def get(self, interval=1.0, key_gen=lambda x:x): while True: todo_l = yield gen.Task(self.rd.llen, self.todo) queue_l = yield gen.Task(self.rd.llen, self.queue) if todo_l == 0 and queue_l == 0: return None, None if queue_l == 0 or random() < todo_l / (todo_l+10000): url = yield gen.Task(self.rd.lpop, self.todo) if url is None or len(url) == 0: return None, None urlkey = key_gen(url) if (yield from try_return(self.urlfilter(url))) is False: continue if nonetest((yield gen.Task(self.rd.hget, self.keypool, urlkey))): continue if nonetest((yield gen.Task(self.rd.hget, self.doing, url))): continue hostname = urlparse(url).hostname t = time_mapper((yield gen.Task(self.rd.hget, self.hostmap, hostname))) if time() - t < interval: yield gen.Task(self.rd.rpush, self.todo, url) return None, None req = req_gen(url) yield gen.Task(self.rd.hset, self.hostmap, hostname, time()) yield gen.Task(self.rd.hset, self.doing, url, time()) return req, self.callback if (yield gen.Task(self.rd.llen, self.queue)) > 0: yield gen.Task(self.rd.rpush, self.todo, (yield gen.Task(self.rd.lpop, self.queue)))
def get(self, interval=1.0, key_gen=lambda x: x): while True: todo_l = yield gen.Task(self.rd.llen, self.todo) queue_l = yield gen.Task(self.rd.llen, self.queue) if todo_l == 0 and queue_l == 0: return None, None if queue_l == 0 or random() < todo_l / (todo_l + 10000): url = yield gen.Task(self.rd.lpop, self.todo) if url is None or len(url) == 0: return None, None urlkey = key_gen(url) if (yield from try_return(self.urlfilter(url))) is False: continue if nonetest((yield gen.Task(self.rd.hget, self.keypool, urlkey))): continue if nonetest((yield gen.Task(self.rd.hget, self.doing, url))): continue hostname = urlparse(url).hostname t = time_mapper((yield gen.Task(self.rd.hget, self.hostmap, hostname))) if time() - t < interval: yield gen.Task(self.rd.rpush, self.todo, url) return None, None req = req_gen(url) yield gen.Task(self.rd.hset, self.hostmap, hostname, time()) yield gen.Task(self.rd.hset, self.doing, url, time()) return req, self.callback if (yield gen.Task(self.rd.llen, self.queue)) > 0: yield gen.Task(self.rd.rpush, self.todo, (yield gen.Task(self.rd.lpop, self.queue)))
import sys from asyncengin import asyncdo from webtools import req_gen, host_rebalance import aiohttp from urllib import parse import asyncio_mongo def print_response(response): mongo = yield from asyncio_mongo.Connection.create() yield from mongo.test.url_html.update({'_id': response.ourl}, {'_id':response.ourl, 'html':response.body}, upsert=True) urls = ((req_gen(l.strip()), print_response) for l in sys.stdin if '.swf' not in l.lower()) reqs = host_rebalance(urls, interval=0.5) asyncdo(reqs, n=50)
import sys from asyncengin import asyncdo from webtools import req_gen, host_rebalance import aiohttp from urllib import parse import asyncio_mongo def print_response(response): mongo = yield from asyncio_mongo.Connection.create() yield from mongo.test.url_html.update({'_id': response.ourl}, { '_id': response.ourl, 'html': response.body }, upsert=True) urls = ((req_gen(l.strip()), print_response) for l in sys.stdin if '.swf' not in l.lower()) reqs = host_rebalance(urls, interval=0.5) asyncdo(reqs, n=50)