def get(self, interval=1.0, key_gen=lambda x:x):
     while True:
         todo_l = yield gen.Task(self.rd.llen, self.todo)
         queue_l = yield gen.Task(self.rd.llen, self.queue)
         if todo_l == 0 and queue_l == 0:
             return None, None
         if queue_l == 0 or random() < todo_l / (todo_l+10000):
             url = yield gen.Task(self.rd.lpop, self.todo)
             if url is None or len(url) == 0:
                 return None, None
             urlkey = key_gen(url)
             if (yield from try_return(self.urlfilter(url))) is False:
                 continue
             if nonetest((yield gen.Task(self.rd.hget, self.keypool, urlkey))):
                 continue
             if nonetest((yield gen.Task(self.rd.hget, self.doing, url))):
                 continue
             hostname = urlparse(url).hostname
             t = time_mapper((yield gen.Task(self.rd.hget, self.hostmap, hostname)))
             if time() - t < interval:
                 yield gen.Task(self.rd.rpush, self.todo, url)
                 return None, None
             req = req_gen(url)
             yield gen.Task(self.rd.hset, self.hostmap, hostname, time())
             yield gen.Task(self.rd.hset, self.doing, url, time())
             return req, self.callback
         if (yield gen.Task(self.rd.llen, self.queue)) > 0:
             yield gen.Task(self.rd.rpush, self.todo, (yield gen.Task(self.rd.lpop, self.queue)))
Exemple #2
0
 def get(self, interval=1.0, key_gen=lambda x: x):
     while True:
         todo_l = yield gen.Task(self.rd.llen, self.todo)
         queue_l = yield gen.Task(self.rd.llen, self.queue)
         if todo_l == 0 and queue_l == 0:
             return None, None
         if queue_l == 0 or random() < todo_l / (todo_l + 10000):
             url = yield gen.Task(self.rd.lpop, self.todo)
             if url is None or len(url) == 0:
                 return None, None
             urlkey = key_gen(url)
             if (yield from try_return(self.urlfilter(url))) is False:
                 continue
             if nonetest((yield gen.Task(self.rd.hget, self.keypool,
                                         urlkey))):
                 continue
             if nonetest((yield gen.Task(self.rd.hget, self.doing, url))):
                 continue
             hostname = urlparse(url).hostname
             t = time_mapper((yield gen.Task(self.rd.hget, self.hostmap,
                                             hostname)))
             if time() - t < interval:
                 yield gen.Task(self.rd.rpush, self.todo, url)
                 return None, None
             req = req_gen(url)
             yield gen.Task(self.rd.hset, self.hostmap, hostname, time())
             yield gen.Task(self.rd.hset, self.doing, url, time())
             return req, self.callback
         if (yield gen.Task(self.rd.llen, self.queue)) > 0:
             yield gen.Task(self.rd.rpush, self.todo,
                            (yield gen.Task(self.rd.lpop, self.queue)))
Exemple #3
0
import sys
from asyncengin import asyncdo
from webtools import req_gen, host_rebalance
import aiohttp
from urllib import parse
import asyncio_mongo

def print_response(response):
    mongo = yield from asyncio_mongo.Connection.create()
    yield from mongo.test.url_html.update({'_id': response.ourl}, {'_id':response.ourl, 'html':response.body}, upsert=True)


urls = ((req_gen(l.strip()), print_response) for l in sys.stdin if '.swf' not in l.lower())

reqs = host_rebalance(urls, interval=0.5)

asyncdo(reqs, n=50)
Exemple #4
0
import sys
from asyncengin import asyncdo
from webtools import req_gen, host_rebalance
import aiohttp
from urllib import parse
import asyncio_mongo


def print_response(response):
    mongo = yield from asyncio_mongo.Connection.create()
    yield from mongo.test.url_html.update({'_id': response.ourl}, {
        '_id': response.ourl,
        'html': response.body
    },
                                          upsert=True)


urls = ((req_gen(l.strip()), print_response) for l in sys.stdin
        if '.swf' not in l.lower())

reqs = host_rebalance(urls, interval=0.5)

asyncdo(reqs, n=50)