class Recrawl(object): def __init__(self, servers, key, shard_dist, crawler): """Initialize Recrawler Parameters ---------- server : Redis instance key : str Where to store fingerprints """ self.queues = QueueManager() self.crawler = crawler data_queue = StrDataQueueCluster(servers, key, DataSet, shard_dist) self.queues.add(RECRAWL_LIST_TAG, data_queue) @classmethod def from_crawler(cls, crawler): servers = connection.from_settings(crawler.settings) key = crawler.settings.get('RECRAWL_LIST_KEY', RECRAWL_KEY) shard_dist = crawler.settings.get('RECRAWL_SHARD_DIST', RECRAWL_SHARD_DIST) recrawl = cls(servers, key, shard_dist, crawler) crawler.signals.connect( recrawl.setup_recrawl, signal=scrapy.signals.spider_opened) return recrawl def setup_recrawl(self, spider): self.crawler.signals.connect( self.recrawl, signal=le_crawler.signals.hourly_timeout) def recrawl(self): recrawl_list = list(self.queues.list_members(RECRAWL_LIST_TAG)) for url in recrawl_list: req = Request(url, dont_filter=True) self.queues.push(PRIORITY_QUEUE_TAG, req)
def add(request,channel_id): clock = Clock(logger=logger) clock.start() queue = QueueManager(channel = channel_id) # Remove non-printable chars element = request.GET['element'] creator = __get_client_ip(request) url = filter(lambda x: x in string.printable,element) match = re.search('.*[w][a][t][c][h].[v][=]([^/,&]*)',url) if match: queue.add(url=match.group(1),creator=creator) logger.info('Added '+url) logger.info("add returned in %f seconds" % clock.stop()) return HttpResponse(1) else: logger.critical('Error! URL Invalid '+url) logger.info("add returned in %f seconds" % clock.stop()) return HttpResponse(0)
class Scheduler(object): """Redis-based scheduler""" def __init__(self, servers, persist, input_queue_key, input_queue_cls, input_queue_shard_dist, output_queue_key, output_queue_cls, output_queue_shard_dist, priority_queue_key, priority_queue_cls, priority_queue_shard_dist, recrawl_key, dupefilter_key, dupe_filter_ins, idle_before_close): """Initialize scheduler. Parameters ---------- servers : list of Redis instance persist : bool queue_key : str queue_cls : queue class dupe_filter_cls : dupefilter class dupefilter_key : str idle_before_close : int """ self.persist = persist self.input_queue_key = input_queue_key self.input_queue_cls = input_queue_cls self.input_queue_shard_dist = input_queue_shard_dist self.output_queue_key = output_queue_key self.output_queue_cls = output_queue_cls self.output_queue_shard_dist = output_queue_shard_dist self.priority_queue_key = priority_queue_key self.priority_queue_cls = priority_queue_cls self.priority_queue_shard_dist = priority_queue_shard_dist self.dupefilter_key = dupefilter_key self.df = dupe_filter_ins self.recrawl_key = recrawl_key self.idle_before_close = idle_before_close self.stats = None self.servers = servers self.queues = QueueManager() self.url_normalize = UrlNormalize.get_instance() def __len__(self): return self.queues.len(PRIORITY_QUEUE_TAG) + \ self.queues.len(OUTPUT_QUEUE_TAG) @classmethod def from_settings(cls, settings): persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST) input_queue_key = settings.get( 'INPUT_QUEUE_KEY', INPUT_QUEUE_KEY) input_queue_cls = load_object(settings.get( 'INPUT_QUEUE_CLASS', INPUT_QUEUE_CLASS)) input_queue_shard_dist = settings.get( 'INPUT_QUEUE_SHARD_DIST', INPUT_QUEUE_SHARD_DIST) output_queue_key = settings.get( 'OUTPUT_QUEUE_KEY', OUTPUT_QUEUE_KEY) output_queue_cls = load_object(settings.get( 'OUTPUT_QUEUE_CLASS', OUTPUT_QUEUE_CLASS)) output_queue_shard_dist = settings.get( 'OUTPUT_QUEUE_SHARD_DIST', OUTPUT_QUEUE_SHARD_DIST) priority_queue_key = settings.get( 'PRIORITY_QUEUE_KEY', PRIORITY_QUEUE_KEY) priority_queue_cls = load_object(settings.get( 'PRIORITY_QUEUE_CLASS', PRIORITY_QUEUE_CLASS)) priority_queue_shard_dist = settings.get( 'PRIORITY_QUEUE_SHARD_DIST', PRIORITY_QUEUE_SHARD_DIST) dupefilter_key = settings.get('DUPEFILTER_KEY', DUPEFILTER_KEY) idle_before_close = settings.get('SCHEDULER_IDLE_BEFORE_CLOSE', IDLE_BEFORE_CLOSE) servers = connection.from_settings(settings) dupefilter_ins = load_object( settings['DUPEFILTER_CLASS']).from_settings(settings) recrawl_key = settings.get('RECRAWL_LIST_KEY', RECRAWL_KEY) return cls(servers, persist, input_queue_key, input_queue_cls, input_queue_shard_dist, output_queue_key, output_queue_cls, output_queue_shard_dist, priority_queue_key, priority_queue_cls, priority_queue_shard_dist, recrawl_key, dupefilter_key, dupefilter_ins, idle_before_close) @classmethod def from_crawler(cls, crawler): instance = cls.from_settings(crawler.settings) # FIXME: for now, stats are only supported from this constructor instance.stats = crawler.stats return instance def open(self, spider): self.spider = spider input_queue = CachedRequestQueueCluster( self.servers, self.input_queue_key, self.input_queue_cls, self.input_queue_shard_dist, self.spider) output_queue = CachedRequestQueueCluster( self.servers, self.output_queue_key, self.output_queue_cls, self.output_queue_shard_dist, self.spider) priority_queue = RequestQueueCluster( self.servers, self.priority_queue_key, self.priority_queue_cls, self.priority_queue_shard_dist, self.spider) self.queues.add(INPUT_QUEUE_TAG, input_queue) self.queues.add(OUTPUT_QUEUE_TAG, output_queue) self.queues.add(PRIORITY_QUEUE_TAG, priority_queue) if self.idle_before_close < 0: self.idle_before_close = 0 # notice if there are requests already in the queue to resume the crawl if len(input_queue): spider.log("Resuming crawl (%d requests scheduled)" % len(input_queue)) if isinstance(self.df, RFPDupeFilter): self.df.set_spider(spider) def close(self, reason): if not self.persist: self.df.clear() self.queues.clear(INPUT_QUEUE_TAG) self.queues.clear(OUTPUT_QUEUE_TAG) self.queues.clear(PRIORITY_QUEUE_TAG) def enqueue_request(self, request): if not request: return # TODO(Xiaohe): move url normalize to some better place # process request, url normalize # some place we dont need normalize url in process request or response tmpurl = self.url_normalize.get_unique_url(request.url) if not tmpurl: raise Exception('Bad request url:%s' % request.url) return new_meta = request.meta.copy() or {} new_meta['Rawurl'] = request.url nrequest = request.replace(url=tmpurl, meta=new_meta) if not request.dont_filter and self.df.request_seen(request): return if self.stats: self.stats.inc_value('scheduler/enqueued/redis', spider=self.spider) self.queues.push(INPUT_QUEUE_TAG, nrequest) def next_request(self): block_pop_timeout = self.idle_before_close request = self.queues.pop(PRIORITY_QUEUE_TAG, block_pop_timeout) if request is None: request = self.queues.pop(OUTPUT_QUEUE_TAG, block_pop_timeout) if request and self.stats: self.stats.inc_value('scheduler/dequeued/redis', spider=self.spider) if request and not request.meta.has_key('Rawurl'): tmpurl = self.url_normalize.get_unique_url(request.url) if not tmpurl: raise Exception('Bad request url:%s' % request.url) nrequest = request.replace(url=tmpurl) return nrequest return request def has_pending_requests(self): return len(self) > 0
class TestSequenceFunctions(TestCase): def setUp(self): user = User.objects.create(username='******') user.save() channel = Channel(creator=user) channel.save() self.queue = QueueManager(channel=channel.id) def test_db_connection(self): print "Test 1" self.assertIsNotNone(self.queue.get_db()) def test_check_initial_emptiness(self): print "Test 2" self.assertEqual(len(self.queue.getQueue()), 0) def test_add_item(self): print "Test 2" self.queue.add(url="tGiEsjtfJdg", creator="127.0.0.1") self.assertEqual(len(self.queue.getQueue()), 1) def test_rm_item(self): print "Test 3" self.queue.rm(url="tGiEsjtfJdg") self.assertEqual(len(self.queue.getQueue()), 0) def test_uniqueness(self): print "Test 4" self.queue.add(url="tGiEsjtfJdg", creator="127.0.0.1") self.queue.add(url="tGiEsjtfJdg", creator="127.0.0.1") self.queue.add(url="tGiEsjtfJdg", creator="127.0.0.2") self.assertEqual(len(self.queue.getQueue()), 1) def test_next_video(self): print "Test 5" self.queue.add(url="tGiEsjtfJdg", creator="127.0.0.1") self.queue.add(url="XFwVfrAURDg", creator="127.0.0.1") self.queue.add(url="EfuVcRdamCY", creator="127.0.0.1") self.queue.add(url="4pRPAbCwgSs", creator="127.0.0.1") count = 0 added = [] self.assertEqual(len(self.queue.getQueue()), 4) self.assertEqual(self.queue.next(), "tGiEsjtfJdg") self.assertEqual(len(self.queue.getQueue()), 3) self.assertEqual(self.queue.next(), "XFwVfrAURDg") self.assertEqual(len(self.queue.getQueue()), 2) self.assertEqual(self.queue.next(), "EfuVcRdamCY") self.assertEqual(len(self.queue.getQueue()), 1) self.assertEqual(self.queue.next(), "4pRPAbCwgSs") self.assertEqual(len(self.queue.getQueue()), 0) self.assertIsNone(self.queue.next()) def test_votes(self): print "Test 6" added = [] # Asserts that it cant register a vote to something that isn't there self.assertFalse( self.queue.register_vote(url="dummy", positive=1, negative=0, creator="127.0.0.1")) # Asserts votes for queues of a single item self.assertEqual(self.queue.add(url="tGiEsjtfJdg", creator="127.0.0.1"), (True, True)) #1,0 self.assertEqual(self.queue.add(url="tGiEsjtfJdg", creator="127.0.0.1"), (True, False)) #1,0 elements = self.queue.getQueue() for element in elements: self.assertIsNotNone(element) self.assertEqual(element.get("positive"), 1) self.assertEqual(element.get("negative"), 0) self.assertTrue( self.queue.register_vote( url="tGiEsjtfJdg", #2,0 positive=1, negative=0, creator="127.0.0.2")) self.assertFalse( self.queue.register_vote( url="tGiEsjtfJdg", #2,0 positive=1, negative=0, creator="127.0.0.2")) elements = self.queue.getQueue() self.assertEqual(len(elements), 1) element = [x for x in elements if x.get("url") == "tGiEsjtfJdg"][0] self.assertIsNotNone(element) self.assertEqual(element.get("positive"), 2) self.assertEqual(element.get("negative"), 0) # Asserts votes for bigger queues self.queue.add(url="XFwVfrAURDg", creator="127.0.0.1") self.queue.add(url="EfuVcRdamCY", creator="127.0.0.1") self.queue.add(url="4pRPAbCwgSs", creator="127.0.0.1") self.assertIsNotNone( self.queue.register_vote(url="tGiEsjtfJdg", positive=0, negative=1, creator="127.0.0.2")) self.assertIsNotNone( self.queue.register_vote(url="XFwVfrAURDg", positive=1, negative=0, creator="127.0.0.2")) self.assertIsNotNone( self.queue.register_vote(url="EfuVcRdamCY", positive=1, negative=0, creator="127.0.0.2")) self.assertIsNotNone( self.queue.register_vote(url="4pRPAbCwgSs", positive=1, negative=0, creator="127.0.0.2")) elements = self.queue.getQueue() self.assertEqual(len(elements), 4) for element in elements: if element.get("url") == "tGiEsjtfJdg": self.assertEqual(element.get("positive"), 1) self.assertEqual(element.get("negative"), 1) elif element.get("url") == "XFwVfrAURDg": self.assertEqual(element.get("positive"), 2) self.assertEqual(element.get("negative"), 0) elif element.get("url") == "4pRPAbCwgSs": self.assertEqual(element.get("positive"), 2) self.assertEqual(element.get("negative"), 0)