def open(self, spider): self.spider = spider if spider.name == 'fgilt': self.queue = self.queue_cls(self.server, spider, self.time + '_' + self.queue_key) else: self.queue = self.queue_cls(self.server, spider, self.queue_key) if self.need_dupefilter: if spider.name == 'fgilt': self.df = RFPDupeFilter( self.server, self.time + '_' + self.dupefilter_key % {'spider': spider.name}) else: self.df = RFPDupeFilter( self.server, self.dupefilter_key % {'spider': spider.name}) else: self.df = ItemRFPDupeFilter( self.server, self.dupefilter_key % {'spider': spider.name}) if self.idle_before_close < 0: self.idle_before_close = 0 # notice if there are requests already in the queue to resume the crawl if len(self.queue): spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))
def test_log_dupes(): def _test(df, dupes, logcount): df.logger.debug = mock.Mock(wraps=df.logger.debug) for i in range(dupes): req = Request('http://example') df.log(req, spider=mock.Mock()) assert df.logger.debug.call_count == logcount server = get_redis_mock() df_quiet = RFPDupeFilter(server, 'foo') # debug=False _test(df_quiet, 5, 1) df_debug = RFPDupeFilter(server, 'foo', debug=True) _test(df_debug, 5, 5)
def open(self, spider): self.spider = spider self.queue = SpiderQueue(self.server, spider, self.queue_key) self.df = RFPDupeFilter(self.server, DUPEFILTER_KEY % {'spider': spider.name}) # notice if there are requests already in the queue if len(self.queue): spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))
def open(self, spider): self.spider = spider self.queue = self.queue_cls(self.server, spider, self.queue_key) self.df = RFPDupeFilter(self.server, self.dupefilter_key % {'spider': spider.name}) # notice if there are requests already in the queue to resume the crawl if len(self.queue): spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))
def open(self, spider): self.spider = spider self.queue = self.queue_cls(self.server, spider, self.queue_key) self.df = RFPDupeFilter(self.server, self.dupefilter_key % {'spider.domain': spider.domain}) if self.idle_before_close < 0: self.idle_before_close = 0 # notice if there are requests already in the queue to resume the crawl if len(self.queue): spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))
def open(self, spider): self.spider = spider self.queue = self.queue_cls(self.server, spider, self.queue_key % {'spider': spider.name}) self.df = RFPDupeFilter(self.redis_server, self.dupefilter_key % {'spider': spider.name}) if self.idle_before_close < 0: self.idle_before_close = 0 if len(self.queue): spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))
def open(self, spider): """ execute this function when open one spider """ self.spider = spider self.queue = self.queue_cls(self.server, spider, self.queue_key) self.dupefilter = RFPDupeFilter(self.server, self.dupefilter_key % {'spider':spider.name}) if self.idle_before_close < 0: self.idle_before_close = 0 # notice if there are requests already in the queue to resume the crawl if len(self.queue): spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue)) print "++++++++++++++++++++++++" print "scheduler open" print "++++++++++++++++++++++++"
def setUp(self): self.key = 'scrapy_redis:tests:dupefilter:' self.df = RFPDupeFilter(self.server, self.key)
def setUp(self): self.server = redis.Redis(REDIS_HOST, REDIS_PORT) self.key = 'scrapy_redis:tests:dupefilter:' self.df = RFPDupeFilter(self.server, self.key)
def setup(self): self.server = get_redis_mock() self.key = 'dupefilter:1' self.df = RFPDupeFilter(self.server, self.key)