class TestRFPDupeFilter(object): def setup(self): self.server = get_redis_mock() self.key = 'dupefilter:1' self.df = RFPDupeFilter(self.server, self.key) def test_request_seen(self): req = Request('http://example.com') assert not self.df.request_seen(req) assert self.df.request_seen(req) def test_overridable_request_fingerprinter(self): req = Request('http://example.com') self.df.request_fingerprint = mock.Mock(wraps=self.df.request_fingerprint) assert not self.df.request_seen(req) self.df.request_fingerprint.assert_called_with(req) def test_clear_deletes(self): self.df.clear() self.server.delete.assert_called_with(self.key) def test_close_calls_clear(self): self.df.clear = mock.Mock(wraps=self.df.clear) self.df.close() self.df.close(reason='foo') assert self.df.clear.call_count == 2
class TestRFPDupeFilter(object): def setup(self): self.server = get_redis_mock() self.key = 'dupefilter:1' self.df = RFPDupeFilter(self.server, self.key) def test_request_seen(self): req = Request('http://example.com') assert not self.df.request_seen(req) assert self.df.request_seen(req) def test_overridable_request_fingerprinter(self): req = Request('http://example.com') self.df.request_fingerprint = mock.Mock( wraps=self.df.request_fingerprint) assert not self.df.request_seen(req) self.df.request_fingerprint.assert_called_with(req) def test_clear_deletes(self): self.df.clear() self.server.delete.assert_called_with(self.key) def test_close_calls_clear(self): self.df.clear = mock.Mock(wraps=self.df.clear) self.df.close() self.df.close(reason='foo') assert self.df.clear.call_count == 2
class DupeFilterTest(RedisTestMixin, TestCase): def setUp(self): self.key = 'scrapy_redis:tests:dupefilter:' self.df = RFPDupeFilter(self.server, self.key) def tearDown(self): self.clear_keys(self.key) def test_dupe_filter(self): req = Request('http://example.com') self.assertFalse(self.df.request_seen(req)) self.assertTrue(self.df.request_seen(req)) self.df.close('nothing')
class DupeFilterTest(TestCase): def setUp(self): self.server = redis.Redis(REDIS_HOST, REDIS_PORT) self.key = 'scrapy_redis:tests:dupefilter:' self.df = RFPDupeFilter(self.server, self.key) def tearDown(self): self.server.delete(self.key) def test_dupe_filter(self): req = Request('http://example.com') self.assertFalse(self.df.request_seen(req)) self.assertTrue(self.df.request_seen(req)) self.df.close('nothing')
class Scheduler(object): """Redis-based scheduler""" def __init__(self, server, persist, queue_key): self.server = server self.persist = persist self.queue_key = queue_key def __len__(self): return len(self.queue) # @classmethod # def from_settings(cls, settings): # host = settings.get('REDIS_HOST', REDIS_HOST) # port = settings.get('REDIS_PORT', REDIS_PORT) # persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST) # queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY) # server = redis.Redis(host, port) # return cls(server, persist, queue_key) @classmethod def from_crawler(cls, crawler): settings = crawler.settings host = settings.get("REDIS_HOST", REDIS_HOST) port = settings.get("REDIS_PORT", REDIS_PORT) persist = settings.get("SCHEDULER_PERSIST", SCHEDULER_PERSIST) queue_key = settings.get("SCHEDULER_QUEUE_KEY", QUEUE_KEY) server = redis.Redis(host, port) return cls(server, persist, queue_key) def open(self, spider): self.spider = spider self.queue = SpiderQueue(self.server, spider, self.queue_key) self.df = RFPDupeFilter(self.server, DUPEFILTER_KEY % {"spider": spider.name}) # notice if there are requests already in the queue if len(self.queue): spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue)) def close(self, reason): if not self.persist: self.df.clear() self.queue.clear() def enqueue_request(self, request): if not request.dont_filter and self.df.request_seen(request): return self.queue.push(request) def next_request(self): return self.queue.pop() def has_pending_requests(self): return len(self) > 0
class Scheduler(object): """Redis-based scheduler""" def __init__(self, server, persist, queue_key, dupe_filter): self.server = server self.persist = persist self.queue_key = queue_key self.dupe_filter = dupe_filter def __len__(self): return len(self.queue) @classmethod def from_crawler(cls, crawler): settings = crawler.settings host = settings.get('REDIS_HOST', REDIS_HOST) port = settings.get('REDIS_PORT', REDIS_PORT) persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST) queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY) dupe_filter = settings.get('DUPE_FILTER', True) server = redis.Redis(host, port) return cls(server, persist, queue_key, dupe_filter) def open(self, spider): self.spider = spider self.queue = SpiderQueue(self.server, spider, self.queue_key) if self.dupe_filter: self.df = RFPDupeFilter(self.server, DUPEFILTER_KEY % {'spider': spider.name}) # notice if there are requests already in the queue if len(self.queue): spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue)) def close(self, reason): if not self.persist: if self.dupe_filter: self.df.clear() self.queue.clear() def enqueue_request(self, request): if not request.dont_filter and self.dupe_filter and self.df.request_seen(request): return self.queue.push(request) def next_request(self): return self.queue.pop() def has_pending_requests(self): return len(self) > 0
class DupeFilterTest(): def __init__(self): self.setUp() def setUp(self): self.server = redis.Redis(REDIS_HOST, REDIS_PORT) self.key = 'Fcrawler:requests:dupefilter:' self.df = RFPDupeFilter(self.server, self.key) def tearDown(self): self.server.delete(self.key) def request_dupe_filter(self, url): req = Request(url) flag = self.df.request_seen(req) self.df.close('nothing') return flag
class Scheduler(object): """Redis-based scheduler""" def __init__(self, server, persist, queue_key, queue_cls, dupefilter_key): """Initialize scheduler. Parameters ---------- server : Redis instance persist : bool queue_key : str queue_cls : queue class dupefilter_key : str """ self.server = server self.persist = persist self.queue_key = queue_key self.queue_cls = queue_cls self.dupefilter_key = dupefilter_key self.stats = None def __len__(self): return len(self.queue) @classmethod def from_settings(cls, settings): host = settings.get('REDIS_HOST', REDIS_HOST) port = settings.get('REDIS_PORT', REDIS_PORT) persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST) queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY) queue_cls = load_object(settings.get('SCHEDULER_QUEUE_CLASS', QUEUE_CLASS)) dupefilter_key = settings.get('DUPEFILTER_KEY', DUPEFILTER_KEY) server = redis.Redis(host, port) return cls(server, persist, queue_key, queue_cls, dupefilter_key) @classmethod def from_crawler(cls, crawler): instance = cls.from_settings(crawler.settings) # FIXME: for now, stats are only supported from this constructor instance.stats = crawler.stats return instance def open(self, spider): self.spider = spider self.queue = self.queue_cls(self.server, spider, self.queue_key) self.df = RFPDupeFilter(self.server, self.dupefilter_key % {'spider': spider.name}) # notice if there are requests already in the queue to resume the crawl if len(self.queue): spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue)) def close(self, reason): if not self.persist: self.df.clear() self.queue.clear() def enqueue_request(self, request): if not request.dont_filter and self.df.request_seen(request): return if self.stats: self.stats.inc_value('scheduler/enqueued/redis', spider=self.spider) self.queue.push(request) def next_request(self): request = self.queue.pop() if request and self.stats: self.stats.inc_value('scheduler/dequeued/redis', spider=self.spider) return request def has_pending_requests(self): return len(self) > 0
class Scheduler(object): """Redis-based scheduler""" def __init__(self, server, persist, queue_key, queue_cls, dupefilter_key): """Initialize scheduler. Parameters ---------- server : Redis instance persist : bool queue_key : str queue_cls : queue class dupefilter_key : str """ self.server = server self.persist = persist self.queue_key = queue_key self.queue_cls = queue_cls self.dupefilter_key = dupefilter_key self.stats = None def __len__(self): return len(self.queue) @classmethod def from_settings(cls, settings): host = settings.get('REDIS_HOST', REDIS_HOST) port = settings.get('REDIS_PORT', REDIS_PORT) persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST) queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY) queue_cls = load_object( settings.get('SCHEDULER_QUEUE_CLASS', QUEUE_CLASS)) dupefilter_key = settings.get('DUPEFILTER_KEY', DUPEFILTER_KEY) server = redis.Redis(host, port) return cls(server, persist, queue_key, queue_cls, dupefilter_key) @classmethod def from_crawler(cls, crawler): instance = cls.from_settings(crawler.settings) # FIXME: for now, stats are only supported from this constructor instance.stats = crawler.stats return instance def open(self, spider): self.spider = spider self.queue = self.queue_cls(self.server, spider, self.queue_key) self.df = RFPDupeFilter(self.server, self.dupefilter_key % {'spider': spider.name}) # notice if there are requests already in the queue to resume the crawl if len(self.queue): spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue)) def close(self, reason): if not self.persist: self.df.clear() self.queue.clear() def enqueue_request(self, request): if not request.dont_filter and self.df.request_seen(request): return if self.stats: self.stats.inc_value('scheduler/enqueued/redis', spider=self.spider) self.queue.push(request) def next_request(self): request = self.queue.pop() if request and self.stats: self.stats.inc_value('scheduler/dequeued/redis', spider=self.spider) return request def has_pending_requests(self): return len(self) > 0
class Scheduler(object): """ A RabbitMQ Scheduler for Scrapy. """ def __init__(self, server, persist, queue_key, queue_cls, dupefilter_key, idle_before_close, redis_server, *args, **kwargs): self.server = server self.persist = persist self.queue_key = queue_key self.queue_cls = queue_cls self.dupefilter_key = dupefilter_key self.idle_before_close = idle_before_close self.stats = None self.redis_server = redis_server def __len__(self): return len(self.queue) @classmethod def from_settings(cls, settings): persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST) queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY) queue_cls = load_object(settings.get('SCHEDULER_QUEUE_CLASS', QUEUE_CLASS)) dupefilter_key = settings.get('DUPEFILTER_KEY', DUPEFILTER_KEY) idle_before_close = settings.get('SCHEDULER_IDLE_BEFORE_CLOSE', IDLE_BEFORE_CLOSE) server, redis_server = connection.from_settings(settings) return cls(server, persist, queue_key, queue_cls, dupefilter_key, idle_before_close, redis_server) @classmethod def from_crawler(cls, crawler): instance = cls.from_settings(crawler.settings) instance.stats = crawler.stats return instance def open(self, spider): self.spider = spider self.queue = self.queue_cls(self.server, spider, self.queue_key % {'spider': spider.name}) self.df = RFPDupeFilter(self.redis_server, self.dupefilter_key % {'spider': spider.name}) if self.idle_before_close < 0: self.idle_before_close = 0 if len(self.queue): spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue)) def close(self, reason): if not self.persist: self.df.clear() self.queue.clear() def enqueue_request(self, request): if not request.dont_filter and self.df.request_seen(request): return if self.stats: self.stats.inc_value('scheduler/enqueued/rabbitmq', spider=self.spider) self.queue.push(request) def next_request(self): block_pop_timeout = self.idle_before_close request = self.queue.pop() if request and self.stats: self.stats.inc_value('scheduler/dequeued/rabbitmq', spider=self.spider) return request def has_pending_requests(self): return len(self) > 0
class Scheduler(object): """Redis-based scheduler""" def __init__(self, server, persist, queue_cls, queue_key, dupefilter_key, idle_before_close): """Initialize scheduler. Parameters: ----------- server: redis instance persist: bool queue_cls: queue class queue_key: str dupefilter_key: str idle_before_close: int """ self.server = server self.persist = persist self.queue_cls = queue_cls self.queue_key = queue_key self.dupefilter_key = dupefilter_key self.idle_before_close = idle_before_close self.status = None def __len__(self): return len(self.queue) @classmethod def from_settings(cls, settings): persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST) host = settings.get('REDIS_HOST', 'localhost') port = settings.get('REDIS_PORT', 6379) server = redis.Redis(host, port) queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY) queue_cls = load_object(settings.get('SCHEDULER_QUEUE_CLASS', QUEUE_CLASS)) idle_before_close = settings.get('SCHEDULER_IDLE_BEFORE_CLOSE', IDLE_BEFORE_CLOSE) dupefilter_key = settings.get('DUPEFILTER_KEY', DUPEFILTER_KEY) return cls(server, persist, queue_cls, queue_key, dupefilter_key, idle_before_close) @classmethod def from_crawler(cls, crawler): instance = cls.from_settings(crawler.settings) # FIXME: for now, stats are only supported from this constructor instance.stats = crawler.stats return instance # settings = crawler.settings # return cls.from_settings(settings) def open(self, spider): """ execute this function when open one spider """ self.spider = spider self.queue = self.queue_cls(self.server, spider, self.queue_key) self.dupefilter = RFPDupeFilter(self.server, self.dupefilter_key % {'spider':spider.name}) if self.idle_before_close < 0: self.idle_before_close = 0 # notice if there are requests already in the queue to resume the crawl if len(self.queue): spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue)) print "++++++++++++++++++++++++" print "scheduler open" print "++++++++++++++++++++++++" def close(self, reason): if not self.persist: self.dupefilter.clear() self.queue.clear() print "++++++++++++++++++++++++" print "scheduler close" print "++++++++++++++++++++++++" def enqueue_request(self, request): if not request.dont_filter and self.dupefilter.request_seen(request): return # if self.stats: self.stats.inc_value('scheduler/enqueued/redis', spider=self.spider) self.queue.push(request) print "++++++++++++++++++++++++" print "scheduler, push request" print "++++++++++++++++++++++++" def next_request(self): print "++++++++++++++++++++++++" print "scheduler next_request" print "++++++++++++++++++++++++" request = self.queue.pop() if request: self.stats.inc_value('scheduler/enqueued/redis', spider=self.spider) return request def has_pending_requests(self): return len(self) > 0
class Scheduler(object): """Redis-based scheduler""" def __init__(self, server, persist, queue_key, queue_cls, dupefilter_key): """Initialize scheduler. Parameters ---------- server : Redis instance persist : bool queue_key : str queue_cls : queue class dupefilter_key : str """ self.server = server self.persist = persist self.queue_key = queue_key self.queue_cls = queue_cls self.dupefilter_key = dupefilter_key def __len__(self): return len(self.queue) @classmethod def from_settings(cls, settings): host = settings.get("REDIS_HOST", REDIS_HOST) port = settings.get("REDIS_PORT", REDIS_PORT) persist = settings.get("SCHEDULER_PERSIST", SCHEDULER_PERSIST) queue_key = settings.get("SCHEDULER_QUEUE_KEY", QUEUE_KEY) queue_cls = load_object(settings.get("SCHEDULER_QUEUE_CLASS", QUEUE_CLASS)) dupefilter_key = settings.get("DUPEFILTER_KEY", DUPEFILTER_KEY) server = redis.Redis(host, port) return cls(server, persist, queue_key, queue_cls, dupefilter_key) @classmethod def from_crawler(cls, crawler): settings = crawler.settings return cls.from_settings(settings) def open(self, spider): self.spider = spider self.queue = SpiderQueue(self.server, spider, self.queue_key) self.df = RFPDupeFilter(self.server, self.dupefilter_key % {"spider": spider.name}) # notice if there are requests already in the queue to resume the crawl if len(self.queue): spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue)) def close(self, reason): if not self.persist: self.df.clear() self.queue.clear() def enqueue_request(self, request): if not request.dont_filter and self.df.request_seen(request): return self.queue.push(request) def next_request(self): return self.queue.pop() def has_pending_requests(self): return len(self) > 0