コード例 #1
0
 def open(self, spider):
     self.spider = spider
     self.queue = SpiderQueue(self.server, spider, self.queue_key)
     self.df = RFPDupeFilter(self.server, DUPEFILTER_KEY % {'spider': spider.name})
     # notice if there are requests already in the queue
     if len(self.queue):
         spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))
コード例 #2
0
    def open(self, spider):
        self.spider = spider

        if spider.name == 'fgilt':
            self.queue = self.queue_cls(self.server, spider,
                                        self.time + '_' + self.queue_key)
        else:
            self.queue = self.queue_cls(self.server, spider, self.queue_key)

        if self.need_dupefilter:
            if spider.name == 'fgilt':
                self.df = RFPDupeFilter(
                    self.server, self.time + '_' +
                    self.dupefilter_key % {'spider': spider.name})
            else:
                self.df = RFPDupeFilter(
                    self.server, self.dupefilter_key % {'spider': spider.name})
        else:
            self.df = ItemRFPDupeFilter(
                self.server, self.dupefilter_key % {'spider': spider.name})

        if self.idle_before_close < 0:
            self.idle_before_close = 0
        # notice if there are requests already in the queue to resume the crawl
        if len(self.queue):
            spider.log("Resuming crawl (%d requests scheduled)" %
                       len(self.queue))
コード例 #3
0
 def open(self, spider):
     self.spider = spider
     self.queue = self.queue_cls(self.server, spider, self.queue_key)
     self.df = RFPDupeFilter(self.server, self.dupefilter_key % {'spider.domain': spider.domain})
     if self.idle_before_close < 0:
         self.idle_before_close = 0
     # notice if there are requests already in the queue to resume the crawl
     if len(self.queue):
         spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))
コード例 #4
0
ファイル: scheduler.py プロジェクト: xuemy/scrapy-redis
 def open(self, spider):
     self.spider = spider
     self.queue = self.queue_cls(self.server, spider, self.queue_key)
     self.df = RFPDupeFilter(self.server,
                             self.dupefilter_key % {'spider': spider.name})
     # notice if there are requests already in the queue to resume the crawl
     if len(self.queue):
         spider.log("Resuming crawl (%d requests scheduled)" %
                    len(self.queue))
コード例 #5
0
    def open(self, spider):
        self.spider = spider
        self.queue = self.queue_cls(self.server, spider, self.queue_key % {'spider': spider.name})
        self.df = RFPDupeFilter(self.redis_server, self.dupefilter_key % {'spider': spider.name})

        if self.idle_before_close < 0:
            self.idle_before_close = 0

        if len(self.queue):
            spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))
コード例 #6
0
class TestRFPDupeFilter(object):
    def setup(self):
        self.server = get_redis_mock()
        self.key = 'dupefilter:1'
        self.df = RFPDupeFilter(self.server, self.key)

    def test_request_seen(self):
        req = Request('http://example.com')
        assert not self.df.request_seen(req)
        assert self.df.request_seen(req)

    def test_overridable_request_fingerprinter(self):
        req = Request('http://example.com')
        self.df.request_fingerprint = mock.Mock(
            wraps=self.df.request_fingerprint)
        assert not self.df.request_seen(req)
        self.df.request_fingerprint.assert_called_with(req)

    def test_clear_deletes(self):
        self.df.clear()
        self.server.delete.assert_called_with(self.key)

    def test_close_calls_clear(self):
        self.df.clear = mock.Mock(wraps=self.df.clear)
        self.df.close()
        self.df.close(reason='foo')
        assert self.df.clear.call_count == 2
コード例 #7
0
ファイル: test_dupefilter.py プロジェクト: lnlantian/Scrapy
class TestRFPDupeFilter(object):

    def setup(self):
        self.server = get_redis_mock()
        self.key = 'dupefilter:1'
        self.df = RFPDupeFilter(self.server, self.key)

    def test_request_seen(self):
        req = Request('http://example.com')
        assert not self.df.request_seen(req)
        assert self.df.request_seen(req)

    def test_overridable_request_fingerprinter(self):
        req = Request('http://example.com')
        self.df.request_fingerprint = mock.Mock(wraps=self.df.request_fingerprint)
        assert not self.df.request_seen(req)
        self.df.request_fingerprint.assert_called_with(req)

    def test_clear_deletes(self):
        self.df.clear()
        self.server.delete.assert_called_with(self.key)

    def test_close_calls_clear(self):
        self.df.clear = mock.Mock(wraps=self.df.clear)
        self.df.close()
        self.df.close(reason='foo')
        assert self.df.clear.call_count == 2
コード例 #8
0
class Scheduler(object):
    """Redis-based scheduler"""

    def __init__(self, server, persist, queue_key):
        self.server = server
        self.persist = persist
        self.queue_key = queue_key

    def __len__(self):
        return len(self.queue)

    # @classmethod
    # def from_settings(cls, settings):
    # host = settings.get('REDIS_HOST', REDIS_HOST)
    # port = settings.get('REDIS_PORT', REDIS_PORT)
    # persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST)
    # queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY)
    # server = redis.Redis(host, port)
    # return cls(server, persist, queue_key)

    @classmethod
    def from_crawler(cls, crawler):
        settings = crawler.settings
        host = settings.get("REDIS_HOST", REDIS_HOST)
        port = settings.get("REDIS_PORT", REDIS_PORT)
        persist = settings.get("SCHEDULER_PERSIST", SCHEDULER_PERSIST)
        queue_key = settings.get("SCHEDULER_QUEUE_KEY", QUEUE_KEY)
        server = redis.Redis(host, port)
        return cls(server, persist, queue_key)

    def open(self, spider):
        self.spider = spider
        self.queue = SpiderQueue(self.server, spider, self.queue_key)
        self.df = RFPDupeFilter(self.server, DUPEFILTER_KEY % {"spider": spider.name})
        # notice if there are requests already in the queue
        if len(self.queue):
            spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))

    def close(self, reason):
        if not self.persist:
            self.df.clear()
            self.queue.clear()

    def enqueue_request(self, request):
        if not request.dont_filter and self.df.request_seen(request):
            return
        self.queue.push(request)

    def next_request(self):
        return self.queue.pop()

    def has_pending_requests(self):
        return len(self) > 0
コード例 #9
0
    def __init__(self, server, key, debug=False):
        self.start_time = time.strftime("%Y%m%d", time.localtime())

        self.yesterday_time = (datetime.date.today() + datetime.timedelta(days=-1)).strftime("%Y%m%d")

        RFPDupeFilter.__init__(self, server, key, debug)

        self.redis_key_today = self.key + self.start_time

        self.redis_key_yesterday = self.key + self.yesterday_time

        # 过期时间为两天
        self.server.sadd(self.redis_key_today, "created")
        self.server.expire(self.redis_key_today, 86400 * 2)
コード例 #10
0
def test_log_dupes():
    def _test(df, dupes, logcount):
        df.logger.debug = mock.Mock(wraps=df.logger.debug)
        for i in range(dupes):
            req = Request('http://example')
            df.log(req, spider=mock.Mock())
        assert df.logger.debug.call_count == logcount

    server = get_redis_mock()

    df_quiet = RFPDupeFilter(server, 'foo')  # debug=False
    _test(df_quiet, 5, 1)

    df_debug = RFPDupeFilter(server, 'foo', debug=True)
    _test(df_debug, 5, 5)
コード例 #11
0
class DupeFilterTest(RedisTestMixin, TestCase):
    def setUp(self):
        self.key = 'scrapy_redis:tests:dupefilter:'
        self.df = RFPDupeFilter(self.server, self.key)

    def tearDown(self):
        self.clear_keys(self.key)

    def test_dupe_filter(self):
        req = Request('http://example.com')

        self.assertFalse(self.df.request_seen(req))
        self.assertTrue(self.df.request_seen(req))

        self.df.close('nothing')
コード例 #12
0
ファイル: scheduler.py プロジェクト: darthbear/scrapy-redis
class Scheduler(object):
    """Redis-based scheduler"""

    def __init__(self, server, persist, queue_key, dupe_filter):
        self.server = server
        self.persist = persist
        self.queue_key = queue_key
        self.dupe_filter = dupe_filter

    def __len__(self):
        return len(self.queue)

    @classmethod
    def from_crawler(cls, crawler):
	settings = crawler.settings
        host = settings.get('REDIS_HOST', REDIS_HOST)
        port = settings.get('REDIS_PORT', REDIS_PORT)
        persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST)
        queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY)
        dupe_filter = settings.get('DUPE_FILTER', True)
        server = redis.Redis(host, port)
        return cls(server, persist, queue_key, dupe_filter)

    def open(self, spider):
        self.spider = spider
        self.queue = SpiderQueue(self.server, spider, self.queue_key)
	if self.dupe_filter:
            self.df = RFPDupeFilter(self.server, DUPEFILTER_KEY % {'spider': spider.name})
        # notice if there are requests already in the queue
        if len(self.queue):
            spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))

    def close(self, reason):
        if not self.persist:
	    if self.dupe_filter:
                self.df.clear()
            self.queue.clear()

    def enqueue_request(self, request):
        if not request.dont_filter and self.dupe_filter and self.df.request_seen(request):
            return
        self.queue.push(request)

    def next_request(self):
        return self.queue.pop()

    def has_pending_requests(self):
        return len(self) > 0
コード例 #13
0
 def open(self, spider):
     """
         execute this function when open one spider
     """
     
     self.spider = spider
     self.queue = self.queue_cls(self.server, spider, self.queue_key)
     self.dupefilter = RFPDupeFilter(self.server, self.dupefilter_key % {'spider':spider.name})
     if self.idle_before_close < 0:
         self.idle_before_close = 0
     # notice if there are requests already in the queue to resume the crawl
     if len(self.queue):
         spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))
     print "++++++++++++++++++++++++"
     print  "scheduler open"
     print "++++++++++++++++++++++++"
コード例 #14
0
ファイル: scheduler.py プロジェクト: xuemy/scrapy-redis
 def open(self, spider):
     self.spider = spider
     self.queue = self.queue_cls(self.server, spider, self.queue_key)
     self.df = RFPDupeFilter(self.server, self.dupefilter_key % {'spider': spider.name})
     # notice if there are requests already in the queue to resume the crawl
     if len(self.queue):
         spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))
コード例 #15
0
class DupeFilterTest(RedisTestMixin, TestCase):

    def setUp(self):
        self.key = 'scrapy_redis:tests:dupefilter:'
        self.df = RFPDupeFilter(self.server, self.key)

    def tearDown(self):
        self.clear_keys(self.key)

    def test_dupe_filter(self):
        req = Request('http://example.com')

        self.assertFalse(self.df.request_seen(req))
        self.assertTrue(self.df.request_seen(req))

        self.df.close('nothing')
コード例 #16
0
class DupeFilterTest(TestCase):
    def setUp(self):
        self.server = redis.Redis(REDIS_HOST, REDIS_PORT)
        self.key = 'scrapy_redis:tests:dupefilter:'
        self.df = RFPDupeFilter(self.server, self.key)

    def tearDown(self):
        self.server.delete(self.key)

    def test_dupe_filter(self):
        req = Request('http://example.com')

        self.assertFalse(self.df.request_seen(req))
        self.assertTrue(self.df.request_seen(req))

        self.df.close('nothing')
コード例 #17
0
 def open(self, spider):
     self.spider = spider
     self.queue = SpiderQueue(self.server, spider, self.queue_key)
     self.df = RFPDupeFilter(self.server, DUPEFILTER_KEY % {"spider": spider.name})
     # notice if there are requests already in the queue
     if len(self.queue):
         spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))
コード例 #18
0
ファイル: allPythonContent.py プロジェクト: Mondego/pyreco
class DupeFilterTest(TestCase):

    def setUp(self):
        self.server = redis.Redis(REDIS_HOST, REDIS_PORT)
        self.key = 'scrapy_redis:tests:dupefilter:'
        self.df = RFPDupeFilter(self.server, self.key)

    def tearDown(self):
        self.server.delete(self.key)

    def test_dupe_filter(self):
        req = Request('http://example.com')

        self.assertFalse(self.df.request_seen(req))
        self.assertTrue(self.df.request_seen(req))

        self.df.close('nothing')
コード例 #19
0
ファイル: bloom_filter.py プロジェクト: hcxdata/zergling
 def __init__(self, server=None, key=None, debug=False, db=0, blockNum=1, redis_key='bloomfilter'):
     """
     :param host: the host of Redis
     :param port: the port of Redis
     :param db: witch db in Redis
     :param blockNum: one blockNum for about 90,000,000; if you have more strings for filtering, increase it.
     :param key: the key's name in Redis
     """
     RFPDupeFilter.__init__(self, server, key, debug)
     self.logger = logging.getLogger(__name__)
     # self.server = server
     self.bit_size = 1 << 31  # Redis的String类型最大容量为512M,现使用256M
     self.seeds = [5, 7, 11, 13, 31, 37, 61]
     self.logger.info(self.server)
     self.key = redis_key
     self.blockNum = blockNum
     self.hashfunc = []
     for seed in self.seeds:
         self.hashfunc.append(SimpleHash(self.bit_size, seed))
コード例 #20
0
ファイル: Utility.py プロジェクト: actlea/MyProj
class DupeFilterTest():   

    def __init__(self):
        self.setUp()
        
    def setUp(self):
        self.server = redis.Redis(REDIS_HOST, REDIS_PORT)
        self.key = 'Fcrawler:requests:dupefilter:'
        self.df = RFPDupeFilter(self.server, self.key) 
        
    def tearDown(self):
        self.server.delete(self.key)

    def request_dupe_filter(self, url):
        req = Request(url)      
        flag = self.df.request_seen(req)

        self.df.close('nothing')
        
        return flag
コード例 #21
0
 def setUp(self):
     self.key = 'scrapy_redis:tests:dupefilter:'
     self.df = RFPDupeFilter(self.server, self.key)
コード例 #22
0
 def __init__(self, server, key, debug=False):
     RFPDupeFilter.__init__(self, server, key, debug)
     self.df = RedisBloomFilter(server, key)
コード例 #23
0
ファイル: scheduler.py プロジェクト: xuemy/scrapy-redis
class Scheduler(object):
    """Redis-based scheduler"""

    def __init__(self, server, persist, queue_key, queue_cls, dupefilter_key):
        """Initialize scheduler.

        Parameters
        ----------
        server : Redis instance
        persist : bool
        queue_key : str
        queue_cls : queue class
        dupefilter_key : str
        """
        self.server = server
        self.persist = persist
        self.queue_key = queue_key
        self.queue_cls = queue_cls
        self.dupefilter_key = dupefilter_key
        self.stats = None

    def __len__(self):
        return len(self.queue)

    @classmethod
    def from_settings(cls, settings):
        host = settings.get('REDIS_HOST', REDIS_HOST)
        port = settings.get('REDIS_PORT', REDIS_PORT)
        persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST)
        queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY)
        queue_cls = load_object(settings.get('SCHEDULER_QUEUE_CLASS', QUEUE_CLASS))
        dupefilter_key = settings.get('DUPEFILTER_KEY', DUPEFILTER_KEY)
        server = redis.Redis(host, port)
        return cls(server, persist, queue_key, queue_cls, dupefilter_key)

    @classmethod
    def from_crawler(cls, crawler):
        instance = cls.from_settings(crawler.settings)
        # FIXME: for now, stats are only supported from this constructor
        instance.stats = crawler.stats
        return instance

    def open(self, spider):
        self.spider = spider
        self.queue = self.queue_cls(self.server, spider, self.queue_key)
        self.df = RFPDupeFilter(self.server, self.dupefilter_key % {'spider': spider.name})
        # notice if there are requests already in the queue to resume the crawl
        if len(self.queue):
            spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))

    def close(self, reason):
        if not self.persist:
            self.df.clear()
            self.queue.clear()

    def enqueue_request(self, request):
        if not request.dont_filter and self.df.request_seen(request):
            return
        if self.stats:
            self.stats.inc_value('scheduler/enqueued/redis', spider=self.spider)
        self.queue.push(request)

    def next_request(self):
        request = self.queue.pop()
        if request and self.stats:
            self.stats.inc_value('scheduler/dequeued/redis', spider=self.spider)
        return request

    def has_pending_requests(self):
        return len(self) > 0
コード例 #24
0
ファイル: scheduler.py プロジェクト: xuemy/scrapy-redis
class Scheduler(object):
    """Redis-based scheduler"""
    def __init__(self, server, persist, queue_key, queue_cls, dupefilter_key):
        """Initialize scheduler.

        Parameters
        ----------
        server : Redis instance
        persist : bool
        queue_key : str
        queue_cls : queue class
        dupefilter_key : str
        """
        self.server = server
        self.persist = persist
        self.queue_key = queue_key
        self.queue_cls = queue_cls
        self.dupefilter_key = dupefilter_key
        self.stats = None

    def __len__(self):
        return len(self.queue)

    @classmethod
    def from_settings(cls, settings):
        host = settings.get('REDIS_HOST', REDIS_HOST)
        port = settings.get('REDIS_PORT', REDIS_PORT)
        persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST)
        queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY)
        queue_cls = load_object(
            settings.get('SCHEDULER_QUEUE_CLASS', QUEUE_CLASS))
        dupefilter_key = settings.get('DUPEFILTER_KEY', DUPEFILTER_KEY)
        server = redis.Redis(host, port)
        return cls(server, persist, queue_key, queue_cls, dupefilter_key)

    @classmethod
    def from_crawler(cls, crawler):
        instance = cls.from_settings(crawler.settings)
        # FIXME: for now, stats are only supported from this constructor
        instance.stats = crawler.stats
        return instance

    def open(self, spider):
        self.spider = spider
        self.queue = self.queue_cls(self.server, spider, self.queue_key)
        self.df = RFPDupeFilter(self.server,
                                self.dupefilter_key % {'spider': spider.name})
        # notice if there are requests already in the queue to resume the crawl
        if len(self.queue):
            spider.log("Resuming crawl (%d requests scheduled)" %
                       len(self.queue))

    def close(self, reason):
        if not self.persist:
            self.df.clear()
            self.queue.clear()

    def enqueue_request(self, request):
        if not request.dont_filter and self.df.request_seen(request):
            return
        if self.stats:
            self.stats.inc_value('scheduler/enqueued/redis',
                                 spider=self.spider)
        self.queue.push(request)

    def next_request(self):
        request = self.queue.pop()
        if request and self.stats:
            self.stats.inc_value('scheduler/dequeued/redis',
                                 spider=self.spider)
        return request

    def has_pending_requests(self):
        return len(self) > 0
コード例 #25
0
ファイル: test_dupefilter.py プロジェクト: lnlantian/Scrapy
 def setup(self):
     self.server = get_redis_mock()
     self.key = 'dupefilter:1'
     self.df = RFPDupeFilter(self.server, self.key)
コード例 #26
0
 def test_from_crawler(self, get_redis_from_settings):
     crawler = mock.Mock(settings=self.settings)
     df = RFPDupeFilter.from_crawler(crawler)
     self.assert_dupefilter(df, get_redis_from_settings)
コード例 #27
0
 def test_from_settings(self, get_redis_from_settings):
     df = RFPDupeFilter.from_settings(self.settings)
     self.assert_dupefilter(df, get_redis_from_settings)
コード例 #28
0
class Scheduler(object):
    """Redis-based scheduler"""
    
    
    def __init__(self, server, persist, queue_cls, queue_key, dupefilter_key, idle_before_close):
        """Initialize scheduler.
        
        Parameters:
        -----------
            server: redis instance
            persist: bool
            queue_cls: queue class
            queue_key: str
            dupefilter_key: str
            idle_before_close: int
        """
        
        self.server = server
        self.persist = persist
        self.queue_cls = queue_cls
        self.queue_key = queue_key
        self.dupefilter_key = dupefilter_key
        self.idle_before_close = idle_before_close
        self.status = None
        
    
    def __len__(self):
        return len(self.queue)
        
        
    @classmethod
    def from_settings(cls, settings):
        persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST)
        host = settings.get('REDIS_HOST', 'localhost')
        port = settings.get('REDIS_PORT', 6379)
        server = redis.Redis(host, port)
        queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY)
        queue_cls = load_object(settings.get('SCHEDULER_QUEUE_CLASS', QUEUE_CLASS))
        idle_before_close = settings.get('SCHEDULER_IDLE_BEFORE_CLOSE', IDLE_BEFORE_CLOSE)
        dupefilter_key = settings.get('DUPEFILTER_KEY', DUPEFILTER_KEY)
        return cls(server, persist, queue_cls, queue_key, dupefilter_key, idle_before_close)
        
    
    @classmethod
    def from_crawler(cls, crawler):
        instance = cls.from_settings(crawler.settings)
        # FIXME: for now, stats are only supported from this constructor
        instance.stats = crawler.stats
        return instance
#         settings = crawler.settings
#         return cls.from_settings(settings)
    
    
    def open(self, spider):
        """
            execute this function when open one spider
        """
        
        self.spider = spider
        self.queue = self.queue_cls(self.server, spider, self.queue_key)
        self.dupefilter = RFPDupeFilter(self.server, self.dupefilter_key % {'spider':spider.name})
        if self.idle_before_close < 0:
            self.idle_before_close = 0
        # notice if there are requests already in the queue to resume the crawl
        if len(self.queue):
            spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))
        print "++++++++++++++++++++++++"
        print  "scheduler open"
        print "++++++++++++++++++++++++"
        
    
    def close(self, reason):
        if not self.persist:
            self.dupefilter.clear()
            self.queue.clear()
        print "++++++++++++++++++++++++"
        print "scheduler close"
        print "++++++++++++++++++++++++"
    
    
    def enqueue_request(self, request):
        if not request.dont_filter and self.dupefilter.request_seen(request):
            return 
#         if self.stats:
        self.stats.inc_value('scheduler/enqueued/redis', spider=self.spider)
        self.queue.push(request)
        print "++++++++++++++++++++++++" 
        print "scheduler, push request"
        print "++++++++++++++++++++++++"
        
    
    def next_request(self):
        print "++++++++++++++++++++++++"
        print "scheduler next_request"
        print "++++++++++++++++++++++++"
        request = self.queue.pop()
        if request:
           self.stats.inc_value('scheduler/enqueued/redis', spider=self.spider) 
        return request
    
    
    def has_pending_requests(self):
        return len(self) > 0
コード例 #29
0
ファイル: allPythonContent.py プロジェクト: Mondego/pyreco
 def setUp(self):
     self.server = redis.Redis(REDIS_HOST, REDIS_PORT)
     self.key = 'scrapy_redis:tests:dupefilter:'
     self.df = RFPDupeFilter(self.server, self.key)
コード例 #30
0
 def setUp(self):
     self.server = redis.Redis(REDIS_HOST, REDIS_PORT)
     self.key = 'scrapy_redis:tests:dupefilter:'
     self.df = RFPDupeFilter(self.server, self.key)
コード例 #31
0
ファイル: Utility.py プロジェクト: actlea/MyProj
 def setUp(self):
     self.server = redis.Redis(REDIS_HOST, REDIS_PORT)
     self.key = 'Fcrawler:requests:dupefilter:'
     self.df = RFPDupeFilter(self.server, self.key) 
コード例 #32
0
 def setUp(self):
     self.key = 'scrapy_redis:tests:dupefilter:'
     self.df = RFPDupeFilter(self.server, self.key)
コード例 #33
0
ファイル: test_dupefilter.py プロジェクト: lnlantian/Scrapy
 def test_from_crawler(self, get_redis_from_settings):
     crawler = mock.Mock(settings=self.settings)
     df = RFPDupeFilter.from_crawler(crawler)
     self.assert_dupefilter(df, get_redis_from_settings)
コード例 #34
0
ファイル: scheduler.py プロジェクト: scraping-xx/scrapy-redis
class Scheduler(object):
    """Redis-based scheduler"""

    def __init__(self, server, persist, queue_key, queue_cls, dupefilter_key):
        """Initialize scheduler.

        Parameters
        ----------
        server : Redis instance
        persist : bool
        queue_key : str
        queue_cls : queue class
        dupefilter_key : str
        """
        self.server = server
        self.persist = persist
        self.queue_key = queue_key
        self.queue_cls = queue_cls
        self.dupefilter_key = dupefilter_key

    def __len__(self):
        return len(self.queue)

    @classmethod
    def from_settings(cls, settings):
        host = settings.get("REDIS_HOST", REDIS_HOST)
        port = settings.get("REDIS_PORT", REDIS_PORT)
        persist = settings.get("SCHEDULER_PERSIST", SCHEDULER_PERSIST)
        queue_key = settings.get("SCHEDULER_QUEUE_KEY", QUEUE_KEY)
        queue_cls = load_object(settings.get("SCHEDULER_QUEUE_CLASS", QUEUE_CLASS))
        dupefilter_key = settings.get("DUPEFILTER_KEY", DUPEFILTER_KEY)
        server = redis.Redis(host, port)
        return cls(server, persist, queue_key, queue_cls, dupefilter_key)

    @classmethod
    def from_crawler(cls, crawler):
        settings = crawler.settings
        return cls.from_settings(settings)

    def open(self, spider):
        self.spider = spider
        self.queue = SpiderQueue(self.server, spider, self.queue_key)
        self.df = RFPDupeFilter(self.server, self.dupefilter_key % {"spider": spider.name})
        # notice if there are requests already in the queue to resume the crawl
        if len(self.queue):
            spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))

    def close(self, reason):
        if not self.persist:
            self.df.clear()
            self.queue.clear()

    def enqueue_request(self, request):
        if not request.dont_filter and self.df.request_seen(request):
            return
        self.queue.push(request)

    def next_request(self):
        return self.queue.pop()

    def has_pending_requests(self):
        return len(self) > 0
コード例 #35
0
ファイル: test_dupefilter.py プロジェクト: lnlantian/Scrapy
 def test_from_settings(self, get_redis_from_settings):
     df = RFPDupeFilter.from_settings(self.settings)
     self.assert_dupefilter(df, get_redis_from_settings)
コード例 #36
0
 def setup(self):
     self.server = get_redis_mock()
     self.key = 'dupefilter:1'
     self.df = RFPDupeFilter(self.server, self.key)
コード例 #37
0
class Scheduler(object):
    """ A RabbitMQ Scheduler for Scrapy.
    """

    def __init__(self, server, persist, queue_key, queue_cls, dupefilter_key, idle_before_close, redis_server, *args,
                 **kwargs):
        self.server = server
        self.persist = persist
        self.queue_key = queue_key
        self.queue_cls = queue_cls
        self.dupefilter_key = dupefilter_key
        self.idle_before_close = idle_before_close
        self.stats = None
        self.redis_server = redis_server

    def __len__(self):
        return len(self.queue)

    @classmethod
    def from_settings(cls, settings):
        persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST)
        queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY)
        queue_cls = load_object(settings.get('SCHEDULER_QUEUE_CLASS', QUEUE_CLASS))
        dupefilter_key = settings.get('DUPEFILTER_KEY', DUPEFILTER_KEY)
        idle_before_close = settings.get('SCHEDULER_IDLE_BEFORE_CLOSE', IDLE_BEFORE_CLOSE)
        server, redis_server = connection.from_settings(settings)
        return cls(server, persist, queue_key, queue_cls, dupefilter_key, idle_before_close, redis_server)

    @classmethod
    def from_crawler(cls, crawler):
        instance = cls.from_settings(crawler.settings)
        instance.stats = crawler.stats
        return instance

    def open(self, spider):
        self.spider = spider
        self.queue = self.queue_cls(self.server, spider, self.queue_key % {'spider': spider.name})
        self.df = RFPDupeFilter(self.redis_server, self.dupefilter_key % {'spider': spider.name})

        if self.idle_before_close < 0:
            self.idle_before_close = 0

        if len(self.queue):
            spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))

    def close(self, reason):
        if not self.persist:
            self.df.clear()
            self.queue.clear()

    def enqueue_request(self, request):
        if not request.dont_filter and self.df.request_seen(request):
            return
        if self.stats:
            self.stats.inc_value('scheduler/enqueued/rabbitmq', spider=self.spider)
        self.queue.push(request)

    def next_request(self):
        block_pop_timeout = self.idle_before_close
        request = self.queue.pop()
        if request and self.stats:
            self.stats.inc_value('scheduler/dequeued/rabbitmq', spider=self.spider)
        return request

    def has_pending_requests(self):
        return len(self) > 0