コード例 #1
0
class TestRFPDupeFilter(object):
    def setup(self):
        self.server = get_redis_mock()
        self.key = 'dupefilter:1'
        self.df = RFPDupeFilter(self.server, self.key)

    def test_request_seen(self):
        req = Request('http://example.com')
        assert not self.df.request_seen(req)
        assert self.df.request_seen(req)

    def test_overridable_request_fingerprinter(self):
        req = Request('http://example.com')
        self.df.request_fingerprint = mock.Mock(
            wraps=self.df.request_fingerprint)
        assert not self.df.request_seen(req)
        self.df.request_fingerprint.assert_called_with(req)

    def test_clear_deletes(self):
        self.df.clear()
        self.server.delete.assert_called_with(self.key)

    def test_close_calls_clear(self):
        self.df.clear = mock.Mock(wraps=self.df.clear)
        self.df.close()
        self.df.close(reason='foo')
        assert self.df.clear.call_count == 2
コード例 #2
0
ファイル: test_dupefilter.py プロジェクト: lnlantian/Scrapy
class TestRFPDupeFilter(object):

    def setup(self):
        self.server = get_redis_mock()
        self.key = 'dupefilter:1'
        self.df = RFPDupeFilter(self.server, self.key)

    def test_request_seen(self):
        req = Request('http://example.com')
        assert not self.df.request_seen(req)
        assert self.df.request_seen(req)

    def test_overridable_request_fingerprinter(self):
        req = Request('http://example.com')
        self.df.request_fingerprint = mock.Mock(wraps=self.df.request_fingerprint)
        assert not self.df.request_seen(req)
        self.df.request_fingerprint.assert_called_with(req)

    def test_clear_deletes(self):
        self.df.clear()
        self.server.delete.assert_called_with(self.key)

    def test_close_calls_clear(self):
        self.df.clear = mock.Mock(wraps=self.df.clear)
        self.df.close()
        self.df.close(reason='foo')
        assert self.df.clear.call_count == 2
コード例 #3
0
class Scheduler(object):
    """Redis-based scheduler"""

    def __init__(self, server, persist, queue_key):
        self.server = server
        self.persist = persist
        self.queue_key = queue_key

    def __len__(self):
        return len(self.queue)

    # @classmethod
    # def from_settings(cls, settings):
    # host = settings.get('REDIS_HOST', REDIS_HOST)
    # port = settings.get('REDIS_PORT', REDIS_PORT)
    # persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST)
    # queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY)
    # server = redis.Redis(host, port)
    # return cls(server, persist, queue_key)

    @classmethod
    def from_crawler(cls, crawler):
        settings = crawler.settings
        host = settings.get("REDIS_HOST", REDIS_HOST)
        port = settings.get("REDIS_PORT", REDIS_PORT)
        persist = settings.get("SCHEDULER_PERSIST", SCHEDULER_PERSIST)
        queue_key = settings.get("SCHEDULER_QUEUE_KEY", QUEUE_KEY)
        server = redis.Redis(host, port)
        return cls(server, persist, queue_key)

    def open(self, spider):
        self.spider = spider
        self.queue = SpiderQueue(self.server, spider, self.queue_key)
        self.df = RFPDupeFilter(self.server, DUPEFILTER_KEY % {"spider": spider.name})
        # notice if there are requests already in the queue
        if len(self.queue):
            spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))

    def close(self, reason):
        if not self.persist:
            self.df.clear()
            self.queue.clear()

    def enqueue_request(self, request):
        if not request.dont_filter and self.df.request_seen(request):
            return
        self.queue.push(request)

    def next_request(self):
        return self.queue.pop()

    def has_pending_requests(self):
        return len(self) > 0
コード例 #4
0
ファイル: scheduler.py プロジェクト: darthbear/scrapy-redis
class Scheduler(object):
    """Redis-based scheduler"""

    def __init__(self, server, persist, queue_key, dupe_filter):
        self.server = server
        self.persist = persist
        self.queue_key = queue_key
        self.dupe_filter = dupe_filter

    def __len__(self):
        return len(self.queue)

    @classmethod
    def from_crawler(cls, crawler):
	settings = crawler.settings
        host = settings.get('REDIS_HOST', REDIS_HOST)
        port = settings.get('REDIS_PORT', REDIS_PORT)
        persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST)
        queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY)
        dupe_filter = settings.get('DUPE_FILTER', True)
        server = redis.Redis(host, port)
        return cls(server, persist, queue_key, dupe_filter)

    def open(self, spider):
        self.spider = spider
        self.queue = SpiderQueue(self.server, spider, self.queue_key)
	if self.dupe_filter:
            self.df = RFPDupeFilter(self.server, DUPEFILTER_KEY % {'spider': spider.name})
        # notice if there are requests already in the queue
        if len(self.queue):
            spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))

    def close(self, reason):
        if not self.persist:
	    if self.dupe_filter:
                self.df.clear()
            self.queue.clear()

    def enqueue_request(self, request):
        if not request.dont_filter and self.dupe_filter and self.df.request_seen(request):
            return
        self.queue.push(request)

    def next_request(self):
        return self.queue.pop()

    def has_pending_requests(self):
        return len(self) > 0
コード例 #5
0
ファイル: scheduler.py プロジェクト: xuemy/scrapy-redis
class Scheduler(object):
    """Redis-based scheduler"""

    def __init__(self, server, persist, queue_key, queue_cls, dupefilter_key):
        """Initialize scheduler.

        Parameters
        ----------
        server : Redis instance
        persist : bool
        queue_key : str
        queue_cls : queue class
        dupefilter_key : str
        """
        self.server = server
        self.persist = persist
        self.queue_key = queue_key
        self.queue_cls = queue_cls
        self.dupefilter_key = dupefilter_key
        self.stats = None

    def __len__(self):
        return len(self.queue)

    @classmethod
    def from_settings(cls, settings):
        host = settings.get('REDIS_HOST', REDIS_HOST)
        port = settings.get('REDIS_PORT', REDIS_PORT)
        persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST)
        queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY)
        queue_cls = load_object(settings.get('SCHEDULER_QUEUE_CLASS', QUEUE_CLASS))
        dupefilter_key = settings.get('DUPEFILTER_KEY', DUPEFILTER_KEY)
        server = redis.Redis(host, port)
        return cls(server, persist, queue_key, queue_cls, dupefilter_key)

    @classmethod
    def from_crawler(cls, crawler):
        instance = cls.from_settings(crawler.settings)
        # FIXME: for now, stats are only supported from this constructor
        instance.stats = crawler.stats
        return instance

    def open(self, spider):
        self.spider = spider
        self.queue = self.queue_cls(self.server, spider, self.queue_key)
        self.df = RFPDupeFilter(self.server, self.dupefilter_key % {'spider': spider.name})
        # notice if there are requests already in the queue to resume the crawl
        if len(self.queue):
            spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))

    def close(self, reason):
        if not self.persist:
            self.df.clear()
            self.queue.clear()

    def enqueue_request(self, request):
        if not request.dont_filter and self.df.request_seen(request):
            return
        if self.stats:
            self.stats.inc_value('scheduler/enqueued/redis', spider=self.spider)
        self.queue.push(request)

    def next_request(self):
        request = self.queue.pop()
        if request and self.stats:
            self.stats.inc_value('scheduler/dequeued/redis', spider=self.spider)
        return request

    def has_pending_requests(self):
        return len(self) > 0
コード例 #6
0
ファイル: scheduler.py プロジェクト: xuemy/scrapy-redis
class Scheduler(object):
    """Redis-based scheduler"""
    def __init__(self, server, persist, queue_key, queue_cls, dupefilter_key):
        """Initialize scheduler.

        Parameters
        ----------
        server : Redis instance
        persist : bool
        queue_key : str
        queue_cls : queue class
        dupefilter_key : str
        """
        self.server = server
        self.persist = persist
        self.queue_key = queue_key
        self.queue_cls = queue_cls
        self.dupefilter_key = dupefilter_key
        self.stats = None

    def __len__(self):
        return len(self.queue)

    @classmethod
    def from_settings(cls, settings):
        host = settings.get('REDIS_HOST', REDIS_HOST)
        port = settings.get('REDIS_PORT', REDIS_PORT)
        persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST)
        queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY)
        queue_cls = load_object(
            settings.get('SCHEDULER_QUEUE_CLASS', QUEUE_CLASS))
        dupefilter_key = settings.get('DUPEFILTER_KEY', DUPEFILTER_KEY)
        server = redis.Redis(host, port)
        return cls(server, persist, queue_key, queue_cls, dupefilter_key)

    @classmethod
    def from_crawler(cls, crawler):
        instance = cls.from_settings(crawler.settings)
        # FIXME: for now, stats are only supported from this constructor
        instance.stats = crawler.stats
        return instance

    def open(self, spider):
        self.spider = spider
        self.queue = self.queue_cls(self.server, spider, self.queue_key)
        self.df = RFPDupeFilter(self.server,
                                self.dupefilter_key % {'spider': spider.name})
        # notice if there are requests already in the queue to resume the crawl
        if len(self.queue):
            spider.log("Resuming crawl (%d requests scheduled)" %
                       len(self.queue))

    def close(self, reason):
        if not self.persist:
            self.df.clear()
            self.queue.clear()

    def enqueue_request(self, request):
        if not request.dont_filter and self.df.request_seen(request):
            return
        if self.stats:
            self.stats.inc_value('scheduler/enqueued/redis',
                                 spider=self.spider)
        self.queue.push(request)

    def next_request(self):
        request = self.queue.pop()
        if request and self.stats:
            self.stats.inc_value('scheduler/dequeued/redis',
                                 spider=self.spider)
        return request

    def has_pending_requests(self):
        return len(self) > 0
コード例 #7
0
class Scheduler(object):
    """ A RabbitMQ Scheduler for Scrapy.
    """

    def __init__(self, server, persist, queue_key, queue_cls, dupefilter_key, idle_before_close, redis_server, *args,
                 **kwargs):
        self.server = server
        self.persist = persist
        self.queue_key = queue_key
        self.queue_cls = queue_cls
        self.dupefilter_key = dupefilter_key
        self.idle_before_close = idle_before_close
        self.stats = None
        self.redis_server = redis_server

    def __len__(self):
        return len(self.queue)

    @classmethod
    def from_settings(cls, settings):
        persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST)
        queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY)
        queue_cls = load_object(settings.get('SCHEDULER_QUEUE_CLASS', QUEUE_CLASS))
        dupefilter_key = settings.get('DUPEFILTER_KEY', DUPEFILTER_KEY)
        idle_before_close = settings.get('SCHEDULER_IDLE_BEFORE_CLOSE', IDLE_BEFORE_CLOSE)
        server, redis_server = connection.from_settings(settings)
        return cls(server, persist, queue_key, queue_cls, dupefilter_key, idle_before_close, redis_server)

    @classmethod
    def from_crawler(cls, crawler):
        instance = cls.from_settings(crawler.settings)
        instance.stats = crawler.stats
        return instance

    def open(self, spider):
        self.spider = spider
        self.queue = self.queue_cls(self.server, spider, self.queue_key % {'spider': spider.name})
        self.df = RFPDupeFilter(self.redis_server, self.dupefilter_key % {'spider': spider.name})

        if self.idle_before_close < 0:
            self.idle_before_close = 0

        if len(self.queue):
            spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))

    def close(self, reason):
        if not self.persist:
            self.df.clear()
            self.queue.clear()

    def enqueue_request(self, request):
        if not request.dont_filter and self.df.request_seen(request):
            return
        if self.stats:
            self.stats.inc_value('scheduler/enqueued/rabbitmq', spider=self.spider)
        self.queue.push(request)

    def next_request(self):
        block_pop_timeout = self.idle_before_close
        request = self.queue.pop()
        if request and self.stats:
            self.stats.inc_value('scheduler/dequeued/rabbitmq', spider=self.spider)
        return request

    def has_pending_requests(self):
        return len(self) > 0
コード例 #8
0
class Scheduler(object):
    """Redis-based scheduler"""
    
    
    def __init__(self, server, persist, queue_cls, queue_key, dupefilter_key, idle_before_close):
        """Initialize scheduler.
        
        Parameters:
        -----------
            server: redis instance
            persist: bool
            queue_cls: queue class
            queue_key: str
            dupefilter_key: str
            idle_before_close: int
        """
        
        self.server = server
        self.persist = persist
        self.queue_cls = queue_cls
        self.queue_key = queue_key
        self.dupefilter_key = dupefilter_key
        self.idle_before_close = idle_before_close
        self.status = None
        
    
    def __len__(self):
        return len(self.queue)
        
        
    @classmethod
    def from_settings(cls, settings):
        persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST)
        host = settings.get('REDIS_HOST', 'localhost')
        port = settings.get('REDIS_PORT', 6379)
        server = redis.Redis(host, port)
        queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY)
        queue_cls = load_object(settings.get('SCHEDULER_QUEUE_CLASS', QUEUE_CLASS))
        idle_before_close = settings.get('SCHEDULER_IDLE_BEFORE_CLOSE', IDLE_BEFORE_CLOSE)
        dupefilter_key = settings.get('DUPEFILTER_KEY', DUPEFILTER_KEY)
        return cls(server, persist, queue_cls, queue_key, dupefilter_key, idle_before_close)
        
    
    @classmethod
    def from_crawler(cls, crawler):
        instance = cls.from_settings(crawler.settings)
        # FIXME: for now, stats are only supported from this constructor
        instance.stats = crawler.stats
        return instance
#         settings = crawler.settings
#         return cls.from_settings(settings)
    
    
    def open(self, spider):
        """
            execute this function when open one spider
        """
        
        self.spider = spider
        self.queue = self.queue_cls(self.server, spider, self.queue_key)
        self.dupefilter = RFPDupeFilter(self.server, self.dupefilter_key % {'spider':spider.name})
        if self.idle_before_close < 0:
            self.idle_before_close = 0
        # notice if there are requests already in the queue to resume the crawl
        if len(self.queue):
            spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))
        print "++++++++++++++++++++++++"
        print  "scheduler open"
        print "++++++++++++++++++++++++"
        
    
    def close(self, reason):
        if not self.persist:
            self.dupefilter.clear()
            self.queue.clear()
        print "++++++++++++++++++++++++"
        print "scheduler close"
        print "++++++++++++++++++++++++"
    
    
    def enqueue_request(self, request):
        if not request.dont_filter and self.dupefilter.request_seen(request):
            return 
#         if self.stats:
        self.stats.inc_value('scheduler/enqueued/redis', spider=self.spider)
        self.queue.push(request)
        print "++++++++++++++++++++++++" 
        print "scheduler, push request"
        print "++++++++++++++++++++++++"
        
    
    def next_request(self):
        print "++++++++++++++++++++++++"
        print "scheduler next_request"
        print "++++++++++++++++++++++++"
        request = self.queue.pop()
        if request:
           self.stats.inc_value('scheduler/enqueued/redis', spider=self.spider) 
        return request
    
    
    def has_pending_requests(self):
        return len(self) > 0
コード例 #9
0
ファイル: scheduler.py プロジェクト: scraping-xx/scrapy-redis
class Scheduler(object):
    """Redis-based scheduler"""

    def __init__(self, server, persist, queue_key, queue_cls, dupefilter_key):
        """Initialize scheduler.

        Parameters
        ----------
        server : Redis instance
        persist : bool
        queue_key : str
        queue_cls : queue class
        dupefilter_key : str
        """
        self.server = server
        self.persist = persist
        self.queue_key = queue_key
        self.queue_cls = queue_cls
        self.dupefilter_key = dupefilter_key

    def __len__(self):
        return len(self.queue)

    @classmethod
    def from_settings(cls, settings):
        host = settings.get("REDIS_HOST", REDIS_HOST)
        port = settings.get("REDIS_PORT", REDIS_PORT)
        persist = settings.get("SCHEDULER_PERSIST", SCHEDULER_PERSIST)
        queue_key = settings.get("SCHEDULER_QUEUE_KEY", QUEUE_KEY)
        queue_cls = load_object(settings.get("SCHEDULER_QUEUE_CLASS", QUEUE_CLASS))
        dupefilter_key = settings.get("DUPEFILTER_KEY", DUPEFILTER_KEY)
        server = redis.Redis(host, port)
        return cls(server, persist, queue_key, queue_cls, dupefilter_key)

    @classmethod
    def from_crawler(cls, crawler):
        settings = crawler.settings
        return cls.from_settings(settings)

    def open(self, spider):
        self.spider = spider
        self.queue = SpiderQueue(self.server, spider, self.queue_key)
        self.df = RFPDupeFilter(self.server, self.dupefilter_key % {"spider": spider.name})
        # notice if there are requests already in the queue to resume the crawl
        if len(self.queue):
            spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))

    def close(self, reason):
        if not self.persist:
            self.df.clear()
            self.queue.clear()

    def enqueue_request(self, request):
        if not request.dont_filter and self.df.request_seen(request):
            return
        self.queue.push(request)

    def next_request(self):
        return self.queue.pop()

    def has_pending_requests(self):
        return len(self) > 0