def test_base(self):
        self.req_it = iter(self.requests)
        ob = OverusedBuffer(self.get_once, 100, 10000)

        assert ob._get_pending_count() == 0
        assert set(
            ob.get_next_requests(
                10,
                overused_keys=['www.example.com', 'example1.com'],
                key_type='domain')) == set([r4, r5])
        assert ob._get_pending_count() == 4
        assert ob.get_next_requests(10,
                                    overused_keys=['www.example.com'],
                                    key_type='domain') == [r6]
        assert ob._get_pending_count() == 3

        assert ob.get_next_requests(10,
                                    overused_keys=['www.example.com'],
                                    key_type='domain') == []
        assert ob._get_pending_count() == 3

        #the max_next_requests is 3 here to cover the "len(requests) == max_next_requests" case.
        assert set(
            ob.get_next_requests(3,
                                 overused_keys=['example.com'],
                                 key_type='domain')) == set([r1, r2, r3])
        assert ob._get_pending_count() == 0

        assert ob.get_next_requests(10, overused_keys=[],
                                    key_type='domain') == []
        assert ob._get_pending_count() == 0
Beispiel #2
0
 def __init__(self, manager):
     super(MemoryDFSOverusedBackend, self).__init__(manager)
     settings = manager.settings
     self.overused_buffer = OverusedBuffer(
         super(MemoryDFSOverusedBackend, self).get_next_requests,
         settings.get("OVERUSED_MAX_QUEUE_SIZE"),
         settings.get("OVERUSED_MAX_KEYS"))
Beispiel #3
0
 def __init__(self, manager):
     settings = manager.settings
     messagebus = load_object(settings.get('MESSAGE_BUS'))
     self.mb = messagebus(settings)
     codec_path = settings.get('MESSAGE_BUS_CODEC')
     encoder_cls = load_object(codec_path + ".Encoder")
     decoder_cls = load_object(codec_path + ".Decoder")
     store_content = settings.get('STORE_CONTENT')
     self._encoder = encoder_cls(manager.request_model,
                                 send_body=store_content)
     self._decoder = decoder_cls(manager.request_model,
                                 manager.response_model)
     self.spider_log_producer = self.mb.spider_log().producer()
     spider_feed = self.mb.spider_feed()
     self.partition_id = int(settings.get('SPIDER_PARTITION_ID'))
     if self.partition_id < 0 or self.partition_id >= settings.get(
             'SPIDER_FEED_PARTITIONS'):
         raise ValueError(
             "Spider partition id cannot be less than 0 or more than SPIDER_FEED_PARTITIONS."
         )
     self.consumer = spider_feed.consumer(partition_id=self.partition_id)
     self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT'))
     self._logger = logging.getLogger("messagebus-backend")
     self._buffer = OverusedBuffer(
         self._get_next_requests,
         max_per_key=settings.get('OVERUSED_MAX_PER_KEY'),
         keep_per_key=settings.get("OVERUSED_KEEP_PER_KEY"),
         max_keys=settings.get('OVERUSED_MAX_KEYS'),
         keep_keys=settings.get('OVERUSED_KEEP_KEYS'))
     self._logger.info("Consuming from partition id %d", self.partition_id)
Beispiel #4
0
    def test(self):
        ob = OverusedBuffer(self.get_func, self.log_func)
        self.requests = [r1, r2, r3, r4, r5, r6]
        assert set(ob.get_next_requests(10, overused_keys=['www.example.com', 'example1.com'],
                                        key_type='domain')) == set([r4, r5])
        assert set(self.logs) == set(["Overused keys: ['www.example.com', 'example1.com']",
                                      "Pending: 0"])
        self.logs = []

        assert ob.get_next_requests(10, overused_keys=['www.example.com'],
                                    key_type='domain') == [r6]
        assert set(self.logs) == set(["Overused keys: ['www.example.com']",
                                     "Pending: 4"])
        self.logs = []

        assert ob.get_next_requests(10, overused_keys=['www.example.com'],
                                    key_type='domain') == []
        assert set(self.logs) == set(["Overused keys: ['www.example.com']",
                                      "Pending: 3"])
        self.logs = []

        #the max_next_requests is 3 here to cover the "len(requests) == max_next_requests" case.
        assert set(ob.get_next_requests(3, overused_keys=['example.com'],
                                        key_type='domain')) == set([r1, r2, r3])
        assert set(self.logs) == set(["Overused keys: ['example.com']",
                                      "Pending: 3"])
        self.logs = []

        assert ob.get_next_requests(10, overused_keys=[], key_type='domain') == []
        assert set(self.logs) == set(["Overused keys: []", "Pending: 0"])
 def test_purging_keys(self):
     self.req_it = cycle(self.requests)
     ob = OverusedBuffer(self.get_once, 10, 100)
     ob.get_next_requests(10,
                          overused_keys=["example.com", "www.example.com"],
                          key_type="domain")
     assert ob._get_pending_count() == 9
     ob.get_next_requests(10,
                          overused_keys=["example.com", "www.example.com"],
                          key_type="domain")  # purging of www.example.com
     assert ob._get_pending_count() == 7
    def test_purging_keys_set(self):
        self.generate_requests()
        self.req_it = cycle(self.requests)
        ob = OverusedBuffer(self.get_once, 1000, 10)

        ob.get_next_requests(10, overused_keys=self.hosts, key_type="domain")
        assert (ob._get_key_count()) == 10

        ob.get_next_requests(10, overused_keys=self.hosts, key_type="domain")
        assert (ob._get_key_count()) == 20

        ob.get_next_requests(10, overused_keys=self.hosts,
                             key_type="domain")  # purging of keys set
        assert (ob._get_key_count()) < 20
Beispiel #7
0
 def __init__(self, manager):
     self._manager = manager
     settings = self._manager.settings
     messagebus = load_object(settings.get('MESSAGE_BUS'))
     self.mb = messagebus(settings)
     store_content = settings.get('STORE_CONTENT')
     self._encoder = Encoder(manager.request_model, send_body=store_content)
     self._decoder = Decoder(manager.request_model, manager.response_model)
     self.spider_log_producer = self.mb.spider_log().producer()
     spider_feed = self.mb.spider_feed()
     self.partition_id = int(settings.get('SPIDER_PARTITION_ID'))
     self.consumer = spider_feed.consumer(partition_id=self.partition_id)
     self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT'))
     self._buffer = OverusedBuffer(self._get_next_requests,
                                   manager.logger.manager.debug)
Beispiel #8
0
 def __init__(self, manager):
     settings = manager.settings
     messagebus = load_object(settings.get('MESSAGE_BUS'))
     self.mb = messagebus(settings)
     store_content = settings.get('STORE_CONTENT')
     self._encoder = Encoder(manager.request_model, send_body=store_content)
     self._decoder = Decoder(manager.request_model, manager.response_model)
     self.spider_log_producer = self.mb.spider_log().producer()
     spider_feed = self.mb.spider_feed()
     self.partition_id = int(settings.get('SPIDER_PARTITION_ID'))
     if self.partition_id < 0 or self.partition_id >= settings.get(
             'SPIDER_FEED_PARTITIONS'):
         raise ValueError(
             "Spider partition id cannot be less than 0 or more than SPIDER_FEED_PARTITIONS."
         )
     self.consumer = spider_feed.consumer(partition_id=self.partition_id)
     self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT'))
     self._logger = logging.getLogger("messagebus-backend")
     self._buffer = OverusedBuffer(self._get_next_requests,
                                   self._logger.debug)
     self._logger.info("Consuming from partition id %d", self.partition_id)
Beispiel #9
0
 def __init__(self, manager):
     super(KafkaOverusedBackend, self).__init__(manager)
     self._buffer = OverusedBuffer(
         super(KafkaOverusedBackend, self).get_next_requests,
         manager.logger.manager.debug)
Beispiel #10
0
 def __init__(self, manager):
     super(MemoryDFSOverusedBackend, self).__init__(manager)
     self.overused_buffer = OverusedBuffer(
         super(MemoryDFSOverusedBackend, self).get_next_requests)