Ejemplo n.º 1
0
 def __init__(self, manager):
     super(MemoryDFSOverusedBackend, self).__init__(manager)
     settings = manager.settings
     self.overused_buffer = OverusedBuffer(
         super(MemoryDFSOverusedBackend, self).get_next_requests,
         settings.get("OVERUSED_MAX_QUEUE_SIZE"),
         settings.get("OVERUSED_MAX_KEYS"))
Ejemplo n.º 2
0
 def __init__(self, manager):
     settings = manager.settings
     messagebus = load_object(settings.get('MESSAGE_BUS'))
     self.mb = messagebus(settings)
     codec_path = settings.get('MESSAGE_BUS_CODEC')
     encoder_cls = load_object(codec_path + ".Encoder")
     decoder_cls = load_object(codec_path + ".Decoder")
     store_content = settings.get('STORE_CONTENT')
     self._encoder = encoder_cls(manager.request_model,
                                 send_body=store_content)
     self._decoder = decoder_cls(manager.request_model,
                                 manager.response_model)
     self.spider_log_producer = self.mb.spider_log().producer()
     spider_feed = self.mb.spider_feed()
     self.partition_id = int(settings.get('SPIDER_PARTITION_ID'))
     if self.partition_id < 0 or self.partition_id >= settings.get(
             'SPIDER_FEED_PARTITIONS'):
         raise ValueError(
             "Spider partition id cannot be less than 0 or more than SPIDER_FEED_PARTITIONS."
         )
     self.consumer = spider_feed.consumer(partition_id=self.partition_id)
     self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT'))
     self._logger = logging.getLogger("messagebus-backend")
     self._buffer = OverusedBuffer(
         self._get_next_requests,
         max_per_key=settings.get('OVERUSED_MAX_PER_KEY'),
         keep_per_key=settings.get("OVERUSED_KEEP_PER_KEY"),
         max_keys=settings.get('OVERUSED_MAX_KEYS'),
         keep_keys=settings.get('OVERUSED_KEEP_KEYS'))
     self._logger.info("Consuming from partition id %d", self.partition_id)
    def test_purging_keys_set(self):
        self.generate_requests()
        self.req_it = cycle(self.requests)
        ob = OverusedBuffer(self.get_once, 1000, 100,  10, 1)

        ob.get_next_requests(10, overused_keys=self.hosts, key_type="domain")
        assert (ob._get_key_count()) == 10

        ob.get_next_requests(10, overused_keys=self.hosts, key_type="domain")
        assert (ob._get_key_count()) == 20

        ob.get_next_requests(10, overused_keys=self.hosts, key_type="domain")   # purging of keys set
        assert (ob._get_key_count()) < 20
Ejemplo n.º 4
0
 def __init__(self, manager):
     self._manager = manager
     settings = Settings(attributes=manager.settings.attributes)
     messagebus = load_object(settings.get('MESSAGE_BUS'))
     self.mb = messagebus(settings)
     store_content = settings.get('STORE_CONTENT')
     self._encoder = Encoder(manager.request_model, send_body=store_content)
     self._decoder = Decoder(manager.request_model, manager.response_model)
     self.spider_log_producer = self.mb.spider_log().producer()
     spider_feed = self.mb.spider_feed()
     self.partition_id = settings.get('SPIDER_PARTITION_ID')
     self.consumer = spider_feed.consumer(partition_id=self.partition_id)
     self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT', 5.0))
     self._buffer = OverusedBuffer(self._get_next_requests,
                                   manager.logger.manager.debug)
Ejemplo n.º 5
0
class MemoryDFSOverusedBackend(MemoryDFSBackend):
    def __init__(self, manager):
        super(MemoryDFSOverusedBackend, self).__init__(manager)
        self.overused_buffer = OverusedBuffer(super(MemoryDFSOverusedBackend, self).get_next_requests)

    def get_next_requests(self, max_next_requests, **kwargs):
        return self.overused_buffer.get_next_requests(max_next_requests, **kwargs)
Ejemplo n.º 6
0
class MemoryDFSOverusedBackend(MemoryDFSBackend):
    def __init__(self, manager):
        super(MemoryDFSOverusedBackend, self).__init__(manager)
        self.overused_buffer = OverusedBuffer(super(MemoryDFSOverusedBackend, self).get_next_requests)

    def get_next_requests(self, max_next_requests, **kwargs):
        return self.overused_buffer.get_next_requests(max_next_requests, **kwargs)
Ejemplo n.º 7
0
    def test(self):
        ob = OverusedBuffer(self.get_func, self.log_func)
        self.requests = [r1, r2, r3, r4, r5, r6]
        assert set(ob.get_next_requests(10, overused_keys=['www.example.com', 'example1.com'],
                                        key_type='domain')) == set([r4, r5])
        assert set(self.logs) == set(["Overused keys: ['www.example.com', 'example1.com']",
                                      "Pending: 0"])
        self.logs = []

        assert ob.get_next_requests(10, overused_keys=['www.example.com'],
                                    key_type='domain') == [r6]
        assert set(self.logs) == set(["Overused keys: ['www.example.com']",
                                     "Pending: 4"])
        self.logs = []

        assert ob.get_next_requests(10, overused_keys=['www.example.com'],
                                    key_type='domain') == []
        assert set(self.logs) == set(["Overused keys: ['www.example.com']",
                                      "Pending: 3"])
        self.logs = []

        #the max_next_requests is 3 here to cover the "len(requests) == max_next_requests" case.
        assert set(ob.get_next_requests(3, overused_keys=['example.com'],
                                        key_type='domain')) == set([r1, r2, r3])
        assert set(self.logs) == set(["Overused keys: ['example.com']",
                                      "Pending: 3"])
        self.logs = []

        assert ob.get_next_requests(10, overused_keys=[], key_type='domain') == []
        assert set(self.logs) == set(["Overused keys: []", "Pending: 0"])
Ejemplo n.º 8
0
 def test_purging_keys(self):
     self.req_it = cycle(self.requests)
     ob = OverusedBuffer(self.get_once, 10, 100)
     ob.get_next_requests(10,
                          overused_keys=["example.com", "www.example.com"],
                          key_type="domain")
     assert ob._get_pending_count() == 9
     ob.get_next_requests(10,
                          overused_keys=["example.com", "www.example.com"],
                          key_type="domain")  # purging of www.example.com
     assert ob._get_pending_count() == 7
Ejemplo n.º 9
0
class KafkaOverusedBackend(KafkaBackend):
    component_name = 'Kafka Backend taking into account overused slots'

    def __init__(self, manager):
        super(KafkaOverusedBackend, self).__init__(manager)
        self._buffer = OverusedBuffer(super(KafkaOverusedBackend, self).get_next_requests,
                                      manager.logger.manager.debug)

    def get_next_requests(self, max_n_requests, **kwargs):
        return self._buffer.get_next_requests(max_n_requests, **kwargs)
Ejemplo n.º 10
0
class KafkaOverusedBackend(KafkaBackend):
    component_name = 'Kafka Backend taking into account overused slots'

    def __init__(self, manager):
        super(KafkaOverusedBackend, self).__init__(manager)
        self._buffer = OverusedBuffer(super(KafkaOverusedBackend, self).get_next_requests,
                                      manager.logger.manager.debug)

    def get_next_requests(self, max_n_requests, **kwargs):
        return self._buffer.get_next_requests(max_n_requests, **kwargs)
 def test_purging_keys(self):
     self.req_it = cycle(self.requests)
     ob = OverusedBuffer(self.get_once, 10, 1, 100, 10)
     ob.get_next_requests(10, overused_keys=["example.com", "www.example.com"],
                          key_type="domain")
     assert ob._get_pending_count() == 9
     ob.get_next_requests(10, overused_keys=["example.com", "www.example.com"],
                          key_type="domain") # purging of www.example.com
     assert ob._get_pending_count() == 7
Ejemplo n.º 12
0
class MemoryDFSOverusedBackend(MemoryDFSBackend):
    def __init__(self, manager):
        super(MemoryDFSOverusedBackend, self).__init__(manager)
        settings = manager.settings
        self.overused_buffer = OverusedBuffer(
            super(MemoryDFSOverusedBackend, self).get_next_requests,
            settings.get("OVERUSED_MAX_QUEUE_SIZE"),
            settings.get("OVERUSED_MAX_KEYS"))

    def get_next_requests(self, max_next_requests, **kwargs):
        return self.overused_buffer.get_next_requests(max_next_requests,
                                                      **kwargs)
Ejemplo n.º 13
0
 def __init__(self, manager):
     self._manager = manager
     settings = Settings(attributes=manager.settings.attributes)
     messagebus = load_object(settings.get('MESSAGE_BUS'))
     self.mb = messagebus(settings)
     store_content = settings.get('STORE_CONTENT')
     self._encoder = Encoder(manager.request_model, send_body=store_content)
     self._decoder = Decoder(manager.request_model, manager.response_model)
     self.spider_log_producer = self.mb.spider_log().producer()
     spider_feed = self.mb.spider_feed()
     self.partition_id = settings.get('SPIDER_PARTITION_ID')
     self.consumer = spider_feed.consumer(partition_id=self.partition_id)
     self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT', 5.0))
     self._buffer = OverusedBuffer(self._get_next_requests,
                                   manager.logger.manager.debug)
Ejemplo n.º 14
0
 def __init__(self, manager):
     settings = manager.settings
     messagebus = load_object(settings.get('MESSAGE_BUS'))
     self.mb = messagebus(settings)
     store_content = settings.get('STORE_CONTENT')
     self._encoder = Encoder(manager.request_model, send_body=store_content)
     self._decoder = Decoder(manager.request_model, manager.response_model)
     self.spider_log_producer = self.mb.spider_log().producer()
     spider_feed = self.mb.spider_feed()
     self.partition_id = int(settings.get('SPIDER_PARTITION_ID'))
     if self.partition_id < 0 or self.partition_id >= settings.get('SPIDER_FEED_PARTITIONS'):
         raise ValueError("Spider partition id cannot be less than 0 or more than SPIDER_FEED_PARTITIONS.")
     self.consumer = spider_feed.consumer(partition_id=self.partition_id)
     self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT'))
     self._logger = logging.getLogger("messagebus-backend")
     self._buffer = OverusedBuffer(self._get_next_requests,
                                   self._logger.debug)
     self._logger.info("Consuming from partition id %d", self.partition_id)
Ejemplo n.º 15
0
 def __init__(self, manager):
     settings = manager.settings
     messagebus = load_object(settings.get('MESSAGE_BUS'))
     self.mb = messagebus(settings)
     store_content = settings.get('STORE_CONTENT')
     self._encoder = Encoder(manager.request_model, send_body=store_content)
     self._decoder = Decoder(manager.request_model, manager.response_model)
     self.spider_log_producer = self.mb.spider_log().producer()
     spider_feed = self.mb.spider_feed()
     self.partition_id = int(settings.get('SPIDER_PARTITION_ID'))
     if self.partition_id < 0 or self.partition_id >= settings.get(
             'SPIDER_FEED_PARTITIONS'):
         raise ValueError(
             "Spider partition id cannot be less than 0 or more than SPIDER_FEED_PARTITIONS."
         )
     self.consumer = spider_feed.consumer(partition_id=self.partition_id)
     self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT'))
     self._logger = logging.getLogger("messagebus-backend")
     self._buffer = OverusedBuffer(self._get_next_requests,
                                   self._logger.debug)
     self._logger.info("Consuming from partition id %d", self.partition_id)
Ejemplo n.º 16
0
 def __init__(self, manager):
     settings = manager.settings
     messagebus = load_object(settings.get('MESSAGE_BUS'))
     self.mb = messagebus(settings)
     codec_path = settings.get('MESSAGE_BUS_CODEC')
     encoder_cls = load_object(codec_path+".Encoder")
     decoder_cls = load_object(codec_path+".Decoder")
     store_content = settings.get('STORE_CONTENT')
     self._encoder = encoder_cls(manager.request_model, send_body=store_content)
     self._decoder = decoder_cls(manager.request_model, manager.response_model)
     self.spider_log_producer = self.mb.spider_log().producer()
     spider_feed = self.mb.spider_feed()
     self.partition_id = int(settings.get('SPIDER_PARTITION_ID'))
     if self.partition_id < 0 or self.partition_id >= settings.get('SPIDER_FEED_PARTITIONS'):
         raise ValueError("Spider partition id cannot be less than 0 or more than SPIDER_FEED_PARTITIONS.")
     self.consumer = spider_feed.consumer(partition_id=self.partition_id)
     self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT'))
     self._logger = logging.getLogger("messagebus-backend")
     self._buffer = OverusedBuffer(self._get_next_requests,
                                   max_per_key=settings.get('OVERUSED_MAX_PER_KEY'),
                                   keep_per_key=settings.get("OVERUSED_KEEP_PER_KEY"),
                                   max_keys=settings.get('OVERUSED_MAX_KEYS'),
                                   keep_keys=settings.get('OVERUSED_KEEP_KEYS'))
     self._logger.info("Consuming from partition id %d", self.partition_id)
Ejemplo n.º 17
0
class MessageBusBackend(Backend):
    def __init__(self, manager):
        settings = manager.settings
        messagebus = load_object(settings.get('MESSAGE_BUS'))
        self.mb = messagebus(settings)
        codec_path = settings.get('MESSAGE_BUS_CODEC')
        encoder_cls = load_object(codec_path+".Encoder")
        decoder_cls = load_object(codec_path+".Decoder")
        store_content = settings.get('STORE_CONTENT')
        self._encoder = encoder_cls(manager.request_model, send_body=store_content)
        self._decoder = decoder_cls(manager.request_model, manager.response_model)
        self.spider_log_producer = self.mb.spider_log().producer()
        spider_feed = self.mb.spider_feed()
        self.partition_id = int(settings.get('SPIDER_PARTITION_ID'))
        if self.partition_id < 0 or self.partition_id >= settings.get('SPIDER_FEED_PARTITIONS'):
            raise ValueError("Spider partition id cannot be less than 0 or more than SPIDER_FEED_PARTITIONS.")
        self.consumer = spider_feed.consumer(partition_id=self.partition_id)
        self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT'))
        self._logger = logging.getLogger("messagebus-backend")
        self._buffer = OverusedBuffer(self._get_next_requests,
                                      max_per_key=settings.get('OVERUSED_MAX_PER_KEY'),
                                      keep_per_key=settings.get("OVERUSED_KEEP_PER_KEY"),
                                      max_keys=settings.get('OVERUSED_MAX_KEYS'),
                                      keep_keys=settings.get('OVERUSED_KEEP_KEYS'))
        self._logger.info("Consuming from partition id %d", self.partition_id)

    @classmethod
    def from_manager(cls, manager):
        return cls(manager)

    def frontier_start(self):
        pass

    def frontier_stop(self):
        self.spider_log_producer.flush()
        self.consumer.close()

    def add_seeds(self, seeds):
        raise NotImplementedError("The seeds addition using spider log isn't allowed")

    def page_crawled(self, response):
        host_fprint = get_host_fprint(response)
        self.spider_log_producer.send(host_fprint, self._encoder.encode_page_crawled(response))

    def links_extracted(self, request, links):
        per_host = aggregate_per_host(links)
        for host_fprint, host_links in six.iteritems(per_host):
            self.spider_log_producer.send(host_fprint,
                                          self._encoder.encode_links_extracted(request, host_links))

    def request_error(self, page, error):
        host_fprint = get_host_fprint(page)
        self.spider_log_producer.send(host_fprint, self._encoder.encode_request_error(page, error))

    def _get_next_requests(self, max_n_requests, **kwargs):
        requests = []
        for encoded in self.consumer.get_messages(count=max_n_requests, timeout=self._get_timeout):
            try:
                request = self._decoder.decode_request(encoded)
            except Exception as exc:
                self._logger.warning("Could not decode message: {0}, error {1}".format(encoded, str(exc)))
            else:
                requests.append(request)
        self.spider_log_producer.send(b'0123456789abcdef0123456789abcdef012345678',
                                      self._encoder.encode_offset(self.partition_id,
                                                                  self.consumer.get_offset(self.partition_id)))
        return requests

    def get_next_requests(self, max_n_requests, **kwargs):
        return self._buffer.get_next_requests(max_n_requests, **kwargs)

    def finished(self):
        return False

    @property
    def metadata(self):
        return None

    @property
    def queue(self):
        return None

    @property
    def states(self):
        return None
Ejemplo n.º 18
0
class MessageBusBackend(Backend):
    def __init__(self, manager):
        self._manager = manager
        settings = Settings(attributes=manager.settings.attributes)
        messagebus = load_object(settings.get('MESSAGE_BUS'))
        self.mb = messagebus(settings)
        store_content = settings.get('STORE_CONTENT')
        self._encoder = Encoder(manager.request_model, send_body=store_content)
        self._decoder = Decoder(manager.request_model, manager.response_model)
        self.spider_log_producer = self.mb.spider_log().producer()
        spider_feed = self.mb.spider_feed()
        self.partition_id = settings.get('SPIDER_PARTITION_ID')
        self.consumer = spider_feed.consumer(partition_id=self.partition_id)
        self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT', 5.0))
        self._buffer = OverusedBuffer(self._get_next_requests,
                                      manager.logger.manager.debug)

    @classmethod
    def from_manager(clas, manager):
        return clas(manager)

    def frontier_start(self):
        pass

    def frontier_stop(self):
        self.spider_log_producer.flush()

    def add_seeds(self, seeds):
        self.spider_log_producer.send(seeds[0].meta['fingerprint'],
                                      self._encoder.encode_add_seeds(seeds))

    def page_crawled(self, response, links):
        self.spider_log_producer.send(
            response.meta['fingerprint'],
            self._encoder.encode_page_crawled(response, links))

    def request_error(self, page, error):
        self.spider_log_producer.send(
            page.meta['fingerprint'],
            self._encoder.encode_request_error(page, error))

    def _get_next_requests(self, max_n_requests, **kwargs):
        requests = []
        for encoded in self.consumer.get_messages(count=max_n_requests,
                                                  timeout=self._get_timeout):
            try:
                request = self._decoder.decode_request(encoded)
                requests.append(request)
            except ValueError:
                self._manager.logger.backend.warning(
                    "Could not decode message: {0}".format(encoded))
                pass
        self.spider_log_producer.send(
            '0123456789abcdef0123456789abcdef012345678',
            self._encoder.encode_offset(self.partition_id,
                                        self.consumer.get_offset()))
        return requests

    def get_next_requests(self, max_n_requests, **kwargs):
        return self._buffer.get_next_requests(max_n_requests, **kwargs)

    def finished(self):
        return False

    @property
    def metadata(self):
        return None

    @property
    def queue(self):
        return None

    @property
    def states(self):
        return None
Ejemplo n.º 19
0
    def test_purging_keys_set(self):
        self.generate_requests()
        self.req_it = cycle(self.requests)
        ob = OverusedBuffer(self.get_once, 1000, 10)

        ob.get_next_requests(10, overused_keys=self.hosts, key_type="domain")
        assert (ob._get_key_count()) == 10

        ob.get_next_requests(10, overused_keys=self.hosts, key_type="domain")
        assert (ob._get_key_count()) == 20

        ob.get_next_requests(10, overused_keys=self.hosts,
                             key_type="domain")  # purging of keys set
        assert (ob._get_key_count()) < 20
Ejemplo n.º 20
0
    def test_base(self):
        self.req_it = iter(self.requests)
        ob = OverusedBuffer(self.get_once, 100, 10000)

        assert ob._get_pending_count() == 0
        assert set(
            ob.get_next_requests(
                10,
                overused_keys=['www.example.com', 'example1.com'],
                key_type='domain')) == set([r4, r5])
        assert ob._get_pending_count() == 4
        assert ob.get_next_requests(10,
                                    overused_keys=['www.example.com'],
                                    key_type='domain') == [r6]
        assert ob._get_pending_count() == 3

        assert ob.get_next_requests(10,
                                    overused_keys=['www.example.com'],
                                    key_type='domain') == []
        assert ob._get_pending_count() == 3

        #the max_next_requests is 3 here to cover the "len(requests) == max_next_requests" case.
        assert set(
            ob.get_next_requests(3,
                                 overused_keys=['example.com'],
                                 key_type='domain')) == set([r1, r2, r3])
        assert ob._get_pending_count() == 0

        assert ob.get_next_requests(10, overused_keys=[],
                                    key_type='domain') == []
        assert ob._get_pending_count() == 0
Ejemplo n.º 21
0
 def __init__(self, manager):
     super(KafkaOverusedBackend, self).__init__(manager)
     self._buffer = OverusedBuffer(super(KafkaOverusedBackend, self).get_next_requests,
                                   manager.logger.manager.debug)
Ejemplo n.º 22
0
class MessageBusBackend(Backend):
    def __init__(self, manager):
        settings = manager.settings
        messagebus = load_object(settings.get('MESSAGE_BUS'))
        self.mb = messagebus(settings)
        store_content = settings.get('STORE_CONTENT')
        self._encoder = Encoder(manager.request_model, send_body=store_content)
        self._decoder = Decoder(manager.request_model, manager.response_model)
        self.spider_log_producer = self.mb.spider_log().producer()
        spider_feed = self.mb.spider_feed()
        self.partition_id = int(settings.get('SPIDER_PARTITION_ID'))
        if self.partition_id < 0 or self.partition_id >= settings.get('SPIDER_FEED_PARTITIONS'):
            raise ValueError("Spider partition id cannot be less than 0 or more than SPIDER_FEED_PARTITIONS.")
        self.consumer = spider_feed.consumer(partition_id=self.partition_id)
        self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT'))
        self._logger = logging.getLogger("messagebus-backend")
        self._buffer = OverusedBuffer(self._get_next_requests,
                                      self._logger.debug)
        self._logger.info("Consuming from partition id %d", self.partition_id)

    @classmethod
    def from_manager(clas, manager):
        return clas(manager)

    def frontier_start(self):
        pass

    def frontier_stop(self):
        self.spider_log_producer.flush()

    def add_seeds(self, seeds):
        self.spider_log_producer.send(seeds[0].meta['fingerprint'], self._encoder.encode_add_seeds(seeds))

    def page_crawled(self, response, links):
        self.spider_log_producer.send(response.meta['fingerprint'], self._encoder.encode_page_crawled(response, links))

    def request_error(self, page, error):
        self.spider_log_producer.send(page.meta['fingerprint'], self._encoder.encode_request_error(page, error))

    def _get_next_requests(self, max_n_requests, **kwargs):
        requests = []
        for encoded in self.consumer.get_messages(count=max_n_requests, timeout=self._get_timeout):
            try:
                request = self._decoder.decode_request(encoded)
            except Exception as exc:
                self._logger.warning("Could not decode message: {0}, error {1}".format(encoded, str(exc)))
            else:
                requests.append(request)
        self.spider_log_producer.send('0123456789abcdef0123456789abcdef012345678',
                                      self._encoder.encode_offset(self.partition_id, self.consumer.get_offset()))
        return requests

    def get_next_requests(self, max_n_requests, **kwargs):
        return self._buffer.get_next_requests(max_n_requests, **kwargs)

    def finished(self):
        return False

    @property
    def metadata(self):
        return None

    @property
    def queue(self):
        return None

    @property
    def states(self):
        return None
Ejemplo n.º 23
0
class MessageBusBackend(Backend):
    def __init__(self, manager):
        self._manager = manager
        settings = Settings(attributes=manager.settings.attributes)
        messagebus = load_object(settings.get('MESSAGE_BUS'))
        self.mb = messagebus(settings)
        store_content = settings.get('STORE_CONTENT')
        self._encoder = Encoder(manager.request_model, send_body=store_content)
        self._decoder = Decoder(manager.request_model, manager.response_model)
        self.spider_log_producer = self.mb.spider_log().producer()
        spider_feed = self.mb.spider_feed()
        self.partition_id = settings.get('SPIDER_PARTITION_ID')
        self.consumer = spider_feed.consumer(partition_id=self.partition_id)
        self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT', 5.0))
        self._buffer = OverusedBuffer(self._get_next_requests,
                                      manager.logger.manager.debug)

    @classmethod
    def from_manager(clas, manager):
        return clas(manager)

    def frontier_start(self):
        pass

    def frontier_stop(self):
        self.spider_log_producer.flush()

    def add_seeds(self, seeds):
        self.spider_log_producer.send(seeds[0].meta['fingerprint'], self._encoder.encode_add_seeds(seeds))

    def page_crawled(self, response, links):
        self.spider_log_producer.send(response.meta['fingerprint'], self._encoder.encode_page_crawled(response, links))

    def request_error(self, page, error):
        self.spider_log_producer.send(page.meta['fingerprint'], self._encoder.encode_request_error(page, error))

    def _get_next_requests(self, max_n_requests, **kwargs):
        requests = []
        for encoded in self.consumer.get_messages(count=max_n_requests, timeout=self._get_timeout):
            try:
                request = self._decoder.decode_request(encoded)
                requests.append(request)
            except ValueError:
                self._manager.logger.backend.warning("Could not decode message: {0}".format(encoded))
                pass
        self.spider_log_producer.send('0123456789abcdef0123456789abcdef012345678',
                                      self._encoder.encode_offset(self.partition_id, self.consumer.get_offset()))
        return requests

    def get_next_requests(self, max_n_requests, **kwargs):
        return self._buffer.get_next_requests(max_n_requests, **kwargs)

    def finished(self):
        return False

    @property
    def metadata(self):
        return None

    @property
    def queue(self):
        return None

    @property
    def states(self):
        return None
Ejemplo n.º 24
0
 def __init__(self, manager):
     super(KafkaOverusedBackend, self).__init__(manager)
     self._buffer = OverusedBuffer(
         super(KafkaOverusedBackend, self).get_next_requests,
         manager.logger.manager.debug)
Ejemplo n.º 25
0
 def __init__(self, manager):
     super(MemoryDFSOverusedBackend, self).__init__(manager)
     self.overused_buffer = OverusedBuffer(super(MemoryDFSOverusedBackend, self).get_next_requests)
Ejemplo n.º 26
0
class MessageBusBackend(Backend):
    def __init__(self, manager):
        settings = manager.settings
        messagebus = load_object(settings.get('MESSAGE_BUS'))
        self.mb = messagebus(settings)
        codec_path = settings.get('MESSAGE_BUS_CODEC')
        encoder_cls = load_object(codec_path + ".Encoder")
        decoder_cls = load_object(codec_path + ".Decoder")
        store_content = settings.get('STORE_CONTENT')
        self._encoder = encoder_cls(manager.request_model,
                                    send_body=store_content)
        self._decoder = decoder_cls(manager.request_model,
                                    manager.response_model)
        self.spider_log_producer = self.mb.spider_log().producer()
        spider_feed = self.mb.spider_feed()
        self.partition_id = int(settings.get('SPIDER_PARTITION_ID'))
        if self.partition_id < 0 or self.partition_id >= settings.get(
                'SPIDER_FEED_PARTITIONS'):
            raise ValueError(
                "Spider partition id cannot be less than 0 or more than SPIDER_FEED_PARTITIONS."
            )
        self.consumer = spider_feed.consumer(partition_id=self.partition_id)
        self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT'))
        self._logger = logging.getLogger("messagebus-backend")
        self._buffer = OverusedBuffer(
            self._get_next_requests,
            max_per_key=settings.get('OVERUSED_MAX_PER_KEY'),
            keep_per_key=settings.get("OVERUSED_KEEP_PER_KEY"),
            max_keys=settings.get('OVERUSED_MAX_KEYS'),
            keep_keys=settings.get('OVERUSED_KEEP_KEYS'))
        self._logger.info("Consuming from partition id %d", self.partition_id)

    @classmethod
    def from_manager(cls, manager):
        return cls(manager)

    def frontier_start(self):
        pass

    def frontier_stop(self):
        self.spider_log_producer.flush()
        self.consumer.close()

    def add_seeds(self, seeds):
        raise NotImplemented(
            "The seeds addition using spider log isn't allowed")

    def page_crawled(self, response):
        host_fprint = get_host_fprint(response)
        self.spider_log_producer.send(
            host_fprint, self._encoder.encode_page_crawled(response))

    def links_extracted(self, request, links):
        per_host = aggregate_per_host(links)
        for host_fprint, host_links in six.iteritems(per_host):
            self.spider_log_producer.send(
                host_fprint,
                self._encoder.encode_links_extracted(request, host_links))

    def request_error(self, page, error):
        host_fprint = get_host_fprint(page)
        self.spider_log_producer.send(
            host_fprint, self._encoder.encode_request_error(page, error))

    def _get_next_requests(self, max_n_requests, **kwargs):
        requests = []
        for encoded in self.consumer.get_messages(count=max_n_requests,
                                                  timeout=self._get_timeout):
            try:
                request = self._decoder.decode_request(encoded)
            except Exception as exc:
                self._logger.warning(
                    "Could not decode message: {0}, error {1}".format(
                        encoded, str(exc)))
            else:
                requests.append(request)
        self.spider_log_producer.send(
            b'0123456789abcdef0123456789abcdef012345678',
            self._encoder.encode_offset(
                self.partition_id,
                self.consumer.get_offset(self.partition_id)))
        return requests

    def get_next_requests(self, max_n_requests, **kwargs):
        return self._buffer.get_next_requests(max_n_requests, **kwargs)

    def finished(self):
        return False

    @property
    def metadata(self):
        return None

    @property
    def queue(self):
        return None

    @property
    def states(self):
        return None
Ejemplo n.º 27
0
 def __init__(self, manager):
     super(MemoryDFSOverusedBackend, self).__init__(manager)
     self.overused_buffer = OverusedBuffer(
         super(MemoryDFSOverusedBackend, self).get_next_requests)
    def test_base(self):
        self.req_it = iter(self.requests)
        ob = OverusedBuffer(self.get_once, None, 100, None, 100)

        assert ob._get_pending_count() == 0
        assert set(ob.get_next_requests(10, overused_keys=['www.example.com', 'example1.com'],
                                        key_type='domain')) == set([r4, r5])
        assert ob._get_pending_count() == 4
        assert ob.get_next_requests(10, overused_keys=['www.example.com'],
                                    key_type='domain') == [r6]
        assert ob._get_pending_count() == 3

        assert ob.get_next_requests(10, overused_keys=['www.example.com'],
                                    key_type='domain') == []
        assert ob._get_pending_count() == 3

        #the max_next_requests is 3 here to cover the "len(requests) == max_next_requests" case.
        assert set(ob.get_next_requests(3, overused_keys=['example.com'],
                                        key_type='domain')) == set([r1, r2, r3])
        assert ob._get_pending_count() == 0

        assert ob.get_next_requests(10, overused_keys=[], key_type='domain') == []
        assert ob._get_pending_count() == 0