def test_base(self): self.req_it = iter(self.requests) ob = OverusedBuffer(self.get_once, 100, 10000) assert ob._get_pending_count() == 0 assert set( ob.get_next_requests( 10, overused_keys=['www.example.com', 'example1.com'], key_type='domain')) == set([r4, r5]) assert ob._get_pending_count() == 4 assert ob.get_next_requests(10, overused_keys=['www.example.com'], key_type='domain') == [r6] assert ob._get_pending_count() == 3 assert ob.get_next_requests(10, overused_keys=['www.example.com'], key_type='domain') == [] assert ob._get_pending_count() == 3 #the max_next_requests is 3 here to cover the "len(requests) == max_next_requests" case. assert set( ob.get_next_requests(3, overused_keys=['example.com'], key_type='domain')) == set([r1, r2, r3]) assert ob._get_pending_count() == 0 assert ob.get_next_requests(10, overused_keys=[], key_type='domain') == [] assert ob._get_pending_count() == 0
def __init__(self, manager): super(MemoryDFSOverusedBackend, self).__init__(manager) settings = manager.settings self.overused_buffer = OverusedBuffer( super(MemoryDFSOverusedBackend, self).get_next_requests, settings.get("OVERUSED_MAX_QUEUE_SIZE"), settings.get("OVERUSED_MAX_KEYS"))
def __init__(self, manager): settings = manager.settings messagebus = load_object(settings.get('MESSAGE_BUS')) self.mb = messagebus(settings) codec_path = settings.get('MESSAGE_BUS_CODEC') encoder_cls = load_object(codec_path + ".Encoder") decoder_cls = load_object(codec_path + ".Decoder") store_content = settings.get('STORE_CONTENT') self._encoder = encoder_cls(manager.request_model, send_body=store_content) self._decoder = decoder_cls(manager.request_model, manager.response_model) self.spider_log_producer = self.mb.spider_log().producer() spider_feed = self.mb.spider_feed() self.partition_id = int(settings.get('SPIDER_PARTITION_ID')) if self.partition_id < 0 or self.partition_id >= settings.get( 'SPIDER_FEED_PARTITIONS'): raise ValueError( "Spider partition id cannot be less than 0 or more than SPIDER_FEED_PARTITIONS." ) self.consumer = spider_feed.consumer(partition_id=self.partition_id) self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT')) self._logger = logging.getLogger("messagebus-backend") self._buffer = OverusedBuffer( self._get_next_requests, max_per_key=settings.get('OVERUSED_MAX_PER_KEY'), keep_per_key=settings.get("OVERUSED_KEEP_PER_KEY"), max_keys=settings.get('OVERUSED_MAX_KEYS'), keep_keys=settings.get('OVERUSED_KEEP_KEYS')) self._logger.info("Consuming from partition id %d", self.partition_id)
def test(self): ob = OverusedBuffer(self.get_func, self.log_func) self.requests = [r1, r2, r3, r4, r5, r6] assert set(ob.get_next_requests(10, overused_keys=['www.example.com', 'example1.com'], key_type='domain')) == set([r4, r5]) assert set(self.logs) == set(["Overused keys: ['www.example.com', 'example1.com']", "Pending: 0"]) self.logs = [] assert ob.get_next_requests(10, overused_keys=['www.example.com'], key_type='domain') == [r6] assert set(self.logs) == set(["Overused keys: ['www.example.com']", "Pending: 4"]) self.logs = [] assert ob.get_next_requests(10, overused_keys=['www.example.com'], key_type='domain') == [] assert set(self.logs) == set(["Overused keys: ['www.example.com']", "Pending: 3"]) self.logs = [] #the max_next_requests is 3 here to cover the "len(requests) == max_next_requests" case. assert set(ob.get_next_requests(3, overused_keys=['example.com'], key_type='domain')) == set([r1, r2, r3]) assert set(self.logs) == set(["Overused keys: ['example.com']", "Pending: 3"]) self.logs = [] assert ob.get_next_requests(10, overused_keys=[], key_type='domain') == [] assert set(self.logs) == set(["Overused keys: []", "Pending: 0"])
def test_purging_keys(self): self.req_it = cycle(self.requests) ob = OverusedBuffer(self.get_once, 10, 100) ob.get_next_requests(10, overused_keys=["example.com", "www.example.com"], key_type="domain") assert ob._get_pending_count() == 9 ob.get_next_requests(10, overused_keys=["example.com", "www.example.com"], key_type="domain") # purging of www.example.com assert ob._get_pending_count() == 7
def test_purging_keys_set(self): self.generate_requests() self.req_it = cycle(self.requests) ob = OverusedBuffer(self.get_once, 1000, 10) ob.get_next_requests(10, overused_keys=self.hosts, key_type="domain") assert (ob._get_key_count()) == 10 ob.get_next_requests(10, overused_keys=self.hosts, key_type="domain") assert (ob._get_key_count()) == 20 ob.get_next_requests(10, overused_keys=self.hosts, key_type="domain") # purging of keys set assert (ob._get_key_count()) < 20
def __init__(self, manager): self._manager = manager settings = self._manager.settings messagebus = load_object(settings.get('MESSAGE_BUS')) self.mb = messagebus(settings) store_content = settings.get('STORE_CONTENT') self._encoder = Encoder(manager.request_model, send_body=store_content) self._decoder = Decoder(manager.request_model, manager.response_model) self.spider_log_producer = self.mb.spider_log().producer() spider_feed = self.mb.spider_feed() self.partition_id = int(settings.get('SPIDER_PARTITION_ID')) self.consumer = spider_feed.consumer(partition_id=self.partition_id) self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT')) self._buffer = OverusedBuffer(self._get_next_requests, manager.logger.manager.debug)
def __init__(self, manager): settings = manager.settings messagebus = load_object(settings.get('MESSAGE_BUS')) self.mb = messagebus(settings) store_content = settings.get('STORE_CONTENT') self._encoder = Encoder(manager.request_model, send_body=store_content) self._decoder = Decoder(manager.request_model, manager.response_model) self.spider_log_producer = self.mb.spider_log().producer() spider_feed = self.mb.spider_feed() self.partition_id = int(settings.get('SPIDER_PARTITION_ID')) if self.partition_id < 0 or self.partition_id >= settings.get( 'SPIDER_FEED_PARTITIONS'): raise ValueError( "Spider partition id cannot be less than 0 or more than SPIDER_FEED_PARTITIONS." ) self.consumer = spider_feed.consumer(partition_id=self.partition_id) self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT')) self._logger = logging.getLogger("messagebus-backend") self._buffer = OverusedBuffer(self._get_next_requests, self._logger.debug) self._logger.info("Consuming from partition id %d", self.partition_id)
def __init__(self, manager): super(KafkaOverusedBackend, self).__init__(manager) self._buffer = OverusedBuffer( super(KafkaOverusedBackend, self).get_next_requests, manager.logger.manager.debug)
def __init__(self, manager): super(MemoryDFSOverusedBackend, self).__init__(manager) self.overused_buffer = OverusedBuffer( super(MemoryDFSOverusedBackend, self).get_next_requests)