def test_fingerprint_partitioner(): partitions = range(0, 5) fp = FingerprintPartitioner(partitions) key = '1be68ff556fd0bbe5802d1a100850da29f7f15b1' partition = fp.partition(key, partitions) assert partition == 4 partition = fp.partition(key, None) assert partition == 4
def test_fingerprint_partitioner(): partitions = list(range(0, 5)) fp = FingerprintPartitioner(partitions) key = b'1be68ff556fd0bbe5802d1a100850da29f7f15b1' assert fp.get_key(request) == key partition = fp.partition(key, partitions) assert partition == 1 partition = fp.partition(key, None) assert partition == 1
def producer(self): return KeyedProducer(self._location, self._enable_ssl, self._cert_path, self._topic, FingerprintPartitioner(self._partitions), self._codec, batch_size=DEFAULT_BATCH_SIZE, buffer_memory=DEFAULT_BUFFER_MEMORY)
def __init__(self, auth, project_id, frontier, batch_size, flush_interval, slots_count, slot_prefix, cleanup_on_start): self.hcf = HCFClientWrapper(auth=auth, project_id=project_id, frontier=frontier, batch_size=batch_size, flush_interval=flush_interval) self.hcf_slots_count = slots_count self.hcf_slot_prefix = slot_prefix self.logger = logging.getLogger("hcf.queue") self.consumed_batches_ids = dict() self.partitions = [ self.hcf_slot_prefix + str(i) for i in range(0, slots_count) ] self.partitioner = FingerprintPartitioner(self.partitions) if cleanup_on_start: for partition_id in self.partitions: self.hcf.delete_slot(partition_id)
def producer(self): partitioner = Crc32NamePartitioner(self._partitions) if self._hostname_partitioning \ else FingerprintPartitioner(self._partitions) return KeyedProducer(self._location, self._enable_ssl, self._cert_path, self._topic, partitioner, self._codec, batch_size=DEFAULT_BATCH_SIZE, buffer_memory=DEFAULT_BUFFER_MEMORY)
def __init__(self, auth, project_id, frontier, batch_size, flush_interval, slots_count, slot_prefix, cleanup_on_start): self.hcf = HCFClientWrapper(auth=auth, project_id=project_id, frontier=frontier, batch_size=batch_size, flush_interval=flush_interval) self.hcf_slots_count = slots_count self.hcf_slot_prefix = slot_prefix self.logger = logging.getLogger("hcf.queue") self.consumed_batches_ids = dict() self.partitions = [self.hcf_slot_prefix+str(i) for i in range(0, slots_count)] self.partitioner = FingerprintPartitioner(self.partitions) if cleanup_on_start: for partition_id in self.partitions: self.hcf.delete_slot(partition_id)
def producer(self): partitioner = Crc32NamePartitioner(self._partitions) if self._hostname_partitioning \ else FingerprintPartitioner(self._partitions) return KeyedProducer(self._location, self._enable_ssl, self._cert_path, self._topic, partitioner, self._codec)
def producer(self): return KeyedProducer(self._location, self._enable_ssl, self._cert_path, self._topic, FingerprintPartitioner(self._partitions), self._codec)
def __init__(self, context, location, partitions, hwm, hostname_partitioning): super(SpiderFeedProducer, self).__init__(context, location, b'sf') self.partitioner = Crc32NamePartitioner(partitions) if hostname_partitioning else \ FingerprintPartitioner(partitions) self.sender.set(zmq.SNDHWM, hwm)
def __init__(self, context, location, partitions): super(SpiderLogProducer, self).__init__(context, location, b'sl') self.partitioner = FingerprintPartitioner(partitions)
class HCFQueue(Queue): def __init__(self, auth, project_id, frontier, batch_size, flush_interval, slots_count, slot_prefix, cleanup_on_start): self.hcf = HCFClientWrapper(auth=auth, project_id=project_id, frontier=frontier, batch_size=batch_size, flush_interval=flush_interval) self.hcf_slots_count = slots_count self.hcf_slot_prefix = slot_prefix self.logger = logging.getLogger("hcf.queue") self.consumed_batches_ids = dict() self.partitions = [self.hcf_slot_prefix+str(i) for i in range(0, slots_count)] self.partitioner = FingerprintPartitioner(self.partitions) if cleanup_on_start: for partition_id in self.partitions: self.hcf.delete_slot(partition_id) def frontier_start(self): pass def frontier_stop(self): self.hcf.close() def get_next_requests(self, max_next_requests, partition_id, **kwargs): return_requests = [] data = True while data and len(return_requests) < max_next_requests: data = False consumed = [] for batch in self.hcf.read(partition_id, max_next_requests): batch_id = batch['id'] requests = batch['requests'] data = len(requests) == max_next_requests self.logger.debug("got batch %s of size %d from HCF server" % (batch_id, len(requests))) for fingerprint, qdata in requests: decoded = _convert_from_saved_type(qdata) request = Request(decoded.get('url', fingerprint), **decoded['request']) if request is not None: request.meta.update({ 'created_at': datetime.utcnow(), 'depth': 0, }) request.meta.setdefault(b'scrapy_meta', {}) return_requests.append(request) consumed.append(batch_id) if consumed: self.hcf.delete(partition_id, consumed) return return_requests def schedule(self, batch): scheduled = 0 for _, score, request, schedule in batch: if schedule: self._process_hcf_link(request, score) scheduled += 1 self.logger.info('scheduled %d links' % scheduled) def _process_hcf_link(self, link, score): link.meta.pop(b'origin_is_frontier', None) hcf_request = {'fp': getattr(link, 'meta', {}).get('hcf_fingerprint', link.url)} qdata = {'request': { 'method': link.method, 'headers': link.headers, 'cookies': link.cookies, 'meta': link.meta} } hcf_request['qdata'] = _convert_and_save_type(qdata) partition_id = self.partitioner.partition(link.meta[b'fingerprint']) slot = self.hcf_slot_prefix + str(partition_id) self.hcf.add_request(slot, hcf_request) def count(self): """ Calculates lower estimate of items in the queue for all partitions. :return: int """ count = 0 for partition_id in self.partitions: for batch in self.hcf.read(partition_id): count += len(batch['requests']) return count
class HCFQueue(Queue): def __init__(self, auth, project_id, frontier, batch_size, flush_interval, slots_count, slot_prefix, cleanup_on_start): self.hcf = HCFClientWrapper(auth=auth, project_id=project_id, frontier=frontier, batch_size=batch_size, flush_interval=flush_interval) self.hcf_slots_count = slots_count self.hcf_slot_prefix = slot_prefix self.logger = logging.getLogger("hcf.queue") self.consumed_batches_ids = dict() self.partitions = [ self.hcf_slot_prefix + str(i) for i in range(0, slots_count) ] self.partitioner = FingerprintPartitioner(self.partitions) if cleanup_on_start: for partition_id in self.partitions: self.hcf.delete_slot(partition_id) def frontier_start(self): pass def frontier_stop(self): self.hcf.close() def get_next_requests(self, max_next_requests, partition_id, **kwargs): return_requests = [] data = True while data and len(return_requests) < max_next_requests: data = False consumed = [] for batch in self.hcf.read(partition_id, max_next_requests): batch_id = batch['id'] requests = batch['requests'] data = len(requests) == max_next_requests self.logger.debug("got batch %s of size %d from HCF server" % (batch_id, len(requests))) for fingerprint, qdata in requests: decoded = _convert_from_saved_type(qdata) request = Request(decoded.get('url', fingerprint), **decoded['request']) if request is not None: request.meta.update({ 'created_at': datetime.utcnow(), 'depth': 0, }) request.meta.setdefault(b'scrapy_meta', {}) return_requests.append(request) consumed.append(batch_id) if consumed: self.hcf.delete(partition_id, consumed) return return_requests def schedule(self, batch): scheduled = 0 for _, score, request, schedule in batch: if schedule: self._process_hcf_link(request, score) scheduled += 1 self.logger.info('scheduled %d links' % scheduled) def _process_hcf_link(self, link, score): link.meta.pop(b'origin_is_frontier', None) hcf_request = { 'fp': getattr(link, 'meta', {}).get('hcf_fingerprint', link.url) } qdata = { 'request': { 'method': link.method, 'headers': link.headers, 'cookies': link.cookies, 'meta': link.meta } } hcf_request['qdata'] = _convert_and_save_type(qdata) partition_id = self.partitioner.partition(link.meta[b'fingerprint']) slot = self.hcf_slot_prefix + str(partition_id) self.hcf.add_request(slot, hcf_request) def count(self): """ Calculates lower estimate of items in the queue for all partitions. :return: int """ count = 0 for partition_id in self.partitions: for batch in self.hcf.read(partition_id): count += len(batch['requests']) return count
def __init__(self): self.messages = [] self.offset = 0 self.partitioner = FingerprintPartitioner([0])