Example #1
0
def test_fingerprint_partitioner():
    partitions = range(0, 5)
    fp = FingerprintPartitioner(partitions)
    key = '1be68ff556fd0bbe5802d1a100850da29f7f15b1'
    partition = fp.partition(key, partitions)
    assert partition == 4

    partition = fp.partition(key, None)
    assert partition == 4
Example #2
0
def test_fingerprint_partitioner():
    partitions = list(range(0, 5))
    fp = FingerprintPartitioner(partitions)

    key = b'1be68ff556fd0bbe5802d1a100850da29f7f15b1'
    assert fp.get_key(request) == key

    partition = fp.partition(key, partitions)
    assert partition == 1

    partition = fp.partition(key, None)
    assert partition == 1
Example #3
0
 def producer(self):
     return KeyedProducer(self._location,
                          self._enable_ssl,
                          self._cert_path,
                          self._topic,
                          FingerprintPartitioner(self._partitions),
                          self._codec,
                          batch_size=DEFAULT_BATCH_SIZE,
                          buffer_memory=DEFAULT_BUFFER_MEMORY)
Example #4
0
    def __init__(self, auth, project_id, frontier, batch_size, flush_interval,
                 slots_count, slot_prefix, cleanup_on_start):
        self.hcf = HCFClientWrapper(auth=auth,
                                    project_id=project_id,
                                    frontier=frontier,
                                    batch_size=batch_size,
                                    flush_interval=flush_interval)
        self.hcf_slots_count = slots_count
        self.hcf_slot_prefix = slot_prefix
        self.logger = logging.getLogger("hcf.queue")
        self.consumed_batches_ids = dict()
        self.partitions = [
            self.hcf_slot_prefix + str(i) for i in range(0, slots_count)
        ]
        self.partitioner = FingerprintPartitioner(self.partitions)

        if cleanup_on_start:
            for partition_id in self.partitions:
                self.hcf.delete_slot(partition_id)
Example #5
0
 def producer(self):
     partitioner = Crc32NamePartitioner(self._partitions) if self._hostname_partitioning \
         else FingerprintPartitioner(self._partitions)
     return KeyedProducer(self._location,
                          self._enable_ssl,
                          self._cert_path,
                          self._topic,
                          partitioner,
                          self._codec,
                          batch_size=DEFAULT_BATCH_SIZE,
                          buffer_memory=DEFAULT_BUFFER_MEMORY)
    def __init__(self, auth, project_id, frontier, batch_size, flush_interval, slots_count, slot_prefix,
                 cleanup_on_start):
        self.hcf = HCFClientWrapper(auth=auth,
                                    project_id=project_id,
                                    frontier=frontier,
                                    batch_size=batch_size,
                                    flush_interval=flush_interval)
        self.hcf_slots_count = slots_count
        self.hcf_slot_prefix = slot_prefix
        self.logger = logging.getLogger("hcf.queue")
        self.consumed_batches_ids = dict()
        self.partitions = [self.hcf_slot_prefix+str(i) for i in range(0, slots_count)]
        self.partitioner = FingerprintPartitioner(self.partitions)

        if cleanup_on_start:
            for partition_id in self.partitions:
                self.hcf.delete_slot(partition_id)
Example #7
0
 def producer(self):
     partitioner = Crc32NamePartitioner(self._partitions) if self._hostname_partitioning \
         else FingerprintPartitioner(self._partitions)
     return KeyedProducer(self._location, self._enable_ssl, self._cert_path,
                          self._topic, partitioner, self._codec)
Example #8
0
 def producer(self):
     return KeyedProducer(self._location, self._enable_ssl, self._cert_path,
                          self._topic,
                          FingerprintPartitioner(self._partitions),
                          self._codec)
Example #9
0
 def __init__(self, context, location, partitions, hwm, hostname_partitioning):
     super(SpiderFeedProducer, self).__init__(context, location, b'sf')
     self.partitioner = Crc32NamePartitioner(partitions) if hostname_partitioning else \
         FingerprintPartitioner(partitions)
     self.sender.set(zmq.SNDHWM, hwm)
Example #10
0
 def __init__(self, context, location, partitions):
     super(SpiderLogProducer, self).__init__(context, location, b'sl')
     self.partitioner = FingerprintPartitioner(partitions)
class HCFQueue(Queue):
    def __init__(self, auth, project_id, frontier, batch_size, flush_interval, slots_count, slot_prefix,
                 cleanup_on_start):
        self.hcf = HCFClientWrapper(auth=auth,
                                    project_id=project_id,
                                    frontier=frontier,
                                    batch_size=batch_size,
                                    flush_interval=flush_interval)
        self.hcf_slots_count = slots_count
        self.hcf_slot_prefix = slot_prefix
        self.logger = logging.getLogger("hcf.queue")
        self.consumed_batches_ids = dict()
        self.partitions = [self.hcf_slot_prefix+str(i) for i in range(0, slots_count)]
        self.partitioner = FingerprintPartitioner(self.partitions)

        if cleanup_on_start:
            for partition_id in self.partitions:
                self.hcf.delete_slot(partition_id)

    def frontier_start(self):
        pass

    def frontier_stop(self):
        self.hcf.close()

    def get_next_requests(self, max_next_requests, partition_id, **kwargs):
        return_requests = []
        data = True
        while data and len(return_requests) < max_next_requests:
            data = False
            consumed = []
            for batch in self.hcf.read(partition_id, max_next_requests):
                batch_id = batch['id']
                requests = batch['requests']
                data = len(requests) == max_next_requests
                self.logger.debug("got batch %s of size %d from HCF server" % (batch_id, len(requests)))
                for fingerprint, qdata in requests:
                    decoded = _convert_from_saved_type(qdata)
                    request = Request(decoded.get('url', fingerprint), **decoded['request'])
                    if request is not None:
                        request.meta.update({
                            'created_at': datetime.utcnow(),
                            'depth': 0,
                        })
                        request.meta.setdefault(b'scrapy_meta', {})
                        return_requests.append(request)
                consumed.append(batch_id)
            if consumed:
                self.hcf.delete(partition_id, consumed)
        return return_requests

    def schedule(self, batch):
        scheduled = 0
        for _, score, request, schedule in batch:
            if schedule:
                self._process_hcf_link(request, score)
                scheduled += 1
        self.logger.info('scheduled %d links' % scheduled)

    def _process_hcf_link(self, link, score):
        link.meta.pop(b'origin_is_frontier', None)
        hcf_request = {'fp': getattr(link, 'meta', {}).get('hcf_fingerprint', link.url)}
        qdata = {'request': {
                    'method': link.method,
                    'headers': link.headers,
                    'cookies': link.cookies,
                    'meta': link.meta}
                }
        hcf_request['qdata'] = _convert_and_save_type(qdata)
        partition_id = self.partitioner.partition(link.meta[b'fingerprint'])
        slot = self.hcf_slot_prefix + str(partition_id)
        self.hcf.add_request(slot, hcf_request)

    def count(self):
        """
        Calculates lower estimate of items in the queue for all partitions.
        :return: int
        """
        count = 0
        for partition_id in self.partitions:
            for batch in self.hcf.read(partition_id):
                count += len(batch['requests'])
        return count
Example #12
0
class HCFQueue(Queue):
    def __init__(self, auth, project_id, frontier, batch_size, flush_interval,
                 slots_count, slot_prefix, cleanup_on_start):
        self.hcf = HCFClientWrapper(auth=auth,
                                    project_id=project_id,
                                    frontier=frontier,
                                    batch_size=batch_size,
                                    flush_interval=flush_interval)
        self.hcf_slots_count = slots_count
        self.hcf_slot_prefix = slot_prefix
        self.logger = logging.getLogger("hcf.queue")
        self.consumed_batches_ids = dict()
        self.partitions = [
            self.hcf_slot_prefix + str(i) for i in range(0, slots_count)
        ]
        self.partitioner = FingerprintPartitioner(self.partitions)

        if cleanup_on_start:
            for partition_id in self.partitions:
                self.hcf.delete_slot(partition_id)

    def frontier_start(self):
        pass

    def frontier_stop(self):
        self.hcf.close()

    def get_next_requests(self, max_next_requests, partition_id, **kwargs):
        return_requests = []
        data = True
        while data and len(return_requests) < max_next_requests:
            data = False
            consumed = []
            for batch in self.hcf.read(partition_id, max_next_requests):
                batch_id = batch['id']
                requests = batch['requests']
                data = len(requests) == max_next_requests
                self.logger.debug("got batch %s of size %d from HCF server" %
                                  (batch_id, len(requests)))
                for fingerprint, qdata in requests:
                    decoded = _convert_from_saved_type(qdata)
                    request = Request(decoded.get('url', fingerprint),
                                      **decoded['request'])
                    if request is not None:
                        request.meta.update({
                            'created_at': datetime.utcnow(),
                            'depth': 0,
                        })
                        request.meta.setdefault(b'scrapy_meta', {})
                        return_requests.append(request)
                consumed.append(batch_id)
            if consumed:
                self.hcf.delete(partition_id, consumed)
        return return_requests

    def schedule(self, batch):
        scheduled = 0
        for _, score, request, schedule in batch:
            if schedule:
                self._process_hcf_link(request, score)
                scheduled += 1
        self.logger.info('scheduled %d links' % scheduled)

    def _process_hcf_link(self, link, score):
        link.meta.pop(b'origin_is_frontier', None)
        hcf_request = {
            'fp': getattr(link, 'meta', {}).get('hcf_fingerprint', link.url)
        }
        qdata = {
            'request': {
                'method': link.method,
                'headers': link.headers,
                'cookies': link.cookies,
                'meta': link.meta
            }
        }
        hcf_request['qdata'] = _convert_and_save_type(qdata)
        partition_id = self.partitioner.partition(link.meta[b'fingerprint'])
        slot = self.hcf_slot_prefix + str(partition_id)
        self.hcf.add_request(slot, hcf_request)

    def count(self):
        """
        Calculates lower estimate of items in the queue for all partitions.
        :return: int
        """
        count = 0
        for partition_id in self.partitions:
            for batch in self.hcf.read(partition_id):
                count += len(batch['requests'])
        return count
Example #13
0
 def __init__(self):
     self.messages = []
     self.offset = 0
     self.partitioner = FingerprintPartitioner([0])