def __init__(self, session_cls, queue_cls, partitions, ordering='default'): self.session = session_cls() self.queue_model = queue_cls self.logger = logging.getLogger("frontera.contrib.backends.sqlalchemy.components.Queue") self.partitions = [i for i in range(0, partitions)] self.partitioner = Crc32NamePartitioner(self.partitions) self.ordering = ordering
def __init__(self, session_cls, queue_cls, partitions): self.session = session_cls() self.queue_model = queue_cls self.logger = logging.getLogger( "frontera.contrib.backends.sqlalchemy.revisiting.RevisitingQueue") self.partitions = [i for i in range(0, partitions)] self.partitioner = Crc32NamePartitioner(self.partitions)
def __init__(self, session_cls, queue_cls, partitions, ordering='default'): self.session = session_cls() self.queue_model = queue_cls self.logger = logging.getLogger("sqlalchemy.queue") self.partitions = [i for i in range(0, partitions)] self.partitioner = Crc32NamePartitioner(self.partitions) self.ordering = ordering
def __init__(self, connection, partitions, table_name, drop=False, use_snappy=False): self.connection = connection self.partitions = [i for i in range(0, partitions)] self.partitioner = Crc32NamePartitioner(self.partitions) self.logger = logging.getLogger("hbase.queue") self.table_name = to_bytes(table_name) tables = set(self.connection.tables()) if drop and self.table_name in tables: self.connection.delete_table(self.table_name, disable=True) tables.remove(self.table_name) if self.table_name not in tables: schema = {'f': {'max_versions': 1}} if use_snappy: schema['f']['compression'] = 'SNAPPY' self.connection.create_table(self.table_name, schema) class DumbResponse: pass self.decoder = Decoder(Request, DumbResponse) self.encoder = Encoder(Request)
def __init__(self, connection, partitions, table_name, drop=False): self.connection = connection self.partitions = [i for i in range(0, partitions)] self.partitioner = Crc32NamePartitioner(self.partitions) self.logger = logging.getLogger("hbase.queue") self.table_name = to_bytes(table_name) tables = set(self.connection.tables()) if drop and self.table_name in tables: self.connection.delete_table(self.table_name, disable=True) tables.remove(self.table_name) if self.table_name not in tables: self.connection.create_table( self.table_name, {'f': { 'max_versions': 1, 'block_cache_enabled': 1 }}) class DumbResponse: pass self.decoder = Decoder(Request, DumbResponse) self.encoder = Encoder(Request)
def __init__(self, partitions): self.partitions = [i for i in range(0, partitions)] self.partitioner = Crc32NamePartitioner(self.partitions) self.logger = logging.getLogger("memory.queue") self.heap = {} for partition in self.partitions: self.heap[partition] = Heap(self._compare_pages)
def __init__(self, partitions): self.partitions = [i for i in range(0, partitions)] self.partitioner = Crc32NamePartitioner(self.partitions) self.logger = logging.getLogger("frontera.contrib.backends.memory.MemoryQueue") self.heap = {} for partition in self.partitions: self.heap[partition] = Heap(self._compare_pages)
def test_queue_with_delay(self): connection = Connection(host='hbase-docker', port=9090) queue = HBaseQueue(connection, Crc32NamePartitioner([0]), b'queue', True) r5 = r3.copy() crawl_at = int(time()) + 1000 r5.meta[b'crawl_at'] = crawl_at batch = [(r5.meta[b'fingerprint'], 0.5, r5, True)] queue.schedule(batch) with mock.patch('frontera.contrib.backends.hbase.time') as mocked_time: mocked_time.return_value = time() assert queue.get_next_requests(10, 0, min_requests=3, min_hosts=1, max_requests_per_host=10) == [] mocked_time.return_value = crawl_at + 1 assert set([ r.url for r in queue.get_next_requests(10, 0, min_requests=3, min_hosts=1, max_requests_per_host=10) ]) == set([r5.url])
def test_drop_all_tables_when_table_name_is_str(self): connection = Connection(host='hbase-docker', port=9090) for table in connection.tables(): connection.delete_table(table, True) hbase_queue_table = 'queue' hbase_metadata_table = 'metadata' connection.create_table(hbase_queue_table, {'f': {'max_versions': 1}}) connection.create_table(hbase_metadata_table, {'f': { 'max_versions': 1 }}) tables = connection.tables() assert set(tables) == set([b'metadata', b'queue']) # Failure of test itself try: HBaseQueue(connection=connection, partitioner=Crc32NamePartitioner([0]), table_name=hbase_queue_table, drop=True) HBaseMetadata(connection=connection, table_name=hbase_metadata_table, drop_all_tables=True, use_snappy=False, batch_size=300000, store_content=True) except AlreadyExists: assert False, "failed to drop hbase tables"
def producer(self): partitioner = Crc32NamePartitioner(self._partitions) if self._hostname_partitioning \ else FingerprintPartitioner(self._partitions) return KeyedProducer(self._location, self._enable_ssl, self._cert_path, self._topic, partitioner, self._codec, batch_size=DEFAULT_BATCH_SIZE, buffer_memory=DEFAULT_BUFFER_MEMORY)
def test_crc32name_partitioner(): partitions = range(0, 5) cp = Crc32NamePartitioner(partitions) key = '1be68ff556fd0bbe5802d1a100850da29f7f15b11' partition = cp.partition(key, partitions) assert partition == 3 partition = cp.partition(None, partitions) assert partition == 0 partition = cp.partition(key, None) assert partition == 3
def __init__(self, partitions, is_fifo=True): """ Deque-based queue (see collections module). Efficient queue for LIFO and FIFO strategies. :param partitions: int count of partitions :param type: bool, True for FIFO, False for LIFO """ self.partitions = [i for i in range(0, partitions)] self.partitioner = Crc32NamePartitioner(self.partitions) self.logger = logging.getLogger("memory.dequequeue") self.queues = {} self.is_fifo = is_fifo for partition in self.partitions: self.queues[partition] = deque()
def test_queue(self): connection = Connection(host='hbase-docker', port=9090) queue = HBaseQueue(connection, Crc32NamePartitioner([0, 1]), b'queue', True) batch = [('10', 0.5, r1, True), ('11', 0.6, r2, True), ('12', 0.7, r3, True)] queue.schedule(batch) assert set([ r.url for r in queue.get_next_requests( 10, 0, min_requests=3, min_hosts=1, max_requests_per_host=10) ]) == set([r3.url]) assert set([ r.url for r in queue.get_next_requests( 10, 1, min_requests=3, min_hosts=1, max_requests_per_host=10) ]) == set([r1.url, r2.url])
def __init__(self, manager, pool, partitions, delete_all_keys=False): settings = manager.settings codec_path = settings.get('REDIS_BACKEND_CODEC') encoder_cls = load_object(codec_path + ".Encoder") decoder_cls = load_object(codec_path + ".Decoder") self._encoder = encoder_cls(manager.request_model) self._decoder = decoder_cls(manager.request_model, manager.response_model) self._redis = RedisOperation(pool) self._redis_pipeline = RedisPipeline(pool) self._partitions = [i for i in range(0, partitions)] self._partitioner = Crc32NamePartitioner(self._partitions) self._logger = logging.getLogger("redis_backend.queue") if delete_all_keys: self._redis.flushdb()
def test_crc32name_partitioner(): partitions = list(range(0, 5)) cp = Crc32NamePartitioner(partitions) key = b'www.example.com' assert cp.get_key(request) == key partition = cp.partition(key, partitions) assert partition == 3 partition = cp.partition(42, partitions) assert partition == 2 partition = cp.partition(None, partitions) assert partition == 0 partition = cp.partition(key, None) assert partition == 3
def __init__(self, connection, partitions, logger, table_name, drop=False): self.connection = connection self.partitions = [i for i in range(0, partitions)] self.partitioner = Crc32NamePartitioner(self.partitions) self.logger = logger self.table_name = table_name tables = set(self.connection.tables()) if drop and self.table_name in tables: self.connection.delete_table(self.table_name, disable=True) tables.remove(self.table_name) if self.table_name not in tables: self.connection.create_table( self.table_name, {'f': { 'max_versions': 1, 'block_cache_enabled': 1 }})
def producer(self): partitioner = Crc32NamePartitioner(self._partitions) if self._hostname_partitioning \ else FingerprintPartitioner(self._partitions) return KeyedProducer(self._location, self._enable_ssl, self._cert_path, self._topic, partitioner, self._codec)
def __init__(self, context, location, partitions, hwm, hostname_partitioning): super(SpiderFeedProducer, self).__init__(context, location, b'sf') self.partitioner = Crc32NamePartitioner(partitions) if hostname_partitioning else \ FingerprintPartitioner(partitions) self.sender.set(zmq.SNDHWM, hwm)