Ejemplo n.º 1
0
 def __init__(self, session_cls, queue_cls, partitions, ordering='default'):
     self.session = session_cls()
     self.queue_model = queue_cls
     self.logger = logging.getLogger("frontera.contrib.backends.sqlalchemy.components.Queue")
     self.partitions = [i for i in range(0, partitions)]
     self.partitioner = Crc32NamePartitioner(self.partitions)
     self.ordering = ordering
Ejemplo n.º 2
0
 def __init__(self, session_cls, queue_cls, partitions):
     self.session = session_cls()
     self.queue_model = queue_cls
     self.logger = logging.getLogger(
         "frontera.contrib.backends.sqlalchemy.revisiting.RevisitingQueue")
     self.partitions = [i for i in range(0, partitions)]
     self.partitioner = Crc32NamePartitioner(self.partitions)
Ejemplo n.º 3
0
 def __init__(self, session_cls, queue_cls, partitions, ordering='default'):
     self.session = session_cls()
     self.queue_model = queue_cls
     self.logger = logging.getLogger("sqlalchemy.queue")
     self.partitions = [i for i in range(0, partitions)]
     self.partitioner = Crc32NamePartitioner(self.partitions)
     self.ordering = ordering
Ejemplo n.º 4
0
    def __init__(self,
                 connection,
                 partitions,
                 table_name,
                 drop=False,
                 use_snappy=False):
        self.connection = connection
        self.partitions = [i for i in range(0, partitions)]
        self.partitioner = Crc32NamePartitioner(self.partitions)
        self.logger = logging.getLogger("hbase.queue")
        self.table_name = to_bytes(table_name)

        tables = set(self.connection.tables())
        if drop and self.table_name in tables:
            self.connection.delete_table(self.table_name, disable=True)
            tables.remove(self.table_name)

        if self.table_name not in tables:
            schema = {'f': {'max_versions': 1}}
            if use_snappy:
                schema['f']['compression'] = 'SNAPPY'
            self.connection.create_table(self.table_name, schema)

        class DumbResponse:
            pass

        self.decoder = Decoder(Request, DumbResponse)
        self.encoder = Encoder(Request)
Ejemplo n.º 5
0
    def __init__(self, connection, partitions, table_name, drop=False):
        self.connection = connection
        self.partitions = [i for i in range(0, partitions)]
        self.partitioner = Crc32NamePartitioner(self.partitions)
        self.logger = logging.getLogger("hbase.queue")
        self.table_name = to_bytes(table_name)

        tables = set(self.connection.tables())
        if drop and self.table_name in tables:
            self.connection.delete_table(self.table_name, disable=True)
            tables.remove(self.table_name)

        if self.table_name not in tables:
            self.connection.create_table(
                self.table_name,
                {'f': {
                    'max_versions': 1,
                    'block_cache_enabled': 1
                }})

        class DumbResponse:
            pass

        self.decoder = Decoder(Request, DumbResponse)
        self.encoder = Encoder(Request)
Ejemplo n.º 6
0
 def __init__(self, partitions):
     self.partitions = [i for i in range(0, partitions)]
     self.partitioner = Crc32NamePartitioner(self.partitions)
     self.logger = logging.getLogger("memory.queue")
     self.heap = {}
     for partition in self.partitions:
         self.heap[partition] = Heap(self._compare_pages)
Ejemplo n.º 7
0
 def __init__(self, partitions):
     self.partitions = [i for i in range(0, partitions)]
     self.partitioner = Crc32NamePartitioner(self.partitions)
     self.logger = logging.getLogger("frontera.contrib.backends.memory.MemoryQueue")
     self.heap = {}
     for partition in self.partitions:
         self.heap[partition] = Heap(self._compare_pages)
Ejemplo n.º 8
0
 def test_queue_with_delay(self):
     connection = Connection(host='hbase-docker', port=9090)
     queue = HBaseQueue(connection, Crc32NamePartitioner([0]), b'queue',
                        True)
     r5 = r3.copy()
     crawl_at = int(time()) + 1000
     r5.meta[b'crawl_at'] = crawl_at
     batch = [(r5.meta[b'fingerprint'], 0.5, r5, True)]
     queue.schedule(batch)
     with mock.patch('frontera.contrib.backends.hbase.time') as mocked_time:
         mocked_time.return_value = time()
         assert queue.get_next_requests(10,
                                        0,
                                        min_requests=3,
                                        min_hosts=1,
                                        max_requests_per_host=10) == []
         mocked_time.return_value = crawl_at + 1
         assert set([
             r.url
             for r in queue.get_next_requests(10,
                                              0,
                                              min_requests=3,
                                              min_hosts=1,
                                              max_requests_per_host=10)
         ]) == set([r5.url])
Ejemplo n.º 9
0
 def test_drop_all_tables_when_table_name_is_str(self):
     connection = Connection(host='hbase-docker', port=9090)
     for table in connection.tables():
         connection.delete_table(table, True)
     hbase_queue_table = 'queue'
     hbase_metadata_table = 'metadata'
     connection.create_table(hbase_queue_table, {'f': {'max_versions': 1}})
     connection.create_table(hbase_metadata_table,
                             {'f': {
                                 'max_versions': 1
                             }})
     tables = connection.tables()
     assert set(tables) == set([b'metadata',
                                b'queue'])  # Failure of test itself
     try:
         HBaseQueue(connection=connection,
                    partitioner=Crc32NamePartitioner([0]),
                    table_name=hbase_queue_table,
                    drop=True)
         HBaseMetadata(connection=connection,
                       table_name=hbase_metadata_table,
                       drop_all_tables=True,
                       use_snappy=False,
                       batch_size=300000,
                       store_content=True)
     except AlreadyExists:
         assert False, "failed to drop hbase tables"
Ejemplo n.º 10
0
 def producer(self):
     partitioner = Crc32NamePartitioner(self._partitions) if self._hostname_partitioning \
         else FingerprintPartitioner(self._partitions)
     return KeyedProducer(self._location,
                          self._enable_ssl,
                          self._cert_path,
                          self._topic,
                          partitioner,
                          self._codec,
                          batch_size=DEFAULT_BATCH_SIZE,
                          buffer_memory=DEFAULT_BUFFER_MEMORY)
Ejemplo n.º 11
0
def test_crc32name_partitioner():
    partitions = range(0, 5)
    cp = Crc32NamePartitioner(partitions)
    key = '1be68ff556fd0bbe5802d1a100850da29f7f15b11'
    partition = cp.partition(key, partitions)
    assert partition == 3

    partition = cp.partition(None, partitions)
    assert partition == 0

    partition = cp.partition(key, None)
    assert partition == 3
Ejemplo n.º 12
0
 def __init__(self, partitions, is_fifo=True):
     """
     Deque-based queue (see collections module). Efficient queue for LIFO and FIFO strategies.
     :param partitions: int count of partitions
     :param type: bool, True for FIFO, False for LIFO
     """
     self.partitions = [i for i in range(0, partitions)]
     self.partitioner = Crc32NamePartitioner(self.partitions)
     self.logger = logging.getLogger("memory.dequequeue")
     self.queues = {}
     self.is_fifo = is_fifo
     for partition in self.partitions:
         self.queues[partition] = deque()
Ejemplo n.º 13
0
 def test_queue(self):
     connection = Connection(host='hbase-docker', port=9090)
     queue = HBaseQueue(connection, Crc32NamePartitioner([0, 1]), b'queue',
                        True)
     batch = [('10', 0.5, r1, True), ('11', 0.6, r2, True),
              ('12', 0.7, r3, True)]
     queue.schedule(batch)
     assert set([
         r.url for r in queue.get_next_requests(
             10, 0, min_requests=3, min_hosts=1, max_requests_per_host=10)
     ]) == set([r3.url])
     assert set([
         r.url for r in queue.get_next_requests(
             10, 1, min_requests=3, min_hosts=1, max_requests_per_host=10)
     ]) == set([r1.url, r2.url])
Ejemplo n.º 14
0
    def __init__(self, manager, pool, partitions, delete_all_keys=False):
        settings = manager.settings
        codec_path = settings.get('REDIS_BACKEND_CODEC')
        encoder_cls = load_object(codec_path + ".Encoder")
        decoder_cls = load_object(codec_path + ".Decoder")
        self._encoder = encoder_cls(manager.request_model)
        self._decoder = decoder_cls(manager.request_model,
                                    manager.response_model)
        self._redis = RedisOperation(pool)
        self._redis_pipeline = RedisPipeline(pool)
        self._partitions = [i for i in range(0, partitions)]
        self._partitioner = Crc32NamePartitioner(self._partitions)
        self._logger = logging.getLogger("redis_backend.queue")

        if delete_all_keys:
            self._redis.flushdb()
Ejemplo n.º 15
0
def test_crc32name_partitioner():
    partitions = list(range(0, 5))
    cp = Crc32NamePartitioner(partitions)

    key = b'www.example.com'
    assert cp.get_key(request) == key

    partition = cp.partition(key, partitions)
    assert partition == 3

    partition = cp.partition(42, partitions)
    assert partition == 2

    partition = cp.partition(None, partitions)
    assert partition == 0

    partition = cp.partition(key, None)
    assert partition == 3
Ejemplo n.º 16
0
    def __init__(self, connection, partitions, logger, table_name, drop=False):
        self.connection = connection
        self.partitions = [i for i in range(0, partitions)]
        self.partitioner = Crc32NamePartitioner(self.partitions)
        self.logger = logger
        self.table_name = table_name

        tables = set(self.connection.tables())
        if drop and self.table_name in tables:
            self.connection.delete_table(self.table_name, disable=True)
            tables.remove(self.table_name)

        if self.table_name not in tables:
            self.connection.create_table(
                self.table_name,
                {'f': {
                    'max_versions': 1,
                    'block_cache_enabled': 1
                }})
Ejemplo n.º 17
0
 def producer(self):
     partitioner = Crc32NamePartitioner(self._partitions) if self._hostname_partitioning \
         else FingerprintPartitioner(self._partitions)
     return KeyedProducer(self._location, self._enable_ssl, self._cert_path,
                          self._topic, partitioner, self._codec)
Ejemplo n.º 18
0
 def __init__(self, context, location, partitions, hwm, hostname_partitioning):
     super(SpiderFeedProducer, self).__init__(context, location, b'sf')
     self.partitioner = Crc32NamePartitioner(partitions) if hostname_partitioning else \
         FingerprintPartitioner(partitions)
     self.sender.set(zmq.SNDHWM, hwm)