Exemple #1
0
 def test_queue(self):
     connection = Connection(host='hbase-docker', port=9090)
     queue = HBaseQueue(connection, 2, b'queue', True)
     batch = [('10', 0.5, r1, True), ('11', 0.6, r2, True),
              ('12', 0.7, r3, True)]
     queue.schedule(batch)
     assert set([r.url for r in queue.get_next_requests(10, 0, min_requests=3, min_hosts=1,
                max_requests_per_host=10)]) == set([r3.url])
     assert set([r.url for r in queue.get_next_requests(10, 1, min_requests=3, min_hosts=1,
                max_requests_per_host=10)]) == set([r1.url, r2.url])
Exemple #2
0
 def test_queue_with_delay(self):
     connection = Connection(host='hbase-docker', port=9090)
     queue = HBaseQueue(connection, 1, b'queue', True)
     r5 = r3.copy()
     r5.meta[b'crawl_at'] = int(time()) + 1
     batch = [(r5.meta[b'fingerprint'], 0.5, r5, True)]
     queue.schedule(batch)
     assert queue.get_next_requests(10, 0, min_requests=3, min_hosts=1,
                max_requests_per_host=10) == []
     sleep(1.5)
     assert set([r.url for r in queue.get_next_requests(10, 0, min_requests=3, min_hosts=1,
                max_requests_per_host=10)]) == set([r5.url])
Exemple #3
0
 def test_queue_with_delay(self):
     connection = Connection(host='hbase-docker', port=9090)
     queue = HBaseQueue(connection, 1, b'queue', use_snappy=False, drop=True)
     r5 = r3.copy()
     crawl_at = int(time()) + 1000
     r5.meta[b'crawl_at'] = crawl_at
     batch = [(r5.meta[b'fingerprint'], 0.5, r5, True)]
     queue.schedule(batch)
     with mock.patch('frontera.contrib.backends.hbase.time') as mocked_time:
         mocked_time.return_value = time()
         assert queue.get_next_requests(10, 0, min_requests=3, min_hosts=1,
                                        max_requests_per_host=10) == []
         mocked_time.return_value = crawl_at + 1
         assert set([r.url for r in queue.get_next_requests(10, 0, min_requests=3, min_hosts=1,
                    max_requests_per_host=10)]) == set([r5.url])
Exemple #4
0
 def test_drop_all_tables_when_table_name_is_str(self):
     connection = Connection(host='hbase-docker', port=9090)
     for table in connection.tables():
         connection.delete_table(table, True)
     hbase_queue_table = 'queue'
     hbase_metadata_table = 'metadata'
     hbase_states_table = 'states'
     connection.create_table(hbase_queue_table, {'f': {'max_versions': 1}})
     connection.create_table(hbase_metadata_table,
                             {'f': {
                                 'max_versions': 1
                             }})
     connection.create_table(hbase_states_table, {'f': {'max_versions': 1}})
     tables = connection.tables()
     assert set(tables) == set([b'metadata', b'queue',
                                b'states'])  # Failure of test itself
     try:
         HBaseQueue(connection=connection,
                    partitions=1,
                    table_name=hbase_queue_table,
                    use_snappy=False,
                    drop=True)
         HBaseMetadata(connection=connection,
                       table_name=hbase_metadata_table,
                       drop_all_tables=True,
                       use_snappy=False,
                       batch_size=300000,
                       store_content=True)
         HBaseState(connection,
                    hbase_states_table,
                    cache_size_limit=100,
                    write_log_size=10,
                    drop_all_tables=True)
     except AlreadyExists:
         assert False, "failed to drop hbase tables"
Exemple #5
0
 def test_queue_with_post_request(self):
     connection = Connection(host='hbase-docker', port=9090)
     queue = HBaseQueue(connection,
                        1,
                        b'queue',
                        drop=True,
                        use_snappy=False)
     batch = [('10', 0.5, r1, True)]
     queue.schedule(batch)
     requests = queue.get_next_requests(10,
                                        0,
                                        min_requests=3,
                                        min_hosts=1,
                                        max_requests_per_host=10)
     self.assertEqual(b'POST', requests[0].method)
     self.assertEqual(data, requests[0].body)
Exemple #6
0
 def test_queue_with_delay(self):
     connection = Connection(host='hbase-docker', port=9090)
     queue = HBaseQueue(connection,
                        1,
                        b'queue',
                        use_snappy=False,
                        drop=True)
     r5 = r3.copy()
     crawl_at = int(time()) + 1000
     r5.meta[b'crawl_at'] = crawl_at
     batch = [(r5.meta[b'fingerprint'], 0.5, r5, True)]
     queue.schedule(batch)
     with mock.patch('frontera.contrib.backends.hbase.time') as mocked_time:
         mocked_time.return_value = time()
         assert queue.get_next_requests(10,
                                        0,
                                        min_requests=3,
                                        min_hosts=1,
                                        max_requests_per_host=10) == []
         mocked_time.return_value = crawl_at + 1
         assert set([
             r.url
             for r in queue.get_next_requests(10,
                                              0,
                                              min_requests=3,
                                              min_hosts=1,
                                              max_requests_per_host=10)
         ]) == set([r5.url])
def queue(request):
    if request.param == "memory":
        mq = MemoryQueue(2)
        yield mq
        return

    if request.param == "sqlalchemy":
        engine = create_engine('sqlite:///:memory:', echo=False)
        session_cls = sessionmaker()
        session_cls.configure(bind=engine)
        QueueModel.__table__.create(bind=engine)
        sqla_queue = SQLAlchemyQueue(session_cls, QueueModel, 2)
        yield sqla_queue
        sqla_queue.frontier_stop()
        engine.dispose()
        return

    if request.param == "hbase":
        conn = get_hbase_connection()
        hq = HBaseQueue(conn, 2, b'queue')
        yield hq
        hq.frontier_stop()
        return
    raise KeyError("Unknown backend param")
def queue(request):
    if request.param == "memory":
        mq = MemoryQueue(2)
        yield mq
        return

    if request.param == "sqlalchemy":
        engine = create_engine('sqlite:///:memory:', echo=False)
        session_cls = sessionmaker()
        session_cls.configure(bind=engine)
        QueueModel.__table__.create(bind=engine)
        sqla_queue = SQLAlchemyQueue(session_cls, QueueModel, 2)
        yield sqla_queue
        sqla_queue.frontier_stop()
        engine.dispose()
        return

    if request.param == "hbase":
        conn = get_hbase_connection()
        hq = HBaseQueue(conn, 2, b'queue')
        yield hq
        hq.frontier_stop()
        return
    raise KeyError("Unknown backend param")
Exemple #9
0
 def test_queue(self):
     connection = Connection(host='hbase-docker', port=9090)
     queue = HBaseQueue(connection, 2, b'queue', True)
     batch = [('10', 0.5, r1, True), ('11', 0.6, r2, True),
              ('12', 0.7, r3, True)]
     queue.schedule(batch)
     assert set([r.url for r in queue.get_next_requests(10, 0, min_requests=3, min_hosts=1,
                max_requests_per_host=10)]) == set([r3.url])
     assert set([r.url for r in queue.get_next_requests(10, 1, min_requests=3, min_hosts=1,
                max_requests_per_host=10)]) == set([r1.url, r2.url])
Exemple #10
0
 def test_queue_with_delay(self):
     connection = Connection(host='hbase-docker', port=9090)
     queue = HBaseQueue(connection, 1, b'queue', True)
     r5 = r3.copy()
     r5.meta[b'crawl_at'] = int(time()) + 1
     batch = [(r5.meta[b'fingerprint'], 0.5, r5, True)]
     queue.schedule(batch)
     assert queue.get_next_requests(10,
                                    0,
                                    min_requests=3,
                                    min_hosts=1,
                                    max_requests_per_host=10) == []
     sleep(1.5)
     assert set([
         r.url for r in queue.get_next_requests(
             10, 0, min_requests=3, min_hosts=1, max_requests_per_host=10)
     ]) == set([r5.url])