Example #1
0
def test_prefix():
    assert TABLE_PREFIX + b'_' == connection._table_name('')
    assert TABLE_PREFIX + b'_foo' == connection._table_name('foo')

    assert connection.table('foobar').name == TABLE_PREFIX + b'_foobar'
    assert connection.table('foobar', use_prefix=False).name == b'foobar'

    c = Connection(autoconnect=False)
    assert b'foo' == c._table_name('foo')

    with assert_raises(TypeError):
        Connection(autoconnect=False, table_prefix=123)

    with assert_raises(TypeError):
        Connection(autoconnect=False, table_prefix_separator=2.1)
Example #2
0
def test_prefix():
    assert_equal(TABLE_PREFIX + '_', connection._table_name(''))
    assert_equal(TABLE_PREFIX + '_foo', connection._table_name('foo'))

    assert_equal(connection.table('foobar').name, TABLE_PREFIX + '_foobar')
    assert_equal(connection.table('foobar', use_prefix=False).name, 'foobar')

    c = Connection(autoconnect=False)
    assert_equal('foo', c._table_name('foo'))

    with assert_raises(TypeError):
        Connection(autoconnect=False, table_prefix=123)

    with assert_raises(TypeError):
        Connection(autoconnect=False, table_prefix_separator=2.1)
Example #3
0
 def test_state(self):
     connection = Connection(host='hbase-docker', port=9090)
     state = HBaseState(connection,
                        b'states',
                        cache_size_limit=300000,
                        write_log_size=5000,
                        drop_all_tables=True)
     state.set_states([r1, r2, r3])
     assert [r.meta[b'state']
             for r in [r1, r2, r3]] == [States.NOT_CRAWLED] * 3
     state.update_cache([r1, r2, r3])
     assert dict(state._state_cache) == {
         b'10': States.NOT_CRAWLED,
         b'11': States.NOT_CRAWLED,
         b'12': States.NOT_CRAWLED
     }
     assert state._state_batch._mutation_count == 3
     r1.meta[b'state'] = States.CRAWLED
     r2.meta[b'state'] = States.CRAWLED
     r3.meta[b'state'] = States.CRAWLED
     state.update_cache([r1, r2, r3])
     assert state._state_batch._mutation_count == 6
     state.flush()
     assert state._state_batch._mutation_count == 0
     state.fetch([b'10', b'11', b'12'])
     assert dict(state._state_cache) == {
         b'10': States.CRAWLED,
         b'11': States.CRAWLED,
         b'12': States.CRAWLED
     }
     r4.meta[b'state'] = States.ERROR
     state.set_states([r1, r2, r4])
     assert r4.meta[b'state'] == States.CRAWLED
     state.flush()
     assert state._state_batch._mutation_count == 0
Example #4
0
 def test_queue_with_delay(self):
     connection = Connection(host='hbase-docker', port=9090)
     queue = HBaseQueue(connection,
                        1,
                        b'queue',
                        use_snappy=False,
                        drop=True)
     r5 = r3.copy()
     crawl_at = int(time()) + 1000
     r5.meta[b'crawl_at'] = crawl_at
     batch = [(r5.meta[b'fingerprint'], 0.5, r5, True)]
     queue.schedule(batch)
     with mock.patch('frontera.contrib.backends.hbase.time') as mocked_time:
         mocked_time.return_value = time()
         assert queue.get_next_requests(10,
                                        0,
                                        min_requests=3,
                                        min_hosts=1,
                                        max_requests_per_host=10) == []
         mocked_time.return_value = crawl_at + 1
         assert set([
             r.url
             for r in queue.get_next_requests(10,
                                              0,
                                              min_requests=3,
                                              min_hosts=1,
                                              max_requests_per_host=10)
         ]) == set([r5.url])
Example #5
0
 def test_drop_all_tables_when_table_name_is_str(self):
     connection = Connection(host='hbase-docker', port=9090)
     for table in connection.tables():
         connection.delete_table(table, True)
     hbase_queue_table = 'queue'
     hbase_metadata_table = 'metadata'
     hbase_states_table = 'states'
     connection.create_table(hbase_queue_table, {'f': {'max_versions': 1}})
     connection.create_table(hbase_metadata_table,
                             {'f': {
                                 'max_versions': 1
                             }})
     connection.create_table(hbase_states_table, {'f': {'max_versions': 1}})
     tables = connection.tables()
     assert set(tables) == set([b'metadata', b'queue',
                                b'states'])  # Failure of test itself
     try:
         HBaseQueue(connection=connection,
                    partitions=1,
                    table_name=hbase_queue_table,
                    use_snappy=False,
                    drop=True)
         HBaseMetadata(connection=connection,
                       table_name=hbase_metadata_table,
                       drop_all_tables=True,
                       use_snappy=False,
                       batch_size=300000,
                       store_content=True)
         HBaseState(connection,
                    hbase_states_table,
                    cache_size_limit=100,
                    write_log_size=10,
                    drop_all_tables=True)
     except AlreadyExists:
         assert False, "failed to drop hbase tables"
Example #6
0
    def __init__(self, manager):
        self.manager = manager
        self.logger = logging.getLogger("hbase.backend")
        settings = manager.settings
        port = settings.get('HBASE_THRIFT_PORT')
        hosts = settings.get('HBASE_THRIFT_HOST')
        namespace = settings.get('HBASE_NAMESPACE')
        self._min_requests = settings.get('BC_MIN_REQUESTS')
        self._min_hosts = settings.get('BC_MIN_HOSTS')
        self._max_requests_per_host = settings.get('BC_MAX_REQUESTS_PER_HOST')

        self.queue_partitions = settings.get('SPIDER_FEED_PARTITIONS')
        host = choice(hosts) if type(hosts) in [list, tuple] else hosts
        kwargs = {
            'host': host,
            'port': int(port),
            'table_prefix': namespace,
            'table_prefix_separator': ':',
            'timeout': 60000
        }
        if settings.get('HBASE_USE_FRAMED_COMPACT'):
            kwargs.update({
                'protocol': 'compact',
                'transport': 'framed'
            })
        self.logger.info("Connecting to %s:%d thrift server.", host, port)
        self.connection = Connection(**kwargs)
        self._metadata = None
        self._queue = None
        self._states = None
Example #7
0
 def test_state(self):
     connection = Connection(host='hbase-docker', port=9090)
     state = HBaseState(connection, b'metadata', 300000)
     state.set_states([r1, r2, r3])
     assert [r.meta[b'state']
             for r in [r1, r2, r3]] == [States.NOT_CRAWLED] * 3
     state.update_cache([r1, r2, r3])
     assert state._state_cache == {
         b'10': States.NOT_CRAWLED,
         b'11': States.NOT_CRAWLED,
         b'12': States.NOT_CRAWLED
     }
     r1.meta[b'state'] = States.CRAWLED
     r2.meta[b'state'] = States.CRAWLED
     r3.meta[b'state'] = States.CRAWLED
     state.update_cache([r1, r2, r3])
     state.flush(True)
     assert state._state_cache == {}
     state.fetch([b'10', b'11', b'12'])
     assert state._state_cache == {
         b'10': States.CRAWLED,
         b'11': States.CRAWLED,
         b'12': States.CRAWLED
     }
     r4.meta[b'state'] = States.ERROR
     state.set_states([r1, r2, r4])
     assert r4.meta[b'state'] == States.CRAWLED
     state.flush(True)
     assert state._state_cache == {}
Example #8
0
 def __init__(self, name):
     from happybase import Connection
     from thrift.transport import TTransport
     try:
         self._conn = Connection('localhost')
         self._table = self._conn.table(name)
     except TTransport.TTransportException, e:
         raise UserWarning(e)
Example #9
0
 def test_queue(self):
     connection = Connection(host='hbase-docker', port=9090)
     queue = HBaseQueue(connection, 2, b'queue', True)
     batch = [('10', 0.5, r1, True), ('11', 0.6, r2, True),
              ('12', 0.7, r3, True)]
     queue.schedule(batch)
     assert set([r.url for r in queue.get_next_requests(10, 0, min_requests=3, min_hosts=1,
                max_requests_per_host=10)]) == set([r3.url])
     assert set([r.url for r in queue.get_next_requests(10, 1, min_requests=3, min_hosts=1,
                max_requests_per_host=10)]) == set([r1.url, r2.url])
Example #10
0
 def test_metadata(self):
     connection = Connection(host='hbase-docker', port=9090)
     metadata = HBaseMetadata(connection, b'metadata', True, False, 300000, True)
     metadata.add_seeds([r1, r2, r3])
     resp = Response('https://www.example.com', request=r1)
     metadata.page_crawled(resp)
     metadata.links_extracted(resp.request, [r2, r3])
     metadata.request_error(r4, 'error')
     metadata.frontier_stop()
     table = connection.table('metadata')
     assert set([to_native_str(data[b'm:url'], 'utf-8') for _, data in table.scan()]) == \
         set([r1.url, r2.url, r3.url])
     self.delete_rows(table, [b'10', b'11', b'12'])
Example #11
0
    def get_client_addons(self, client_id):
        """Retrieve the list of addons for the given client

        Only the last known version of the list of addons is retrieved"""
        with contextlib.closing(Connection(self._hostname)) as connection:
            table = connection.table(self.tablename)
            row_start = "{}:{}".format(client_id, "99999999")
            for key, data in table.scan(row_start=row_start,
                                        limit=1,
                                        columns=[self.column_family],
                                        reverse=True):
                return json.loads(data[self.column].decode("utf-8"))
        return None
 def setUp(self):
     logging.basicConfig(level=logging.DEBUG)
     self.conn = Connection(host="hbase-docker")
     if b'domain_metadata' not in self.conn.tables():
         self.conn.create_table(
             'domain_metadata',
             {'m': {
                 'max_versions': 1,
                 'block_cache_enabled': 1,
             }})
     t = self.conn.table('domain_metadata')
     t.delete('d1')
     t.delete('d2')
     t.delete('d3')
     t.delete('d4')
Example #13
0
def setup_module():
    global connection, table
    connection = Connection(**connection_kwargs)

    assert_is_not_none(connection)

    cfs = {
        'cf1': {},
        'cf2': None,
        'cf3': {'max_versions': 1},
    }
    connection.create_table(TEST_TABLE_NAME, families=cfs)

    table = connection.table(TEST_TABLE_NAME)
    assert_is_not_none(table)
Example #14
0
 def test_queue_with_post_request(self):
     connection = Connection(host='hbase-docker', port=9090)
     queue = HBaseQueue(connection,
                        1,
                        b'queue',
                        drop=True,
                        use_snappy=False)
     batch = [('10', 0.5, r1, True)]
     queue.schedule(batch)
     requests = queue.get_next_requests(10,
                                        0,
                                        min_requests=3,
                                        min_hosts=1,
                                        max_requests_per_host=10)
     self.assertEqual(b'POST', requests[0].method)
     self.assertEqual(data, requests[0].body)
Example #15
0
def setup_module():
    global connection, table
    connection = Connection(**connection_kwargs)

    assert connection is not None

    maybe_delete_table()
    cfs = {
        'cf1': {},
        'cf2': None,
        'cf3': {'max_versions': 1},
    }
    connection.create_table(TEST_TABLE_NAME, families=cfs)

    table = connection.table(TEST_TABLE_NAME)
    assert table is not None
    def __init__(self, manager):
        self.manager = manager

        settings = manager.settings
        port = settings.get('HBASE_THRIFT_PORT', 9090)
        hosts = settings.get('HBASE_THRIFT_HOST', 'localhost')
        namespace = settings.get('HBASE_NAMESPACE', 'crawler')
        drop_all_tables = settings.get('HBASE_DROP_ALL_TABLES', False)
        self.queue_partitions = settings.get('HBASE_QUEUE_PARTITIONS', 4)
        self._table_name = settings.get('HBASE_METADATA_TABLE', 'metadata')
        host = choice(hosts) if type(hosts) in [list, tuple] else hosts

        self.connection = Connection(host=host,
                                     port=int(port),
                                     table_prefix=namespace,
                                     table_prefix_separator=':')
        # protocol='compact', transport='framed'
        self.queue = HBaseQueue(self.connection,
                                self.queue_partitions,
                                self.manager.logger.backend,
                                drop=drop_all_tables)
        self.state_checker = HBaseState(self.connection, self._table_name)

        tables = set(self.connection.tables())
        if drop_all_tables and self._table_name in tables:
            self.connection.delete_table(self._table_name, disable=True)
            tables.remove(self._table_name)

        if self._table_name not in tables:
            self.connection.create_table(
                self._table_name,
                {
                    'm': {
                        'max_versions': 5
                    },  # 'compression': 'SNAPPY'
                    's': {
                        'max_versions': 1,
                        'block_cache_enabled': 1,
                        'bloom_filter_type': 'ROW',
                        'in_memory': True,
                    },
                    'c': {
                        'max_versions': 1
                    }
                })
        table = self.connection.table(self._table_name)
        self.batch = table.batch(batch_size=9216)
Example #17
0
 def test_queue_with_delay(self):
     connection = Connection(host='hbase-docker', port=9090)
     queue = HBaseQueue(connection, 1, b'queue', True)
     r5 = r3.copy()
     r5.meta[b'crawl_at'] = int(time()) + 1
     batch = [(r5.meta[b'fingerprint'], 0.5, r5, True)]
     queue.schedule(batch)
     assert queue.get_next_requests(10,
                                    0,
                                    min_requests=3,
                                    min_hosts=1,
                                    max_requests_per_host=10) == []
     sleep(1.5)
     assert set([
         r.url for r in queue.get_next_requests(
             10, 0, min_requests=3, min_hosts=1, max_requests_per_host=10)
     ]) == set([r5.url])
Example #18
0
    def get_client_profile(self, client_id):
        """Retrieve the latest row for the given client in HBase

        Only the last known version of the info is retrieved"""
        try:
            with contextlib.closing(Connection(
                    self.hbase_hostname)) as connection:
                table = connection.table(self.tablename)
                client_row = table.row(client_id, columns=[self.column_family])
                if client_row:
                    return json.loads(client_row[self.column].decode("utf-8"))
        except Exception:
            logger.exception("Connection to HBase failed",
                             extra={"client_id": client_id})

        logger.info("Client information not found",
                    extra={"client_id": client_id})
        return None
def put_data_into_hbase(rdd):
    """
    functions to store data into hbase table
    """
    # collecting the results
    results = rdd.collect()
    # computing the exact time: this will serve as the row id
    date = str(datetime.datetime.now())[:19]
    # making connection to the right
    connection = Connection(host='localhost', port=9090, autoconnect=True)
    table = connection.table(name='base_tweets')
    #
    for data in results:
        if data[0] == 0:
            table.put(row=date, data={'tweet_count:neg': str(data[1])})
        else:
            table.put(row=date, data={'tweet_count:pos': str(data[1])})

    connection.close()
Example #20
0
 def __init__(self, manager):
     self.manager = manager
     self.logger = manager.logger.backend
     settings = manager.settings
     port = settings.get('HBASE_THRIFT_PORT')
     hosts = settings.get('HBASE_THRIFT_HOST')
     namespace = settings.get('HBASE_NAMESPACE')
     self.queue_partitions = settings.get('SPIDER_FEED_PARTITIONS')
     host = choice(hosts) if type(hosts) in [list, tuple] else hosts
     kwargs = {
         'host': host,
         'port': int(port),
         'table_prefix': namespace,
         'table_prefix_separator': ':'
     }
     if settings.get('HBASE_USE_COMPACT_PROTOCOL'):
         kwargs.update({'protocol': 'compact', 'transport': 'framed'})
     self.connection = Connection(**kwargs)
     self._metadata = None
     self._queue = None
     self._states = None
Example #21
0
from happybase import Connection
import pprint

if __name__ == '__main__':

    # creating a connection with HBase
    hbase_connection = Connection(host='localhost',
                                  port=9090,
                                  autoconnect=True)

    # name of the table to create
    table_name = 'animals2'

    # printing out the tables in HBase
    tables = hbase_connection.tables()
    print('Current tables :')
    print(tables)

    # column families to create
    families_schema = {
        'id': dict(),  # keeping default parameters
        'features': dict()  # keeping default parameters
    }
    # Creating a table
    hbase_connection.create_table(name=table_name, families=families_schema)

    # printing out the tables in HBase
    tables = hbase_connection.tables()
    print('Current tables :')
    print(tables)
Example #22
0
 def __init__(self):
     self.conn = Connection()
Example #23
0
def test_timeout_arg():
    Connection(
        timeout=5000,
        autoconnect=False)
Example #24
0
def test_connection_compat():
    with assert_raises(ValueError):
        Connection(compat='0.1.invalid.version')
Example #25
0
 def __init__(self, settings):
     super().__init__(settings)
     hbase_host = settings['HBASE_HOST']
     hbase_table = settings['HBASE_TABLE']
     self._connection = Connection(hbase_host)
     self._table = self._connection.table(hbase_table)
def get_hbase_connection():
    global hbase_connection
    if hbase_connection is None:
        hbase_connection = Connection(host='hbase-docker', port=9090)
    return hbase_connection
Example #27
0
def describe_proximity(window_seconds=5):
    """
    Poll the DB at a given interval to get the minimum proximity, average proximity
    and the variance in the mean (to get a feel for range sampled).

    Args:
        interval (int): Inteval at which to update
        window_ms (int): Window range for averaging (in milliseconds)
    """
    dt = timedelta(seconds=window_seconds)
    now_ = datetime.now()
    start = (now_ - dt).strftime(dtfmt)
    stop = now_.strftime(dtfmt)
    conn = Connection(config['hbase'], port=int(config['thrift']))
    tab = conn.table(str.encode(config['prox_table']))
    #dct = {k: v for k, v in tab.scan(row_start=pk01+start, row_stop=pk01+stop)}
    avg_ = []
    min_ = 0
    for pk in pks:
        dct = {
            k: v
            for k, v in tab.scan(row_start=pk + start, row_stop=pk + stop)
        }
        if len(dct) > 0:
            df = pd.DataFrame.from_dict(dct, orient="index").reset_index()
            df[b'spatial:dr'] = df[b'spatial:dr'].astype(float)
            avg_.append(df[b'spatial:dr'].mean())
            min_ += df[df[b'spatial:dr'] < 10].shape[0]

    time.append(str(now_))
    miny.append(min_)
    try:
        avgy.append(sum(avg_) / len(avg_))
    except Exception:
        avgy.append(np.nan)
    avgline = Scatter(x=list(time),
                      y=list(avgy),
                      type='scatter',
                      mode='lines',
                      name='Mean')
    minline = Scatter(x=list(time),
                      y=list(miny),
                      type='scatter',
                      mode='lines',
                      name='< 10',
                      yaxis="y2")
    #trace = [{'x': time, 'y': avgy, 'type': "scatter", 'mode': "lines", 'name': 'Avg'},
    #         {'x': time, 'y': miny, 'type': "scatter", 'mode': "lines", 'name': 'Min'}]
    layout = {
        'height': 620,
        'yaxis': {
            'title': "Average Proximity (m)",
            'side': "left"
        },
        'yaxis2': {
            'title': 'Within 10 Meters (count)',
            'side': "right",
            'overlaying': "y"
        }
    }
    return Figure(data=[avgline, minline], layout=layout)