def test_prefix(): assert TABLE_PREFIX + b'_' == connection._table_name('') assert TABLE_PREFIX + b'_foo' == connection._table_name('foo') assert connection.table('foobar').name == TABLE_PREFIX + b'_foobar' assert connection.table('foobar', use_prefix=False).name == b'foobar' c = Connection(autoconnect=False) assert b'foo' == c._table_name('foo') with assert_raises(TypeError): Connection(autoconnect=False, table_prefix=123) with assert_raises(TypeError): Connection(autoconnect=False, table_prefix_separator=2.1)
def test_prefix(): assert_equal(TABLE_PREFIX + '_', connection._table_name('')) assert_equal(TABLE_PREFIX + '_foo', connection._table_name('foo')) assert_equal(connection.table('foobar').name, TABLE_PREFIX + '_foobar') assert_equal(connection.table('foobar', use_prefix=False).name, 'foobar') c = Connection(autoconnect=False) assert_equal('foo', c._table_name('foo')) with assert_raises(TypeError): Connection(autoconnect=False, table_prefix=123) with assert_raises(TypeError): Connection(autoconnect=False, table_prefix_separator=2.1)
def test_state(self): connection = Connection(host='hbase-docker', port=9090) state = HBaseState(connection, b'states', cache_size_limit=300000, write_log_size=5000, drop_all_tables=True) state.set_states([r1, r2, r3]) assert [r.meta[b'state'] for r in [r1, r2, r3]] == [States.NOT_CRAWLED] * 3 state.update_cache([r1, r2, r3]) assert dict(state._state_cache) == { b'10': States.NOT_CRAWLED, b'11': States.NOT_CRAWLED, b'12': States.NOT_CRAWLED } assert state._state_batch._mutation_count == 3 r1.meta[b'state'] = States.CRAWLED r2.meta[b'state'] = States.CRAWLED r3.meta[b'state'] = States.CRAWLED state.update_cache([r1, r2, r3]) assert state._state_batch._mutation_count == 6 state.flush() assert state._state_batch._mutation_count == 0 state.fetch([b'10', b'11', b'12']) assert dict(state._state_cache) == { b'10': States.CRAWLED, b'11': States.CRAWLED, b'12': States.CRAWLED } r4.meta[b'state'] = States.ERROR state.set_states([r1, r2, r4]) assert r4.meta[b'state'] == States.CRAWLED state.flush() assert state._state_batch._mutation_count == 0
def test_queue_with_delay(self): connection = Connection(host='hbase-docker', port=9090) queue = HBaseQueue(connection, 1, b'queue', use_snappy=False, drop=True) r5 = r3.copy() crawl_at = int(time()) + 1000 r5.meta[b'crawl_at'] = crawl_at batch = [(r5.meta[b'fingerprint'], 0.5, r5, True)] queue.schedule(batch) with mock.patch('frontera.contrib.backends.hbase.time') as mocked_time: mocked_time.return_value = time() assert queue.get_next_requests(10, 0, min_requests=3, min_hosts=1, max_requests_per_host=10) == [] mocked_time.return_value = crawl_at + 1 assert set([ r.url for r in queue.get_next_requests(10, 0, min_requests=3, min_hosts=1, max_requests_per_host=10) ]) == set([r5.url])
def test_drop_all_tables_when_table_name_is_str(self): connection = Connection(host='hbase-docker', port=9090) for table in connection.tables(): connection.delete_table(table, True) hbase_queue_table = 'queue' hbase_metadata_table = 'metadata' hbase_states_table = 'states' connection.create_table(hbase_queue_table, {'f': {'max_versions': 1}}) connection.create_table(hbase_metadata_table, {'f': { 'max_versions': 1 }}) connection.create_table(hbase_states_table, {'f': {'max_versions': 1}}) tables = connection.tables() assert set(tables) == set([b'metadata', b'queue', b'states']) # Failure of test itself try: HBaseQueue(connection=connection, partitions=1, table_name=hbase_queue_table, use_snappy=False, drop=True) HBaseMetadata(connection=connection, table_name=hbase_metadata_table, drop_all_tables=True, use_snappy=False, batch_size=300000, store_content=True) HBaseState(connection, hbase_states_table, cache_size_limit=100, write_log_size=10, drop_all_tables=True) except AlreadyExists: assert False, "failed to drop hbase tables"
def __init__(self, manager): self.manager = manager self.logger = logging.getLogger("hbase.backend") settings = manager.settings port = settings.get('HBASE_THRIFT_PORT') hosts = settings.get('HBASE_THRIFT_HOST') namespace = settings.get('HBASE_NAMESPACE') self._min_requests = settings.get('BC_MIN_REQUESTS') self._min_hosts = settings.get('BC_MIN_HOSTS') self._max_requests_per_host = settings.get('BC_MAX_REQUESTS_PER_HOST') self.queue_partitions = settings.get('SPIDER_FEED_PARTITIONS') host = choice(hosts) if type(hosts) in [list, tuple] else hosts kwargs = { 'host': host, 'port': int(port), 'table_prefix': namespace, 'table_prefix_separator': ':', 'timeout': 60000 } if settings.get('HBASE_USE_FRAMED_COMPACT'): kwargs.update({ 'protocol': 'compact', 'transport': 'framed' }) self.logger.info("Connecting to %s:%d thrift server.", host, port) self.connection = Connection(**kwargs) self._metadata = None self._queue = None self._states = None
def test_state(self): connection = Connection(host='hbase-docker', port=9090) state = HBaseState(connection, b'metadata', 300000) state.set_states([r1, r2, r3]) assert [r.meta[b'state'] for r in [r1, r2, r3]] == [States.NOT_CRAWLED] * 3 state.update_cache([r1, r2, r3]) assert state._state_cache == { b'10': States.NOT_CRAWLED, b'11': States.NOT_CRAWLED, b'12': States.NOT_CRAWLED } r1.meta[b'state'] = States.CRAWLED r2.meta[b'state'] = States.CRAWLED r3.meta[b'state'] = States.CRAWLED state.update_cache([r1, r2, r3]) state.flush(True) assert state._state_cache == {} state.fetch([b'10', b'11', b'12']) assert state._state_cache == { b'10': States.CRAWLED, b'11': States.CRAWLED, b'12': States.CRAWLED } r4.meta[b'state'] = States.ERROR state.set_states([r1, r2, r4]) assert r4.meta[b'state'] == States.CRAWLED state.flush(True) assert state._state_cache == {}
def __init__(self, name): from happybase import Connection from thrift.transport import TTransport try: self._conn = Connection('localhost') self._table = self._conn.table(name) except TTransport.TTransportException, e: raise UserWarning(e)
def test_queue(self): connection = Connection(host='hbase-docker', port=9090) queue = HBaseQueue(connection, 2, b'queue', True) batch = [('10', 0.5, r1, True), ('11', 0.6, r2, True), ('12', 0.7, r3, True)] queue.schedule(batch) assert set([r.url for r in queue.get_next_requests(10, 0, min_requests=3, min_hosts=1, max_requests_per_host=10)]) == set([r3.url]) assert set([r.url for r in queue.get_next_requests(10, 1, min_requests=3, min_hosts=1, max_requests_per_host=10)]) == set([r1.url, r2.url])
def test_metadata(self): connection = Connection(host='hbase-docker', port=9090) metadata = HBaseMetadata(connection, b'metadata', True, False, 300000, True) metadata.add_seeds([r1, r2, r3]) resp = Response('https://www.example.com', request=r1) metadata.page_crawled(resp) metadata.links_extracted(resp.request, [r2, r3]) metadata.request_error(r4, 'error') metadata.frontier_stop() table = connection.table('metadata') assert set([to_native_str(data[b'm:url'], 'utf-8') for _, data in table.scan()]) == \ set([r1.url, r2.url, r3.url]) self.delete_rows(table, [b'10', b'11', b'12'])
def get_client_addons(self, client_id): """Retrieve the list of addons for the given client Only the last known version of the list of addons is retrieved""" with contextlib.closing(Connection(self._hostname)) as connection: table = connection.table(self.tablename) row_start = "{}:{}".format(client_id, "99999999") for key, data in table.scan(row_start=row_start, limit=1, columns=[self.column_family], reverse=True): return json.loads(data[self.column].decode("utf-8")) return None
def setUp(self): logging.basicConfig(level=logging.DEBUG) self.conn = Connection(host="hbase-docker") if b'domain_metadata' not in self.conn.tables(): self.conn.create_table( 'domain_metadata', {'m': { 'max_versions': 1, 'block_cache_enabled': 1, }}) t = self.conn.table('domain_metadata') t.delete('d1') t.delete('d2') t.delete('d3') t.delete('d4')
def setup_module(): global connection, table connection = Connection(**connection_kwargs) assert_is_not_none(connection) cfs = { 'cf1': {}, 'cf2': None, 'cf3': {'max_versions': 1}, } connection.create_table(TEST_TABLE_NAME, families=cfs) table = connection.table(TEST_TABLE_NAME) assert_is_not_none(table)
def test_queue_with_post_request(self): connection = Connection(host='hbase-docker', port=9090) queue = HBaseQueue(connection, 1, b'queue', drop=True, use_snappy=False) batch = [('10', 0.5, r1, True)] queue.schedule(batch) requests = queue.get_next_requests(10, 0, min_requests=3, min_hosts=1, max_requests_per_host=10) self.assertEqual(b'POST', requests[0].method) self.assertEqual(data, requests[0].body)
def setup_module(): global connection, table connection = Connection(**connection_kwargs) assert connection is not None maybe_delete_table() cfs = { 'cf1': {}, 'cf2': None, 'cf3': {'max_versions': 1}, } connection.create_table(TEST_TABLE_NAME, families=cfs) table = connection.table(TEST_TABLE_NAME) assert table is not None
def __init__(self, manager): self.manager = manager settings = manager.settings port = settings.get('HBASE_THRIFT_PORT', 9090) hosts = settings.get('HBASE_THRIFT_HOST', 'localhost') namespace = settings.get('HBASE_NAMESPACE', 'crawler') drop_all_tables = settings.get('HBASE_DROP_ALL_TABLES', False) self.queue_partitions = settings.get('HBASE_QUEUE_PARTITIONS', 4) self._table_name = settings.get('HBASE_METADATA_TABLE', 'metadata') host = choice(hosts) if type(hosts) in [list, tuple] else hosts self.connection = Connection(host=host, port=int(port), table_prefix=namespace, table_prefix_separator=':') # protocol='compact', transport='framed' self.queue = HBaseQueue(self.connection, self.queue_partitions, self.manager.logger.backend, drop=drop_all_tables) self.state_checker = HBaseState(self.connection, self._table_name) tables = set(self.connection.tables()) if drop_all_tables and self._table_name in tables: self.connection.delete_table(self._table_name, disable=True) tables.remove(self._table_name) if self._table_name not in tables: self.connection.create_table( self._table_name, { 'm': { 'max_versions': 5 }, # 'compression': 'SNAPPY' 's': { 'max_versions': 1, 'block_cache_enabled': 1, 'bloom_filter_type': 'ROW', 'in_memory': True, }, 'c': { 'max_versions': 1 } }) table = self.connection.table(self._table_name) self.batch = table.batch(batch_size=9216)
def test_queue_with_delay(self): connection = Connection(host='hbase-docker', port=9090) queue = HBaseQueue(connection, 1, b'queue', True) r5 = r3.copy() r5.meta[b'crawl_at'] = int(time()) + 1 batch = [(r5.meta[b'fingerprint'], 0.5, r5, True)] queue.schedule(batch) assert queue.get_next_requests(10, 0, min_requests=3, min_hosts=1, max_requests_per_host=10) == [] sleep(1.5) assert set([ r.url for r in queue.get_next_requests( 10, 0, min_requests=3, min_hosts=1, max_requests_per_host=10) ]) == set([r5.url])
def get_client_profile(self, client_id): """Retrieve the latest row for the given client in HBase Only the last known version of the info is retrieved""" try: with contextlib.closing(Connection( self.hbase_hostname)) as connection: table = connection.table(self.tablename) client_row = table.row(client_id, columns=[self.column_family]) if client_row: return json.loads(client_row[self.column].decode("utf-8")) except Exception: logger.exception("Connection to HBase failed", extra={"client_id": client_id}) logger.info("Client information not found", extra={"client_id": client_id}) return None
def put_data_into_hbase(rdd): """ functions to store data into hbase table """ # collecting the results results = rdd.collect() # computing the exact time: this will serve as the row id date = str(datetime.datetime.now())[:19] # making connection to the right connection = Connection(host='localhost', port=9090, autoconnect=True) table = connection.table(name='base_tweets') # for data in results: if data[0] == 0: table.put(row=date, data={'tweet_count:neg': str(data[1])}) else: table.put(row=date, data={'tweet_count:pos': str(data[1])}) connection.close()
def __init__(self, manager): self.manager = manager self.logger = manager.logger.backend settings = manager.settings port = settings.get('HBASE_THRIFT_PORT') hosts = settings.get('HBASE_THRIFT_HOST') namespace = settings.get('HBASE_NAMESPACE') self.queue_partitions = settings.get('SPIDER_FEED_PARTITIONS') host = choice(hosts) if type(hosts) in [list, tuple] else hosts kwargs = { 'host': host, 'port': int(port), 'table_prefix': namespace, 'table_prefix_separator': ':' } if settings.get('HBASE_USE_COMPACT_PROTOCOL'): kwargs.update({'protocol': 'compact', 'transport': 'framed'}) self.connection = Connection(**kwargs) self._metadata = None self._queue = None self._states = None
from happybase import Connection import pprint if __name__ == '__main__': # creating a connection with HBase hbase_connection = Connection(host='localhost', port=9090, autoconnect=True) # name of the table to create table_name = 'animals2' # printing out the tables in HBase tables = hbase_connection.tables() print('Current tables :') print(tables) # column families to create families_schema = { 'id': dict(), # keeping default parameters 'features': dict() # keeping default parameters } # Creating a table hbase_connection.create_table(name=table_name, families=families_schema) # printing out the tables in HBase tables = hbase_connection.tables() print('Current tables :') print(tables)
def __init__(self): self.conn = Connection()
def test_timeout_arg(): Connection( timeout=5000, autoconnect=False)
def test_connection_compat(): with assert_raises(ValueError): Connection(compat='0.1.invalid.version')
def __init__(self, settings): super().__init__(settings) hbase_host = settings['HBASE_HOST'] hbase_table = settings['HBASE_TABLE'] self._connection = Connection(hbase_host) self._table = self._connection.table(hbase_table)
def get_hbase_connection(): global hbase_connection if hbase_connection is None: hbase_connection = Connection(host='hbase-docker', port=9090) return hbase_connection
def describe_proximity(window_seconds=5): """ Poll the DB at a given interval to get the minimum proximity, average proximity and the variance in the mean (to get a feel for range sampled). Args: interval (int): Inteval at which to update window_ms (int): Window range for averaging (in milliseconds) """ dt = timedelta(seconds=window_seconds) now_ = datetime.now() start = (now_ - dt).strftime(dtfmt) stop = now_.strftime(dtfmt) conn = Connection(config['hbase'], port=int(config['thrift'])) tab = conn.table(str.encode(config['prox_table'])) #dct = {k: v for k, v in tab.scan(row_start=pk01+start, row_stop=pk01+stop)} avg_ = [] min_ = 0 for pk in pks: dct = { k: v for k, v in tab.scan(row_start=pk + start, row_stop=pk + stop) } if len(dct) > 0: df = pd.DataFrame.from_dict(dct, orient="index").reset_index() df[b'spatial:dr'] = df[b'spatial:dr'].astype(float) avg_.append(df[b'spatial:dr'].mean()) min_ += df[df[b'spatial:dr'] < 10].shape[0] time.append(str(now_)) miny.append(min_) try: avgy.append(sum(avg_) / len(avg_)) except Exception: avgy.append(np.nan) avgline = Scatter(x=list(time), y=list(avgy), type='scatter', mode='lines', name='Mean') minline = Scatter(x=list(time), y=list(miny), type='scatter', mode='lines', name='< 10', yaxis="y2") #trace = [{'x': time, 'y': avgy, 'type': "scatter", 'mode': "lines", 'name': 'Avg'}, # {'x': time, 'y': miny, 'type': "scatter", 'mode': "lines", 'name': 'Min'}] layout = { 'height': 620, 'yaxis': { 'title': "Average Proximity (m)", 'side': "left" }, 'yaxis2': { 'title': 'Within 10 Meters (count)', 'side': "right", 'overlaying': "y" } } return Figure(data=[avgline, minline], layout=layout)