def scanfilter(self, mmsi, table, query): conn = self.getHbaseConnection() t = happybase.Table(table, conn) ed = int(time.time()) bg = ed - 8640000 # query_str = "ColumnPrefixFilter('your_prsifx_str') AND TimestampsFilter(your_timestamp)" # filter = "SingleColumnValueFilter('f', 'id', =, 'substring:852223')", limit = 10 # for k, v in t.scan(filter=query,columns=["motion:mmsi","motion:rot","motion:sog"]): res = list() row_start = "{0}{1}".format(mmsi, bg) row_stop = "{0}{1}".format(mmsi, ed) for k, v in t.scan(filter=query,columns=["info:mmsi","info:source","motion:cog","motion:latitude",\ "motion:longitude","motion:rot","motion:sog","motion:time","motion:trueHeading"],row_start=row_start,row_stop=row_stop): res.append([v[b"info:mmsi"].decode('utf8'),v[b"info:source"].decode('utf8'),v[b"motion:cog"].decode('utf8'),v[b"motion:latitude"].decode('utf8'),\ v[b"motion:longitude"].decode('utf8'),v[b"motion:rot"].decode('utf8'),v[b"motion:sog"].decode('utf8'),v[b"motion:time"].decode('utf8'), v[b"motion:trueHeading"].decode('utf8')]) # for k,v in t.scan(filter=query,): # print(v[b"info:mmsi"]) # for i,j in v.items(): # print(j.decode('utf8')) # for i,j in v.items(): # res.append([]) # print(j.decode('utf8')) return res
def __init__(self, hbase: str, table: str, filter: str, request: list = None, batch_size: int = 128, train_mode: bool = True, field=FIELD): self.connection = happybase.Connection( hbase, autoconnect=False, # transport="framed", # protocol="compact" ) #Hbase自带有线程安全的连接池,踏允许多个线程共享和重用已经打开的连接。这对于多线程的应用是非常有用的。 # 当一个线程申请一个连接,它将获得一个租赁凭证,在此期间,这个线程单独享有这个连接。 # 当这个线程使用完该连接之后,它将该连接归还给连接池以便其他的线程可以使用 self.table = happybase.Table(table, self.connection) self.filter = filter #天 哪一天 self.request = request or [] assert isinstance(self.request, list), "request must be list!" #copy 有区别的 self.field = field.copy()
def connect_to_hbase(hosts, table_name): host = random.choice(hosts) connection = happybase.Connection(host) connection.open() if table_name not in connection.tables(): connection.create_table(table_name, {'cf': dict()}) return happybase.Table(table_name, connection)
def create_htable(table_name): HOSTS = ["hadoop2-%02d.yandex.ru" % i for i in xrange(11, 14)] host = random.choice(HOSTS) conn = happybase.Connection(host) conn.open() if table_name not in conn.tables(): conn.create_table(table_name, {'cf': dict()}) return happybase.Table(table_name, conn)
def scan_table(self, table, row_start, row_stop, row_prefix): self.row_start = row_start conn = self.get_hbase_connection() t = happybase.Table(table, conn) scan = t.scan(row_start=row_start, row_stop=row_stop, row_prefix=row_prefix, limit=1000) # print(self.recourd_count) count = 0 for key, value in scan: count += 1 # 记录用户登录事件 distinct_id = str(dict(value)['i:phone'.encode()]) if distinct_id == '': self.zero_count += 1 continue self.recourd_count += 1 grade = str(dict(value)['i:grade'.encode()]) g_list = grade.split("_")[1:-1] corr = 0 num = 0 for r in g_list: corr += int(r.split(":")[1]) num += int(r.split(":")[2]) if num == 0: accuracy = 0.0 else: accuracy = corr / num properties = { 'HuaTuOnline_exercises': float(dict(value)['i:exerciseNum'.encode()]), 'HuaTuOnline_prediction_score': float(dict(value)['i:predictScore'.encode()]), 'HuaTuOnline_accuracy': accuracy } # self.sa.profile_set(distinct_id, properties, is_login_id=True) self.l.append((distinct_id, properties)) self.row_stop = key if self.row_stop == self.row_start: return 0 if count < 1000: conn.close() return 0 conn.close() return 1
def connect(table_name): host = random.choice(HOSTS) conn = happybase.Connection(host) logging.debug("Connecting to HBase Thrift Server on %s", host) conn.open() logging.debug("Using table %s", table_name) return happybase.Table(table_name, conn)
def getUserComment_user(self, user_id): c = happybase.Connection(host=hbase_host, port=hbase_port) c.open() comment_table = happybase.Table('comment_local', c) query_str = "RowFilter (=, 'substring:" + str(user_id) + "_')" query = comment_table.scan(filter=query_str, limit=1000) result = list(query) result = self.revert_user(result) c.close() return result
def batchPut(self, table): ''' 批量插入数据 :param table: :return: ''' with pool.connection() as conn: t = happybase.Table(table, conn) batch = t.batch(batch_size=10) return batch
def querySingleLine(self, table, rowkey): ''' 返回单行数据,返回tuple :param table:表名 :param rowkey:行键 :return: ''' with pool.connection() as conn: t = happybase.Table(table, conn) return t.row(rowkey)
def queryMultilLines(self, table, list): ''' 返回多行数据,返回dict :param table:表名 :param list: :return: ''' with pool.connection() as conn: t = happybase.Table(table, conn) return dict(t.rows(list))
def singleDelete(self, table, rowkey): ''' 删除单行数据 :param table: :param rowkey: :return: ''' with pool.connection() as conn: t = happybase.Table(table, conn) t.delete(rowkey)
def ucf(user_id): connection = happybase.Connection(host=hbase_host, port=hbase_port) connection.open() recommend_table = happybase.Table('recommend', connection) tmp = recommend_table.row(str(user_id)) movie_id = '' for key, value in tmp.items(): movie_id = value.decode('utf-8') connection.close() return movie_id
def getRow(tableName, rowkey): # 创建实例 connect = getConn() connect.open() table = happybase.Table(tableName, connect) row = table.row(row=rowkey) connect.close() if row == None or not row: return row row = changeEncode(row) return row
def deleteDetailColumns(self, table, rowkey, detailColumns): ''' 删除一个列族中的几个列的数据 :param table: :param rowkey: :param detailColumns: :return: ''' with pool.connection() as conn: t = happybase.Table(table, conn) t.delete(rowkey, columns=detailColumns)
def deleteColumns(self, table, rowkey, columns): ''' 删除多个列族的数据 :param table: :param rowkey: :param columns: :return: ''' with pool.connection() as conn: t = happybase.Table(table, conn) t.delete(rowkey, columns=columns)
def getRows(tableName, rowkeys): connect = getConn() connect.open() table = happybase.Table(tableName, connect) rows = table.rows(rows=rowkeys) connect.close() new_rows = [] for row in rows: new_row = changeEncode(row) new_rows.append(new_row) return new_rows
def singlePut(self, table, rowkey, data): ''' 插入单条数据 :param table: :param rowkey: :param data: :return: ''' with pool.connection() as conn: t = happybase.Table(table, conn) t.put(rowkey, data=data)
def scanTable(self, table): #, row_start, row_stop, row_prefix conn = self.getHbaseConnection() t = happybase.Table(table, conn) scan = t.scan( limit=10, filter= "SingleColumnValueFilter('info', 'mmsi', =, 'substring:100704002')" ) #row_start=row_start, row_stop=row_stop, row_prefix=row_prefix for key, value in scan: for i, j in value.items(): print(key.decode('utf8'), i.decode('utf8'), j.decode('utf8'))
def batchDelete(self, table, rowkeys): ''' 批量删除数据 :param table: :param rowkeys: :return: ''' with pool.connection() as conn: t = happybase.Table(table, conn) with t.batch() as bat: for rowkey in rowkeys: bat.delete(rowkey)
def connect(): host = random.choice(HOSTS) conn = happybase.Connection(host) logging.debug("Connecting to HBase Thrift Server on %s", host) conn.open() if TABLE not in conn.tables(): # Create a table with column family `cf` with default settings. conn.create_table(TABLE, {"cf": dict()}) logging.debug("Created table %s", TABLE) else: logging.debug("Using table %s", TABLE) return happybase.Table(TABLE, conn)
def getComments_movie(self, movie_id): c = happybase.Connection(host=hbase_host, port=hbase_port) c.open() comment_table = happybase.Table('comment', c) query_str = "RowFilter (=, 'substring:_" + str(movie_id) + "')" query = comment_table.scan(filter=query_str, limit=1000) try: result = list(query) except: c.close() return [] result = self.revert(result) c.close() return result[0:10]
def addUserComment(self, movie_id, user_id, content, star, reviewtime): c = happybase.Connection(host=hbase_host, port=hbase_port) c.open() comment_table = happybase.Table('comment_local', c) if self.hasUserComment(movie_id, user_id): c.close() return False comment_table.put( str(user_id) + "_" + str(movie_id), { "region:content": str(content), "region:star": str(star), "region:reviewtime": str(reviewtime) }) c.close()
def scanTable(self, table, row_start=None, row_stop=None, row_prefix=None): ''' 扫描一张表 :param table:表名 :param row_start:行键起 :param row_stop:行键止 :param row_prefix: :return: ''' with pool.connection() as conn: t = happybase.Table(table, conn) scan = t.scan(row_start=row_start, row_stop=row_stop, row_prefix=row_prefix) for key, value in scan: print(key, value)
def getSimilar(self, movie_id): c = happybase.Connection(host=hbase_host, port=hbase_port) c.open() recommend_table = happybase.Table('movie_sim_1', c) tmp_dict = recommend_table.row(str(movie_id)) movie_id_str = '' for key, value in tmp_dict.items(): movie_id_str = value.decode('utf-8') movie_list = [] if len(movie_id_str) > 0: movie_id_list = movie_id_str.split(',') for movie_id in movie_id_list: tmp = MovieInfo.objects.get(id=int(movie_id)) movie_list.append(tmp) c.close() return movie_list
def __init__(self, host: str, table: str, filter: str, request: list = None, field=FIELD): self.connection = happybase.Connection(host, autoconnect=False, timeout=30 * 1000) self.table = happybase.Table(table, self.connection) self.filter = filter self.request = request or [] assert isinstance(self.request, list), "request must be list!" self.field = field.copy()
def hasUserComment(self, movie_id, user_id): c = happybase.Connection(host=hbase_host, port=hbase_port) c.open() comment_table = happybase.Table('comment_local', c) query_str = "RowFilter (=, 'binary:" + str(user_id) + "_" + str( movie_id) + "')" query = comment_table.scan(filter=query_str, limit=1000) try: result = list(query) except: c.close() return False c.close() if len(result) == 0: return False else: return True
def get_batch_yesterday(date=None): conn = happybase.Connection(HBASE_HOST, port=HBASE_PORT) conn.open() app.logger.info(date) if not date: date = request.args.get('date') datetimeDate = datetime.datetime.strptime(date, "%Y%m%d") # return date pastWeek = (datetimeDate - datetime.timedelta(days=7)).strftime("%Y%m%d") yesterday = datetimeDate.strftime("%Y%m%d") # arguments formatted to yyyyMMdd row_start = '001#001#{}'.format(pastWeek) row_start_bytes = row_start.encode('utf-8') row_end = '001#001#{}'.format(yesterday) row_end_bytes = row_end.encode('utf-8') try: table = happybase.Table(BATCH_ANALYSIS_TABLE_NAME, conn) rows = [] for key, data in table.scan(row_start=row_start_bytes, row_stop=row_end_bytes): keyStr = key.decode('utf-8') rowDataDict = {} for columnName in data: column = columnName.decode('utf-8') try: # Java Bytes class converts Double to IEEE-754 # String is converted by utf-8 n = unpack(b'>d', data[columnName]) val = round(n[0], 2) except: val = data[columnName].decode('utf-8') rowDataDict[column] = val # pass rows.append((keyStr, rowDataDict)) app.logger.info('Retrieved data from HBase succesfully') return jsonify(items=rows) except: app.logger.error( "Table {} doesn't have row {}. Check with the hbase shell that you're retrieving the correct data." .format(BATCH_ANALYSIS_TABLE_NAME, row_start)) return "ERROR" finally: conn.close()
def __init__(self, hbase: str, table: str, filter: str, request: list = None, batch_size: int = 128, field=FIELD): self.connection = happybase.Connection( hbase, autoconnect=False, # transport="framed", # protocol="compact" ) self.table = happybase.Table(table, self.connection) self.filter = filter #天 哪一天 self.request = request or [] assert isinstance(self.request, list), "request must be list!" #copy 有区别的 self.field = field.copy()
def get_averages_past24sim(): # Streaming avgs are based on actual time, # and saved in hbase with rowkey based on actual time. conn = happybase.Connection(HBASE_HOST, port=HBASE_PORT) conn.open() # yyyyMMddHHmm date = datetime.datetime.now() pastSimDay = ( date - datetime.timedelta(minutes=SIM_DAY_IN_MIN)).strftime("%Y%m%d%H%M") simToday = date.strftime("%Y%m%d%H%M") try: table = happybase.Table(RUNNING_AVG_TABLE_NAME, conn) row_start = '001#001#{}'.format(pastSimDay) row_start_bytes = row_start.encode('utf-8') row_end = '001#001#{}'.format(simToday) row_end_bytes = row_end.encode('utf-8') rows = [] for key, data in table.scan(row_start=row_start_bytes, row_stop=row_end_bytes): keyStr = key.decode('utf-8') rowDataDict = {} for columnName in data: column = columnName.decode('utf-8') # Java Bytes class converts Double to IEEE-754 # String is converted by utf-8 n = unpack(b'>d', data[columnName]) val = round(n[0], 2) rowDataDict[column] = val rows.append((keyStr, rowDataDict)) app.logger.info('Retrieved data from HBase succesfully') return jsonify(items=rows) except: errorMsg = "Table {} doesn't have row {}. Check with the hbase shell that you're retrieving the correct data.".format( BATCH_ANALYSIS_TABLE_NAME, pastSimDay) app.logger.error(errorMsg) return errorMsg finally: conn.close()
}) # 互联网行为表 internetBehaviorTable if b'internetBehaviorTable' in connection.tables(): connection.delete_table('internetBehaviorTable', disable=True) connection.create_table( 'internetBehaviorTable', { 'news': dict(), 'communications': dict(), 'entertainment': dict(), 'domersticServices': dict(), 'busApp': dict(), 'toolUse': dict(), 'date': dict() }) """连接表""" basicFeaturesTable = happybase.Table('basicFeaturesTable', connection) basicFeaturesTableBat = basicFeaturesTable.batch(batch_size=1000) socialAttributesTable = happybase.Table('socialAttributesTable', connection) socialAttributesTableBat = socialAttributesTable.batch(batch_size=1000) consumptionCharacteristicsTable = happybase.Table( 'consumptionCharacteristicsTable', connection) consumptionCharacteristicsTableBat = consumptionCharacteristicsTable.batch( batch_size=1000) internetBehaviorTable = happybase.Table('internetBehaviorTable', connection) internetBehaviorTableBat = internetBehaviorTable.batch(batch_size=1000) """基础属性表 basicFeaturesTable""" data = pd.read_csv("data/basicFeaturesData.csv") for i in range(data.shape[0]): basicFeaturesTableBat.put( "%s" % data.loc[i, 'key'], {