def puts(self, rowkey, columnFamilies, values): mutationsBatch = [] try: if not isinstance(rowkey, list): rowKeys = [rowkey] * len(values) for i, value in enumerate(values): mutations = [] for j, column in enumerate(value): if isinstance(column, str): m_name = Hbase.Mutation(column=columnFamilies[j] + ':' + '0', value=column) elif isinstance(column, int): m_name = Hbase.Mutation(column=columnFamilies[j] + ':' + '0', value=encode(column)) mutations.append(m_name) mutationsBatch.append( Hbase.BatchMutation(row=rowKeys[i], mutations=mutations)) self.client.mutateRows(self.dbname, mutationsBatch) return True except (Hbase.IOError, Hbase.TException, Hbase.TApplicationException, Hbase.IllegalArgument) as e: logInfo('puts') logInfo(e) print(e) return False
def creat_table(table_name="l_test_table"): transport.open() content_1 = Hbase.ColumnDescriptor(name='person:', maxVersions=2) content_2 = Hbase.ColumnDescriptor(name='content:', maxVersions=2) client.createTable(table_name, [content_1, content_2]) print client.getTableNames() transport.close()
def write_hbase(data, table_name, ip, server_port): """ 将数据写入Hbase中 :param data: 包含数据的迭代器,单条数据为dict类型,比如 {'img_oss' = 'http://bj-image.oss-cn-hangzhou-internal. aliyuncs.com/6321965c0c96f1ea809b15ad757252f3.jpeg', 'img_type' = ['line_chart']} :param table_name: 需要推送的目标表的表名 :param ip: 推送的目标thrift ip :param server_port: 推送的目标thrift port """ if not isinstance(table_name, bytes): table_name = bytes(table_name, encoding='utf-8') # 建立 thrift 连接 transport = TSocket.TSocket(ip, server_port) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = Hbase.Client(protocol) transport.open() result = [] count = 0 for item in data: count += 1 mutations = [] img_type = bytes(item['img_type'], encoding='utf-8') row_key = bytes(hashlib.md5(item['url'].encode()).hexdigest(), encoding='utf-8') mutations.append(Mutation(column=b'info:img_type', value=img_type)) result.append(Hbase.BatchMutation(row=row_key, mutations=mutations)) client.mutateRows(table_name, result, None) transport.close()
def puts(self, rowKeys, values, qualifier='1'): """ put sevel rows, `qualifier` is autoincrement :param rowKeys: a single rowKey :param values: values is a 2-dimension list, one piece element is [name, sex, age] :param qualifier: column family qualifier Usage:: >>> HBaseTest().puts('test', [['lee', 'f', '27'], ['clark', 'm', 27], ['dan', 'f', '27']]) """ mutationsBatch = [] if not isinstance(rowKeys, list): rowKeys = [rowKeys] * len(values) for i, value in enumerate(values): mutations = [] for j, column in enumerate(value): if isinstance(column, str): m_name = Hbase.Mutation(column=self.columnFamilies[j] + ':' + qualifier, value=column) elif isinstance(column, int): m_name = Hbase.Mutation(column=self.columnFamilies[j] + ':' + qualifier, value=encode(column)) mutations.append(m_name) qualifier = str(int(qualifier) + 1) mutationsBatch.append( Hbase.BatchMutation(row=rowKeys[i], mutations=mutations)) self.client.mutateRows(self.table, mutationsBatch, {})
def puts(self, rowKeys, qualifier, values): """ put sevel rows, `qualifier` is autoincrement :param rowKeys: a single rowKey :param values: values is a 2-dimension list, one piece element is [name, sex, age] :param qualifier: column family qualifier Usage:: >>> HBaseTest('table').puts(rowKeys=[1,2,3],qualifier="name",values=[1,2,3]) """ mutationsBatch = [] if not isinstance(rowKeys, list): rowKeys = [rowKeys] * len(values) for i, value in enumerate(values): mutations = [] # for j, column in enumerate(value): if isinstance(value, str): value = value.encode('utf-8') m_name = Hbase.Mutation(column=(self.columnFamilies[0] + ':' + qualifier).encode('utf-8'), value=value) elif isinstance(value, int): m_name = Hbase.Mutation(column=(self.columnFamilies[0] + ':' + qualifier).encode('utf-8'), value=encode(value)) mutations.append(m_name) mutationsBatch.append( Hbase.BatchMutation(row=rowKeys[i].encode('utf-8'), mutations=mutations)) self.client.mutateRows(self.table, mutationsBatch, {})
def push(self, table): """ 扫描 MongoDB 全表,并把数据写入Hbase 中 :param table: :return: """ handle = RotatingFileHandler('./full_sync.log', maxBytes=50 * 1024 * 1024, backupCount=3) handle.setFormatter(logging.Formatter( '%(asctime)s %(name)-12s %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s')) logger = logging.getLogger(table) logger.addHandler(handle) logger.setLevel(logging.INFO) logger.info('开始推送 ' + table + ' !') db_name = table.split('.')[0] table_name = table.split('.')[1] client = pymongo.MongoClient(MONGODB_HOST, MONGODB_PORT, unicode_decode_error_handler='ignore') admin = client['admin'] admin.authenticate(USER, PASSWORD) transport = TSocket.TSocket(THRIFT_IP, THRIFT_PORT) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) thrift_client = Hbase.Client(protocol) transport.open() count = 0 cursor = client[db_name][table_name].find().sort('$natural', pymongo.ASCENDING) for record in cursor: count += 1 mutations = [] # row_key的值为 md5(_id)[0:10]:_id _id = str(record['_id']) row_key = bytes(hashlib.md5(bytes(_id, encoding="utf-8")).hexdigest()[0:10] + ':' + _id, encoding="utf-8") for item in record: if item == '_id': continue key = bytes('data:' + item, encoding="utf8") var = bytes(str(record[item]), encoding="utf8") # hbase.client.keyvalue.maxsize 默认是10M,超出这个值则设置为None if len(var) < 10 * 1024 * 1024: mutations.append(Hbase.Mutation(column=key, value=var)) else: mutations.append(Hbase.Mutation(column=key, value=bytes(str(None), encoding="utf8"))) thrift_client.mutateRow(bytes(table_name, encoding="utf8"), row_key, mutations, {}) if count % 100000 == 0: if 'create_time' in record: logger.info(table + ' 已经读出 ' + str(count / 10000) + ' 万条数据' + ' ' + str(record['create_time'])) else: logger.info(table + ' 已经读出 ' + str(count / 10000) + ' 万条数据') client.close() transport.close()
def open_spider(self, spider): column_families = (Hbase.ColumnDescriptor(name=self.cf_basic, maxVersions=1), Hbase.ColumnDescriptor(name=self.cf_price, maxVersions=1, timeToLive=365 * 24 * 60 * 60)) self.hbase.create_table_if_not_exists(column_families)
def put(self, rowKey, qualifier='0', *args): mutations = [] for j, column in enumerate(args): if isinstance(column, str): m_name = Hbase.Mutation(column=self.columnFamilies[j] + ':' + qualifier, value=column) elif isinstance(column, int): m_name = Hbase.Mutation(column=self.columnFamilies[j] + ':' + qualifier, value=encode(column)) mutations.append(m_name) self.client.mutateRow(self.table, rowKey, mutations, {})
def write_hbase(self, data): """ 将数据写入 HBase, 注意,源ID会经过 :param data: 对 HBase 的一个操作,比如 { # 'i' 是插入, 'd' 是删除 (只能是 'i' 或 'd') 'op': 'i', # 写入的 HBase 表 'table_name': 'hb_charts', # 数据id '_id': '121314125_img2', # 写入的各个字段的值 'columns': { 'title' : 'This is a title' } } :return: """ op = data['op'] table_name = bytes(data['table_name'], "utf-8") # row_key = bytes(self.generate_rowkey(data['_id']), "utf-8") columns = data['columns'] if 'columns' in data else [] if op == 'i': mutations = [] for item in columns: if item == '_id': continue key = bytes(self.cf + ':' + item, encoding="utf8") var = bytes(str(columns[item]), encoding="utf8") # hbase.client.keyvalue.maxsize 默认是10M,超出这个值则设置为None if len(var) < 10 * 1024 * 1024: mutations.append(Hbase.Mutation(column=key, value=var)) else: mutations.append( Hbase.Mutation(column=key, value=bytes(str(None), encoding="utf8"))) self.logger.warning(self.getName() + ' ' + data['table_name'] + ' 的 _id为 ' + data['_id'] + ' 的数据的 ' + str(item) + ' 字段的值大小超过了' + ' HBase 默认规定的键值10M限制,先已经置 None 替代该值') self.client.mutateRow(table_name, row_key, mutations, {}) self.logger.debug(str(QUEUE.qsize()) + ' 插入到 HBase ' + str(data)) elif op == 'd': self.client.deleteAllRow(table_name, row_key, {}) self.logger.debug(str(QUEUE.qsize()) + ' 删除到 HBase ' + str(data))
def write_data_to_hbase(data, col_names, table_name, ip, server_port): """ 该函数为在mapPartation中调用的功能函数。接受的RDD数据以迭代器的形式传入。 通过遍历迭代器,将迭代器中的数据缓冲到一个缓冲变量中。 当缓冲变量中的数据量到达1000条时,将数据推送到hbase中,然后清空变量,姐搜下一批数据。 :param data: 包含数据的迭代器。 :param col_names: 需要推送的列的列名 :param table_name: 需要推送的目标表的表名 :param ip: 推送的目标thrift ip :param server_port: 推送的目标thrift port :return: 每一行对应的缓冲变量的索引编号 """ print("start putDataAsPartition") if not isinstance(table_name, bytes): table_name = bytes(table_name, encoding='utf-8') col_names = HBaseUtils().str_list_to_bytes_list(col_names) # 建立hbase连接 transport = TSocket.TSocket(ip, server_port) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = Hbase.Client(protocol) transport.open() # 开始收集数据 result = [] return_data = [] count = 0 for line in data: # print("data: " + str(line)) count += 1 # 收集数据生成BathMutation mutations_ = [] for colName in col_names: if str(colName, encoding='utf-8') in line: mutations_.append(Mutation(column=colName, value=bytes(line[str(colName, encoding='utf-8')], encoding='utf-8'))) result.append(Hbase.BatchMutation(row=bytes(line["rowKey"], encoding='utf-8'), mutations=mutations_)) # 每1000条想hbase推送一次数据 if count % 1000 == 0: client.mutateRows(table_name, result, None) result = [] # 推送出缓冲变量中的剩余数据 if len(result) > 0: client.mutateRows(table_name, result, None) transport.close() return return_data
def __connect(self): """ hbase 连接 """ for index in range(0, len(self.nodes)): try: host, port = self.nodes[index].split(':') for i in range(0, self.rety): try: transport = TSocket.TSocket(host, port) transport.setTimeout(self.timeout) self.transport = TTransport.TBufferedTransport( transport) self.client = Hbase.Client( TBinaryProtocol.TBinaryProtocol(self.transport)) self.transport.open() break except: if i + 1 >= self.rety: raise Exception('cannot connect hbase, info: %s' % traceback.format_exc()) break except Exception as e: if index >= len(self.nodes): raise Exception(e.message)
def main(args): # getColumnInfo(table_name) if(len(args)<2): print "TableScan.py tableName No[10]" sys.exit(1) table_name=args[1] NO=10; if(len(args)<3): NO=10; else: NO=int(args[2]); getConfiguration('host.properties') transport = TBufferedTransport(TSocket(hbaseHost, 9090)) transport.open() protocol = TBinaryProtocol.TBinaryProtocol(transport) global client client = Hbase.Client(protocol) ret=getRowsLimit(table_name,NO) printRowsResult(ret)
def __init__(self, columnn_family='data'): """ 初始化函数 :param columnn_family: 写入到 HBase 的列族 """ super(HBaseSync, self).__init__() handle = RotatingFileHandler('./hbase_sync.log', maxBytes=50 * 1024 * 1024, backupCount=3) handle.setFormatter( logging.Formatter( '%(asctime)s %(name)-12s %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s' )) self.logger = logging.getLogger('HBaseSync') self.logger.addHandler(handle) # self.logger.setLevel(logging.INFO) transport = TSocket.TSocket(THRIFT_IP, THRIFT_PORT) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) self.client = Hbase.Client(protocol) transport.open() self.cf = columnn_family
def __init__(self, table='test', columnFamilies=['indexData:', 'result'], host='localhost', port=9090): if host == 'localhost': try: host = Utools().HOST_HBASE except: print 'use the default host(hbase):"localhost"' host = 'localhost' self.table = table self.port = port # Connect to HBase Thrift server socket = TSocket.TSocket(host, port) socket.setTimeout(1000 * 10) self.transport = TTransport.TBufferedTransport(socket) self.protocol = TBinaryProtocol.TBinaryProtocol(self.transport) # Create and open the client connection self.client = Hbase.Client(self.protocol) self.transport.open() # set type and field of column families #self.set_column_families([str, str], ['name', 'sex']) self.set_column_families(columnFamilies) self._build_column_families()
def scanner(self, numRows=100, startRow=None, stopRow=None): scan = Hbase.TScan(startRow, stopRow) scannerId = self.client.scannerOpenWithScan(self.table, scan, {}) # row = self.client.scannerGet(scannerId) ret = [] rowList = self.client.scannerGetList(scannerId, numRows) while rowList: for r in rowList: rd = {'row': r.row} for k, v in r.columns.iteritems(): cf, qualifier = k.split(':') if qualifier not in rd: rd[qualifier] = {} idx = self.columnFamilies.index(cf) if self.columnFamiliesType[idx] == str: rd[qualifier].update({cf: v.value}) elif self.columnFamiliesType[idx] == int: rd[qualifier].update({cf: decode(v.value)}) ret.append(rd) rowList = self.client.scannerGetList(scannerId, numRows) self.client.scannerClose(scannerId) return ret
def _connect(self): if hasattr(self.context.config, 'HBASE_STORAGE_SERVER_HOSTS'): host = self.context.config.HBASE_STORAGE_SERVER_HOSTS[ (self.context.server.port + self.hbase_server_offset) % len(self.context.config.HBASE_STORAGE_SERVER_HOSTS)] else: host = self.context.config.HBASE_STORAGE_SERVER_HOST transport = TBufferedTransport( TSocket(host=host, port=self.context.config.HBASE_STORAGE_SERVER_PORT)) socket = TSocket(host=host, port=self.context.config.HBASE_STORAGE_SERVER_PORT) # Timeout is sum of HTTP timeouts, plus a bit. try: timeout = 5 socket.setTimeout(timeout * 1000) except: pass try: transport = TBufferedTransport(socket) transport.open() protocol = TBinaryProtocol.TBinaryProtocol(transport) self.storage = Hbase.Client(protocol) logger.info("Connected to HBase server " + host + ":" + str(self.context.config.HBASE_STORAGE_SERVER_PORT)) except: logger.error("Error connecting to HBase server " + host + ":" + str(self.context.config.HBASE_STORAGE_SERVER_PORT)) self.hbase_server_offset = self.hbase_server_offset + 1
def __init__(self): self.host = "193.169.100.33" self.port = 2181 self.transport = TBufferedTransport(TSocket(self.host, self.port)) self.transport.open() self.protocol = TBinaryProtocol.TBinaryProtocol(self.transport) self.client = Hbase.Client(self.protocol)
def post(self): task_id = request.form['task_id'] user_id = request.form['user_id'] print(task_id, user_id, "===============") transport = TSocket.TSocket('172.16.100.200', 9090) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = Hbase.Client(protocol) transport.open() tableName = 'tabledata' res_uid = int(user_id) * 1000 res_tid = int(task_id) res_num = res_tid + res_uid rowKey = str(res_num + 1000000) print(rowKey) result = client.getRow(tableName, rowKey, None) la = {} li = [] if result: for (k, v) in result[0].columns.items(): kk = str("%-20s:%s" % (k, v.value)) ll = [] for i in kk.split(" :"): ll.append(i) # ll_b=str(ll[2:])[2:-2] # la[ll[1]]=ll_b # la[ll[1]]=str(ll[2:]) la[ll[0][5:]] = str(ll[1]) return la
def run(self): # server端地址和端口,web是HMaster也就是thriftServer主机名,9090是thriftServer默认端口 transport = TSocket.TSocket('localhost', 6666) # 可以设置超时 transport.setTimeout(5000) # 设置传输方式(TFramedTransport或TBufferedTransport) trans = TTransport.TBufferedTransport(transport) # 设置传输协议 protocol = TBinaryProtocol.TBinaryProtocol(trans) # 确定客户端 client = Hbase.Client(protocol) # 打开连接 transport.open() total = 0.0 for i in range(int(sys.argv[2])): key = str(i) key = str(random.randint(0, 1999)) beg = time.time() get_row('users', key, client) end = time.time() total += end - beg print 'total:', total print 'avg:', total / int(sys.argv[2])
def find_row(self, table_name, column_family, column, column_value): """ 查找hbase中的某条数据 :param table_name: :param column_family: :param column: :param column_value: :return: """ transport = TSocket.TSocket(self.master_ip, self.port) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = Hbase.Client(protocol) transport.open() scan = TScan() scan.filterString = bytes("SingleColumnValueFilter('{cf}', '{col}', {opt}, 'binary:{val}', true, true)".format( cf=column_family, col=column, opt="=", val=column_value), encoding='utf-8') scanner = client.scannerOpenWithScan(bytes(table_name, encoding='utf-8'), scan, None) while True: r = client.scannerGet(scanner) if not r: transport.close() break else: res = {} for i in r[0].columns.items(): res[i[0]] = i[1].value yield res
def main(args): if (len(args) < 2): print "%s <verified file> -all" % (args[0]) sys.exit(1) filename = args[1] opt_all = True if len(args) > 2 and args[2] == "-all" else False filenamearray = filename.split("_") orgId = filenamearray[0] subOrgId = filenamearray[1] getConfiguration('host.properties') transport = TBufferedTransport(TSocket(hbaseHost, 9090)) transport.open() protocol = TBinaryProtocol.TBinaryProtocol(transport) global client client = Hbase.Client(protocol) tablename = "%s_%s_master_%s" % (orgId, subOrgId, orgId) for line in open(filename, "r"): input = line.strip() row = client.getRow(tablename, input) print input printRow(row) print "" if (not opt_all): break transport.close()
def get_statuses(self, uid): key_beg = pack_mid(uid, 0) key_end = pack_mid(uid, 0x7fffffffffffffff) scan = Hbase.TScan(startRow=key_beg, stopRow=key_end) client = self._get_client() scanner = client.scannerOpenWithScan(self.cfg['table_status'], scan, None) i = 0 while True: i += 1 row_list = client.scannerGetList(scanner, i) if not row_list: break for row in row_list: (status, repost) = load_status(row.columns) if status is not None: status.__dict__.pop('batches') ret = {} ret.update(status.__dict__) if repost is not None: repost.__dict__.pop('batches') ret['retweeted_status'] = repost.__dict__ yield ret client.scannerClose(scanner)
def __init__(self): self.tableName = 'database_test' self.transport = TSocket.TSocket('student62', 9090) self.transport = TTransport.TBufferedTransport(self.transport) self.transport.open() self.protocol = TBinaryProtocol.TBinaryProtocol(self.transport) self.client = Hbase.Client(self.protocol)
def __init__(self, host='', port=9090): transport = TSocket.TSocket(host=host, port=port) self.transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(self.transport) self.client = Hbase.Client(protocol) self.transport.open()
def write_to_hbase(result): socket = TSocket.TSocket('127.0.0.1', 9090) socket.setTimeout(5000) transport = TTransport.TBufferedTransport(socket) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = Hbase.Client(protocol) socket.open() # global socket # global client # 将这大量字段添加进去 mutations = [ Mutation(column=("f:" + x).encode('utf-8'), value=to_byte(result[x])) for x in result.keys() ] # 获得行键 row_key = to_md5(result["n"]).encode('utf-8') # 写入 client.mutateRow("film22".encode('utf-8'), row_key, mutations, None) print(result) print("录入完成")
def filter_data(x): host = '10.27.71.108' port = 9099 transport = TTransport.TBufferedTransport(TSocket.TSocket(host, port)) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = Hbase.Client(protocol) transport.open() user = '******' password = '******' host = 'dds-bp1d09d4b278ceb41.mongodb.rds.aliyuncs.com' port = 3717 db_name = 'cr_data' table = 'hb_charts' mongo_client = MongoClient(host, port) db = mongo_client[db_name] db.authenticate(user, password) collection = db[table] for row in x: rowkey = row['key'].split(':')[-1] state = row['state'] if QUERY_MONGODB: if state in ['3', '4', '5'] or collection.find({'_id': rowkey}).count() == 0: try: client.deleteAllRow(b'hb_charts', bytes(row['key'], 'utf-8'), attributes=None) except: pass else: if state in ['3', '4', '5']: try: client.deleteAllRow(b'hb_charts', bytes(row['key'], 'utf-8'), attributes=None) except: pass
def main(): hbasetransport = TSocket.TSocket("192.168.1.163", 9090) hbasetransport = TTransport.TBufferedTransport(hbasetransport) hbaseprotocol = TBinaryProtocol.TBinaryProtocol(hbasetransport) hbaseclient = Hbase.Client(hbaseprotocol) hbasetransport.open() id = hbaseclient.scannerOpen('userrelation', '', ['follower']) count = 0 while True: li = hbaseclient.scannerGet(id) if len(li) == 0: break for item in li: idlist.add(item.row[0:10]) ids = item.columns['follower:'].value for i in ids.split(':'): idlist.add(i) count = count + 1 time_str = str(long(time.time())) fname = '/home/mapred/sinaid/' + 'id' + time_str f = open(fname, 'w') for i in idlist: f.write(i + '\n') f.close() hbaseclient.scannerClose(id) hbasetransport.close()
def setup_thrift_transport(host): transport = TSocket.TSocket(host, 9090) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = Hbase.Client(protocol) transport.open() return client, transport
def display_3D_frame(request): # cwd = os.getcwd() # smiles_file = open(os.path.join(cwd,'smiles_sample.smiles'), 'r') # smiles_list = [] # index = 0 # for line in smiles_file: # smiles_list.append((line.strip(), index)) # index += 1 # Connect to HBase Thrift server host = 'ai-master.sh.intel.com' port = 9090 transport = TTransport.TBufferedTransport(TSocket.TSocket(host, port)) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = Hbase.Client(protocol) transport.open() row_key_list = [] # row key starts from 1 for i in range(1, 101): row_key_list.append(str(i)) smiles_list = [] for row_key in row_key_list: row_label = client.get('drug', row_key, 'label') row_data = client.get('drug', row_key, 'data') smiles_list.append((row_label[0].value, row_data[0].value, row_key)) transport.close() context_var = { 'smiles_list': smiles_list, } return render(request, 'molview.html', context=context_var)
def __create_table(self, table): """ create table in hbase with column families """ columnFamilies = [] for columnFamily in self.columnFamilies: name = Hbase.ColumnDescriptor(name=columnFamily) columnFamilies.append(name) self.client.createTable(table, columnFamilies)