tables = [ "TEST", "test1", prefix + "AREA", prefix + "RETAIL", prefix + "RETAIL_WARNING", prefix + "WARNING_CODE", prefix + "DATA_INDEX", prefix + "BLOCK_DATA", prefix + "CIGA_PICTURE", prefix + "GEARS_TOSS", prefix + "CIGA_GRADE" ] hbase["table"] = tables[3] hbase["families"] = "0" # hbase["row"]="sale_center_id" # hbase["row"] = "cust_id" # hbase["table"]="V530_TOBACCO.CODE" conn = happybase.Connection(host=hbase["host"]) table = conn.table(hbase["table"]) #查询有哪些表 # tables=conn.tables() # for t in tables: # print(t.decode("utf-8")) # table=conn.table(hbase["table"]) # table = conn.table("V630_TOBACCO.GEARS_TOSS") # rows=table.scan(row_prefix=bytes("YJFL004","utf-8")) # # print(type(rows)) # for row in rows: # print(row) # table.put(row="01111430206",data={"0:COUNTY":"['渌口区']"})
import happybase as hb conn = hb.Connection() p = {'personal': dict(), 'professional': dict(), 'custom': dict()} f = {'nutrition': dict(), 'taste': dict()} conn.create_table('powers', p) conn.create_table('food', f) conn.close()
#!/opt/anaconda/envs/bd9/bin/python3 import sys import happybase connection = happybase.Connection('master.cluster-lab.com') table = connection.table('sergey.zaytsev') def main(): for line in sys.stdin: # input uid, ts, url = line.strip().split('\t') # output table.put(uid, {'data:url': url}, timestamp=int(ts)) if __name__ == "__main__": main()
def init(self): host, port = self.args.connect.split(':') connection = happybase.Connection(host=host, port=int(port)) self.table = connection.table(self.args.table)
import happybase connection = happybase.Connection(host='hadoop_env.com',port=9090,timeout=1000000) #conn = happybase.Connection(host=host,port=port,protocol='compact',transport='framed') connection.open() print connection.tables() user = connection.table('user') user_families=user.families() regions=user.regions() row_1=user.row('1') # 获取第一行的数据 rows_1=user.rows(['2','3','4']) # 获取好多列 cells=user.cells('1','address:city') # sanner=user.scan() for key , data in sanner: print key , data user.put('5',{'user name:first_name':'kate','user name:last_name':'jane','address:city':'chengdu','address:region':'tianfu'}) user.delete('5') batch=user.batch() batch.put('5',{'user name:first_name':'kate','user name:last_name':'jane', 'address:city':'chengdu','address:region':'tianfu'}) batch.delete('5') batch.send()
# то эти комментарии вообще теряют весь смысл и учитываться интерпретатором python не будут. При этом будет использован # текущий python, то есть путь к которому прописан в переменной PATH (или первый из них, который найдется в PATH) # Если мы сделаем файл mapper.py исполняемым и выполним команду # ./mapper.py # то оболочка (bash) посмотрит первую строку, и если она начинается с "#!", то использует то, что указано дальше, как # команду, на вход которой подастся исполняемый файл. То есть в нашем случае при исполнении "./mapper.py" на самом деле # оболочка исполнит # /opt/anaconda/envs/bd9/bin/python mapper.py # Но в том и другом случае второй комментарий "#!/usr/bin/python3" останется без внимания и ни на что не повлияет # Если же мы попытаемся выполнить "./mapper.py", но в нем не будет первой строки "#!...", то bash поругается, что # не знает как это запускать import sys import happybase connection = happybase.Connection('bd-node2.newprolab.com') table = connection.table('s***n.shafronov') def map(line): # Наверное, при оформлении забыл что-то поправить и оставили i вместо line. Переменной i не существует :) objects = i.split('\t') if len(objects) != 3: return uid, timestamp, url = objects # Тут неплохо было бы проверить, что uid является "натуральным числом, записанным в десятичной форме", как указано # в задании. Иначе при приведении к int может возникнуть исключение if len(uid) < 11: return # Следуя guidelines, следовало бы число 256 вынести в константу, а число 25 можно было бы вынести в параметр функции # (мне у себя тоже следовало так сделать :) )
rawData = ['3,INFO,name,Rongcheng', '4,INFO,phone,123456'] sc.parallelize(rawData).map( lambda x: (x[0], x.split(','))).saveAsNewAPIHadoopDataset( conf=hbase_conf, keyConverter=keyConv, valueConverter=valueConv) if __name__ == '__main__': conf = SparkConf() sc = SparkContext(conf=conf) hc = HiveContext(sc) # test_join_func() # test_Row() # test_groupby() # data = ["aaafgftango", "asdasfgftango", "aaafgfhike"] # ns_dict = [a.split('fgf') for a in data] # kv_dict = {} # for k, v in ns_dict: # if kv_dict.get(k) is None: # kv_dict[k] = {"platform": [v], "score": 1} # else: # a = kv_dict.get(k).get("platform") # a.append(v) # kv_dict[k] = {"platform": a, "score": a.__len__()} # print kv_dict # insert_data = sc.parallelize([{"userid": "uid1", "name": "zy", "mail": "*****@*****.**", "phone": "8612123123", # "platform": "tango", "salt": "asd", "password": "******", "name_source": "tc", # "reg_time": "2018"}, # {"userid": "uid1", "name": "zy22", "mail": "*****@*****.**", "phone": "861231231233", # "platform": "tango", "salt": "asdasf", "password": "******", # "name_source": "tl", "reg_time": "2018"}, # {"userid": "uid2", "name": "qww", "mail": "*****@*****.**", "phone": "8615631231233", # "platform": "tango", "salt": "aas", "password": "******", "name_source": "zyb", # "reg_time": "2018"} ]) # df = hc.createDataFrame(insert_data) # test_merge_hbase_data() # test_hbaseinsert() table = happybase.Connection(host="10.200.11.35", port=19090).table("USER_TEST") a = table.row("0074cffb2e2fc36264fb6f7abf21abec-viber") for key in json.loads(a.get("INFO:NAME")).iterkeys(): print key
import happybase from kafka import KafkaClient, SimpleConsumer #kafka_consumer = SimpleConsumer(KafkaClient('172.31.17.174:6667'), None, 'rvi') #kafka_consumer.seek(0,2) hb_conn = happybase.Connection('172.31.17.174') table = hb_conn.table('rvi') """ for key, data in table.scan(row_prefix="3"): print key, data """ vin = 'rsixtbmw' row = table.row('rjsram') print row """ if len(row)==0: print "nothing!" else: print row['user:mostrecent'] """ #vin = '3' #start_date = '10000000' #end_date = '20000000' #start_key = vin+start_date #end_key = vin+end_date count = 0 for key, data in table.scan(row_prefix=vin): count = count + 1
import happybase # server_name = 'node2.newprolab.com' server_name = 'horton1.ssv.home.internal' table_name = 'sergey.sirosh' def create_table(conn, table_name): conn.create_table(table_name, {'data:url': dict(max_versions=4096)}) try: conn = happybase.Connection(server_name) print('Connection is') except: print('Connection isNot') create_table(conn, table_name)
def get_connection(): connection = happybase.Connection(host=host) return connection
def write_hbase(x): global content_loc global article_family global article_pref global user_family global user_pref global content_loc global table_name global cf1 global title_col if x != None: print('keys:') print(x['id'] + ' ' + x['username'] + ' ' + x['title']) id = x['id'] content = x['text'] vec = x['vec'].copy() username = x['username'] title = x['title'] connection = happybase.Connection('0.0.0.0', port=9090) table = connection.table(table_name) #Fetch row from table row = table.row(article_pref + id) #Append contributions if content_loc in row: content = row[content_loc] + ' ' + content #Calculate contrib count for article count = 1 if count_loc in row: count = str(count + int(row[count_loc])) else: count = str(count) # Aggregate article vector for word in vec: key = article_family + ':' + word if key in row: vec[word] = vec[word] + int(row[key]) #Copy to new vec temp = {} for word in vec: temp[article_family + ':' + word] = str(vec[word]) vec = temp # Put article vector + content + count + contributor vec[content_loc] = content vec[count_loc] = count vec[cf1 + ':' + user_pref + username] = 'true' vec[cf1 + ':' + title_col] = title table.put(article_pref + id, vec) #Fetch user row from table row = table.row(user_pref + username) #Aggregate user vector vec = x['vec'].copy() for word in vec: key = user_family + ':' + word if key in row: vec[word] = vec[word] + int(row[key]) count = 1 if count_loc in row: count = str(count + int(row[count_loc])) else: count = str(count) #write user vector +count temp = {} for word in vec: temp[user_family + ':' + word] = str(vec[word]) vec = temp vec[count_loc] = count vec[cf1 + ':' + article_pref + id] = 'true' table.put(user_pref + username, vec)
#!/usr/bin/env python3 # Mit diesem Script kann die im Praktikum gestellte 'plz.data' Datei in eine MongoDB importiert werden # Bevor das Script ausgefuehrt wird muss die MongoDB gestartet werden! # Name der Datenbank soll 'test' lauten # Die Datei 'plz.data' muss sich im gleichen Verzeichnis befinden wie dieses Script import happybase import json FILE_PATH = ("plz.data") TABLE_NAME = 'orte' # Verbindung herstellen con = happybase.Connection(autoconnect=True) con.open() table = con.table(TABLE_NAME) batch = table.batch() print(table.row('71646')) for row in table.scan(columns=[b'daten:city']): plz = row[0] city = row[1]['daten:city'] if (city == 'HAMBURG' or city == 'BREMEN'): batch.put(plz, {b'fussball:': b'ja'}) batch.send() #print(table.row('71646',columns=[b'fussball:']))
def __init__(self): self.connection = happybase.Connection(app.config['HBASE_HOST'], app.config['HBASE_PORT'])
import happybase from main_app.models import Projects connection=happybase.Connection(host='172.16.14.84',port=9090) connection.open() table=connection.table('AI133:t_project') city_list=['北京','上海','广州','深圳'] job_list=['web','爬','数据','ai'] detail_list=[(i,j)for i in city_list for j in job_list] scanner=table.scan(columns=("choosed",)) def sum_count(): bj_web=Projects.objects.filter(city__contains='北京', title__icontains='web').count() bj_crawl=Projects.objects.filter(city__contains='北京', title__contains='爬').count() bj_data=Projects.objects.filter(city__contains='北京', title__contains='数据').count() bj_ai=Projects.objects.filter(city__contains='北京', title__icontains='ai').count() sh_web=Projects.objects.filter(city__contains='上海', title__icontains='web').count() sh_crawl=Projects.objects.filter(city__contains='上海', title__contains='爬').count() sh_data=Projects.objects.filter(city__contains='上海', title__contains='数据').count() sh_ai=Projects.objects.filter(city__contains='上海', title__icontains='ai').count() gz_web=Projects.objects.filter(city__contains='广州', title__icontains='web').count() gz_crawl=Projects.objects.filter(city__contains='广州', title__contains='爬').count() gz_data=Projects.objects.filter(city__contains='广州', title__contains='数据').count() gz_ai=Projects.objects.filter(city__contains='广州', title__icontains='ai').count() sz_web=Projects.objects.filter(city__contains='深圳', title__icontains='web').count() sz_crawl=Projects.objects.filter(city__contains='深圳', title__contains='爬').count() sz_data=Projects.objects.filter(city__contains='深圳', title__contains='数据').count() sz_ai=Projects.objects.filter(city__contains='深圳', title__icontains='ai').count() return bj_web,bj_crawl,bj_data, bj_ai,sh_web,sh_crawl,sh_data,sh_ai,gz_web,gz_crawl,gz_data,gz_ai,sz_web,sz_crawl,sz_data,sz_ai def hbase_list(): global detail_list
def establish_connection_with_api(self, **kwargs): api_connection = happybase.Connection(**kwargs) return api_connection
import impala.dbapi import happybase connect = happybase.Connection(host='hadoop3', port=9090, timeout=None, autoconnect=True, table_prefix=None, table_prefix_separator=b'_', transport='buffered', protocol='binary') connect.open() families = {"cf": dict(), "df": dict()} connect.create_table('jobdata', families) connect.close()
from pywebhdfs.webhdfs import PyWebHdfsClient import happybase import subprocess import time from random import randint HBASE_NODE = 'data2' hdfs = PyWebHdfsClient(host='namenode', port='50070', user_name='root') conn = happybase.Connection(HBASE_NODE) t = conn.table('anet') while True: a_net = randint(1, 255) ROW = t.row(str(a_net)) if len(ROW) > 0: for key, value in ROW.items(): if value != str(-1): START = randint(1, 255) continue t.put(str(a_net), {'data:user': '******'}) print 'scanning the major ' + str(a_net) + '.0.0.0/8 subnet' for bnet in range(0, 256): if a_net == 10: continue elif a_net == 192 and bnet == 168: continue elif a_net == 172 and bnet == 16: continue elif a_net == 127: continue IPADDR = str(a_net) + '.' + str(bnet) + '.0.0/16' OFILE = str(a_net) + '-' + str(bnet) + '-p80.log' A = subprocess.Popen(
import happybase, sys, os, string # VARIABLES # Output directory for CSV files outputDir = "/mnt" # HBase Thrift server to connect to. Leave blank for localhost server = "" # Connect to server c = happybase.Connection(server) # Get the full list of tables tables = c.tables() # For each table in the tables for table in tables: # Open file to write to file = open(outputDir + "/" + table + ".csv", "w") t = c.table(table) print table + ": ", count = 0 # For each row key for prefix in string.printable: try: for key, data in t.scan(row_prefix=prefix): # First key if count == 0: startRow = key
#!/usr/bin/env python ''' Reducer Purpose: To produce inverted index and store it in a HBase Database, 1) Input format: Word,Frequency,FancyHitBit,DocId 2) Row Format: Word - DocId1(Freq,FHBit)$DocId2(Freq,FHBit)$... 3) Store the output row in hbase database Everytime Mapreduce job is run, a new column is created in InvertedIndex table which stores the InvertedIndex string of that job. Wanted to append to existing invertedIndex string but there was an unknown issue in modifying existing entries in hbase table. ''' import fileinput import happybase connection = happybase.Connection( '172.31.10.32') #ip of host running thrift server table = connection.table('InvertedIndex') prev_word = '' isFirst = True invertedIndexString = '' def insertInTable(word, invertedIndexString): #insert in InvertedIndex table invertedIndexString = invertedIndexString[:len(invertedIndexString) - 1] #remove last $ row = table.row(word) #returns a dictionary postings_no = len( row.keys()
parser.add_argument('topic_name') parser.add_argument('kafka_broker') parser.add_argument('data_table') parser.add_argument('hbase_host') #Parse args args = parser.parse_args() topic_name = args.topic_name kafka_broker = args.kafka_broker data_table = args.data_table hbase_host = args.hbase_host #Initiate a simple kafka consumer kafka_consumer = KafkaConsumer(topic_name, bootstrap_servers=kafka_broker) #Initiate a hbase connection hbase_connection = happybase.Connection(hbase_host) #Create table if not exists hbase_tables = [table.decode() for table in hbase_connection.tables()] if data_table not in hbase_tables: hbase_connection.create_table(data_table, {'family': dict()}) #Step up proper shutdown hook atexit.register(shutdown_hook, kafka_consumer, hbase_connection) #Start consuming kafka and writing to hbase for msg in kafka_consumer: persist_data(msg.value, hbase_connection, data_table)
def __init__(self): self.connection = happybase.Connection(host='localhost', port=9090, table_prefix='stock', table_prefix_separator=':') self.table = self.connection.table('stock_daily')
__author__ = 'Maykungth' # 18/8/2558 import happybase Master2 = '172.30.224.142' con = happybase.Connection(Master2) con.open() alltable = con.tables() creatingTable = True # Reset Delete Table # con.delete_table('MetaTable',disable=True) # con.delete_table('EncTable',disable=True) # Creating Hbase schema # if creatingTable: if ('MetaTable' and 'EncTable') not in alltable: #Create Table and column print "Creating table : " + 'MetaTable' con.create_table( 'MetaTable', { 'pp': dict(max_versions=1, bloom_filter_type='ROW', block_cache_enabled=True) }) print "Creating table : " + 'EncTable' con.create_table( 'EncTable', { 'enc': dict(max_versions=1, bloom_filter_type='ROW',
def _new_hbase_table_connection(self) -> "happybase.table": return happybase.Connection( self.hbase_address, timeout=self.timeout).table(self.hbase_table)
import happybase as hbase hb_conn = hbase.Connection('localhost', table_prefix='wda') ############################################################## def create_table(table_name, families): is_table_exists = False; try: is_table_exists = hb_conn.is_table_enabled(table_name) except: is_table_exists = False; if(is_table_exists == False): hb_conn.create_table(table_name, families) print 'Table ' + table_name + ' created successfully !!' return True else: print 'Table ' + table_name + ' exists !!' return False ############################################################## def delete_table(table_name): is_table_exists = True; try: is_table_exists = hb_conn.is_table_enabled(table_name) if(is_table_exists): hb_conn.disable_table(table_name) hb_conn.delete_table(table_name) except: is_table_exists = False; if(is_table_exists == False): print 'Table ' + table_name + ' deleted successfully !!'
from jsonrpc.authproxy import AuthServiceProxy import sys, string, getpass, time, datetime import happybase import pprint #rpcuser = "******" #rpcpass = "******" #rpcip = "127.0.0.1" hbase = happybase.Connection('localhost') #hbase_blocks_table = hbase.table('block_data') #hbase_live_transactions_table = hbase.table('realtime_transactions') settings_table = hbase.table('settings') #settings_table.put('row1234', {"metadata:time":"213124124"}) settings = settings_table.row('row1234') pprint.pprint(settings) #hbase_transactions_table = hbase.table('realtime_transactions') #results = hbase_transactions_table.scan( filter=b"SingleColumnValueFilter('metadata','timestamp',>, 'int:124124')") #results = hbase_transactions_table.scan( filter=b"KeyOnlyFilter() AND FirstKeyOnlyFilter()") #results = hbase_live_transactions_table.scan( filter=b"SingleColumnValueFilter('metadata','status',=, 'binary:Error loading block')") #KeyOnlyFilter() AND FirstKeyOnlyFilter() #row_start=b'1', row_stop=b'116010', #live=[{key:data} for data in results] #pprint.pprint(live) # full_list = sorted([str(key) for key in range(1, 116010)])
#coding=UTF-8 ''' __author__ = 'Ivy' created on 2016.3.1 ''' import sys #reload(sys) #sys.setdefaultencoding('utf-8') import happybase from collections import OrderedDict conn = happybase.Connection('192.168.168.41') conn.open() print conn.tables() table=conn.table('commentTable') row = table.row('row1') print row['testColumn:date'] ind = 0 for key,data in table.scan(): ind += 1 print '1',key,data print '2',data['testColumn:date'] print '3',data['testColumn:weiboId'] print "total_rows" ,ind rows = table.rows(['row1', 'row2']) for key, data in rows: print 'hey',key, data rows_as_dict = dict(table.rows(['row1', 'row2']))
def __init__(self): self.connection = happybase.Connection(host="192.168.106.129", port=9090) self.table = self.connection.table(b'lasttest5') # TODO 上传前修改 self.do = Down_info()
def main(): #TODO: Fix this wait #sleep until hbase and kafka are up time.sleep(60) useBeam = True #TODO: check first if kafka topic exist? #TODO: check kafka connection #TODO: include kafka group ID if useBeam: #TODO: Check HBase connection #Check if table exist and create it otherwise conn = hb.Connection(hbHost, hbPort) if not hbTableName.encode('utf-8') in conn.tables(): conn.create_table(hbTableName, hbFamilies) conn.close() #Define kafka configuration kafka_config = { "topic": kafkaTopic, "bootstrap_servers": kafkaServers } #,"group_id":kafkaGrId #Streaming pipelines with beam.Pipeline(options=PipelineOptions()) as p: #3 pipelines: Metadata&Subject, Content&Label, WordCount inputTuples = p | "Reading messages from Kafka" >> kafkaio.KafkaConsume( kafka_config) content = (inputTuples | "Extract content" >> beam.Map(extract_mailContent)) #TODO: filter empty content mails???? | "filter empy content" >> beam.Filter(is_ContentNotEmpty)) classifiedContent = content | "Classify as SPAM/HAM and store" >> beam.Map( classifyMail) wordC = ( content | "Clean content" >> beam.Map(cleanContent) #TODO: word count exploiting beam(window strategy?) #| 'Fixed-size windows' >> beam.WindowInto() #| "Word" >> ..... #| "Count" >> beam.combiners.Count.PerElement() | "Count and store" >> beam.Map(countWordsContent)) metadata = (inputTuples | "Extract metadata" >> beam.Map(extract_mailMetadata) | "Extract subject and store" >> beam.Map(extract_subjectMetadata)) #| 'Writing to stdout' >> beam.Map(print)) else: #Create Kafka consumer consumer = KafkaConsumer( kafkaTopic, bootstrap_servers=kafkaServers) #group_id = kafkaGrId #Receive and store kafka data dataCollected = [] for message in consumer: dataCollected.append((message.key, message.value)) print(message.key)
import happybase import epics import time import sys import os pv1=epics.PV('hadoop1:ai1') connection=happybase.Connection('hadoop1') table=connection.table('PVSimulator_test') # ,PVSimulator_test def onChanges(pvname=None,value=None,timestamp=None,status=None,severity=None,type=None,**kw): print pvname,value,timestamp,status,severity,type table.put(pvname+'_'+str(timestamp),{'PV:val':str(value),'PV:status':str(status),'PV:severity':str(severity)},timestamp=int(timestamp)) pv1.add_callback(onChanges) # print onChanges() t0=time.time() while time.time(): #-t0<3: time.sleep(1.e-3)
#orig_path = '/p2data/sftp/csvsftp/big_folder/a_csv_file/' #target_path = '/p2data/sftp/csvsftp/big_folder/b_csv_file/' #archive_path= '/data/history/csv/aoi_csv/' + rec_dat + '/' archive_path = '/data/history/csv/aoi_csv/' + rec_dat + '/' err_path = '/data/history/csv/aoi_csv/err_aoi_csv_hbase/' + rec_dat + '/' job_name = 'p4_aoi_csv_upload_hbase' run_log = '/home/armap/log_exc/' + job_name + '_' + rec_dat + '.record' exc_log = open(run_log, "a+") if os.path.exists(archive_path) == False: os.mkdir(archive_path) if os.path.exists(err_path) == False: os.mkdir(err_path) ## For HBase connection connection = happybase.Connection('10.41.158.65') table = connection.table('p8_aoi_csv') ## For MYSQL connection #db = MySQLdb.connect("10.41.158.65","root","admfcs","aoi_mo_sn" ) #cursor = db.cursor() csv_file = os.listdir(source_path) time.sleep(5) if len(csv_file) > 20000: n = 20000 else: n = len(csv_file) for i in range(n): print i, "of", n