def create(descriptor_path, environment): with open(descriptor_path) as descriptor_file: contents = descriptor_file.read() descriptor = json.loads(contents) logging.debug("_deploy_hbase: %s", descriptor) hbase = starbase.Connection(host=environment['hbase_rest_server'], port=int(environment['hbase_rest_port'])) hive_host = environment['hive_server'] hive_port = environment['hive_port'] hive = pyhs2.connect(host=hive_host, port=hive_port, authMechanism="PLAIN", user='******', password='******', database='default') for element in descriptor: if 'table' in element and 'col_family' in element: table = hbase.table('%s' % element['table']) table.create(element['col_family']) for qry in element['hive_schema']: hive.cursor().execute(qry) hive.close()
def clear_table(table_name, table_columns): c = starbase.Connection(port=HBASE_PORT) t = c.table(table_name) t.drop() exists = create_table(table_name, table_columns) tables = all_tables() return tables
def fetch_all(table): c = starbase.Connection(port=HBASE_PORT) # c = starbase.Connection(host=HBASE_HOST, port=HBASE_PORT) table = c.table(table) return table.fetch_all_rows( with_row_id=False, scanner_config='<Scanner maxVersions="1"></Scanner>')
def fetch(table, rowkey, *args): c = starbase.Connection(port=HBASE_PORT) # c = starbase.Connection(host=HBASE_HOST, port=HBASE_PORT) table = c.table(table) if not args: return table.fetch(rowkey, ) return table.fetch(rowkey, args)
def fetch_part(table, start_row, end_row, *args): c = starbase.Connection(port=HBASE_PORT) # c = starbase.Connection(host=HBASE_HOST, port=HBASE_PORT) table = c.table(table) if not args: return table.fetch_all_rows(with_row_id=True, fail_silently=True, scanner_config='<Scanner maxVersions="1" startRow="{}" endRow="{}"></Scanner>'.format(start_row, end_row)) else: return table.fetch_all_rows(with_row_id=True, fail_silently=True, scanner_config='<Scanner maxVersions="1" startRow="{}" endRow="{}"><column>{}</column></Scanner>'.format(start_row, end_row, args[0]))
def save_batch(table, rowkey, batch_data): c = starbase.Connection(port=HBASE_PORT) # c = starbase.Connection(host=HBASE_HOST, port=HBASE_PORT) table = c.table(table) b = table.batch() if b: b.insert(rowkey, batch_data) b.commit(finalize=True)
def insert_data(table, rowkey, columfamily, columqualifier, value): c = starbase.Connection(port=HBASE_PORT) # c = starbase.Connection(host=HBASE_HOST, port=HBASE_PORT) table = c.table(table) table.insert( rowkey, { columfamily: { columqualifier: value } } )
def create(descriptor_path, environment): with open(descriptor_path) as descriptor_file: contents = descriptor_file.read() descriptor = json.loads(contents) logging.debug("_deploy_hbase: %s", descriptor) hbase = starbase.Connection(host=environment['hbase_rest_server'], port=int(environment['hbase_rest_port'])) hive_host = environment['hive_server'] hive_port = int(environment['hive_port']) for element in descriptor: if 'table' in element and 'col_family' in element: table = hbase.table('%s' % element['table']) table.create(element['col_family']) for qry in element['hive_schema']: beeline_output = run_hive_query(qry, hive_host, hive_port) logging.info(beeline_output)
def delete_row(table, rowkey): c = starbase.Connection(port=HBASE_PORT) table = c.table(table) table.remove(rowkey)
def all_columns(table): c = starbase.Connection(port=HBASE_PORT) t = c.table(table) columns = t.columns() return columns
def create_table(table_name, table_column): c = starbase.Connection(port=HBASE_PORT) new_table = c.table(table_name) new_table.create(table_column) exists = new_table.exists() return exists
def all_tables(): c = starbase.Connection(port=HBASE_PORT) tables = c.tables() return tables
import starbase import os c = starbase.Connection(host="wolf.iems.northwestern.edu", port=20550) directory = "/home/public/course/enron" table = c.table("Emp_Table_Gupta") table.create("EmployeeData", "EmailData") empnames = os.listdir(directory) i = 0 for empname in empnames: mailfiles = os.listdir(os.path.join(directory, empname)) for filename in mailfiles: mailfile = os.path.join(directory, empname, filename) i = i + 1 with open(mailfile) as f: email = f.read() mid = email.split("\n")[0][13:-1] sender_address = email[(email.find("\nFrom:") + 7):email.find("\nTo:")] mail_date = email.split("\n")[1][11:] month = mail_date.split(' ')[1] send_to = email[(email.find("To:") + 4):email.find("Subject:")].replace("\n", "") last_metadat_onwards = email[email.find("X-FileName:"):] body = last_metadat_onwards[last_metadat_onwards.find('\n'):] table.insert( str(i), { 'EmployeeData': {