jpype.startJVM( JVM_path, "-ea", "-Djava.class.path=C:\\Users\\CBH\\IdeaProjects\\HiveUtil\\target\\HiveUtil-1.0-SNAPSHOT.jar" ) # 打印hello, word jpype.java.lang.System.out.println("hello World") TA = jpype.JPackage('com.sbh.tj').ConnectorHiveUtil TA = TA() a = TA.GetHiveConnctor('jdbc:hive2://192.168.8.10:10000/default', 'show databases') a = TA.newGetCon() # 关闭JVM jpype.shutdownJVM() #python 三方包连接hive from pyhive import hive conn = hive.Connection(host="192.168.8.10", port='10000', username='******', password='******', database='default', auth="LDAP") cursor = conn.cursor() cursor.execute("select * from xxx") result = cursor.fetchall() df = pd.DataFrame(list(result))
def __getConnection(self): conn = hive.Connection(host = self.host_name, port = self.port, database = self.database, auth = 'NOSASL') return conn;
from pyhive import hive import pandas as pd # Create HIVE Connection # Student is created Data base in hive conn = hive.Connection(host="localhost", port=10000, username="******", database="CRUD", auth="NOSASL") cursor = conn.cursor() # join operation joinOPeration = """Select address, firstTable.name, firstTable.age from secondTable JOIN firstTable ON(secondTable.id=firstTable.id)""" cursor.execute(joinOPeration) data = pd.read_sql(joinOPeration, conn) print(data)
from pyhive import hive import pandas as pd import sys # Establish connection between hive server and database conn = hive.Connection(host="localhost", port=10000, database="logs", auth="NOSASL") try: query= pd.read_sql('select * from cpulogs', conn) #Load Dtabase into hive late_commers = query[query['cpulogs.start_time'] > '2019-10-24 09:30:00'] # Getting dataframe whose start time is above 9:30 AM late_commers_usernames = late_commers['cpulogs.user_name'] # Getting user names only print(late_commers_usernames) except: print("Syntax error")
directory = 'BL/' from os import walk import pandas as pd categories = pd.DataFrame() for (dirpath, dirnames, filenames) in walk(directory): if (len(dirnames) == 0): category = dirpath[len(directory):] print(category) filepath = dirpath + "/" + "domains" with open(filepath) as f: content = f.readlines() content = [x.strip() for x in content] content = pd.DataFrame(content) content['category'] = category content.columns = ['domain', 'category'] categories = pd.concat([categories, content]) #print(content) categories from pyhive import hive conn = hive.Connection(host="YOUR_HIVE_HOST", port="PORT", username="******") import pandas as pd df = pd.read_sql("SELECT cool_stuff FROM hive_table", conn)
def _execute(self, text): ip, port = sql_util.get_thrift_addr() conn = hive.Connection(host=ip, port=port) cursor = conn.cursor(arraysize=1000) result = cursor.execute(text) return result, cursor
from pyhive import hive import pandas import sys import ssl import thrift import thrift_sasl import json from datetime import datetime as dt import numpy as np import matplotlib.pyplot as plt #instantiate connection to hive via server2, using pyhive library #use connection to read hive table and store as pandas dataframe connection = hive.Connection(host="localhost", port=10000, username='******') dataframe = pandas.read_sql("SELECT * FROM igdbtables.topgames", connection) #convert dataframe to usuable dictionary format, for processing games = [] genreDataSet = [] dataframe.to_dict() limit = 500 for i in range(0, limit): game = {} game['id'] = int(dataframe['topgames.id'][i]) game['name'] = dataframe['topgames.name'][i] theseGenres = dataframe['topgames.genres'][i]
from pyhive import hive """ 使用kerberos认证,连接程序需要安装客户端 """ conn = hive.Connection(host='master1-de.pagod.com.cn', port=10000, auth="KERBEROS", kerberos_service_name="hive", username='******', database='default') cursor = conn.cursor() cursor.execute('show tables') for result in cursor.fetchall(): print(result)
from pyhive import hive import pandas as pd import sys # Establish the connection between hive server and Database conn = hive.Connection(host="localhost", port=10000, username="******", database="logs", auth="NOSASL") try: # load database into hive query = pd.read_sql('select * from workinglogs', conn) # convert the data into the format of datetime query['workinglogs.working_hours'] = pd.to_datetime( query['workinglogs.working_hours']) # calculate the mean of total working_hours lowest_avghour_log = query[query['workinglogs.working_hours'] < query['workinglogs.working_hours'].mean()] # print(avghour) lowest_avghour_log.to_csv("data/lowest_avghours_user_log.csv", index=False) # print the user_name with lowest average hours # LOWEST_AVG_HOURS =avghour['workinglogs.user_name'] # print(LOWEST_AVG_HOURS) except: print("Syntax error")
""" def date_tuple(date_string): lst = date_string.split('-') return (datetime.date(int(lst[0]), int(lst[1]), int(lst[2]))) def gen_maturity_date(matu): regex = re.compile(r'\.|、') splt = regex.split(matu) return datetime.date(datetime.datetime.now().year, int(splt[0]), int(splt[1])) conn = hive.Connection(host="202.120.38.90", port=10086, auth="NOSASL") cursor = conn.cursor() # cursor.execute("select DISTINCT dated_date from bonds") # for result in cursor.fetchall(): # print(result) cursor.execute( "select denomination,issue_start_date,dated_date,expiration_date,repayment_method,APR,bond_id from bonds" ) total = 0 f = open("tmp.txt", mode='w', encoding='utf-8') l = [] for result in cursor.fetchall(): s = "" for item in result: s += str(item) + "\t" l.append(s + "\n")
from pyhive import hive from TCLIService.ttypes import TOperationState import pandas as pd cursor = hive.connect(host='localhost', port=10000, username='******').cursor() cursor.execute("CREATE EXTERNAL TABLE IF NOT EXISTS cc(c_code int,country string)ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'") cursor.execute("LOAD DATA LOCAL INPATH '/home/student/Desktop/CCEE_Final_Project/Dataset/country_code.csv' OVERWRITE INTO TABLE cc") #cursor.execute("CREATE EXTERNAL TABLE IF NOT EXISTS zomato_project_3(Restaurant_ID string,Restaurant_Name string,Country_Code INT,City string,Address string,Locality string,Locality_Verbose string,Longitude string,Latitude string,Cuisines string,Average_Cost_for_two string,Currency string,Has_Table_booking string,Has_Online_delivery string,Is_delivering_now string,Switch_to_order_menu string,Price_range int,Aggregate_rating string,Rating_color string,Rating_text string,Votes string)ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'") #cursor.execute("LOAD DATA LOCAL INPATH '/home/student/Desktop/CCEE_Final_Project/Dataset/Zomato.csv' OVERWRITE INTO TABLE zomato_project_3") # In[51]: import pandas as pd conn = hive.Connection(host='localhost', port=10000, username='******') df = pd.read_sql("select * from zomato_project_3", conn) # In[52]: df.head() # In[53]: df.rename(index=str, columns={"zomato_project_3.restaurant_id":"Restaurant ID", "zomato_project_3.restaurant_name":"Restaurant Name", "zomato_project_3.country_code":"Country Code",
datetime.timedelta(1)).strftime('%Y%m%d') # Read connection info from config file config = configparser.ConfigParser() config.read('connection.cfg') hive_conn = config['hive'] hive_host = hive_conn['host'] hive_port = int(hive_conn['port']) hive_user = hive_conn['user'] path_cfg = config['path'] output_path = path_cfg['output'] # Setup hive connection and execute query conn = hive.Connection(host=hive_host, port=hive_port, username=hive_user) cursor = conn.cursor() cursor.execute("use " + dbName) cursor.execute("show tables", async=True) status = cursor.poll().operationState while status in (TOperationState.INITIALIZED_STATE, TOperationState.RUNNING_STATE): logs = cursor.fetch_logs() for message in logs: print(message) # If needed, an asynchronous query can be cancelled at any time with: # cursor.cancel() status = cursor.poll().operationState
def run(cfg): conn = hive.Connection(host='10.213.37.46', username='******', password='******', auth='CUSTOM') cursor = conn.cursor() cursor.execute('select * from dlpm_11092020_model_stat') stat_model = cursor.fetchone() model_info = json.loads(stat_model[0]) stat_info = json.loads(stat_model[1]) names = [] tfrecord_location = cfg['tfrecords_local_path'] for file in os.listdir(tfrecord_location): if file.startswith("part"): names.append(file) file_paths = [os.path.join(tfrecord_location, name) for name in names] # read and make the dataset from tfrecord dataset = tf.data.TFRecordDataset(file_paths) dataset = dataset.map(__data_parser) batch_size = cfg['batch_size'] duration = cfg['duration'] dataset = dataset.batch(batch_size).shuffle(SHUFFLE_BUFFER) iterator = dataset.make_one_shot_iterator() next_el = iterator.get_next() # lagged_ix = numpy.ones((duration, 4), dtype=float) # lagged_ix = np.where(lagged_ix == 1, -1, lagged_ix) lagged_ix = np.stack(lag_indexes(model_info), axis=-1) # quarter_autocorr = numpy.ones((batch_size,), dtype=float) date_list = model_info['days'] dow = get_dow(date_list) holiday_list = cfg['holidays'] holidays = [1 if _ in holiday_list else 0 for _ in date_list] a_list = [] b_list = [] for _ in holidays: a, b = holiday_norm(_) a_list.append(a) b_list.append(b) holiday = (a_list, b_list) with tf.Session() as sess: x = sess.run(next_el) quarter_autocorr = numpy.ones((x[0].size, ), dtype=float) page_indx = list(x[0]) fill_isolated_zeros(x[21]) tensors = dict( hits=pd.DataFrame(x[21], index=page_indx, columns=date_list), lagged_ix=lagged_ix, page_ix=page_indx, pf_age=pd.DataFrame(x[8:15], columns=page_indx, index=(1, 2, 3, 4, 5, 6, 7)).T, pf_si=pd.DataFrame(x[20], index=page_indx), pf_network=pd.DataFrame(x[15:20], columns=page_indx, index=('2G', '3G', '4G', 'UNKNOWN', 'WIFI')).T, pf_price_cat=pd.DataFrame(x[1:4], columns=page_indx, index=('pc1', 'pc2', 'pc3')).T, pf_gender=pd.DataFrame(x[4:8], columns=page_indx, index=('none', 'f', 'm', 'x')).T, page_popularity=x[22], # page_popularity = quarter_autocorr, quarter_autocorr=quarter_autocorr, dow=pd.DataFrame(dow).T, holiday=pd.DataFrame(holiday).T) data_len = tensors['hits'].shape[1] plain = dict(data_days=data_len - cfg['add_days'], features_days=data_len, data_start=date_list[0], data_end=date_list[-1], features_end=date_list[-1], n_pages=batch_size) VarFeeder(cfg['data_dir'], tensors, plain)
#!/bin/env python # -*- coding: UTF-8 -*- from pyhive import hive conn = hive.Connection(host='192.168.137.130', port=10000, username='******', database='default') cursor = conn.cursor() sql = "select sum(Proc_Rec_Total_Qty),sum(All_Dupl_Rec_Qty),sum(Pk_Dupl_Rec_Qty) from ( "\ "select count(1) as Proc_Rec_Total_Qty ,0 as All_Dupl_Rec_Qty,0 as Pk_Dupl_Rec_Qty from cls_db_3.lwq_test_tb where data_dt='2019-05-03' and tags is not null "\ "union "\ "select 0 as Proc_Rec_Total_Qty ,count(1) as All_Dupl_Rec_Qty,0 as Pk_Dupl_Rec_Qty from isu_db_3.lwq_test_tb where data_dt='2019-05-03' and isu_type='1' "\ "union "\ "select 0 as Proc_Rec_Total_Qty,0 as All_Dupl_Rec_Qty,count(1) as Pk_Dupl_Rec_Qty from isu_db_3.lwq_test_tb where data_dt='2019-05-03' and isu_type='3' "\ ") t" cursor.execute(sql) result = cursor.fetchone() if result != None: print(result) Proc_Rec_Total_Qty = result[0] All_Dupl_Rec_Qty = result[1] Pk_Dupl_Rec_Qty = result[2]
from pyhive import hive conn = hive.Connection(host='localhost', port=10002, username='******', database='test') cursor = conn.cursor() sql = "select dst12 from traffic_matrices where srcid='12' limit 10;" cursor.execute(sql) str = " " for result in cursor.fetchall(): print(str.join(result))
from pyhive import hive conn = hive.Connection(host='192.168.109.172', port=10000, auth='NOSASL', username='******', database='default') cursor = conn.cursor() cursor.execute('select * from link_expenses limit 1') for result in cursor.fetchall(): print(result) conn.close()
from pyhive import hive from flask import Flask, jsonify from flask import Flask, request from flask_restful import Resource, Api #======================================================================================================================= #Establish connection with hive using Pyhive #======================================================================================================================= app = Flask(__name__) db = hive.Connection(host="<hive host>",port=<default port>,database="default") cursor = db.cursor() #======================================================================================================================= #Product Rest Endpoints Definition #======================================================================================================================= @app.route('/products', methods = ['GET']) def products(): if 'benefit' in request.args: getbenefit = request.args['benefit'] splitbenefit = getbenefit.split(",") sql_query = '''select * from dev.products_sample where benefit in ('%s')''' % ("','".join(splitbenefit)) cursor.execute(sql_query) content = [dict((cursor.description[i][0], value) for i, value in enumerate(row)) for row in cursor.fetchall()] return jsonify({'myCollection': content}) elif 'id' in request.args: getid = request.args['id']
def connect_to_hive(hive_user): hive_host = '10.11.12.144' hive_conf = {'job.queue.name': 'default'} return hive.Connection(host=hive_host, username=hive_user, configuration=hive_conf)
from pyhive import hive conn = hive.Connection(host="localhost", port=50070, username="******")
# ## load data (from Hive) from pyhive import hive import pandas as pd conn = hive.Connection(host='mlamairesse-training-1.vpc.cloudera.com', port=10000, auth='KERBEROS', kerberos_service_name='hive') airlines_pd_df = pd.read_sql('select * from flights.airports', conn) airlines_pd_df.set_index('airports.iata', inplace=True) # ##lookup function def lookup(arg: dict): code = arg['code'].upper() return airlines_pd_df.loc[code, :].to_dict() lookup({"code": "WRL"})