def get_connection(db_conn_conf, use_default_db): # Set up which database to use if use_default_db: db_name = "default" else: # Set it to be the one specief in the config file db_name = db_conn_conf['database'] # Create the connection if db_conn_conf['connector'] == 'hive': conn = pyhs2.connect(host=db_conn_conf['host'], port=int(db_conn_conf['port']), authMechanism="PLAIN", user=db_conn_conf['user'], database=db_name) return conn elif db_conn_conf['connector'] == 'impala': conn = impala.dbapi.connect(host=db_conn_conf['host'], port=db_conn_conf['port'], user=db_conn_conf['user'], database=db_name) return conn else: print "Can't find connector" sys.exit(0)
def start_requests(self): with pyhs2.connect(host='10.15.1.16', port=10000, authMechanism="PLAIN", user='******', password='******', database='ods') as conn: with conn.cursor() as cur: the_day = time.strftime('%Y-%m-%d', time.localtime()) cur.execute( "select goods_id from ods.daily_goodsList where dt='{}' and user_id in (446338500,890482188," "133227658,1993730769,772352677,167873659, 98563612, 353042333, 1731961317, 353042353, 829273025, " "2940972233, 2940677727, 435878238, 281917995, 737997431, 458599810, 1739810699, 1574853209, " "1893742894, 1122478447, 373327370, 2647118809, 2786693231, 2434852658, 3000560259, 1916102784, " "656650799, 2074964291, 708668355, 720472756, 356579667, 1731961317,98563612,1689954831,387266832" ",2073309259,325718097,1891339807,834807033,1974964452,372602234,167486422,356374102," "320083279) group by goods_id".format(the_day)) for i in cur.fetch(): nid = i[0] url = 'https://api.m.taobao.com/h5/mtop.taobao.detail.getdetail/6.0/?appKey=12574478&t=14606167' \ '25586&sign=04b5eb36c2ccfebe0d39dab46de5ec18&api=mtop.taobao.detail.getdetail&v=6.0&ttid=' \ '2013%40taobao_h5_1.0.0&type=jsonp&dataType=jsonp&callback=&data=%7B%22itemNumId%22%3A%2' \ '2{}%22%2C%22exParams%22%3A%22%7B%5C%22id%5C%22%3A%5C%228548526%5C%22%2C%5C%22wp_app%5C' \ '%22%3A%5C%22weapp%5C%22%7D%22%7D'.format(nid) #print url yield scrapy.Request(url, headers=header, callback=self.parse_detail, meta={ 'nid': nid, 'retry': 0 }, dont_filter=True)
def hs2_pull_data(query, buffername, max_id, field): with open('pass.json', 'rb') as passfile: passdict = json.load(passfile) with pyhs2.connect(host=passdict['HIVE_HOST'], port=10000, authMechanism='PLAIN', user=passdict['HIVE_USER'], database='default', cursorclass=SSCursor) as conn: with gzip.open( '/home/epeters/public/dumps/%s_redshift.gz' % (buffername), 'wb+') as gzip_csv_file: cur = conn.cursor() datacsv = csv.writer(gzip_csv_file, delimiter='|', quotechar='`', quoting=csv.QUOTE_MINIMAL) yesterday = datetime.datetime.utcnow() - datetime.timedelta(days=1) query = re.sub('@YEAR@', str(yesterday.year), query) query = re.sub('@MONTH@', str(yesterday.month), query) query = re.sub('@DAY@', str(yesterday.day), query) query = re.sub('@MAX_ID@', '%s > %s' % (max_id, field), query) #print "query is now:" #print query cur.execute(query) #print "Query execute successful" counter = 0 for i in cur.fetch(): counter += 1 if counter % 10000 == 0: print "counter now at %s" % (counter) datacsv.writerow(i) return cur.getSchema()
def getHiveData(loadsql, createsql, dropsql, selectsql): with pyhs2.connect(host='localhost', port=10000, authMechanism="PLAIN", user='******', password='******', database='default') as conn: with conn.cursor() as cur: # Show databases print("--------") cur.execute(dropsql) print("--------") cur.execute(createsql) print("--------") cur.execute(loadsql) print("--------") cur.execute(selectsql) print("--------") # Return column info from query # print cur.getSchema() # Fetch table results result = [] for i in cur.fetch(): result.append(i) yield i
def purchase(id, purchase_type): """Returning the number of purchases for certain ids. Type of purchase (release or track) is needed""" # testing if purchase type is correct if purchase_type == 'release': ctype = 'release_id' elif purchase_type == 'track': ctype = 'track_id' else: raise AttributeError("provide valid purchase_type: 'release' or 'track'") #splitting ids cid = id_split(id) cid = cid.replace('src_id', ctype) # uncomment the line below to print created Hive query # print ('select count(transaction_id), %s from purchase where (%s) group by %s' % (ctype, cid, ctype)) # connecting to Hive, sending query, returning results of query conn = pyhs2.connect(host='nif-nif.zvq.me', port=10000, authMechanism="NOSASL", user='******', password='******', database='default') cur = conn.cursor() cur.execute('select count(transaction_id), %s from purchase where (%s) group by %s' % (ctype, cid, ctype)) results = cur.fetch() cur.close() conn.close() return results
def _set_hive(self): hive = pyhs2.connect(host=self.config['hive']['host'], port=int(self.config['hive']['port']), authMechanism='PLAIN', user=self.config['hive']['username'], password="") return hive.cursor()
def process(self, tup): movie_id = tup.values[0] # [{'field_name' : 'field_value', ...}, ....] tweets = json.loads(tup.values[1]) self.log('Received %d tweets for movie %s' % (len(tweets), movie_id)) if self.verbose: self.log(tweets) with pyhs2.connect(host=get_config('host'), port=get_config('port'), authMechanism=get_config('auth'), user=str(get_config('user')), database=get_config('database')) as conn: with conn.cursor() as cur: query = "INSERT INTO " + get_config( 'tweet_data_tablename') + " VALUES " for t in tweets: # Clean up the tweets before storing text = unicodedata.normalize('NFKD', t['text']).encode( 'ascii', 'ignore') text = re.sub(r'\'', '', text) text = re.sub(r'\\', ' ', text) text = re.sub(r'[\s\n\t\r]', ' ', text) query += "('" + movie_id + "', " + str( t['id']) + ", '" + text + "')," if self.verbose: self.log(query) cur.execute(query[:-1]) # -1 to remove last comma
def call_get_meta_data(table_name, sandbox_ip): try: with pyhs2.connect(host=sandbox_ip, port=10000, authMechanism="PLAIN", user='******', password='******', database='bigdata') as conn: with conn.cursor() as cur: # Show databases # print cur.getDatabases() query = "desc formatted " + table_name print(query) cur.execute(query) meta_data = [] flag = 0 for i in cur.fetch(): if (i[0] == '' and flag == 0): flag = 1 elif (i[0] == '' and flag == 1): flag = 2 elif (flag == 1): meta_data.append((i[0].strip(), i[1].strip())) return meta_data except Exception, e: print e return None
def execute_sql (self, database_name, sql, fetch_result = False): import pyhs2 conn = pyhs2.connect(host=self.host, port=self.port, authMechanism="PLAIN", user="******", password="", database='default', timeout=5000) print "Connected to hiverserver2" # turn on tez and add serde jar c = conn.cursor() c.execute("set hive.execution.engine=tez") c.execute("set hive.cache.expr.evaluation=false") # c.execute("add jar %s" % self.hive_serdes_path) if database_name != None: c.execute("use %s" % database_name) # run actual command command print "Executing HiveQL: %s" % (sql) c.execute(sql) output = [] if fetch_result: rows = c.fetchall() for row in rows: output.append(row) c.close() conn.close() return output
def processQuery(self, querylist,flag): propertyObj = ConfigProperties() hostname = propertyObj.localhivehost() portnumber = propertyObj.localhiveport() authentication = propertyObj.localhiveauthentication() username = propertyObj.localhiveuser() userpassword = propertyObj.localuserpassword() databasename = propertyObj.localhivedatabase() conn = pyhs2.connect(host=hostname, port = portnumber, authMechanism = authentication, user=username, password=userpassword, database = databasename) cur = conn.cursor() cur.execute(querylist['createdb']) cur.execute(querylist['workdb']) cur.execute(querylist['droptable1']) cur.execute(querylist['createtable1']) cur.execute(querylist['testcode']) cur.execute(querylist['droptable2']) cur.execute(querylist['createtable2']) if flag == 0: cur.close() conn.close()
def load_data_orc(self,table_name): with pyhs2.connect(host=self.env['HIVE_HOST'],port=10000,authMechanism="PLAIN", user=self.env['USER_NAME'],password='',database=self.config['HIVE_DB']) as conn: with conn.cursor() as cur: #set_hive="SET hive.mergejob.maponly=true" #self.logger.info("RUNNING at HIVE: %s",set_hive) #cur.execute(set_hive) #set_hive=" SET hive.merge.mapredfiles=true" #self.logger.info("RUNNING at HIVE: %s",set_hive) #cur.execute(set_hive) #set_hive="SET hive.merge.mapfiles=true" #self.logger.info("RUNNING at HIVE: %s",set_hive) #cur.execute(set_hive) #set_hive="SET hive.merge.size.per.task=256000000" #self.logger.info("RUNNING at HIVE: %s",set_hive) #cur.execute(set_hive) #set_hive="SET hive.merge.smallfiles.avgsize=16000000000" #self.logger.info("RUNNING at HIVE: %s",set_hive) #cur.execute(set_hive) #self.logger.info("RUNNING at HIVE: %s",set_hive) #cur.execute(set_hive) hive_sql="INSERT INTO TABLE "+self.config['HIVE_DB']+"."+table_name #+" PARTITION ("+primary_id+"="+primary_value+")" hive_sql=hive_sql+" SELECT "+self.table.get_table_column(table_name)+" from "+self.config['HIVE_DB']+"."+table_name+"_text" print hive_sql self.logger.info("RUNNING at HIVE: %s",hive_sql) cur.execute(hive_sql)
def execute_sql(self, database_name, sql, fetch_result=False): import pyhs2 conn = pyhs2.connect(host=self.host, port=self.port, authMechanism="NOSASL", database='default') # turn on tez and add serde jar c = conn.cursor() c.execute("set hive.execution.engine=tez") c.execute("set hive.cache.expr.evaluation=false") c.execute("add jar %s" % self.hive_serdes_path) if database_name != None: c.execute("use %s" % database_name) # run actual command command print "Executing HiveQL: %s" % (sql) c.execute(sql) output = [] if fetch_result: rows = c.fetchall() for row in rows: output.append(row) c.close() conn.close() return output
def connect(self): print "I'm running but will hang some time. Please be patient..." with pyhs2.connect(host='cosmos.lab.fi-ware.org', port=10000, authMechanism="PLAIN", user='', password='', database='default') as conn: with conn.cursor() as self.cur: #Show databases #print cur.getDatabases() #Execute query self.cur.execute("select * from andre_silva_fresh_serv_fresh_servpath_sensor_9_sensor_column") self.db_zone1_cols = [] for i in self.cur.getSchema(): if("_md" not in i['columnName'].split('.')[1] and "recv" not in i['columnName'].split('.')[1]): self.db_zone1_cols.append(i['columnName'].split('.')[1]) self.db_zone1_rows = self.cur.fetch() self.cur.execute("select * from andre_silva_fresh_serv_fresh_servpath_sensor_10_sensor_column") self.db_zone2_cols = [] for i in self.cur.getSchema(): if("_md" not in i['columnName'].split('.')[1] and "recv" not in i['columnName'].split('.')[1]): self.db_zone2_cols.append(i['columnName'].split('.')[1]) self.db_zone2_rows = self.cur.fetch() print "Whoa! I have a database!"
def create_original_table(self, table_name): table_statement = self.table.get_table_column_with_type( table_name) + " )" #if table_statement.find("PARTITION BY") > 0: #partition_column=table_statement.split("PARTITION BY")[1].replace(table_name+".","").replace("(","").replace(")","").replace(" ","") #sql = "select data_type from v_catalog.columns where column_name= '"+partition_column+"' and table_name='"+table_name+"'" #data_type = self.db.retrieveSQL(sql) #data_type=data_type[0][0] #table_statement=table_statement.split("PARTITION BY")[0] #table_statement=first_part+ "PARTITIONED BY ("+ partition_column+ " "+data_type +" ) " #table_statement=table_statement.replace(partition_column+ " "+data_type+",","") table_statement = table_statement + " ROW FORMAT DELIMITED STORED AS ORC tblproperties ('orc.compress'='" + self.config[ 'SQOOP_COMPRESSION'] + "','orc.stripe.size'='" + self.config[ 'ORC_STRIPE_SIZE'] + "','orc.row.index.stride'='" + self.config[ 'ORC_INDEX_STRIDE'] + "','orc.create.index'='true')" table_statement = table_statement.replace("\n", " ") table_statement = table_statement.replace("numeric", "decimal") table_statement = table_statement.replace("timestamp(6)", "timestamp") table_statement = "CREATE EXTERNAL TABLE " + self.config[ 'HIVE_DB'] + "." + table_name + " ( " + table_statement print table_statement self.logger.info("RUNNING at HIVE: %s", table_statement) with pyhs2.connect(host=self.env['HIVE_HOST'], port=10000, authMechanism="PLAIN", user=self.env['USER_NAME'], password='', database=self.config['HIVE_DB']) as conn: with conn.cursor() as cur: cur.execute(table_statement) print cur.getDatabases()
def aggregate(): with pyhs2.connect(host='hive.athena.we7.local', port=10000, authMechanism="KERBEROS", user='', password='', database='davec_sandbox' ) as conn: with conn.cursor() as cur: cur.execute('''add file hdfs://athena/user/davec/agg_segment_daily_reducer.py''') # Hive chooses only one reducer by default (28 minutes). Force 15 (2.5 mins). cur.execute('''set mapred.reduce.tasks=15''') cur.execute('''create table if not exists davec_sandbox.agg_segment_daily ( segment_date string, segment_type string, user_segment string, users int ) ''') cur.execute(''' insert overwrite table davec_sandbox.agg_segment_daily select segment_date, segment_type, user_segment, sum(cast(users as int)) from ( select transform(*) using 'agg_segment_daily_reducer.py' as ( segment_date, segment_type, user_segment, users ) from ( select user_id, segment_type, user_segment, fact_year, fact_month, fact_day from events_super_mart.fact_user_segment --test--where segment_type = 'Value Segment' --test--and fact_year = 2014 --test--and fact_month = 11 distribute by user_id, segment_type sort by user_id, segment_type, fact_year, cast(fact_month as int), cast(fact_day as int) ) user_segment ) segment_by_date group by segment_date, segment_type, user_segment ''')
def query2hive(self,sql="SHOW TABLES",use_server=True): if use_server: host = "137.205.118.65" else: host = "localhost" with pyhs2.connect(host=host, port=10000, authMechanism="PLAIN", user='******', password='******', database='default') as conn: with conn.cursor() as cur: #Show databases # print cur.getDatabases() #Execute query # cur.execute("select * from src") start = datetime.now() cur.execute(sql) end = datetime.now() time_cost = (end - start).total_seconds() #Return column info from query # print cur.getSchema() #Fetch table results self.logger.logger.info("Time spent for HIVE query: %.4fs." % time_cost) for i in cur.fetch(): self.logger.logger.info(i) return i[0], time_cost
def getSchema(): settings = get_settings_from_file("spec.json") print(settings) conn = pyhs2.connect(host=settings.Param.HiveServer2_Host, port=int(settings.Param.HiveServer2_Port), authMechanism="PLAIN", user="******", password="", database="default") query_sql = "DESCRIBE %s" % settings.Input.table_a.val cur = conn.cursor() cur.execute(query_sql) a_schema = [] for row in cur.fetch(): a_schema.append(("a.%s AS a_%s") %(row[0],row[0])) query_sql = "DESCRIBE %s" % settings.Input.table_b.val cur = conn.cursor() cur.execute(query_sql) b_schema = [] for row in cur.fetch(): b_schema.append(("b.%s AS b_%s")%(row[0],row[0])) cur.close() conn.close() return [a_schema,b_schema]
def start_requests(self): with pyhs2.connect(host='10.15.1.16', port=10000, authMechanism="PLAIN", user='******', password='******', database='ods') as conn: with conn.cursor() as cur: the_day = time.strftime('%Y-%m-%d', time.localtime()) # the_day = '2018-01-04' cur.execute( "select goods_id from ods.daily_goodsList where (dt='2017-11-08' or dt='2017-11-07' or " "dt='2017-12-03' or dt='2018-01-04' or dt='{}') and user_id in (446338500,890482188," "133227658,1993730769,772352677,167873659, 98563612, 353042333, 1731961317, 353042353, 829273025, " "2940972233, 2940677727, 435878238, 281917995, 737997431, 458599810, 1739810699, 1574853209, " "1893742894, 1122478447, 373327370, 2647118809, 2786693231, 2434852658, 3000560259, 1916102784, " "656650799, 2074964291, 708668355, 720472756, 356579667, 1731961317,98563612,1689954831,387266832" ",2073309259,325718097,1891339807,834807033,1974964452,372602234,167486422,356374102," "320083279,411832242,1602582004,2428721558,1574853209,2986712394,2183615086,2424477833,1754310760,1754310760,1122478447,3000560259,2652614726,533230328,2935707588,2945786195,2074690906,3458347554,3383168585,2366121327,3099864367,106852162,152579056,1754310760,783329018,1600687454,205919815,113484749,94092459,6655,2978259752) group by goods_id" .format(the_day)) # print '************' for i in cur.fetch(): nid = i[0] url = 'https://rate.tmall.com/listTagClouds.htm?itemId={}&isAll=true&isInner=true&t=1510632615425' \ '&_ksTS=1510632615425_2407&callback=jsonp2408'.format(nid) yield scrapy.Request(url, headers=header, callback=self.parse_keyword, meta={'nid': nid}, dont_filter=True)
def connect(self, config): """ connect hiveServer2 with your config Parameters: ----------- config : dict. | config['host'] The host ip of hive server 2 | config['port'] The port of hive server 2 | config['authMechanism'] Most time the value is "NOSASL" | config['user'] The connect user | config['password'] The connect password | config['database'] The database which you want to connect Returns: -------- None """ self.conn = pyhs2.connect(host=config['host'], port=config['port'], authMechanism=config['authMechanism'], user=config['user'], password=config['password'], database=config['database']) self.cur = self.conn.cursor()
def getHiveData(loadsql,createsql,dropsql,selectsql): with pyhs2.connect(host='localhost', port=10000, authMechanism="PLAIN", user='******', password='******', database='default') as conn: with conn.cursor() as cur: #Show databases #print cur.getDatabases() print("--------") cur.execute(dropsql) print("--------") cur.execute(createsql) print("--------") cur.execute(loadsql) print("--------") cur.execute(selectsql) print("--------") #Return column info from query #print cur.getSchema() #Fetch table results result = [] for i in cur.fetch(): result.append(i) yield i
def get_hiveserver2_results_schema(hiveserver_config_dict, db, sql): host = hiveserver_config_dict["host"] port = hiveserver_config_dict["port"] username = hiveserver_config_dict["username"] password = hiveserver_config_dict["password"] auth = hiveserver_config_dict["auth"] connection = pyhs2.connect(host=host, port=int(port), authMechanism=auth, user=username, password=password, database=db) cursor = connection.cursor() cursor.execute(sql + " limit 0") schemas = cursor.getSchema() connection.close() column_name_type_list = [] print "result schemas:", schemas for column in schemas: column_name_type_list.append({ "name": column['columnName'], "type": HiveType.change_hiveserver2_type(column['type']) }) return column_name_type_list
def __init__(self,using=''): """ @param cursor_hander:数据库句柄 """ self.cursor = None self.cursor_hander = using self.connections = None if str(self.cursor_hander).rstrip() == '': print('please write Custom_Hive`s using param') exit(0) databases = { 'ares_dw':{'host':'10.0.0.2', 'user':'******', 'password':'', 'database':'test', 'port':10000 ,'authMechanism':'NOSASL'}, 'hadoops2':{'host':'10.0.0.2', 'user':'******', 'password':'', 'database':'test', 'port':10000 ,'authMechanism':'NOSASL'}, 'basic_data':{'host':'10.0.0.2', 'user':'******', 'password':'', 'database':'basic_data', 'port':10000 ,'authMechanism':'NOSASL'} } database = databases[self.cursor_hander] self.connections= pyhs2.connect(host=database['host'], port= int(database['port']), authMechanism= database['authMechanism'], user=database['user'], password=database['password'], database=database['database'], ) self.cursor = self.connections.cursor()
def get_hive_connection(master_host): c = pyhs2.connect(host=master_host, port=10000, authMechanism="PLAIN", user='******', database='default') return c
def test_result_sets_match(self): for lim in (0,100,10000,11000,20000,21000): with pyhs2.connect(**self.hive_cfg) as conn: with conn.cursor() as cur: cur.execute('SELECT * FROM {db}.{tbl} LIMIT {limit}'.format(db=self.test_db, tbl=self.test_tbl, limit=lim)) res = cur.fetch() self.assertEqual(res, self.test_data[:lim])
def __call__(self): try: self.conn = pyhs2.connect(host=self.host, port=self.port, authMechanism="PLAIN", user=self.user, password=self.password, database=self.database, timeout=self.timeout) if self.conn is None: raise Exception("can't connect hiveserver") cur = self.conn.cursor() cur.execute("show tables") if len(cur.fetch()) > 0: self._readq.nput("hive.state %s %s" % (int(time.time()), '0')) else: raise Exception("can not find any tables") except Exception as e: self._readq.nput("hive.state %s %s" % (int(time.time()), '1')) self.log_error(e) finally: if self.conn is not None: self.conn.close()
def __GetConnect(self): if self.database == 'sqlserver': import pyodbc conn_info = 'DRIVER={SQL Server};DATABASE=%s;SERVER=%s;UID=%s;PWD=%s' % ( self.db, str(self.host) + ',' + str(self.port), self.user, self.pwd) conn = pyodbc.connect(conn_info) # except: # import pymssql # conn = pymssql.connect(host=str(self.host)+':'+str(self.port), user=self.user, password=self.pwd, database=self.db) elif self.database == 'hive': import pyhs2 conn = pyhs2.connect(host=self.host, port=self.port, user=self.user, authMechanism="PLAIN", password=self.pwd) else: import pyodbc conn_info = ( 'Driver={MySQL ODBC 5.1 Driver};Server=%s;Port=%s;Database=%s;User=%s; Password=%s;Option=3;' % (self.host, self.port, self.db, self.user, self.pwd)) conn = pyodbc.connect(conn_info) self.conn = conn cur = self.conn.cursor() # 使用conn连接创建(并返回)一个游标的对象 if not cur: raise (NameError, "数据库连接失败") else: return cur
def __init__(self, db_host, user, password, port=10000, authMechanism="PLAIN"): self.conn = pyhs2.connect(host=db_host, port=port, authMechanism=authMechanism, user=user, password=password, )
def get_tweet(consumer): try: for message in consumer: if message is not None: #print message.offset, message.value, time.strftime("%Y%m") with pyhs2.connect(host='localhost', port=10000, authMechanism="PLAIN", user='******', password='******', database='default') as conn: with conn.cursor() as cur: #Show databases print cur.getDatabases() #Execute query cur.execute( "create table if not exists twitter_grippe(ID varchar(255), tweet string, date_month string)" ) cur.execute( "INSERT INTO table twitter_grippe values ('{}','{}','{}')" .format(message.offset, message.value, time.strftime("%Y%m"))) elif not message: print 'No message' else: print 'Something else happened..' except KeyboardInterrupt as e: pass return message.offset, message.value, time.strftime("%Y%m")
def distcp_and_import( src_data_block_url, table_name, hive_table_meta, sandbox_ip, file_path): dis_data_block_url = 'hdfs://' + sandbox_ip + ':9000' + file_path status_distcp = [] status_distcp.append( distcp(src_data_block_url, dis_data_block_url)) status_import = [] try: with pyhs2.connect(host=sandbox_ip, port=10000, authMechanism="PLAIN", user='******', password='******', database='default') as conn: with conn.cursor() as cur: # Show databases # print cur.getDatabases() query = "create external table " + table_name + \ "(" + hive_table_meta + ")" + " ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE LOCATION " + "'" + file_path + "'" print(query) print(cur.execute(query)) result = 0 except BaseException: result = 1 status_import.append(result) print [status_distcp, status_import]
def process(self, tup): movie_id = tup.values[0] # [{'field_name' : 'field_value', ...}, ....] tweets = json.loads(tup.values[1]) # For debugging. #movie_id = tup[0] #tweets = json.loads(tup[1]) self.log('Received %d tweets for movie %s' % (len(tweets), movie_id)) tweets_bow = [] for t in tweets: text = self.processTweet(t['text']) bag_of_words = self.getWordsSet(text) tweets_bow.append(bag_of_words) features = self.build_features(tweets_bow) pred = self.classifier.predict(features) pos_count = sum(pred) neg_count = len(pred) - pos_count self.log('SE: %s +ve:%d -ve:%d' % (movie_id, pos_count, neg_count)) with pyhs2.connect(host = get_config('host'), port = get_config('port'), authMechanism = get_config('auth'), user = str(get_config('user')), database = get_config('database')) as conn: with conn.cursor() as cur: query = ("INSERT INTO " + get_config('se_score_tablename') + " VALUES (" + str(int(time.time())) + ", '" + movie_id + "', " + str(pos_count) + ", " + str(neg_count) + ")") if self.verbose: self.log(query) cur.execute(query)
def listen(id, listen_type): """Returning the number of playevents longer than 30 sec with ok flag for certain ids""" # testing if purchase type is correct if listen_type not in ('release', 'track', 'playlist'): raise AttributeError("provide valid listen_type: 'release', 'playlist' or 'track'") #splitting ids cid = id_split(id) # uncomment the line below to print created Hive query # print ("select count(1), src_id, src_type from playevent" # "where (%s) and src_type = '%s' and play_duration > 30 and ok_flag" # "group by src_id, src_type" % (cid, listen_type)) # connecting to Hive, sending query, returning results of query conn = pyhs2.connect(host='nif-nif.zvq.me', port=10000, authMechanism="NOSASL", user='******', password='******', database='default') cur = conn.cursor() cur.execute( "select count(1), src_id, src_type from playevent" "where (%s) and src_type = '%s' and play_duration > 30 and ok_flag" "group by src_id, src_type" % (cid, listen_type)) results = cur.fetch() cur.close() conn.close() return results
def apriori(): with pyhs2.connect(host="localhost", port=10000, authMechanism="PLAIN", user="******", password="******", database=u"minor") as conn: file_iter = open('data', 'w') with conn.cursor() as cur: cur.execute("select * from trainingset") for i in cur.fetch(): for j in i: file_iter.write(str(j) + ",") file_iter.write("\n") file_iter.close() inFile = dataFromFile('data') parametera = raw_input("first parameter") parameterb = raw_input("second parameter") minSupport = 0.15 minConfidence = 0.15 items, rules = runApriori(inFile, minSupport, minConfidence) printResults(items, rules, str(parametera), str(parameterb))
def create(descriptor_path, environment): with open(descriptor_path) as descriptor_file: contents = descriptor_file.read() descriptor = json.loads(contents) logging.debug("_deploy_hbase: %s", descriptor) hbase = starbase.Connection(host=environment['hbase_rest_server'], port=int(environment['hbase_rest_port'])) hive_host = environment['hive_server'] hive_port = environment['hive_port'] hive = pyhs2.connect(host=hive_host, port=hive_port, authMechanism="PLAIN", user='******', password='******', database='default') for element in descriptor: if 'table' in element and 'col_family' in element: table = hbase.table('%s' % element['table']) table.create(element['col_family']) for qry in element['hive_schema']: hive.cursor().execute(qry) hive.close()
def hive_query(sqls, is_select=True, meta=False): ''' 执行 hive 的sql :param sqls: :param is_select: 是否是select :param meta: :return: ''' hive_connection = { "database": "elengjing", "host": "172.16.1.12", "user": "******", "password": "******", "port": 10000, "authMechanism": "PLAIN" } conn = pyhs2.connect(**hive_connection) try: with conn.cursor() as cursor: for sql in sqls.split(";"): if sql.strip(): cursor.execute(sql.strip().encode('utf8')) if is_select: columns = [_['columnName'] for _ in cursor.getSchema()] rows = [dict(zip(columns, _)) for _ in cursor] if meta: return rows, columns else: return rows except Exception, e: raise e
def get_cursor(): conn = pyhs2.connect(host='', port=10000, authMechanism="PLAIN", user='******', password='', database='test') return conn.cursor()
def get_conn(self): db = self.get_connection(self.hiveserver2_conn_id) return pyhs2.connect(host=db.host, port=db.port, authMechanism=db.extra_dejson.get( 'authMechanism', 'NOSASL'), user=db.login, database=db.schema or 'default')
def get_connection(self): conn = pyhs2.connect(host=self.params.get('host'), port=self.params.get('port'), authMechanism=self.params.get('authMechanism'), user=self.params.get('user'), password=self.params.get('password'), database=self.params.get('db')) return conn
def __init__(self, **hive_cfg): self.args = self.create_hive_config(**hive_cfg) self.columns = [] try: self.conn = pyhs2.connect(**self.args) self.cursor = self.conn.cursor() except Exception as e: logging.error("Could not get a cursor. Reason:\n %s", e)
def __init__(self,database_name): conn = pyhs2.connect(host='192.168.11.55', port=10000, authMechanism="PLAIN", user='******', password='', database= database_name) self.cur=conn.cursor()
def get_conn(self): db = self.get_connection(self.hiveserver2_conn_id) return pyhs2.connect( host=db.host, port=db.port, authMechanism=db.extra_dejson.get('authMechanism', 'NOSASL'), user=db.login, database=db.schema or 'default')
def insert_into_pb_dim_pred(self, strat_algos): logging.info('') logging.info(" Inserting data into hive ...") insert_statement = """ INSERT OVERWRITE TABLE {}.pom_pb_dimension_predictions PARTITION (algo_id) SELECT DISTINCT * FROM (SELECT strategy_id, dimension_value_structs, probability_score, algo_id FROM (SELECT opti.strategy_id, row_number() over (partition BY opti.strategy_id ORDER BY preds.cumulative_users ASC) AS row_n, NAMED_STRUCT('dimension_type_id', 1, 'dimension_value', NAMED_STRUCT('first', CAST(sd.site_domain_id AS STRING), 'last', "", 'unit', "")) AS dimension_value_structs, preds.percentage_recommendations AS probability_score, opti.algorithm_id AS algo_id FROM {}.pom_programmable_bid_domainbidderinputfield opti JOIN radiumone_master.linez l ON opti.line_id = l.line_id JOIN dchugh.line_recommender_cumsum preds ON l.line_id = preds.line_id JOIN radiumone_master.SITE_DOMAINS sd ON (preds.domain_name = sd.NAME) ) first_domain WHERE first_domain.row_n = 1 UNION ALL SELECT opti.strategy_id, NAMED_STRUCT('dimension_type_id', 1, 'dimension_value', NAMED_STRUCT('first', CAST(sd.site_domain_id AS STRING), 'last', "", 'unit', "")) AS dimension_value_structs, preds.percentage_recommendations AS probability_score, opti.algorithm_id AS algo_id FROM {}.pom_programmable_bid_domainbidderinputfield opti JOIN radiumone_master.linez l ON opti.line_id = l.line_id JOIN dchugh.line_recommender_cumsum preds ON l.line_id = preds.line_id JOIN radiumone_master.SITE_DOMAINS sd ON (preds.domain_name = sd.NAME) WHERE (preds.cumulative_users <= opti.unique_count or (opti.unique_count = -1)) ) full_query ORDER BY strategy_id """.format(pb_properties.hive_db, pb_properties.hive_db, pb_properties.hive_db) with pyhs2.connect( host=pb_properties.hive_server_host, user=pb_properties.hive_server_user, password=pb_properties.hive_server_password, port=pb_properties.hive_server_port, authMechanism=pb_properties.hive_server_auth) as conn: with conn.cursor() as cur: logging.info('') logging.info( ' In host, jobserver, executing the hql statement: ' + insert_statement) hive_settings_dyn_partition_enable = "set hive.exec.dynamic.partition=true" hive_settings_dyn_partition_mode = "set hive.exec.dynamic.partition.mode=nonstrict" cur.execute(hive_settings_dyn_partition_enable) cur.execute(hive_settings_dyn_partition_mode) cur.execute(insert_statement)
def get_conn(self): db = self.get_connection(self.hiveserver2_conn_id) auth_mechanism = db.extra_dejson.get("authMechanism", "NOSASL") if conf.get("core", "security") == "kerberos": auth_mechanism = db.extra_dejson.get("authMechanism", "KERBEROS") return pyhs2.connect( host=db.host, port=db.port, authMechanism=auth_mechanism, user=db.login, database=db.schema or "default" )
def _set_hive(self): hive = pyhs2.connect( host=self.config['hive']['host'], port=int(self.config['hive']['port']), authMechanism=self.config['hive']['authMechanism'], user=self.config['hive']['username'], password="", database=self.config['hive']['db']) return hive.cursor()
def run(self, connection, date): target_date = (date - datetime.timedelta(days=1)).strftime('%Y-%m-%d') raw_adi_logs_pathname = os.path.join( tempfile.gettempdir(), "%s.raw_adi_logs.TEMPORARY%s" % ( target_date, '.txt' ) ) try: with open(raw_adi_logs_pathname, 'w') as f: hive = pyhs2.connect( host=self.config.hive_host, port=self.config.hive_port, authMechanism=self.config.hive_auth_mechanism, user=self.config.hive_user, password=self.config.hive_password, database=self.config.hive_database ) cur = hive.cursor() query = self.config.query % target_date cur.execute(query) for row in cur: if None not in row: f.write( "\t" .join( urllib2.unquote(str(v)).replace('\\', '\\\\') for v in row ) ) f.write("\n") with open(raw_adi_logs_pathname, 'r') as f: pgcursor = connection.cursor() pgcursor.copy_from( f, 'raw_adi_logs', null='None', columns=[ 'report_date', 'product_name', 'product_os_platform', 'product_os_version', 'product_version', 'build', 'build_channel', 'product_guid', 'count' ] ) pgcursor.execute(_RAW_ADI_QUERY, (target_date,)) finally: if os.path.isfile(raw_adi_logs_pathname): os.remove(raw_adi_logs_pathname)
def __init__(self, host, port, username, password, db, table_name, column_order): self.conn = pyhs2.connect(host=host, port=port, authMechanism="PLAIN", user=username, password=password, database=db) self.table_name = table_name self.column_order = column_order
def __init__(self, host, port, user, passwd, db): self._host = host self._port = port self._user = user self._passwd = passwd self._db = db self._conn = pyhs2.connect(host = self._host, port = self._port, authMechanism = "PLAIN", user = self._user, password = self._passwd, database = self._db) self._cur = self._conn.cursor()
def get_conn(self): db = self.get_connection(self.hiveserver2_conn_id) auth_mechanism = db.extra_dejson.get('authMechanism', 'NOSASL') if conf.get('core', 'security') == 'kerberos': auth_mechanism = db.extra_dejson.get('authMechanism', 'KERBEROS') return pyhs2.connect( host=db.host, port=db.port, authMechanism=auth_mechanism, user=db.login, database=db.schema or 'default')
def __init__(self): host = "dma-tools-01.eng.solidfire.net" auth = "PLAIN" user = "******" password = "******" database = "aiq_prod" self.hive = pyhs2.connect(host=host, port=10000, authMechanism=auth, user=user, password=password, database=database)
def loadResToHive(host,port,database,user,password,authMechanism,resultFilePath,partition): #connect to hive database with pyhs2.connect(host=host, port=port, database=database, user=user, password=password, authMechanism=authMechanism) as conn: with conn.cursor() as cur: str="load data local inpath "+resultFilePath+" into table IntRes partition(dt='"+partition+"')" # cur.execute("load data local inpath '/home/hadoop/PycharmProjects/IDWInterpolation/concatenateResult.txt' into table IntRes partition(dt='2016-03-01')") cur.execute(str)
def conn(cls): """ return the hive connection handle to the configured server """ config = cls.config try: connection = hive.connect(host=config['host'], port=config.get('port', cls.DEFAULT_PORT), authMechanism='NOSASL', user=config['username'], password=config['password']) except Exception, e: raise DbError("Cannot connect to Hive Server: %s\n" "Ensure that the server is running and you can connect normally" % e.message)
def make_connection(delimiter, host, user, pw, database): conn = pyhs2.connect(host=host, user=user, database=database, password=pw, port=10000, authMechanism="PLAIN") cur = conn.cursor() # Show databases #print cur.getDatabases() return cur, conn
def create_connection(self): """ Override the create_connection from the Netezza class which get's called in it's initializer """ import pyhs2 conn = pyhs2.connect(host=self.host, port=self.port, authMechanism=self.auth_mechanism, user=self.user, password=self.password, database=self.database) return conn
def create_flat_table(self, table_name): table_statement=self.table.get_table_column_with_type(table_name) +" )" table_statement=table_statement +" ROW FORMAT DELIMITED" table_statement=table_statement+" FIELDS TERMINATED BY ',' LINES TERMINATED BY " table_statement=table_statement+" '\\n' STORED AS TEXTFILE LOCATION '/user/sqoop2/"+table_name+"/'" table_statement=table_statement.replace("numeric","decimal") table_statement=table_statement.replace("timestamp(6)","timestamp") table_statement="CREATE EXTERNAL TABLE " +self.config['HIVE_DB']+"."+ table_name+"_text ("+table_statement self.logger.info("RUNNING at HIVE: %s",table_statement) with pyhs2.connect(host=self.env['HIVE_HOST'],port=10000,authMechanism="PLAIN", user='******',password='',database=self.config['HIVE_DB']) as conn: with conn.cursor() as cur: cur.execute(table_statement)
def isConnected(): with pyhs2.connect(host='localhost', port=10000, authMechanism="PLAIN", user='******', password='******', database='default') as conn: with conn.cursor() as cur: # Show databases try: cur.getDatabases() return True except Exception: return False
def getDataForInt(): with pyhs2.connect(host='172.20.111.54', port=10001, database='default', user='******', password='******', authMechanism="PLAIN") as conn: with conn.cursor() as cur: #get the table which stores the geo info of the meteorological station cur.execute("select site,value from awsrain") #cur.execute("select * from awsstationdesc") awsrain=cur.fetch() return awsrain
def get_results(self, hql, schema='default', arraysize=1000): schema = schema or 'default' with pyhs2.connect( host=self.hiveserver2_conn.host, port=self.hiveserver2_conn.port, authMechanism="NOSASL", user='******', database=schema) as conn: with conn.cursor() as cur: cur.execute(hql) return { 'data': cur.fetchall(), 'header': cur.getSchema(), }
def getStationInfo(host,port,database,user,password,authMechanism): with pyhs2.connect(host=host, port=port, database=database, user=user, password=password, authMechanism=authMechanism) as conn: with conn.cursor() as cur: #get the table which stores the geo info of the meteorological station cur.execute("select sid,lon,lat from awsstationdesc") #cur.execute("select * from awsstationdesc") listStationInfo=cur.fetch() del listStationInfo[0] return listStationInfo