def setUpClass(cls): transport = None try: # connect to the mongod conn = MongoClient(hostname, mongoPort) dbName, collName = Helpers.getDBAndCollNames(mongoTestURI) # start the hive server cls.hserverpid = Helpers.startHiveServer() if verbose: print "Successfully started hive server" ts = TSocket.TSocket(hostname, hivePort) transport = TTransport.TBufferedTransport(ts) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = ThriftHive.Client(protocol) transport.open() # first add all required JARS for the tests Helpers.addJars(client) cls.transport = transport cls.client = client cls.mongoc = conn[dbName][collName] except Thrift.TException, tx: print 'Error: %s' % (tx.message) if transport: transport.close()
def execute(self, quals, columns): if self.query: statement = self.query else: statement = "SELECT " + ",".join( self.columns.keys()) + " FROM " + self.table log_to_postgres('Hive query: ' + unicode(statement), DEBUG) try: transport = TSocket.TSocket(self.host, self.port) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = ThriftHive.Client(protocol) transport.open() client.execute(statement) for row in client.fetchAll(): line = {} cols = row.split("\t") idx = 0 for column_name in self.columns: line[column_name] = cols[idx] idx = idx + 1 yield line except Thrift.TException, tx: log_to_postgres(tx.message, ERROR)
def connect(self, ip, port, db, user='', passwd=''): try: self.ip = ip self.port = port self.db = db self.user = user self.passwd = passwd transport = TSocket.TSocket(ip, port) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = ThriftHive.Client(protocol) transport.open() client.execute('use %s' % db) #client.execute('add jar /opt/modules/hive/HivePlugin.jar') #client.execute("create temporary function getpid as 'com.baofeng.data.hive.UDFGetPid'") #if mapred_queue != "": # client.execute('set mapred.job.queue.name=%s' % mapred_queue) self.transport = transport self.client = client return True except Thrift.TException, tx: self.transport = None self.client = None self.logger('pdbc hive error: %s' % (tx.message), 'error') return False
def get_metastore_client(self): """ Returns a Hive thrift client. """ from thrift.transport import TSocket, TTransport from thrift.protocol import TBinaryProtocol from hive_service import ThriftHive ms = self.metastore_conn auth_mechanism = ms.extra_dejson.get('authMechanism', 'NOSASL') if configuration.get('core', 'security') == 'kerberos': auth_mechanism = ms.extra_dejson.get('authMechanism', 'GSSAPI') kerberos_service_name = ms.extra_dejson.get('kerberos_service_name', 'hive') socket = TSocket.TSocket(ms.host, ms.port) if configuration.get('core', 'security') == 'kerberos' and auth_mechanism == 'GSSAPI': try: import saslwrapper as sasl except ImportError: import sasl def sasl_factory(): sasl_client = sasl.Client() sasl_client.setAttr("host", ms.host) sasl_client("service", kerberos_service_name) sasl_client.init() from thrift_sasl import TSaslClientTransport transport = TSaslClientTransport(sasl_factory, "GSSAPI", socket) else: transport = TTransport.TBufferedTransport(socket) protocol = TBinaryProtocol.TBinaryProtocol(transport) return ThriftHive.Client(protocol)
def get_hive_client(self): ''' Returns a Hive thrift client. ''' transport = TSocket.TSocket(self.host, self.port) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) return ThriftHive.Client(protocol)
def hiveQuery(sql): tSocket = TSocket.TSocket('10.60.32.100', 10000) tTransport = TTransport.TBufferedTransport(tSocket) protocol = TBinaryProtocol.TBinaryProtocol(tTransport) client = ThriftHive.Client(protocol) tTransport.open() client.execute(sql) return client.fetchAll()
def get_metastore_client(self): ''' Returns a Hive thrift client. ''' ms = self.metastore_conn transport = TSocket.TSocket(ms.host, ms.port) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) return ThriftHive.Client(protocol)
def create_client(self, connection): """ Creates a Hive client. """ from hive_service import ThriftHive from thrift.protocol import TBinaryProtocol protocol = TBinaryProtocol.TBinaryProtocol(connection) return ThriftHive.Client(protocol)
def reportCommunities(table,dt,comm): """ Output the detected communities to the suspiciousdnsfailures Hive table """ sys.stderr.write("Report suspicious IPs.\n") outputFile = open("%s/suspiciousdnsfailures_%s_%s.txt" % (outputDirectory,table,dt), "w") for commId, G in enumerate(comm): comfqdns = set(n for n,d in G.nodes(data=True) if d['bipartite']==1) degrees = bipartite.degree_centrality(G,comfqdns) for e in G.edges(): # Compute all fields to store in the DB if G.node[e[0]]["bipartite"] == 0 and G.node[e[1]]["bipartite"] == 1: srcip = e[0] fqdn = e[1] elif G.node[e[0]]["bipartite"] == 1 and G.node[e[1]]["bipartite"] == 0: srcip = e[1] fqdn = e[0] else: sys.stderr.write("Error: Invalid edge (%s)\n" % e) degree = degrees[e[0]]+degrees[e[1]]/2.0 conf = "LOW" if degree > 0.66: conf = "HIGH" elif degree > 0.33: conf = "MED" outputFile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t\n" % (fqdn,srcip,commId,G.order(),degree,conf,table)) outputFile.close() # Store results in Hive try: transport = TSocket.TSocket('localhost', 10000) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = ThriftHive.Client(protocol) transport.open() client.execute("create table if not exists suspiciousdnsfailures (fqdn string, srcip string, clusterid int, clustersize bigint, degree double, confidence string, table string) partitioned by(dt string) row format delimited fields terminated by '\t'"); client.execute("load data local inpath '{dir}/suspiciousdnsfailures_{table}_{date}.txt' into table suspiciousdnsfailures partition (dt='{date}')".format(date=dt,dir=outputDirectory,table=table)) #create table suspiciousdnsfailuresIP_dns_pcaps (ip1 string, ip2 string, fqdn_overlap int) partitioned by (dt string); #client.execute("insert table suspiciousdnsfailuresIP partition (dt='{date}') select t1.srcip, t2.srcip, count(*) from suspiciousdnsfailures as t1 join suspiciousdnsfailures as t2 on (t1.clusterid=t2.clusterid and t1.fqdn=t2.fqdn and t1.dt='{date}' and t2.dt='{date}') where t1.srcip!=t2.srcip and t1.table='{table}' and t2.table='{table}' group by t1.srcip, t2.srcip".format(table=table,date=dt)) #transport.close() except Thrift.TException, tx: sys.stderr.write('%s\n' % (tx.message))
def run_query(q): socket = TSocket.TSocket("ec2-107-20-75-29.compute-1.amazonaws.com", 10000) transport = TTransport.TBufferedTransport(socket) protocol = TBinaryProtocolAccelerated(transport) client = ThriftHive.Client(protocol) transport.open() client.execute(q) rows = client.fetchAll() transport.close() return [r.split('\t') for r in rows]
def test(): transport = TSocket.TSocket('localhost', 10000) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = ThriftHive.Client(protocol) transport.open() client.execute("SELECT 1;") print client.fetchOne() transport.close()
def hive_based_calculations(connection, site_id, work_dir, backfilled_raw_logs_path, do_calculations=do_calculations): transport = TSocket.TSocket('localhost', 10000) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = ThriftHive.Client(protocol) transport.open() do_calculations(connection, site_id, work_dir, backfilled_raw_logs_path, client) transport.close()
def get_metastore_client(self): """ Returns a Hive thrift client. """ from hive_service import ThriftHive ms = self.metastore_conn transport = TSocket.TSocket(ms.host, ms.port) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) return ThriftHive.Client(protocol)
def execute_alter_sql(sql, hive_server_addr, port=10000): try: transport = TSocket.TSocket(hive_server_addr, port) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = ThriftHive.Client(protocol) transport.open() # Fetch databases client.execute(sql) except Thrift.TException, tx: print '%s' % (tx.message)
def connect(self): try: if self.client: return except AttributeError: pass try: socket = TSocket.TSocket(self.host, self.port) transport = TTransport.TBufferedTransport(socket) protocol = TBinaryProtocol.TBinaryProtocol(transport) self.client = ThriftHive.Client(protocol) transport.open() except Thrift.TException as te: raise HiveClientError('Failed to connect to Thrift server\n' + te.message)
def reconnect(self): if hasattr(self, "transport") and self.transport: self.transport.close() self.transport = None # Make socket self.transport = TSocket.TSocket(self.host, self.port) # Buffering is critical. Raw sockets are very slow self.transport = TTransport.TBufferedTransport(self.transport) # Wrap in a protocol protocol = TBinaryProtocol.TBinaryProtocol(self.transport) # Create a client to use the protocol encoder self.client = ThriftHive.Client(protocol) self.transport.open()
def clean_table_partitions(table_name_list, max_logtime): try: transport = TSocket.TSocket('100.5.24.137', 9991) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = ThriftHive.Client(protocol) transport.open() for table in table_name_list: drop_part_ddl = "ALTER TABLE " + table + " DROP PARTITION (log_time<'" + max_logtime + "')" print drop_part_ddl client.execute(drop_part_ddl) print client.fetchAll() transport.close() except Thrift.TException, tx: print '%s' % (tx.message)
def query(self, vars_hql, hql, callback): try: transport = TSocket.TSocket(self.host, self.port) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = ThriftHive.Client(protocol) transport.open() #获取表的字段名列表 # vars_hql='desc dmn.us_am_uid_class' client.execute(vars_hql) rows = [str(row) for row in client.fetchAll()] i = 0 isOver = False vars_name = [] while isOver == False: row = rows[i] if '\t \t ' in str(row): isOver = True vars_name += [row.split('\t')[0].replace(' ', '')] i += 1 vars_name = vars_name[0:-1] #获取表中的数据记录 # hql = 'select * from dmn.us_am_uid_class limit 5' client.execute(hql) records = [] for row in client.fetchAll(): record = {} j = 0 conts = row.split('\t') # print conts for cont in conts: key = vars_name[j] record[key] = cont # print cont j += 1 records += [record] # print records transport.close() callback(records) except Thrift.TException, tx: callback(None) print '%s' % (tx.message)
def isvalid(ip, port): log.msg("valid %s :%s " % (ip, port)) sql = conf.hive_valid_sql try: transport = TSocket.TSocket(ip, int(port)) transport.setTimeout(80000) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = ThriftHive.Client(protocol) transport.open() client.execute(sql) rows = client.fetchAll() transport.close() return 1 except Thrift.TException, tx: log.msg("Thrift.TException, tx%s" % tx) transport.close() return 0
def hiveExe(sql): try: transport = TSocket.TSocket(hive_server_ip, hive_server_port) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = ThriftHive.Client(protocol) transport.open() client.execute(sql) # print "The return value is : " result = client.fetchAll() # print result # print "............",len(result) transport.close() return result except Thrift.TException, tx: print '%s' % (tx.message)
def QueryExe(hql, name, dates): lock_file = join(lpath, name + '_' + dates + '.lock') try: transport = TSocket.TSocket(ips, 10001) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = ThriftHive.Client(protocol) transport.open() logger.info('Query sql is:\n%s', hql) client.execute(hql) query = client.fetchAll() logger.info('Query sql result is:\n%s', query) transport.close() return (query) except Thrift.TException, tx: logger.error(u'程序执行过程中发生异常, 错误信息如下\n%s', tx.message) os.remove(lock_file) logger.error(u'程序正在退出. 删除锁文件 %s', lock_file) sys.exit(1)
def HiveExe(hql, name, dates): lock_file = join(lpath, name + '_' + dates + '.lock') try: transport = TSocket.TSocket(ips, 10001) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = ThriftHive.Client(protocol) transport.open() for sql in hql: logger.info('Executive sql is:\n%s', sql) client.execute(sql) # client.fetchAll() logger.info('Successful implementation of this Sql') transport.close() except Thrift.TException, tx: logger.error(u'程序执行过程中发生异常, 错误信息如下\n%s', tx.message) os.remove(lock_file) logger.error(u'程序正在退出. 删除锁文件 %s', lock_file) sys.exit(1)
def execsql(sql): try: transport = TSocket.TSocket(conf['hive']['host'], conf['hive']['port']) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = ThriftHive.Client(protocol) transport.open() print "hive connect" client.execute(sql) print client.fetchAll() transport.close() print "close hive connect" return True except Thrift.TException, tx: print '%s' % (tx.message) return False
def query(self, hsql, callback): try: transport = TSocket.TSocket(self.host, self.port) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = ThriftHive.Client(protocol) transport.open() #获取表中的数据记录 client.execute(hsql) callback(client.fetchAll()) transport.close() except Thrift.TException, tx: callback(None) print '%s' % (tx.message) # # app=hiveDB('182.92.183.76',9084) # app.query()
def executeSql(host, command): try: transport = TSocket.TSocket(host, 10000) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = ThriftHive.Client(protocol) transport.open() sqls = command.replace("\r\n", "").split(";") result = [] for sql in sqls: sql = sql.strip() if len(sql) > 0: start = time.time() client.execute(sql) lines = client.fetchAll() end = time.time() result = result + lines + ["----------Time: %.3fs----------" % (end-start)] transport.close() return result except Exception as e: return [str(e)]
def __init__(self, server='localhost', port=10001, db='default'): """Initialize the Hive Client. :parameter server(string): server to connect to. Default- localhost :parameter port(int): port to connect to. Default- 10000 :parameter db(string): databased name. Default- default :return: None """ transport = TSocket.TSocket(server, port) self.__transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(self.__transport) self.__client = ThriftHive.Client(protocol) self.__db = db # make sure this DB exists! with openclose(self.__transport): assert self.__client.get_database(db)
def fetch_db_info_from_hive(hive_server_addr, port=10000): try: transport = TSocket.TSocket(hive_server_addr, port) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = ThriftHive.Client(protocol) transport.open() # Fetch databases client.execute("show databases") dbs = client.fetchAll() # Fetch tables db_tbl_map = {} for db in dbs: client.execute("use " + db) client.execute("show tables") tbls = client.fetchAll() tbl_col_map = {} for tbl in tbls: col_map = {} # Fetch table column name and type client.execute("describe " + tbl) cols = client.fetchAll() for col in cols: words = col.split() col_map[words[0]] = words[1] tbl_col_map[tbl] = col_map; db_tbl_map[db] = tbl_col_map; transport.close() return db_tbl_map except Thrift.TException, tx: print '%s' % (tx.message)
def findHeavyHitters(table, today=datetime.date.today(), verbose=False): """ Find heavy hitters in the given traffic (table) and store the results in the 'suspiciousheavyhitters' Hive table. """ histNbDay = 15 date = "%d%02d%02d" % (today.year, today.month, today.day) dates = list( "%d%02d%02d" % (x.year, x.month, x.day) for x in pd.date_range(today - datetime.timedelta(histNbDay), today - datetime.timedelta(1))) table = scrub(table) ## set some variables regarding the input data if table.startswith("netflow"): dataType = "netflow" endpointTypes = [("dstip", "da"), ("srcip", "sa")] req0 = "select {endpoint}, sum(ipkt) nbpkt, sum(ibyt) nbbyte from {table} where dt=%s group by {endpoint}" req1 = "select {genericLabel}, avg(nbpkt) as avgpkt, stddev_samp(nbpkt) as stdpkt, avg(nbbyt) as avgbyt, stddev_samp(nbbyt) as stdbyt from(select {endpointType} as {genericLabel}, dt, sum(ipkt) as nbpkt, sum(ibyt) as nbbyt from {table} where {endpointType} IN ({suspiciousIP}) and dt IN ({dates}) group by {endpointType}, dt order by {endpointType}, dt) group by {genericLabel}" elif table.startswith("sflow"): dataType = "sflow" endpointTypes = [("dstip", "dstip"), ("srcip", "srcip"), ("dstip", "dstip6"), ("srcip", "srcip6")] req0 = "select {endpoint}, count(*) nbpkt, sum(ipsize) nbbyte from {table} where dt=%s and {endpoint}<>'' group by {endpoint}" req1 = "select {genericLabel}, avg(nbpkt) as avgpkt, stddev_samp(nbpkt) as stdpkt, avg(nbbyt) as avgbyt, stddev_samp(nbbyt) as stdbyt from(select {endpointType} as {genericLabel}, dt, count(*) as nbpkt, sum(ipsize) as nbbyt from {table} where {endpointType} IN ({suspiciousIP}) and dt IN ({dates}) group by {endpointType}, dt order by {endpointType}, dt) group by {genericLabel}" else: sys.stderr.write("Data type unknown!") sys.exit(-1) outputFile = open( "%s/suspiciousheavyhitters_%s_%s.txt" % (outputDirectory, table, date), "w") cursor = presto.connect('localhost').cursor() for genericLabel, endpointType in endpointTypes: if verbose: sys.stdout.write("Looking for %s heavy hitters... (%s,%s)\n" % (date, table, genericLabel)) suspiciousIP = set() # get today's data formatedReq = req0.format(endpoint=endpointType, table=table) cursor.execute(formatedReq, [date]) res = cursor.fetchall() if len(res) == 0: continue data = pd.DataFrame(res, columns=[genericLabel, "nbpkt", "nbbyt"]) data.index = data.pop(genericLabel) # find today's heavy hitter for aggType in ["nbpkt", "nbbyt"]: suspiciousIP.update( data.ix[data[aggType] > data[aggType].mean() + 3 * data[aggType].std()].index.tolist()) # check in past data if they had similar behavior if verbose: sys.stdout.write("Retrieve past data...\n") suspiciousIP = list(suspiciousIP) for i in range(len(suspiciousIP))[::100]: susIP = suspiciousIP[i:i + 100] formatedReq1 = req1.format( genericLabel=genericLabel, endpointType=endpointType, table=table, suspiciousIP=str.translate(str(list(susIP)), None, "u[]"), dates=str.translate(str(dates), None, "u[]")) cursor.execute(formatedReq1) res = cursor.fetchall() if verbose: sys.stdout.write("Register suspicious IPs...\n") for ip, avgpkt, stdpkt, avgbyt, stdbyt in res: currData = data.ix[ip] if genericLabel == "dstip": dstip = ip srcip = "" else: dstip = "" srcip = ip try: if currData["nbpkt"] > avgpkt + 3 * stdpkt or currData[ "nbbyt"] > avgbyt + 3 * stdbyt: outputFile.write( "%s\t%s\t%s\t%s\t%s\t\n" % (srcip, dstip, currData["nbpkt"], currData["nbbyt"], confidence(currData["nbpkt"], avgpkt, stdpkt, currData["nbbyt"], avgbyt, stdbyt))) except TypeError: if verbose: sys.stdout.write( "!!Warning!! no past data for %s (avgpkt=%s, stdpkt=%s, avgbyt=%s, stdbyt=%s)\n" % (ip, avgpkt, stdpkt, avgbyt, stdbyt)) outputFile.write("%s\t%s\t%s\t%s\t%s\t\n" % (srcip, dstip, currData["nbpkt"], currData["nbbyt"], "MED")) continue outputFile.close() # Store results in Hive try: transport = TSocket.TSocket('localhost', 10000) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = ThriftHive.Client(protocol) transport.open() client.execute( "create table if not exists suspiciousheavyhitters (srcip string, dstip string, pkt bigint, byte bigint, confidence string) partitioned by(dt string, dataSrc string) row format delimited fields terminated by '\t'" ) client.execute( "load data local inpath '{dir}/suspiciousheavyhitters_{table}_{date}.txt' overwrite into table suspiciousheavyhitters partition (dt='{date}', dataSrc='{table}')" .format(table=table, date=date, dir=outputDirectory)) transport.close() except Thrift.TException, tx: sys.stderr.write('%s\n' % (tx.message))
self.ip = iplist[num] while indexOfRetrytime + 1 < ipcounter and historyip.count( self.ip) > 0: num = random.randint(0, ipcounter - 1) self.ip = iplist[num] print "%d time retry execute connect to hive ip:%s" % ( indexOfRetrytime + 1, self.ip) self.WriteLog("%d time retry execute connect to hive ip:%s" % (indexOfRetrytime + 1, self.ip)) historyip.append(self.ip) self.transport = TSocket.TSocket(self.ip, self.port) #add by cherry end #self.transport = TSocket.TSocket(self.server, self.port) self.transport = TTransport.TBufferedTransport(self.transport) self.protocol = TBinaryProtocol.TBinaryProtocol(self.transport) self.cli = ThriftHive.Client(self.protocol) self.transport.open() self.cli.audit(self.usrname, self.passwd, self.dbname) sname = self.cli.createSession("") self.session = sname[0] #print "create: %s" %(self.session) self.authid = sname[1] res = self.cli.execute("set plcretry=%d" % (indexOfRetrytime + 1)) self.WriteLog("plcretry: %d" % (indexOfRetrytime + 1)) self.WriteLog("new session: " + self.session) self.WriteLog("new session server: " + self.server) self.WriteLog("new session ip: " + self.ip) self.WriteLog( time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
return (outputFilename, dt) if __name__ == "__main__": ### ### Main function gets the current IP list and upload it to the Hive database ### tmpFilename = downloadIPList() (csvFilename, dt) = convert2csv(tmpFilename) # upload data to the Hive server try: transport = TSocket.TSocket('localhost', 10000) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = ThriftHive.Client(protocol) transport.open() client.execute("create table if not exists isc_daily_sources (source_ip string, target_port int, protocol int, reports bigint, targets bigint, first_seen string, last_seen string, hostname string) partitioned by(dt string) row format delimited fields terminated by '\t'"); client.execute("load data local inpath '{csvFile}' overwrite into table isc_daily_sources partition (dt='{date}')".format(csvFile=csvFilename,date=dt)) transport.close() except Thrift.TException, tx: sys.stderr.write('%s\n' % (tx.message))