def QueryExe(hql, name, dates): lock_file = join(lpath, name + '_' + dates + '.lock') try: transport = TSocket.TSocket(ips, 10001) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = ThriftHive.Client(protocol) transport.open() logger.info('Query sql is:\n%s', hql) client.execute(hql) query = client.fetchAll() logger.info('Query sql result is:\n%s', query) transport.close() return (query) except Thrift.TException, tx: logger.error(u'程序执行过程中发生异常, 错误信息如下\n%s', tx.message) os.remove(lock_file) logger.error(u'程序正在退出. 删除锁文件 %s', lock_file) sys.exit(1)
def hiveExe(sql): try: transport = TSocket.TSocket(hive_server_ip, hive_server_port) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = ThriftHive.Client(protocol) transport.open() client.execute(sql) # print "The return value is : " result = client.fetchAll() # print result # print "............",len(result) transport.close() return result except Thrift.TException, tx: print '%s' % (tx.message)
def HiveExe(hql, name, dates): lock_file = join(lpath, name + '_' + dates + '.lock') try: transport = TSocket.TSocket(ips, 10001) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = ThriftHive.Client(protocol) transport.open() for sql in hql: logger.info('Executive sql is:\n%s', sql) client.execute(sql) # client.fetchAll() logger.info('Successful implementation of this Sql') transport.close() except Thrift.TException, tx: logger.error(u'程序执行过程中发生异常, 错误信息如下\n%s', tx.message) os.remove(lock_file) logger.error(u'程序正在退出. 删除锁文件 %s', lock_file) sys.exit(1)
def execsql(sql): try: transport = TSocket.TSocket(conf['hive']['host'], conf['hive']['port']) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = ThriftHive.Client(protocol) transport.open() print "hive connect" client.execute(sql) print client.fetchAll() transport.close() print "close hive connect" return True except Thrift.TException, tx: print '%s' % (tx.message) return False
def query(self, hsql, callback): try: transport = TSocket.TSocket(self.host, self.port) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = ThriftHive.Client(protocol) transport.open() #获取表中的数据记录 client.execute(hsql) callback(client.fetchAll()) transport.close() except Thrift.TException, tx: callback(None) print '%s' % (tx.message) # # app=hiveDB('182.92.183.76',9084) # app.query()
def executeSql(host, command): try: transport = TSocket.TSocket(host, 10000) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = ThriftHive.Client(protocol) transport.open() sqls = command.replace("\r\n", "").split(";") result = [] for sql in sqls: sql = sql.strip() if len(sql) > 0: start = time.time() client.execute(sql) lines = client.fetchAll() end = time.time() result = result + lines + ["----------Time: %.3fs----------" % (end-start)] transport.close() return result except Exception as e: return [str(e)]
def __init__(self, server='localhost', port=10001, db='default'): """Initialize the Hive Client. :parameter server(string): server to connect to. Default- localhost :parameter port(int): port to connect to. Default- 10000 :parameter db(string): databased name. Default- default :return: None """ transport = TSocket.TSocket(server, port) self.__transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(self.__transport) self.__client = ThriftHive.Client(protocol) self.__db = db # make sure this DB exists! with openclose(self.__transport): assert self.__client.get_database(db)
def fetch_db_info_from_hive(hive_server_addr, port=10000): try: transport = TSocket.TSocket(hive_server_addr, port) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = ThriftHive.Client(protocol) transport.open() # Fetch databases client.execute("show databases") dbs = client.fetchAll() # Fetch tables db_tbl_map = {} for db in dbs: client.execute("use " + db) client.execute("show tables") tbls = client.fetchAll() tbl_col_map = {} for tbl in tbls: col_map = {} # Fetch table column name and type client.execute("describe " + tbl) cols = client.fetchAll() for col in cols: words = col.split() col_map[words[0]] = words[1] tbl_col_map[tbl] = col_map; db_tbl_map[db] = tbl_col_map; transport.close() return db_tbl_map except Thrift.TException, tx: print '%s' % (tx.message)
def get_metastore_client(self): """ Returns a Hive thrift client. """ from thrift.transport import TSocket, TTransport from thrift.protocol import TBinaryProtocol from hive_service import ThriftHive ms = self.metastore_conn auth_mechanism = ms.extra_dejson.get('authMechanism', 'NOSASL') if configuration.get('core', 'security') == 'kerberos': auth_mechanism = ms.extra_dejson.get('authMechanism', 'GSSAPI') kerberos_service_name = ms.extra_dejson.get( 'kerberos_service_name', 'hive') socket = TSocket.TSocket(ms.host, ms.port) if configuration.get( 'core', 'security') == 'kerberos' and auth_mechanism == 'GSSAPI': try: import saslwrapper as sasl except ImportError: import sasl def sasl_factory(): sasl_client = sasl.Client() sasl_client.setAttr("host", ms.host) sasl_client.setAttr("service", kerberos_service_name) sasl_client.init() return sasl_client from thrift_sasl import TSaslClientTransport transport = TSaslClientTransport(sasl_factory, "GSSAPI", socket) else: transport = TTransport.TBufferedTransport(socket) protocol = TBinaryProtocol.TBinaryProtocol(transport) return ThriftHive.Client(protocol)
return (outputFilename, dt) if __name__ == "__main__": ### ### Main function gets the current IP list and upload it to the Hive database ### tmpFilename = downloadIPList() (csvFilename, dt) = convert2csv(tmpFilename) # upload data to the Hive server try: transport = TSocket.TSocket('localhost', 10000) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = ThriftHive.Client(protocol) transport.open() client.execute("create table if not exists isc_daily_sources (source_ip string, target_port int, protocol int, reports bigint, targets bigint, first_seen string, last_seen string, hostname string) partitioned by(dt string) row format delimited fields terminated by '\t'"); client.execute("load data local inpath '{csvFile}' overwrite into table isc_daily_sources partition (dt='{date}')".format(csvFile=csvFilename,date=dt)) transport.close() except Thrift.TException, tx: sys.stderr.write('%s\n' % (tx.message))
def findHeavyHitters(table, today=datetime.date.today(), verbose=False): """ Find heavy hitters in the given traffic (table) and store the results in the 'suspiciousheavyhitters' Hive table. """ histNbDay = 15 date = "%d%02d%02d" % (today.year, today.month, today.day) dates = list( "%d%02d%02d" % (x.year, x.month, x.day) for x in pd.date_range(today - datetime.timedelta(histNbDay), today - datetime.timedelta(1))) table = scrub(table) ## set some variables regarding the input data if table.startswith("netflow"): dataType = "netflow" endpointTypes = [("dstip", "da"), ("srcip", "sa")] req0 = "select {endpoint}, sum(ipkt) nbpkt, sum(ibyt) nbbyte from {table} where dt=%s group by {endpoint}" req1 = "select {genericLabel}, avg(nbpkt) as avgpkt, stddev_samp(nbpkt) as stdpkt, avg(nbbyt) as avgbyt, stddev_samp(nbbyt) as stdbyt from(select {endpointType} as {genericLabel}, dt, sum(ipkt) as nbpkt, sum(ibyt) as nbbyt from {table} where {endpointType} IN ({suspiciousIP}) and dt IN ({dates}) group by {endpointType}, dt order by {endpointType}, dt) group by {genericLabel}" elif table.startswith("sflow"): dataType = "sflow" endpointTypes = [("dstip", "dstip"), ("srcip", "srcip"), ("dstip", "dstip6"), ("srcip", "srcip6")] req0 = "select {endpoint}, count(*) nbpkt, sum(ipsize) nbbyte from {table} where dt=%s and {endpoint}<>'' group by {endpoint}" req1 = "select {genericLabel}, avg(nbpkt) as avgpkt, stddev_samp(nbpkt) as stdpkt, avg(nbbyt) as avgbyt, stddev_samp(nbbyt) as stdbyt from(select {endpointType} as {genericLabel}, dt, count(*) as nbpkt, sum(ipsize) as nbbyt from {table} where {endpointType} IN ({suspiciousIP}) and dt IN ({dates}) group by {endpointType}, dt order by {endpointType}, dt) group by {genericLabel}" else: sys.stderr.write("Data type unknown!") sys.exit(-1) outputFile = open( "%s/suspiciousheavyhitters_%s_%s.txt" % (outputDirectory, table, date), "w") cursor = presto.connect('localhost').cursor() for genericLabel, endpointType in endpointTypes: if verbose: sys.stdout.write("Looking for %s heavy hitters... (%s,%s)\n" % (date, table, genericLabel)) suspiciousIP = set() # get today's data formatedReq = req0.format(endpoint=endpointType, table=table) cursor.execute(formatedReq, [date]) res = cursor.fetchall() if len(res) == 0: continue data = pd.DataFrame(res, columns=[genericLabel, "nbpkt", "nbbyt"]) data.index = data.pop(genericLabel) # find today's heavy hitter for aggType in ["nbpkt", "nbbyt"]: suspiciousIP.update( data.ix[data[aggType] > data[aggType].mean() + 3 * data[aggType].std()].index.tolist()) # check in past data if they had similar behavior if verbose: sys.stdout.write("Retrieve past data...\n") suspiciousIP = list(suspiciousIP) for i in range(len(suspiciousIP))[::100]: susIP = suspiciousIP[i:i + 100] formatedReq1 = req1.format( genericLabel=genericLabel, endpointType=endpointType, table=table, suspiciousIP=str.translate(str(list(susIP)), None, "u[]"), dates=str.translate(str(dates), None, "u[]")) cursor.execute(formatedReq1) res = cursor.fetchall() if verbose: sys.stdout.write("Register suspicious IPs...\n") for ip, avgpkt, stdpkt, avgbyt, stdbyt in res: currData = data.ix[ip] if genericLabel == "dstip": dstip = ip srcip = "" else: dstip = "" srcip = ip try: if currData["nbpkt"] > avgpkt + 3 * stdpkt or currData[ "nbbyt"] > avgbyt + 3 * stdbyt: outputFile.write( "%s\t%s\t%s\t%s\t%s\t\n" % (srcip, dstip, currData["nbpkt"], currData["nbbyt"], confidence(currData["nbpkt"], avgpkt, stdpkt, currData["nbbyt"], avgbyt, stdbyt))) except TypeError: if verbose: sys.stdout.write( "!!Warning!! no past data for %s (avgpkt=%s, stdpkt=%s, avgbyt=%s, stdbyt=%s)\n" % (ip, avgpkt, stdpkt, avgbyt, stdbyt)) outputFile.write("%s\t%s\t%s\t%s\t%s\t\n" % (srcip, dstip, currData["nbpkt"], currData["nbbyt"], "MED")) continue outputFile.close() # Store results in Hive try: transport = TSocket.TSocket('localhost', 10000) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = ThriftHive.Client(protocol) transport.open() client.execute( "create table if not exists suspiciousheavyhitters (srcip string, dstip string, pkt bigint, byte bigint, confidence string) partitioned by(dt string, dataSrc string) row format delimited fields terminated by '\t'" ) client.execute( "load data local inpath '{dir}/suspiciousheavyhitters_{table}_{date}.txt' overwrite into table suspiciousheavyhitters partition (dt='{date}', dataSrc='{table}')" .format(table=table, date=date, dir=outputDirectory)) transport.close() except Thrift.TException, tx: sys.stderr.write('%s\n' % (tx.message))
self.ip = iplist[num] while indexOfRetrytime + 1 < ipcounter and historyip.count( self.ip) > 0: num = random.randint(0, ipcounter - 1) self.ip = iplist[num] print "%d time retry execute connect to hive ip:%s" % ( indexOfRetrytime + 1, self.ip) self.WriteLog("%d time retry execute connect to hive ip:%s" % (indexOfRetrytime + 1, self.ip)) historyip.append(self.ip) self.transport = TSocket.TSocket(self.ip, self.port) #add by cherry end #self.transport = TSocket.TSocket(self.server, self.port) self.transport = TTransport.TBufferedTransport(self.transport) self.protocol = TBinaryProtocol.TBinaryProtocol(self.transport) self.cli = ThriftHive.Client(self.protocol) self.transport.open() self.cli.audit(self.usrname, self.passwd, self.dbname) sname = self.cli.createSession("") self.session = sname[0] #print "create: %s" %(self.session) self.authid = sname[1] res = self.cli.execute("set plcretry=%d" % (indexOfRetrytime + 1)) self.WriteLog("plcretry: %d" % (indexOfRetrytime + 1)) self.WriteLog("new session: " + self.session) self.WriteLog("new session server: " + self.server) self.WriteLog("new session ip: " + self.ip) self.WriteLog( time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
def findNtpAmplifiers(table, today=datetime.date.today(), verbose=False): """ Find NTP amplifiers in the given traffic (table) and store the results in the 'ntpamplifiers' Hive table. """ date = "%d%02d%02d" % (today.year, today.month, today.day) table = scrub(table) ## set some variables regarding the input data if table.startswith("netflow"): dataType = "netflow" req0 = "select sa, sum(ibyt), sum(ipkt) from %s where sp=123 and dt='%s' and pr='UDP' and ibyt/ipkt=468 group by sa" % ( table, date) elif table.startswith("sflow"): dataType = "sflow" req0 = "select srcip, sum(ipsize), count(*) from %s where udpsrcport=123 and ipprotocol=17 and ipsize=468 and dt='%s' group by srcip" % ( table, date) else: sys.stderr.write("Data type unknown!") sys.exit(-1) cursor = presto.connect('localhost').cursor() if verbose: sys.stdout.write("Looking for %s NTP amplifiers... (%s)\n" % (date, table)) # get today's data cursor.execute(req0) res = cursor.fetchall() if len(res) == 0: return data = pd.DataFrame(res, columns=["srcip", "nbbyt", "nbpkt"]) # add the confidence score: data["confidence"] = "LOW" data.loc[data.nbpkt >= 100, "confidence"] = "MED" data.loc[data.nbpkt >= 1000, "confidence"] = "HIGH" outputFile = open( "%s/ntpamplifiers_%s_%s.txt" % (outputDirectory, table, date), "w") data.to_csv(outputFile, sep="\t", header=False, cols=["srcip", "nbbyt", "nbpkt", "confidence"], index=False) outputFile.close() # Store results in Hive try: transport = TSocket.TSocket('localhost', 10000) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = ThriftHive.Client(protocol) transport.open() client.execute( "create table if not exists ntpamplifiers (srcip string, byte bigint, pkt bigint, confidence string) partitioned by(dt string, dataSrc string) row format delimited fields terminated by '\t'" ) client.execute( "load data local inpath '{dir}/ntpamplifiers_{table}_{date}.txt' overwrite into table ntpamplifiers partition (dt='{date}', dataSrc='{table}')" .format(table=table, date=date, dir=outputDirectory)) transport.close() except Thrift.TException, tx: sys.stderr.write('%s\n' % (tx.message))