def suspicious_queries(date, ip=None, query=None, limit=250): db = Configuration.db() sq_query = (""" SELECT STRAIGHT_JOIN ds.unix_tstamp,frame_len,ds.ip_dst,ds.dns_qry_name, dns_qry_class,dns_qry_type,dns_qry_rcode,ml_score,tld, query_rep,hh,dns_qry_class_name,dns_qry_type_name, dns_qry_rcode_name,network_context FROM {0}.dns_scores ds LEFT JOIN {0}.dns_threat_investigation dt ON (ds.dns_qry_name = dt.dns_qry_name) WHERE ds.y={1} AND ds.m={2} AND ds.d={3} AND (dt.dns_qry_name is NULL) """).format(db, date.year, date.month, date.day) sq_filter = "" sq_filter += " AND ds.ip_dst = '{0}'".format(ip) if ip else "" sq_filter += " AND ds.dns_qry_name LIKE '%{0}%'".format( query) if query else "" sq_filter += " ORDER BY ds.ml_score limit {0}".format(limit) sq_query = sq_query + sq_filter return ImpalaEngine.execute_query_as_list(sq_query)
def suspicious_requests(date,uri=None,ip=None,limit=250): db = Configuration.db() proxy_query = (""" SELECT STRAIGHT_JOIN ps.tdate,ps.time,ps.clientip,ps.host,ps.reqmethod,ps.useragent, ps.resconttype,ps.duration,ps.username,ps.webcat,ps.referer, ps.respcode,ps.uriport,ps.uripath,ps.uriquery,ps.serverip,ps.scbytes, ps.csbytes,ps.fulluri,ps.ml_score,ps.uri_rep,ps.respcode_name, ps.network_context FROM {0}.proxy_scores ps LEFT JOIN {0}.proxy_threat_investigation pt ON (ps.fulluri = pt.fulluri) WHERE ps.y={1} AND ps.m={2} AND ps.d={3} AND (pt.fulluri is NULL) """).format(db,date.year,date.month,date.day) p_filter = "" p_filter += " AND ps.fulluri LIKE '%{0}%'".format(uri) if uri else "" p_filter += " AND ps.clientip = '{0}'".format(ip) if ip else "" p_filter += " ORDER BY ps.ml_score limit {0}".format(limit) proxy_query = proxy_query + p_filter return ImpalaEngine.execute_query_as_list(proxy_query)
def create_time_line(anchor,inbound, outbound, twoway,date): top_keys = [] if len(twoway) > 0: top_keys.extend(twoway.keys()) if len(outbound) > 0: top_keys.extend(outbound.keys()) if len(inbound) > 0: top_keys.extend(inbound.keys()) db = Configuration.db() imp_query =(""" INSERT INTO TABLE {0}.flow_timeline PARTITION (y={4}, m={5},d={6}) SELECT '{7}' ,min(treceived) as tstart, max(treceived) as tend, sip as srcIP,dip as dstip, proto as proto, sport as sport, dport AS dport, ipkt as ipkt, ibyt as ibyt FROM {0}.flow WHERE y={4} AND m={5} AND d={6} AND ((dip IN({1}) AND sip ='{2}') OR (sip IN({1}) AND dip ='{2}')) GROUP BY sip, dip, proto, sport, dport, ipkt, ibyt ORDER BY tstart LIMIT {3} """) ips = "'" + "','".join(top_keys) + "'" imp_query = imp_query.format(db,ips,anchor,1000,date.year,date.month, date.day,anchor) if ImpalaEngine.execute_query(imp_query): return "Timeline successfully created \n" else: return "Timeline couldn't be created \n"
def expanded_search(date, query=None, ip=None, limit=20): if not ip and not query: return False db = Configuration.db() if ip: count = "dns_qry_name" filter_param = "ip_dst" filter_value = ip else: count = "ip_dst" filter_param = "dns_qry_name" filter_value = query expanded_query = (""" SELECT COUNT({0}) as total,dns_qry_name,ip_dst FROM {1}.dns WHERE y={2} AND m={3} AND d={4} AND {5} = '{6}' GROUP BY {0},{5} ORDER BY total DESC LIMIT {7} """).format(count,db,date.year,date.month,date.day,\ filter_param,filter_value,limit if limit else 20) return ImpalaEngine.execute_query_as_list(expanded_query)
def expanded_search(date,query=None,ip=None,limit=20): if not ip and not query: return False db = Configuration.db() if ip: count = "dns_qry_name" filter_param = "ip_dst" filter_value = ip else: count = "ip_dst" filter_param = "dns_qry_name" filter_value = query expanded_query = (""" SELECT COUNT({0}) as total,dns_qry_name,ip_dst FROM {1}.dns WHERE y={2} AND m={3} AND d={4} AND {5} = '{6}' GROUP BY {0},{5} ORDER BY total DESC LIMIT {7} """).format(count,db,date.year,date.month,date.day,\ filter_param,filter_value,limit if limit else 20) return ImpalaEngine.execute_query_as_list(expanded_query)
def suspicious_requests(date, uri=None, ip=None, limit=250): db = Configuration.db() proxy_query = (""" SELECT STRAIGHT_JOIN ps.tdate,ps.time,ps.clientip,ps.host,ps.reqmethod,ps.useragent, ps.resconttype,ps.duration,ps.username,ps.webcat,ps.referer, ps.respcode,ps.uriport,ps.uripath,ps.uriquery,ps.serverip,ps.scbytes, ps.csbytes,ps.fulluri,ps.ml_score,ps.uri_rep,ps.respcode_name, ps.network_context FROM {0}.proxy_scores ps LEFT JOIN {0}.proxy_threat_investigation pt ON (ps.fulluri = pt.fulluri) WHERE ps.y={1} AND ps.m={2} AND ps.d={3} AND (pt.fulluri is NULL) """).format(db, date.year, date.month, date.day) p_filter = "" p_filter += " AND ps.fulluri LIKE '%{0}%'".format(uri) if uri else "" p_filter += " AND ps.clientip = '{0}'".format(ip) if ip else "" p_filter += " ORDER BY ps.ml_score limit {0}".format(limit) proxy_query = proxy_query + p_filter return ImpalaEngine.execute_query_as_list(proxy_query)
def chord_details(ip,date): db = Configuration.db() chord_query = (""" SELECT srcip,dstip,ibyt,ipkt FROM {0}.flow_chords WHERE y={1} AND m={2} AND d={3} AND ip_threat='{4}' """).format(db,date.year,date.month,date.day,ip) return ImpalaEngine.execute_query_as_list(chord_query)
def create_dendro(expanded_search, date, anchor): db = Configuration.db() for row in expanded_search: dendro_query = (""" INSERT INTO {0}.dns_threat_dendro PARTITION (y={1}, m={2},d={3}) VALUES ( '{4}',{5},'{6}','{7}') """)\ .format(db,date.year,date.month,date.day,anchor,\ row["total"],row["dnsQuery"],row["clientIp"]) ImpalaEngine.execute_query(dendro_query)
def create_dendro(expanded_search,date,anchor): db = Configuration.db() for row in expanded_search: dendro_query = (""" INSERT INTO {0}.dns_threat_dendro PARTITION (y={1}, m={2},d={3}) VALUES ( '{4}',{5},'{6}','{7}') """)\ .format(db,date.year,date.month,date.day,anchor,\ row["total"],row["dnsQuery"],row["clientIp"]) ImpalaEngine.execute_query(dendro_query)
def get_scored_requests(date): db = Configuration.db() sc_query = (""" SELECT tdate,fulluri,uri_sev FROM {0}.proxy_threat_investigation WHERE y={1} AND m={2} AND d={3} """).format(db, date.year, date.month, date.day) return ImpalaEngine.execute_query_as_list(sc_query)
def time_line(date, uri): db = Configuration.db() time_line_query = (""" SELECT p_threat,tstart,tend,duration,clientip,respcode,respcodename FROM {0}.proxy_timeline WHERE y={1} AND m={2} AND d={3} AND p_threat = '{4}' """).format(db, date.year, date.month, date.day, uri) return ImpalaEngine.execute_query_as_list(time_line_query)
def time_line(date,uri): db = Configuration.db() time_line_query = (""" SELECT p_threat,tstart,tend,duration,clientip,respcode,respcodename FROM {0}.proxy_timeline WHERE y={1} AND m={2} AND d={3} AND p_threat = '{4}' """).format(db,date.year,date.month,date.day,uri) return ImpalaEngine.execute_query_as_list(time_line_query)
def get_scored_requests(date): db = Configuration.db() sc_query = (""" SELECT tdate,fulluri,uri_sev FROM {0}.proxy_threat_investigation WHERE y={1} AND m={2} AND d={3} """).format(db,date.year,date.month,date.day) return ImpalaEngine.execute_query_as_list(sc_query)
def get_scored_connections(date): db = Configuration.db() sc_query = (""" SELECT unix_tstamp,ip_dst,dns_qry_name,ip_sev,dns_sev FROM {0}.dns_threat_investigation WHERE y={1} AND m={2} AND d={3} """).format(db,date.year,date.month,date.day) return ImpalaEngine.execute_query_as_list(sc_query)
def get_scored_connections(date): db = Configuration.db() sc_query = (""" SELECT unix_tstamp,ip_dst,dns_qry_name,ip_sev,dns_sev FROM {0}.dns_threat_investigation WHERE y={1} AND m={2} AND d={3} """).format(db, date.year, date.month, date.day) return ImpalaEngine.execute_query_as_list(sc_query)
def client_details(date, ip): db = Configuration.db() client_query =(""" SELECT ip_dst,dns_a,dns_qry_name,ip_dst FROM {0}.dns_dendro WHERE y={1} AND m={2} AND d={3} AND ip_dst='{4}' """).format(db,date.year,date.month,date.day,ip) return ImpalaEngine.execute_query_as_list(client_query)
def expanded_search(date, uri): db = Configuration.db() expanded_query = (""" SELECT p_date, p_time, clientip, username, duration, fulluri,\ webcat, respcode, reqmethod,useragent, resconttype,\ referer, uriport, serverip, scbytes, csbytes FROM {0}.proxy WHERE y='{1}' AND m='{2}' AND d='{3}' AND (fulluri='{4}' OR referer ='{4}') ORDER BY p_time """)\ .format(db,date.year,str(date.month).zfill(2),str(date.day).zfill(2),uri) return ImpalaEngine.execute_query_as_list(expanded_query)
def time_line(ip,date): db = Configuration.db() time_line_query = (""" SELECT ip_threat,tstart,tend,srcip,dstip,proto, sport,dport,ipkt,ibyt FROM {0}.flow_timeline WHERE y={1} AND m={2} AND d={3} AND ip_threat = '{4}' """).format(db,date.year,date.month,date.day,ip) return ImpalaEngine.execute_query_as_list(time_line_query)
def get_scored_connections(date): db = Configuration.db() scored_query = (""" SELECT tstart,srcip,dstip,srcport,dstport,score FROM {0}.flow_threat_investigation WHERE y={1} AND m={2} AND d={3} """).format(db,date.year,date.month,date.day) return ImpalaEngine.execute_query_as_list(scored_query)
def expanded_search(date,uri): db = Configuration.db() expanded_query = (""" SELECT p_date, p_time, clientip, username, duration, fulluri,\ webcat, respcode, reqmethod,useragent, resconttype,\ referer, uriport, serverip, scbytes, csbytes FROM {0}.proxy WHERE y='{1}' AND m='{2}' AND d='{3}' AND (fulluri='{4}' OR referer ='{4}') ORDER BY p_time """)\ .format(db,date.year,str(date.month).zfill(2),str(date.day).zfill(2),uri) return ImpalaEngine.execute_query_as_list(expanded_query)
def client_details(date, ip): db = Configuration.db() client_query = (""" SELECT ip_dst,dns_a,dns_qry_name,ip_dst FROM {0}.dns_dendro WHERE y={1} AND m={2} AND d={3} AND ip_dst='{4}' """).format(db, date.year, date.month, date.day, ip) return ImpalaEngine.execute_query_as_list(client_query)
def ingest_summary(start_date,end_date): db = Configuration.db() is_query = (""" SELECT tdate,total FROM {0}.proxy_ingest_summary WHERE ( y >= {1} and y <= {2}) AND ( m >= {3} and m <= {4}) AND ( d >= {5} and d <= {6}) """)\ .format(db,start_date.year,end_date.year,start_date.month,end_date.month, start_date.day, end_date.day) return ImpalaEngine.execute_query_as_list(is_query)
def ingest_summary(start_date, end_date): db = Configuration.db() is_query = (""" SELECT tdate,total FROM {0}.proxy_ingest_summary WHERE ( y >= {1} and y <= {2}) AND ( m >= {3} and m <= {4}) AND ( d >= {5} and d <= {6}) """)\ .format(db,start_date.year,end_date.year,start_date.month,end_date.month, start_date.day, end_date.day) return ImpalaEngine.execute_query_as_list(is_query)
def details(frame_time, query): db = Configuration.db() details_query = (""" SELECT unix_tstamp,frame_len,ip_dst,ip_src,dns_qry_name,dns_qry_class, dns_qry_type,dns_qry_rcode,dns_a,dns_qry_type_name, dns_qry_rcode_name,dns_qry_class_name FROM {0}.dns_edge WHERE y={1} AND m={2} AND d={3} AND hh={4} AND dns_qry_name = '{5}' """).format(db,frame_time.year,frame_time.month,frame_time.day,\ frame_time.hour,query) return ImpalaEngine.execute_query_as_list(details_query)
def story_board(date): db = Configuration.db() sb_query = (""" SELECT p_threat,title,text FROM {0}.proxy_storyboard WHERE y={1} AND m={2} AND d={3} """).format(db, date.year, date.month, date.day) results = ImpalaEngine.execute_query_as_list(sb_query) for row in results: row["text"] = row["text"].replace("\n", "\\n") return results
def story_board(date): db = Configuration.db() sb_query= (""" SELECT ip_threat,title,text FROM {0}.flow_storyboard WHERE y={1} AND m={2} AND d={3} """).format(db,date.year,date.month,date.day) results = ImpalaEngine.execute_query_as_list(sb_query) for row in results: row["text"] = row["text"].replace("\n","\\n") return results
def ingest_summary(start_date,end_date): db = Configuration.db() daterange_select = daterange_query(start_date, end_date) is_query = (""" SELECT tdate,total FROM {0}.flow_ingest_summary WHERE {1} ORDER BY tdate """).format(db, daterange_select) return ImpalaEngine.execute_query_as_list(is_query)
def save_comments(anchor, ip, query, title, text, date): db = Configuration.db() sb_query = (""" SELECT ip_threat,dns_threat,title,text FROM {0}.dns_storyboard WHERE y = {1} AND m= {2} AND d={3} """).format(db, date.year, date.month, date.day) sb_data = ImpalaEngine.execute_query_as_list(sb_query) # find value if already exists. saved = False for item in sb_data: if item["ip_threat"] == anchor or item["dns_threat"] == anchor: item["title"] = title item["text"] = text saved = True if not saved: sb_data.append({ 'text': text, 'ip_threat': str(ip), 'title': title, 'dns_threat': query }) #remove old file. app_path = Configuration.spot() old_file = "{0}/dns/hive/oa/storyboard/y={1}/m={2}/d={3}/"\ .format(app_path,date.year,date.month,date.day) HDFSClient.delete_folder(old_file, "impala") ImpalaEngine.execute_query("invalidate metadata") for item in sb_data: insert_query = (""" INSERT INTO {0}.dns_storyboard PARTITION(y={1} , m={2} ,d={3}) VALUES ( '{4}', '{5}', '{6}','{7}') """)\ .format(db,date.year,date.month,date.day,\ item["ip_threat"],item["dns_threat"],item["title"],item["text"]) ImpalaEngine.execute_query(insert_query) return True
def details(src_ip,dst_ip,date): db = Configuration.db() details_query = (""" SELECT tstart,srcip,dstip,sport,dport,proto,flags, tos,ibyt,ipkt,input,output,rip,obyt,opkt FROM {0}.flow_edge WHERE y={1} AND m={2} AND d={3} AND hh={4} AND mn={5} AND ((srcip='{6}' AND dstip='{7}') OR (srcip='{7}' AND dstip='{6}')) ORDER BY tstart """).format(db,date.year,date.month,date.day,date.hour, \ date.minute,src_ip,dst_ip) return ImpalaEngine.execute_query_as_list(details_query)
def ingest_summary(start_date,end_date): db = Configuration.db() is_query = (""" SELECT tdate,total FROM {0}.flow_ingest_summary WHERE ( y >= {1} AND y <= {2}) AND ( m >= {3} AND m <= {4}) AND ( d >= {5} AND d <= {6}) ORDER BY tdate """).format(db,start_date.year,end_date.year, \ start_date.month,end_date.month, \ start_date.day, end_date.day) return ImpalaEngine.execute_query_as_list(is_query)
def save_comment(ip,title,text,date): #Get current table info. db = Configuration.db() sb_query = (""" SELECT ip_threat,title,text FROM {0}.flow_storyboard WHERE y = {1} AND m= {2} AND d={3} """).format(db,date.year,date.month,date.day) sb_data = ImpalaEngine.execute_query_as_list(sb_query) # find value if already exists. saved = False for item in sb_data: if item["ip_threat"] == ip: item["title"] = title item["text"] = text saved = True if not saved: sb_data.append({'text': text, 'ip_threat': str(ip), 'title': title}) #remove old file. app_path = Configuration.spot() old_file = "{0}/flow/hive/oa/storyboard/y={1}/m={2}/d={3}/" \ .format(app_path,date.year,date.month,date.day) # remove file manually to allow the comments update. HDFSClient.delete_folder(old_file,"impala") ImpalaEngine.execute_query("invalidate metadata") for item in sb_data: insert_query = (""" INSERT INTO {0}.flow_storyboard PARTITION(y={1} , m={2} ,d={3}) VALUES ( '{4}', '{5}','{6}') """) \ .format(db,date.year,date.month,date.day, \ item["ip_threat"],item["title"],item["text"]) ImpalaEngine.execute_query(insert_query) return True
def details(date, uri, ip): if not uri and not ip: return None db = Configuration.db() p_details = (""" SELECT tdate,time,clientIp,host,webcat,respcode,respcode_name ,reqmethod,useragent,resconttype,referer,uriport,serverip ,scbytes,csbytes,fulluri,hh FROM {0}.proxy_edge WHERE y={1} AND m={2} AND d={3} AND (fulluri='{4}' AND clientIp='{5}') """).format(db, date.year, date.month, date.day, uri.replace("'", "//'"), ip) return ImpalaEngine.execute_query_as_list(p_details)
def details(date,uri,ip): if not uri and not ip: return None db = Configuration.db() p_details = (""" SELECT tdate,time,clientIp,host,webcat,respcode,respcode_name ,reqmethod,useragent,resconttype,referer,uriport,serverip ,scbytes,csbytes,fulluri,hh FROM {0}.proxy_edge WHERE y={1} AND m={2} AND d={3} AND (fulluri='{4}' AND clientIp='{5}') """).format(db,date.year,date.month,date.day,uri.replace("'","//'"),ip) return ImpalaEngine.execute_query_as_list(p_details)
def incident_progression(date, query,ip): if not ip and not query: return None db = Configuration.db() return_value = "dns_qry_name" if ip else "ip_dst" dns_threat_query = (""" SELECT anchor,total,{0} FROM {1}.dns_threat_dendro WHERE y={2} AND m={3} AND d={4} AND anchor = '{5}' """).format(return_value,db,date.year,date.month,date.day,\ query if query else ip) return ImpalaEngine.execute_query_as_list(dns_threat_query)
def expanded_search(date,ip): db = Configuration.db() expanded_query = (""" SELECT min(treceived) as firstseen, max(treceived) as lastseen, sip as srcip, dip as dstip, sport as sport, dport as dport, count(sip) as conns, max(ipkt) as maxpkts, avg(ipkt) as avgpkts, max(ibyt) as maxbyts, avg(ibyt) as avgbyts FROM {0}.flow WHERE y={1} AND m={2} AND d={3} AND (sip ='{4}' OR dip='{4}') GROUP BY sip, dip,sport,dport """).format(db,date.year,date.month,date.day,ip) return ImpalaEngine.execute_query_as_list(expanded_query)
def incident_progression(date, query, ip): if not ip and not query: return None db = Configuration.db() return_value = "dns_qry_name" if ip else "ip_dst" dns_threat_query = (""" SELECT anchor,total,{0} FROM {1}.dns_threat_dendro WHERE y={2} AND m={3} AND d={4} AND anchor = '{5}' """).format(return_value,db,date.year,date.month,date.day,\ query if query else ip) return ImpalaEngine.execute_query_as_list(dns_threat_query)
def suspicious_connections(date,ip=None,limit=250): db = Configuration.db() sc_query = (""" SELECT STRAIGHT_JOIN fs.tstart,fs.srcip,fs.dstip,fs.sport,fs.dport,proto, ipkt,ibyt,opkt,obyt,ml_score,rank,srcip_internal, dstip_internal,src_geoloc,dst_geoloc,src_domain, dst_domain,src_rep,dst_rep FROM {0}.flow_scores fs LEFT JOIN {0}.flow_threat_investigation ft ON (( fs.srcip = ft.srcip) OR ( fs.dstip = ft.dstip)) WHERE fs.y={1} AND fs.m={2} and fs.d={3} AND ( ft.srcip is NULL AND ft.dstip is NULL ) """).format(db,date.year,date.month,date.day) sc_filter = "" if ip: sc_filter = " AND ( fs.srcip='{0}' OR fs.dstip='{0}')".format(ip) sc_filter += " ORDER BY rank limit {0}".format(limit) sc_query = sc_query + sc_filter return ImpalaEngine.execute_query_as_list(sc_query)
def suspicious_queries(date, ip=None, query=None,limit=250): db = Configuration.db() sq_query = (""" SELECT STRAIGHT_JOIN ds.unix_tstamp,frame_len,ds.ip_dst,ds.dns_qry_name, dns_qry_class,dns_qry_type,dns_qry_rcode,ml_score,tld, query_rep,hh,dns_qry_class_name,dns_qry_type_name, dns_qry_rcode_name,network_context FROM {0}.dns_scores ds LEFT JOIN {0}.dns_threat_investigation dt ON (ds.dns_qry_name = dt.dns_qry_name) WHERE ds.y={1} AND ds.m={2} AND ds.d={3} AND (dt.dns_qry_name is NULL) """).format(db,date.year,date.month,date.day) sq_filter = "" sq_filter += " AND ds.ip_dst = '{0}'".format(ip) if ip else "" sq_filter += " AND ds.dns_qry_name LIKE '%{0}%'".format(query) if query else "" sq_filter += " ORDER BY ds.ml_score limit {0}".format(limit) sq_query = sq_query + sq_filter return ImpalaEngine.execute_query_as_list(sq_query)
def create_connection(): impala_host, impala_port = config.impala() conf = {} # TODO: if using hive, kerberos service name must be changed, impyla sets 'impala' as default service_name = {'kerberos_service_name': 'impala'} if config.kerberos_enabled(): principal, keytab, sasl_mech, security_proto = config.kerberos() conf.update({ 'auth_mechanism': 'GSSAPI', }) if config.ssl_enabled(): ssl_verify, ca_location, cert, key = config.ssl() conf.update({'ca_cert': cert, 'use_ssl': ssl_verify}) db = config.db() conn = connect(host=impala_host, port=int(impala_port), database=db, **conf) return conn.cursor()
def score_connection(score,date,src_ip=None,dst_ip=None,src_port=None,dst_port=None): if not src_ip and not dst_ip and not src_port and not dst_port: return False db = Configuration.db() # get connections to score connections_query = (""" SELECT tstart,srcip,dstip,sport,dport, ibyt,ipkt FROM {0}.flow_scores WHERE y = {1} AND m={2} AND d={3} """).format(db,date.year,date.month,date.day) connections_filter = "" connections_filter += " AND srcip = '{0}'".format(src_ip) if src_ip else "" connections_filter += " AND dstip = '{0}'".format(dst_ip) if dst_ip else "" connections_filter += " AND sport = {0}" \ .format(str(src_port)) if src_port else "" connections_filter += " AND dport = {0}" \ .format(str(dst_port)) if dst_port else "" connections = ImpalaEngine.execute_query(connections_query + connections_filter) # add score to connections insert_command = (""" INSERT INTO {0}.flow_threat_investigation PARTITION (y={1},m={2},d={3}) VALUES (""") \ .format(db,date.year,date.month,date.day) fb_data = [] first = True num_rows = 0 for row in connections: # insert into flow_threat_investigation. threat_data = (row[0],row[1],row[2],row[3],row[4],score) fb_data.append([score,row[0],row[1],row[2],row[3],row[4],row[5],row[6]]) insert_command += "{0}{1}".format("," if not first else "", threat_data) first = False num_rows += 1 insert_command += ")" if num_rows > 0: ImpalaEngine.execute_query(insert_command) # create feedback file. app_path = Configuration.spot() feedback_path = "{0}/flow/scored_results/{1}{2}{3}/feedback" \ .format(app_path,date.year,str(date.month).zfill(2),str(date.day).zfill(2)) append_file = True if len(HDFSClient.list_dir(feedback_path)) == 0: fb_data.insert(0,["sev","tstart","sip","dip","sport","dport","ipkt","ibyt"]) append_file = False HDFSClient.put_file_csv(fb_data,feedback_path,"ml_feedback.csv",\ append_file=append_file) return True
def create_timeline(anchor, clientips, date, top_results): response = "" susp_ips = [] if clientips: srtlist = sorted(list(clientips.items()), key=lambda x: x[1], reverse=True) for val in srtlist[:top_results]: susp_ips.append(val[0]) if anchor != "": db = Configuration.db() time_line_query = (""" SELECT p_threat,tstart,tend,duration,clientip,respcode,respcodename FROM {0}.proxy_timeline WHERE y={1} AND m={2} AND d={3} AND p_threat != '{4}' """).format(db, date.year, date.month, date.day, anchor.replace("'", "//'")) tmp_timeline_data = ImpalaEngine.execute_query_as_list(time_line_query) imp_query = (""" INSERT INTO TABLE {0}.proxy_timeline PARTITION (y={2}, m={3},d={4}) SELECT '{7}' as p_threat, concat(cast(p_date as string), ' ', cast(MIN(p_time) as string)) AS tstart, concat(cast(p_date as string), ' ', cast(MAX(p_time) as string)) AS tend, SUM(duration) AS duration, clientip, respcode,"respCodeName" as respCodeName FROM {0}.proxy WHERE fulluri='{1}' AND clientip IN ({5}) AND y='{2}' AND m='{3}' AND d='{4}' GROUP BY clientip, p_time, respcode, p_date LIMIT {6} """)\ .format(db,anchor,date.year,str(date.month).zfill(2),\ str(date.day).zfill(2),("'" + "','".join(susp_ips) + "'")\ ,top_results,anchor) app_path = Configuration.spot() old_file = "{0}/proxy/hive/oa/timeline/y={1}/m={2}/d={3}"\ .format(app_path,date.year,date.month,date.day) HDFSClient.delete_folder(old_file, "impala") ImpalaEngine.execute_query("invalidate metadata") #Insert temporary values for item in tmp_timeline_data: insert_query = (""" INSERT INTO {0}.proxy_timeline PARTITION(y={1} , m={2} ,d={3}) VALUES ('{4}', '{5}', '{6}',{7},'{8}','{9}','{10}') """)\ .format(db,date.year,date.month,date.day,\ item["p_threat"],item["tstart"],item["tend"],item["duration"],item["clientip"],item["respcode"],item["respcodename"]) ImpalaEngine.execute_query(insert_query) ImpalaEngine.execute_query(imp_query) response = "Timeline successfully saved" else: response = "Timeline couldn't be created"
def create_timeline(anchor,clientips,date,top_results): response = "" susp_ips = [] if clientips: srtlist = sorted(list(clientips.items()), key=lambda x: x[1], reverse=True) for val in srtlist[:top_results]: susp_ips.append(val[0]) if anchor != "": db = Configuration.db() time_line_query = (""" SELECT p_threat,tstart,tend,duration,clientip,respcode,respcodename FROM {0}.proxy_timeline WHERE y={1} AND m={2} AND d={3} AND p_threat != '{4}' """).format(db,date.year,date.month,date.day,anchor.replace("'","//'")) tmp_timeline_data = ImpalaEngine.execute_query_as_list(time_line_query) imp_query = (""" INSERT INTO TABLE {0}.proxy_timeline PARTITION (y={2}, m={3},d={4}) SELECT '{7}' as p_threat, concat(cast(p_date as string), ' ', cast(MIN(p_time) as string)) AS tstart, concat(cast(p_date as string), ' ', cast(MAX(p_time) as string)) AS tend, SUM(duration) AS duration, clientip, respcode,"respCodeName" as respCodeName FROM {0}.proxy WHERE fulluri='{1}' AND clientip IN ({5}) AND y='{2}' AND m='{3}' AND d='{4}' GROUP BY clientip, p_time, respcode, p_date LIMIT {6} """)\ .format(db,anchor,date.year,str(date.month).zfill(2),\ str(date.day).zfill(2),("'" + "','".join(susp_ips) + "'")\ ,top_results,anchor) app_path = Configuration.spot() old_file = "{0}/proxy/hive/oa/timeline/y={1}/m={2}/d={3}"\ .format(app_path,date.year,date.month,date.day) HDFSClient.delete_folder(old_file,"impala") ImpalaEngine.execute_query("invalidate metadata") #Insert temporary values for item in tmp_timeline_data: insert_query = (""" INSERT INTO {0}.proxy_timeline PARTITION(y={1} , m={2} ,d={3}) VALUES ('{4}', '{5}', '{6}',{7},'{8}','{9}','{10}') """)\ .format(db,date.year,date.month,date.day,\ item["p_threat"],item["tstart"],item["tend"],item["duration"],item["clientip"],item["respcode"],item["respcodename"]) ImpalaEngine.execute_query(insert_query) ImpalaEngine.execute_query(imp_query) response = "Timeline successfully saved" else: response = "Timeline couldn't be created"
def score_request(date,score,uri): if not score and not uri: return None db = Configuration.db() p_query = (""" SELECT tdate,time,clientip,host,reqmethod,useragent,resconttype ,duration,username,webcat,referer,respcode,uriport ,uripath,uriquery,serverip,scbytes,csbytes,fulluri ,word,ml_score,uri_rep,respcode_name,network_context FROM {0}.proxy_scores WHERE y={1} and m={2} and d={3} AND fulluri = '{4}' """).format(db,date.year,date.month,date.day,uri) connections = ImpalaEngine.execute_query(p_query) # add score to connections insert_command = (""" INSERT INTO {0}.proxy_threat_investigation PARTITION (y={1},m={2},d={3}) VALUES (""") \ .format(db,date.year,date.month,date.day) fb_data = [] first = True num_rows = 0 for row in connections: cip_index = row[2] uri_index = row[18] tme_index = row[2] hash_field = [str( md5.new(str(cip_index) + str(uri_index)).hexdigest() \ + str((tme_index.split(":"))[0]) )] threat_data = (row[0],row[18],score) fb_data.append([row[0],row[1],row[2],row[3],row[4],row[5],row[6],row[7] \ ,row[8],row[9],row[10],row[11],row[12],row[13],row[14],row[15] \ ,row[16],row[17],row[18],row[19],score,row[20],row[21],row[22], \ row[23],hash_field]) insert_command += "{0}{1}".format("," if not first else "", threat_data) first = False num_rows += 1 insert_command += ")" if num_rows > 0: ImpalaEngine.execute_query(insert_command) # create feedback file. app_path = Configuration.spot() feedback_path = "{0}/proxy/scored_results/{1}{2}{3}/feedback"\ .format(app_path,date.year,str(date.month).zfill(2),str(date.day).zfill(2)) ap_file = True if len(HDFSClient.list_dir(feedback_path)) == 0: fb_data.insert(0,["p_date","p_time","clientip","host","reqmethod",\ "useragent","resconttype","duration","username","webcat","referer",\ "respcode","uriport","uripath","uriquery","serverip","scbytes","csbytes",\ "fulluri","word","score","uri_rep","uri_sev","respcode_name",\ "network_context","hash"]) ap_file = False HDFSClient.put_file_csv(fb_data,feedback_path,"ml_feedback.csv",append_file=ap_file) return True
def score_connection(date,ip="", dns="", ip_sev=0, dns_sev=0): if (not ip and not ip_sev) and (not dns and not dns_sev): return False db = Configuration.db() sq_query = (""" SELECT frame_time,unix_tstamp,frame_len,ip_dst,dns_qry_name,dns_qry_class, dns_qry_type,dns_qry_rcode,ml_score,tld,query_rep, hh,dns_qry_class_name,dns_qry_type_name,dns_qry_rcode_name, network_context FROM {0}.dns_scores WHERE y={1} and m={2} and d={3} AND ( """).format(db,date.year,date.month,date.day) connections_filter = "" connections_filter += "ip_dst = '{0}' ".format(ip) if ip else "" connections_filter += " OR " if ip and dns else "" connections_filter += "dns_qry_name = '{0}' ".format(dns) if dns else "" connections_filter += ")" connections = ImpalaEngine.execute_query(sq_query + connections_filter) # add score to connections insert_command = ("""INSERT INTO {0}.dns_threat_investigation PARTITION (y={1},m={2},d={3}) VALUES (""") \ .format(db,date.year,date.month,date.day) fb_data = [] first = True num_rows = 0 for row in connections: # insert into dns_threat_investigation. threat_data = (row[1],row[3],row[4],ip_sev if ip == row[3] else 0,\ dns_sev if dns == row[4] else 0) fb_data.append([row[0],row[2],row[3],row[4],row[5],row[6],row[7],\ row[8],row[9],row[10],row[11],ip_sev,dns_sev,row[12],row[13],row[14],\ row[15],row[1]]) insert_command += "{0}{1}".format("," if not first else "", threat_data) first = False num_rows += 1 insert_command += ")" if num_rows > 0: ImpalaEngine.execute_query(insert_command) # create feedback file. app_path = Configuration.spot() feedback_path = "{0}/dns/scored_results/{1}{2}{3}/feedback"\ .format(app_path,date.year,str(date.month).zfill(2),str(date.day).zfill(2)) ap_file = True if len(HDFSClient.list_dir(feedback_path)) == 0: fb_data.insert(0,["frame_time","frame_len","ip_dst","dns_qry_name",\ "dns_qry_class","dns_qry_type","dns_qry_rcode","score","tld","query_rep",\ "hh","ip_sev","dns_sev","dns_qry_class_name","dns_qry_type_name",\ "dns_qry_rcode_name","network_context","unix_tstamp"]) ap_file = False HDFSClient.put_file_csv(fb_data,feedback_path,"ml_feedback.csv",append_file=ap_file) return True
def score_connection(date, ip="", dns="", ip_sev=0, dns_sev=0): if (not ip and not ip_sev) and (not dns and not dns_sev): return False db = Configuration.db() sq_query = (""" SELECT frame_time,unix_tstamp,frame_len,ip_dst,dns_qry_name,dns_qry_class, dns_qry_type,dns_qry_rcode,ml_score,tld,query_rep, hh,dns_qry_class_name,dns_qry_type_name,dns_qry_rcode_name, network_context FROM {0}.dns_scores WHERE y={1} and m={2} and d={3} AND ( """).format(db, date.year, date.month, date.day) connections_filter = "" connections_filter += "ip_dst = '{0}' ".format(ip) if ip else "" connections_filter += " OR " if ip and dns else "" connections_filter += "dns_qry_name = '{0}' ".format(dns) if dns else "" connections_filter += ")" connections = ImpalaEngine.execute_query(sq_query + connections_filter) # add score to connections insert_command = ("""INSERT INTO {0}.dns_threat_investigation PARTITION (y={1},m={2},d={3}) VALUES (""") \ .format(db,date.year,date.month,date.day) fb_data = [] first = True num_rows = 0 for row in connections: # insert into dns_threat_investigation. threat_data = (row[1],row[3],row[4],ip_sev if ip == row[3] else 0,\ dns_sev if dns == row[4] else 0) fb_data.append([row[0],row[2],row[3],row[4],row[5],row[6],row[7],\ row[8],row[9],row[10],row[11],ip_sev,dns_sev,row[12],row[13],row[14],\ row[15],row[1]]) insert_command += "{0}{1}".format("," if not first else "", threat_data) first = False num_rows += 1 insert_command += ")" if num_rows > 0: ImpalaEngine.execute_query(insert_command) # create feedback file. app_path = Configuration.spot() feedback_path = "{0}/dns/scored_results/{1}{2}{3}/feedback"\ .format(app_path,date.year,str(date.month).zfill(2),str(date.day).zfill(2)) ap_file = True if len(HDFSClient.list_dir(feedback_path)) == 0: fb_data.insert(0,["frame_time","frame_len","ip_dst","dns_qry_name",\ "dns_qry_class","dns_qry_type","dns_qry_rcode","score","tld","query_rep",\ "hh","ip_sev","dns_sev","dns_qry_class_name","dns_qry_type_name",\ "dns_qry_rcode_name","network_context","unix_tstamp"]) ap_file = False HDFSClient.put_file_csv(fb_data, feedback_path, "ml_feedback.csv", append_file=ap_file) return True
def score_request(date, score, uri): if not score and not uri: return None db = Configuration.db() p_query = (""" SELECT tdate,time,clientip,host,reqmethod,useragent,resconttype ,duration,username,webcat,referer,respcode,uriport ,uripath,uriquery,serverip,scbytes,csbytes,fulluri ,word,ml_score,uri_rep,respcode_name,network_context FROM {0}.proxy_scores WHERE y={1} and m={2} and d={3} AND fulluri = '{4}' """).format(db, date.year, date.month, date.day, uri) connections = ImpalaEngine.execute_query(p_query) # add score to connections insert_command = (""" INSERT INTO {0}.proxy_threat_investigation PARTITION (y={1},m={2},d={3}) VALUES (""") \ .format(db,date.year,date.month,date.day) fb_data = [] first = True num_rows = 0 for row in connections: cip_index = row[2] uri_index = row[18] tme_index = row[2] hash_field = [str( md5.new(str(cip_index) + str(uri_index)).hexdigest() \ + str((tme_index.split(":"))[0]) )] threat_data = (row[0], row[18], score) fb_data.append([row[0],row[1],row[2],row[3],row[4],row[5],row[6],row[7] \ ,row[8],row[9],row[10],row[11],row[12],row[13],row[14],row[15] \ ,row[16],row[17],row[18],row[19],score,row[20],row[21],row[22], \ row[23],hash_field]) insert_command += "{0}{1}".format("," if not first else "", threat_data) first = False num_rows += 1 insert_command += ")" if num_rows > 0: ImpalaEngine.execute_query(insert_command) # create feedback file. app_path = Configuration.spot() feedback_path = "{0}/proxy/scored_results/{1}{2}{3}/feedback"\ .format(app_path,date.year,str(date.month).zfill(2),str(date.day).zfill(2)) ap_file = True if len(HDFSClient.list_dir(feedback_path)) == 0: fb_data.insert(0,["p_date","p_time","clientip","host","reqmethod",\ "useragent","resconttype","duration","username","webcat","referer",\ "respcode","uriport","uripath","uriquery","serverip","scbytes","csbytes",\ "fulluri","word","score","uri_rep","uri_sev","respcode_name",\ "network_context","hash"]) ap_file = False HDFSClient.put_file_csv(fb_data, feedback_path, "ml_feedback.csv", append_file=ap_file) return True
def create_connection(): impala_host, impala_port = Config.impala() db = Config.db() conn = connect(host=impala_host, port=int(impala_port), database=db) return conn.cursor()