def expanded_search(date, query=None, ip=None, limit=20): if not ip and not query: return False db = Configuration.db() if ip: count = "dns_qry_name" filter_param = "ip_dst" filter_value = ip else: count = "ip_dst" filter_param = "dns_qry_name" filter_value = query expanded_query = (""" SELECT COUNT({0}) as total,dns_qry_name,ip_dst FROM {1}.dns WHERE y={2} AND m={3} AND d={4} AND {5} = '{6}' GROUP BY {0},{5} ORDER BY total DESC LIMIT {7} """).format(count,db,date.year,date.month,date.day,\ filter_param,filter_value,limit if limit else 20) return ImpalaEngine.execute_query_as_list(expanded_query)
def suspicious_requests(date, uri=None, ip=None, limit=250): db = Configuration.db() proxy_query = (""" SELECT STRAIGHT_JOIN ps.tdate,ps.time,ps.clientip,ps.host,ps.reqmethod,ps.useragent, ps.resconttype,ps.duration,ps.username,ps.webcat,ps.referer, ps.respcode,ps.uriport,ps.uripath,ps.uriquery,ps.serverip,ps.scbytes, ps.csbytes,ps.fulluri,ps.ml_score,ps.uri_rep,ps.respcode_name, ps.network_context FROM {0}.proxy_scores ps LEFT JOIN {0}.proxy_threat_investigation pt ON (ps.fulluri = pt.fulluri) WHERE ps.y={1} AND ps.m={2} AND ps.d={3} AND (pt.fulluri is NULL) """).format(db, date.year, date.month, date.day) p_filter = "" p_filter += " AND ps.fulluri LIKE '%{0}%'".format(uri) if uri else "" p_filter += " AND ps.clientip = '{0}'".format(ip) if ip else "" p_filter += " ORDER BY ps.ml_score limit {0}".format(limit) proxy_query = proxy_query + p_filter return ImpalaEngine.execute_query_as_list(proxy_query)
def expanded_search(date,query=None,ip=None,limit=20): if not ip and not query: return False db = Configuration.db() if ip: count = "dns_qry_name" filter_param = "ip_dst" filter_value = ip else: count = "ip_dst" filter_param = "dns_qry_name" filter_value = query expanded_query = (""" SELECT COUNT({0}) as total,dns_qry_name,ip_dst FROM {1}.dns WHERE y={2} AND m={3} AND d={4} AND {5} = '{6}' GROUP BY {0},{5} ORDER BY total DESC LIMIT {7} """).format(count,db,date.year,date.month,date.day,\ filter_param,filter_value,limit if limit else 20) return ImpalaEngine.execute_query_as_list(expanded_query)
def suspicious_requests(date,uri=None,ip=None,limit=250): db = Configuration.db() proxy_query = (""" SELECT STRAIGHT_JOIN ps.tdate,ps.time,ps.clientip,ps.host,ps.reqmethod,ps.useragent, ps.resconttype,ps.duration,ps.username,ps.webcat,ps.referer, ps.respcode,ps.uriport,ps.uripath,ps.uriquery,ps.serverip,ps.scbytes, ps.csbytes,ps.fulluri,ps.ml_score,ps.uri_rep,ps.respcode_name, ps.network_context FROM {0}.proxy_scores ps LEFT JOIN {0}.proxy_threat_investigation pt ON (ps.fulluri = pt.fulluri) WHERE ps.y={1} AND ps.m={2} AND ps.d={3} AND (pt.fulluri is NULL) """).format(db,date.year,date.month,date.day) p_filter = "" p_filter += " AND ps.fulluri LIKE '%{0}%'".format(uri) if uri else "" p_filter += " AND ps.clientip = '{0}'".format(ip) if ip else "" p_filter += " ORDER BY ps.ml_score limit {0}".format(limit) proxy_query = proxy_query + p_filter return ImpalaEngine.execute_query_as_list(proxy_query)
def suspicious_queries(date, ip=None, query=None, limit=250): db = Configuration.db() sq_query = (""" SELECT STRAIGHT_JOIN ds.unix_tstamp,frame_len,ds.ip_dst,ds.dns_qry_name, dns_qry_class,dns_qry_type,dns_qry_rcode,ml_score,tld, query_rep,hh,dns_qry_class_name,dns_qry_type_name, dns_qry_rcode_name,network_context FROM {0}.dns_scores ds LEFT JOIN {0}.dns_threat_investigation dt ON (ds.dns_qry_name = dt.dns_qry_name) WHERE ds.y={1} AND ds.m={2} AND ds.d={3} AND (dt.dns_qry_name is NULL) """).format(db, date.year, date.month, date.day) sq_filter = "" sq_filter += " AND ds.ip_dst = '{0}'".format(ip) if ip else "" sq_filter += " AND ds.dns_qry_name LIKE '%{0}%'".format( query) if query else "" sq_filter += " ORDER BY ds.ml_score limit {0}".format(limit) sq_query = sq_query + sq_filter return ImpalaEngine.execute_query_as_list(sq_query)
def chord_details(ip,date): db = Configuration.db() chord_query = (""" SELECT srcip,dstip,ibyt,ipkt FROM {0}.flow_chords WHERE y={1} AND m={2} AND d={3} AND ip_threat='{4}' """).format(db,date.year,date.month,date.day,ip) return ImpalaEngine.execute_query_as_list(chord_query)
def chord_details(ip,date): db = Configuration.db() chord_query = (""" SELECT srcip,dstip,ibyt,ipkt FROM {0}.flow_chords WHERE y={1} AND m={2} AND d={3} AND ip_threat='{4}' """).format(db,date.year,date.month,date.day,ip) return ImpalaEngine.execute_query_as_list(chord_query)
def get_scored_requests(date): db = Configuration.db() sc_query = (""" SELECT tdate,fulluri,uri_sev FROM {0}.proxy_threat_investigation WHERE y={1} AND m={2} AND d={3} """).format(db, date.year, date.month, date.day) return ImpalaEngine.execute_query_as_list(sc_query)
def time_line(date,uri): db = Configuration.db() time_line_query = (""" SELECT p_threat,tstart,tend,duration,clientip,respcode,respcodename FROM {0}.proxy_timeline WHERE y={1} AND m={2} AND d={3} AND p_threat = '{4}' """).format(db,date.year,date.month,date.day,uri) return ImpalaEngine.execute_query_as_list(time_line_query)
def get_scored_connections(date): db = Configuration.db() sc_query = (""" SELECT unix_tstamp,ip_dst,dns_qry_name,ip_sev,dns_sev FROM {0}.dns_threat_investigation WHERE y={1} AND m={2} AND d={3} """).format(db,date.year,date.month,date.day) return ImpalaEngine.execute_query_as_list(sc_query)
def get_scored_requests(date): db = Configuration.db() sc_query = (""" SELECT tdate,fulluri,uri_sev FROM {0}.proxy_threat_investigation WHERE y={1} AND m={2} AND d={3} """).format(db,date.year,date.month,date.day) return ImpalaEngine.execute_query_as_list(sc_query)
def time_line(date, uri): db = Configuration.db() time_line_query = (""" SELECT p_threat,tstart,tend,duration,clientip,respcode,respcodename FROM {0}.proxy_timeline WHERE y={1} AND m={2} AND d={3} AND p_threat = '{4}' """).format(db, date.year, date.month, date.day, uri) return ImpalaEngine.execute_query_as_list(time_line_query)
def get_scored_connections(date): db = Configuration.db() sc_query = (""" SELECT unix_tstamp,ip_dst,dns_qry_name,ip_sev,dns_sev FROM {0}.dns_threat_investigation WHERE y={1} AND m={2} AND d={3} """).format(db, date.year, date.month, date.day) return ImpalaEngine.execute_query_as_list(sc_query)
def expanded_search(date,uri): db = Configuration.db() expanded_query = (""" SELECT p_date, p_time, clientip, username, duration, fulluri,\ webcat, respcode, reqmethod,useragent, resconttype,\ referer, uriport, serverip, scbytes, csbytes FROM {0}.proxy WHERE y='{1}' AND m='{2}' AND d='{3}' AND (fulluri='{4}' OR referer ='{4}') ORDER BY p_time """)\ .format(db,date.year,str(date.month).zfill(2),str(date.day).zfill(2),uri) return ImpalaEngine.execute_query_as_list(expanded_query)
def client_details(date, ip): db = Configuration.db() client_query =(""" SELECT ip_dst,dns_a,dns_qry_name,ip_dst FROM {0}.dns_dendro WHERE y={1} AND m={2} AND d={3} AND ip_dst='{4}' """).format(db,date.year,date.month,date.day,ip) return ImpalaEngine.execute_query_as_list(client_query)
def time_line(ip,date): db = Configuration.db() time_line_query = (""" SELECT ip_threat,tstart,tend,srcip,dstip,proto, sport,dport,ipkt,ibyt FROM {0}.flow_timeline WHERE y={1} AND m={2} AND d={3} AND ip_threat = '{4}' """).format(db,date.year,date.month,date.day,ip) return ImpalaEngine.execute_query_as_list(time_line_query)
def get_scored_connections(date): db = Configuration.db() scored_query = (""" SELECT tstart,srcip,dstip,srcport,dstport,score FROM {0}.flow_threat_investigation WHERE y={1} AND m={2} AND d={3} """).format(db,date.year,date.month,date.day) return ImpalaEngine.execute_query_as_list(scored_query)
def get_scored_connections(date): db = Configuration.db() scored_query = (""" SELECT tstart,srcip,dstip,srcport,dstport,score FROM {0}.flow_threat_investigation WHERE y={1} AND m={2} AND d={3} """).format(db,date.year,date.month,date.day) return ImpalaEngine.execute_query_as_list(scored_query)
def expanded_search(date, uri): db = Configuration.db() expanded_query = (""" SELECT p_date, p_time, clientip, username, duration, fulluri,\ webcat, respcode, reqmethod,useragent, resconttype,\ referer, uriport, serverip, scbytes, csbytes FROM {0}.proxy WHERE y='{1}' AND m='{2}' AND d='{3}' AND (fulluri='{4}' OR referer ='{4}') ORDER BY p_time """)\ .format(db,date.year,str(date.month).zfill(2),str(date.day).zfill(2),uri) return ImpalaEngine.execute_query_as_list(expanded_query)
def client_details(date, ip): db = Configuration.db() client_query = (""" SELECT ip_dst,dns_a,dns_qry_name,ip_dst FROM {0}.dns_dendro WHERE y={1} AND m={2} AND d={3} AND ip_dst='{4}' """).format(db, date.year, date.month, date.day, ip) return ImpalaEngine.execute_query_as_list(client_query)
def time_line(ip,date): db = Configuration.db() time_line_query = (""" SELECT ip_threat,tstart,tend,srcip,dstip,proto, sport,dport,ipkt,ibyt FROM {0}.flow_timeline WHERE y={1} AND m={2} AND d={3} AND ip_threat = '{4}' """).format(db,date.year,date.month,date.day,ip) return ImpalaEngine.execute_query_as_list(time_line_query)
def ingest_summary(start_date, end_date): db = Configuration.db() is_query = (""" SELECT tdate,total FROM {0}.proxy_ingest_summary WHERE ( y >= {1} and y <= {2}) AND ( m >= {3} and m <= {4}) AND ( d >= {5} and d <= {6}) """)\ .format(db,start_date.year,end_date.year,start_date.month,end_date.month, start_date.day, end_date.day) return ImpalaEngine.execute_query_as_list(is_query)
def ingest_summary(start_date,end_date): db = Configuration.db() is_query = (""" SELECT tdate,total FROM {0}.proxy_ingest_summary WHERE ( y >= {1} and y <= {2}) AND ( m >= {3} and m <= {4}) AND ( d >= {5} and d <= {6}) """)\ .format(db,start_date.year,end_date.year,start_date.month,end_date.month, start_date.day, end_date.day) return ImpalaEngine.execute_query_as_list(is_query)
def ingest_summary(start_date,end_date): db = Configuration.db() daterange_select = daterange_query(start_date, end_date) is_query = (""" SELECT tdate,total FROM {0}.flow_ingest_summary WHERE {1} ORDER BY tdate """).format(db, daterange_select) return ImpalaEngine.execute_query_as_list(is_query)
def details(frame_time, query): db = Configuration.db() details_query = (""" SELECT unix_tstamp,frame_len,ip_dst,ip_src,dns_qry_name,dns_qry_class, dns_qry_type,dns_qry_rcode,dns_a,dns_qry_type_name, dns_qry_rcode_name,dns_qry_class_name FROM {0}.dns_edge WHERE y={1} AND m={2} AND d={3} AND hh={4} AND dns_qry_name = '{5}' """).format(db,frame_time.year,frame_time.month,frame_time.day,\ frame_time.hour,query) return ImpalaEngine.execute_query_as_list(details_query)
def story_board(date): db = Configuration.db() sb_query= (""" SELECT ip_threat,title,text FROM {0}.flow_storyboard WHERE y={1} AND m={2} AND d={3} """).format(db,date.year,date.month,date.day) results = ImpalaEngine.execute_query_as_list(sb_query) for row in results: row["text"] = row["text"].replace("\n","\\n") return results
def story_board(date): db = Configuration.db() sb_query = (""" SELECT p_threat,title,text FROM {0}.proxy_storyboard WHERE y={1} AND m={2} AND d={3} """).format(db, date.year, date.month, date.day) results = ImpalaEngine.execute_query_as_list(sb_query) for row in results: row["text"] = row["text"].replace("\n", "\\n") return results
def details(frame_time, query): db = Configuration.db() details_query = (""" SELECT unix_tstamp,frame_len,ip_dst,ip_src,dns_qry_name,dns_qry_class, dns_qry_type,dns_qry_rcode,dns_a,dns_qry_type_name, dns_qry_rcode_name,dns_qry_class_name FROM {0}.dns_edge WHERE y={1} AND m={2} AND d={3} AND hh={4} AND dns_qry_name = '{5}' """).format(db,frame_time.year,frame_time.month,frame_time.day,\ frame_time.hour,query) return ImpalaEngine.execute_query_as_list(details_query)
def save_comments(anchor, ip, query, title, text, date): db = Configuration.db() sb_query = (""" SELECT ip_threat,dns_threat,title,text FROM {0}.dns_storyboard WHERE y = {1} AND m= {2} AND d={3} """).format(db, date.year, date.month, date.day) sb_data = ImpalaEngine.execute_query_as_list(sb_query) # find value if already exists. saved = False for item in sb_data: if item["ip_threat"] == anchor or item["dns_threat"] == anchor: item["title"] = title item["text"] = text saved = True if not saved: sb_data.append({ 'text': text, 'ip_threat': str(ip), 'title': title, 'dns_threat': query }) #remove old file. app_path = Configuration.spot() old_file = "{0}/dns/hive/oa/storyboard/y={1}/m={2}/d={3}/"\ .format(app_path,date.year,date.month,date.day) HDFSClient.delete_folder(old_file, "impala") ImpalaEngine.execute_query("invalidate metadata") for item in sb_data: insert_query = (""" INSERT INTO {0}.dns_storyboard PARTITION(y={1} , m={2} ,d={3}) VALUES ( '{4}', '{5}', '{6}','{7}') """)\ .format(db,date.year,date.month,date.day,\ item["ip_threat"],item["dns_threat"],item["title"],item["text"]) ImpalaEngine.execute_query(insert_query) return True
def details(src_ip,dst_ip,date): db = Configuration.db() details_query = (""" SELECT tstart,srcip,dstip,sport,dport,proto,flags, tos,ibyt,ipkt,input,output,rip,obyt,opkt FROM {0}.flow_edge WHERE y={1} AND m={2} AND d={3} AND hh={4} AND mn={5} AND ((srcip='{6}' AND dstip='{7}') OR (srcip='{7}' AND dstip='{6}')) ORDER BY tstart """).format(db,date.year,date.month,date.day,date.hour, \ date.minute,src_ip,dst_ip) return ImpalaEngine.execute_query_as_list(details_query)
def ingest_summary(start_date,end_date): db = Configuration.db() is_query = (""" SELECT tdate,total FROM {0}.flow_ingest_summary WHERE ( y >= {1} AND y <= {2}) AND ( m >= {3} AND m <= {4}) AND ( d >= {5} AND d <= {6}) ORDER BY tdate """).format(db,start_date.year,end_date.year, \ start_date.month,end_date.month, \ start_date.day, end_date.day) return ImpalaEngine.execute_query_as_list(is_query)
def details(src_ip,dst_ip,date): db = Configuration.db() details_query = (""" SELECT tstart,srcip,dstip,sport,dport,proto,flags, tos,ibyt,ipkt,input,output,rip,obyt,opkt FROM {0}.flow_edge WHERE y={1} AND m={2} AND d={3} AND hh={4} AND mn={5} AND ((srcip='{6}' AND dstip='{7}') OR (srcip='{7}' AND dstip='{6}')) ORDER BY tstart """).format(db,date.year,date.month,date.day,date.hour, \ date.minute,src_ip,dst_ip) return ImpalaEngine.execute_query_as_list(details_query)
def save_comment(ip,title,text,date): #Get current table info. db = Configuration.db() sb_query = (""" SELECT ip_threat,title,text FROM {0}.flow_storyboard WHERE y = {1} AND m= {2} AND d={3} """).format(db,date.year,date.month,date.day) sb_data = ImpalaEngine.execute_query_as_list(sb_query) # find value if already exists. saved = False for item in sb_data: if item["ip_threat"] == ip: item["title"] = title item["text"] = text saved = True if not saved: sb_data.append({'text': text, 'ip_threat': str(ip), 'title': title}) #remove old file. app_path = Configuration.spot() old_file = "{0}/flow/hive/oa/storyboard/y={1}/m={2}/d={3}/" \ .format(app_path,date.year,date.month,date.day) # remove file manually to allow the comments update. HDFSClient.delete_folder(old_file,"impala") ImpalaEngine.execute_query("invalidate metadata") for item in sb_data: insert_query = (""" INSERT INTO {0}.flow_storyboard PARTITION(y={1} , m={2} ,d={3}) VALUES ( '{4}', '{5}','{6}') """) \ .format(db,date.year,date.month,date.day, \ item["ip_threat"],item["title"],item["text"]) ImpalaEngine.execute_query(insert_query) return True
def details(date,uri,ip): if not uri and not ip: return None db = Configuration.db() p_details = (""" SELECT tdate,time,clientIp,host,webcat,respcode,respcode_name ,reqmethod,useragent,resconttype,referer,uriport,serverip ,scbytes,csbytes,fulluri,hh FROM {0}.proxy_edge WHERE y={1} AND m={2} AND d={3} AND (fulluri='{4}' AND clientIp='{5}') """).format(db,date.year,date.month,date.day,uri.replace("'","//'"),ip) return ImpalaEngine.execute_query_as_list(p_details)
def details(date, uri, ip): if not uri and not ip: return None db = Configuration.db() p_details = (""" SELECT tdate,time,clientIp,host,webcat,respcode,respcode_name ,reqmethod,useragent,resconttype,referer,uriport,serverip ,scbytes,csbytes,fulluri,hh FROM {0}.proxy_edge WHERE y={1} AND m={2} AND d={3} AND (fulluri='{4}' AND clientIp='{5}') """).format(db, date.year, date.month, date.day, uri.replace("'", "//'"), ip) return ImpalaEngine.execute_query_as_list(p_details)
def expanded_search(date,ip): db = Configuration.db() expanded_query = (""" SELECT min(treceived) as firstseen, max(treceived) as lastseen, sip as srcip, dip as dstip, sport as sport, dport as dport, count(sip) as conns, max(ipkt) as maxpkts, avg(ipkt) as avgpkts, max(ibyt) as maxbyts, avg(ibyt) as avgbyts FROM {0}.flow WHERE y={1} AND m={2} AND d={3} AND (sip ='{4}' OR dip='{4}') GROUP BY sip, dip,sport,dport """).format(db,date.year,date.month,date.day,ip) return ImpalaEngine.execute_query_as_list(expanded_query)
def _ingest_summary(self): # get date parameters. yr = self._date[:4] mn = self._date[4:6] dy = self._date[6:] self._logger.info("Getting ingest summary data for the day") ingest_summary_cols = ["date", "total"] result_rows = [] df_filtered = pd.DataFrame() query_to_load = (""" SELECT frame_time, COUNT(*) as total FROM {0}.{1} WHERE y={2} AND m={3} AND d={4} AND unix_tstamp IS NOT NULL AND frame_time IS NOT NULL AND frame_len IS NOT NULL AND dns_qry_name IS NOT NULL AND ip_src IS NOT NULL AND (dns_qry_class IS NOT NULL AND dns_qry_type IS NOT NULL AND dns_qry_rcode IS NOT NULL ) GROUP BY frame_time; """).format(self._db, self._table_name, yr, mn, dy) results = impala.execute_query_as_list(query_to_load) df = pd.DataFrame(results) # Forms a new dataframe splitting the minutes from the time column df_new = pd.DataFrame([["{0}-{1}-{2} {3}:{4}".format(yr, mn, dy,\ val['frame_time'].replace(" "," ").split(" ")[3].split(":")[0].zfill(2),\ val['frame_time'].replace(" "," ").split(" ")[3].split(":")[1].zfill(2)),\ int(val['total']) if not math.isnan(val['total']) else 0 ] for key,val in df.iterrows()],columns = ingest_summary_cols) #Groups the data by minute sf = df_new.groupby(by=['date'])['total'].sum() df_per_min = pd.DataFrame({'date': sf.index, 'total': sf.values}) df_final = df_filtered.append(df_per_min, ignore_index=True).to_records( False, False) if len(df_final) > 0: query_to_insert = (""" INSERT INTO {0}.dns_ingest_summary PARTITION (y={1}, m={2}, d={3}) VALUES {4}; """).format(self._db, yr, mn, dy, tuple(df_final)) impala.execute_query(query_to_insert)
def incident_progression(date, query,ip): if not ip and not query: return None db = Configuration.db() return_value = "dns_qry_name" if ip else "ip_dst" dns_threat_query = (""" SELECT anchor,total,{0} FROM {1}.dns_threat_dendro WHERE y={2} AND m={3} AND d={4} AND anchor = '{5}' """).format(return_value,db,date.year,date.month,date.day,\ query if query else ip) return ImpalaEngine.execute_query_as_list(dns_threat_query)
def incident_progression(date, query, ip): if not ip and not query: return None db = Configuration.db() return_value = "dns_qry_name" if ip else "ip_dst" dns_threat_query = (""" SELECT anchor,total,{0} FROM {1}.dns_threat_dendro WHERE y={2} AND m={3} AND d={4} AND anchor = '{5}' """).format(return_value,db,date.year,date.month,date.day,\ query if query else ip) return ImpalaEngine.execute_query_as_list(dns_threat_query)
def expanded_search(date,ip): db = Configuration.db() expanded_query = (""" SELECT min(treceived) as firstseen, max(treceived) as lastseen, sip as srcip, dip as dstip, sport as sport, dport as dport, count(sip) as conns, max(ipkt) as maxpkts, avg(ipkt) as avgpkts, max(ibyt) as maxbyts, avg(ibyt) as avgbyts FROM {0}.flow WHERE y={1} AND m={2} AND d={3} AND (sip ='{4}' OR dip='{4}') GROUP BY sip, dip,sport,dport """).format(db,date.year,date.month,date.day,ip) return ImpalaEngine.execute_query_as_list(expanded_query)
def _ingest_summary(self): # get date parameters. yr = self._date[:4] mn = self._date[4:6] dy = self._date[6:] self._logger.info("Getting ingest summary data for the day") ingest_summary_cols = ["date","total"] result_rows = [] df_filtered = pd.DataFrame() query_to_load = (""" SELECT frame_time, COUNT(*) as total FROM {0}.{1} WHERE y={2} AND m={3} AND d={4} AND unix_tstamp IS NOT NULL AND frame_time IS NOT NULL AND frame_len IS NOT NULL AND dns_qry_name IS NOT NULL AND ip_src IS NOT NULL AND (dns_qry_class IS NOT NULL AND dns_qry_type IS NOT NULL AND dns_qry_rcode IS NOT NULL ) GROUP BY frame_time; """).format(self._db,self._table_name, yr, mn, dy) results = impala.execute_query_as_list(query_to_load) df = pd.DataFrame(results) # Forms a new dataframe splitting the minutes from the time column df_new = pd.DataFrame([["{0}-{1}-{2} {3}:{4}".format(yr, mn, dy,\ val['frame_time'].replace(" "," ").split(" ")[3].split(":")[0].zfill(2),\ val['frame_time'].replace(" "," ").split(" ")[3].split(":")[1].zfill(2)),\ int(val['total']) if not math.isnan(val['total']) else 0 ] for key,val in df.iterrows()],columns = ingest_summary_cols) #Groups the data by minute sf = df_new.groupby(by=['date'])['total'].sum() df_per_min = pd.DataFrame({'date':sf.index, 'total':sf.values}) df_final = df_filtered.append(df_per_min, ignore_index=True).to_records(False,False) if len(df_final) > 0: query_to_insert=(""" INSERT INTO {0}.dns_ingest_summary PARTITION (y={1}, m={2}, d={3}) VALUES {4}; """).format(self._db, yr, mn, dy, tuple(df_final)) impala.execute_query(query_to_insert)
def suspicious_connections(date,ip=None,limit=250): db = Configuration.db() sc_query = (""" SELECT STRAIGHT_JOIN fs.tstart,fs.srcip,fs.dstip,fs.sport,fs.dport,proto, ipkt,ibyt,opkt,obyt,ml_score,rank,srcip_internal, dstip_internal,src_geoloc,dst_geoloc,src_domain, dst_domain,src_rep,dst_rep FROM {0}.flow_scores fs LEFT JOIN {0}.flow_threat_investigation ft ON (( fs.srcip = ft.srcip) OR ( fs.dstip = ft.dstip)) WHERE fs.y={1} AND fs.m={2} and fs.d={3} AND ( ft.srcip is NULL AND ft.dstip is NULL ) """).format(db,date.year,date.month,date.day) sc_filter = "" if ip: sc_filter = " AND ( fs.srcip='{0}' OR fs.dstip='{0}')".format(ip) sc_filter += " ORDER BY rank limit {0}".format(limit) sc_query = sc_query + sc_filter return ImpalaEngine.execute_query_as_list(sc_query)
def suspicious_connections(date,ip=None,limit=250): db = Configuration.db() sc_query = (""" SELECT STRAIGHT_JOIN fs.tstart,fs.srcip,fs.dstip,fs.sport,fs.dport,proto, ipkt,ibyt,opkt,obyt,ml_score,rank,srcip_internal, dstip_internal,src_geoloc,dst_geoloc,src_domain, dst_domain,src_rep,dst_rep FROM {0}.flow_scores fs LEFT JOIN {0}.flow_threat_investigation ft ON (( fs.srcip = ft.srcip) OR ( fs.dstip = ft.dstip)) WHERE fs.y={1} AND fs.m={2} and fs.d={3} AND ( ft.srcip is NULL AND ft.dstip is NULL ) """).format(db,date.year,date.month,date.day) sc_filter = "" if ip: sc_filter = " AND ( fs.srcip='{0}' OR fs.dstip='{0}')".format(ip) sc_filter += " ORDER BY rank limit {0}".format(limit) sc_query = sc_query + sc_filter return ImpalaEngine.execute_query_as_list(sc_query)
def suspicious_queries(date, ip=None, query=None,limit=250): db = Configuration.db() sq_query = (""" SELECT STRAIGHT_JOIN ds.unix_tstamp,frame_len,ds.ip_dst,ds.dns_qry_name, dns_qry_class,dns_qry_type,dns_qry_rcode,ml_score,tld, query_rep,hh,dns_qry_class_name,dns_qry_type_name, dns_qry_rcode_name,network_context FROM {0}.dns_scores ds LEFT JOIN {0}.dns_threat_investigation dt ON (ds.dns_qry_name = dt.dns_qry_name) WHERE ds.y={1} AND ds.m={2} AND ds.d={3} AND (dt.dns_qry_name is NULL) """).format(db,date.year,date.month,date.day) sq_filter = "" sq_filter += " AND ds.ip_dst = '{0}'".format(ip) if ip else "" sq_filter += " AND ds.dns_qry_name LIKE '%{0}%'".format(query) if query else "" sq_filter += " ORDER BY ds.ml_score limit {0}".format(limit) sq_query = sq_query + sq_filter return ImpalaEngine.execute_query_as_list(sq_query)
def create_timeline(anchor,clientips,date,top_results): response = "" susp_ips = [] if clientips: srtlist = sorted(list(clientips.items()), key=lambda x: x[1], reverse=True) for val in srtlist[:top_results]: susp_ips.append(val[0]) if anchor != "": db = Configuration.db() time_line_query = (""" SELECT p_threat,tstart,tend,duration,clientip,respcode,respcodename FROM {0}.proxy_timeline WHERE y={1} AND m={2} AND d={3} AND p_threat != '{4}' """).format(db,date.year,date.month,date.day,anchor.replace("'","//'")) tmp_timeline_data = ImpalaEngine.execute_query_as_list(time_line_query) imp_query = (""" INSERT INTO TABLE {0}.proxy_timeline PARTITION (y={2}, m={3},d={4}) SELECT '{7}' as p_threat, concat(cast(p_date as string), ' ', cast(MIN(p_time) as string)) AS tstart, concat(cast(p_date as string), ' ', cast(MAX(p_time) as string)) AS tend, SUM(duration) AS duration, clientip, respcode,"respCodeName" as respCodeName FROM {0}.proxy WHERE fulluri='{1}' AND clientip IN ({5}) AND y='{2}' AND m='{3}' AND d='{4}' GROUP BY clientip, p_time, respcode, p_date LIMIT {6} """)\ .format(db,anchor,date.year,str(date.month).zfill(2),\ str(date.day).zfill(2),("'" + "','".join(susp_ips) + "'")\ ,top_results,anchor) app_path = Configuration.spot() old_file = "{0}/proxy/hive/oa/timeline/y={1}/m={2}/d={3}"\ .format(app_path,date.year,date.month,date.day) HDFSClient.delete_folder(old_file,"impala") ImpalaEngine.execute_query("invalidate metadata") #Insert temporary values for item in tmp_timeline_data: insert_query = (""" INSERT INTO {0}.proxy_timeline PARTITION(y={1} , m={2} ,d={3}) VALUES ('{4}', '{5}', '{6}',{7},'{8}','{9}','{10}') """)\ .format(db,date.year,date.month,date.day,\ item["p_threat"],item["tstart"],item["tend"],item["duration"],item["clientip"],item["respcode"],item["respcodename"]) ImpalaEngine.execute_query(insert_query) ImpalaEngine.execute_query(imp_query) response = "Timeline successfully saved" else: response = "Timeline couldn't be created"
def create_timeline(anchor, clientips, date, top_results): response = "" susp_ips = [] if clientips: srtlist = sorted(list(clientips.items()), key=lambda x: x[1], reverse=True) for val in srtlist[:top_results]: susp_ips.append(val[0]) if anchor != "": db = Configuration.db() time_line_query = (""" SELECT p_threat,tstart,tend,duration,clientip,respcode,respcodename FROM {0}.proxy_timeline WHERE y={1} AND m={2} AND d={3} AND p_threat != '{4}' """).format(db, date.year, date.month, date.day, anchor.replace("'", "//'")) tmp_timeline_data = ImpalaEngine.execute_query_as_list(time_line_query) imp_query = (""" INSERT INTO TABLE {0}.proxy_timeline PARTITION (y={2}, m={3},d={4}) SELECT '{7}' as p_threat, concat(cast(p_date as string), ' ', cast(MIN(p_time) as string)) AS tstart, concat(cast(p_date as string), ' ', cast(MAX(p_time) as string)) AS tend, SUM(duration) AS duration, clientip, respcode,"respCodeName" as respCodeName FROM {0}.proxy WHERE fulluri='{1}' AND clientip IN ({5}) AND y='{2}' AND m='{3}' AND d='{4}' GROUP BY clientip, p_time, respcode, p_date LIMIT {6} """)\ .format(db,anchor,date.year,str(date.month).zfill(2),\ str(date.day).zfill(2),("'" + "','".join(susp_ips) + "'")\ ,top_results,anchor) app_path = Configuration.spot() old_file = "{0}/proxy/hive/oa/timeline/y={1}/m={2}/d={3}"\ .format(app_path,date.year,date.month,date.day) HDFSClient.delete_folder(old_file, "impala") ImpalaEngine.execute_query("invalidate metadata") #Insert temporary values for item in tmp_timeline_data: insert_query = (""" INSERT INTO {0}.proxy_timeline PARTITION(y={1} , m={2} ,d={3}) VALUES ('{4}', '{5}', '{6}',{7},'{8}','{9}','{10}') """)\ .format(db,date.year,date.month,date.day,\ item["p_threat"],item["tstart"],item["tend"],item["duration"],item["clientip"],item["respcode"],item["respcodename"]) ImpalaEngine.execute_query(insert_query) ImpalaEngine.execute_query(imp_query) response = "Timeline successfully saved" else: response = "Timeline couldn't be created"