def _get_proxy_details(self,fulluri,clientip,year,month,day,hh,proxy_iana): limit = 250 value_string = "" query_to_load =(""" SELECT p_date, p_time, clientip, host, webcat, respcode, reqmethod, useragent, resconttype, referer, uriport, serverip, scbytes, csbytes, fulluri, {5} as hh FROM {0}.{1} WHERE y='{2}' AND m='{3}' AND d='{4}' AND h='{5}' AND fulluri='{6}' AND clientip='{7}' LIMIT {8}; """).format(self._db,self._table_name, year,month,day,hh,fulluri.replace("'","\\'"),clientip,limit) detail_results = impala.execute_query(query_to_load) if proxy_iana: # add IANA to results. self._logger.info("Adding IANA translation to details results") updated_rows = [conn + (proxy_iana.get_name(conn[5],"proxy_http_rcode"),) for conn in detail_results] updated_rows = filter(None, updated_rows) else: updated_rows = [conn + ("") for conn in detail_results ] for row in updated_rows: value_string += str(tuple(item for item in row)) + "," if value_string != "": query_to_insert=(""" INSERT INTO {0}.proxy_edge PARTITION (y={1}, m={2}, d={3}) VALUES ({4}); """).format(self._db,year, month, day, value_string[:-1]) impala.execute_query(query_to_insert)
def _get_dns_dendrogram(self): for conn in self._dns_scores: timestamp = conn[self._conf["dns_score_fields"]["unix_tstamp"]] full_date = datetime.datetime.utcfromtimestamp( int(timestamp)).strftime('%Y-%m-%d %H:%M:%S') date = full_date.split(" ")[0].split("-") # get date parameters. yr = date[0] mn = date[1] dy = date[2] ip_dst = conn[self._conf["dns_score_fields"]["ip_dst"]] query_to_load = (""" INSERT INTO TABLE {0}.dns_dendro PARTITION (y={2}, m={3},d={4}) SELECT unix_tstamp, dns_a, dns_qry_name, ip_dst FROM (SELECT unix_tstamp, susp.ip_dst, susp.dns_qry_name, susp.dns_a FROM {0}.{1} as susp WHERE susp.y={2} AND susp.m={3} AND susp.d={4} AND susp.ip_dst='{5}' LIMIT {6}) AS tmp GROUP BY dns_a, dns_qry_name, ip_dst, unix_tstamp """).format(self._db, self._table_name, yr, mn, dy, ip_dst, self._details_limit) impala.execute_query(query_to_load)
def _clear_previous_executions(self): self._logger.info("Cleaning data from previous executions for the day") yr = self._date[:4] mn = self._date[4:6] dy = self._date[6:] table_schema = [] HUSER = self._spot_conf.get('conf', 'HUSER').replace("'", "").replace('"', '') table_schema = [ 'suspicious', 'edge', 'dendro', 'threat_dendro', 'threat_investigation', 'storyboard', 'summary' ] for path in table_schema: HDFSClient.delete_folder( "{0}/{1}/hive/oa/{2}/y={3}/m={4}/d={5}".format( HUSER, self._table_name, path, yr, int(mn), int(dy)), user="******") impala.execute_query("invalidate metadata") #removes Feedback file HDFSClient.delete_folder( "{0}/{1}/scored_results/{2}{3}{4}/feedback/ml_feedback.csv".format( HUSER, self._table_name, yr, mn, dy)) #removes json files from the storyboard HDFSClient.delete_folder("{0}/{1}/oa/{2}/{3}/{4}/{5}".format( HUSER, self._table_name, "storyboard", yr, mn, dy))
def create_dendro(expanded_search,date,anchor): db = Configuration.db() for row in expanded_search: dendro_query = (""" INSERT INTO {0}.dns_threat_dendro PARTITION (y={1}, m={2},d={3}) VALUES ( '{4}',{5},'{6}','{7}') """)\ .format(db,date.year,date.month,date.day,anchor,\ row["total"],row["dnsQuery"],row["clientIp"]) ImpalaEngine.execute_query(dendro_query)
def create_dendro(expanded_search, date, anchor): db = Configuration.db() for row in expanded_search: dendro_query = (""" INSERT INTO {0}.dns_threat_dendro PARTITION (y={1}, m={2},d={3}) VALUES ( '{4}',{5},'{6}','{7}') """)\ .format(db,date.year,date.month,date.day,anchor,\ row["total"],row["dnsQuery"],row["clientIp"]) ImpalaEngine.execute_query(dendro_query)
def _ingest_summary(self): # get date parameters. yr = self._date[:4] mn = self._date[4:6] dy = self._date[6:] self._logger.info("Getting ingest summary data for the day") ingest_summary_cols = ["date","total"] result_rows = [] df_filtered = pd.DataFrame() # get ingest summary. query_to_load=(""" SELECT tryear, trmonth, trday, trhour, trminute, COUNT(*) as total FROM {0}.{1} WHERE y={2} AND m={3} AND d={4} AND unix_tstamp IS NOT NULL AND sip IS NOT NULL AND sport IS NOT NULL AND dip IS NOT NULL AND dport IS NOT NULL AND ibyt IS NOT NULL AND ipkt IS NOT NULL AND tryear={2} AND cast(treceived as timestamp) IS NOT NULL GROUP BY tryear, trmonth, trday, trhour, trminute; """).format(self._db,self._table_name, yr, mn, dy) results = impala.execute_query(query_to_load) if results: df_results = as_pandas(results) #Forms a new dataframe splitting the minutes from the time column df_new = pd.DataFrame([["{0}-{1}-{2} {3}:{4}".format(val['tryear'],val['trmonth'],val['trday'], val['trhour'], val['trminute']), int(val['total']) if not math.isnan(val['total']) else 0 ] for key,val in df_results.iterrows()],columns = ingest_summary_cols) value_string = '' #Groups the data by minute sf = df_new.groupby(by=['date'])['total'].sum() df_per_min = pd.DataFrame({'date':sf.index, 'total':sf.values}) df_final = df_filtered.append(df_per_min, ignore_index=True).to_records(False,False) if len(df_final) > 0: query_to_insert=(""" INSERT INTO {0}.flow_ingest_summary PARTITION (y={1}, m={2}, d={3}) VALUES {4}; """).format(self._db, yr, mn, dy, tuple(df_final)) impala.execute_query(query_to_insert) else: self._logger.info("No data found for the ingest summary")
def _get_dns_details(self, dns_qry_name, year, month, day, hh, dns_iana): value_string = "" query_to_load = (""" SELECT unix_tstamp,frame_len,ip_dst,ip_src,dns_qry_name,dns_qry_class,dns_qry_type,dns_qry_rcode,dns_a,h as hh FROM {0}.{1} WHERE y={2} AND m={3} AND d={4} AND dns_qry_name LIKE '%{5}%' AND h={6} LIMIT {7}; """).format(self._db, self._table_name, year, month, day, dns_qry_name, hh, self._details_limit) try: dns_details = impala.execute_query(query_to_load) except: self._logger.info( "WARNING. Details couldn't be retreived for {0}, skipping this step" .format(dns_qry_name)) else: # add IANA to results. update_rows = [] if dns_iana: self._logger.info("Adding IANA translation to details results") dns_details = [ conn + (dns_iana.get_name(str(conn[5]), "dns_qry_class"), dns_iana.get_name(str(conn[6]), "dns_qry_type"), dns_iana.get_name(str(conn[7]), "dns_qry_rcode")) for conn in dns_details ] else: self._logger.info("WARNING: NO IANA configured.") dns_details = [conn + ("", "", "") for conn in dns_details] nc_conf_file = "{0}/components/nc/nc_config.json".format( os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) if os.path.isfile(nc_conf_file): nc_conf = json.loads(open(nc_conf_file).read())["NC"] dns_nc = NetworkContext(nc_conf, self._logger) dns_details = [ conn + (dns_nc.get_nc(conn[2]), ) for conn in dns_details ] else: dns_details = [conn + (0, ) for conn in dns_details] for row in dns_details: value_string += str(tuple(item for item in row)) + "," if value_string != "": query_to_insert = (""" INSERT INTO {0}.dns_edge PARTITION (y={1}, m={2}, d={3}) VALUES ({4}); """).format(self._db, year, month, day, value_string[:-1]) impala.execute_query(query_to_insert)
def _create_proxy_scores_csv(self): # get date parameters. yr = self._date[:4] mn = self._date[4:6] dy = self._date[6:] value_string = "" for row in self._proxy_scores: value_string += str(tuple(Util.cast_val(item) for item in row)) + "," load_into_impala = (""" INSERT INTO {0}.proxy_scores partition(y={2}, m={3}, d={4}) VALUES {1} """).format(self._db, value_string[:-1], yr, mn, dy) impala.execute_query(load_into_impala)
def _create_flow_scores(self): # get date parameters. yr = self._date[:4] mn = self._date[4:6] dy = self._date[6:] value_string = "" for row in self._flow_scores: value_string += str(tuple(Util.cast_val(item) for item in row)) + "," load_into_impala = (""" INSERT INTO {0}.flow_scores partition(y={2}, m={3}, d={4}) VALUES {1} """).format(self._db, value_string[:-1], yr, mn, dy) impala.execute_query(load_into_impala)
def create_time_line(anchor,inbound, outbound, twoway,date): top_keys = [] if len(twoway) > 0: top_keys.extend(twoway.keys()) if len(outbound) > 0: top_keys.extend(outbound.keys()) if len(inbound) > 0: top_keys.extend(inbound.keys()) db = Configuration.db() imp_query =(""" INSERT INTO TABLE {0}.flow_timeline PARTITION (y={4}, m={5},d={6}) SELECT '{7}' ,min(treceived) as tstart, max(treceived) as tend, sip as srcIP,dip as dstip, proto as proto, sport as sport, dport AS dport, ipkt as ipkt, ibyt as ibyt FROM {0}.flow WHERE y={4} AND m={5} AND d={6} AND ((dip IN({1}) AND sip ='{2}') OR (sip IN({1}) AND dip ='{2}')) GROUP BY sip, dip, proto, sport, dport, ipkt, ibyt ORDER BY tstart LIMIT {3} """) ips = "'" + "','".join(top_keys) + "'" imp_query = imp_query.format(db,ips,anchor,1000,date.year,date.month, date.day,anchor) if ImpalaEngine.execute_query(imp_query): return "Timeline successfully created \n" else: return "Timeline couldn't be created \n"
def save_comments(anchor, ip, query, title, text, date): db = Configuration.db() sb_query = (""" SELECT ip_threat,dns_threat,title,text FROM {0}.dns_storyboard WHERE y = {1} AND m= {2} AND d={3} """).format(db, date.year, date.month, date.day) sb_data = ImpalaEngine.execute_query_as_list(sb_query) # find value if already exists. saved = False for item in sb_data: if item["ip_threat"] == anchor or item["dns_threat"] == anchor: item["title"] = title item["text"] = text saved = True if not saved: sb_data.append({ 'text': text, 'ip_threat': str(ip), 'title': title, 'dns_threat': query }) #remove old file. app_path = Configuration.spot() old_file = "{0}/dns/hive/oa/storyboard/y={1}/m={2}/d={3}/"\ .format(app_path,date.year,date.month,date.day) HDFSClient.delete_folder(old_file, "impala") ImpalaEngine.execute_query("invalidate metadata") for item in sb_data: insert_query = (""" INSERT INTO {0}.dns_storyboard PARTITION(y={1} , m={2} ,d={3}) VALUES ( '{4}', '{5}', '{6}','{7}') """)\ .format(db,date.year,date.month,date.day,\ item["ip_threat"],item["dns_threat"],item["title"],item["text"]) ImpalaEngine.execute_query(insert_query) return True
def _clear_previous_executions(self): self._logger.info("Cleaning data from previous executions for the day") yr = self._date[:4] mn = self._date[4:6] dy = self._date[6:] table_schema = [] HUSER = self._spot_conf.get('conf', 'HUSER').replace("'", "").replace('"', '') table_schema=['suspicious', 'edge','chords','threat_investigation', 'timeline', 'storyboard', 'summary' ] for path in table_schema: HDFSClient.delete_folder("{0}/{1}/hive/oa/{2}/y={3}/m={4}/d={5}".format(HUSER,self._table_name,path,yr,int(mn),int(dy)),user="******") impala.execute_query("invalidate metadata") #removes Feedback file HDFSClient.delete_folder("{0}/{1}/scored_results/{2}{3}{4}/feedback/ml_feedback.csv".format(HUSER,self._table_name,yr,mn,dy)) #removes json files from the storyboard HDFSClient.delete_folder("{0}/{1}/oa/{2}/{3}/{4}/{5}".format(HUSER,self._table_name,"storyboard",yr,mn,dy))
def save_comment(ip,title,text,date): #Get current table info. db = Configuration.db() sb_query = (""" SELECT ip_threat,title,text FROM {0}.flow_storyboard WHERE y = {1} AND m= {2} AND d={3} """).format(db,date.year,date.month,date.day) sb_data = ImpalaEngine.execute_query_as_list(sb_query) # find value if already exists. saved = False for item in sb_data: if item["ip_threat"] == ip: item["title"] = title item["text"] = text saved = True if not saved: sb_data.append({'text': text, 'ip_threat': str(ip), 'title': title}) #remove old file. app_path = Configuration.spot() old_file = "{0}/flow/hive/oa/storyboard/y={1}/m={2}/d={3}/" \ .format(app_path,date.year,date.month,date.day) # remove file manually to allow the comments update. HDFSClient.delete_folder(old_file,"impala") ImpalaEngine.execute_query("invalidate metadata") for item in sb_data: insert_query = (""" INSERT INTO {0}.flow_storyboard PARTITION(y={1} , m={2} ,d={3}) VALUES ( '{4}', '{5}','{6}') """) \ .format(db,date.year,date.month,date.day, \ item["ip_threat"],item["title"],item["text"]) ImpalaEngine.execute_query(insert_query) return True
def _create_dns_scores(self): # get date parameters. yr = self._date[:4] mn = self._date[4:6] dy = self._date[6:] value_string = "" dns_scores_final = self._move_time_stamp(self._dns_scores) self._dns_scores = dns_scores_final for row in dns_scores_final: value_string += str(tuple(Util.cast_val(item) for item in row)) + "," load_into_impala = (""" INSERT INTO {0}.dns_scores partition(y={2}, m={3}, d={4}) VALUES {1} """).format(self._db, value_string[:-1], yr, mn, dy) impala.execute_query(load_into_impala)
def _ingest_summary(self): # get date parameters. yr = self._date[:4] mn = self._date[4:6] dy = self._date[6:] self._logger.info("Getting ingest summary data for the day") ingest_summary_cols = ["date","total"] result_rows = [] df_filtered = pd.DataFrame() # get ingest summary. query_to_load=(""" SELECT p_date, p_time, COUNT(*) as total FROM {0}.{1} WHERE y='{2}' AND m='{3}' AND d='{4}' AND p_date IS NOT NULL AND p_time IS NOT NULL AND clientip IS NOT NULL AND p_time != '' AND host IS NOT NULL AND fulluri IS NOT NULL GROUP BY p_date, p_time; """).format(self._db,self._table_name, yr, mn, dy) results = impala.execute_query(query_to_load) if results: df_results = as_pandas(results) #Forms a new dataframe splitting the minutes from the time column/ df_new = pd.DataFrame([["{0} {1}:{2}".format(val['p_date'], val['p_time'].split(":")[0].zfill(2), val['p_time'].split(":")[1].zfill(2)), int(val['total']) if not math.isnan(val['total']) else 0 ] for key,val in df_results.iterrows()],columns = ingest_summary_cols) value_string = '' #Groups the data by minute sf = df_new.groupby(by=['date'])['total'].sum() df_per_min = pd.DataFrame({'date':sf.index, 'total':sf.values}) df_final = df_filtered.append(df_per_min, ignore_index=True).to_records(False,False) if len(df_final) > 0: query_to_insert=(""" INSERT INTO {0}.proxy_ingest_summary PARTITION (y={1}, m={2}, d={3}) VALUES {4}; """).format(self._db, yr, mn, dy, tuple(df_final)) impala.execute_query(query_to_insert) else: self._logger.info("No data found for the ingest summary")
def _ingest_summary(self): # get date parameters. yr = self._date[:4] mn = self._date[4:6] dy = self._date[6:] self._logger.info("Getting ingest summary data for the day") ingest_summary_cols = ["date", "total"] result_rows = [] df_filtered = pd.DataFrame() query_to_load = (""" SELECT frame_time, COUNT(*) as total FROM {0}.{1} WHERE y={2} AND m={3} AND d={4} AND unix_tstamp IS NOT NULL AND frame_time IS NOT NULL AND frame_len IS NOT NULL AND dns_qry_name IS NOT NULL AND ip_src IS NOT NULL AND (dns_qry_class IS NOT NULL AND dns_qry_type IS NOT NULL AND dns_qry_rcode IS NOT NULL ) GROUP BY frame_time; """).format(self._db, self._table_name, yr, mn, dy) results = impala.execute_query_as_list(query_to_load) df = pd.DataFrame(results) # Forms a new dataframe splitting the minutes from the time column df_new = pd.DataFrame([["{0}-{1}-{2} {3}:{4}".format(yr, mn, dy,\ val['frame_time'].replace(" "," ").split(" ")[3].split(":")[0].zfill(2),\ val['frame_time'].replace(" "," ").split(" ")[3].split(":")[1].zfill(2)),\ int(val['total']) if not math.isnan(val['total']) else 0 ] for key,val in df.iterrows()],columns = ingest_summary_cols) #Groups the data by minute sf = df_new.groupby(by=['date'])['total'].sum() df_per_min = pd.DataFrame({'date': sf.index, 'total': sf.values}) df_final = df_filtered.append(df_per_min, ignore_index=True).to_records( False, False) if len(df_final) > 0: query_to_insert = (""" INSERT INTO {0}.dns_ingest_summary PARTITION (y={1}, m={2}, d={3}) VALUES {4}; """).format(self._db, yr, mn, dy, tuple(df_final)) impala.execute_query(query_to_insert)
def _get_suspicious_details(self,bar=None): # skip header sp_connections = iter(self._flow_scores) # loop connections. connections_added = [] for conn in sp_connections: # validate if the connection's details are not already extracted. if conn in connections_added: continue else: connections_added.append(conn) src_ip_index = self._conf["flow_score_fields"]["srcIP"] dst_ip_index = self._conf["flow_score_fields"]["dstIP"] # get src ip sip = conn[src_ip_index] # get dst ip dip = conn[dst_ip_index] # get hour and date (i.e. 2014-07-08 10:10:40) date_array = conn[0].split(' ') date_array_1 = date_array[0].split('-') date_array_2 = date_array[1].split(':') yr = date_array_1[0] dy = date_array_1[2] mh = date_array_1[1] hr = date_array_2[0] mm = date_array_2[1] query_to_load = (""" INSERT INTO TABLE {0}.flow_edge PARTITION (y={2}, m={3}, d={4}) SELECT treceived as tstart,sip as srcip,dip as dstip,sport as sport,dport as dport,proto as proto,flag as flags, stos as tos,ibyt as ibyt,ipkt as ipkt, input as input, output as output,rip as rip, obyt as obyt, opkt as opkt, h as hh, trminute as mn from {0}.{1} where ((sip='{7}' AND dip='{8}') or (sip='{8}' AND dip='{7}')) AND y={2} AND m={3} AND d={4} AND h={5} AND trminute={6}; """).format(self._db,self._table_name,yr, mh, dy, hr, mm, sip,dip) impala.execute_query(query_to_load)
def reset_scored_connections(date): flow_storyboard = "flow/hive/oa/storyboard" flow_threat_investigation = "flow/hive/oa/threat_investigation" flow_timeline = "flow/hive/oa/timeline" app_path = Configuration.spot() try: # remove parquet files manually to allow the comments update. HDFSClient.delete_folder("{0}/{1}/y={2}/m={3}/d={4}/".format( \ app_path,flow_storyboard,date.year,date.month,date.day) , "impala") HDFSClient.delete_folder("{0}/{1}/y={2}/m={3}/d={4}/".format( \ app_path,flow_threat_investigation,date.year,date.month,date.day), "impala") HDFSClient.delete_folder("{0}/{1}/y={2}/m={3}/d={4}/".format( \ app_path,flow_timeline,date.year,date.month,date.day), "impala") ImpalaEngine.execute_query("invalidate metadata") return True except HdfsError: return False
def reset_scored_connections(date): proxy_storyboard = "proxy/hive/oa/storyboard" proxy_threat_investigation = "dns_threat_dendro/hive/oa/timeline" proxy_timeline = "proxy/hive/oa/threat_investigation" app_path = Configuration.spot() try: # remove parquet files manually to allow the comments update. HDFSClient.delete_folder("{0}/{1}/y={2}/m={3}/d={4}/".format( \ app_path,proxy_storyboard,date.year,date.month,date.day) , "impala") HDFSClient.delete_folder("{0}/{1}/y={2}/m={3}/d={4}/".format( \ app_path,proxy_threat_investigation,date.year,date.month,date.day), "impala") HDFSClient.delete_folder("{0}/{1}/y={2}/m={3}/d={4}/".format( \ app_path,proxy_timeline,date.year,date.month,date.day), "impala") ImpalaEngine.execute_query("invalidate metadata") return True except HdfsError: return False
def _ingest_summary(self): # get date parameters. yr = self._date[:4] mn = self._date[4:6] dy = self._date[6:] self._logger.info("Getting ingest summary data for the day") ingest_summary_cols = ["date","total"] result_rows = [] df_filtered = pd.DataFrame() query_to_load = (""" SELECT frame_time, COUNT(*) as total FROM {0}.{1} WHERE y={2} AND m={3} AND d={4} AND unix_tstamp IS NOT NULL AND frame_time IS NOT NULL AND frame_len IS NOT NULL AND dns_qry_name IS NOT NULL AND ip_src IS NOT NULL AND (dns_qry_class IS NOT NULL AND dns_qry_type IS NOT NULL AND dns_qry_rcode IS NOT NULL ) GROUP BY frame_time; """).format(self._db,self._table_name, yr, mn, dy) results = impala.execute_query_as_list(query_to_load) df = pd.DataFrame(results) # Forms a new dataframe splitting the minutes from the time column df_new = pd.DataFrame([["{0}-{1}-{2} {3}:{4}".format(yr, mn, dy,\ val['frame_time'].replace(" "," ").split(" ")[3].split(":")[0].zfill(2),\ val['frame_time'].replace(" "," ").split(" ")[3].split(":")[1].zfill(2)),\ int(val['total']) if not math.isnan(val['total']) else 0 ] for key,val in df.iterrows()],columns = ingest_summary_cols) #Groups the data by minute sf = df_new.groupby(by=['date'])['total'].sum() df_per_min = pd.DataFrame({'date':sf.index, 'total':sf.values}) df_final = df_filtered.append(df_per_min, ignore_index=True).to_records(False,False) if len(df_final) > 0: query_to_insert=(""" INSERT INTO {0}.dns_ingest_summary PARTITION (y={1}, m={2}, d={3}) VALUES {4}; """).format(self._db, yr, mn, dy, tuple(df_final)) impala.execute_query(query_to_insert)
def _get_dns_details(self,dns_qry_name,year,month,day,hh,dns_iana): value_string = "" query_to_load =(""" SELECT unix_tstamp,frame_len,ip_dst,ip_src,dns_qry_name,dns_qry_class,dns_qry_type,dns_qry_rcode,dns_a,h as hh FROM {0}.{1} WHERE y={2} AND m={3} AND d={4} AND dns_qry_name LIKE '%{5}%' AND h={6} LIMIT {7}; """).format(self._db,self._table_name,year,month,day,dns_qry_name,hh,self._details_limit) try: dns_details = impala.execute_query(query_to_load) except: self._logger.info("WARNING. Details couldn't be retreived for {0}, skipping this step".format(dns_qry_name)) else: # add IANA to results. update_rows = [] if dns_iana: self._logger.info("Adding IANA translation to details results") dns_details = [ conn + (dns_iana.get_name(str(conn[5]),"dns_qry_class"),dns_iana.get_name(str(conn[6]),"dns_qry_type"),dns_iana.get_name(str(conn[7]),"dns_qry_rcode")) for conn in dns_details ] else: self._logger.info("WARNING: NO IANA configured.") dns_details = [ conn + ("","","") for conn in dns_details ] nc_conf_file = "{0}/components/nc/nc_config.json".format(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) if os.path.isfile(nc_conf_file): nc_conf = json.loads(open(nc_conf_file).read())["NC"] dns_nc = NetworkContext(nc_conf,self._logger) dns_details = [ conn + (dns_nc.get_nc(conn[2]),) for conn in dns_details ] else: dns_details = [ conn + (0,) for conn in dns_details ] for row in dns_details: value_string += str(tuple(item for item in row)) + "," if value_string != "": query_to_insert=(""" INSERT INTO {0}.dns_edge PARTITION (y={1}, m={2}, d={3}) VALUES ({4}); """).format(self._db,year, month, day, value_string[:-1]) impala.execute_query(query_to_insert)
def _get_chord_details(self,bar=None): # skip header sp_connections = iter(self._flow_scores) src_ip_index = self._conf["flow_score_fields"]["srcIP"] dst_ip_index = self._conf["flow_score_fields"]["dstIP"] # get date parameters. yr = self._date[:4] mn = self._date[4:6] dy = self._date[6:] # get number of times each IP appears. srcdict = {} for conn in sp_connections: if conn[src_ip_index] in srcdict:srcdict[conn[src_ip_index]] += 1 else:srcdict[conn[src_ip_index]] = 1 if conn[dst_ip_index] in srcdict:srcdict[conn[dst_ip_index]] += 1 else:srcdict[conn[dst_ip_index]] = 1 for (ip,n) in srcdict.items(): if n > 1: ip_list = [] sp_connections = iter(self._flow_scores) for row in sp_connections: if ip == row[1] : ip_list.append(row[2]) if ip == row[2] :ip_list.append(row[1]) ips = list(set(ip_list)) if len(ips) > 1: ips_filter = (",".join(str("'{0}'".format(ip)) for ip in ips)) query_to_load = (""" INSERT INTO TABLE {0}.flow_chords PARTITION (y={2}, m={3}, d={4}) SELECT '{5}' as ip_threat, sip as srcip, dip as dstip, SUM(ibyt) as ibyt, SUM(ipkt) as ipkt from {0}.{1} where y={2} and m={3} and d={4} and ((sip='{5}' and dip IN({6})) or (sip IN({6}) and dip='{5}')) group by sip,dip,m,d; """).format(self._db,self._table_name,yr,mn,dy,ip,ips_filter) impala.execute_query(query_to_load)
def _get_dns_dendrogram(self): for conn in self._dns_scores: timestamp = conn[self._conf["dns_score_fields"]["unix_tstamp"]] full_date = datetime.datetime.utcfromtimestamp(int(timestamp)).strftime('%Y-%m-%d %H:%M:%S') date = full_date.split(" ")[0].split("-") # get date parameters. yr = date[0] mn = date[1] dy = date[2] ip_dst=conn[self._conf["dns_score_fields"]["ip_dst"]] query_to_load = (""" INSERT INTO TABLE {0}.dns_dendro PARTITION (y={2}, m={3},d={4}) SELECT unix_tstamp, dns_a, dns_qry_name, ip_dst FROM (SELECT unix_tstamp, susp.ip_dst, susp.dns_qry_name, susp.dns_a FROM {0}.{1} as susp WHERE susp.y={2} AND susp.m={3} AND susp.d={4} AND susp.ip_dst='{5}' LIMIT {6}) AS tmp GROUP BY dns_a, dns_qry_name, ip_dst, unix_tstamp """).format(self._db,self._table_name,yr,mn,dy,ip_dst,self._details_limit) impala.execute_query(query_to_load)
def score_connection(date, ip="", dns="", ip_sev=0, dns_sev=0): if (not ip and not ip_sev) and (not dns and not dns_sev): return False db = Configuration.db() sq_query = (""" SELECT frame_time,unix_tstamp,frame_len,ip_dst,dns_qry_name,dns_qry_class, dns_qry_type,dns_qry_rcode,ml_score,tld,query_rep, hh,dns_qry_class_name,dns_qry_type_name,dns_qry_rcode_name, network_context FROM {0}.dns_scores WHERE y={1} and m={2} and d={3} AND ( """).format(db, date.year, date.month, date.day) connections_filter = "" connections_filter += "ip_dst = '{0}' ".format(ip) if ip else "" connections_filter += " OR " if ip and dns else "" connections_filter += "dns_qry_name = '{0}' ".format(dns) if dns else "" connections_filter += ")" connections = ImpalaEngine.execute_query(sq_query + connections_filter) # add score to connections insert_command = ("""INSERT INTO {0}.dns_threat_investigation PARTITION (y={1},m={2},d={3}) VALUES (""") \ .format(db,date.year,date.month,date.day) fb_data = [] first = True num_rows = 0 for row in connections: # insert into dns_threat_investigation. threat_data = (row[1],row[3],row[4],ip_sev if ip == row[3] else 0,\ dns_sev if dns == row[4] else 0) fb_data.append([row[0],row[2],row[3],row[4],row[5],row[6],row[7],\ row[8],row[9],row[10],row[11],ip_sev,dns_sev,row[12],row[13],row[14],\ row[15],row[1]]) insert_command += "{0}{1}".format("," if not first else "", threat_data) first = False num_rows += 1 insert_command += ")" if num_rows > 0: ImpalaEngine.execute_query(insert_command) # create feedback file. app_path = Configuration.spot() feedback_path = "{0}/dns/scored_results/{1}{2}{3}/feedback"\ .format(app_path,date.year,str(date.month).zfill(2),str(date.day).zfill(2)) ap_file = True if len(HDFSClient.list_dir(feedback_path)) == 0: fb_data.insert(0,["frame_time","frame_len","ip_dst","dns_qry_name",\ "dns_qry_class","dns_qry_type","dns_qry_rcode","score","tld","query_rep",\ "hh","ip_sev","dns_sev","dns_qry_class_name","dns_qry_type_name",\ "dns_qry_rcode_name","network_context","unix_tstamp"]) ap_file = False HDFSClient.put_file_csv(fb_data, feedback_path, "ml_feedback.csv", append_file=ap_file) return True
def score_connection(date,ip="", dns="", ip_sev=0, dns_sev=0): if (not ip and not ip_sev) and (not dns and not dns_sev): return False db = Configuration.db() sq_query = (""" SELECT frame_time,unix_tstamp,frame_len,ip_dst,dns_qry_name,dns_qry_class, dns_qry_type,dns_qry_rcode,ml_score,tld,query_rep, hh,dns_qry_class_name,dns_qry_type_name,dns_qry_rcode_name, network_context FROM {0}.dns_scores WHERE y={1} and m={2} and d={3} AND ( """).format(db,date.year,date.month,date.day) connections_filter = "" connections_filter += "ip_dst = '{0}' ".format(ip) if ip else "" connections_filter += " OR " if ip and dns else "" connections_filter += "dns_qry_name = '{0}' ".format(dns) if dns else "" connections_filter += ")" connections = ImpalaEngine.execute_query(sq_query + connections_filter) # add score to connections insert_command = ("""INSERT INTO {0}.dns_threat_investigation PARTITION (y={1},m={2},d={3}) VALUES (""") \ .format(db,date.year,date.month,date.day) fb_data = [] first = True num_rows = 0 for row in connections: # insert into dns_threat_investigation. threat_data = (row[1],row[3],row[4],ip_sev if ip == row[3] else 0,\ dns_sev if dns == row[4] else 0) fb_data.append([row[0],row[2],row[3],row[4],row[5],row[6],row[7],\ row[8],row[9],row[10],row[11],ip_sev,dns_sev,row[12],row[13],row[14],\ row[15],row[1]]) insert_command += "{0}{1}".format("," if not first else "", threat_data) first = False num_rows += 1 insert_command += ")" if num_rows > 0: ImpalaEngine.execute_query(insert_command) # create feedback file. app_path = Configuration.spot() feedback_path = "{0}/dns/scored_results/{1}{2}{3}/feedback"\ .format(app_path,date.year,str(date.month).zfill(2),str(date.day).zfill(2)) ap_file = True if len(HDFSClient.list_dir(feedback_path)) == 0: fb_data.insert(0,["frame_time","frame_len","ip_dst","dns_qry_name",\ "dns_qry_class","dns_qry_type","dns_qry_rcode","score","tld","query_rep",\ "hh","ip_sev","dns_sev","dns_qry_class_name","dns_qry_type_name",\ "dns_qry_rcode_name","network_context","unix_tstamp"]) ap_file = False HDFSClient.put_file_csv(fb_data,feedback_path,"ml_feedback.csv",append_file=ap_file) return True
def score_request(date, score, uri): if not score and not uri: return None db = Configuration.db() p_query = (""" SELECT tdate,time,clientip,host,reqmethod,useragent,resconttype ,duration,username,webcat,referer,respcode,uriport ,uripath,uriquery,serverip,scbytes,csbytes,fulluri ,word,ml_score,uri_rep,respcode_name,network_context FROM {0}.proxy_scores WHERE y={1} and m={2} and d={3} AND fulluri = '{4}' """).format(db, date.year, date.month, date.day, uri) connections = ImpalaEngine.execute_query(p_query) # add score to connections insert_command = (""" INSERT INTO {0}.proxy_threat_investigation PARTITION (y={1},m={2},d={3}) VALUES (""") \ .format(db,date.year,date.month,date.day) fb_data = [] first = True num_rows = 0 for row in connections: cip_index = row[2] uri_index = row[18] tme_index = row[2] hash_field = [str( md5.new(str(cip_index) + str(uri_index)).hexdigest() \ + str((tme_index.split(":"))[0]) )] threat_data = (row[0], row[18], score) fb_data.append([row[0],row[1],row[2],row[3],row[4],row[5],row[6],row[7] \ ,row[8],row[9],row[10],row[11],row[12],row[13],row[14],row[15] \ ,row[16],row[17],row[18],row[19],score,row[20],row[21],row[22], \ row[23],hash_field]) insert_command += "{0}{1}".format("," if not first else "", threat_data) first = False num_rows += 1 insert_command += ")" if num_rows > 0: ImpalaEngine.execute_query(insert_command) # create feedback file. app_path = Configuration.spot() feedback_path = "{0}/proxy/scored_results/{1}{2}{3}/feedback"\ .format(app_path,date.year,str(date.month).zfill(2),str(date.day).zfill(2)) ap_file = True if len(HDFSClient.list_dir(feedback_path)) == 0: fb_data.insert(0,["p_date","p_time","clientip","host","reqmethod",\ "useragent","resconttype","duration","username","webcat","referer",\ "respcode","uriport","uripath","uriquery","serverip","scbytes","csbytes",\ "fulluri","word","score","uri_rep","uri_sev","respcode_name",\ "network_context","hash"]) ap_file = False HDFSClient.put_file_csv(fb_data, feedback_path, "ml_feedback.csv", append_file=ap_file) return True
def create_timeline(anchor, clientips, date, top_results): response = "" susp_ips = [] if clientips: srtlist = sorted(list(clientips.items()), key=lambda x: x[1], reverse=True) for val in srtlist[:top_results]: susp_ips.append(val[0]) if anchor != "": db = Configuration.db() time_line_query = (""" SELECT p_threat,tstart,tend,duration,clientip,respcode,respcodename FROM {0}.proxy_timeline WHERE y={1} AND m={2} AND d={3} AND p_threat != '{4}' """).format(db, date.year, date.month, date.day, anchor.replace("'", "//'")) tmp_timeline_data = ImpalaEngine.execute_query_as_list(time_line_query) imp_query = (""" INSERT INTO TABLE {0}.proxy_timeline PARTITION (y={2}, m={3},d={4}) SELECT '{7}' as p_threat, concat(cast(p_date as string), ' ', cast(MIN(p_time) as string)) AS tstart, concat(cast(p_date as string), ' ', cast(MAX(p_time) as string)) AS tend, SUM(duration) AS duration, clientip, respcode,"respCodeName" as respCodeName FROM {0}.proxy WHERE fulluri='{1}' AND clientip IN ({5}) AND y='{2}' AND m='{3}' AND d='{4}' GROUP BY clientip, p_time, respcode, p_date LIMIT {6} """)\ .format(db,anchor,date.year,str(date.month).zfill(2),\ str(date.day).zfill(2),("'" + "','".join(susp_ips) + "'")\ ,top_results,anchor) app_path = Configuration.spot() old_file = "{0}/proxy/hive/oa/timeline/y={1}/m={2}/d={3}"\ .format(app_path,date.year,date.month,date.day) HDFSClient.delete_folder(old_file, "impala") ImpalaEngine.execute_query("invalidate metadata") #Insert temporary values for item in tmp_timeline_data: insert_query = (""" INSERT INTO {0}.proxy_timeline PARTITION(y={1} , m={2} ,d={3}) VALUES ('{4}', '{5}', '{6}',{7},'{8}','{9}','{10}') """)\ .format(db,date.year,date.month,date.day,\ item["p_threat"],item["tstart"],item["tend"],item["duration"],item["clientip"],item["respcode"],item["respcodename"]) ImpalaEngine.execute_query(insert_query) ImpalaEngine.execute_query(imp_query) response = "Timeline successfully saved" else: response = "Timeline couldn't be created"
def score_connection(score,date,src_ip=None,dst_ip=None,src_port=None,dst_port=None): if not src_ip and not dst_ip and not src_port and not dst_port: return False db = Configuration.db() # get connections to score connections_query = (""" SELECT tstart,srcip,dstip,sport,dport, ibyt,ipkt FROM {0}.flow_scores WHERE y = {1} AND m={2} AND d={3} """).format(db,date.year,date.month,date.day) connections_filter = "" connections_filter += " AND srcip = '{0}'".format(src_ip) if src_ip else "" connections_filter += " AND dstip = '{0}'".format(dst_ip) if dst_ip else "" connections_filter += " AND sport = {0}" \ .format(str(src_port)) if src_port else "" connections_filter += " AND dport = {0}" \ .format(str(dst_port)) if dst_port else "" connections = ImpalaEngine.execute_query(connections_query + connections_filter) # add score to connections insert_command = (""" INSERT INTO {0}.flow_threat_investigation PARTITION (y={1},m={2},d={3}) VALUES (""") \ .format(db,date.year,date.month,date.day) fb_data = [] first = True num_rows = 0 for row in connections: # insert into flow_threat_investigation. threat_data = (row[0],row[1],row[2],row[3],row[4],score) fb_data.append([score,row[0],row[1],row[2],row[3],row[4],row[5],row[6]]) insert_command += "{0}{1}".format("," if not first else "", threat_data) first = False num_rows += 1 insert_command += ")" if num_rows > 0: ImpalaEngine.execute_query(insert_command) # create feedback file. app_path = Configuration.spot() feedback_path = "{0}/flow/scored_results/{1}{2}{3}/feedback" \ .format(app_path,date.year,str(date.month).zfill(2),str(date.day).zfill(2)) append_file = True if len(HDFSClient.list_dir(feedback_path)) == 0: fb_data.insert(0,["sev","tstart","sip","dip","sport","dport","ipkt","ibyt"]) append_file = False HDFSClient.put_file_csv(fb_data,feedback_path,"ml_feedback.csv",\ append_file=append_file) return True
def score_request(date,score,uri): if not score and not uri: return None db = Configuration.db() p_query = (""" SELECT tdate,time,clientip,host,reqmethod,useragent,resconttype ,duration,username,webcat,referer,respcode,uriport ,uripath,uriquery,serverip,scbytes,csbytes,fulluri ,word,ml_score,uri_rep,respcode_name,network_context FROM {0}.proxy_scores WHERE y={1} and m={2} and d={3} AND fulluri = '{4}' """).format(db,date.year,date.month,date.day,uri) connections = ImpalaEngine.execute_query(p_query) # add score to connections insert_command = (""" INSERT INTO {0}.proxy_threat_investigation PARTITION (y={1},m={2},d={3}) VALUES (""") \ .format(db,date.year,date.month,date.day) fb_data = [] first = True num_rows = 0 for row in connections: cip_index = row[2] uri_index = row[18] tme_index = row[2] hash_field = [str( md5.new(str(cip_index) + str(uri_index)).hexdigest() \ + str((tme_index.split(":"))[0]) )] threat_data = (row[0],row[18],score) fb_data.append([row[0],row[1],row[2],row[3],row[4],row[5],row[6],row[7] \ ,row[8],row[9],row[10],row[11],row[12],row[13],row[14],row[15] \ ,row[16],row[17],row[18],row[19],score,row[20],row[21],row[22], \ row[23],hash_field]) insert_command += "{0}{1}".format("," if not first else "", threat_data) first = False num_rows += 1 insert_command += ")" if num_rows > 0: ImpalaEngine.execute_query(insert_command) # create feedback file. app_path = Configuration.spot() feedback_path = "{0}/proxy/scored_results/{1}{2}{3}/feedback"\ .format(app_path,date.year,str(date.month).zfill(2),str(date.day).zfill(2)) ap_file = True if len(HDFSClient.list_dir(feedback_path)) == 0: fb_data.insert(0,["p_date","p_time","clientip","host","reqmethod",\ "useragent","resconttype","duration","username","webcat","referer",\ "respcode","uriport","uripath","uriquery","serverip","scbytes","csbytes",\ "fulluri","word","score","uri_rep","uri_sev","respcode_name",\ "network_context","hash"]) ap_file = False HDFSClient.put_file_csv(fb_data,feedback_path,"ml_feedback.csv",append_file=ap_file) return True
def create_timeline(anchor,clientips,date,top_results): response = "" susp_ips = [] if clientips: srtlist = sorted(list(clientips.items()), key=lambda x: x[1], reverse=True) for val in srtlist[:top_results]: susp_ips.append(val[0]) if anchor != "": db = Configuration.db() time_line_query = (""" SELECT p_threat,tstart,tend,duration,clientip,respcode,respcodename FROM {0}.proxy_timeline WHERE y={1} AND m={2} AND d={3} AND p_threat != '{4}' """).format(db,date.year,date.month,date.day,anchor.replace("'","//'")) tmp_timeline_data = ImpalaEngine.execute_query_as_list(time_line_query) imp_query = (""" INSERT INTO TABLE {0}.proxy_timeline PARTITION (y={2}, m={3},d={4}) SELECT '{7}' as p_threat, concat(cast(p_date as string), ' ', cast(MIN(p_time) as string)) AS tstart, concat(cast(p_date as string), ' ', cast(MAX(p_time) as string)) AS tend, SUM(duration) AS duration, clientip, respcode,"respCodeName" as respCodeName FROM {0}.proxy WHERE fulluri='{1}' AND clientip IN ({5}) AND y='{2}' AND m='{3}' AND d='{4}' GROUP BY clientip, p_time, respcode, p_date LIMIT {6} """)\ .format(db,anchor,date.year,str(date.month).zfill(2),\ str(date.day).zfill(2),("'" + "','".join(susp_ips) + "'")\ ,top_results,anchor) app_path = Configuration.spot() old_file = "{0}/proxy/hive/oa/timeline/y={1}/m={2}/d={3}"\ .format(app_path,date.year,date.month,date.day) HDFSClient.delete_folder(old_file,"impala") ImpalaEngine.execute_query("invalidate metadata") #Insert temporary values for item in tmp_timeline_data: insert_query = (""" INSERT INTO {0}.proxy_timeline PARTITION(y={1} , m={2} ,d={3}) VALUES ('{4}', '{5}', '{6}',{7},'{8}','{9}','{10}') """)\ .format(db,date.year,date.month,date.day,\ item["p_threat"],item["tstart"],item["tend"],item["duration"],item["clientip"],item["respcode"],item["respcodename"]) ImpalaEngine.execute_query(insert_query) ImpalaEngine.execute_query(imp_query) response = "Timeline successfully saved" else: response = "Timeline couldn't be created"