Ejemplo n.º 1
0
    def _get_proxy_details(self,fulluri,clientip,year,month,day,hh,proxy_iana):
        limit = 250 
        value_string = ""
        
        query_to_load =("""
            SELECT p_date, p_time, clientip, host, webcat, respcode, reqmethod, useragent, resconttype,
            referer, uriport, serverip, scbytes, csbytes, fulluri, {5} as hh
            FROM {0}.{1} WHERE y='{2}' AND m='{3}' AND d='{4}' AND
            h='{5}' AND fulluri='{6}' AND clientip='{7}' LIMIT {8};
        """).format(self._db,self._table_name, year,month,day,hh,fulluri.replace("'","\\'"),clientip,limit)

        detail_results = impala.execute_query(query_to_load)
 
        if proxy_iana:
             # add IANA to results.
            self._logger.info("Adding IANA translation to details results")
 
            updated_rows = [conn + (proxy_iana.get_name(conn[5],"proxy_http_rcode"),) for conn in detail_results]
            updated_rows = filter(None, updated_rows)            
        else:
            updated_rows = [conn + ("") for conn in detail_results ]
 
        for row in updated_rows:
            value_string += str(tuple(item for item in row)) + ","     
        
        if value_string != "":  
            query_to_insert=("""
                INSERT INTO {0}.proxy_edge PARTITION (y={1}, m={2}, d={3}) VALUES ({4});
            """).format(self._db,year, month, day, value_string[:-1])

            impala.execute_query(query_to_insert) 
Ejemplo n.º 2
0
    def _get_dns_dendrogram(self):

        for conn in self._dns_scores:
            timestamp = conn[self._conf["dns_score_fields"]["unix_tstamp"]]
            full_date = datetime.datetime.utcfromtimestamp(
                int(timestamp)).strftime('%Y-%m-%d %H:%M:%S')

            date = full_date.split(" ")[0].split("-")
            # get date parameters.

            yr = date[0]
            mn = date[1]
            dy = date[2]
            ip_dst = conn[self._conf["dns_score_fields"]["ip_dst"]]

            query_to_load = ("""
                INSERT INTO TABLE {0}.dns_dendro PARTITION (y={2}, m={3},d={4})
                SELECT unix_tstamp, dns_a, dns_qry_name, ip_dst
                FROM (SELECT unix_tstamp, susp.ip_dst, susp.dns_qry_name, susp.dns_a
                    FROM {0}.{1} as susp WHERE susp.y={2} AND susp.m={3} AND susp.d={4} AND susp.ip_dst='{5}'
                LIMIT {6}) AS tmp GROUP BY dns_a, dns_qry_name, ip_dst, unix_tstamp
            """).format(self._db, self._table_name, yr, mn, dy, ip_dst,
                        self._details_limit)

            impala.execute_query(query_to_load)
Ejemplo n.º 3
0
    def _clear_previous_executions(self):

        self._logger.info("Cleaning data from previous executions for the day")
        yr = self._date[:4]
        mn = self._date[4:6]
        dy = self._date[6:]
        table_schema = []
        HUSER = self._spot_conf.get('conf',
                                    'HUSER').replace("'", "").replace('"', '')
        table_schema = [
            'suspicious', 'edge', 'dendro', 'threat_dendro',
            'threat_investigation', 'storyboard', 'summary'
        ]

        for path in table_schema:
            HDFSClient.delete_folder(
                "{0}/{1}/hive/oa/{2}/y={3}/m={4}/d={5}".format(
                    HUSER, self._table_name, path, yr, int(mn), int(dy)),
                user="******")
        impala.execute_query("invalidate metadata")

        #removes Feedback file
        HDFSClient.delete_folder(
            "{0}/{1}/scored_results/{2}{3}{4}/feedback/ml_feedback.csv".format(
                HUSER, self._table_name, yr, mn, dy))
        #removes json files from the storyboard
        HDFSClient.delete_folder("{0}/{1}/oa/{2}/{3}/{4}/{5}".format(
            HUSER, self._table_name, "storyboard", yr, mn, dy))
Ejemplo n.º 4
0
def create_dendro(expanded_search,date,anchor):

    db = Configuration.db()
    for row in expanded_search:
	dendro_query = ("""
		INSERT INTO {0}.dns_threat_dendro PARTITION (y={1}, m={2},d={3})
		VALUES ( '{4}',{5},'{6}','{7}')
		""")\
        .format(db,date.year,date.month,date.day,anchor,\
        row["total"],row["dnsQuery"],row["clientIp"])

	ImpalaEngine.execute_query(dendro_query)
Ejemplo n.º 5
0
def create_dendro(expanded_search, date, anchor):

    db = Configuration.db()
    for row in expanded_search:
        dendro_query = ("""
		INSERT INTO {0}.dns_threat_dendro PARTITION (y={1}, m={2},d={3})
		VALUES ( '{4}',{5},'{6}','{7}')
		""")\
               .format(db,date.year,date.month,date.day,anchor,\
               row["total"],row["dnsQuery"],row["clientIp"])

        ImpalaEngine.execute_query(dendro_query)
Ejemplo n.º 6
0
    def _ingest_summary(self):
        # get date parameters.
        yr = self._date[:4]
        mn = self._date[4:6]
        dy = self._date[6:]

        self._logger.info("Getting ingest summary data for the day")
        
        ingest_summary_cols = ["date","total"]		
        result_rows = []        
        df_filtered =  pd.DataFrame()

        # get ingest summary.

        query_to_load=("""
                SELECT tryear, trmonth, trday, trhour, trminute, COUNT(*) as total
                FROM {0}.{1} WHERE y={2} AND m={3} AND d={4}
                AND unix_tstamp IS NOT NULL
                AND sip IS NOT NULL
                AND sport IS NOT NULL
                AND dip IS NOT NULL
                AND dport IS NOT NULL
                AND ibyt IS NOT NULL
                AND ipkt IS NOT NULL
                AND tryear={2}
                AND cast(treceived as timestamp) IS NOT NULL
                GROUP BY tryear, trmonth, trday, trhour, trminute;
        """).format(self._db,self._table_name, yr, mn, dy)
        
        results = impala.execute_query(query_to_load) 
 
        if results:
            df_results = as_pandas(results) 
            
            #Forms a new dataframe splitting the minutes from the time column
            df_new = pd.DataFrame([["{0}-{1}-{2} {3}:{4}".format(val['tryear'],val['trmonth'],val['trday'], val['trhour'], val['trminute']), int(val['total']) if not math.isnan(val['total']) else 0 ] for key,val in df_results.iterrows()],columns = ingest_summary_cols)
            value_string = ''
            #Groups the data by minute 

            sf = df_new.groupby(by=['date'])['total'].sum()
            df_per_min = pd.DataFrame({'date':sf.index, 'total':sf.values})
            
            df_final = df_filtered.append(df_per_min, ignore_index=True).to_records(False,False) 
            if len(df_final) > 0:
                query_to_insert=("""
                    INSERT INTO {0}.flow_ingest_summary PARTITION (y={1}, m={2}, d={3}) VALUES {4};
                """).format(self._db, yr, mn, dy, tuple(df_final))

                impala.execute_query(query_to_insert)
                
        else:
            self._logger.info("No data found for the ingest summary")
Ejemplo n.º 7
0
    def _get_dns_details(self, dns_qry_name, year, month, day, hh, dns_iana):
        value_string = ""
        query_to_load = ("""
            SELECT unix_tstamp,frame_len,ip_dst,ip_src,dns_qry_name,dns_qry_class,dns_qry_type,dns_qry_rcode,dns_a,h as hh
            FROM {0}.{1} WHERE y={2} AND m={3} AND d={4} AND dns_qry_name LIKE '%{5}%' AND h={6} LIMIT {7};
        """).format(self._db, self._table_name, year, month, day, dns_qry_name,
                    hh, self._details_limit)

        try:
            dns_details = impala.execute_query(query_to_load)
        except:
            self._logger.info(
                "WARNING. Details couldn't be retreived for {0}, skipping this step"
                .format(dns_qry_name))
        else:
            # add IANA to results.
            update_rows = []
            if dns_iana:
                self._logger.info("Adding IANA translation to details results")

                dns_details = [
                    conn + (dns_iana.get_name(str(conn[5]), "dns_qry_class"),
                            dns_iana.get_name(str(conn[6]), "dns_qry_type"),
                            dns_iana.get_name(str(conn[7]), "dns_qry_rcode"))
                    for conn in dns_details
                ]
            else:
                self._logger.info("WARNING: NO IANA configured.")
                dns_details = [conn + ("", "", "") for conn in dns_details]

            nc_conf_file = "{0}/components/nc/nc_config.json".format(
                os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
            if os.path.isfile(nc_conf_file):
                nc_conf = json.loads(open(nc_conf_file).read())["NC"]
                dns_nc = NetworkContext(nc_conf, self._logger)
                dns_details = [
                    conn + (dns_nc.get_nc(conn[2]), ) for conn in dns_details
                ]
            else:
                dns_details = [conn + (0, ) for conn in dns_details]

            for row in dns_details:
                value_string += str(tuple(item for item in row)) + ","

            if value_string != "":

                query_to_insert = ("""
                    INSERT INTO {0}.dns_edge PARTITION (y={1}, m={2}, d={3}) VALUES ({4});
                """).format(self._db, year, month, day, value_string[:-1])

                impala.execute_query(query_to_insert)
Ejemplo n.º 8
0
    def _create_proxy_scores_csv(self):
        # get date parameters.
        yr = self._date[:4]
        mn = self._date[4:6]
        dy = self._date[6:] 
        value_string = ""
 
        for row in self._proxy_scores:
            value_string += str(tuple(Util.cast_val(item) for item in row)) + ","              
    
        load_into_impala = ("""
             INSERT INTO {0}.proxy_scores partition(y={2}, m={3}, d={4}) VALUES {1}
        """).format(self._db, value_string[:-1], yr, mn, dy) 
        impala.execute_query(load_into_impala)
Ejemplo n.º 9
0
    def _create_flow_scores(self):

        # get date parameters.
        yr = self._date[:4]
        mn = self._date[4:6]
        dy = self._date[6:] 
        value_string = ""

        for row in self._flow_scores:
            value_string += str(tuple(Util.cast_val(item) for item in row)) + ","              
    
        load_into_impala = ("""
             INSERT INTO {0}.flow_scores partition(y={2}, m={3}, d={4}) VALUES {1}
        """).format(self._db, value_string[:-1], yr, mn, dy) 
        impala.execute_query(load_into_impala)
Ejemplo n.º 10
0
def create_time_line(anchor,inbound, outbound, twoway,date):

    top_keys = []
    if len(twoway) > 0: top_keys.extend(twoway.keys())
    if len(outbound) > 0: top_keys.extend(outbound.keys())
    if len(inbound) > 0: top_keys.extend(inbound.keys())


    db = Configuration.db()

    imp_query =("""
        INSERT INTO TABLE {0}.flow_timeline PARTITION (y={4}, m={5},d={6})
        SELECT
            '{7}' ,min(treceived) as tstart, max(treceived) as tend,
            sip as srcIP,dip as dstip, proto as proto, sport as sport,
            dport AS dport, ipkt as ipkt, ibyt as ibyt
        FROM
            {0}.flow
        WHERE y={4} AND m={5} AND d={6}
        AND ((dip IN({1}) AND sip ='{2}') OR (sip IN({1}) AND dip ='{2}'))
        GROUP BY sip, dip, proto, sport, dport, ipkt, ibyt
        ORDER BY tstart
        LIMIT {3}
    """)

    ips = "'" + "','".join(top_keys) + "'"
    imp_query = imp_query.format(db,ips,anchor,1000,date.year,date.month, date.day,anchor)

    if ImpalaEngine.execute_query(imp_query):
        return "Timeline successfully created \n"
    else:
        return "Timeline couldn't be created \n"
Ejemplo n.º 11
0
def create_time_line(anchor,inbound, outbound, twoway,date):

    top_keys = []
    if len(twoway) > 0: top_keys.extend(twoway.keys())
    if len(outbound) > 0: top_keys.extend(outbound.keys())
    if len(inbound) > 0: top_keys.extend(inbound.keys())


    db = Configuration.db()

    imp_query =("""
        INSERT INTO TABLE {0}.flow_timeline PARTITION (y={4}, m={5},d={6})
        SELECT
            '{7}' ,min(treceived) as tstart, max(treceived) as tend,
            sip as srcIP,dip as dstip, proto as proto, sport as sport,
            dport AS dport, ipkt as ipkt, ibyt as ibyt
        FROM
            {0}.flow
        WHERE y={4} AND m={5} AND d={6}
        AND ((dip IN({1}) AND sip ='{2}') OR (sip IN({1}) AND dip ='{2}'))
        GROUP BY sip, dip, proto, sport, dport, ipkt, ibyt
        ORDER BY tstart
        LIMIT {3}
    """)

    ips = "'" + "','".join(top_keys) + "'"
    imp_query = imp_query.format(db,ips,anchor,1000,date.year,date.month, date.day,anchor)

    if ImpalaEngine.execute_query(imp_query):
        return "Timeline successfully created \n"
    else:
        return "Timeline couldn't be created \n"
Ejemplo n.º 12
0
def save_comments(anchor, ip, query, title, text, date):

    db = Configuration.db()
    sb_query = ("""
            SELECT
                ip_threat,dns_threat,title,text
            FROM
                {0}.dns_storyboard
            WHERE
                y = {1} AND m= {2} AND d={3}
            """).format(db, date.year, date.month, date.day)
    sb_data = ImpalaEngine.execute_query_as_list(sb_query)

    # find value if already exists.
    saved = False
    for item in sb_data:
        if item["ip_threat"] == anchor or item["dns_threat"] == anchor:
            item["title"] = title
            item["text"] = text
            saved = True

    if not saved:
        sb_data.append({
            'text': text,
            'ip_threat': str(ip),
            'title': title,
            'dns_threat': query
        })

    #remove old file.
    app_path = Configuration.spot()
    old_file = "{0}/dns/hive/oa/storyboard/y={1}/m={2}/d={3}/"\
    .format(app_path,date.year,date.month,date.day)

    HDFSClient.delete_folder(old_file, "impala")
    ImpalaEngine.execute_query("invalidate metadata")

    for item in sb_data:
        insert_query = ("""
         	INSERT INTO {0}.dns_storyboard PARTITION(y={1} , m={2} ,d={3})
            	VALUES ( '{4}', '{5}', '{6}','{7}')
            	""")\
                       .format(db,date.year,date.month,date.day,\
                       item["ip_threat"],item["dns_threat"],item["title"],item["text"])
        ImpalaEngine.execute_query(insert_query)

    return True
Ejemplo n.º 13
0
    def _clear_previous_executions(self):
        
        self._logger.info("Cleaning data from previous executions for the day")       
        yr = self._date[:4]
        mn = self._date[4:6]
        dy = self._date[6:]  
        table_schema = []
        HUSER = self._spot_conf.get('conf', 'HUSER').replace("'", "").replace('"', '')
        table_schema=['suspicious', 'edge','chords','threat_investigation', 'timeline', 'storyboard', 'summary' ] 

        for path in table_schema:
            HDFSClient.delete_folder("{0}/{1}/hive/oa/{2}/y={3}/m={4}/d={5}".format(HUSER,self._table_name,path,yr,int(mn),int(dy)),user="******")        
        impala.execute_query("invalidate metadata")
        #removes Feedback file
        HDFSClient.delete_folder("{0}/{1}/scored_results/{2}{3}{4}/feedback/ml_feedback.csv".format(HUSER,self._table_name,yr,mn,dy))
        #removes json files from the storyboard
        HDFSClient.delete_folder("{0}/{1}/oa/{2}/{3}/{4}/{5}".format(HUSER,self._table_name,"storyboard",yr,mn,dy))
Ejemplo n.º 14
0
def save_comment(ip,title,text,date):

    #Get current table info.
    db = Configuration.db()
    sb_query = ("""
            SELECT
                ip_threat,title,text
            FROM
                {0}.flow_storyboard
            WHERE
                y = {1} AND m= {2} AND d={3}
            """).format(db,date.year,date.month,date.day)

    sb_data = ImpalaEngine.execute_query_as_list(sb_query)

    # find value if already exists.
    saved = False
    for item in sb_data:
        if item["ip_threat"] == ip:
            item["title"] = title
            item["text"] = text
            saved = True

    if not saved:
        sb_data.append({'text': text, 'ip_threat': str(ip), 'title': title})

    #remove old file.
    app_path = Configuration.spot()
    old_file = "{0}/flow/hive/oa/storyboard/y={1}/m={2}/d={3}/" \
    .format(app_path,date.year,date.month,date.day)

    # remove file manually to allow the comments update.
    HDFSClient.delete_folder(old_file,"impala")
    ImpalaEngine.execute_query("invalidate metadata")

    for item in sb_data:
	insert_query = ("""
         	INSERT INTO {0}.flow_storyboard PARTITION(y={1} , m={2} ,d={3})
            	VALUES ( '{4}', '{5}','{6}')
            	""") \
                .format(db,date.year,date.month,date.day, \
                item["ip_threat"],item["title"],item["text"])

        ImpalaEngine.execute_query(insert_query)
    return True
Ejemplo n.º 15
0
    def _create_dns_scores(self):
        
        # get date parameters.
        yr = self._date[:4]
        mn = self._date[4:6]
        dy = self._date[6:]
        value_string = ""

        dns_scores_final = self._move_time_stamp(self._dns_scores)
        self._dns_scores = dns_scores_final

        for row in dns_scores_final:
            value_string += str(tuple(Util.cast_val(item) for item in row)) + ","

        load_into_impala = ("""
             INSERT INTO {0}.dns_scores partition(y={2}, m={3}, d={4}) VALUES {1}
        """).format(self._db, value_string[:-1], yr, mn, dy)
        impala.execute_query(load_into_impala)
Ejemplo n.º 16
0
    def _ingest_summary(self): 
        # get date parameters.
        yr = self._date[:4]
        mn = self._date[4:6]
        dy = self._date[6:]

        self._logger.info("Getting ingest summary data for the day")
        
        ingest_summary_cols = ["date","total"]		
        result_rows = []        
        df_filtered =  pd.DataFrame()

        # get ingest summary.

        query_to_load=("""
                SELECT p_date, p_time, COUNT(*) as total
                FROM {0}.{1} WHERE y='{2}' AND m='{3}' AND d='{4}'
                AND p_date IS NOT NULL AND p_time IS NOT NULL
                AND clientip IS NOT NULL AND p_time != ''
                AND host IS NOT NULL AND fulluri IS NOT NULL
                GROUP BY p_date, p_time;
        """).format(self._db,self._table_name, yr, mn, dy)
        
        results = impala.execute_query(query_to_load) 
 
        if results:
            df_results = as_pandas(results)
            #Forms a new dataframe splitting the minutes from the time column/
            df_new = pd.DataFrame([["{0} {1}:{2}".format(val['p_date'], val['p_time'].split(":")[0].zfill(2), val['p_time'].split(":")[1].zfill(2)), int(val['total']) if not math.isnan(val['total']) else 0 ] for key,val in df_results.iterrows()],columns = ingest_summary_cols)
            value_string = ''
            #Groups the data by minute 
            sf = df_new.groupby(by=['date'])['total'].sum()
            df_per_min = pd.DataFrame({'date':sf.index, 'total':sf.values})
            
            df_final = df_filtered.append(df_per_min, ignore_index=True).to_records(False,False) 
            if len(df_final) > 0:
                query_to_insert=("""
                    INSERT INTO {0}.proxy_ingest_summary PARTITION (y={1}, m={2}, d={3}) VALUES {4};
                """).format(self._db, yr, mn, dy, tuple(df_final))

                impala.execute_query(query_to_insert) 
                
        else:
            self._logger.info("No data found for the ingest summary")
Ejemplo n.º 17
0
    def _ingest_summary(self):
        # get date parameters.
        yr = self._date[:4]
        mn = self._date[4:6]
        dy = self._date[6:]

        self._logger.info("Getting ingest summary data for the day")

        ingest_summary_cols = ["date", "total"]
        result_rows = []
        df_filtered = pd.DataFrame()

        query_to_load = ("""
            SELECT frame_time, COUNT(*) as total FROM {0}.{1}
            WHERE y={2} AND m={3} AND d={4} AND unix_tstamp IS NOT NULL
            AND frame_time IS NOT NULL AND frame_len IS NOT NULL
            AND dns_qry_name IS NOT NULL AND ip_src IS NOT NULL
            AND (dns_qry_class IS NOT NULL AND dns_qry_type IS NOT NULL
            AND dns_qry_rcode IS NOT NULL ) GROUP BY frame_time;
        """).format(self._db, self._table_name, yr, mn, dy)

        results = impala.execute_query_as_list(query_to_load)
        df = pd.DataFrame(results)

        # Forms a new dataframe splitting the minutes from the time column
        df_new = pd.DataFrame([["{0}-{1}-{2} {3}:{4}".format(yr, mn, dy,\
            val['frame_time'].replace("  "," ").split(" ")[3].split(":")[0].zfill(2),\
            val['frame_time'].replace("  "," ").split(" ")[3].split(":")[1].zfill(2)),\
            int(val['total']) if not math.isnan(val['total']) else 0 ] for key,val in df.iterrows()],columns = ingest_summary_cols)

        #Groups the data by minute
        sf = df_new.groupby(by=['date'])['total'].sum()
        df_per_min = pd.DataFrame({'date': sf.index, 'total': sf.values})

        df_final = df_filtered.append(df_per_min,
                                      ignore_index=True).to_records(
                                          False, False)

        if len(df_final) > 0:
            query_to_insert = ("""
                INSERT INTO {0}.dns_ingest_summary PARTITION (y={1}, m={2}, d={3}) VALUES {4};
            """).format(self._db, yr, mn, dy, tuple(df_final))
            impala.execute_query(query_to_insert)
Ejemplo n.º 18
0
    def _get_suspicious_details(self,bar=None):
        
        # skip header
        sp_connections = iter(self._flow_scores)
        # loop connections.
        connections_added = [] 
        for conn in sp_connections:
            
            # validate if the connection's details are not already extracted.            
            if conn in connections_added:
                continue
            else:
                connections_added.append(conn)
            
            src_ip_index = self._conf["flow_score_fields"]["srcIP"]
            dst_ip_index = self._conf["flow_score_fields"]["dstIP"]

            # get src ip 
            sip = conn[src_ip_index]
            # get dst ip
            dip = conn[dst_ip_index]

            # get hour and date  (i.e. 2014-07-08 10:10:40)
            
            date_array = conn[0].split(' ')
            date_array_1 = date_array[0].split('-')
            date_array_2 = date_array[1].split(':')
	    
            yr = date_array_1[0]                   
            dy = date_array_1[2]
            mh = date_array_1[1]

            hr = date_array_2[0]
            mm = date_array_2[1]
            
            query_to_load = ("""
                INSERT INTO TABLE {0}.flow_edge PARTITION (y={2}, m={3}, d={4})
                SELECT treceived as tstart,sip as srcip,dip as dstip,sport as sport,dport as dport,proto as proto,flag as flags,
                stos as tos,ibyt as ibyt,ipkt as ipkt, input as input, output as output,rip as rip, obyt as obyt, 
                opkt as opkt, h as hh, trminute as mn from {0}.{1} where ((sip='{7}' AND dip='{8}') or (sip='{8}' AND dip='{7}')) 
                AND y={2} AND m={3} AND d={4} AND h={5} AND trminute={6};
                """).format(self._db,self._table_name,yr, mh, dy, hr, mm, sip,dip)
            impala.execute_query(query_to_load)
Ejemplo n.º 19
0
def reset_scored_connections(date):

    flow_storyboard =  "flow/hive/oa/storyboard"
    flow_threat_investigation = "flow/hive/oa/threat_investigation"
    flow_timeline = "flow/hive/oa/timeline"    
    app_path = Configuration.spot()   

    try:
        # remove parquet files manually to allow the comments update.
        HDFSClient.delete_folder("{0}/{1}/y={2}/m={3}/d={4}/".format( \
            app_path,flow_storyboard,date.year,date.month,date.day) , "impala")
        HDFSClient.delete_folder("{0}/{1}/y={2}/m={3}/d={4}/".format( \
            app_path,flow_threat_investigation,date.year,date.month,date.day), "impala")
        HDFSClient.delete_folder("{0}/{1}/y={2}/m={3}/d={4}/".format( \
            app_path,flow_timeline,date.year,date.month,date.day), "impala")
        ImpalaEngine.execute_query("invalidate metadata")
        return True
        
    except HdfsError:
        return False
Ejemplo n.º 20
0
def reset_scored_connections(date):

    proxy_storyboard = "proxy/hive/oa/storyboard"
    proxy_threat_investigation = "dns_threat_dendro/hive/oa/timeline"
    proxy_timeline = "proxy/hive/oa/threat_investigation"
    app_path = Configuration.spot()

    try:
        # remove parquet files manually to allow the comments update.
        HDFSClient.delete_folder("{0}/{1}/y={2}/m={3}/d={4}/".format( \
            app_path,proxy_storyboard,date.year,date.month,date.day) , "impala")
        HDFSClient.delete_folder("{0}/{1}/y={2}/m={3}/d={4}/".format( \
            app_path,proxy_threat_investigation,date.year,date.month,date.day), "impala")
        HDFSClient.delete_folder("{0}/{1}/y={2}/m={3}/d={4}/".format( \
            app_path,proxy_timeline,date.year,date.month,date.day), "impala")
        ImpalaEngine.execute_query("invalidate metadata")
        return True

    except HdfsError:
        return False
Ejemplo n.º 21
0
    def _ingest_summary(self):
        # get date parameters.
        yr = self._date[:4]
        mn = self._date[4:6]
        dy = self._date[6:]

        self._logger.info("Getting ingest summary data for the day")
        
        ingest_summary_cols = ["date","total"]		
        result_rows = []        
        df_filtered =  pd.DataFrame()

        query_to_load = ("""
            SELECT frame_time, COUNT(*) as total FROM {0}.{1}
            WHERE y={2} AND m={3} AND d={4} AND unix_tstamp IS NOT NULL
            AND frame_time IS NOT NULL AND frame_len IS NOT NULL
            AND dns_qry_name IS NOT NULL AND ip_src IS NOT NULL
            AND (dns_qry_class IS NOT NULL AND dns_qry_type IS NOT NULL
            AND dns_qry_rcode IS NOT NULL ) GROUP BY frame_time;
        """).format(self._db,self._table_name, yr, mn, dy)

        results = impala.execute_query_as_list(query_to_load)
        df = pd.DataFrame(results)

        # Forms a new dataframe splitting the minutes from the time column
        df_new = pd.DataFrame([["{0}-{1}-{2} {3}:{4}".format(yr, mn, dy,\
            val['frame_time'].replace("  "," ").split(" ")[3].split(":")[0].zfill(2),\
            val['frame_time'].replace("  "," ").split(" ")[3].split(":")[1].zfill(2)),\
            int(val['total']) if not math.isnan(val['total']) else 0 ] for key,val in df.iterrows()],columns = ingest_summary_cols)

        #Groups the data by minute
        sf = df_new.groupby(by=['date'])['total'].sum()
        df_per_min = pd.DataFrame({'date':sf.index, 'total':sf.values})

        df_final = df_filtered.append(df_per_min, ignore_index=True).to_records(False,False)

        if len(df_final) > 0:
            query_to_insert=("""
                INSERT INTO {0}.dns_ingest_summary PARTITION (y={1}, m={2}, d={3}) VALUES {4};
            """).format(self._db, yr, mn, dy, tuple(df_final))
            impala.execute_query(query_to_insert)
Ejemplo n.º 22
0
    def _get_dns_details(self,dns_qry_name,year,month,day,hh,dns_iana):
        value_string = ""
        query_to_load =("""
            SELECT unix_tstamp,frame_len,ip_dst,ip_src,dns_qry_name,dns_qry_class,dns_qry_type,dns_qry_rcode,dns_a,h as hh
            FROM {0}.{1} WHERE y={2} AND m={3} AND d={4} AND dns_qry_name LIKE '%{5}%' AND h={6} LIMIT {7};
        """).format(self._db,self._table_name,year,month,day,dns_qry_name,hh,self._details_limit)

        try:
             dns_details = impala.execute_query(query_to_load)
        except:
            self._logger.info("WARNING. Details couldn't be retreived for {0}, skipping this step".format(dns_qry_name))
        else:
        # add IANA to results.
            update_rows = []
            if dns_iana:
                self._logger.info("Adding IANA translation to details results")

                dns_details = [ conn + (dns_iana.get_name(str(conn[5]),"dns_qry_class"),dns_iana.get_name(str(conn[6]),"dns_qry_type"),dns_iana.get_name(str(conn[7]),"dns_qry_rcode")) for conn in dns_details ]
            else:
                self._logger.info("WARNING: NO IANA configured.")
                dns_details = [ conn + ("","","") for conn in dns_details ]

            nc_conf_file = "{0}/components/nc/nc_config.json".format(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
            if os.path.isfile(nc_conf_file):
                nc_conf = json.loads(open(nc_conf_file).read())["NC"]
                dns_nc = NetworkContext(nc_conf,self._logger)
                dns_details = [ conn + (dns_nc.get_nc(conn[2]),) for conn in dns_details ]
            else:
                dns_details = [ conn + (0,) for conn in dns_details ]

            for row in dns_details:
                value_string += str(tuple(item for item in row)) + ","

            if value_string != "":
                
                query_to_insert=("""
                    INSERT INTO {0}.dns_edge PARTITION (y={1}, m={2}, d={3}) VALUES ({4});
                """).format(self._db,year, month, day,  value_string[:-1])

                impala.execute_query(query_to_insert)
Ejemplo n.º 23
0
    def _get_chord_details(self,bar=None):

         # skip header
        sp_connections = iter(self._flow_scores)

        src_ip_index = self._conf["flow_score_fields"]["srcIP"]
        dst_ip_index = self._conf["flow_score_fields"]["dstIP"] 

        # get date parameters.
        yr = self._date[:4]
        mn = self._date[4:6]
        dy = self._date[6:]

        # get number of times each IP appears.
        srcdict = {}
        for conn in sp_connections:
            if conn[src_ip_index] in srcdict:srcdict[conn[src_ip_index]] += 1 
            else:srcdict[conn[src_ip_index]] = 1
            if conn[dst_ip_index] in srcdict:srcdict[conn[dst_ip_index]] += 1
            else:srcdict[conn[dst_ip_index]] = 1
        
        for (ip,n) in srcdict.items():            
            if n > 1:
                ip_list = []                
                sp_connections = iter(self._flow_scores)
                for row in sp_connections:                    
                    if ip == row[1] : ip_list.append(row[2])
                    if ip == row[2] :ip_list.append(row[1])    
                ips = list(set(ip_list))
             
                if len(ips) > 1:
                    ips_filter = (",".join(str("'{0}'".format(ip)) for ip in ips))
 
                    query_to_load = ("""
                        INSERT INTO TABLE {0}.flow_chords PARTITION (y={2}, m={3}, d={4})
                        SELECT '{5}' as ip_threat, sip as srcip, dip as dstip, SUM(ibyt) as ibyt, SUM(ipkt) as ipkt from {0}.{1} where y={2} and m={3}
                        and d={4} and ((sip='{5}' and dip IN({6})) or (sip IN({6}) and dip='{5}')) group by sip,dip,m,d;
                        """).format(self._db,self._table_name,yr,mn,dy,ip,ips_filter)

                    impala.execute_query(query_to_load)
Ejemplo n.º 24
0
    def _get_dns_dendrogram(self):

        for conn in self._dns_scores:
            timestamp = conn[self._conf["dns_score_fields"]["unix_tstamp"]]
            full_date = datetime.datetime.utcfromtimestamp(int(timestamp)).strftime('%Y-%m-%d %H:%M:%S')

            date = full_date.split(" ")[0].split("-")
            # get date parameters.

            yr = date[0]
            mn = date[1]
            dy = date[2]
            ip_dst=conn[self._conf["dns_score_fields"]["ip_dst"]]

            query_to_load = ("""
                INSERT INTO TABLE {0}.dns_dendro PARTITION (y={2}, m={3},d={4})
                SELECT unix_tstamp, dns_a, dns_qry_name, ip_dst
                FROM (SELECT unix_tstamp, susp.ip_dst, susp.dns_qry_name, susp.dns_a
                    FROM {0}.{1} as susp WHERE susp.y={2} AND susp.m={3} AND susp.d={4} AND susp.ip_dst='{5}'
                LIMIT {6}) AS tmp GROUP BY dns_a, dns_qry_name, ip_dst, unix_tstamp
            """).format(self._db,self._table_name,yr,mn,dy,ip_dst,self._details_limit)

            impala.execute_query(query_to_load)
Ejemplo n.º 25
0
def score_connection(date, ip="", dns="", ip_sev=0, dns_sev=0):

    if (not ip and not ip_sev) and (not dns and not dns_sev):
        return False

    db = Configuration.db()
    sq_query = ("""
		SELECT
    	    frame_time,unix_tstamp,frame_len,ip_dst,dns_qry_name,dns_qry_class,
		    dns_qry_type,dns_qry_rcode,ml_score,tld,query_rep,
		    hh,dns_qry_class_name,dns_qry_type_name,dns_qry_rcode_name,
		    network_context
		FROM
		    {0}.dns_scores
		WHERE
		    y={1} and m={2} and d={3}
            AND (
		""").format(db, date.year, date.month, date.day)

    connections_filter = ""
    connections_filter += "ip_dst = '{0}' ".format(ip) if ip else ""
    connections_filter += " OR " if ip and dns else ""
    connections_filter += "dns_qry_name = '{0}' ".format(dns) if dns else ""
    connections_filter += ")"
    connections = ImpalaEngine.execute_query(sq_query + connections_filter)

    # add score to connections

    insert_command = ("""INSERT INTO {0}.dns_threat_investigation
                        PARTITION (y={1},m={2},d={3})
                        VALUES (""") \
                        .format(db,date.year,date.month,date.day)

    fb_data = []
    first = True
    num_rows = 0
    for row in connections:
        # insert into dns_threat_investigation.
        threat_data = (row[1],row[3],row[4],ip_sev if ip == row[3] else 0,\
        dns_sev if dns == row[4] else 0)

        fb_data.append([row[0],row[2],row[3],row[4],row[5],row[6],row[7],\
        row[8],row[9],row[10],row[11],ip_sev,dns_sev,row[12],row[13],row[14],\
        row[15],row[1]])

        insert_command += "{0}{1}".format("," if not first else "",
                                          threat_data)
        first = False
        num_rows += 1

    insert_command += ")"
    if num_rows > 0: ImpalaEngine.execute_query(insert_command)

    # create feedback file.
    app_path = Configuration.spot()
    feedback_path = "{0}/dns/scored_results/{1}{2}{3}/feedback"\
    .format(app_path,date.year,str(date.month).zfill(2),str(date.day).zfill(2))
    ap_file = True

    if len(HDFSClient.list_dir(feedback_path)) == 0:
        fb_data.insert(0,["frame_time","frame_len","ip_dst","dns_qry_name",\
        "dns_qry_class","dns_qry_type","dns_qry_rcode","score","tld","query_rep",\
        "hh","ip_sev","dns_sev","dns_qry_class_name","dns_qry_type_name",\
        "dns_qry_rcode_name","network_context","unix_tstamp"])
        ap_file = False

    HDFSClient.put_file_csv(fb_data,
                            feedback_path,
                            "ml_feedback.csv",
                            append_file=ap_file)
    return True
Ejemplo n.º 26
0
def  score_connection(date,ip="", dns="", ip_sev=0, dns_sev=0):

    if (not ip and not ip_sev) and (not dns and not dns_sev):
        return False

    db = Configuration.db()
    sq_query = ("""
		SELECT
    	    frame_time,unix_tstamp,frame_len,ip_dst,dns_qry_name,dns_qry_class,
		    dns_qry_type,dns_qry_rcode,ml_score,tld,query_rep,
		    hh,dns_qry_class_name,dns_qry_type_name,dns_qry_rcode_name,
		    network_context
		FROM
		    {0}.dns_scores
		WHERE
		    y={1} and m={2} and d={3}
            AND (
		""").format(db,date.year,date.month,date.day)

    connections_filter = ""
    connections_filter += "ip_dst = '{0}' ".format(ip) if ip else ""
    connections_filter += " OR " if ip and dns else ""
    connections_filter += "dns_qry_name = '{0}' ".format(dns) if dns else ""
    connections_filter += ")"
    connections = ImpalaEngine.execute_query(sq_query + connections_filter)

    # add score to connections

    insert_command = ("""INSERT INTO {0}.dns_threat_investigation
                        PARTITION (y={1},m={2},d={3})
                        VALUES (""") \
                        .format(db,date.year,date.month,date.day)

    fb_data =  []
    first = True
    num_rows = 0
    for row in connections:
        # insert into dns_threat_investigation.
        threat_data = (row[1],row[3],row[4],ip_sev if ip == row[3] else 0,\
        dns_sev if dns == row[4] else 0)

        fb_data.append([row[0],row[2],row[3],row[4],row[5],row[6],row[7],\
        row[8],row[9],row[10],row[11],ip_sev,dns_sev,row[12],row[13],row[14],\
        row[15],row[1]])

        insert_command += "{0}{1}".format("," if not first else "", threat_data)
        first = False
        num_rows += 1

    insert_command += ")"
    if num_rows > 0: ImpalaEngine.execute_query(insert_command)

    # create feedback file.
    app_path = Configuration.spot()
    feedback_path = "{0}/dns/scored_results/{1}{2}{3}/feedback"\
    .format(app_path,date.year,str(date.month).zfill(2),str(date.day).zfill(2))
    ap_file = True

    if len(HDFSClient.list_dir(feedback_path)) == 0:
        fb_data.insert(0,["frame_time","frame_len","ip_dst","dns_qry_name",\
        "dns_qry_class","dns_qry_type","dns_qry_rcode","score","tld","query_rep",\
        "hh","ip_sev","dns_sev","dns_qry_class_name","dns_qry_type_name",\
        "dns_qry_rcode_name","network_context","unix_tstamp"])
        ap_file = False

    HDFSClient.put_file_csv(fb_data,feedback_path,"ml_feedback.csv",append_file=ap_file)
    return True
Ejemplo n.º 27
0
def score_request(date, score, uri):

    if not score and not uri:
        return None

    db = Configuration.db()
    p_query = ("""
		SELECT
		    tdate,time,clientip,host,reqmethod,useragent,resconttype
		    ,duration,username,webcat,referer,respcode,uriport
		    ,uripath,uriquery,serverip,scbytes,csbytes,fulluri
		    ,word,ml_score,uri_rep,respcode_name,network_context
		FROM
		    {0}.proxy_scores
		WHERE
		    y={1} and m={2} and d={3}
		    AND fulluri = '{4}'
		""").format(db, date.year, date.month, date.day, uri)

    connections = ImpalaEngine.execute_query(p_query)

    # add score to connections
    insert_command = ("""
		INSERT INTO {0}.proxy_threat_investigation PARTITION (y={1},m={2},d={3})
		VALUES (""") \
        .format(db,date.year,date.month,date.day)

    fb_data = []
    first = True
    num_rows = 0
    for row in connections:
        cip_index = row[2]
        uri_index = row[18]
        tme_index = row[2]
        hash_field = [str( md5.new(str(cip_index) + str(uri_index)).hexdigest() \
        + str((tme_index.split(":"))[0]) )]

        threat_data = (row[0], row[18], score)
        fb_data.append([row[0],row[1],row[2],row[3],row[4],row[5],row[6],row[7] \
   ,row[8],row[9],row[10],row[11],row[12],row[13],row[14],row[15] \
   ,row[16],row[17],row[18],row[19],score,row[20],row[21],row[22], \
   row[23],hash_field])
        insert_command += "{0}{1}".format("," if not first else "",
                                          threat_data)
        first = False
        num_rows += 1

    insert_command += ")"
    if num_rows > 0: ImpalaEngine.execute_query(insert_command)

    # create feedback file.
    app_path = Configuration.spot()
    feedback_path = "{0}/proxy/scored_results/{1}{2}{3}/feedback"\
    .format(app_path,date.year,str(date.month).zfill(2),str(date.day).zfill(2))

    ap_file = True
    if len(HDFSClient.list_dir(feedback_path)) == 0:
        fb_data.insert(0,["p_date","p_time","clientip","host","reqmethod",\
           "useragent","resconttype","duration","username","webcat","referer",\
           "respcode","uriport","uripath","uriquery","serverip","scbytes","csbytes",\
           "fulluri","word","score","uri_rep","uri_sev","respcode_name",\
           "network_context","hash"])
        ap_file = False

    HDFSClient.put_file_csv(fb_data,
                            feedback_path,
                            "ml_feedback.csv",
                            append_file=ap_file)
    return True
Ejemplo n.º 28
0
def create_timeline(anchor, clientips, date, top_results):
    response = ""
    susp_ips = []

    if clientips:
        srtlist = sorted(list(clientips.items()),
                         key=lambda x: x[1],
                         reverse=True)
        for val in srtlist[:top_results]:
            susp_ips.append(val[0])

    if anchor != "":
        db = Configuration.db()
        time_line_query = ("""
                SELECT p_threat,tstart,tend,duration,clientip,respcode,respcodename
                FROM {0}.proxy_timeline
                WHERE
                    y={1} AND m={2} AND d={3} AND p_threat != '{4}'
                """).format(db, date.year, date.month, date.day,
                            anchor.replace("'", "//'"))

        tmp_timeline_data = ImpalaEngine.execute_query_as_list(time_line_query)

        imp_query = ("""
                        INSERT INTO TABLE {0}.proxy_timeline
                        PARTITION (y={2}, m={3},d={4})
                        SELECT
                            '{7}' as p_threat, concat(cast(p_date as string),
                            ' ', cast(MIN(p_time) as string)) AS tstart,
                            concat(cast(p_date as string), ' ',
                            cast(MAX(p_time) as string)) AS tend,
                            SUM(duration) AS duration,
                            clientip, respcode,"respCodeName" as respCodeName
                        FROM {0}.proxy
                        WHERE fulluri='{1}' AND clientip IN ({5})
                        AND y='{2}' AND m='{3}' AND d='{4}'
                        GROUP BY clientip, p_time, respcode, p_date
                        LIMIT {6}
                    """)\
                    .format(db,anchor,date.year,str(date.month).zfill(2),\
                    str(date.day).zfill(2),("'" + "','".join(susp_ips) + "'")\
                    ,top_results,anchor)

        app_path = Configuration.spot()
        old_file = "{0}/proxy/hive/oa/timeline/y={1}/m={2}/d={3}"\
        .format(app_path,date.year,date.month,date.day)

        HDFSClient.delete_folder(old_file, "impala")
        ImpalaEngine.execute_query("invalidate metadata")

        #Insert temporary values
        for item in tmp_timeline_data:
            insert_query = ("""
                        INSERT INTO {0}.proxy_timeline PARTITION(y={1} , m={2} ,d={3})
                        VALUES ('{4}', '{5}', '{6}',{7},'{8}','{9}','{10}')
                        """)\
                        .format(db,date.year,date.month,date.day,\
                        item["p_threat"],item["tstart"],item["tend"],item["duration"],item["clientip"],item["respcode"],item["respcodename"])

            ImpalaEngine.execute_query(insert_query)

        ImpalaEngine.execute_query(imp_query)
        response = "Timeline successfully saved"
    else:
        response = "Timeline couldn't be created"
Ejemplo n.º 29
0
def score_connection(score,date,src_ip=None,dst_ip=None,src_port=None,dst_port=None):

    if not src_ip and not dst_ip and not src_port and not dst_port:
        return False

    db = Configuration.db()
    # get connections to score
    connections_query = ("""
            SELECT
                tstart,srcip,dstip,sport,dport, ibyt,ipkt
            FROM {0}.flow_scores
            WHERE
                y = {1} AND m={2} AND d={3}
            """).format(db,date.year,date.month,date.day)

    connections_filter = ""
    connections_filter += " AND srcip = '{0}'".format(src_ip) if src_ip else ""
    connections_filter += " AND dstip = '{0}'".format(dst_ip) if dst_ip else ""

    connections_filter += " AND sport = {0}" \
    .format(str(src_port)) if src_port else ""

    connections_filter += " AND dport = {0}" \
    .format(str(dst_port)) if dst_port else ""
    connections = ImpalaEngine.execute_query(connections_query + connections_filter)


    # add score to connections
    insert_command = ("""
        INSERT INTO {0}.flow_threat_investigation
        PARTITION (y={1},m={2},d={3})
        VALUES (""") \
        .format(db,date.year,date.month,date.day)

    fb_data =  []
    first = True
    num_rows = 0
    for row in connections:
        # insert into flow_threat_investigation.
        threat_data = (row[0],row[1],row[2],row[3],row[4],score)
        fb_data.append([score,row[0],row[1],row[2],row[3],row[4],row[5],row[6]])
        insert_command += "{0}{1}".format("," if not first else "", threat_data)
        first = False
        num_rows += 1

    insert_command += ")"
    if num_rows > 0: ImpalaEngine.execute_query(insert_command)

    # create feedback file.
    app_path = Configuration.spot()
    feedback_path = "{0}/flow/scored_results/{1}{2}{3}/feedback" \
    .format(app_path,date.year,str(date.month).zfill(2),str(date.day).zfill(2))

    append_file = True
    if len(HDFSClient.list_dir(feedback_path)) == 0:
        fb_data.insert(0,["sev","tstart","sip","dip","sport","dport","ipkt","ibyt"])
        append_file = False

    HDFSClient.put_file_csv(fb_data,feedback_path,"ml_feedback.csv",\
    append_file=append_file)
    return True
Ejemplo n.º 30
0
def score_request(date,score,uri):

    if not score and not uri:
	return None

    db = Configuration.db()
    p_query = ("""
		SELECT
		    tdate,time,clientip,host,reqmethod,useragent,resconttype
		    ,duration,username,webcat,referer,respcode,uriport
		    ,uripath,uriquery,serverip,scbytes,csbytes,fulluri
		    ,word,ml_score,uri_rep,respcode_name,network_context
		FROM
		    {0}.proxy_scores
		WHERE
		    y={1} and m={2} and d={3}
		    AND fulluri = '{4}'
		""").format(db,date.year,date.month,date.day,uri)

    connections = ImpalaEngine.execute_query(p_query)

    # add score to connections
    insert_command = ("""
		INSERT INTO {0}.proxy_threat_investigation PARTITION (y={1},m={2},d={3})
		VALUES (""") \
        .format(db,date.year,date.month,date.day)

    fb_data =  []
    first = True
    num_rows = 0
    for row in connections:
        cip_index = row[2]
        uri_index = row[18]
        tme_index = row[2]
        hash_field = [str( md5.new(str(cip_index) + str(uri_index)).hexdigest() \
        + str((tme_index.split(":"))[0]) )]

        threat_data = (row[0],row[18],score)
        fb_data.append([row[0],row[1],row[2],row[3],row[4],row[5],row[6],row[7] \
			,row[8],row[9],row[10],row[11],row[12],row[13],row[14],row[15] \
			,row[16],row[17],row[18],row[19],score,row[20],row[21],row[22], \
			row[23],hash_field])
        insert_command += "{0}{1}".format("," if not first else "", threat_data)
        first = False
        num_rows += 1

    insert_command += ")"
    if num_rows > 0: ImpalaEngine.execute_query(insert_command)

    # create feedback file.
    app_path = Configuration.spot()
    feedback_path = "{0}/proxy/scored_results/{1}{2}{3}/feedback"\
    .format(app_path,date.year,str(date.month).zfill(2),str(date.day).zfill(2))

    ap_file = True
    if len(HDFSClient.list_dir(feedback_path)) == 0:
    	fb_data.insert(0,["p_date","p_time","clientip","host","reqmethod",\
        "useragent","resconttype","duration","username","webcat","referer",\
        "respcode","uriport","uripath","uriquery","serverip","scbytes","csbytes",\
        "fulluri","word","score","uri_rep","uri_sev","respcode_name",\
        "network_context","hash"])
        ap_file = False

    HDFSClient.put_file_csv(fb_data,feedback_path,"ml_feedback.csv",append_file=ap_file)
    return True
Ejemplo n.º 31
0
def create_timeline(anchor,clientips,date,top_results):
    response = ""
    susp_ips = []

    if clientips:
        srtlist = sorted(list(clientips.items()), key=lambda x: x[1], reverse=True)
        for val in srtlist[:top_results]:
            susp_ips.append(val[0])

    if anchor != "":
        db = Configuration.db()
        time_line_query = ("""
                SELECT p_threat,tstart,tend,duration,clientip,respcode,respcodename
                FROM {0}.proxy_timeline
                WHERE
                    y={1} AND m={2} AND d={3} AND p_threat != '{4}'
                """).format(db,date.year,date.month,date.day,anchor.replace("'","//'"))
        
        tmp_timeline_data = ImpalaEngine.execute_query_as_list(time_line_query)

        imp_query = ("""
                        INSERT INTO TABLE {0}.proxy_timeline
                        PARTITION (y={2}, m={3},d={4})
                        SELECT
                            '{7}' as p_threat, concat(cast(p_date as string),
                            ' ', cast(MIN(p_time) as string)) AS tstart,
                            concat(cast(p_date as string), ' ',
                            cast(MAX(p_time) as string)) AS tend,
                            SUM(duration) AS duration,
                            clientip, respcode,"respCodeName" as respCodeName
                        FROM {0}.proxy
                        WHERE fulluri='{1}' AND clientip IN ({5})
                        AND y='{2}' AND m='{3}' AND d='{4}'
                        GROUP BY clientip, p_time, respcode, p_date
                        LIMIT {6}
                    """)\
                    .format(db,anchor,date.year,str(date.month).zfill(2),\
                    str(date.day).zfill(2),("'" + "','".join(susp_ips) + "'")\
                    ,top_results,anchor)

        app_path = Configuration.spot()
        old_file = "{0}/proxy/hive/oa/timeline/y={1}/m={2}/d={3}"\
        .format(app_path,date.year,date.month,date.day)

        HDFSClient.delete_folder(old_file,"impala")
        ImpalaEngine.execute_query("invalidate metadata")

        #Insert temporary values
        for item in tmp_timeline_data:
            insert_query = ("""
                        INSERT INTO {0}.proxy_timeline PARTITION(y={1} , m={2} ,d={3})
                        VALUES ('{4}', '{5}', '{6}',{7},'{8}','{9}','{10}')
                        """)\
                        .format(db,date.year,date.month,date.day,\
                        item["p_threat"],item["tstart"],item["tend"],item["duration"],item["clientip"],item["respcode"],item["respcodename"])

            ImpalaEngine.execute_query(insert_query)

        ImpalaEngine.execute_query(imp_query)
        response = "Timeline successfully saved"
    else:
        response = "Timeline couldn't be created"
Ejemplo n.º 32
0
def score_connection(score,date,src_ip=None,dst_ip=None,src_port=None,dst_port=None):

    if not src_ip and not dst_ip and not src_port and not dst_port:
        return False

    db = Configuration.db()
    # get connections to score
    connections_query = ("""
            SELECT
                tstart,srcip,dstip,sport,dport, ibyt,ipkt
            FROM {0}.flow_scores
            WHERE
                y = {1} AND m={2} AND d={3}
            """).format(db,date.year,date.month,date.day)

    connections_filter = ""
    connections_filter += " AND srcip = '{0}'".format(src_ip) if src_ip else ""
    connections_filter += " AND dstip = '{0}'".format(dst_ip) if dst_ip else ""

    connections_filter += " AND sport = {0}" \
    .format(str(src_port)) if src_port else ""

    connections_filter += " AND dport = {0}" \
    .format(str(dst_port)) if dst_port else ""
    connections = ImpalaEngine.execute_query(connections_query + connections_filter)


    # add score to connections
    insert_command = ("""
        INSERT INTO {0}.flow_threat_investigation
        PARTITION (y={1},m={2},d={3})
        VALUES (""") \
        .format(db,date.year,date.month,date.day)

    fb_data =  []
    first = True
    num_rows = 0
    for row in connections:
        # insert into flow_threat_investigation.
        threat_data = (row[0],row[1],row[2],row[3],row[4],score)
        fb_data.append([score,row[0],row[1],row[2],row[3],row[4],row[5],row[6]])
        insert_command += "{0}{1}".format("," if not first else "", threat_data)
        first = False
        num_rows += 1

    insert_command += ")"
    if num_rows > 0: ImpalaEngine.execute_query(insert_command)

    # create feedback file.
    app_path = Configuration.spot()
    feedback_path = "{0}/flow/scored_results/{1}{2}{3}/feedback" \
    .format(app_path,date.year,str(date.month).zfill(2),str(date.day).zfill(2))

    append_file = True
    if len(HDFSClient.list_dir(feedback_path)) == 0:
        fb_data.insert(0,["sev","tstart","sip","dip","sport","dport","ipkt","ibyt"])
        append_file = False

    HDFSClient.put_file_csv(fb_data,feedback_path,"ml_feedback.csv",\
    append_file=append_file)
    return True