Example #1
0
    def _initialize_members(self, date, limit, logger):

        # get logger if exists. if not, create new instance.
        self._logger = logging.getLogger(
            'OA.Flow') if logger else Util.get_logger('OA.Flow',
                                                      create_file=False)

        # initialize required parameters.
        self._scrtip_path = os.path.dirname(os.path.abspath(__file__))
        self._date = date
        self._table_name = "flow"
        self._flow_results = []
        self._limit = limit
        self._data_path = None
        self._ipynb_path = None
        self._ingest_summary_path = None
        self._flow_scores = []
        self._results_delimiter = '\t'

        # get app configuration.
        self._spot_conf = Util.get_spot_conf()

        # get scores fields conf
        conf_file = "{0}/flow_conf.json".format(self._scrtip_path)
        self._conf = json.loads(open(conf_file).read(),
                                object_pairs_hook=OrderedDict)

        # initialize data engine
        self._db = self._spot_conf.get('conf',
                                       'DBNAME').replace("'",
                                                         "").replace('"', '')
        self._engine = Data(self._db, self._table_name, self._logger)
Example #2
0
    def _initialize_members(self,date,limit,logger):
        
        # get logger if exists. if not, create new instance.
        self._logger = logging.getLogger('OA.Flow') if logger else Util.get_logger('OA.Flow',create_file=False)

        # initialize required parameters.
        self._scrtip_path = os.path.dirname(os.path.abspath(__file__))
        self._date = date
        self._table_name = "flow"
        self._flow_results = []
        self._limit = limit
        self._data_path = None
        self._ipynb_path = None
        self._ingest_summary_path = None
        self._flow_scores = []
        self._results_delimiter = '\t'

        # get app configuration.
        self._oni_conf = Util.get_oni_conf()  

        # get scores fields conf
        conf_file = "{0}/flow_conf.json".format(self._scrtip_path)
        self._conf = json.loads(open (conf_file).read(),object_pairs_hook=OrderedDict)     
 
        # initialize data engine
        self._db = self._oni_conf.get('conf','DBNAME').replace("'","").replace('"','') 
        self._engine = Data(self._db, self._table_name,self._logger)
Example #3
0
class OA(object):

    def __init__(self,date,limit=500,logger=None):       
       
       self._initialize_members(date,limit,logger)
       
    def _initialize_members(self,date,limit,logger):
        
        # get logger if exists. if not, create new instance.
        self._logger = logging.getLogger('OA.Flow') if logger else Util.get_logger('OA.Flow',create_file=False)

        # initialize required parameters.
        self._scrtip_path = os.path.dirname(os.path.abspath(__file__))
        self._date = date
        self._table_name = "flow"
        self._flow_results = []
        self._limit = limit
        self._data_path = None
        self._ipynb_path = None
        self._ingest_summary_path = None
        self._flow_scores = []
        self._results_delimiter = '\t'

        # get app configuration.
        self._oni_conf = Util.get_oni_conf()  

        # get scores fields conf
        conf_file = "{0}/flow_conf.json".format(self._scrtip_path)
        self._conf = json.loads(open (conf_file).read(),object_pairs_hook=OrderedDict)     
 
        # initialize data engine
        self._db = self._oni_conf.get('conf','DBNAME').replace("'","").replace('"','') 
        self._engine = Data(self._db, self._table_name,self._logger)
              
    def start(self):       
        
        ####################
        start = time.time()
        ####################

        self._create_folder_structure()
        self._add_ipynb()  
        self._get_flow_results()
        self._add_network_context()
        self._add_geo_localization()
        self._add_reputation()        
        self._create_flow_scores_csv()
        self._get_oa_details()

        ##################
        end = time.time()
        print(end - start)
        ##################
       
    def _create_folder_structure(self):

        # create date folder structure if it does not exist.
        self._logger.info("Creating folder structure for OA (data and ipynb)")       
        self._data_path,self._ingest_summary_path,self._ipynb_path = Util.create_oa_folders("flow",self._date)

    def _add_ipynb(self):     

        if os.path.isdir(self._ipynb_path):

            self._logger.info("Adding edge investigation IPython Notebook")
            shutil.copy("{0}/ipynb_templates/Edge_Investigation_master.ipynb".format(self._scrtip_path),"{0}/Edge_Investigation.ipynb".format(self._ipynb_path))

            self._logger.info("Adding threat investigation IPython Notebook")
            shutil.copy("{0}/ipynb_templates/Threat_Investigation_master.ipynb".format(self._scrtip_path),"{0}/Threat_Investigation.ipynb".format(self._ipynb_path))

        else:
            self._logger.error("There was a problem adding the IPython Notebooks, please check the directory exists.")
            
    def _get_flow_results(self):
               
        self._logger.info("Getting {0} Machine Learning Results from HDFS".format(self._date))
        flow_results = "{0}/flow_results.csv".format(self._data_path)

        # get hdfs path from conf file 
        HUSER = self._oni_conf.get('conf','HUSER').replace("'","").replace('"','')   
        hdfs_path = "{0}/flow/scored_results/{1}/scores/flow_results.csv".format(HUSER,self._date)
               
        # get results file from hdfs
        get_command = Util.get_ml_results_form_hdfs(hdfs_path,self._data_path)
        self._logger.info("{0}".format(get_command))

        # valdiate files exists
        if os.path.isfile(flow_results):

            # read number of results based in the limit specified.
            self._logger.info("Reading {0} flow results file: {1}".format(self._date,flow_results))
            self._flow_results = Util.read_results(flow_results,self._limit,self._results_delimiter)
            if len(self._flow_results) == 0: self._logger.error("There are not flow results.");sys.exit(1)

        else:
            self._logger.error("There was an error getting ML results from HDFS")
            sys.exit(1)

        # add headers.        
        self._logger.info("Adding headers based on configuration file: score_fields.json")
        self._flow_scores = [ [ str(key) for (key,value) in self._conf['flow_score_fields'].items()] ]

        ldaab_index = self._conf["flow_results_fields"]["lda_score_ab"]
        ldaba_index = self._conf["flow_results_fields"]["lda_score_ba"]

        # filter results add sev and rank.
        self._logger.info("Filtering required columns based on configuration")
        self._flow_scores.extend([ [0] +  [ conn[i] for i in self._conf['column_indexes_filter'] ] + [(conn[ldaab_index] if (conn[ldaab_index]<= conn[ldaba_index]) else conn[ldaba_index])] + [n]  for n, conn in enumerate(self._flow_results) ])
     
    def _create_flow_scores_csv(self):

        flow_scores_csv = "{0}/flow_scores.csv".format(self._data_path)
        Util.create_csv_file(flow_scores_csv,self._flow_scores)

        # create bk file
        flow_scores_bu_csv = "{0}/flow_scores_bu.csv".format(self._data_path)
        Util.create_csv_file(flow_scores_bu_csv,self._flow_scores)  

    def _add_network_context(self):

        # use ipranges to see if the IPs are internals.         
        ip_ranges_file = "{0}/context/ipranges.csv".format(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))

        # add new headers (srcIpInternal/destIpInternal).
        self._logger.info("Adding network context headers")
        flow_headers = self._flow_scores[0]
        flow_headers.extend(["srcIpInternal","destIpInternal"])

        # add values to srcIpInternal and destIpInternal.
        flow_scores = iter(self._flow_scores)
        next(flow_scores)

        if os.path.isfile(ip_ranges_file):

            self._logger.info("Start adding network context...")

            # get ranges from configuration file.
            self._logger.info("Reading network context file: {0}".format(ip_ranges_file))
            with open(ip_ranges_file, 'rb') as f:
                nc_ranges = [ map(Util.ip_to_int,line.strip('\n').split(',')) for line in f ]

            # get src and dst IPs
            src_ip_index = self._conf["flow_score_fields"]["srcIP"]
            dst_ip_index = self._conf["flow_score_fields"]["dstIP"]              
            
            # add networkcontext per connection.
            ip_internal_ranges = filter(None,nc_ranges[0])    
            self._logger.info("Adding networkcontext to suspicious connections.")
            self._flow_scores = [ conn + [ self._is_ip_internal(conn[src_ip_index],ip_internal_ranges)]+[ self._is_ip_internal(conn[dst_ip_index],ip_internal_ranges)] for conn in flow_scores]
           
        else:

            self._flow_scores = [ conn + ["",""] for conn in flow_scores ]            
            self._logger.info("WARNING: Network context was not added because the file ipranges.csv does not exist.")
        
        self._flow_scores.insert(0,flow_headers)

    def _is_ip_internal(self,ip, ranges):

        if Util.ip_to_int(ip) >= ranges[0] and Util.ip_to_int(ip) <= ranges[1]: return 1
        return 0

    def _add_geo_localization(self):

        # use ipranges to see if the IPs are internals.         
        iploc_file = "{0}/context/iploc.csv".format(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))

        # add new headers (srcIpInternal/destIpInternal).     
        self._logger.info("Adding geo localization headers")
        flow_headers = self._flow_scores[0]
        flow_headers.extend(["srcGeo","dstGeo","srcDomain","dstDomain"]) 

        # add values to srcIpInternal and destIpInternal.
        flow_scores = iter(self._flow_scores)
        next(flow_scores)

        if os.path.isfile(iploc_file):

            self._logger.info("Initializing geo localization component")
            geo = GeoLocalization(iploc_file,self._logger)
            
            src_ip_index = self._conf["flow_score_fields"]["srcIP"]
            dst_ip_index = self._conf["flow_score_fields"]["dstIP"] 

            self._logger.info("Adding geo localization...")
            self._flow_scores = []
            for conn in flow_scores:

                # get geo localizastin for src ip
                self._logger.debug("Searching geo for src ip {0}".format(conn[src_ip_index]))
                src_geo_dict = geo.get_ip_geo_localization(conn[src_ip_index])

                # get goe localization for dst ip.
                self._logger.debug("Searching geo for dst ip {0}".format(conn[dst_ip_index]))
                dst_geo_dict = geo.get_ip_geo_localization(conn[dst_ip_index])

                # adding columns to the current connection list.
                conn.extend([src_geo_dict["geo_loc"],dst_geo_dict["geo_loc"],src_geo_dict["domain"],dst_geo_dict["domain"]])
                self._flow_scores.extend([conn])                
        else:

            self._flow_scores = [ conn + ["","","",""] for conn in flow_scores ]   
            self._logger.info("WARNING: IP location was not added because the file {0} does not exist.".format(iploc_file))

        self._flow_scores.insert(0,flow_headers)       

    def _add_reputation(self):
        
        reputation_conf_file = "{0}/components/reputation/reputation_config.json".format(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
        
        # add new headers (gtiSrcRep/gtiDstRep).
        self._logger.info("Adding reputation headers")
        flow_headers_rep = self._flow_scores[0]
        flow_headers_rep.extend(["srcIp_rep","dstIp_rep"])
        
        # read configuration.
        self._logger.info("Reading reputation configuration file: {0}".format(reputation_conf_file))
        rep_conf = json.loads(open(reputation_conf_file).read())["gti"]

        if os.path.isfile(rep_conf['refclient']):
           
            # initialize gti module.
            self._logger.info("Initializing GTI component")
            flow_gti = gti.Reputation(rep_conf,self._logger)

            # get all src ips.
            src_ip_index = self._conf["flow_score_fields"]["srcIP"]
            dst_ip_index = self._conf["flow_score_fields"]["dstIP"]

            self._logger.info("Getting GTI reputation for src IPs")
            flow_scores_src = iter(self._flow_scores)
            next(flow_scores_src)

            # getting reputation for src IPs
            src_ips = [ conn[src_ip_index] for conn in flow_scores_src ]            
            src_rep_results = flow_gti.check(src_ips)

            self._logger.info("Getting GTI reputation for dst IPs")
            flow_scores_dst = iter(self._flow_scores)
            next(flow_scores_dst)

            # getting reputation for dst IPs            
            dst_ips = [  conn[dst_ip_index] for conn in flow_scores_dst ]
            dst_rep_results = flow_gti.check(dst_ips)
                       
            flow_scores_final = iter(self._flow_scores)
            next(flow_scores_final)
            
            self._flow_scores = []
            flow_scores = [conn + [src_rep_results[conn[src_ip_index]]] + [dst_rep_results[conn[dst_ip_index]]]  for conn in  flow_scores_final ]
            self._flow_scores = flow_scores           
           
        else:
            # add values to gtiSrcRep and gtiDstRep.
            flow_scores = iter(self._flow_scores)
            next(flow_scores)

            self._flow_scores = [ conn + ["",""] for conn in flow_scores ]   
            self._logger.info("WARNING: IP reputation was not added. No refclient configured".format(reputation_conf_file))  
    
        self._flow_scores.insert(0,flow_headers_rep)       

    def _get_oa_details(self):

	self._logger.info("Getting OA Flow suspicious details/chord diagram")
	# start suspicious connects details process.
	p_sp = Process(target=self._get_suspicious_details)
	p_sp.start()

	# start chord diagram process.
	p_ch = Process(target=self._get_chord_details)
	p_ch.start()

	p_sp.join()
	p_ch.join()

	
    def _get_suspicious_details(self,bar=None):
        
        # skip header
        sp_connections = iter(self._flow_scores)
        next(sp_connections)
      
        # loop connections.
        connections_added = [] 
        for conn in sp_connections:
            
            # validate if the connection's details are not already extracted.            
            if conn in connections_added:
                continue
            else:
                connections_added.append(conn)
           
            src_ip_index = self._conf["flow_score_fields"]["srcIP"]
            dst_ip_index = self._conf["flow_score_fields"]["dstIP"]

            # get src ip 
            sip = conn[src_ip_index]
            # get dst ip
            dip = conn[dst_ip_index]

            # get hour and date  (i.e. 2014-07-08 10:10:40)
            date_array = conn[1].split(' ')
            date_array_1 = date_array[0].split('-')
            date_array_2 = date_array[1].split(':')

            yr = date_array_1[0]                   
            dy = date_array_1[2]
            mh = date_array_1[1]

            hr = date_array_2[0]
            mm = date_array_2[1]
        
            # connection details query.
            sp_query = ("SELECT treceived as tstart,sip as srcip,dip as dstip,sport as sport,dport as dport,proto as proto,flag as flags,stos as TOS,ibyt as bytes,ipkt as pkts,input as input, output as output,rip as rip from {0}.{1} where ((sip='{2}' AND dip='{3}') or (sip='{3}' AND dip='{2}')) AND y={8} AND m={4} AND d={5} AND h={6} AND trminute={7} order by tstart limit 100")
                 
            # sp query.
            sp_query = sp_query.format(self._db,self._table_name,sip,dip,mh,dy,hr,mm,yr)

            # output file.
            edge_file = "{0}/edge-{1}-{2}-{3}-{4}.tsv".format(self._data_path,sip.replace(".","_"),dip.replace(".","_"),hr,mm)

            # execute query
            self._engine.query(sp_query,output_file=edge_file,delimiter="\\t")
    
    def _get_chord_details(self,bar=None):

         # skip header
        sp_connections = iter(self._flow_scores)
        next(sp_connections) 

        src_ip_index = self._conf["flow_score_fields"]["srcIP"]
        dst_ip_index = self._conf["flow_score_fields"]["dstIP"] 

        # get date parameters.
        yr = self._date[:4]
        mn = self._date[4:6]
        dy = self._date[6:]

        # get number of times each IP appears.
        srcdict = {}
        for conn in sp_connections:
            if conn[src_ip_index] in srcdict:srcdict[conn[src_ip_index]] += 1 
            else:srcdict[conn[src_ip_index]] = 1
            if conn[dst_ip_index] in srcdict:srcdict[conn[dst_ip_index]] += 1
            else:srcdict[conn[dst_ip_index]] = 1
        
        for (ip,n) in srcdict.items():            
            if n > 1:
                ip_list = []                
                sp_connections = iter(self._flow_scores)
                next(sp_connections)
                for row in sp_connections:                    
                    if ip == row[2] : ip_list.append(row[3])
                    if ip == row[3] :ip_list.append(row[2])    
                ips = list(set(ip_list))
             
                if len(ips) > 1:
                    ips_filter = (",".join(str("'{0}'".format(ip)) for ip in ips))
                    chord_file = "{0}/chord-{1}.tsv".format(self._data_path,ip.replace(".","_"))                     
                    ch_query = ("SELECT sip as srcip, dip as dstip, MAX(ibyt) as maxbyte, AVG(ibyt) as avgbyte, MAX(ipkt) as maxpkt, AVG(ipkt) as avgpkt from {0}.{1} where y={2} and m={3} and d={4} and ( (sip='{5}' and dip IN({6})) or (sip IN({6}) and dip='{5}') ) group by sip,dip")
                    self._engine.query(ch_query.format(self._db,self._table_name,yr,mn,dy,ip,ips_filter),chord_file,delimiter="\\t")
Example #4
0
class OA(object):
    def __init__(self, date, limit=500, logger=None):

        self._initialize_members(date, limit, logger)

    def _initialize_members(self, date, limit, logger):

        # get logger if exists. if not, create new instance.
        self._logger = logging.getLogger(
            'OA.Flow') if logger else Util.get_logger('OA.Flow',
                                                      create_file=False)

        # initialize required parameters.
        self._scrtip_path = os.path.dirname(os.path.abspath(__file__))
        self._date = date
        self._table_name = "flow"
        self._flow_results = []
        self._limit = limit
        self._data_path = None
        self._ipynb_path = None
        self._ingest_summary_path = None
        self._flow_scores = []
        self._results_delimiter = '\t'

        # get app configuration.
        self._spot_conf = Util.get_spot_conf()

        # get scores fields conf
        conf_file = "{0}/flow_conf.json".format(self._scrtip_path)
        self._conf = json.loads(open(conf_file).read(),
                                object_pairs_hook=OrderedDict)

        # initialize data engine
        self._db = self._spot_conf.get('conf',
                                       'DBNAME').replace("'",
                                                         "").replace('"', '')
        self._engine = Data(self._db, self._table_name, self._logger)

    def start(self):

        ####################
        start = time.time()
        ####################

        self._create_folder_structure()
        self._add_ipynb()
        self._get_flow_results()
        self._add_network_context()
        self._add_geo_localization()
        self._add_reputation()
        self._create_flow_scores_csv()
        self._get_oa_details()
        self._ingest_summary()

        ##################
        end = time.time()
        print(end - start)
        ##################

    def _create_folder_structure(self):

        # create date folder structure if it does not exist.
        self._logger.info("Creating folder structure for OA (data and ipynb)")
        self._data_path, self._ingest_summary_path, self._ipynb_path = Util.create_oa_folders(
            "flow", self._date)

    def _add_ipynb(self):

        if os.path.isdir(self._ipynb_path):

            self._logger.info("Adding edge investigation IPython Notebook")
            shutil.copy(
                "{0}/ipynb_templates/Edge_Investigation_master.ipynb".format(
                    self._scrtip_path),
                "{0}/Edge_Investigation.ipynb".format(self._ipynb_path))

            self._logger.info("Adding threat investigation IPython Notebook")
            shutil.copy(
                "{0}/ipynb_templates/Threat_Investigation_master.ipynb".format(
                    self._scrtip_path),
                "{0}/Threat_Investigation.ipynb".format(self._ipynb_path))

        else:
            self._logger.error(
                "There was a problem adding the IPython Notebooks, please check the directory exists."
            )

    def _get_flow_results(self):

        self._logger.info(
            "Getting {0} Machine Learning Results from HDFS".format(
                self._date))
        flow_results = "{0}/flow_results.csv".format(self._data_path)

        # get hdfs path from conf file
        HUSER = self._spot_conf.get('conf',
                                    'HUSER').replace("'", "").replace('"', '')
        hdfs_path = "{0}/flow/scored_results/{1}/scores/flow_results.csv".format(
            HUSER, self._date)

        # get results file from hdfs
        get_command = Util.get_ml_results_form_hdfs(hdfs_path, self._data_path)
        self._logger.info("{0}".format(get_command))

        # valdiate files exists
        if os.path.isfile(flow_results):

            # read number of results based in the limit specified.
            self._logger.info("Reading {0} flow results file: {1}".format(
                self._date, flow_results))
            self._flow_results = Util.read_results(flow_results, self._limit,
                                                   self._results_delimiter)
            if len(self._flow_results) == 0:
                self._logger.error("There are not flow results.")
                sys.exit(1)

        else:
            self._logger.error(
                "There was an error getting ML results from HDFS")
            sys.exit(1)

        # add headers.
        self._logger.info(
            "Adding headers based on configuration file: score_fields.json")
        self._flow_scores = [[
            str(key)
            for (key, value) in self._conf['flow_score_fields'].items()
        ]]

        # filter results add sev and rank.
        self._logger.info("Filtering required columns based on configuration")
        self._flow_scores.extend(
            [[0] + [conn[i]
                    for i in self._conf['column_indexes_filter']] + [n]
             for n, conn in enumerate(self._flow_results)])

    def _create_flow_scores_csv(self):

        flow_scores_csv = "{0}/flow_scores.csv".format(self._data_path)
        Util.create_csv_file(flow_scores_csv, self._flow_scores)

        # create bk file
        flow_scores_bu_csv = "{0}/flow_scores_bu.csv".format(self._data_path)
        Util.create_csv_file(flow_scores_bu_csv, self._flow_scores)

    def _add_network_context(self):

        # use ipranges to see if the IPs are internals.
        ip_ranges_file = "{0}/context/ipranges.csv".format(
            os.path.dirname(
                os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))

        # add new headers (srcIpInternal/destIpInternal).
        self._logger.info("Adding network context headers")
        flow_headers = self._flow_scores[0]
        flow_headers.extend(["srcIpInternal", "destIpInternal"])

        # add values to srcIpInternal and destIpInternal.
        flow_scores = iter(self._flow_scores)
        next(flow_scores)

        if os.path.isfile(ip_ranges_file):

            self._logger.info("Start adding network context...")

            # get ranges from configuration file.
            self._logger.info(
                "Reading network context file: {0}".format(ip_ranges_file))
            with open(ip_ranges_file, 'rb') as f:
                nc_ranges = [
                    map(Util.ip_to_int,
                        line.strip('\n').split(',')) for line in f
                ]

            # get src and dst IPs
            src_ip_index = self._conf["flow_score_fields"]["srcIP"]
            dst_ip_index = self._conf["flow_score_fields"]["dstIP"]

            # add networkcontext per connection.
            ip_internal_ranges = filter(None, nc_ranges)
            self._logger.info(
                "Adding networkcontext to suspicious connections.")
            self._flow_scores = [
                conn +
                [self._is_ip_internal(conn[src_ip_index], ip_internal_ranges)
                 ] +
                [self._is_ip_internal(conn[dst_ip_index], ip_internal_ranges)]
                for conn in flow_scores
            ]

        else:

            self._flow_scores = [conn + ["", ""] for conn in flow_scores]
            self._logger.info(
                "WARNING: Network context was not added because the file ipranges.csv does not exist."
            )

        self._flow_scores.insert(0, flow_headers)

    def _is_ip_internal(self, ip, ranges):
        result = 0
        for row in ranges:
            if Util.ip_to_int(ip) >= row[0] and Util.ip_to_int(ip) <= row[1]:
                result = 1
                break
        return result

    def _add_geo_localization(self):

        # use ipranges to see if the IPs are internals.
        iploc_file = "{0}/context/iploc.csv".format(
            os.path.dirname(
                os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))

        # add new headers (srcIpInternal/destIpInternal).
        self._logger.info("Adding geo localization headers")
        flow_headers = self._flow_scores[0]
        flow_headers.extend(["srcGeo", "dstGeo", "srcDomain", "dstDomain"])

        # add values to srcIpInternal and destIpInternal.
        flow_scores = iter(self._flow_scores)
        next(flow_scores)

        if os.path.isfile(iploc_file):

            self._logger.info("Initializing geo localization component")
            geo = GeoLocalization(iploc_file, self._logger)

            src_ip_index = self._conf["flow_score_fields"]["srcIP"]
            dst_ip_index = self._conf["flow_score_fields"]["dstIP"]

            self._logger.info("Adding geo localization...")
            self._flow_scores = []
            for conn in flow_scores:

                # get geo localizastin for src ip
                self._logger.debug("Searching geo for src ip {0}".format(
                    conn[src_ip_index]))
                src_geo_dict = geo.get_ip_geo_localization(conn[src_ip_index])

                # get goe localization for dst ip.
                self._logger.debug("Searching geo for dst ip {0}".format(
                    conn[dst_ip_index]))
                dst_geo_dict = geo.get_ip_geo_localization(conn[dst_ip_index])

                # adding columns to the current connection list.
                conn.extend([
                    src_geo_dict["geo_loc"], dst_geo_dict["geo_loc"],
                    src_geo_dict["domain"], dst_geo_dict["domain"]
                ])
                self._flow_scores.extend([conn])
        else:

            self._flow_scores = [
                conn + ["", "", "", ""] for conn in flow_scores
            ]
            self._logger.info(
                "WARNING: IP location was not added because the file {0} does not exist."
                .format(iploc_file))

        self._flow_scores.insert(0, flow_headers)

    def _add_reputation(self):

        reputation_conf_file = "{0}/components/reputation/reputation_config.json".format(
            os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

        # add new headers (gtiSrcRep/gtiDstRep).
        self._logger.info("Adding reputation headers")
        flow_headers_rep = self._flow_scores[0]
        flow_headers_rep.extend(["srcIP_rep", "dstIP_rep"])

        # read configuration.
        self._logger.info("Reading reputation configuration file: {0}".format(
            reputation_conf_file))
        rep_conf = json.loads(open(reputation_conf_file).read())

        if "gti" in rep_conf and os.path.isfile(rep_conf['gti']['refclient']):
            rep_conf = rep_conf['gti']
            # initialize gti module.
            self._logger.info("Initializing GTI component")
            flow_gti = gti.Reputation(rep_conf, self._logger)

            # get all src ips.
            src_ip_index = self._conf["flow_score_fields"]["srcIP"]
            dst_ip_index = self._conf["flow_score_fields"]["dstIP"]

            self._logger.info("Getting GTI reputation for src IPs")
            flow_scores_src = iter(self._flow_scores)
            next(flow_scores_src)

            # getting reputation for src IPs
            src_ips = [conn[src_ip_index] for conn in flow_scores_src]
            src_rep_results = flow_gti.check(src_ips)

            self._logger.info("Getting GTI reputation for dst IPs")
            flow_scores_dst = iter(self._flow_scores)
            next(flow_scores_dst)

            # getting reputation for dst IPs
            dst_ips = [conn[dst_ip_index] for conn in flow_scores_dst]
            dst_rep_results = flow_gti.check(dst_ips)

            flow_scores_final = iter(self._flow_scores)
            next(flow_scores_final)

            self._flow_scores = []
            flow_scores = [
                conn + [src_rep_results[conn[src_ip_index]]] +
                [dst_rep_results[conn[dst_ip_index]]]
                for conn in flow_scores_final
            ]
            self._flow_scores = flow_scores

        else:
            # add values to gtiSrcRep and gtiDstRep.
            flow_scores = iter(self._flow_scores)
            next(flow_scores)

            self._flow_scores = [conn + ["", ""] for conn in flow_scores]
            self._logger.info(
                "WARNING: IP reputation was not added. No refclient configured"
            )

        self._flow_scores.insert(0, flow_headers_rep)

    def _get_oa_details(self):

        self._logger.info("Getting OA Flow suspicious details/chord diagram")
        # start suspicious connects details process.
        p_sp = Process(target=self._get_suspicious_details)
        p_sp.start()

        # start chord diagram process.
        p_ch = Process(target=self._get_chord_details)
        p_ch.start()

        p_sp.join()
        p_ch.join()

    def _get_suspicious_details(self, bar=None):

        # skip header
        sp_connections = iter(self._flow_scores)
        next(sp_connections)

        # loop connections.
        connections_added = []
        for conn in sp_connections:

            # validate if the connection's details are not already extracted.
            if conn in connections_added:
                continue
            else:
                connections_added.append(conn)

            src_ip_index = self._conf["flow_score_fields"]["srcIP"]
            dst_ip_index = self._conf["flow_score_fields"]["dstIP"]

            # get src ip
            sip = conn[src_ip_index]
            # get dst ip
            dip = conn[dst_ip_index]

            # get hour and date  (i.e. 2014-07-08 10:10:40)
            date_array = conn[1].split(' ')
            date_array_1 = date_array[0].split('-')
            date_array_2 = date_array[1].split(':')

            yr = date_array_1[0]
            dy = date_array_1[2]
            mh = date_array_1[1]

            hr = date_array_2[0]
            mm = date_array_2[1]

            # connection details query.
            sp_query = (
                "SELECT treceived as tstart,sip as srcip,dip as dstip,sport as sport,dport as dport,proto as proto,flag as flags,stos as TOS,ibyt as ibytes,ipkt as ipkts,input as input, output as output,rip as rip, obyt as obytes, opkt as opkts from {0}.{1} where ((sip='{2}' AND dip='{3}') or (sip='{3}' AND dip='{2}')) AND y={8} AND m={4} AND d={5} AND h={6} AND trminute={7} order by tstart limit 100"
            )

            # sp query.
            sp_query = sp_query.format(self._db, self._table_name, sip, dip,
                                       mh, dy, hr, mm, yr)

            # output file.
            edge_file = "{0}/edge-{1}-{2}-{3}-{4}.tsv".format(
                self._data_path, sip.replace(".", "_"), dip.replace(".", "_"),
                hr, mm)

            # execute query
            self._engine.query(sp_query,
                               output_file=edge_file,
                               delimiter="\\t")

    def _get_chord_details(self, bar=None):

        # skip header
        sp_connections = iter(self._flow_scores)
        next(sp_connections)

        src_ip_index = self._conf["flow_score_fields"]["srcIP"]
        dst_ip_index = self._conf["flow_score_fields"]["dstIP"]

        # get date parameters.
        yr = self._date[:4]
        mn = self._date[4:6]
        dy = self._date[6:]

        # get number of times each IP appears.
        srcdict = {}
        for conn in sp_connections:
            if conn[src_ip_index] in srcdict: srcdict[conn[src_ip_index]] += 1
            else: srcdict[conn[src_ip_index]] = 1
            if conn[dst_ip_index] in srcdict: srcdict[conn[dst_ip_index]] += 1
            else: srcdict[conn[dst_ip_index]] = 1

        for (ip, n) in srcdict.items():
            if n > 1:
                ip_list = []
                sp_connections = iter(self._flow_scores)
                next(sp_connections)
                for row in sp_connections:
                    if ip == row[2]: ip_list.append(row[3])
                    if ip == row[3]: ip_list.append(row[2])
                ips = list(set(ip_list))

                if len(ips) > 1:
                    ips_filter = (",".join(
                        str("'{0}'".format(ip)) for ip in ips))
                    chord_file = "{0}/chord-{1}.tsv".format(
                        self._data_path, ip.replace(".", "_"))
                    ch_query = (
                        "SELECT sip as srcip, dip as dstip, SUM(ibyt) as ibytes, SUM(ipkt) as ipkts from {0}.{1} where y={2} and m={3} \
                        and d={4} and ( (sip='{5}' and dip IN({6})) or (sip IN({6}) and dip='{5}') ) group by sip,dip"
                    )
                    self._engine.query(ch_query.format(self._db,
                                                       self._table_name, yr,
                                                       mn, dy, ip, ips_filter),
                                       chord_file,
                                       delimiter="\\t")

    def _ingest_summary(self):

        # get date parameters.
        yr = self._date[:4]
        mn = self._date[4:6]
        dy = self._date[6:]

        # get ingest summary.
        ingest_summary_qry = (
            "SELECT tryear, trmonth, trday, trhour, trminute, COUNT(*) flows"
            " FROM {0}.flow "
            " WHERE "
            " y={1} "
            " AND m={2} "
            " AND d={3} "
            " AND unix_tstamp IS NOT NULL "
            " GROUP BY tryear, trmonth, trday, trhour, trminute;")

        ingest_summary_qry = ingest_summary_qry.format(self._db, yr, mn, dy)

        results_file = "{0}/results_{1}.csv".format(self._ingest_summary_path,
                                                    self._date)
        self._engine.query(ingest_summary_qry,
                           output_file=results_file,
                           delimiter=",")

        result_rows = []
        with open(results_file, 'rb') as rf:
            csv_reader = csv.reader(rf, delimiter=",")
            result_rows = list(csv_reader)

        result_rows = iter(result_rows)
        next(result_rows)

        ingest_summary_results = [["date", "flows"]]
        ingest_summary_results.extend([[
            "{0}-{1}-{2} {3}:{4}".format(yr, mn, dy, row[3].zfill(2),
                                         row[4].zfill(2)), row[5]
        ] for row in result_rows])
        ingest_summay_file = "{0}/is_{1}{2}.csv".format(
            self._ingest_summary_path, yr, mn)

        write_format = 'a' if os.path.isfile(ingest_summay_file) else 'w+'
        with open(ingest_summay_file, write_format) as u_file:
            writer = csv.writer(u_file, quoting=csv.QUOTE_NONE, delimiter=",")
            writer.writerows(ingest_summary_results)

        rm_big_file = "rm {0}".format(results_file)
        os.remove(results_file)
Example #5
0
class OA(object):

    def __init__(self,date,limit=500,logger=None):

        self._initialize_members(date,limit,logger)

    def _initialize_members(self,date,limit,logger):

        # get logger if exists. if not, create new instance.
        self._logger = logging.getLogger('OA.PROXY') if logger else Util.get_logger('OA.PROXY',create_file=False)

        # initialize required parameters.
        self._scrtip_path = os.path.dirname(os.path.abspath(__file__))
        self._date = date
        self._table_name = "proxy"
        self._proxy_results = []
        self._limit = limit
        self._data_path = None
        self._ipynb_path = None
        self._ingest_summary_path = None
        self._proxy_scores = []
        self._proxy_scores_headers = []
        self._proxy_extra_columns = []
        self._results_delimiter = '\t'

        # get app configuration.
        self._oni_conf = Util.get_oni_conf()

        # get scores fields conf
        conf_file = "{0}/proxy_conf.json".format(self._scrtip_path)
        self._conf = json.loads(open (conf_file).read(),object_pairs_hook=OrderedDict)

        # initialize data engine
        self._db = self._oni_conf.get('conf','DBNAME').replace("'","").replace('"','')
        self._engine = Data(self._db, self._table_name,self._logger)


    def start(self):

        ####################
        start = time.time()
        ####################

        self._create_folder_structure()
        self._add_ipynb()
        self._get_proxy_results()
        self._add_reputation()
        self._add_severity()
        self._add_iana()
        self._add_network_context()
        self._add_hash()
        self._create_proxy_scores_csv()
        self._get_oa_details()


        ##################
        end = time.time()
        print(end - start)
        ##################

    def _create_folder_structure(self):

        # create date folder structure if it does not exist.
        self._logger.info("Creating folder structure for OA (data and ipynb)")
        self._data_path,self._ingest_summary_path,self._ipynb_path = Util.create_oa_folders("proxy",self._date)


    def _add_ipynb(self):

        if os.path.isdir(self._ipynb_path):

            self._logger.info("Adding edge investigation IPython Notebook")
            shutil.copy("{0}/ipynb_templates/Edge_Investigation_master.ipynb".format(self._scrtip_path),"{0}/Edge_Investigation.ipynb".format(self._ipynb_path))

            self._logger.info("Adding threat investigation IPython Notebook")
            shutil.copy("{0}/ipynb_templates/Threat_Investigation_master.ipynb".format(self._scrtip_path),"{0}/Threat_Investigation.ipynb".format(self._ipynb_path))

        else:
            self._logger.error("There was a problem adding the IPython Notebooks, please check the directory exists.")


    def _get_proxy_results(self):

        self._logger.info("Getting {0} Machine Learning Results from HDFS".format(self._date))
        proxy_results = "{0}/proxy_results.csv".format(self._data_path)

        # get hdfs path from conf file.
        HUSER = self._oni_conf.get('conf','HUSER').replace("'","").replace('"','')
        hdfs_path = "{0}/proxy/scored_results/{1}/scores/proxy_results.csv".format(HUSER,self._date)

        # get results file from hdfs.
        get_command = Util.get_ml_results_form_hdfs(hdfs_path,self._data_path)
        self._logger.info("{0}".format(get_command))

         # valdiate files exists
        if os.path.isfile(proxy_results):

            # read number of results based in the limit specified.
            self._logger.info("Reading {0} proxy results file: {1}".format(self._date,proxy_results))
            self._proxy_results = Util.read_results(proxy_results,self._limit,self._results_delimiter)[:]
            if len(self._proxy_results) == 0: self._logger.error("There are not proxy results.");sys.exit(1)
        else:
            self._logger.error("There was an error getting ML results from HDFS")
            sys.exit(1)

        # add headers.
        self._logger.info("Adding headers")
        self._proxy_scores_headers = [  str(key) for (key,value) in self._conf['proxy_score_fields'].items() ]

        self._proxy_scores = self._proxy_results[:]


    def _create_proxy_scores_csv(self):

        proxy_scores_csv = "{0}/proxy_scores.tsv".format(self._data_path)
        proxy_scores_final = self._proxy_scores[:];
        proxy_scores_final.insert(0,self._proxy_scores_headers)
        Util.create_csv_file(proxy_scores_csv,proxy_scores_final, self._results_delimiter)

        # create bk file
        proxy_scores_bu_csv = "{0}/proxy_scores_bu.tsv".format(self._data_path)
        Util.create_csv_file(proxy_scores_bu_csv,proxy_scores_final, self._results_delimiter)


    def _add_reputation(self):

        # read configuration.
        reputation_conf_file = "{0}/components/reputation/reputation_config.json".format(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
        self._logger.info("Reading reputation configuration file: {0}".format(reputation_conf_file))
        rep_conf = json.loads(open(reputation_conf_file).read())

        # initialize reputation services.
        self._rep_services = []
        self._logger.info("Initializing reputation services.")
        for service in rep_conf:
             config = rep_conf[service]
             module = __import__("components.reputation.{0}.{0}".format(service), fromlist=['Reputation'])
             self._rep_services.append(module.Reputation(config,self._logger))

        # get columns for reputation.
        rep_cols = {}
        indexes =  [ int(value) for key, value in self._conf["add_reputation"].items()]
        self._logger.info("Getting columns to add reputation based on config file: proxy_conf.json".format())
        for index in indexes:
            col_list = []
            for conn in self._proxy_scores:
                col_list.append(conn[index])
            rep_cols[index] = list(set(col_list))

        # get reputation per column.
        self._logger.info("Getting reputation for each service in config")
        rep_services_results = []
        for key,value in rep_cols.items():
            rep_services_results = [ rep_service.check(None,value,True) for rep_service in self._rep_services]
            rep_results = {}

            for result in rep_services_results:
                rep_results = {k: "{0}::{1}".format(rep_results.get(k, ""), result.get(k, "")).strip('::') for k in set(rep_results) | set(result)}

            self._proxy_scores = [ conn + [ rep_results[conn[key]] ]   for conn in self._proxy_scores  ]

    def _add_severity(self):
        # Add severity column
        self._proxy_scores = [conn + [0] for conn in self._proxy_scores]


    def _add_iana(self):

        iana_conf_file = "{0}/components/iana/iana_config.json".format(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
        if os.path.isfile(iana_conf_file):
            iana_config  = json.loads(open(iana_conf_file).read())
            proxy_iana = IanaTransform(iana_config["IANA"])
            proxy_rcode_index = self._conf["proxy_score_fields"]["respcode"]
            self._proxy_scores = [ conn + [ proxy_iana.get_name(conn[proxy_rcode_index],"proxy_http_rcode")] for conn in self._proxy_scores ]
        else:
            self._proxy_scores = [ conn + [""] for conn in self._proxy_scores ]


    def _add_network_context(self):

        nc_conf_file = "{0}/components/nc/nc_config.json".format(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
        if os.path.isfile(nc_conf_file):
            nc_conf = json.loads(open(nc_conf_file).read())["NC"]
            proxy_nc = NetworkContext(nc_conf,self._logger)
            ip_dst_index = self._conf["proxy_score_fields"]["clientip"]
            self._proxy_scores = [ conn + [proxy_nc.get_nc(conn[ip_dst_index])] for conn in self._proxy_scores ]

        else:
            self._proxy_scores = [ conn + [""] for conn in self._proxy_scores ]


    def _add_hash(self):
        #A hash string is generated to be used as the file name for the edge files.
        #These fields are used for the hash creation, so this combination of values is treated as
        #a 'unique' connection
        cip_index = self._conf["proxy_score_fields"]["clientip"]
        uri_index = self._conf["proxy_score_fields"]["fulluri"]
        tme_index = self._conf["proxy_score_fields"]["p_time"]

        self._proxy_scores = [conn + [str( md5.new(str(conn[cip_index]) + str(conn[uri_index])).hexdigest() + str((conn[tme_index].split(":"))[0]) )] for conn in self._proxy_scores]


    def _get_oa_details(self):

        self._logger.info("Getting OA Proxy suspicious details")
        # start suspicious connects details process.
        p_sp = Process(target=self._get_suspicious_details)
        p_sp.start()

        # p_sp.join()

    def _get_suspicious_details(self):
        hash_list = []
        iana_conf_file = "{0}/components/iana/iana_config.json".format(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
        if os.path.isfile(iana_conf_file):
            iana_config  = json.loads(open(iana_conf_file).read())
            proxy_iana = IanaTransform(iana_config["IANA"])

        for conn in self._proxy_scores:
            conn_hash = conn[self._conf["proxy_score_fields"]["hash"]]
            if conn_hash not in hash_list:
                hash_list.append(conn_hash)
                clientip = conn[self._conf["proxy_score_fields"]["clientip"]]
                fulluri = conn[self._conf["proxy_score_fields"]["fulluri"]]
                date=conn[self._conf["proxy_score_fields"]["p_date"]].split('-')
                if len(date) == 3:
                    year=date[0]
                    month=date[1].zfill(2)
                    day=date[2].zfill(2)
                    hh=(conn[self._conf["proxy_score_fields"]["p_time"]].split(":"))[0]
                    self._get_proxy_details(fulluri,clientip,conn_hash,year,month,day,hh,proxy_iana)


    def _get_proxy_details(self,fulluri,clientip,conn_hash,year,month,day,hh,proxy_iana):

        limit = 250
        output_delimiter = '\t'
        edge_file ="{0}/edge-{1}-{2}.tsv".format(self._data_path,clientip,conn_hash)
        edge_tmp  ="{0}/edge-{1}-{2}.tmp".format(self._data_path,clientip,conn_hash)

        if not os.path.isfile(edge_file):
            proxy_qry = ("SELECT p_date, p_time, clientip, host, webcat, respcode, reqmethod, useragent, resconttype, \
                referer, uriport, serverip, scbytes, csbytes, fulluri FROM {0}.{1} WHERE y=\'{2}\' AND m=\'{3}\' AND d=\'{4}\' AND \
                h=\'{5}\' AND fulluri =\'{6}\' AND clientip = \'{7}\' LIMIT {8};").format(self._db,self._table_name, year,month,day,hh,fulluri,clientip,limit)

            # execute query
            self._engine.query(proxy_qry,edge_tmp,output_delimiter)
            # add IANA to results.
            self._logger.info("Adding IANA translation to details results")
            with open(edge_tmp) as proxy_details_csv:
                rows = csv.reader(proxy_details_csv, delimiter=output_delimiter,quotechar='"')
                next(proxy_details_csv)
                update_rows = [[conn[0]] + [conn[1]] + [conn[2]] + [conn[3]] + [conn[4]] + [proxy_iana.get_name(conn[5],"proxy_http_rcode") if proxy_iana else conn[5]] + [conn[6]] + [conn[7]] + [conn[8]] + [conn[9]] + [conn[10]] + [conn[11]] + [conn[12]] + [conn[13]] + [conn[14]] if len(conn) > 0 else [] for conn in rows]
                update_rows = filter(None, update_rows)
                header = ["p_date","p_time","clientip","host","webcat","respcode","reqmethod","useragent","resconttype","referer","uriport","serverip","scbytes","csbytes","fulluri"]
                update_rows.insert(0,header)

		# due an issue with the output of the query.
		update_rows = [ [ w.replace('"','') for w in l ] for l in update_rows ]
	

            # create edge file.
            self._logger.info("Creating edge file:{0}".format(edge_file))
            with open(edge_file,'wb') as proxy_details_edge:
                writer = csv.writer(proxy_details_edge, quoting=csv.QUOTE_NONE, delimiter=output_delimiter)
                if update_rows:
                    writer.writerows(update_rows)
                else:
                    shutil.copy(edge_tmp,edge_file)

            try:
                os.remove(edge_tmp)
            except OSError:
                pass
Example #6
0
class OA(object):
    def __init__(self, date, limit=500, logger=None):

        self._initialize_members(date, limit, logger)

    def _initialize_members(self, date, limit, logger):

        # get logger if exists. if not, create new instance.
        self._logger = logging.getLogger(
            'OA.DNS') if logger else Util.get_logger('OA.DNS',
                                                     create_file=False)

        # initialize required parameters.
        self._scrtip_path = os.path.dirname(os.path.abspath(__file__))
        self._date = date
        self._table_name = "dns"
        self._dns_results = []
        self._limit = limit
        self._data_path = None
        self._ipynb_path = None
        self._ingest_summary_path = None
        self._dns_scores = []
        self._dns_scores_headers = []
        self._results_delimiter = '\t'
        self._details_limit = 250

        # get app configuration.
        self._spot_conf = Util.get_spot_conf()

        # get scores fields conf
        conf_file = "{0}/dns_conf.json".format(self._scrtip_path)
        self._conf = json.loads(open(conf_file).read(),
                                object_pairs_hook=OrderedDict)

        # initialize data engine
        self._db = self._spot_conf.get('conf',
                                       'DBNAME').replace("'",
                                                         "").replace('"', '')
        self._engine = Data(self._db, self._table_name, self._logger)

    def start(self):

        ####################
        start = time.time()
        ####################

        self._create_folder_structure()
        self._add_ipynb()
        self._get_dns_results()
        self._add_tld_column()
        self._add_reputation()
        self._add_hh_and_severity()
        self._add_iana()
        self._add_network_context()
        self._create_dns_scores_csv()
        self._get_oa_details()
        self._ingest_summary()

        ##################
        end = time.time()
        print(end - start)
        ##################

    def _create_folder_structure(self):

        # create date folder structure if it does not exist.
        self._logger.info("Creating folder structure for OA (data and ipynb)")
        self._data_path, self._ingest_summary_path, self._ipynb_path = Util.create_oa_folders(
            "dns", self._date)

    def _add_ipynb(self):

        if os.path.isdir(self._ipynb_path):

            self._logger.info("Adding edge investigation IPython Notebook")
            shutil.copy(
                "{0}/ipynb_templates/Edge_Investigation_master.ipynb".format(
                    self._scrtip_path),
                "{0}/Edge_Investigation.ipynb".format(self._ipynb_path))

            self._logger.info("Adding threat investigation IPython Notebook")
            shutil.copy(
                "{0}/ipynb_templates/Threat_Investigation_master.ipynb".format(
                    self._scrtip_path),
                "{0}/Threat_Investigation.ipynb".format(self._ipynb_path))

        else:
            self._logger.error(
                "There was a problem adding the IPython Notebooks, please check the directory exists."
            )

    def _get_dns_results(self):

        self._logger.info(
            "Getting {0} Machine Learning Results from HDFS".format(
                self._date))
        dns_results = "{0}/dns_results.csv".format(self._data_path)

        # get hdfs path from conf file.
        HUSER = self._spot_conf.get('conf',
                                    'HUSER').replace("'", "").replace('"', '')
        hdfs_path = "{0}/dns/scored_results/{1}/scores/dns_results.csv".format(
            HUSER, self._date)

        # get results file from hdfs.
        get_command = Util.get_ml_results_form_hdfs(hdfs_path, self._data_path)
        self._logger.info("{0}".format(get_command))

        # validate files exists
        if os.path.isfile(dns_results):

            # read number of results based in the limit specified.
            self._logger.info("Reading {0} dns results file: {1}".format(
                self._date, dns_results))
            self._dns_results = Util.read_results(dns_results, self._limit,
                                                  self._results_delimiter)[:]
            if len(self._dns_results) == 0:
                self._logger.error("There are not flow results.")
                sys.exit(1)

        else:
            self._logger.error(
                "There was an error getting ML results from HDFS")
            sys.exit(1)

        # add headers.
        self._logger.info("Adding headers")
        self._dns_scores_headers = [
            str(key)
            for (key, value) in self._conf['dns_score_fields'].items()
        ]

        # add dns content.
        self._dns_scores = [conn[:] for conn in self._dns_results][:]

    def _move_time_stamp(self, dns_data):

        for dns in dns_data:
            time_stamp = dns[1]
            dns.remove(time_stamp)
            dns.append(time_stamp)

        return dns_data

    def _create_dns_scores_csv(self):

        dns_scores_csv = "{0}/dns_scores.csv".format(self._data_path)
        dns_scores_final = self._move_time_stamp(self._dns_scores)
        dns_scores_final.insert(0, self._dns_scores_headers)
        Util.create_csv_file(dns_scores_csv, dns_scores_final)

        # create bk file
        dns_scores_bu_csv = "{0}/dns_scores_bu.csv".format(self._data_path)
        Util.create_csv_file(dns_scores_bu_csv, dns_scores_final)

    def _add_tld_column(self):
        qry_name_col = self._conf['dns_results_fields']['dns_qry_name']
        self._dns_scores = [
            conn + [
                get_tld("http://" + str(conn[qry_name_col]),
                        fail_silently=True)
                if "http://" not in str(conn[qry_name_col]) else get_tld(
                    str(conn[qry_name_col]), fail_silently=True)
            ] for conn in self._dns_scores
        ]

    def _add_reputation(self):

        # read configuration.
        reputation_conf_file = "{0}/components/reputation/reputation_config.json".format(
            os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
        self._logger.info("Reading reputation configuration file: {0}".format(
            reputation_conf_file))
        rep_conf = json.loads(open(reputation_conf_file).read())

        # initialize reputation services.
        self._rep_services = []
        self._logger.info("Initializing reputation services.")
        for service in rep_conf:
            config = rep_conf[service]
            module = __import__(
                "components.reputation.{0}.{0}".format(service),
                fromlist=['Reputation'])
            self._rep_services.append(module.Reputation(config, self._logger))

        # get columns for reputation.
        rep_cols = {}
        indexes = [
            int(value) for key, value in self._conf["add_reputation"].items()
        ]
        self._logger.info(
            "Getting columns to add reputation based on config file: dns_conf.json"
            .format())
        for index in indexes:
            col_list = []
            for conn in self._dns_scores:
                col_list.append(conn[index])
            rep_cols[index] = list(set(col_list))

        # get reputation per column.
        self._logger.info("Getting reputation for each service in config")
        rep_services_results = []

        if self._rep_services:
            for key, value in rep_cols.items():
                rep_services_results = [
                    rep_service.check(None, value)
                    for rep_service in self._rep_services
                ]
                rep_results = {}
                for result in rep_services_results:
                    rep_results = {
                        k: "{0}::{1}".format(rep_results.get(k, ""),
                                             result.get(k, "")).strip('::')
                        for k in set(rep_results) | set(result)
                    }

                self._dns_scores = [
                    conn + [rep_results[conn[key]]]
                    for conn in self._dns_scores
                ]
        else:
            self._dns_scores = [conn + [""] for conn in self._dns_scores]

    def _add_hh_and_severity(self):

        # add hh value and sev columns.
        dns_date_index = self._conf["dns_results_fields"]["frame_time"]
        self._dns_scores = [
            conn +
            [filter(None, conn[dns_date_index].split(" "))[3].split(":")[0]] +
            [0] + [0] for conn in self._dns_scores
        ]

    def _add_iana(self):

        iana_conf_file = "{0}/components/iana/iana_config.json".format(
            os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
        if os.path.isfile(iana_conf_file):
            iana_config = json.loads(open(iana_conf_file).read())
            dns_iana = IanaTransform(iana_config["IANA"])

            dns_qry_class_index = self._conf["dns_results_fields"][
                "dns_qry_class"]
            dns_qry_type_index = self._conf["dns_results_fields"][
                "dns_qry_type"]
            dns_qry_rcode_index = self._conf["dns_results_fields"][
                "dns_qry_rcode"]
            self._dns_scores = [
                conn + [
                    dns_iana.get_name(conn[dns_qry_class_index],
                                      "dns_qry_class")
                ] +
                [dns_iana.get_name(conn[dns_qry_type_index], "dns_qry_type")] +
                [
                    dns_iana.get_name(conn[dns_qry_rcode_index],
                                      "dns_qry_rcode")
                ] for conn in self._dns_scores
            ]

        else:
            self._dns_scores = [
                conn + ["", "", ""] for conn in self._dns_scores
            ]

    def _add_network_context(self):

        nc_conf_file = "{0}/components/nc/nc_config.json".format(
            os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
        if os.path.isfile(nc_conf_file):
            nc_conf = json.loads(open(nc_conf_file).read())["NC"]
            dns_nc = NetworkContext(nc_conf, self._logger)
            ip_dst_index = self._conf["dns_results_fields"]["ip_dst"]
            self._dns_scores = [
                conn + [dns_nc.get_nc(conn[ip_dst_index])]
                for conn in self._dns_scores
            ]
        else:
            self._dns_scores = [conn + [""] for conn in self._dns_scores]

    def _get_oa_details(self):

        self._logger.info("Getting OA DNS suspicious details/chord diagram")
        # start suspicious connects details process.
        p_sp = Process(target=self._get_suspicious_details)
        p_sp.start()

        # start chord diagram process.
        p_dn = Process(target=self._get_dns_dendrogram)
        p_dn.start()

        p_sp.join()
        p_dn.join()

    def _get_suspicious_details(self):

        iana_conf_file = "{0}/components/iana/iana_config.json".format(
            os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
        if os.path.isfile(iana_conf_file):
            iana_config = json.loads(open(iana_conf_file).read())
            dns_iana = IanaTransform(iana_config["IANA"])

        for conn in self._dns_scores:
            # get data to query
            date = conn[self._conf["dns_score_fields"]["frame_time"]].split(
                " ")
            date = filter(None, date)

            if len(date) == 5:
                year = date[2]
                month = datetime.datetime.strptime(date[0],
                                                   '%b').strftime('%m')
                day = date[1]
                hh = conn[self._conf["dns_score_fields"]["hh"]]
                dns_qry_name = conn[self._conf["dns_score_fields"]
                                    ["dns_qry_name"]]
                self._get_dns_details(dns_qry_name, year, month, day, hh,
                                      dns_iana)

    def _get_dns_details(self, dns_qry_name, year, month, day, hh, dns_iana):

        limit = self._details_limit
        edge_file = "{0}/edge-{1}_{2}_00.csv".format(
            self._data_path, dns_qry_name.replace("/", "-"), hh)
        edge_tmp = "{0}/edge-{1}_{2}_00.tmp".format(
            self._data_path, dns_qry_name.replace("/", "-"), hh)

        if not os.path.isfile(edge_file):

            dns_qry = (
                "SELECT frame_time,frame_len,ip_dst,ip_src,dns_qry_name,dns_qry_class,dns_qry_type,dns_qry_rcode,dns_a FROM {0}.{1} WHERE y={2} AND m={3} AND d={4} AND dns_qry_name LIKE '%{5}%' AND h={6} LIMIT {7};"
            ).format(self._db, self._table_name, year, month, day,
                     dns_qry_name, hh, limit)

            # execute query
            self._engine.query(dns_qry, edge_tmp)

            # add IANA to results.
            if dns_iana:
                update_rows = []
                self._logger.info("Adding IANA translation to details results")
                with open(edge_tmp) as dns_details_csv:
                    rows = csv.reader(dns_details_csv,
                                      delimiter=',',
                                      quotechar='|')
                    try:
                        next(rows)
                        update_rows = [
                            [conn[0]] + [conn[1]] + [conn[2]] + [conn[3]] +
                            [conn[4]] +
                            [dns_iana.get_name(conn[5], "dns_qry_class")] +
                            [dns_iana.get_name(conn[6], "dns_qry_type")] +
                            [dns_iana.get_name(conn[7], "dns_qry_rcode")] +
                            [conn[8]] for conn in rows
                        ]
                        update_rows = filter(None, update_rows)
                        header = [
                            "frame_time", "frame_len", "ip_dst", "ip_src",
                            "dns_qry_name", "dns_qry_class_name",
                            "dns_qry_type_name", "dns_qry_rcode_name", "dns_a"
                        ]
                        update_rows.insert(0, header)
                    except IndexError:
                        pass

            else:
                self._logger.info("WARNING: NO IANA configured.")

            # create edge file.
            self._logger.info("Creating edge file:{0}".format(edge_file))
            with open(edge_file, 'wb') as dns_details_edge:
                writer = csv.writer(dns_details_edge, quoting=csv.QUOTE_ALL)
                if update_rows:
                    writer.writerows(update_rows)
                else:
                    shutil.copy(edge_tmp, edge_file)

            try:
                os.remove(edge_tmp)
            except OSError:
                pass

    def _get_dns_dendrogram(self):
        limit = self._details_limit
        for conn in self._dns_scores:
            date = conn[self._conf["dns_score_fields"]["frame_time"]].split(
                " ")
            date = filter(None, date)

            if len(date) == 5:
                year = date[2]
                month = datetime.datetime.strptime(date[0],
                                                   '%b').strftime('%m')
                day = date[1]
                ip_dst = conn[self._conf["dns_score_fields"]["ip_dst"]]
                self._get_dendro(self._db, self._table_name, ip_dst, year,
                                 month, day, limit)

    def _get_dendro(self, db, table, ip_dst, year, month, day, limit):

        dendro_file = "{0}/dendro-{1}.csv".format(self._data_path, ip_dst)
        if not os.path.isfile(dendro_file):
            dndro_qry = (
                "SELECT dns_a, dns_qry_name, ip_dst FROM (SELECT susp.ip_dst, susp.dns_qry_name, susp.dns_a FROM {0}.{1} as susp WHERE susp.y={2} AND susp.m={3} AND susp.d={4} AND susp.ip_dst='{5}' LIMIT {6}) AS tmp GROUP BY dns_a, dns_qry_name, ip_dst"
            ).format(db, table, year, month, day, ip_dst, limit)
            # execute query
            self._engine.query(dndro_qry, dendro_file)

    def _ingest_summary(self):
        # get date parameters.
        yr = self._date[:4]
        mn = self._date[4:6]
        dy = self._date[6:]

        self._logger.info("Getting ingest summary data for the day")

        ingest_summary_cols = ["date", "total"]
        result_rows = []
        df_filtered = pd.DataFrame()

        ingest_summary_file = "{0}/is_{1}{2}.csv".format(
            self._ingest_summary_path, yr, mn)
        ingest_summary_tmp = "{0}.tmp".format(ingest_summary_file)

        if os.path.isfile(ingest_summary_file):
            df = pd.read_csv(ingest_summary_file, delimiter=',')
            #discards previous rows from the same date
            df_filtered = df[df['date'].str.contains("{0}-{1}-{2}".format(
                yr, mn, dy)) == False]
        else:
            df = pd.DataFrame()

        # get ingest summary.
        ingest_summary_qry = (
            "SELECT frame_time, COUNT(*) as total "
            " FROM {0}.{1}"
            " WHERE y={2} AND m={3} AND d={4} "
            " AND unix_tstamp IS NOT NULL AND frame_time IS NOT NULL"
            " AND frame_len IS NOT NULL AND dns_qry_name IS NOT NULL"
            " AND ip_src IS NOT NULL "
            " AND (dns_qry_class IS NOT NULL AND dns_qry_type IS NOT NULL AND dns_qry_rcode IS NOT NULL ) "
            " GROUP BY frame_time;")

        ingest_summary_qry = ingest_summary_qry.format(self._db,
                                                       self._table_name, yr,
                                                       mn, dy)

        results_file = "{0}/results_{1}.csv".format(self._ingest_summary_path,
                                                    self._date)
        self._engine.query(ingest_summary_qry,
                           output_file=results_file,
                           delimiter=",")

        if os.path.isfile(results_file):
            df_results = pd.read_csv(results_file, delimiter=',')

            # Forms a new dataframe splitting the minutes from the time column
            df_new = pd.DataFrame([[
                "{0}-{1}-{2} {3}:{4}".format(
                    yr, mn, dy,
                    val['frame_time'].split(" ")[3].split(":")[0].zfill(2),
                    val['frame_time'].split(" ")[3].split(":")[1].zfill(2)),
                int(val['total']) if not math.isnan(val['total']) else 0
            ] for key, val in df_results.iterrows()],
                                  columns=ingest_summary_cols)

            #Groups the data by minute
            sf = df_new.groupby(by=['date'])['total'].sum()

            df_per_min = pd.DataFrame({'date': sf.index, 'total': sf.values})

            df_final = df_filtered.append(df_per_min, ignore_index=True)
            df_final.to_csv(ingest_summary_tmp, sep=',', index=False)

            os.remove(results_file)
            os.rename(ingest_summary_tmp, ingest_summary_file)
        else:
            self._logger.info("No data found for the ingest summary")
Example #7
0
class OA(object):
    
    def __init__(self,date,limit=500,logger=None):

        self._initialize_members(date,limit,logger)

    def _initialize_members(self,date,limit,logger):
        
        # get logger if exists. if not, create new instance.
        self._logger = logging.getLogger('OA.DNS') if logger else Util.get_logger('OA.DNS',create_file=False)

        # initialize required parameters.
        self._scrtip_path = os.path.dirname(os.path.abspath(__file__))
        self._date = date
        self._table_name = "dns"
        self._dns_results = []
        self._limit = limit
        self._data_path = None
        self._ipynb_path = None
        self._ingest_summary_path = None
        self._dns_scores = []
        self._dns_scores_headers = []
        self._results_delimiter = '\t'

        # get app configuration.
        self._oni_conf = Util.get_oni_conf()

        # get scores fields conf
        conf_file = "{0}/dns_conf.json".format(self._scrtip_path)
        self._conf = json.loads(open (conf_file).read(),object_pairs_hook=OrderedDict)

        # initialize data engine
        self._db = self._oni_conf.get('conf','DBNAME').replace("'","").replace('"','') 
        self._engine = Data(self._db,self._table_name ,self._logger)


    def start(self):

        ####################
        start = time.time()
        ####################

        self._create_folder_structure()
        self._add_ipynb()
        self._get_dns_results()
        self._add_reputation()
        self._add_hh_and_severity()
        self._add_iana()
        self._add_network_context()
        self._create_dns_scores_csv()
        self._get_oa_details()

        ##################
        end = time.time()
        print(end - start)
        ##################

    def _create_folder_structure(self):

        # create date folder structure if it does not exist.
        self._logger.info("Creating folder structure for OA (data and ipynb)")       
        self._data_path,self._ingest_summary_path,self._ipynb_path = Util.create_oa_folders("dns",self._date)
    
    def _add_ipynb(self):

        if os.path.isdir(self._ipynb_path):

            self._logger.info("Adding edge investigation IPython Notebook")
            shutil.copy("{0}/ipynb_templates/Edge_Investigation_master.ipynb".format(self._scrtip_path),"{0}/Edge_Investigation.ipynb".format(self._ipynb_path))

            self._logger.info("Adding threat investigation IPython Notebook")
            shutil.copy("{0}/ipynb_templates/Threat_Investigation_master.ipynb".format(self._scrtip_path),"{0}/Threat_Investigation.ipynb".format(self._ipynb_path))

        else:
            self._logger.error("There was a problem adding the IPython Notebooks, please check the directory exists.")


    def _get_dns_results(self):

        self._logger.info("Getting {0} Machine Learning Results from HDFS".format(self._date))
        dns_results = "{0}/dns_results.csv".format(self._data_path)

        # get hdfs path from conf file.
        HUSER = self._oni_conf.get('conf','HUSER').replace("'","").replace('"','')   
        hdfs_path = "{0}/dns/scored_results/{1}/scores/dns_results.csv".format(HUSER,self._date)

        # get results file from hdfs.
        get_command = Util.get_ml_results_form_hdfs(hdfs_path,self._data_path)
        self._logger.info("{0}".format(get_command))

         # valdiate files exists
        if os.path.isfile(dns_results):

            # read number of results based in the limit specified.
            self._logger.info("Reading {0} dns results file: {1}".format(self._date,dns_results))
            self._dns_results = Util.read_results(dns_results,self._limit,self._results_delimiter)[:]
            if len(self._dns_results) == 0: self._logger.error("There are not flow results.");sys.exit(1)

        else:
            self._logger.error("There was an error getting ML results from HDFS")
            sys.exit(1)

        # add headers.        
        self._logger.info("Adding headers")
        self._dns_scores_headers = [  str(key) for (key,value) in self._conf['dns_score_fields'].items() ]

        # add dns content.
        self._dns_scores = [ conn[:]  for conn in self._dns_results][:]       

    def _move_time_stamp(self,dns_data):
        
        for dns in dns_data:
            time_stamp = dns[1]
            dns.remove(time_stamp)
            dns.append(time_stamp)
        
        return dns_data        

    def _create_dns_scores_csv(self):
        
        dns_scores_csv = "{0}/dns_scores.csv".format(self._data_path)
        dns_scores_final =  self._move_time_stamp(self._dns_scores)
        dns_scores_final.insert(0,self._dns_scores_headers)
        Util.create_csv_file(dns_scores_csv,dns_scores_final)   

        # create bk file
        dns_scores_bu_csv = "{0}/dns_scores_bu.csv".format(self._data_path)
        Util.create_csv_file(dns_scores_bu_csv,dns_scores_final)  
  
    def _add_reputation(self):

        # read configuration.
        reputation_conf_file = "{0}/components/reputation/reputation_config.json".format(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
        self._logger.info("Reading reputation configuration file: {0}".format(reputation_conf_file))
        rep_conf = json.loads(open(reputation_conf_file).read())
       
        # initialize reputation services.
        self._rep_services = []
        self._logger.info("Initializing reputation services.")
        for service in rep_conf:               
            config = rep_conf[service]
            module = __import__("components.reputation.{0}.{0}".format(service), fromlist=['Reputation'])
            self._rep_services.append(module.Reputation(config,self._logger))
                
        # get columns for reputation.
        rep_cols = {}
        indexes =  [ int(value) for key, value in self._conf["add_reputation"].items()]  
        self._logger.info("Getting columns to add reputation based on config file: dns_conf.json".format())
        for index in indexes:
            col_list = []
            for conn in self._dns_scores:
                col_list.append(conn[index])            
            rep_cols[index] = list(set(col_list))

        # get reputation per column.
        self._logger.info("Getting reputation for each service in config")        
        rep_services_results = []
        for key,value in rep_cols.items():
            rep_services_results = [ rep_service.check(None,value) for rep_service in self._rep_services]
            rep_results = {}            
            for result in rep_services_results:            
                rep_results = {k: "{0}::{1}".format(rep_results.get(k, ""), result.get(k, "")).strip('::') for k in set(rep_results) | set(result)}

            self._dns_scores = [ conn + [ rep_results[conn[key]] ]   for conn in self._dns_scores  ]

        

    def _add_hh_and_severity(self):

        # add hh value and sev columns.
        dns_date_index = self._conf["dns_results_fields"]["frame_time"]
        self._dns_scores = [conn + [ filter(None,conn[dns_date_index].split(" "))[3].split(":")[0]] + [0] + [0] for conn in self._dns_scores  ]


    def _add_iana(self):

        iana_conf_file = "{0}/components/iana/iana_config.json".format(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
        if os.path.isfile(iana_conf_file):
            iana_config  = json.loads(open(iana_conf_file).read())
            dns_iana = IanaTransform(iana_config["IANA"])

            dns_qry_class_index = self._conf["dns_results_fields"]["dns_qry_class"]
            dns_qry_type_index = self._conf["dns_results_fields"]["dns_qry_type"]
            dns_qry_rcode_index = self._conf["dns_results_fields"]["dns_qry_rcode"]
            self._dns_scores = [ conn + [ dns_iana.get_name(conn[dns_qry_class_index],"dns_qry_class")] + [dns_iana.get_name(conn[dns_qry_type_index],"dns_qry_type")] + [ dns_iana.get_name(conn[dns_qry_rcode_index],"dns_qry_rcode") ] for conn in self._dns_scores ]
            
        else:            
            self._dns_scores = [ conn + ["","",""] for conn in self._dns_scores ] 

    def _add_network_context(self):

        nc_conf_file = "{0}/components/nc/nc_config.json".format(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
        if os.path.isfile(nc_conf_file):
            nc_conf = json.loads(open(nc_conf_file).read())["NC"]
            dns_nc = NetworkContext(nc_conf,self._logger)
            ip_dst_index = self._conf["dns_results_fields"]["ip_dst"]
            self._dns_scores = [ conn + [dns_nc.get_nc(conn[ip_dst_index])] for conn in self._dns_scores ]
        else:
            self._dns_scores = [ conn + [""] for conn in self._dns_scores ]

   
    def _get_oa_details(self):
        
        self._logger.info("Getting OA DNS suspicious details/chord diagram")       
        # start suspicious connects details process.
        p_sp = Process(target=self._get_suspicious_details)
        p_sp.start()        

        # start chord diagram process.            
        p_dn = Process(target=self._get_dns_dendrogram)
        p_dn.start()

        p_sp.join()
        p_dn.join()

    def _get_suspicious_details(self):

        iana_conf_file = "{0}/components/iana/iana_config.json".format(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
        if os.path.isfile(iana_conf_file):
            iana_config  = json.loads(open(iana_conf_file).read())
            dns_iana = IanaTransform(iana_config["IANA"])
        
        for conn in self._dns_scores:
            # get data to query
            date=conn[self._conf["dns_score_fields"]["frame_time"]].split(" ")
            date = filter(None,date)

            if len(date) == 5:
                year=date[2]
                month=datetime.datetime.strptime(date[0], '%b').strftime('%m')
                day=date[1]                
                hh=conn[self._conf["dns_score_fields"]["hh"]]
                dns_qry_name = conn[self._conf["dns_score_fields"]["dns_qry_name"]]
                self._get_dns_details(dns_qry_name,year,month,day,hh,dns_iana)

    def _get_dns_details(self,dns_qry_name,year,month,day,hh,dns_iana):
                    
        limit = 250
        edge_file ="{0}/edge-{1}_{2}_00.csv".format(self._data_path,dns_qry_name.replace("/","-"),hh)
        edge_tmp  ="{0}/edge-{1}_{2}_00.tmp".format(self._data_path,dns_qry_name.replace("/","-"),hh)

        if not os.path.isfile(edge_file):
    
            dns_qry = ("SELECT frame_time,frame_len,ip_dst,ip_src,dns_qry_name,dns_qry_class,dns_qry_type,dns_qry_rcode,dns_a FROM {0}.{1} WHERE y={2} AND m={3} AND d={4} AND dns_qry_name LIKE '%{5}%' AND h={6} LIMIT {7};").format(self._db,self._table_name,year,month,day,dns_qry_name,hh,limit)
            
            # execute query
            self._engine.query(dns_qry,edge_tmp)
 
            # add IANA to results.
            if dns_iana:
                update_rows = []
                self._logger.info("Adding IANA translation to details results")
                with open(edge_tmp) as dns_details_csv:
                    rows = csv.reader(dns_details_csv, delimiter=',', quotechar='|')
                    try:
                        next(rows)
                        update_rows = [[conn[0]] + [conn[1]] + [conn[2]] + [conn[3]] + [conn[4]] + [dns_iana.get_name(conn[5],"dns_qry_class")] + [dns_iana.get_name(conn[6],"dns_qry_type")] + [dns_iana.get_name(conn[7],"dns_qry_rcode")] + [conn[8]] for conn in rows]
                        update_rows = filter(None, update_rows)
                        header = [ "frame_time", "frame_len", "ip_dst","ip_src","dns_qry_name","dns_qry_class_name","dns_qry_type_name","dns_qry_rcode_name","dns_a" ]
                        update_rows.insert(0,header)
                    except IndexError:
                        pass

            else:
                self._logger.info("WARNING: NO IANA configured.")

            # create edge file.
            self._logger.info("Creating edge file:{0}".format(edge_file))
            with open(edge_file,'wb') as dns_details_edge:
                writer = csv.writer(dns_details_edge, quoting=csv.QUOTE_ALL)
                if update_rows:
                    writer.writerows(update_rows)
                else:            
                    shutil.copy(edge_tmp,edge_file)           
            
            try:
                os.remove(edge_tmp)
            except OSError:
                pass
           

    def _get_dns_dendrogram(self):
       
        
        for conn in self._dns_scores:            
            date=conn[self._conf["dns_score_fields"]["frame_time"]].split(" ")
            date = filter(None,date)

            if len(date) == 5:
                year=date[2]
                month=datetime.datetime.strptime(date[0], '%b').strftime('%m')
                day=date[1]
                ip_dst=conn[self._conf["dns_score_fields"]["ip_dst"]]
                self._get_dendro(self._db,self._table_name,ip_dst,year,month,day)


    def _get_dendro(self,db,table,ip_dst,year,month,day):

        dendro_file = "{0}/dendro-{1}.csv".format(self._data_path,ip_dst)
        if not os.path.isfile(dendro_file):
            dndro_qry = ("SELECT dns_a, dns_qry_name, ip_dst FROM (SELECT susp.ip_dst, susp.dns_qry_name, susp.dns_a FROM {0}.{1} as susp WHERE susp.y={2} AND susp.m={3} AND susp.d={4} AND susp.ip_dst='{5}' ) AS tmp GROUP BY dns_a, dns_qry_name, ip_dst").format(db,table,year,month,day,ip_dst)

            # execute query
            self._engine.query(dndro_qry,dendro_file)
Example #8
0
class OA(object):
    
    def __init__(self,date,limit=500,logger=None):

        self._initialize_members(date,limit,logger)

    def _initialize_members(self,date,limit,logger):
        
        # get logger if exists. if not, create new instance.
        self._logger = logging.getLogger('OA.DNS') if logger else Util.get_logger('OA.DNS',create_file=False)

        # initialize required parameters.
        self._scrtip_path = os.path.dirname(os.path.abspath(__file__))
        self._date = date
        self._table_name = "dns"
        self._dns_results = []
        self._limit = limit
        self._data_path = None
        self._ipynb_path = None
        self._ingest_summary_path = None
        self._dns_scores = []
        self._dns_scores_headers = []

        # get app configuration.
        self._oni_conf = Util.get_oni_conf()

        # get scores fields conf
        conf_file = "{0}/dns_conf.json".format(self._scrtip_path)
        self._conf = json.loads(open (conf_file).read(),object_pairs_hook=OrderedDict)

        # initialize data engine
        self._db = self._oni_conf.get('conf','DBNAME').replace("'","").replace('"','') 
        self._engine = Data(self._db,self._table_name ,self._logger)


    def start(self):

        ####################
        start = time.time()
        ####################

        self._create_folder_structure()
        self._add_ipynb()
        self._get_dns_results()
        self._add_reputation()
        self._add_hh_and_severity()
        self._add_iana()
        self._add_network_context()
        self._create_dns_scores_csv()
        self._get_oa_details()

        ##################
        end = time.time()
        print(end - start)
        ##################

    def _create_folder_structure(self):

        # create date folder structure if it does not exist.
        self._logger.info("Creating folder structure for OA (data and ipynb)")       
        self._data_path,self._ingest_summary_path,self._ipynb_path = Util.create_oa_folders("dns",self._date)
    
    def _add_ipynb(self):

        if os.path.isdir(self._ipynb_path):

            self._logger.info("Adding edge investigation IPython Notebook")
            shutil.copy("{0}/ipynb_templates/Edge_Investigation_master.ipynb".format(self._scrtip_path),"{0}/Edge_Investigation.ipynb".format(self._ipynb_path))

            self._logger.info("Adding threat investigation IPython Notebook")
            shutil.copy("{0}/ipynb_templates/Threat_Investigation_master.ipynb".format(self._scrtip_path),"{0}/Threat_Investigation.ipynb".format(self._ipynb_path))

        else:
            self._logger.error("There was a problem adding the IPython Notebooks, please check the directory exists.")


    def _get_dns_results(self):

        self._logger.info("Getting {0} Machine Learning Results from HDFS".format(self._date))
        dns_results = "{0}/dns_results.csv".format(self._data_path)

        # get hdfs path from conf file.
        HUSER = self._oni_conf.get('conf','HUSER').replace("'","").replace('"','')   
        hdfs_path = "{0}/dns/scored_results/{1}/scores/dns_results.csv".format(HUSER,self._date)

        # get results file from hdfs.
        get_command = Util.get_ml_results_form_hdfs(hdfs_path,self._data_path)
        self._logger.info("{0}".format(get_command))

         # valdiate files exists
        if os.path.isfile(dns_results):

            # read number of results based in the limit specified.
            self._logger.info("Reading {0} dns results file: {1}".format(self._date,dns_results))
            self._dns_results = Util.read_results(dns_results,self._limit)[:]        
            if len(self._dns_results) == 0: self._logger.error("There are not flow results.");sys.exit(1)

        else:
            self._logger.error("There was an error getting ML results from HDFS")
            sys.exit(1)

        # add headers.        
        self._logger.info("Adding headers")
        self._dns_scores_headers = [  str(key) for (key,value) in self._conf['dns_score_fields'].items() ]

        # add dns content.
        self._dns_scores = [ conn[:]  for conn in self._dns_results][:]       

    def _move_time_stamp(self,dns_data):
        
        for dns in dns_data:
            time_stamp = dns[1]
            dns.remove(time_stamp)
            dns.append(time_stamp)
        
        return dns_data        

    def _create_dns_scores_csv(self):
        
        dns_scores_csv = "{0}/dns_scores.csv".format(self._data_path)
        dns_scores_final =  self._move_time_stamp(self._dns_scores)
        dns_scores_final.insert(0,self._dns_scores_headers)
        Util.create_csv_file(dns_scores_csv,dns_scores_final)   

        # create bk file
        dns_scores_bu_csv = "{0}/dns_scores_bu.csv".format(self._data_path)
        Util.create_csv_file(dns_scores_bu_csv,dns_scores_final)  
  
    def _add_reputation(self):

        # read configuration.
        reputation_conf_file = "{0}/components/reputation/reputation_config.json".format(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
        self._logger.info("Reading reputation configuration file: {0}".format(reputation_conf_file))
        rep_conf = json.loads(open(reputation_conf_file).read())
       
        # initialize reputation services.
        self._rep_services = []
        self._logger.info("Initializing reputation services.")
        for service in rep_conf:               
             config = rep_conf[service]
             module = __import__("components.reputation.{0}.{0}".format(service), fromlist=['Reputation'])
             self._rep_services.append(module.Reputation(config,self._logger))
                
        # get columns for reputation.
        rep_cols = {}
        indexes =  [ int(value) for key, value in self._conf["add_reputation"].items()]  
        self._logger.info("Getting columns to add reputation based on config file: dns_conf.json".format())
        for index in indexes:
            col_list = []
            for conn in self._dns_scores:
                col_list.append(conn[index])            
            rep_cols[index] = list(set(col_list))

        # get reputation per column.
        self._logger.info("Getting reputation for each service in config")        
        rep_services_results = []
        for key,value in rep_cols.items():
            rep_services_results = [ rep_service.check(None,value) for rep_service in self._rep_services]
            rep_results = {}            
            for result in rep_services_results:            
                rep_results = {k: "{0}::{1}".format(rep_results.get(k, ""), result.get(k, "")).strip('::') for k in set(rep_results) | set(result)}

            self._dns_scores = [ conn + [ rep_results[conn[key]] ]   for conn in self._dns_scores  ]

        

    def _add_hh_and_severity(self):

        # add hh value and sev columns.
        dns_date_index = self._conf["dns_results_fields"]["frame_time"]
	self._dns_scores = [conn + [ filter(None,conn[dns_date_index].split(" "))[3].split(":")[0]] + [0] + [0] for conn in self._dns_scores  ]

    def _add_iana(self):

        iana_conf_file = "{0}/components/iana/iana_config.json".format(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
        if os.path.isfile(iana_conf_file):
            iana_config  = json.loads(open(iana_conf_file).read())
            dns_iana = IanaTransform(iana_config["IANA"])

            dns_qry_class_index = self._conf["dns_results_fields"]["dns_qry_class"]
            dns_qry_type_index = self._conf["dns_results_fields"]["dns_qry_type"]
            dns_qry_rcode_index = self._conf["dns_results_fields"]["dns_qry_rcode"]
            self._dns_scores = [ conn + [ dns_iana.get_name(conn[dns_qry_class_index],"dns_qry_class")] + [dns_iana.get_name(conn[dns_qry_type_index],"dns_qry_type")] + [ dns_iana.get_name(conn[dns_qry_rcode_index],"dns_qry_rcode") ] for conn in self._dns_scores ]
            
        else:            
            self._dns_scores = [ conn + ["","",""] for conn in self._dns_scores ] 

    def _add_network_context(self):

        nc_conf_file = "{0}/components/nc/nc_config.json".format(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
        if os.path.isfile(nc_conf_file):
            nc_conf = json.loads(open(nc_conf_file).read())["NC"]
            dns_nc = NetworkContext(nc_conf,self._logger)
            ip_dst_index = self._conf["dns_results_fields"]["ip_dst"]
            self._dns_scores = [ conn + [dns_nc.get_nc(conn[ip_dst_index])] for conn in self._dns_scores ]

        else:
            self._dns_scores = [ conn + [""] for conn in self._dns_scores ]

   
    def _get_oa_details(self):
        
        self._logger.info("Getting OA DNS suspicious details/chord diagram")       
        # start suspicious connects details process.
        p_sp = Process(target=self._get_suspicious_details)
        p_sp.start()        

        # start chord diagram process.            
        p_dn = Process(target=self._get_dns_dendrogram)
        p_dn.start()

        p_sp.join()
        p_dn.join()

    def _get_suspicious_details(self):

        iana_conf_file = "{0}/components/iana/iana_config.json".format(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
        if os.path.isfile(iana_conf_file):
            iana_config  = json.loads(open(iana_conf_file).read())
            dns_iana = IanaTransform(iana_config["IANA"])
        
        for conn in self._dns_scores:
            # get data to query
            date=conn[self._conf["dns_score_fields"]["frame_time"]].split(" ")
            date = filter(None,date)

            if len(date) == 5:
                year=date[2]
                month=datetime.datetime.strptime(date[0], '%b').strftime('%m')
                day=date[1]                
                hh=conn[self._conf["dns_score_fields"]["hh"]]
                dns_qry_name = conn[self._conf["dns_score_fields"]["dns_qry_name"]]
                self._get_dns_details(dns_qry_name,year,month,day,hh,dns_iana)

    def _get_dns_details(self,dns_qry_name,year,month,day,hh,dns_iana):
                    
        limit = 250
        edge_file ="{0}/edge-{1}_{2}_00.csv".format(self._data_path,dns_qry_name.replace("/","-"),hh)
        edge_tmp  ="{0}/edge-{1}_{2}_00.tmp".format(self._data_path,dns_qry_name.replace("/","-"),hh)

        if not os.path.isfile(edge_file):
    
            dns_qry = ("SELECT frame_time,frame_len,ip_dst,ip_src,dns_qry_name,dns_qry_class,dns_qry_type,dns_qry_rcode,dns_a FROM {0}.{1} WHERE y={2} AND m={3} AND d={4} AND dns_qry_name LIKE '%{5}%' AND h={6} LIMIT {7};").format(self._db,self._table_name,year,month,day,dns_qry_name,hh,limit)
            
            # execute query
            self._engine.query(dns_qry,edge_tmp)
 
            # add IANA to results.
            if dns_iana:
                self._logger.info("Adding IANA translation to details results")
                with open(edge_tmp) as dns_details_csv:
                    rows = csv.reader(dns_details_csv, delimiter=',', quotechar='|')
        		    next(dns_details_csv)
        		    update_rows = [[conn[0]] + [conn[1]] + [conn[2]] + [conn[3]] + [conn[4]] + [dns_iana.get_name(conn[5],"dns_qry_class")] + [dns_iana.get_name(conn[6],"dns_qry_type")] + [dns_iana.get_name(conn[7],"dns_qry_rcode")] + [conn[8]] for conn in rows]
        		    update_rows = filter(None, update_rows)
        		    header = [ "frame_time", "frame_len", "ip_dst","ip_src","dns_qry_name","dns_qry_class_name","dns_qry_type_name","dns_qry_rcode_name","dns_a" ]
        		    update_rows.insert(0,header)
            else:
                self._logger.info("WARNING: NO IANA configured.")

            # create edge file.
            self._logger.info("Creating edge file:{0}".format(edge_file))
            with open(edge_file,'wb') as dns_details_edge:
                writer = csv.writer(dns_details_edge, quoting=csv.QUOTE_ALL)
                if update_rows:
                    writer.writerows(update_rows)
                else:            
                    shutil.copy(edge_tmp,edge_file)           
            
            try:
                os.remove(edge_tmp)
            except OSError:
                pass