def _gather_all_features(self, cur_rec_dict):
        """
        Set the ip_sieve log equal to the cur ip's history and 
        compute features  from feature list for that ip only
        """
        #check for too much memory consumption
        if (self._log_rec_counter == self.MAX_LOG_DB_SIZE):
            oldest_rec = self._ip_log_db.popitem(last=False)
            self._log_rec_counter -= (len(oldest_rec[1]) - 1)
        else:
            self._log_rec_counter += 1

        print "no of ips", len(
            self._ip_log_db), " no of log recs", self._log_rec_counter

        from random import randint
        cur_ip = cur_rec_dict["host"]
        #cur_ip = cur_ip[:-1] + str(randint(1,255))
        cur_ats_rec = ATSRecord(cur_rec_dict)
        if not cur_ip in self._ip_log_db:
            self._ip_log_db[cur_ip] = [cur_ats_rec]
        else:
            #get rid of old session
            if cur_ats_rec.time_to_second() - self._ip_log_db[cur_ip][
                    -1].time_to_second() > self.DEAD_SESSION_PAUSE:
                self._log_rec_counter -= (len(self._ip_log_db[cur_ip]) - 1)
                self._ip_log_db[cur_ip] = []

            self._ip_log_db[cur_ip].append(cur_ats_rec)
            #get rid of ip's old row in the ip feature array
            np.delete(self.ip_feature_array, (self.ip_row_tracker[cur_ip]),
                      axis=0)

        ip_recs = dict(((cur_ip, self._ip_log_db[cur_ip]), ))

        #so this is stupid we should compute accumulatively
        #instead so ip_feature_db should be the member of
        #the class, it probably should be a training set
        ip_feature_db = {}
        for cur_feature_name in self._feature_list:
            cur_feature_tester = self._available_features[cur_feature_name](
                ip_recs, ip_feature_db)
            cur_feature_tester.compute()

        #turing ip_feature_db into a numpy array
        self.ip_row_tracker[cur_ip] = self.ip_feature_array.shape[0]
        cur_ip_row = [[
            ip_feature_db[ip][feature]
            for feature in range(1,
                                 len(self._feature_list) + 1)
        ] for ip in ip_feature_db]
        if (self.ip_feature_array.shape[0] == 0):
            self.ip_feature_array = np.array(cur_ip_row)
        else:
            self.ip_feature_array = np.vstack(
                [self.ip_feature_array, cur_ip_row])
Example #2
0
    def parse_log_old(self):
        """
        Read each line of the log file and batch the
        records corresponding to each (client (ip), session)
        make a dictionary of lists each consisting of all records of that session
        """
        for cur_rec in self._log_lines:
            #Here (at least for now) we only care about the ip and the time record.
            time_pos = cur_rec.find('-')
            if time_pos == -1: #Not a valid record
                continue

            http_req_pos = cur_rec.find('"')
            cur_ip = cur_rec[:time_pos-1]
            rec_time = cur_rec[time_pos + 3:http_req_pos - 2]
            rec_payload = cur_rec[http_req_pos:]
            #check if we have already encountered this ip


            cur_ats_rec = ATSRecord(cur_ip, rec_time, rec_payload)
            if not cur_ip in self._ordered_records:
                self._ordered_records[cur_ip] = [cur_ats_rec]
            else:
                self._ordered_records[cur_ip].append(cur_ats_rec)

        self.dict_invalid = False
    def _gather_all_features(self, cur_rec_dict):
        """
        Set the ip_sieve log equal to the cur ip's history and 
        compute features  from feature list for that ip only
        """
        #check for too much memory consumption
        if (self._log_rec_counter == self.MAX_LOG_DB_SIZE):
            oldest_rec = self._ip_log_db.popitem(last=False)
            self._log_rec_counter -= (len(oldest_rec[1]) -1)
        else:
            self._log_rec_counter += 1

        print "no of ips", len(self._ip_log_db), " no of log recs", self._log_rec_counter

        from random import randint
        cur_ip = cur_rec_dict["host"]
        #cur_ip = cur_ip[:-1] + str(randint(1,255))
        cur_ats_rec = ATSRecord(cur_rec_dict)
        if not cur_ip in self._ip_log_db:
            self._ip_log_db[cur_ip] = [cur_ats_rec]
        else:
            #get rid of old session
            if cur_ats_rec.time_to_second() - self._ip_log_db[cur_ip][-1].time_to_second() > self.DEAD_SESSION_PAUSE:
                self._log_rec_counter -= (len(self._ip_log_db[cur_ip]) - 1)
                self._ip_log_db[cur_ip] = []

            self._ip_log_db[cur_ip].append(cur_ats_rec)
            #get rid of ip's old row in the ip feature array
            np.delete(self.ip_feature_array, (self.ip_row_tracker[cur_ip]), axis=0)
        
        ip_recs = dict(((cur_ip, self._ip_log_db[cur_ip]),))

        #so this is stupid we should compute accumulatively
        #instead so ip_feature_db should be the member of
        #the class, it probably should be a training set
        ip_feature_db = {}
        for cur_feature_name in self._feature_list:
            cur_feature_tester = self._available_features[cur_feature_name](ip_recs, ip_feature_db)
            cur_feature_tester.compute()

        #turing ip_feature_db into a numpy array
        self.ip_row_tracker[cur_ip] = self.ip_feature_array.shape[0]
        cur_ip_row = [[ip_feature_db[ip][feature] for feature in range(1, len(self._feature_list)+1)] for ip in ip_feature_db]
        if (self.ip_feature_array.shape[0] == 0):
            self.ip_feature_array = np.array(cur_ip_row)
        else:
            self.ip_feature_array = np.vstack([self.ip_feature_array, cur_ip_row]) 
Example #4
0
    def get(self, start, stop, target):
        """
        Get deflect log from es
        """
        indexes, body = self._get_param_deflect(start, stop, target)

        print "es.search() start..."
        result = []
        try:
            page = self.es.search(
                index=indexes,
                scroll='5m',  #search_type = 'scan',
                size=10000,
                body=body)

            sid = page['_scroll_id']
            page_index = 0
            scroll_size = page['hits']['total']
            total_size = scroll_size
            print "total # of hits : ", total_size
            num_processed = 0
            while (scroll_size > 0):
                #print "Scrolling...", page_index
                # Do something with the obtained page
                json_result = page['hits']['hits']

                for log in json_result:
                    #print log['_source']['@timestamp']
                    cur_rec_dict = util.es_log_muncher.parse_es_json_object(
                        log)
                    if cur_rec_dict:
                        cur_ats_rec = ATSRecord(cur_rec_dict)
                        result.append(cur_ats_rec)
                        num_processed = num_processed + 1

                print "progress:{},{:.1f}%...".format(
                    num_processed, 100.0 * num_processed / total_size)
                if (num_processed > 5000000):
                    break

                page_index = page_index + 1
                page = self.es.scroll(scroll_id=sid, scroll='5m')
                sid = page['_scroll_id']
                scroll_size = len(page['hits']['hits'])
        except Exception as ex:
            print ex

        return result
Example #5
0
    def parse_log(self):
        """
        Read each line of the log file and batch the records corresponding
        to each client (ip) make a dictionary of lists each consisting of all
         records
        """
        #to check the performance and the sensitivity of the log mancher
        total_failure_munches = 0
        for log_filename in self._log_file_list:
            try:
                self._log_lines = open(log_filename)
            except IOError:
                raise IOError

            self._log_lines.seek(0, 2) #go to end to check the size
            total_file_size = self._log_lines.tell()
            self._log_lines.seek(0, 0) #and go back to the begining
            previous_progress = 0

            print "Parsing ", log_filename.split('/')[-1]

            #we are going to keep track of each ip and last session number corresponding
            #to that ip
            ip_session_tracker = {}
            for cur_rec in self._log_lines:
                new_session = False
                cur_rec_dict = parse_apache_line(cur_rec)

                if cur_rec_dict:
                    cur_ip = cur_rec_dict["host"];
                    cur_ats_rec = ATSRecord(cur_rec_dict);

                    if not cur_ip in ip_session_tracker:
                        ip_session_tracker[cur_ip] = 0
                        new_session = True

                    #now we are checking if we hit a new session
                    #if we already decided that we are in a new session then there is nothing
                    #to investigate
                    if not new_session:
                        #so we have a session already recorded, compare
                        #the time of that last record of that session with
                        #this session
                        if cur_ats_rec.time_to_second() - self._ordered_records[(cur_ip, ip_session_tracker[cur_ip])][-1].time_to_second() > self.DEAD_SESSION_PAUSE:
                            #the session is dead we have to start a new session
                            ip_session_tracker[cur_ip] += 1
                            new_session = True

                    if new_session:
                        self._ordered_records[(cur_ip, ip_session_tracker[cur_ip])] = [cur_ats_rec]
                    else:
                        self._ordered_records[(cur_ip, ip_session_tracker[cur_ip])].append(cur_ats_rec)

                else:
                    #unable to munch and grasp the data due to unrecognizable format
                    total_failure_munches += 1

                #reporting progress
                current_progress = (self._log_lines.tell()*100)/total_file_size
                if (current_progress != previous_progress):
                    print "%", current_progress
                    previous_progress = current_progress


            self._log_lines.close()

        self._log_file_list = []

        #for debug, it should be moved to be dumped in the logger
        print "Parsed ", len(self._ordered_records)
        if total_failure_munches > 0:
            print "Failed to parse ", total_failure_munches, " records"
        self.dict_invalid = False
Example #6
0
    def parse_log(self, parser = "apache"):
        """
        Read each line of the log file and batch the records corresponding
        to each client (ip) make a dictionary of lists each consisting of all
         records
        """
        parser_function = parse_apache_line if parser == "apache" else parse_nginx_line
        #to check the performance and the sensitivity of the log mancher
        total_failure_munches = 0
        for log_filename in self._log_file_list:
            try:
                self._log_lines = open(log_filename)
            except IOError:
                raise IOError

            self._log_lines.seek(0, 2) #go to end to check the size
            total_file_size = self._log_lines.tell()
            self._log_lines.seek(0, 0) #and go back to the begining
            previous_progress = 0

            print "Parsing ", log_filename.split('/')[-1]

            #we are going to keep track of each ip and last session number corresponding
            #to that ip
            ip_session_tracker = {}
            for cur_rec in self._log_lines:
                new_session = False
                cur_rec_dict = parser_function(cur_rec)

                if cur_rec_dict:
                    cur_ip = cur_rec_dict["host"];
                    cur_ats_rec = ATSRecord(cur_rec_dict);

                    if not cur_ip in ip_session_tracker:
                        ip_session_tracker[cur_ip] = 0
                        new_session = True

                    #now we are checking if we hit a new session
                    #if we already decided that we are in a new session then there is nothing
                    #to investigate
                    if not new_session:
                        #so we have a session already recorded, compare
                        #the time of that last record of that session with
                        #this session
                        if cur_ats_rec.time_to_second() - self._ordered_records[(cur_ip, ip_session_tracker[cur_ip])][-1].time_to_second() > self.DEAD_SESSION_PAUSE:
                            #the session is dead we have to start a new session
                            ip_session_tracker[cur_ip] += 1
                            new_session = True

                    if new_session:
                        self._ordered_records[(cur_ip, ip_session_tracker[cur_ip])] = [cur_ats_rec]
                    else:
                        self._ordered_records[(cur_ip, ip_session_tracker[cur_ip])].append(cur_ats_rec)

                else:
                    #unable to munch and grasp the data due to unrecognizable format
                    total_failure_munches += 1

                #reporting progress
                current_progress = (self._log_lines.tell()*100)/total_file_size
                if (current_progress != previous_progress):
                    print "%", current_progress
                    previous_progress = current_progress


            self._log_lines.close()

        self._log_file_list = []

        #for debug, it should be moved to be dumped in the logger
        print "Parsed ", len(self._ordered_records)
        if total_failure_munches > 0:
            print "Failed to parse ", total_failure_munches, " records"
        self.dict_invalid = False
        return self._ordered_records