def parse_log_old(self): """ Read each line of the log file and batch the records corresponding to each (client (ip), session) make a dictionary of lists each consisting of all records of that session """ for cur_rec in self._log_lines: #Here (at least for now) we only care about the ip and the time record. time_pos = cur_rec.find('-') if time_pos == -1: #Not a valid record continue http_req_pos = cur_rec.find('"') cur_ip = cur_rec[:time_pos-1] rec_time = cur_rec[time_pos + 3:http_req_pos - 2] rec_payload = cur_rec[http_req_pos:] #check if we have already encountered this ip cur_ats_rec = ATSRecord(cur_ip, rec_time, rec_payload) if not cur_ip in self._ordered_records: self._ordered_records[cur_ip] = [cur_ats_rec] else: self._ordered_records[cur_ip].append(cur_ats_rec) self.dict_invalid = False
def _gather_all_features(self, cur_rec_dict): """ Set the ip_sieve log equal to the cur ip's history and compute features from feature list for that ip only """ #check for too much memory consumption if (self._log_rec_counter == self.MAX_LOG_DB_SIZE): oldest_rec = self._ip_log_db.popitem(last=False) self._log_rec_counter -= (len(oldest_rec[1]) - 1) else: self._log_rec_counter += 1 print "no of ips", len( self._ip_log_db), " no of log recs", self._log_rec_counter from random import randint cur_ip = cur_rec_dict["host"] #cur_ip = cur_ip[:-1] + str(randint(1,255)) cur_ats_rec = ATSRecord(cur_rec_dict) if not cur_ip in self._ip_log_db: self._ip_log_db[cur_ip] = [cur_ats_rec] else: #get rid of old session if cur_ats_rec.time_to_second() - self._ip_log_db[cur_ip][ -1].time_to_second() > self.DEAD_SESSION_PAUSE: self._log_rec_counter -= (len(self._ip_log_db[cur_ip]) - 1) self._ip_log_db[cur_ip] = [] self._ip_log_db[cur_ip].append(cur_ats_rec) #get rid of ip's old row in the ip feature array np.delete(self.ip_feature_array, (self.ip_row_tracker[cur_ip]), axis=0) ip_recs = dict(((cur_ip, self._ip_log_db[cur_ip]), )) #so this is stupid we should compute accumulatively #instead so ip_feature_db should be the member of #the class, it probably should be a training set ip_feature_db = {} for cur_feature_name in self._feature_list: cur_feature_tester = self._available_features[cur_feature_name]( ip_recs, ip_feature_db) cur_feature_tester.compute() #turing ip_feature_db into a numpy array self.ip_row_tracker[cur_ip] = self.ip_feature_array.shape[0] cur_ip_row = [[ ip_feature_db[ip][feature] for feature in range(1, len(self._feature_list) + 1) ] for ip in ip_feature_db] if (self.ip_feature_array.shape[0] == 0): self.ip_feature_array = np.array(cur_ip_row) else: self.ip_feature_array = np.vstack( [self.ip_feature_array, cur_ip_row])
def get(self, start, stop, target): """ Get deflect log from es """ indexes, body = self._get_param_deflect(start, stop, target) print "es.search() start..." result = [] try: page = self.es.search( index=indexes, scroll='5m', #search_type = 'scan', size=10000, body=body) sid = page['_scroll_id'] page_index = 0 scroll_size = page['hits']['total'] total_size = scroll_size print "total # of hits : ", total_size num_processed = 0 while (scroll_size > 0): #print "Scrolling...", page_index # Do something with the obtained page json_result = page['hits']['hits'] for log in json_result: #print log['_source']['@timestamp'] cur_rec_dict = util.es_log_muncher.parse_es_json_object( log) if cur_rec_dict: cur_ats_rec = ATSRecord(cur_rec_dict) result.append(cur_ats_rec) num_processed = num_processed + 1 print "progress:{},{:.1f}%...".format( num_processed, 100.0 * num_processed / total_size) if (num_processed > 5000000): break page_index = page_index + 1 page = self.es.scroll(scroll_id=sid, scroll='5m') sid = page['_scroll_id'] scroll_size = len(page['hits']['hits']) except Exception as ex: print ex return result
def parse_log(self, parser = "apache"): """ Read each line of the log file and batch the records corresponding to each client (ip) make a dictionary of lists each consisting of all records """ parser_function = parse_apache_line if parser == "apache" else parse_nginx_line #to check the performance and the sensitivity of the log mancher total_failure_munches = 0 for log_filename in self._log_file_list: try: self._log_lines = open(log_filename) except IOError: raise IOError self._log_lines.seek(0, 2) #go to end to check the size total_file_size = self._log_lines.tell() self._log_lines.seek(0, 0) #and go back to the begining previous_progress = 0 print "Parsing ", log_filename.split('/')[-1] #we are going to keep track of each ip and last session number corresponding #to that ip ip_session_tracker = {} for cur_rec in self._log_lines: new_session = False cur_rec_dict = parser_function(cur_rec) if cur_rec_dict: cur_ip = cur_rec_dict["host"]; cur_ats_rec = ATSRecord(cur_rec_dict); if not cur_ip in ip_session_tracker: ip_session_tracker[cur_ip] = 0 new_session = True #now we are checking if we hit a new session #if we already decided that we are in a new session then there is nothing #to investigate if not new_session: #so we have a session already recorded, compare #the time of that last record of that session with #this session if cur_ats_rec.time_to_second() - self._ordered_records[(cur_ip, ip_session_tracker[cur_ip])][-1].time_to_second() > self.DEAD_SESSION_PAUSE: #the session is dead we have to start a new session ip_session_tracker[cur_ip] += 1 new_session = True if new_session: self._ordered_records[(cur_ip, ip_session_tracker[cur_ip])] = [cur_ats_rec] else: self._ordered_records[(cur_ip, ip_session_tracker[cur_ip])].append(cur_ats_rec) else: #unable to munch and grasp the data due to unrecognizable format total_failure_munches += 1 #reporting progress current_progress = (self._log_lines.tell()*100)/total_file_size if (current_progress != previous_progress): print "%", current_progress previous_progress = current_progress self._log_lines.close() self._log_file_list = [] #for debug, it should be moved to be dumped in the logger print "Parsed ", len(self._ordered_records) if total_failure_munches > 0: print "Failed to parse ", total_failure_munches, " records" self.dict_invalid = False return self._ordered_records