def test_all_features(self): for cur_log_file in self.log_files: self.test_ip_sieve.add_log_file(cur_log_file) self.test_ip_sieve.parse_log() for CurrentFeatureType in Learn2BanFeature.__subclasses__(): cur_feature_tester = CurrentFeatureType(self.test_ip_sieve, self.test_ip_feature_db) cur_feature_tester.compute() print self.test_ip_feature_db
def test_all_features(self): for cur_log_file in self.log_files: self.test_ip_sieve.add_log_file(cur_log_file) self.test_ip_sieve.parse_log() for CurrentFeatureType in Learn2BanFeature.__subclasses__(): cur_feature_tester = CurrentFeatureType( self.test_ip_sieve, self.test_ip_feature_db) cur_feature_tester.compute() print self.test_ip_feature_db
def gather_all_features(self, log_files): """ gathers all features INPUT: log_files: the logs that we went through it. """ for cur_log_file in log_files: self.ip_sieve.add_log_file(cur_log_file) self.ip_sieve.parse_log() for CurrentFeatureType in Learn2BanFeature.__subclasses__(): cur_feature_tester = CurrentFeatureType(self.ip_sieve, self.ip_feature_db) cur_feature_tester.compute() return self.ip_feature_db
def gather_all_features(self, log_files): """ gathers all features INPUT: log_files: the logs that we went through it. """ for cur_log_file in log_files: self.ip_sieve.add_log_file(cur_log_file) self.ip_sieve.parse_log() for CurrentFeatureType in Learn2BanFeature.__subclasses__(): cur_feature_tester = CurrentFeatureType( self.ip_sieve, self.ip_feature_db) cur_feature_tester.compute() return self.ip_feature_db
def _process_logs(self): """ get the log name from db and gathers all features INPUT: log_files: the logs that we went through it. """ #this is not a oop way of retrieving the logs but I think we are #avoiding db access in other classes beside l2btools cur_experiment_logs = self.l2btools.retrieve_experiment_logs(self.id) #if there is no log associated to this experiment then there is nothing #to do if len(cur_experiment_logs) == 0: logging.info("Giving up on experiment %i with no training log"%self.expr_dict['id']) return #log id is needed to be send to the trainer so the the trainer #knows which regex is detecting the bots for which log self.trainer.add_malicious_history_log_files([(cur_log_info['log_id'], cur_log_info['file_name']) for cur_log_info in cur_experiment_logs]) #extracitng the filenames #Get IP Features log_filenames = tuple(cur_log['file_name'] for cur_log in cur_experiment_logs) #At this stage it is only a peliminary list we might lose features #due to 0 variance self._active_feature_list = [] #do a dry run on all features just to gather the indeces of all available #features for CurrentFeatureType in Learn2BanFeature.__subclasses__(): cur_feature_tester = CurrentFeatureType(self.ip_sieve, self.ip_feature_db) self._active_feature_list.append(cur_feature_tester._FEATURE_INDEX) for cur_log_file in log_filenames: #in theory it might be more memory efficient #to crunch the logs one by one but python is quite disappointing in memory #management try: self.ip_sieve.add_log_file(cur_log_file) self.ip_sieve.parse_log() except IOError: print "Unable to read ", cur_log_file, "skipping..." for CurrentFeatureType in Learn2BanFeature.__subclasses__(): cur_feature_tester = CurrentFeatureType(self.ip_sieve, self.ip_feature_db) logging.info("Computing feature %i..."%cur_feature_tester._FEATURE_INDEX) cur_feature_tester.compute() # we have memory problem here :( # import objgraph # objgraph.show_refs([self.ip_sieve._ordered_records], filename='ips-graph.png') del self.ip_sieve._ordered_records del self.ip_sieve #f**k python with not letting the memory released # import gc # gc.collect() # print gc.garbage() self.trainer.add_to_sample(self.ip_feature_db) #we store the non-normailized vectors in a json file jsonized_ip_feature_db = {} for k,v in self.ip_feature_db.items(): jsonized_ip_feature_db[str(k)] = v import json with open(self.base_analyse_log_file+".prenormal_ip_feature_db.json", "w") as ip_feature_file: json.dump(jsonized_ip_feature_db, ip_feature_file) del self.ip_feature_db del jsonized_ip_feature_db #Normalise training set, normalisation should happen after all #sample is gathered self.trainer.normalise(self.expr_dict['norm_mode'])
def _process_logs(self): """ get the log name from db and gathers all features INPUT: log_files: the logs that we went through it. """ #this is not a oop way of retrieving the logs but I think we are #avoiding db access in other classes beside l2btools cur_experiment_logs = self.l2btools.retrieve_experiment_logs(self.id) #if there is no log associated to this experiment then there is nothing #to do if len(cur_experiment_logs) == 0: logging.info("Giving up on experiment %i with no training log" % self.expr_dict['id']) return #log id is needed to be send to the trainer so the the trainer #knows which regex is detecting the bots for which log self.trainer.add_malicious_history_log_files([ (cur_log_info['log_id'], cur_log_info['file_name']) for cur_log_info in cur_experiment_logs ]) #extracitng the filenames #Get IP Features log_filenames = tuple(cur_log['file_name'] for cur_log in cur_experiment_logs) #At this stage it is only a peliminary list we might lose features #due to 0 variance self._active_feature_list = [] #do a dry run on all features just to gather the indeces of all available #features for CurrentFeatureType in Learn2BanFeature.__subclasses__(): cur_feature_tester = CurrentFeatureType(self.ip_sieve, self.ip_feature_db) self._active_feature_list.append(cur_feature_tester._FEATURE_INDEX) for cur_log_file in log_filenames: #in theory it might be more memory efficient #to crunch the logs one by one but python is quite disappointing in memory #management try: self.ip_sieve.add_log_file(cur_log_file) self.ip_sieve.parse_log() except IOError: print "Unable to read ", cur_log_file, "skipping..." for CurrentFeatureType in Learn2BanFeature.__subclasses__(): cur_feature_tester = CurrentFeatureType(self.ip_sieve, self.ip_feature_db) logging.info("Computing feature %i..." % cur_feature_tester._FEATURE_INDEX) cur_feature_tester.compute() # we have memory problem here :( # import objgraph # objgraph.show_refs([self.ip_sieve._ordered_records], filename='ips-graph.png') del self.ip_sieve._ordered_records del self.ip_sieve #f**k python with not letting the memory released # import gc # gc.collect() # print gc.garbage() self.trainer.add_to_sample(self.ip_feature_db) #we store the non-normailized vectors in a json file jsonized_ip_feature_db = {} for k, v in self.ip_feature_db.items(): jsonized_ip_feature_db[str(k)] = v import json with open(self.base_analyse_log_file + ".prenormal_ip_feature_db.json", "w") as ip_feature_file: json.dump(jsonized_ip_feature_db, ip_feature_file) del self.ip_feature_db del jsonized_ip_feature_db #Normalise training set, normalisation should happen after all #sample is gathered self.trainer.normalise(self.expr_dict['norm_mode'])