class Train2Ban: """ The class receives a classifier, the user then can add ip+feature dics. Each ip is considered innocent unless proven otherwise. User can call different methods to indicate bad ips. It also hase the ability of using regex (through Fail2BanFilter) to mark bad ips The bad ips will be flagged 1 and the good one by 0 Finally when the user is satisfied with the training data they can call train to train the classifier. TODO: The training set needs to know its normalisation for prediction """ def __init__(self, ban_classifier): """ Sets the classifier, initiate the fail2ban filter and other objects. Because ban_classifer is an object, self gets a reference to it and after training it is ready to use. INPUT: ban_classifier: an svm object to be trained. """ self._ban_classifier = ban_classifier self._training_set = TrainingSet() self._log_filters = [] self._malicious_ip_list = [] self._log_files = [] def add_to_sample(self, ip_feature_db): """ Gets a dictionary cooked up by feature gathering classes and put them in the format of TrainigSet INPUT: ip_feature_db: A dictionary of lists each entry index by an ip address pointing to a list of features. """ for cur_ip in ip_feature_db: try: self._training_set.add_ip(cur_ip, ip_feature_db[cur_ip]) except ValueError: #just ignore the second coming of the IP pass def normalise(self, method = 'individual'): """ Ask the training set to normalises itself """ if method == 'sparse': self._training_set.normalise_sparse() else: self._training_set.normalise_individual() def add_bad_regexes(self, log_id, bad_ip_regexes): """ Submit the fail2ban regexes so when runs over the log file sieve out the bad ips and ip that doesn't come out of that process is consdier inoccent. If the ip doesn't already added to the training_set, the it will be ignored. INPUT: log_id: to which log this regex should be associated bad_ip_regexes: a tuple/list of fail2ban regexes to added to the filter. """ #first we check if we have alredy assciated any filter to this log cur_log_filter = [cur_filter[1] for cur_filter in self._log_filters if cur_filter[0] == log_id] if (len(cur_log_filter) == 0): #no filter found cur_log_filter = Fail2BanFilter(None) self._log_filters.append([log_id, cur_log_filter]) #setting the jail as None because #we are only using it line by line else: #filter already exists cur_log_filter = cur_log_filter[0] for cur_bad_regex in bad_ip_regexes: cur_log_filter.addFailRegex(cur_bad_regex) def add_malicious_history_log_files(self, log_file_info): """ Store the name of the files that fail2ban suppose to analysis to find out about the bad ips. INPUT: log_file_info: is an array of [log_id, log_filename] where the log_id is being used to keep trak of the regex associated to each log """ self._log_files.extend(log_file_info) def add_to_malicious_ips(self, bad_ip_list): """ Get a list of ips that the user knows they are malicious and add them to _malicious_ip_list INPUT: bad_ip_list: the ip list of strs to be indicated as 1 in training target """ self._malicious_ip_list.extend(bad_ip_list) def get_training_set(self): """ Access function for the training set """ return self._training_set def set_training_set(self, prepared_training_set): """ As it desirable to re-use some of the information in the training set one can retrieve a subset of a training set and re-set it again. However, this function should be used caustiously as the trainer accept the set without checking it (at least at the momemnt hence TODO!) """ self._training_set = prepared_training_set def predict(self, ip_feature_db): """ For a given data set use the currently constructed model to predict class labels for the entities """ failList = list() ip_set = self._training_set.precook_to_predict(ip_feature_db) self.bad_ip_prediction = self._ban_classifier.predict(ip_set._ip_feature_array) failList.extend([ip_set._ip_index[i] for i in range(0, len(self.bad_ip_prediction)) if self.bad_ip_prediction[i] == ip_set.BAD_TARGET]) return failList def mark_bad_target(self): """ Goes through all means of detecting bad ips, e.g., running fail2ban over log file, go through the malicious ip list and create the target for training. If an ip doesn't show up in any of these, it is considered good. #telling the training set that we are done with adding ips """ self._training_set.initiate_target() #Fail2ban ip selection from datetime import datetime for (cur_log_id, cur_log_filename) in self._log_files: try: cur_log_file = open(cur_log_filename) cur_log_filter = [cur_filter[1] for cur_filter in self._log_filters if cur_filter[0] == cur_log_id] #there is at most one filter anyway if (len(cur_log_filter) > 0): #filter for this log found for cur_line in cur_log_file: for bad_ip in cur_log_filter[0].findFailure(str(datetime.now()), cur_line): #TODO: this might need to be #changed, for we can simply give now #.strftime("%Y-%m-%d %Y %I:%M%p") self._training_set.mark_as_bad(bad_ip[0]) except IOError: print "Unable to read", cur_log_filename, "for marking bad ips, skipping..." #Manual ip selection for bad_ip in self._malicious_ip_list: self._training_set.mark_as_bad(bad_ip) def train(self): """ simply run the train procedure of the classifier If all ips are good no actual training will happen """ #first user should mark bad ips #If all ips ar good there's nothing to train if sum(self._training_set._target): self._ban_classifier.fit(self._training_set._ip_feature_array, \ self._training_set._target) def mark_and_train(self): self.mark_bad_target() self.train() def save_model(self,filename): """ Given a filename this function saves the current trainer model as a pickle file using the Sklearn pickle function. On success it returns true on failure it returns an error message. """ model_to_save = ReconstructableModel(self._training_set, self._ban_classifier); model_to_save.save_model(filename) def load_model(self,filename): """ For ao given filename this function attempts to load a pickle file as the current trainer model. On success it returns true on failure it returns an error. """ model_to_load = ReconstructableModel.construct_from_stored_model(filename) self._ban_classifier = model_to_load.ban_classifier self._training_set._normalisation_data = model_to_load.normalisation_data self._training_set._normalisation_function = self._training_set.normalise_individual if self._training_set._normalisation_data[TrainingSet.NORMALISATION_TYPE] == 'sparse': self._training_set._normalisation_function = self._training_set.normalise_sparse return True def get_training_model(self): """ Simply an access function for ip_feature_list and target in the training set. Objects are not safe to modify, so be nice. """ return (self._training_set._ip_index, \ self._training_set._ip_feature_array, \ self._training_set._target)
class Train2Ban: """ The class receives a classifier, the user then can add ip+feature dics. Each ip is considered innocent unless proven otherwise. User can call different methods to indicate bad ips. It also hase the ability of using regex (through Fail2BanFilter) to mark bad ips The bad ips will be flagged 1 and the good one by 0 Finally when the user is satisfied with the training data they can call train to train the classifier. TODO: The training set needs to know its normalisation for prediction """ def __init__(self, ban_classifier): """ Sets the classifier, initiate the fail2ban filter and other objects. Because ban_classifier is an object, self gets a reference to it and after training it is ready to use. INPUT: ban_classifier: an svm object to be trained. """ self._ban_classifier = ban_classifier self._training_set = TrainingSet() self._log_filters = [] self._malicious_ip_list = [] self._log_files = [] def add_to_sample(self, ip_feature_db): """ Gets a dictionary cooked up by feature gathering classes and put them in the format of TrainigSet INPUT: ip_feature_db: A dictionary of lists each entry index by an ip address pointing to a list of features. """ for cur_ip in ip_feature_db: try: self._training_set.add_ip(cur_ip, ip_feature_db[cur_ip]) except ValueError: #just ignore the second coming of the IP pass def normalise(self, method = 'individual'): """ Ask the training set to normalises itself """ if method == 'sparse': self._training_set.normalise_sparse() else: self._training_set.normalise_individual() def add_bad_regexes(self, log_id, bad_ip_regexes): """ Submit the fail2ban regexes so when runs over the log file sieve out the bad ips and ip that doesn't come out of that process is consdier inoccent. If the ip doesn't already added to the training_set, the it will be ignored. INPUT: log_id: to which log this regex should be associated bad_ip_regexes: a tuple/list of fail2ban regexes to added to the filter. """ #first we check if we have alredy assciated any filter to this log cur_log_filter = [cur_filter[1] for cur_filter in self._log_filters if cur_filter[0] == log_id] if (len(cur_log_filter) == 0): #no filter found cur_log_filter = Fail2BanFilter(None) self._log_filters.append([log_id, cur_log_filter]) #setting the jail as None because #we are only using it line by line else: #filter already exists cur_log_filter = cur_log_filter[0] for cur_bad_regex in bad_ip_regexes: cur_log_filter.addFailRegex(cur_bad_regex) def add_malicious_history_log_files(self, log_file_info): """ Store the name of the files that fail2ban suppose to analysis to find out about the bad ips. INPUT: log_file_info: is an array of [log_id, log_filename] where the log_id is being used to keep trak of the regex associated to each log """ self._log_files.extend(log_file_info) def add_to_malicious_ips(self, bad_ip_list): """ Get a list of ips that the user knows they are malicious and add them to _malicious_ip_list INPUT: bad_ip_list: the ip list of strs to be indicated as 1 in training target """ self._malicious_ip_list.extend(bad_ip_list) def get_training_set(self): """ Access function for the training set """ return self._training_set def set_training_set(self, prepared_training_set): """ As it desirable to re-use some of the information in the training set one can retrieve a subset of a training set and re-set it again. However, this function should be used caustiously as the trainer accept the set without checking it (at least at the momemnt hence TODO!) """ self._training_set = prepared_training_set def predict(self, ip_feature_db): """ For a given data set use the currently constructed model to predict class labels for the entities """ failList = list() ip_set = self._training_set.precook_to_predict(ip_feature_db) self.bad_ip_prediction = self._ban_classifier.predict(ip_set._ip_feature_array) failList.extend([ip_set._ip_index[i][0] for i in range(0, len(self.bad_ip_prediction)) if self.bad_ip_prediction[i] == ip_set.BAD_TARGET]) return failList def mark_bad_target(self): """ Goes through all means of detecting bad ips, e.g., running fail2ban over log file, go through the malicious ip list and create the target for training. If an ip doesn't show up in any of these, it is considered good. OUTPUT: retrurn the malicious list for possible further use """ #telling the training set that we are done with adding ips self._training_set.initiate_target() comp_bad_ip_list = [] #Fail2ban ip selection from datetime import datetime for (cur_log_id, cur_log_filename) in self._log_files: try: cur_log_file = open(cur_log_filename) cur_log_filter = [cur_filter[1] for cur_filter in self._log_filters if cur_filter[0] == cur_log_id] #there is at most one filter anyway if (len(cur_log_filter) > 0): #filter for this log found for cur_line in cur_log_file: for bad_ip in cur_log_filter[0].findFailure(str(datetime.now()), cur_line): #TODO: this might need to be #changed, for we can simply give now #.strftime("%Y-%m-%d %Y %I:%M%p") self._training_set.mark_as_bad(bad_ip[0]) if not bad_ip[0] in comp_bad_ip_list: comp_bad_ip_list.append(bad_ip[0]) except IOError: print "Unable to read", cur_log_filename, "for marking bad ips, skipping..." #Manual ip selection for bad_ip in self._malicious_ip_list: self._training_set.mark_as_bad(bad_ip) if not bad_ip in comp_bad_ip_list: comp_bad_ip_list.append(bad_ip) print "sample: bad/all: %i/%i"%(len([1 for i in self._training_set._target if i == self._training_set.BAD_TARGET]),len(self._training_set._target)) #return the malicious list for possible further use return comp_bad_ip_list def train(self): """ simply run the train procedure of the classifier If all ips are good no actual training will happen """ #first user should mark bad ips #If all ips ar good there's nothing to train if sum(self._training_set._target): self._ban_classifier.fit(self._training_set._ip_feature_array, \ self._training_set._target) #for testing the consistancy of the prediction # for i in range(0, len(self._training_set._target)): # if self._training_set._target[i] == self._training_set.BAD_TARGET: # print self._training_set._ip_feature_array[i] def mark_and_train(self): self.mark_bad_target() self.train() def save_model(self,filename, model_format='pickle'): """ Given a filename this function saves the current trainer model as a pickle file using the Sklearn pickle function. On success it returns true on failure it returns an error message. INPUT:: filename: the filename to store the model model_format: to be chosen from pickle:python object pickling, libsvm: libsvm format no normalization data, 'normal_svm' libsvm model with normalisation data attached at the end """ model_to_save = ReconstructableModel(self._training_set, self._ban_classifier); if (model_format == 'pickle'): model_to_save.save_model(filename) elif (model_format == 'libsvm'): model_to_save.save_bare_svm_model(filename) elif (model_format == 'normal_svm'): model_to_save.save_reconstructable_svm_model(filename) else: raise NotImplementedError, "model format not recognized" def load_model(self,filename): """ For ao given filename this function attempts to load a pickle file as the current trainer model. On success it returns true on failure it returns an error. """ model_to_load = ReconstructableModel.construct_from_stored_model(filename) self._ban_classifier = model_to_load.ban_classifier self._training_set._normalisation_data = model_to_load.normalisation_data self._training_set._normalisation_function = self._training_set.normalise_individual if self._training_set._normalisation_data[TrainingSet.NORMALISATION_TYPE] == 'sparse': self._training_set._normalisation_function = self._training_set.normalise_sparse return True def get_training_model(self): """ Simply an access function for ip_feature_list and target in the training set. Objects are not safe to modify, so be nice. """ return (self._training_set._ip_index, \ self._training_set._ip_feature_array, \ self._training_set._target)