Beispiel #1
0
class Train2Ban:
    """
    The class receives a classifier, the user then can add ip+feature dics.
    Each ip is considered innocent unless proven otherwise. User can call
    different methods to indicate bad ips.

    It also hase the ability of using regex (through Fail2BanFilter) to mark
    bad ips

    The bad ips will be flagged 1 and the good one by 0

    Finally when the user is satisfied with the training data they can call
    train to train the classifier.

    TODO: The training set needs to know its normalisation for prediction
    """
    def __init__(self, ban_classifier):
        """
        Sets the classifier, initiate the fail2ban filter and other objects.

        Because ban_classifer is an object, self gets a reference to it and
        after training it is ready to use.

        INPUT:
           ban_classifier: an svm object to be trained.

        """
        self._ban_classifier = ban_classifier
        self._training_set = TrainingSet()
        self._log_filters = []
        self._malicious_ip_list = []
        self._log_files = []

    def add_to_sample(self, ip_feature_db):
        """
        Gets a dictionary cooked up by feature gathering classes
        and put them in the format of TrainigSet

        INPUT:
            ip_feature_db: A dictionary of lists each entry index by an ip
                           address pointing to a list of features.

        """
        for cur_ip in ip_feature_db:
            try:
                self._training_set.add_ip(cur_ip, ip_feature_db[cur_ip])
            except ValueError:
                #just ignore the second coming of the IP
                pass

    def normalise(self, method = 'individual'):
        """
        Ask the training set to normalises itself
        """
        if method == 'sparse':
            self._training_set.normalise_sparse()
        else:
            self._training_set.normalise_individual()

    def add_bad_regexes(self, log_id, bad_ip_regexes):
        """
        Submit the fail2ban regexes so when runs over the log file sieve
        out the bad ips and ip that doesn't come out of that process
        is consdier inoccent.

        If the ip doesn't already added to the training_set, the it will be
        ignored.

        INPUT:
           log_id: to which log this regex should be associated
           bad_ip_regexes: a tuple/list of fail2ban regexes to added to the
                           filter.
        """
        #first we check if we have alredy assciated any filter to this log
        cur_log_filter = [cur_filter[1] for cur_filter in self._log_filters if cur_filter[0] == log_id]
        if (len(cur_log_filter) == 0): #no filter found
            cur_log_filter = Fail2BanFilter(None)
            self._log_filters.append([log_id, cur_log_filter])
                                            #setting the jail as None because
                                            #we are only using it line by line
        else: #filter already exists
            cur_log_filter = cur_log_filter[0]

        for cur_bad_regex in bad_ip_regexes:
            cur_log_filter.addFailRegex(cur_bad_regex)

    def add_malicious_history_log_files(self, log_file_info):
        """
        Store the name of the files that fail2ban suppose to analysis to
        find out about the bad ips.

        INPUT:
              log_file_info: is an array of [log_id, log_filename]
                             where the log_id is being used to keep trak of
                             the regex associated to each log
        """
        self._log_files.extend(log_file_info)

    def add_to_malicious_ips(self, bad_ip_list):
        """
        Get a list of ips that the user knows they are malicious
        and add them to _malicious_ip_list

        INPUT:
           bad_ip_list: the ip list of strs to be indicated as 1 in training
           target
        """
        self._malicious_ip_list.extend(bad_ip_list)

    def get_training_set(self):
        """
        Access function for the training set
        """
        return self._training_set

    def set_training_set(self, prepared_training_set):
        """
        As it desirable to re-use some of the information in the training set
        one can retrieve a subset of a training set and re-set it again.
        However, this function should be used caustiously as the trainer
        accept the set without checking it (at least at the momemnt hence
        TODO!)
        """
        self._training_set = prepared_training_set

    def predict(self, ip_feature_db):
        """
        For a given data set use the currently constructed model
        to predict class labels for the entities
        """
        failList = list()

        ip_set = self._training_set.precook_to_predict(ip_feature_db)

        self.bad_ip_prediction = self._ban_classifier.predict(ip_set._ip_feature_array)

        failList.extend([ip_set._ip_index[i] for i in range(0, len(self.bad_ip_prediction)) if self.bad_ip_prediction[i] == ip_set.BAD_TARGET])

        return failList

    def mark_bad_target(self):
        """
        Goes through all means of detecting bad ips, e.g., running fail2ban
        over log file, go through the malicious ip list and create the target
        for training. If an ip doesn't show up in any of these, it is
        considered good.

        #telling the training set that we are done with adding ips
        """
        self._training_set.initiate_target()

        #Fail2ban ip selection
        from datetime import datetime
        for (cur_log_id, cur_log_filename) in self._log_files:
            try:
                cur_log_file = open(cur_log_filename)
                cur_log_filter = [cur_filter[1] for cur_filter in self._log_filters if cur_filter[0] == cur_log_id] #there is at most one filter anyway
                if (len(cur_log_filter) > 0): #filter for this log found
                    for cur_line in cur_log_file:
                        for bad_ip in cur_log_filter[0].findFailure(str(datetime.now()), cur_line):       #TODO: this might need to be
                            #changed, for we can simply give now
                            #.strftime("%Y-%m-%d %Y %I:%M%p")
                            self._training_set.mark_as_bad(bad_ip[0])
            except IOError:
                print "Unable to read", cur_log_filename, "for marking bad ips, skipping..."

        #Manual ip selection
        for bad_ip in self._malicious_ip_list:
            self._training_set.mark_as_bad(bad_ip)

    def train(self):
        """
        simply run the train procedure of the classifier

        If all ips are good no actual training will happen
        """
        #first user should mark bad ips

        #If all ips ar good there's nothing to train
        if sum(self._training_set._target):
            self._ban_classifier.fit(self._training_set._ip_feature_array, \
                                     self._training_set._target)

    def mark_and_train(self):
        self.mark_bad_target()
        self.train()

    def save_model(self,filename):
    	"""
        Given a filename this function saves the current trainer model
        as a pickle file using the Sklearn pickle function.
        On success it returns true on failure it returns an error
        message.
        """
        model_to_save = ReconstructableModel(self._training_set, self._ban_classifier);
        model_to_save.save_model(filename)

    def load_model(self,filename):
        """
        For ao given filename this function attempts to load a pickle
        file as the current trainer model.
        On success it returns true on failure it returns an error.
        """
        model_to_load = ReconstructableModel.construct_from_stored_model(filename)

        self._ban_classifier = model_to_load.ban_classifier
        self._training_set._normalisation_data = model_to_load.normalisation_data
        self._training_set._normalisation_function = self._training_set.normalise_individual
        if self._training_set._normalisation_data[TrainingSet.NORMALISATION_TYPE] == 'sparse':
            self._training_set._normalisation_function = self._training_set.normalise_sparse

        return True

    def get_training_model(self):
        """
        Simply an access function for ip_feature_list and target in the training
        set. Objects are not safe to modify, so be nice.
        """
        return (self._training_set._ip_index, \
                    self._training_set._ip_feature_array, \
                    self._training_set._target)
Beispiel #2
0
class Train2Ban:
    """
    The class receives a classifier, the user then can add ip+feature dics.
    Each ip is considered innocent unless proven otherwise. User can call
    different methods to indicate bad ips.

    It also hase the ability of using regex (through Fail2BanFilter) to mark
    bad ips

    The bad ips will be flagged 1 and the good one by 0

    Finally when the user is satisfied with the training data they can call
    train to train the classifier.

    TODO: The training set needs to know its normalisation for prediction
    """
    def __init__(self, ban_classifier):
        """
        Sets the classifier, initiate the fail2ban filter and other objects.

        Because ban_classifier is an object, self gets a reference to it and
        after training it is ready to use.

        INPUT:
           ban_classifier: an svm object to be trained.

        """
        self._ban_classifier = ban_classifier
        self._training_set = TrainingSet()
        self._log_filters = []
        self._malicious_ip_list = []
        self._log_files = []

    def add_to_sample(self, ip_feature_db):
        """
        Gets a dictionary cooked up by feature gathering classes
        and put them in the format of TrainigSet

        INPUT:
            ip_feature_db: A dictionary of lists each entry index by an ip
                           address pointing to a list of features.

        """
        for cur_ip in ip_feature_db:
            try:
                self._training_set.add_ip(cur_ip, ip_feature_db[cur_ip])
            except ValueError:
                #just ignore the second coming of the IP
                pass

    def normalise(self, method = 'individual'):
        """
        Ask the training set to normalises itself
        """
        if method == 'sparse':
            self._training_set.normalise_sparse()
        else:
            self._training_set.normalise_individual()

    def add_bad_regexes(self, log_id, bad_ip_regexes):
        """
        Submit the fail2ban regexes so when runs over the log file sieve
        out the bad ips and ip that doesn't come out of that process
        is consdier inoccent.

        If the ip doesn't already added to the training_set, the it will be
        ignored.

        INPUT:
           log_id: to which log this regex should be associated
           bad_ip_regexes: a tuple/list of fail2ban regexes to added to the
                           filter.
        """
        #first we check if we have alredy assciated any filter to this log
        cur_log_filter = [cur_filter[1] for cur_filter in self._log_filters if cur_filter[0] == log_id]
        if (len(cur_log_filter) == 0): #no filter found
            cur_log_filter = Fail2BanFilter(None)
            self._log_filters.append([log_id, cur_log_filter])
                                            #setting the jail as None because
                                            #we are only using it line by line
        else: #filter already exists
            cur_log_filter = cur_log_filter[0]

        for cur_bad_regex in bad_ip_regexes:
            cur_log_filter.addFailRegex(cur_bad_regex)

    def add_malicious_history_log_files(self, log_file_info):
        """
        Store the name of the files that fail2ban suppose to analysis to
        find out about the bad ips.

        INPUT:
              log_file_info: is an array of [log_id, log_filename]
                             where the log_id is being used to keep trak of
                             the regex associated to each log
        """
        self._log_files.extend(log_file_info)

    def add_to_malicious_ips(self, bad_ip_list):
        """
        Get a list of ips that the user knows they are malicious
        and add them to _malicious_ip_list

        INPUT:
           bad_ip_list: the ip list of strs to be indicated as 1 in training
           target
        """
        self._malicious_ip_list.extend(bad_ip_list)

    def get_training_set(self):
        """
        Access function for the training set
        """
        return self._training_set

    def set_training_set(self, prepared_training_set):
        """
        As it desirable to re-use some of the information in the training set
        one can retrieve a subset of a training set and re-set it again.
        However, this function should be used caustiously as the trainer
        accept the set without checking it (at least at the momemnt hence
        TODO!)
        """
        self._training_set = prepared_training_set

    def predict(self, ip_feature_db):
        """
        For a given data set use the currently constructed model
        to predict class labels for the entities
        """
        failList = list()

        ip_set = self._training_set.precook_to_predict(ip_feature_db)

        self.bad_ip_prediction = self._ban_classifier.predict(ip_set._ip_feature_array)

        failList.extend([ip_set._ip_index[i][0] for i in range(0, len(self.bad_ip_prediction)) if self.bad_ip_prediction[i] == ip_set.BAD_TARGET])

        return failList

    def mark_bad_target(self):
        """
        Goes through all means of detecting bad ips, e.g., running fail2ban
        over log file, go through the malicious ip list and create the target
        for training. If an ip doesn't show up in any of these, it is
        considered good.

        OUTPUT:
            retrurn the malicious list for possible further use

        """
        #telling the training set that we are done with adding ips
        self._training_set.initiate_target()
        comp_bad_ip_list = []

        #Fail2ban ip selection
        from datetime import datetime
        for (cur_log_id, cur_log_filename) in self._log_files:
            try:
                cur_log_file = open(cur_log_filename)
                cur_log_filter = [cur_filter[1] for cur_filter in self._log_filters if cur_filter[0] == cur_log_id] #there is at most one filter anyway
                if (len(cur_log_filter) > 0): #filter for this log found
                    for cur_line in cur_log_file:
                        for bad_ip in cur_log_filter[0].findFailure(str(datetime.now()), cur_line):
                            #TODO: this might need to be
                            #changed, for we can simply give now
                            #.strftime("%Y-%m-%d %Y %I:%M%p")
                            self._training_set.mark_as_bad(bad_ip[0])
                            if not bad_ip[0] in comp_bad_ip_list:
                                comp_bad_ip_list.append(bad_ip[0])
            except IOError:
                print "Unable to read", cur_log_filename, "for marking bad ips, skipping..."

        #Manual ip selection
        for bad_ip in self._malicious_ip_list:
            self._training_set.mark_as_bad(bad_ip)
            if not bad_ip in comp_bad_ip_list:
                comp_bad_ip_list.append(bad_ip)

        print "sample: bad/all: %i/%i"%(len([1 for i in self._training_set._target if i == self._training_set.BAD_TARGET]),len(self._training_set._target))

        #return the malicious list for possible further use
        return comp_bad_ip_list

    def train(self):
        """
        simply run the train procedure of the classifier

        If all ips are good no actual training will happen
        """
        #first user should mark bad ips

        #If all ips ar good there's nothing to train
        if sum(self._training_set._target):
            self._ban_classifier.fit(self._training_set._ip_feature_array, \
                                     self._training_set._target)
            #for testing the consistancy of the prediction
            # for i in range(0, len(self._training_set._target)):
            #     if self._training_set._target[i] == self._training_set.BAD_TARGET:
            #         print self._training_set._ip_feature_array[i]

    def mark_and_train(self):
        self.mark_bad_target()
        self.train()

    def save_model(self,filename, model_format='pickle'):
    	"""
        Given a filename this function saves the current trainer model
        as a pickle file using the Sklearn pickle function.
        On success it returns true on failure it returns an error
        message.

        INPUT::
            filename: the filename to store the model
            model_format: to be chosen from pickle:python object pickling, libsvm: libsvm
            format no normalization data, 'normal_svm' libsvm model with normalisation data
            attached at the end
        """
        model_to_save = ReconstructableModel(self._training_set, self._ban_classifier);
        if (model_format == 'pickle'):
            model_to_save.save_model(filename)
        elif (model_format == 'libsvm'):
            model_to_save.save_bare_svm_model(filename)
        elif (model_format == 'normal_svm'):
            model_to_save.save_reconstructable_svm_model(filename)
        else:
            raise NotImplementedError, "model format not recognized"

    def load_model(self,filename):
        """
        For ao given filename this function attempts to load a pickle
        file as the current trainer model.
        On success it returns true on failure it returns an error.
        """
        model_to_load = ReconstructableModel.construct_from_stored_model(filename)

        self._ban_classifier = model_to_load.ban_classifier
        self._training_set._normalisation_data = model_to_load.normalisation_data
        self._training_set._normalisation_function = self._training_set.normalise_individual
        if self._training_set._normalisation_data[TrainingSet.NORMALISATION_TYPE] == 'sparse':
            self._training_set._normalisation_function = self._training_set.normalise_sparse

        return True

    def get_training_model(self):
        """
        Simply an access function for ip_feature_list and target in the training
        set. Objects are not safe to modify, so be nice.
        """
        return (self._training_set._ip_index, \
                    self._training_set._ip_feature_array, \
                    self._training_set._target)