Ejemplo n.º 1
0
    def test_all_features(self):
        for cur_log_file in self.log_files:
            self.test_ip_sieve.add_log_file(cur_log_file)
            self.test_ip_sieve.parse_log()

            for CurrentFeatureType in Learn2BanFeature.__subclasses__():
                cur_feature_tester = CurrentFeatureType(self.test_ip_sieve, self.test_ip_feature_db)
                cur_feature_tester.compute()

        print self.test_ip_feature_db
Ejemplo n.º 2
0
    def test_all_features(self):
        for cur_log_file in self.log_files:
            self.test_ip_sieve.add_log_file(cur_log_file)
            self.test_ip_sieve.parse_log()

            for CurrentFeatureType in Learn2BanFeature.__subclasses__():
                cur_feature_tester = CurrentFeatureType(
                    self.test_ip_sieve, self.test_ip_feature_db)
                cur_feature_tester.compute()

        print self.test_ip_feature_db
Ejemplo n.º 3
0
    def gather_all_features(self, log_files):
        """
        gathers all features

        INPUT:
            log_files: the logs that we went through it.
        """
        for cur_log_file in log_files:
            self.ip_sieve.add_log_file(cur_log_file)
            self.ip_sieve.parse_log()
            for CurrentFeatureType in Learn2BanFeature.__subclasses__():
                cur_feature_tester = CurrentFeatureType(self.ip_sieve, self.ip_feature_db)
                cur_feature_tester.compute()

        return self.ip_feature_db
Ejemplo n.º 4
0
    def gather_all_features(self, log_files):
        """
        gathers all features

        INPUT:
            log_files: the logs that we went through it.
        """
        for cur_log_file in log_files:
            self.ip_sieve.add_log_file(cur_log_file)
            self.ip_sieve.parse_log()
            for CurrentFeatureType in Learn2BanFeature.__subclasses__():
                cur_feature_tester = CurrentFeatureType(
                    self.ip_sieve, self.ip_feature_db)
                cur_feature_tester.compute()

        return self.ip_feature_db
Ejemplo n.º 5
0
    def _process_logs(self):
        """
        get the log name from db and gathers all features

        INPUT:
            log_files: the logs that we went through it.
        """
        #this is not a oop way of retrieving the logs but I think we are
        #avoiding db access in other classes beside l2btools
        cur_experiment_logs = self.l2btools.retrieve_experiment_logs(self.id)

        #if there is no log associated to this experiment then there is nothing
        #to do
        if len(cur_experiment_logs) == 0:
            logging.info("Giving up on experiment %i with no training log"%self.expr_dict['id'])
            return

        #log id is needed to be send to the trainer so the the trainer
        #knows which regex is detecting the bots for which log
        self.trainer.add_malicious_history_log_files([(cur_log_info['log_id'], cur_log_info['file_name']) for cur_log_info in cur_experiment_logs])

        #extracitng the filenames
        #Get IP Features
        log_filenames = tuple(cur_log['file_name'] for cur_log in cur_experiment_logs)
        #At this stage it is only a peliminary list we might lose features
        #due to 0 variance
        self._active_feature_list = []
        #do a dry run on all features just to gather the indeces of all available
        #features
        for CurrentFeatureType in Learn2BanFeature.__subclasses__():
            cur_feature_tester = CurrentFeatureType(self.ip_sieve, self.ip_feature_db)
            self._active_feature_list.append(cur_feature_tester._FEATURE_INDEX)

        for cur_log_file in log_filenames: #in theory it might be more memory efficient
            #to crunch the logs one by one but python is quite disappointing in memory
            #management
            try:
                self.ip_sieve.add_log_file(cur_log_file)
                self.ip_sieve.parse_log()
            except IOError:
                print "Unable to read ", cur_log_file, "skipping..."

        for CurrentFeatureType in Learn2BanFeature.__subclasses__():
            cur_feature_tester = CurrentFeatureType(self.ip_sieve, self.ip_feature_db)
            logging.info("Computing feature %i..."%cur_feature_tester._FEATURE_INDEX)
            cur_feature_tester.compute()

            # we have memory problem here :(
            # import objgraph
            # objgraph.show_refs([self.ip_sieve._ordered_records], filename='ips-graph.png')

        del self.ip_sieve._ordered_records
        del self.ip_sieve

        #f**k python with not letting the memory released
        # import gc
        # gc.collect()
        # print gc.garbage()

        self.trainer.add_to_sample(self.ip_feature_db)

        #we store the non-normailized vectors in a json file
        jsonized_ip_feature_db = {}
        for k,v in self.ip_feature_db.items():
            jsonized_ip_feature_db[str(k)] = v
        import json
        with open(self.base_analyse_log_file+".prenormal_ip_feature_db.json", "w") as ip_feature_file:
            json.dump(jsonized_ip_feature_db, ip_feature_file)

        del self.ip_feature_db
        del jsonized_ip_feature_db

        #Normalise training set, normalisation should happen after all
        #sample is gathered
        self.trainer.normalise(self.expr_dict['norm_mode'])
Ejemplo n.º 6
0
    def _process_logs(self):
        """
        get the log name from db and gathers all features

        INPUT:
            log_files: the logs that we went through it.
        """
        #this is not a oop way of retrieving the logs but I think we are
        #avoiding db access in other classes beside l2btools
        cur_experiment_logs = self.l2btools.retrieve_experiment_logs(self.id)

        #if there is no log associated to this experiment then there is nothing
        #to do
        if len(cur_experiment_logs) == 0:
            logging.info("Giving up on experiment %i with no training log" %
                         self.expr_dict['id'])
            return

        #log id is needed to be send to the trainer so the the trainer
        #knows which regex is detecting the bots for which log
        self.trainer.add_malicious_history_log_files([
            (cur_log_info['log_id'], cur_log_info['file_name'])
            for cur_log_info in cur_experiment_logs
        ])

        #extracitng the filenames
        #Get IP Features
        log_filenames = tuple(cur_log['file_name']
                              for cur_log in cur_experiment_logs)
        #At this stage it is only a peliminary list we might lose features
        #due to 0 variance
        self._active_feature_list = []
        #do a dry run on all features just to gather the indeces of all available
        #features
        for CurrentFeatureType in Learn2BanFeature.__subclasses__():
            cur_feature_tester = CurrentFeatureType(self.ip_sieve,
                                                    self.ip_feature_db)
            self._active_feature_list.append(cur_feature_tester._FEATURE_INDEX)

        for cur_log_file in log_filenames:  #in theory it might be more memory efficient
            #to crunch the logs one by one but python is quite disappointing in memory
            #management
            try:
                self.ip_sieve.add_log_file(cur_log_file)
                self.ip_sieve.parse_log()
            except IOError:
                print "Unable to read ", cur_log_file, "skipping..."

        for CurrentFeatureType in Learn2BanFeature.__subclasses__():
            cur_feature_tester = CurrentFeatureType(self.ip_sieve,
                                                    self.ip_feature_db)
            logging.info("Computing feature %i..." %
                         cur_feature_tester._FEATURE_INDEX)
            cur_feature_tester.compute()

            # we have memory problem here :(
            # import objgraph
            # objgraph.show_refs([self.ip_sieve._ordered_records], filename='ips-graph.png')

        del self.ip_sieve._ordered_records
        del self.ip_sieve

        #f**k python with not letting the memory released
        # import gc
        # gc.collect()
        # print gc.garbage()

        self.trainer.add_to_sample(self.ip_feature_db)

        #we store the non-normailized vectors in a json file
        jsonized_ip_feature_db = {}
        for k, v in self.ip_feature_db.items():
            jsonized_ip_feature_db[str(k)] = v
        import json
        with open(self.base_analyse_log_file + ".prenormal_ip_feature_db.json",
                  "w") as ip_feature_file:
            json.dump(jsonized_ip_feature_db, ip_feature_file)

        del self.ip_feature_db
        del jsonized_ip_feature_db

        #Normalise training set, normalisation should happen after all
        #sample is gathered
        self.trainer.normalise(self.expr_dict['norm_mode'])