class UpdateModel(object): def __init__(self, iteration): self.iteration = iteration self.models = {} self.times = {} def run(self): models_conf = self.iteration.conf.models_conf for k, conf in models_conf.iteritems(): self.runModel(k, conf) def runModel(self, kind, conf): self.setDatasets(conf) model = conf.model_class(conf, self.datasets, cv_monitoring=False) model.training() model.testing() if self.datasets.validation_instances is not None: model.validation() self.models[kind] = model # Execution time monitoring time = model.training_execution_time + model.testing_execution_time self.times[kind] = time return None def setDatasets(self, conf): al_datasets = self.iteration.datasets self.datasets = ClassifierDatasets(conf) self.datasets.setDatasets(al_datasets.getTrainInstances(conf), al_datasets.getTestInstances()) self.datasets.setValidationInstances(al_datasets.validation_instances)
def run(self): instances = InstancesFromExperiment(self).getInstances() test_instances = None if self.classification_conf.test_conf.method == 'test_dataset': test_exp = self.classification_conf.test_conf.test_exp test_instances = InstancesFromExperiment(test_exp).getInstances() datasets = ClassifierDatasets(self.classification_conf) datasets.generateDatasets(instances, test_instances) learning = self.classification_conf.model_class(self.classification_conf, datasets) learning.run(self.getOutputDirectory(), self)
class TrainTestValidation(object): def __init__(self, iteration): self.iteration = iteration self.models = {} self.times = {} def run(self): models_conf = self.iteration.experiment.conf.models_conf self.models_exp = {} for k, conf in models_conf.iteritems(): self.models_exp[k] = self.runModel(k, conf) self.exportModelsExperiments() def exportModelsExperiments(self): export_models = {} for k, exp in self.models_exp.iteritems(): export_models[k] = exp.experiment_id output_file = self.iteration.output_directory output_file += 'models_experiments.json' with open(output_file, 'w') as f: json.dump(export_models, f, indent=2) def runModel(self, kind, conf): self.setDatasets(conf) # Create the experiment exp = self.iteration.experiment name = 'AL' + str(exp.experiment_id) + '-Iter' + str( self.iteration.iteration_number) + '-' + kind model_exp = ClassificationExperiment(exp.project, exp.dataset, exp.session, experiment_name=name, labels_id=exp.labels_id, parent=exp.experiment_id) model_exp.setFeaturesFilenames(exp.features_filenames) model_exp.setClassifierConf(conf) model_exp.createExperiment() model_exp.export() # Build the model model = conf.model_class(model_exp.classification_conf, self.datasets, cv_monitoring=True) model.run(model_exp.getOutputDirectory(), model_exp) self.models[kind] = model # Execution time monitoring time = model.training_execution_time + model.testing_execution_time self.times[kind] = time return model_exp def setDatasets(self, conf): al_datasets = self.iteration.datasets self.datasets = ClassifierDatasets(conf) self.datasets.setDatasets(al_datasets.getTrainInstances(conf), al_datasets.getTestInstances()) self.datasets.setValidationInstances(al_datasets.validation_instances)
def runNaiveBayes(self): naive_bayes_conf = self.createNaiveBayesConf() # Update training data - the naive Bayes classifier is trained on all the data self.datasets.test_instances.families = list(self.lr_predicted_labels) all_datasets = ClassifierDatasets(naive_bayes_conf) train_instances = copy.deepcopy(self.datasets.train_instances) train_instances.union(self.datasets.test_instances) all_datasets.train_instances = train_instances all_datasets.test_instances = None all_datasets.setSampleWeights() self.evalClusteringPerf(all_datasets.train_instances) # Train the naive Bayes detection model and predict self.naive_bayes = GaussianNaiveBayes(naive_bayes_conf, all_datasets) self.naive_bayes.training() self.nb_time = self.naive_bayes.training_execution_time num_test_instances = self.datasets.test_instances.numInstances() self.datasets.test_instances.families = [None] * num_test_instances if num_test_instances == 0: self.nb_predicted_log_proba = [] else: self.nb_predicted_log_proba = self.naive_bayes.pipeline.predict_log_proba( self.datasets.test_instances.getFeatures()) start_time = time.time() if num_test_instances == 0: self.nb_predicted_labels = [] else: self.nb_predicted_labels = self.naive_bayes.pipeline.predict( self.datasets.test_instances.getFeatures()) self.nb_time += time.time() - start_time self.nb_class_labels = self.naive_bayes.class_labels
def buildMulticlassClassifier(self): if self.multiclass_model is not None: return multiclass_exp = self.createMulticlassExperiment() datasets = self.iteration.datasets predicted_instances = datasets.getInstancesFromIds(self.predicted_ids) multiclass_datasets = ClassifierDatasets( multiclass_exp, multiclass_exp.classification_conf) multiclass_datasets.train_instances = self.annotated_instances multiclass_datasets.test_instances = predicted_instances multiclass_datasets.setSampleWeights() self.multiclass_model = multiclass_exp.classification_conf.model_class( multiclass_exp, multiclass_datasets, cv_monitoring=True) self.multiclass_model.run()
def buildMulticlassClassifier(self, alerts_ids): multiclass_exp = self.createMulticlassExperiment() multiclass_datasets = ClassifierDatasets( multiclass_exp, multiclass_exp.classification_conf) malicious_ids = self.datasets.train_instances.getMaliciousIds() multiclass_datasets.train_instances = self.datasets.train_instances.getInstancesFromIds( malicious_ids) multiclass_datasets.test_instances = self.datasets.test_instances.getInstancesFromIds( alerts_ids) multiclass_datasets.setSampleWeights() multiclass_model = multiclass_exp.classification_conf.model_class( multiclass_exp, multiclass_datasets, cv_monitoring=False) multiclass_model.run() return multiclass_model
def trainNaiveBayes(self): naive_bayes_conf = self.getNaiveBayesConf() datasets = ClassifierDatasets(naive_bayes_conf) current_families = copy.deepcopy(self.instances.families) # families are altered self.instances.families = self.assigned_categories datasets.train_instances = self.instances datasets.test_instances = None datasets.setSampleWeights() naive_bayes = GaussianNaiveBayes(naive_bayes_conf, datasets) naive_bayes.training() # families are restored self.instances.families = current_families return naive_bayes
def runNaiveBayes(self): # Create an experiment for the naive Bayes model exp = self.iteration.experiment name = '-'.join([ 'AL' + str(exp.experiment_id), 'Iter' + str(self.iteration.iteration_number), 'all', 'NaiveBayes' ]) naive_bayes_exp = ClassificationExperiment( exp.project, exp.dataset, exp.db, exp.cursor, experiment_name=name, experiment_label=exp.experiment_label, parent=exp.experiment_id) naive_bayes_exp.setFeaturesFilenames(exp.features_filenames) test_conf = TestConfiguration() test_conf.setUnlabeled(labels_annotations='annotations') naive_bayes_conf = GaussianNaiveBayesConfiguration( exp.conf.models_conf['multiclass'].num_folds, False, True, test_conf) naive_bayes_exp.setClassifierConf(naive_bayes_conf) naive_bayes_exp.createExperiment() naive_bayes_exp.export() # Update training data - the naive Bayes classifier is trained on all the data self.datasets.test_instances.families = list(self.lr_predicted_labels) all_datasets = ClassifierDatasets(naive_bayes_exp, naive_bayes_exp.classification_conf) train_instances = Instances() train_instances.union(self.datasets.train_instances, self.datasets.test_instances) all_datasets.train_instances = train_instances all_datasets.test_instances = None all_datasets.setSampleWeights() self.evalClusteringPerf(all_datasets.train_instances) # Train the naive Bayes detection model and predict self.naive_bayes = GaussianNaiveBayes(naive_bayes_exp, all_datasets) self.naive_bayes.training() self.nb_time = self.naive_bayes.training_execution_time self.datasets.test_instances.families = [ None ] * self.datasets.test_instances.numInstances() self.nb_predicted_log_proba = self.naive_bayes.pipeline.predict_log_proba( self.datasets.test_instances.getFeatures()) start_time = time.time() self.nb_predicted_labels = self.naive_bayes.pipeline.predict( self.datasets.test_instances.getFeatures()) self.nb_time += time.time() - start_time self.nb_class_labels = self.naive_bayes.class_labels
def trainNaiveBayes(self, iteration_number): naive_bayes_exp = self.createNaiveBayesExperiment(iteration_number) # Train the naive Bayes detection model and predict datasets = ClassifierDatasets(naive_bayes_exp, naive_bayes_exp.classification_conf) current_families = copy.deepcopy(self.instances.families) # families are altered self.instances.families = self.assigned_categories datasets.train_instances = self.instances datasets.test_instances = None datasets.setSampleWeights() naive_bayes = GaussianNaiveBayes(naive_bayes_exp, datasets) naive_bayes.training() # families are restored self.instances.families = current_families return naive_bayes
def buildMulticlassClassifier(self): if self.multiclass_model is not None: return multiclass_conf = self.getMulticlassConf() datasets = self.iteration.datasets predicted_instances = datasets.getInstancesFromIds(self.predicted_ids) multiclass_datasets = ClassifierDatasets(multiclass_conf) multiclass_datasets.train_instances = self.annotated_instances multiclass_datasets.test_instances = predicted_instances multiclass_datasets.setSampleWeights() self.multiclass_model = multiclass_conf.model_class( multiclass_conf, multiclass_datasets, cv_monitoring = True) self.multiclass_model.training() self.multiclass_model.testing() if multiclass_datasets.validation_instances is not None: self.multiclass_model.validation()
def setDatasets(self, conf): al_datasets = self.iteration.datasets self.datasets = ClassifierDatasets(conf) self.datasets.setDatasets(al_datasets.getTrainInstances(conf), al_datasets.getTestInstances()) self.datasets.setValidationInstances(al_datasets.validation_instances)