def generateDatasets(self): instances = Instances() instances.initFromExperiment(self.experiment) test_conf = self.classification_conf.test_conf if test_conf.method == 'random_split': self.splitTrainDataset(instances, test_conf.test_size) elif test_conf.method == 'test_dataset': self.generateTrainTestDatasets(instances, test_conf.test_exp) elif test_conf.method == 'unlabeled': self.unlabeledLabeledDatasets(instances, test_conf.labels_annotations) self.test_instances.eraseLabels() self.setSampleWeights()
def runNaiveBayes(self): # Create an experiment for the naive Bayes model exp = self.iteration.experiment name = '-'.join([ 'AL' + str(exp.experiment_id), 'Iter' + str(self.iteration.iteration_number), 'all', 'NaiveBayes' ]) naive_bayes_exp = ClassificationExperiment( exp.project, exp.dataset, exp.db, exp.cursor, experiment_name=name, experiment_label=exp.experiment_label, parent=exp.experiment_id) naive_bayes_exp.setFeaturesFilenames(exp.features_filenames) test_conf = TestConfiguration() test_conf.setUnlabeled(labels_annotations='annotations') naive_bayes_conf = GaussianNaiveBayesConfiguration( exp.conf.models_conf['multiclass'].num_folds, False, True, test_conf) naive_bayes_exp.setClassifierConf(naive_bayes_conf) naive_bayes_exp.createExperiment() naive_bayes_exp.export() # Update training data - the naive Bayes classifier is trained on all the data self.datasets.test_instances.families = list(self.lr_predicted_labels) all_datasets = ClassifierDatasets(naive_bayes_exp, naive_bayes_exp.classification_conf) train_instances = Instances() train_instances.union(self.datasets.train_instances, self.datasets.test_instances) all_datasets.train_instances = train_instances all_datasets.test_instances = None all_datasets.setSampleWeights() self.evalClusteringPerf(all_datasets.train_instances) # Train the naive Bayes detection model and predict self.naive_bayes = GaussianNaiveBayes(naive_bayes_exp, all_datasets) self.naive_bayes.training() self.nb_time = self.naive_bayes.training_execution_time self.datasets.test_instances.families = [ None ] * self.datasets.test_instances.numInstances() self.nb_predicted_log_proba = self.naive_bayes.pipeline.predict_log_proba( self.datasets.test_instances.getFeatures()) start_time = time.time() self.nb_predicted_labels = self.naive_bayes.pipeline.predict( self.datasets.test_instances.getFeatures()) self.nb_time += time.time() - start_time self.nb_class_labels = self.naive_bayes.class_labels
def getInstances(self): ids = self.getIds() features, features_names = self.getFeatures() labels, families, annotations, true_labels, true_families = self.getLabels( len(ids)) instances = Instances(ids, features, features_names, labels, families, true_labels, true_families, annotations) return instances
def transform(self, instances): projected_matrix = self.pipeline.transform(instances.getFeatures()) projected_instances = Instances( instances.getIds(), projected_matrix, self.componentLabels(instances.getFeaturesNames()), instances.labels, instances.families, instances.true_labels, instances.true_families, instances.annotations) return projected_instances
def transform(self, instances, visu=True, performance=False): projected_instances = Instances() projected_matrix = self.pipeline.transform(instances.getFeatures()) projected_instances.initFromMatrix( instances.getIds(), projected_matrix, self.componentLabels(), labels=instances.getLabels(), families=instances.families, true_labels=instances.getLabels(true_labels=True), true_families=instances.getFamilies(true_labels=True), annotations=instances.annotations) if visu: visu = Visualization(self) visu.allHexBin(projected_instances) if performance: self.performance = self.assessPerformance(projected_instances) return projected_instances
def buildCategories(self): self.buildMulticlassClassifier() all_instances = Instances() train = self.multiclass_model.datasets.train_instances test = self.multiclass_model.datasets.test_instances all_instances.union(test, train) if test.numInstances() > 0: predicted_families = self.multiclass_model.testing_monitoring.getPredictedLabels( ) all_families = list(predicted_families) + train.families predicted_proba = self.multiclass_model.testing_monitoring.getAllPredictedProba( ) for family in train.families: probas = [ int(family == s) for s in self.multiclass_model.class_labels ] predicted_proba = np.vstack( (predicted_proba, np.array(probas))) else: all_families = self.annotated_instances.families predicted_proba = None for family in self.annotated_instances.families: probas = [ int(family == s) for s in self.multiclass_model.class_labels ] if predicted_proba is None: predicted_proba = np.array(probas) else: predicted_proba = np.vstack( (predicted_proba, np.array(probas))) labels_values = list(self.multiclass_model.class_labels) assigned_categories = [labels_values.index(x) for x in all_families] self.categories = Categories(self.multiclass_model.experiment, all_instances, assigned_categories, predicted_proba, self.label, self.multiclass_model.class_labels)
class DescriptiveStatistics(object): def __init__(self, experiment): self.instances = Instances() self.instances.initFromExperiment(experiment) self.output_directory = dir_tools.getExperimentOutputDirectory( experiment) # The file features_types.csv contains the list of features with their corresponding type (numeric or binary). # This file is updated after the processing of each feature to allow the user to display the results. # The features are sorted alphabetically. def generateDescriptiveStatistics(self): features_types = {} features_types['features'] = [] features_types['types'] = {} for feature in self.instances.features_names: stats = FeatureDescriptiveStatistics(self.instances, feature, self.output_directory) stats.generateDescriptiveStatistics() features_types['features'].append(feature) features_types['features'].sort() features_types['types'][feature] = stats.feature_type with open(self.output_directory + 'features_types.json', 'w') as f: json.dump(features_types, f, indent=2)
def run(self, instances=None, visu=True, performance=False): if instances is None: instances = Instances() instances.initFromExperiment(self.experiment) self.fit(instances, visu=visu) self.transform(instances, visu=visu, performance=performance)
def setValidationInstances(self, validation_conf): self.validation_instances = None if validation_conf is not None: self.validation_instances = Instances() self.validation_instances.initFromExperiment( validation_conf.test_exp)
def __init__(self, experiment): self.experiment = experiment self.instances = Instances() self.instances.initFromExperiment(experiment) self.setValidationInstances(experiment.validation_conf) self.initCounts()
class Datasets(object): def __init__(self, experiment): self.experiment = experiment self.instances = Instances() self.instances.initFromExperiment(experiment) self.setValidationInstances(experiment.validation_conf) self.initCounts() def setValidationInstances(self, validation_conf): self.validation_instances = None if validation_conf is not None: self.validation_instances = Instances() self.validation_instances.initFromExperiment( validation_conf.test_exp) def update(self, instance_id, label, family, annotation): self.new_labels = True self.instances.setLabel(instance_id, label == 'malicious') self.instances.setFamily(instance_id, family) self.instances.setAnnotation(instance_id, annotation) ## Update the annotation count if annotation: self.num_annotations[label] += 1 def checkLabelsWithDB(self, cursor, experiment_label_id): self.instances.checkLabelsWithDB(cursor, experiment_label_id) def saveLabeledInstances(self, iteration_number): for i in ['annotations', 'labels']: filename = dir_tools.getDatasetDirectory(self.experiment.project, self.experiment.dataset) filename += 'labels/' + i + '_' filename += self.experiment.labeling_method + '_' filename += 'exp' + str(self.experiment.experiment_id) + '_' filename += 'it' + str(iteration_number) + '.csv' if i == 'annotations': instances = self.instances.getAnnotatedInstances() elif i == 'labels': instances = self.instances.getLabeledInstances() instances.saveInstancesLabels(filename) def numAnnotations(self, label='all'): if label == 'all': num_annotations = self.numAnnotations('malicious') num_annotations += self.numAnnotations('benign') return num_annotations else: return self.num_annotations[label] def numLabels(self, label='all'): if label == 'all': num_labels = self.numLabels('malicious') num_labels += self.numLabels('benign') return num_labels else: if label == 'benign': return len(self.instances.getBenignIds()) elif label == 'malicious': return len(self.instances.getMaliciousIds()) def numInstances(self, label='all', true_labels=False): return self.instances.numInstances(label=label, true_labels=true_labels) def getFeaturesNames(self): return self.instances.getFeaturesNames() def getTrainInstances(self, conf): if conf.semi_supervised: return self.instances else: return self.getAnnotatedInstances() def getTestInstances(self): return self.getUnlabeledInstances() def getAnnotatedInstances(self, label='all'): return self.instances.getAnnotatedInstances(label=label) def getLabeledInstances(self): return self.instances.getLabeledInstances() def getUnlabeledInstances(self): return self.instances.getUnlabeledInstances() def getInstancesFromIds(self, instance_ids): return self.instances.getInstancesFromIds(instance_ids) ############################# ############################# ##### Private functions ##### ############################# ############################# ## We initial labels have been checked by an expert def initCounts(self): self.num_instances = {} self.num_init = {} self.num_annotations = {} for label in ['malicious', 'benign']: self.num_instances[label] = None if self.instances.hasTrueLabels(): num = self.instances.numInstances(label=label, true_labels=True) self.num_instances[label] = num self.num_init[label] = self.getLabeledInstances().numInstances( label=label) self.num_annotations[label] = self.num_init[label]
def __init__(self, experiment): self.instances = Instances() self.instances.initFromExperiment(experiment) self.output_directory = dir_tools.getExperimentOutputDirectory( experiment)
def generateTrainTestDatasets(self, instances, validation_exp): self.train_instances = instances self.test_instances = Instances() self.test_instances.initFromExperiment(validation_exp)
class ClassifierDatasets(object): def __init__(self, experiment, classification_conf): self.experiment = experiment self.classification_conf = classification_conf self.validation_instances = None def getFeaturesNames(self): return self.train_instances.getFeaturesNames() def setValidationInstances(self, validation_instances): self.validation_instances = validation_instances if self.validation_instances is not None: self.validation_instances.eraseLabels() def setDatasets(self, train_instances, test_instances): self.train_instances = train_instances self.test_instances = test_instances self.test_instances.eraseLabels() self.setSampleWeights() def generateDatasets(self): instances = Instances() instances.initFromExperiment(self.experiment) test_conf = self.classification_conf.test_conf if test_conf.method == 'random_split': self.splitTrainDataset(instances, test_conf.test_size) elif test_conf.method == 'test_dataset': self.generateTrainTestDatasets(instances, test_conf.test_exp) elif test_conf.method == 'unlabeled': self.unlabeledLabeledDatasets(instances, test_conf.labels_annotations) self.test_instances.eraseLabels() self.setSampleWeights() def setSampleWeights(self): if self.classification_conf.sample_weight: self.computeSampleWeights() else: self.sample_weight = None def splitTrainDataset(self, instances, test_size): labeled_instances = instances.getLabeledInstances() train, test = generateTrainTestIds(labeled_instances.getIds(), test_size) self.train_instances = labeled_instances.getInstancesFromIds(train) self.test_instances = labeled_instances.getInstancesFromIds(test) def generateTrainTestDatasets(self, instances, validation_exp): self.train_instances = instances self.test_instances = Instances() self.test_instances.initFromExperiment(validation_exp) def unlabeledLabeledDatasets(self, instances, labels_annotations): if labels_annotations == 'labels': self.train_instances = instances.getLabeledInstances() elif labels_annotations == 'annotations': self.train_instances = instances.getAnnotatedInstances() self.test_instances = instances.getUnlabeledInstances() self.test_instances.labels = self.test_instances.true_labels self.test_instances.families = self.test_instances.true_families def semiSupervisedSplit(self, instances, test_size): labeled_instances = instances.getLabeledInstances() labeled_train, labeled_test = generateTrainTestIds( labeled_instances.getIds(), test_size) unlabeled_instances = instances.getUnlabeledInstances() unlabeled_train, unlabeled_test = generateTrainTestIds( unlabeled_instances.getIds(), test_size) self.train_instances = instances.getInstancesFromIds(labeled_train + unlabeled_train) self.test_instances = instances.getInstancesFromIds(labeled_test + unlabeled_test) def computeSampleWeights(self): self.sample_weight = [1] * self.train_instances.numInstances() families = self.train_instances.getFamilies() families_prop = self.train_instances.getFamiliesProp() for i in range(self.train_instances.numInstances()): self.sample_weight[i] = 1 / families_prop[families[i]] if self.sample_weight[i] > 100: self.sample_weight[i] = 100