Esempio n. 1
0
 def generateDatasets(self):
     instances = Instances()
     instances.initFromExperiment(self.experiment)
     test_conf = self.classification_conf.test_conf
     if test_conf.method == 'random_split':
         self.splitTrainDataset(instances, test_conf.test_size)
     elif test_conf.method == 'test_dataset':
         self.generateTrainTestDatasets(instances, test_conf.test_exp)
     elif test_conf.method == 'unlabeled':
         self.unlabeledLabeledDatasets(instances,
                                       test_conf.labels_annotations)
     self.test_instances.eraseLabels()
     self.setSampleWeights()
Esempio n. 2
0
 def runNaiveBayes(self):
     # Create an experiment for the naive Bayes model
     exp = self.iteration.experiment
     name = '-'.join([
         'AL' + str(exp.experiment_id),
         'Iter' + str(self.iteration.iteration_number), 'all', 'NaiveBayes'
     ])
     naive_bayes_exp = ClassificationExperiment(
         exp.project,
         exp.dataset,
         exp.db,
         exp.cursor,
         experiment_name=name,
         experiment_label=exp.experiment_label,
         parent=exp.experiment_id)
     naive_bayes_exp.setFeaturesFilenames(exp.features_filenames)
     test_conf = TestConfiguration()
     test_conf.setUnlabeled(labels_annotations='annotations')
     naive_bayes_conf = GaussianNaiveBayesConfiguration(
         exp.conf.models_conf['multiclass'].num_folds, False, True,
         test_conf)
     naive_bayes_exp.setClassifierConf(naive_bayes_conf)
     naive_bayes_exp.createExperiment()
     naive_bayes_exp.export()
     # Update training data - the naive Bayes classifier is trained on all the data
     self.datasets.test_instances.families = list(self.lr_predicted_labels)
     all_datasets = ClassifierDatasets(naive_bayes_exp,
                                       naive_bayes_exp.classification_conf)
     train_instances = Instances()
     train_instances.union(self.datasets.train_instances,
                           self.datasets.test_instances)
     all_datasets.train_instances = train_instances
     all_datasets.test_instances = None
     all_datasets.setSampleWeights()
     self.evalClusteringPerf(all_datasets.train_instances)
     # Train the naive Bayes detection model and predict
     self.naive_bayes = GaussianNaiveBayes(naive_bayes_exp, all_datasets)
     self.naive_bayes.training()
     self.nb_time = self.naive_bayes.training_execution_time
     self.datasets.test_instances.families = [
         None
     ] * self.datasets.test_instances.numInstances()
     self.nb_predicted_log_proba = self.naive_bayes.pipeline.predict_log_proba(
         self.datasets.test_instances.getFeatures())
     start_time = time.time()
     self.nb_predicted_labels = self.naive_bayes.pipeline.predict(
         self.datasets.test_instances.getFeatures())
     self.nb_time += time.time() - start_time
     self.nb_class_labels = self.naive_bayes.class_labels
Esempio n. 3
0
 def getInstances(self):
     ids = self.getIds()
     features, features_names = self.getFeatures()
     labels, families, annotations, true_labels, true_families = self.getLabels(
         len(ids))
     instances = Instances(ids, features, features_names, labels, families,
                           true_labels, true_families, annotations)
     return instances
Esempio n. 4
0
 def transform(self, instances):
     projected_matrix = self.pipeline.transform(instances.getFeatures())
     projected_instances = Instances(
         instances.getIds(), projected_matrix,
         self.componentLabels(instances.getFeaturesNames()),
         instances.labels, instances.families, instances.true_labels,
         instances.true_families, instances.annotations)
     return projected_instances
Esempio n. 5
0
 def transform(self, instances, visu=True, performance=False):
     projected_instances = Instances()
     projected_matrix = self.pipeline.transform(instances.getFeatures())
     projected_instances.initFromMatrix(
         instances.getIds(),
         projected_matrix,
         self.componentLabels(),
         labels=instances.getLabels(),
         families=instances.families,
         true_labels=instances.getLabels(true_labels=True),
         true_families=instances.getFamilies(true_labels=True),
         annotations=instances.annotations)
     if visu:
         visu = Visualization(self)
         visu.allHexBin(projected_instances)
     if performance:
         self.performance = self.assessPerformance(projected_instances)
     return projected_instances
Esempio n. 6
0
 def buildCategories(self):
     self.buildMulticlassClassifier()
     all_instances = Instances()
     train = self.multiclass_model.datasets.train_instances
     test = self.multiclass_model.datasets.test_instances
     all_instances.union(test, train)
     if test.numInstances() > 0:
         predicted_families = self.multiclass_model.testing_monitoring.getPredictedLabels(
         )
         all_families = list(predicted_families) + train.families
         predicted_proba = self.multiclass_model.testing_monitoring.getAllPredictedProba(
         )
         for family in train.families:
             probas = [
                 int(family == s)
                 for s in self.multiclass_model.class_labels
             ]
             predicted_proba = np.vstack(
                 (predicted_proba, np.array(probas)))
     else:
         all_families = self.annotated_instances.families
         predicted_proba = None
         for family in self.annotated_instances.families:
             probas = [
                 int(family == s)
                 for s in self.multiclass_model.class_labels
             ]
             if predicted_proba is None:
                 predicted_proba = np.array(probas)
             else:
                 predicted_proba = np.vstack(
                     (predicted_proba, np.array(probas)))
     labels_values = list(self.multiclass_model.class_labels)
     assigned_categories = [labels_values.index(x) for x in all_families]
     self.categories = Categories(self.multiclass_model.experiment,
                                  all_instances, assigned_categories,
                                  predicted_proba, self.label,
                                  self.multiclass_model.class_labels)
Esempio n. 7
0
class DescriptiveStatistics(object):
    def __init__(self, experiment):
        self.instances = Instances()
        self.instances.initFromExperiment(experiment)
        self.output_directory = dir_tools.getExperimentOutputDirectory(
            experiment)

    # The file features_types.csv contains the list of features with their corresponding type (numeric or binary).
    # This file is updated after the processing of each feature to allow the user to display the results.
    # The features are sorted alphabetically.
    def generateDescriptiveStatistics(self):
        features_types = {}
        features_types['features'] = []
        features_types['types'] = {}
        for feature in self.instances.features_names:
            stats = FeatureDescriptiveStatistics(self.instances, feature,
                                                 self.output_directory)
            stats.generateDescriptiveStatistics()
            features_types['features'].append(feature)
            features_types['features'].sort()
            features_types['types'][feature] = stats.feature_type
            with open(self.output_directory + 'features_types.json', 'w') as f:
                json.dump(features_types, f, indent=2)
Esempio n. 8
0
 def run(self, instances=None, visu=True, performance=False):
     if instances is None:
         instances = Instances()
         instances.initFromExperiment(self.experiment)
     self.fit(instances, visu=visu)
     self.transform(instances, visu=visu, performance=performance)
Esempio n. 9
0
 def setValidationInstances(self, validation_conf):
     self.validation_instances = None
     if validation_conf is not None:
         self.validation_instances = Instances()
         self.validation_instances.initFromExperiment(
             validation_conf.test_exp)
Esempio n. 10
0
 def __init__(self, experiment):
     self.experiment = experiment
     self.instances = Instances()
     self.instances.initFromExperiment(experiment)
     self.setValidationInstances(experiment.validation_conf)
     self.initCounts()
Esempio n. 11
0
class Datasets(object):
    def __init__(self, experiment):
        self.experiment = experiment
        self.instances = Instances()
        self.instances.initFromExperiment(experiment)
        self.setValidationInstances(experiment.validation_conf)
        self.initCounts()

    def setValidationInstances(self, validation_conf):
        self.validation_instances = None
        if validation_conf is not None:
            self.validation_instances = Instances()
            self.validation_instances.initFromExperiment(
                validation_conf.test_exp)

    def update(self, instance_id, label, family, annotation):
        self.new_labels = True
        self.instances.setLabel(instance_id, label == 'malicious')
        self.instances.setFamily(instance_id, family)
        self.instances.setAnnotation(instance_id, annotation)
        ## Update the annotation count
        if annotation:
            self.num_annotations[label] += 1

    def checkLabelsWithDB(self, cursor, experiment_label_id):
        self.instances.checkLabelsWithDB(cursor, experiment_label_id)

    def saveLabeledInstances(self, iteration_number):
        for i in ['annotations', 'labels']:
            filename = dir_tools.getDatasetDirectory(self.experiment.project,
                                                     self.experiment.dataset)
            filename += 'labels/' + i + '_'
            filename += self.experiment.labeling_method + '_'
            filename += 'exp' + str(self.experiment.experiment_id) + '_'
            filename += 'it' + str(iteration_number) + '.csv'
            if i == 'annotations':
                instances = self.instances.getAnnotatedInstances()
            elif i == 'labels':
                instances = self.instances.getLabeledInstances()
            instances.saveInstancesLabels(filename)

    def numAnnotations(self, label='all'):
        if label == 'all':
            num_annotations = self.numAnnotations('malicious')
            num_annotations += self.numAnnotations('benign')
            return num_annotations
        else:
            return self.num_annotations[label]

    def numLabels(self, label='all'):
        if label == 'all':
            num_labels = self.numLabels('malicious')
            num_labels += self.numLabels('benign')
            return num_labels
        else:
            if label == 'benign':
                return len(self.instances.getBenignIds())
            elif label == 'malicious':
                return len(self.instances.getMaliciousIds())

    def numInstances(self, label='all', true_labels=False):
        return self.instances.numInstances(label=label,
                                           true_labels=true_labels)

    def getFeaturesNames(self):
        return self.instances.getFeaturesNames()

    def getTrainInstances(self, conf):
        if conf.semi_supervised:
            return self.instances
        else:
            return self.getAnnotatedInstances()

    def getTestInstances(self):
        return self.getUnlabeledInstances()

    def getAnnotatedInstances(self, label='all'):
        return self.instances.getAnnotatedInstances(label=label)

    def getLabeledInstances(self):
        return self.instances.getLabeledInstances()

    def getUnlabeledInstances(self):
        return self.instances.getUnlabeledInstances()

    def getInstancesFromIds(self, instance_ids):
        return self.instances.getInstancesFromIds(instance_ids)

    #############################
    #############################
    ##### Private functions #####
    #############################
    #############################

    ## We initial labels have been checked by an expert
    def initCounts(self):
        self.num_instances = {}
        self.num_init = {}
        self.num_annotations = {}
        for label in ['malicious', 'benign']:
            self.num_instances[label] = None
            if self.instances.hasTrueLabels():
                num = self.instances.numInstances(label=label,
                                                  true_labels=True)
                self.num_instances[label] = num
            self.num_init[label] = self.getLabeledInstances().numInstances(
                label=label)
            self.num_annotations[label] = self.num_init[label]
Esempio n. 12
0
 def __init__(self, experiment):
     self.instances = Instances()
     self.instances.initFromExperiment(experiment)
     self.output_directory = dir_tools.getExperimentOutputDirectory(
         experiment)
Esempio n. 13
0
 def generateTrainTestDatasets(self, instances, validation_exp):
     self.train_instances = instances
     self.test_instances = Instances()
     self.test_instances.initFromExperiment(validation_exp)
Esempio n. 14
0
class ClassifierDatasets(object):
    def __init__(self, experiment, classification_conf):
        self.experiment = experiment
        self.classification_conf = classification_conf
        self.validation_instances = None

    def getFeaturesNames(self):
        return self.train_instances.getFeaturesNames()

    def setValidationInstances(self, validation_instances):
        self.validation_instances = validation_instances
        if self.validation_instances is not None:
            self.validation_instances.eraseLabels()

    def setDatasets(self, train_instances, test_instances):
        self.train_instances = train_instances
        self.test_instances = test_instances
        self.test_instances.eraseLabels()
        self.setSampleWeights()

    def generateDatasets(self):
        instances = Instances()
        instances.initFromExperiment(self.experiment)
        test_conf = self.classification_conf.test_conf
        if test_conf.method == 'random_split':
            self.splitTrainDataset(instances, test_conf.test_size)
        elif test_conf.method == 'test_dataset':
            self.generateTrainTestDatasets(instances, test_conf.test_exp)
        elif test_conf.method == 'unlabeled':
            self.unlabeledLabeledDatasets(instances,
                                          test_conf.labels_annotations)
        self.test_instances.eraseLabels()
        self.setSampleWeights()

    def setSampleWeights(self):
        if self.classification_conf.sample_weight:
            self.computeSampleWeights()
        else:
            self.sample_weight = None

    def splitTrainDataset(self, instances, test_size):
        labeled_instances = instances.getLabeledInstances()
        train, test = generateTrainTestIds(labeled_instances.getIds(),
                                           test_size)
        self.train_instances = labeled_instances.getInstancesFromIds(train)
        self.test_instances = labeled_instances.getInstancesFromIds(test)

    def generateTrainTestDatasets(self, instances, validation_exp):
        self.train_instances = instances
        self.test_instances = Instances()
        self.test_instances.initFromExperiment(validation_exp)

    def unlabeledLabeledDatasets(self, instances, labels_annotations):
        if labels_annotations == 'labels':
            self.train_instances = instances.getLabeledInstances()
        elif labels_annotations == 'annotations':
            self.train_instances = instances.getAnnotatedInstances()
        self.test_instances = instances.getUnlabeledInstances()
        self.test_instances.labels = self.test_instances.true_labels
        self.test_instances.families = self.test_instances.true_families

    def semiSupervisedSplit(self, instances, test_size):
        labeled_instances = instances.getLabeledInstances()
        labeled_train, labeled_test = generateTrainTestIds(
            labeled_instances.getIds(), test_size)
        unlabeled_instances = instances.getUnlabeledInstances()
        unlabeled_train, unlabeled_test = generateTrainTestIds(
            unlabeled_instances.getIds(), test_size)
        self.train_instances = instances.getInstancesFromIds(labeled_train +
                                                             unlabeled_train)
        self.test_instances = instances.getInstancesFromIds(labeled_test +
                                                            unlabeled_test)

    def computeSampleWeights(self):
        self.sample_weight = [1] * self.train_instances.numInstances()
        families = self.train_instances.getFamilies()
        families_prop = self.train_instances.getFamiliesProp()
        for i in range(self.train_instances.numInstances()):
            self.sample_weight[i] = 1 / families_prop[families[i]]
            if self.sample_weight[i] > 100:
                self.sample_weight[i] = 100