class MulticlassPerfIndicators(object):
    def __init__(self, num_folds):
        self.num_folds = num_folds
        self.accuracy = [0] * num_folds
        self.f1_micro = [0] * num_folds
        self.f1_macro = [0] * num_folds
        self.clustering_perf = PerformanceIndicators()
        self.true_labels = []
        self.predicted_labels = []

    def addFold(self, fold_id, true_labels, predicted_labels):
        self.f1_micro[fold_id] = f1_score(true_labels,
                                          predicted_labels,
                                          average='micro')
        self.f1_macro[fold_id] = f1_score(true_labels,
                                          predicted_labels,
                                          average='macro')
        self.accuracy[fold_id] = accuracy_score(true_labels, predicted_labels)
        self.true_labels += true_labels
        self.predicted_labels += list(predicted_labels)

    def getAccuracy(self):
        return self.accuracy_mean

    def finalComputations(self):
        self.accuracy_mean = np.mean(self.accuracy)
        self.accuracy_std = np.std(self.accuracy)
        self.f1_micro_mean = np.mean(self.f1_micro)
        self.f1_micro_std = np.std(self.f1_micro)
        self.f1_macro_mean = np.mean(self.f1_macro)
        self.f1_macro_std = np.std(self.f1_macro)
        self.clustering_perf.generateEvaluation(self.true_labels,
                                                self.predicted_labels)

    def toJson(self, f):
        perf = {}
        perf['accuracy'] = {
            'mean': floats_tools.toPercentage(self.accuracy_mean),
            'std': floats_tools.trunc(self.accuracy_std)
        }
        perf['f1_micro'] = {
            'mean': floats_tools.toPercentage(self.f1_micro_mean),
            'std': floats_tools.trunc(self.f1_micro_std)
        }
        perf['f1_macro'] = {
            'mean': floats_tools.toPercentage(self.f1_macro_mean),
            'std': floats_tools.trunc(self.f1_macro_std)
        }
        perf['clustering_perf'] = self.clustering_perf.toJson()
        json.dump(perf, f, indent=2)

    def getCsvHeader(self):
        return ['accuracy', 'f1-micro', 'f1-macro']

    def getCsvLine(self):
        v = []
        v.append(self.accuracy_mean)
        v.append(self.f1_micro_mean)
        v.append(self.f1_macro_mean)
        return v
 def __init__(self, num_folds):
     self.num_folds = num_folds
     self.accuracy = [0] * num_folds
     self.f1_micro = [0] * num_folds
     self.f1_macro = [0] * num_folds
     self.clustering_perf = PerformanceIndicators()
     self.true_labels = []
     self.predicted_labels = []
Esempio n. 3
0
 def globalClusteringEvaluation(self):
     clusters = []
     true_families = []
     if self.malicious.categories is not None:
         clusters += list(self.malicious.categories.assigned_categories)
         true_families += self.malicious.categories.instances.getFamilies(
             true_labels=True)
     if self.benign.categories is not None:
         max_clusters = 0
         if len(clusters) > 0:
             max_clusters = max(clusters)
         clusters += [
             x + max_clusters + 1
             for x in list(self.benign.categories.assigned_categories)
         ]
         true_families += self.benign.categories.instances.getFamilies(
             true_labels=True)
     if len(clusters) > 0:
         self.global_clustering_perf = PerformanceIndicators()
         self.global_clustering_perf.generateEvaluation(
             clusters, true_families)
     else:
         self.global_clustering_perf = None
Esempio n. 4
0
class Ilab(QueryStrategy):
    def __init__(self, iteration):
        QueryStrategy.__init__(self, iteration)
        conf = iteration.experiment.conf
        eps = conf.eps
        self.uncertain = UncertainAnnotationQueries(self.iteration,
                                                    conf.num_uncertain, 0, 1)
        self.malicious = RareCategoryDetectionAnnotationQueries(
            self.iteration, 'malicious', 1 - eps, 1)
        self.benign = RareCategoryDetectionAnnotationQueries(
            self.iteration, 'benign', 0, eps)

    def generateAnnotationQueries(self):
        self.generate_queries_time = 0
        self.uncertain.run()
        self.generate_queries_time += self.uncertain.generate_queries_time
        self.exportAnnotationsTypes(malicious=False, benign=False)
        uncertain_queries = self.uncertain.getInstanceIds()
        self.malicious.run(already_queried=uncertain_queries)
        self.generate_queries_time += self.malicious.generate_queries_time
        self.exportAnnotationsTypes(malicious=True, benign=False)
        self.benign.run(already_queried=uncertain_queries)
        self.generate_queries_time += self.benign.generate_queries_time
        self.exportAnnotationsTypes()
        self.globalClusteringEvaluation()

    def annotateAuto(self):
        self.uncertain.annotateAuto()
        self.malicious.annotateAuto()
        self.benign.annotateAuto()

    def getManualAnnotations(self):
        self.uncertain.getManualAnnotations()
        self.malicious.getManualAnnotations()
        self.benign.getManualAnnotations()

    def getClusteringsEvaluations(self):
        clusterings = {}
        clusterings['all'] = self.global_clustering_perf
        clusterings['malicious'] = None
        clusterings['benign'] = None
        return clusterings

    def globalClusteringEvaluation(self):
        clusters = []
        true_families = []
        if self.malicious.categories is not None:
            clusters += list(self.malicious.categories.assigned_categories)
            true_families += self.malicious.categories.instances.getFamilies(
                true_labels=True)
        if self.benign.categories is not None:
            max_clusters = 0
            if len(clusters) > 0:
                max_clusters = max(clusters)
            clusters += [
                x + max_clusters + 1
                for x in list(self.benign.categories.assigned_categories)
            ]
            true_families += self.benign.categories.instances.getFamilies(
                true_labels=True)
        if len(clusters) > 0:
            self.global_clustering_perf = PerformanceIndicators()
            self.global_clustering_perf.generateEvaluation(
                clusters, true_families)
        else:
            self.global_clustering_perf = None

    ###############################
    ## Execution time monitoring ##
    ###############################

    def executionTimeHeader(self):
        header = ['binary_model', 'malicious_clustering', 'benign_clustering']
        header += QueryStrategy.executionTimeHeader(self)
        return header

    def executionTimeMonitoring(self):
        line = [self.iteration.train_test_validation.times['binary']]
        line += [self.malicious.analysis_time, self.benign.analysis_time]
        line += QueryStrategy.executionTimeMonitoring(self)
        return line

    def executionTimeDisplay(self):
        binary_model = PlotDataset(None, 'Binary model')
        malicious = PlotDataset(None, 'Malicious Analysis')
        malicious.setLinestyle('dotted')
        malicious.setColor(colors_tools.getLabelColor('malicious'))
        benign = PlotDataset(None, 'Benign Analysis')
        benign.setLinestyle('dashed')
        benign.setColor(colors_tools.getLabelColor('benign'))
        return [binary_model, malicious, benign
                ] + QueryStrategy.executionTimeDisplay(self)

    def exportAnnotationsTypes(self, malicious=True, benign=True):
        types = {'uncertain': 'individual', 'malicious': None, 'benign': None}
        if malicious:
            types['malicious'] = self.malicious.annotations_type
        if benign:
            types['benign'] = self.benign.annotations_type
        filename = self.iteration.output_directory
        filename += 'annotations_types.json'
        with open(filename, 'w') as f:
            json.dump(types, f, indent=2)
Esempio n. 5
0
class Ilab(QueryStrategy):
    def __init__(self, iteration):
        QueryStrategy.__init__(self, iteration)
        eps = self.iteration.conf.eps
        self.uncertain = UncertainAnnotationQueries(
            self.iteration, self.iteration.conf.num_uncertain, 0, 1)
        self.malicious = RareCategoryDetectionAnnotationQueries(
            self.iteration, 'malicious', 1 - eps, 1)
        self.benign = RareCategoryDetectionAnnotationQueries(
            self.iteration, 'benign', 0, eps)

    def generateAnnotationQueries(self):
        self.generate_queries_time = 0
        self.uncertain.run()
        self.generate_queries_time += self.uncertain.generate_queries_time
        uncertain_queries = self.uncertain.getInstanceIds()
        self.malicious.run(already_queried=uncertain_queries)
        self.generate_queries_time += self.malicious.generate_queries_time
        self.benign.run(already_queried=uncertain_queries)
        self.generate_queries_time += self.benign.generate_queries_time
        self.globalClusteringEvaluation()

    def annotateAuto(self):
        self.uncertain.annotateAuto()
        self.malicious.annotateAuto()
        self.benign.annotateAuto()

    def getManualAnnotations(self):
        self.uncertain.getManualAnnotations()
        self.malicious.getManualAnnotations()
        self.benign.getManualAnnotations()

    def getClusteringsEvaluations(self):
        clusterings = {}
        clusterings['all'] = self.global_clustering_perf
        clusterings['malicious'] = None
        clusterings['benign'] = None
        return clusterings

    def globalClusteringEvaluation(self):
        clusters = []
        true_families = []
        if self.malicious.categories is not None:
            clusters += list(self.malicious.categories.assigned_categories)
            true_families += self.malicious.categories.instances.getFamilies(
                true_labels=True)
        if self.benign.categories is not None:
            max_clusters = 0
            if len(clusters) > 0:
                max_clusters = max(clusters)
            clusters += [
                x + max_clusters + 1
                for x in list(self.benign.categories.assigned_categories)
            ]
            true_families += self.benign.categories.instances.getFamilies(
                true_labels=True)
        if len(clusters) > 0:
            self.global_clustering_perf = PerformanceIndicators()
            self.global_clustering_perf.generateEvaluation(
                clusters, true_families)
        else:
            self.global_clustering_perf = None

    def checkAnnotationQueriesAnswered(self):
        answered = self.uncertain.checkAnnotationQueriesAnswered()
        if answered:
            answered = self.malicious.checkAnnotationQueriesAnswered()
            if answered:
                return self.benign.checkAnnotationQueriesAnswered()
            else:
                return False
        else:
            return False

    ###############################
    ## Execution time monitoring ##
    ###############################

    def executionTimeHeader(self):
        header = ['malicious_queries', 'uncertain_queries', 'benign_queries']
        return header

    def executionTimeMonitoring(self):
        line = [
            self.malicious.analysis_time + self.malicious.generate_queries_time
        ]
        line += [
            self.iteration.update_model.times['binary'] +
            self.uncertain.generate_queries_time
        ]
        line += [self.benign.analysis_time + self.benign.generate_queries_time]
        return line

    def executionTimeDisplay(self):
        uncertain = PlotDataset(None, 'Uncertain Queries')
        malicious = PlotDataset(None, 'Malicious Queries')
        malicious.setLinestyle('dotted')
        malicious.setColor(colors_tools.getLabelColor('malicious'))
        benign = PlotDataset(None, 'Benign Queries')
        benign.setLinestyle('dashed')
        benign.setColor(colors_tools.getLabelColor('benign'))
        return [malicious, uncertain, benign]
Esempio n. 6
0
class AladinAnnotationQueries(AnnotationQueries):
    def __init__(self, iteration, conf):
        AnnotationQueries.__init__(self, iteration, 'aladin')
        self.num_annotations = conf.num_annotations
        self.datasets = self.iteration.train_test_validation.models[
            'binary'].datasets

    def runModels(self):
        self.getLogisticRegressionResults()
        self.runNaiveBayes()

    def generateAnnotationQueries(self):
        self.computeScores()
        self.generateQueriesFromScores()

    #####################
    ## Private methods ##
    #####################

    def getLogisticRegressionResults(self):
        multiclass = self.iteration.train_test_validation.models['multiclass']
        self.lr_predicted_proba = multiclass.testing_monitoring.predictions_monitoring.predicted_proba_all
        self.lr_predicted_labels = multiclass.testing_monitoring.predictions_monitoring.predictions[
            'predicted_labels']
        self.lr_class_labels = multiclass.class_labels
        self.lr_time = multiclass.training_execution_time
        self.lr_time += multiclass.testing_execution_time

    def runNaiveBayes(self):
        # Create an experiment for the naive Bayes model
        exp = self.iteration.experiment
        name = '-'.join([
            'AL' + str(exp.experiment_id),
            'Iter' + str(self.iteration.iteration_number), 'all', 'NaiveBayes'
        ])
        naive_bayes_exp = ClassificationExperiment(
            exp.project,
            exp.dataset,
            exp.db,
            exp.cursor,
            experiment_name=name,
            experiment_label=exp.experiment_label,
            parent=exp.experiment_id)
        naive_bayes_exp.setFeaturesFilenames(exp.features_filenames)
        test_conf = TestConfiguration()
        test_conf.setUnlabeled(labels_annotations='annotations')
        naive_bayes_conf = GaussianNaiveBayesConfiguration(
            exp.conf.models_conf['multiclass'].num_folds, False, True,
            test_conf)
        naive_bayes_exp.setClassifierConf(naive_bayes_conf)
        naive_bayes_exp.createExperiment()
        naive_bayes_exp.export()
        # Update training data - the naive Bayes classifier is trained on all the data
        self.datasets.test_instances.families = list(self.lr_predicted_labels)
        all_datasets = ClassifierDatasets(naive_bayes_exp,
                                          naive_bayes_exp.classification_conf)
        train_instances = Instances()
        train_instances.union(self.datasets.train_instances,
                              self.datasets.test_instances)
        all_datasets.train_instances = train_instances
        all_datasets.test_instances = None
        all_datasets.setSampleWeights()
        self.evalClusteringPerf(all_datasets.train_instances)
        # Train the naive Bayes detection model and predict
        self.naive_bayes = GaussianNaiveBayes(naive_bayes_exp, all_datasets)
        self.naive_bayes.training()
        self.nb_time = self.naive_bayes.training_execution_time
        self.datasets.test_instances.families = [
            None
        ] * self.datasets.test_instances.numInstances()
        self.nb_predicted_log_proba = self.naive_bayes.pipeline.predict_log_proba(
            self.datasets.test_instances.getFeatures())
        start_time = time.time()
        self.nb_predicted_labels = self.naive_bayes.pipeline.predict(
            self.datasets.test_instances.getFeatures())
        self.nb_time += time.time() - start_time
        self.nb_class_labels = self.naive_bayes.class_labels

    def evalClusteringPerf(self, instances):
        self.clustering_perf = PerformanceIndicators()
        self.clustering_perf.generateEvaluation(instances.true_families,
                                                instances.families)

    def computeScores(self):
        self.createScoresDataFrame()
        self.computeUncertaintyScores()
        self.computeAnomalousScores()

    def createScoresDataFrame(self):
        test_instances = self.datasets.test_instances
        num_test_instances = self.datasets.test_instances.numInstances()
        self.scores = pd.DataFrame(np.zeros((num_test_instances, 5)),
                                   index=test_instances.getIds(),
                                   columns=[
                                       'lr_prediction', 'lr_score',
                                       'nb_prediction', 'nb_score', 'queried'
                                   ])
        self.scores['queried'] = [False] * num_test_instances

    # Uncertain instances have a low difference between the probability of belonging to
    # the most likely family and the second most likely family.
    # In Aladin, the authors consider this measure of uncertainty instead of the entropy
    # used by Pelleg and Moore (Active Learning for Anomaly and Rare-Category Detection).
    def computeUncertaintyScores(self):
        self.scores['lr_prediction'] = self.lr_predicted_labels
        lr_scores = []
        for i, predicted_label in enumerate(self.scores['lr_prediction']):
            predicted_label_index = np.where(
                self.lr_class_labels == predicted_label)[0]
            predicted_proba = self.lr_predicted_proba[i, predicted_label_index]
            proba = predicted_proba - self.lr_predicted_proba[i, :]
            proba[predicted_label_index] = 2
            score = np.min(proba)
            lr_scores.append(score)
        self.scores['lr_score'] = lr_scores

    # Anomalous instances have a low probability of belonging to the assigned family
    def computeAnomalousScores(self):
        self.scores['nb_prediction'] = self.nb_predicted_labels
        features = self.datasets.test_instances.getFeatures()
        for c in set(self.nb_predicted_labels):
            indexes = [
                i for i, x in enumerate(self.nb_predicted_labels) if x == c
            ]
            c_features = features[indexes, :]
            c_likelihood = self.naive_bayes.logLikelihood(c_features, c)
            self.scores['nb_score'].iloc[indexes] = c_likelihood

    def generateQueriesFromScores(self):
        assert (np.array_equal(self.lr_class_labels, self.nb_class_labels))
        lr_predicted_proba_df = self.generateLrPredictedProbaDataFrame()
        num_families = len(self.lr_class_labels)
        self.annotation_queries = []

        # There are fewer annotation queries than the number of families
        if self.num_annotations <= num_families:
            if self.iteration.iteration_number % 2 == 0:
                classifier = 'lr'
            else:
                classifier = 'nb'
            matrix_tools.sortDataFrame(self.scores, classifier + '_score',
                                       True, True)
            selected_instances = self.scores.index.tolist()[:self.
                                                            num_annotations]
            for instance_id in selected_instances:
                query = AnnotationQuery(instance_id, 0, None, None)
                self.annotation_queries.append(query)
            return

        # Otherwise
        num_uncertain = [0] * num_families
        num_anomalous = [0] * num_families
        families_scores = self.generateFamiliesScoresTables()
        num_annotations = 0
        stop = False
        selected_instances = []
        while not stop:
            for i, family in enumerate(list(self.lr_class_labels)):
                if num_uncertain[i] <= num_anomalous[i]:
                    classifier = 'lr'
                    num_uncertain[i] += 1
                else:
                    classifier = 'nb'
                    num_anomalous[i] += 1
                scores = families_scores[classifier][i]
                selected_rows = scores.loc[scores['queried'] == False]
                if len(selected_rows) > 0:
                    query = selected_rows.index.tolist()[0]
                else:
                    # No anomalous or uncertain instances available for annotation
                    # Select the most likely instance according to the logistic regression output
                    print family + ': no anomalous, no uncertain instances'
                    selected_rows = lr_predicted_proba_df.loc[
                        lr_predicted_proba_df['queried'] == False]
                    matrix_tools.sortDataFrame(selected_rows, family, False,
                                               True)
                    query = selected_rows.index.tolist()[0]
                # Add annotation query and set queried = True
                num_annotations += 1
                selected_instances.append(query)
                for c in ['nb', 'lr']:
                    predicted_class = self.scores.loc[query, c + '_prediction']
                    predicted_class_index = np.where(
                        self.lr_class_labels == predicted_class)[0]
                    families_scores[c][predicted_class_index].set_value(
                        query, 'queried', True)
                self.scores.set_value(query, 'queried', True)
                lr_predicted_proba_df.set_value(query, 'queried', True)
                # Break condition
                if num_annotations >= self.num_annotations:
                    stop = True
                    break
        for instance_id in selected_instances:
            query = AnnotationQuery(instance_id, 0, None, None)
            self.annotation_queries.append(query)

    def generateLrPredictedProbaDataFrame(self):
        num_test_instances = self.datasets.test_instances.numInstances()
        lr_predicted_proba_df = pd.DataFrame(
            np.zeros((num_test_instances, len(self.lr_class_labels) + 1)),
            index=self.datasets.test_instances.getIds(),
            columns=list(self.lr_class_labels) + ['queried'])
        lr_predicted_proba_df.iloc[:, :-1] = self.lr_predicted_proba
        lr_predicted_proba_df['queried'] = [False] * num_test_instances
        return lr_predicted_proba_df

    def generateFamiliesScoresTables(self, classifier=None):
        if classifier is None:
            families_scores = {}
            families_scores['lr'] = self.generateFamiliesScoresTables('lr')
            families_scores['nb'] = self.generateFamiliesScoresTables('nb')
            return families_scores
        families_scores = []
        for i, family in enumerate(list(self.lr_class_labels)):
            family_scores = self.scores.loc[self.scores[
                classifier + '_prediction'] == family]
            matrix_tools.sortDataFrame(family_scores, classifier + '_score',
                                       True, True)
            families_scores.append(family_scores)
        return families_scores
Esempio n. 7
0
 def evalClusteringPerf(self, instances):
     self.clustering_perf = PerformanceIndicators()
     self.clustering_perf.generateEvaluation(instances.true_families,
                                             instances.families)