コード例 #1
0
ファイル: categories.py プロジェクト: will3b/SecuML
 def train_naive_bayes(self):
     naive_bayes_conf = self.get_naive_bayes_conf()
     current_families = copy.deepcopy(
         self.instances.annotations.get_families())
     # families are altered
     self.instances.annotations.set_families(self.assigned_categories)
     naive_bayes = GaussianNaiveBayes(naive_bayes_conf)
     naive_bayes.training(self.instances)
     # families are restored
     self.instances.annotations.set_families(current_families)
     return naive_bayes
コード例 #2
0
ファイル: aladin.py プロジェクト: tuantmb/SecuML
class AladinQueries(Queries):
    def __init__(self, iteration, conf):
        Queries.__init__(self, iteration)
        self.num_annotations = conf.num_annotations
        self.conf = conf
        datasets = self.iteration.datasets
        self.train_instances = datasets.instances.get_annotated_instances()
        self.test_instances = datasets.get_unlabeled_instances()
        self.check_input_data()

    def run_models(self):
        self._run_logistic_regression()
        self._run_naive_bayes()

    def generate_queries(self, already_queried=None):
        self.compute_scores()
        self._gen_queries_from_scores()

    def check_input_data(self):
        families = set(self.train_instances.annotations.get_families())
        if len(families) < 2:
            raise AladinAtLeastTwoFamilies()

    def _run_logistic_regression(self):
        return None

    def _run_naive_bayes(self):
        naive_bayes_conf = self._create_naive_bayes_conf()
        # Update training data
        # the naive Bayes classifier is trained on all the data
        self.test_instances.annotations.set_families(
            list(self.lr_predicted_labels))
        train_instances = copy.deepcopy(self.train_instances)
        train_instances.union(self.test_instances)
        self.eval_clustering_perf(train_instances)
        # Train the naive Bayes detection model and predict
        self.naive_bayes = GaussianNaiveBayes(naive_bayes_conf)
        _, self.nb_time = self.naive_bayes.training(train_instances)
        num_test_instances = self.test_instances.num_instances()
        self.test_instances.annotations.set_families(
            [None for i in range(num_test_instances)])
        start_time = time.time()
        if num_test_instances == 0:
            self.nb_predicted_labels = []
        else:
            self.nb_predicted_labels = self.naive_bayes.pipeline.predict(
                self.test_instances.features.get_values())
        self.nb_time += time.time() - start_time
        self.nb_class_labels = self.naive_bayes.class_labels

    def eval_clustering_perf(self, instances):
        self.clustering_perf = PerformanceIndicators()
        self.clustering_perf.gen_eval(instances.ground_truth.get_families(),
                                      instances.annotations.get_families())

    def compute_scores(self):
        self._create_scores_df()
        self._compute_uncertainty_scores()
        self._compute_anomalous_scores()

    def _create_scores_df(self):
        num_test_instances = self.test_instances.num_instances()
        header = [
            'lr_prediction', 'lr_score', 'nb_prediction', 'nb_score', 'queried'
        ]
        self.scores = pd.DataFrame(np.zeros((num_test_instances, len(header))),
                                   index=self.test_instances.ids.get_ids(),
                                   columns=header)
        self.scores['queried'] = [False] * num_test_instances

    # Uncertain instances have a low difference between the probability of
    # belonging to the most likely family and the second most likely family.
    # In Aladin, the authors consider this measure of uncertainty instead of
    # the entropy used by Pelleg and Moore (Active Learning for Anomaly and
    # Rare-Category Detection).
    def _compute_uncertainty_scores(self):
        self.scores['lr_prediction'] = self.lr_predicted_labels
        lr_scores = []
        for i, predicted_label in enumerate(self.scores['lr_prediction']):
            predicted_label_index = np.where(
                self.lr_class_labels == predicted_label)[0]
            predicted_proba = self.lr_predicted_proba[i, predicted_label_index]
            proba = predicted_proba - self.lr_predicted_proba[i, :]
            proba[predicted_label_index] = 2
            score = np.min(proba)
            lr_scores.append(score)
        self.scores['lr_score'] = lr_scores

    # Anomalous instances have a low probability of belonging to the assigned
    # family
    def _compute_anomalous_scores(self):
        self.scores['nb_prediction'] = self.nb_predicted_labels
        for c in set(self.nb_predicted_labels):
            selection = self.scores['nb_prediction'] == c
            c_ids = self.scores.loc[selection].index.values
            c_instances = self.test_instances.get_from_ids(c_ids)
            c_features = c_instances.features.values
            c_likelihood = self.naive_bayes.log_likelihood(c_features, c)
            self.scores.loc[selection, 'nb_score'] = c_likelihood

    def _gen_queries_from_scores(self):
        assert (np.array_equal(self.lr_class_labels, self.nb_class_labels))
        lr_predicted_proba_df = self.gen_lr_predicted_proba_df()
        num_families = len(self.lr_class_labels)
        self.annotation_queries = []

        # There are fewer annotation queries than the number of families
        if self.num_annotations <= num_families:
            if self.iteration.iter_num % 2 == 0:
                classifier = 'lr'
            else:
                classifier = 'nb'
            sort_data_frame(self.scores, '%s_score' % classifier, True, True)
            selected_instances = self.scores.index.tolist()[:self.
                                                            num_annotations]
            for instance_id in selected_instances:
                query = self.generate_query(instance_id, 0, None, None)
                self.add_query(query)
            return

        # Otherwise
        num_uncertain = [0] * num_families
        num_anomalous = [0] * num_families
        families_scores = self._gen_families_scores_tables()
        num_annotations = 0
        stop = False
        selected_instances = []
        while not stop:
            for i, family in enumerate(list(self.lr_class_labels)):
                if num_uncertain[i] <= num_anomalous[i]:
                    classifier = 'lr'
                    num_uncertain[i] += 1
                else:
                    classifier = 'nb'
                    num_anomalous[i] += 1
                scores = families_scores[classifier][i]
                selected_rows = scores.loc[scores['queried'] == false()]
                if len(selected_rows) > 0:
                    query = selected_rows.index.tolist()[0]
                else:
                    # No anomalous or uncertain instances available for
                    # annotation
                    # Select the most likely instance according to the
                    # logistic regression output
                    self.conf.logger.debug(
                        family + ': no anomalous, no uncertain instances')
                    selected_rows = lr_predicted_proba_df.loc[
                        lr_predicted_proba_df['queried'] == false()]
                    selected_rows = sort_data_frame(selected_rows, family,
                                                    False, False)
                    selection = selected_rows.index.tolist()
                    # Break condition
                    # There is no instance left in the unlabelled pool
                    if len(selection) == 0:
                        stop = True
                        break
                    else:
                        query = selection[0]
                # Add annotation query and set queried = True
                num_annotations += 1
                selected_instances.append(query)
                for c in ['nb', 'lr']:
                    predicted_class = self.scores.loc[query, c + '_prediction']
                    predicted_class_index = np.where(
                        self.lr_class_labels == predicted_class)[0][0]
                    families_scores[c][predicted_class_index].set_value(
                        query, 'queried', True)
                self.scores.set_value(query, 'queried', True)
                lr_predicted_proba_df.set_value(query, 'queried', True)
                # Break condition
                # self.num_annotations instances have been queried
                if num_annotations >= self.num_annotations:
                    stop = True
                    break
        for instance_id in selected_instances:
            query = self.generate_query(instance_id, 0, None, None)
            self.add_query(query)

    def gen_lr_predicted_proba_df(self):
        num_test_instances = self.test_instances.num_instances()
        lr_predicted_proba_df = pd.DataFrame(
            np.zeros((num_test_instances, len(self.lr_class_labels) + 1)),
            index=self.test_instances.ids.get_ids(),
            columns=list(self.lr_class_labels) + ['queried'])
        if num_test_instances > 0:
            lr_predicted_proba_df.iloc[:, :-1] = self.lr_predicted_proba
            lr_predicted_proba_df['queried'] = [False] * num_test_instances
        return lr_predicted_proba_df

    def _gen_families_scores_tables(self, classifier=None):
        if classifier is None:
            families_scores = {}
            families_scores['lr'] = self._gen_families_scores_tables('lr')
            families_scores['nb'] = self._gen_families_scores_tables('nb')
            return families_scores
        families_scores = []
        for i, family in enumerate(list(self.lr_class_labels)):
            selection = self.scores[classifier + '_prediction']
            if selection.shape[0] > 0:
                family_scores = self.scores.loc[self.scores[
                    classifier + '_prediction'] == family]
                family_scores = sort_data_frame(family_scores,
                                                '%s_score' % classifier, True,
                                                False)
            else:
                col_values = self.scores.columns.values
                family_scores = pd.DataFrame(columns=col_values)
            families_scores.append(family_scores)
        return families_scores