Beispiel #1
0
 def extract(self, predictions_monitoring):
     predictions = predictions_monitoring.predictions
     detection_threshold = self.alerts_conf.detection_threshold
     self.alerts = extract_rows_with_thresholds(predictions,
                                                detection_threshold,
                                                None,
                                                'predicted_proba')
     sort_data_frame(self.alerts, 'predicted_proba', False, True)
Beispiel #2
0
 def _compute_features_scoring_ranking(self):
     self.features_scores = {}
     for i, feature_id in enumerate(self.instances.features.info.ids):
         # Store values / pvalues
         self.features_scores[feature_id] = FeatureScoring(
             feature_id, self.scores, self.scoring_func)
     # Store ranks
     for func, _ in self.scoring_func:
         sort_data_frame(self.scores, func, False, True)
         for rank, feature_id in enumerate(self.scores.index.values):
             self.features_scores[feature_id].set_rank(func, rank)
Beispiel #3
0
 def final_computations(self):
     features = self.fold_coef.columns
     mean = self.fold_coef.mean(axis=0)
     abs_mean = list(map(abs, mean))
     std = self.fold_coef.std(axis=0)
     zscore = abs(mean / [0.00001 if x == 0 else x for x in std])
     self.coef_summary = pd.DataFrame({'mean': mean,
                                       'std': std,
                                       'Zscore': zscore,
                                       'abs_mean': abs_mean},
                                      index=features)
     sort_data_frame(self.coef_summary, 'abs_mean', False, True)
Beispiel #4
0
def ndcg(ground_truth, scores, pos_label=1):
    df = pd.DataFrame({
        'scores': scores,
        'ground_truth': ground_truth,
        'index': [0] * len(scores)
    })
    sort_data_frame(df, 'scores', False, True)
    df.loc[:, 'index'] = range(len(scores))
    selection = df.loc[:, 'ground_truth'] == pos_label
    df = df.loc[selection, :]
    score = sum([pow(2, -row['index']) for _, row in df.iterrows()])
    ideal_score = (sum([pow(2, -i) for i in range(len(scores))]))
    return score / ideal_score
Beispiel #5
0
 def generate_queries(self, already_queried=None):
     unsure_df = pd.DataFrame({'proba': self.predictions.probas},
                              index=self.predictions.ids.ids)
     # drop already queried instances
     if already_queried is not None:
         unsure_df.drop(labels=already_queried, inplace=True)
     unsure_df['proba'] = abs(unsure_df['proba'] - 0.5)
     sort_data_frame(unsure_df, 'proba', True, True)
     if (self.num_annotations is not None
             and len(unsure_df) > self.num_annotations):
         unsure_df = unsure_df.head(n=self.num_annotations)
     for instance_id, row in unsure_df.iterrows():
         query = self.generate_query(instance_id, row['proba'], None, None)
         self.add_query(query)
Beispiel #6
0
def getFamiliesBarplot(annotations_id, iteration, label):
    iteration = None if iteration == 'None' else int(iteration)
    family_counts = annotations_db_tools.get_families_counts(
        session, annotations_id, iter_max=iteration, label=label)
    df = pd.DataFrame({
        'families':
        list(family_counts.keys()),
        'counts': [family_counts[k] for k in list(family_counts.keys())]
    })
    sort_data_frame(df, 'families', ascending=True, inplace=True)
    barplot = BarPlot(df['families'].values)
    dataset = PlotDataset(df['counts'].values, 'Num. Instances')
    dataset.set_color(get_label_color(label))
    barplot.add_dataset(dataset)
    return jsonify(barplot.to_json())
Beispiel #7
0
 def generate_queries(self, cluster_strategy, already_queried=None):
     queries_types = cluster_strategy.split('_')
     num_queries_types = len(queries_types)
     if already_queried is None:
         already_queried = []
     for q, queries_type in enumerate(queries_types):
         drop_instances = already_queried[:]
         drop_instances.extend(self.annotated_instances)
         if q == (num_queries_types - 1):
             num_queries = self.num_annotations - len(
                 self.annotation_queries)
         else:
             num_queries = self.num_annotations // num_queries_types
         if num_queries == 0:
             continue
         queries_df = self._get_selected_instances(drop_instances)
         if queries_type == 'center':
             confidence = 'high'
             sort_data_frame(queries_df, 'likelihood', False, True)
             queries_df = queries_df.head(num_queries)
         elif queries_type == 'anomalous':
             confidence = 'low'
             sort_data_frame(queries_df, 'likelihood', True, True)
             queries_df = queries_df.head(num_queries)
         elif queries_type == 'uncertain':
             confidence = 'low'
             sort_data_frame(queries_df, 'entropy', False, True)
             queries_df = queries_df.head(num_queries)
         elif queries_type == 'random':
             confidence = 'low'
             queries_df = queries_df.sample(n=num_queries, axis=0)
         else:
             raise ValueError()
         self._add_queries(confidence, queries_df)
Beispiel #8
0
 def generate_queries(self, already_queried=None):
     predicted_scores = self.predictions.scores
     if len(predicted_scores) == 0:
         return
     boundary_scores = abs(predicted_scores) / max(abs(predicted_scores))
     neighbours_scores = self._compute_neighbours_scores()
     global_scores = self.delta * boundary_scores
     global_scores += (1 - self.delta) * neighbours_scores
     queries_df = pd.DataFrame(data={
         'scores': predicted_scores,
         'boundary_scores': boundary_scores,
         'neighbours_scores': neighbours_scores,
         'global_scores': global_scores
     },
                               index=self.predictions.ids.ids)
     if already_queried is not None:
         queries_df.drop(labels=already_queried, inplace=True)
     sort_data_frame(queries_df, 'global_scores', True, True)
     queries_df = queries_df.head(n=self.num_annotations)
     for index, row in queries_df.iterrows():
         query = self.generate_query(index, row['scores'], None, None)
         self.add_query(query)
Beispiel #9
0
def getSortedFeatures(exp_id, criterion):
    exp = update_curr_exp(exp_id)
    scoring_filename = path.join(exp.output_dir(), 'scores.csv')
    scores = pd.read_csv(scoring_filename, header=0, index_col=0)
    pvalues = None
    if criterion == 'alphabet':
        features = scores.index.values.tolist()
        features.sort()
        values = None
        user_ids = get_feature_user_ids(session, features)
        return jsonify({
            'features': features,
            'values': None,
            'pvalues': None,
            'user_ids': user_ids
        })
    if criterion == 'null_variance':
        selection = scores.loc[:, 'variance'] == 0
        scores = scores.loc[selection, :]
        criterion = 'variance'
    else:
        sort_data_frame(scores, criterion, False, True)
    features = scores.index.values.tolist()
    values = scores[criterion].tolist()
    values = ['%.2f' % v for v in values]
    pvalues_col = '_'.join([criterion, 'pvalues'])
    if pvalues_col in scores.columns:
        pvalues = scores[pvalues_col].tolist()
        pvalues = ['%.2E' % Decimal(v) for v in pvalues]
    user_ids = get_feature_user_ids(session, features)
    return jsonify({
        'features': features,
        'values': values,
        'pvalues': pvalues,
        'user_ids': user_ids
    })
Beispiel #10
0
 def _gen_families_scores_tables(self, classifier=None):
     if classifier is None:
         families_scores = {}
         families_scores['lr'] = self._gen_families_scores_tables('lr')
         families_scores['nb'] = self._gen_families_scores_tables('nb')
         return families_scores
     families_scores = []
     for i, family in enumerate(list(self.lr_class_labels)):
         selection = self.scores[classifier + '_prediction']
         if selection.shape[0] > 0:
             family_scores = self.scores.loc[self.scores[
                 classifier + '_prediction'] == family]
             family_scores = sort_data_frame(family_scores,
                                             '%s_score' % classifier, True,
                                             False)
         else:
             col_values = self.scores.columns.values
             family_scores = pd.DataFrame(columns=col_values)
         families_scores.append(family_scores)
     return families_scores
Beispiel #11
0
    def _gen_queries_from_scores(self):
        assert (np.array_equal(self.lr_class_labels, self.nb_class_labels))
        lr_predicted_proba_df = self.gen_lr_predicted_proba_df()
        num_families = len(self.lr_class_labels)
        self.annotation_queries = []

        # There are fewer annotation queries than the number of families
        if self.num_annotations <= num_families:
            if self.iteration.iter_num % 2 == 0:
                classifier = 'lr'
            else:
                classifier = 'nb'
            sort_data_frame(self.scores, '%s_score' % classifier, True, True)
            selected_instances = self.scores.index.tolist()[:self.
                                                            num_annotations]
            for instance_id in selected_instances:
                query = self.generate_query(instance_id, 0, None, None)
                self.add_query(query)
            return

        # Otherwise
        num_uncertain = [0] * num_families
        num_anomalous = [0] * num_families
        families_scores = self._gen_families_scores_tables()
        num_annotations = 0
        stop = False
        selected_instances = []
        while not stop:
            for i, family in enumerate(list(self.lr_class_labels)):
                if num_uncertain[i] <= num_anomalous[i]:
                    classifier = 'lr'
                    num_uncertain[i] += 1
                else:
                    classifier = 'nb'
                    num_anomalous[i] += 1
                scores = families_scores[classifier][i]
                selected_rows = scores.loc[scores['queried'] == false()]
                if len(selected_rows) > 0:
                    query = selected_rows.index.tolist()[0]
                else:
                    # No anomalous or uncertain instances available for
                    # annotation
                    # Select the most likely instance according to the
                    # logistic regression output
                    self.conf.logger.debug(
                        family + ': no anomalous, no uncertain instances')
                    selected_rows = lr_predicted_proba_df.loc[
                        lr_predicted_proba_df['queried'] == false()]
                    selected_rows = sort_data_frame(selected_rows, family,
                                                    False, False)
                    selection = selected_rows.index.tolist()
                    # Break condition
                    # There is no instance left in the unlabelled pool
                    if len(selection) == 0:
                        stop = True
                        break
                    else:
                        query = selection[0]
                # Add annotation query and set queried = True
                num_annotations += 1
                selected_instances.append(query)
                for c in ['nb', 'lr']:
                    predicted_class = self.scores.loc[query, c + '_prediction']
                    predicted_class_index = np.where(
                        self.lr_class_labels == predicted_class)[0][0]
                    families_scores[c][predicted_class_index].set_value(
                        query, 'queried', True)
                self.scores.set_value(query, 'queried', True)
                lr_predicted_proba_df.set_value(query, 'queried', True)
                # Break condition
                # self.num_annotations instances have been queried
                if num_annotations >= self.num_annotations:
                    stop = True
                    break
        for instance_id in selected_instances:
            query = self.generate_query(instance_id, 0, None, None)
            self.add_query(query)
Beispiel #12
0
 def final_computations(self):
     sort_data_frame(self.predictions, 'predicted_proba', True, True)
Beispiel #13
0
 def sort_instances(self):
     df = pd.DataFrame({'distance': self.distances},
                       index=list(map(str, self.instances_ids)))
     sort_data_frame(df, 'distance', True, True)
     self.instances_ids = list(map(int, df.index.values.tolist()))
     self.distances = df.distance.tolist()