def extract(self, predictions_monitoring): predictions = predictions_monitoring.predictions detection_threshold = self.alerts_conf.detection_threshold self.alerts = extract_rows_with_thresholds(predictions, detection_threshold, None, 'predicted_proba') sort_data_frame(self.alerts, 'predicted_proba', False, True)
def _compute_features_scoring_ranking(self): self.features_scores = {} for i, feature_id in enumerate(self.instances.features.info.ids): # Store values / pvalues self.features_scores[feature_id] = FeatureScoring( feature_id, self.scores, self.scoring_func) # Store ranks for func, _ in self.scoring_func: sort_data_frame(self.scores, func, False, True) for rank, feature_id in enumerate(self.scores.index.values): self.features_scores[feature_id].set_rank(func, rank)
def final_computations(self): features = self.fold_coef.columns mean = self.fold_coef.mean(axis=0) abs_mean = list(map(abs, mean)) std = self.fold_coef.std(axis=0) zscore = abs(mean / [0.00001 if x == 0 else x for x in std]) self.coef_summary = pd.DataFrame({'mean': mean, 'std': std, 'Zscore': zscore, 'abs_mean': abs_mean}, index=features) sort_data_frame(self.coef_summary, 'abs_mean', False, True)
def ndcg(ground_truth, scores, pos_label=1): df = pd.DataFrame({ 'scores': scores, 'ground_truth': ground_truth, 'index': [0] * len(scores) }) sort_data_frame(df, 'scores', False, True) df.loc[:, 'index'] = range(len(scores)) selection = df.loc[:, 'ground_truth'] == pos_label df = df.loc[selection, :] score = sum([pow(2, -row['index']) for _, row in df.iterrows()]) ideal_score = (sum([pow(2, -i) for i in range(len(scores))])) return score / ideal_score
def generate_queries(self, already_queried=None): unsure_df = pd.DataFrame({'proba': self.predictions.probas}, index=self.predictions.ids.ids) # drop already queried instances if already_queried is not None: unsure_df.drop(labels=already_queried, inplace=True) unsure_df['proba'] = abs(unsure_df['proba'] - 0.5) sort_data_frame(unsure_df, 'proba', True, True) if (self.num_annotations is not None and len(unsure_df) > self.num_annotations): unsure_df = unsure_df.head(n=self.num_annotations) for instance_id, row in unsure_df.iterrows(): query = self.generate_query(instance_id, row['proba'], None, None) self.add_query(query)
def getFamiliesBarplot(annotations_id, iteration, label): iteration = None if iteration == 'None' else int(iteration) family_counts = annotations_db_tools.get_families_counts( session, annotations_id, iter_max=iteration, label=label) df = pd.DataFrame({ 'families': list(family_counts.keys()), 'counts': [family_counts[k] for k in list(family_counts.keys())] }) sort_data_frame(df, 'families', ascending=True, inplace=True) barplot = BarPlot(df['families'].values) dataset = PlotDataset(df['counts'].values, 'Num. Instances') dataset.set_color(get_label_color(label)) barplot.add_dataset(dataset) return jsonify(barplot.to_json())
def generate_queries(self, cluster_strategy, already_queried=None): queries_types = cluster_strategy.split('_') num_queries_types = len(queries_types) if already_queried is None: already_queried = [] for q, queries_type in enumerate(queries_types): drop_instances = already_queried[:] drop_instances.extend(self.annotated_instances) if q == (num_queries_types - 1): num_queries = self.num_annotations - len( self.annotation_queries) else: num_queries = self.num_annotations // num_queries_types if num_queries == 0: continue queries_df = self._get_selected_instances(drop_instances) if queries_type == 'center': confidence = 'high' sort_data_frame(queries_df, 'likelihood', False, True) queries_df = queries_df.head(num_queries) elif queries_type == 'anomalous': confidence = 'low' sort_data_frame(queries_df, 'likelihood', True, True) queries_df = queries_df.head(num_queries) elif queries_type == 'uncertain': confidence = 'low' sort_data_frame(queries_df, 'entropy', False, True) queries_df = queries_df.head(num_queries) elif queries_type == 'random': confidence = 'low' queries_df = queries_df.sample(n=num_queries, axis=0) else: raise ValueError() self._add_queries(confidence, queries_df)
def generate_queries(self, already_queried=None): predicted_scores = self.predictions.scores if len(predicted_scores) == 0: return boundary_scores = abs(predicted_scores) / max(abs(predicted_scores)) neighbours_scores = self._compute_neighbours_scores() global_scores = self.delta * boundary_scores global_scores += (1 - self.delta) * neighbours_scores queries_df = pd.DataFrame(data={ 'scores': predicted_scores, 'boundary_scores': boundary_scores, 'neighbours_scores': neighbours_scores, 'global_scores': global_scores }, index=self.predictions.ids.ids) if already_queried is not None: queries_df.drop(labels=already_queried, inplace=True) sort_data_frame(queries_df, 'global_scores', True, True) queries_df = queries_df.head(n=self.num_annotations) for index, row in queries_df.iterrows(): query = self.generate_query(index, row['scores'], None, None) self.add_query(query)
def getSortedFeatures(exp_id, criterion): exp = update_curr_exp(exp_id) scoring_filename = path.join(exp.output_dir(), 'scores.csv') scores = pd.read_csv(scoring_filename, header=0, index_col=0) pvalues = None if criterion == 'alphabet': features = scores.index.values.tolist() features.sort() values = None user_ids = get_feature_user_ids(session, features) return jsonify({ 'features': features, 'values': None, 'pvalues': None, 'user_ids': user_ids }) if criterion == 'null_variance': selection = scores.loc[:, 'variance'] == 0 scores = scores.loc[selection, :] criterion = 'variance' else: sort_data_frame(scores, criterion, False, True) features = scores.index.values.tolist() values = scores[criterion].tolist() values = ['%.2f' % v for v in values] pvalues_col = '_'.join([criterion, 'pvalues']) if pvalues_col in scores.columns: pvalues = scores[pvalues_col].tolist() pvalues = ['%.2E' % Decimal(v) for v in pvalues] user_ids = get_feature_user_ids(session, features) return jsonify({ 'features': features, 'values': values, 'pvalues': pvalues, 'user_ids': user_ids })
def _gen_families_scores_tables(self, classifier=None): if classifier is None: families_scores = {} families_scores['lr'] = self._gen_families_scores_tables('lr') families_scores['nb'] = self._gen_families_scores_tables('nb') return families_scores families_scores = [] for i, family in enumerate(list(self.lr_class_labels)): selection = self.scores[classifier + '_prediction'] if selection.shape[0] > 0: family_scores = self.scores.loc[self.scores[ classifier + '_prediction'] == family] family_scores = sort_data_frame(family_scores, '%s_score' % classifier, True, False) else: col_values = self.scores.columns.values family_scores = pd.DataFrame(columns=col_values) families_scores.append(family_scores) return families_scores
def _gen_queries_from_scores(self): assert (np.array_equal(self.lr_class_labels, self.nb_class_labels)) lr_predicted_proba_df = self.gen_lr_predicted_proba_df() num_families = len(self.lr_class_labels) self.annotation_queries = [] # There are fewer annotation queries than the number of families if self.num_annotations <= num_families: if self.iteration.iter_num % 2 == 0: classifier = 'lr' else: classifier = 'nb' sort_data_frame(self.scores, '%s_score' % classifier, True, True) selected_instances = self.scores.index.tolist()[:self. num_annotations] for instance_id in selected_instances: query = self.generate_query(instance_id, 0, None, None) self.add_query(query) return # Otherwise num_uncertain = [0] * num_families num_anomalous = [0] * num_families families_scores = self._gen_families_scores_tables() num_annotations = 0 stop = False selected_instances = [] while not stop: for i, family in enumerate(list(self.lr_class_labels)): if num_uncertain[i] <= num_anomalous[i]: classifier = 'lr' num_uncertain[i] += 1 else: classifier = 'nb' num_anomalous[i] += 1 scores = families_scores[classifier][i] selected_rows = scores.loc[scores['queried'] == false()] if len(selected_rows) > 0: query = selected_rows.index.tolist()[0] else: # No anomalous or uncertain instances available for # annotation # Select the most likely instance according to the # logistic regression output self.conf.logger.debug( family + ': no anomalous, no uncertain instances') selected_rows = lr_predicted_proba_df.loc[ lr_predicted_proba_df['queried'] == false()] selected_rows = sort_data_frame(selected_rows, family, False, False) selection = selected_rows.index.tolist() # Break condition # There is no instance left in the unlabelled pool if len(selection) == 0: stop = True break else: query = selection[0] # Add annotation query and set queried = True num_annotations += 1 selected_instances.append(query) for c in ['nb', 'lr']: predicted_class = self.scores.loc[query, c + '_prediction'] predicted_class_index = np.where( self.lr_class_labels == predicted_class)[0][0] families_scores[c][predicted_class_index].set_value( query, 'queried', True) self.scores.set_value(query, 'queried', True) lr_predicted_proba_df.set_value(query, 'queried', True) # Break condition # self.num_annotations instances have been queried if num_annotations >= self.num_annotations: stop = True break for instance_id in selected_instances: query = self.generate_query(instance_id, 0, None, None) self.add_query(query)
def final_computations(self): sort_data_frame(self.predictions, 'predicted_proba', True, True)
def sort_instances(self): df = pd.DataFrame({'distance': self.distances}, index=list(map(str, self.instances_ids))) sort_data_frame(df, 'distance', True, True) self.instances_ids = list(map(int, df.index.values.tolist())) self.distances = df.distance.tolist()