def getTopModelFeatures(exp_id, size, train_test, fold_id): exp = updateCurrentExperiment(exp_id) directory = exp.output_dir() if fold_id != 'None' and fold_id != 'all': directory = path.join(directory, fold_id) directory = path.join(directory, train_test) filename = path.join(directory, 'model_coefficients.csv') with open(filename, 'r') as f: coefficients_df = pd.read_csv(f, header=0, index_col=0, nrows=int(size)) coefficients = list(coefficients_df['mean']) features_ids = coefficients_df.index tooltip_data = [] user_ids = [] for feature_id in features_ids: query = session.query(FeaturesAlchemy) query = query.filter(FeaturesAlchemy.id == int(feature_id)) row = query.one() tooltip_data.append(row.name) user_ids.append(row.user_id) barplot = BarPlot(user_ids) dataset = PlotDataset(coefficients, None) score = exp.exp_conf.core_conf.classifier_conf.featureImportance() if score == 'weight': dataset.set_color(colors_tools.red) barplot.add_dataset(dataset) return jsonify(barplot.to_json(tooltip_data=tooltip_data))
def _gen_binary_histogram(self): self.barplot = BarPlot(['0', '1']) for label, dataset in self.plot_datasets.items(): if len(dataset.values) > 0: num_0 = sum(dataset.values == 0) num_1 = sum(dataset.values == 1) hist_dataset = PlotDataset([num_0, num_1], label) hist_dataset.set_color(dataset.color) self.barplot.add_dataset(hist_dataset)
def _gen_histogram(self): # 10 equal-width bins computed on all the data _, bin_edges = np.histogram(self.all_values, bins=10, density=False) x_labels = ['%.2f - %.2f' % (bin_edges[e], bin_edges[e+1]) for e in range(len(bin_edges) - 1)] self.barplot = BarPlot(x_labels) for label, dataset in self.plot_datasets.items(): if len(dataset.values) > 0: hist, _ = np.histogram(dataset.values, bins=bin_edges, density=False) hist_dataset = PlotDataset(hist, label) hist_dataset.set_color(dataset.color) self.barplot.add_dataset(hist_dataset)
def getFamiliesBarplot(annotations_id, iteration, label): iteration = None if iteration == 'None' else int(iteration) family_counts = annotations_db_tools.getFamiliesCounts(session, annotations_id, iter_max=iteration, label=label) df = pd.DataFrame({ 'families': list(family_counts.keys()), 'counts': [family_counts[k] for k in list(family_counts.keys())] }) matrix_tools.sort_data_frame(df, 'families', ascending=True, inplace=True) barplot = BarPlot(list(df['families'])) dataset = PlotDataset(list(df['counts']), 'Num. Instances') dataset.set_color(colors_tools.get_label_color(label)) barplot.add_dataset(dataset) return jsonify(barplot.to_json())
def getClusterStats(experiment_id): experiment = updateCurrentExperiment(experiment_id) clustering = ClusteringExp.from_json(experiment.output_dir()) num_clusters = clustering.num_clusters num_instances_v = [] labels = [] for c in range(num_clusters): instances_in_cluster = clustering.clusters[c].instances_ids num_instances = len(instances_in_cluster) # the empty clusters are not displayed # if num_instances > 0: num_instances_v.append(num_instances) #labels.append('c_' + str(c)) labels.append(clustering.clusters[c].label) barplot = BarPlot(labels) dataset = PlotDataset(num_instances_v, 'Num. Instances') barplot.add_dataset(dataset) return jsonify(barplot.to_json())
def getTopWeightedFeatures(exp_id, inst_exp_id, instance_id, size, fold_id): if fold_id == 'all': return None instance_id = int(instance_id) exp = updateCurrentExperiment(exp_id) inst_exp = updateCurrentExperiment(inst_exp_id) # get the features features_from_exp = FeaturesFromExp(inst_exp) features_names, features_values = features_from_exp.get_instance( instance_id) features_values = [float(value) for value in features_values] # get the pipeline with scaler and logistic model experiment_dir = exp.output_dir() if fold_id != 'None': experiment_dir = path.join(experiment_dir, fold_id) pipeline = joblib.load(path.join(experiment_dir, 'model', 'model.out')) # scale the features scaled_values = pipeline.named_steps['scaler'].transform( np.reshape(features_values, (1, -1))) weighted_values = np.multiply(scaled_values, pipeline.named_steps['model'].coef_) features = list( map(lambda name, value, w_value: (name, value, w_value), features_names, features_values, weighted_values[0])) features.sort(key=lambda tup: abs(tup[2])) features = features[:-int(size) - 1:-1] features_names = [x[0] for x in features] features_values = [x[1] for x in features] features_weighted_values = [x[2] for x in features] labels = [str(name) for name in features_names] tooltips = [ '%s (%.2f)' % (name, features_values[i]) for i, name in enumerate(features_names) ] barplot = BarPlot(labels) dataset = PlotDataset(features_weighted_values, None) dataset.set_color(colors_tools.red) barplot.add_dataset(dataset) return jsonify(barplot.to_json(tooltip_data=tooltips))
def display(self, directory): labels = [ '0-10%', '10-20%', '20-30%', '30-40%', '40-50%', '50-60%', '60-70%', '70-80%', '80-90%', '90-100%' ] barplot = BarPlot(labels) if not self.has_ground_truth: dataset = PlotDataset(list(map(len, self.ranges)), 'numInstances') dataset.set_color(colors_tools.get_label_color('all')) barplot.add_dataset(dataset) else: self.displayLabel(barplot, labels_tools.MALICIOUS) self.displayLabel(barplot, labels_tools.BENIGN) filename = path.join(directory, 'predictions_barplot.json') with open(filename, 'w') as f: barplot.export_json(f)
class FeaturePlots(object): def __init__(self, instances, feature_index): self.feature_index = feature_index features = instances.features self.feature_type = features.types[self.feature_index] self.feature_name = features.names[self.feature_index] self.feature_id = features.ids[self.feature_index] self.all_values = features.getValuesFromIndex(self.feature_index) self._gen_plot_datasets(instances) def compute(self): if self.feature_type == FeatureType.binary: self._gen_binary_histogram() elif self.feature_type == FeatureType.numeric: self._gen_bloxplot() self._gen_histogram() self._gen_density() def export(self, output_dir): output_dir = path.join(output_dir, str(self.feature_id)) dir_tools.createDirectory(output_dir) if self.feature_type == FeatureType.binary: with open(path.join(output_dir, 'binary_histogram.json'), 'w') as f: self.barplot.export_json(f) elif self.feature_type == FeatureType.numeric: self.boxplot.display(path.join(output_dir, 'boxplot.png')) with open(path.join(output_dir, 'histogram.json'), 'w') as f: self.barplot.export_json(f) self.density.display(path.join(output_dir, 'density.png')) def _gen_plot_datasets(self, instances): self.plot_datasets = {} self._gen_label_plot_dataset(instances, labels_tools.MALICIOUS) self._gen_label_plot_dataset(instances, labels_tools.BENIGN) self._gen_label_plot_dataset(instances, 'unlabeled') def _gen_label_plot_dataset(self, instances, label): if label != 'unlabeled': instances = instances.getAnnotatedInstances(label=label) else: instances = instances.getUnlabeledInstances() values = instances.features.getValuesFromIndex(self.feature_index) dataset = PlotDataset(values, label) dataset.set_color(colors_tools.get_label_color(label)) self.plot_datasets[label] = dataset def _gen_bloxplot(self): self.boxplot = BoxPlot(title='Feature %s' % self.feature_name) for label, dataset in self.plot_datasets.items(): if len(dataset.values) > 0: self.boxplot.add_dataset(dataset) def _gen_histogram(self): # 10 equal-width bins computed on all the data _, bin_edges = np.histogram(self.all_values, bins=10, density=False) x_labels = ['%.2f - %.2f' % (bin_edges[e], bin_edges[e+1]) for e in range(len(bin_edges) - 1)] self.barplot = BarPlot(x_labels) for label, dataset in self.plot_datasets.items(): if len(dataset.values) > 0: hist, _ = np.histogram(dataset.values, bins=bin_edges, density=False) hist_dataset = PlotDataset(hist, label) hist_dataset.set_color(dataset.color) self.barplot.add_dataset(hist_dataset) def _gen_binary_histogram(self): self.barplot = BarPlot(['0', '1']) for label, dataset in self.plot_datasets.items(): if len(dataset.values) > 0: num_0 = sum(dataset.values == 0) num_1 = sum(dataset.values == 1) hist_dataset = PlotDataset([num_0, num_1], label) hist_dataset.set_color(dataset.color) self.barplot.add_dataset(hist_dataset) def _gen_density(self): self.density = Density(title='Feature %s' % self.feature_name) for _, dataset in self.plot_datasets.items(): if len(dataset.values) > 0: self.density.add_dataset(dataset)