def generateHistogram(self): # 10 equal-width bins computed on all the data if not self.has_ground_truth: hist, bin_edges = np.histogram(self.plot_datasets['all'].values, bins=10, density=False) else: hist, bin_edges = np.histogram( self.plot_datasets[labels_tools.MALICIOUS].values, bins=10, density=False) x_labels = [ str(bin_edges[e]) + ' - ' + str(bin_edges[e + 1]) for e in range(len(bin_edges) - 1) ] barplot = BarPlot(x_labels) for label, dataset in self.plot_datasets.items(): hist, bin_edges = np.histogram(dataset.values, bins=bin_edges, density=False) hist_dataset = PlotDataset(hist, dataset.label) hist_dataset.setColor(dataset.color) barplot.addDataset(hist_dataset) output_filename = path.join(self.output_directory, 'histogram.json') with open(output_filename, 'w') as f: barplot.exportJson(f)
def generateLabelPlotDatasets(self, instances, label): instances = instances.getInstancesFromIds( instances.ground_truth.getAnnotatedIds(label)) dataset = PlotDataset( instances.features.getFeatureValues(self.feature), label) dataset.setColor(colors_tools.getLabelColor(label)) self.plot_datasets[label] = dataset
def displayLabel(self, barplot, label): label_bool = labels_tools.labelStringToBoolean(label) ranges = [[x for x in l if x['ground_truth_label'] == label_bool] for l in self.ranges] dataset = PlotDataset(list(map(len, ranges)), label) dataset.setColor(colors_tools.getLabelColor(label)) barplot.addDataset(dataset)
def generateBinaryHistogram(self): barplot = BarPlot(['0', '1']) for label, dataset in self.plot_datasets.items(): num_0 = sum(dataset.values == 0) num_1 = sum(dataset.values == 1) hist_dataset = PlotDataset([num_0, num_1], dataset.label) hist_dataset.setColor(dataset.color) barplot.addDataset(hist_dataset) output_filename = path.join(self.output_directory, 'binary_histogram.json') with open(output_filename, 'w') as f: barplot.exportJson(f)
def executionTimeDisplay(self): lr = PlotDataset(None, 'Logistic Regression') lr.setLinestyle('dotted') nb = PlotDataset(None, 'Naive Bayes') nb.setLinestyle('dashed') v = [lr, nb] v.extend(QueryStrategy.executionTimeDisplay(self)) return v
def display(self, directory): labels = [ '0-10%', '10-20%', '20-30%', '30-40%', '40-50%', '50-60%', '60-70%', '70-80%', '80-90%', '90-100%' ] barplot = BarPlot(labels) if not self.has_ground_truth: dataset = PlotDataset(list(map(len, self.ranges)), 'numInstances') dataset.setColor(colors_tools.getLabelColor('all')) barplot.addDataset(dataset) else: self.displayLabel(barplot, labels_tools.MALICIOUS) self.displayLabel(barplot, labels_tools.BENIGN) filename = path.join(directory, 'predictions_barplot.json') with open(filename, 'w') as f: barplot.exportJson(f)
def getFamiliesBarplot(experiment_id, iteration, label): if iteration == 'None': iteration = None family_counts = annotations_db_tools.getFamiliesCounts( session, experiment_id, iteration_max=iteration, label=label) df = pd.DataFrame({ 'families': list(family_counts.keys()), 'counts': [family_counts[k] for k in list(family_counts.keys())] }) matrix_tools.sortDataFrame(df, 'families', ascending=True, inplace=True) barplot = BarPlot(list(df['families'])) dataset = PlotDataset(list(df['counts']), 'Num. Instances') dataset.setColor(colors_tools.getLabelColor(label)) barplot.addDataset(dataset) return jsonify(barplot.toJson())
def plotEvolutionMonitoring(self, evolution_file, monitoring_dir): data = self.loadEvolutionMonitoring(evolution_file) if self.labels_families == 'labels': title = 'Labels Suggestions Accuracy' elif self.labels_families == 'families': title = 'Families Suggestions Accuracy' plot = PlotDataset(data['true_suggestions'] / data['num_suggestions'], title) iterations = list(range(self.monitoring.iteration_number)) plt.clf() max_value = 1 plt.plot(iterations, plot.values, label=plot.label, color=plot.color, linewidth=plot.linewidth, marker=plot.marker) plt.ylim(0, max_value) plt.xlabel('Iteration') plt.ylabel('Suggestions Accuracy') lgd = plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3, ncol=2, mode='expand', borderaxespad=0., fontsize='large') filename = '_'.join( [self.labels_families, self.kind, 'suggestions.png']) filename = path.join(monitoring_dir, filename) plt.savefig(filename, bbox_extra_artists=(lgd, ), bbox_inches='tight') plt.clf() self.data = data
def getTopWeightedFeatures(experiment_id, inst_exp_id, instance_id, size, fold_id): if fold_id == 'all': return None instance_id = int(instance_id) exp = ExperimentFactory.getFactory().fromJson(experiment_id, session) validation_experiment = ExperimentFactory.getFactory().fromJson( inst_exp_id, session) # get the features features_names, features_values = validation_experiment.getFeatures( instance_id) features_values = [float(value) for value in features_values] # get the pipeline with scaler and logistic model experiment_dir = exp.getOutputDirectory() if fold_id != 'None': experiment_dir = path.join(experiment_dir, fold_id) pipeline = joblib.load(path.join(experiment_dir, 'model', 'model.out')) # scale the features scaled_values = pipeline.named_steps['scaler'].transform( np.reshape(features_values, (1, -1))) weighted_values = np.multiply(scaled_values, pipeline.named_steps['model'].coef_) features = list( map(lambda name, value, w_value: (name, value, w_value), features_names, features_values, weighted_values[0])) features.sort(key=lambda tup: abs(tup[2])) features = features[:-int(size) - 1:-1] features_names = [x[0] for x in features] features_values = [x[1] for x in features] features_weighted_values = [x[2] for x in features] max_length = max([len(f) for f in features_names]) if max_length > 30: labels = [str(i) for i in range(len(features_names))] tooltips = [ features_names[i] + ' (' + str(features_values[i]) + ')' for i in range(len(features_names)) ] else: labels = features_names tooltips = features_values barplot = BarPlot(labels) dataset = PlotDataset(features_weighted_values, None) dataset.setColor(colors_tools.red) barplot.addDataset(dataset) return jsonify(barplot.toJson(tooltip_data=tooltips))
def generatePlotDatasets(self, instances): self.plot_datasets = {} if self.has_ground_truth: self.generateLabelPlotDatasets(instances, labels_tools.MALICIOUS) self.generateLabelPlotDatasets(instances, labels_tools.BENIGN) else: self.plot_datasets['all'] = PlotDataset( instances.features.getFeatureValues(self.feature), 'all') self.plot_datasets['all'].setColor( colors_tools.getLabelColor('all'))
def plotEvolutionMonitoring(self, monitoring_dir): iterations = list(range(1, self.monitoring.iteration_number + 1)) plt.clf() # Labels data = self.labels_accuracy.high_confidence_counts.data values = data['true_suggestions'] / data['num_suggestions'] plot = PlotDataset(values, 'Labels Suggestions') max_value = 1 plt.plot(iterations, plot.values, label=plot.label, color=plot.color, linewidth=plot.linewidth, marker=plot.marker) # Families data = self.families_accuracy.high_confidence_counts.data values = data['true_suggestions'] / data['num_suggestions'] plot = PlotDataset(values, 'Families Suggestions') max_value = 1 plt.plot(iterations, plot.values, label=plot.label, color='purple', linewidth=plot.linewidth, marker=plot.marker) # Plot plt.ylim(0, max_value) plt.xlabel('Iteration') plt.ylabel('Suggestions Accuracy') lgd = plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3, ncol=2, mode='expand', borderaxespad=0., fontsize='large') filename = path.join( monitoring_dir, 'labels_families_high_confidence_suggestions.png') plt.savefig(filename, bbox_extra_artists=(lgd, ), bbox_inches='tight') plt.clf()
def getTopModelFeatures(experiment_id, size, train_test, fold_id): size = int(size) exp = updateCurrentExperiment(experiment_id) directory = exp.getOutputDirectory() if fold_id != 'None' and fold_id != 'all': directory = path.join(directory, fold_id) directory = path.join(directory, train_test) filename = path.join(directory, 'model_coefficients.csv') with open(filename, 'r') as f: coefficients_df = pd.read_csv(f, header=0, index_col=0) model_coefficients = list(coefficients_df['mean']) features_names = list(map(str, coefficients_df.index)) coefficients = list( map(lambda name, coef: (name, coef), features_names, model_coefficients)) coefficients.sort(key=lambda tup: abs(tup[1])) coefficients = coefficients[:-size - 1:-1] coefficients_names = [coef[0] for coef in coefficients] coefficients_values = [coef[1] for coef in coefficients] max_length = max([len(coef) for coef in coefficients_names]) if max_length > 30: coefficients_ids = [str(i) for i in range(len(coefficients_names))] coefficients_names = [ name.replace(' WHERE', '\nWHERE') for name in coefficients_names ] barplot = BarPlot(coefficients_ids) dataset = PlotDataset(coefficients_values, None) if exp.conf.featureImportance() == 'weight': dataset.setColor(colors_tools.red) barplot.addDataset(dataset) return jsonify(barplot.toJson(tooltip_data=coefficients_names)) else: barplot = BarPlot(coefficients_names) dataset = PlotDataset(coefficients_values, None) if exp.conf.featureImportance() == 'weight': dataset.setColor(colors_tools.red) barplot.addDataset(dataset) return jsonify(barplot.toJson())
def getClusterStats(experiment_id): experiment = updateCurrentExperiment(experiment_id) clustering = ClusteringExp.fromJson(experiment.getOutputDirectory()) num_clusters = clustering.num_clusters num_instances_v = [] labels = [] for c in range(num_clusters): instances_in_cluster = clustering.clusters[c].instances_ids num_instances = len(instances_in_cluster) # the empty clusters are not displayed # if num_instances > 0: num_instances_v.append(num_instances) #labels.append('c_' + str(c)) labels.append(clustering.clusters[c].label) barplot = BarPlot(labels) dataset = PlotDataset(num_instances_v, 'Num. Instances') barplot.addDataset(dataset) return jsonify(barplot.toJson())
def plotPerfEvolution(self, estimators, output_filename, data, monitoring_dir): iterations = list(range(1, self.monitoring.iteration_number + 1)) plt.clf() for estimator in estimators: plot = PlotDataset(data[estimator], estimator) plt.plot(iterations, plot.values, label=plot.label, color=plot.color, linewidth=plot.linewidth, marker=plot.marker) plt.ylim(0, 1) plt.xlabel('Iteration') plt.ylabel('Performance') lgd = plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3, ncol=3, mode='expand', borderaxespad=0., fontsize='large') filename = self.outputFilename(monitoring_dir, output_filename, 'png') plt.savefig(filename, bbox_extra_artists=(lgd,), bbox_inches='tight') plt.clf()
def getFamiliesPerformance(experiment_id, train_test, label, threshold): experiment = updateCurrentExperiment(experiment_id) directory = path.join(experiment.getOutputDirectory(), train_test, 'families') if label == labels_tools.MALICIOUS: filename = 'tp_' tp_fp = 'Detection Rate' elif label == labels_tools.BENIGN: filename = 'fp_' tp_fp = 'False Positive Rate' filename += 'families_thresholds.csv' with open(path.join(directory, filename), 'r') as f: perf = pd.read_csv(f, header=0, index_col=0) families = list(perf.columns.values[:-1]) threshold = float(threshold) / 100 thresholds = list(perf.index[:-1]) threshold_value = min(enumerate(thresholds), key=lambda x: abs(x[1] - threshold))[1] perf = list(perf.loc[threshold_value]) barplot = BarPlot(families) barplot.addDataset(PlotDataset(perf, tp_fp)) return jsonify(barplot.toJson())
def executionTimeDisplay(self): generate_queries = PlotDataset(None, 'Queries generation') generate_queries.setColor('purple') return [generate_queries]
def executionTimeDisplay(self): binary_model = PlotDataset(None, 'Binary model') v = [binary_model] v.extend(QueryStrategy.executionTimeDisplay(self)) return v
def executionTimeDisplay(self): clustering = PlotDataset(None, 'Analysis') return [clustering] + QueryStrategy.executionTimeDisplay(self)
def executionTimeDisplay(self): clustering = PlotDataset(None, 'Analysis') v = [clustering] v.extend(QueryStrategy.executionTimeDisplay(self)) return v
def executionTimeDisplay(self): uncertain = PlotDataset(None, 'Uncertain Queries') malicious = PlotDataset(None, 'Malicious Queries') malicious.setLinestyle('dotted') malicious.setColor(colors_tools.getLabelColor(labels_tools.MALICIOUS)) benign = PlotDataset(None, 'Benign Queries') benign.setLinestyle('dashed') benign.setColor(colors_tools.getLabelColor(labels_tools.BENIGN)) return [malicious, uncertain, benign]
def executionTimeDisplay(self): binary_model = PlotDataset(None, 'Binary model') return [binary_model] + QueryStrategy.executionTimeDisplay(self)