def display(self, directory): labels = [ '0-10%', '10-20%', '20-30%', '30-40%', '40-50%', '50-60%', '60-70%', '70-80%', '80-90%', '90-100%' ] barplot = BarPlot(labels) barplot.addDataset(map(len, self.ranges), colors_tools.getLabelColor('all'), 'numInstances') filename = directory + 'predictions_barplot.json' with open(filename, 'w') as f: barplot.display(f) barplot = BarPlot(labels) malicious_ranges = map(lambda l: filter(lambda x: x['true_label'], l), self.ranges) benign_ranges = map(lambda l: filter(lambda x: not x['true_label'], l), self.ranges) barplot.addDataset(map(len, malicious_ranges), colors_tools.getLabelColor('malicious'), 'malicious') barplot.addDataset(map(len, benign_ranges), colors_tools.getLabelColor('benign'), 'benign') filename = directory filename += 'predictions_barplot_labels.json' with open(filename, 'w') as f: barplot.display(f)
def getTopWeightedFeatures(project, dataset, experiment, instance_dataset, inst_exp_id, instance_id, size): instance_id = int(instance_id) model_experiment_obj = ExperimentFactory.getFactory().fromJson( project, dataset, experiment, db, cursor) validation_experiment = ExperimentFactory.getFactory().fromJson( project, instance_dataset, inst_exp_id, db, cursor) #get the features features_names, features_values = validation_experiment.getFeatures( instance_id) features_values = [float(value) for value in features_values] #get the pipeline with scaler and logistic model pipeline = model_experiment_obj.getModelPipeline() #scale the features scaled_values = pipeline.named_steps['scaler'].transform( np.reshape(features_values, (1, -1))) weighted_values = np.multiply(scaled_values, pipeline.named_steps['model'].coef_) features = map(lambda name, value, w_value: (name, value, w_value), features_names, features_values, weighted_values[0]) features.sort(key=lambda tup: abs(tup[2])) features = features[:-int(size) - 1:-1] tooltips = [x[1] for x in features] barplot = BarPlot([x[0] for x in features]) barplot.addDataset([x[2] for x in features], colors_tools.red, None) barplot.addTooltips(tooltips) return jsonify(barplot.barplot)
def plotEvolutionMonitoring(self): barplot = BarPlot(self.families) for i in range(self.data.shape[0]): barplot.addDataset(list(self.data.iloc[i, 1:]), 'blue', str(i)) filename = self.output_directory filename += self.label + '_families_evolution.json' with open(filename, 'w') as f: barplot.display(f)
def plotEvolutionMonitoring(self): barplot = BarPlot(self.families) for i in range(self.data.shape[0]): dataset = PlotDataset([self.data.iloc[i, 1]], str(i)) barplot.addDataset(dataset) filename = self.output_directory filename += self.label + '_families_evolution.json' with open(filename, 'w') as f: barplot.exportJson(f)
def generateBinaryHistogram(self): barplot = BarPlot(['0', '1']) for label, dataset in self.plot_datasets.iteritems(): num_0 = sum(dataset.values == 0) num_1 = sum(dataset.values == 1) barplot.addDataset([num_0, num_1], dataset.color, dataset.label) output_filename = self.output_directory + 'binary_histogram.json' with open(output_filename, 'w') as f: barplot.display(f)
def plotEvolutionMonitoring(self, evolution_file, monitoring_dir): data = self.loadEvolutionMonitoring(evolution_file) barplot = BarPlot(self.families) for i in range(data.shape[0]): dataset = PlotDataset([data.iloc[i, 1]], str(i)) barplot.addDataset(dataset) filename = monitoring_dir filename += self.label + '_families_evolution.json' with open(filename, 'w') as f: barplot.exportJson(f)
def getTopModelCoefficients(project, dataset, experiment, size): size = int(size) model_experiment_obj = ExperimentFactory.getFactory().fromJson(project, dataset, experiment, db, cursor) pipeline = model_experiment_obj.getModelPipeline() model_coefficients = pipeline.named_steps['model'].coef_[0] features_names = model_experiment_obj.getFeaturesNames() coefficients = map(lambda name, coef: (name, coef), features_names, model_coefficients) coefficients.sort(key = lambda tup: abs(tup[1])) coefficients = coefficients[:-size-1:-1] barplot = BarPlot([x[0] for x in coefficients]) barplot.addDataset([x[1] for x in coefficients], '#d9534f', None) return jsonify(barplot.barplot)
def getTopModelFeatures(experiment_id, size): size = int(size) exp = ExperimentFactory.getFactory().fromJson(experiment_id, session) model_coefficients = exp.getTopFeatures() features_names = exp.getFeaturesNames() coefficients = map(lambda name, coef: (name, coef), features_names, model_coefficients) coefficients.sort(key=lambda tup: abs(tup[1])) coefficients = coefficients[:-size - 1:-1] barplot = BarPlot([x[0] for x in coefficients]) dataset = PlotDataset([x[1] for x in coefficients], None) if (exp.classification_conf.featureImportance() == 'weight'): dataset.setColor(colors_tools.red) barplot.addDataset(dataset) return jsonify(barplot.toJson())
def generateHistogram(self): # 10 equal-width bins computed on all the data if not self.has_true_labels: hist, bin_edges = np.histogram(self.plot_datasets['all'].values, bins = 10, density = False) else: hist, bin_edges = np.histogram(self.plot_datasets['malicious'].values, bins = 10, density = False) x_labels = [str(bin_edges[e]) + ' - ' + str(bin_edges[e+1]) for e in range(len(bin_edges)-1)] barplot = BarPlot(x_labels) for label, dataset in self.plot_datasets.iteritems(): hist, bin_edges = np.histogram(dataset.values, bins = bin_edges, density = False) hist_dataset = PlotDataset(hist, dataset.label) hist_dataset.setColor(dataset.color) barplot.addDataset(hist_dataset) output_filename = self.output_directory + 'histogram.json' with open(output_filename, 'w') as f: barplot.exportJson(f)
def getClusterStats(project, dataset, experiment_id): experiment = ExperimentFactory.getFactory().fromJson(project, dataset, experiment_id, db, cursor) clustering = Clustering.fromJson(experiment) num_clusters = clustering.num_clusters num_instances_v = [] labels = [] mysql_tools.useDatabase(cursor, project, dataset) for c in range(num_clusters): instances_in_cluster = clustering.clusters[c].instances_ids num_instances = len(instances_in_cluster) # the empty clusters are not displayed if num_instances > 0: num_instances_v.append(num_instances) labels.append('c_' + str(c)) barplot = BarPlot(labels) barplot.addDataset(num_instances_v, colors_tools.blue, 'Num. Instances') return jsonify(barplot.barplot)
def getClusterStats(experiment_id): experiment = updateCurrentExperiment(experiment_id) clustering = Clustering.fromJson(experiment) num_clusters = clustering.num_clusters num_instances_v = [] labels = [] for c in range(num_clusters): instances_in_cluster = clustering.clusters[c].instances_ids num_instances = len(instances_in_cluster) # the empty clusters are not displayed #if num_instances > 0: num_instances_v.append(num_instances) #labels.append('c_' + str(c)) labels.append(clustering.clusters[c].label) barplot = BarPlot(labels) dataset = PlotDataset(num_instances_v, 'Num. Instances') barplot.addDataset(dataset) return jsonify(barplot.toJson())
def getFamiliesPerformance(project, dataset, experiment, train_test, label, threshold): filename = getDir(project, dataset, experiment) + train_test + '/families/' if label == 'malicious': filename += 'tp_' tp_fp = 'Detection Rate' elif label == 'benign': filename += 'fp_' tp_fp = 'False Positive Rate' filename += 'families_thresholds.csv' with open(filename, 'r') as f: perf = pd.read_csv(f, header = 0, index_col = 0) families = list(perf.columns.values[:-1]) threshold = float(threshold)/100 thresholds = list(perf.index[:-1]) threshold_value = min(enumerate(thresholds), key=lambda x: abs(x[1]-threshold))[1] perf = list(perf.loc[threshold_value]) barplot = BarPlot(families) barplot.addDataset(perf, colors_tools.getLabelColor('all'), tp_fp) return jsonify(barplot.barplot);
def getFamiliesBarplot(project, dataset, experiment_id, iteration, label): experiment = ExperimentFactory.getFactory().fromJson( project, dataset, experiment_id, db, cursor) experiment_label_id = experiment.experiment_label_id if iteration == 'None': iteration = None family_counts = labels_tools.getFamiliesCounts(cursor, experiment_label_id, iteration_max=iteration, label=label) df = pd.DataFrame({ 'families': family_counts.keys(), 'counts': [family_counts[k] for k in family_counts.keys()] }) matrix_tools.sortDataFrame(df, 'families', ascending=True, inplace=True) barplot = BarPlot(list(df['families'])) barplot.addDataset(list(df['counts']), colors_tools.getLabelColor(label), 'Num. Instances') return jsonify(barplot.barplot)
def getFamiliesBarplot(experiment_id, iteration, label): experiment = updateCurrentExperiment(experiment_id) experiment_label_id = experiment.labels_id if iteration == 'None': iteration = None family_counts = labels_tools.getFamiliesCounts(experiment.session, experiment_label_id, iteration_max=iteration, label=label) df = pd.DataFrame({ 'families': family_counts.keys(), 'counts': [family_counts[k] for k in family_counts.keys()] }) matrix_tools.sortDataFrame(df, 'families', ascending=True, inplace=True) barplot = BarPlot(list(df['families'])) dataset = PlotDataset(list(df['counts']), 'Num. Instances') dataset.setColor(colors_tools.getLabelColor(label)) barplot.addDataset(dataset) return jsonify(barplot.toJson())
def getFamiliesPerformance(experiment_id, train_test, label, threshold): experiment = updateCurrentExperiment(experiment_id) filename = experiment.getOutputDirectory() + train_test + '/families/' if label == 'malicious': filename += 'tp_' tp_fp = 'Detection Rate' elif label == 'benign': filename += 'fp_' tp_fp = 'False Positive Rate' filename += 'families_thresholds.csv' with open(filename, 'r') as f: perf = pd.read_csv(f, header=0, index_col=0) families = list(perf.columns.values[:-1]) threshold = float(threshold) / 100 thresholds = list(perf.index[:-1]) threshold_value = min(enumerate(thresholds), key=lambda x: abs(x[1] - threshold))[1] perf = list(perf.loc[threshold_value]) barplot = BarPlot(families) barplot.addDataset(PlotDataset(perf, tp_fp)) return jsonify(barplot.toJson())