def getTopWeightedFeatures(experiment_id, inst_exp_id, instance_id, size): instance_id = int(instance_id) exp = ExperimentFactory.getFactory().fromJson(experiment_id, session) validation_experiment = ExperimentFactory.getFactory().fromJson( inst_exp_id, session) #get the features features_names, features_values = validation_experiment.getFeatures( instance_id) features_values = [float(value) for value in features_values] #get the pipeline with scaler and logistic model pipeline = exp.getModelPipeline() #scale the features scaled_values = pipeline.named_steps['scaler'].transform( np.reshape(features_values, (1, -1))) weighted_values = np.multiply(scaled_values, pipeline.named_steps['model'].coef_) features = map(lambda name, value, w_value: (name, value, w_value), features_names, features_values, weighted_values[0]) features.sort(key=lambda tup: abs(tup[2])) features = features[:-int(size) - 1:-1] tooltips = [x[1] for x in features] barplot = BarPlot([x[0] for x in features]) dataset = PlotDataset([x[2] for x in features], None) dataset.setColor(colors_tools.red) barplot.addDataset(dataset) return jsonify(barplot.toJson(tooltip_data=tooltips))
def getTopModelFeatures(experiment_id, size): size = int(size) exp = ExperimentFactory.getFactory().fromJson(experiment_id, session) model_coefficients = exp.getTopFeatures() features_names = exp.getFeaturesNames() coefficients = map(lambda name, coef: (name, coef), features_names, model_coefficients) coefficients.sort(key=lambda tup: abs(tup[1])) coefficients = coefficients[:-size - 1:-1] barplot = BarPlot([x[0] for x in coefficients]) dataset = PlotDataset([x[1] for x in coefficients], None) if (exp.classification_conf.featureImportance() == 'weight'): dataset.setColor(colors_tools.red) barplot.addDataset(dataset) return jsonify(barplot.toJson())
def getClusterStats(experiment_id): experiment = updateCurrentExperiment(experiment_id) clustering = Clustering.fromJson(experiment) num_clusters = clustering.num_clusters num_instances_v = [] labels = [] for c in range(num_clusters): instances_in_cluster = clustering.clusters[c].instances_ids num_instances = len(instances_in_cluster) # the empty clusters are not displayed #if num_instances > 0: num_instances_v.append(num_instances) #labels.append('c_' + str(c)) labels.append(clustering.clusters[c].label) barplot = BarPlot(labels) dataset = PlotDataset(num_instances_v, 'Num. Instances') barplot.addDataset(dataset) return jsonify(barplot.toJson())
def getFamiliesBarplot(experiment_id, iteration, label): experiment = updateCurrentExperiment(experiment_id) experiment_label_id = experiment.labels_id if iteration == 'None': iteration = None family_counts = labels_tools.getFamiliesCounts(experiment.session, experiment_label_id, iteration_max=iteration, label=label) df = pd.DataFrame({ 'families': family_counts.keys(), 'counts': [family_counts[k] for k in family_counts.keys()] }) matrix_tools.sortDataFrame(df, 'families', ascending=True, inplace=True) barplot = BarPlot(list(df['families'])) dataset = PlotDataset(list(df['counts']), 'Num. Instances') dataset.setColor(colors_tools.getLabelColor(label)) barplot.addDataset(dataset) return jsonify(barplot.toJson())
def getFamiliesPerformance(experiment_id, train_test, label, threshold): experiment = updateCurrentExperiment(experiment_id) filename = experiment.getOutputDirectory() + train_test + '/families/' if label == 'malicious': filename += 'tp_' tp_fp = 'Detection Rate' elif label == 'benign': filename += 'fp_' tp_fp = 'False Positive Rate' filename += 'families_thresholds.csv' with open(filename, 'r') as f: perf = pd.read_csv(f, header=0, index_col=0) families = list(perf.columns.values[:-1]) threshold = float(threshold) / 100 thresholds = list(perf.index[:-1]) threshold_value = min(enumerate(thresholds), key=lambda x: abs(x[1] - threshold))[1] perf = list(perf.loc[threshold_value]) barplot = BarPlot(families) barplot.addDataset(PlotDataset(perf, tp_fp)) return jsonify(barplot.toJson())