Ejemplo n.º 1
0
def getFamiliesBarplot(experiment_id, iteration, label):
    if iteration == 'None':
        iteration = None
    family_counts = annotations_db_tools.getFamiliesCounts(
        session, experiment_id, iteration_max=iteration, label=label)
    df = pd.DataFrame({
        'families':
        list(family_counts.keys()),
        'counts': [family_counts[k] for k in list(family_counts.keys())]
    })
    matrix_tools.sortDataFrame(df, 'families', ascending=True, inplace=True)
    barplot = BarPlot(list(df['families']))
    dataset = PlotDataset(list(df['counts']), 'Num. Instances')
    dataset.setColor(colors_tools.getLabelColor(label))
    barplot.addDataset(dataset)
    return jsonify(barplot.toJson())
Ejemplo n.º 2
0
def getTopWeightedFeatures(experiment_id, inst_exp_id, instance_id, size,
                           fold_id):
    if fold_id == 'all':
        return None
    instance_id = int(instance_id)
    exp = ExperimentFactory.getFactory().fromJson(experiment_id, session)
    validation_experiment = ExperimentFactory.getFactory().fromJson(
        inst_exp_id, session)
    # get the features
    features_names, features_values = validation_experiment.getFeatures(
        instance_id)
    features_values = [float(value) for value in features_values]
    # get the pipeline with scaler and logistic model
    experiment_dir = exp.getOutputDirectory()
    if fold_id != 'None':
        experiment_dir = path.join(experiment_dir, fold_id)
    pipeline = joblib.load(path.join(experiment_dir, 'model', 'model.out'))
    # scale the features
    scaled_values = pipeline.named_steps['scaler'].transform(
        np.reshape(features_values, (1, -1)))
    weighted_values = np.multiply(scaled_values,
                                  pipeline.named_steps['model'].coef_)
    features = list(
        map(lambda name, value, w_value: (name, value, w_value),
            features_names, features_values, weighted_values[0]))
    features.sort(key=lambda tup: abs(tup[2]))
    features = features[:-int(size) - 1:-1]

    features_names = [x[0] for x in features]
    features_values = [x[1] for x in features]
    features_weighted_values = [x[2] for x in features]

    max_length = max([len(f) for f in features_names])
    if max_length > 30:
        labels = [str(i) for i in range(len(features_names))]
        tooltips = [
            features_names[i] + ' (' + str(features_values[i]) + ')'
            for i in range(len(features_names))
        ]
    else:
        labels = features_names
        tooltips = features_values
    barplot = BarPlot(labels)
    dataset = PlotDataset(features_weighted_values, None)
    dataset.setColor(colors_tools.red)
    barplot.addDataset(dataset)
    return jsonify(barplot.toJson(tooltip_data=tooltips))
Ejemplo n.º 3
0
def getClusterStats(experiment_id):
    experiment = updateCurrentExperiment(experiment_id)
    clustering = ClusteringExp.fromJson(experiment.getOutputDirectory())
    num_clusters = clustering.num_clusters
    num_instances_v = []
    labels = []
    for c in range(num_clusters):
        instances_in_cluster = clustering.clusters[c].instances_ids
        num_instances = len(instances_in_cluster)
        # the empty clusters are not displayed

        # if num_instances > 0:
        num_instances_v.append(num_instances)
        #labels.append('c_' + str(c))
        labels.append(clustering.clusters[c].label)
    barplot = BarPlot(labels)
    dataset = PlotDataset(num_instances_v, 'Num. Instances')
    barplot.addDataset(dataset)
    return jsonify(barplot.toJson())
Ejemplo n.º 4
0
def getTopModelFeatures(experiment_id, size, train_test, fold_id):
    size = int(size)
    exp = updateCurrentExperiment(experiment_id)

    directory = exp.getOutputDirectory()
    if fold_id != 'None' and fold_id != 'all':
        directory = path.join(directory, fold_id)
    directory = path.join(directory, train_test)
    filename = path.join(directory, 'model_coefficients.csv')
    with open(filename, 'r') as f:
        coefficients_df = pd.read_csv(f, header=0, index_col=0)
        model_coefficients = list(coefficients_df['mean'])
        features_names = list(map(str, coefficients_df.index))
        coefficients = list(
            map(lambda name, coef: (name, coef), features_names,
                model_coefficients))
        coefficients.sort(key=lambda tup: abs(tup[1]))
        coefficients = coefficients[:-size - 1:-1]

        coefficients_names = [coef[0] for coef in coefficients]
        coefficients_values = [coef[1] for coef in coefficients]
        max_length = max([len(coef) for coef in coefficients_names])

        if max_length > 30:
            coefficients_ids = [str(i) for i in range(len(coefficients_names))]
            coefficients_names = [
                name.replace(' WHERE', '\nWHERE')
                for name in coefficients_names
            ]
            barplot = BarPlot(coefficients_ids)
            dataset = PlotDataset(coefficients_values, None)
            if exp.conf.featureImportance() == 'weight':
                dataset.setColor(colors_tools.red)
            barplot.addDataset(dataset)
            return jsonify(barplot.toJson(tooltip_data=coefficients_names))
        else:
            barplot = BarPlot(coefficients_names)
            dataset = PlotDataset(coefficients_values, None)
            if exp.conf.featureImportance() == 'weight':
                dataset.setColor(colors_tools.red)
            barplot.addDataset(dataset)
            return jsonify(barplot.toJson())
Ejemplo n.º 5
0
 def generateHistogram(self):
     # 10 equal-width bins computed on all the data
     if not self.has_ground_truth:
         hist, bin_edges = np.histogram(self.plot_datasets['all'].values,
                                        bins=10,
                                        density=False)
     else:
         hist, bin_edges = np.histogram(
             self.plot_datasets[labels_tools.MALICIOUS].values,
             bins=10,
             density=False)
     x_labels = [
         str(bin_edges[e]) + ' - ' + str(bin_edges[e + 1])
         for e in range(len(bin_edges) - 1)
     ]
     barplot = BarPlot(x_labels)
     for label, dataset in self.plot_datasets.items():
         hist, bin_edges = np.histogram(dataset.values,
                                        bins=bin_edges,
                                        density=False)
         hist_dataset = PlotDataset(hist, dataset.label)
         hist_dataset.setColor(dataset.color)
         barplot.addDataset(hist_dataset)
     output_filename = path.join(self.output_directory, 'histogram.json')
     with open(output_filename, 'w') as f:
         barplot.exportJson(f)
Ejemplo n.º 6
0
def getFamiliesPerformance(experiment_id, train_test, label, threshold):
    experiment = updateCurrentExperiment(experiment_id)
    directory = path.join(experiment.getOutputDirectory(), train_test,
                          'families')
    if label == labels_tools.MALICIOUS:
        filename = 'tp_'
        tp_fp = 'Detection Rate'
    elif label == labels_tools.BENIGN:
        filename = 'fp_'
        tp_fp = 'False Positive Rate'
    filename += 'families_thresholds.csv'
    with open(path.join(directory, filename), 'r') as f:
        perf = pd.read_csv(f, header=0, index_col=0)
        families = list(perf.columns.values[:-1])
        threshold = float(threshold) / 100
        thresholds = list(perf.index[:-1])
        threshold_value = min(enumerate(thresholds),
                              key=lambda x: abs(x[1] - threshold))[1]
        perf = list(perf.loc[threshold_value])
        barplot = BarPlot(families)
        barplot.addDataset(PlotDataset(perf, tp_fp))
    return jsonify(barplot.toJson())
Ejemplo n.º 7
0
 def generateBinaryHistogram(self):
     barplot = BarPlot(['0', '1'])
     for label, dataset in self.plot_datasets.items():
         num_0 = sum(dataset.values == 0)
         num_1 = sum(dataset.values == 1)
         hist_dataset = PlotDataset([num_0, num_1], dataset.label)
         hist_dataset.setColor(dataset.color)
         barplot.addDataset(hist_dataset)
     output_filename = path.join(self.output_directory,
                                 'binary_histogram.json')
     with open(output_filename, 'w') as f:
         barplot.exportJson(f)
Ejemplo n.º 8
0
 def display(self, directory):
     labels = [
         '0-10%', '10-20%', '20-30%', '30-40%', '40-50%', '50-60%',
         '60-70%', '70-80%', '80-90%', '90-100%'
     ]
     barplot = BarPlot(labels)
     if not self.has_ground_truth:
         dataset = PlotDataset(list(map(len, self.ranges)), 'numInstances')
         dataset.setColor(colors_tools.getLabelColor('all'))
         barplot.addDataset(dataset)
     else:
         self.displayLabel(barplot, labels_tools.MALICIOUS)
         self.displayLabel(barplot, labels_tools.BENIGN)
     filename = path.join(directory, 'predictions_barplot.json')
     with open(filename, 'w') as f:
         barplot.exportJson(f)