예제 #1
0
def getTopModelFeatures(exp_id, size, train_test, fold_id):
    exp = updateCurrentExperiment(exp_id)
    directory = exp.output_dir()
    if fold_id != 'None' and fold_id != 'all':
        directory = path.join(directory, fold_id)
    directory = path.join(directory, train_test)
    filename = path.join(directory, 'model_coefficients.csv')
    with open(filename, 'r') as f:
        coefficients_df = pd.read_csv(f,
                                      header=0,
                                      index_col=0,
                                      nrows=int(size))
        coefficients = list(coefficients_df['mean'])
        features_ids = coefficients_df.index
        tooltip_data = []
        user_ids = []
        for feature_id in features_ids:
            query = session.query(FeaturesAlchemy)
            query = query.filter(FeaturesAlchemy.id == int(feature_id))
            row = query.one()
            tooltip_data.append(row.name)
            user_ids.append(row.user_id)
        barplot = BarPlot(user_ids)
        dataset = PlotDataset(coefficients, None)
        score = exp.exp_conf.core_conf.classifier_conf.featureImportance()
        if score == 'weight':
            dataset.set_color(colors_tools.red)
        barplot.add_dataset(dataset)
        return jsonify(barplot.to_json(tooltip_data=tooltip_data))
예제 #2
0
 def _gen_binary_histogram(self):
     self.barplot = BarPlot(['0', '1'])
     for label, dataset in self.plot_datasets.items():
         if len(dataset.values) > 0:
             num_0 = sum(dataset.values == 0)
             num_1 = sum(dataset.values == 1)
             hist_dataset = PlotDataset([num_0, num_1], label)
             hist_dataset.set_color(dataset.color)
             self.barplot.add_dataset(hist_dataset)
예제 #3
0
 def _gen_histogram(self):
     # 10 equal-width bins computed on all the data
     _, bin_edges = np.histogram(self.all_values, bins=10, density=False)
     x_labels = ['%.2f - %.2f' % (bin_edges[e], bin_edges[e+1])
                 for e in range(len(bin_edges) - 1)]
     self.barplot = BarPlot(x_labels)
     for label, dataset in self.plot_datasets.items():
         if len(dataset.values) > 0:
             hist, _ = np.histogram(dataset.values, bins=bin_edges,
                                    density=False)
             hist_dataset = PlotDataset(hist, label)
             hist_dataset.set_color(dataset.color)
             self.barplot.add_dataset(hist_dataset)
예제 #4
0
def getFamiliesBarplot(annotations_id, iteration, label):
    iteration = None if iteration == 'None' else int(iteration)
    family_counts = annotations_db_tools.getFamiliesCounts(session,
                                                           annotations_id,
                                                           iter_max=iteration,
                                                           label=label)
    df = pd.DataFrame({
        'families': list(family_counts.keys()),
        'counts': [family_counts[k] for k in list(family_counts.keys())]
        })
    matrix_tools.sort_data_frame(df, 'families', ascending=True, inplace=True)
    barplot = BarPlot(list(df['families']))
    dataset = PlotDataset(list(df['counts']), 'Num. Instances')
    dataset.set_color(colors_tools.get_label_color(label))
    barplot.add_dataset(dataset)
    return jsonify(barplot.to_json())
예제 #5
0
def getClusterStats(experiment_id):
    experiment = updateCurrentExperiment(experiment_id)
    clustering = ClusteringExp.from_json(experiment.output_dir())
    num_clusters = clustering.num_clusters
    num_instances_v = []
    labels = []
    for c in range(num_clusters):
        instances_in_cluster = clustering.clusters[c].instances_ids
        num_instances = len(instances_in_cluster)
        # the empty clusters are not displayed

        # if num_instances > 0:
        num_instances_v.append(num_instances)
        #labels.append('c_' + str(c))
        labels.append(clustering.clusters[c].label)
    barplot = BarPlot(labels)
    dataset = PlotDataset(num_instances_v, 'Num. Instances')
    barplot.add_dataset(dataset)
    return jsonify(barplot.to_json())
예제 #6
0
def getTopWeightedFeatures(exp_id, inst_exp_id, instance_id, size, fold_id):
    if fold_id == 'all':
        return None
    instance_id = int(instance_id)
    exp = updateCurrentExperiment(exp_id)
    inst_exp = updateCurrentExperiment(inst_exp_id)
    # get the features
    features_from_exp = FeaturesFromExp(inst_exp)
    features_names, features_values = features_from_exp.get_instance(
        instance_id)
    features_values = [float(value) for value in features_values]
    # get the pipeline with scaler and logistic model
    experiment_dir = exp.output_dir()
    if fold_id != 'None':
        experiment_dir = path.join(experiment_dir, fold_id)
    pipeline = joblib.load(path.join(experiment_dir, 'model', 'model.out'))
    # scale the features
    scaled_values = pipeline.named_steps['scaler'].transform(
        np.reshape(features_values, (1, -1)))
    weighted_values = np.multiply(scaled_values,
                                  pipeline.named_steps['model'].coef_)
    features = list(
        map(lambda name, value, w_value: (name, value, w_value),
            features_names, features_values, weighted_values[0]))
    features.sort(key=lambda tup: abs(tup[2]))
    features = features[:-int(size) - 1:-1]

    features_names = [x[0] for x in features]
    features_values = [x[1] for x in features]
    features_weighted_values = [x[2] for x in features]
    labels = [str(name) for name in features_names]
    tooltips = [
        '%s (%.2f)' % (name, features_values[i])
        for i, name in enumerate(features_names)
    ]
    barplot = BarPlot(labels)
    dataset = PlotDataset(features_weighted_values, None)
    dataset.set_color(colors_tools.red)
    barplot.add_dataset(dataset)
    return jsonify(barplot.to_json(tooltip_data=tooltips))
예제 #7
0
 def display(self, directory):
     labels = [
         '0-10%', '10-20%', '20-30%', '30-40%', '40-50%', '50-60%',
         '60-70%', '70-80%', '80-90%', '90-100%'
     ]
     barplot = BarPlot(labels)
     if not self.has_ground_truth:
         dataset = PlotDataset(list(map(len, self.ranges)), 'numInstances')
         dataset.set_color(colors_tools.get_label_color('all'))
         barplot.add_dataset(dataset)
     else:
         self.displayLabel(barplot, labels_tools.MALICIOUS)
         self.displayLabel(barplot, labels_tools.BENIGN)
     filename = path.join(directory, 'predictions_barplot.json')
     with open(filename, 'w') as f:
         barplot.export_json(f)
예제 #8
0
class FeaturePlots(object):

    def __init__(self, instances, feature_index):
        self.feature_index = feature_index
        features = instances.features
        self.feature_type = features.types[self.feature_index]
        self.feature_name = features.names[self.feature_index]
        self.feature_id = features.ids[self.feature_index]
        self.all_values = features.getValuesFromIndex(self.feature_index)
        self._gen_plot_datasets(instances)

    def compute(self):
        if self.feature_type == FeatureType.binary:
            self._gen_binary_histogram()
        elif self.feature_type == FeatureType.numeric:
            self._gen_bloxplot()
            self._gen_histogram()
            self._gen_density()

    def export(self, output_dir):
        output_dir = path.join(output_dir, str(self.feature_id))
        dir_tools.createDirectory(output_dir)
        if self.feature_type == FeatureType.binary:
            with open(path.join(output_dir, 'binary_histogram.json'), 'w') as f:
                self.barplot.export_json(f)
        elif self.feature_type == FeatureType.numeric:
            self.boxplot.display(path.join(output_dir, 'boxplot.png'))
            with open(path.join(output_dir, 'histogram.json'), 'w') as f:
                self.barplot.export_json(f)
            self.density.display(path.join(output_dir, 'density.png'))

    def _gen_plot_datasets(self, instances):
        self.plot_datasets = {}
        self._gen_label_plot_dataset(instances, labels_tools.MALICIOUS)
        self._gen_label_plot_dataset(instances, labels_tools.BENIGN)
        self._gen_label_plot_dataset(instances, 'unlabeled')

    def _gen_label_plot_dataset(self, instances, label):
        if label != 'unlabeled':
            instances = instances.getAnnotatedInstances(label=label)
        else:
            instances = instances.getUnlabeledInstances()
        values = instances.features.getValuesFromIndex(self.feature_index)
        dataset = PlotDataset(values, label)
        dataset.set_color(colors_tools.get_label_color(label))
        self.plot_datasets[label] = dataset

    def _gen_bloxplot(self):
        self.boxplot = BoxPlot(title='Feature %s' % self.feature_name)
        for label, dataset in self.plot_datasets.items():
            if len(dataset.values) > 0:
                self.boxplot.add_dataset(dataset)

    def _gen_histogram(self):
        # 10 equal-width bins computed on all the data
        _, bin_edges = np.histogram(self.all_values, bins=10, density=False)
        x_labels = ['%.2f - %.2f' % (bin_edges[e], bin_edges[e+1])
                    for e in range(len(bin_edges) - 1)]
        self.barplot = BarPlot(x_labels)
        for label, dataset in self.plot_datasets.items():
            if len(dataset.values) > 0:
                hist, _ = np.histogram(dataset.values, bins=bin_edges,
                                       density=False)
                hist_dataset = PlotDataset(hist, label)
                hist_dataset.set_color(dataset.color)
                self.barplot.add_dataset(hist_dataset)

    def _gen_binary_histogram(self):
        self.barplot = BarPlot(['0', '1'])
        for label, dataset in self.plot_datasets.items():
            if len(dataset.values) > 0:
                num_0 = sum(dataset.values == 0)
                num_1 = sum(dataset.values == 1)
                hist_dataset = PlotDataset([num_0, num_1], label)
                hist_dataset.set_color(dataset.color)
                self.barplot.add_dataset(hist_dataset)

    def _gen_density(self):
        self.density = Density(title='Feature %s' % self.feature_name)
        for _, dataset in self.plot_datasets.items():
            if len(dataset.values) > 0:
                self.density.add_dataset(dataset)