Beispiel #1
0
 def to_barplot(self, directory):
     head_coeff = self.coef_summary.head(n=NUM_COEFF_EXPORT)
     coefficients = head_coeff['mean'].values
     features_ids = list(head_coeff.index)
     features_names = []
     user_ids = []
     for feature_id in features_ids:
         query = self.exp.session.query(FeaturesAlchemy)
         query = query.filter(FeaturesAlchemy.id == int(feature_id))
         row = query.one()
         features_names.append(row.name)
         user_ids.append(row.user_id)
     barplot = BarPlot(user_ids)
     dataset = PlotDataset(coefficients, None)
     score = self.classifier_conf.get_feature_importance()
     if score == 'weight':
         dataset.set_color(red)
     else:
         dataset.set_color(blue)
     barplot.add_dataset(dataset)
     if self.class_label is None:
         out_filename = 'coeff_barplot.json'
     else:
         out_filename = 'coeff_barplot_%s.json' % self.class_label
     return barplot.export_to_json(path.join(directory, out_filename),
                                   tooltip_data=features_names)
Beispiel #2
0
def getTopWeightedFeatures(exp_id, instance_id, size):
    instance_id = int(instance_id)
    classifier = get_classifier(exp_id)
    # get the features
    exp = update_curr_exp(exp_id)
    f_names, f_values = FeaturesFromExp.get_instance(exp, instance_id)
    # scale the features
    scaled_values = classifier.named_steps['scaler'].transform(
        np.reshape(f_values, (1, -1)))
    weighted_values = np.multiply(scaled_values,
                                  classifier.named_steps['model'].coef_)
    features = list(
        map(lambda name, value, w_value: (name, value, w_value), f_names,
            f_values, weighted_values[0]))
    features.sort(key=lambda tup: abs(tup[2]))
    features = features[:-int(size) - 1:-1]
    f_names, f_values, f_weighted = list(zip(*features))
    labels = [str(name) for name in f_names]
    tooltips = [
        '%s (%.2f)' % (name, f_values[i]) for i, name in enumerate(f_names)
    ]
    barplot = BarPlot(labels)
    dataset = PlotDataset(f_weighted, None)
    dataset.set_color(red)
    barplot.add_dataset(dataset)
    return jsonify(barplot.to_json(tooltip_data=tooltips))
Beispiel #3
0
 def display(self, directory):
     barplot = BarPlot(self.labels)
     if not self.has_ground_truth:
         self.display_label(barplot, 'all')
     else:
         self.display_label(barplot, MALICIOUS)
         self.display_label(barplot, BENIGN)
     barplot.export_to_json(path.join(directory, 'proba_barplot.json'))
Beispiel #4
0
 def _gen_binary_histogram(self):
     self.barplot = BarPlot(['0', '1'])
     for label, dataset in self.plot_datasets.items():
         if len(dataset.values) > 0:
             num_0 = sum(dataset.values == 0)
             num_1 = sum(dataset.values == 1)
             hist_dataset = PlotDataset([num_0, num_1], label)
             hist_dataset.set_color(dataset.color)
             self.barplot.add_dataset(hist_dataset)
Beispiel #5
0
 def display(self, directory):
     labels = list(self.predictions.keys())
     if self.multiclass:
         xlabels = labels
     else:
         xlabels = [label_bool_to_str(l) for l in labels]
     barplot = BarPlot(xlabels)
     if not self.has_ground_truth:
         self._display(barplot, labels)
     else:
         self._display(barplot, labels, error=False)
         self._display(barplot, labels, error=True)
     barplot.export_to_json(path.join(directory, 'pred_barplot.json'))
Beispiel #6
0
def getClusterStats(exp_id):
    experiment = update_curr_exp(exp_id)
    clustering = ClustersExp.from_json(experiment.output_dir())
    num_clusters = clustering.num_clusters
    num_instances_v = []
    labels = []
    for c in range(num_clusters):
        instances_in_cluster = clustering.clusters[c].instances_ids
        num_instances = len(instances_in_cluster)
        num_instances_v.append(num_instances)
        labels.append(clustering.clusters[c].label)
    barplot = BarPlot(labels)
    dataset = PlotDataset(num_instances_v, 'Num. Instances')
    barplot.add_dataset(dataset)
    return jsonify(barplot.to_json())
Beispiel #7
0
def getFamiliesBarplot(annotations_id, iteration, label):
    iteration = None if iteration == 'None' else int(iteration)
    family_counts = annotations_db_tools.get_families_counts(
        session, annotations_id, iter_max=iteration, label=label)
    df = pd.DataFrame({
        'families':
        list(family_counts.keys()),
        'counts': [family_counts[k] for k in list(family_counts.keys())]
    })
    sort_data_frame(df, 'families', ascending=True, inplace=True)
    barplot = BarPlot(df['families'].values)
    dataset = PlotDataset(df['counts'].values, 'Num. Instances')
    dataset.set_color(get_label_color(label))
    barplot.add_dataset(dataset)
    return jsonify(barplot.to_json())
Beispiel #8
0
 def _gen_histogram(self):
     # 10 equal-width bins computed on all the data
     _, bin_edges = np.histogram(self.all_values, bins=10, density=False)
     x_labels = [
         '%.2f - %.2f' % (bin_edges[e], bin_edges[e + 1])
         for e in range(len(bin_edges) - 1)
     ]
     self.barplot = BarPlot(x_labels)
     for label, dataset in self.plot_datasets.items():
         if len(dataset.values) > 0:
             hist, _ = np.histogram(dataset.values,
                                    bins=bin_edges,
                                    density=False)
             hist_dataset = PlotDataset(hist, label)
             hist_dataset.set_color(dataset.color)
             self.barplot.add_dataset(hist_dataset)
Beispiel #9
0
 def display(self, directory):
     labels = ['0-10%', '10-20%', '20-30%', '30-40%', '40-50%',
               '50-60%', '60-70%', '70-80%', '80-90%', '90-100%']
     barplot = BarPlot(labels)
     if not self.has_ground_truth:
         dataset = PlotDataset(list(map(len, self.ranges)), 'num_instances')
         dataset.set_color(get_label_color('all'))
         barplot.add_dataset(dataset)
     else:
         self.display_label(barplot, MALICIOUS)
         self.display_label(barplot, BENIGN)
     barplot.export_to_json(path.join(directory, 'predictions_barplot.json'))
Beispiel #10
0
class FeaturePlots(object):
    def __init__(self, instances, feature_index):
        self.feature_index = feature_index
        features_info = instances.features.info
        self.feature_type = features_info.types[self.feature_index]
        self.feature_name = features_info.names[self.feature_index]
        self.feature_id = features_info.ids[self.feature_index]
        self.all_values = instances.features.get_values_from_index(
            self.feature_index)
        self._gen_plot_datasets(instances)

    def compute(self):
        if self.feature_type == FeatureType.binary:
            self._gen_binary_histogram()
        elif self.feature_type == FeatureType.numeric:
            self._gen_bloxplot()
            # Added to deal with numpy issue #8627
            # In this case, the variance is null.
            # The plots are not generated, since the scoring metrics
            # contain all the informations.
            try:
                self._gen_histogram()
            except Exception:
                self.barplot = None
                pass
            self._gen_density()

    def export(self, output_dir):
        output_dir = path.join(output_dir, str(self.feature_id))
        os.makedirs(output_dir)
        if self.barplot is None:
            return
        if self.feature_type == FeatureType.binary:
            self.barplot.export_to_json(
                path.join(output_dir, 'binary_histogram.json'))
        elif self.feature_type == FeatureType.numeric:
            self.boxplot.display(path.join(output_dir, 'boxplot.png'))
            self.barplot.export_to_json(path.join(output_dir,
                                                  'histogram.json'))
            self.density.display(path.join(output_dir, 'density.png'))

    def _gen_plot_datasets(self, instances):
        self.plot_datasets = {}
        self._gen_label_plot_dataset(instances, MALICIOUS)
        self._gen_label_plot_dataset(instances, BENIGN)
        self._gen_label_plot_dataset(instances, 'unlabeled')

    def _gen_label_plot_dataset(self, instances, label):
        if label != 'unlabeled':
            instances = instances.get_annotated_instances(label=label)
        else:
            instances = instances.get_unlabeled_instances()
        values = instances.features.get_values_from_index(self.feature_index)
        dataset = PlotDataset(values, label)
        dataset.set_color(get_label_color(label))
        self.plot_datasets[label] = dataset

    def _gen_bloxplot(self):
        self.boxplot = BoxPlot(title='Feature %s' % self.feature_name)
        for label, dataset in self.plot_datasets.items():
            if len(dataset.values) > 0:
                self.boxplot.add_dataset(dataset)

    def _gen_histogram(self):
        # 10 equal-width bins computed on all the data
        _, bin_edges = np.histogram(self.all_values, bins=10, density=False)
        x_labels = [
            '%.2f - %.2f' % (bin_edges[e], bin_edges[e + 1])
            for e in range(len(bin_edges) - 1)
        ]
        self.barplot = BarPlot(x_labels)
        for label, dataset in self.plot_datasets.items():
            if len(dataset.values) > 0:
                hist, _ = np.histogram(dataset.values,
                                       bins=bin_edges,
                                       density=False)
                hist_dataset = PlotDataset(hist, label)
                hist_dataset.set_color(dataset.color)
                self.barplot.add_dataset(hist_dataset)

    def _gen_binary_histogram(self):
        self.barplot = BarPlot(['0', '1'])
        for label, dataset in self.plot_datasets.items():
            if len(dataset.values) > 0:
                num_0 = sum(dataset.values == 0)
                num_1 = sum(dataset.values == 1)
                hist_dataset = PlotDataset([num_0, num_1], label)
                hist_dataset.set_color(dataset.color)
                self.barplot.add_dataset(hist_dataset)

    def _gen_density(self):
        self.density = Density(title='Feature %s' % self.feature_name)
        for _, dataset in self.plot_datasets.items():
            if len(dataset.values) > 0:
                self.density.add_dataset(dataset)
Beispiel #11
0
class FeaturePlots(object):

    def __init__(self, instances, multiclass, feature_index, logger,
                 with_density=True):
        self.feature_index = feature_index
        self.logger = logger
        self.with_density = with_density
        features_info = instances.features.info
        self.feature_type = features_info.types[self.feature_index]
        self.feature_name = features_info.names[self.feature_index]
        self.feature_id = features_info.ids[self.feature_index]
        self._gen_plot_datasets(instances, multiclass)

    def compute(self):
        if self.feature_type == FeatureType.binary:
            self._gen_binary_histogram()
        elif self.feature_type == FeatureType.numeric:
            self._gen_bloxplot()
            self._gen_histogram()
            if self.with_density:
                self._gen_density()

    def export(self, output_dir):
        output_dir = path.join(output_dir, str(self.feature_id))
        os.makedirs(output_dir)
        if self.feature_type == FeatureType.binary:
            self.barplot.export_to_json(path.join(output_dir,
                                                  'binary_histogram.json'))
        elif self.feature_type == FeatureType.numeric:
            self.boxplot.display(path.join(output_dir, 'boxplot.png'))
            self.barplot.export_to_json(path.join(output_dir,
                                                  'histogram.json'))
            if self.with_density:
                self.density.display(path.join(output_dir, 'density.png'))

    def _gen_plot_datasets(self, instances, multiclass):
        self.plot_datasets = {}
        if not multiclass:
            self._gen_label_plot_dataset(instances, label=MALICIOUS)
            self._gen_label_plot_dataset(instances, label=BENIGN)
            self._gen_label_plot_dataset(instances, label='unlabeled')
        else:
            families = list(instances.annotations.get_families_values())
            families_colors = colors(len(families))
            for family, color in zip(families, families_colors):
                self._gen_label_plot_dataset(instances, family=family,
                                             color=color)

    def _gen_label_plot_dataset(self, instances, label=None, family=None,
                                color=None):
        if label is not None:
            if label != 'unlabeled':
                instances = instances.get_annotated_instances(label=label)
            else:
                instances = instances.get_unlabeled_instances()
        else:
            instances = instances.get_annotated_instances(family=family)
        values = instances.features.get_values_from_index(self.feature_index)
        if isinstance(values, spmatrix):
            values = values.toarray()
        plot_label = label if label is not None else family
        plot_color = color
        if plot_color is None:
            plot_color = get_label_color(plot_label)
        dataset = PlotDataset(values, plot_label)
        dataset.set_color(plot_color)
        self.plot_datasets[plot_label] = dataset

    def _gen_bloxplot(self):
        self.boxplot = BoxPlot(title='Feature %s' % self.feature_name)
        for label, dataset in self.plot_datasets.items():
            if dataset.values.shape[0] > 0:
                self.boxplot.add_dataset(dataset)

    def _gen_histogram(self):
        self.barplot = Histogram(self.plot_datasets, self.logger)

    def _gen_binary_histogram(self):
        self.barplot = BarPlot(['0', '1'])
        for label, dataset in self.plot_datasets.items():
            if dataset.values.shape[0] > 0:
                num_0 = sum(dataset.values == 0)
                num_1 = sum(dataset.values == 1)
                hist_dataset = PlotDataset(np.array([num_0, num_1]), label)
                hist_dataset.set_color(dataset.color)
                self.barplot.add_dataset(hist_dataset)

    def _gen_density(self):
        self.density = Density(title='Feature %s' % self.feature_name)
        for _, dataset in self.plot_datasets.items():
            if dataset.values.shape[0] > 0:
                self.density.add_dataset(dataset)
Beispiel #12
0
 def _gen_histogram(self):
     self.barplot = Histogram(self.plot_datasets, self.logger)