Esempio n. 1
0
    def fit(self, name, n_pca=None):
        filename = compose_filename(self.config['input_data_folder'],
                                    self.config['filter_latent'],
                                    self.config['standardization'], n_pca,
                                    'features', name, 'csv')

        all_data = pd.read_csv(filename, index_col=0)
        features = [
            col_name for col_name in all_data.columns
            if col_name not in self.config['skip-features']
        ]
        x = all_data[features]
        y = all_data['label']
        k_fold = ShuffleSplit(n_splits=self.config['k-fold-splits'],
                              train_size=self.config['k-fold-train-size'])

        print(f"Classifying with {self.ml_method} and {self.classifier}")

        if self.ml_method == 'RR':
            cv_results = cross_validate(self.__create_classifier(), x.values,
                                        y.values, create_scorers(True), k_fold)
        else:
            cv_results = cross_validate(self.__create_classifier(), x.values,
                                        y.values, create_scorers(), k_fold)

        cv_results['features'] = features

        return cv_results
    def plot_average_of_all_pr(self, av_kind):
        title = compose_configuration(f'{av_kind} PR curves',
                                      self.config['filter_latent'],
                                      self.config['standardization'], 'no-pca',
                                      self.name)
        print(f"Plotting {title}")

        plt.figure(figsize=(6, 4.5))

        for ml_method, classifiers in self.results.items():
            for classifier, result in classifiers.items():
                if len(result['fpr']['early']) > 0:
                    if av_kind == 'macro':
                        self.plot_macro_pr(
                            ml_method, classifier,
                            f'{self.MULTI_CLASS_NAME[ml_method]} {self.CLASSIFIER_NAME[classifier]} {av_kind}-average',
                            '-', None, 1.5)
                    else:
                        self.plot_pr_class(
                            ml_method, classifier, av_kind,
                            f'{self.MULTI_CLASS_NAME[ml_method]} {self.CLASSIFIER_NAME[classifier]} {av_kind}-average',
                            '-', None, 1.5)
        plt.legend()
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.tight_layout()
        filename = compose_filename(
            self.config['output_pr_curves_plot_directory'],
            self.config['filter_latent'], self.config['standardization'],
            'no-pca', f'PR_{av_kind}_average', self.name, '')
        plt.savefig(filename, dpi=150)
        plt.close()
Esempio n. 3
0
 def grid_search(self, name, grid, splits, n_pca=None):
     filename = compose_filename(self.config['input_data_folder'],
                                 self.config['filter_latent'],
                                 self.config['standardization'], n_pca,
                                 'features', name, 'csv')
     all_data = pd.read_csv(filename, index_col=0)
     x = all_data[[
         col_name for col_name in all_data.columns
         if col_name not in self.config['skip-features']
     ]]
     y = all_data['label']
     classifier = self.__create_classifier(grid_search=True)
     gs_cv = GridSearchCV(classifier,
                          grid,
                          scoring=make_scorer(balanced_accuracy_score,
                                              adjusted=True),
                          cv=ShuffleSplit(n_splits=splits, train_size=0.67))
     gs_cv.fit(x.values, y.values)
     print(gs_cv.best_params_)
     print(gs_cv.best_score_)
     print()
     for i, param in enumerate(gs_cv.cv_results_['params']):
         print(param, gs_cv.cv_results_['mean_test_score'][i], 'time:',
               gs_cv.cv_results_['mean_fit_time'][i])
         print()
    def plot_pr(self, ml_method, classifier, n_pca):
        title = compose_configuration(f'PR curves of {ml_method} {classifier}',
                                      self.config['filter_latent'],
                                      self.config['standardization'], n_pca,
                                      self.name)
        print(f"Plotting {title}")
        plt.figure(figsize=(6.5, 4.3333))

        self.plot_pr_class(ml_method, classifier, 'micro', f'micro-average',
                           ':', color_phase_dict['micro'])
        self.plot_macro_pr(ml_method, classifier, 'macro-average', ':',
                           color_phase_dict['macro'])

        for class_ in ['immediate-early', 'early', 'late']:
            self.plot_pr_class(ml_method, classifier, class_, f'{class_}', '-',
                               color_phase_dict[class_])

        plt.legend()
        # plt.title(title)
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.tight_layout()
        filename = compose_filename(
            self.config['output_pr_curves_plot_directory'],
            self.config['filter_latent'], self.config['standardization'],
            n_pca,
            f'PR_{self.MULTI_CLASS_NAME[ml_method]}_{self.CLASSIFIER_NAME[classifier]}',
            self.name, '')
        plt.savefig(filename, dpi=150)
        plt.close()
Esempio n. 5
0
    def plot_feature_importance(self, name):
        pca_file = f"{self.config['input_pca_folder']}None-pca.p"
        pca = pickle.load(open(pca_file, 'rb'))  # type: PCA

        features_file = compose_filename(self.config["input_data_folder"],
                                         self.config['filter_latent'],
                                         self.config['standardization'],
                                         'no-pca', 'features', name, 'csv')

        original_features = pd.read_csv(features_file, index_col=0).columns
        original_features = [
            col for col in original_features
            if col not in self.config['skip-features']
        ]

        _plot_pca_fi_barchart(False, original_features, pca)
        plt.savefig(
            f"{self.config['output_pca_variance_plot_directory']}PCA_Features_Importance_{name}"
        )
        plt.close()

        _plot_pca_fi_barchart(True, original_features, pca)
        plt.savefig(
            f"{self.config['output_pca_variance_plot_directory']}PCA_Features_Importance_Relative_to_variance{name}"
        )
        plt.close()
    def plot_average_of_all_roc(self, av_kind):
        title = compose_configuration(f'Micro and Macro average ROC curves',
                                      self.config['filter_latent'],
                                      self.config['standardization'], 'no-pca',
                                      self.name)
        print(f"Plotting {title}")

        plt.figure(figsize=(6, 4.5))

        for ml_method, classifiers in self.results.items():
            if ml_method == 'ML':
                for classifier, result in classifiers.items():
                    if len(result['fpr']['early']) > 0:
                        self.plot_roc_class(
                            ml_method, classifier, av_kind,
                            f'{self.CLASSIFIER_NAME[classifier]} {av_kind}-average',
                            '-', None, 2)

        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.legend()
        plt.tight_layout()
        filename = compose_filename(
            self.config['output_roc_curves_plot_directory'],
            self.config['filter_latent'], self.config['standardization'],
            'no-pca', f'ROC_{av_kind}_average', self.name, '')
        plt.savefig(filename, dpi=150)
        plt.close()
    def plot_correlation_matrix(self):
        feature_file = compose_filename(self.config['input_data_folder'], True,
                                        False, 'no-pca', 'features',
                                        'original', 'csv')
        df = pd.read_csv(feature_file, index_col=0)
        columns = [
            col for col in df.columns
            if col not in self.config['skip-features']
        ]
        df = df[columns]

        corr = df.corr(method='spearman')
        fig = plt.figure(figsize=(12, 10))
        ax = plt.gca()
        im = ax.matshow(corr)
        divider = make_axes_locatable(ax)
        cax = divider.append_axes("right", size="5%", pad=0.1)
        cb = fig.colorbar(im, cax=cax)

        ax.set_xticks(range(df.shape[1]))
        ax.set_xticklabels(df.columns,
                           fontsize=12,
                           rotation=-45,
                           rotation_mode='anchor',
                           ha='right')
        ax.set_yticks(range(df.shape[1]))
        ax.set_yticklabels(df.columns, fontsize=12)

        # plt.title('Spearman Feature Correlation Matrix', y=-0.07, x=-11, fontsize=20)
        fig.tight_layout(pad=2)
        plt.savefig(
            'Classification/Output/plots/scatter_plot_matrix/feature_correlation_matrix',
            dpi=150)
    def plot_permutation_importance_summary(self, classifier, max_n, n_pca):
        title = compose_configuration(
            f'Summarized Permutation Importances of {classifier}',
            self.config['filter_latent'], self.config['standardization'],
            n_pca, self.name)
        print(f"Plotting {title}")

        features = self.results['RR'][classifier]['features']

        perm_imps = []
        for mc_technique in ['ML', '1vsA', 'RR']:
            perm_imps.extend(self.results[mc_technique][classifier]
                             ['permutation_importance'])

        boxplot_data = {}
        for i, f in enumerate(features):
            feature_imps = [perm_imp[i] for perm_imp in perm_imps]
            boxplot_data[f] = feature_imps
        boxplot_data = {
            k: v
            for k, v in sorted(
                boxplot_data.items(),
                key=lambda item: np.median(item[1]) + 0.001 * np.mean(item[1]),
                reverse=True)
        }

        fig, ax = plt.subplots(figsize=(8.5, 4.5))
        ax.yaxis.grid(True,
                      linestyle='-',
                      which='major',
                      color='lightgrey',
                      alpha=1)
        ax.set_axisbelow(True)
        ax.boxplot(boxplot_data.values(),
                   flierprops=dict(markersize=4, markeredgewidth=0.6))
        ax.set_xticklabels(boxplot_data.keys())

        plt.xticks(rotation=45, rotation_mode='anchor', ha='right')
        plt.ylabel('Permutation Importance')
        plt.xlabel('Feature')
        plt.tight_layout()
        filename = compose_filename(self.config['output_pi_plot_directory'],
                                    self.config['filter_latent'],
                                    self.config['standardization'], n_pca,
                                    f'pi_{classifier}', self.name, '')
        plt.savefig(filename, dpi=150)
        plt.close()
    def plot_permutation_importance(self, ml_method, classifier, n_pca):
        title = compose_configuration(
            f'Permutation Importances of {ml_method} {classifier}',
            self.config['filter_latent'], self.config['standardization'],
            n_pca, self.name)
        print(f"Plotting {title}")

        features = self.results[ml_method][classifier]['features']
        perm_imps = self.results[ml_method][classifier][
            'permutation_importance']

        permutation_importances = {}
        for i, f in enumerate(features):
            feature_imps = [perm_imp[i] for perm_imp in perm_imps]
            permutation_importances[f] = (np.mean(feature_imps),
                                          np.std(feature_imps))

        permutation_importances = {
            k: v
            for k, v in sorted(permutation_importances.items(),
                               key=lambda item: item[1][0],
                               reverse=True)[:30]
        }

        x_size = len(permutation_importances) / 2.5
        y_size = x_size / 1.375
        plt.figure(figsize=(x_size, y_size))
        plt.bar(permutation_importances.keys(),
                [val[0] for val in permutation_importances.values()],
                yerr=[val[1] for val in permutation_importances.values()],
                width=1,
                capsize=5)
        plt.xticks(rotation=45, rotation_mode='anchor', ha='right')
        plt.ylabel('Permutation Importance')
        plt.xlabel('Feature')
        plt.title(title, wrap=True)
        plt.tight_layout()
        filename = compose_filename(
            self.config['output_pi_plot_directory'],
            self.config['filter_latent'], self.config['standardization'],
            n_pca, f'permutation_importance_{ml_method}_{classifier}',
            self.name, '')
        plt.savefig(filename)
        plt.close()
 def plot_scatter_matrix(self):
     feature_file = compose_filename(self.config['input_data_folder'], True,
                                     False, 'no-pca', 'features',
                                     'original', 'csv')
     df = pd.read_csv(feature_file, index_col=0)
     columns = [
         col for col in df.columns
         if col not in self.config['skip-features']
     ]
     df = df[columns]
     scatter_plot_matrix(df.to_numpy(),
                         figsize=(60, 60),
                         names=columns,
                         alpha=0.3)
     print('Done plotting')
     plt.tight_layout(h_pad=0, w_pad=0)
     print('Layout done')
     plt.savefig(
         'Classification/Output/plots/scatter_plot_matrix/scatter_plot_matrix',
         dpi=150)
     print('Saving done')
 def load_results(self, n_pca):
     filename = compose_filename(self.config['output_result_directory'],
                                 self.config['filter_latent'],
                                 self.config['standardization'], n_pca,
                                 'classification_results', self.name, 'p')
     self.results = pickle.load(open(filename, 'rb'))
Esempio n. 12
0
    def plot(self, name):
        # title = compose_configuration(f'PCA scatter plot', self.config['filter_latent'],
        #                               self.config['standardization'], 2, name)

        title = ""
        print(f"Plotting {title}")

        features_file = compose_filename(self.config["input_data_folder"],
                                         self.config['filter_latent'],
                                         self.config['standardization'],
                                         'no-pca', 'features', name, 'csv')

        original_features = pd.read_csv(features_file, index_col=0).columns
        original_features = [
            col for col in original_features
            if col not in self.config['skip-features']
        ]

        pca_object_file = f"{self.config['input_pca_folder']}None-pca.p"
        pca = pickle.load(open(pca_object_file, 'rb'))  # type: PCA

        pca_file = compose_filename(self.config["input_data_folder"],
                                    self.config['filter_latent'],
                                    self.config['standardization'], 2,
                                    'features', name, 'csv')
        data = pd.read_csv(pca_file)
        plt.figure(figsize=(5, 4))

        print(pca.components_[0])
        print(pca.explained_variance_)
        print(pca.singular_values_)

        for i, feature in enumerate(original_features):
            plt.arrow(0,
                      0,
                      pca.components_[0][i],
                      pca.components_[1][i],
                      alpha=0.5,
                      length_includes_head=True,
                      head_width=0.005)
            plt.text(pca.components_[0][i] * 1.1,
                     pca.components_[1][i] * 1.1,
                     feature,
                     fontsize=8)

        xs = []
        ys = []
        for label in ['immediate-early', 'early', 'late']:
            label_data = data[data.label.eq(label)]
            xs.extend(label_data['comp_0'])
            ys.extend(label_data['comp_1'])
        scalex = 0.75 / (max(xs) - min(xs))
        scaley = 0.75 / (max(ys) - min(ys))

        for label in ['immediate-early', 'early', 'late']:
            label_data = data[data.label.eq(label)]
            x = label_data['comp_0']
            y = label_data['comp_1']
            plt.scatter(x * scalex,
                        y * scaley,
                        label=label,
                        s=40,
                        alpha=0.65,
                        edgecolors='none',
                        color=color_phase_dict[label])
        plt.legend()
        plt.title(title, wrap=True)
        plt.xlim(-.3, .8)
        plt.ylim(-.6, .5)
        plt.xlabel('PC 1')
        plt.ylabel('PC 2')
        plt.tight_layout()
        filename = compose_filename(self.config['output_pca_plot_directory'],
                                    self.config['filter_latent'],
                                    self.config['standardization'], 'no-pca',
                                    '2-pca', 'scatter', 'png')
        plt.savefig(filename, dpi=300)
        plt.close()
Esempio n. 13
0
 def save_scores(self, n_pca):
     filename = compose_filename(self.config['output_result_directory'],
                                 self.config['filter_latent'],
                                 self.config['standardization'], n_pca,
                                 'classification_results', self.name, 'p')
     pickle.dump(self.results, open(filename, 'wb'))
 def save(self, name, n_pca):
     filename = compose_filename(self.output_csv_directory,
                                 self.filter_phase, self.standardization,
                                 n_pca, 'features', name, 'csv')
     self.data_frame.to_csv(filename)