def fit(self, name, n_pca=None): filename = compose_filename(self.config['input_data_folder'], self.config['filter_latent'], self.config['standardization'], n_pca, 'features', name, 'csv') all_data = pd.read_csv(filename, index_col=0) features = [ col_name for col_name in all_data.columns if col_name not in self.config['skip-features'] ] x = all_data[features] y = all_data['label'] k_fold = ShuffleSplit(n_splits=self.config['k-fold-splits'], train_size=self.config['k-fold-train-size']) print(f"Classifying with {self.ml_method} and {self.classifier}") if self.ml_method == 'RR': cv_results = cross_validate(self.__create_classifier(), x.values, y.values, create_scorers(True), k_fold) else: cv_results = cross_validate(self.__create_classifier(), x.values, y.values, create_scorers(), k_fold) cv_results['features'] = features return cv_results
def plot_average_of_all_pr(self, av_kind): title = compose_configuration(f'{av_kind} PR curves', self.config['filter_latent'], self.config['standardization'], 'no-pca', self.name) print(f"Plotting {title}") plt.figure(figsize=(6, 4.5)) for ml_method, classifiers in self.results.items(): for classifier, result in classifiers.items(): if len(result['fpr']['early']) > 0: if av_kind == 'macro': self.plot_macro_pr( ml_method, classifier, f'{self.MULTI_CLASS_NAME[ml_method]} {self.CLASSIFIER_NAME[classifier]} {av_kind}-average', '-', None, 1.5) else: self.plot_pr_class( ml_method, classifier, av_kind, f'{self.MULTI_CLASS_NAME[ml_method]} {self.CLASSIFIER_NAME[classifier]} {av_kind}-average', '-', None, 1.5) plt.legend() plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('Recall') plt.ylabel('Precision') plt.tight_layout() filename = compose_filename( self.config['output_pr_curves_plot_directory'], self.config['filter_latent'], self.config['standardization'], 'no-pca', f'PR_{av_kind}_average', self.name, '') plt.savefig(filename, dpi=150) plt.close()
def grid_search(self, name, grid, splits, n_pca=None): filename = compose_filename(self.config['input_data_folder'], self.config['filter_latent'], self.config['standardization'], n_pca, 'features', name, 'csv') all_data = pd.read_csv(filename, index_col=0) x = all_data[[ col_name for col_name in all_data.columns if col_name not in self.config['skip-features'] ]] y = all_data['label'] classifier = self.__create_classifier(grid_search=True) gs_cv = GridSearchCV(classifier, grid, scoring=make_scorer(balanced_accuracy_score, adjusted=True), cv=ShuffleSplit(n_splits=splits, train_size=0.67)) gs_cv.fit(x.values, y.values) print(gs_cv.best_params_) print(gs_cv.best_score_) print() for i, param in enumerate(gs_cv.cv_results_['params']): print(param, gs_cv.cv_results_['mean_test_score'][i], 'time:', gs_cv.cv_results_['mean_fit_time'][i]) print()
def plot_pr(self, ml_method, classifier, n_pca): title = compose_configuration(f'PR curves of {ml_method} {classifier}', self.config['filter_latent'], self.config['standardization'], n_pca, self.name) print(f"Plotting {title}") plt.figure(figsize=(6.5, 4.3333)) self.plot_pr_class(ml_method, classifier, 'micro', f'micro-average', ':', color_phase_dict['micro']) self.plot_macro_pr(ml_method, classifier, 'macro-average', ':', color_phase_dict['macro']) for class_ in ['immediate-early', 'early', 'late']: self.plot_pr_class(ml_method, classifier, class_, f'{class_}', '-', color_phase_dict[class_]) plt.legend() # plt.title(title) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('Recall') plt.ylabel('Precision') plt.tight_layout() filename = compose_filename( self.config['output_pr_curves_plot_directory'], self.config['filter_latent'], self.config['standardization'], n_pca, f'PR_{self.MULTI_CLASS_NAME[ml_method]}_{self.CLASSIFIER_NAME[classifier]}', self.name, '') plt.savefig(filename, dpi=150) plt.close()
def plot_feature_importance(self, name): pca_file = f"{self.config['input_pca_folder']}None-pca.p" pca = pickle.load(open(pca_file, 'rb')) # type: PCA features_file = compose_filename(self.config["input_data_folder"], self.config['filter_latent'], self.config['standardization'], 'no-pca', 'features', name, 'csv') original_features = pd.read_csv(features_file, index_col=0).columns original_features = [ col for col in original_features if col not in self.config['skip-features'] ] _plot_pca_fi_barchart(False, original_features, pca) plt.savefig( f"{self.config['output_pca_variance_plot_directory']}PCA_Features_Importance_{name}" ) plt.close() _plot_pca_fi_barchart(True, original_features, pca) plt.savefig( f"{self.config['output_pca_variance_plot_directory']}PCA_Features_Importance_Relative_to_variance{name}" ) plt.close()
def plot_average_of_all_roc(self, av_kind): title = compose_configuration(f'Micro and Macro average ROC curves', self.config['filter_latent'], self.config['standardization'], 'no-pca', self.name) print(f"Plotting {title}") plt.figure(figsize=(6, 4.5)) for ml_method, classifiers in self.results.items(): if ml_method == 'ML': for classifier, result in classifiers.items(): if len(result['fpr']['early']) > 0: self.plot_roc_class( ml_method, classifier, av_kind, f'{self.CLASSIFIER_NAME[classifier]} {av_kind}-average', '-', None, 2) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.legend() plt.tight_layout() filename = compose_filename( self.config['output_roc_curves_plot_directory'], self.config['filter_latent'], self.config['standardization'], 'no-pca', f'ROC_{av_kind}_average', self.name, '') plt.savefig(filename, dpi=150) plt.close()
def plot_correlation_matrix(self): feature_file = compose_filename(self.config['input_data_folder'], True, False, 'no-pca', 'features', 'original', 'csv') df = pd.read_csv(feature_file, index_col=0) columns = [ col for col in df.columns if col not in self.config['skip-features'] ] df = df[columns] corr = df.corr(method='spearman') fig = plt.figure(figsize=(12, 10)) ax = plt.gca() im = ax.matshow(corr) divider = make_axes_locatable(ax) cax = divider.append_axes("right", size="5%", pad=0.1) cb = fig.colorbar(im, cax=cax) ax.set_xticks(range(df.shape[1])) ax.set_xticklabels(df.columns, fontsize=12, rotation=-45, rotation_mode='anchor', ha='right') ax.set_yticks(range(df.shape[1])) ax.set_yticklabels(df.columns, fontsize=12) # plt.title('Spearman Feature Correlation Matrix', y=-0.07, x=-11, fontsize=20) fig.tight_layout(pad=2) plt.savefig( 'Classification/Output/plots/scatter_plot_matrix/feature_correlation_matrix', dpi=150)
def plot_permutation_importance_summary(self, classifier, max_n, n_pca): title = compose_configuration( f'Summarized Permutation Importances of {classifier}', self.config['filter_latent'], self.config['standardization'], n_pca, self.name) print(f"Plotting {title}") features = self.results['RR'][classifier]['features'] perm_imps = [] for mc_technique in ['ML', '1vsA', 'RR']: perm_imps.extend(self.results[mc_technique][classifier] ['permutation_importance']) boxplot_data = {} for i, f in enumerate(features): feature_imps = [perm_imp[i] for perm_imp in perm_imps] boxplot_data[f] = feature_imps boxplot_data = { k: v for k, v in sorted( boxplot_data.items(), key=lambda item: np.median(item[1]) + 0.001 * np.mean(item[1]), reverse=True) } fig, ax = plt.subplots(figsize=(8.5, 4.5)) ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=1) ax.set_axisbelow(True) ax.boxplot(boxplot_data.values(), flierprops=dict(markersize=4, markeredgewidth=0.6)) ax.set_xticklabels(boxplot_data.keys()) plt.xticks(rotation=45, rotation_mode='anchor', ha='right') plt.ylabel('Permutation Importance') plt.xlabel('Feature') plt.tight_layout() filename = compose_filename(self.config['output_pi_plot_directory'], self.config['filter_latent'], self.config['standardization'], n_pca, f'pi_{classifier}', self.name, '') plt.savefig(filename, dpi=150) plt.close()
def plot_permutation_importance(self, ml_method, classifier, n_pca): title = compose_configuration( f'Permutation Importances of {ml_method} {classifier}', self.config['filter_latent'], self.config['standardization'], n_pca, self.name) print(f"Plotting {title}") features = self.results[ml_method][classifier]['features'] perm_imps = self.results[ml_method][classifier][ 'permutation_importance'] permutation_importances = {} for i, f in enumerate(features): feature_imps = [perm_imp[i] for perm_imp in perm_imps] permutation_importances[f] = (np.mean(feature_imps), np.std(feature_imps)) permutation_importances = { k: v for k, v in sorted(permutation_importances.items(), key=lambda item: item[1][0], reverse=True)[:30] } x_size = len(permutation_importances) / 2.5 y_size = x_size / 1.375 plt.figure(figsize=(x_size, y_size)) plt.bar(permutation_importances.keys(), [val[0] for val in permutation_importances.values()], yerr=[val[1] for val in permutation_importances.values()], width=1, capsize=5) plt.xticks(rotation=45, rotation_mode='anchor', ha='right') plt.ylabel('Permutation Importance') plt.xlabel('Feature') plt.title(title, wrap=True) plt.tight_layout() filename = compose_filename( self.config['output_pi_plot_directory'], self.config['filter_latent'], self.config['standardization'], n_pca, f'permutation_importance_{ml_method}_{classifier}', self.name, '') plt.savefig(filename) plt.close()
def plot_scatter_matrix(self): feature_file = compose_filename(self.config['input_data_folder'], True, False, 'no-pca', 'features', 'original', 'csv') df = pd.read_csv(feature_file, index_col=0) columns = [ col for col in df.columns if col not in self.config['skip-features'] ] df = df[columns] scatter_plot_matrix(df.to_numpy(), figsize=(60, 60), names=columns, alpha=0.3) print('Done plotting') plt.tight_layout(h_pad=0, w_pad=0) print('Layout done') plt.savefig( 'Classification/Output/plots/scatter_plot_matrix/scatter_plot_matrix', dpi=150) print('Saving done')
def load_results(self, n_pca): filename = compose_filename(self.config['output_result_directory'], self.config['filter_latent'], self.config['standardization'], n_pca, 'classification_results', self.name, 'p') self.results = pickle.load(open(filename, 'rb'))
def plot(self, name): # title = compose_configuration(f'PCA scatter plot', self.config['filter_latent'], # self.config['standardization'], 2, name) title = "" print(f"Plotting {title}") features_file = compose_filename(self.config["input_data_folder"], self.config['filter_latent'], self.config['standardization'], 'no-pca', 'features', name, 'csv') original_features = pd.read_csv(features_file, index_col=0).columns original_features = [ col for col in original_features if col not in self.config['skip-features'] ] pca_object_file = f"{self.config['input_pca_folder']}None-pca.p" pca = pickle.load(open(pca_object_file, 'rb')) # type: PCA pca_file = compose_filename(self.config["input_data_folder"], self.config['filter_latent'], self.config['standardization'], 2, 'features', name, 'csv') data = pd.read_csv(pca_file) plt.figure(figsize=(5, 4)) print(pca.components_[0]) print(pca.explained_variance_) print(pca.singular_values_) for i, feature in enumerate(original_features): plt.arrow(0, 0, pca.components_[0][i], pca.components_[1][i], alpha=0.5, length_includes_head=True, head_width=0.005) plt.text(pca.components_[0][i] * 1.1, pca.components_[1][i] * 1.1, feature, fontsize=8) xs = [] ys = [] for label in ['immediate-early', 'early', 'late']: label_data = data[data.label.eq(label)] xs.extend(label_data['comp_0']) ys.extend(label_data['comp_1']) scalex = 0.75 / (max(xs) - min(xs)) scaley = 0.75 / (max(ys) - min(ys)) for label in ['immediate-early', 'early', 'late']: label_data = data[data.label.eq(label)] x = label_data['comp_0'] y = label_data['comp_1'] plt.scatter(x * scalex, y * scaley, label=label, s=40, alpha=0.65, edgecolors='none', color=color_phase_dict[label]) plt.legend() plt.title(title, wrap=True) plt.xlim(-.3, .8) plt.ylim(-.6, .5) plt.xlabel('PC 1') plt.ylabel('PC 2') plt.tight_layout() filename = compose_filename(self.config['output_pca_plot_directory'], self.config['filter_latent'], self.config['standardization'], 'no-pca', '2-pca', 'scatter', 'png') plt.savefig(filename, dpi=300) plt.close()
def save_scores(self, n_pca): filename = compose_filename(self.config['output_result_directory'], self.config['filter_latent'], self.config['standardization'], n_pca, 'classification_results', self.name, 'p') pickle.dump(self.results, open(filename, 'wb'))
def save(self, name, n_pca): filename = compose_filename(self.output_csv_directory, self.filter_phase, self.standardization, n_pca, 'features', name, 'csv') self.data_frame.to_csv(filename)