def main(): fig, axes = plt.subplots(1, 3, sharey=True, sharex=True) #fig.suptitle('Resample approaches') for ax, title, model in zip(axes.flat, ['No resample', 'Oversample', 'Undersample', ], [no_resample, oversample, undersample]): y, y_pred, c = model() print(title) print(imetrics.classification_report_imbalanced(y, y_pred)) acc = metrics.accuracy_score(y, y_pred) cm = metrics.confusion_matrix(y, y_pred, labels=labels) cm = norm_cm(cm) cm = pd.DataFrame(cm, index=labels, columns=labels) sns.heatmap(cm, vmin=0, vmax=1, annot=True, fmt='.2f', cmap='Greys', ax=ax, cbar=False, square=True) ax.set_title(f'{title}\naccuracy={acc:.3f}') count = class_counter(y) fig.suptitle('Population: ' + ', '.join([f'{key}: {count[key]*100:.1f}%' for key in labels])) fig.tight_layout() fig.savefig('./different_resampling.pdf', dpi=92, bbox_inches='tight') plt.show()
def multiple_figures(): mpl.style.use(['seaborn-white', 'seaborn-paper', 'grayscale']) #width = 3.39 #height = 3.39 * (np.sqrt(5) - 1.0) / 2.0 latexify() for title, model in zip(['without', 'gaussian', 'constant', ], [without, guassian, constant, ]): y, y_pred = model(pipe) print(title, classifier[0]) print(class_counter(y)) print(metrics.classification_report(y, y_pred, labels=labels)) acc = metrics.accuracy_score(y, y_pred) cm = metrics.confusion_matrix(y, y_pred, labels=labels) cm = norm_cm(cm) cm = pd.DataFrame(cm, index=labels, columns=labels) fig, ax = plt.subplots(dpi=92) sns.heatmap(cm, vmin=0, vmax=1, annot=True, fmt='.2f', cmap='Greys', ax=ax, cbar=False, square=True) format_axes_for_cm(ax) ax.set_title(f'accuracy = {acc:.3f}') fig.tight_layout() ensure_dir('./output/interpolations/') fig.savefig(f'./output/interpolations/{title}.pdf', dpi=92, bbox_inches='tight') plt.clf()
def main(): mpl.style.use(['seaborn-white', 'seaborn-paper', 'grayscale']) latexify() w_sizes = (5, 10, 50, 100) for (w_prr, w_history) in it.product(w_sizes, repeat=2): y, y_pred = different_window_sizes(w_prr, w_history) acc = metrics.accuracy_score(y, y_pred) prec = metrics.precision_score(y, y_pred, average='weighted', labels=labels) recall = metrics.recall_score(y, y_pred, average='weighted', labels=labels) f1 = metrics.f1_score(y, y_pred, average='weighted', labels=labels) print( f'& {w_history}\t& {w_prr}\t& {acc:.3f}\t& {prec:.3f}\t& {recall:.3f}\t& {f1:.3f}' ) #print(f'Wh=={w_history}; Wprr=={w_prr}') #print(metrics.classification_report(y, y_pred, labels=labels)) cm = metrics.confusion_matrix(y, y_pred, labels=labels) cm = norm_cm(cm) cm = pd.DataFrame(cm, index=labels, columns=labels) fig, ax = plt.subplots(dpi=92) sns.heatmap(cm, vmin=0, vmax=1, annot=True, fmt='.2f', cmap='Greys', ax=ax, cbar=False, square=True) #ax.set_title(f'$\\mathrm{{Acc}}(W_{{\\mathrm{{PRR}}}}={w_prr}, W_{{\\mathrm{{history}}}}={w_history})={acc:.3f}$') ax.set_title( f'Accuracy = {acc:.3f}\n(prec = {prec:.3f}, rec = {recall:.3f})') format_axes_for_cm(ax) fig.tight_layout() ensure_dir('./output/w_sizes/') fig.savefig(f'./output/w_sizes/Wprr{w_prr}_Wh{w_history}.pdf', dpi=92, bbox_inches='tight') plt.close(fig)
def multiplots(): mpl.style.use(['seaborn-white', 'seaborn-paper', 'grayscale']) latexify() for model, title in zip([no_resample, undersample, oversample], ['none', 'undersample', 'oversample']): y, y_pred, c = model(classifier) acc = metrics.accuracy_score(y, y_pred) #prec = metrics.precision_score(y, y_pred, labels=labels, average='macro') #rec = metrics.recall_score(y, y_pred, labels=labels, average='macro') #f1 = metrics.f1_score(y, y_pred, labels=labels, average='macro') cm = metrics.confusion_matrix(y, y_pred, labels=labels) cm = norm_cm(cm) cm = pd.DataFrame(cm, index=labels, columns=labels) fig, ax = plt.subplots(dpi=92, constrained_layout=True) #print(f'{title}\t-- Acc.: {acc:.3f};\t Prec.: {prec:.3f}\t Rec.: {rec:.3f}\t F1: {f1:.3f}') print('Resample:', title, classifier[0], f'accuracy={acc:.3f}') print(metrics.classification_report(y, y_pred, labels=labels)) plt.suptitle(f'accuracy={acc:.3f}') #plt.suptitle(f'good={c["good"]:,}\ninterm.={c["interm."]:,}\nbad={c["bad"]:,}', ha='left') sns.heatmap(cm, vmin=0, vmax=1, annot=True, fmt='.2f', cmap='Greys', ax=ax, cbar=False, square=True) #ax.set_title(f'Accuracy = {acc:.3f}', loc='center') ax.set_title( f'good: {c["good"]:,}\ninterm.: {c["interm."]:,}\nbad: {c["bad"]:,}', fontdict={'fontsize': 9}, loc='left' ) format_axes_for_cm(ax) #fig.tight_layout() ensure_dir('./output/resampling/') fig.savefig(f'./output/resampling/{title}.pdf', dpi=92, bbox_inches='tight') plt.close(fig)
def main(): fig, axes = plt.subplots(1, 3, sharey=True, sharex=True) #fig.suptitle('Interpolation approaches') for ax, title, model in zip(axes.reshape(-1), ['Without', 'Gaussian', 'Constant', ], [without, guassian, constant, ]): y, y_pred = model() print(title, classifier[0]) print(metrics.classification_report(y, y_pred, labels=labels)) acc = metrics.accuracy_score(y, y_pred) cm = metrics.confusion_matrix(y, y_pred, labels=labels) cm = norm_cm(cm) cm = pd.DataFrame(cm, index=labels, columns=labels) sns.heatmap(cm, vmin=0, vmax=1, annot=True, fmt='.2f', cmap='Greys', ax=ax, cbar=False, square=True) ax.set_title(f'{title}\naccuracy={acc:.3f}') fig.tight_layout() fig.savefig('./different_interpolations.pdf', dpi=92, bbox_inches='tight') plt.show()
def multiple_figures(): mpl.style.use(['seaborn-white', 'seaborn-paper', 'grayscale']) latexify() pipe = pipe_dtree for features in feature_sets: y, y_pred = different_features(pipe, features) acc = metrics.accuracy_score(y, y_pred) prec = metrics.precision_score(y, y_pred, average='weighted', labels=labels) recall = metrics.recall_score(y, y_pred, average='weighted', labels=labels) #prec = metrics.precision_score(y, y_pred, labels=labels, average='micro') #rec = metrics.recall_score(y, y_pred, labels=labels, average='micro') cm = metrics.confusion_matrix(y, y_pred, labels=labels) cm = norm_cm(cm) cm = pd.DataFrame(cm, index=labels, columns=labels) fig, ax = plt.subplots(dpi=92) sns.heatmap(cm, vmin=0, vmax=1, annot=True, fmt='.2f', cmap='Greys', ax=ax, cbar=False, square=True) format_axes_for_cm(ax) feature_str = stringify_features(features) ax.set_title( f'Accuracy = {acc:.3f}\n(prec = {prec:.3f}; rec = {recall:.3f})') fig.tight_layout() ensure_dir('./output/features/dtree/') fig.savefig(f'./output/features/dtree/{feature_str}.pdf', dpi=92, bbox_inches='tight') plt.close(fig) print(f'Done {features}') pipe = pipe_logreg for features in feature_sets: print('Features', features) y, y_pred = different_features(pipe, features) acc = metrics.accuracy_score(y, y_pred) prec = metrics.precision_score(y, y_pred, average='micro', labels=labels) recall = metrics.recall_score(y, y_pred, average='micro', labels=labels) #prec = metrics.precision_score(y, y_pred, labels=labels, average='micro') #rec = metrics.recall_score(y, y_pred, labels=labels, average='micro') cm = metrics.confusion_matrix(y, y_pred, labels=labels) cm = norm_cm(cm) cm = pd.DataFrame(cm, index=labels, columns=labels) fig, ax = plt.subplots(dpi=92) sns.heatmap(cm, vmin=0, vmax=1, annot=True, fmt='.2f', cmap='Greys', ax=ax, cbar=False, square=True) format_axes_for_cm(ax) feature_str = stringify_features(features) ax.set_title( f'Accuracy = {acc:.3f}\n(prec = {prec:.3f}, rec = {recall:.3f})') fig.tight_layout() ensure_dir('./output/features/logistic/') fig.savefig(f'./output/features/logistic/{feature_str}.pdf', dpi=92, bbox_inches='tight') plt.close(fig) print(f'Done {features}')
def multiple_figures(): mpl.style.use(['seaborn-white', 'seaborn-paper', 'grayscale']) latexify() cv = model_selection.StratifiedKFold(n_splits=10, shuffle=True) scaler = preprocessing.StandardScaler() resample = over_sampling.RandomOverSampler() baseline = pipeline.make_pipeline( scaler, resample, dummy.DummyClassifier(strategy='constant', constant='good')) logreg = pipeline.make_pipeline( scaler, resample, linear_model.LogisticRegression(solver='lbfgs', multi_class='ovr'), ) dtree = pipeline.make_pipeline( scaler, resample, tree.DecisionTreeClassifier(), ) knn = pipeline.make_pipeline( scaler, resample, neighbors.KNeighborsClassifier(), ) mlp = pipeline.make_pipeline( scaler, resample, neural_network.MLPClassifier(hidden_layer_sizes=( 100, 100, 100, ), activation='relu', solver='adam'), ) svc = pipeline.make_pipeline( scaler, resample, svm.LinearSVC(), ) RForest = pipeline.make_pipeline( scaler, resample, ensemble.RandomForestClassifier(n_estimators=100), ) models = ( ('Constant', baseline), ('Logistic Regression', logreg), ('Decision Tree', dtree), #('kNN', knn), ('Multi-Layer Perceptron', mlp), ('linearSVM', svc), ('Random Forest', RForest), ) # Special case of baseline filename = 'baseline-link-overall' df = prepare_data() y, y_pred = df['class'].ravel(), df['class_overall'].ravel() acc = metrics.accuracy_score(y, y_pred) prec = metrics.precision_score(y, y_pred, average='weighted', labels=labels) recall = metrics.recall_score(y, y_pred, average='weighted', labels=labels) cm = metrics.confusion_matrix(y, y_pred, labels=labels) cm = norm_cm(cm) cm = pd.DataFrame(cm, index=labels, columns=labels) fig, ax = plt.subplots(dpi=92) sns.heatmap(cm, vmin=0, vmax=1, annot=True, fmt='.2f', cmap='Greys', ax=ax, cbar=False, square=True) ax.set_title( f'accuracy = {acc:.3f}\n(prec = {prec:.3f}, rec = {recall:.3f})') format_axes_for_cm(ax) fig.tight_layout() ensure_dir('./output/models/') fig.savefig(f'./output/models/{filename}.pdf', dpi=92, bbox_inches='tight') plt.close(fig) print(f'Done {filename}') for name, pipe in models: filename = name.lower().replace(' ', '_') y, y_pred = different_models(pipe) acc = metrics.accuracy_score(y, y_pred) #prec = metrics.precision_score(y, y_pred, average='weighted', labels=labels) #recall = metrics.recall_score(y, y_pred, average='weighted', labels=labels) print(name) print(metrics.classification_report(y, y_pred, labels=labels)) cm = metrics.confusion_matrix(y, y_pred, labels=labels) cm = norm_cm(cm) cm = pd.DataFrame(cm, index=labels, columns=labels) fig, ax = plt.subplots(dpi=92) sns.heatmap(cm, vmin=0, vmax=1, annot=True, fmt='.2f', cmap='Greys', ax=ax, cbar=False, square=True) ax.set_title(f'accuracy={acc:.3f}') format_axes_for_cm(ax) fig.tight_layout() ensure_dir('./output/models/') fig.savefig(f'./output/models/{filename}.pdf', dpi=92, bbox_inches='tight') plt.close(fig)
def main(): cv = model_selection.StratifiedKFold(n_splits=10, shuffle=True) poly = preprocessing.PolynomialFeatures(degree=2) scaler = preprocessing.StandardScaler() resample = over_sampling.RandomOverSampler() baseline = pipeline.make_pipeline( scaler, resample, dummy.DummyClassifier(strategy='constant', constant='good')) logreg = pipeline.make_pipeline( scaler, resample, linear_model.LogisticRegression(), ) dtree = pipeline.make_pipeline( scaler, resample, tree.DecisionTreeClassifier(), ) #knn = pipeline.make_pipeline( # scaler, # resample, # neighbors.KNeighborsClassifier() #) mlp = pipeline.make_pipeline(scaler, resample, neural_network.MLPClassifier()) svc = pipeline.make_pipeline(scaler, resample, svm.LinearSVC()) RForest = pipeline.make_pipeline(scaler, resample, ensemble.RandomForestClassifier()) models = ( ('Constant', baseline), ('Logistic Regression', logreg), ('Decision Tree', dtree), #('kNN', knn), ('Multi-Layer Perceptron', mlp), ('SVM (linear kernel)', svc), ('Random Forest', RForest), ) fig, axes = plt.subplots(nrows=2, ncols=3, dpi=96, sharey=True, sharex=True) for (name, pipe), ax in zip(models, axes.reshape(-1)): y, y_pred = different_models(pipe) acc = metrics.accuracy_score(y, y_pred) prec = metrics.precision_score(y, y_pred, average='weighted', labels=labels) recall = metrics.recall_score(y, y_pred, average='weighted', labels=labels) cm = metrics.confusion_matrix(y, y_pred, labels=labels) cm = norm_cm(cm) cm = pd.DataFrame(cm, index=labels, columns=labels) sns.heatmap(cm, vmin=0, vmax=1, annot=True, fmt='.2f', cmap='Greys', ax=ax, cbar=False, square=True) ax.set_title( f'{name}\naccuracy={acc:.3f}\n(prec = {prec:.3f}, rec = {recall:.3f})' ) fig.tight_layout() fig.savefig('./different_models.pdf', bbox_inches='tight') plt.show()