def pca(X, y, outpath, **kwargs): # Create a new figure and axes _, ax = plt.subplots() viz = PCADecomposition(ax=ax, **kwargs) viz.fit_transform(X, y) viz.poof(outpath=outpath)
def generate_ordinal_diagnostics(x, y, current_best_model, label_type, diagnostic_image_path): x = np.array(x) y = np.array(y) kf = KFold(n_splits=10, shuffle=True) guesses = [] for train_index, test_index in kf.split(x): X_train, X_test = x[train_index], x[test_index] y_train, y_test = np.array(y)[train_index], np.array(y)[test_index] model = current_best_model[0].fit(X_train, y_train) for guess in zip(y_test.tolist(), model.predict(X_test).tolist()): guesses.append(guess) X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2) if "VotingClassifier" not in str(current_best_model[0].__class__): visualizer = ResidualsPlot(current_best_model[0]) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.poof(outpath=diagnostic_image_path + "/residuals_plot.png") plt.clf() visualizer = PredictionError(current_best_model[0]) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.poof(outpath=diagnostic_image_path + "/prediction_error.png") plt.clf() visualizer = PCADecomposition(scale=True, center=False, col=y, proj_dim=2) visualizer.fit_transform(x, y) print(diagnostic_image_path + "/pca_2.png") visualizer.poof(outpath=diagnostic_image_path + "/pca_2.png") plt.clf() visualizer = PCADecomposition(scale=True, center=False, col=y, proj_dim=3) visualizer.fit_transform(x, y) visualizer.poof(outpath=diagnostic_image_path + "/pca_3.png") plt.clf() return { "mse": mean_squared_error(*np.array(guesses).transpose()), "r2": r2_score(*np.array(guesses).transpose()), "mae": median_absolute_error(*np.array(guesses).transpose()), "evs": explained_variance_score(*np.array(guesses).transpose()), "rmse": np.sqrt(mean_squared_error(*np.array(guesses).transpose())) }
def project_pca(X): #colors = np.array(['r' if yi else 'b' for yi in y]) vis = PCADecomposition(scale=True, proj_features=True, proj_dim=3) #, color=colors) vis.fit_transform(X) vis.poof()
def pca(X, y, outpath, **kwargs): # Create a new figure and axes fig = plt.figure() ax = fig.add_subplot(111) viz = PCADecomposition(**kwargs) viz.fit_transform(X, y) viz.poof(outpath=outpath)
plt.plot(accuracy_history_test) plt.title('model accuracy') plt.ylabel('loss') plt.xlabel("500th iteration") plt.legend(['test'], loc='upper left') plt.savefig("accuracy history.png") plt.show() from yellowbrick.features.pca import PCADecomposition from yellowbrick.style.palettes import PALETTES, SEQUENCES, color_palette # get color for all classes pallette = color_palette("reset") colors = list(map(lambda idx: pallette[idx // test_set.shape[1]], range(len(pred_test)))) visualizer = PCADecomposition(scale=True, proj_dim=3, color = colors, size=(1080, 720)) visualizer.fit_transform(pred_test, colors) visualizer.poof(outpath="./pca", dpi=300) def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues): """ This function prints and plots the confusion matrix. Normalization can be applied by setting `normalize=True`. """ if normalize: cm = (cm.astype('float') / cm.sum(axis=1)[:, np.newaxis])*100 print("Normalized confusion matrix") else:
visualizer.fit(X, y) visualizer.poof() # %% visualizer = FeatureCorrelation(method='mutual_info-classification') visualizer.fit(X, y) visualizer.poof() # %% visualizer = RadViz(classes=class_names) visualizer.fit(X, y) visualizer.transform(X) visualizer.poof() # %% colors = np.array(['r' if yi else 'b' for yi in y]) visualizer = PCADecomposition(color=colors, proj_features=True) visualizer.fit_transform(X, y) visualizer.poof() visualizer = PCADecomposition(scale=True, color=colors, proj_dim=3, proj_features=True) visualizer.fit_transform(X, y) visualizer.poof() # %% viz = FeatureImportances(GradientBoostingClassifier(), relative=False) viz.fit(X, y) viz.poof()
# shows how much each feature contributes to each principal component # Scikitplot # from scikitplot.decomposition import plot_pca_component_variance # # plot_pca_component_variance(pca_scaled.named_steps['pca']) # plt.show() # Yellowbrick from yellowbrick.features.pca import PCADecomposition colors = np.array(['r' if yi else 'b' for yi in y]) visualizer = PCADecomposition(scale=True, color=colors, proj_dim=3) # ∆ to 2 for 2D projection visualizer.fit_transform(X, y) visualizer.poof() # Biplot visualizer = PCADecomposition(scale=True, proj_features=True, proj_dim=2) visualizer.fit_transform(X, y) visualizer.poof() ########## Regularization ########## # if you used regularization, you no longer get an unbiased estimate # project the model down into a lower-dimensional space # Baseline
def generate_binary_diagnostics(x, y, current_best_model, label_type, diagnostic_image_path): x = np.array(x) y = np.array(y) kf = KFold(n_splits=10, shuffle=True) guesses = [] for train_index, test_index in kf.split(x): X_train, X_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] model = current_best_model[0].fit(X_train, y_train) for guess in zip(y_test.tolist(), model.predict(X_test).tolist()): guesses.append(guess) conmat = {} if len(set(y)) == 2: tn, fp, fn, tp = confusion_matrix( *np.array(guesses).transpose()).ravel() conmat = {"tn": int(tn), "tp": int(tp), "fn": int(fn), "fp": int(fp)} else: for val in list(set(y)): fp = len([el for el in guesses if el[0] == val and el[1] != val]) tp = len([el for el in guesses if el[0] == val and el[1] == val]) tn = len([el for el in guesses if el[0] != val and el[1] != val]) fn = len([el for el in guesses if el[0] != val and el[1] == val]) conmat[str(val)] = {"tn": tn, "tp": tp, "fn": fn, "fp": fp} X_train, X_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=int(np.random.random() * 100)) current_count = 0 while current_count < 1000 and (sorted(set(y_train)) != sorted(set(y_test)) or sorted(set(y_test)) != sorted(set(y))): X_train, X_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=int(np.random.random() * 100)) current_count += 1 #pickle.dump( [current_best_model, list(set(y)), X_train, X_test, y_train, y_test], open( str(int(np.random.random()*1000000))+"_binary.pkl", "wb" ) ) #visualizer = ROCAUC(current_best_model[0], classes=list(set(y))) #visualizer.fit(X_train, y_train) #visualizer.score(X_test, y_test) #visualizer.poof(outpath=diagnostic_image_path+"/roc_auc.png") #plt.clf() visualizer = ClassificationReport(current_best_model[0], classes=list(set(y))) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.poof(outpath=diagnostic_image_path + "/classification_report.png") plt.clf() cm = ConfusionMatrix(current_best_model[0], classes=list(set(y))) cm.fit(X_train, y_train) cm.score(X_test, y_test) cm.poof(outpath=diagnostic_image_path + "/confusion_matrix.png") plt.clf() visualizer = ClassBalance(current_best_model[0], classes=list(set(y))) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.poof(outpath=diagnostic_image_path + "/class_balance.png") plt.clf() visualizer = PCADecomposition(scale=True, center=False, col=y, proj_dim=2) visualizer.fit_transform(x, y) visualizer.poof(outpath=diagnostic_image_path + "/pca_2.png") plt.clf() visualizer = PCADecomposition(scale=True, center=False, col=y, proj_dim=3) visualizer.fit_transform(x, y) visualizer.poof(outpath=diagnostic_image_path + "/pca_3.png") plt.clf() #"auc": roc_auc_score(*np.array(guesses).transpose()) NEEDS TO BE FIXED. return {"accuracy": current_best_model[1], "confusion_matrix": conmat}
barmode='group', height=400) fig.update_yaxes(title_text="Model Metrics") fig.update_layout(title_text="Model Performance") fig.show() # - # ## Dimensionality Reduction # ### PCA from yellowbrick.features.pca import ( PCADecomposition, ) fig, ax = plt.subplots(figsize=(6, 4)) colors = ["rg"[j] for j in y_train['Bankrupt?']] pca_viz = PCADecomposition(color=colors) pca_viz.fit_transform(X_train_prepared, y_train['Bankrupt?']) pca_viz.poof() # Dimension Reduction using PCA from sklearn.decomposition import PCA pca = PCA(n_components=15) X_train_prepared_PCA = pca.fit_transform(X_train_prepared) # + models = get_model() names, results, result_df = bl_performance(X_train_prepared_PCA, y_train, models) result_df.sort_values(by='F1', ascending=False, inplace=True)
plt.imshow(matriz, cmap=plt.cm.Blues, interpolation='nearest') plt.title("Matriz de confusão") labels = ['positivos', 'negativos'] marcador_escalas = range(len(labels)) plt.yticks(marcador_escalas, labels) plt.xticks(marcador_escalas, labels) for linha in range(matriz.shape[0]): for coluna in range(matriz.shape[1]): plt.text(coluna, linha, format(matriz[linha,coluna]), horizontalalignment='center', color='black') plt.show() !pip install yellowbrick from yellowbrick.features.pca import PCADecomposition print("DADOS TREINAMENTO") cores_treinamento = np.array(['r' if label==0 else 'b' for label in rotulos_treinamento]) visualizador_treinamento = PCADecomposition(scale=True, color= cores_treinamento, proj_dim=3) visualizador_treinamento.fit_transform(descritores, rotulos_treinamento) visualizador_treinamento.poof() print("DADOS TESTE") cores_teste = np.array(['r' if label == 0 else 'b' for label in rotulos_teste]) visualizador_teste = PCADecomposition(scale=True, color=cores_teste, proj_dim=3) visualizador_teste.fit_transform(img_teste_descritores, rotulos_teste) visualizador_teste.poof()
def visualize_features(classes, problem_type, curdir, default_features, balance_data, test_size): # make features into label encoder here features, feature_labels, class_labels = get_features( classes, problem_type, default_features, balance_data) # now preprocess features for all the other plots os.chdir(curdir) le = preprocessing.LabelEncoder() le.fit(class_labels) tclass_labels = le.transform(class_labels) # process features to help with clustering se = preprocessing.StandardScaler() t_features = se.fit_transform(features) X_train, X_test, y_train, y_test = train_test_split(features, tclass_labels, test_size=test_size, random_state=42) # print(len(features)) # print(len(feature_labels)) # print(len(class_labels)) # print(class_labels) # GET TRAINING DATA DURING MODELING PROCESS ################################## # get filename # csvfile='' # print(classes) # for i in range(len(classes)): # csvfile=csvfile+classes[i]+'_' # get training and testing data for later # try: # print('loading training files...') # X_train=pd.read_csv(prev_dir(curdir)+'/models/'+csvfile+'train.csv') # y_train=X_train['class_'] # X_train.drop(['class_'], axis=1) # X_test=pd.read_csv(prev_dir(curdir)+'/models/'+csvfile+'test.csv') # y_test=X_test['class_'] # X_test.drop(['class_'], axis=1) # y_train=le.inverse_transform(y_train) # y_test=le.inverse_transform(y_test) # except: # print('error loading in training files, making new test data') # Visualize each class (quick plot) ################################## visualization_dir = 'visualization_session' try: os.mkdir(visualization_dir) os.chdir(visualization_dir) except: shutil.rmtree(visualization_dir) os.mkdir(visualization_dir) os.chdir(visualization_dir) objects = tuple(set(class_labels)) y_pos = np.arange(len(objects)) performance = list() for i in range(len(objects)): performance.append(class_labels.count(objects[i])) plt.bar(y_pos, performance, align='center', alpha=0.5) plt.xticks(y_pos, objects) plt.xticks(rotation=90) plt.title('Counts per class') plt.ylabel('Count') plt.xlabel('Class') plt.tight_layout() plt.savefig('classes.png') plt.close() # set current directory curdir = os.getcwd() # ################################## # # CLUSTERING!!! # ################################## ################################## # Manifold type options ################################## ''' "lle" Locally Linear Embedding (LLE) uses many local linear decompositions to preserve globally non-linear structures. "ltsa" LTSA LLE: local tangent space alignment is similar to LLE in that it uses locality to preserve neighborhood distances. "hessian" Hessian LLE an LLE regularization method that applies a hessian-based quadratic form at each neighborhood "modified" Modified LLE applies a regularization parameter to LLE. "isomap" Isomap seeks a lower dimensional embedding that maintains geometric distances between each instance. "mds" MDS: multi-dimensional scaling uses similarity to plot points that are near to each other close in the embedding. "spectral" Spectral Embedding a discrete approximation of the low dimensional manifold using a graph representation. "tsne" (default) t-SNE: converts the similarity of points into probabilities then uses those probabilities to create an embedding. ''' os.mkdir('clustering') os.chdir('clustering') # tSNE plt.figure() viz = Manifold(manifold="tsne", classes=set(classes)) viz.fit_transform(np.array(features), tclass_labels) viz.poof(outpath="tsne.png") plt.close() # os.system('open tsne.png') # viz.show() # PCA plt.figure() visualizer = PCADecomposition(scale=True, classes=set(classes)) visualizer.fit_transform(np.array(features), tclass_labels) visualizer.poof(outpath="pca.png") plt.close() # os.system('open pca.png') # spectral embedding plt.figure() viz = Manifold(manifold="spectral", classes=set(classes)) viz.fit_transform(np.array(features), tclass_labels) viz.poof(outpath="spectral.png") plt.close() # lle embedding plt.figure() viz = Manifold(manifold="lle", classes=set(classes)) viz.fit_transform(np.array(features), tclass_labels) viz.poof(outpath="lle.png") plt.close() # ltsa # plt.figure() # viz = Manifold(manifold="ltsa", classes=set(classes)) # viz.fit_transform(np.array(features), tclass_labels) # viz.poof(outpath="ltsa.png") # plt.close() # hessian # plt.figure() # viz = Manifold(manifold="hessian", method='dense', classes=set(classes)) # viz.fit_transform(np.array(features), tclass_labels) # viz.poof(outpath="hessian.png") # plt.close() # modified plt.figure() viz = Manifold(manifold="modified", classes=set(classes)) viz.fit_transform(np.array(features), tclass_labels) viz.poof(outpath="modified.png") plt.close() # isomap plt.figure() viz = Manifold(manifold="isomap", classes=set(classes)) viz.fit_transform(np.array(features), tclass_labels) viz.poof(outpath="isomap.png") plt.close() # mds plt.figure() viz = Manifold(manifold="mds", classes=set(classes)) viz.fit_transform(np.array(features), tclass_labels) viz.poof(outpath="mds.png") plt.close() # spectral plt.figure() viz = Manifold(manifold="spectral", classes=set(classes)) viz.fit_transform(np.array(features), tclass_labels) viz.poof(outpath="spectral.png") plt.close() # UMAP embedding plt.figure() umap = UMAPVisualizer(metric='cosine', classes=set(classes), title="UMAP embedding") umap.fit_transform(np.array(features), class_labels) umap.poof(outpath="umap.png") plt.close() # alternative UMAP # import umap.plot # plt.figure() # mapper = umap.UMAP().fit(np.array(features)) # fig=umap.plot.points(mapper, labels=np.array(tclass_labels)) # fig = fig.get_figure() # fig.tight_layout() # fig.savefig('umap2.png') # plt.close(fig) ################################# # FEATURE RANKING!! ################################# os.chdir(curdir) os.mkdir('feature_ranking') os.chdir('feature_ranking') # You can get the feature importance of each feature of your dataset # by using the feature importance property of the model. plt.figure(figsize=(12, 12)) model = ExtraTreesClassifier() model.fit(np.array(features), tclass_labels) # print(model.feature_importances_) feat_importances = pd.Series(model.feature_importances_, index=feature_labels[0]) feat_importances.nlargest(20).plot(kind='barh') plt.title('Feature importances (ExtraTrees)', size=16) plt.title('Feature importances with %s features' % (str(len(features[0])))) plt.tight_layout() plt.savefig('feature_importance.png') plt.close() # os.system('open feature_importance.png') # get selected labels for top 20 features selectedlabels = list(dict(feat_importances.nlargest(20))) new_features, new_labels = restructure_features(selectedlabels, t_features, feature_labels[0]) new_features_, new_labels_ = restructure_features(selectedlabels, features, feature_labels[0]) # Shapiro rank algorithm (1D) plt.figure(figsize=(28, 12)) visualizer = Rank1D(algorithm='shapiro', classes=set(classes), features=new_labels) visualizer.fit(np.array(new_features), tclass_labels) visualizer.transform(np.array(new_features)) # plt.tight_layout() visualizer.poof(outpath="shapiro.png") plt.title('Shapiro plot (top 20 features)', size=16) plt.close() # os.system('open shapiro.png') # visualizer.show() # pearson ranking algorithm (2D) plt.figure(figsize=(12, 12)) visualizer = Rank2D(algorithm='pearson', classes=set(classes), features=new_labels) visualizer.fit(np.array(new_features), tclass_labels) visualizer.transform(np.array(new_features)) plt.tight_layout() visualizer.poof(outpath="pearson.png") plt.title('Pearson ranking plot (top 20 features)', size=16) plt.close() # os.system('open pearson.png') # visualizer.show() # feature importances with top 20 features for Lasso plt.figure(figsize=(12, 12)) viz = FeatureImportances(Lasso(), labels=new_labels_) viz.fit(np.array(new_features_), tclass_labels) plt.tight_layout() viz.poof(outpath="lasso.png") plt.close() # correlation plots with feature removal if corr > 0.90 # https://towardsdatascience.com/feature-selection-correlation-and-p-value-da8921bfb3cf # now remove correlated features # --> p values # --> https://towardsdatascience.com/the-next-level-of-data-visualization-in-python-dd6e99039d5e / https://github.com/WillKoehrsen/Data-Analysis/blob/master/plotly/Plotly%20Whirlwind%20Introduction.ipynb- plotly for correlation heatmap and scatterplot matrix # --> https://seaborn.pydata.org/tutorial/distributions.html data = new_features corr = data.corr() plt.figure(figsize=(12, 12)) fig = sns.heatmap(corr) fig = fig.get_figure() plt.title('Heatmap with correlated features (top 20 features)', size=16) fig.tight_layout() fig.savefig('heatmap.png') plt.close(fig) columns = np.full((corr.shape[0], ), True, dtype=bool) for i in range(corr.shape[0]): for j in range(i + 1, corr.shape[0]): if corr.iloc[i, j] >= 0.9: if columns[j]: columns[j] = False selected_columns = data.columns[columns] data = data[selected_columns] corr = data.corr() plt.figure(figsize=(12, 12)) fig = sns.heatmap(corr) fig = fig.get_figure() plt.title('Heatmap without correlated features (top 20 features)', size=16) fig.tight_layout() fig.savefig('heatmap_clean.png') plt.close(fig) # radviz # Instantiate the visualizer plt.figure(figsize=(12, 12)) visualizer = RadViz(classes=classes, features=new_labels) visualizer.fit(np.array(new_features), tclass_labels) visualizer.transform(np.array(new_features)) visualizer.poof(outpath="radviz.png") visualizer.show() plt.close() # feature correlation plot plt.figure(figsize=(28, 12)) visualizer = feature_correlation(np.array(new_features), tclass_labels, labels=new_labels) visualizer.poof(outpath="correlation.png") visualizer.show() plt.tight_layout() plt.close() os.mkdir('feature_plots') os.chdir('feature_plots') newdata = new_features_ newdata['classes'] = class_labels for j in range(len(new_labels_)): fig = sns.violinplot(x=newdata['classes'], y=newdata[new_labels_[j]]) fig = fig.get_figure() fig.tight_layout() fig.savefig('%s_%s.png' % (str(j), new_labels_[j])) plt.close(fig) os.mkdir('feature_plots_transformed') os.chdir('feature_plots_transformed') newdata = new_features newdata['classes'] = class_labels for j in range(len(new_labels)): fig = sns.violinplot(x=newdata['classes'], y=newdata[new_labels[j]]) fig = fig.get_figure() fig.tight_layout() fig.savefig('%s_%s.png' % (str(j), new_labels[j])) plt.close(fig) ################################################## # PRECISION-RECALL CURVES ################################################## os.chdir(curdir) os.mkdir('model_selection') os.chdir('model_selection') plt.figure() visualizer = precision_recall_curve(GaussianNB(), np.array(features), tclass_labels) visualizer.poof(outpath="precision-recall.png") plt.close() plt.figure() visualizer = roc_auc(LogisticRegression(), np.array(features), tclass_labels) visualizer.poof(outpath="roc_curve_train.png") plt.close() plt.figure() visualizer = discrimination_threshold( LogisticRegression(multi_class="auto", solver="liblinear"), np.array(features), tclass_labels) visualizer.poof(outpath="thresholds.png") plt.close() plt.figure() visualizer = residuals_plot(Ridge(), np.array(features), tclass_labels, train_color="maroon", test_color="gold") visualizer.poof(outpath="residuals.png") plt.close() plt.figure() visualizer = prediction_error(Lasso(), np.array(features), tclass_labels) visualizer.poof(outpath='prediction_error.png') plt.close() # outlier detection plt.figure() visualizer = cooks_distance(np.array(features), tclass_labels, draw_threshold=True, linefmt="C0-", markerfmt=",") visualizer.poof(outpath='outliers.png') plt.close() # cluster numbers plt.figure() visualizer = silhouette_visualizer( KMeans(len(set(tclass_labels)), random_state=42), np.array(features)) visualizer.poof(outpath='siloutte.png') plt.close() # cluster distance plt.figure() visualizer = intercluster_distance( KMeans(len(set(tclass_labels)), random_state=777), np.array(features)) visualizer.poof(outpath='cluster_distance.png') plt.close() # plot percentile of features plot with SVM to see which percentile for features is optimal features = preprocessing.MinMaxScaler().fit_transform(features) clf = Pipeline([('anova', SelectPercentile(chi2)), ('scaler', StandardScaler()), ('logr', LogisticRegression())]) score_means = list() score_stds = list() percentiles = (1, 3, 6, 10, 15, 20, 30, 40, 50, 60, 70, 80, 90, 100) for percentile in percentiles: clf.set_params(anova__percentile=percentile) this_scores = cross_val_score(clf, np.array(features), class_labels) score_means.append(this_scores.mean()) score_stds.append(this_scores.std()) plt.errorbar(percentiles, score_means, np.array(score_stds)) plt.title( 'Performance of the LogisticRegression-Anova varying the percent features selected' ) plt.xticks(np.linspace(0, 100, 11, endpoint=True)) plt.xlabel('Percentile') plt.ylabel('Accuracy Score') plt.axis('tight') plt.savefig('logr_percentile_plot.png') plt.close() # get PCA pca = PCA(random_state=1) pca.fit(X_train) skplt.decomposition.plot_pca_component_variance(pca) plt.savefig('pca_explained_variance.png') plt.close() # estimators rf = RandomForestClassifier() skplt.estimators.plot_learning_curve(rf, X_train, y_train) plt.title('Learning Curve (Random Forest)') plt.savefig('learning_curve.png') plt.close() # elbow plot kmeans = KMeans(random_state=1) skplt.cluster.plot_elbow_curve(kmeans, X_train, cluster_ranges=range(1, 30), title='Elbow plot (KMeans clustering)') plt.savefig('elbow.png') plt.close() # KS statistic (only if 2 classes) lr = LogisticRegression() lr = lr.fit(X_train, y_train) y_probas = lr.predict_proba(X_test) skplt.metrics.plot_ks_statistic(y_test, y_probas) plt.savefig('ks.png') plt.close() # precision-recall nb = GaussianNB() nb.fit(X_train, y_train) y_probas = nb.predict_proba(X_test) skplt.metrics.plot_precision_recall(y_test, y_probas) plt.tight_layout() plt.savefig('precision-recall.png') plt.close() ## plot calibration curve rf = RandomForestClassifier() lr = LogisticRegression() nb = GaussianNB() svm = LinearSVC() dt = DecisionTreeClassifier(random_state=0) ab = AdaBoostClassifier(n_estimators=100) gb = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0) knn = KNeighborsClassifier(n_neighbors=7) rf_probas = rf.fit(X_train, y_train).predict_proba(X_test) lr_probas = lr.fit(X_train, y_train).predict_proba(X_test) nb_probas = nb.fit(X_train, y_train).predict_proba(X_test) # svm_scores = svm.fit(X_train, y_train).predict_proba(X_test) dt_scores = dt.fit(X_train, y_train).predict_proba(X_test) ab_scores = ab.fit(X_train, y_train).predict_proba(X_test) gb_scores = gb.fit(X_train, y_train).predict_proba(X_test) knn_scores = knn.fit(X_train, y_train).predict_proba(X_test) probas_list = [ rf_probas, lr_probas, nb_probas, # svm_scores, dt_scores, ab_scores, gb_scores, knn_scores ] clf_names = [ 'Random Forest', 'Logistic Regression', 'Gaussian NB', # 'SVM', 'Decision Tree', 'Adaboost', 'Gradient Boost', 'KNN' ] skplt.metrics.plot_calibration_curve(y_test, probas_list, clf_names) plt.savefig('calibration.png') plt.tight_layout() plt.close() # pick classifier type by ROC (without optimization) probs = [ rf_probas[:, 1], lr_probas[:, 1], nb_probas[:, 1], # svm_scores[:, 1], dt_scores[:, 1], ab_scores[:, 1], gb_scores[:, 1], knn_scores[:, 1] ] plot_roc_curve(y_test, probs, clf_names) # more elaborate ROC example with CV = 5 fold # https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc_crossval.html#sphx-glr-auto-examples-model-selection-plot-roc-crossval-py os.chdir(curdir) return ''
def showPCAProjection(): # Load the classification data set data = load_data('credit') # Specify the features of interest features = [ 'limit', 'sex', 'edu', 'married', 'age', 'apr_delay', 'may_delay', 'jun_delay', 'jul_delay', 'aug_delay', 'sep_delay', 'apr_bill', 'may_bill', 'jun_bill', 'jul_bill', 'aug_bill', 'sep_bill', 'apr_pay', 'may_pay', 'jun_pay', 'jul_pay', 'aug_pay', 'sep_pay', ] # Extract the numpy arrays from the data frame X = data[features].as_matrix() y = data.default.as_matrix() visualizer = PCADecomposition(scale=True, center=False, col=y) visualizer.fit_transform(X, y) visualizer.poof() visualizer = PCADecomposition(scale=True, center=False, col=y, proj_dim=3) visualizer.fit_transform(X, y) visualizer.poof()
def pca_visualization(self,path=None,fileName=None,save=False, scale=True, center=False): visualizer = PCADecomposition(scale=scale,center=center, color=self.y,title=self.title) visualizer.fit_transform(self.x, self.y) self.poof(visualizer, path, fileName, save)