Esempio n. 1
0
def pca(X, y, outpath, **kwargs):
    # Create a new figure and axes
    _, ax = plt.subplots()

    viz = PCADecomposition(ax=ax, **kwargs)
    viz.fit_transform(X, y)
    viz.poof(outpath=outpath)
def generate_ordinal_diagnostics(x, y, current_best_model, label_type,
                                 diagnostic_image_path):
    x = np.array(x)
    y = np.array(y)
    kf = KFold(n_splits=10, shuffle=True)
    guesses = []
    for train_index, test_index in kf.split(x):
        X_train, X_test = x[train_index], x[test_index]
        y_train, y_test = np.array(y)[train_index], np.array(y)[test_index]
        model = current_best_model[0].fit(X_train, y_train)
        for guess in zip(y_test.tolist(), model.predict(X_test).tolist()):
            guesses.append(guess)
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    if "VotingClassifier" not in str(current_best_model[0].__class__):
        visualizer = ResidualsPlot(current_best_model[0])
        visualizer.fit(X_train, y_train)
        visualizer.score(X_test, y_test)
        visualizer.poof(outpath=diagnostic_image_path + "/residuals_plot.png")
        plt.clf()
        visualizer = PredictionError(current_best_model[0])
        visualizer.fit(X_train, y_train)
        visualizer.score(X_test, y_test)
        visualizer.poof(outpath=diagnostic_image_path +
                        "/prediction_error.png")
        plt.clf()
    visualizer = PCADecomposition(scale=True, center=False, col=y, proj_dim=2)
    visualizer.fit_transform(x, y)
    print(diagnostic_image_path + "/pca_2.png")
    visualizer.poof(outpath=diagnostic_image_path + "/pca_2.png")
    plt.clf()
    visualizer = PCADecomposition(scale=True, center=False, col=y, proj_dim=3)
    visualizer.fit_transform(x, y)
    visualizer.poof(outpath=diagnostic_image_path + "/pca_3.png")
    plt.clf()
    return {
        "mse": mean_squared_error(*np.array(guesses).transpose()),
        "r2": r2_score(*np.array(guesses).transpose()),
        "mae": median_absolute_error(*np.array(guesses).transpose()),
        "evs": explained_variance_score(*np.array(guesses).transpose()),
        "rmse": np.sqrt(mean_squared_error(*np.array(guesses).transpose()))
    }
Esempio n. 3
0
def project_pca(X):

    #colors = np.array(['r' if yi else 'b' for yi in y])
    vis = PCADecomposition(scale=True, proj_features=True,
                           proj_dim=3)  #, color=colors)
    vis.fit_transform(X)
    vis.poof()
Esempio n. 4
0
def pca(X, y, outpath, **kwargs):
    # Create a new figure and axes
    _, ax = plt.subplots()

    viz = PCADecomposition(ax=ax, **kwargs)
    viz.fit_transform(X, y)
    viz.poof(outpath=outpath)
Esempio n. 5
0
def pca(X, y, outpath, **kwargs):
    # Create a new figure and axes
    fig = plt.figure()
    ax = fig.add_subplot(111)

    viz = PCADecomposition(**kwargs)
    viz.fit_transform(X, y)
    viz.poof(outpath=outpath)
plt.plot(accuracy_history_test)
plt.title('model accuracy')
plt.ylabel('loss')
plt.xlabel("500th iteration")
plt.legend(['test'], loc='upper left')
plt.savefig("accuracy history.png")
plt.show()

from yellowbrick.features.pca import PCADecomposition
from yellowbrick.style.palettes import PALETTES, SEQUENCES, color_palette

# get color for all classes
pallette = color_palette("reset")
colors = list(map(lambda idx: pallette[idx // test_set.shape[1]], range(len(pred_test))))

visualizer = PCADecomposition(scale=True, proj_dim=3, color = colors, size=(1080, 720))
visualizer.fit_transform(pred_test, colors)
visualizer.poof(outpath="./pca", dpi=300)

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = (cm.astype('float') / cm.sum(axis=1)[:, np.newaxis])*100
        print("Normalized confusion matrix")
    else:
Esempio n. 7
0
visualizer.fit(X, y)
visualizer.poof()

# %%
visualizer = FeatureCorrelation(method='mutual_info-classification')
visualizer.fit(X, y)
visualizer.poof()

# %%
visualizer = RadViz(classes=class_names)
visualizer.fit(X, y)
visualizer.transform(X)
visualizer.poof()

# %%
colors = np.array(['r' if yi else 'b' for yi in y])
visualizer = PCADecomposition(color=colors, proj_features=True)
visualizer.fit_transform(X, y)
visualizer.poof()
visualizer = PCADecomposition(scale=True,
                              color=colors,
                              proj_dim=3,
                              proj_features=True)
visualizer.fit_transform(X, y)
visualizer.poof()

# %%
viz = FeatureImportances(GradientBoostingClassifier(), relative=False)
viz.fit(X, y)
viz.poof()
Esempio n. 8
0
# shows how much each feature contributes to each principal component

# Scikitplot

# from scikitplot.decomposition import plot_pca_component_variance
#
# plot_pca_component_variance(pca_scaled.named_steps['pca'])
# plt.show()

# Yellowbrick

from yellowbrick.features.pca import PCADecomposition

colors = np.array(['r' if yi else 'b' for yi in y])

visualizer = PCADecomposition(scale=True, color=colors,
                              proj_dim=3)  # ∆ to 2 for 2D projection
visualizer.fit_transform(X, y)
visualizer.poof()

# Biplot
visualizer = PCADecomposition(scale=True, proj_features=True, proj_dim=2)
visualizer.fit_transform(X, y)
visualizer.poof()

########## Regularization ##########

# if you used regularization, you no longer get an unbiased estimate
# project the model down into a lower-dimensional space

# Baseline
def generate_binary_diagnostics(x, y, current_best_model, label_type,
                                diagnostic_image_path):
    x = np.array(x)
    y = np.array(y)
    kf = KFold(n_splits=10, shuffle=True)
    guesses = []
    for train_index, test_index in kf.split(x):
        X_train, X_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model = current_best_model[0].fit(X_train, y_train)
        for guess in zip(y_test.tolist(), model.predict(X_test).tolist()):
            guesses.append(guess)
    conmat = {}
    if len(set(y)) == 2:
        tn, fp, fn, tp = confusion_matrix(
            *np.array(guesses).transpose()).ravel()
        conmat = {"tn": int(tn), "tp": int(tp), "fn": int(fn), "fp": int(fp)}
    else:
        for val in list(set(y)):
            fp = len([el for el in guesses if el[0] == val and el[1] != val])
            tp = len([el for el in guesses if el[0] == val and el[1] == val])
            tn = len([el for el in guesses if el[0] != val and el[1] != val])
            fn = len([el for el in guesses if el[0] != val and el[1] == val])
            conmat[str(val)] = {"tn": tn, "tp": tp, "fn": fn, "fp": fp}
    X_train, X_test, y_train, y_test = train_test_split(
        x, y, test_size=0.2, random_state=int(np.random.random() * 100))
    current_count = 0
    while current_count < 1000 and (sorted(set(y_train)) != sorted(set(y_test))
                                    or sorted(set(y_test)) != sorted(set(y))):
        X_train, X_test, y_train, y_test = train_test_split(
            x, y, test_size=0.2, random_state=int(np.random.random() * 100))
        current_count += 1
    #pickle.dump( [current_best_model, list(set(y)), X_train, X_test, y_train, y_test], open( str(int(np.random.random()*1000000))+"_binary.pkl", "wb" ) )
    #visualizer = ROCAUC(current_best_model[0], classes=list(set(y)))
    #visualizer.fit(X_train, y_train)
    #visualizer.score(X_test, y_test)
    #visualizer.poof(outpath=diagnostic_image_path+"/roc_auc.png")
    #plt.clf()
    visualizer = ClassificationReport(current_best_model[0],
                                      classes=list(set(y)))
    visualizer.fit(X_train, y_train)
    visualizer.score(X_test, y_test)
    visualizer.poof(outpath=diagnostic_image_path +
                    "/classification_report.png")
    plt.clf()
    cm = ConfusionMatrix(current_best_model[0], classes=list(set(y)))
    cm.fit(X_train, y_train)
    cm.score(X_test, y_test)
    cm.poof(outpath=diagnostic_image_path + "/confusion_matrix.png")
    plt.clf()
    visualizer = ClassBalance(current_best_model[0], classes=list(set(y)))
    visualizer.fit(X_train, y_train)
    visualizer.score(X_test, y_test)
    visualizer.poof(outpath=diagnostic_image_path + "/class_balance.png")
    plt.clf()
    visualizer = PCADecomposition(scale=True, center=False, col=y, proj_dim=2)
    visualizer.fit_transform(x, y)
    visualizer.poof(outpath=diagnostic_image_path + "/pca_2.png")
    plt.clf()
    visualizer = PCADecomposition(scale=True, center=False, col=y, proj_dim=3)
    visualizer.fit_transform(x, y)
    visualizer.poof(outpath=diagnostic_image_path + "/pca_3.png")
    plt.clf()
    #"auc": roc_auc_score(*np.array(guesses).transpose()) NEEDS TO BE FIXED.
    return {"accuracy": current_best_model[1], "confusion_matrix": conmat}
Esempio n. 10
0
             barmode='group',
             height=400)

fig.update_yaxes(title_text="Model Metrics")
fig.update_layout(title_text="Model Performance")
fig.show()
# -

# ## Dimensionality Reduction
# ### PCA

from yellowbrick.features.pca import (
    PCADecomposition, )
fig, ax = plt.subplots(figsize=(6, 4))
colors = ["rg"[j] for j in y_train['Bankrupt?']]
pca_viz = PCADecomposition(color=colors)
pca_viz.fit_transform(X_train_prepared, y_train['Bankrupt?'])
pca_viz.poof()

# Dimension Reduction using PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=15)
X_train_prepared_PCA = pca.fit_transform(X_train_prepared)

# +
models = get_model()
names, results, result_df = bl_performance(X_train_prepared_PCA, y_train,
                                           models)

result_df.sort_values(by='F1', ascending=False, inplace=True)
plt.imshow(matriz, cmap=plt.cm.Blues, interpolation='nearest')
plt.title("Matriz de confusão")

labels = ['positivos', 'negativos']

marcador_escalas = range(len(labels))

plt.yticks(marcador_escalas, labels)
plt.xticks(marcador_escalas, labels)

for linha in range(matriz.shape[0]):
    for coluna in range(matriz.shape[1]):
        plt.text(coluna, linha, format(matriz[linha,coluna]), horizontalalignment='center', color='black')
plt.show()

!pip install yellowbrick

from yellowbrick.features.pca import PCADecomposition

print("DADOS TREINAMENTO")
cores_treinamento = np.array(['r' if label==0 else 'b' for label in rotulos_treinamento])
visualizador_treinamento = PCADecomposition(scale=True, color= cores_treinamento, proj_dim=3)
visualizador_treinamento.fit_transform(descritores, rotulos_treinamento)
visualizador_treinamento.poof()

print("DADOS TESTE")

cores_teste = np.array(['r' if label == 0 else 'b' for label in rotulos_teste])
visualizador_teste = PCADecomposition(scale=True, color=cores_teste, proj_dim=3)
visualizador_teste.fit_transform(img_teste_descritores, rotulos_teste)
visualizador_teste.poof()
Esempio n. 12
0
def visualize_features(classes, problem_type, curdir, default_features,
                       balance_data, test_size):

    # make features into label encoder here
    features, feature_labels, class_labels = get_features(
        classes, problem_type, default_features, balance_data)

    # now preprocess features for all the other plots
    os.chdir(curdir)
    le = preprocessing.LabelEncoder()
    le.fit(class_labels)
    tclass_labels = le.transform(class_labels)

    # process features to help with clustering
    se = preprocessing.StandardScaler()
    t_features = se.fit_transform(features)

    X_train, X_test, y_train, y_test = train_test_split(features,
                                                        tclass_labels,
                                                        test_size=test_size,
                                                        random_state=42)

    # print(len(features))
    # print(len(feature_labels))
    # print(len(class_labels))
    # print(class_labels)

    # GET TRAINING DATA DURING MODELING PROCESS
    ##################################
    # get filename
    # csvfile=''
    # print(classes)
    # for i in range(len(classes)):
    # 	csvfile=csvfile+classes[i]+'_'

    # get training and testing data for later
    # try:
    # print('loading training files...')
    # X_train=pd.read_csv(prev_dir(curdir)+'/models/'+csvfile+'train.csv')
    # y_train=X_train['class_']
    # X_train.drop(['class_'], axis=1)
    # X_test=pd.read_csv(prev_dir(curdir)+'/models/'+csvfile+'test.csv')
    # y_test=X_test['class_']
    # X_test.drop(['class_'], axis=1)
    # y_train=le.inverse_transform(y_train)
    # y_test=le.inverse_transform(y_test)
    # except:
    # print('error loading in training files, making new test data')

    # Visualize each class (quick plot)
    ##################################
    visualization_dir = 'visualization_session'
    try:
        os.mkdir(visualization_dir)
        os.chdir(visualization_dir)
    except:
        shutil.rmtree(visualization_dir)
        os.mkdir(visualization_dir)
        os.chdir(visualization_dir)

    objects = tuple(set(class_labels))
    y_pos = np.arange(len(objects))
    performance = list()
    for i in range(len(objects)):
        performance.append(class_labels.count(objects[i]))

    plt.bar(y_pos, performance, align='center', alpha=0.5)
    plt.xticks(y_pos, objects)
    plt.xticks(rotation=90)
    plt.title('Counts per class')
    plt.ylabel('Count')
    plt.xlabel('Class')
    plt.tight_layout()
    plt.savefig('classes.png')
    plt.close()

    # set current directory
    curdir = os.getcwd()

    # ##################################
    # # CLUSTERING!!!
    # ##################################

    ##################################
    # Manifold type options
    ##################################
    '''
		"lle"
		Locally Linear Embedding (LLE) uses many local linear decompositions to preserve globally non-linear structures.
		"ltsa"
		LTSA LLE: local tangent space alignment is similar to LLE in that it uses locality to preserve neighborhood distances.
		"hessian"
		Hessian LLE an LLE regularization method that applies a hessian-based quadratic form at each neighborhood
		"modified"
		Modified LLE applies a regularization parameter to LLE.
		"isomap"
		Isomap seeks a lower dimensional embedding that maintains geometric distances between each instance.
		"mds"
		MDS: multi-dimensional scaling uses similarity to plot points that are near to each other close in the embedding.
		"spectral"
		Spectral Embedding a discrete approximation of the low dimensional manifold using a graph representation.
		"tsne" (default)
		t-SNE: converts the similarity of points into probabilities then uses those probabilities to create an embedding.
	'''
    os.mkdir('clustering')
    os.chdir('clustering')

    # tSNE
    plt.figure()
    viz = Manifold(manifold="tsne", classes=set(classes))
    viz.fit_transform(np.array(features), tclass_labels)
    viz.poof(outpath="tsne.png")
    plt.close()
    # os.system('open tsne.png')
    # viz.show()

    # PCA
    plt.figure()
    visualizer = PCADecomposition(scale=True, classes=set(classes))
    visualizer.fit_transform(np.array(features), tclass_labels)
    visualizer.poof(outpath="pca.png")
    plt.close()
    # os.system('open pca.png')

    # spectral embedding
    plt.figure()
    viz = Manifold(manifold="spectral", classes=set(classes))
    viz.fit_transform(np.array(features), tclass_labels)
    viz.poof(outpath="spectral.png")
    plt.close()

    # lle embedding
    plt.figure()
    viz = Manifold(manifold="lle", classes=set(classes))
    viz.fit_transform(np.array(features), tclass_labels)
    viz.poof(outpath="lle.png")
    plt.close()

    # ltsa
    # plt.figure()
    # viz = Manifold(manifold="ltsa", classes=set(classes))
    # viz.fit_transform(np.array(features), tclass_labels)
    # viz.poof(outpath="ltsa.png")
    # plt.close()

    # hessian
    # plt.figure()
    # viz = Manifold(manifold="hessian", method='dense', classes=set(classes))
    # viz.fit_transform(np.array(features), tclass_labels)
    # viz.poof(outpath="hessian.png")
    # plt.close()

    # modified
    plt.figure()
    viz = Manifold(manifold="modified", classes=set(classes))
    viz.fit_transform(np.array(features), tclass_labels)
    viz.poof(outpath="modified.png")
    plt.close()

    # isomap
    plt.figure()
    viz = Manifold(manifold="isomap", classes=set(classes))
    viz.fit_transform(np.array(features), tclass_labels)
    viz.poof(outpath="isomap.png")
    plt.close()

    # mds
    plt.figure()
    viz = Manifold(manifold="mds", classes=set(classes))
    viz.fit_transform(np.array(features), tclass_labels)
    viz.poof(outpath="mds.png")
    plt.close()

    # spectral
    plt.figure()
    viz = Manifold(manifold="spectral", classes=set(classes))
    viz.fit_transform(np.array(features), tclass_labels)
    viz.poof(outpath="spectral.png")
    plt.close()

    # UMAP embedding
    plt.figure()
    umap = UMAPVisualizer(metric='cosine',
                          classes=set(classes),
                          title="UMAP embedding")
    umap.fit_transform(np.array(features), class_labels)
    umap.poof(outpath="umap.png")
    plt.close()

    # alternative UMAP
    # import umap.plot
    # plt.figure()
    # mapper = umap.UMAP().fit(np.array(features))
    # fig=umap.plot.points(mapper, labels=np.array(tclass_labels))
    # fig = fig.get_figure()
    # fig.tight_layout()
    # fig.savefig('umap2.png')
    # plt.close(fig)

    #################################
    # 	  FEATURE RANKING!!
    #################################
    os.chdir(curdir)
    os.mkdir('feature_ranking')
    os.chdir('feature_ranking')

    # You can get the feature importance of each feature of your dataset
    # by using the feature importance property of the model.
    plt.figure(figsize=(12, 12))
    model = ExtraTreesClassifier()
    model.fit(np.array(features), tclass_labels)
    # print(model.feature_importances_)
    feat_importances = pd.Series(model.feature_importances_,
                                 index=feature_labels[0])
    feat_importances.nlargest(20).plot(kind='barh')
    plt.title('Feature importances (ExtraTrees)', size=16)
    plt.title('Feature importances with %s features' % (str(len(features[0]))))
    plt.tight_layout()
    plt.savefig('feature_importance.png')
    plt.close()
    # os.system('open feature_importance.png')

    # get selected labels for top 20 features
    selectedlabels = list(dict(feat_importances.nlargest(20)))
    new_features, new_labels = restructure_features(selectedlabels, t_features,
                                                    feature_labels[0])
    new_features_, new_labels_ = restructure_features(selectedlabels, features,
                                                      feature_labels[0])

    # Shapiro rank algorithm (1D)
    plt.figure(figsize=(28, 12))
    visualizer = Rank1D(algorithm='shapiro',
                        classes=set(classes),
                        features=new_labels)
    visualizer.fit(np.array(new_features), tclass_labels)
    visualizer.transform(np.array(new_features))
    # plt.tight_layout()
    visualizer.poof(outpath="shapiro.png")
    plt.title('Shapiro plot (top 20 features)', size=16)
    plt.close()
    # os.system('open shapiro.png')
    # visualizer.show()

    # pearson ranking algorithm (2D)
    plt.figure(figsize=(12, 12))
    visualizer = Rank2D(algorithm='pearson',
                        classes=set(classes),
                        features=new_labels)
    visualizer.fit(np.array(new_features), tclass_labels)
    visualizer.transform(np.array(new_features))
    plt.tight_layout()
    visualizer.poof(outpath="pearson.png")
    plt.title('Pearson ranking plot (top 20 features)', size=16)
    plt.close()
    # os.system('open pearson.png')
    # visualizer.show()

    # feature importances with top 20 features for Lasso
    plt.figure(figsize=(12, 12))
    viz = FeatureImportances(Lasso(), labels=new_labels_)
    viz.fit(np.array(new_features_), tclass_labels)
    plt.tight_layout()
    viz.poof(outpath="lasso.png")
    plt.close()

    # correlation plots with feature removal if corr > 0.90
    # https://towardsdatascience.com/feature-selection-correlation-and-p-value-da8921bfb3cf

    # now remove correlated features
    # --> p values
    # --> https://towardsdatascience.com/the-next-level-of-data-visualization-in-python-dd6e99039d5e / https://github.com/WillKoehrsen/Data-Analysis/blob/master/plotly/Plotly%20Whirlwind%20Introduction.ipynb- plotly for correlation heatmap and scatterplot matrix
    # --> https://seaborn.pydata.org/tutorial/distributions.html
    data = new_features
    corr = data.corr()

    plt.figure(figsize=(12, 12))
    fig = sns.heatmap(corr)
    fig = fig.get_figure()
    plt.title('Heatmap with correlated features (top 20 features)', size=16)
    fig.tight_layout()
    fig.savefig('heatmap.png')
    plt.close(fig)

    columns = np.full((corr.shape[0], ), True, dtype=bool)
    for i in range(corr.shape[0]):
        for j in range(i + 1, corr.shape[0]):
            if corr.iloc[i, j] >= 0.9:
                if columns[j]:
                    columns[j] = False
    selected_columns = data.columns[columns]
    data = data[selected_columns]
    corr = data.corr()

    plt.figure(figsize=(12, 12))
    fig = sns.heatmap(corr)
    fig = fig.get_figure()
    plt.title('Heatmap without correlated features (top 20 features)', size=16)
    fig.tight_layout()
    fig.savefig('heatmap_clean.png')
    plt.close(fig)

    # radviz
    # Instantiate the visualizer
    plt.figure(figsize=(12, 12))
    visualizer = RadViz(classes=classes, features=new_labels)
    visualizer.fit(np.array(new_features), tclass_labels)
    visualizer.transform(np.array(new_features))
    visualizer.poof(outpath="radviz.png")
    visualizer.show()
    plt.close()

    # feature correlation plot
    plt.figure(figsize=(28, 12))
    visualizer = feature_correlation(np.array(new_features),
                                     tclass_labels,
                                     labels=new_labels)
    visualizer.poof(outpath="correlation.png")
    visualizer.show()
    plt.tight_layout()
    plt.close()

    os.mkdir('feature_plots')
    os.chdir('feature_plots')

    newdata = new_features_
    newdata['classes'] = class_labels

    for j in range(len(new_labels_)):
        fig = sns.violinplot(x=newdata['classes'], y=newdata[new_labels_[j]])
        fig = fig.get_figure()
        fig.tight_layout()
        fig.savefig('%s_%s.png' % (str(j), new_labels_[j]))
        plt.close(fig)

    os.mkdir('feature_plots_transformed')
    os.chdir('feature_plots_transformed')

    newdata = new_features
    newdata['classes'] = class_labels

    for j in range(len(new_labels)):
        fig = sns.violinplot(x=newdata['classes'], y=newdata[new_labels[j]])
        fig = fig.get_figure()
        fig.tight_layout()
        fig.savefig('%s_%s.png' % (str(j), new_labels[j]))
        plt.close(fig)

    ##################################################
    # PRECISION-RECALL CURVES
    ##################################################

    os.chdir(curdir)
    os.mkdir('model_selection')
    os.chdir('model_selection')

    plt.figure()
    visualizer = precision_recall_curve(GaussianNB(), np.array(features),
                                        tclass_labels)
    visualizer.poof(outpath="precision-recall.png")
    plt.close()

    plt.figure()
    visualizer = roc_auc(LogisticRegression(), np.array(features),
                         tclass_labels)
    visualizer.poof(outpath="roc_curve_train.png")
    plt.close()

    plt.figure()
    visualizer = discrimination_threshold(
        LogisticRegression(multi_class="auto", solver="liblinear"),
        np.array(features), tclass_labels)
    visualizer.poof(outpath="thresholds.png")
    plt.close()

    plt.figure()
    visualizer = residuals_plot(Ridge(),
                                np.array(features),
                                tclass_labels,
                                train_color="maroon",
                                test_color="gold")
    visualizer.poof(outpath="residuals.png")
    plt.close()

    plt.figure()
    visualizer = prediction_error(Lasso(), np.array(features), tclass_labels)
    visualizer.poof(outpath='prediction_error.png')
    plt.close()

    # outlier detection
    plt.figure()
    visualizer = cooks_distance(np.array(features),
                                tclass_labels,
                                draw_threshold=True,
                                linefmt="C0-",
                                markerfmt=",")
    visualizer.poof(outpath='outliers.png')
    plt.close()

    # cluster numbers
    plt.figure()
    visualizer = silhouette_visualizer(
        KMeans(len(set(tclass_labels)), random_state=42), np.array(features))
    visualizer.poof(outpath='siloutte.png')
    plt.close()

    # cluster distance
    plt.figure()
    visualizer = intercluster_distance(
        KMeans(len(set(tclass_labels)), random_state=777), np.array(features))
    visualizer.poof(outpath='cluster_distance.png')
    plt.close()

    # plot percentile of features plot with SVM to see which percentile for features is optimal
    features = preprocessing.MinMaxScaler().fit_transform(features)
    clf = Pipeline([('anova', SelectPercentile(chi2)),
                    ('scaler', StandardScaler()),
                    ('logr', LogisticRegression())])
    score_means = list()
    score_stds = list()
    percentiles = (1, 3, 6, 10, 15, 20, 30, 40, 50, 60, 70, 80, 90, 100)

    for percentile in percentiles:
        clf.set_params(anova__percentile=percentile)
        this_scores = cross_val_score(clf, np.array(features), class_labels)
        score_means.append(this_scores.mean())
        score_stds.append(this_scores.std())

    plt.errorbar(percentiles, score_means, np.array(score_stds))
    plt.title(
        'Performance of the LogisticRegression-Anova varying the percent features selected'
    )
    plt.xticks(np.linspace(0, 100, 11, endpoint=True))
    plt.xlabel('Percentile')
    plt.ylabel('Accuracy Score')
    plt.axis('tight')
    plt.savefig('logr_percentile_plot.png')
    plt.close()

    # get PCA
    pca = PCA(random_state=1)
    pca.fit(X_train)
    skplt.decomposition.plot_pca_component_variance(pca)
    plt.savefig('pca_explained_variance.png')
    plt.close()

    # estimators
    rf = RandomForestClassifier()
    skplt.estimators.plot_learning_curve(rf, X_train, y_train)
    plt.title('Learning Curve (Random Forest)')
    plt.savefig('learning_curve.png')
    plt.close()

    # elbow plot
    kmeans = KMeans(random_state=1)
    skplt.cluster.plot_elbow_curve(kmeans,
                                   X_train,
                                   cluster_ranges=range(1, 30),
                                   title='Elbow plot (KMeans clustering)')
    plt.savefig('elbow.png')
    plt.close()

    # KS statistic (only if 2 classes)
    lr = LogisticRegression()
    lr = lr.fit(X_train, y_train)
    y_probas = lr.predict_proba(X_test)
    skplt.metrics.plot_ks_statistic(y_test, y_probas)
    plt.savefig('ks.png')
    plt.close()

    # precision-recall
    nb = GaussianNB()
    nb.fit(X_train, y_train)
    y_probas = nb.predict_proba(X_test)
    skplt.metrics.plot_precision_recall(y_test, y_probas)
    plt.tight_layout()
    plt.savefig('precision-recall.png')
    plt.close()

    ## plot calibration curve
    rf = RandomForestClassifier()
    lr = LogisticRegression()
    nb = GaussianNB()
    svm = LinearSVC()
    dt = DecisionTreeClassifier(random_state=0)
    ab = AdaBoostClassifier(n_estimators=100)
    gb = GradientBoostingClassifier(n_estimators=100,
                                    learning_rate=1.0,
                                    max_depth=1,
                                    random_state=0)
    knn = KNeighborsClassifier(n_neighbors=7)

    rf_probas = rf.fit(X_train, y_train).predict_proba(X_test)
    lr_probas = lr.fit(X_train, y_train).predict_proba(X_test)
    nb_probas = nb.fit(X_train, y_train).predict_proba(X_test)
    # svm_scores = svm.fit(X_train, y_train).predict_proba(X_test)
    dt_scores = dt.fit(X_train, y_train).predict_proba(X_test)
    ab_scores = ab.fit(X_train, y_train).predict_proba(X_test)
    gb_scores = gb.fit(X_train, y_train).predict_proba(X_test)
    knn_scores = knn.fit(X_train, y_train).predict_proba(X_test)

    probas_list = [
        rf_probas,
        lr_probas,
        nb_probas,  # svm_scores,
        dt_scores,
        ab_scores,
        gb_scores,
        knn_scores
    ]

    clf_names = [
        'Random Forest',
        'Logistic Regression',
        'Gaussian NB',  # 'SVM',
        'Decision Tree',
        'Adaboost',
        'Gradient Boost',
        'KNN'
    ]

    skplt.metrics.plot_calibration_curve(y_test, probas_list, clf_names)
    plt.savefig('calibration.png')
    plt.tight_layout()
    plt.close()

    # pick classifier type by ROC (without optimization)
    probs = [
        rf_probas[:, 1],
        lr_probas[:, 1],
        nb_probas[:, 1],  # svm_scores[:, 1],
        dt_scores[:, 1],
        ab_scores[:, 1],
        gb_scores[:, 1],
        knn_scores[:, 1]
    ]

    plot_roc_curve(y_test, probs, clf_names)
    # more elaborate ROC example with CV = 5 fold
    # https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc_crossval.html#sphx-glr-auto-examples-model-selection-plot-roc-crossval-py

    os.chdir(curdir)

    return ''
Esempio n. 13
0
def showPCAProjection():
    # Load the classification data set
    data = load_data('credit')

    # Specify the features of interest
    features = [
        'limit',
        'sex',
        'edu',
        'married',
        'age',
        'apr_delay',
        'may_delay',
        'jun_delay',
        'jul_delay',
        'aug_delay',
        'sep_delay',
        'apr_bill',
        'may_bill',
        'jun_bill',
        'jul_bill',
        'aug_bill',
        'sep_bill',
        'apr_pay',
        'may_pay',
        'jun_pay',
        'jul_pay',
        'aug_pay',
        'sep_pay',
    ]

    # Extract the numpy arrays from the data frame
    X = data[features].as_matrix()
    y = data.default.as_matrix()
    visualizer = PCADecomposition(scale=True, center=False, col=y)
    visualizer.fit_transform(X, y)
    visualizer.poof()

    visualizer = PCADecomposition(scale=True, center=False, col=y, proj_dim=3)
    visualizer.fit_transform(X, y)
    visualizer.poof()
Esempio n. 14
0
 def pca_visualization(self,path=None,fileName=None,save=False,
                       scale=True, center=False):
     visualizer = PCADecomposition(scale=scale,center=center,
                                   color=self.y,title=self.title)
     visualizer.fit_transform(self.x, self.y)
     self.poof(visualizer, path, fileName, save)