Esempio n. 1
0
def main():
    fig, axes = plt.subplots(1, 3, sharey=True, sharex=True)
    #fig.suptitle('Resample approaches')

    for ax, title, model in zip(axes.flat, ['No resample', 'Oversample', 'Undersample', ], [no_resample, oversample, undersample]):
        y, y_pred, c = model()

        print(title)
        print(imetrics.classification_report_imbalanced(y, y_pred))

        acc = metrics.accuracy_score(y, y_pred)
        cm = metrics.confusion_matrix(y, y_pred, labels=labels)
        cm = norm_cm(cm)

        cm = pd.DataFrame(cm, index=labels, columns=labels)
        sns.heatmap(cm, vmin=0, vmax=1, annot=True, fmt='.2f', cmap='Greys', ax=ax, cbar=False, square=True)
        ax.set_title(f'{title}\naccuracy={acc:.3f}')


    count = class_counter(y)
    fig.suptitle('Population: ' + ', '.join([f'{key}: {count[key]*100:.1f}%' for key in labels]))

    fig.tight_layout()
    fig.savefig('./different_resampling.pdf', dpi=92, bbox_inches='tight')

    plt.show()
def multiple_figures():
    mpl.style.use(['seaborn-white', 'seaborn-paper', 'grayscale'])

    #width = 3.39
    #height = 3.39 * (np.sqrt(5) - 1.0) / 2.0
    latexify()


    for title, model in zip(['without', 'gaussian', 'constant', ], [without, guassian, constant, ]):
        y, y_pred = model(pipe)

        print(title, classifier[0])
        print(class_counter(y))
        print(metrics.classification_report(y, y_pred, labels=labels))


        acc = metrics.accuracy_score(y, y_pred)
        cm = metrics.confusion_matrix(y, y_pred, labels=labels)
        cm = norm_cm(cm)

        cm = pd.DataFrame(cm, index=labels, columns=labels)

        fig, ax = plt.subplots(dpi=92)

        sns.heatmap(cm, vmin=0, vmax=1, annot=True, fmt='.2f', cmap='Greys', ax=ax, cbar=False, square=True)
        format_axes_for_cm(ax)
        ax.set_title(f'accuracy = {acc:.3f}')

        fig.tight_layout()

        ensure_dir('./output/interpolations/')
        fig.savefig(f'./output/interpolations/{title}.pdf', dpi=92, bbox_inches='tight')
        plt.clf()
Esempio n. 3
0
def main():
    mpl.style.use(['seaborn-white', 'seaborn-paper', 'grayscale'])
    latexify()

    w_sizes = (5, 10, 50, 100)

    for (w_prr, w_history) in it.product(w_sizes, repeat=2):
        y, y_pred = different_window_sizes(w_prr, w_history)

        acc = metrics.accuracy_score(y, y_pred)
        prec = metrics.precision_score(y,
                                       y_pred,
                                       average='weighted',
                                       labels=labels)
        recall = metrics.recall_score(y,
                                      y_pred,
                                      average='weighted',
                                      labels=labels)
        f1 = metrics.f1_score(y, y_pred, average='weighted', labels=labels)

        print(
            f'& {w_history}\t& {w_prr}\t& {acc:.3f}\t& {prec:.3f}\t& {recall:.3f}\t& {f1:.3f}'
        )

        #print(f'Wh=={w_history}; Wprr=={w_prr}')
        #print(metrics.classification_report(y, y_pred, labels=labels))

        cm = metrics.confusion_matrix(y, y_pred, labels=labels)
        cm = norm_cm(cm)
        cm = pd.DataFrame(cm, index=labels, columns=labels)

        fig, ax = plt.subplots(dpi=92)
        sns.heatmap(cm,
                    vmin=0,
                    vmax=1,
                    annot=True,
                    fmt='.2f',
                    cmap='Greys',
                    ax=ax,
                    cbar=False,
                    square=True)
        #ax.set_title(f'$\\mathrm{{Acc}}(W_{{\\mathrm{{PRR}}}}={w_prr}, W_{{\\mathrm{{history}}}}={w_history})={acc:.3f}$')
        ax.set_title(
            f'Accuracy = {acc:.3f}\n(prec = {prec:.3f}, rec = {recall:.3f})')
        format_axes_for_cm(ax)

        fig.tight_layout()

        ensure_dir('./output/w_sizes/')
        fig.savefig(f'./output/w_sizes/Wprr{w_prr}_Wh{w_history}.pdf',
                    dpi=92,
                    bbox_inches='tight')
        plt.close(fig)
Esempio n. 4
0
def multiplots():
    mpl.style.use(['seaborn-white', 'seaborn-paper', 'grayscale'])

    latexify()

    for model, title in zip([no_resample, undersample, oversample], ['none', 'undersample', 'oversample']):
        y, y_pred, c = model(classifier)

        acc = metrics.accuracy_score(y, y_pred)
        #prec = metrics.precision_score(y, y_pred, labels=labels, average='macro')
        #rec = metrics.recall_score(y, y_pred, labels=labels, average='macro')
        #f1 = metrics.f1_score(y, y_pred, labels=labels, average='macro')

        cm = metrics.confusion_matrix(y, y_pred, labels=labels)
        cm = norm_cm(cm)

        cm = pd.DataFrame(cm, index=labels, columns=labels)

        fig, ax = plt.subplots(dpi=92, constrained_layout=True)
        #print(f'{title}\t-- Acc.: {acc:.3f};\t Prec.: {prec:.3f}\t Rec.: {rec:.3f}\t F1: {f1:.3f}')
        print('Resample:', title, classifier[0], f'accuracy={acc:.3f}')
        print(metrics.classification_report(y, y_pred, labels=labels))

        plt.suptitle(f'accuracy={acc:.3f}')
        #plt.suptitle(f'good={c["good"]:,}\ninterm.={c["interm."]:,}\nbad={c["bad"]:,}', ha='left')

        sns.heatmap(cm, vmin=0, vmax=1, annot=True, fmt='.2f', cmap='Greys', ax=ax, cbar=False, square=True)

        #ax.set_title(f'Accuracy = {acc:.3f}', loc='center')
        ax.set_title(
            f'good:    {c["good"]:,}\ninterm.: {c["interm."]:,}\nbad:      {c["bad"]:,}',
            fontdict={'fontsize': 9},
            loc='left'
        )

        format_axes_for_cm(ax)

        #fig.tight_layout()
        ensure_dir('./output/resampling/')
        fig.savefig(f'./output/resampling/{title}.pdf', dpi=92, bbox_inches='tight')
        plt.close(fig)
def main():

    fig, axes = plt.subplots(1, 3, sharey=True, sharex=True)
    #fig.suptitle('Interpolation approaches')

    for ax, title, model in zip(axes.reshape(-1), ['Without', 'Gaussian', 'Constant', ], [without, guassian, constant, ]):
        y, y_pred = model()

        print(title, classifier[0])
        print(metrics.classification_report(y, y_pred, labels=labels))
        acc = metrics.accuracy_score(y, y_pred)
        cm = metrics.confusion_matrix(y, y_pred, labels=labels)
        cm = norm_cm(cm)

        cm = pd.DataFrame(cm, index=labels, columns=labels)
        sns.heatmap(cm, vmin=0, vmax=1, annot=True, fmt='.2f', cmap='Greys', ax=ax, cbar=False, square=True)
        ax.set_title(f'{title}\naccuracy={acc:.3f}')

    fig.tight_layout()
    fig.savefig('./different_interpolations.pdf', dpi=92, bbox_inches='tight')

    plt.show()
Esempio n. 6
0
def multiple_figures():
    mpl.style.use(['seaborn-white', 'seaborn-paper', 'grayscale'])

    latexify()

    pipe = pipe_dtree

    for features in feature_sets:
        y, y_pred = different_features(pipe, features)

        acc = metrics.accuracy_score(y, y_pred)
        prec = metrics.precision_score(y,
                                       y_pred,
                                       average='weighted',
                                       labels=labels)
        recall = metrics.recall_score(y,
                                      y_pred,
                                      average='weighted',
                                      labels=labels)

        #prec = metrics.precision_score(y, y_pred, labels=labels, average='micro')
        #rec = metrics.recall_score(y, y_pred, labels=labels, average='micro')

        cm = metrics.confusion_matrix(y, y_pred, labels=labels)
        cm = norm_cm(cm)

        cm = pd.DataFrame(cm, index=labels, columns=labels)

        fig, ax = plt.subplots(dpi=92)
        sns.heatmap(cm,
                    vmin=0,
                    vmax=1,
                    annot=True,
                    fmt='.2f',
                    cmap='Greys',
                    ax=ax,
                    cbar=False,
                    square=True)
        format_axes_for_cm(ax)

        feature_str = stringify_features(features)
        ax.set_title(
            f'Accuracy = {acc:.3f}\n(prec = {prec:.3f}; rec = {recall:.3f})')

        fig.tight_layout()

        ensure_dir('./output/features/dtree/')
        fig.savefig(f'./output/features/dtree/{feature_str}.pdf',
                    dpi=92,
                    bbox_inches='tight')
        plt.close(fig)
        print(f'Done {features}')

    pipe = pipe_logreg

    for features in feature_sets:
        print('Features', features)

        y, y_pred = different_features(pipe, features)

        acc = metrics.accuracy_score(y, y_pred)
        prec = metrics.precision_score(y,
                                       y_pred,
                                       average='micro',
                                       labels=labels)
        recall = metrics.recall_score(y,
                                      y_pred,
                                      average='micro',
                                      labels=labels)

        #prec = metrics.precision_score(y, y_pred, labels=labels, average='micro')
        #rec = metrics.recall_score(y, y_pred, labels=labels, average='micro')

        cm = metrics.confusion_matrix(y, y_pred, labels=labels)
        cm = norm_cm(cm)

        cm = pd.DataFrame(cm, index=labels, columns=labels)

        fig, ax = plt.subplots(dpi=92)
        sns.heatmap(cm,
                    vmin=0,
                    vmax=1,
                    annot=True,
                    fmt='.2f',
                    cmap='Greys',
                    ax=ax,
                    cbar=False,
                    square=True)
        format_axes_for_cm(ax)

        feature_str = stringify_features(features)
        ax.set_title(
            f'Accuracy = {acc:.3f}\n(prec = {prec:.3f}, rec = {recall:.3f})')

        fig.tight_layout()

        ensure_dir('./output/features/logistic/')
        fig.savefig(f'./output/features/logistic/{feature_str}.pdf',
                    dpi=92,
                    bbox_inches='tight')
        plt.close(fig)
        print(f'Done {features}')
def multiple_figures():
    mpl.style.use(['seaborn-white', 'seaborn-paper', 'grayscale'])
    latexify()

    cv = model_selection.StratifiedKFold(n_splits=10, shuffle=True)
    scaler = preprocessing.StandardScaler()
    resample = over_sampling.RandomOverSampler()

    baseline = pipeline.make_pipeline(
        scaler, resample,
        dummy.DummyClassifier(strategy='constant', constant='good'))

    logreg = pipeline.make_pipeline(
        scaler,
        resample,
        linear_model.LogisticRegression(solver='lbfgs', multi_class='ovr'),
    )

    dtree = pipeline.make_pipeline(
        scaler,
        resample,
        tree.DecisionTreeClassifier(),
    )

    knn = pipeline.make_pipeline(
        scaler,
        resample,
        neighbors.KNeighborsClassifier(),
    )

    mlp = pipeline.make_pipeline(
        scaler,
        resample,
        neural_network.MLPClassifier(hidden_layer_sizes=(
            100,
            100,
            100,
        ),
                                     activation='relu',
                                     solver='adam'),
    )

    svc = pipeline.make_pipeline(
        scaler,
        resample,
        svm.LinearSVC(),
    )

    RForest = pipeline.make_pipeline(
        scaler,
        resample,
        ensemble.RandomForestClassifier(n_estimators=100),
    )

    models = (
        ('Constant', baseline),
        ('Logistic Regression', logreg),
        ('Decision Tree', dtree),
        #('kNN', knn),
        ('Multi-Layer Perceptron', mlp),
        ('linearSVM', svc),
        ('Random Forest', RForest),
    )

    # Special case of baseline
    filename = 'baseline-link-overall'
    df = prepare_data()
    y, y_pred = df['class'].ravel(), df['class_overall'].ravel()

    acc = metrics.accuracy_score(y, y_pred)
    prec = metrics.precision_score(y,
                                   y_pred,
                                   average='weighted',
                                   labels=labels)
    recall = metrics.recall_score(y, y_pred, average='weighted', labels=labels)

    cm = metrics.confusion_matrix(y, y_pred, labels=labels)
    cm = norm_cm(cm)

    cm = pd.DataFrame(cm, index=labels, columns=labels)

    fig, ax = plt.subplots(dpi=92)
    sns.heatmap(cm,
                vmin=0,
                vmax=1,
                annot=True,
                fmt='.2f',
                cmap='Greys',
                ax=ax,
                cbar=False,
                square=True)
    ax.set_title(
        f'accuracy = {acc:.3f}\n(prec = {prec:.3f}, rec = {recall:.3f})')
    format_axes_for_cm(ax)

    fig.tight_layout()

    ensure_dir('./output/models/')
    fig.savefig(f'./output/models/{filename}.pdf', dpi=92, bbox_inches='tight')
    plt.close(fig)
    print(f'Done {filename}')

    for name, pipe in models:
        filename = name.lower().replace(' ', '_')

        y, y_pred = different_models(pipe)

        acc = metrics.accuracy_score(y, y_pred)
        #prec = metrics.precision_score(y, y_pred, average='weighted', labels=labels)
        #recall = metrics.recall_score(y, y_pred, average='weighted', labels=labels)
        print(name)
        print(metrics.classification_report(y, y_pred, labels=labels))

        cm = metrics.confusion_matrix(y, y_pred, labels=labels)
        cm = norm_cm(cm)

        cm = pd.DataFrame(cm, index=labels, columns=labels)

        fig, ax = plt.subplots(dpi=92)
        sns.heatmap(cm,
                    vmin=0,
                    vmax=1,
                    annot=True,
                    fmt='.2f',
                    cmap='Greys',
                    ax=ax,
                    cbar=False,
                    square=True)
        ax.set_title(f'accuracy={acc:.3f}')
        format_axes_for_cm(ax)

        fig.tight_layout()

        ensure_dir('./output/models/')
        fig.savefig(f'./output/models/{filename}.pdf',
                    dpi=92,
                    bbox_inches='tight')
        plt.close(fig)
def main():
    cv = model_selection.StratifiedKFold(n_splits=10, shuffle=True)
    poly = preprocessing.PolynomialFeatures(degree=2)
    scaler = preprocessing.StandardScaler()
    resample = over_sampling.RandomOverSampler()

    baseline = pipeline.make_pipeline(
        scaler, resample,
        dummy.DummyClassifier(strategy='constant', constant='good'))

    logreg = pipeline.make_pipeline(
        scaler,
        resample,
        linear_model.LogisticRegression(),
    )

    dtree = pipeline.make_pipeline(
        scaler,
        resample,
        tree.DecisionTreeClassifier(),
    )

    #knn = pipeline.make_pipeline(
    #    scaler,
    #    resample,
    #    neighbors.KNeighborsClassifier()
    #)

    mlp = pipeline.make_pipeline(scaler, resample,
                                 neural_network.MLPClassifier())

    svc = pipeline.make_pipeline(scaler, resample, svm.LinearSVC())

    RForest = pipeline.make_pipeline(scaler, resample,
                                     ensemble.RandomForestClassifier())

    models = (
        ('Constant', baseline),
        ('Logistic Regression', logreg),
        ('Decision Tree', dtree),
        #('kNN', knn),
        ('Multi-Layer Perceptron', mlp),
        ('SVM (linear kernel)', svc),
        ('Random Forest', RForest),
    )

    fig, axes = plt.subplots(nrows=2,
                             ncols=3,
                             dpi=96,
                             sharey=True,
                             sharex=True)

    for (name, pipe), ax in zip(models, axes.reshape(-1)):
        y, y_pred = different_models(pipe)

        acc = metrics.accuracy_score(y, y_pred)
        prec = metrics.precision_score(y,
                                       y_pred,
                                       average='weighted',
                                       labels=labels)
        recall = metrics.recall_score(y,
                                      y_pred,
                                      average='weighted',
                                      labels=labels)

        cm = metrics.confusion_matrix(y, y_pred, labels=labels)
        cm = norm_cm(cm)

        cm = pd.DataFrame(cm, index=labels, columns=labels)
        sns.heatmap(cm,
                    vmin=0,
                    vmax=1,
                    annot=True,
                    fmt='.2f',
                    cmap='Greys',
                    ax=ax,
                    cbar=False,
                    square=True)

        ax.set_title(
            f'{name}\naccuracy={acc:.3f}\n(prec = {prec:.3f}, rec = {recall:.3f})'
        )

    fig.tight_layout()
    fig.savefig('./different_models.pdf', bbox_inches='tight')

    plt.show()