Esempio n. 1
0
def model_stats(model, **kwargs):
    """ Train a model and make classification report and confusion matrix and save them """
    X_train, y_train = load_train_data()
    X_test, y_test = load_test_data()
    model.train(X_train, y_train, **kwargs)
    acc_train = model_accuracy(model, X_train, y_train)
    acc_test = model_accuracy(model, X_test, y_test)
    df_acc = pd.DataFrame(np.c_[acc_train, acc_test],
                          columns=['train_accuracy', 'test_accuracy'])
    export_dataframe('{0}_stats_accuracy.csv'.format(model.name.lower()),
                     df_acc)
    print('Train Accuracy: {0}, Test Accuracy: {1}'.format(
        acc_train, acc_test))
    pred_test = model.predict(X_test)
    confusion = confusion_matrix(y_test, pred_test)
    cr = classification_report(y_test, pred_test)
    print('Full Model Stats')
    print('Classification Report')
    print(cr)
    print('Confusion Matrix')
    df_confusion = pd.DataFrame(confusion)
    print(confusion)
    export_dataframe('{0}_confusion_matrix.csv'.format(model.name.lower()),
                     df_confusion)
    # plot_confusion_matrix(confusion)
    return model
Esempio n. 2
0
def make_softmax_statistics():
    model = SoftmaxClassifier()
    rounds = 50
    print('Loading Data')
    X_train, y_train = load_train_data()
    X_test, y_test = load_test_data()
    model = model_stats(model, **{'reg': 1e-4, 'epochs': 1, 'batch_size': 64})
    visualize_model(SoftmaxClassifier())
    return model
Esempio n. 3
0
def best_model(model):
    """ Find the best regularization of the model
    All you need to do is put some regularization values in reg and
    choose the params you want for mini_batch_gradient descent in params.
    """

    X_train, y_train = load_train_data()
    X_test, y_test = load_test_data()
    params = {}
    reg = [0, 0.001, 0.0001, 0.00001, 0.000001]
    ### YOUR CODE HERE
    for i in range(1, 20):
        reg.append(0.30**i)
    ### END CODE
    reg = sorted(reg)
    model, acc = run_validation(model, X_train, y_train, reg, **params)
    df_reg = pd.DataFrame(np.c_[reg, acc],
                          columns=['reg. params', 'val. score'])
    export_dataframe('{0}_validation_scores.csv'.format(model.name.lower()),
                     df_reg)
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    ax.plot(reg,
            acc,
            'b-*',
            linewidth=2,
            markersize=6,
            label='Validation Accuracy')
    print('plot of: ', np.c_[reg, acc])
    name = '{0}_validation_accuracy'.format(model.name.lower())
    ax.set_title(name)
    ax.set_xlabel('reg. params')
    ax.set_ylabel('val. accuracy')
    ax.legend()
    export_fig('{0}.png'.format(name), fig)
    test_acc = model_accuracy(model, X_test, y_test)
    idx = np.argmax(acc)
    print(
        'Best model found: reg {0}\nValidation accuracy: {1}\nTest Accuracy: {2}'
        .format(reg[idx], acc[idx], test_acc))
    best_result = pd.DataFrame(
        np.c_[reg[idx], acc[idx], test_acc],
        columns=['reg. value', 'validation accuracy', 'test accuracy'])
    export_dataframe('{0}_best_result.csv'.format(model.name.lower()),
                     best_result)
Esempio n. 4
0
def make_logreg_statistics():
    """ Generate simple statistics about the logistic regression implementation for the hand in report """

    pd.set_option('display.height', 1000)
    pd.set_option('display.max_rows', 500)
    pd.set_option('display.max_columns', 500)
    pd.set_option('display.width', 1000)
    model = LogisticClassifierTwoClass()
    rounds = 50
    print('Loading Data')
    X_train, y_train = load_train_data()
    X_test, y_test = load_test_data()

    names = [
        'Batch Gradient Descent', 'Mini Batch Gradient Descent',
        'Scipy Optimize'
    ]
    algs = [
        log_reg.batch_grad_descent, log_reg.mini_batch_grad_descent,
        log_reg.fast_descent
    ]
    params = [{
        'reg': 1e-4,
        'rounds': rounds,
        'lr': 1.0
    }, {
        'reg': 1e-4,
        'epochs': rounds,
        'lr': 0.1,
        'batch_size': 16
    }, {
        'reg': 1e-4,
        'rounds': rounds
    }]
    times = np.zeros(3)
    acc_train = np.zeros(3)
    acc_test = np.zeros(3)

    print('Computing 2 vs 7 Statistics on fixed paramters')
    Xbin_train, ybin_train = get_digit_pair_data(X_train, y_train, 2, 7)
    Xbin_test, ybin_test = get_digit_pair_data(X_test, y_test, 2, 7)
    Xbin_train_padded = np.c_[np.zeros(Xbin_train.shape[0]), Xbin_train]
    for i, (alg, param, name) in enumerate(zip(algs, params, names)):
        print('*** Running {0} ***'.format(name))
        start = timer()
        w = alg(Xbin_train_padded, ybin_train, **param)
        end = timer()
        time = end - start
        model.w = w
        train_acc = model_accuracy(model, Xbin_train, ybin_train)
        test_acc = model_accuracy(model, Xbin_test, ybin_test)
        times[i] = time
        acc_train[i] = train_acc
        acc_test[i] = test_acc
        print('Alg: {0}, Rounds/Epochs: {1}, Time: {2}, Ein: {3}, Etest: {4}'.
              format(name, rounds, time, train_acc, test_acc))
    df_2vs7 = pd.DataFrame(np.c_[names, times, acc_train, acc_test],
                           columns=['Alg.', 'Time', 'Ein', 'Etest'])
    filename_2vs7 = 'results_2_vs_7.csv'
    print('Result of 2 vs 7 Classifier with fixed parameters stored in {0}'.
          format(filename_2vs7))
    display(df_2vs7)
    export_dataframe(filename_2vs7, df_2vs7)
    visualize_model(LogisticClassifierTwoClass(), Xbin_train, ybin_train)

    print('Compute Pairwise scores using mini_batch gradient descent')
    pairwise_scores_train = np.zeros((10, 10))
    pairwise_scores_test = np.zeros((10, 10))
    pairs = [(i, j) for i in range(10) for j in range(10)]
    model = LogisticClassifierTwoClass()
    for i, j in pairs:
        if j >= i: continue
        Xbin_train, ybin_train = get_digit_pair_data(X_train, y_train, i, j)
        Xbin_test, ybin_test = get_digit_pair_data(X_test, y_test, i, j)
        # model.w = log_reg.mini_batch_grad_descent(Xbin_train, ybin_train, reg=1e-4, epochs=30, lr=0.1, batch_size=16)
        model.train(Xbin_train,
                    ybin_train,
                    reg=1e-4,
                    epochs=30,
                    lr=0.1,
                    batch_size=16)
        acc_train = model_accuracy(model, Xbin_train, ybin_train)
        acc_test = model_accuracy(model, Xbin_test, ybin_test)
        pairwise_scores_train[i][j] = acc_train
        pairwise_scores_test[i][j] = acc_test

    df_pairwise_train = pd.DataFrame(pairwise_scores_train)
    df_pairwise_test = pd.DataFrame(pairwise_scores_test)
    print('Pairwise Training Accuracy')
    display(df_pairwise_train)
    print('Pairwise Test Accuract')
    display(df_pairwise_test)
    export_dataframe('pairwise_train_accuracy.csv', df_pairwise_train.round(4))
    export_dataframe('pairwise_test_accuracy.csv', df_pairwise_test.round(4))

    print('Training full model with all vs one')
    full_model = LogisticClassifier()
    model_stats(full_model, **{'reg': 1e-4, 'epochs': 30, 'batch_size': 16})
    print('Result of 2 vs 7 Classifier with fixed parameters stored in {0}'.
          format(filename_2vs7))
    visualize_model(LogisticClassifier())