def logregA_varying_regularization(lam, regul1):
    pa_list = []
    ta_list = []
    total_ta = 0
    total_pa = 0

    for i in range(5):

        Log_ob = LogisticRegression(regLambda=lam, regNorm=regul1)

        Log_ob.fit(folds_X_complete[i], folds_y_complete[i])

        y_test = Log_ob.predict(X_test[i])
        pa_score = accuracy_score(y_test, y_complete[i])
        pa_list.append(pa_score)

        y_train = Log_ob.predict(folds_X_complete[i])
        ta_score = accuracy_score(y_train, folds_y_complete[i])
        ta_list.append(ta_score)

        total_pa = total_pa + pa_score
        total_ta = total_ta + ta_score

    pa = total_pa / 5
    ta = total_ta / 5

    return pa, ta, pa_list, ta_list
Beispiel #2
0
def main(train_path, valid_path, test_path, save_path):
    """Problem 2: Logistic regression for incomplete, positive-only labels.

    Run under the following conditions:
        1. on t-labels,
        2. on y-labels,
        3. on y-labels with correction factor alpha.

    Args:
        train_path: Path to CSV file containing training set.
        valid_path: Path to CSV file containing validation set.
        test_path: Path to CSV file containing test set.
        save_path: Path to save predictions.
    """
    output_path_true = save_path.replace(WILDCARD, 'true')
    output_path_naive = save_path.replace(WILDCARD, 'naive')
    output_path_adjusted = save_path.replace(WILDCARD, 'adjusted')

    # *** START CODE HERE ***
    # Part (a): Train and test on true labels
    x_train, t_train = util.load_dataset(train_path,
                                         label_col='t',
                                         add_intercept=True)
    model = LogisticRegression()
    model.fit(x_train, t_train)
    x_test, t_test = util.load_dataset(test_path,
                                       label_col='t',
                                       add_intercept=True)
    t_pred = model.predict(x_test)
    util.plot(x_test, t_test, model.theta, '{}.png'.format(output_path_true))
    np.savetxt(output_path_true, t_pred)
    # Make sure to save predicted probabilities to output_path_true using np.savetxt()
    # Part (b): Train on y-labels and test on true labels
    x_train, y_train = util.load_dataset(train_path,
                                         label_col='y',
                                         add_intercept=True)
    model = LogisticRegression()
    model.fit(x_train, y_train)
    x_test, t_test = util.load_dataset(test_path,
                                       label_col='t',
                                       add_intercept=True)
    t_pred = model.predict(x_test)
    util.plot(x_test, t_test, model.theta, '{}.png'.format(output_path_naive))
    np.savetxt(output_path_naive, t_pred)
    # Make sure to save predicted probabilities to output_path_naive using np.savetxt()
    # Part (f): Apply correction factor using validation set and test on true labels
    x_val, y_val = util.load_dataset(valid_path,
                                     label_col='y',
                                     add_intercept=True)
    h_val = model.predict(x_val)
    alpha = np.mean(h_val[y_val == 1])
    py_test = model.predict(x_test)
    pt_test = py_test / alpha
    util.plot(x_test,
              t_test,
              model.theta,
              '{}.png'.format(output_path_adjusted),
              correction=alpha)
    np.savetxt(output_path_adjusted, pt_test)
Beispiel #3
0
def main(train_path, valid_path, test_path, save_path):
    """Problem 2: Logistic regression for incomplete, positive-only labels.

    Run under the following conditions:
        1. on t-labels,
        2. on y-labels,
        3. on y-labels with correction factor alpha.

    Args:
        train_path: Path to CSV file containing training set.
        valid_path: Path to CSV file containing validation set.
        test_path: Path to CSV file containing test set.
        save_path: Path to save predictions.
    """
    output_path_true = save_path.replace(WILDCARD, 'true')
    output_path_naive = save_path.replace(WILDCARD, 'naive')
    output_path_adjusted = save_path.replace(WILDCARD, 'adjusted')

    # *** START CODE HERE ***
    # Part (a): Train and test on true labels
    x_train, y_train_t = util.load_dataset(train_path,
                                           label_col='t',
                                           add_intercept=True)
    model = LogisticRegression()

    # Fit model on true labels
    model.fit(x_train, y_train_t)

    x_val, y_val_t = util.load_dataset(valid_path,
                                       label_col='t',
                                       add_intercept=True)

    # Make sure to save predicted probabilities to output_path_true using np.savetxt()
    np.savetxt(output_path_true, model.predict(x_val))
    util.plot(x_val, y_val_t, model.theta, output_path_true[:-4])

    # Part (b): Train on y-labels and test on true labels
    _, y_train_y = util.load_dataset(train_path,
                                     label_col='y',
                                     add_intercept=True)

    model = LogisticRegression()

    # Train model on y-labels
    model.fit(x_train, y_train_y)

    # Make sure to save predicted probabilities to output_path_naive using np.savetxt()
    np.savetxt(output_path_naive, model.predict(x_val))
    util.plot(x_val, y_val_t, model.theta, output_path_naive[:-4])
Beispiel #4
0
def learn():

    stoplist = makeStoplist()
    features = extractFeaturesFromFile(stoplist=stoplist)
    vectorizer = TfidfVectorizer(encoding=ENCODING)
    X_train = vectorizer.fit_transform(
        [" ".join(feature[1:]) for feature in features])
    y_train = np.zeros(len(features))
    for i in range(len(features)):
        if features[i][0] == "+1":
            y_train[i] = 1
    clf = LogisticRegression()
    clf.fit(X_train, y_train)

    io.savemat("X_train", {"X_train": X_train})
    np.save("y_train", y_train)
    joblib.dump(vectorizer, "tfidf.vec")
    clf.save("logreg")
Beispiel #5
0
def main(train_path, valid_path, test_path, save_path):
    """Problem 2: Logistic regression for incomplete, positive-only labels.

    Run under the following conditions:
        1. on t-labels,
        2. on y-labels,
        3. on y-labels with correction factor alpha.

    Args:
        train_path: Path to CSV file containing training set.
        valid_path: Path to CSV file containing validation set.
        test_path: Path to CSV file containing test set.
        save_path: Path to save predictions.
    """
    output_path_true = save_path.replace(WILDCARD, 'true')
    output_path_naive = save_path.replace(WILDCARD, 'naive')
    output_path_adjusted = save_path.replace(WILDCARD, 'adjusted')

    # Part (a):
    x_train, t_train = util.load_dataset(train_path, 't', add_intercept=True)
    x_test, t_test = util.load_dataset(test_path, 't', add_intercept=True)
    clf = LogisticRegression()
    clf.fit(x_train, t_train)
    util.plot(x_test, t_test, clf.theta, 'posonly-true.jpg')
    np.savetxt(output_path_true, clf.predict(x_test))

    # Part (b):
    x_train, y_train = util.load_dataset(train_path, add_intercept=True)
    x_test, y_test = util.load_dataset(test_path, add_intercept=True)
    x_valid, y_valid = util.load_dataset(valid_path, add_intercept=True)
    clf = LogisticRegression()
    clf.fit(x_train, y_train)
    util.plot(x_test, t_test, clf.theta, 'posonly-naive.jpg')
    np.savetxt(output_path_naive, clf.predict(x_test))

    # Part (f):
    alpha = np.mean(clf.predict(x_valid[y_valid == 1]))
    np.savetxt(output_path_adjusted, clf.predict(x_test) / alpha)
    clf.theta[0] += np.log(2 / alpha - 1)
    util.plot(x_test, t_test, clf.theta, 'posonly_adjusted.jpg')
Beispiel #6
0
if __name__ == '__main__':

    # Create parser
    p = Parser()

    # Create training dataset
    ds = p.create_dataset("en-ud-train-projective.conllu", train=True)
    model_file = 'model.pkl'
    # model_file = 'model_t800.pkl'
    # Train LR model

    if os.path.exists(model_file):
        # if model exists, load from file
        print("Loading existing model...")
        lr = pickle.load(open(model_file, 'rb'))
    else:
        # train model using minibatch GD
        lr = LogisticRegression()
        lr.fit(*ds.to_arrays())
        pickle.dump(lr, open(model_file, 'wb'))
    
    # Create test dataset
    test_ds = p.create_dataset("en-ud-dev.conllu")
    # Copy feature maps to ensure that test datapoints are encoded in the same way
    test_ds.copy_feature_maps(ds)
    # Compute move-level accuracy
    lr.classify_datapoints(*test_ds.to_arrays())
    
    # Compute UAS and sentence-level accuracy
    t = TreeConstructor(p)
    t.evaluate(lr, 'en-ud-dev.conllu', ds)
Beispiel #7
0
 data = loadtxt(filename, delimiter=',')
 X = data[:, 0:2]
 y = np.array([data[:, 2]]).T
 n,d = X.shape
 
 # Standardize the data
 mean = X.mean(axis=0)
 std = X.std(axis=0)
 X = (X - mean) / std
 
 # map features into a higher dimensional feature space
 X = mapFeature(X[:,0],X[:,1])
 
 # train logistic regression
 logregModel = LogisticRegression()
 logregModel.fit(X,y)
 
 
 # reload the data for 2D plotting purposes
 data = loadtxt(filename, delimiter=',')
 PX = data[:, 0:2]
 y = data[:, 2]
 
 # Standardize the data
 mean = PX.mean(axis=0)
 std = PX.std(axis=0)
 PX = (PX - mean) / std
 
 # Plot the decision boundary
 h = .02  # step size in the mesh
 x_min, x_max = PX[:, 0].min() - .5, PX[:, 0].max() + .5
Beispiel #8
0
def main(train_path, validation_path, save_path):
    """Problem 2: Logistic regression for imbalanced labels.

    Run under the following conditions:
        1. naive logistic regression
        2. upsampling minority class

    Args:
        train_path: Path to CSV file containing training set.
        validation_path: Path to CSV file containing validation set.
        save_path: Path to save predictions.
    """
    output_path_naive = save_path.replace(WILDCARD, 'naive')
    output_path_upsampling = save_path.replace(WILDCARD, 'upsampling')

    # *** START CODE HERE ***
    # Part (b): Vanilla logistic regression
    # Make sure to save predicted probabilities to output_path_naive using np.savetxt()
    print("Vanilla Logistic Regression:")
    x_train, y_train = util.load_dataset(train_path, add_intercept=True)

    x_val, y_val = util.load_dataset(validation_path, add_intercept=True)

    clf = LogisticRegression()
    clf.fit(x_train, y_train)
    y_predict = clf.predict(x_val)

    np.savetxt(output_path_naive, y_predict)
    y_predict = y_predict >= 0.5
    util.plot(x_val, y_predict, clf.theta, output_path_naive[:-4])

    accuracy = np.mean(y_predict == y_val)
    A_0 = np.sum((y_predict == 0) * (y_val == 0)) / np.sum(y_val == 0)
    A_1 = np.sum((y_predict == 1) * (y_val == 1)) / np.sum(y_val == 1)
    balanced_accuracy = 0.5 * (A_0 + A_1)
    print("Accuracy: {},\nAccuracy for class 0: {},\nAccuracy for class 1: {},"
          "\nBalanced Accuracy: {}".format(accuracy, A_0, A_1,
                                           balanced_accuracy))

    #plot the real expected outcome from the validation:
    util.plot(x_val, y_val, clf.theta, output_path_naive[:-4] + "validation")
    # Part (d): Upsampling minority class
    # Make sure to save predicted probabilities to output_path_upsampling using np.savetxt()
    # Repeat minority examples 1 / kappa times
    num_add = int(1 / kappa) - 1

    x_train = np.concatenate(
        (x_train, np.repeat(x_train[y_train == 1, :], num_add, axis=0)),
        axis=0)
    y_train = np.concatenate(
        (y_train, np.repeat(y_train[y_train == 1], num_add, axis=0)), axis=0)

    x_val, y_val = util.load_dataset(validation_path, add_intercept=True)

    clf = LogisticRegression()
    clf.fit(x_train, y_train)
    y_predict = clf.predict(x_val)

    np.savetxt(output_path_upsampling, y_predict)
    y_predict = y_predict >= 0.5
    util.plot(x_val, y_predict, clf.theta, output_path_upsampling[:-4])

    accuracy = np.mean(y_predict == y_val)
    A_0 = np.sum((y_predict == 0) * (y_val == 0)) / np.sum(y_val == 0)
    A_1 = np.sum((y_predict == 1) * (y_val == 1)) / np.sum(y_val == 1)
    balanced_accuracy = 0.5 * (A_0 + A_1)
    print("Accuracy: {},\nAccuracy for class 0: {},\nAccuracy for class 1: {},"
          "\nBalanced Accuracy: {}".format(accuracy, A_0, A_1,
                                           balanced_accuracy))
    #plot the real expected outcome from the validation:
    util.plot(x_val, y_val, clf.theta,
              output_path_upsampling[:-4] + "validation")
def evaluatePerformance(numTrials=1000):
    '''
    Evaluate the performance of decision trees and logistic regression,
    average over 1,000 trials of 10-fold cross validation

    Return:
      a matrix giving the performance that will contain the following entries:
      stats[0,0] = mean accuracy of decision tree
      stats[0,1] = std deviation of decision tree accuracy
      stats[1,0] = mean accuracy of logistic regression
      stats[1,1] = std deviation of logistic regression accuracy

    ** Note that your implementation must follow this API**
    '''

    # Load Data
    filename = 'data/SPECTF.dat'
    data = np.loadtxt(filename, delimiter=',')
    X = data[:, 1:]
    y = np.array([data[:, 0]]).T
    n,d = X.shape

    # Standardize the data
    mean = X.mean(axis=0)
    std = X.std(axis=0)
    X = (X - mean) / std

    #1000 trials
    num_folds = 10
    percent_incs = 10
    tree_accuracy = np.zeros(shape=[numTrials*num_folds,percent_incs])
    log_accuracy = np.zeros(shape=[numTrials*num_folds,percent_incs])

    #split the data
    k_fold = sklearn.cross_validation.KFold(len(y), n_folds=num_folds)

    for i in xrange(numTrials):
        #for each trial, shuffle the data
        #print 'Iteration: ', i+1
        idx = np.arange(n)
        np.random.seed(13)
        np.random.shuffle(idx)
        X = X[idx]
        y = y[idx]

        j = 0
        for train_index, test_index in k_fold:
            for k in xrange(percent_incs):
                #get the data splits for the current fold
                Xtrain, Xtest = X[train_index[0:(n/percent_incs)*(k+1)]], X[test_index]
                ytrain, ytest = y[train_index[0:(n/percent_incs)*(k+1)]], y[test_index]

                # train the decision tree
                clf = tree.DecisionTreeClassifier()
                clf = clf.fit(Xtrain, ytrain)

                # output tree predictions on the remaining data and check them
                tree_pred = clf.predict(Xtest)
                tree_accuracy[i*num_folds + j,k] = accuracy_score(ytest, tree_pred)

                #train logarithmic regression
                logregModel = LogisticRegression(alpha = 0.1, epsilon = 0.005)
                logregModel.fit(Xtrain, ytrain)

                #output logreg predictions on the remaining data and check them
                log_pred = logregModel.predict(Xtest)
                log_accuracy[i*num_folds + j,k] = accuracy_score(ytest, log_pred)

            j += 1

    # compute the training accuracy of the model
    meanDecisionTreeAccuracy = np.mean(tree_accuracy[:,percent_incs-1])

    # TODO: update these statistics based on the results of your experiment
    stddevDecisionTreeAccuracy = np.std(tree_accuracy[:,percent_incs-1])
    meanLogisticRegressionAccuracy = np.mean(log_accuracy[:,percent_incs-1])
    stddevLogisticRegressionAccuracy = np.std(log_accuracy[:,percent_incs-1])

    #print graph
    tree_array = np.zeros(percent_incs)
    tree_array_std = np.zeros(percent_incs)
    log_array = np.zeros(percent_incs)
    log_array_std = np.zeros(percent_incs)
    for i in xrange(percent_incs):
        tree_array[i] = np.mean(tree_accuracy[:,i])
        tree_array_std[i] = np.std(tree_accuracy[:,i])
        log_array[i] = np.mean(log_accuracy[:,i])
        log_array_std[i] = np.std(log_accuracy[:,i])

    x_axis = (np.arange(percent_incs) + 1) * 10
    tree_plot = plt.errorbar(x=x_axis, y=tree_array, yerr=tree_array_std)
    log_plot = plt.errorbar(x=x_axis, y=log_array, yerr=log_array_std)
    plt.xlabel('Training Data Used (percentage)')
    plt.ylabel('Accuracy (mean)')
    plt.title('Learning Curve')
    plt.axis([10, 100, 0.0, 1.0])
    plt.grid(True)
    plt.legend([tree_plot, log_plot], ["Decision Tree", "Logistic Regression"], loc=4)

    plt.savefig('learningcurve.pdf')
    #plt.show()

    # make certain that the return value matches the API specification
    stats = np.zeros((2,2))
    stats[0,0] = meanDecisionTreeAccuracy
    stats[0,1] = stddevDecisionTreeAccuracy
    stats[1,0] = meanLogisticRegressionAccuracy
    stats[1,1] = stddevLogisticRegressionAccuracy
    return stats
Beispiel #10
0
import time

import numpy as np
from scipy import io
from sklearn.externals import joblib
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from logreg import LogisticRegression

if __name__ == "__main__":
    X_train = io.loadmat("X_train")["X_train"]
    X_train = X_train.tocsr()  #疎行列の種類の変更(tfidfVectorizerで出力されるものと同じものにする)
    y_train = np.load("y_train.npy")
    kf = KFold(n_splits=5)

    start = time.time()
    for (i, (train, test)) in enumerate(kf.split(X_train), start=1):
        clf = LogisticRegression()
        clf.fit(X_train[train], y_train[train])
        y_predict = clf.predict(X_train[test])
        y_test = y_train[test]
        print("Fold %d" % i)
        print("正解率: %f" % accuracy_score(y_test, y_predict))
        print("適合率: %f" % precision_score(y_test, y_predict))
        print("再現率: %f" % recall_score(y_test, y_predict))
        print("F1スコア: %f" % f1_score(y_test, y_predict))
        print("")
    elapsed_time = time.time() - start
    print(str(elapsed_time) + "[sec]")
if __name__ == "__main__":
    # Load Data
    filename = 'data/data1.dat'
    data = loadtxt(filename, delimiter=',')
    X = data[:, 0:2]
    y = np.array([data[:, 2]]).T
    n, d = X.shape

    # Standardize the data
    mean = X.mean(axis=0)
    std = X.std(axis=0)
    X = (X - mean) / std

    # train logistic regression
    logregModel = LogisticRegression(regLambda=0.0001)
    logregModel.fit(X, y)

    # Plot the decision boundary
    h = .02  # step size in the mesh
    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    Z = logregModel.predict(np.c_[xx.ravel(), yy.ravel()])
    print Z

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure(1, figsize=(4, 3))
    plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)
Beispiel #12
0
def evaluatePerformance(numTrials = 1000):
    '''
    Evaluate the performance of decision trees and logistic regression,
    average over 1,000 trials of 10-fold cross validation
    
    Return:
      a matrix giving the performance that will contain the following entries:
      stats[0,0] = mean accuracy of decision tree
      stats[0,1] = std deviation of decision tree accuracy
      stats[1,0] = mean accuracy of logistic regression
      stats[1,1] = std deviation of logistic regression accuracy
      
    ** Note that your implementation must follow this API**
    '''
    
       # Xtrain = X[1:101,:]  # train on first 100 instances
       #  Xtest = X[101:,:]
       #  ytrain = y[1:101,:]  # test on remaining instances
       #  ytest = y[101:,:]

    # Load Data
    filename = 'data/SPECTF.dat'
    data = np.loadtxt(filename, delimiter=',')
    X = data[:, 1:]
    y = np.array([data[:, 0]]).T
    n,d = X.shape

    # shuffle the data
    idx = np.arange(n)
    np.random.seed(13)
    # number of folds
    k = 10 
    # creates an array of numbers that correspond to the start / end points of each fold in the case for hw from 0 -266  it should return 0 26 ...267
    fold_index = n/k 
    index_arrayX =  [i*fold_index for i in range(k)]
    index_arrayX = np.append(index_arrayX,n)
    index_arrayY = [i*fold_index for i in range(k)] 
    index_arrayY = np.append(index_arrayX,n)

    stddevLogisticRegressionAccuracy = 0
    meanDecisionTreeAccuracy = 0
    meanLogisticRegressionAccuracy = 0 
    stddevDecisionTreeAccuracy = 0
    # an array to store all of the learning accuracies  where the #rows = k*numTrial and # columns is each percentage of the data 
    log_learning = np.matrix(np.zeros((numTrials*k,9)))
    tree_learning = np.matrix(np.zeros((numTrials*k,9)))
    #index for learning 
    ll =0 
    #accuracy vars 
    log_a = 0
    tree_a =0

    # making decision tree object and a logistic regression object 

    clf = tree.DecisionTreeClassifier()
    lr = LogisticRegression(alpha = 0.0000001, regLambda=0.001, epsilon=0.0001, maxNumIters = 10000)

    #test_instance = 1
    #start_time = time.time()
    # ~~~~~~~~~~~main loop ~~~~~~~~~~~~~~~~~
    for i in xrange (numTrials): 
        #shuffle data after each cross validation 
        np.random.shuffle(idx)
        X = X[idx]
        y = y[idx]
        for j in xrange(k): 
          # seperate test data from train data, moves test data to subsequent fold after each loop
          #print (time.time() - start_time)
          end = j+1
          Xtest = X[index_arrayX[j]:index_arrayX[end],:]
          ytest = y[index_arrayY[j]:index_arrayX[end],:]
          Xtrain = X[0:index_arrayX[j],:]
          ytrain = y[0:index_arrayY[j],:]
          Xtrain = np.append(Xtrain, X[index_arrayX[j+1]:n,:],axis =0)
          ytrain = np.append(ytrain, y[index_arrayY[j+1]:n,:],axis =0)
          size_n,size_d = Xtrain.shape
          #size of 10% blocks 
          train_percentage = size_n/10
          for l in xrange(1,10):
            #train / find accuracy over 10% then 20% ect until loop exits 
            clf = clf.fit(Xtrain[0:train_percentage*l,:],ytrain[0:train_percentage*l,:])
            treey_pred = clf.predict(Xtest[0:train_percentage*l,:])
            lr.fit(Xtrain[0:train_percentage*l,:], ytrain[0:train_percentage*l,:])
            logy_pred = lr.predict(Xtest[0:train_percentage*l,:]) 
            # fill in accuracies into accuracy matrix  
            log_a =  accuracy_score(ytest[0:train_percentage*l,:],logy_pred) + log_a
            tree_a = accuracy_score(ytest[0:train_percentage*l,:],treey_pred) + tree_a
            log_learning[ll,(l-1)] = log_a
            tree_learning[ll,(l-1)] = tree_a
            ll+1
    tree_acc = 0
    log_acc = 0 
    for o in xrange(9):
      #summing the accuracies for each percentage then dviding by fold*trials * percentages
      meanDecisionTreeAccuracy = (np.sum(tree_learning[:,o])/(9*k*numTrials)) + meanDecisionTreeAccuracy
      meanLogisticRegressionAccuracy = (np.sum(log_learning[:,o])/(9*k*numTrials)) + meanLogisticRegressionAccuracy 

    #finding total mean accuracy over all percentages as well as standard deviations over (k*numTrial) trials
    meanDecisionTreeAccuracy = meanDecisionTreeAccuracy/(9)
    meanLogisticRegressionAccuracy = meanLogisticRegressionAccuracy /(9)
    stddevDecisionTreeAccuracy = np.std(tree_learning)/(k*numTrials)
    stddevLogisticRegressionAccuracy = np.std(log_learning)/(k*numTrials)


    # make certain that the return value matches the API specification
    stats = np.zeros((2,2))
    stats[0,0] = meanDecisionTreeAccuracy
    stats[0,1] = stddevDecisionTreeAccuracy
    stats[1,0] = meanLogisticRegressionAccuracy
    stats[1,1] = stddevLogisticRegressionAccuracy
    #end_time = time.time() 
    plot_log= np.array(np.zeros((9,1)))
    plot_tree =np.array(np.zeros((9,1)))
    #putting the mean accuracies for each perctage block into an array
    for q in xrange(9):
      plot_log[q] = np.sum(log_learning[:,q])/(9*k*numTrials)
      plot_tree[q] = np.sum(tree_learning[:,q])/(9*k*numTrials)
    percent_array = [10,20,30,40,50,60,70,80,90]

    plt.figure(1)
    plt.clf()
    plt.title("Learning Curve")
    plt.xlabel("Percentage")
    plt.ylabel("Accuracy")
    plt.axis([0,100, .6,.8])
    plt.plot(percent_array,plot_log, 'rx', label='Logistic Regression')
    plt.hold 
    plt.plot(percent_array,plot_tree, 'bx',label ='Decision Tree')
    plt.legend(loc='lower right')
    plt.savefig('learningcurve.png')
    #plt.show()
    

    return stats
Beispiel #13
0
def main(train_path, valid_path, test_path, save_path):
    """Problem 2: Logistic regression for incomplete, positive-only labels.

    Run under the following conditions:
        1. on t-labels,
        2. on y-labels,
        3. on y-labels with correction factor alpha.

    Args:
        train_path: Path to CSV file containing training set.
        valid_path: Path to CSV file containing validation set.
        test_path: Path to CSV file containing test set.
        save_path: Path to save predictions.
    """
    output_path_true = save_path.replace(WILDCARD, 'true')
    output_path_naive = save_path.replace(WILDCARD, 'naive')
    output_path_adjusted = save_path.replace(WILDCARD, 'adjusted')

    # *** START CODE HERE ***
    # Part (a): Train and test on true labels
    x_train, y_train = util.load_dataset(train_path,
                                         label_col='t',
                                         add_intercept=True)

    model_true = LogisticRegression()
    model_true.fit(x_train, y_train)

    x_test, y_test = util.load_dataset(test_path,
                                       label_col='t',
                                       add_intercept=True)

    util.plot(x_test, y_test, model_true.theta, 'plot_5a.png')

    # Make sure to save predicted probabilities to output_path_true using np.savetxt()
    np.savetxt(output_path_true, model_true.predict(x_test))

    # Part (b): Train on y-labels and test on true labels
    x_train, y_train = util.load_dataset(train_path,
                                         label_col='y',
                                         add_intercept=True)

    model_naive = LogisticRegression()
    model_naive.fit(x_train, y_train)

    x_test, y_test = util.load_dataset(test_path,
                                       label_col='y',
                                       add_intercept=True)
    util.plot(x_test, y_test, model_naive.theta, 'plot_5b.png')

    # Make sure to save predicted probabilities to output_path_naive using np.savetxt()
    np.savetxt(output_path_naive, model_naive.predict(x_test))

    # Part (f): Apply correction factor using validation set and test on true labels
    x_valid, y_valid = util.load_dataset(valid_path,
                                         label_col='t',
                                         add_intercept=True)

    x_index = np.where(y_valid == 1)

    alpha = 1 / len(y_valid[y_valid == 1]) * np.sum(
        model_naive.predict((x_valid[x_index])))

    x_test, y_test = util.load_dataset(test_path,
                                       label_col='y',
                                       add_intercept=True)

    util.plot(x_test,
              y_test,
              model_naive.theta,
              'plot_5f.png',
              correction=alpha)

    np.savetxt(output_path_adjusted, model_naive.predict(x_test) * alpha)
Beispiel #14
0
def main(train_path, valid_path, test_path, save_path):
    """Problem 2: Logistic regression for incomplete, positive-only labels.

    Run under the following conditions:
        1. on t-labels,
        2. on y-labels,
        3. on y-labels with correction factor alpha.

    Args:
        train_path: Path to CSV file containing training set.
        valid_path: Path to CSV file containing validation set.
        test_path: Path to CSV file containing test set.
        save_path: Path to save predictions.
    """
    output_path_true = save_path.replace(WILDCARD, 'true')
    output_path_naive = save_path.replace(WILDCARD, 'naive')
    output_path_adjusted = save_path.replace(WILDCARD, 'adjusted')

    # *** START CODE HERE ***

    def image_path(path):
        return path[:-3] + "png"

    # Part (a): Train and test on true labels
    # Make sure to save predicted probabilities to output_path_true using np.savetxt()
    x_train, t_train = util.load_dataset(train_path,
                                         label_col="t",
                                         add_intercept=True)
    x_test, t_test = util.load_dataset(test_path,
                                       label_col="t",
                                       add_intercept=True)
    model = LogisticRegression()
    model.fit(x_train, t_train)

    prob_test = model.predict(x_test)
    np.savetxt(output_path_true, prob_test)
    util.plot(x_test,
              t_test,
              model.theta,
              save_path=image_path(output_path_true))

    # Part (b): Train on y-labels and test on true labels
    # Make sure to save predicted probabilities to output_path_naive using np.savetxt()
    x_train, y_train = util.load_dataset(train_path,
                                         label_col="y",
                                         add_intercept=True)
    x_test, y_test = util.load_dataset(test_path,
                                       label_col="y",
                                       add_intercept=True)

    model = LogisticRegression()
    model.fit(x_train, y_train)

    prob_test = model.predict(x_test)
    np.savetxt(output_path_naive, prob_test)

    util.plot(x_test,
              t_test,
              model.theta,
              save_path=image_path(output_path_naive))
    # Part (f): Apply correction factor using validation set and test on true labels
    # Plot and use np.savetxt to save outputs to output_path_adjusted
    # Estimate alpha
    x_val, y_val = util.load_dataset(valid_path,
                                     label_col="y",
                                     add_intercept=True)
    model = LogisticRegression()
    model.fit(x_train, y_train)
    h_val = model.predict(x_val)
    alpha = np.mean(h_val[y_val == 1])  # Mean over positive y samples.
    # Adjustment
    py_test = model.predict(x_test)
    pt_test = py_test / alpha
    np.savetxt(output_path_adjusted, pt_test)
    # Plot
    util.plot(x_test,
              t_test,
              model.theta,
              save_path=image_path(output_path_adjusted),
              correction=alpha)
Beispiel #15
0
plt.imshow(train_set_x_orig[index])
plt.show()
print ("y = " + str(train_set_y[:, index]) + ", it's a '" + classes[np.squeeze(train_set_y[:, index])].decode("utf-8") +  "' picture.")
'''

# Flatten the images
train_set_x_flatten = train_set_x_orig.reshape(train_set_x_orig.shape[0], -1).T
test_set_x_flatten = test_set_x_orig.reshape(test_set_x_orig.shape[0], -1).T

# Normalise image values
train_set_x = train_set_x_flatten / 255.
test_set_x = test_set_x_flatten / 255.

# Create model instance
model = LogisticRegression()

# Fit model to the data
model.fit(train_set_x, train_set_y)

# Train the model
model.train(2400, verbose=True)

# Predict values
predictions = model.predict(test_set_x)

# Check accuracy
model.print_accuracy(predictions, test_set_y)

# Plot training loss
model.plot_cost()
Beispiel #16
0
    batch_size = 100
    n_batches = int(Xtrain.shape[0] / batch_size)
    logReg = LogisticRegression(n_batches=n_batches, allow_early_stop=False)

    etas = [1, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5]
    acc_list = []

    accuracys_train = []
    costs_train = []
    accuracys_test = []
    costs_test = []

    for eta in etas:
        a, b, c, d = logReg.fit(Xtrain,
                                ytrain,
                                eta=eta,
                                n_epochs=2000,
                                Xtest=Xtest,
                                ytest=ytest)
        acc_list.append(logReg.accuracy(Xtest, ytest))

        accuracys_train.append(a)
        costs_train.append(b)
        accuracys_test.append(c)
        costs_test.append(d)

        print("Accuracy vs. test data, own logreg:", acc_list[-1])

    plt.figure(figsize=(10, 8))
    plt.title("Accuracy score for varying learning rate, logistic regression")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
Beispiel #17
0
    if args.test:
        test_file = args.test
        test = pd.read_csv(test_file)

    if test_file is None:
        print("Splitting train to accomodate for test set.")
        train, test = train_test_split(train, test_size=0.2)

    train_Y = train['labels'].values
    train_X = train.drop(['labels'], axis=1).values

    test_Y = test['labels'].values
    test_X = test.drop(['labels'], axis=1).values

    print(train_X.shape, train_Y.shape, test_X.shape, test_Y.shape)
    logreg = LogisticRegression(learning_rate=lr,
                                epochs=epochs,
                                initialiser=init,
                                verbose=verbose)
    logreg.fit(train_X, train_Y)
    predictions = logreg.predict(test_X)

    if args.output == ".":
        args.output = os.getcwd()
    with open(args.output + "/classification_report.txt", 'w') as f:
        f.write(str(classification_report(test_Y, predictions)))

    test['predictions'] = predictions
    test.to_csv(args.output + "/predictions.csv")
Beispiel #18
0
def main(train_path, valid_path, test_path, save_path):
    """Problem 2: Logistic regression for incomplete, positive-only labels.

    Run under the following conditions:
        1. on t-labels,
        2. on y-labels,
        3. on y-labels with correction factor alpha.

    Args:
        train_path: Path to CSV file containing training set.
        valid_path: Path to CSV file containing validation set.
        test_path: Path to CSV file containing test set.
        save_path: Path to save predictions.
    """
    output_path_true = save_path.replace(WILDCARD, 'true')
    output_path_naive = save_path.replace(WILDCARD, 'naive')
    output_path_adjusted = save_path.replace(WILDCARD, 'adjusted')

    # *** START CODE HERE ***
    # Part (a): Train and test on true labels
    x_train, y_train = util.load_dataset(train_path,
                                         add_intercept=True,
                                         label_col='t')
    x_valid, y_valid = util.load_dataset(valid_path,
                                         add_intercept=True,
                                         label_col='t')
    from logreg import LogisticRegression
    clf = LogisticRegression()
    clf.fit(x_train, y_train)
    print(clf.theta)

    fig, ax = plt.subplots(1, 1, figsize=(12, 8))

    ax.scatter(x_valid[:, 1], x_valid[:, 2], c=y_valid.astype(np.int))
    ax.set_ylim(x_valid[:, 2].min(), x_valid[:, 2].max())
    plot_decision_line(clf.theta, x_valid, ax)
    plt.savefig("posonly_all_observed.png")
    plt.show()

    # Make sure to save predicted probabilities to output_path_true using np.savetxt()
    # Part (b): Train on y-labels and test on true labels
    x_train, y_train = util.load_dataset(train_path,
                                         add_intercept=True,
                                         label_col='y')
    x_valid, y_valid = util.load_dataset(valid_path,
                                         add_intercept=True,
                                         label_col='y')
    from logreg import LogisticRegression
    clf = LogisticRegression()
    clf.fit(x_train, y_train)
    print(clf.theta)

    fig, ax = plt.subplots(1, 1, figsize=(12, 8))

    ax.scatter(x_valid[:, 1], x_valid[:, 2], c=y_valid.astype(np.int))
    ax.set_ylim(x_valid[:, 2].min(), x_valid[:, 2].max())
    plot_decision_line(clf.theta, x_valid, ax)
    plt.savefig("naive_training_partial.png")
    plt.show()
    # Make sure to save predicted probabilities to output_path_naive using np.savetxt()
    # Part (f): Apply correction factor using validation set and test on true labels
    clf = LogisticRegression()
    clf.fit(x_train, y_train)

    #decition
    y_pred = clf.predict(x_valid)
    print(y_pred)

    fig, ax = plt.subplots(1, 1, figsize=(12, 8))

    ax.scatter(x_valid[:, 1], x_valid[:, 2], c=y_valid.astype(np.int))
    ax.set_ylim(x_valid[:, 2].min(), x_valid[:, 2].max())
    plt.show()