Esempio n. 1
0
def main(train_path, eval_path, pred_path):
    """Problem 1(b): Logistic regression with Newton's Method.

    Args:
        train_path: Path to CSV file containing dataset for training.
        eval_path: Path to CSV file containing dataset for evaluation.
        pred_path: Path to save predictions.
    """
    x_train, y_train = util.load_dataset(train_path, add_intercept=True)
    x_eval, y_eval = util.load_dataset(eval_path, add_intercept=True)

    # *** START CODE HERE ***
    # Train a logistic regression classifier
    # Plot decision boundary on top of validation set set
    # Use np.savetxt to save predictions on eval set to pred_path

    initial_theta = np.zeros(x_train.shape[1])
    log_reg = LogisticRegression(step_size=0.2,
                                 max_iter=100,
                                 eps=1e-5,
                                 theta_0=initial_theta,
                                 verbose=True)
    log_reg.fit(x_train, y_train)

    prediction = log_reg.predict(x_eval)

    plot_path = pred_path + ".plot.png"
    util.plot(x_eval, y_eval, log_reg.theta, plot_path, correction=1.0)

    np.savetxt(pred_path, prediction)
Esempio n. 2
0
def main(train_path, test_path):

    x_train, y_train = util.load_dataset(train_path[0])

    if len(train_path) == 2:
        x_train2, y_train2 = util.load_dataset(train_path[1])
        x_train = np.concatenate((x_train, x_train2), axis=0)
        y_train = np.concatenate((y_train, y_train2), axis=0)

    # Load the data.
    x_test, y_test = util.load_dataset(test_path)

    # # delete the bert entries
    # x_train=x_train[:,-13:]
    # x_test=x_test[:,-13:]

    # Define the SVM
    clf = LogisticRegression().fit(x_train, y_train)

    # Predicting
    prediction = clf.predict(x_test)

    print(classification_report(y_test, prediction))

    prediction = clf.predict(x_train)

    print(classification_report(y_train, prediction))
Esempio n. 3
0
def main(lr, train_path, eval_path, pred_path):
    """Problem 3(d): Poisson regression with gradient ascent.

    Args:
        lr: Learning rate for gradient ascent.
        train_path: Path to CSV file containing dataset for training.
        eval_path: Path to CSV file containing dataset for evaluation.
        pred_path: Path to save predictions.
    """
    # Load training set
    # x_train, y_train = util.load_dataset(train_path, add_intercept=False)
    x_train, y_train = util.load_dataset(train_path, add_intercept=True)

    pr = PoissonRegression(max_iter=10000)
    pr.step_size = lr
    pr.fit(x_train, y_train)

    x_eval, y_eval = util.load_dataset(eval_path, add_intercept=True)
    y_pred = np.empty_like(y_eval)

    for i in range(len(x_eval)):
        y_pred[i] = pr.predict(x_eval[i])

    # np.savetxt(pred_path, np.column_stack((x_eval, y_pred)), delimiter=',')
    np.savetxt(pred_path, y_pred, delimiter=',')
Esempio n. 4
0
def main(tau, train_path, eval_path):
    """Problem 5(b): Locally weighted regression (LWR)

    Args:
        tau: Bandwidth parameter for LWR.
        train_path: Path to CSV file containing dataset for training.
        eval_path: Path to CSV file containing dataset for evaluation.
    """
    # Load training set
    x_train, y_train = util.load_dataset(train_path, add_intercept=True)

    # *** START CODE HERE ***
    # Fit a LWR model
    model = LocallyWeightedLinearRegression(tau=tau)
    model.fit(x_train, y_train)

    # Get MSE value on the validation set
    x_val, y_val = util.load_dataset(eval_path, add_intercept=True)
    y_pred = model.predict(x_val)
    print('p5b mse: ', ((y_val - y_pred)**2).mean(axis=0))

    # Plot validation predictions on top of training set
    plt.figure()

    # No need to save predictions
    # Plot data
    plt.plot(x_train, y_train, 'bx', linewidth=2)
    plt.plot(x_val, y_pred, 'ro', linewidth=2)

    plt.xlabel('x')
    plt.ylabel('y')
    plt.savefig('output/p05b.png')
Esempio n. 5
0
def main(train_path, eval_path, pred_path):
    """Problem 1(b): Logistic regression with Newton's Method.

    Args:
        train_path: Path to CSV file containing dataset for training.
        eval_path: Path to CSV file containing dataset for evaluation.
        pred_path: Path to save predictions.
    """
    x_train, y_train = util.load_dataset(train_path, add_intercept=True)

    # initial guess of parameters
    theta_0 = np.zeros(shape=(3, ))

    # get the model
    model = LogisticRegression(theta_0=theta_0)
    model.fit(x_train, y_train)

    # predict using the trained model
    x_eval, y_eval = util.load_dataset(eval_path, add_intercept=True)
    y_pred = model.predict(x_eval)

    # Plot decision boundary on top of validation set set
    util.plot(x_eval, y_eval, model.theta, 'output/{ds}_log_reg.pdf'.format(ds=eval_path.split('/')[-1]))

    # Use np.savetxt to save predictions on eval set to pred_path
    np.savetxt(pred_path, y_pred)
Esempio n. 6
0
def main(train_path, valid_path, save_path):
    """Problem: Gaussian discriminant analysis (GDA)

    Args:
        train_path: Path to CSV file containing dataset for training.
        valid_path: Path to CSV file containing dataset for validation.
        save_path: Path to save predicted probabilities using np.savetxt().
    """
    # Load dataset
    x_train, y_train = util.load_dataset(train_path, add_intercept=False)

    # *** START CODE HERE ***
    # Train a GDA classifier
    clf = GDA()
    clf.fit(x_train, y_train)
    # Plot decision boundary on validation set
    x_test, y_test = util.load_dataset(valid_path, add_intercept=True)
    y_test_pred = clf.predict(x_test[:, 1:])

    #    plot_decision_boundary(x_test, y_test, clf.theta, save_path)
    plot(x_test, y_test, clf.theta,
         os.path.splitext(save_path)[0] + '_fig.png')

    # Use np.savetxt to save predictions on eval set to save_path
    np.savetxt(save_path, y_test_pred)
    base, ext = os.path.splitext(save_path)
    theta_save_path = base + '_theta' + ext
    np.savetxt(theta_save_path, clf.theta)
Esempio n. 7
0
def main(train_path, eval_path, pred_path, k = 0):
    """Problem 1(b): Logistic regression with Newton's Method.

    Args:
        train_path: Path to CSV file containing dataset for training.
        eval_path: Path to CSV file containing dataset for evaluation.
        pred_path: Path to save predictions.
    """
    
    x_train, y_train = util.load_dataset(train_path, add_intercept=True)
    x_eval, y_eval = util.load_dataset(eval_path, add_intercept = True)
    # *** START CODE HERE ***
    # Train a logistic regression classifier
    # Plot decision boundary on top of validation set set
    # Use np.savetxt to save predictions on eval set to pred_path
    clf = LogisticRegression()
    theta = clf.fit(x_train,y_train)
    p = clf.predict(x_eval)
    if(k==0):
        np.savetxt(pred_path,p,delimiter = ',')
        sp = 'output/p01b_plot'
        util.plot(x_eval,y_eval,theta,sp)
    elif(k==1):
        ind = p < 0.5
        p[ind] = 0
        index = p >= 0.5
        p[index] = 1
        return theta,p
Esempio n. 8
0
def main(train_path, valid_path, save_path):
    """Problem: Gaussian discriminant analysis (GDA)

    Args:
        train_path: Path to CSV file containing dataset for training.
        valid_path: Path to CSV file containing dataset for validation.
        save_path: Path to save predicted probabilities using np.savetxt().
    """
    # Load dataset
    x_train, y_train = util.load_dataset(train_path, add_intercept=False)
    x_eval, y_eval = util.load_dataset(valid_path, add_intercept=False)

    # *** START CODE HERE ***
    # Train a GDA classifier
    clf = GDA()
    clf.fit(x_train, y_train)
    preds = clf.predict(x_eval)

    # Plot decision boundary on validation set
    theta_ = np.insert(clf.theta, 0, clf.theta_zero)
    save_path_ = save_path.strip('.txt')
    util.plot(x_eval, y_eval, theta_, save_path_)

    # Use np.savetxt to save outputs from validation set to save_path
    np.savetxt(save_path, preds)
Esempio n. 9
0
def main(tau, train_path, eval_path):
    """Problem 5(b): Locally weighted regression (LWR)

    Args:
        tau: Bandwidth parameter for LWR.
        train_path: Path to CSV file containing dataset for training.
        eval_path: Path to CSV file containing dataset for evaluation.
    """
    # Load training set
    x_train, y_train = util.load_dataset(train_path, add_intercept=True)

    # *** START CODE HERE ***

    model = LocallyWeightedLinearRegression(tau=tau)
    model.fit(x_train, y_train)

    x_eval, y_eval = util.load_dataset(eval_path, add_intercept=True)
    y_pred = model.predict(x_eval)

    mse = np.mean((y_pred - y_eval)**2)
    print(f'MSE={mse}')

    plt.figure()
    plt.plot(x_train, y_train, 'bx', linewidth=2)
    plt.plot(x_eval, y_pred, 'ro', linewidth=2)
    plt.xlabel('x')
    plt.ylabel('y')
    plt.savefig('output/p05b.png')
Esempio n. 10
0
def main(train_path, eval_path, pred_path):
    """Problem 1(e): Gaussian discriminant analysis (GDA)

    Args:
        train_path: Path to CSV file containing dataset for training.
        eval_path: Path to CSV file containing dataset for evaluation.
        pred_path: Path to save predictions.
    """
    # Load dataset
    x_train, y_train = util.load_dataset(train_path, add_intercept=False)

    # get the model
    model = GDA()
    model.fit(x_train, y_train)

    # predict using the trained model
    x_eval, y_eval = util.load_dataset(eval_path, add_intercept=False)
    y_pred = model.predict(x_eval)

    # Plot decision boundary on top of validation set set
    theta = list(model.theta)
    theta_0 = [model.theta_0]
    util.plot(x_eval, y_eval, theta_0 + theta,
              'output/{ds}_GDA.pdf'.format(ds=eval_path.split('/')[-1]))

    # Use np.savetxt to save predictions on eval set to pred_path
    np.savetxt(pred_path, y_pred)
Esempio n. 11
0
def main(lr, train_path, eval_path, save_path):
    """Problem: Poisson regression with gradient ascent.

    Args:
        lr: Learning rate for gradient ascent.
        train_path: Path to CSV file containing dataset for training.
        eval_path: Path to CSV file containing dataset for evaluation.
        save_path: Path to save predictions.
    """
    # Load training set
    x_train, y_train = util.load_dataset(train_path, add_intercept=True)

    # *** START CODE HERE ***
    # Fit a Poisson Regression model
    model = PoissonRegression(step_size=lr)
    model.fit(x_train, y_train)
    # Run on the validation set, and use np.savetxt to save outputs to save_path
    x_val, y_val = util.load_dataset(eval_path, add_intercept=True)
    pred_val = model.predict(x_val)
    np.savetxt(save_path, pred_val)

    # Plot the result
    plt.scatter(x=y_val, y=pred_val, label="Predictions")
    plt.xlabel("True Count")
    plt.ylabel("Predicted Expected Count")

    l = np.array([min(y_val), max(y_val)])
    plt.plot(l, l, alpha=0.6, color="red", label="45-degree Line")
    plt.legend()

    image_path = save_path[:-3] + "png"
    plt.savefig(image_path)
Esempio n. 12
0
def main(train_path, eval_path, pred_path):
    """Problem 1(e): Gaussian discriminant analysis (GDA)

    Args:
        train_path: Path to CSV file containing dataset for training.
        eval_path: Path to CSV file containing dataset for evaluation.
        pred_path: Path to save predictions.
    """
    # Load dataset
    x_train, y_train = util.load_dataset(train_path, add_intercept=False)
    x_eval, y_eval = util.load_dataset(eval_path, add_intercept=True)

    # *** START CODE HERE ***
    # Train a GDA classifier
    # Plot decision boundary on validation set
    # Use np.savetxt to save outputs from validation set to pred_path
    gda = GDA(verbose=True)
    gda.fit(x_train, y_train)

    prediction = gda.predict(x_eval)

    plot_path = pred_path + ".plot.png"
    util.plot(x_eval, y_eval, gda.theta, plot_path, correction=1.0)

    np.savetxt(pred_path, prediction)
Esempio n. 13
0
def main(tau, train_path, eval_path):
    """Problem 5(b): Locally weighted regression (LWR)

    Args:
        tau: Bandwidth parameter for LWR.
        train_path: Path to CSV file containing dataset for training.
        eval_path: Path to CSV file containing dataset for evaluation.
    """
    # Load training set
    x_train, y_train = util.load_dataset(train_path, add_intercept=True)

    # *** START CODE HERE ***
    LWR = LocallyWeightedLinearRegression(0.5)
    LWR.fit(x_train, y_train)
    x_val, y_val = util.load_dataset(eval_path, add_intercept=True)
    y_pred = LWR.predict(x_val)
    
    #plot
    plt.figure()
    plt.plot(x_train[:,1:], y_train, 'bx')
    plt.plot(x_val[:,1:], y_pred, 'ro')
    plt.show()
    
    mse = ((y_pred - y_val) ** 2).mean()
    print(mse)
def main(tau_values, train_path, valid_path, test_path, pred_path):
    """Problem 5(b): Tune the bandwidth paramater tau for LWR.

    Args:
        tau_values: List of tau values to try.
        train_path: Path to CSV file containing training set.
        valid_path: Path to CSV file containing validation set.
        test_path: Path to CSV file containing test set.
        pred_path: Path to save predictions.
    """
    # Load training set
    x_train, y_train = util.load_dataset(train_path, add_intercept=True)
    x_eval, y_eval = util.load_dataset(valid_path, add_intercept=True)
    x_test, y_test = util.load_dataset(test_path, add_intercept=True)
    # *** START CODE HERE ***
    # Search tau_values for the best tau (lowest MSE on the validation set)
    MSE_values = []
    model_list = []
    # Fit a LWR model with the best tau value
    for tau in tau_values:
        clf = LocallyWeightedLinearRegression(tau)
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_eval)
        MSE = np.linalg.norm(y_pred - y_eval)**2 / y_eval.shape[0]
        print("tau {}, MSE {}".format(tau, MSE))
        MSE_values.append(MSE)
        model_list.append(clf)
        plot_lwr(x_train, y_train, x_eval, y_pred,
                 "output/tau_{}.png".format(tau))

    idx = np.argmin(MSE_values)
    best_model, best_tau = model_list[idx], tau_values[idx]
    y_test_pred = best_model.predict(x_test)
    test_MSE = np.linalg.norm(y_test_pred - y_test)**2 / y_test.shape[0]
    print("best tau {}, MSE on the test split {}".format(best_tau, test_MSE))
Esempio n. 15
0
def load_model_input(args, fold):

    logging.info("Loading preprocessed {} dataset".format(fold))
    path = get_model_path(args)
    X = load_dataset(path / "X_{}.json".format(fold))
    Y = load_dataset(path / "Y_{}.json".format(fold))
    return X, np.array(Y)
Esempio n. 16
0
    def parallel(self):
        X_train, y_train = load_dataset(self.train_set)
        X_test, y_test = load_dataset(self.test_set)
        rows = X_train.shape[0]
        hor_X, hor_y = horizontal_split_data(X_train, y_train, self.part)
        lambda_accurancy = np.zeros(self.part)

        weights = np.zeros(self.part)

        for i in self.number_of_learners:
            for j in self.height:
                predict_of_test = np.zeros(y_test.shape[0])
                adaboost_set = []
                for k in hor_X:
                    lambda_accurancy[k], adaboost_instance = self.AdaBoost_DP(
                        hor_X[k], hor_y[k], self.privacy_epsilon[k], j, i,
                        rows)
                    adaboost_set.append(adaboost_instance)

                weights = get_weight(lambda_accurancy)
                for k in range(self.part):
                    predict_of_test += weights[k] * adaboost_set[k].predict(
                        X_test)
                predict_of_test = [
                    0.0 if predict <= 0.5 else 1.0
                    for predict in predict_of_test
                ]
                outputFile = open("adaboostdp_output.txt", 'a')
                outputFile.write(
                    f1_score(y_test, predict_of_test, average="micro") + '\n')
                outputFile.write(roc_auc_score(y_test, predict_of_test) + '\n')
                outputFile.close()
Esempio n. 17
0
def transforming_stuff(train_path, valid_path, save_path):
    x_train, y_train = util.load_dataset(train_path, add_intercept=False)
    x_train2 = np.stack([x_train[:, 0], np.log(x_train[:, 1])]).T

    # *** START CODE HERE ***
    # Train a GDA classifier
    clf = GDA()
    clf.fit(x_train2, y_train)
    # Plot decision boundary on validation set
    x_test, y_test = util.load_dataset(valid_path, add_intercept=True)
    y_test_pred = clf.predict(x_test[:, 1:])

    x, y = x_test, y_test
    plt.figure()
    plt.plot(x[y == 0, -2], x[y == 0, -1], 'bx', linewidth=2)
    plt.plot(x[y == 1, -2], x[y == 1, -1], 'go', linewidth=2)

    x1_min, x1_max = x[:, -2].min(), x[:, -2].max()
    x_pts = np.arange(x1_min, x1_max, (x1_max - x1_min) / 100)
    theta0, theta1, theta2 = clf.theta
    y_pts = np.exp((-1 / theta2) * theta1 * x_pts + theta0)
    plt.plot(x_pts, y_pts)
    plt.xlabel('x1')
    plt.ylabel('x2')

    if save_path:
        plt.savefig(os.path.splitext(save_path)[0] + '_fig.png')

    # Use np.savetxt to save predictions on eval set to save_path
    np.savetxt(save_path, y_test_pred)
    base, ext = os.path.splitext(save_path)
    theta_save_path = base + '_theta' + ext
    np.savetxt(theta_save_path, clf.theta)
Esempio n. 18
0
def main(train_path, valid_path, save_path):
    """Problem: Logistic regression with Newton's Method.

    Args:
        train_path: Path to CSV file containing dataset for training.
        valid_path: Path to CSV file containing dataset for validation.
        save_path: Path to save predicted probabilities using np.savetxt().
    """
    x_train, y_train = util.load_dataset(train_path, add_intercept=False)
    x_valid, y_valid = util.load_dataset(valid_path, add_intercept=False)
    x_train = x_train[:, 1:]
    x_valid = x_valid[:, 1:]

    # normalize the data: (skip binary features)
    x_train[:, :-1] = (x_train[:, :-1] - np.mean(
        x_train[:, :-1], axis=0)) / np.std(x_train[:, :-1], axis=0)
    x_valid[:, :-1] = (x_valid[:, :-1] - np.mean(
        x_valid[:, :-1], axis=0)) / np.std(x_valid[:, :-1], axis=0)

    # add intercept for logistic regression:
    x_train = util.add_intercept(x_train)
    x_valid = util.add_intercept(x_valid)

    clf = logistic.LogisticRegression(step_size=1, max_iter=100000000)
    clf.fit(x_train, y_train)

    y_pred_prob = clf.predict(x_valid)
    y_pred = y_pred_prob.round()

    print(classification_report(y_valid, y_pred))
    print(confusion_matrix(y_valid, y_pred))
    print(np.sum(y_valid))

    np.savetxt(save_path, y_pred)
Esempio n. 19
0
def main(train_path, valid_path, save_path):
    """Problem: Logistic regression with Newton's Method.

    Args:
        train_path: Path to CSV file containing dataset for training.
        valid_path: Path to CSV file containing dataset for validation.
        save_path: Path to save predicted probabilities using np.savetxt().
    """
    x_train, y_train = util.load_dataset(train_path, add_intercept=True)

    # *** START CODE HERE ***
    # Train a logistic regression classifier
    # Plot decision boundary on top of validation set
    # Use np.savetxt to save predictions on eval set to save_path

    model = LogisticRegression()
    model.fit(x_train, y_train)

    x_val, y_val = util.load_dataset(valid_path, add_intercept=True)
    util.plot(x_val,
              y_val,
              model.theta,
              save_path=save_path.replace(".txt", "jpg"))

    yhat = model.predict(x_val)
    np.savetxt(save_path, yhat)

    print(f"LogReg acc: {util.compute_accuracy(y_val, yhat)}")
    print(f"LogReg log loss: {util.compute_log_loss(y_val, yhat)}")
Esempio n. 20
0
def main(train_path, test_path):
    # load the dataset
    # Load the data.
    # Load headers
    x_train, y_train = util.load_dataset(train_path[0])

    if len(train_path) == 2:
        x_train2, y_train2 = util.load_dataset(train_path[1])
        x_train = np.concatenate((x_train, x_train2), axis=0)
        y_train = np.concatenate((y_train, y_train2), axis=0)

    x_test, y_test = util.load_dataset(test_path)

    h1 = 300
    h2 = 70
    h3 = 12
    h4 = 8
    epoch = 100

    # acc_train_all, acc_test_all = simple_nn_all(x_train, y_train, x_test, y_test, h1, h2, h3, h4, epoch)
    # acc_train_orig, acc_test_orig = simple_nn_orig(x_train, y_train, x_test, y_test, 10, 10, 10, epoch)
    acc_train, acc_test = keras_cat_nn(x_train,
                                       y_train,
                                       x_test,
                                       y_test,
                                       h1,
                                       h2,
                                       h3,
                                       h4,
                                       epoch,
                                       feature=False)
Esempio n. 21
0
def main(tau, train_path, eval_path):
    """Problem 5(b): Locally weighted regression (LWR)

    Args:
        tau: Bandwidth parameter for LWR.
        train_path: Path to CSV file containing dataset for training.
        eval_path: Path to CSV file containing dataset for evaluation.
    """
    # Load training set
    x_train, y_train = util.load_dataset(train_path, add_intercept=True)

    # *** START CODE HERE ***
    # Fit a LWR model
    model = LocallyWeightedLinearRegression(0.5)
    model.fit(x_train, y_train)
    # Get MSE value on the validation set
    x_val, y_val = util.load_dataset(eval_path, add_intercept=True)
    y_pred = model.predict(x_val)
    mse = ((y_pred - y_val)**2).mean()
    print(mse)

    # Plot validation predictions on top of training set
    # No need to save anything
    # Plot data

    import matplotlib.pyplot as plt
    plt.figure()
    plt.plot(x_train, y_train, 'bx')
    plt.plot(x_val, y_pred, 'ro')
Esempio n. 22
0
def main(train_path, valid_path, save_path):
    """Problem: Gaussian discriminant analysis (GDA)

    Args:
        train_path: Path to CSV file containing dataset for training.
        valid_path: Path to CSV file containing dataset for validation.
        save_path: Path to save predicted probabilities using np.savetxt().
    """
    # Load dataset
    x_train, y_train = util.load_dataset(train_path, add_intercept=False)

    # *** START CODE HERE ***
    # Train a GDA classifier
    # Plot decision boundary on validation set
    # Use np.savetxt to save outputs from validation set to save_path

    model = GDA()
    model.fit(x_train, y_train)

    x_val, y_val = util.load_dataset(valid_path, add_intercept=False)
    util.plot(x_val,
              y_val,
              model.theta,
              save_path=save_path.replace(".txt", "jpg"))

    yhat = model.predict(x_val)
    np.savetxt(save_path, yhat)

    print(f"GDA acc: {util.compute_accuracy(y_val, yhat)}")
    print(f"GDA log loss: {util.compute_log_loss(y_val, yhat)}")
Esempio n. 23
0
def run_trial_lms(n):
    """Problem: Logistic regression with Newton's Method.

        Args:
            train_path: Path to CSV file containing dataset for training.
            valid_path: Path to CSV file containing dataset for validation.
            save_path: Path to save predicted probabilities using np.savetxt().
        """
    rates = [0.05,0.01, 0.05, 0.1,0.5,1]

    train_path = 'ds1_train.csv'
    valid_path = 'ds1_valid.csv'

    x_train, y_train = util.load_dataset(train_path, add_intercept=True)

    # *** START CODE HERE ***

    # Train a logistic regression classifier
    LR = LogisticRegression(step_size = rates[n])
    it = LR.lms_fit(x_train, y_train)

    # Plot decision boundary on top of validation set set
    x_valid, y_valid = util.load_dataset(valid_path, add_intercept=True)
    ac = LR.predict(x_valid, y_valid)
    return n,it,ac
Esempio n. 24
0
def main(train_path, eval_path, pred_path):
    """Problem 1(e): Gaussian discriminant analysis (GDA)

    Args:
        train_path: Path to CSV file containing dataset for training.
        eval_path: Path to CSV file containing dataset for evaluation.
        pred_path: Path to save predictions.
    """
    # *** START CODE HERE ***
    # Train a GDA classifier
    # NOTE Drop x0 = 1 convention used in regression examples
    # Will need to account for this to write in terms of theta
    x_train, y_train = util.load_dataset(train_path, add_intercept=False)
    x_eval, y_eval = util.load_dataset(eval_path, add_intercept=False)
    model = GDA()
    model.fit(x_train, y_train)

    predictions = model.predict(x_eval)
    np.savetxt(pred_path, predictions)

    # Train Logistic regression classifier
    x_train, y_train = util.load_dataset(train_path, add_intercept=True)
    x_eval, y_eval = util.load_dataset(eval_path, add_intercept=True)
    model2 = LogisticRegression()
    model2.fit(x_train, y_train)

    # Plot decision boundary on validation set
    # Compare decision boundary with logistic
    thetas = [model.theta, model2.theta]
    fig_path = pred_path[:-4] + "_fig.jpg"
    colours = ["red", "orange"]
    title = "LinearReg (Orange) vs. GDA (Red)"
    util.plot_multiple(x_eval, y_eval, thetas, colours, fig_path, title=title)
Esempio n. 25
0
def main(train_path, valid_path, save_path):
    """Problem: Gaussian discriminant analysis (GDA)

    Args:
        train_path: Path to CSV file containing dataset for training.
        valid_path: Path to CSV file containing dataset for validation.
        save_path: Path to save predicted probabilities using np.savetxt().
    """
    # Load dataset
    x_train, y_train = util.load_dataset(train_path, add_intercept=False)

    # *** START CODE HERE ***
    # Train a GDA classifier
    model = GDA()

    # Fit model to the training data. Define theta
    model.fit(x_train, y_train)

    # Read validation set
    x_val, y_val = util.load_dataset(valid_path, add_intercept=True)

    # Save predictions to save path
    np.savetxt(save_path, model.predict(x_val))

    # Plot boundaries
    util.plot(x_val, y_val, model.theta, save_path[:-4])
Esempio n. 26
0
def main(lr, train_path, eval_path, save_path):
    """Problem: Poisson regression with gradient ascent.

    Args:
        lr: Learning rate for gradient ascent.
        train_path: Path to CSV file containing dataset for training.
        eval_path: Path to CSV file containing dataset for evaluation.
        save_path: Path to save predictions.
    """
    # Load training set
    x_train, y_train = util.load_dataset(train_path, add_intercept=True)
    x_valid, y_valid = util.load_dataset(eval_path, add_intercept=True)

    # Train poisson model
    reg = PoissonRegression(step_size=lr)
    reg.fit(x_train, y_train)
    preds = reg.predict(x_valid)
    np.savetxt(save_path, preds)

    # plot predictions
    plt.scatter(y_valid, preds)
    plt.xlabel('True count')
    plt.ylabel('Predicted count')
    plt.axis('equal')
    plt.savefig('poisson.jpg')
Esempio n. 27
0
File: gda.py Progetto: Aitous/CS229
def main(train_path, valid_path, save_path):
    """Problem: Gaussian discriminant analysis (GDA)
    
    Args:
        train_path: Path to CSV file containing dataset for training.
        valid_path: Path to CSV file containing dataset for validation.
        save_path: Path to save predicted probabilities using np.savetxt().
    """
    # Load dataset
    x_train, y_train = util.load_dataset(train_path, add_intercept=False)

    # *** START CODE HERE ***
    # Train a GDA classifier
    # Plot decision boundary on validation set
    # Use np.savetxt to save outputs from validation set to save_path
    x_val, y_val = util.load_dataset(valid_path, add_intercept=False)
        
    ###decomment to normalize the training and validation sets to improve the GDA performance:
#    x_train = (x_train - np.mean(x_train, axis=0))/np.std(x_train, axis=0)
#    x_val = (x_val - np.mean(x_val, axis=0))/np.std(x_val, axis=0)
    
#    x_train = (x_train - np.min(x_train, axis=0))/(np.max(x_train, axis=0) - np.min(x_train, axis=0))
#    x_val = (x_val - np.min(x_val, axis=0))/(np.max(x_val, axis=0) - np.min(x_val, axis=0))
    
    
    clf = GDA()
    clf.fit(x_train, y_train)
    y_predict = clf.predict(x_val)
    
    np.savetxt(save_path, y_predict)
    util.plot(x_val, (y_predict >= 0.5), clf.theta, save_path[:-4]+ "validation_expected")
    #plotting the real distribution
    util.plot(x_val, y_val, clf.theta, save_path[:-4] + "validation_real")
Esempio n. 28
0
def main(lr, train_path, eval_path, pred_path):
    """Problem 3(d): Poisson regression with gradient ascent.

    Args:
        lr: Learning rate for gradient ascent.
        train_path: Path to CSV file containing dataset for training.
        eval_path: Path to CSV file containing dataset for evaluation.
        pred_path: Path to save predictions.
    """
    # Load training set
    x_train, y_train = util.load_dataset(train_path, add_intercept=False)
    x_eval, y_eval = util.load_dataset(train_path, add_intercept=False)

    # *** START CODE HERE ***
    # Fit a Poisson Regression model
    # Run on the validation set, and use np.savetxt to save outputs to pred_path

    initial_theta = np.zeros(x_train.shape[1])
    #took 2008 iterations to converge using the given learning rate and default epsilon value
    poisson_reg = PoissonRegression(step_size=lr, max_iter=10000, theta_0=initial_theta, verbose=True)
    poisson_reg.fit(x_train, y_train)

    prediction = poisson_reg.predict(x_eval)

    np.savetxt(pred_path, prediction)

    # comparing prediction and y_eval copied from:
    # https://scikit-learn.org/0.16/auto_examples/plot_cv_predict.html
    import matplotlib.pyplot as plt
    fig,ax = plt.subplots()
    ax.scatter(y_eval, prediction)
    ax.set_xlabel('Measured')
    ax.set_ylabel('Predicted')
    fig.savefig(pred_path + ".comparision.png")
Esempio n. 29
0
def main(lr, train_path, eval_path, save_path, plot_path):
    """Problem: Poisson regression with gradient ascent.

    Args:
        lr: Learning rate for gradient ascent.
        train_path: Path to CSV file containing dataset for training.
        eval_path: Path to CSV file containing dataset for evaluation.
        save_path: Path to save predictions.
    """
    # Load training set
    x_train, y_train = util.load_dataset(train_path, add_intercept=True)

    # *** START CODE HERE ***
    # Fit a Poisson Regression model
    # Run on the validation set, and use np.savetxt to save outputs to save_path
    clf = PoissonRegression(step_size=lr)
    clf.fit(x_train, y_train)

    # Evaluation
    x_eval, y_eval = util.load_dataset(eval_path, add_intercept=True)
    pred_eval = clf.predict(x_eval)
    pred_train = clf.predict(x_train)
    np.savetxt(save_path, pred_eval)

    # Plot
    plot_path_train = plot_path.replace(".png", "_train.png")
    plot_path_eval = plot_path.replace(".png", "_eval.png")
    util.plot_poisson(y_train, pred_train, plot_path_train)
    util.plot_poisson(y_eval, pred_eval, plot_path_eval)
Esempio n. 30
0
def main():
    # load dataset
    x_train, y_train = util.load_dataset(training_mode=True)
    m = x_train.shape[0]
    n = x_train.shape[1]
    x_train = x_train.reshape(m, n * n)
    x_test, y_test = util.load_dataset(training_mode=False)
    m = x_test.shape[0]
    n = x_test.shape[1]
    x_test = x_test.reshape(m, n * n)

    x_train = x_train / 255.0
    x_test = x_test / 255.0

    # build SVM model
    svm = SVC(C=5, gamma=0.05)

    start_time = dt.datetime.now()
    svm.fit(x_train, y_train)
    end_time = dt.datetime.now()
    elapsed_time = end_time - start_time
    print('Elapsed learning {}'.format(str(elapsed_time)))

    # predict
    expected = y_test
    predicted = svm.predict(x_test)

    # confusion matrix
    cm = metrics.confusion_matrix(expected, predicted)
    print("Confusion matrix:\n%s" % cm)
    plot_confusion_matrix(cm)

    # get accuracy
    print("Training Accuracy={}".format(metrics.accuracy_score(y_train, svm.predict(x_train))))
    print("Testing Accuracy={}".format(metrics.accuracy_score(y_test, svm.predict(x_test))))
def registry(filename,nf, ptitle, kfstart=2, kfend=5, kstart=1, kend=5):
	''' starts the project. For each fold it calculates mean accuracy,
		standard deviation and plot the corresponding graph.
	'''
	dataset = load_dataset(filename)
	kf_accuracy = []
	for kf in range(kfstart, kfend+1):
		kf_accuracy.append(get_Allknn_acc_for_kfold(dataset, kf, kstart, kend, nf))
	kf_mean_acc = [sum(acclist)/len(acclist) for acclist in kf_accuracy]
	sd = [numpy.std(acclist) for acclist in kf_accuracy]

	for kf, acclist in zip(range(kfstart, kfend+1),kf_accuracy):
		print kf, "fold validation ===> accuracy of", sum(acclist)/len(acclist)
	# print kf_mean_acc
	mean_sd = sum(sd)/len(sd)
	mean_acc = sum(kf_mean_acc)/len(kf_mean_acc)
	print "Mean accuracy : ", mean_acc
	print "Mean S.D : ", mean_sd
	plot_graph(kf_accuracy, kstart, kend,sd, ptitle)
Esempio n. 32
0
PROJ_GRAD=False # Should we project gradient on tangent space to to the Stiefel Manifold (Orthogonal matrices)?
RETRACT=False # Should we do retraction step?
THRESHOLD=0 #error threshold in which we do the retraction step
GAIN=1 # a multiplicative constant we add to all orthogonal matrices
RETRACT_SKIP=1 # How many Batches to wait before we do retraction
opt_mathods_set=['SGD','ADAM']
OPT_METHOD=opt_mathods_set[0]
algorithm={'ORT_INIT':ORT_INIT,'PROJ_GRAD':PROJ_GRAD,'RETRACT':RETRACT,'THRESHOLD':THRESHOLD,
'GAIN':GAIN,'RETRACT_SKIP':RETRACT_SKIP,'OPT_METHOD':OPT_METHOD}

params={'network':network,'training':training,'algorithm':algorithm}
DO_SAVE=True # should we save results?
save_file_name=get_file_name(params)
#%% Intialize network model
    
data, vocab, data_ranges = load_dataset(DATASET)

# define a list of parameters to orthogonalize (recurrent connectivities)
param2orthogonlize=[]      
# The number of features is number of different letters + 1 unknown letter
FEATURES_NUM=len(vocab)+1
# Construct network

# Input layer
l_in = lasagne.layers.InputLayer(
    (BATCH_SIZE, SEQUENCE_LENGTH-1, FEATURES_NUM)) # the input has -1 sequence elength since we through away the last character (it is only predicted - in the output)
layers_to_concat = []
# All recurrent layer
for dd in range(DEPTH): 
    if ORT_INIT:
        W_in_to_hid_init=lasagne.init.Orthogonal(gain=GAIN)