def testPdToDict():

    df = hw3.load_and_normalize_spambase()
    cols = df.columns[0:3]
    sub = utils.train_subset(df, cols, 5)
    print sub
    print hw3.pandas_to_data(sub)
def q7():
    h_test, h_train = utils.load_and_normalize_housing_set()
    housingData_test = hw3.pandas_to_data(h_test)
    housingData_train = hw3.pandas_to_data(h_train)
    y, X = hw4.split_truth_from_data(housingData_train)
    y_test, X_test = hw4.split_truth_from_data(housingData_test)
    #gb = GradientBoostingRegressor(learning_rate=.1, n_estimators=1, max_depth=1)
    gb = gradb.GradientBoostRegressor(learning_rate=.1, n_estimators=100, max_depth=1, learner=lambda: DecisionTreeRegressor(max_depth=1))
    gb.fit(X, y)
    gb.print_stats()
    yhat = gb.predict(X_test)
    print y_test[:10]
    print yhat[:10]
    print 'MSE: {}'.format(hw4.compute_mse(y_test, yhat))
def q1():
    spamData = hw3.pandas_to_data(hw3.load_and_normalize_spambase())
    k = 10
    all_folds = hw3.partition_folds(spamData, k)
    tprs = []
    fprs = []
    for i in [0]: #range(len(all_folds)):
        kf_data, kf_test = dl.get_train_and_test(all_folds, i)
        y, X = hw4.split_truth_from_data(kf_data)
        y_test, X_test = hw4.split_truth_from_data(kf_test)
        adaboost = run_adaboost(X, y, X_test, y_test, i)
        predicted = adaboost.predict(X)
        print(roc_auc_score(y, predicted))
        for i in range(len(adaboost.snapshots)):
            round_number = i + 1
            ab = adaboost.snapshots[i]
            yt_pred = ab.predict(X_test)
            round_err = float(np.sum([1 if yt==yp else 0 for yt, yp in zip(yt_pred, y_test)]))/len(y_test)
            adaboost.adaboost_error_test[round_number] = round_err
        print predicted[:20]
        print y[:20]
        name = 'q1'
        directory = '/Users/Admin/Dropbox/ML/MachineLearning_CS6140/CS6140_A_MacLeay/Homeworks'
        path = os.path.join(directory, name + 'hw4errors.pdf')
        tterrpath = os.path.join(directory, name + 'hw4_errors_test_train.pdf')
        print path
        plt.Errors([adaboost.local_errors]).plot_all_errors(path)
        plt.Errors([adaboost.adaboost_error, adaboost.adaboost_error_test]).plot_all_errors(tterrpath)
        roc = plt.ROC()
        #roc.add_tpr_fpr_arrays(adaboost.tpr.values(), adaboost.fpr.values())
        get_tpr_fpr(adaboost, roc, X_test, y_test, 30)
        roc.plot_ROC(os.path.join(directory, name + 'hw4_roc.pdf'))
def q2_plots():
    models = ['Bernoulli', 'Gaussian', '4-bins', '9-bins']
    spamData = hw3.pandas_to_data(hw3.load_and_normalize_spambase())
    k = 10
    num_points = 50
    k_folds = hw3.partition_folds(spamData, k)
    for model_type in range(4):
        roc = ROC.ROC()
        print '\nModel: {}'.format(models[model_type])
        train_acc_sum = 0
        nb_models = []
        for ki in [0]:
            alpha = .001 if model_type==0 else 0
            nb_model = nb.NaiveBayes(model_type, alpha=alpha)
            truth_rows, data_rows, data_mus, y_mu = hw3.get_data_and_mus(k_folds[ki])
            nb_model.train(data_rows, truth_rows)
            for ti in range(num_points + 2):
                theta = ti * 1./(num_points + 1)
                predict = nb_model.predict(data_rows, theta)
                print predict
                accuracy = hw3.get_accuracy(predict, truth_rows)
                train_acc_sum += accuracy
                roc.add_tp_tn(predict, truth_rows, theta)

                #print_plot_output(ki, accuracy, theta)

        roc.plot_ROC('/Users/Admin/Dropbox/ML/MachineLearning_CS6140/CS6140_A_MacLeay/Homeworks/roc_{}.pdf'.format(model_type))
        roc.print_info()
def q2():
    models = ['Bernoulli', 'Gaussian', '4-bins', '9-bins']
    spamData = hw3.pandas_to_data(hw3.load_and_normalize_spambase())
    k = 10
    k_folds = hw3.partition_folds(spamData, k)
    for model_type in range(4):
        print '\nModel: {}'.format(models[model_type])
        train_acc_sum = 0
        nb_models = []
        for ki in range(k - 1):
            alpha = .001 if model_type==0 else 0
            nb_model = nb.NaiveBayes(model_type, alpha=alpha)
            truth_rows, data_rows, data_mus, y_mu = hw3.get_data_and_mus(k_folds[ki])
            nb_model.train(data_rows, truth_rows)
            predict = nb_model.predict(data_rows)
            print predict
            accuracy = hw3.get_accuracy(predict, truth_rows)
            train_acc_sum += accuracy
            print_output(ki, accuracy)
            nb_models.append(nb_model)
        nb_combined = nb.NaiveBayes(model_type, alpha=.001)
        if model_type < 2:
            nb_combined.aggregate_model(nb_models)
        else:
            nb_combined.aggregate_model3(nb_models)
        truth_rows, data_rows, data_mus, y_mu = hw3.get_data_and_mus(k_folds[k - 1])
        test_predict = nb_combined.predict(data_rows)
        test_accuracy = hw3.get_accuracy(test_predict, truth_rows)
        print_test_output(test_accuracy, float(train_acc_sum)/(k-1))



            #print len(k_folds[0])
    truth_rows, data_rows, data_mus, y_mu = hw3.get_data_and_mus(spamData)
def q1():
    """GDA """
    """Run the Gaussian Discriminant Analysis on the spambase data. Use the k-folds from the previous problem (1 for testing, k-1 for training, for each fold)
Since you have 57 real value features, each of the  2gaussians (for + class and for - class) will have a mean  vector with 57 components, and a they will have
either a common (shared) covariance matrix size 57x57. This covariance is estimated from all training data (both classes)
or two separate covariance 57x57 matrices (estimated separately for each class)
(you can use a Matlab or Python of Java built in function to estimated covariance matrices, but the estimator is easy to code up).
Looking at the training and testing performance, does it appear that the gaussian assumption (normal distributed data) holds for this particular dataset?
"""

    spamData = hw3.pandas_to_data(hw3.load_and_normalize_spambase())  # returns an array of arrays - this is by row
    k = 10
    train_acc_sum = 0
    k_folds = hw3.partition_folds(spamData, k)
    gdas = []
    for ki in range(k - 1):
        subset = []
        gda = hw3.GDA()
        X, truth = hw3.separate_X_and_y(k_folds[ki])
        covariance_matrix = hw3.get_covar(X)
        gda.p_y = float(sum(truth)) / len(truth)
        gda.train(X, covariance_matrix, truth)
        predictions = gda.predict(X)
        #print predictions
        accuracy = mystats.get_error(predictions, truth, True)
        #gdas.append(gda)
        print_output(ki, accuracy)
        #print gda.prob
        gdas.append(gda)
def testTransposeArray():
    dfup = hw3.load_and_normalize_spambase()
    cols = dfup.columns[0:3]
    sub = utils.train_subset(dfup, cols, 5)
    up = hw3.pandas_to_data(sub)
    print up
    trans = hw3.transpose_array(up)
    print trans
def q6():
    """ Bagging - sample with replacement """
    spamData = hw3.pandas_to_data(hw3.load_and_normalize_spambase())
    y, X = hw4.split_truth_from_data(spamData)
    bagged = bag.Bagging(max_rounds=100, sample_size=1000, learner=lambda: DecisionTreeClassifier(max_depth=3))
    bagged.fit(X, y)
    kf_fold = hw4.partition_folds(spamData, .4)
    test_y, test_X = hw4.split_truth_from_data(kf_fold[0])
    test_pred = bagged.predict(test_X)
    test_y = bagged._check_y(test_y)
    test_pred = bagged._check_y(test_pred)
    test_error = float(sum([0 if py == ty else 1 for py, ty in zip(test_pred, test_y)]))/len(test_y)
    print 'Final testing error: {}'.format(test_error)
def q3():
    """Run your code from PB1 on Spambase dataset to perform Active Learning.
    Specifically:
    - start with a training set of about 5% of the data (selected randomly)
    - iterate M episodes: train the Adaboost for T rounds; from the datapoints
      not in the training set, select the 2% ones that are closest to the
      separation surface (boosting score F(x) closest to ) and add these to the
      training set (with labels). Repeat until the size of the training set
      reaches 50% of the data.

    How is the performance improving with the training set increase? Compare the
    performance of the Adaboost algorithm on the c% randomly selected training set
    with c% actively-built training set for several values of c : 5, 10, 15, 20,
    30, 50.
    """
    spamData = hw3.pandas_to_data(hw3.load_and_normalize_spambase())
    percent = .05
    all_folds = hw4.partition_folds_q4(spamData, percent)
    kf_train = all_folds[0]
    kf_test = all_folds[1]
    left_over = all_folds[2]

    while len(kf_train) < len(spamData)/2:
        y, X = hw4.split_truth_from_data(kf_train)
        y_test, X_test = hw4.split_truth_from_data(kf_test)
        adaboost = run_adaboost(X, y, X_test, y_test, 'q2_crx')

        yt_pred = adaboost.predict(X_test)
        order = adaboost.rank(X_test)
        yt_pred = adaboost._check_y(yt_pred)
        y_test = adaboost._check_y(y_test)
        round_err = float(np.sum([1 if yt!=yp else 0 for yt, yp in zip(yt_pred, y_test)]))/len(y_test)

        print 'Error {}'.format(round_err)
        shift_number = int(len(order) * .02)  # number of items to switch into training set
        mask = []
        for i in xrange(shift_number):
            mask.append(order[i])
            kf_train.append(kf_test[order[i]])
        new_test = [kf_test[i] for i in range(len(kf_test)) if i not in mask]
        for i in xrange(len(mask)):
            new_test.append(left_over[i])
        left_over = left_over[len(mask):]
        kf_test = new_test[:]
        print 'test len {} train len {} leftover len {} shifting {}'.format(len(kf_test), len(kf_train), len(left_over), shift_number)
def spamData():
    return hw3.pandas_to_data(hw3.load_and_normalize_spambase())