def q2_plots():
    models = ['Bernoulli', 'Gaussian', '4-bins', '9-bins']
    spamData = hw3.pandas_to_data(hw3.load_and_normalize_spambase())
    k = 10
    num_points = 50
    k_folds = hw3.partition_folds(spamData, k)
    for model_type in range(4):
        roc = ROC.ROC()
        print '\nModel: {}'.format(models[model_type])
        train_acc_sum = 0
        nb_models = []
        for ki in [0]:
            alpha = .001 if model_type==0 else 0
            nb_model = nb.NaiveBayes(model_type, alpha=alpha)
            truth_rows, data_rows, data_mus, y_mu = hw3.get_data_and_mus(k_folds[ki])
            nb_model.train(data_rows, truth_rows)
            for ti in range(num_points + 2):
                theta = ti * 1./(num_points + 1)
                predict = nb_model.predict(data_rows, theta)
                print predict
                accuracy = hw3.get_accuracy(predict, truth_rows)
                train_acc_sum += accuracy
                roc.add_tp_tn(predict, truth_rows, theta)

                #print_plot_output(ki, accuracy, theta)

        roc.plot_ROC('/Users/Admin/Dropbox/ML/MachineLearning_CS6140/CS6140_A_MacLeay/Homeworks/roc_{}.pdf'.format(model_type))
        roc.print_info()
def TreeTest():
    spamDat = spamData()
    k = 10
    all_folds = hw3.partition_folds(spamDat, k)
    num_in_fold = []
    err_in_fold = []
    for i in range(len(all_folds) - 1):
        spam = all_folds[i]
        num_in_fold.append(len(spam))
        truth, f_data = decTree.split_truth_from_data(spam)
        tree = decTree.TreeOptimal(max_depth=2)
        #tree = decTree.TreeRandom()
        tree.fit(f_data, truth)
        print 'Prediction...\n'
        predict = tree.predict(f_data)
        print predict
        print truth
        error = 1. - hw3.get_accuracy(predict, truth)
        err_in_fold.append(error)
        print 'Tree error is: {}'.format(error)
    spam = all_folds[k -1]
    truth, f_data = decTree.split_truth_from_data(spam)
    tree = decTree.TreeOptimal(max_depth=2)
    #tree = decTree.TreeRandom()
    tree.fit(f_data, truth)
    predict = tree.predict(f_data)
    error = 1. - hw3.get_accuracy(predict, truth)
    sum_training_err = 0
    for i in range(len(num_in_fold)):
        sum_training_err += err_in_fold[i]
        #sum_training_err += float(err_in_fold)/num_in_fold
    average_training_error = float(sum_training_err)/len(num_in_fold)
    print 'Average training error: {}\nAverage testing error: {}'.format(average_training_error, error)
def testPdToDict():

    df = hw3.load_and_normalize_spambase()
    cols = df.columns[0:3]
    sub = utils.train_subset(df, cols, 5)
    print sub
    print hw3.pandas_to_data(sub)
 def model_bin_train(self, data_row, truth, num_bins=2):
     #TODO add epsilon
     model = {}
     cutoffsc = [[] for _ in range(len(data_row[0]))]
     dmat = np.matrix(data_row)
     drange = dmat.max() - dmat.min()
     bin_size = float(drange) / num_bins
     data_col = hw3.transpose_array(data_row)
     for j in range(len(data_col)):
         #cutoffsc.append([min(data_col)[0] + bin_size * i for i in range(num_bins)])
         mu = np.asarray(data_col[j]).mean()
         low_mu = np.asarray([data_col[j][i] for i in range(len(data_col[j])) if data_col[j][i] < mu]).mean()
         high_mu = np.asarray([data_col[j][i] for i in range(len(data_col[j])) if data_col[j][i] > mu]).mean()
         if num_bins == 4:
             cutoffsc[j] = [min(data_col)[0], low_mu, mu, high_mu]
         else:
             cutoffsc[j] = [min(data_col)[0], (low_mu - min(data_col)[0])/2, mu, (high_mu-mu)/2, high_mu, (max(data_col)[0]-high_mu)/2]
     cutoffs = [dmat.min() + bin_size * i for i in range(num_bins)]
     #epsilon = float(alpha * 1) / len(covar_matrix)
     for label in [0,1]:
         # transpose to go by column
         sub_data = hw3.transpose_array(hw3.get_sub_at_value(data_row, truth, label))
         model[label] = hw3.bins_per_column(sub_data, cutoffs)
         model[label] = hw3.bins_per_column_by_col(sub_data, cutoffsc)
         # probability of bin given label
     self.y_prob = float(sum(truth))/len(truth)
     self.cutoffs = cutoffsc
     return model
def q1():
    """GDA """
    """Run the Gaussian Discriminant Analysis on the spambase data. Use the k-folds from the previous problem (1 for testing, k-1 for training, for each fold)
Since you have 57 real value features, each of the  2gaussians (for + class and for - class) will have a mean  vector with 57 components, and a they will have
either a common (shared) covariance matrix size 57x57. This covariance is estimated from all training data (both classes)
or two separate covariance 57x57 matrices (estimated separately for each class)
(you can use a Matlab or Python of Java built in function to estimated covariance matrices, but the estimator is easy to code up).
Looking at the training and testing performance, does it appear that the gaussian assumption (normal distributed data) holds for this particular dataset?
"""

    spamData = hw3.pandas_to_data(hw3.load_and_normalize_spambase())  # returns an array of arrays - this is by row
    k = 10
    train_acc_sum = 0
    k_folds = hw3.partition_folds(spamData, k)
    gdas = []
    for ki in range(k - 1):
        subset = []
        gda = hw3.GDA()
        X, truth = hw3.separate_X_and_y(k_folds[ki])
        covariance_matrix = hw3.get_covar(X)
        gda.p_y = float(sum(truth)) / len(truth)
        gda.train(X, covariance_matrix, truth)
        predictions = gda.predict(X)
        #print predictions
        accuracy = mystats.get_error(predictions, truth, True)
        #gdas.append(gda)
        print_output(ki, accuracy)
        #print gda.prob
        gdas.append(gda)
def q1():
    spamData = hw3.pandas_to_data(hw3.load_and_normalize_spambase())
    k = 10
    all_folds = hw3.partition_folds(spamData, k)
    tprs = []
    fprs = []
    for i in [0]: #range(len(all_folds)):
        kf_data, kf_test = dl.get_train_and_test(all_folds, i)
        y, X = hw4.split_truth_from_data(kf_data)
        y_test, X_test = hw4.split_truth_from_data(kf_test)
        adaboost = run_adaboost(X, y, X_test, y_test, i)
        predicted = adaboost.predict(X)
        print(roc_auc_score(y, predicted))
        for i in range(len(adaboost.snapshots)):
            round_number = i + 1
            ab = adaboost.snapshots[i]
            yt_pred = ab.predict(X_test)
            round_err = float(np.sum([1 if yt==yp else 0 for yt, yp in zip(yt_pred, y_test)]))/len(y_test)
            adaboost.adaboost_error_test[round_number] = round_err
        print predicted[:20]
        print y[:20]
        name = 'q1'
        directory = '/Users/Admin/Dropbox/ML/MachineLearning_CS6140/CS6140_A_MacLeay/Homeworks'
        path = os.path.join(directory, name + 'hw4errors.pdf')
        tterrpath = os.path.join(directory, name + 'hw4_errors_test_train.pdf')
        print path
        plt.Errors([adaboost.local_errors]).plot_all_errors(path)
        plt.Errors([adaboost.adaboost_error, adaboost.adaboost_error_test]).plot_all_errors(tterrpath)
        roc = plt.ROC()
        #roc.add_tpr_fpr_arrays(adaboost.tpr.values(), adaboost.fpr.values())
        get_tpr_fpr(adaboost, roc, X_test, y_test, 30)
        roc.plot_ROC(os.path.join(directory, name + 'hw4_roc.pdf'))
def testTransposeArray():
    dfup = hw3.load_and_normalize_spambase()
    cols = dfup.columns[0:3]
    sub = utils.train_subset(dfup, cols, 5)
    up = hw3.pandas_to_data(sub)
    print up
    trans = hw3.transpose_array(up)
    print trans
 def model_gaussian_rand_var_train(self, data, truth):
     mus = {}
     std_dev = {}
     for label in [0,1]:
         sub_data = hw3.get_sub_at_value(data, truth, label)
         mus[label] = hw3.get_mus(sub_data)
         std_dev[label] = hw3.get_std_dev(sub_data)
     self.y_prob = float(sum(truth))/len(truth)
     return [mus, std_dev, float(sum(truth))/len(truth)]
def q6():
    """ Bagging - sample with replacement """
    spamData = hw3.pandas_to_data(hw3.load_and_normalize_spambase())
    y, X = hw4.split_truth_from_data(spamData)
    bagged = bag.Bagging(max_rounds=100, sample_size=1000, learner=lambda: DecisionTreeClassifier(max_depth=3))
    bagged.fit(X, y)
    kf_fold = hw4.partition_folds(spamData, .4)
    test_y, test_X = hw4.split_truth_from_data(kf_fold[0])
    test_pred = bagged.predict(test_X)
    test_y = bagged._check_y(test_y)
    test_pred = bagged._check_y(test_pred)
    test_error = float(sum([0 if py == ty else 1 for py, ty in zip(test_pred, test_y)]))/len(test_y)
    print 'Final testing error: {}'.format(test_error)
def q2():
    models = ['Bernoulli', 'Gaussian', '4-bins', '9-bins']
    spamData = hw3.pandas_to_data(hw3.load_and_normalize_spambase())
    k = 10
    k_folds = hw3.partition_folds(spamData, k)
    for model_type in range(4):
        print '\nModel: {}'.format(models[model_type])
        train_acc_sum = 0
        nb_models = []
        for ki in range(k - 1):
            alpha = .001 if model_type==0 else 0
            nb_model = nb.NaiveBayes(model_type, alpha=alpha)
            truth_rows, data_rows, data_mus, y_mu = hw3.get_data_and_mus(k_folds[ki])
            nb_model.train(data_rows, truth_rows)
            predict = nb_model.predict(data_rows)
            print predict
            accuracy = hw3.get_accuracy(predict, truth_rows)
            train_acc_sum += accuracy
            print_output(ki, accuracy)
            nb_models.append(nb_model)
        nb_combined = nb.NaiveBayes(model_type, alpha=.001)
        if model_type < 2:
            nb_combined.aggregate_model(nb_models)
        else:
            nb_combined.aggregate_model3(nb_models)
        truth_rows, data_rows, data_mus, y_mu = hw3.get_data_and_mus(k_folds[k - 1])
        test_predict = nb_combined.predict(data_rows)
        test_accuracy = hw3.get_accuracy(test_predict, truth_rows)
        print_test_output(test_accuracy, float(train_acc_sum)/(k-1))



            #print len(k_folds[0])
    truth_rows, data_rows, data_mus, y_mu = hw3.get_data_and_mus(spamData)
def q7():
    h_test, h_train = utils.load_and_normalize_housing_set()
    housingData_test = hw3.pandas_to_data(h_test)
    housingData_train = hw3.pandas_to_data(h_train)
    y, X = hw4.split_truth_from_data(housingData_train)
    y_test, X_test = hw4.split_truth_from_data(housingData_test)
    #gb = GradientBoostingRegressor(learning_rate=.1, n_estimators=1, max_depth=1)
    gb = gradb.GradientBoostRegressor(learning_rate=.1, n_estimators=100, max_depth=1, learner=lambda: DecisionTreeRegressor(max_depth=1))
    gb.fit(X, y)
    gb.print_stats()
    yhat = gb.predict(X_test)
    print y_test[:10]
    print yhat[:10]
    print 'MSE: {}'.format(hw4.compute_mse(y_test, yhat))
def q3():  # Got points off b/c I have 89 accuracy instead of 92
    """ Logistic Regression """
    data = utils.load_and_normalize_polluted_spam_data()
    k = 10
    k_folds = hw3u.partition_folds(data, k)
    train_acc = []
    test_acc = []
    hw2_train_acc = []
    hw2_test_acc = []
    for ki in range(k):
        grouped_fold = hw5u.group_fold(k_folds, ki)
        y, X = utils.split_truth_from_data(grouped_fold)
        y_truth, X_test = utils.split_truth_from_data(k_folds[ki])
        clf = lm.LogisticRegression() #penalty="l1")
        ridge_clf = hw5u.Ridge()
        #clf = lm.Lasso(alpha=.5)
        #clf = lm.RidgeClassifier(alpha=.1)
        clf.fit(X, y)
        ridge_clf.fit(X, y)

        y_train = [1 if p >= .5 else 0 for p in clf.predict(X)]
        y_test = [1 if p >= .5 else 0 for p in clf.predict(X_test)]
        yhat_ridge_train = [1 if p >= .5 else 0 for p in ridge_clf.predict(X)]
        yhat_ridge_test = [1 if p >= .5 else 0 for p in ridge_clf.predict(X_test)]
        train_acc.append(accuracy_score(y, y_train))
        test_acc.append(accuracy_score(y_truth, y_test))
        hw2_train_acc.append(accuracy_score(y, yhat_ridge_train))
        hw2_test_acc.append(accuracy_score(y_truth, yhat_ridge_test))
        print 'Fold {} train acc: {} test acc: {} HW2 ridge train: {}  HW2 ridge test: {}'.format(ki+1, train_acc[-1], test_acc[-1], hw2_train_acc[-1], hw2_test_acc[-1])
    print 'Average acc - Train: {}  Test: {}  HW2 ridge: {}'.format(np.mean(train_acc), np.mean(test_acc), np.mean(hw2_train_acc), np.mean(hw2_test_acc))
def SpamClassifier(features, skclassifier, myclassifier):
    data = utils.pandas_to_data(utils.load_and_normalize_spam_data())
    k = 10
    if features != 'all':
        # Only use the features passed in the features array
        new = []
        t = utils.transpose_array(data)
        for i in xrange(len(t)):
            if i in features:
                new.append(t[i])
            data = utils.transpose_array(t)
    all_folds = hw3u.partition_folds(data, k)
    kf_train, kf_test = dl.get_train_and_test(all_folds, 0)
    y, X = hw4u.split_truth_from_data(kf_train)
    y_test, X_test = hw4u.split_truth_from_data(kf_test)
    print 'start MyKNN'
    knn = hw7u.KNN(classifier=myclassifier)
    print 'start scikit'
    knnsci = hw7u.KNN(classifier=skclassifier)
    print 'start my pred'
    y_pred = knn.predict(X_test, X, y)
    print 'My Accuracy: {}'.format(accuracy_score(fix_y(y_test), fix_y(y_pred)))
    print 'start sk pred'
    y_sci = knnsci.predict(X_test, X, y)
    print 'SciKit Accuracy: {}  My Accuracy: {}'.format(accuracy_score(fix_y(y_test), fix_y(y_sci)), accuracy_score(fix_y(y_test), fix_y(y_pred)))
def runDigits(n, skclf, myclf):
    mnsize = n
    df = hw6u.load_mnist_features(mnsize)
    data = utils.pandas_to_data(df)
    k = 10
    all_folds = hw3u.partition_folds(data, k)
    kf_train, kf_test = dl.get_train_and_test(all_folds, 0)
    y, X = hw4u.split_truth_from_data(kf_train, replace_zeros=False)
    y, X = np.asarray(y, dtype=np.float), np.asarray(X)
    y_test, X_test = hw4u.split_truth_from_data(kf_test, replace_zeros=False)
    y_test, X_test = np.asarray(y_test), np.asarray(X_test, dtype=np.float)
    print 'my fit'
    clf = OneVsRestClassifier(myclf).fit(X, y)
    print 'scikit fit'
    skclf = skclf.fit(X, y)
    print 'my predict'
    y_pred = clf.predict(X_test)
    myacc = accuracy_score(y_test, y_pred)
    print '({})'.format(myacc)
    print 'scikit predict'
    sk_pred = skclf.predict(X_test)
    print sk_pred
    print y_test
    print y_pred
    print 'SciKit Accuracy: {}  My Accuracy: {}'.format(accuracy_score(y_test, sk_pred), myacc)
def runDigitsDensity(n,_i, j):
    metric = ['minkowski', 'cosine', 'gaussian', 'poly2']
    ma = hw7u.Kernel(ktype=metric[j]+'_sci').compute
    #skclf = KernelDensity(metric=ma)
    myclf = hw7u.MyKNN(metric=metric[j], density=True)
    mnsize = n
    df = hw6u.load_mnist_features(mnsize)
    data = utils.pandas_to_data(df)
    k = 10
    all_folds = hw3u.partition_folds(data, k)
    kf_train, kf_test = dl.get_train_and_test(all_folds, 0)
    y, X = hw4u.split_truth_from_data(kf_train, replace_zeros=False)
    y, X = np.asarray(y, dtype=np.float), np.asarray(X)
    y_test, X_test = hw4u.split_truth_from_data(kf_test, replace_zeros=False)
    y_test, X_test = np.asarray(y_test), np.asarray(X_test, dtype=np.float)
    print 'my fit'
    clf = OneVsRestClassifier(myclf).fit(X, y)
    print 'scikit fit'
    #skclf = skclf.fit(X, y)
    print 'my predict'
    y_pred = clf.predict(X_test)
    myacc = accuracy_score(y_test, y_pred)
    print '({})'.format(myacc)
    #print 'scikit predict'
    #sk_pred = skclf.predict(X_test)
    #print sk_pred
    print y_test
    print y_pred
    #print 'SciKit Accuracy: {}  My Accuracy: {}'.format(accuracy_score(y_test, sk_pred), myacc)
    print 'My Accuracy: {}'.format(myacc)
def tests_radius():
    i = 0
    j = 0
    k = 10
    X, y = testData()
    #print X
    X = np.concatenate([X, y.reshape((len(y), 1))], axis=1)
    X = [list(x.ravel()) for x in X]
    radius = [3, 5, 7]
    radius = [1e-1,1e-2,1e-3]  # for radius
    metric = ['minkowski', 'cosine', 'gaussian', 'poly2']
    ma = speedy.Kernel(ktype=metric[j]).compute
    #ma = hw7u.Kernel(ktype=metric[j]).compute
    print 'spam radius is {}'.format(radius[i])
    clf = hw7u.MyKNN(radius=radius[i], metric=metric[j], outlier_label=-1)
    skclf = RadiusNeighborsClassifier(radius=radius[i], algorithm='brute', metric="euclidean", p=2, outlier_label=.5)
    all_folds = hw3u.partition_folds(X, k)
    kf_train, kf_test = dl.get_train_and_test(all_folds, 0)
    y, X = hw4u.split_truth_from_data(kf_train)
    y_test, X_test = hw4u.split_truth_from_data(kf_test)
    print 'start scikit'
    knnsci = hw7u.KNN(classifier=skclf)
    print 'start MyKNN'
    knn = hw7u.KNN(classifier=clf)
    print 'start sk pred'
    y_sci = knnsci.predict(X_test, X, y)
    print 'start my pred'
    y_pred = knn.predict(X_test, X, y)
    print y_pred
    print 'SciKit Accuracy: {}  My Accuracy: {}'.format(accuracy_score(hw7.fix_y(y_test), hw7.fix_y(y_sci)), accuracy_score(hw7.fix_y(y_test), hw7.fix_y(y_pred)))
def q1():
    """ feature analysis with Adaboost """
    #spamData = hw3u.pandas_to_data(hw3u.load_and_normalize_spambase())
    spamData = utils.load_and_normalize_polluted_spam_data()
    k = 10
    all_folds = hw3u.partition_folds(spamData, k)
    col_errs = []
    kf_train, kf_test = dl.get_train_and_test(all_folds, 0)
    y, X = hw4u.split_truth_from_data(kf_train)
    y_test, X_test = hw4u.split_truth_from_data(kf_test)

    # We're not actually cross-validating anything -- we just want feature weights
    #X = np.concatenate([X, X_test], axis=0)
    #y = np.concatenate([y, y_test], axis=0)

    #adaboost = adac.AdaboostOptimal(max_rounds=100, do_fast=False, learner=lambda: DecisionTreeClassifier(max_depth=1, splitter='random'))
    adaboost = adac.AdaboostOptimal(max_rounds=100, do_fast=False, learner=lambda: DecisionTreeClassifier(max_depth=1, splitter='best'))
    #adaboost = adac.AdaboostOptimal(max_rounds=10, do_fast=False, learner=hw4u.TreeOptimal)
    adaboost.fit(X, y)


    margin_fractions = get_margin_fractions(adaboost, X[0])
    #margin_fractions_v = hw5u.get_margin_fractions_validate(adaboost, X, y)
    #print col_errs
    ranked = rank(margin_fractions)
    print_ranks(ranked)

    pred = adaboost.predict(X_test)
    print 'Accuracy: {}'.format(accuracy_score(adaboost._check_y_not_zero(y_test), adaboost._check_y_not_zero(pred)))
def test_NaiveBayes():
    bayes = nb.NaiveBayes(2)
    arr = get_nb_data()
    print arr
    truth_rows, data_rows, data_mus, y_mu = hw3.get_data_and_mus(arr)
    bayes.train(data_rows, truth_rows)
    print bayes.model
    return bayes
def q3():
    """Run your code from PB1 on Spambase dataset to perform Active Learning.
    Specifically:
    - start with a training set of about 5% of the data (selected randomly)
    - iterate M episodes: train the Adaboost for T rounds; from the datapoints
      not in the training set, select the 2% ones that are closest to the
      separation surface (boosting score F(x) closest to ) and add these to the
      training set (with labels). Repeat until the size of the training set
      reaches 50% of the data.

    How is the performance improving with the training set increase? Compare the
    performance of the Adaboost algorithm on the c% randomly selected training set
    with c% actively-built training set for several values of c : 5, 10, 15, 20,
    30, 50.
    """
    spamData = hw3.pandas_to_data(hw3.load_and_normalize_spambase())
    percent = .05
    all_folds = hw4.partition_folds_q4(spamData, percent)
    kf_train = all_folds[0]
    kf_test = all_folds[1]
    left_over = all_folds[2]

    while len(kf_train) < len(spamData)/2:
        y, X = hw4.split_truth_from_data(kf_train)
        y_test, X_test = hw4.split_truth_from_data(kf_test)
        adaboost = run_adaboost(X, y, X_test, y_test, 'q2_crx')

        yt_pred = adaboost.predict(X_test)
        order = adaboost.rank(X_test)
        yt_pred = adaboost._check_y(yt_pred)
        y_test = adaboost._check_y(y_test)
        round_err = float(np.sum([1 if yt!=yp else 0 for yt, yp in zip(yt_pred, y_test)]))/len(y_test)

        print 'Error {}'.format(round_err)
        shift_number = int(len(order) * .02)  # number of items to switch into training set
        mask = []
        for i in xrange(shift_number):
            mask.append(order[i])
            kf_train.append(kf_test[order[i]])
        new_test = [kf_test[i] for i in range(len(kf_test)) if i not in mask]
        for i in xrange(len(mask)):
            new_test.append(left_over[i])
        left_over = left_over[len(mask):]
        kf_test = new_test[:]
        print 'test len {} train len {} leftover len {} shifting {}'.format(len(kf_test), len(kf_train), len(left_over), shift_number)
def test_NaiveBayes_predict():
    bayes = nb.NaiveBayes(2)
    arr = get_nb_data()
    test = get_nb_test_data(5)
    print arr
    truth_rows, data_rows, data_mus, y_mu = hw3.get_data_and_mus(arr)
    bayes.train(data_rows, truth_rows)
    print data_mus
    print bayes.model
    print bayes.predict(test)
def svm_q1(data, classifier=svm.SVC()):
    k = 10
    all_folds = hw3u.partition_folds(data, k)
    kf_train, kf_test = dl.get_train_and_test(all_folds, 0)
    y, X = hw4u.split_truth_from_data(kf_train)
    y_test, X_test = hw4u.split_truth_from_data(kf_test)
    print 'length train: {} length test {}'.format(len(X), len(X_test))
    clf = classifier
    clf.fit(X, y)
    y_pred = clf.predict(X_test)
    print 'train acc: {} test acc: {}'.format(accuracy_score(fix_y(y), fix_y(clf.predict(X))), accuracy_score(fix_y(y_test), fix_y(clf.predict(X_test))))
 def model_average_predict(self, data_row, theta=.5):
     """  For each row calculate the probability
     that y is 1 and the probability that y is 0
     P(Y|X) = ( P(X|Y) * P(Y) ) / ( P(X) )
     P(X) = prob_over (probability that x is above average for column)
     P(X|Y) = prob_over_given_c (probability that x is above average when y = c for column)
     P(Y) = prob_y ( probability of y )
     """
     mus = hw3.get_mus(data_row)
     data_cols = hw3.transpose_array(data_row)
     prob_over_given_1 = self.model[0]
     prob_over_given_0 = self.model[1]
     prob_over = self.model[2]
     prob_y1 = self.model[3]
     predict = []
     for r in range(len(data_row)):
         row = data_row[r]
         prob_1 = 1
         prob_0 = 1
         for c in range(len(row)):
             mu = mus[c]
             if row[c] > mu:
                 prob_x1 = prob_over_given_1[c]
                 prob_x0 = prob_over_given_0[c]
                 prob_xover = prob_over[c]
             else:
                 prob_x1 = 1 - prob_over_given_1[c]
                 prob_x0 = 1 - prob_over_given_0[c]
                 prob_xover = 1 - prob_over[c]
             prob_1 = prob_1 * prob_x1 #* prob_y1 #/ prob_xover  #P(X|Y) * P(Y)
             prob_0 = prob_0 * prob_x0 #* (1-prob_y1) #/ prob_xover
             #prob_1 = prob_1 + np.log(prob_x1) + np.log(prob_y1)
             #prob_0 = prob_0 + np.log(prob_x0) + np.log(1-prob_y1)
         prob_1 = prob_1 * prob_y1
         prob_0 = prob_0 * (1 - prob_y1)
         prob_norm = float(prob_1)/(prob_0 + prob_1)
         if prob_norm > theta:
             predict.append(1)
         else:
             predict.append(0)
     return predict
def GaussianNB(X, num_features=None):
    model_type = 1
    train_acc_sum = 0
    test_acc_sum = 0
    k = 10
    nb_models = []
    if num_features is not None:
        y, X = utils.split_truth_from_data(X)
        q4_slct = SelectKBest(k=num_features).fit(X, y)
        X = q4_slct.transform(X)
        X = utils.add_row(X, y)
    k_folds = hw3u.partition_folds(X, k)
    for ki in range(k):
        grouped_fold = hw5u.group_fold(k_folds, ki)
        alpha = .001 if model_type==0 else 0
        mask_cols = check_cols(grouped_fold)
        #nb_model = nb.NaiveBayes(model_type, alpha=alpha, ignore_cols=mask_cols)
        nb_model = BernoulliNB()
        print 'len of kfolds {}'.format(len(grouped_fold))
        #truth_rows, data_rows, data_mus, y_mu = hw3u.get_data_and_mus(grouped_fold)
        truth_rows, data_rows = utils.split_truth_from_data(grouped_fold)
        print 'len of data {}'.format(len(data_rows))
        #nb_model.train(data_rows, truth_rows)
        nb_model.fit(data_rows, truth_rows)
        predict = nb_model.predict(data_rows)
        #print predict
        accuracy = hw3u.get_accuracy(predict, truth_rows)
        train_acc_sum += accuracy
        print_output(ki, accuracy)
        nb_models.append(nb_model)

        truth_rows, data_rows = utils.split_truth_from_data(k_folds[ki])
        test_predict = nb_model.predict(data_rows)
        test_accuracy = hw3u.get_accuracy(test_predict, truth_rows)
        test_acc_sum += test_accuracy
        print_output(ki, test_accuracy, 'test')

    print_test_output(float(train_acc_sum)/k, float(test_acc_sum)/k)
    def model_average_train(self, data_row, truth):
        """ return [prob_over_given_1, prob_over_given_0, prob_y1]
        prob_over_give_x = col1[mu, var, proabality], colx[mu, var, prob] ...
        """
        mus = hw3.get_mus(data_row)
        is_not_spam = hw3.get_sub_at_value(data_row, truth, 0)
        is_spam = hw3.get_sub_at_value(data_row, truth, 1)
        prob_over = get_prob_over(data_row, mus)
        prob_over_given_1 = get_prob_over(is_spam, mus)
        prob_over_given_0 = get_prob_over(is_not_spam, mus)
        l0 = len(prob_over_given_0)
        l1 = len(prob_over_given_1)
        if l1 != l0:
            addx = abs(l1-l0)
            fake_row = [0 for _ in range(addx)]
            if l1 > l0:
                prob_over_given_0 = fake_row
            else:
                prob_over_given_1 = fake_row
        prob_y1 = float(sum(truth))/len(truth)
        self.y_prob = prob_y1

        return [prob_over_given_1, prob_over_given_0, prob_over, prob_y1]
Example #25
0
    def initialize(self, data, k=2):
        # start with k = 2 and std_dev = 1
        self.k = k
        self.labels = [ki for ki in range(self.k)]
        models = [EMModel() for _ in range(self.k)]

        mucheat = mu_cheat(hw3.transpose_array(data), k)
        for ki in range(self.k):
            #models[ki].random_mus(data)
            models[ki].mu = mucheat[ki]

        self.labels = self.assign_labels(data, models)
        #self.labels = self.assign_labels2(data, model)

        self.prevent_empty(data)

        for ki in range(self.k):
            sub_data = hw3.get_sub_at_value(data, self.labels, ki)
            #models[ki].sigma = hw3.get_covar(sub_data)
            models[ki].sigma = hw3.get_covar(data)
            #models[ki].weight = float(len(sub_data)) / len(data)
            models[ki].weight = .5
            models[ki].likelihood = self.expectation(data, models[ki])  # multivarate_normal
        self.models = models
    def model_gaussian_rand_var_predict(self, data, theta=.5):
        """ model = [[mus_by_col], [std_dev_by_col], prob_y]"""
        std_devs = self.model[1]
        mus = self.model[0]
        y_prob = self.model[2]
        probabilities = {}
        for label in [0, 1]:
            if len(std_devs[label]) == 0:
                #print self.model
                #print 'Standard Deviations is empty!!!'
                probabilities[label] = [0] * len(data)
                continue
            prob_of_y = y_prob if label==1 else (1-y_prob)
            probabilities[label] = hw3.univariate_normal(data, std_devs[label], mus[label], prob_of_y, .15, ignore_cols=self.ignore_cols)

        return self.nb_predict(probabilities, theta)
def get_prob_over(data_by_row, mus):
    """
    Return array of arrays
    column[i] = [probability_above]
    """
    probability_above_mu = []
    size = len(data_by_row)
    by_col = hw3.transpose_array(data_by_row)
    for col in range(len(by_col)):
        total_over = 0
        column = by_col[col]
        mu_col = mus[col]
        var_col = utils.variance(by_col[col], size)
        for row in range(len(column)):
            if column[row] > mu_col:
                total_over += 1
        probability_above_mu.append(float(total_over)/size)
    return probability_above_mu
def multiclassSVC(classifier, sz=2000):

    mnsize = sz
    df = hw6u.load_mnist_features(mnsize)
    data = utils.pandas_to_data(df)
    k = 10
    all_folds = hw3u.partition_folds(data, k)
    kf_train, kf_test = dl.get_train_and_test(all_folds, 0)
    y, X = hw4u.split_truth_from_data(kf_train, replace_zeros=False)
    y, X = np.asarray(y), np.asarray(X)
    y_test, X_test = hw4u.split_truth_from_data(kf_test, replace_zeros=False)
    y_test, X_test = np.asarray(y_test), np.asarray(X_test)
    print 'Beginning analysis: {}'.format(X.shape)
    #clf = OneVsRestClassifier(classifier, n_jobs=4).fit(X, y)
    clf = OneVsOneClassifier(classifier).fit(X, y)
    #clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=10, random_state=0).fit(np.asarray(X), y)
    y_pred = clf.predict(X)
    print 'train acc: {} test acc: {}'.format(accuracy_score(fix_y(y_pred), fix_y(y)), accuracy_score(fix_y(y_test), fix_y(clf.predict(X_test))))
    print 'train acc: {} test acc: {}'.format(accuracy_score(fix_y(clf.predict(X)), fix_y(y)), accuracy_score(fix_y(y_test), fix_y(clf.predict(X_test))))
 def run(self, data, weights):
     k_folds = hw3.partition_folds(data, self.number_k_folds)
     for k in xrange(self.number_k_folds - 1):
         err_matrix = []
         fold = k_folds[k]
         truth, f_data = split_truth_from_data(fold)
         model = self.fit(f_data)
         predicted = self.predict(model, f_data)  # {-1, 1}
         err_matrix = self.compute_error_matrix(truth, predicted)
         self.training_errors.append(self.get_error(err_matrix))
         self.training_errors_weighted.append(sum(self.weight_errors(err_matrix, weights)))
     fold = k_folds[self.number_k_folds - 1]
     truth, f_data = utils.split_truth_from_data(fold)
     predicted = self.predict(model, f_data)
     # Error matrix for round computed from test data
     self.err_matrix = self.compute_error_matrix(truth, predicted)
     self.testing_error = self.get_error(self.err_matrix)
     self.testing_errors_weighted = self.weight_errors(self.err_matrix, weights)
     self.set_weight_distribution_and_total()  # Dt(x) and epsilon
     self.set_alpha()
    def model_bin_predict(self, data_row, alpha=2.00001, theta=.5):
        """
        probality[0] = [xlabel_0_prob, xlabel_1_prob, ..., xlabel_n_prob]
                        probability of y == 0 given xlabel
        probality[1] = [xlabel_0_prob, xlabel_1_prob, ..., xlabel_n_prob]
                        probability of y == 1 given xlabel
        """

        probability = [[] for _ in [0, 1]]  # hold probability per row
        for r in range(len(data_row)):
            prob = [1 for _ in [0, 1]]  #[1 for _ in range(len(self.cutoffs))]
            row = data_row[r]
            for c in range(len(row)):
                xbin = hw3.classify_x(row[c], self.cutoffs[c])
                for label in [0, 1]:
                    # model[0] = [col1: prob_bin1, prob_bin2 ...], [col2:...]
                    #for modbin in self.model[label]
                    prob[label] = prob[label] * (self.model[label][c][xbin] + float(alpha) / len(data_row))
            for label in [0, 1]:
                prob_y = self.y_prob if label == 1 else 1 - self.y_prob
                probability[label].append(prob[label] * prob_y)
        return self.nb_predict(probability, theta=theta)