Ejemplo n.º 1
0
def q1():
    """ feature analysis with Adaboost """
    #spamData = hw3u.pandas_to_data(hw3u.load_and_normalize_spambase())
    spamData = utils.load_and_normalize_polluted_spam_data()
    k = 10
    all_folds = hw3u.partition_folds(spamData, k)
    col_errs = []
    kf_train, kf_test = dl.get_train_and_test(all_folds, 0)
    y, X = hw4u.split_truth_from_data(kf_train)
    y_test, X_test = hw4u.split_truth_from_data(kf_test)

    # We're not actually cross-validating anything -- we just want feature weights
    #X = np.concatenate([X, X_test], axis=0)
    #y = np.concatenate([y, y_test], axis=0)

    #adaboost = adac.AdaboostOptimal(max_rounds=100, do_fast=False, learner=lambda: DecisionTreeClassifier(max_depth=1, splitter='random'))
    adaboost = adac.AdaboostOptimal(max_rounds=100, do_fast=False, learner=lambda: DecisionTreeClassifier(max_depth=1, splitter='best'))
    #adaboost = adac.AdaboostOptimal(max_rounds=10, do_fast=False, learner=hw4u.TreeOptimal)
    adaboost.fit(X, y)


    margin_fractions = get_margin_fractions(adaboost, X[0])
    #margin_fractions_v = hw5u.get_margin_fractions_validate(adaboost, X, y)
    #print col_errs
    ranked = rank(margin_fractions)
    print_ranks(ranked)

    pred = adaboost.predict(X_test)
    print 'Accuracy: {}'.format(accuracy_score(adaboost._check_y_not_zero(y_test), adaboost._check_y_not_zero(pred)))
Ejemplo n.º 2
0
def runDigits(n, skclf, myclf):
    mnsize = n
    df = hw6u.load_mnist_features(mnsize)
    data = utils.pandas_to_data(df)
    k = 10
    all_folds = hw3u.partition_folds(data, k)
    kf_train, kf_test = dl.get_train_and_test(all_folds, 0)
    y, X = hw4u.split_truth_from_data(kf_train, replace_zeros=False)
    y, X = np.asarray(y, dtype=np.float), np.asarray(X)
    y_test, X_test = hw4u.split_truth_from_data(kf_test, replace_zeros=False)
    y_test, X_test = np.asarray(y_test), np.asarray(X_test, dtype=np.float)
    print 'my fit'
    clf = OneVsRestClassifier(myclf).fit(X, y)
    print 'scikit fit'
    skclf = skclf.fit(X, y)
    print 'my predict'
    y_pred = clf.predict(X_test)
    myacc = accuracy_score(y_test, y_pred)
    print '({})'.format(myacc)
    print 'scikit predict'
    sk_pred = skclf.predict(X_test)
    print sk_pred
    print y_test
    print y_pred
    print 'SciKit Accuracy: {}  My Accuracy: {}'.format(accuracy_score(y_test, sk_pred), myacc)
Ejemplo n.º 3
0
def SpamClassifier(features, skclassifier, myclassifier):
    data = utils.pandas_to_data(utils.load_and_normalize_spam_data())
    k = 10
    if features != 'all':
        # Only use the features passed in the features array
        new = []
        t = utils.transpose_array(data)
        for i in xrange(len(t)):
            if i in features:
                new.append(t[i])
            data = utils.transpose_array(t)
    all_folds = hw3u.partition_folds(data, k)
    kf_train, kf_test = dl.get_train_and_test(all_folds, 0)
    y, X = hw4u.split_truth_from_data(kf_train)
    y_test, X_test = hw4u.split_truth_from_data(kf_test)
    print 'start MyKNN'
    knn = hw7u.KNN(classifier=myclassifier)
    print 'start scikit'
    knnsci = hw7u.KNN(classifier=skclassifier)
    print 'start my pred'
    y_pred = knn.predict(X_test, X, y)
    print 'My Accuracy: {}'.format(accuracy_score(fix_y(y_test), fix_y(y_pred)))
    print 'start sk pred'
    y_sci = knnsci.predict(X_test, X, y)
    print 'SciKit Accuracy: {}  My Accuracy: {}'.format(accuracy_score(fix_y(y_test), fix_y(y_sci)), accuracy_score(fix_y(y_test), fix_y(y_pred)))
def TreeTest():
    spamDat = spamData()
    k = 10
    all_folds = hw3.partition_folds(spamDat, k)
    num_in_fold = []
    err_in_fold = []
    for i in range(len(all_folds) - 1):
        spam = all_folds[i]
        num_in_fold.append(len(spam))
        truth, f_data = decTree.split_truth_from_data(spam)
        tree = decTree.TreeOptimal(max_depth=2)
        #tree = decTree.TreeRandom()
        tree.fit(f_data, truth)
        print 'Prediction...\n'
        predict = tree.predict(f_data)
        print predict
        print truth
        error = 1. - hw3.get_accuracy(predict, truth)
        err_in_fold.append(error)
        print 'Tree error is: {}'.format(error)
    spam = all_folds[k -1]
    truth, f_data = decTree.split_truth_from_data(spam)
    tree = decTree.TreeOptimal(max_depth=2)
    #tree = decTree.TreeRandom()
    tree.fit(f_data, truth)
    predict = tree.predict(f_data)
    error = 1. - hw3.get_accuracy(predict, truth)
    sum_training_err = 0
    for i in range(len(num_in_fold)):
        sum_training_err += err_in_fold[i]
        #sum_training_err += float(err_in_fold)/num_in_fold
    average_training_error = float(sum_training_err)/len(num_in_fold)
    print 'Average training error: {}\nAverage testing error: {}'.format(average_training_error, error)
Ejemplo n.º 5
0
def runDigitsDensity(n,_i, j):
    metric = ['minkowski', 'cosine', 'gaussian', 'poly2']
    ma = hw7u.Kernel(ktype=metric[j]+'_sci').compute
    #skclf = KernelDensity(metric=ma)
    myclf = hw7u.MyKNN(metric=metric[j], density=True)
    mnsize = n
    df = hw6u.load_mnist_features(mnsize)
    data = utils.pandas_to_data(df)
    k = 10
    all_folds = hw3u.partition_folds(data, k)
    kf_train, kf_test = dl.get_train_and_test(all_folds, 0)
    y, X = hw4u.split_truth_from_data(kf_train, replace_zeros=False)
    y, X = np.asarray(y, dtype=np.float), np.asarray(X)
    y_test, X_test = hw4u.split_truth_from_data(kf_test, replace_zeros=False)
    y_test, X_test = np.asarray(y_test), np.asarray(X_test, dtype=np.float)
    print 'my fit'
    clf = OneVsRestClassifier(myclf).fit(X, y)
    print 'scikit fit'
    #skclf = skclf.fit(X, y)
    print 'my predict'
    y_pred = clf.predict(X_test)
    myacc = accuracy_score(y_test, y_pred)
    print '({})'.format(myacc)
    #print 'scikit predict'
    #sk_pred = skclf.predict(X_test)
    #print sk_pred
    print y_test
    print y_pred
    #print 'SciKit Accuracy: {}  My Accuracy: {}'.format(accuracy_score(y_test, sk_pred), myacc)
    print 'My Accuracy: {}'.format(myacc)
def tests_radius():
    i = 0
    j = 0
    k = 10
    X, y = testData()
    #print X
    X = np.concatenate([X, y.reshape((len(y), 1))], axis=1)
    X = [list(x.ravel()) for x in X]
    radius = [3, 5, 7]
    radius = [1e-1,1e-2,1e-3]  # for radius
    metric = ['minkowski', 'cosine', 'gaussian', 'poly2']
    ma = speedy.Kernel(ktype=metric[j]).compute
    #ma = hw7u.Kernel(ktype=metric[j]).compute
    print 'spam radius is {}'.format(radius[i])
    clf = hw7u.MyKNN(radius=radius[i], metric=metric[j], outlier_label=-1)
    skclf = RadiusNeighborsClassifier(radius=radius[i], algorithm='brute', metric="euclidean", p=2, outlier_label=.5)
    all_folds = hw3u.partition_folds(X, k)
    kf_train, kf_test = dl.get_train_and_test(all_folds, 0)
    y, X = hw4u.split_truth_from_data(kf_train)
    y_test, X_test = hw4u.split_truth_from_data(kf_test)
    print 'start scikit'
    knnsci = hw7u.KNN(classifier=skclf)
    print 'start MyKNN'
    knn = hw7u.KNN(classifier=clf)
    print 'start sk pred'
    y_sci = knnsci.predict(X_test, X, y)
    print 'start my pred'
    y_pred = knn.predict(X_test, X, y)
    print y_pred
    print 'SciKit Accuracy: {}  My Accuracy: {}'.format(accuracy_score(hw7.fix_y(y_test), hw7.fix_y(y_sci)), accuracy_score(hw7.fix_y(y_test), hw7.fix_y(y_pred)))
Ejemplo n.º 7
0
def q1():
    spamData = hw3.pandas_to_data(hw3.load_and_normalize_spambase())
    k = 10
    all_folds = hw3.partition_folds(spamData, k)
    tprs = []
    fprs = []
    for i in [0]: #range(len(all_folds)):
        kf_data, kf_test = dl.get_train_and_test(all_folds, i)
        y, X = hw4.split_truth_from_data(kf_data)
        y_test, X_test = hw4.split_truth_from_data(kf_test)
        adaboost = run_adaboost(X, y, X_test, y_test, i)
        predicted = adaboost.predict(X)
        print(roc_auc_score(y, predicted))
        for i in range(len(adaboost.snapshots)):
            round_number = i + 1
            ab = adaboost.snapshots[i]
            yt_pred = ab.predict(X_test)
            round_err = float(np.sum([1 if yt==yp else 0 for yt, yp in zip(yt_pred, y_test)]))/len(y_test)
            adaboost.adaboost_error_test[round_number] = round_err
        print predicted[:20]
        print y[:20]
        name = 'q1'
        directory = '/Users/Admin/Dropbox/ML/MachineLearning_CS6140/CS6140_A_MacLeay/Homeworks'
        path = os.path.join(directory, name + 'hw4errors.pdf')
        tterrpath = os.path.join(directory, name + 'hw4_errors_test_train.pdf')
        print path
        plt.Errors([adaboost.local_errors]).plot_all_errors(path)
        plt.Errors([adaboost.adaboost_error, adaboost.adaboost_error_test]).plot_all_errors(tterrpath)
        roc = plt.ROC()
        #roc.add_tpr_fpr_arrays(adaboost.tpr.values(), adaboost.fpr.values())
        get_tpr_fpr(adaboost, roc, X_test, y_test, 30)
        roc.plot_ROC(os.path.join(directory, name + 'hw4_roc.pdf'))
Ejemplo n.º 8
0
def svm_q1(data, classifier=svm.SVC()):
    k = 10
    all_folds = hw3u.partition_folds(data, k)
    kf_train, kf_test = dl.get_train_and_test(all_folds, 0)
    y, X = hw4u.split_truth_from_data(kf_train)
    y_test, X_test = hw4u.split_truth_from_data(kf_test)
    print 'length train: {} length test {}'.format(len(X), len(X_test))
    clf = classifier
    clf.fit(X, y)
    y_pred = clf.predict(X_test)
    print 'train acc: {} test acc: {}'.format(accuracy_score(fix_y(y), fix_y(clf.predict(X))), accuracy_score(fix_y(y_test), fix_y(clf.predict(X_test))))
Ejemplo n.º 9
0
def q6():
    """ Bagging - sample with replacement """
    spamData = hw3.pandas_to_data(hw3.load_and_normalize_spambase())
    y, X = hw4.split_truth_from_data(spamData)
    bagged = bag.Bagging(max_rounds=100, sample_size=1000, learner=lambda: DecisionTreeClassifier(max_depth=3))
    bagged.fit(X, y)
    kf_fold = hw4.partition_folds(spamData, .4)
    test_y, test_X = hw4.split_truth_from_data(kf_fold[0])
    test_pred = bagged.predict(test_X)
    test_y = bagged._check_y(test_y)
    test_pred = bagged._check_y(test_pred)
    test_error = float(sum([0 if py == ty else 1 for py, ty in zip(test_pred, test_y)]))/len(test_y)
    print 'Final testing error: {}'.format(test_error)
Ejemplo n.º 10
0
def q7():
    h_test, h_train = utils.load_and_normalize_housing_set()
    housingData_test = hw3.pandas_to_data(h_test)
    housingData_train = hw3.pandas_to_data(h_train)
    y, X = hw4.split_truth_from_data(housingData_train)
    y_test, X_test = hw4.split_truth_from_data(housingData_test)
    #gb = GradientBoostingRegressor(learning_rate=.1, n_estimators=1, max_depth=1)
    gb = gradb.GradientBoostRegressor(learning_rate=.1, n_estimators=100, max_depth=1, learner=lambda: DecisionTreeRegressor(max_depth=1))
    gb.fit(X, y)
    gb.print_stats()
    yhat = gb.predict(X_test)
    print y_test[:10]
    print yhat[:10]
    print 'MSE: {}'.format(hw4.compute_mse(y_test, yhat))
Ejemplo n.º 11
0
def q3():
    """Run your code from PB1 on Spambase dataset to perform Active Learning.
    Specifically:
    - start with a training set of about 5% of the data (selected randomly)
    - iterate M episodes: train the Adaboost for T rounds; from the datapoints
      not in the training set, select the 2% ones that are closest to the
      separation surface (boosting score F(x) closest to ) and add these to the
      training set (with labels). Repeat until the size of the training set
      reaches 50% of the data.

    How is the performance improving with the training set increase? Compare the
    performance of the Adaboost algorithm on the c% randomly selected training set
    with c% actively-built training set for several values of c : 5, 10, 15, 20,
    30, 50.
    """
    spamData = hw3.pandas_to_data(hw3.load_and_normalize_spambase())
    percent = .05
    all_folds = hw4.partition_folds_q4(spamData, percent)
    kf_train = all_folds[0]
    kf_test = all_folds[1]
    left_over = all_folds[2]

    while len(kf_train) < len(spamData)/2:
        y, X = hw4.split_truth_from_data(kf_train)
        y_test, X_test = hw4.split_truth_from_data(kf_test)
        adaboost = run_adaboost(X, y, X_test, y_test, 'q2_crx')

        yt_pred = adaboost.predict(X_test)
        order = adaboost.rank(X_test)
        yt_pred = adaboost._check_y(yt_pred)
        y_test = adaboost._check_y(y_test)
        round_err = float(np.sum([1 if yt!=yp else 0 for yt, yp in zip(yt_pred, y_test)]))/len(y_test)

        print 'Error {}'.format(round_err)
        shift_number = int(len(order) * .02)  # number of items to switch into training set
        mask = []
        for i in xrange(shift_number):
            mask.append(order[i])
            kf_train.append(kf_test[order[i]])
        new_test = [kf_test[i] for i in range(len(kf_test)) if i not in mask]
        for i in xrange(len(mask)):
            new_test.append(left_over[i])
        left_over = left_over[len(mask):]
        kf_test = new_test[:]
        print 'test len {} train len {} leftover len {} shifting {}'.format(len(kf_test), len(kf_train), len(left_over), shift_number)
    def fit(self, X, y):
        X = np.asarray(X)
        y = np.asarray(y)

        self.mean = np.mean(y)
        #y = np.asarray([self.mean]*len(y))
        #hypothesis = self.learner().fit(X, y)
        #self.hypotheses.append(hypothesis)
        for round in xrange(self.max_rounds):
            residual = [(yn - yl) for yn, yl in zip(y, self.predict(X))]
            hypothesis = self.learner().fit(X, residual)
            self.hypotheses.append(hypothesis)

            self.local_error.append(hw4.compute_mse(residual, hypothesis.predict(X)))

            pred_round = self.predict(X)
            self.train_score = hw4.compute_mse(pred_round, y)
            self.training_error.append(self.train_score)
Ejemplo n.º 13
0
def multiclassSVC(classifier, sz=2000):

    mnsize = sz
    df = hw6u.load_mnist_features(mnsize)
    data = utils.pandas_to_data(df)
    k = 10
    all_folds = hw3u.partition_folds(data, k)
    kf_train, kf_test = dl.get_train_and_test(all_folds, 0)
    y, X = hw4u.split_truth_from_data(kf_train, replace_zeros=False)
    y, X = np.asarray(y), np.asarray(X)
    y_test, X_test = hw4u.split_truth_from_data(kf_test, replace_zeros=False)
    y_test, X_test = np.asarray(y_test), np.asarray(X_test)
    print 'Beginning analysis: {}'.format(X.shape)
    #clf = OneVsRestClassifier(classifier, n_jobs=4).fit(X, y)
    clf = OneVsOneClassifier(classifier).fit(X, y)
    #clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=10, random_state=0).fit(np.asarray(X), y)
    y_pred = clf.predict(X)
    print 'train acc: {} test acc: {}'.format(accuracy_score(fix_y(y_pred), fix_y(y)), accuracy_score(fix_y(y_test), fix_y(clf.predict(X_test))))
    print 'train acc: {} test acc: {}'.format(accuracy_score(fix_y(clf.predict(X)), fix_y(y)), accuracy_score(fix_y(y_test), fix_y(clf.predict(X_test))))
Ejemplo n.º 14
0
def q2():
    """Boosting on UCI datasets"""
    crx = dl.data_q3_crx()
    #crx = dl.data_q3_vote()
    num_points = len(crx)
    for i in xrange(5, 85, 5):
        percent = float(i)/100
        all_folds = hw4.partition_folds(crx, percent)
        kf_train = all_folds[0]
        kf_test = all_folds[1]
        y, X = hw4.split_truth_from_data(kf_train)
        y_test, X_test = hw4.split_truth_from_data(kf_test)
        adaboost = run_adaboost(X, y, X_test, y_test, 'q2_crx')
        yt_pred = adaboost.predict(X_test)
        yt_pred = adaboost._check_y(yt_pred)
        y_test = adaboost._check_y(y_test)
        round_err = float(np.sum([1 if yt!=yp else 0 for yt, yp in zip(yt_pred, y_test)]))/len(y_test)
        last_round = adaboost.local_errors.keys()[-1]
        #print 'Error at {}%: Train: {} Test: {}'.format(percent, adaboost.adaboost_error[last_round], round_err)
        print 'Error at {}%: Test: {}'.format(percent, round_err)
Ejemplo n.º 15
0
def relief(n):
    max_iters = 1
    j = 0
    i = 1
    n_neighbors = [1, 3, 7]
    metric = ['minkowski', 'cosine', 'gaussian', 'poly2']
    ma = hw7u.Kernel(ktype=metric[j]).compute
    data = utils.pandas_to_data(utils.load_and_normalize_spam_data())
    k = 10
    all_folds = hw3u.partition_folds(data, k)
    kf_train, kf_test = dl.get_train_and_test(all_folds, 0)
    y, X = hw4u.split_truth_from_data(kf_train)
    y_test, X_test = hw4u.split_truth_from_data(kf_test)
    loops = 0
    weights = np.zeros(len(X[0]))
    loops += 1
    n_features = len(X[0])
    n_samples = len(X)
    for j in range(n_features): #feature

        for i in range(n_samples):  # data
            closest_same = None
            closest_opp = None
            for z_i in range(n_samples):
                if z_i == i:
                    continue
                diff = (X[z_i][j] - X[i][j]) ** 2
                if y[z_i] == y[i]:  # same
                    if closest_same is None or diff < closest_same:
                        closest_same = diff
                else:  # opp
                    if closest_opp is None or diff < closest_opp:
                        closest_opp = diff
            weights[j] += (-closest_same + closest_opp)
            if i % 1000 == 0:
                print 'feature {} of {}, sample {} of {}'.format(j, n_features, i, n_samples)
    print weights

    return sorted(zip(weights, range(len(weights))), reverse=True)[:n][1]
def tests_density():
    i = 0
    j = 2
    k = 10
    X, y = testData()
    print X
    X = np.concatenate([X, y.reshape((len(y), 1))], axis=1)
    X = [list(x.ravel()) for x in X]
    radius = [3, 5, 7]
    metric = ['minkowski', 'cosine', 'gaussian', 'poly2']
    ma = hw7u.Kernel(ktype=metric[j]).compute
    params = {'bandwidth': np.logspace(-1, 1, 20)}
    grid = GridSearchCV(KernelDensity(), params)
    grid.fit(X)
    clf = hw7u.MyKNN(metric=metric[j], density=True)

    bw = grid.best_estimator_.bandwidth
    print("best bandwidth: {0}".format(bw))

    # use the best estimator to compute the kernel density estimate
    kde = grid.best_estimator_
    skclf = KernelDensity(bandwidth=bw, kernel='gaussian')
    skclf.fit(X[:-10], y[:-10])
    print skclf.score_samples(X[-10:])
    return
    all_folds = hw3u.partition_folds(X, k)
    kf_train, kf_test = dl.get_train_and_test(all_folds, 0)
    y, X = hw4u.split_truth_from_data(kf_train)
    y_test, X_test = hw4u.split_truth_from_data(kf_test)
    print 'start scikit'
    knnsci = hw7u.KNN(classifier=skclf)
    print 'start MyKNN'
    knn = hw7u.KNN(classifier=clf)
    print 'start sk pred'
    y_sci = knnsci.predict(X_test, X, y)
    print 'start my pred'
    y_pred = knn.predict(X_test, X, y)
    print y_pred
    print 'SciKit Accuracy: {}  My Accuracy: {}'.format(accuracy_score(hw7.fix_y(y_test), hw7.fix_y(y_sci)), accuracy_score(hw7.fix_y(y_test), hw7.fix_y(y_pred)))
Ejemplo n.º 17
0
def runSpamDensity(_i, j, features='all'):
    metric = ['gaussian', 'poly2', 'cosine_similarity', 'gaussian_density']
    data = utils.pandas_to_data(utils.load_and_normalize_spam_data())
    k = 10
    all_folds = hw3u.partition_folds(data, k)
    kf_train, kf_test = dl.get_train_and_test(all_folds, 0)
    y, X = hw4u.split_truth_from_data(kf_train)
    y_test, X_test = hw4u.split_truth_from_data(kf_test)

    print(len(X))
    print(len(X_test))

    myclassifier = hw7u.MyKNN(metric=metric[j], density=True)
    print 'start MyKNN'
    myclassifier.fit(X, y)
    #print 'start scikit'
    #knnsci = skclassifier.fit(X, y)
    print 'start my pred'
    y_pred = myclassifier.predict(X_test)
    print(y_pred)
    #print 'start sk pred'
    #y_sci = knnsci.score(X_test)
    #print 'SciKit Accuracy: {}  My Accuracy: {}'.format(accuracy_score(fix_y(y_test), fix_y(y_sci)), accuracy_score(fix_y(y_test), fix_y(y_pred)))
    print '2b: My Accuracy: {}'.format(accuracy_score(fix_y(y_test), fix_y(y_pred)))