コード例 #1
0
def regression_line_spam_no_libs():
    """
    Solution for HW1 prob 2
    """
    print('Homework 1 problem 2 - No Libraries - Regression Line')
    print('Spam Dataset')
    spam_data = utils.load_and_normalize_spam_data()
    test, train = utils.split_test_and_train(spam_data)
    columns = train.columns[:-1]
    Y_fit = mystats.linear_regression_points(train[columns], train['is_spam'])

    #print 'Y_fit'
    #print Y_fit
    #for i in range(0, len(Y_fit)):
    #    print str(Y_fit[i]) + ' -- ' + str(train['is_spam'][i])

    col_MSE = {}
    for i, col in enumerate(columns):
        col_fit = Y_fit[i] + Y_fit[-1]
        col_MSE[col] = mystats.compute_MSE_arrays(col_fit, train['is_spam'])
    print col_MSE
    RMSE = np.sqrt(col_MSE.values())
    average_MSE = utils.average(col_MSE.values())
    average_RMSE = utils.average(RMSE)
    print 'Average MSE: ' + str(average_MSE)
    print 'Average RMSE: ' + str(average_RMSE)
コード例 #2
0
def decision_spambase_set_no_libs():
    """
    Solution for HW1 prob 1
    """
    print('Homework 1 problem 1 - No Libraries - Regression Decision tree')
    print('Spambase Dataset')
    spam_data = utils.load_and_normalize_spam_data()
    test, train = utils.split_test_and_train(spam_data)
    print str(len(train)) + " # in training set <--> # in test " + str(len(test))

    node = mytree.Node(np.ones(len(train)))
    branch_node(node, train, 5, 'is_spam')
    #node.show_children_tree()
    node.show_children_tree(follow=False)

    model = mytree.Tree(node)
    model.print_leaves()
    print 'Trained model error is : ' + str(model.error())

    node.presence = np.ones(len(test))
    test_node(node, test, 'is_spam')
    test_tree = mytree.Tree(node)
    prediction = test_tree.predict_obj()
    test_tree.print_leaves_test()
    print 'predict sum: ' + str(sum(prediction))
    print 'MSE:' + str(test_tree.error_test())

    [tp, tn, fp, fn] = mystats.get_performance_stats(test['is_spam'].as_matrix(), prediction)
    print 'TP: {}\tFP: {}\nTN: {}\tFN: {}'.format(tp, fp, tn, fn)
    print 'Accuracy: ' + str(mystats.compute_accuracy(tp,tn, fp,fn))
    print 'MSE: ' + str(mystats.compute_MSE_arrays(prediction, test['is_spam']))
コード例 #3
0
def SpamClassifier(features, skclassifier, myclassifier):
    data = utils.pandas_to_data(utils.load_and_normalize_spam_data())
    k = 10
    if features != 'all':
        # Only use the features passed in the features array
        new = []
        t = utils.transpose_array(data)
        for i in xrange(len(t)):
            if i in features:
                new.append(t[i])
            data = utils.transpose_array(t)
    all_folds = hw3u.partition_folds(data, k)
    kf_train, kf_test = dl.get_train_and_test(all_folds, 0)
    y, X = hw4u.split_truth_from_data(kf_train)
    y_test, X_test = hw4u.split_truth_from_data(kf_test)
    print 'start MyKNN'
    knn = hw7u.KNN(classifier=myclassifier)
    print 'start scikit'
    knnsci = hw7u.KNN(classifier=skclassifier)
    print 'start my pred'
    y_pred = knn.predict(X_test, X, y)
    print 'My Accuracy: {}'.format(accuracy_score(fix_y(y_test), fix_y(y_pred)))
    print 'start sk pred'
    y_sci = knnsci.predict(X_test, X, y)
    print 'SciKit Accuracy: {}  My Accuracy: {}'.format(accuracy_score(fix_y(y_test), fix_y(y_sci)), accuracy_score(fix_y(y_test), fix_y(y_pred)))
コード例 #4
0
def q1a():
    """SVM on Spam Data
    length train: 4140 length test 461
    train acc: 0.806763285024 test acc: 0.819956616052
    """
    data = utils.pandas_to_data(utils.load_and_normalize_spam_data())
    svm_q1(data, svm.SVC(kernel='poly'))
コード例 #5
0
def decision_spambase_set():
    """
    Solution for HW1 prob 1
    """
    print('Homework 1 problem 1 - Regression Decision tree')
    print('Spambase Dataset')
    spam_data = utils.load_and_normalize_spam_data()
    test, train = utils.split_test_and_train(spam_data)
    print str(len(train)) + " # in training set <--> # in test " + str(len(test))
    dt = train_decision_tree(train)
    predicted = test_decision_tree(dt, test)
    #print predicted
    #print test['is_spam']
    error = mystats.calculate_binary_error(predicted, test['is_spam'])
    print 'Error: ' + str(error)
コード例 #6
0
def q_1():
    h_test, h_train = utils.load_and_normalize_housing_set()
    h_results = []
    s_results = []
    # h_results.append(dec_or_reg_tree(h_train, h_test, 'MEDV')) # MSE - 568 test- 448
    # h_results.append(linear_reg_errors(h_train, h_test, 'MEDV')) # MSE - 27 test -14
    # h_results.append(linear_reg_errors(h_train, h_test, 'MEDV', True)) # 24176 - 68289
    # h_results.append(linear_gd(h_train, h_test, 'MEDV')) # works but MSE too low? .0022 - .0013
    # h_results.append(logistic_gd(h_train, h_test, 'MEDV'))  # 1.46e_13 - 1.17e+13

    s_test, s_train = utils.split_test_and_train(utils.load_and_normalize_spam_data())
    s_results.append(dec_or_reg_tree(s_train, s_test, "is_spam"))  # works .845 - .86
    s_results.append(linear_reg_errors(s_train, s_test, "is_spam"))  # works .8609 - .903
    s_results.append(linear_reg_errors(s_train, s_test, "is_spam", True))  # works .8416 - .8543
    s_results.append(k_folds_linear_gd(s_train, s_test, "is_spam"))  # does not work .6114 - .6114
    s_results.append(logistic_gd(s_train, s_test, "is_spam"))  # returns perfect... 1- 1
    print_results_1(s_results, h_results)
コード例 #7
0
def testLogisticGradient():
    """ logistic gradient descent """
    df_test, df_train = utils.split_test_and_train(utils.load_and_normalize_spam_data())
    Y = 'is_spam'
    binary = utils.check_binary(df_train[Y])
    model = gd.logistic_gradient(df_train, df_train[Y], .1, max_iterations=5)
    #print model
    #raw_input()
    predict = gd.predict(df_train, model, binary, True)
    print predict
    error_train = mystats.get_error(predict, df_train[Y], binary)
    #raw_input()
    predict = gd.predict(df_test, model, binary, True)
    print predict
    error_test = mystats.get_error(predict, df_test[Y], binary)
    print 'error train {} error_test {}'.format(error_train, error_test)
    return [error_train, error_test]
コード例 #8
0
def relief(n):
    max_iters = 1
    j = 0
    i = 1
    n_neighbors = [1, 3, 7]
    metric = ['minkowski', 'cosine', 'gaussian', 'poly2']
    ma = hw7u.Kernel(ktype=metric[j]).compute
    data = utils.pandas_to_data(utils.load_and_normalize_spam_data())
    k = 10
    all_folds = hw3u.partition_folds(data, k)
    kf_train, kf_test = dl.get_train_and_test(all_folds, 0)
    y, X = hw4u.split_truth_from_data(kf_train)
    y_test, X_test = hw4u.split_truth_from_data(kf_test)
    loops = 0
    weights = np.zeros(len(X[0]))
    loops += 1
    n_features = len(X[0])
    n_samples = len(X)
    for j in range(n_features): #feature

        for i in range(n_samples):  # data
            closest_same = None
            closest_opp = None
            for z_i in range(n_samples):
                if z_i == i:
                    continue
                diff = (X[z_i][j] - X[i][j]) ** 2
                if y[z_i] == y[i]:  # same
                    if closest_same is None or diff < closest_same:
                        closest_same = diff
                else:  # opp
                    if closest_opp is None or diff < closest_opp:
                        closest_opp = diff
            weights[j] += (-closest_same + closest_opp)
            if i % 1000 == 0:
                print 'feature {} of {}, sample {} of {}'.format(j, n_features, i, n_samples)
    print weights

    return sorted(zip(weights, range(len(weights))), reverse=True)[:n][1]
コード例 #9
0
def runSpamDensity(_i, j, features='all'):
    metric = ['gaussian', 'poly2', 'cosine_similarity', 'gaussian_density']
    data = utils.pandas_to_data(utils.load_and_normalize_spam_data())
    k = 10
    all_folds = hw3u.partition_folds(data, k)
    kf_train, kf_test = dl.get_train_and_test(all_folds, 0)
    y, X = hw4u.split_truth_from_data(kf_train)
    y_test, X_test = hw4u.split_truth_from_data(kf_test)

    print(len(X))
    print(len(X_test))

    myclassifier = hw7u.MyKNN(metric=metric[j], density=True)
    print 'start MyKNN'
    myclassifier.fit(X, y)
    #print 'start scikit'
    #knnsci = skclassifier.fit(X, y)
    print 'start my pred'
    y_pred = myclassifier.predict(X_test)
    print(y_pred)
    #print 'start sk pred'
    #y_sci = knnsci.score(X_test)
    #print 'SciKit Accuracy: {}  My Accuracy: {}'.format(accuracy_score(fix_y(y_test), fix_y(y_sci)), accuracy_score(fix_y(y_test), fix_y(y_pred)))
    print '2b: My Accuracy: {}'.format(accuracy_score(fix_y(y_test), fix_y(y_pred)))
コード例 #10
0
def load_and_normalize_spambase():
    return utils.load_and_normalize_spam_data()
コード例 #11
0
def analyze_spambase_hw1():
    """ HW1 - problem 2 """
    spamData = utils.load_and_normalize_spam_data()
    mystats.k_folds(spamData, 10)
コード例 #12
0
def q2_kernel_poly():
    # c = 1, tol = 1-e2, passes = 1
    #
    data = utils.pandas_to_data(utils.load_and_normalize_spam_data())
    svm_q1(data, mysvm.SVC(mysvm.SMO, Kernel('poly')))
コード例 #13
0
def q2():
    # c= 1, tol = 1-e2, passes = 1
    # train acc: 0.904830917874 test acc: 0.917570498915
    data = utils.pandas_to_data(utils.load_and_normalize_spam_data())
    svm_q1(data, mysvm.SVC(mysvm.SMO, Kernel('linear')))