Python Stats Examples, CS6140_A_MacLeay.utils.Stats Python Examples

Example #1

0

Show file

File: hw1.py Project: alliemacleay/MachineLearning_CS6140

def regression_line_housing_no_libs():
    """
    Solution for HW1 prob 2
    """
    print('Homework 1 problem 2 - No Libraries - Regression Line')
    print('Housing Dataset')
    test, train = utils.load_and_normalize_housing_set()
    print str(len(train)) + " # in training set <--> # in test " + str(len(test))
    columns = train.columns[:-1]
    Y_fit = mystats.linear_regression_points(train[columns], train['MEDV'])
    print 'Y_fit'
    print Y_fit
    #for i in range(0, len(Y_fit)):
    #    print str(Y_fit[i]) + ' -- ' + str(train['MEDV'][i])

    row_sums = np.zeros(len(Y_fit[0]))
    for col in Y_fit:
        for i in range(0, len(col)):
            row_sums[i] += col[i]

    print row_sums

    col_MSE = {}
    for i, col in enumerate(columns):
        col_fit = row_sums[i]  # Y_fit[i] + Y_fit[-1]
        col_MSE[col] = mystats.compute_MSE(col_fit, train['MEDV'])
    print col_MSE
    RMSE = np.sqrt(col_MSE.values())
    average_MSE = utils.average(col_MSE.values())
    average_RMSE = utils.average(RMSE)
    print 'Average MSE: ' + str(average_MSE)
    print 'Average RMSE: ' + str(average_RMSE)

Example #2

0

Show file

File: hw1.py Project: alliemacleay/MachineLearning_CS6140

def test_node(node, df, Y, regression=False):
    """
    :param node: Node object defined in Stats
    :param df: The dataframe being used by the tree
    :param Y: Feature to predict
    :return: void
    """
    print 'Testing Branching Level : ' + str(node.level)
    data = node.get_node_data(df)
    print 'Length of TEST data ' + str(len(data)) + ' len df: ' + str(len(df))
    feature = node.label['feature']
    label = node.label['criteria']
    if feature is not '':
        print 'feature ' + feature
        #print df[feature]
        A_array, B_array = node.split(feature, df[feature], label)
        print 'Test A : {} B: {}'.format(sum(A_array), sum(B_array))
        node.left.set_presence(A_array)
        node.right.set_presence(B_array)
        if node.left is not None:
            test_node(node.left, df, Y, regression)
        if node.right is not None:
            test_node(node.right, df, Y, regression)
    else:
        predict = node.predict
        if not regression:
            error = mystats.binary_error(data, Y, predict)
        else:
            error = mystats.compute_MSE(predict, list(data[Y]))
        node.test_leaf(error)

Example #3

0

Show file

File: hw1.py Project: alliemacleay/MachineLearning_CS6140

def branch_node(node, df, threshold, Y, regression=False):
    """
    :param node: Node object defined in Stats
    :param df: The dataframe being used by the tree
    :param threshold: max branching depth
    :param Y: Feature to predict
    :return: void
    """
    print 'Branching Level : ' + str(node.level)
    data = node.get_node_data(df)
    print 'Length of data ' + str(len(data)) + ' len df: ' + str(len(df))
    feature, label = mytree.find_best_feature_and_label_for_split(data, Y, regression)
    print 'feature: {} label: {}'.format(feature, label)
    if feature is not None and node.level < threshold:
        A_array, B_array = node.split(feature, df[feature], label)
        print ' A : {} B: {}'.format(sum(A_array), sum(B_array))
        node.add_left(A_array)
        node.add_right(B_array)
        branch_node(node.left, df, threshold, Y, regression)
        branch_node(node.right, df, threshold, Y, regression)
    else:
        if not regression:
            predict = 0
            prob = mystats.binary_probability(data, Y)
            print 'PROBABILITY ' + str(prob)
            if prob >= .5:
                predict = 1
            error = mystats.binary_error(data, Y, predict)
        else:
            print str(feature) +'is fueaturea ' + str(label) + str(node.presence)
            predict = float(sum(data[Y]))/len(data[Y])
            error = mystats.compute_MSE(predict, list(data[Y]))
        node.leaf(predict, error)

Example #4

0

Show file

File: hw1.py Project: alliemacleay/MachineLearning_CS6140

def decision_spambase_set_no_libs():
    """
    Solution for HW1 prob 1
    """
    print('Homework 1 problem 1 - No Libraries - Regression Decision tree')
    print('Spambase Dataset')
    spam_data = utils.load_and_normalize_spam_data()
    test, train = utils.split_test_and_train(spam_data)
    print str(len(train)) + " # in training set <--> # in test " + str(len(test))

    node = mytree.Node(np.ones(len(train)))
    branch_node(node, train, 5, 'is_spam')
    #node.show_children_tree()
    node.show_children_tree(follow=False)

    model = mytree.Tree(node)
    model.print_leaves()
    print 'Trained model error is : ' + str(model.error())

    node.presence = np.ones(len(test))
    test_node(node, test, 'is_spam')
    test_tree = mytree.Tree(node)
    prediction = test_tree.predict_obj()
    test_tree.print_leaves_test()
    print 'predict sum: ' + str(sum(prediction))
    print 'MSE:' + str(test_tree.error_test())

    [tp, tn, fp, fn] = mystats.get_performance_stats(test['is_spam'].as_matrix(), prediction)
    print 'TP: {}\tFP: {}\nTN: {}\tFN: {}'.format(tp, fp, tn, fn)
    print 'Accuracy: ' + str(mystats.compute_accuracy(tp,tn, fp,fn))
    print 'MSE: ' + str(mystats.compute_MSE_arrays(prediction, test['is_spam']))

Example #5

0

Show file

File: Tree.py Project: alliemacleay/MachineLearning_CS6140

def find_best_label_regression(df_old, col, y):
    least_sq = {}
    sarray = []
    df = df_old.copy()
    sorted = df.sort([col]).reset_index(drop=True)
    # sorted.reset_index(drop=True)
    # for row in enumerate(df.sort([col])):
    #    sarray.append(row)
    # sorted = pd.DataFrame(sarray)
    # sorted.columns = df.columns
    print "Print reset index"
    print df[y][0:10]
    print sorted[y][0:10]
    # sys.exit()
    i = 0
    print "Finding label for " + col
    for _, row in sorted.iterrows():
        i += 1
        if i == 1 or i > len(sorted) - 1:
            continue
        # print 'i:' + str(i)
        # print list(sorted[y])[i:len(sorted[y])]
        # print 'ls {} + {}'.format(len(list(sorted[y])[0:i]), len(list(sorted[y])[i:len(sorted[y])]))
        lsq = mystats.least_squares(list(sorted[y])[0:i]) + mystats.least_squares(list(sorted[y])[i : len(sorted[y])])
        least_sq[row[col]] = lsq
        # print 'ls {} + {}'.format(str(least_squares(list(sorted[y])[0:i])), least_squares(list(sorted[y])[i:len(sorted[y])]))
    return min(least_sq, key=lambda k: least_sq[k])

Example #6

0

Show file

File: hw2_new.py Project: alliemacleay/MachineLearning_CS6140

def dec_or_reg_tree(df_train, df_test, Y):
    binary = utils.check_binary(df_train[Y])
    if binary:
        newtree = treeHW4.TreeOptimal(max_depth=1)
        y = list(df_train[Y])
        nondf_train = utils.pandas_to_data(df_train)
        nondf_test = utils.pandas_to_data(df_test)
        newtree.fit(nondf_train, y)
        predict = newtree.predict(nondf_train)
        error_train = mystats.get_error(predict, y, binary)

        y = utils.pandas_to_data(df_test[Y])
        predict = newtree.predict(nondf_test)
        error_test = mystats.get_error(predict, y)
    else:

        node = mytree.Node(np.ones(len(df_train)))
        hw1.branch_node(node, df_train, 5, Y)
        model = mytree.Tree(node)
        predict = model.predict_obj()
        error_train = mystats.get_error(predict, df_train[Y], binary)

        node.presence = np.ones(len(df_test))
        hw1.test_node(node, df_test, Y)
        test_tree = mytree.Tree(node)
        predict = test_tree.predict_obj()
        error_test = mystats.get_error(predict, df_test[Y], binary)
    return [error_train, error_test]

Example #7

0

Show file

File: hw1.py Project: alliemacleay/MachineLearning_CS6140

def regression_line_spam_no_libs():
    """
    Solution for HW1 prob 2
    """
    print('Homework 1 problem 2 - No Libraries - Regression Line')
    print('Spam Dataset')
    spam_data = utils.load_and_normalize_spam_data()
    test, train = utils.split_test_and_train(spam_data)
    columns = train.columns[:-1]
    Y_fit = mystats.linear_regression_points(train[columns], train['is_spam'])

    #print 'Y_fit'
    #print Y_fit
    #for i in range(0, len(Y_fit)):
    #    print str(Y_fit[i]) + ' -- ' + str(train['is_spam'][i])

    col_MSE = {}
    for i, col in enumerate(columns):
        col_fit = Y_fit[i] + Y_fit[-1]
        col_MSE[col] = mystats.compute_MSE_arrays(col_fit, train['is_spam'])
    print col_MSE
    RMSE = np.sqrt(col_MSE.values())
    average_MSE = utils.average(col_MSE.values())
    average_RMSE = utils.average(RMSE)
    print 'Average MSE: ' + str(average_MSE)
    print 'Average RMSE: ' + str(average_RMSE)

Example #8

0

Show file

File: hw1.py Project: alliemacleay/MachineLearning_CS6140

def test_regression_line_housing_no_libs():
    """
    Testing 2 variable solution for HW1 prob 2
    """
    print('Testing linear regression with 2 columns')
    test, train = utils.load_and_normalize_housing_set()
    print str(len(train)) + " # in training set <--> # in test " + str(len(test))
    columns = train.columns[:-1]
    Y_fit = mystats.linear_regression_points(train[columns[0]], train['MEDV'])
    #for i, col in enumerate(columns):
    print 'Y_fit'
    print Y_fit
    for i in range(0, len(Y_fit)):
        print str(Y_fit[i]) + ' -- ' + str(train['MEDV'][i])
    print train[columns[0]]
    #myplot.points([train[columns[0]], train['MEDV']])

    #myplot.points([train[columns[0]], list(Y_fit[0])])
    myplot.fit_v_point([train[columns[0]], train['MEDV'], list(Y_fit[0] + Y_fit[-1])])
    col_MSE = {}
    print columns[0]
    i = 0
    col = 'CRIM'
    col_fit = Y_fit[i] + Y_fit[-1]
    col_MSE[col] = mystats.compute_MSE_arrays(col_fit, train['MEDV'])
    print col_MSE

Example #9

0

Show file

File: testTree.py Project: alliemacleay/MachineLearning_CS6140

def test_vector_equality():
    v1 = [1, 2, 3, 4, 5]
    v2 = [1, 2, 3, 4, 5]
    v3 = [1, 2, 3, 4, 6]
    v4 = [1, 2, 3, 4]
    assert_true(mystats.check_vector_equality(v1, v2))
    assert_false(mystats.check_vector_equality(v2, v3))
    assert_false(mystats.check_vector_equality(v3, v4))

Example #10

0

Show file

File: testTree.py Project: alliemacleay/MachineLearning_CS6140

def test_dot_prod():
    data = {1:[1, 2, 3, 4, 5],
            2:[1, 2, 3, 4, 5],
            3:[1, 2, 3, 4, 5],
            4:[1, 2, 3, 4, 5]}
    multiplier = [2, 2, 2, 2, 2]
    truth = [30, 30, 30, 30]
    assert_true(mystats.check_vector_equality(truth, mystats.dot_product_sanity(data, multiplier)))

Example #11

0

Show file

File: hw2.py Project: alliemacleay/MachineLearning_CS6140

def logistic_regression(dftrain, dftest, predict_col):
    """ Logistic Regression for HW2 part B"""
    features = dftrain.columns.tolist()
    features.remove(predict_col)
    cls = LogisticRegression()
    cls.fit(dftrain[features], dftrain[predict_col])
    predictions = cls.predict(dftest[features])
    print predictions
    print mystats.compute_ACC(predictions, dftest[predict_col])

Example #12

0

Show file

File: Tree.py Project: alliemacleay/MachineLearning_CS6140

def compute_info_gain(df, feature, split, y):
    A = df[[feature, y]]
    # series = [split for x in range(0, len(A[feature]))]
    # print series
    mask = A[feature] <= split
    B = A[mask]
    C = A[~mask]
    info_gain = mystats.binary_entropy(A, y) - mystats.binary_entropy(B, y) + mystats.binary_entropy(C, y)
    # print 'Information Gain: %s' % info_gain
    return info_gain

Example #13

0

Show file

File: Tree.py Project: alliemacleay/MachineLearning_CS6140

def compute_info_gain_regression(df, feature, split, y):
    A = df[[feature, y]]
    # series = [split for x in range(0, len(A[feature]))]
    # print series
    mask = A[feature] <= split
    B = A[mask]
    C = A[~mask]
    info_gain = mystats.least_squares(A, y) - mystats.least_squares(B, y) - mystats.least_squares(C, y)
    print "Information Gain: %s" % info_gain
    return info_gain

Example #14

0

Show file

File: Tree.py Project: alliemacleay/MachineLearning_CS6140

def compute_mse_regression(df, feature, split, y):
    A = df[[feature, y]]
    # series = [split for x in range(0, len(A[feature]))]
    # print series
    mask = A[feature] <= split
    B = A[mask]
    C = A[~mask]
    # mse = mystats.mse(A, y) - mystats.mse(B, y) - mystats.mse(C, y)
    mse = mystats.mse(B, y) + mystats.mse(C, y)
    print "delta MSE: " + str(mse)
    return mse

Example #15

0

Show file

File: hw2_new.py Project: alliemacleay/MachineLearning_CS6140

def linear_gd(df_train, df_test, Y):
    """ linear gradient descent """
    binary = utils.check_binary(df_train[Y])
    model = gd.gradient(df_train, df_train[Y], 0.00001, max_iterations=50)
    print model
    predict = gd.predict(df_train, model, binary)
    print predict
    error_train = mystats.get_error(predict, df_train[Y], binary)
    predict = gd.predict(df_test, model, binary)
    print predict
    error_test = mystats.get_error(predict, df_test[Y], binary)
    return [error_train, error_test]

Example #16

0

Show file

File: hw2_new.py Project: alliemacleay/MachineLearning_CS6140

def logistic_gd(df_train, df_test, Y):
    """ logistic gradient descent """
    binary = utils.check_binary(df_train[Y])
    model = gd.logistic_gradient(df_train, df_train[Y], 0.1, max_iterations=5)
    print model
    predict = gd.predict(df_train, model, binary, True)
    print predict
    error_train = mystats.get_error(predict, df_train[Y], binary)
    predict = gd.predict(df_test, model, binary, True)
    print predict
    error_test = mystats.get_error(predict, df_test[Y], binary)
    return [error_train, error_test]

Example #17

0

Show file

File: hw3.py Project: alliemacleay/MachineLearning_CS6140

def q1():
    """GDA """
    """Run the Gaussian Discriminant Analysis on the spambase data. Use the k-folds from the previous problem (1 for testing, k-1 for training, for each fold)
Since you have 57 real value features, each of the  2gaussians (for + class and for - class) will have a mean  vector with 57 components, and a they will have
either a common (shared) covariance matrix size 57x57. This covariance is estimated from all training data (both classes)
or two separate covariance 57x57 matrices (estimated separately for each class)
(you can use a Matlab or Python of Java built in function to estimated covariance matrices, but the estimator is easy to code up).
Looking at the training and testing performance, does it appear that the gaussian assumption (normal distributed data) holds for this particular dataset?
"""

    spamData = hw3.pandas_to_data(hw3.load_and_normalize_spambase())  # returns an array of arrays - this is by row
    k = 10
    train_acc_sum = 0
    k_folds = hw3.partition_folds(spamData, k)
    gdas = []
    for ki in range(k - 1):
        subset = []
        gda = hw3.GDA()
        X, truth = hw3.separate_X_and_y(k_folds[ki])
        covariance_matrix = hw3.get_covar(X)
        gda.p_y = float(sum(truth)) / len(truth)
        gda.train(X, covariance_matrix, truth)
        predictions = gda.predict(X)
        #print predictions
        accuracy = mystats.get_error(predictions, truth, True)
        #gdas.append(gda)
        print_output(ki, accuracy)
        #print gda.prob
        gdas.append(gda)

Example #18

0

Show file

File: testTree.py Project: alliemacleay/MachineLearning_CS6140

def testLogisticGradient():
    """ logistic gradient descent """
    df_test, df_train = utils.split_test_and_train(utils.load_and_normalize_spam_data())
    Y = 'is_spam'
    binary = utils.check_binary(df_train[Y])
    model = gd.logistic_gradient(df_train, df_train[Y], .1, max_iterations=5)
    #print model
    #raw_input()
    predict = gd.predict(df_train, model, binary, True)
    print predict
    error_train = mystats.get_error(predict, df_train[Y], binary)
    #raw_input()
    predict = gd.predict(df_test, model, binary, True)
    print predict
    error_test = mystats.get_error(predict, df_test[Y], binary)
    print 'error train {} error_test {}'.format(error_train, error_test)
    return [error_train, error_test]

Example #19

0

Show file

File: hw2_new.py Project: alliemacleay/MachineLearning_CS6140

def linear_gd_error(df, Y):
    binary = utils.check_binary(df[Y])
    model = gd.gradient(df, df[Y], 0.00001, max_iterations=50)
    print model
    predict = gd.predict(df, model, binary)
    print predict
    error = mystats.get_error(predict, df_train[Y], binary)
    return error

Example #20

0

Show file

File: hw2_new.py Project: alliemacleay/MachineLearning_CS6140

def linear_reg(df, Y, binary=False, ridge=False, sigmoid=False):
    means = []
    columns = [col for col in df.columns if (col != "is_spam" and col != "MEDV" and col != "y")]
    if ridge:
        w = mystats.get_linridge_w(df[columns], df[Y], binary)
    else:
        for col in df.columns:
            mean = df[col].mean()
            means.append(mean)
            df[col] -= mean

        w = mystats.get_linreg_w(df[columns], df[Y])

    print ("w:")
    print (w)
    predict = mystats.predict(df[columns], w, binary, means=means)
    error = mystats.get_error(predict, df[Y], binary)
    return error

Example #21

0

Show file

File: hw1.py Project: alliemacleay/MachineLearning_CS6140

def decision_housing_set_no_libs():
    """
    Solution for HW1 prob 1
    """
    print('Homework 1 problem 1 - No Libraries - Regression Decision tree')
    print('Housing Dataset')
    test, train = utils.load_and_normalize_housing_set()

    # The following 2 lines are for debugging
    #train = utils.train_subset(train, ['ZN','CRIM', 'TAX', 'DIS', 'MEDV'], n=50)
    #test = utils.train_subset(test, ['ZN', 'CRIM', 'TAX', 'DIS', 'MEDV'], n=3)

    print str(len(train)) + " # in training set <--> # in test " + str(len(test))
    node = mytree.Node(np.ones(len(train)))
    branch_node(node, train, 2, 'MEDV', regression=True)
    #node.show_children_tree()
    node.show_children_tree(follow=False)

    model = mytree.Tree(node)
    model.print_leaves()
    model.print_tree(train)
    print 'Trained model error is : ' + str(model.error())
    train_prediction = model.predict_obj()
    print 'Training MSE is: ' + str(mystats.compute_MSE_arrays(train_prediction, train['MEDV']))
    sys.exit()

    node.presence = np.ones(len(test))
    test_node(node, test, 'MEDV', regression=True)
    test_tree = mytree.Tree(node)
    prediction = test_tree.predict_obj()
    #raw_input()
    print 'predict sum: ' + str(sum(prediction))
    test_tree.print_leaves_test()
    print 'ERROR: ' + str(test_tree.error_test())
    print prediction
    print 'train'
    print train['MEDV']
    print 'test'
    print test['MEDV']
    MSE = mystats.compute_MSE_arrays(prediction, test['MEDV'])
    print 'MSE: ' + str(MSE)
    print 'RMSE: ' + str(np.sqrt(MSE))

    test_tree.print_tree(test, long=False)

Example #22

0

Show file

File: hw1.py Project: alliemacleay/MachineLearning_CS6140

def regression_housing_set():
    """
    Solution for HW1 prob 1
    """
    print('Homework 1 problem 1 - Regression Decision tree')
    print('Housing Dataset')
    test, train = utils.load_and_normalize_housing_set()
    dt_reg = train_regression_tree(train)
    predicted = test_regression_tree(dt_reg, test)
    error = mystats.calculate_chisq_error(predicted, test['MEDV'])
    print 'Error: ' + str(error)

Example #23

0

Show file

File: GradientDescent.py Project: alliemacleay/MachineLearning_CS6140

def debug_print(iters, nc, h, y):
    diffs = 0
    error = mystats.get_error(h, y, 0)
    for i, pred in enumerate(y):
        diffs += abs(pred - h[i])
    distance = float(diffs)/len(h)
    print "actual"
    print y[:5]
    print "predicted"
    print h[:5]
    print 'loop: {} num not converged: {} distance: {} MSE: {}'.format(iters, nc, distance, error)

Example #24

0

Show file

File: Tree.py Project: alliemacleay/MachineLearning_CS6140

def find_best_label_new(df, col, y):
    # print df[col]
    values = set(df[col])
    mse = {}
    for v in values:
        mask = df[col] <= v
        if len(df[mask]) > 0 and len(df[mask]) < len(df):
            mse[v] = mystats.compute_combined_MSE(list(df[mask][y]), list(df[~mask][y]))
    # print 'MSE ' + str(mse)
    lkey = min(mse, key=lambda k: mse[k])
    return lkey, mse[lkey]

Example #25

0

Show file

File: hw2_new.py Project: alliemacleay/MachineLearning_CS6140

def k_folds_linear_gd(df_test, df_train, Y):
    k = 10
    df_test = gd.pandas_to_data(df_test)
    k_folds = partition_folds(df_test, k)
    model = Model_w()
    theta = None
    for ki in range(k - 1):
        print "k fold is {}".format(k)
        data, truth = get_data_and_truth(k_folds[ki])
        binary = True
        model.update(gd.gradient(data, np.array(truth), 0.00001, max_iterations=5, binary=binary))
        print model.w
        if theta is None:
            theta, max_acc = get_best_theta(data, truth, model.w, binary, False)
        predict = gd.predict_data(data, model.w, binary, False, theta)
        error = mystats.get_error(predict, truth, binary)
        print "Error for fold {} is {} with theta =  {}".format(k, error, theta)
    test, truth = get_data_and_truth(k_folds[k - 1])
    predict = gd.predict_data(test, model.w, binary, False, theta)
    test_error = mystats.get_error(predict, truth, binary)
    return [error, test_error]

Example #26

0

Show file

File: hw1.py Project: alliemacleay/MachineLearning_CS6140

def decision_spambase_set():
    """
    Solution for HW1 prob 1
    """
    print('Homework 1 problem 1 - Regression Decision tree')
    print('Spambase Dataset')
    spam_data = utils.load_and_normalize_spam_data()
    test, train = utils.split_test_and_train(spam_data)
    print str(len(train)) + " # in training set <--> # in test " + str(len(test))
    dt = train_decision_tree(train)
    predicted = test_decision_tree(dt, test)
    #print predicted
    #print test['is_spam']
    error = mystats.calculate_binary_error(predicted, test['is_spam'])
    print 'Error: ' + str(error)

Example #27

0

Show file

File: hw2_new.py Project: alliemacleay/MachineLearning_CS6140

def get_best_theta(data, truth, model, binary, logistic):
    best_theta = None
    max_acc = 0
    modmin = min(model)
    modmax = max(model)
    for theta_i in range(100):
        theta = modmin + float(theta_i) / (modmax - modmin)
        predict = gd.predict_data(data, model, binary, False, theta)
        acc = mystats.get_error(predict, truth, binary)
        if best_theta is None:
            best_theta = theta
            max_acc = acc
        elif acc > max_acc:
            best_theta = theta
            max_acc = acc
    return best_theta, max_acc

Example #28

0

Show file

File: __init__.py Project: alliemacleay/MachineLearning_CS6140

 def predict(self, X):
     df = pd.DataFrame(X)
     return mystats.predict(df, self.w, True, [])

Example #29

0

Show file

File: hw1.py Project: alliemacleay/MachineLearning_CS6140

def analyze_spambase_hw1():
    """ HW1 - problem 2 """
    spamData = utils.load_and_normalize_spam_data()
    mystats.k_folds(spamData, 10)

Example #30

0

Show file

File: testTree.py Project: alliemacleay/MachineLearning_CS6140

def test_column_product():
    v1 = [1, 2, 3, 4, 5]
    v2 = [2, 2, 2, 2, 2]
    truth = 30
    prod = mystats.column_product(v1, v2)
    assert_equal(truth, prod)