コード例 #1
0
def branch_node(node, df, threshold, Y, regression=False):
    """
    :param node: Node object defined in Stats
    :param df: The dataframe being used by the tree
    :param threshold: max branching depth
    :param Y: Feature to predict
    :return: void
    """
    print 'Branching Level : ' + str(node.level)
    data = node.get_node_data(df)
    print 'Length of data ' + str(len(data)) + ' len df: ' + str(len(df))
    feature, label = mytree.find_best_feature_and_label_for_split(data, Y, regression)
    print 'feature: {} label: {}'.format(feature, label)
    if feature is not None and node.level < threshold:
        A_array, B_array = node.split(feature, df[feature], label)
        print ' A : {} B: {}'.format(sum(A_array), sum(B_array))
        node.add_left(A_array)
        node.add_right(B_array)
        branch_node(node.left, df, threshold, Y, regression)
        branch_node(node.right, df, threshold, Y, regression)
    else:
        if not regression:
            predict = 0
            prob = mystats.binary_probability(data, Y)
            print 'PROBABILITY ' + str(prob)
            if prob >= .5:
                predict = 1
            error = mystats.binary_error(data, Y, predict)
        else:
            print str(feature) +'is fueaturea ' + str(label) + str(node.presence)
            predict = float(sum(data[Y]))/len(data[Y])
            error = mystats.compute_MSE(predict, list(data[Y]))
        node.leaf(predict, error)
コード例 #2
0
def test_node(node, df, Y, regression=False):
    """
    :param node: Node object defined in Stats
    :param df: The dataframe being used by the tree
    :param Y: Feature to predict
    :return: void
    """
    print 'Testing Branching Level : ' + str(node.level)
    data = node.get_node_data(df)
    print 'Length of TEST data ' + str(len(data)) + ' len df: ' + str(len(df))
    feature = node.label['feature']
    label = node.label['criteria']
    if feature is not '':
        print 'feature ' + feature
        #print df[feature]
        A_array, B_array = node.split(feature, df[feature], label)
        print 'Test A : {} B: {}'.format(sum(A_array), sum(B_array))
        node.left.set_presence(A_array)
        node.right.set_presence(B_array)
        if node.left is not None:
            test_node(node.left, df, Y, regression)
        if node.right is not None:
            test_node(node.right, df, Y, regression)
    else:
        predict = node.predict
        if not regression:
            error = mystats.binary_error(data, Y, predict)
        else:
            error = mystats.compute_MSE(predict, list(data[Y]))
        node.test_leaf(error)
コード例 #3
0
def regression_line_housing_no_libs():
    """
    Solution for HW1 prob 2
    """
    print('Homework 1 problem 2 - No Libraries - Regression Line')
    print('Housing Dataset')
    test, train = utils.load_and_normalize_housing_set()
    print str(len(train)) + " # in training set <--> # in test " + str(len(test))
    columns = train.columns[:-1]
    Y_fit = mystats.linear_regression_points(train[columns], train['MEDV'])
    print 'Y_fit'
    print Y_fit
    #for i in range(0, len(Y_fit)):
    #    print str(Y_fit[i]) + ' -- ' + str(train['MEDV'][i])

    row_sums = np.zeros(len(Y_fit[0]))
    for col in Y_fit:
        for i in range(0, len(col)):
            row_sums[i] += col[i]

    print row_sums

    col_MSE = {}
    for i, col in enumerate(columns):
        col_fit = row_sums[i]  # Y_fit[i] + Y_fit[-1]
        col_MSE[col] = mystats.compute_MSE(col_fit, train['MEDV'])
    print col_MSE
    RMSE = np.sqrt(col_MSE.values())
    average_MSE = utils.average(col_MSE.values())
    average_RMSE = utils.average(RMSE)
    print 'Average MSE: ' + str(average_MSE)
    print 'Average RMSE: ' + str(average_RMSE)