def regression_line_housing_no_libs():
    """
    Solution for HW1 prob 2
    """
    print('Homework 1 problem 2 - No Libraries - Regression Line')
    print('Housing Dataset')
    test, train = utils.load_and_normalize_housing_set()
    print str(len(train)) + " # in training set <--> # in test " + str(len(test))
    columns = train.columns[:-1]
    Y_fit = mystats.linear_regression_points(train[columns], train['MEDV'])
    print 'Y_fit'
    print Y_fit
    #for i in range(0, len(Y_fit)):
    #    print str(Y_fit[i]) + ' -- ' + str(train['MEDV'][i])

    row_sums = np.zeros(len(Y_fit[0]))
    for col in Y_fit:
        for i in range(0, len(col)):
            row_sums[i] += col[i]

    print row_sums

    col_MSE = {}
    for i, col in enumerate(columns):
        col_fit = row_sums[i]  # Y_fit[i] + Y_fit[-1]
        col_MSE[col] = mystats.compute_MSE(col_fit, train['MEDV'])
    print col_MSE
    RMSE = np.sqrt(col_MSE.values())
    average_MSE = utils.average(col_MSE.values())
    average_RMSE = utils.average(RMSE)
    print 'Average MSE: ' + str(average_MSE)
    print 'Average RMSE: ' + str(average_RMSE)
def regression_line_spam_no_libs():
    """
    Solution for HW1 prob 2
    """
    print('Homework 1 problem 2 - No Libraries - Regression Line')
    print('Spam Dataset')
    spam_data = utils.load_and_normalize_spam_data()
    test, train = utils.split_test_and_train(spam_data)
    columns = train.columns[:-1]
    Y_fit = mystats.linear_regression_points(train[columns], train['is_spam'])

    #print 'Y_fit'
    #print Y_fit
    #for i in range(0, len(Y_fit)):
    #    print str(Y_fit[i]) + ' -- ' + str(train['is_spam'][i])

    col_MSE = {}
    for i, col in enumerate(columns):
        col_fit = Y_fit[i] + Y_fit[-1]
        col_MSE[col] = mystats.compute_MSE_arrays(col_fit, train['is_spam'])
    print col_MSE
    RMSE = np.sqrt(col_MSE.values())
    average_MSE = utils.average(col_MSE.values())
    average_RMSE = utils.average(RMSE)
    print 'Average MSE: ' + str(average_MSE)
    print 'Average RMSE: ' + str(average_RMSE)
def compute_combined_MSE(A, B):
    """ """
    if len(A) == 0:
        return 0
    muA = utils.average(A)
    muB = utils.average(B)
    if muA == 0:
        muA += .000000001
    if muB == 0:
        muB += .000000001
    total = 0
    total += compute_MSE(muA, A)
    total += compute_MSE(muB, B)

    return total
def get_mus(arr):
    """ Return averages of each vector
        Expects an array of arrays as input
    """
    trans = transpose_array(arr)  # to go by column
    mus = []
    for i in range(len(trans)):
        mus.append(utils.average(trans[i]))
    return mus
def test_variance():
    a = get_test_data(8)
    arr = a[0]
    mu = utils.average(arr)
    sum = 0
    for i in range(len(arr)):
        sum += (arr[i] - mu)**2
    print float(sum/len(arr))

    print np.var(arr)
def get_covar_X_Y(data, predict):
    """Data and predict are by rows
    """
    covar = []
    xmus = get_mus(data)
    ymu = utils.average(predict)
    for row in range(len(data)):
        covar.append([])
        y = predict[row]
        for i in range(len(data[row])):
            x = data[row][i]
            covar[row].append(calc_covar(x, xmus[i], y, ymu))
    return covar
def get_data_and_mus(spamData):
    truth_rows = transpose_array(spamData)[-1]  # truth is by row
    data_rows = transpose_array(transpose_array(spamData)[:-1])  # data is by column
    data_mus = get_mus(data_rows)
    y_mu = utils.average(truth_rows)
    return truth_rows, data_rows, data_mus, y_mu
def mse(df, col):
    mu = utils.average(df[col])
    sig = 0
    for i in df[col]:
        sig += (i-mu)**2
    return float(sig)/len(df[col])
def summary(array):
    """ returns mean and variance"""
    return [utils.average(array), utils.variance(array, len(d))]