def regression_line_housing_no_libs(): """ Solution for HW1 prob 2 """ print('Homework 1 problem 2 - No Libraries - Regression Line') print('Housing Dataset') test, train = utils.load_and_normalize_housing_set() print str(len(train)) + " # in training set <--> # in test " + str(len(test)) columns = train.columns[:-1] Y_fit = mystats.linear_regression_points(train[columns], train['MEDV']) print 'Y_fit' print Y_fit #for i in range(0, len(Y_fit)): # print str(Y_fit[i]) + ' -- ' + str(train['MEDV'][i]) row_sums = np.zeros(len(Y_fit[0])) for col in Y_fit: for i in range(0, len(col)): row_sums[i] += col[i] print row_sums col_MSE = {} for i, col in enumerate(columns): col_fit = row_sums[i] # Y_fit[i] + Y_fit[-1] col_MSE[col] = mystats.compute_MSE(col_fit, train['MEDV']) print col_MSE RMSE = np.sqrt(col_MSE.values()) average_MSE = utils.average(col_MSE.values()) average_RMSE = utils.average(RMSE) print 'Average MSE: ' + str(average_MSE) print 'Average RMSE: ' + str(average_RMSE)
def regression_line_spam_no_libs(): """ Solution for HW1 prob 2 """ print('Homework 1 problem 2 - No Libraries - Regression Line') print('Spam Dataset') spam_data = utils.load_and_normalize_spam_data() test, train = utils.split_test_and_train(spam_data) columns = train.columns[:-1] Y_fit = mystats.linear_regression_points(train[columns], train['is_spam']) #print 'Y_fit' #print Y_fit #for i in range(0, len(Y_fit)): # print str(Y_fit[i]) + ' -- ' + str(train['is_spam'][i]) col_MSE = {} for i, col in enumerate(columns): col_fit = Y_fit[i] + Y_fit[-1] col_MSE[col] = mystats.compute_MSE_arrays(col_fit, train['is_spam']) print col_MSE RMSE = np.sqrt(col_MSE.values()) average_MSE = utils.average(col_MSE.values()) average_RMSE = utils.average(RMSE) print 'Average MSE: ' + str(average_MSE) print 'Average RMSE: ' + str(average_RMSE)
def compute_combined_MSE(A, B): """ """ if len(A) == 0: return 0 muA = utils.average(A) muB = utils.average(B) if muA == 0: muA += .000000001 if muB == 0: muB += .000000001 total = 0 total += compute_MSE(muA, A) total += compute_MSE(muB, B) return total
def get_mus(arr): """ Return averages of each vector Expects an array of arrays as input """ trans = transpose_array(arr) # to go by column mus = [] for i in range(len(trans)): mus.append(utils.average(trans[i])) return mus
def test_variance(): a = get_test_data(8) arr = a[0] mu = utils.average(arr) sum = 0 for i in range(len(arr)): sum += (arr[i] - mu)**2 print float(sum/len(arr)) print np.var(arr)
def get_covar_X_Y(data, predict): """Data and predict are by rows """ covar = [] xmus = get_mus(data) ymu = utils.average(predict) for row in range(len(data)): covar.append([]) y = predict[row] for i in range(len(data[row])): x = data[row][i] covar[row].append(calc_covar(x, xmus[i], y, ymu)) return covar
def get_data_and_mus(spamData): truth_rows = transpose_array(spamData)[-1] # truth is by row data_rows = transpose_array(transpose_array(spamData)[:-1]) # data is by column data_mus = get_mus(data_rows) y_mu = utils.average(truth_rows) return truth_rows, data_rows, data_mus, y_mu
def mse(df, col): mu = utils.average(df[col]) sig = 0 for i in df[col]: sig += (i-mu)**2 return float(sig)/len(df[col])
def summary(array): """ returns mean and variance""" return [utils.average(array), utils.variance(array, len(d))]