def general_CFV(data, prediction_function, target_column, param, k=10): """ :param data: data to perform CFV on :param prediction_function: prediction_function: maps (training data, testing data, k) to a prediction series of testing data :param target_column: column of df that is being predicted :param param: parameter of the model being used :param k: k to be used in k fold CFV :return: CFV r squared """ train_valid_splits = tools.train_valid_k_fold_sets(data, k) total_r_sq = 0 # iterate through each (train,valid) tuple finding r squared for each # add this r squared to the running total r squared for train_set, valid_set in train_valid_splits: # find predictions of valid_set based on train_set valid_set_predictions = prediction_function(train_set, valid_set, param) valid_set_r_sq = tools.r_squared(valid_set, target_column, valid_set_predictions) total_r_sq += valid_set_r_sq # find the average r squared over the different validation sets avg_r_sq = total_r_sq / len(train_valid_splits) return avg_r_sq
def lasso_CFV(train, features, l2_pen, k): my_sets = tools.train_valid_k_fold_sets(train, k) total_r_sq = 0 for t, v in my_sets: r_model = Lasso(alpha=l2_pen, normalize=True, max_iter=MAX_ITERATIONS) r_model.fit(t[features], t["price"]) r_sq = r_model.score(v[features], v["price"]) total_r_sq += r_sq avg_r_sq = total_r_sq / k return avg_r_sq
def ridge_CFV(train, features, l2_pen, k): my_sets = tools.train_valid_k_fold_sets(train, k) total_r_sq = 0 for t, v in my_sets: r_model = Ridge(alpha=l2_pen, normalize=True) r_model.fit(t[features], t["price"]) r_sq = r_model.score(v[features], v["price"]) total_r_sq += r_sq avg_r_sq = total_r_sq / k return avg_r_sq
def CFV_r_sq(data, lam, k=10): """ Does (k = 10) fold cross validation of the data :param training: :return: """ my_sets = tools.train_valid_k_fold_sets(data, k) total_r_sq = 0 for t, v in my_sets: v_predictions = data_predictions(t, v, lam) v_r_sq = tools.r_squared(v, "price", v_predictions) total_r_sq += v_r_sq avg_r_sq = total_r_sq / k return avg_r_sq
def cross_val_lambda(train, lam, k=10): """ Return cross validation r squared For gaussian kernel regression :param train: :param lam: :param k: :return: """ cross_valid_sets = tools.train_valid_k_fold_sets(train, k) total_r_sq = 0 for t,v in cross_valid_sets: v_r_sq = gauss_r_sq(t,v,lam) total_r_sq += v_r_sq avg_r_sq= total_r_sq / k return avg_r_sq
numeric_features.remove("zipcode") numeric_features.remove("long") numeric_features.remove("lat") #numeric_features.remove("yr_built") features = numeric_features """ find the best l2 penalty done by finding r squared values for different l2 penalties then select the l2 penalty that gives the highest r squared value """ # choose a value of k for k-fold cross validation k = 50 # set consisting of (test,valid) pairs for k fold validation my_sets = tools.train_valid_k_fold_sets(train_data, k) # find the r squared values for different l2 penalties # then select the l2 penalty that gives the highest r squared value l2_pens = [10, 40, 100] best_l2_value = None best_cross_r_sq = -np.inf for l2_pen in l2_pens: cross_r_sq = lasso_CFV(train_data, features, l2_pen, k) print("For l2 penalty {}, r squared is {}".format(l2_pen, np.round(cross_r_sq, 5))) if cross_r_sq > best_cross_r_sq: best_cross_r_sq = cross_r_sq best_l2_value = l2_pen print() print("The best value for l2 is", best_l2_value) best_model = Lasso(alpha=0.03, normalize=True, max_iter=MAX_ITERATIONS)