Ejemplo n.º 1
0
def general_CFV(data, prediction_function, target_column, param, k=10):
    """

    :param data: data to perform CFV on
    :param prediction_function: prediction_function: maps (training data, testing data, k)
           to a prediction series of testing data
    :param target_column: column of df that is being predicted
    :param param: parameter of the model being used
    :param k: k to be used in k fold CFV
    :return: CFV r squared
    """
    train_valid_splits = tools.train_valid_k_fold_sets(data, k)

    total_r_sq = 0
    # iterate through each (train,valid) tuple finding r squared for each
    # add this r squared to the running total r squared
    for train_set, valid_set in train_valid_splits:
        # find predictions of valid_set based on train_set
        valid_set_predictions = prediction_function(train_set, valid_set,
                                                    param)
        valid_set_r_sq = tools.r_squared(valid_set, target_column,
                                         valid_set_predictions)
        total_r_sq += valid_set_r_sq

    # find the average r squared over the different validation sets
    avg_r_sq = total_r_sq / len(train_valid_splits)
    return avg_r_sq
Ejemplo n.º 2
0
def lasso_CFV(train, features, l2_pen, k):
    my_sets = tools.train_valid_k_fold_sets(train, k)
    total_r_sq = 0
    for t, v in my_sets:
        r_model = Lasso(alpha=l2_pen, normalize=True, max_iter=MAX_ITERATIONS)
        r_model.fit(t[features], t["price"])
        r_sq = r_model.score(v[features], v["price"])
        total_r_sq += r_sq
    avg_r_sq = total_r_sq / k
    return avg_r_sq
Ejemplo n.º 3
0
def ridge_CFV(train, features, l2_pen, k):
    my_sets = tools.train_valid_k_fold_sets(train, k)
    total_r_sq = 0
    for t, v in my_sets:
        r_model = Ridge(alpha=l2_pen, normalize=True)
        r_model.fit(t[features], t["price"])
        r_sq = r_model.score(v[features], v["price"])
        total_r_sq += r_sq
    avg_r_sq = total_r_sq / k
    return avg_r_sq
Ejemplo n.º 4
0
def CFV_r_sq(data, lam, k=10):
    """
    Does (k = 10) fold cross validation of the data
    :param training:
    :return:
    """
    my_sets = tools.train_valid_k_fold_sets(data, k)
    total_r_sq = 0
    for t, v in my_sets:
        v_predictions = data_predictions(t, v, lam)
        v_r_sq = tools.r_squared(v, "price", v_predictions)
        total_r_sq += v_r_sq
    avg_r_sq = total_r_sq / k
    return avg_r_sq
Ejemplo n.º 5
0
def cross_val_lambda(train, lam, k=10):
    """
    Return cross validation r squared
    For gaussian kernel regression
    :param train:
    :param lam:
    :param k:
    :return:
    """
    cross_valid_sets = tools.train_valid_k_fold_sets(train, k)
    total_r_sq = 0
    for t,v in cross_valid_sets:
        v_r_sq = gauss_r_sq(t,v,lam)
        total_r_sq += v_r_sq
    avg_r_sq= total_r_sq / k
    return avg_r_sq
Ejemplo n.º 6
0
numeric_features.remove("zipcode")
numeric_features.remove("long")
numeric_features.remove("lat")
#numeric_features.remove("yr_built")

features = numeric_features
"""
find the best l2 penalty
done by finding r squared values for different l2 penalties
then select the l2 penalty that gives the highest r squared value
"""

# choose a value of k for k-fold cross validation
k = 50
# set consisting of (test,valid) pairs for k fold validation
my_sets = tools.train_valid_k_fold_sets(train_data, k)
# find the r squared values for different l2 penalties
# then select the l2 penalty that gives the highest r squared value
l2_pens = [10, 40, 100]
best_l2_value = None
best_cross_r_sq = -np.inf
for l2_pen in l2_pens:
    cross_r_sq = lasso_CFV(train_data, features, l2_pen, k)
    print("For l2 penalty {}, r squared is {}".format(l2_pen,
                                                      np.round(cross_r_sq, 5)))
    if cross_r_sq > best_cross_r_sq:
        best_cross_r_sq = cross_r_sq
        best_l2_value = l2_pen
print()
print("The best value for l2 is", best_l2_value)
best_model = Lasso(alpha=0.03, normalize=True, max_iter=MAX_ITERATIONS)