def k_fold_cross_validation(k, l2_pena, data, output_name, features_list, verbose=False): degree = 15 rss_sum = 0. for i in range(0, k): validation = extract_segment(data, k, i) training = extract_train(data, k, i) poly_data = polynomial_sframe(training['sqft_living'], degree) my_features = poly_data.column_names() # print(my_features) poly_data['price'] = training['price'] model = graphlab.linear_regression.create(poly_data, target = 'price', features = my_features, l2_penalty = l2_pena, validation_set = None, verbose = False) # model.get("coefficients").print_rows(num_rows=20) # validation poly_validation = polynomial_sframe(validation[features_list[0]], degree) rss = get_residual_sum_of_squares(model, poly_validation, validation[output_name]) rss_sum += rss print(" Segment %d of %d: l2_pena = %f, avg[train X1 = %f, train Y = %f, validation X1 = %f], RSS = %f" % (i, k, l2_pena, training['sqft_living'].mean(), training['price'].mean(), validation['sqft_living'].mean(), rss)) print("%d-folding, Avg. RSS = %f, L2 penalty = %f" % (k, (rss_sum/k), l2_pena))
#%matplotlib inline import graphlab import matplotlib as mpl mpl.use('TkAgg') import matplotlib.pyplot as plt from regression import polynomial_sframe from regression import polynomial_fit from regression import get_residual_sum_of_squares as rss sales = graphlab.SFrame('kc_house_data.gl/') sales = sales.sort(['sqft_living', 'price']) # Model 1: 1 feature & degree 1 poly1_data = polynomial_sframe(sales['sqft_living'], 1) poly1_data['price'] = sales[ 'price'] # add price to the data since it's the target model1 = graphlab.linear_regression.create(poly1_data, target='price', features=['power_1'], validation_set=None) #let's take a look at the weights before we plot print "Coefficients of model with 1 degree:" model1.get("coefficients") plt.plot(poly1_data['power_1'], poly1_data['price'], '.',
training_and_validation,testing = sales.random_split(.9,seed=1) training,validation = training_and_validation.random_split(.5,seed=1) print(len(sales)) print(len(training)) print(len(validation)) print(len(testing)) foo = "" lowest_rss = None lowest_degree = None lowest_model = None for degree in range(1, 15+1): print("----------------------------------------------") print("Estimating for Degree %d" % degree) print("----------------------------------------------") poly_data = polynomial_sframe(training['sqft_living'], degree) my_features = poly_data.column_names() print(my_features) poly_data['price'] = training['price'] model = graphlab.linear_regression.create(poly_data, target = 'price', features = my_features, validation_set = None) model.get("coefficients").print_rows(num_rows=20) poly_validation = polynomial_sframe(validation['sqft_living'], degree) rss = get_residual_sum_of_squares(model, poly_validation, validation['price']) if lowest_rss is None or lowest_rss > rss: lowest_rss = rss lowest_degree = degree lowest_model = model foo = foo + "Degree %d, RSS %f\n" % (degree, rss) print(foo)
# ---------------------------------------------- # Polynomial regression, revisited # ---------------------------------------------- print("*** Polynomial regression, revisited") sales = graphlab.SFrame('kc_house_data.gl/') sales = sales.sort(['sqft_living', 'price']) # split the data set into training, validation and testing. #training_and_validation,testing = sales.random_split(.9,seed=1) #training,validation = training_and_validation.random_split(.5,seed=1) training, testing = sales.random_split(.9, seed=1) l2_small_penalty = 1e-5 degree = 15 poly_data = polynomial_sframe(training['sqft_living'], degree) my_features = poly_data.column_names() print(my_features) poly_data['price'] = training['price'] model = graphlab.linear_regression.create(poly_data, target='price', features=my_features, l2_penalty=l2_small_penalty, validation_set=None) model.get("coefficients").print_rows(num_rows=20) # ---------------------------------------------- # Observe overfitting # ---------------------------------------------- print("*** Observe overfitting")
# ---------------------------------------------- # Polynomial regression, revisited # ---------------------------------------------- print("*** Polynomial regression, revisited") sales = graphlab.SFrame('kc_house_data.gl/') sales = sales.sort(['sqft_living','price']) # split the data set into training, validation and testing. #training_and_validation,testing = sales.random_split(.9,seed=1) #training,validation = training_and_validation.random_split(.5,seed=1) training,testing = sales.random_split(.9,seed=1) l2_small_penalty = 1e-5 degree = 15 poly_data = polynomial_sframe(training['sqft_living'], degree) my_features = poly_data.column_names() print(my_features) poly_data['price'] = training['price'] model = graphlab.linear_regression.create(poly_data, target = 'price', features = my_features, l2_penalty = l2_small_penalty, validation_set = None) model.get("coefficients").print_rows(num_rows=20) # ---------------------------------------------- # Observe overfitting # ---------------------------------------------- print("*** Observe overfitting")
#%matplotlib inline import graphlab import matplotlib as mpl mpl.use('TkAgg') import matplotlib.pyplot as plt from regression import polynomial_sframe from regression import polynomial_fit from regression import get_residual_sum_of_squares as rss sales = graphlab.SFrame('kc_house_data.gl/') sales = sales.sort(['sqft_living', 'price']) # Model 1: 1 feature & degree 1 poly1_data = polynomial_sframe(sales['sqft_living'], 1) poly1_data['price'] = sales['price'] # add price to the data since it's the target model1 = graphlab.linear_regression.create(poly1_data, target = 'price', features = ['power_1'], validation_set = None) #let's take a look at the weights before we plot print "Coefficients of model with 1 degree:" model1.get("coefficients") plt.plot(poly1_data['power_1'],poly1_data['price'],'.', poly1_data['power_1'], model1.predict(poly1_data),'-') # Model 2: 1 feature & degree 2 poly2_data = polynomial_sframe(sales['sqft_living'], 2)
# source ~/test27/bin/activate from regression import polynomial_sframe import graphlab as gl import matplotlib.pyplot as plt sales = gl.SFrame('kc_house_data.gl/') sales = sales.sort(['sqft_living', 'price']) print "********Small Penalty**********" L2_small_penalty = 1e-5 poldata = polynomial_sframe(sales['sqft_living'], 15) poldata['price'] = sales['price'] model_0 = gl.linear_regression.create(poldata, target='price', l2_penalty= L2_small_penalty, validation_set=None) print "Q1: What is the value of coefficient of Power_1?" print model_0.get("coefficients") # power_1: 103.09 print "power_1 should be 103.09." (semi_split1, semi_split2) = sales.random_split(.5, seed=0) (set_1, set_2) = semi_split1.random_split(0.5, seed=0) (set_3, set_4) = semi_split2.random_split(0.5, seed=0)
import numpy as np from regression import slice_data from regression import k_fold_cross_validation from regression import polynomial_sframe print "*** Selecting an L2 penalty via cross-validation" sales = gl.SFrame('kc_house_data.gl/') sales = sales.sort(['sqft_living','price']) # split the data set into training, validation and testing. (train_valid, test) = sales.random_split(.9, seed=1) train_valid_shuffled = gl.toolkits.cross_validation.shuffle(train_valid, random_seed=1) n = len(train_valid_shuffled) k = 10 # 10-fold cross-validation # i starts with 1 (start, end) = slice_data(n,k,3) validation4 = train_valid_shuffled[start:end] print "Test data slice. Answer should be 536234: ", int(round(validation4['price'].mean(), 0)) feature = polynomial_sframe(train_valid_shuffled['sqft_living'],15) for l2 in np.logspace(1, 7, num=13): rss = k_fold_cross_validation(10, l2, train_valid_shuffled,'price',['sqft_living']) print "For L2_penalty is ", l2, ", validation error is", rss
tmp = graphlab.SArray([1., 2., 3.]) tmp_cubed = tmp.apply(lambda x: x**3) print tmp print tmp_cubed ex_sframe = graphlab.SFrame() ex_sframe['power_1'] = tmp print ex_sframe # ----------------------------------------- # Polynomial_sframe function # ----------------------------------------- print("*** Polynomial_sframe function") print polynomial_sframe(tmp, 3) # ----------------------------------------- # Visualizing polynomial regression # ----------------------------------------- print("*** Visualizing polynomial regression") sales = graphlab.SFrame('kc_house_data.gl/') sales = sales.sort(['sqft_living', 'price']) poly1_data = polynomial_sframe(sales['sqft_living'], 1) poly1_data['price'] = sales['price'] # add price to the data since it's the target model1 = graphlab.linear_regression.create(poly1_data, target = 'price', features = ['power_1'], validation_set = None)