Esempio n. 1
0
def k_fold_cross_validation(k, l2_pena, data, output_name, features_list, verbose=False):
    degree = 15
    rss_sum = 0.
    for i in range(0, k):
        validation = extract_segment(data, k, i)
        training = extract_train(data, k, i)

        poly_data = polynomial_sframe(training['sqft_living'], degree)
        my_features = poly_data.column_names()
#        print(my_features)
        poly_data['price'] = training['price']
        model = graphlab.linear_regression.create(poly_data,
                                                  target = 'price',
                                                  features = my_features,
                                                  l2_penalty = l2_pena,
                                                  validation_set = None,
                                                  verbose = False)
#        model.get("coefficients").print_rows(num_rows=20)

        # validation
        poly_validation = polynomial_sframe(validation[features_list[0]], degree)
        rss = get_residual_sum_of_squares(model, poly_validation, validation[output_name])

        rss_sum += rss
        print("  Segment %d of %d: l2_pena = %f, avg[train X1 = %f, train Y = %f, validation X1 = %f], RSS = %f" % (i, k, l2_pena, training['sqft_living'].mean(), training['price'].mean(), validation['sqft_living'].mean(), rss))
    print("%d-folding, Avg. RSS = %f, L2 penalty = %f" % (k, (rss_sum/k), l2_pena))
Esempio n. 2
0
def k_fold_cross_validation(k, l2_pena, data, output_name, features_list, verbose=False):
    degree = 15
    rss_sum = 0.
    for i in range(0, k):
        validation = extract_segment(data, k, i)
        training = extract_train(data, k, i)

        poly_data = polynomial_sframe(training['sqft_living'], degree)
        my_features = poly_data.column_names()
#        print(my_features)
        poly_data['price'] = training['price']
        model = graphlab.linear_regression.create(poly_data,
                                                  target = 'price',
                                                  features = my_features,
                                                  l2_penalty = l2_pena,
                                                  validation_set = None,
                                                  verbose = False)
#        model.get("coefficients").print_rows(num_rows=20)

        # validation
        poly_validation = polynomial_sframe(validation[features_list[0]], degree)
        rss = get_residual_sum_of_squares(model, poly_validation, validation[output_name])

        rss_sum += rss
        print("  Segment %d of %d: l2_pena = %f, avg[train X1 = %f, train Y = %f, validation X1 = %f], RSS = %f" % (i, k, l2_pena, training['sqft_living'].mean(), training['price'].mean(), validation['sqft_living'].mean(), rss))
    print("%d-folding, Avg. RSS = %f, L2 penalty = %f" % (k, (rss_sum/k), l2_pena))
Esempio n. 3
0
#%matplotlib inline

import graphlab
import matplotlib as mpl
mpl.use('TkAgg')
import matplotlib.pyplot as plt
from regression import polynomial_sframe
from regression import polynomial_fit
from regression import get_residual_sum_of_squares as rss

sales = graphlab.SFrame('kc_house_data.gl/')

sales = sales.sort(['sqft_living', 'price'])

# Model 1: 1 feature & degree 1
poly1_data = polynomial_sframe(sales['sqft_living'], 1)

poly1_data['price'] = sales[
    'price']  # add price to the data since it's the target

model1 = graphlab.linear_regression.create(poly1_data,
                                           target='price',
                                           features=['power_1'],
                                           validation_set=None)

#let's take a look at the weights before we plot
print "Coefficients of model with 1 degree:"

model1.get("coefficients")

plt.plot(poly1_data['power_1'], poly1_data['price'], '.',
Esempio n. 4
0
training_and_validation,testing = sales.random_split(.9,seed=1)
training,validation = training_and_validation.random_split(.5,seed=1)
print(len(sales))
print(len(training))
print(len(validation))
print(len(testing))

foo = ""
lowest_rss = None
lowest_degree = None
lowest_model = None
for degree in range(1, 15+1):
    print("----------------------------------------------")
    print("Estimating for Degree %d" % degree)
    print("----------------------------------------------")
    poly_data = polynomial_sframe(training['sqft_living'], degree)
    my_features = poly_data.column_names()
    print(my_features)
    poly_data['price'] = training['price']
    model = graphlab.linear_regression.create(poly_data, target = 'price', features = my_features, validation_set = None)
    model.get("coefficients").print_rows(num_rows=20)

    poly_validation = polynomial_sframe(validation['sqft_living'], degree)
    rss = get_residual_sum_of_squares(model, poly_validation, validation['price'])
    if lowest_rss is None or lowest_rss > rss:
        lowest_rss = rss
        lowest_degree = degree
        lowest_model = model
    foo = foo + "Degree %d, RSS %f\n" % (degree, rss)

print(foo)
Esempio n. 5
0
# ----------------------------------------------
# Polynomial regression, revisited
# ----------------------------------------------
print("*** Polynomial regression, revisited")
sales = graphlab.SFrame('kc_house_data.gl/')
sales = sales.sort(['sqft_living', 'price'])

# split the data set into training, validation and testing.
#training_and_validation,testing = sales.random_split(.9,seed=1)
#training,validation = training_and_validation.random_split(.5,seed=1)
training, testing = sales.random_split(.9, seed=1)

l2_small_penalty = 1e-5

degree = 15
poly_data = polynomial_sframe(training['sqft_living'], degree)
my_features = poly_data.column_names()
print(my_features)
poly_data['price'] = training['price']
model = graphlab.linear_regression.create(poly_data,
                                          target='price',
                                          features=my_features,
                                          l2_penalty=l2_small_penalty,
                                          validation_set=None)
model.get("coefficients").print_rows(num_rows=20)

# ----------------------------------------------
# Observe overfitting
# ----------------------------------------------
print("*** Observe overfitting")
Esempio n. 6
0
# ----------------------------------------------
# Polynomial regression, revisited
# ----------------------------------------------
print("*** Polynomial regression, revisited")
sales = graphlab.SFrame('kc_house_data.gl/')
sales = sales.sort(['sqft_living','price'])

# split the data set into training, validation and testing.
#training_and_validation,testing = sales.random_split(.9,seed=1)
#training,validation = training_and_validation.random_split(.5,seed=1)
training,testing = sales.random_split(.9,seed=1)

l2_small_penalty = 1e-5

degree = 15
poly_data = polynomial_sframe(training['sqft_living'], degree)
my_features = poly_data.column_names()
print(my_features)
poly_data['price'] = training['price']
model = graphlab.linear_regression.create(poly_data,
                                          target = 'price',
                                          features = my_features,
                                          l2_penalty = l2_small_penalty,
                                          validation_set = None)
model.get("coefficients").print_rows(num_rows=20)

# ----------------------------------------------
# Observe overfitting
# ----------------------------------------------
print("*** Observe overfitting")
Esempio n. 7
0
#%matplotlib inline

import graphlab
import matplotlib as mpl
mpl.use('TkAgg')
import matplotlib.pyplot as plt
from regression import polynomial_sframe
from regression import polynomial_fit
from regression import get_residual_sum_of_squares as rss

sales = graphlab.SFrame('kc_house_data.gl/')

sales = sales.sort(['sqft_living', 'price'])

# Model 1: 1 feature & degree 1
poly1_data = polynomial_sframe(sales['sqft_living'], 1)

poly1_data['price'] = sales['price'] # add price to the data since it's the target

model1 = graphlab.linear_regression.create(poly1_data, target = 'price', features = ['power_1'], validation_set = None)

#let's take a look at the weights before we plot
print "Coefficients of model with 1 degree:"

model1.get("coefficients")

plt.plot(poly1_data['power_1'],poly1_data['price'],'.',
        poly1_data['power_1'], model1.predict(poly1_data),'-')

# Model 2: 1 feature & degree 2
poly2_data = polynomial_sframe(sales['sqft_living'], 2)
#  source ~/test27/bin/activate

from regression import polynomial_sframe
import graphlab as gl
import matplotlib.pyplot as plt

sales = gl.SFrame('kc_house_data.gl/')

sales = sales.sort(['sqft_living', 'price'])

print "********Small Penalty**********"

L2_small_penalty = 1e-5

poldata = polynomial_sframe(sales['sqft_living'], 15)

poldata['price'] = sales['price']

model_0 = gl.linear_regression.create(poldata, target='price',
                                        l2_penalty= L2_small_penalty, validation_set=None)
print "Q1: What is the value of coefficient of Power_1?"

print model_0.get("coefficients")
# power_1: 103.09

print "power_1 should be 103.09."

(semi_split1, semi_split2) = sales.random_split(.5, seed=0)
(set_1, set_2) = semi_split1.random_split(0.5, seed=0)
(set_3, set_4) = semi_split2.random_split(0.5, seed=0)
import numpy as np
from regression import slice_data
from regression import k_fold_cross_validation
from regression import polynomial_sframe

print "*** Selecting an L2 penalty via cross-validation"

sales = gl.SFrame('kc_house_data.gl/')

sales = sales.sort(['sqft_living','price'])

# split the data set into training, validation and testing.
(train_valid, test) = sales.random_split(.9, seed=1)
train_valid_shuffled = gl.toolkits.cross_validation.shuffle(train_valid, random_seed=1)

n = len(train_valid_shuffled)
k = 10 # 10-fold cross-validation

# i starts with 1
(start, end) = slice_data(n,k,3)

validation4 = train_valid_shuffled[start:end]

print "Test data slice. Answer should be 536234: ", int(round(validation4['price'].mean(), 0))

feature = polynomial_sframe(train_valid_shuffled['sqft_living'],15)

for l2 in np.logspace(1, 7, num=13):
    rss = k_fold_cross_validation(10, l2, train_valid_shuffled,'price',['sqft_living'])
    print "For L2_penalty is ", l2, ", validation error is", rss
Esempio n. 10
0
tmp = graphlab.SArray([1., 2., 3.])
tmp_cubed = tmp.apply(lambda x: x**3)
print tmp
print tmp_cubed

ex_sframe = graphlab.SFrame()
ex_sframe['power_1'] = tmp
print ex_sframe

# -----------------------------------------
# Polynomial_sframe function
# -----------------------------------------
print("*** Polynomial_sframe function")

print polynomial_sframe(tmp, 3)

# -----------------------------------------
# Visualizing polynomial regression
# -----------------------------------------
print("*** Visualizing polynomial regression")

sales = graphlab.SFrame('kc_house_data.gl/')

sales = sales.sort(['sqft_living', 'price'])

poly1_data = polynomial_sframe(sales['sqft_living'], 1)
poly1_data['price'] = sales['price'] # add price to the data since it's the target

model1 = graphlab.linear_regression.create(poly1_data, target = 'price', features = ['power_1'], validation_set = None)