Ejemplo n.º 1
0
def select_polynomial_degree(train_data, val_data):
	model_by_degree = {}
	rss_all = {}
	list_of_degrees = range(1,16)
	for degree in list_of_degrees:
		data_n_train = reg.polynomial_sframe(train_data['sqft_living'],degree)
		features_names = data_n_train.column_names()
		data_n_train['price'] = train_data['price']
		model_n = gp.create_linear_regression(data_n_train,target='price',features=features_names)
		data_n_val = reg.polynomial_sframe(val_data['sqft_living'],degree)
		data_n_val['price'] = val_data['price']
		rss_n = reg.get_model_residual_sum_of_squares(model_n,data_n_val,data_n_val['price'])
		rss_all[degree] = rss_n
		# print 'RSS(%s): %s' % (degree,rss_n)
		model_by_degree[degree] = model_n

	return gp.find_key_min(rss_all), model_by_degree
Ejemplo n.º 2
0
def create_regression_model_by_degree(sales, list_of_degrees, sales_col='sqft_living', target='price'):
	poly_n_data= {}
	for degree in list_of_degrees:
		polyn_data = reg.polynomial_sframe(sales[sales_col], degree)
		features_names = polyn_data.column_names()
		# print features_names
		polyn_data[target] = sales[target]
		model_n = gp.create_linear_regression(polyn_data, target=target, features = features_names)
		# model_n.get("coefficients").print_rows(16)
		power_n = 'power_%s'%degree
		poly_n_data[power_n] = {'model':model_n, 'coefficients':model_n.get("coefficients"),'data': polyn_data}

		# plt.plot(polyn_data[power_n],polyn_data['price'],'.',polyn_data[power_n], model_n.predict(polyn_data),'-')
		# plt.show()
	return poly_n_data
def quiz_1_selecting_l2_penalty(sales):
	print "\n**********************************"
	print "*        k-Fold validation       *"
	print "**********************************\n"

	(train_valid,test) = sales.random_split(.9,seed=1)
	all_l2_rss_avg = selecting_l2_via_cross_validation(train_valid)
	best_l2_penalty = min(all_l2_rss_avg,key=all_l2_rss_avg.get)
	print "\nQ6: Best L2 penalty via 10-fold validation L2 (%.2f): %s" % (best_l2_penalty,all_l2_rss_avg[best_l2_penalty])

	degree = 15
	model_train_valid,poly_sframe_train_valid = reg.polynomial_ridge_regression(train_valid,degree,target='price',l2_penalty=float(best_l2_penalty))
	poly_sframe_test = reg.polynomial_sframe(test['sqft_living'],degree)
	poly_sframe_test['price'] = test['price']
	rss_n = reg.get_model_residual_sum_of_squares(model_train_valid,poly_sframe_test,poly_sframe_test['price'])
	print "\nQ7: Predictions for degree=%s TEST error (RSS)=%s" % (degree,rss_n)
	print "\t- Between 8e13 and 4e14"
def selecting_l2_via_cross_validation(train_valid):

	train_valid_shuffled = gp.graphlab.toolkits.cross_validation.shuffle(train_valid,random_seed=1)

	k,target,l2_penalties = 10,'price',np_utils.np.logspace(1,7,num=13)
	poly_sframe = reg.polynomial_sframe(train_valid_shuffled['sqft_living'],degree=15)
	features_list = poly_sframe.column_names()
	poly_sframe[target] = train_valid_shuffled[target]

	l2_rss_avg = compute_k_fold_cross_validation(k,poly_sframe,target,features_list,l2_penalties)

	plt.figure(figsize=(10,8))
	reg.plot_k_cross_vs_penalty(l2_penalties, l2_rss_avg)
	plt.savefig('../graphs/k_fold_vd_penalty_l2.png')
	plt.close()

	all_l2_rss_avg = dict(zip(l2_penalties,l2_rss_avg))

	return all_l2_rss_avg
Ejemplo n.º 5
0
def main():
	try:
		print "\n**********************************"
		print "*  Polynomial Regression Model   *"
		print "**********************************\n"

		sales = gp.load_data('../../data_sets/kc_house_data.gl/')
		train,test = sales.random_split(0.5,seed=0)

		set_1,set_2 = train.random_split(0.5,seed=0)
		set_3,set_4 = test.random_split(0.5,seed=0)

		list_of_degrees = [15] #[1,3,5,15]
		list_of_sets = [set_1,set_2,set_3,set_4]
		polynomial_regressions = get_polynomial_regression_by_sets(list_of_degrees, list_of_sets)

		print "\nQ1: power_15 for all four models:"
		pw_degree = 'power_15'
		for idx,sets in enumerate(list_of_sets):
			idx_set = 'set_%s' % (idx + 1)
			poly_n_coeff = polynomial_regressions[idx_set][pw_degree]['coefficients']
			coeff_dict = gp.convert_sframe_to_simple_dict(poly_n_coeff,'name','value')
			print "\t- %s: %s"%(idx_set, coeff_dict[pw_degree])

		print "\nQ2: fitted lines all look the same plots: FALSE"

		training, test_data = sales.random_split(0.9,seed=1)
		train_data, val_data = training.random_split(0.5,seed=1)

		best_degree, model_by_degree  = select_polynomial_degree(train_data,val_data)
		print "\nQ3: the lowest RSS on Validation data is degree:%s" % best_degree

		data_n_test = reg.polynomial_sframe(test_data['sqft_living'],best_degree)
		data_n_test['price'] = test_data['price']
		rss_n = reg.get_model_residual_sum_of_squares(model_by_degree[best_degree],data_n_test,data_n_test['price'])
		print "\nQ4: RSS on TEST with the degree:%s from Validation data is:%s" % (best_degree,rss_n)

	except Exception as details:
		print "Error >> %s" % details
		traceback.print_exc()