def multiple_regression_model(train_data, test_data):

	print "\n**********************************"
	print "*    Multiple Regression Model   *"
	print "**********************************\n"

	train_data = creat_new_features(train_data)
	test_data = creat_new_features(test_data)

	print "Quiz_1 (week_2):"

	new_features = ['bedrooms_squared','bed_bath_rooms','log_sqft_living','lat_plus_long']
	for idx,feature in enumerate(new_features):
		print "\nQ%s: %s mean is: %s" % (idx + 1,feature,round(test_data[feature].mean(),2))

	multiple_models = learning_multiple_models(train_data)
	rss_train,rss_test = {},{}
	models_names = ['model_1','model_2','model_3']
	for idx_m,model_i in enumerate(models_names):
		coefficients = multiple_models[model_i].get("coefficients").sort('value',ascending=False)
		# print coefficients #.print_rows(12)
		coeff_dict = gp.convert_sframe_to_simple_dict(coefficients,'name','value')
		print "\nQ%s: coefficient for 'bathrooms' in %s is: %s" % (idx_m + 5,model_i,coeff_dict['bathrooms'])

		rss_train[model_i] = reg.get_model_residual_sum_of_squares(multiple_models[model_i],train_data,train_data['price'])
		rss_test[model_i] = reg.get_model_residual_sum_of_squares(multiple_models[model_i],test_data,test_data['price'])

	print "\nQ8: lowest RSS on TRAINING Data is: %s" % (gp.find_key_min(rss_train))
	print "\nQ9: lowest RSS on TESTING Data is: %s" % (gp.find_key_min(rss_test))

	print '\nRSS-train:%s' % rss_train
	print 'RSS-trest:%s' % rss_test
Esempio n. 2
0
def main():
    try:
        song_data = gp.load_data('../../data_sets/song_data.gl/')

        artist_list = [
            'Kanye West', 'Foo Fighters', 'Taylor Swift', 'Lady GaGa'
        ]
        count_uniques = counting_unique_users(song_data, artist_list)
        # print count_uniques

        #Which of the artists below have had the most unique users listening to their songs
        print "\nQ1: Most unique users: %s" % (gp.find_key_max(count_uniques))

        #Which of the artists below is the most popular artist,
        # the one with highest total listen_count, in the data set
        listen_count = song_data.groupby(
            key_columns='artist',
            operations={
                'total_count': gp.graphlab.aggregate.SUM('listen_count')
            })

        # print listen_count.sort('total_count',ascending=False) #most listend / ascending=True) #least listend
        most_listen_count = gp.convert_sframe_to_simple_dict(
            listen_count, 'artist', 'total_count')
        print "\nQ2: Highest total listen: %s" % (
            gp.find_key_max(most_listen_count))
        print "\nQ3: Smallest total listen: %s" % (
            gp.find_key_min(most_listen_count))

    except Exception as details:
        print "Error >> %s" % details
        traceback.print_exc()
def quiz_1_ridge_regression(sales):
	l2_small_penalty = 1e-5

	model,poly_sframe = reg.polynomial_ridge_regression(sales,degree=15,target='price',l2_penalty=l2_small_penalty)
	# np_utils.print_coefficients(model)
	coeff_powers = gp.get_model_coefficients_dict(model)

	print "\nQ1: Learned coefficient of feature power_1: %s" % (round(abs(coeff_powers['power_1']),2))

	(semi_split1,semi_split2) = sales.random_split(.5,seed=0)
	(set_1,set_2) = semi_split1.random_split(0.5,seed=0)
	(set_3,set_4) = semi_split2.random_split(0.5,seed=0)

	degree = 15
	data_sets = [set_1,set_2,set_3,set_4]
	w_power1 = 1

	# Small Penalty
	l2_small_penalty = 1e-5
	weights_per_set = create_ridge_regression_and_plot(data_sets,degree,l2_small_penalty,w_power1)
	print "\nQ2: Weights por power_1: %s" % weights_per_set
	print "\nQ2: smallest coefficient L2=%s" % (l2_small_penalty)
	print "\t- Range: Between -1000 and -100"
	print "\nQ3: largest coefficient L2=%s" % (l2_small_penalty)
	print "\t- Range: Between 1000 and 10000"

	#Large Penalty
	l2_large_penalty = 1e5
	l2_weights_per_set = create_ridge_regression_and_plot(data_sets,degree,l2_large_penalty,w_power1)
	print "\nQ4: Weights por power_1: %s" % l2_weights_per_set

	min_set = gp.find_key_min(l2_weights_per_set)
	max_set = gp.find_key_max(l2_weights_per_set)
	print "\nQ4: smallest coeff L2=%s  %s: %s" % (l2_large_penalty,min_set,round(l2_weights_per_set[min_set],2))
	print "\nQ5: largest coeff L2=%s  %s: %s" % (l2_large_penalty,max_set,round(l2_weights_per_set[max_set],2))
Esempio n. 4
0
def query_min_knn_distance(name, people_dist, knn_model, name_model, count_qs):
    raw_model = knn_model.query(people_dist[name], verbose=False)
    # print raw_words
    raw_dict = gp.convert_sframe_to_simple_dict(raw_model, 'reference_label',
                                                'distance')
    raw_dict.pop(name)
    print "\nQ%s: %s: (%s): %s" % (count_qs, name, name_model,
                                   gp.find_key_min(raw_dict))
Esempio n. 5
0
def get_nearest_distance_id_query(knn_models, label, tet_query):
    """ create a dict from query-info and find min-key distance"""
    current_query = knn_models[label].query(tet_query, verbose=False)
    # print current_query
    cat_distance = gp.convert_sframe_to_simple_dict(current_query,
                                                    'reference_label',
                                                    'distance')
    return current_query, gp.find_key_min(cat_distance)
Esempio n. 6
0
def select_polynomial_degree(train_data, val_data):
	model_by_degree = {}
	rss_all = {}
	list_of_degrees = range(1,16)
	for degree in list_of_degrees:
		data_n_train = reg.polynomial_sframe(train_data['sqft_living'],degree)
		features_names = data_n_train.column_names()
		data_n_train['price'] = train_data['price']
		model_n = gp.create_linear_regression(data_n_train,target='price',features=features_names)
		data_n_val = reg.polynomial_sframe(val_data['sqft_living'],degree)
		data_n_val['price'] = val_data['price']
		rss_n = reg.get_model_residual_sum_of_squares(model_n,data_n_val,data_n_val['price'])
		rss_all[degree] = rss_n
		# print 'RSS(%s): %s' % (degree,rss_n)
		model_by_degree[degree] = model_n

	return gp.find_key_min(rss_all), model_by_degree
Esempio n. 7
0
def main():
    try:
        #Load Data
        products = gp.load_data('../../data_sets/amazon_baby.gl/')

        #Create word-count column
        products['word_count'] = gp.get_text_analytics_count(
            products['review'])

        #Select a group of words
        selected_words = [
            'awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible',
            'bad', 'terrible', 'awful', 'wow', 'hate'
        ]

        # Q1: Out of the 11 words in selected_words, which one
        # is most used in the reviews in the dataset?
        products, most_used = select_most_used_word(products, selected_words,
                                                    products['word_count'])

        key_most_used = gp.find_key_max(most_used)
        least_most_used = gp.find_key_min(most_used)
        print "\nQ1: Most used:%s = %s" % (key_most_used,
                                           most_used[key_most_used])
        print "\nQ2: Least used:%s = %s" % (least_most_used,
                                            most_used[least_most_used])

        #ignore all 3* reviews (to remove unknown rating)
        products = products[products['rating'] != 3]

        #positive sentiment = 4* or 5* reviews
        products['sentiment'] = products['rating'] >= 4

        #Split Data
        train_data, test_data = products.random_split(.8, seed=0)

        #***************
        # Create Model *
        #***************
        #Create Logistic Model (with selected-words as features)
        selected_model = gp.create_logistic_classifier_model(
            train_data,
            target='sentiment',
            features=selected_words,
            validation_set=test_data)
        #Create Logistic Model (with word_count as features)
        sentiment_model = gp.create_logistic_classifier_model(
            train_data,
            target='sentiment',
            features=['word_count'],
            validation_set=test_data)

        # Get weights of Coefficients
        coefficients = selected_model['coefficients'].sort('value',
                                                           ascending=False)
        # print coefficients.print_rows(12)

        #Out of the 11 words in selected_words, which one got
        # the most positive/negative weight in the selected_words_model
        print gp.find_key_max({coefficients['name']: coefficients['value']})
        print "\nQ3: Most Positive (w): love"
        print "\nQ4: Most Negative (w): terrible"

        #*****************
        # Evaluate Model *
        #*****************
        #Which of the following ranges contains the accuracy
        # of the selected_words_model on the test_data
        results_selected = selected_model.evaluate(test_data)
        results_sentiment = sentiment_model.evaluate(test_data)
        print "\nQ5: Accuracy selected-model: %s" % results_selected[
            'accuracy']  #0.843111938506
        print "\nQ6: Accuracy sentiment-model: %s" % results_sentiment[
            'accuracy']  #0.916256305549

        #****************
        # Predict Model *
        #****************
        #Which of the following ranges contains the predicted_sentiment for the most positive review
        # for Baby Trend Diaper Champ, according to the sentiment_model ?
        diaper_champ_reviews = products[products['name'] ==
                                        'Baby Trend Diaper Champ']
        diaper_champ_reviews['predicted_selected'] = selected_model.predict(
            diaper_champ_reviews, output_type='probability')
        diaper_champ_reviews['predicted_sentiment'] = sentiment_model.predict(
            diaper_champ_reviews, output_type='probability')

        most_positive_review = diaper_champ_reviews.sort('predicted_selected',
                                                         ascending=False)
        most_positive_sentiment = diaper_champ_reviews.sort(
            'predicted_sentiment', ascending=False)

        print "\nQ9: predicted_selected most positive review: %s" % max(
            most_positive_review['predicted_selected'])
        print "\nQ10: predicted_sentiment most positive review: %s" % max(
            most_positive_sentiment['predicted_sentiment'])

    except Exception as details:
        print "Error >> %s" % details
        traceback.print_exc()