def multiple_regression_model(train_data, test_data): print "\n**********************************" print "* Multiple Regression Model *" print "**********************************\n" train_data = creat_new_features(train_data) test_data = creat_new_features(test_data) print "Quiz_1 (week_2):" new_features = ['bedrooms_squared','bed_bath_rooms','log_sqft_living','lat_plus_long'] for idx,feature in enumerate(new_features): print "\nQ%s: %s mean is: %s" % (idx + 1,feature,round(test_data[feature].mean(),2)) multiple_models = learning_multiple_models(train_data) rss_train,rss_test = {},{} models_names = ['model_1','model_2','model_3'] for idx_m,model_i in enumerate(models_names): coefficients = multiple_models[model_i].get("coefficients").sort('value',ascending=False) # print coefficients #.print_rows(12) coeff_dict = gp.convert_sframe_to_simple_dict(coefficients,'name','value') print "\nQ%s: coefficient for 'bathrooms' in %s is: %s" % (idx_m + 5,model_i,coeff_dict['bathrooms']) rss_train[model_i] = reg.get_model_residual_sum_of_squares(multiple_models[model_i],train_data,train_data['price']) rss_test[model_i] = reg.get_model_residual_sum_of_squares(multiple_models[model_i],test_data,test_data['price']) print "\nQ8: lowest RSS on TRAINING Data is: %s" % (gp.find_key_min(rss_train)) print "\nQ9: lowest RSS on TESTING Data is: %s" % (gp.find_key_min(rss_test)) print '\nRSS-train:%s' % rss_train print 'RSS-trest:%s' % rss_test
def main(): try: song_data = gp.load_data('../../data_sets/song_data.gl/') artist_list = [ 'Kanye West', 'Foo Fighters', 'Taylor Swift', 'Lady GaGa' ] count_uniques = counting_unique_users(song_data, artist_list) # print count_uniques #Which of the artists below have had the most unique users listening to their songs print "\nQ1: Most unique users: %s" % (gp.find_key_max(count_uniques)) #Which of the artists below is the most popular artist, # the one with highest total listen_count, in the data set listen_count = song_data.groupby( key_columns='artist', operations={ 'total_count': gp.graphlab.aggregate.SUM('listen_count') }) # print listen_count.sort('total_count',ascending=False) #most listend / ascending=True) #least listend most_listen_count = gp.convert_sframe_to_simple_dict( listen_count, 'artist', 'total_count') print "\nQ2: Highest total listen: %s" % ( gp.find_key_max(most_listen_count)) print "\nQ3: Smallest total listen: %s" % ( gp.find_key_min(most_listen_count)) except Exception as details: print "Error >> %s" % details traceback.print_exc()
def quiz_1_ridge_regression(sales): l2_small_penalty = 1e-5 model,poly_sframe = reg.polynomial_ridge_regression(sales,degree=15,target='price',l2_penalty=l2_small_penalty) # np_utils.print_coefficients(model) coeff_powers = gp.get_model_coefficients_dict(model) print "\nQ1: Learned coefficient of feature power_1: %s" % (round(abs(coeff_powers['power_1']),2)) (semi_split1,semi_split2) = sales.random_split(.5,seed=0) (set_1,set_2) = semi_split1.random_split(0.5,seed=0) (set_3,set_4) = semi_split2.random_split(0.5,seed=0) degree = 15 data_sets = [set_1,set_2,set_3,set_4] w_power1 = 1 # Small Penalty l2_small_penalty = 1e-5 weights_per_set = create_ridge_regression_and_plot(data_sets,degree,l2_small_penalty,w_power1) print "\nQ2: Weights por power_1: %s" % weights_per_set print "\nQ2: smallest coefficient L2=%s" % (l2_small_penalty) print "\t- Range: Between -1000 and -100" print "\nQ3: largest coefficient L2=%s" % (l2_small_penalty) print "\t- Range: Between 1000 and 10000" #Large Penalty l2_large_penalty = 1e5 l2_weights_per_set = create_ridge_regression_and_plot(data_sets,degree,l2_large_penalty,w_power1) print "\nQ4: Weights por power_1: %s" % l2_weights_per_set min_set = gp.find_key_min(l2_weights_per_set) max_set = gp.find_key_max(l2_weights_per_set) print "\nQ4: smallest coeff L2=%s %s: %s" % (l2_large_penalty,min_set,round(l2_weights_per_set[min_set],2)) print "\nQ5: largest coeff L2=%s %s: %s" % (l2_large_penalty,max_set,round(l2_weights_per_set[max_set],2))
def query_min_knn_distance(name, people_dist, knn_model, name_model, count_qs): raw_model = knn_model.query(people_dist[name], verbose=False) # print raw_words raw_dict = gp.convert_sframe_to_simple_dict(raw_model, 'reference_label', 'distance') raw_dict.pop(name) print "\nQ%s: %s: (%s): %s" % (count_qs, name, name_model, gp.find_key_min(raw_dict))
def get_nearest_distance_id_query(knn_models, label, tet_query): """ create a dict from query-info and find min-key distance""" current_query = knn_models[label].query(tet_query, verbose=False) # print current_query cat_distance = gp.convert_sframe_to_simple_dict(current_query, 'reference_label', 'distance') return current_query, gp.find_key_min(cat_distance)
def select_polynomial_degree(train_data, val_data): model_by_degree = {} rss_all = {} list_of_degrees = range(1,16) for degree in list_of_degrees: data_n_train = reg.polynomial_sframe(train_data['sqft_living'],degree) features_names = data_n_train.column_names() data_n_train['price'] = train_data['price'] model_n = gp.create_linear_regression(data_n_train,target='price',features=features_names) data_n_val = reg.polynomial_sframe(val_data['sqft_living'],degree) data_n_val['price'] = val_data['price'] rss_n = reg.get_model_residual_sum_of_squares(model_n,data_n_val,data_n_val['price']) rss_all[degree] = rss_n # print 'RSS(%s): %s' % (degree,rss_n) model_by_degree[degree] = model_n return gp.find_key_min(rss_all), model_by_degree
def main(): try: #Load Data products = gp.load_data('../../data_sets/amazon_baby.gl/') #Create word-count column products['word_count'] = gp.get_text_analytics_count( products['review']) #Select a group of words selected_words = [ 'awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible', 'bad', 'terrible', 'awful', 'wow', 'hate' ] # Q1: Out of the 11 words in selected_words, which one # is most used in the reviews in the dataset? products, most_used = select_most_used_word(products, selected_words, products['word_count']) key_most_used = gp.find_key_max(most_used) least_most_used = gp.find_key_min(most_used) print "\nQ1: Most used:%s = %s" % (key_most_used, most_used[key_most_used]) print "\nQ2: Least used:%s = %s" % (least_most_used, most_used[least_most_used]) #ignore all 3* reviews (to remove unknown rating) products = products[products['rating'] != 3] #positive sentiment = 4* or 5* reviews products['sentiment'] = products['rating'] >= 4 #Split Data train_data, test_data = products.random_split(.8, seed=0) #*************** # Create Model * #*************** #Create Logistic Model (with selected-words as features) selected_model = gp.create_logistic_classifier_model( train_data, target='sentiment', features=selected_words, validation_set=test_data) #Create Logistic Model (with word_count as features) sentiment_model = gp.create_logistic_classifier_model( train_data, target='sentiment', features=['word_count'], validation_set=test_data) # Get weights of Coefficients coefficients = selected_model['coefficients'].sort('value', ascending=False) # print coefficients.print_rows(12) #Out of the 11 words in selected_words, which one got # the most positive/negative weight in the selected_words_model print gp.find_key_max({coefficients['name']: coefficients['value']}) print "\nQ3: Most Positive (w): love" print "\nQ4: Most Negative (w): terrible" #***************** # Evaluate Model * #***************** #Which of the following ranges contains the accuracy # of the selected_words_model on the test_data results_selected = selected_model.evaluate(test_data) results_sentiment = sentiment_model.evaluate(test_data) print "\nQ5: Accuracy selected-model: %s" % results_selected[ 'accuracy'] #0.843111938506 print "\nQ6: Accuracy sentiment-model: %s" % results_sentiment[ 'accuracy'] #0.916256305549 #**************** # Predict Model * #**************** #Which of the following ranges contains the predicted_sentiment for the most positive review # for Baby Trend Diaper Champ, according to the sentiment_model ? diaper_champ_reviews = products[products['name'] == 'Baby Trend Diaper Champ'] diaper_champ_reviews['predicted_selected'] = selected_model.predict( diaper_champ_reviews, output_type='probability') diaper_champ_reviews['predicted_sentiment'] = sentiment_model.predict( diaper_champ_reviews, output_type='probability') most_positive_review = diaper_champ_reviews.sort('predicted_selected', ascending=False) most_positive_sentiment = diaper_champ_reviews.sort( 'predicted_sentiment', ascending=False) print "\nQ9: predicted_selected most positive review: %s" % max( most_positive_review['predicted_selected']) print "\nQ10: predicted_sentiment most positive review: %s" % max( most_positive_sentiment['predicted_sentiment']) except Exception as details: print "Error >> %s" % details traceback.print_exc()