def main(): try: image_train = gp.load_data('../../data_sets/image_train_data/') image_test = gp.load_data('../../data_sets/image_test_data/') #1) Computing summary statistics of the data: label_col = image_train['label'].sketch_summary() # print label_col print "\nQ1: least common category: 'bird'" #2) Creating category-specific image retrieval models: categories = ['automobile', 'cat', 'dog', 'bird'] train_labels = create_train_labels(image_train, categories) # print train_labels.keys() knn_models = create_labels_nearest_neighbors_model( train_labels, categories) cat_test_query = image_test[0:1] # cat_test_query['image'].show() #using ipython it shows the image in browser cat_query, cat_distance = get_nearest_distance_id_query( knn_models, 'cat', cat_test_query) print "\nQ2: nearest 'cat' labeled image id: %s" % cat_distance # # train_labels['cat'][train_labels['cat']['id'] == 16289]['image'].show() dog_query, dog_distance = get_nearest_distance_id_query( knn_models, 'dog', cat_test_query) print "\nQ3: nearest 'dog' labeled image id: %s" % dog_distance # # train_labels['dog'][train_labels['dog']['id'] == 16976]['image'].show() #3) A simple example of nearest-neighbors classification: #he mean distance between this image and its nearest neighbors in training data? print "\nQ4: 'cat' neighbors mean-distance: %s" % cat_query[ 'distance'].mean() print "\nQ5: 'dog' neighbors mean-distance: %s" % dog_query[ 'distance'].mean() print "\nQ6: in average 1st img in test data is closer to nearest neighbors in cat data" #4. Computing nearest neighbors accuracy using SFrame operations: test_labels = create_train_labels(image_test, categories) image_test_dog = test_labels['dog'] labels_dog_distances = [ 'dog-automobile', 'dog-cat', 'dog-dog', 'dog-bird' ] dog_distances = get_label_distances(labels_dog_distances, knn_models, image_test_dog) # print 'Dog-distances: \n', dog_distances correct_dog_predictions = dog_distances.apply(is_dog_correct) # correct_dog_predictions.sketch_summary() # print correct_dog_predictions accuracy_1knn_dof = (correct_dog_predictions.sum() / float(len(image_test_dog))) * 100 print "\nQ7: accuracy of 1-knn classifying 'dog' img in test set: %% %.2f" % accuracy_1knn_dof except Exception as details: print "Error >> %s" % details traceback.print_exc()
def main(): try: song_data = gp.load_data('../../data_sets/song_data.gl/') artist_list = [ 'Kanye West', 'Foo Fighters', 'Taylor Swift', 'Lady GaGa' ] count_uniques = counting_unique_users(song_data, artist_list) # print count_uniques #Which of the artists below have had the most unique users listening to their songs print "\nQ1: Most unique users: %s" % (gp.find_key_max(count_uniques)) #Which of the artists below is the most popular artist, # the one with highest total listen_count, in the data set listen_count = song_data.groupby( key_columns='artist', operations={ 'total_count': gp.graphlab.aggregate.SUM('listen_count') }) # print listen_count.sort('total_count',ascending=False) #most listend / ascending=True) #least listend most_listen_count = gp.convert_sframe_to_simple_dict( listen_count, 'artist', 'total_count') print "\nQ2: Highest total listen: %s" % ( gp.find_key_max(most_listen_count)) print "\nQ3: Smallest total listen: %s" % ( gp.find_key_min(most_listen_count)) except Exception as details: print "Error >> %s" % details traceback.print_exc()
def main(): try: sales = gp.load_data('../../data_sets/home_data.gl/') train_data, test_data = gp.split_data(sales, 0.8) #week2_summary(sales,train_data,test_data) total_houses = sales.num_rows() print '\nData -Total (rows): %s' % total_houses #1. Selection and summary statistics avg_house = find_highest_house_price(sales) print '\n1) Highest average house price: $%s' % avg_house #2. Filtering Data num_houses_high = filter_data(sales) print '\n2) Selected Houses (sqft_living):%s' % num_houses_high #3. Building a regression model with several more features info_text = ['(my features)', '(advanced features)'] models = build_regression_model(train_data) print '\n3) Building a regression model (++features)' evaluate1, evaluate2 = evaluate_house_price_models( info_text, models, test_data) print "\nAnswers:" print "\nQ1: %s" % avg_house print "\nQ2: %s" % (num_houses_high / float(total_houses)) print "\nQ3: %s" % (evaluate1['rmse'] - evaluate2['rmse']) except Exception as details: print "Error >> %s" % details traceback.print_exc()
def main(): try: products = gp.load_data('../../data_sets/amazon_baby_subset.gl/') # Sentiment: Positives (+1) & Negative (-1) reviews # products['sentiment'] # [1,1,1,,-1,-1,1, ......] important_words = gp.load_json_file( '../../data_sets/important_words.json') # print len(important_words) # Remove Punctuation products['review_clean'] = products['review'].apply( gp.remove_punctuation) # Add important_words and its number of ocurrences per review for word in important_words: products[word] = products['review_clean'].apply( lambda s: s.split().count(word)) # print products[:10] lg_class = cl_utils.LogisticRegression() # quiz2_implementing_logistic_regression(products, important_words, lg_class) quiz3_logistic_regression_l2_penalty(products, important_words, lg_class) except Exception as details: print(">> Exit or Errors \n%s, %s" % (details, traceback.print_exc()))
def main(): # Week_1 # Create/modify new columns in SFrame sframe = gp.load_data('../data_sets/people-example.csv') # print sframe.show() print sframe.tail() # sframe['Country'] = sframe['Country'].apply(lambda x: 'United States' if x == 'USA' else x) sframe = gp.transform_column_entry(sframe, 'Country', 'USA', 'United States') print 'New SFrame:\n', sframe
def main(): try: sales = gp.load_data('../../data_sets/kc_house_data.gl/') train_data, test_data = gp.split_data(sales, 0.8) multiple_regression_model(train_data,test_data) gradient_descent_model(train_data,test_data) except Exception as details: print "Error >> %s" % details traceback.print_exc()
def main(): try: sales = gp.load_data('../../data_sets/kc_house_data.gl/') train_data, test_data = gp.split_data(sales, 0.8) simple_reg = SimpleLinearRegression() print "\n**********************************" print "* Simple Linear Regression Model *" print "**********************************\n" sqft_intercept, sqft_slope = simple_reg.simple_linear_regression( train_data['sqft_living'], train_data['price']) bedroom_intercept, bedroom_slope = simple_reg.simple_linear_regression( train_data['bedrooms'], train_data['price']) print "Predicting house prices using:" print "\t- Square feet model: Intercept:%s & Slope:%s" % ( sqft_intercept, sqft_slope) print "\t- Bedroom model: Intercept:%s & Slope:%s" % ( bedroom_intercept, bedroom_slope) print "\nQuiz (week_1):" my_house_sqft = 2650 estimated_price = reg.get_regression_predictions( my_house_sqft, sqft_intercept, sqft_slope) print "\nQ1: Predicted price for a house with %s sqft: %s" % ( my_house_sqft, estimated_price) rss_prices_on_sqft = simple_reg.get_residual_sum_of_squares( train_data['sqft_living'], train_data['price'], sqft_intercept, sqft_slope) print "\nQ2: RSS of predicted prices based on sqft is: %s" % rss_prices_on_sqft my_house_price = 800000 estimated_squarefeet = simple_reg.inverse_regression_predictions( my_house_price, sqft_intercept, sqft_slope) print "\nQ3: Estimated sqft for a house worth $%d is: %.3f" % ( my_house_price, estimated_squarefeet) # Compute RSS when using bedrooms on TEST data: rss_prices_on_bedroom_test = simple_reg.get_residual_sum_of_squares( test_data['bedrooms'], test_data['price'], bedroom_intercept, bedroom_slope) rss_prices_on_sqrt_test = simple_reg.get_residual_sum_of_squares( test_data['sqft_living'], test_data['price'], sqft_intercept, sqft_slope) print "\nQ4: Which model (square feet or bedrooms) has lowest RSS on TEST data?" print "\t-> RSS (square feet): %s" % (rss_prices_on_sqrt_test) print "\t-> RSS (bedroom): %s" % (rss_prices_on_bedroom_test) except Exception as details: print "Error >> %s" % details traceback.print_exc()
def main(): try: print "\n**********************************" print "* Lasso Regression Model *" print "**********************************\n" sales = gp.load_data('../../data_sets/kc_house_data.gl/') lasso = LassoRegression() quiz1_lasso_to_select_features(lasso, sales) quiz2_lasso_coordinate(lasso, sales) except Exception as details: print "Error >> %s" % details traceback.print_exc()
def main(): try: print "\n**********************************" print "* k-nearest regression Model *" print "**********************************\n" sales = gp.load_data('../../data_sets/kc_house_data_small.gl/') feature_list = ['bedrooms','bathrooms','sqft_living','sqft_lot','floors','waterfront','view','condition', 'grade','sqft_above','sqft_basement','yr_built','yr_renovated','lat','long','sqft_living15','sqft_lot15'] train_and_validation,test = sales.random_split(.8,seed=1) train,validation = train_and_validation.random_split(.8,seed=1) data_sets = get_normalized_datasets(train, test, validation, feature_list) features_train,features_test,features_valid,output_train,output_valid,output_test = data_sets query_10house = np_utils.get_euclidean_distance(features_test[0],features_train[9]) print "\nQ1: Euclidean distance query vs 10th house (training): %s" % (round(query_10house,3)) query_house = features_test[0] closest_dist = closest_distance(9,query_house,features_train) print "\nQ2: House closest to the query house (training): %s" % (closest_dist) close_dist_test = np_utils.get_euclidean_distance_matrix(features_train,features_test[2]) # print close_dist_test print "\nQ3: House (training) closest to query house (test[2]): %s" % (np_utils.np.argmin(close_dist_test)) print "\nQ4: Predicted value query train=%s vs test: %s" % (train['price'][382], test['price'][382]) close_4h = np_utils.find_k_nearest_neighbors(4,features_train,features_test[2]) print "\nQ5: 4 (training) houses closest to query house: %s" % (close_4h) predict_avg_houses = np_utils.single_prediction_k_nearest_neighbors(4,features_train,features_test[2], output_train) print "\nQ6: Predict the value of the query house (avg k-nearest): %s" % (predict_avg_houses) lowest_house, lowest_predict = lowest_predicted_house(10,features_train,features_test[:10],output_train) print "\nQ7: Index-house with query set with lowest predicted value: idx(%s):%s" % (lowest_house, lowest_predict) # plot_RSS_vs_validation_set(15,features_train,features_valid,output_train,output_valid) current_prediction = multiple_predictions_k_nearest_neighbors(8,features_train,features_test,output_train) rss = np_utils.compute_RSS(current_prediction,output_test) print "\nQ8: k-nearest with optimal k, RSS on the TEST data: %s\n" % (rss) except Exception as details: print "Error >> %s" % details traceback.print_exc()
def main(): try: print "\n**********************************" print "* Ridge Regression Model *" print "**********************************\n" sales = gp.load_data('../../data_sets/kc_house_data.gl/') sales_q1 = sales.sort(['sqft_living','price']) quiz_1_ridge_regression(sales_q1) quiz_1_selecting_l2_penalty(sales_q1) quiz_2_ridge_grandient_descent(sales) except Exception as details: print "Error >> %s" % details traceback.print_exc()
def main(): try: print "\n**************************************" print "* Boosting Trees *" print "**************************************\n" loans = gp.load_data('../../data_sets/lending-club-data.gl/') # Remove bad_loands column loans['safe_loans'] = loans['bad_loans'].apply(lambda x:+1 if x == 0 else -1) loans = loans.remove_column('bad_loans') # Extract the feature columns and target column target = 'safe_loans' # prediction target (y) (+1 means safe, -1 is risky) quiz1_boosting_trees(loans,target) quiz2_adaboosting_trees(loans,target) except Exception as details: print (">> Exit or Errors \n%s, %s"%(details, traceback.print_exc()))
def main(): try: print "\n**********************************" print "* Polynomial Regression Model *" print "**********************************\n" sales = gp.load_data('../../data_sets/kc_house_data.gl/') train,test = sales.random_split(0.5,seed=0) set_1,set_2 = train.random_split(0.5,seed=0) set_3,set_4 = test.random_split(0.5,seed=0) list_of_degrees = [15] #[1,3,5,15] list_of_sets = [set_1,set_2,set_3,set_4] polynomial_regressions = get_polynomial_regression_by_sets(list_of_degrees, list_of_sets) print "\nQ1: power_15 for all four models:" pw_degree = 'power_15' for idx,sets in enumerate(list_of_sets): idx_set = 'set_%s' % (idx + 1) poly_n_coeff = polynomial_regressions[idx_set][pw_degree]['coefficients'] coeff_dict = gp.convert_sframe_to_simple_dict(poly_n_coeff,'name','value') print "\t- %s: %s"%(idx_set, coeff_dict[pw_degree]) print "\nQ2: fitted lines all look the same plots: FALSE" training, test_data = sales.random_split(0.9,seed=1) train_data, val_data = training.random_split(0.5,seed=1) best_degree, model_by_degree = select_polynomial_degree(train_data,val_data) print "\nQ3: the lowest RSS on Validation data is degree:%s" % best_degree data_n_test = reg.polynomial_sframe(test_data['sqft_living'],best_degree) data_n_test['price'] = test_data['price'] rss_n = reg.get_model_residual_sum_of_squares(model_by_degree[best_degree],data_n_test,data_n_test['price']) print "\nQ4: RSS on TEST with the degree:%s from Validation data is:%s" % (best_degree,rss_n) except Exception as details: print "Error >> %s" % details traceback.print_exc()
def main(): try: people = gp.load_data('../../data_sets/people_wiki.gl/') #Create Word Count & TF_IDF analytics count people['word_count'] = gp.get_text_analytics_count(people['text']) people['tfidf'] = gp.get_text_analytics_tf_idf(people['word_count']) famous_people = ['Elton John', 'Victoria Beckham', 'Paul McCartney'] #Quiz # famous_people = ['Barack Obama', 'Bill Clinton', 'David Beckham', 'Taylor Swift', 'George Clooney'] people_info = {} for person in famous_people: people_info[person] = people[people['name'] == person] people_info['%s table' % person] = stack_columns_to_table( people_info[person], 'word_count', ['word', 'count']) people_info['%s tfidf' % person] = stack_columns_to_table( people_info[person], 'tfidf', ['word', 'tfidf'], sort_by='tfidf') # 1)Person:'Elton John' What are the 3 words in his articles # with highest word counts? and with highest TF-IDF? name = 'Elton John' print "Person: %s" % name print "\nQ1: Highest word counts = %s" % (people_info['%s table' % name]) print "\nQ2: Top TF-IDF= %s" % (people_info['%s table' % name]) # 2)Whats the cosine distance between the articles on dist1 = 'Elton John_vs_Victoria Beckham' dist2 = 'Elton John_vs_Paul McCartney' cos_distances = calculate_cos_distance(name, famous_people, people_info) print "\nQ3: %s: %s" % (dist1, cos_distances[dist1]) print "\nQ4: %s: %s" % (dist2, cos_distances[dist2]) print "\nQ5: closer to 'Elton John is Paul McCartney" # cos_distances = calculate_cos_distance('Barack Obama', famous_people, people_info) # for dist in cos_distances.keys(): # print "%s: %s"%(dist, cos_distances[dist]) # 6) Now, you will build two nearest neighbors models: # Using word counts as features # Using TF-IDF as features # set the distance function to cosine similarity knn_model_word_count = gp.create_nearest_neighbors_model( people, features=['word_count'], label='name', distance='cosine') knn_model_tfidf = gp.create_nearest_neighbors_model(people, features=['tfidf'], label='name', distance='cosine') # Whats the most similar article, other than itself # Elton John & Victoria Beckham using word count features? & TF-IDF features? print "Find the Nearest Neighbor of" count_qs = 6 for name in ['Elton John', 'Victoria Beckham']: query_min_knn_distance(name, people_info, knn_model_word_count, 'raw_model', count_qs) count_qs += 1 query_min_knn_distance(name, people_info, knn_model_tfidf, 'tfidf_model', count_qs) count_qs += 1 except Exception as details: print "Error >> %s" % details traceback.print_exc()
def main(): try: print "\n**************************************" print "* Precision & Recall *" print "**************************************\n" products = gp.load_data('../../data_sets/amazon_baby.gl/') # Remove punctuation. review_clean = products['review'].apply(gp.remove_punctuation) # Count words products['word_count'] = gp.graphlab.text_analytics.count_words( review_clean) # Drop neutral sentiment reviews. products = products[products['rating'] != 3] # Positive sentiment to +1 and negative sentiment to -1 products['sentiment'] = products['rating'].apply(lambda rating: +1 if rating > 3 else -1) train_data, test_data = products.random_split(.8, seed=1) model = gp.graphlab.logistic_classifier.create(train_data, target='sentiment', features=['word_count'], validation_set=None, verbose=False) accuracy = model.evaluate(test_data, metric='accuracy')['accuracy'] baseline = len(test_data[test_data['sentiment'] == 1]) / len(test_data) print "\nQ1: YES, logistic regression model was better than the baseline (majority class classifier)" print "\tBaseline: %s" % accuracy print "\tReg-model: %s" % baseline confusion_matrix = model.evaluate( test_data, metric='confusion_matrix')['confusion_matrix'] print confusion_matrix false_positives = 1443 print "\nQ2: False positives: (-1)(+1): %s" % false_positives false_negatives = 1406 print "\nQ3: Cost associated with the logistic regression: $%s" % ( false_negatives + 100 * false_positives) true_positives = 26689 print "\nQ4: Fracion of false positives: %s" % round( (false_positives / float(true_positives)), 2) print "\nQ5: Increase threshold for predicting the positive class (y^=+1)" print "\nQ6: Fracion of false positives: %s" % round( (false_negatives / float(true_positives)), 2) print "\nQ7: classifier that predicts +1 for all data points has recall= 1" precision_and_recall_threshold(model, test_data) precision_and_recall_plot(model, test_data) baby_reviews = test_data[test_data['name'].apply( lambda x: 'baby' in x.lower())] probabilities_baby = model.predict(baby_reviews, output_type='probability') threshold_values_baby = np_utils.np.linspace(0.5, 1, num=100) precision_all_baby, recall_all_baby = get_all_precisions_and_recall( baby_reviews, probabilities_baby, threshold_values_baby) threshold_small_baby = find_smallest_threshold(precision_all_baby, threshold_values_baby) print "\nQ12: smallest threshold-baby value that achieves a precision of 96.5 or better is: %s" % round( threshold_small_baby, 3) print "\nQ13: threshold value is larger: than the threshold used for the entire dataset" output_file = '../graphs/Precision_recall_curve_baby.png' np_plot.plot_pr_curve(precision_all_baby, recall_all_baby, "Precision-Recall (Baby)", output_file) except Exception as details: print(">> Exit or Errors \n%s, %s" % (details, traceback.print_exc()))
def main(): try: print "\n**************************************" print "* Online Learning *" print "**************************************\n" products = gp.load_data('../../data_sets/amazon_baby_subset.gl/') important_words = gp.load_json_file( '../../data_sets/important_words.json') # Remove Punctuation products['review_clean'] = products['review'].apply( gp.remove_punctuation) # Add important_words and its number of ocurrences per review for word in important_words: products[word] = products['review_clean'].apply( lambda s: s.split().count(word)) # print products[:10] train_data, validation_data = products.random_split(.9, seed=1) feature_matrix_train, sentiment_train = np_utils.get_numpy_data( train_data, important_words, 'sentiment') feature_matrix_valid, sentiment_valid = np_utils.get_numpy_data( validation_data, important_words, 'sentiment') print "\nQ1: stochastic gradient ascent affect the number of features NOT: Stays the same" print "\nQ2: llA (w) = (1/N) * ll(w) --> only add (1/N)" print "\nQ3: dli(w)/dwj is a --> scalar" print "\nQ4: dli(w)/dwj (minibatch) is a: scalar" print "\nQ5: to have the same as the full gradient set B=N (size of train_data): %s" % len( train_data) print "\nQ6: logistic_regression_SG act as a standard gradient ascent when B=N (size of train_data): %s" % len( train_data) lg = cl_utils.LogisticRregStochastic() coefficients, log_likelihood = lg.logistic_regression_SG( feature_matrix_train, sentiment_train, initial_coefficients=np_utils.np.zeros(194), step_size=5e-1, batch_size=1, max_iter=10, verbose=False) print "\nQ7: set batch_size = 1, as each iteration passes, the average log likelihood in the batch: Fluctuates" # print coefficients coefficients_batch, log_likelihood_batch = lg.logistic_regression_SG( feature_matrix_train, sentiment_train, initial_coefficients=np_utils.np.zeros(194), step_size=5e-1, batch_size=len(feature_matrix_train), max_iter=200, verbose=False) print "\nQ8: set batch_size = 47780, as each iteration passes, the average log likelihood in the batch: Increases" # print coefficients_batch print "\nQ9: gradient updates are performed at the end of two passes ((2*50000)/100.0) = %s" % ( (2 * 50000) / 100.0) # log_likelihood_metrics(lg,feature_matrix_train,sentiment_train) plot_stochastic_and_batch(lg, feature_matrix_train, sentiment_train, log_likelihood_batch) print "\nQ10: passes needed to achieve a similar log likelihood as stochastic gradient ascent: 150 passes or more" # effects_of_step_size(lg,feature_matrix_train,sentiment_train,train_data) print "\nQ11: worst step size is: 1e2" print "\nQ12: best step size is: 1e0" except Exception as details: print(">> Exit or Errors \n%s, %s" % (details, traceback.print_exc()))
def week2_summary(sales, train_data, test_data): #Build a regression model with 1 feature -> 'sqft_living' sqft_model = gp.create_linear_regression(train_data, target='price', features=['sqft_living']) print 'Price test-mean: %s' % test_data['price'].mean() #543054.042563 print 'Price model evaluate: %s' % sqft_model.evaluate(test_data) #{'max_error': 4143550.8825285914, 'rmse': 255191.02870527367} # import matplotlib.pyplot as plt # plt.plot(test_data['sqft_living'], test_data['price'],'.', # test_data['sqft_living'], sqft_model.predict(test_data),'-') # plt.show() print 'model coefficients: %s\n' % sqft_model.get('coefficients') print 'columns name: %s' % sales.column_names() # print "sales[my_features] %s\n" % sales[my_features].show() # sales[my_features].show() # sales.show(view='BoxWhisker Plot', x='zipcode', y='price') #****************** # CREATE MODEL * #****************** #Build a regression model with more features my_features = [ 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode' ] print 'my_features: %s' % my_features print '\n1) CREATE model:' my_features_model = gp.create_linear_regression(train_data, target='price', features=my_features) #****************** # EVALUATE MODEL * #****************** info_text = ['(1 feature)', '(more feature)'] models = [sqft_model, my_features_model] print '\n2) EVALUATE model:' evaluate_house_price_models(info_text, models, test_data) #****************** # PREDICT MODEL * #****************** #The first house we will use is considered an "average" house in Seattle. house1 = sales[sales['id'] == '5309101200'] print '\n3) PREDICT model:' print '\nhouse1: %s' % house1['price'] predict_house_price_models(info_text, models, house1) house2 = sales[sales['id'] == '1925069082'] print '\nhouse2: %s' % house2['price'] predict_house_price_models(info_text, models, house2) bill_gates = { 'bedrooms': [8], 'bathrooms': [25], 'sqft_living': [50000], 'sqft_lot': [225000], 'floors': [4], 'zipcode': ['98039'], 'condition': [10], 'grade': [10], 'waterfront': [1], 'view': [4], 'sqft_above': [37500], 'sqft_basement': [12500], 'yr_built': [1994], 'yr_renovated': [2010], 'lat': [47.627606], 'long': [-122.242054], 'sqft_living15': [5000], 'sqft_lot15': [40000] } #model receive SFrame not dicts house_bill_gates = gp.load_data(bill_gates) print '\nhouse-Bill-Gates' predict_house_price_models(info_text, models, house_bill_gates)
def main(): try: #Load Data products = gp.load_data('../../data_sets/amazon_baby.gl/') #Create word-count column products['word_count'] = gp.get_text_analytics_count( products['review']) #Select a group of words selected_words = [ 'awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible', 'bad', 'terrible', 'awful', 'wow', 'hate' ] # Q1: Out of the 11 words in selected_words, which one # is most used in the reviews in the dataset? products, most_used = select_most_used_word(products, selected_words, products['word_count']) key_most_used = gp.find_key_max(most_used) least_most_used = gp.find_key_min(most_used) print "\nQ1: Most used:%s = %s" % (key_most_used, most_used[key_most_used]) print "\nQ2: Least used:%s = %s" % (least_most_used, most_used[least_most_used]) #ignore all 3* reviews (to remove unknown rating) products = products[products['rating'] != 3] #positive sentiment = 4* or 5* reviews products['sentiment'] = products['rating'] >= 4 #Split Data train_data, test_data = products.random_split(.8, seed=0) #*************** # Create Model * #*************** #Create Logistic Model (with selected-words as features) selected_model = gp.create_logistic_classifier_model( train_data, target='sentiment', features=selected_words, validation_set=test_data) #Create Logistic Model (with word_count as features) sentiment_model = gp.create_logistic_classifier_model( train_data, target='sentiment', features=['word_count'], validation_set=test_data) # Get weights of Coefficients coefficients = selected_model['coefficients'].sort('value', ascending=False) # print coefficients.print_rows(12) #Out of the 11 words in selected_words, which one got # the most positive/negative weight in the selected_words_model print gp.find_key_max({coefficients['name']: coefficients['value']}) print "\nQ3: Most Positive (w): love" print "\nQ4: Most Negative (w): terrible" #***************** # Evaluate Model * #***************** #Which of the following ranges contains the accuracy # of the selected_words_model on the test_data results_selected = selected_model.evaluate(test_data) results_sentiment = sentiment_model.evaluate(test_data) print "\nQ5: Accuracy selected-model: %s" % results_selected[ 'accuracy'] #0.843111938506 print "\nQ6: Accuracy sentiment-model: %s" % results_sentiment[ 'accuracy'] #0.916256305549 #**************** # Predict Model * #**************** #Which of the following ranges contains the predicted_sentiment for the most positive review # for Baby Trend Diaper Champ, according to the sentiment_model ? diaper_champ_reviews = products[products['name'] == 'Baby Trend Diaper Champ'] diaper_champ_reviews['predicted_selected'] = selected_model.predict( diaper_champ_reviews, output_type='probability') diaper_champ_reviews['predicted_sentiment'] = sentiment_model.predict( diaper_champ_reviews, output_type='probability') most_positive_review = diaper_champ_reviews.sort('predicted_selected', ascending=False) most_positive_sentiment = diaper_champ_reviews.sort( 'predicted_sentiment', ascending=False) print "\nQ9: predicted_selected most positive review: %s" % max( most_positive_review['predicted_selected']) print "\nQ10: predicted_sentiment most positive review: %s" % max( most_positive_sentiment['predicted_sentiment']) except Exception as details: print "Error >> %s" % details traceback.print_exc()
def main(): try: print "\n**********************************" print "* Predicting sentiment *" print "**********************************\n" products = gp.load_data('../../data_sets/amazon_baby.gl/') #Remove punction - built-in python string review_without_punctuation = products['review'].apply( gp.remove_punctuation) products['word_count'] = gp.graphlab.text_analytics.count_words( review_without_punctuation) # Ignore neutral-ratings == 3 products = products[products['rating'] != 3] # Create Sentiment column # for every rating in products-data set. Rating > 3 (positive +1) otherwise (negative -1) products['sentiment'] = products['rating'].apply(lambda x: +1 if x > 3 else -1) # 1) Train a Logistic-Classifier model train_data, test_data = products.random_split(.8, seed=1) sentiment_model = gp.create_logistic_classifier_model( train_data, target='sentiment', features=['word_count']) weights = sentiment_model.coefficients num_positive_weights = len(weights[weights['value'] >= 0]) num_negative_weights = len(weights[weights['value'] < 0]) print "\nQ1: How many weights are greater >= 0 is: %s" % ( num_positive_weights) my_predictions = predict_scores_simple_data(sentiment_model, test_data) print "\nQ2: Lowest probability of being classified House: %s" % ( np_utils.np.argmin(my_predictions) + 1) test_data["probability"] = sentiment_model.predict( test_data, output_type='probability') # Compare accuracy in TEST data accuracy_sent_test = quiz1_make_predictions_logistic_regression( sentiment_model, test_data) simple_model = quiz1_learn_classifier_fewer_words( train_data, test_data) accuracy_simp_test = cl_utils.get_model_classification_accuracy( simple_model, test_data, test_data['sentiment']) # Compare accuracy in TRAINING data accuracy_sent_train = cl_utils.get_model_classification_accuracy( sentiment_model, train_data, train_data['sentiment']) accuracy_simp_train = cl_utils.get_model_classification_accuracy( simple_model, train_data, train_data['sentiment']) print "\nQ9: Accuracy on TRAINING data sentiment:%s vs simple:%s" % ( accuracy_sent_train, accuracy_simp_train) print "\nQ10: Accuracy on TEST data sentiment:%s vs simple:%s" % ( accuracy_sent_test, accuracy_simp_test) accuracy_majority = cl_utils.get_majority_class_accuracy( sentiment_model, test_data) print "\nQ11: Accuracy of the majority class classifier model: %s" % ( accuracy_majority) except Exception as details: print(">> Exit or Errors \n%s, %s" % (details, traceback.print_exc()))