Example #1
0
def main():
    try:
        image_train = gp.load_data('../../data_sets/image_train_data/')
        image_test = gp.load_data('../../data_sets/image_test_data/')

        #1) Computing summary statistics of the data:
        label_col = image_train['label'].sketch_summary()
        # print label_col
        print "\nQ1: least common category: 'bird'"

        #2) Creating category-specific image retrieval models:
        categories = ['automobile', 'cat', 'dog', 'bird']
        train_labels = create_train_labels(image_train, categories)
        # print train_labels.keys()
        knn_models = create_labels_nearest_neighbors_model(
            train_labels, categories)

        cat_test_query = image_test[0:1]
        # cat_test_query['image'].show() #using ipython it shows the image in browser
        cat_query, cat_distance = get_nearest_distance_id_query(
            knn_models, 'cat', cat_test_query)
        print "\nQ2: nearest 'cat' labeled image id: %s" % cat_distance
        # # train_labels['cat'][train_labels['cat']['id'] == 16289]['image'].show()

        dog_query, dog_distance = get_nearest_distance_id_query(
            knn_models, 'dog', cat_test_query)
        print "\nQ3: nearest 'dog' labeled image id: %s" % dog_distance
        # # train_labels['dog'][train_labels['dog']['id'] == 16976]['image'].show()

        #3) A simple example of nearest-neighbors classification:
        #he mean distance between this image and its nearest neighbors in training data?
        print "\nQ4: 'cat' neighbors mean-distance: %s" % cat_query[
            'distance'].mean()
        print "\nQ5: 'dog' neighbors mean-distance: %s" % dog_query[
            'distance'].mean()
        print "\nQ6: in average 1st img in test data is closer to nearest neighbors in cat data"

        #4. Computing nearest neighbors accuracy using SFrame operations:
        test_labels = create_train_labels(image_test, categories)
        image_test_dog = test_labels['dog']
        labels_dog_distances = [
            'dog-automobile', 'dog-cat', 'dog-dog', 'dog-bird'
        ]

        dog_distances = get_label_distances(labels_dog_distances, knn_models,
                                            image_test_dog)
        # print 'Dog-distances: \n', dog_distances
        correct_dog_predictions = dog_distances.apply(is_dog_correct)
        # correct_dog_predictions.sketch_summary()
        # print correct_dog_predictions
        accuracy_1knn_dof = (correct_dog_predictions.sum() /
                             float(len(image_test_dog))) * 100
        print "\nQ7: accuracy of 1-knn classifying 'dog' img in test set: %% %.2f" % accuracy_1knn_dof

    except Exception as details:
        print "Error >> %s" % details
        traceback.print_exc()
Example #2
0
def main():
    try:
        song_data = gp.load_data('../../data_sets/song_data.gl/')

        artist_list = [
            'Kanye West', 'Foo Fighters', 'Taylor Swift', 'Lady GaGa'
        ]
        count_uniques = counting_unique_users(song_data, artist_list)
        # print count_uniques

        #Which of the artists below have had the most unique users listening to their songs
        print "\nQ1: Most unique users: %s" % (gp.find_key_max(count_uniques))

        #Which of the artists below is the most popular artist,
        # the one with highest total listen_count, in the data set
        listen_count = song_data.groupby(
            key_columns='artist',
            operations={
                'total_count': gp.graphlab.aggregate.SUM('listen_count')
            })

        # print listen_count.sort('total_count',ascending=False) #most listend / ascending=True) #least listend
        most_listen_count = gp.convert_sframe_to_simple_dict(
            listen_count, 'artist', 'total_count')
        print "\nQ2: Highest total listen: %s" % (
            gp.find_key_max(most_listen_count))
        print "\nQ3: Smallest total listen: %s" % (
            gp.find_key_min(most_listen_count))

    except Exception as details:
        print "Error >> %s" % details
        traceback.print_exc()
Example #3
0
def main():
    try:
        sales = gp.load_data('../../data_sets/home_data.gl/')
        train_data, test_data = gp.split_data(sales, 0.8)

        #week2_summary(sales,train_data,test_data)

        total_houses = sales.num_rows()
        print '\nData -Total (rows): %s' % total_houses

        #1. Selection and summary statistics
        avg_house = find_highest_house_price(sales)
        print '\n1) Highest average house price: $%s' % avg_house

        #2. Filtering Data
        num_houses_high = filter_data(sales)
        print '\n2) Selected Houses (sqft_living):%s' % num_houses_high

        #3. Building a regression model with several more features
        info_text = ['(my features)', '(advanced features)']
        models = build_regression_model(train_data)
        print '\n3) Building a regression model (++features)'
        evaluate1, evaluate2 = evaluate_house_price_models(
            info_text, models, test_data)

        print "\nAnswers:"
        print "\nQ1: %s" % avg_house
        print "\nQ2: %s" % (num_houses_high / float(total_houses))
        print "\nQ3: %s" % (evaluate1['rmse'] - evaluate2['rmse'])

    except Exception as details:
        print "Error >> %s" % details
        traceback.print_exc()
def main():
    try:
        products = gp.load_data('../../data_sets/amazon_baby_subset.gl/')

        # Sentiment: Positives (+1) & Negative (-1) reviews
        # products['sentiment'] # [1,1,1,,-1,-1,1, ......]

        important_words = gp.load_json_file(
            '../../data_sets/important_words.json')
        # print len(important_words)

        # Remove Punctuation
        products['review_clean'] = products['review'].apply(
            gp.remove_punctuation)

        # Add important_words and its number of ocurrences per review
        for word in important_words:
            products[word] = products['review_clean'].apply(
                lambda s: s.split().count(word))
        # print products[:10]

        lg_class = cl_utils.LogisticRegression()

        # quiz2_implementing_logistic_regression(products, important_words, lg_class)

        quiz3_logistic_regression_l2_penalty(products, important_words,
                                             lg_class)

    except Exception as details:
        print(">> Exit or Errors \n%s, %s" % (details, traceback.print_exc()))
Example #5
0
def main():

    # Week_1
    # Create/modify new columns in  SFrame
    sframe = gp.load_data('../data_sets/people-example.csv')
    # print sframe.show()
    print sframe.tail()
    # sframe['Country'] = sframe['Country'].apply(lambda x: 'United States' if x == 'USA' else x)
    sframe = gp.transform_column_entry(sframe, 'Country', 'USA',
                                       'United States')
    print 'New SFrame:\n', sframe
def main():
	try:
		sales = gp.load_data('../../data_sets/kc_house_data.gl/')
		train_data, test_data = gp.split_data(sales, 0.8)

		multiple_regression_model(train_data,test_data)

		gradient_descent_model(train_data,test_data)

	except Exception as details:
			print "Error >> %s" % details
			traceback.print_exc()
def main():
    try:
        sales = gp.load_data('../../data_sets/kc_house_data.gl/')
        train_data, test_data = gp.split_data(sales, 0.8)

        simple_reg = SimpleLinearRegression()

        print "\n**********************************"
        print "* Simple Linear Regression Model *"
        print "**********************************\n"
        sqft_intercept, sqft_slope = simple_reg.simple_linear_regression(
            train_data['sqft_living'], train_data['price'])
        bedroom_intercept, bedroom_slope = simple_reg.simple_linear_regression(
            train_data['bedrooms'], train_data['price'])
        print "Predicting house prices using:"
        print "\t- Square feet model: Intercept:%s  &  Slope:%s" % (
            sqft_intercept, sqft_slope)
        print "\t- Bedroom model:     Intercept:%s  &  Slope:%s" % (
            bedroom_intercept, bedroom_slope)

        print "\nQuiz (week_1):"
        my_house_sqft = 2650
        estimated_price = reg.get_regression_predictions(
            my_house_sqft, sqft_intercept, sqft_slope)
        print "\nQ1: Predicted price for a house with %s sqft: %s" % (
            my_house_sqft, estimated_price)

        rss_prices_on_sqft = simple_reg.get_residual_sum_of_squares(
            train_data['sqft_living'], train_data['price'], sqft_intercept,
            sqft_slope)
        print "\nQ2: RSS of predicted prices based on sqft is: %s" % rss_prices_on_sqft

        my_house_price = 800000
        estimated_squarefeet = simple_reg.inverse_regression_predictions(
            my_house_price, sqft_intercept, sqft_slope)
        print "\nQ3: Estimated sqft for a house worth $%d is: %.3f" % (
            my_house_price, estimated_squarefeet)

        # Compute RSS when using bedrooms on TEST data:
        rss_prices_on_bedroom_test = simple_reg.get_residual_sum_of_squares(
            test_data['bedrooms'], test_data['price'], bedroom_intercept,
            bedroom_slope)
        rss_prices_on_sqrt_test = simple_reg.get_residual_sum_of_squares(
            test_data['sqft_living'], test_data['price'], sqft_intercept,
            sqft_slope)

        print "\nQ4: Which model (square feet or bedrooms) has lowest RSS on TEST data?"
        print "\t-> RSS (square feet): %s" % (rss_prices_on_sqrt_test)
        print "\t-> RSS (bedroom): %s" % (rss_prices_on_bedroom_test)

    except Exception as details:
        print "Error >> %s" % details
        traceback.print_exc()
def main():
    try:
        print "\n**********************************"
        print "*      Lasso Regression Model    *"
        print "**********************************\n"

        sales = gp.load_data('../../data_sets/kc_house_data.gl/')
        lasso = LassoRegression()

        quiz1_lasso_to_select_features(lasso, sales)

        quiz2_lasso_coordinate(lasso, sales)

    except Exception as details:
        print "Error >> %s" % details
        traceback.print_exc()
Example #9
0
def main():
	try:
		print "\n**********************************"
		print "*   k-nearest regression Model   *"
		print "**********************************\n"

		sales = gp.load_data('../../data_sets/kc_house_data_small.gl/')

		feature_list = ['bedrooms','bathrooms','sqft_living','sqft_lot','floors','waterfront','view','condition',
			'grade','sqft_above','sqft_basement','yr_built','yr_renovated','lat','long','sqft_living15','sqft_lot15']

		train_and_validation,test = sales.random_split(.8,seed=1)
		train,validation = train_and_validation.random_split(.8,seed=1)
		data_sets = get_normalized_datasets(train, test, validation, feature_list)
		features_train,features_test,features_valid,output_train,output_valid,output_test = data_sets
		query_10house = np_utils.get_euclidean_distance(features_test[0],features_train[9])
		print "\nQ1: Euclidean distance query vs 10th house (training): %s" % (round(query_10house,3))

		query_house = features_test[0]
		closest_dist = closest_distance(9,query_house,features_train)
		print "\nQ2: House closest to the query house (training): %s" % (closest_dist)

		close_dist_test = np_utils.get_euclidean_distance_matrix(features_train,features_test[2])
		# print close_dist_test
		print "\nQ3: House (training) closest to query house (test[2]): %s" % (np_utils.np.argmin(close_dist_test))

		print "\nQ4: Predicted value query train=%s vs test: %s" % (train['price'][382], test['price'][382])

		close_4h = np_utils.find_k_nearest_neighbors(4,features_train,features_test[2])
		print "\nQ5: 4 (training) houses closest to query house: %s" % (close_4h)

		predict_avg_houses = np_utils.single_prediction_k_nearest_neighbors(4,features_train,features_test[2], output_train)
		print "\nQ6: Predict the value of the query house (avg k-nearest): %s" % (predict_avg_houses)

		lowest_house, lowest_predict = lowest_predicted_house(10,features_train,features_test[:10],output_train)
		print "\nQ7: Index-house with query set with lowest predicted value: idx(%s):%s" % (lowest_house, lowest_predict)

		# plot_RSS_vs_validation_set(15,features_train,features_valid,output_train,output_valid)

		current_prediction = multiple_predictions_k_nearest_neighbors(8,features_train,features_test,output_train)
		rss = np_utils.compute_RSS(current_prediction,output_test)
		print "\nQ8: k-nearest with optimal k, RSS on the TEST data: %s\n" % (rss)

	except Exception as details:
		print "Error >> %s" % details
		traceback.print_exc()
def main():
	try:
		print "\n**********************************"
		print "*     Ridge Regression Model     *"
		print "**********************************\n"

		sales = gp.load_data('../../data_sets/kc_house_data.gl/')
		sales_q1 = sales.sort(['sqft_living','price'])

		quiz_1_ridge_regression(sales_q1)

		quiz_1_selecting_l2_penalty(sales_q1)

		quiz_2_ridge_grandient_descent(sales)

	except Exception as details:
		print "Error >> %s" % details
		traceback.print_exc()
def main():
	try:
		print "\n**************************************"
		print "*          Boosting Trees            *"
		print "**************************************\n"

		loans = gp.load_data('../../data_sets/lending-club-data.gl/')

		# Remove bad_loands column
		loans['safe_loans'] = loans['bad_loans'].apply(lambda x:+1 if x == 0 else -1)
		loans = loans.remove_column('bad_loans')

		# Extract the feature columns and target column
		target = 'safe_loans' # prediction target (y) (+1 means safe, -1 is risky)

		quiz1_boosting_trees(loans,target)
		quiz2_adaboosting_trees(loans,target)

	except Exception as details:
		print (">> Exit or Errors \n%s, %s"%(details, traceback.print_exc()))
Example #12
0
def main():
	try:
		print "\n**********************************"
		print "*  Polynomial Regression Model   *"
		print "**********************************\n"

		sales = gp.load_data('../../data_sets/kc_house_data.gl/')
		train,test = sales.random_split(0.5,seed=0)

		set_1,set_2 = train.random_split(0.5,seed=0)
		set_3,set_4 = test.random_split(0.5,seed=0)

		list_of_degrees = [15] #[1,3,5,15]
		list_of_sets = [set_1,set_2,set_3,set_4]
		polynomial_regressions = get_polynomial_regression_by_sets(list_of_degrees, list_of_sets)

		print "\nQ1: power_15 for all four models:"
		pw_degree = 'power_15'
		for idx,sets in enumerate(list_of_sets):
			idx_set = 'set_%s' % (idx + 1)
			poly_n_coeff = polynomial_regressions[idx_set][pw_degree]['coefficients']
			coeff_dict = gp.convert_sframe_to_simple_dict(poly_n_coeff,'name','value')
			print "\t- %s: %s"%(idx_set, coeff_dict[pw_degree])

		print "\nQ2: fitted lines all look the same plots: FALSE"

		training, test_data = sales.random_split(0.9,seed=1)
		train_data, val_data = training.random_split(0.5,seed=1)

		best_degree, model_by_degree  = select_polynomial_degree(train_data,val_data)
		print "\nQ3: the lowest RSS on Validation data is degree:%s" % best_degree

		data_n_test = reg.polynomial_sframe(test_data['sqft_living'],best_degree)
		data_n_test['price'] = test_data['price']
		rss_n = reg.get_model_residual_sum_of_squares(model_by_degree[best_degree],data_n_test,data_n_test['price'])
		print "\nQ4: RSS on TEST with the degree:%s from Validation data is:%s" % (best_degree,rss_n)

	except Exception as details:
		print "Error >> %s" % details
		traceback.print_exc()
Example #13
0
def main():
    try:
        people = gp.load_data('../../data_sets/people_wiki.gl/')

        #Create Word Count & TF_IDF analytics count
        people['word_count'] = gp.get_text_analytics_count(people['text'])
        people['tfidf'] = gp.get_text_analytics_tf_idf(people['word_count'])

        famous_people = ['Elton John', 'Victoria Beckham',
                         'Paul McCartney']  #Quiz
        # famous_people = ['Barack Obama', 'Bill Clinton', 'David Beckham', 'Taylor Swift', 'George Clooney']
        people_info = {}

        for person in famous_people:
            people_info[person] = people[people['name'] == person]
            people_info['%s table' % person] = stack_columns_to_table(
                people_info[person], 'word_count', ['word', 'count'])
            people_info['%s tfidf' % person] = stack_columns_to_table(
                people_info[person],
                'tfidf', ['word', 'tfidf'],
                sort_by='tfidf')

        # 1)Person:'Elton John' What are the 3 words in his articles
        # with highest word counts? and  with highest TF-IDF?
        name = 'Elton John'
        print "Person: %s" % name
        print "\nQ1: Highest word counts = %s" % (people_info['%s table' %
                                                              name])
        print "\nQ2: Top TF-IDF= %s" % (people_info['%s table' % name])

        # 2)Whats the cosine distance between the articles on
        dist1 = 'Elton John_vs_Victoria Beckham'
        dist2 = 'Elton John_vs_Paul McCartney'

        cos_distances = calculate_cos_distance(name, famous_people,
                                               people_info)

        print "\nQ3: %s: %s" % (dist1, cos_distances[dist1])
        print "\nQ4: %s: %s" % (dist2, cos_distances[dist2])
        print "\nQ5: closer to 'Elton John is Paul McCartney"

        # cos_distances = calculate_cos_distance('Barack Obama', famous_people, people_info)
        # for dist in cos_distances.keys():
        # 	print "%s: %s"%(dist, cos_distances[dist])

        # 6) Now, you will build two nearest neighbors models:
        # Using word counts as features
        # Using TF-IDF as features
        # set the distance function to cosine similarity

        knn_model_word_count = gp.create_nearest_neighbors_model(
            people, features=['word_count'], label='name', distance='cosine')
        knn_model_tfidf = gp.create_nearest_neighbors_model(people,
                                                            features=['tfidf'],
                                                            label='name',
                                                            distance='cosine')

        # Whats the most similar article, other than itself
        # Elton John & Victoria Beckham using word count features? & TF-IDF features?
        print "Find the Nearest Neighbor of"
        count_qs = 6
        for name in ['Elton John', 'Victoria Beckham']:
            query_min_knn_distance(name, people_info, knn_model_word_count,
                                   'raw_model', count_qs)
            count_qs += 1
            query_min_knn_distance(name, people_info, knn_model_tfidf,
                                   'tfidf_model', count_qs)
            count_qs += 1

    except Exception as details:
        print "Error >> %s" % details
        traceback.print_exc()
Example #14
0
def main():
    try:
        print "\n**************************************"
        print "*        Precision & Recall          *"
        print "**************************************\n"

        products = gp.load_data('../../data_sets/amazon_baby.gl/')

        # Remove punctuation.
        review_clean = products['review'].apply(gp.remove_punctuation)

        # Count words
        products['word_count'] = gp.graphlab.text_analytics.count_words(
            review_clean)

        # Drop neutral sentiment reviews.
        products = products[products['rating'] != 3]

        # Positive sentiment to +1 and negative sentiment to -1
        products['sentiment'] = products['rating'].apply(lambda rating: +1
                                                         if rating > 3 else -1)
        train_data, test_data = products.random_split(.8, seed=1)
        model = gp.graphlab.logistic_classifier.create(train_data,
                                                       target='sentiment',
                                                       features=['word_count'],
                                                       validation_set=None,
                                                       verbose=False)
        accuracy = model.evaluate(test_data, metric='accuracy')['accuracy']
        baseline = len(test_data[test_data['sentiment'] == 1]) / len(test_data)

        print "\nQ1: YES, logistic regression model was better than the baseline (majority class classifier)"
        print "\tBaseline: %s" % accuracy
        print "\tReg-model: %s" % baseline
        confusion_matrix = model.evaluate(
            test_data, metric='confusion_matrix')['confusion_matrix']
        print confusion_matrix
        false_positives = 1443
        print "\nQ2: False positives: (-1)(+1): %s" % false_positives
        false_negatives = 1406
        print "\nQ3: Cost associated with the logistic regression: $%s" % (
            false_negatives + 100 * false_positives)
        true_positives = 26689
        print "\nQ4: Fracion of false positives: %s" % round(
            (false_positives / float(true_positives)), 2)
        print "\nQ5: Increase threshold for predicting the positive class (y^=+1)"
        print "\nQ6: Fracion of false positives: %s" % round(
            (false_negatives / float(true_positives)), 2)
        print "\nQ7: classifier that predicts +1 for all data points has recall= 1"

        precision_and_recall_threshold(model, test_data)

        precision_and_recall_plot(model, test_data)

        baby_reviews = test_data[test_data['name'].apply(
            lambda x: 'baby' in x.lower())]
        probabilities_baby = model.predict(baby_reviews,
                                           output_type='probability')
        threshold_values_baby = np_utils.np.linspace(0.5, 1, num=100)
        precision_all_baby, recall_all_baby = get_all_precisions_and_recall(
            baby_reviews, probabilities_baby, threshold_values_baby)
        threshold_small_baby = find_smallest_threshold(precision_all_baby,
                                                       threshold_values_baby)
        print "\nQ12: smallest threshold-baby value that achieves a precision of 96.5 or better is: %s" % round(
            threshold_small_baby, 3)
        print "\nQ13: threshold value is larger: than the threshold used for the entire dataset"
        output_file = '../graphs/Precision_recall_curve_baby.png'
        np_plot.plot_pr_curve(precision_all_baby, recall_all_baby,
                              "Precision-Recall (Baby)", output_file)

    except Exception as details:
        print(">> Exit or Errors \n%s, %s" % (details, traceback.print_exc()))
def main():
    try:
        print "\n**************************************"
        print "*          Online Learning           *"
        print "**************************************\n"

        products = gp.load_data('../../data_sets/amazon_baby_subset.gl/')
        important_words = gp.load_json_file(
            '../../data_sets/important_words.json')

        # Remove Punctuation
        products['review_clean'] = products['review'].apply(
            gp.remove_punctuation)

        # Add important_words and its number of ocurrences per review
        for word in important_words:
            products[word] = products['review_clean'].apply(
                lambda s: s.split().count(word))
        # print products[:10]

        train_data, validation_data = products.random_split(.9, seed=1)
        feature_matrix_train, sentiment_train = np_utils.get_numpy_data(
            train_data, important_words, 'sentiment')
        feature_matrix_valid, sentiment_valid = np_utils.get_numpy_data(
            validation_data, important_words, 'sentiment')
        print "\nQ1: stochastic gradient ascent affect the number of features NOT: Stays the same"
        print "\nQ2: llA (w) = (1/N) * ll(w) --> only add (1/N)"
        print "\nQ3:  dli(w)/dwj is a --> scalar"
        print "\nQ4:  dli(w)/dwj (minibatch) is a: scalar"
        print "\nQ5: to have the same as the full gradient set B=N (size of train_data): %s" % len(
            train_data)
        print "\nQ6: logistic_regression_SG act as a standard gradient ascent when B=N (size of train_data): %s" % len(
            train_data)
        lg = cl_utils.LogisticRregStochastic()
        coefficients, log_likelihood = lg.logistic_regression_SG(
            feature_matrix_train,
            sentiment_train,
            initial_coefficients=np_utils.np.zeros(194),
            step_size=5e-1,
            batch_size=1,
            max_iter=10,
            verbose=False)
        print "\nQ7: set batch_size = 1, as each iteration passes, the average log likelihood in the batch:  Fluctuates"
        # print coefficients
        coefficients_batch, log_likelihood_batch = lg.logistic_regression_SG(
            feature_matrix_train,
            sentiment_train,
            initial_coefficients=np_utils.np.zeros(194),
            step_size=5e-1,
            batch_size=len(feature_matrix_train),
            max_iter=200,
            verbose=False)
        print "\nQ8: set batch_size = 47780, as each iteration passes, the average log likelihood in the batch:  Increases"
        # print coefficients_batch
        print "\nQ9: gradient updates are performed at the end of two passes  ((2*50000)/100.0) = %s" % (
            (2 * 50000) / 100.0)

        # log_likelihood_metrics(lg,feature_matrix_train,sentiment_train)

        plot_stochastic_and_batch(lg, feature_matrix_train, sentiment_train,
                                  log_likelihood_batch)
        print "\nQ10: passes  needed to achieve a similar log likelihood as stochastic gradient ascent: 150 passes or more"

        # effects_of_step_size(lg,feature_matrix_train,sentiment_train,train_data)
        print "\nQ11: worst step size is: 1e2"
        print "\nQ12: best step size is: 1e0"

    except Exception as details:
        print(">> Exit or Errors \n%s, %s" % (details, traceback.print_exc()))
Example #16
0
def week2_summary(sales, train_data, test_data):
    #Build a regression model with 1 feature -> 'sqft_living'
    sqft_model = gp.create_linear_regression(train_data,
                                             target='price',
                                             features=['sqft_living'])

    print 'Price test-mean: %s' % test_data['price'].mean()
    #543054.042563

    print 'Price model evaluate: %s' % sqft_model.evaluate(test_data)
    #{'max_error': 4143550.8825285914, 'rmse': 255191.02870527367}
    # import matplotlib.pyplot as plt
    # plt.plot(test_data['sqft_living'], test_data['price'],'.',
    # 		 test_data['sqft_living'], sqft_model.predict(test_data),'-')
    # plt.show()

    print 'model coefficients: %s\n' % sqft_model.get('coefficients')
    print 'columns name: %s' % sales.column_names()

    # print "sales[my_features] %s\n" % sales[my_features].show()
    # sales[my_features].show()
    # sales.show(view='BoxWhisker Plot', x='zipcode', y='price')

    #******************
    #   CREATE MODEL  *
    #******************
    #Build a regression model with more features
    my_features = [
        'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode'
    ]
    print 'my_features: %s' % my_features
    print '\n1) CREATE model:'
    my_features_model = gp.create_linear_regression(train_data,
                                                    target='price',
                                                    features=my_features)

    #******************
    #  EVALUATE MODEL *
    #******************
    info_text = ['(1 feature)', '(more feature)']
    models = [sqft_model, my_features_model]
    print '\n2) EVALUATE model:'
    evaluate_house_price_models(info_text, models, test_data)

    #******************
    #  PREDICT MODEL  *
    #******************
    #The first house we will use is considered an "average" house in Seattle.
    house1 = sales[sales['id'] == '5309101200']
    print '\n3) PREDICT model:'
    print '\nhouse1:                      %s' % house1['price']
    predict_house_price_models(info_text, models, house1)

    house2 = sales[sales['id'] == '1925069082']
    print '\nhouse2:                      %s' % house2['price']
    predict_house_price_models(info_text, models, house2)

    bill_gates = {
        'bedrooms': [8],
        'bathrooms': [25],
        'sqft_living': [50000],
        'sqft_lot': [225000],
        'floors': [4],
        'zipcode': ['98039'],
        'condition': [10],
        'grade': [10],
        'waterfront': [1],
        'view': [4],
        'sqft_above': [37500],
        'sqft_basement': [12500],
        'yr_built': [1994],
        'yr_renovated': [2010],
        'lat': [47.627606],
        'long': [-122.242054],
        'sqft_living15': [5000],
        'sqft_lot15': [40000]
    }

    #model receive SFrame not dicts
    house_bill_gates = gp.load_data(bill_gates)
    print '\nhouse-Bill-Gates'
    predict_house_price_models(info_text, models, house_bill_gates)
Example #17
0
def main():
    try:
        #Load Data
        products = gp.load_data('../../data_sets/amazon_baby.gl/')

        #Create word-count column
        products['word_count'] = gp.get_text_analytics_count(
            products['review'])

        #Select a group of words
        selected_words = [
            'awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible',
            'bad', 'terrible', 'awful', 'wow', 'hate'
        ]

        # Q1: Out of the 11 words in selected_words, which one
        # is most used in the reviews in the dataset?
        products, most_used = select_most_used_word(products, selected_words,
                                                    products['word_count'])

        key_most_used = gp.find_key_max(most_used)
        least_most_used = gp.find_key_min(most_used)
        print "\nQ1: Most used:%s = %s" % (key_most_used,
                                           most_used[key_most_used])
        print "\nQ2: Least used:%s = %s" % (least_most_used,
                                            most_used[least_most_used])

        #ignore all 3* reviews (to remove unknown rating)
        products = products[products['rating'] != 3]

        #positive sentiment = 4* or 5* reviews
        products['sentiment'] = products['rating'] >= 4

        #Split Data
        train_data, test_data = products.random_split(.8, seed=0)

        #***************
        # Create Model *
        #***************
        #Create Logistic Model (with selected-words as features)
        selected_model = gp.create_logistic_classifier_model(
            train_data,
            target='sentiment',
            features=selected_words,
            validation_set=test_data)
        #Create Logistic Model (with word_count as features)
        sentiment_model = gp.create_logistic_classifier_model(
            train_data,
            target='sentiment',
            features=['word_count'],
            validation_set=test_data)

        # Get weights of Coefficients
        coefficients = selected_model['coefficients'].sort('value',
                                                           ascending=False)
        # print coefficients.print_rows(12)

        #Out of the 11 words in selected_words, which one got
        # the most positive/negative weight in the selected_words_model
        print gp.find_key_max({coefficients['name']: coefficients['value']})
        print "\nQ3: Most Positive (w): love"
        print "\nQ4: Most Negative (w): terrible"

        #*****************
        # Evaluate Model *
        #*****************
        #Which of the following ranges contains the accuracy
        # of the selected_words_model on the test_data
        results_selected = selected_model.evaluate(test_data)
        results_sentiment = sentiment_model.evaluate(test_data)
        print "\nQ5: Accuracy selected-model: %s" % results_selected[
            'accuracy']  #0.843111938506
        print "\nQ6: Accuracy sentiment-model: %s" % results_sentiment[
            'accuracy']  #0.916256305549

        #****************
        # Predict Model *
        #****************
        #Which of the following ranges contains the predicted_sentiment for the most positive review
        # for Baby Trend Diaper Champ, according to the sentiment_model ?
        diaper_champ_reviews = products[products['name'] ==
                                        'Baby Trend Diaper Champ']
        diaper_champ_reviews['predicted_selected'] = selected_model.predict(
            diaper_champ_reviews, output_type='probability')
        diaper_champ_reviews['predicted_sentiment'] = sentiment_model.predict(
            diaper_champ_reviews, output_type='probability')

        most_positive_review = diaper_champ_reviews.sort('predicted_selected',
                                                         ascending=False)
        most_positive_sentiment = diaper_champ_reviews.sort(
            'predicted_sentiment', ascending=False)

        print "\nQ9: predicted_selected most positive review: %s" % max(
            most_positive_review['predicted_selected'])
        print "\nQ10: predicted_sentiment most positive review: %s" % max(
            most_positive_sentiment['predicted_sentiment'])

    except Exception as details:
        print "Error >> %s" % details
        traceback.print_exc()
Example #18
0
def main():
    try:
        print "\n**********************************"
        print "*       Predicting sentiment     *"
        print "**********************************\n"

        products = gp.load_data('../../data_sets/amazon_baby.gl/')

        #Remove punction - built-in python string
        review_without_punctuation = products['review'].apply(
            gp.remove_punctuation)
        products['word_count'] = gp.graphlab.text_analytics.count_words(
            review_without_punctuation)

        # Ignore neutral-ratings == 3
        products = products[products['rating'] != 3]

        # Create Sentiment column
        # for every rating in products-data set. Rating > 3 (positive +1) otherwise (negative -1)
        products['sentiment'] = products['rating'].apply(lambda x: +1
                                                         if x > 3 else -1)

        # 1) Train a Logistic-Classifier model
        train_data, test_data = products.random_split(.8, seed=1)
        sentiment_model = gp.create_logistic_classifier_model(
            train_data, target='sentiment', features=['word_count'])
        weights = sentiment_model.coefficients
        num_positive_weights = len(weights[weights['value'] >= 0])
        num_negative_weights = len(weights[weights['value'] < 0])

        print "\nQ1: How many weights are greater >= 0 is: %s" % (
            num_positive_weights)

        my_predictions = predict_scores_simple_data(sentiment_model, test_data)
        print "\nQ2: Lowest probability of being classified House: %s" % (
            np_utils.np.argmin(my_predictions) + 1)

        test_data["probability"] = sentiment_model.predict(
            test_data, output_type='probability')

        # Compare accuracy in TEST data
        accuracy_sent_test = quiz1_make_predictions_logistic_regression(
            sentiment_model, test_data)
        simple_model = quiz1_learn_classifier_fewer_words(
            train_data, test_data)
        accuracy_simp_test = cl_utils.get_model_classification_accuracy(
            simple_model, test_data, test_data['sentiment'])

        # Compare accuracy in TRAINING data
        accuracy_sent_train = cl_utils.get_model_classification_accuracy(
            sentiment_model, train_data, train_data['sentiment'])
        accuracy_simp_train = cl_utils.get_model_classification_accuracy(
            simple_model, train_data, train_data['sentiment'])

        print "\nQ9: Accuracy on TRAINING data  sentiment:%s vs simple:%s" % (
            accuracy_sent_train, accuracy_simp_train)
        print "\nQ10: Accuracy on TEST data  sentiment:%s vs simple:%s" % (
            accuracy_sent_test, accuracy_simp_test)

        accuracy_majority = cl_utils.get_majority_class_accuracy(
            sentiment_model, test_data)
        print "\nQ11: Accuracy of the majority class classifier model: %s" % (
            accuracy_majority)

    except Exception as details:
        print(">> Exit or Errors \n%s, %s" % (details, traceback.print_exc()))