def quiz3_logistic_regression_l2_penalty(products, important_words, lg_class):
    print "\n**************************************"
    print "*  Logistic Regression: L2 penalty   *"
    print "**************************************\n"
    train_data, validation_data = products.random_split(.8, seed=2)
    feature_matrix_train, sentiment_train = np_utils.get_numpy_data(
        train_data, important_words, 'sentiment')
    feature_matrix_valid, sentiment_valid = np_utils.get_numpy_data(
        validation_data, important_words, 'sentiment')

    table = get_table_with_logistic_model(lg_class, important_words,
                                          feature_matrix_train,
                                          sentiment_train)

    coefficients = list(table["coefficients [L2=0]"][1:])  # exclude intercept
    word_coefficient_tuples = [
        (word, coefficient)
        for word, coefficient in zip(important_words, coefficients)
    ]
    word_coefficient_tuples = sorted(word_coefficient_tuples,
                                     key=lambda x: x[1],
                                     reverse=True)
    # print word_coefficient_tuples[:5]
    positive_words = map(lambda x: x[0], word_coefficient_tuples[:5])
    negative_words = map(lambda x: x[0], word_coefficient_tuples[-5:])

    # print "Positive words: %s" % positive_words
    # print "Negative words: %s" % negative_words
    # quiz_word = ['love', 'disappointed', 'great', 'money', 'quality']

    print "\nQ1: feature_derivative_with_L2, regularize the intercept: NO"
    print "\nQ2: L2 regularization increase/decrease the log likelihood ll(w): DECREASE"
    print "\nQ3: words is not listed in either positive_words or negative_words: QUALITY"

    l2_penalty_list = [0, 4, 10, 1e2, 1e3, 1e5]
    output_file = '../graphs/Coefficient_vs_L2penalty.png'
    np_plot.make_coefficient_plot(table, positive_words, negative_words,
                                  l2_penalty_list, output_file)
    print "\nQ4: All coefficients consistently get smaller in size as the L2 penalty is increased -> TRUE"
    train_accuracy = create_accuracy_table(table, feature_matrix_train,
                                           sentiment_train)
    validation_accuracy = create_accuracy_table(table, feature_matrix_valid,
                                                sentiment_valid)
    print "\nComputing accuracy ....\n"
    for key in sorted(validation_accuracy.keys()):
        print "\tL2 penalty = %g" % key
        print "\ttrain accuracy = %s, validation_accuracy = %s" % (
            train_accuracy[key], validation_accuracy[key])
        print "\t--------------------------------------------------------------------------------"
    output_file2 = '../graphs/Classification_Accuracy_vs_L2penalty.png'
    np_plot.make_classsification_accuracy_plot(train_accuracy,
                                               validation_accuracy,
                                               output_file2)

    print "\nQ6: highest accuracy on the training data: L2 penalty = 0"
    print "\nQ7: highest accuracy on the validation data: L2 penalty = 4"
    print "\nQ8: highest accuracy on the training data imply that the model is the best one: NO"
Esempio n. 2
0
def get_normalized_datasets(train, test, validation, feature_list):
	features_train,output_train = np_utils.get_numpy_data(train,feature_list,'price')
	features_test,output_test = np_utils.get_numpy_data(test,feature_list,'price')
	features_valid,output_valid = np_utils.get_numpy_data(validation,feature_list,'price')

	features_train,norms = np_utils.normalize_features(features_train) # normalize training set features (columns)
	features_test = features_test / norms # normalize test set by training set norms
	features_valid = features_valid / norms # normalize validation set by training set norms

	return (features_train, features_test, features_valid, output_train, output_valid, output_test)
def quiz_2_ridge_grandient_descent(sales):
	print "\n**********************************"
	print "*     Ridge Gradient Descent     *"
	print "**********************************\n"

	simple_features = ['sqft_living']
	my_output = 'price'
	train_data,test_data = sales.random_split(.8,seed=0)
	(simple_feature_matrix,output) = np_utils.get_numpy_data(train_data,simple_features,my_output)
	(simple_test_feature_matrix,test_output) = np_utils.get_numpy_data(test_data,simple_features,my_output)

	ridge = RidgeRegression()
	l2_no_reg,l2_high_reg = 0,1e11
	initial_weights = np_utils.np.array([0.,0.])
	print "\nQ1 & Q2 coefficients with features: %s" % (simple_features)
	ridge_weights = compute_ridge_regression(ridge,simple_feature_matrix,output,[l2_no_reg,l2_high_reg],initial_weights)
	# print ridge_weights

	print "\nQ3: Line fit with no regularization (l2_penalty=0) is steeper"
	print "\nQ4: high regularization (l2_penalty=1e11)"
	compute_ridge_rss([ridge_weights[l2_high_reg]],simple_test_feature_matrix,test_data)
	print "\t- Between 5e14 and 8e14"

	more_features = ['sqft_living','sqft_living15']
	initial_w_morefeatures = np_utils.np.array([0.0,0.0,0.0])
	(more_feature_matrix,output_more_features) = np_utils.get_numpy_data(train_data,more_features,my_output)
	(more_test_feature_matrix,test_output_more) = np_utils.get_numpy_data(test_data,more_features,my_output)

	print "\nQ5 & Q6 coefficients with features: %s" % (more_features)
	ridge_morefeatures = compute_ridge_regression(ridge,more_feature_matrix,output_more_features,
		[l2_no_reg,l2_high_reg],initial_w_morefeatures)

	print "\nQ7: using all zero weights with features: %s" % (simple_features)
	compute_ridge_rss([initial_w_morefeatures],more_test_feature_matrix,test_data)
	print "\t-Between 1e15 and 3e15"

	num_of_house = 1#5
	print "\nQ8: Which model makes better predictions for 1st house:"
	for l2_penalty in [l2_no_reg,l2_high_reg]:
		print "L2:%s:" % l2_penalty
		current_predictions = np_utils.predict_output(more_test_feature_matrix,ridge_morefeatures[l2_penalty])
		for house_predict in range(num_of_house):
			pred,real = current_predictions[house_predict],test_data['price'][house_predict]
			print '\t\t(predict) %s vs %s (real)  diff: %s' % (pred,real,real - pred)
def quiz2_implementing_logistic_regression(products, important_words,
                                           lg_class):
    print "\n**************************************"
    print "*  Implementing Logistic Regression  *"
    print "**************************************\n"

    # set to 1 if the count of the word perfect >=1
    products['contains_perfec'] = products['perfect'] >= 1
    print "\nQ1: # of reviews containing word perfect is: %s" % products[
        'contains_perfec'].sum()

    print "\nTransforming data to numpy arrays ...."
    feature_matrix, sentiment = np_utils.get_numpy_data(
        products, important_words, 'sentiment')
    n_features, n_weights = feature_matrix.shape
    print "\nQ2: # of features in the feature_matrix is: %s" % n_weights

    #*******************
    # Logistic model
    #*******************
    print "\nCreating logistic model ...."
    coefficients = predict_coefficients_logistic_model(feature_matrix,
                                                       sentiment, lg_class)
    print "\nQ4: As each iteration of gradient ascent passes the log- likelihood: increases"

    predictions_yi, correctly_classified = compute_correct_score_predictions(
        feature_matrix, coefficients)
    print "\nQ5: reviews were predicted to have positive sentiment is: %s" % correctly_classified

    accuracy = compute_accuracy_of_the_model(predictions_yi, products)
    print "\nQ6: accuracy of the model on predictions %s" % round(accuracy, 2)

    word_coefficient_tuples = get_word_coeff_tuples(important_words,
                                                    coefficients)

    top_words = map(lambda x: x[0], word_coefficient_tuples[:10])
    select = list({'love', 'easy', 'great', 'perfect', 'cheap'} -
                  set(top_words))
    print "\nQ7: not present in the top 10 most positive words: %s" % select

    least_words = map(lambda x: x[0], word_coefficient_tuples[-10:])
    select_least = list({'need', 'work', 'disappointed', 'even'} -
                        set(least_words))
    print "\nQ8: not present in the top 10 most negative words: %s" % select_least
def more_features_with_lasso_coordinate(lasso, sales):
    train_data, test_data = sales.random_split(.8, seed=0)

    all_features = [
        'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
        'waterfront', 'view', 'condition', 'grade', 'sqft_above',
        'sqft_basement', 'yr_built', 'yr_renovated'
    ]

    feature_matrix_norm, train_output, train_norms = np_utils.get_normalized_data(
        train_data, all_features, 'price')

    initial_weights = np_utils.np.zeros(len(all_features) + 1)

    weights_info, nnz_features = {}, {}
    penalty_tolerance = [[1e7, 1.0], [1e8, 1.0], [1e4, 5e5]]
    penalty_str = {1e7: '1e7', 1e8: '1e8', 1e4: '1e4'}
    print "\nFeatures assigned for Q5,Q6,Q7:"

    for penalty, tolerance in penalty_tolerance:
        weights = lasso.lasso_cyclical_coordinate_descent(
            feature_matrix_norm, train_output, initial_weights, penalty,
            tolerance)
        # print weights
        weights_normalized = weights / train_norms
        weights_info[penalty] = weights_normalized
        dict_weights = dict(
            zip(['constant'] + all_features, weights_normalized))
        nnz_features[penalty] = filter(lambda x: dict_weights[x] > 0,
                                       dict_weights)
        print "\n\tL1_penalty_%s: %s" % (penalty_str[penalty],
                                         nnz_features[penalty])

    print "\nQ8: three models RSS on the TEST data:"
    test_feature_matrix, test_output = np_utils.get_numpy_data(
        test_data, all_features, 'price')
    for penalty, tolerance in penalty_tolerance:
        current_predictions = np_utils.predict_output(test_feature_matrix,
                                                      weights_info[penalty])
        RSS = reg.compute_RSS(current_predictions, test_output)
        print "\n\tL1_penalty_%s: %s" % (penalty_str[penalty], RSS)
def get_predictions(dataset, features, output, weights):
	feature_matrix, output = np_utils.get_numpy_data(dataset, features, output)
	predictions = np_utils.predict_output(feature_matrix, weights)
	return predictions
def calculate_weights(gradient, dataset, features, output, parameters):
	initial_weights,step_size,tolerance = parameters
	feature_matrix, output_data = np_utils.get_numpy_data(dataset,features,output)
	weights = gradient.regression_gradient_descent(feature_matrix,output_data,initial_weights,step_size,tolerance)
	return weights
def main():
    try:
        print "\n**************************************"
        print "*          Online Learning           *"
        print "**************************************\n"

        products = gp.load_data('../../data_sets/amazon_baby_subset.gl/')
        important_words = gp.load_json_file(
            '../../data_sets/important_words.json')

        # Remove Punctuation
        products['review_clean'] = products['review'].apply(
            gp.remove_punctuation)

        # Add important_words and its number of ocurrences per review
        for word in important_words:
            products[word] = products['review_clean'].apply(
                lambda s: s.split().count(word))
        # print products[:10]

        train_data, validation_data = products.random_split(.9, seed=1)
        feature_matrix_train, sentiment_train = np_utils.get_numpy_data(
            train_data, important_words, 'sentiment')
        feature_matrix_valid, sentiment_valid = np_utils.get_numpy_data(
            validation_data, important_words, 'sentiment')
        print "\nQ1: stochastic gradient ascent affect the number of features NOT: Stays the same"
        print "\nQ2: llA (w) = (1/N) * ll(w) --> only add (1/N)"
        print "\nQ3:  dli(w)/dwj is a --> scalar"
        print "\nQ4:  dli(w)/dwj (minibatch) is a: scalar"
        print "\nQ5: to have the same as the full gradient set B=N (size of train_data): %s" % len(
            train_data)
        print "\nQ6: logistic_regression_SG act as a standard gradient ascent when B=N (size of train_data): %s" % len(
            train_data)
        lg = cl_utils.LogisticRregStochastic()
        coefficients, log_likelihood = lg.logistic_regression_SG(
            feature_matrix_train,
            sentiment_train,
            initial_coefficients=np_utils.np.zeros(194),
            step_size=5e-1,
            batch_size=1,
            max_iter=10,
            verbose=False)
        print "\nQ7: set batch_size = 1, as each iteration passes, the average log likelihood in the batch:  Fluctuates"
        # print coefficients
        coefficients_batch, log_likelihood_batch = lg.logistic_regression_SG(
            feature_matrix_train,
            sentiment_train,
            initial_coefficients=np_utils.np.zeros(194),
            step_size=5e-1,
            batch_size=len(feature_matrix_train),
            max_iter=200,
            verbose=False)
        print "\nQ8: set batch_size = 47780, as each iteration passes, the average log likelihood in the batch:  Increases"
        # print coefficients_batch
        print "\nQ9: gradient updates are performed at the end of two passes  ((2*50000)/100.0) = %s" % (
            (2 * 50000) / 100.0)

        # log_likelihood_metrics(lg,feature_matrix_train,sentiment_train)

        plot_stochastic_and_batch(lg, feature_matrix_train, sentiment_train,
                                  log_likelihood_batch)
        print "\nQ10: passes  needed to achieve a similar log likelihood as stochastic gradient ascent: 150 passes or more"

        # effects_of_step_size(lg,feature_matrix_train,sentiment_train,train_data)
        print "\nQ11: worst step size is: 1e2"
        print "\nQ12: best step size is: 1e0"

    except Exception as details:
        print(">> Exit or Errors \n%s, %s" % (details, traceback.print_exc()))