コード例 #1
0
def deliverable3(ratings_train, ratings_test):
    """
    Deliverable 3.
    --------------
    For the same algorithm you picked above and for (one of) the best setting(s) of λ based on your analysis above do the following: fix all hyper-parameters
    except for d the number of latent dimensions. Now run the flow several times varying d for all values in the set {2, 4, 10, 20, 40, 50, 70, 100, 200}.
    Plot the values of your choice of generalization metrics (same choice as above). Again specify all your modeling choices clearly Some hyper-parameters can
    also significantly effect the training time of the algorithm.
    """
    latent_dimensions = [2, 4, 10, 20, 40, 50, 70, 100, 200]
    with open(__latent_dimensions_analysis_output__,
              'w') as dimensions_analysis:
        print "Dimensions Analysis:"
        for latent_dim in latent_dimensions:
            hyper = algorithm.SGDHyperParameters(alpha=0.01,
                                                 eta=25,
                                                 lambda_user=0.1,
                                                 lambda_item=0.1,
                                                 lambda_bias_user=0.1,
                                                 lambda_bias_item=0.1,
                                                 latent_factor=latent_dim)
            model_object = model.MFModel(ratings_train, hyper)
            model_object = algorithm.LearnModelFromDataUsingSGD(
                ratings_train, model_object, hyper)
            ranked_items_list = evaluation.get_ranked_items_list(
                model_object, ratings_test)
            rmse = evaluation.root_mean_squared_error(ranked_items_list,
                                                      ratings_test)
            p_at_k = evaluation.precision_at_K(ranked_items_list, ratings_test,
                                               5)
            print "Dimension: {0}, RMSE: {1}, Precision@K: {2}".format(
                latent_dim, rmse, p_at_k)
            dimensions_analysis.write("{0},{1},{2}\n".format(
                latent_dim, rmse, p_at_k))
    plots.deliverable3_plot(__latent_dimensions_analysis_output__)
コード例 #2
0
def deliverable4(ratings_train):
    """
    Deliverable 4.
    --------------
    For the same experiments as above plot the training run-time as a function of d.
    """
    latent_dimensions = [2, 4, 10, 20, 40, 50, 70, 100, 200]
    with open(__latent_dimensions_analysis_output__,
              'w') as dimensions_analysis:
        print "Dimensions Runtime:"
        for latent_dim in latent_dimensions:
            hyper = algorithm.SGDHyperParameters(alpha=0.01,
                                                 eta=25,
                                                 lambda_user=0.1,
                                                 lambda_item=0.1,
                                                 lambda_bias_user=0.1,
                                                 lambda_bias_item=0.1,
                                                 latent_factor=latent_dim)
            model_object = model.MFModel(ratings_train, hyper)
            timer = time.clock()
            algorithm.LearnModelFromDataUsingSGD(ratings_train, model_object,
                                                 hyper)
            training_time = time.clock() - timer
            print "Dimension: {0},Training Time: {1}".format(
                latent_dim, training_time)
            dimensions_analysis.write("{0},{1}\n".format(
                latent_dim, training_time))
    plots.deliverable4_plot(__latent_dimensions_analysis_output__)
コード例 #3
0
def deliverable2(ratings_train, ratings_test):
    """
    Deliverable 2.
    --------------
    For one of the two algorithms fix all hyper-parameters except for the regularization parameters. Make the simplifying modeling choice
    λv = λu = λbu = λbv = λ (i.e. all the regularization parameters are set to the same value). Vary the value of λ from 0.1 to 1000 in 3 multiples of 10.
    Choose two of the generalization metrics on the test set output by your code and plot the results as a function of log λ. All hyper-parameter configurations
    of the algorithm should be clearly specified.
    """
    min_lambda = 0.1
    max_lambda = 1000
    lambda_multiplier = 10
    lambda_value = min_lambda
    with open(__lambda_analysis_output__, 'w') as lambda_analysis:
        print "Lambda Analysis:"
        while lambda_value <= max_lambda:
            hyper = algorithm.SGDHyperParameters(alpha=0.01,
                                                 eta=25,
                                                 lambda_user=lambda_value,
                                                 lambda_item=lambda_value,
                                                 lambda_bias_user=lambda_value,
                                                 lambda_bias_item=lambda_value,
                                                 latent_factor=20)
            model_object = model.MFModel(ratings_train, hyper)
            model_object = algorithm.LearnModelFromDataUsingSGD(
                ratings_train, model_object, hyper)
            ranked_items_list = evaluation.get_ranked_items_list(
                model_object, ratings_test)
            rmse = evaluation.root_mean_squared_error(ranked_items_list,
                                                      ratings_test)
            p_at_k = evaluation.precision_at_K(ranked_items_list, ratings_test,
                                               5)
            print "Lambda: {0}, log(Lambda): {1}, RMSE: {2}, Precision@K: {3}".format(
                lambda_value, np.log10(lambda_value), rmse, p_at_k)
            lambda_analysis.write("{0},{1},{2},{3}\n".format(
                lambda_value, np.log10(lambda_value), rmse, p_at_k))
            lambda_value *= lambda_multiplier
    plots.deliverable2_plot(__lambda_analysis_output__)
コード例 #4
0
def deliverable5(ratings_train, movies_dict):
    """
    Deliverable 5.
    --------------
    For 5 users who have rated 3 or more items in the training set, provide a human readable format of the recommendations that would result from
    applying the learned model.
    """
    selected_users_indexes = []
    users_list = xrange(ratings_train.shape[0])
    while len(selected_users_indexes) < 5:
        random_user = np.random.choice(users_list, 1)[0]
        if len(ratings_train[random_user].nonzero()[0]) >= 3:
            selected_users_indexes.append(random_user)

    config = ConfigParser.RawConfigParser()
    config.read('config.ini')
    hyper = algorithm.SGDHyperParameters(
        alpha=config.getfloat("SGD", "LearningRate"),
        eta=config.getint("SGD", "Epochs"),
        lambda_user=config.getfloat("SGD", "LambdaUser"),
        lambda_item=config.getfloat("SGD", "LambdaItem"),
        lambda_bias_user=config.getfloat("SGD", "LambdaUserBias"),
        lambda_bias_item=config.getfloat("SGD", "LambdaItemBias"),
        latent_factor=config.getint("SGD", "LatentDimensions"))
    model_object = model.MFModel(ratings_train, hyper)
    model_object = algorithm.LearnModelFromDataUsingSGD(
        ratings_train, model_object, hyper)
    predictions = evaluation.get_ranked_items_list(model_object, ratings_train)

    k = 5
    user_id_len = 6
    movie_id_len = 7
    movie_title_len = 50
    movie_genre_len = 75
    prediction_len = 12
    ground_truth_len = 12

    for user in selected_users_indexes:
        print "Displaying Top {0} Recommended Movies for User {1}:".format(
            k, user)
        top_user_predictions = predictions[user][:k]
        print "+{0}+{1}+{2}+{3}+{4}+{5}+".format("-" * user_id_len,
                                                 "-" * movie_id_len,
                                                 "-" * movie_title_len,
                                                 "-" * movie_genre_len,
                                                 "-" * prediction_len,
                                                 "-" * ground_truth_len)
        print "| {0} | {1} | {2} | {3} | {4} | {5} |".format(
            "User".ljust(user_id_len - 2), "Movie".ljust(movie_id_len - 2),
            "Title".ljust(movie_title_len - 2),
            "Genres".ljust(movie_genre_len - 2),
            "Prediction".ljust(prediction_len - 2),
            "True Label".ljust(ground_truth_len - 2))
        print "+{0}+{1}+{2}+{3}+{4}+{5}+".format("-" * user_id_len,
                                                 "-" * movie_id_len,
                                                 "-" * movie_title_len,
                                                 "-" * movie_genre_len,
                                                 "-" * prediction_len,
                                                 "-" * ground_truth_len)
        for movie, prediction in top_user_predictions:
            print "| {0} | {1} | {2} | {3} | {4} | {5} |".format(
                str(user).ljust(user_id_len - 2),
                str(movie).ljust(movie_id_len - 2),
                movies_dict[movie][0].ljust(movie_title_len - 2),
                movies_dict[movie][1].ljust(movie_genre_len - 2),
                str("%.7f" % prediction).ljust(prediction_len - 2),
                str(ratings_train[user, movie]).ljust(ground_truth_len - 2))
        print "+{0}+{1}+{2}+{3}+{4}+{5}+".format("-" * user_id_len,
                                                 "-" * movie_id_len,
                                                 "-" * movie_title_len,
                                                 "-" * movie_genre_len,
                                                 "-" * prediction_len,
                                                 "-" * ground_truth_len)
コード例 #5
0
def main(sections):
    ratings_train, ratings_test = None, None
    if "Main" in sections:
        # Section 1
        ratings = data.Ratings(r"dataset\ratings.dat")
        ratings_train, ratings_test = ratings.split_dataset()

        # Debug
        """
        # ratings.head(5)
        # movies.head(5)

        # ratings.show_sparsity(ratings.get_matrix(), "Matrix ")
        # ratings.show_sparsity(ratings_train, "Train Matrix ")
        # ratings.show_sparsity(ratings_test, "Test Matrix ")
        """

        # Debug
        """
        debug_ratings_train = np.array([[1, 5, 0, 0, 0],
                                        [3, 0, 1, 0, 0],
                                        [2, 3, 0, 0, 0],
                                        [3, 0, 0, 3, 0],
                                        [0, 4, 1, 0, 0],
                                        [2, 0, 2, 0, 3]])
        debug_ratings_test = np.array([[0, 0, 0, 0, 4],
                                      [0, 4, 0, 0, 0],
                                      [0, 0, 0, 4, 0],
                                      [0, 0, 2, 0, 3],
                                      [1, 0, 0, 0, 0],
                                      [0, 0, 0, 4, 0]])
        """

        # Load configuration file
        config = ConfigParser.RawConfigParser()
        config.read('config.ini')

        # Section 2
        hyper = None
        learning_algorithm = config.get("Configuration", "Algorithm")
        if learning_algorithm == __SGD__:
            hyper = algorithm.SGDHyperParameters(
                alpha=config.getfloat(__SGD__, "LearningRate"),
                eta=config.getint(__SGD__, "Epochs"),
                lambda_user=config.getfloat(__SGD__, "LambdaUser"),
                lambda_item=config.getfloat(__SGD__, "LambdaItem"),
                lambda_bias_user=config.getfloat(__SGD__, "LambdaUserBias"),
                lambda_bias_item=config.getfloat(__SGD__, "LambdaItemBias"),
                latent_factor=config.getint(__SGD__, "LatentDimensions"))
        elif learning_algorithm == __ALS__:
            hyper = algorithm.ALSHyperParameters(
                epsilon=config.getfloat(__ALS__, "Epsilon"),
                eta=config.getint(__ALS__, "Epochs"),
                lambda_user=config.getfloat(__ALS__, "LambdaUser"),
                lambda_item=config.getfloat(__ALS__, "LambdaItem"),
                lambda_bias_user=config.getfloat(__ALS__, "LambdaUserBias"),
                lambda_bias_item=config.getfloat(__ALS__, "LambdaItemBias"),
                latent_factor=config.getint(__ALS__, "LatentDimensions"))
        model_object = model.MFModel(ratings_train, hyper)

        # Section 3
        timer = time.clock()
        if learning_algorithm == __SGD__:
            model_object = algorithm.LearnModelFromDataUsingSGD(
                ratings_train, model_object, hyper, ratings_test)
        elif learning_algorithm == __ALS__:
            model_object = algorithm.LearnModelFromDataUsingALS(
                ratings_train, model_object, hyper, ratings_test)
        training_time = time.clock() - timer

        # Section 4
        ranked_items_list = evaluation.get_ranked_items_list(
            model_object, ratings_test)
        rmse = evaluation.root_mean_squared_error(ranked_items_list,
                                                  ratings_test)  # RMSE
        mpr = evaluation.mean_percentile_rank(ranked_items_list,
                                              ratings_test)  # MPR
        p_at_2 = evaluation.precision_at_K(ranked_items_list, ratings_test,
                                           2)  # P@2
        p_at_10 = evaluation.precision_at_K(ranked_items_list, ratings_test,
                                            10)  # P@10
        r_at_2 = evaluation.recall_at_K(ranked_items_list, ratings_test,
                                        2)  # R@2
        r_at_10 = evaluation.recall_at_K(ranked_items_list, ratings_test,
                                         10)  # R10
        map_ = evaluation.mean_average_precision(ranked_items_list,
                                                 ratings_test)  # MAP

        # Section 5
        with open(__main_flow_output__, "w") as output_handle:
            output_handle.write("Algorithm: {0}{1}".format(
                learning_algorithm, __newline__))
            output_handle.write("LambdaUser: {0}{1}".format(
                hyper.lambda_user, __newline__))
            output_handle.write("LambdaItem: {0}{1}".format(
                hyper.lambda_item, __newline__))
            output_handle.write("LambdaUserBias: {0}{1}".format(
                hyper.lambda_bias_user, __newline__))
            output_handle.write("LambdaItemBias: {0}{1}".format(
                hyper.lambda_bias_item, __newline__))
            output_handle.write("LatentDimensions: {0}{1}".format(
                hyper.latent_factor, __newline__))
            output_handle.write("Epochs: {0}{1}".format(
                hyper.eta, __newline__))
            if learning_algorithm == __SGD__:
                output_handle.write("LearningRate: {0}{1}".format(
                    hyper.alpha, __newline__))
            elif learning_algorithm == __ALS__:
                output_handle.write("Epsilon: {0}{1}".format(
                    hyper.epsilon, __newline__))
            output_handle.write("RMSE: {0}{1}".format(rmse, __newline__))
            output_handle.write("MPR: {0}{1}".format(mpr, __newline__))
            output_handle.write("P@2: {0}{1}".format(p_at_2, __newline__))
            output_handle.write("P@10: {0}{1}".format(p_at_10, __newline__))
            output_handle.write("R@2: {0}{1}".format(r_at_2, __newline__))
            output_handle.write("R@10: {0}{1}".format(r_at_10, __newline__))
            output_handle.write("MAP: {0}{1}".format(map_, __newline__))
            output_handle.write("TrainingTime: {0}{1}".format(
                training_time, __newline__))

    print "Finished Main Flow\n"
    """
    Part 7. Diagnostics and Analysis
    """
    print "Starting Diagnostics and Analysis"

    if not (ratings_train or ratings_test):
        ratings = data.Ratings(r"dataset\ratings.dat")
        ratings_train, ratings_test = ratings.split_dataset()

    if "D1" in sections:
        print "\nDeliverable 1"
        diagnostics.deliverable1()

    if "D2" in sections:
        print "\nDeliverable 2"
        diagnostics.deliverable2(ratings_train, ratings_test)

    if "D3" in sections:
        print "\nDeliverable 3"
        diagnostics.deliverable3(ratings_train, ratings_test)

    if "D4" in sections:
        print "\nDeliverable 4"
        diagnostics.deliverable4(ratings_train)

    if "D5" in sections:
        print "\nDeliverable 5"
        movies = data.Movies(r"dataset\movies.dat")
        movies_dict = movies.get_dictionary()
        diagnostics.deliverable5(ratings_train, movies_dict)

    print "Finished Diagnostics and Analysis"