Beispiel #1
0
def infer_true_movie_ratings(num_observations=-1):
    """
	For every movie, computes the posterior distribution and MAP estimate of
	the movie's true/inherent rating given the movie's observed ratings.

	Input
	-----
	- num_observations: integer that specifies how many available ratings to
		use per movie (the default value of -1 indicates that all available
		ratings will be used).

	Output
	------
	- posteriors: a 2D array consisting of the posterior distributions where
		the number of rows is the number of movies, and the number of columns
		is M, i.e., the number of possible ratings (remember ratings are
		0, 1, ..., M-1); posteriors[i] gives a length M vector that is the
		posterior distribution of the true/inherent rating of the i-th movie
		given ratings for the i-th movie (where for each movie, the number of
		observations used is precisely what is specified by the input variable
		`num_observations`)
	- MAP_ratings: a 1D array with length given by the number of movies;
		MAP_ratings[i] gives the true/inherent rating with the highest
		posterior probability in the distribution `posteriors[i]`
	"""

    M = 11  # all of our ratings are between 0 and 10
    prior = np.array([1.0 / M] * M)  # uniform distribution
    likelihood = compute_movie_rating_likelihood(M)

    # get the list of all movie IDs to process
    movie_id_list = movie_data_helper.get_movie_id_list()
    num_movies = len(movie_id_list)

    # -------------------------------------------------------------------------
    # YOUR CODE GOES HERE FOR PART (d)
    #
    # Your code should iterate through the movies. For each movie, your code
    # should:
    #   1. Get all the observed ratings for the movie. You can artificially
    #	  limit the number of available ratings used by truncating the ratings
    #	  vector according to num_observations.
    #   2. Use the ratings you retrieved and the function compute_posterior to
    #	  obtain the posterior of the true/inherent rating of the movie
    #	  given the observed ratings
    #   3. Find the rating for each movie that maximizes the posterior

    # These are the output variables - it's your job to fill them.
    posteriors = np.zeros((num_movies, M))
    MAP_ratings = np.zeros(num_movies)
    for movie in range(num_movies):
        ratings = movie_data_helper.get_ratings(movie)
        posteriors[movie, :] = compute_posterior(prior, likelihood, ratings)
        MAP_ratings[movie] = posteriors[movie, :].argmax()

    #
    # END OF YOUR CODE FOR PART (d)
    # -------------------------------------------------------------------------

    return posteriors, MAP_ratings
Beispiel #2
0
def main():
    # -------------------------------------------------------------------------
    # YOUR CODE GOES HERE FOR TESTING THE FUNCTIONS YOU HAVE WRITTEN
    #
    # Place your code that calls the relevant functions here.  Make sure it's
    # easy for us graders to run your code. You may want to define multiple
    # functions for each of the parts of this problem, and call them here.

    posteriors, MAP_ratings = infer_true_movie_ratings()
    ratings_by_id = list(
        zip(movie_data_helper.get_movie_id_list(), MAP_ratings))
    ratings_by_id.sort(key=lambda x: x[1], reverse=True)

    print("Best movies by MAP estimate:")
    for i in range(10):
        print(movie_data_helper.get_movie_name(ratings_by_id[i][0]),
              ratings_by_id[i][1])

    print("---")
    print("Worst movies:")
    for i in range(-10, 0, 1):
        print(movie_data_helper.get_movie_name(ratings_by_id[i][0]),
              ratings_by_id[i][1])

    prob_of_10_by_id = list(
        zip(movie_data_helper.get_movie_id_list(), posteriors[:, 10]))
    prob_of_10_by_id.sort(key=lambda x: x[1], reverse=True)
    print("---")
    print("Movies most likely to be a perfect 10:")
    for i in range(10):
        print(movie_data_helper.get_movie_name(prob_of_10_by_id[i][0]),
              prob_of_10_by_id[i][1])

    max_observations = 200
    entropy_plot = np.zeros(max_observations)
    for num_observations in range(1, max_observations + 1):
        entropy_plot[num_observations - 1] = \
            np.mean(compute_true_movie_rating_posterior_entropies(
                num_observations))
        print(num_observations, entropy_plot[num_observations - 1])
    plt.plot(range(1, max_observations + 1), entropy_plot)
    plt.show()
Beispiel #3
0
def infer_true_movie_ratings(num_observations=-1):
    """
    For every movie, computes the posterior distribution and MAP estimate of
    the movie's true/inherent rating given the movie's observed ratings.

    Input
    -----
    - num_observations: integer that specifies how many available ratings to
        use per movie (the default value of -1 indicates that all available
        ratings will be used).

    Output
    ------
    - posteriors: a 2D array consisting of the posterior distributions where
        the number of rows is the number of movies, and the number of columns
        is M, i.e., the number of possible ratings (remember ratings are
        0, 1, ..., M-1); posteriors[i] gives a length M vector that is the
        posterior distribution of the true/inherent rating of the i-th movie
        given ratings for the i-th movie (where for each movie, the number of
        observations used is precisely what is specified by the input variable
        `num_observations`)
    - MAP_ratings: a 1D array with length given by the number of movies;
        MAP_ratings[i] gives the true/inherent rating with the highest
        posterior probability in the distribution `posteriors[i]`
    """
    M = 11  # all of our ratings are between 0 and 10
    prior = np.array([1.0 / M] * M)  # uniform distribution
    likelihood = compute_movie_rating_likelihood(M)

    # get the list of all movie IDs to process
    movie_id_list = movie_data_helper.get_movie_id_list()
    num_movies = len(movie_id_list)

    # Allocate output variables.
    posteriors = np.zeros((num_movies, M))
    MAP_ratings = np.zeros(num_movies)

    for i, movie_id in enumerate(movie_id_list):
        # Truncate the number of movies if necessary.
        ratings = movie_data_helper.get_ratings(movie_id)[:num_observations]

        # Compute the posterior probability.
        posteriors[i, :] = compute_posterior(prior, likelihood, ratings)

        # MAP Rating is simply the rating with maximum posterior probability.
        MAP_ratings[i] = np.argmax(posteriors[i, :])

    return posteriors, MAP_ratings
Beispiel #4
0
def infer_true_movie_ratings(num_observations=-1):
    """
    For every movie, computes the posterior distribution and MAP estimate of
    the movie's true/inherent rating given the movie's observed ratings.

    Input
    -----
    - num_observations: integer that specifies how many available ratings to
        use per movie (the default value of -1 indicates that all available
        ratings will be used).

    Output
    ------
    - posteriors: a 2D array consisting of the posterior distributions where
        the number of rows is the number of movies, and the number of columns
        is M, i.e., the number of possible ratings (remember ratings are
        0, 1, ..., M-1); posteriors[i] gives a length M vector that is the
        posterior distribution of the true/inherent rating of the i-th movie
        given ratings for the i-th movie (where for each movie, the number of
        observations used is precisely what is specified by the input variable
        `num_observations`)
    - MAP_ratings: a 1D array with length given by the number of movies;
        MAP_ratings[i] gives the true/inherent rating with the highest
        posterior probability in the distribution `posteriors[i]`
    """

    M = 11
    prior = np.array([1.0 / M] * M)  # uniform distribution
    likelihood = compute_movie_rating_likelihood(M)

    movie_id_list = movie_data_helper.get_movie_id_list()
    num_movies = len(movie_id_list)

    posteriors = np.zeros((num_movies, M))
    MAP_ratings = np.zeros(num_movies)

    for id in movie_id_list:
        ratings = movie_data_helper.get_ratings(id)
        if num_observations > 0:
            ratings = ratings[:num_observations]
        posteriors_ = compute_posterior(prior, likelihood, ratings)
        posteriors[id] = posteriors_
        MAP_ratings[id] = np.argmax(posteriors_, axis=0)

    return posteriors, MAP_ratings
def compute_true_movie_rating_posterior_entropies_fast():
    movie_id_list = movie_data_helper.get_movie_id_list()
    num_movies = len(movie_id_list)
    N = 200
    finAns = np.zeros((num_movies, N))
    for movie_id in movie_id_list:
        print("Working on {0} of {1} movies".format(movie_id, num_movies))
        allRatings = movie_data_helper.get_ratings(movie_id)
        for nObs in range(1, N + 1):
            ratings = allRatings[:nObs]
            posterior, _ = infer_true_movie_ratings_fast(
                movie_id_list, ratings)

            finAns[movie_id, nObs - 1] = compute_entropy(posterior)

    # mean and plot
    tmp = finAns.mean(axis=0)
    plt.plot(tmp)
def compute_true_movie_rating_posterior_entropies(num_observations):
    """
    For every movie, computes the Shannon entropy (in bits) of the posterior
    distribution of the true/inherent rating of the movie given observed
    ratings.

    Input
    -----
    - num_observations: integer that specifies how many available ratings to
        use per movie (the default value of -1 indicates that all available
        ratings will be used)

    Output
    ------
    - posterior_entropies: a 1D array; posterior_entropies[i] gives the Shannon
        entropy (in bits) of the posterior distribution of the true/inherent
        rating of the i-th movie given observed ratings (with number of
        observed ratings given by the input `num_observations`)
    """

    # -------------------------------------------------------------------------
    # YOUR CODE GOES HERE FOR PART (g)
    #
    # Make use of the compute_entropy function you coded in part (f).
    movie_ids = movie_data_helper.get_movie_id_list()
    posteriors, _ = infer_true_movie_ratings(num_observations)

    posterior_entropies = np.zeros(len(movie_ids))

    # for each movie:
    for movie_id in movie_ids:
        posterior_entropies[movie_id] = compute_entropy(posteriors[movie_id])

    #
    # END OF YOUR CODE FOR PART (g)
    # -------------------------------------------------------------------------

    return posterior_entropies
def plot_entropies():

    # Now let's try it a faster way (we should only have to read the ratings once)
    # First I'm going to migrate the infer_true_movie_ratings() method to here
    M = 11  # all of our ratings are between 0 and 10
    prior = np.array([1.0 / M] * M)  # uniform distribution
    likelihood = compute_movie_rating_likelihood(M)

    # get the list of all movie IDs to process
    movie_id_list = movie_data_helper.get_movie_id_list()
    num_movies = len(movie_id_list)

    posteriors = np.zeros((num_movies, M))

    ratings = [0 for x in range(num_movies)]

    # Collecting the rating data for all movies
    for i in range(num_movies):
        ratings[i] = movie_data_helper.get_ratings(movie_id_list[i])[0:200]

    total_entropies = np.zeros((num_movies, 200))

    # Now I'll compute the posteriors and entropies with my matrix ratings
    print(
        "Computing entropies for plotting (this should take about a minute) ... "
    )
    for i in range(num_movies):
        for j in range(0, 200):
            post = compute_posterior(prior, likelihood, ratings[i][0:j])
            total_entropies[i, j] = compute_entropy(post)

    plotting_data = np.mean(total_entropies, axis=0)
    print("Plotting now ... ")
    plt.plot(plotting_data)
    plt.axis([0, 200, 0, 4])
    plt.show()
def compute_true_movie_rating_posterior_entropies(num_observations):
    """
    For every movie, computes the Shannon entropy (in bits) of the posterior
    distribution of the true/inherent rating of the movie given observed
    ratings.

    Input
    -----
    - num_observations: integer that specifies how many available ratings to
        use per movie (the default value of -1 indicates that all available
        ratings will be used)

    Output
    ------
    - posterior_entropies: a 1D array; posterior_entropies[i] gives the Shannon
        entropy (in bits) of the posterior distribution of the true/inherent
        rating of the i-th movie given observed ratings (with number of
        observed ratings given by the input `num_observations`)
    """

    # -------------------------------------------------------------------------
    # YOUR CODE GOES HERE FOR PART (g)
    #
    # Make use of the compute_entropy function you coded in part (f).
    # get the list of all movie IDs to process
    movie_id_list = movie_data_helper.get_movie_id_list()
    num_movies = len(movie_id_list)
    post, _ = infer_true_movie_ratings(num_observations)
    posterior_entropies = np.zeros(num_movies)
    for rowIdx in range(post.shape[0]):
        posterior_entropies[rowIdx] = (compute_entropy(post[rowIdx, :]))
    #
    # END OF YOUR CODE FOR PART (g)
    # -------------------------------------------------------------------------

    return posterior_entropies
def main():

    # -------------------------------------------------------------------------
    # ERROR CHECKS
    #
    # Here are some error checks that you can use to test your code.

    print("Posterior calculation (few observations)")
    prior = np.array([0.6, 0.4])
    likelihood = np.array([
        [0.7, 0.98],
        [0.3, 0.02],
    ])
    y = [0] * 2 + [1] * 1
    print("My answer:")
    print(compute_posterior(prior, likelihood, y))
    print("Expected answer:")
    print(np.array([[0.91986917, 0.08013083]]))

    print("---")
    print("Entropy of fair coin flip")
    distribution = np.array([0.5, 0.5])
    print("My answer:")
    print(compute_entropy(distribution))
    print("Expected answer:")
    print(1.0)

    print("Entropy of coin flip where P(heads) = 0.25 and P(tails) = 0.75")
    distribution = np.array([0.25, 0.75])
    print("My answer:")
    print(compute_entropy(distribution))
    print("Expected answer:")
    print(0.811278124459)

    print("Entropy of coin flip where P(heads) = 0.75 and P(tails) = 0.25")
    distribution = np.array([0.75, 0.25])
    print("My answer:")
    print(compute_entropy(distribution))
    print("Expected answer:")
    print(0.811278124459)

    #
    # END OF ERROR CHECKS
    # -------------------------------------------------------------------------

    # -------------------------------------------------------------------------
    # YOUR CODE GOES HERE FOR TESTING THE FUNCTIONS YOU HAVE WRITTEN,
    # for example, to answer the questions in part (e) and part (h)
    #
    # Place your code that calls the relevant functions here.  Make sure it's
    # easy for us graders to run your code. You may want to define multiple
    # functions for each of the parts of this problem, and call them here.

    # -- Printing top movies --

    # Print the movies with the top ten true ratings
    posteriors, MAP_ratings = infer_true_movie_ratings()
    print("Top ten movies are: ")
    movie_id_list = movie_data_helper.get_movie_id_list()
    top = MAP_ratings.argsort()[-10:][::-1]
    top_movies = []
    for i in range(10):
        top_movies.append(movie_data_helper.get_movie_name(top[i]))
    print(top_movies)

    # -- Plotting entropies --

    # I put this in a separate method so I could optimize its speed
    plot_entropies()
Beispiel #10
0
 def __init__(self):
     self.movie_list = movie_data_helper.get_movie_id_list()
     self.ratings = {}
def main():

    # -------------------------------------------------------------------------
    # ERROR CHECKS
    #
    # Here are some error checks that you can use to test your code.

    print("Posterior calculation (few observations)")
    prior = np.array([0.6, 0.4])
    likelihood = np.array([
        [0.7, 0.98],
        [0.3, 0.02],
    ])
    y = [0] * 2 + [1] * 1
    print("My answer:")
    print(compute_posterior(prior, likelihood, y))
    print("Expected answer:")
    print(np.array([[0.91986917, 0.08013083]]))

    print("---")
    print("Entropy of fair coin flip")
    distribution = np.array([0.5, 0.5])
    print("My answer:")
    print(compute_entropy(distribution))
    print("Expected answer:")
    print(1.0)

    print("Entropy of coin flip where P(heads) = 0.25 and P(tails) = 0.75")
    distribution = np.array([0.25, 0.75])
    print("My answer:")
    print(compute_entropy(distribution))
    print("Expected answer:")
    print(0.811278124459)

    print("Entropy of coin flip where P(heads) = 0.75 and P(tails) = 0.25")
    distribution = np.array([0.75, 0.25])
    print("My answer:")
    print(compute_entropy(distribution))
    print("Expected answer:")
    print(0.811278124459)

    #
    # END OF ERROR CHECKS
    # -------------------------------------------------------------------------

    # -------------------------------------------------------------------------
    # YOUR CODE GOES HERE FOR TESTING THE FUNCTIONS YOU HAVE WRITTEN,
    # for example, to answer the questions in part (e) and part (h)
    #
    # Place your code that calls the relevant functions here.  Make sure it's
    # easy for us graders to run your code. You may want to define multiple
    # functions for each of the parts of this problem, and call them here.

    # part (c)
    size = 2**4
    print('(c) likelihood %s' % size)
    likelihood = compute_movie_rating_likelihood(size)
    if likelihood.shape != (16, 16):
        exit('In compute_movie_rating_likelihood: Matrix size is not (M, Mj')

    # part (d)
    posteriors, MAP_ratings = infer_true_movie_ratings()

    # part (e)
    # Assume this list is stable.
    movie_id_list = movie_data_helper.get_movie_id_list()
    movie_titles = [
        movie_data_helper.get_movie_name(movie_id)
        for movie_id in movie_id_list
    ]
    print('result lengths movie_id_list %s, MAP_ratings %s, posteriors %s' %
          (len(movie_id_list), len(MAP_ratings), posteriors.shape))
    results = np.column_stack((movie_id_list, MAP_ratings))
    # [[movie_id, MAP_rating, posteriors]
    #  [...]]
    sort_by_map_rating = results[:, 1].argsort()[::-1]
    top_results = results[sort_by_map_rating]
    print('top_results')
    # There are 72 movies rated 10
    for row in top_results[:73, ]:
        print('%s %s' % (row, movie_titles[row[0]]))

    # plt.figure(1)
    # plt.title("Average of Entropies")
    # plt.xlabel("Number of Samples")
    # plt.ylabel("E (bits)", rotation='horizontal', position=(0, 1.01))
    # plt.plot(num_entropy[0], num_entropy[1])
    # plt.show()

    print('Posterior for movie 0')
    print(posteriors[0])
    print('Expect')
    print(
        np.array([
            0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
            0.00000000e+000, 0.00000000e+000, 2.08691952e-217, 7.41913971e-104,
            1.00000000e+000, 3.12235460e-048, 2.56768318e-058
        ]))