n_movies = 30
n_users = 18
most_rated_movies_users_selection = helper.sort_by_rating_density(user_movie_ratings, n_movies, n_users)

print('dataset dimensions: ', most_rated_movies_users_selection.shape)
most_rated_movies_users_selection.head()


# That's more like it. Let's also establish a good way for visualizing these ratings so we can attempt to visually recognize the ratings (and later, clusters) when we look at bigger subsets.
# 
# Let's use colors instead of the number ratings:

# In[49]:


helper.draw_movies_heatmap(most_rated_movies_users_selection)


# Each column is a movie. Each row is a user. The color of the cell is how the user rated that movie based on the scale on the right of the graph.
# 
# Notice how some cells are white? This means the respective user did not rate that movie. This is an issue you'll come across when clustering in real life. Unlike the clean example we started with, real-world datasets can often be sparse and not have a value in each cell of the dataset. This makes it less straightforward to cluster users directly by their movie ratings as k-means generally does not like missing values.
# 
# For performance reasons, we'll only use ratings for 1000 movies (out of the 9000+ available in the dataset).

# In[50]:


user_movie_ratings =  pd.pivot_table(ratings_title, index='userId', columns= 'title', values='rating')
most_rated_movies_1k = helper.get_most_rated_movies(user_movie_ratings, 1000)

Example #2
0
clustered = pd.concat(
    [most_rated_movies_1k.reset_index(),
     pd.DataFrame({'group': predictions})],
    axis=1)
helper.draw_movie_clusters(clustered, max_users, max_movies)

cluster_number = 11

n_users = 75
n_movies = 300
cluster = clustered[clustered.group == cluster_number].drop(['index', 'group'],
                                                            axis=1)

cluster = helper.sort_by_rating_density(cluster, n_movies, n_users)
helper.draw_movies_heatmap(cluster, axis_labels=False)

cluster.fillna('').head()

movie_name = "Forrest Gump (1994)"

cluster[movie_name].mean()
cluster.mean().head(20)

user_id = 19

# Get all this user's ratings
user_2_ratings = cluster.loc[user_id, :]

# Which movies did they not rate?
user_2_unrated_movies = user_2_ratings[user_2_ratings.isnull()]
]].values

# TODO: Create an instance of KMeans to find seven clusters
kmeans_5 = KMeans(n_clusters=7)

# TODO: use fit_predict to cluster the dataset
predictions_5 = kmeans_5.fit_predict(X_with_action)

# plot
helper.draw_clusters_3d(biased_dataset_3_genres, predictions_5)

#Merge the two tables then pivot so we have Users X Movies dataframe
ratings_title = pd.merge(ratings, movies[['movieId', 'title']], on='movieId')
user_movie_ratings = pd.pivot_table(ratings_title,
                                    index='userId',
                                    columns='title',
                                    values='rating')

print('dataset dimensions: ', user_movie_ratings.shape, '\n\nSubset example:')
user_movie_ratings.iloc[:6, :10]

n_movies = 30
n_users = 18
most_rated_movies_users_selection = helper.sort_by_rating_density(
    user_movie_ratings, n_movies, n_users)

print('dataset dimensions: ', most_rated_movies_users_selection.shape)
most_rated_movies_users_selection.head()

helper.draw_movies_heatmap(most_rated_movies_users_selection)