import matplotlib.pyplot as plt import numpy as np from scipy.sparse import scr_matrix import helper from sklearn.cluster import KMeans # import movies dataset movies = pd.read_csv('ml-latest-small/movies.csv') ratings = pd.read_csv('ml-latest-small/ratings.csv') genre_ratings = helper.get_genre_ratings( ratings, movies, ['Romance', 'Sci-Fi'], ['avg_romance_rating', 'avg_scifi_rating']) genre_ratings.head() biased_dataset = helper.bias_genre_rating_dataset(genre_ratings, 3.2, 2.5) print("Number of records: ", len(biased_dataset)) biased_dataset.head() get_ipython().run_line_magic('matplotlib', 'inline') helper.draw_scatterplot(biased_dataset['avg_scifi_rating'], 'Avg scifi rating', biased_dataset['avg_romance_rating'], 'Avg romance rating') # use k-means kmeans_1 = KMeans(n_cluster=2) predictions = kmeans_1.fit_predict(X) helper.draw_clusters(biased_dataset, predictions)
ratings = pd.read_csv('ml-latest-small/ratings.csv') print('The dataset contains: ', len(ratings), ' ratings of ', len(movies), ' movies.') # Let's start by taking a subset of users, and seeing what their prefered genres are. # The function get_genre_ratings calculated each user's average rating of all romance # movies and all scifi movies. genre_ratings = helper.get_genre_ratings( ratings, movies, ['Romance', 'Sci-Fi'], ['avg_romance_rating', 'avg_scifi_rating']) # Let's bias our dataset a little by removing people who like both scifi and romance, # just so that our clusters tend to define them as liking one genre more than the other. biased_dataset = helper.bias_genre_rating_dataset(genre_ratings, 3.2, 2.5) print("So we can see we have {} users, " "and for each user we have their average\n" "rating of the romance and sci movies they have watched.".format( len(biased_dataset))) # %matplotlib inline # helper.draw_scatterplot(biased_dataset['avg_scifi_rating'], # 'Avg scifi rating', biased_dataset['avg_romance_rating'], # 'Avg romance rating') # Lets apply K-Means on above set X = biased_dataset[['avg_scifi_rating', 'avg_romance_rating']].values # Create an instance of KMeans to find two clusters