Beispiel #1
0
import matplotlib.pyplot as plt
import numpy as np
from scipy.sparse import scr_matrix
import helper
from sklearn.cluster import KMeans

# import movies dataset
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')

genre_ratings = helper.get_genre_ratings(
    ratings, movies, ['Romance', 'Sci-Fi'],
    ['avg_romance_rating', 'avg_scifi_rating'])
genre_ratings.head()

biased_dataset = helper.bias_genre_rating_dataset(genre_ratings, 3.2, 2.5)

print("Number of records: ", len(biased_dataset))
biased_dataset.head()

get_ipython().run_line_magic('matplotlib', 'inline')

helper.draw_scatterplot(biased_dataset['avg_scifi_rating'], 'Avg scifi rating',
                        biased_dataset['avg_romance_rating'],
                        'Avg romance rating')

# use k-means

kmeans_1 = KMeans(n_cluster=2)
predictions = kmeans_1.fit_predict(X)
helper.draw_clusters(biased_dataset, predictions)
Beispiel #2
0
ratings = pd.read_csv('ml-latest-small/ratings.csv')

print('The dataset contains: ', len(ratings), ' ratings of ', len(movies),
      ' movies.')

# Let's start by taking a subset of users, and seeing what their prefered genres are.
# The function get_genre_ratings calculated each user's average rating of all romance
# movies and all scifi movies.

genre_ratings = helper.get_genre_ratings(
    ratings, movies, ['Romance', 'Sci-Fi'],
    ['avg_romance_rating', 'avg_scifi_rating'])

# Let's bias our dataset a little by removing people who like both scifi and romance,
# just so that our clusters tend to define them as liking one genre more than the other.
biased_dataset = helper.bias_genre_rating_dataset(genre_ratings, 3.2, 2.5)

print("So we can see we have {} users, "
      "and for each user we have their average\n"
      "rating of the romance and sci movies they have watched.".format(
          len(biased_dataset)))

# %matplotlib inline
# helper.draw_scatterplot(biased_dataset['avg_scifi_rating'],
#                         'Avg scifi rating', biased_dataset['avg_romance_rating'],
#                         'Avg romance rating')

# Lets apply K-Means on above set
X = biased_dataset[['avg_scifi_rating', 'avg_romance_rating']].values

# Create an instance of KMeans to find two clusters