-
Notifications
You must be signed in to change notification settings - Fork 0
/
content_based_recommendation_system.py
123 lines (103 loc) · 4.36 KB
/
content_based_recommendation_system.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
from sklearn.feature_extraction.text import CountVectorizer as cv
from sklearn.metrics.pairwise import cosine_similarity
from my_sql import MySQL
import pandas as pd
import numpy as np
"""
Content-based Recommendation System
"""
class ContentBasedRecommendationSystem:
def __init__(self, watched_movie, mysql):
self.watched_movie = watched_movie
self.mysql = mysql
self.connection = mysql.get_connection()
self.movies = self.read_table(
"""
SELECT lenslinks.movieId, title, genres, imdbId
FROM movielenstable JOIN lenslinks ON movielenstable.movieId = lenslinks.movieId
"""
)
concat = []
temp = pd.read_csv("data/actors-movieIds.csv").groupby('imdbId').agg({'personName': ' '.join})
for index, row in temp.iterrows():
concat.append([index, row['personName']])
self.movie_by_actor = pd.DataFrame(np.array(concat), columns=['imdbId', 'actorName'])
concat = []
temp = pd.read_csv("data/directors-movieIds.csv").groupby('imdbId').agg({'personName': ' '.join})
for index, row in temp.iterrows():
concat.append([index, row['personName']])
self.movie_by_director = pd.DataFrame(np.array(concat), columns=['imdbId', 'directorName'])
self.movie_by_actor['imdbId'] = self.movie_by_actor['imdbId'].astype(int)
self.movie_by_director['imdbId'] = self.movie_by_director['imdbId'].astype(int)
merged = pd.merge(left=self.movies, right=self.movie_by_actor, left_on='imdbId', right_on='imdbId')
self.movies = pd.merge(left=merged, right=self.movie_by_director, left_on='imdbId', right_on='imdbId')
def __del__(self):
print("Closing db connection")
self.connection.close()
def get_movie_title(self, i):
"""
Gets the movie with the movie id
:param movieId: movie id
:return: titie
"""
for index, row in self.movies.iterrows():
if i == index:
return row
def get_movie_id(self, title):
"""
Gets title from the movie id
:param movieId: movie title
:return: movieId
"""
for index, row in self.movies.iterrows():
if row['title'] == title:
return index
def train(self):
"""
Trains the model based on movies; title, genres, tag
"""
for attribute in ['title', 'genres', 'actorName', 'directorName']:
self.movies[attribute] = self.movies[attribute]
self.movies['merged'] = self.movies.apply(self.merge, axis=1)
count_vectorized = cv()
cs = cosine_similarity(count_vectorized.fit_transform(self.movies['merged']))
recommended_movies = list(enumerate(cs[self.get_movie_id(self.watched_movie)]))
if recommended_movies:
predicted = self.get_highest(recommended_movies)
for i, row in self.movies.iterrows():
if predicted[0] == i:
print('\nSince you\'ve liked', self.watched_movie, 'We recommend: ', row['title'], 'genres:', row['genres'])
print('Accuracy', predicted[1])
if i == 999:
print(self.watched_movie, 'movie\'s genre:', row['genres'])
print()
else:
print('Something went wrong with the analysis')
def get_highest(self, recommended):
"""
Gets the highest percantage that matches the movie
:param recommended: recommended movies
:return: returns the top movie
"""
return sorted(recommended, key=lambda elem: elem[1], reverse=True)[1]
def merge(self, row):
"""
Merges features from a single row
:param row:
:return:
"""
return '{} {} {} {}'.format(row['title'], row['genres'], row['actorName'], row['directorName'])
def read_table(self, query):
"""
Executes MySQL queries and converts it into pandas format
:param query: mysql query that needs to be executed
:return: result of the query in pandas format
"""
if self.connection.is_connected():
return pd.read_sql_query(query, self.connection)
def main():
mysql = MySQL()
content_based = ContentBasedRecommendationSystem('Iron Man (1931)', mysql)
content_based.train()
if __name__ == '__main__':
main()