from utils.utils import extract_features # _movies_bof = load_features('content/bof_128.bin') # print _movies_bof[9089] _movies_bof_normalized = extract_features('content/bof_128.bin') # print _movies_bof_normalized[9089] conn = sqlite3.connect('content/database.db') _user_ratings = pd.read_sql("SELECT r.userid, t.id " "FROM movielens_rating r " "JOIN movielens_movie m ON m.movielensid = r.movielensid " "JOIN trailers t ON t.imdbid = m.imdbidtt " "AND t.best_file = 1 " "WHERE r.rating > 4 " "AND r.userid < 5000 " "ORDER BY r.userid", conn) users_bof = {} _users = _user_ratings['userID'].unique() for user in _users: users_bof[user] = [] _current_user_ratings = _user_ratings[_user_ratings['userID'] == user] for key, item in _current_user_ratings.iterrows(): item_bof = _movies_bof_normalized[item['id']] users_bof[user].append(item_bof) save_obj(users_bof, '3112_users_bof')
count = 0 _safe_exit = 2 trailer_tfidf_similarities = dict() for i in range(0, len(tfidf_array)): # print sum(tfidf_array[i]) trailer_id = _all_ratings.iloc[i] print trailer_id trailer_tfidf_similarities[trailer_id[0]] = {} # trailer_tfidf_similarities[trailer_id[0]] = [] for j in range(0, len(tfidf_array)): # if i == j: # avoid self-comparison # continue sim = cosine_similarity([tfidf_array[i]], [tfidf_array[j]]) # trailer_tfidf_similarities[trailer_id[0]].append((_all_ratings.iloc[j][0], sim[0][0])) trailer_tfidf_similarities[trailer_id[0]][_all_ratings.iloc[j] [0]] = sim[0][0] # trailer_tfidf_similarities[trailer_id[0]] = sort_desc(trailer_tfidf_similarities[trailer_id[0]]) # count += 1 # if count == _safe_exit: # break # print trailer_tfidf_similarities save_obj(trailer_tfidf_similarities, 'trailer_tfidf_synopsis_similarities')
continue intersect = pd.merge(_all_ratings[_all_ratings['id'] == movie], _all_ratings[_all_ratings['id'] == neighbor], on='userID') # print intersect # exit() # if len(intersect) > 4: if not intersect.empty: try: # sim = cosine_similarity(intersect['rating_x'].reshape(1, -1), intersect['rating_y'].reshape(1, -1)) # sim = cosine_similarity([intersect['rating_x']], [intersect['rating_y']]) sim = adjusted_cosine(intersect, user_profiles) movie_similarity[movie].append((neighbor, sim)) # sim = cosine_similarity(intersect['rating_x'].reshape(1, -1), intersect['rating_y'].reshape(1, -1)) # movie_similarity[movie].append((neighbor, sim[0][0])) except ValueError: continue else: movie_similarity[movie].append((neighbor, 0)) movie_similarity[movie] = sort_desc(movie_similarity[movie]) # print movie_similarity[movie] # break save_obj(movie_similarity, 'item_item_collaborative_similarities')
'precision': hducp, 'recall': hducr, 'diversity': hducd, 'mae': hducm, 'rankscore': hducrs, 'f1': hducf1 }, # 'switching-hybrid': {'precision': swp, 'recall': swr, 'diversity': swd, 'mae': swm, 'rankscore': swrs, 'f1': swf1}, # 'linear-regression': {'precision': lrp, 'recall': lrr, 'diversity': lrd, 'mae': lrm} # 'weighted-hybrid-content-item': {'precision': h2p, 'recall': h2r, 'diversity': h2d, 'mae': h2m, 'rankscore': h2rs, 'f1': h2f1}, # 'weighted-hybrid-collaborative': {'precision': h3p, 'recall': h3r, 'diversity': h3d, 'mae': h3m, 'rankscore': h3rs, 'f1': h3f1}, } results = {} for index in range(2, 16): results[index] = experiment(index, new_user_profiles, convnet_sim_matrix, low_level_sim_matrix, _trailers_tfidf_sims_matrix, _trailers_tfidf_synopsis_sims_matrix) # print results print results[15] save_obj(new_user_profiles, 'profiles_with_predictions') save_obj(results, 'results_50_users') end = time.time() print "Execution time", (end - start), "seconds."
item_similarities = dict() for key, target_movie_movielens_id in _all_movies.iterrows(): other_items = _all_movies.iloc[key + 1:] similarities = [] for sub_key, neighbour_movie_id in other_items.iterrows(): join_ratings = pd.merge(_all_ratings[_all_ratings['movielensID'] == neighbour_movie_id.iloc[0]], _all_ratings[_all_ratings['movielensID'] == target_movie_movielens_id.iloc[0]], on='userID') sim = 0 if len(join_ratings) > 0: ratings_x = np.array(join_ratings['rating_x']) ratings_y = np.array(join_ratings['rating_y']) sim = cosine_similarity(ratings_x.reshape(1, -1), ratings_y.reshape(1, -1))[0][0] similarities.append((neighbour_movie_id, sim)) ordered = sort_desc(similarities)[:30] item_similarities[target_movie_movielens_id.iloc[0]] = ordered count += 1 if count % 100 == 0: print count, "movies read" break # print item_similarities print "finished in", time.time() - start, "seconds" save_obj(item_similarities, 'item_collaborative_similarity')
for movie in user_movies: try: new_movie_vector = np.insert(_deep_features_bof[movie], 0, 1) except KeyError: continue rating = _all_ratings[(_all_ratings['userID'] == user) & ( _all_ratings['id'] == movie)]['rating'].iloc[0] theta_vectors[user][0] -= _alpha * (theta_vectors[user].reshape( -1, 129).dot(new_movie_vector.reshape(129, -1))[0][0] - rating) * new_movie_vector[0] # for every theta (weight) value for index in range(1, len(theta_vectors[user])): part1 = (theta_vectors[user].reshape(-1, 129).dot( new_movie_vector.reshape(129, -1))[0][0] - rating) theta_vectors[user][index] -= _alpha * ( part1 * new_movie_vector[index] + _lambda * theta_vectors[user][index]) print "user", user # break # print "modified", theta_vectors[user] save_obj(theta_vectors, 'users_theta_vectors')
for key, movie in _all_ratings.iterrows(): movie_tag_vector = [] print movie[0] for subkey, tag in _all_tags.iterrows(): c = conn.cursor() count_tag = c.execute(sql_count_tags, ( movie[0], tag[0], )) movie_count_tags = count_tag.fetchall() movie_tag_vector.append(movie_count_tags[0][0]) movies_tag_vectors.append(movie_tag_vector) # print movie_tags # count += 1 # if count == _safe_exit: # break # for movie_counts in movies_tag_vectors: # print sum(movie_counts) tfidf = transformer.fit_transform(movies_tag_vectors) # print tfidf.toarray() save_obj(tfidf.toarray(), 'movies_tfidf_array')
from hausdorff import hausdorff from utils.utils import sort_desc import numpy as np _users_bof = load_features('content/3112_users_bof.pkl') # test_user_1 = np.array(_users_bof[1]) # test_user_3 = np.array(_users_bof[7]) # print test_user_1 # print hausdorff(test_user_1, test_user_3) users_bof_similarities = {} for key, user_bof in _users_bof.iteritems(): users_bof_similarities[key] = [] print "current user", key for neighbor, neighbor_bof in _users_bof.iteritems(): if neighbor == key: continue sim = hausdorff(np.array(user_bof), np.array(neighbor_bof)) users_bof_similarities[key].append((neighbor, sim)) users_bof_similarities[key] = sort_desc(users_bof_similarities[key], desc=False) # print users_bof_similarities[key] # break save_obj(users_bof_similarities, '3112_user_user_bof_similarities')
import sqlite3 import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from utils.opening_feat import save_obj v = TfidfVectorizer() conn = sqlite3.connect('/home/ralph/Dev/content-based-recsys/content/database.db') _all_movies = pd.read_sql('select distinct t.id, ms.Plot ' 'from movielens_rating r ' 'join movielens_movie m on m.movielensid = r.movielensid ' 'join trailers t on t.imdbid = m.imdbidtt ' 'join movies ms on ms.imdbID = t.imdbid ' 'where t.best_file = 1 ' # 'and userid < 5000 ' 'order by t.id ', conn) plots = [] for key, movie in _all_movies.iterrows(): # print movie['Plot'] print key plots.append(movie['Plot']) x = v.fit_transform(plots) save_obj(x.toarray(), 'movies_tfidf_synopsis_array')
for user in _3112_user_ratings['userID'].unique(): _movies = c.execute(_movies_sql) column = 0 for movie in _movies.fetchall(): _user_rating = get_user_rating(_3112_user_ratings, user, movie[0]) if not _user_rating.empty: _ratings_matrix[row][column] = _user_rating['rating'].iloc[0] column += 1 row += 1 # print _ratings_matrix[0] df = pd.DataFrame(_ratings_matrix) # print df df.fillna(df.mean(), inplace=True) # print df # print _ratings_matrix # exit() # Salva a matriz completa considerando a media de cada item nas celulas vazias save_obj(df, 'full_matrix_for_svd') # print full_ratings # exit() # scaled = preprocessing.scale(np.matrix(full_ratings)) # full_matrix = np.nan_to_num(np.array(full_ratings)) # normalized = preprocessing.normalize(full_matrix, norm='l2') # print np.matrix(full_ratings) # np.savetxt('full_matrix_for_svd_item_mean_imputation', np.matrix(full_ratings)) # np.savetxt('full_matrix_for_svd_normalized', normalized)
neighbor_average = user_profiles.loc[neighbor]['avg'] except IndexError as e: print e, "neighbor", neighbor, "failed" try: intersect = pd.merge( _all_ratings[_all_ratings['userID'] == neighbor], target_user_ratings, on='id') if len(intersect) < 5: sim = 0 else: sim = pearsonr(intersect['rating_x'], intersect['rating_y'])[0] # ssim = sum([(item['rating_x'] - neighbor_average) * (item['rating_y'] - target_user_average) # for k, item in intersect.iterrows()]) / ( # math.sqrt(sum([(item['rating_x'] - neighbor_average) ** 2 for k, item in intersect.iterrows()])) * # math.sqrt(sum([(item['rating_y'] - target_user_average) ** 2 for k, item in intersect.iterrows()]))) except ValueError: sim = 0 if not (sim > 0 or sim < 0): sim = 0 user_user_similarities[user].append((neighbor, sim)) user_user_similarities[user] = sort_desc(user_user_similarities[user]) # break save_obj(user_user_similarities, 'user_user_collaborative_similarities')