def get_switching_hybrid_recommendations(movies_to_predict, _all_ratings, _target_user_id, sim_matrix): predictions = [] _limit_top_neighbours_to = 20 target_user_ratings = _all_ratings[_all_ratings['userID'] == _target_user_id] for trailer_id, rating in movies_to_predict: top_neighbours = [] # find most similar movies for rated_movie in target_user_ratings['id']: intersect = pd.merge( _all_ratings[_all_ratings['id'] == rated_movie], _all_ratings[_all_ratings['id'] == trailer_id], on='userID') # print intersect try: sim = cosine_similarity(intersect['rating_x'].reshape(1, -1), intersect['rating_y'].reshape(1, -1)) top_neighbours.append((rated_movie, sim[0][0])) except ValueError: try: sim = sim_matrix[rated_movie][trailer_id] top_neighbours.append((rated_movie, sim)) except KeyError: continue top_n = sort_desc(top_neighbours)[:_limit_top_neighbours_to] numerator, denominator = (0, 0) for neighbour, sim in top_n: user_rating = _all_ratings[(_all_ratings['id'] == neighbour) & ( _all_ratings['userID'] == _target_user_id)]['rating'].iloc[0] numerator += sim * user_rating denominator += abs(sim) try: p_ui = numerator / denominator except ZeroDivisionError: p_ui = 0 predictions.append((trailer_id, p_ui)) return sort_desc(predictions)
def get_tag_based_predictions(user_baseline, movies, all_movies, sim_matrix, _ratings_by_movie, _global_average): predictions = [] for movie in movies: p_ui = predict_user_rating(user_baseline, movie[0], [(movieJ[1], sim_matrix[movieJ[0]][movie[0]]) for movieJ in all_movies if movieJ[0] != movie[0]], _ratings_by_movie, _global_average) if p_ui > 0: predictions.append((movie[0], p_ui)) else: predictions.append((movie[0], 0.)) # predictions = [(movie[0], predict_user_rating(user_baseline, movie[0], # [(movieJ[1], sim_matrix[movieJ[0]][movie[0]]) # for movieJ in all_movies if movieJ[0] != movie[0]], # _ratings_by_movie, _global_average)) # for movie in movies] # predictions = [(movie[0], predict_user_rating(user_baseline, movie[0], # [(movieJ[1], sim_matrix[movieJ[0]][movie[0]]) # for movieJ in all_movies], # _ratings_by_movie, _global_average)) # for movie in movies] # print predictions return sort_desc(predictions)
def get_predictions_svd(movies_set, svd_matrix, movies_to_index, user_index, user_average): u, s, v = svd_matrix predictions = [] for trailer_id, rating in movies_set: try: movie_index = movies_to_index[trailer_id] p_ui = 0 for singular_value in range(0, len(s)): p_ui += u[user_index][singular_value] * s[singular_value] * v[ singular_value][movie_index] # p_ui += user_average # print p_ui # print sum(u[user_index] * s * v[:, movie_index]) # break # p_ui = user_average + np.sum(u[user_index].dot(s.dot(v[:, movie_index]))) except KeyError: p_ui = user_average predictions.append((trailer_id, p_ui)) return sort_desc(predictions)
def get_content_based_user_bof_predictions(_movies_set, _user_avg, _all_ratings, _user_user_sim_matrix, _user_profiles, _target_user_id): predictions = [] for trailer_id, rating in _movies_set: rating_neighbors = _all_ratings[_all_ratings['id'] == trailer_id] rating_neighbors_users = list(rating_neighbors['userID']) selected_neighbors = [(user, sim, rating_neighbors[rating_neighbors['userID'] == user]['rating'].iloc[0]) for user, sim in _user_user_sim_matrix[_target_user_id] if user in rating_neighbors_users] # print "Selected Neighbors" # print selected_neighbors # break try: # print sum([abs(sim) for u, sim, r in selected_neighbors]) # break p_ui = _user_avg + sum([sim * (_user_profiles.loc[user]['avg'] - user_rating) for user, sim, user_rating in selected_neighbors]) / \ sum([abs(sim) for u, sim, r in selected_neighbors]) except ZeroDivisionError: p_ui = 0 predictions.append((trailer_id, p_ui)) return sort_desc(predictions)
def get_user_collaborative_predictions_precomputed_similarities( movies_to_predict, _user_profiles, _all_ratings, _target_user_id, _user_avg, _user_user_sim_matrix): global _avg_ratings predictions = [] _limit_top_neighbours_to = 50 for trailer_id, rating in movies_to_predict: # all neighbours rating_neighbors = set( _all_ratings[_all_ratings['id'] == trailer_id]['userID']) # print len(rating_neighbors), "is the current neighbourhood size" # break # find top neighbours top_neighbors = [ (neighbor, sim) for neighbor, sim in _user_user_sim_matrix[_target_user_id] if neighbor in rating_neighbors ] top_n = sort_desc(top_neighbors)[:_limit_top_neighbours_to] # print "Top N", top_n # predict rating numerator, denominator = (0, 0) for neighbour, sim in top_n: neighbour_rating = _all_ratings[ (_all_ratings['userID'] == neighbour) & (_all_ratings['id'] == trailer_id)]['rating'].iloc[0] numerator += sim * (neighbour_rating - _user_profiles.loc[neighbour]['avg']) denominator += abs(sim) try: p_ui = _user_avg + numerator / denominator except ZeroDivisionError: p_ui = 0 predictions.append((trailer_id, p_ui)) return sort_desc(predictions)
def get_weighted_hybrid_recommendations(predictions, movie_set): hybrid_predictions = [] _num_vectors = 2 for trailer_id, ratings in movie_set: sum_ratings = sum( [p_ui for tid, p_ui in predictions if tid == trailer_id]) hybrid_predictions.append((trailer_id, sum_ratings / _num_vectors)) return sort_desc(hybrid_predictions)
def get_predictions_linear_regression(movies_set, _deep_features, _user_theta_vectors, userid): # predictions = [] # # for trailer_id, rating in movies_set: # # p_ui = _user_theta_vectors[userid].dot(np.insert(_deep_features[trailer_id], 0, 1)) # predictions.append((trailer_id, p_ui)) predictions = [(trailer_id, _user_theta_vectors[userid].dot(np.insert(_deep_features[trailer_id], 0, 1))) for trailer_id, r in movies_set] return sort_desc(predictions)
def get_content_based_predictions(user_baseline, movies, all_movies, sim_matrix, _ratings_by_movie, _global_average): # predictions = [(movie[0], predict_user_rating(user_baseline, movie[0], # [(movieJ[1], sim_matrix[movieJ[0]][movie[0]]) # for movieJ in all_movies if movieJ[0] != movie[0]], # _ratings_by_movie, _global_average)) # for movie in movies] predictions = [(movie[0], predict_user_rating(user_baseline, movie[0], [(movieJ[1], sim_matrix[movieJ[0]][movie[0]]) for movieJ in all_movies], _ratings_by_movie, _global_average)) for movie in movies] # print predictions return sort_desc(predictions)
def get_item_collaborative_predictions_precomputed_similarities( movies_to_predict, _all_ratings, _target_user_id, _item_item_sim_matrix): predictions = [] _limit_top_neighbours_to = 20 # target_user_ratings = _all_ratings[_all_ratings['userID'] == _target_user_id] for trailer_id, rating in movies_to_predict: # print "Trailer id is", trailer_id try: _all_sim_items = _item_item_sim_matrix[trailer_id] # print "All sims are", _all_sim_items # break # _allowed_sim_items = _all_sim_items[:_limit_top_neighbours_to] allowed_sim_items = [] for item in _all_sim_items: rating = _all_ratings[ (_all_ratings['userID'] == _target_user_id) & (_all_ratings['id'] == item[0])]['rating'] try: # the current user rated this item rating = float(rating) allowed_sim_items.append((item[1], rating)) except TypeError: continue if len(allowed_sim_items) == _limit_top_neighbours_to: break # print "Allowed:", allowed_sim_items # b_ui = get_item_baseline(user_baseline, trailer_id, _ratings_by_movie, _global_average) try: p_ui = ( sum([sim * rating for sim, rating in allowed_sim_items]) / sum([abs(sim) for sim, rating in allowed_sim_items])) except ZeroDivisionError: p_ui = 0 except KeyError: p_ui = 0 predictions.append((trailer_id, p_ui)) return sort_desc(predictions)
continue intersect = pd.merge(_all_ratings[_all_ratings['id'] == movie], _all_ratings[_all_ratings['id'] == neighbor], on='userID') # print intersect # exit() # if len(intersect) > 4: if not intersect.empty: try: # sim = cosine_similarity(intersect['rating_x'].reshape(1, -1), intersect['rating_y'].reshape(1, -1)) # sim = cosine_similarity([intersect['rating_x']], [intersect['rating_y']]) sim = adjusted_cosine(intersect, user_profiles) movie_similarity[movie].append((neighbor, sim)) # sim = cosine_similarity(intersect['rating_x'].reshape(1, -1), intersect['rating_y'].reshape(1, -1)) # movie_similarity[movie].append((neighbor, sim[0][0])) except ValueError: continue else: movie_similarity[movie].append((neighbor, 0)) movie_similarity[movie] = sort_desc(movie_similarity[movie]) # print movie_similarity[movie] # break save_obj(movie_similarity, 'item_item_collaborative_similarities')
item_similarities = dict() for key, target_movie_movielens_id in _all_movies.iterrows(): other_items = _all_movies.iloc[key + 1:] similarities = [] for sub_key, neighbour_movie_id in other_items.iterrows(): join_ratings = pd.merge(_all_ratings[_all_ratings['movielensID'] == neighbour_movie_id.iloc[0]], _all_ratings[_all_ratings['movielensID'] == target_movie_movielens_id.iloc[0]], on='userID') sim = 0 if len(join_ratings) > 0: ratings_x = np.array(join_ratings['rating_x']) ratings_y = np.array(join_ratings['rating_y']) sim = cosine_similarity(ratings_x.reshape(1, -1), ratings_y.reshape(1, -1))[0][0] similarities.append((neighbour_movie_id, sim)) ordered = sort_desc(similarities)[:30] item_similarities[target_movie_movielens_id.iloc[0]] = ordered count += 1 if count % 100 == 0: print count, "movies read" break # print item_similarities print "finished in", time.time() - start, "seconds" save_obj(item_similarities, 'item_collaborative_similarity')
from hausdorff import hausdorff from utils.utils import sort_desc import numpy as np _users_bof = load_features('content/3112_users_bof.pkl') # test_user_1 = np.array(_users_bof[1]) # test_user_3 = np.array(_users_bof[7]) # print test_user_1 # print hausdorff(test_user_1, test_user_3) users_bof_similarities = {} for key, user_bof in _users_bof.iteritems(): users_bof_similarities[key] = [] print "current user", key for neighbor, neighbor_bof in _users_bof.iteritems(): if neighbor == key: continue sim = hausdorff(np.array(user_bof), np.array(neighbor_bof)) users_bof_similarities[key].append((neighbor, sim)) users_bof_similarities[key] = sort_desc(users_bof_similarities[key], desc=False) # print users_bof_similarities[key] # break save_obj(users_bof_similarities, '3112_user_user_bof_similarities')
neighbor_average = user_profiles.loc[neighbor]['avg'] except IndexError as e: print e, "neighbor", neighbor, "failed" try: intersect = pd.merge( _all_ratings[_all_ratings['userID'] == neighbor], target_user_ratings, on='id') if len(intersect) < 5: sim = 0 else: sim = pearsonr(intersect['rating_x'], intersect['rating_y'])[0] # ssim = sum([(item['rating_x'] - neighbor_average) * (item['rating_y'] - target_user_average) # for k, item in intersect.iterrows()]) / ( # math.sqrt(sum([(item['rating_x'] - neighbor_average) ** 2 for k, item in intersect.iterrows()])) * # math.sqrt(sum([(item['rating_y'] - target_user_average) ** 2 for k, item in intersect.iterrows()]))) except ValueError: sim = 0 if not (sim > 0 or sim < 0): sim = 0 user_user_similarities[user].append((neighbor, sim)) user_user_similarities[user] = sort_desc(user_user_similarities[user]) # break save_obj(user_user_similarities, 'user_user_collaborative_similarities')