def test_nearest_neighbors(): """Ensure the nearest neighbors are different when using user-user similarity vs item-item.""" reader = Reader(line_format='user item rating', sep=' ', skip_lines=3) data_file = os.path.dirname(os.path.realpath(__file__)) + '/custom_train' data = Dataset.load_from_file(data_file, reader, rating_scale=(1, 5)) trainset = data.build_full_trainset() algo_ub = KNNBasic(sim_options={'user_based': True}) algo_ub.fit(trainset) algo_ib = KNNBasic(sim_options={'user_based': False}) algo_ib.fit(trainset) assert algo_ub.get_neighbors(0, k=10) != algo_ib.get_neighbors(0, k=10)
class ItemCF(): def __init__(self): file_path = os.path.expanduser('user_item_rate.csv') reader = Reader(line_format='user item rating', sep=',') surprise_data = Dataset.load_from_file(file_path, reader=reader) all_trainset = surprise_data.build_full_trainset() # 训练模型:基于项目相似度 self.item_algo = KNNBasic(k=10, min_k=3, sim_options={'user_based': False}) # sim_options={'name': 'cosine','user_based': True} cosine/msd/pearson/pearson_baseline self.item_algo.fit(all_trainset) def get_similar_items(self, top_k, item_id): """ 相似项目 Args: top_k(int): 相似项目数量 item_id(str): 项目id Returns: list generator """ item_inner_id = self.item_algo.trainset.to_inner_iid(item_id) item_neighbors = self.item_algo.get_neighbors(item_inner_id, k=top_k) item_neighbor_ids = (self.item_algo.trainset.to_raw_iid(inner_id) for inner_id in item_neighbors) return item_neighbor_ids
def item_based_cf(self, co_pe, df_path): # INITIALIZE REQUIRED PARAMETERS # INITIALIZE REQUIRED PARAMETERS path = '/home/mister-t/Projects/PycharmProjects/RecommendationSys/ml-100k/u.item' prnt = "ITEM" sim_op = {'name': co_pe, 'user_based': False} algo = KNNBasic(sim_options=sim_op) reader = Reader(line_format="user item rating", sep=',', rating_scale=(1, 5)) df = Dataset.load_from_file(df_path, reader=reader) # START TRAINING trainset = df.build_full_trainset() # APPLYING ALGORITHM KNN Basic res = algo.train(trainset) print "\t\t >>>TRAINED SET<<<<\n\n", res # Read the mappings raw id <-> movie name rid_to_name, name_to_rid = self.read_item_names(path) print "CF Type:", prnt, "BASED" search_key = raw_input( "Enter a Movie Name, \n ex. Toy Story (1995) or Seven (Se7en) (1995)\n Movie name:" ) print "ALGORITHM USED : ", co_pe raw_id = name_to_rid[search_key] # --------------------------------------------- MARKERS f = io.open("cluster/AlgoHist_ib.txt", "wb") f.write(repr(co_pe)) f.close() # --------------------------------------------- MARKERS END print "\t\t RAW ID>>>>>>>", raw_id, "<<<<<<<" inner_id = algo.trainset.to_inner_iid(raw_id) print "INNER ID >>>>>", inner_id # Retrieve inner ids of the nearest neighbors of Toy Story. k = input("Enter size of Neighborhood (Min:1, Max:40)") neighbors = algo.get_neighbors(inner_id, k=k) neighbors = (algo.trainset.to_raw_iid(inner_id) for inner_id in neighbors) neighbors = (rid_to_name[rid] for rid in neighbors) print "Nearest ", k, " Matching Items are:" for i in neighbors: print "\t " * 6, i
def user_based_cf(co_pe): # INITIALIZE REQUIRED PARAMETERS # path = 'ml-100k/u.user' prnt = "USER" sim_op = {'name': co_pe, 'user_based': True} algo = KNNBasic(sim_options=sim_op) reader = Reader(line_format="user item rating", sep='\t', rating_scale=(1, 5)) df = Dataset.load_from_file('ml-100k/u.data', reader=reader) # START TRAINING trainset = df.build_full_trainset() # APPLYING ALGORITHM KNN Basic algo.train(trainset) print "ALGORITHM USED", co_pe # -------------------------------`-------------- MARKERS f = io.open("_AlgoHist_ub.txt", "wb") f.write(repr(co_pe)) f.close() # --------------------------------------------- MARKERS END print "CF Type:", prnt, "BASED" # PEEKING PREDICTED VALUES search_key = raw_input("Enter User ID:") item_id = raw_input("Enter Item ID:") actual_rating = input("Enter actual Rating:") print algo.predict(str(search_key), item_id, actual_rating) testset = trainset.build_anti_testset() predictions = algo.test(testset=testset) top_n = get_top_n(predictions, 5) result_u = True k = input("Enter size of Neighborhood (Min:1, Max:40)") inner_id = algo.trainset.to_inner_iid(search_key) neighbors = algo.get_neighbors(inner_id, k=k) print "Nearest Matching users are:" for i in neighbors: print "\t " * 6, i return top_n, result_u
def FriendRecommender(user): df = pd.DataFrame(rating_dict) reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(df[['users', 'colleges', 'rating']], reader) trainset = data.build_full_trainset() sim_options = {'name': 'cosine', 'user-based': True} algo = KNNBasic(sim_options) algo.fit(trainset) uid = trainset.to_inner_uid(user) pred = algo.get_neighbors(uid, 3) for i in pred: x.insert(i, (trainset.to_raw_uid(i)))
def FriendRecommender(user): df = pd.DataFrame(rating_dict) reader = Reader(rating_scale=(1, 5)) # the ratings range from 1 to 5 data = Dataset.load_from_df(df[['user', 'item', 'rating']], reader) trainset = data.build_full_trainset() sim_options = {'name': 'cosine', 'user_based': True} # Using cosine to measure similarities, user based approach algo = KNNBasic(sim_options) algo.fit(trainset) uid = trainset.to_inner_uid(user) pred = algo.get_neighbors( uid, 3) # returns 3 nearest neighbours of inputted user for i in pred: print(trainset.to_raw_uid(i))
class Movie_KNN_recommender: def __init__(self, mode=0): self.index = pd.read_csv('../data/personal/movies.csv') self.reader = Reader() self.ratings = pd.read_csv('../data/personal/ratings.csv') data = Dataset.load_from_df( self.ratings[['userId', 'movieId', 'rating']], self.reader) trainset = data.build_full_trainset() sim_options = {'name': 'pearson_baseline', 'user_based': False} if mode == 0: self.algo = KNNBaseline(sim_options=sim_options) elif mode == 1: self.algo = KNNWithMeans(sim_options=sim_options) elif mode == 2: self.algo = KNNBasic(sim_options=sim_options) else: exit(0) self.algo.fit(trainset) def get_similar_movies(self, movieID, num=10): movie_inner_id = self.algo.trainset.to_inner_iid(movieID) movie_neighbors = self.algo.get_neighbors(movie_inner_id, k=num) movie_neighbors = [ self.algo.trainset.to_raw_iid(inner_id) for inner_id in movie_neighbors ] print(movie_neighbors) return movie_neighbors def debug(self): similar_users = self.get_similar_movies(1, 1) print(self.ratings[self.ratings.userId == 1].head()) for i in similar_users: print(list(self.ratings[self.ratings.userId == i]['movieId'])) def recommend(self, movieID, num=10): movie_similar = self.get_similar_movies(movieID, num) recommending = [] for i in movie_similar: recommending.append(self.index[self.index.movieId == i]['title']) return recommending
def FriendRecommender(user): df = pd.DataFrame(rating_dic) reader = Reader(rating_scale=(0, 1)) data = Dataset.load_from_df(df[['user', 'game', 'favorite']], reader) trainset = data.build_full_trainset() sim_options = {'name': 'cosine', 'user_based': True} algo = KNNBasic(sim_options) algo.fit(trainset) uid = trainset.to_inner_uid(user) pred = algo.get_neighbors(uid, 3) list_result = [] for i in pred: #print(trainset.to_raw_uid(i)) list_result.append(trainset.to_raw_uid(i)) return list_result
class Personal_KNN_recommender: def __init__(self, mode=0): self.index = pd.read_csv('../data/personal/movies.csv') self.reader = Reader() self.ratings = pd.read_csv('../data/personal/train.csv') self.testings = pd.read_csv('../data/personal/test.csv') data = Dataset.load_from_df( self.ratings[['userId', 'movieId', 'rating']], self.reader) trainset = data.build_full_trainset() sim_options = {'name': 'pearson_baseline', 'user_based': True} if mode == 0: self.algo = KNNBaseline(sim_options=sim_options) elif mode == 1: self.algo = KNNWithMeans(sim_options=sim_options) elif mode == 2: self.algo = KNNBasic(sim_options=sim_options) else: exit(0) self.userid = [] for i in range(len(self.testings['userId'])): if not self.testings['userId'][i] in self.userid: self.userid.append(self.testings['userId'][i]) self.algo.fit(trainset) def get_similar_users(self, usrID, num=10): user_inner_id = self.algo.trainset.to_inner_uid(usrID) user_neighbors = self.algo.get_neighbors(user_inner_id, k=num) user_neighbors = [ self.algo.trainset.to_raw_uid(inner_id) for inner_id in user_neighbors ] # print(user_neighbors) return user_neighbors def debug(self): similar_users = self.get_similar_users(1, 1) print(self.ratings[self.ratings.userId == 1].head()) for i in similar_users: print(list(self.ratings[self.ratings.userId == i]['movieId'])) def recommend(self, usrID, num=5): existed_movie = list( self.ratings[self.ratings.userId == usrID]['movieId']) similar_users = self.get_similar_users(usrID, num) movies_dict = {} for i in similar_users: movie = list(self.ratings[self.ratings.userId == i]['movieId']) vote = list(self.ratings[self.ratings.userId == i]['rating']) for j in range(len(vote)): if not (movie[j] in existed_movie): if movie[j] in movies_dict.keys(): movies_dict[movie[j]] += vote[j] else: movies_dict[movie[j]] = vote[ j] # 从最相似的用户中挑选出没看过的电影,评分相加 result = sorted(movies_dict.items(), key=lambda x: x[1], reverse=True) # 对评分进行排序 result = result[:num] # 挑选出最高评分的10部电影 # print(result) recommending = [] recommending_id = [] for i in result: recommending.append( self.index[self.index.movieId == i[0]]['title']) recommending_id.append(i[0]) return recommending, recommending_id # 返回推荐的电影名字和id def test(self, num=10): result = [] for user in self.userid: _, ids = self.recommend(user, num) # print(ids) result.append(ids) with open("./result.csv", "w") as csvfile: writer = csv.writer(csvfile) writer.writerow(['userId', 'result']) for i, row in enumerate(result): writer.writerow([self.userid[i], row])
rid_to_name = {} name_to_rid = {} with io.open(file_name, 'r', encoding='ISO-8859-1') as f: for line in f: line = line.split('|') rid_to_name[line[0]] = line[1] name_to_rid[line[1]] = line[0] return rid_to_name, name_to_rid rid_to_name, name_to_rid = read_item_names() #采用基于物品的协同过滤 transet = data.build_full_trainset() algo = KNNBasic(sim_options={'name': 'pearson_baseline', 'user_based': False}) algo.train(transet) #查看电影Now and Then (1995)最相似的10部电影 toy_story_raw_id = name_to_rid['Now and Then (1995)'] toy_story_raw_id toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id) toy_story_inner_id toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, k=10) toy_story_neighbors #将10部电影转化为对应的名字 toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id) for inner_id in toy_story_neighbors) toy_story_neighbors = (rid_to_name[rid] for rid in toy_story_neighbors) for movie in toy_story_neighbors: print(movie)
data = Dataset.load_from_df(final[['user_pseudo_id', 'interest', 'rating']], reader) print('Using KNNBasic') bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5} #algo = BaselineOnly(bsl_options=bsl_options) #cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=False) trainset, testset = train_test_split(data, test_size=0.25) algo = KNNBasic(bsl_options=bsl_options) algo.fit(trainset) predictions = algo.test(testset) accuracy.rmse(predictions) result = pd.DataFrame(predictions, columns=[ 'visitor_id', 'item_id', 'base_event', 'predict_event', 'details' ]) result.drop(columns={'details'}, inplace=True) result['erro'] = abs(result['base_event'] - result['predict_event']) print(result.head()) tuzlaId = algo.trainset.to_inner_iid('Satılik_İstanbul_Tuzla_İçmeler') print("Satılik_İstanbul_Tuzla_İçmeler : " + tuzlaId) tuzla_neighbors = algo.get_neighbors(tuzlaId, k=5) tuzla_neighbors = (algo.trainset.to_raw_iid(tuzlaId) for tuzlaId in tuzla_neighbors) for n in tuzla_neighbors: print(n)
reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(df[['Name', 'Title', 'Value']], reader) train_set = data.build_full_trainset() algo = KNNBasic(k=k, verbose=True, sim_options={ 'name': 'pearson_baseline', 'shrinkage': shrinkage_parameter }) algo.fit(train_set) names = df.Name.sort_values().unique() for name in names: user_inner_id = train_set.to_inner_uid(name) neighbors = algo.get_neighbors(user_inner_id, k) print("\n{}:".format(name)) print(" {:<12}Similarity (%)\tBooks in Common".format("Name")) for i in range(len(neighbors)): neighbor_name = train_set.to_raw_uid(neighbors[i]) similarity = algo.sim[user_inner_id, neighbors[i]] if similarity < 0.0001: # stop reporting neighbors with neglible similarity break user_books = set(df[df.Name == name].Title) neighbor_books = set(df[df.Name == neighbor_name].Title) books_in_common = len(user_books.intersection(neighbor_books)) print("{}: {:<9}\t{:0.2f}\t\t\t{}".format(i + 1, neighbor_name, similarity * 100,
trainset.n_users trainset.n_items #训练协同过滤模型,这里采用用户协同过滤 algo=KNNBasic() algo.train(trainset) #计算第39个歌单的前10个近邻歌单 current_playlist_name=list(name_id_dic.keys())[39] print(current_playlist_name) current_playlist_id=name_id_dic[current_playlist_name] print(current_playlist_id) #映射到内部user_id playlist_inner_id=algo.trainset.to_inner_uid(current_playlist_id) #获取前10个近邻 playlist_neighbors=algo.get_neighbors(playlist_inner_id,k=10) #将近邻映射回原来的id playlist_neighbors=[algo.trainset.to_raw_uid(inner_id) for inner_id in playlist_neighbors] #将歌单id映射回歌单名字 playlist_neighbors=[id_name_dic[id] for id in playlist_neighbors] playlist_neighbors #加载歌曲id=>歌曲名字映射文件 song_id_name_dic=pickle.load(open("C:\\Users\\T\\Desktop\\python视频\\song.pkl",'rb')) #构建歌单名=>歌单id的映射 song_name_id_dic={} for song_id in song_id_name_dic: song_name_id_dic[song_id_name_dic[song_id]]=song_id #对用户进行推荐,这里选择4号用户
for playlist_name in playlist_neighbors_name: print(playlist_name, name_id_dic[playlist_name]) playlist_recommend_main() file_path = os.path.expanduser('neteasy_playlist_recommend_data.csv') # 指定文件格式 reader = Reader(line_format='user item rating timestamp', sep=',') # 从文件读取数据 music_data = Dataset.load_from_file(file_path, reader=reader) # 分成5折 改成了, cv=5 # music_data.split(n_folds=5) # userCF - default # itemCF - KNNBasic(sim_options={"user_based": False}) algo = KNNBasic(sim_options={"user_based": False}) perf = cross_validate(algo, music_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) print(perf) # 回归算法的评价指标MSE(均方误差),RMSE(均方根误差),MAE(平均绝对误差) # algo.fit(music_data.build_full_trainset()) # print(algo.get_neighbors(algo.trainset.to_inner_uid('2150055953'), 10)) print(algo.get_neighbors(algo.trainset.to_inner_iid("424262401"), 3))
def run_rec(dataset, num_rec=20): r_cols = ['user_id', 'item_id', 'rating', 'unix_timestamp'] ratings = pd.read_csv( 'ml-100k/sample.txt', sep=' ', names=['user_id', 'item_id', 'rating', 'unix_timestamp']) # ratings = pd.read_csv('ml-100k/ua.base.txt', sep='\t', names=r_cols) # ratings = pd.DataFrame(dataset, columns=['user_id', 'item_id', 'rating']) train_data = ratings.to_numpy() n_rows, n_cols = train_data.shape # normalized_data = train_data.copy() normalized_data = np.ndarray((n_rows, n_cols), dtype=object) for r in range(n_rows): normalized_data[r, 0] = train_data[r, 0] normalized_data[r, 1] = train_data[r, 1] normalized_data[r, 2] = float(train_data[r, 2]) normalized_data[r, 3] = train_data[r, 3] # User mean # users = train_data[:, 0] # n_users = int(np.max(train_data[:, 0])) # mean_rating_matrix = np.zeros((n_users + 1,)) # for u in range(1, n_users + 1): # indices = np.where(users == u)[0].astype(np.int32) # temp_ratings = train_data[indices, 2] # # temp_ratings = [float(temp) for temp in train_data[indices, 2]] # mean_rating_matrix[u] = np.mean(temp_ratings) if indices.size > 0 else 0 # normalized_data[indices, 2] = temp_ratings - mean_rating_matrix[u] # Item mean items = train_data[:, 1] n_items = int(np.max(train_data[:, 1])) mean_rating_matrix = np.zeros((n_items + 1, )) for i in range(1, n_items + 1): indices = np.where(items == i)[0].astype(np.int32) temp_ratings = train_data[indices, 2] # temp_ratings = [float(temp) for temp in train_data[indices, 2]] mean_rating_matrix[i] = np.mean( temp_ratings) if indices.size > 0 else 0 normalized_data[indices, 2] = temp_ratings - mean_rating_matrix[i] new_ratings = pd.DataFrame( normalized_data, columns=['user_id', 'item_id', 'rating', 'unix_timestamp']) reader = Reader() data = Dataset.load_from_df(new_ratings[['user_id', 'item_id', 'rating']], reader) trainset = data.build_full_trainset() sim_options = {'name': 'cosine', 'user_based': False} algo = KNNBasic(sim_options=sim_options) algo.fit(trainset) print(algo.sim) item_raw_id = 1 item_inner_id = algo.trainset.to_inner_iid(item_raw_id) # print(item_inner_id) item_neighbors_inner_ids = algo.get_neighbors(item_inner_id, k=num_rec) # for inner_id in item_neighbors_inner_ids: # print(inner_id) item_neighbors_raw_ids = (algo.trainset.to_raw_iid(inner_id) for inner_id in item_neighbors_inner_ids) print('Start') for raw_id in item_neighbors_raw_ids: print(raw_id) print('Done')