def main(): data = get_data('app_clean.csv', 'user item rating timestamp') trainset = data.build_full_trainset() sim_options = {'name': 'msd', 'user_based': False} algo = KNNBaseline(sim_options=sim_options) algo.fit(trainset) # get app name and convert to app_input = argv[1].strip() rid_to_name, name_to_rid = read_item_names() inner_id = algo.trainset.to_inner_iid(name_to_rid[app_input]) # get neighbours and convert to app names neighbors = algo.get_neighbors(inner_id, k=5) neighbors = (algo.trainset.to_raw_iid(inner_ids) for inner_ids in neighbors) neighbors = (rid_to_name[rid] for rid in neighbors) print get_user_recommendations() # print neighbours print "Apps that are available in the app directory similar to", rid_to_name[ algo.trainset.to_raw_iid(inner_id)] for app in neighbors: print app app_to_category, category_to_app, all_categories = get_categories() if app_input in app_to_category: category = app_to_category[app_input] print "Other apps in this category:", category print category_to_app[category]
def knn_centered(self): print(("calculating knn centered... File Rating: " + self.file_path)) print(("calculating knn centered... Item to Evaluate: " + self.item_evaluated)) print(("calculating knn centered... Number of recommendations: " + str(self.number_recommendations))) # Reader reader = Reader(line_format='item user rating', sep=self.delimiter, skip_lines=1) # Dataset data = Dataset.load_from_file(self.file_path, reader=reader) trainset = data.build_full_trainset() sim_options = {'name': 'pearson_baseline', 'user_based': False} algo = KNNBaseline(sim_options=sim_options) algo.fit(trainset) item_inner_id = algo.trainset.to_inner_iid(self.item_evaluated) item_neighbors = algo.get_neighbors(item_inner_id, k=int(self.number_recommendations)) item_neighbors = (algo.trainset.to_raw_iid(inner_id) for inner_id in item_neighbors) dictionary_neighbors = {} print("\nTransition Component Based Ratings >> Recommended items by KNN Centered:") i = 0 for item in item_neighbors: i += 1 dictionary_neighbors[i] = item print(("- " + item)) return dictionary_neighbors
def kNNbAlgo(product_ID, nRecos=3): """ Adapted from class tutorial, uses the KNN Baseline algorithm of the Surprise library for recommendation. Data must follow an explicit format, per below. As such, data is recasted using R into "finalMaster.csv". Function returns a list of n recommended products in product ID format, where n is 3 by default but can be specified if desired. Function takes a product ID as the only required parameter. """ reader = Reader(line_format='user item rating', sep=',') data = Dataset.load_from_file('backend/finalMaster.csv', reader=reader) trainset = data.build_full_trainset() sim_options = {'name': 'pearson_baseline', 'user_based': False} algo = KNNBaseline(sim_options=sim_options) algo.fit(trainset) product_inner_id = algo.trainset.to_inner_iid(str(product_ID)) product_neighbors = algo.get_neighbors(product_inner_id, k=nRecos) product_neighbors = (algo.trainset.to_raw_iid(inner_id) for inner_id in product_neighbors) recoList = list(product_neighbors) return recoList
def knn_cosine(dataset_path, target, value): # Read the data file file_path = dataset_path reader = Reader(line_format= 'user item rating', rating_scale=(1, 7), sep='\t') data = Dataset.load_from_file(file_path, reader) # Construct a training set using the entire dataset (without spliting the dataset into folds) # variable trainset is an object of Trainset trainset = data.build_full_trainset() # Parameters needed to create rating matrix user_to_item = {} item_to_user = {} file = open(file_path, "r") # Train the algorithm to compute the similarities between users sim_options = {'name': 'cosine', 'user_based': True} algo = KNNBaseline(sim_options=sim_options) algo.train(trainset) # Read the mappings user <-> item for line in file: line = line.split('\t') user_to_item[line[0]] = line[1] item_to_user[line[1]] = line[0] # Retrieve the user id and target_neighbors = algo.get_neighbors(target, k=value) target_neighbors = (algo.trainset.to_raw_uid(inner_id) for inner_id in target_neighbors) target_neighbors = (item_to_user[rid] for rid in target_neighbors) return target_neighbors
def get_nearest_neighbors(user_id): #path to dataset file file_path = os.path.expanduser('~/src/project/outward.csv') # define a reader object for our dataset reader = Reader(sep=',') #load data from dataset data = Dataset.load_from_file(file_path, reader=reader) #Train algorithm on dataset trainset = data.build_full_trainset() sim_options = {'name': 'pearson_baseline', 'user_based': True} algo = KNNBaseline(sim_options=sim_options) algo.fit(trainset) #Retrieve inner id of the user in question user_inner_id = algo.trainset.to_inner_uid(str(user_id)) #Retrieve inner ids of the nearest neighbors of user user_neighbors = algo.get_neighbors(user_inner_id, k=10) #Convert inner ids of the neighbors into raw user ids user_neighbors = (algo.trainset.to_raw_uid(inner_id) for inner_id in user_neighbors) neighbors_lst = [] print() print(f'The 10 nearest neighbors of {user_id} are:') for user in user_neighbors: print(user) neighbors_lst.append(user) return neighbors_lst
def get_similar_users_recommendations(uid, n=10): # 获取训练集,这里取数据集全部数据 trainset = data.build_full_trainset() algo = KNNBaseline(sim_option = user_based_sim_option) algo.fit(trainset) inner_id = algo.trainset.to_inner_uid(uid) # 使用get_neighbors方法得到10个最相似的用户 neighbors = algo.get_neighbors(inner_id, k=10) neighbors_uid = ( algo.trainset.to_raw_uid(x) for x in neighbors ) recommendations = set() #把评分为5的电影加入推荐列表 for user in neighbors_uid: if len(recommendations) > n: break item = data_df[data_df['user']==user] item = item[item['rating']=='5']['item'] for i in item: print(item_dict[i]) recommendations.add(item_dict[i]) # print('\nrecommendations for user',uid) # for i, j in enumerate(list(recommendations)): # if i >= 10: # break # print(j) print("dsofowenjcxjknugoiernfgioernnvrehguhtgunviofv") get_my_ratings(uid) return recommendations
def knn_experiment(movie_title): movie_items = pd.read_csv('item.csv')[['movie_id', 'title' ]].set_index('title').dropna() data = Dataset.load_builtin('ml-100k') trainset = data.build_full_trainset() sim_options = {'name': 'pearson_baseline', 'user_based': False} algo = KNNBaseline(sim_options=sim_options) algo.fit(trainset) # movie_id and title mapping row = movie_items.index.str.startswith(movie_title) try: raw_id = str(movie_items[row]['movie_id'].values[0]) except: print('Movie not Found') return # Getting KNN id of the provided movie inner_id = algo.trainset.to_inner_iid(raw_id) # Get top 10 matched results neighbors = algo.get_neighbors(inner_id, k=10) neighbors = (algo.trainset.to_raw_iid(inner_id) for inner_id in neighbors) neighbors_ids = [x for x in neighbors] for x in movie_items['movie_id']: if str(x) in neighbors_ids: print(movie_items[movie_items['movie_id'] == x].index.values[0])
def knn_basic(self): print("calculating knn basic... File Rating: " + self.file_path) print("calculating knn basic... Item to Evaluate: " + self.item_evaluated) print("calculating knn basic... Number of recommendations: " + str(self.number_recommendations)) # Reader reader = Reader(line_format='item user rating', sep=self.delimiter, skip_lines=1) # Dataset data = Dataset.load_from_file(self.file_path, reader=reader) trainset = data.build_full_trainset() sim_options = {'name': 'pearson_baseline', 'user_based': True} algo = KNNBaseline(sim_options=sim_options) algo.fit(trainset) item_inner_id = algo.trainset.to_inner_iid(self.item_evaluated) item_neighbors = algo.get_neighbors(item_inner_id, k=int(self.number_recommendations)) item_neighbors = (algo.trainset.to_raw_iid(inner_id) for inner_id in item_neighbors) dictionary_neighbors = {} i = 0 for item in item_neighbors: i += 1 dictionary_neighbors[i] = item return dictionary_neighbors
def findSimilarItem(movieName): movieLens = MovieLens() data = movieLens.loadMovieLensLatestSmall() moviesArray = [] name_to_movieID = ml.name_to_movieID movieID_to_name = ml.movieID_to_name trainSet = data.build_full_trainset() algo = KNNBaseline(sim_options=sim_options) algo.fit(trainSet) movie_raw_id = name_to_movieID[movieName] movie_inner_id = algo.trainset.to_inner_iid(movie_raw_id) # Retrieve inner ids of the nearest neighbors of Toy Story. movie_neighbors = algo.get_neighbors(movie_inner_id, k=10) # Convert inner ids of the neighbors into names. movie_neighbors = (algo.trainset.to_raw_iid(inner_id) for inner_id in movie_neighbors) movie_neighbors = (movieID_to_name[rid] for rid in movie_neighbors) # print(movie_neighbors) for movie in movie_neighbors: # print(movie) moviesArray.append(movie) return moviesArray
def get_similar_users_recommendations(uid, n=10): # 获取训练集,这里取数据集全部数据 trainset = data.build_full_trainset() # print(trainset.ur) # 考虑基线评级的协同过滤算法 algo = KNNBaseline(sim_option=user_based_sim_option) # 拟合训练集 algo.fit(trainset) # 使用get_neighbors方法得到10个最相似的用户 neighbors = algo.get_neighbors(int(uid), k=10) neighbors_uid = (algo.trainset.to_raw_uid(x) for x in neighbors) recommendations = set() #把评分为5的电影加入推荐列表 for user in neighbors_uid: if len(recommendations) > n: break item = data_df[data_df['user'] == str(user)] item = item[item['rating'] == '5']['item'] for i in item: recommendations.add(item_dict[i]) # print('\nrecommendations for user %d:')%inner_id for i, j in enumerate(list(recommendations)): if i >= 10: break print(j)
def get_similar_items(iid, n = 10): trainset = data.build_full_trainset() algo = KNNBaseline(sim_option = item_based_sim_option) algo.fit(trainset) inner_id = algo.trainset.to_inner_iid(iid) # 使用get_neighbors方法得到n个最相似的电影 neighbors = algo.get_neighbors(inner_id, k=n) neighbors_iid = ( algo.trainset.to_raw_iid(x) for x in neighbors ) recommendations = [ item_dict[x] for x in neighbors_iid ] # print('\nten movies most similar to the',item_dict[iid]) # for i in recommendations: # print(i) return recommendations
class Recmodel(object): def __init__(self, algo='knn_baseline', filepath=None): if not os.path.exists(filepath): raise FileNotFoundError("{} not exist".format(filepath)) self.filepath = filepath if algo == 'nmf': self.algo = NMF() self.model_name = 'nmf' else: self.algo = KNNBaseline() self.model_name = 'knn_baseline' self.convertor = DataConvertHelper() def buildDataSet(self): reader = Reader(line_format='user item rating timestamp', sep=',') music_data = Dataset.load_from_file(file_path=self.filepath, reader=reader) self.trainset = music_data.build_full_trainset() def train(self): print("begin training...") self.algo.fit(self.trainset) def evaluate(self, index): current_playlist_name = self.convertor.get_name_by_index(index) print('当前歌单:{}'.format(current_playlist_name)) current_playlist_rid = self.convertor.get_rid_by_name( current_playlist_name) print("当前歌单rid: {}".format(current_playlist_rid)) playlist_inner_id = self.algo.trainset.to_inner_uid( current_playlist_rid) print('歌单inid', playlist_inner_id) playlist_neighbors_inner_ids = self.algo.get_neighbors( playlist_inner_id, k=10) playlist_neighbors_rids = ( self.algo.trainset.to_raw_uid(inner_id) for inner_id in playlist_neighbors_inner_ids) playlist_neighbors_names = (self.convertor.get_name_by_rid(rid) for rid in playlist_neighbors_rids) print('歌单 《', current_playlist_name, '》 最接近的10个歌单为:') for playlist_name in playlist_neighbors_names: print( playlist_name, self.algo.trainset.to_inner_uid( self.convertor.get_rid_by_name(playlist_name)))
def recommend_friends(request): queryset = Rate.objects.all() query, params = queryset.query.as_sql( compiler='django.db.backends.sqlite3.compiler.SQLCompiler', connection=connections['default']) df = pd.read_sql_query(query, con=connections['default'], params=params) print("load data") reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(df[['user_id', 'item_id', 'rate']], reader) trainset = data.build_full_trainset() sim_options = {'name': 'pearson_baseline'} algo = KNNBaseline(sim_options=sim_options) algo.fit(trainset) for given_user_id in set(df['user_id']): print(given_user_id) given_user_id = int(given_user_id) _from = get_object_or_404(Profile, profile_id=given_user_id) inner_id = algo.trainset.to_inner_uid(given_user_id) # to_inner_uid(), to_inner_iid(), to_raw_uid(), and to_raw_iid() neighbors = algo.get_neighbors(inner_id, k=5) results = [ algo.trainset.to_raw_uid(inner_user_id) for inner_user_id in neighbors ] print('The 5 nearest neighbors of Given User Id:') for raw_user_id in results: _to = get_object_or_404(Profile, user_id=int(raw_user_id)) # print(raw_user_id,Candidates2.objects.filter(user_from=user_from,user_to=user_to)) if Candidates2.objects.filter(user_from=_from): if Candidates2.objects.filter(user_from=_from, user_to=_to): print("user from , to 다 일치") pass else: cand = Candidates2.objects.get(user_from=_from) cand.user_to.add(_to) print("user from만 일치, to 추가") else: cand = Candidates2.objects.create() cand.user_from.add(_from) cand.user_to.add(_to) print("해당 유저 %s 에 대한 데이터 저장완료" % given_user_id) return render(request, "recommend_completed.html")
def get(self, item_id): # SQL query conn = mysql.connect() cursor = conn.cursor() # STEP 1 : KNN WITH MSD df = pd.read_sql_query("SELECT * FROM story_reviews", conn) # Data and Model reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(df[['user_id', 'story_id', 'star']], reader) sim_options = {'name': 'pearson_baseline', 'user_based': False} model = KNNBaseline(sim_options=sim_options) # Training training_set = data.build_full_trainset() model.fit(training_set) item_inner_id = model.trainset.to_inner_iid(item_id) item_neighbors_inner = model.get_neighbors(item_inner_id, k=10) item_neighbors = [model.trainset.to_raw_iid(inner_id) for inner_id in item_neighbors_inner] # STEP 2 : CASCADE IT WITH TF-IDF df_stories = pd.read_sql_query("SELECT * FROM stories", conn) # Model tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english') tf_idf_matrix = tf.fit_transform(df_stories['title']) cosine_similarities = cosine_similarity(tf_idf_matrix, tf_idf_matrix) # Retrieve similar items cosine_similarities_row = cosine_similarities[item_id-1] recommendations_list = [] n = 10 for i in range(n): recommendations_list.append((item_neighbors[i], cosine_similarities_row[item_neighbors[i]-1])) recommendations_list.sort(key=lambda x:x[1], reverse=True) formatted_recommendations_list = [item[0] for item in recommendations_list] # Return K Nearest Neighbors return jsonify(recommendations = formatted_recommendations_list)
def compute_user_neighbors(id_name_dic, name_id_dic, trainset): algo = KNNBaseline() algo.fit(trainset) user_name = name_id_dic.keys()[1] print("user_name: ", user_name) user_id = name_id_dic[user_name] print("user_id: ", user_id) user_inner_id = algo.trainset.to_inner_uid(user_id) print("内部id: ", user_inner_id) user_neighbors = algo.get_neighbors(user_inner_id, k=10) user_neighbors = (algo.trainset.to_raw_uid(inner_id) for inner_id in user_neighbors) user_neighbors = (id_name_dic[user_id] for user_id in user_neighbors) print() print("和user 《", user_name, "》 最接近的10个user为:") for user in user_neighbors: print(algo.trainset.to_inner_uid(name_id_dic[user]), user)
def compute_movie_neighbors(id_name_dic, name_id_dic, trainset): sim_options = {'user_based': False} algo = KNNBaseline(sim_options=sim_options) algo.fit(trainset) #movie_name = name_id_dic.keys()[1] movie_name = "古墓迷途2" print("movie_name: ", movie_name) movie_id = name_id_dic[movie_name] print("movie_id: ", movie_id) movie_inner_id = algo.trainset.to_inner_iid(movie_id) print("内部id: ", movie_inner_id) movie_neighbors = algo.get_neighbors(movie_inner_id, k=10) movie_neighbors = (algo.trainset.to_raw_iid(inner_id) for inner_id in movie_neighbors) movie_neighbors = (id_name_dic[movie_id] for movie_id in movie_neighbors) print() print("和movie 《", movie_name, "》 最接近的10个movie为:") for movie in movie_neighbors: print(algo.trainset.to_inner_iid(name_id_dic[movie]), movie)
class NearestRecipesBaseline: def __init__(self, k=40): self.model = KNNBaseline( k=k, sim_options={ 'name': 'pearson_baseline', 'user_based': False }, verbose=False ) def fit(self, reviews): # SurPRISE supports only pandas DataFrame or folds as data input data = Dataset.load_from_df( DataFrame(reviews), Reader(rating_scale=(1, 5)) ) trainset = data.build_full_trainset() return self.model.fit(trainset) def predict(self, recipe_id, k=20): inner_id = self.model.trainset.to_inner_iid(recipe_id) nearest_inner_ids = self.model.get_neighbors(inner_id, k=k) nearest_ids = [self.model.trainset.to_raw_iid(id) for id in nearest_inner_ids] return nearest_ids
def start(goods='951'): file_path = os.path.expanduser('SampleData') # 指定文件格式 reader = Reader(line_format='user item rating timestamp', sep=',') # 从文件读取数据 data = Dataset.load_from_file(file_path, reader=reader) trainset = data.build_full_trainset() sim_options = {'name': 'pearson_baseline', 'user_based': False} ##使用KNNBaseline算法 algo = KNNBaseline(sim_options=sim_options) algo.train(trainset) iid_innerid = algo.trainset.to_inner_iid(goods) iid_neighbors = algo.get_neighbors(iid_innerid, k=10) iid_neighbors = (algo.trainset.to_raw_iid(inner_id) for inner_id in iid_neighbors) print('The 10 nearest neighbors of %s:' % goods) for iid in iid_neighbors: print(iid)
def get(self, algorithm, item_id): # SQL query conn = mysql.connect() cursor = conn.cursor() df = pd.read_sql_query("SELECT * FROM story_reviews", conn) # Data and Model reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(df[['user_id', 'story_id', 'star']], reader) if algorithm == 'pearson': sim_options = {'name': 'pearson', 'user_based': False} elif algorithm == 'cosine': sim_options = {'name': 'cosine', 'user_based': False} elif algorithm == 'pearsonbaseline': sim_options = {'name': 'pearson_baseline', 'user_based': False} elif algorithm == 'msd': sim_options = {'name': 'msd', 'user_based': False} else: sim_options = {'name': 'pearson_baseline', 'user_based': False} model = KNNBaseline(sim_options=sim_options) # Training training_set = data.build_full_trainset() model.fit(training_set) # TESTING # cross_validate(model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True) item_inner_id = model.trainset.to_inner_iid(item_id) item_neighbors_inner = model.get_neighbors(item_inner_id, k=10) item_neighbors = [model.trainset.to_raw_iid(inner_id) for inner_id in item_neighbors_inner] # Return K Nearest Neighbors return jsonify(recommendations = item_neighbors)
# Computing the pearson_baseline similarity matrix... # Done computing similarity matrix. # UserWarning: train() is deprecated. Use fit() instead warnings.warn('train() is deprecated. Use fit() instead', UserWarning) # algo.train(trainset) algo.fit(trainset) rid_to_name, name_to_rid = read_item_names() origin_film_name = 'GoldenEye (1995)' toy_story_raw_id = name_to_rid[origin_film_name] # print(toy_story_raw_id) toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id) # print(toy_story_inner_id) # 一个电影在algo内部的id, 并不一定等于文件中的id # 找到最近的k = 10个邻居 K = 10 toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, k=K) print(toy_story_neighbors) # 从近邻的id映射回电影名称 toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id) for inner_id in toy_story_neighbors) toy_story_neighbors = (rid_to_name[rid] for rid in toy_story_neighbors) print('The ' + str(K) + ' nearest neighbors of ' + origin_film_name + ' are:') for movie in toy_story_neighbors: print(movie)
return rid_to_name, name_to_rid # First, train the algortihm to compute the similarities between items data = Dataset.load_builtin('ml-100k') trainset = data.build_full_trainset() sim_options = {'name': 'pearson_baseline', 'user_based': False} algo = KNNBaseline(sim_options=sim_options) algo.fit(trainset) # Read the mappings raw id <-> movie name rid_to_name, name_to_rid = read_item_names() # Retrieve inner id of the movie Toy Story toy_story_raw_id = name_to_rid['Toy Story (1995)'] toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id) # Retrieve inner ids of the nearest neighbors of Toy Story. toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, k=10) # Convert inner ids of the neighbors into names. toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id) for inner_id in toy_story_neighbors) toy_story_neighbors = (rid_to_name[rid] for rid in toy_story_neighbors) print() print('The 10 nearest neighbors of Toy Story are:') for movie in toy_story_neighbors: print(movie)
reader = Reader(line_format='user item rating', sep='\t') data = Dataset.load_from_file(file_path, reader=reader) trainset = data.build_full_trainset() sim_options = {'name': 'pearson_baseline', 'user_based': False} algo = KNNBaseline(sim_options=sim_options) algo.fit(trainset) path = os.path.join(BASE_DIR, os.path.dirname(os.path.abspath(__file__)), 'play_id.txt') f = open(path) cur_play_id = f.read() f.close() # print(cur_play_id,'===========') cur_play_inner_id = algo.trainset.to_inner_iid(cur_play_id) cur_play_neighbors = algo.get_neighbors(cur_play_inner_id, k=6) # print(cur_play_neighbors) cur_play_neighbors = (algo.trainset.to_raw_iid(inner_id) for inner_id in cur_play_neighbors) path = os.path.join(BASE_DIR, os.path.dirname(os.path.abspath(__file__)), 'recmmond_play_result.txt') f = open(path, 'w') f.write(','.join(cur_play_neighbors)) f.close() # for play_id in cur_play_neighbors: # print(play_id, '----------')
# print(trainset.n_users) # print("开始训练模型...") algo = KNNBaseline() algo.fit(trainset) current_palylist = list(name_id_dic.keys())[39] print("歌单名称", current_palylist) # 取出近邻 # 映射名字到id playlist_id = name_id_dic[current_palylist] print("歌单id", playlist_id) # 取出来对应的内部user id => to_inner_uid playlist_inner_id = algo.trainset.to_inner_uid(playlist_id) print("内部id", playlist_inner_id) playlist_neighbors = algo.get_neighbors(playlist_inner_id, k=10) # 把歌曲id转换成歌曲名字 # to_raw_uid映射回去 playlist_neighbors = (algo.trainset.to_raw_uid(inner_id) for inner_id in playlist_neighbors) playlist_neighbors = (id_name_dic[playlist_id] for playlist_id in playlist_neighbors) print("之前的啥:", playlist_neighbors) print("和歌单《", current_palylist, "》最接近的10首歌单为:\n") for playlist in playlist_neighbors: print(playlist, algo.trainset.to_inner_uid(name_id_dic[current_palylist])) # 针对用户进行预测 song_id_name_dic = pickle.load(open("popular_song.pkl", "rb"),
# raw_id_toy_story = rid_to_name['1053'] # print(raw_id_toy_story) train_data = data.build_full_trainset() # 皮尔逊系数计算相似度,基于物品的推荐 sim_options = {'name': 'pearson_baseline', 'user_based': False} from surprise import KNNBaseline knn = KNNBaseline(sim_options=sim_options) knn.fit(train_data) rid_to_name, name_to_rid = read_item_names() toy_story_raw_id = name_to_rid['Now and Then (1995)'] print("数据中的ID:%s" % toy_story_raw_id) # toy_story_inner_id在实际要计算的矩阵中的ID。 toy_story_inner_id = knn.trainset.to_inner_iid(toy_story_raw_id) print("矩阵中的ID:%s" % toy_story_inner_id) toy_story_neighbors = knn.get_neighbors(toy_story_inner_id, k=10) print("最近邻:%s" % toy_story_neighbors) # 对近邻集合中对ID进行转换为名字 toy_story_neighbors = (knn.trainset.to_raw_iid(inner_id) for inner_id in toy_story_neighbors) toy_story_neighbors = (rid_to_name[rid] for rid in toy_story_neighbors) print() print('推荐的Top 10 :') for movie in toy_story_neighbors: print(movie)
file_path = 'ratings_android.dat' reader = Reader(line_format='user item rating', rating_scale=(1, 5), sep=' ') data = Dataset.load_from_file(file_path, reader=reader) trainset = data.build_full_trainset() sim_options = {'name': 'pearson_baseline', 'user_based': True} algo = KNNBaseline(sim_options=sim_options) algo.train(trainset) # Read the mappings raw id <-> movie name #rid_to_name, name_to_rid = read_item_names() # Retrieve inner id of the movie Toy Story #toy_story_raw_id = name_to_rid['Toy Story (1995)'] #toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id) # Retrieve inner ids of the nearest neighbors of Toy Story. toy_story_neighbors = algo.get_neighbors(1, k=10) # Convert inner ids of the neighbors into names. #toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id) # for inner_id in toy_story_neighbors) #toy_story_neighbors = (rid_to_name[rid] # for rid in toy_story_neighbors) #print() print('The 10 nearest neighbors of Toy Story are:') #for movie in toy_story_neighbors: # print(movie) print(toy_story_neighbors)
if int(line[1]) in uid: uid[int(line[1])].update({int(line[0]):int(line[2])}) else: uid[int(line[1])]={int(line[0]):int(line[2])} # print "done!" return uid file_path=os.path.expanduser('~')+"/Downloads/CSC522/toy/sample.data" reader=Reader(line_format='item user rating timestamp',sep=',') data=Dataset.load_from_file(file_path,reader=reader) data.split(n_folds=60) trainset=data.build_full_trainset() sim_options={'name':'pearson_baseline','user_based':True} algo=KNNBaseline(sim_options=sim_options) algo.train(trainset) user_id='911' user_inner_id=algo.trainset.to_inner_uid(user_id) user_neighbors=algo.get_neighbors(user_inner_id,k=22) user_neighbors=(algo.trainset.to_raw_uid(inner_id) for inner_id in user_neighbors) print() print('The 5 nearest neighbors of the userid %s are:'%user_id) for userid in user_neighbors: print(userid) perf=evaluate(algo,data,measures=['RMSE','MAE']) print (perf)
genre_inner_id_list = [] for genre in full_genres: inner = algo_genre_group.trainset.to_inner_iid(genre) genre_inner_id_list.append(inner) game_inner_id_list = [] for game in full_games: inner = algo_game_group.trainset.to_inner_iid(game) game_inner_id_list.append(inner) # get the list of neighbors for those genres/games genre_neighbors_list = [] for inner in genre_inner_id_list: genre_neighbors = algo_genre_group.get_neighbors(inner, k=3) genre_neighbors_list.append(genre_neighbors) game_neighbors_list = [] for inner in game_inner_id_list: game_neighbors = algo_game_group.get_neighbors(inner, k=3) game_neighbors_list.append(game_neighbors) # prioritize closest neighbors to all original genres/games mentioned genre_final_list = [] for item in genre_neighbors_list: genre_final_list.append(item[0]) genre_final_list.append(item[1]) game_final_list = []
W = np.ones(s) algo.weightUpdate(W) predictions = algo.fit(trainset).test(testset) # predictions = algo.fit(trainset) PredictM = np.zeros(s) for it in predictions: PredictM[int(it[0]), int(it[1])] = it[3] print(PredictM) PM = pd.DataFrame(PredictM) PM.to_csv("PredictionMatrix.csv") # print (trainset.all_items()) NeighborM = np.zeros([trainset.n_items, K]) for i, item in enumerate(trainset.all_items()): NeighborM[i] = algo.get_neighbors(item, k=K) NM = pd.DataFrame(NeighborM) NM.to_csv("NeighborMatrix.csv") # print ("Inner id: ",iterator_neighbors[1],"Raw id:") # print (algo.trainset.to_raw_uid(1)) # print Neighbor # u_id:user id i_id:item id # un_: user number in_: item number # real rating matrix: r[u_id][i_id] # estimate : r_h[m][u_id][i_id] # ###########similarity matrix: (item_based) s[i_id][i_id] # rating interval: # weight matrix: W[u_id][i_id] # k_neighbor: k_neighbor_li
def read_user_names(): """Read the u.item file from MovieLens 100-k dataset and return two mappings to convert raw ids into movie names and movie names into raw ids. """ file_name = get_dataset_dir() + '/ml-100k/ml-100k/u.data' user_id = {} name_to_rid = {} with io.open(file_name, 'r', encoding='ISO-8859-1') as f: for line in f: line = line.split('\t') user_id[line[0]] = line[0] return user_id # First, train the algortihm to compute the similarities between items data = Dataset.load_builtin('ml-100k') trainset = data.build_full_trainset() sim_options = {'name': 'cosine', 'user_based': True, 'min_support': 10} algo = KNNBaseline(sim_options=sim_options) algo.fit(trainset) user_neighbors = algo.get_neighbors(130, k=10) user_neighbors = (algo.trainset.to_raw_iid(inner_id) for inner_id in user_neighbors) print('The 10 nearest neighbors of User 130 are:') for user in user_neighbors: print(user)
'C:/Users/xuwei/workspace/Surprise/src/data.txt') reader = Reader(line_format='user item rating', sep='\t') data = Dataset.load_from_file(file_path, reader=reader) trainset = data.build_full_trainset() #使用pearson_baseline方式计算相似度 False以item为基准计算相似度 本例为电影之间的相似度 sim_options = {'name': 'pearson_baseline', 'user_based': False} ##使用KNNBaseline算法 algo = KNNBaseline(sim_options=sim_options) #algo=KNNBaseline() #训练模型 algo.train(trainset) n = 0 for item in range(trainset.n_items): raw_id = trainset.to_raw_iid(item) neighbors = algo.get_neighbors(item, 5) for neighbor in neighbors: neighbor_raw_id = trainset.to_raw_iid(neighbor) sql = "insert into book_similarity_icf(book_id1,book_id2) values('%d','%d');" % ( int(raw_id), int(neighbor_raw_id)) print(sql) cursor.execute(sql) db.commit() n += 1 if (n % 100 == 0): print('----------------') print(n, 'is done') db.close()
def train_knn(self, df, userId, user_m_ids, movies_watched): """ param: df - movies pandas DataFrame userId - user ID to predict movies with user_m_ids - List of movieIDs of movies to be recommended upon (as seen in TMDB dataset) movies_watched - List of movie titles watched (as seen in TMDB dataset) return: pandas DataFrame of the recommended movies with attributes - title, id, vote_average, vote_count, popularity, release date Collaborative filtering is done using KNN-Baseline and prediction is done using pearson_baseline. The technique used is item-item based. """ reader = Reader(rating_scale=(1, 5)) movie_ids = self.get_movie_ids() rec_result = dict() sim_options = {"name": "pearson_baseline", "user_based": False} data = Dataset.load_from_df(df[RATING_ATTR], reader) if isfile(PATH_COLL_FILTERING_CACHE): model = joblib.load(PATH_COLL_FILTERING_CACHE) else: trainset = data.build_full_trainset() model = KNNBaseline(sim_options=sim_options) model.fit(trainset) joblib.dump(model, PATH_COLL_FILTERING_CACHE) inn_id = model.trainset.to_inner_iid(user_m_ids[0]) # print(self.get_movie_title(self.get_tmdb_id(user_m_ids[0]))) inn_id_neigh = model.get_neighbors(inn_id, k=10) # print(inn_id_neigh) df_pref = pd.DataFrame(columns=[ "title", "id", "vote_average", "vote_count", "popularity", "release_date", ]) index = 0 for m_id in inn_id_neigh: title_df = self.get_movie_title( self.get_tmdb_id(model.trainset.to_raw_iid(m_id))) try: if title_df[0] not in movies_watched: df_pref.loc[index] = array([ title_df[0], title_df[1], title_df[2], title_df[3], title_df[4], title_df[5], ]) index += 1 except: pass return df_pref