def main():

    data = get_data('app_clean.csv', 'user item rating timestamp')
    trainset = data.build_full_trainset()

    sim_options = {'name': 'msd', 'user_based': False}
    algo = KNNBaseline(sim_options=sim_options)
    algo.fit(trainset)

    # get app name and convert to
    app_input = argv[1].strip()
    rid_to_name, name_to_rid = read_item_names()
    inner_id = algo.trainset.to_inner_iid(name_to_rid[app_input])
    # get neighbours and convert to app names
    neighbors = algo.get_neighbors(inner_id, k=5)
    neighbors = (algo.trainset.to_raw_iid(inner_ids)
                 for inner_ids in neighbors)
    neighbors = (rid_to_name[rid] for rid in neighbors)

    print get_user_recommendations()
    # print neighbours
    print "Apps that are available in the app directory similar to", rid_to_name[
        algo.trainset.to_raw_iid(inner_id)]
    for app in neighbors:
        print app

    app_to_category, category_to_app, all_categories = get_categories()
    if app_input in app_to_category:
        category = app_to_category[app_input]
        print "Other apps in this category:", category
        print category_to_app[category]
    def knn_centered(self):
        print(("calculating knn centered... File Rating: " + self.file_path))
        print(("calculating knn centered... Item to Evaluate: " + self.item_evaluated))
        print(("calculating knn centered... Number of recommendations: " + str(self.number_recommendations)))

        # Reader
        reader = Reader(line_format='item user rating', sep=self.delimiter, skip_lines=1)

        # Dataset
        data = Dataset.load_from_file(self.file_path, reader=reader)

        trainset = data.build_full_trainset()
        sim_options = {'name': 'pearson_baseline',
                       'user_based': False}
        algo = KNNBaseline(sim_options=sim_options)
        algo.fit(trainset)

        item_inner_id = algo.trainset.to_inner_iid(self.item_evaluated)

        item_neighbors = algo.get_neighbors(item_inner_id, k=int(self.number_recommendations))

        item_neighbors = (algo.trainset.to_raw_iid(inner_id) for inner_id in item_neighbors)

        dictionary_neighbors = {}
        print("\nTransition Component Based Ratings >> Recommended items by KNN Centered:")
        i = 0
        for item in item_neighbors:
            i += 1
            dictionary_neighbors[i] = item
            print(("- " + item))

        return dictionary_neighbors
def kNNbAlgo(product_ID, nRecos=3):
    """
    Adapted from class tutorial, uses the KNN Baseline algorithm of the Surprise library for recommendation. 
    Data must follow an explicit format, per below. As such, data is recasted using R into "finalMaster.csv".
    
    Function returns a list of n recommended products in product ID format, 
    where n is 3 by default but can be specified if desired. 
    
    Function takes a product ID as the only required parameter. 
    """

    reader = Reader(line_format='user item rating', sep=',')
    data = Dataset.load_from_file('backend/finalMaster.csv', reader=reader)
    trainset = data.build_full_trainset()

    sim_options = {'name': 'pearson_baseline', 'user_based': False}
    algo = KNNBaseline(sim_options=sim_options)
    algo.fit(trainset)

    product_inner_id = algo.trainset.to_inner_iid(str(product_ID))
    product_neighbors = algo.get_neighbors(product_inner_id, k=nRecos)
    product_neighbors = (algo.trainset.to_raw_iid(inner_id)
                         for inner_id in product_neighbors)

    recoList = list(product_neighbors)

    return recoList
Exemple #4
0
def knn_cosine(dataset_path, target, value):
    # Read the data file
    file_path = dataset_path
    reader = Reader(line_format= 'user item rating', rating_scale=(1, 7), sep='\t')
    data = Dataset.load_from_file(file_path, reader)

    # Construct a training set using the entire dataset (without spliting the dataset into folds)
    # variable trainset is an object of Trainset
    trainset = data.build_full_trainset()
    
    # Parameters needed to create rating matrix
    user_to_item = {}
    item_to_user = {}
    file = open(file_path, "r")
        
    # Train the algorithm to compute the similarities between users
    sim_options = {'name': 'cosine', 'user_based': True}
    algo = KNNBaseline(sim_options=sim_options)
    algo.train(trainset)
    
    # Read the mappings user <-> item
    for line in file:
        line = line.split('\t')
        user_to_item[line[0]] = line[1]
        item_to_user[line[1]] = line[0]
    
    # Retrieve the user id and 
    target_neighbors = algo.get_neighbors(target, k=value)
    target_neighbors = (algo.trainset.to_raw_uid(inner_id) for inner_id in target_neighbors)
    target_neighbors = (item_to_user[rid] for rid in target_neighbors)
                       
    return target_neighbors
Exemple #5
0
def get_nearest_neighbors(user_id):
    #path to dataset file
    file_path = os.path.expanduser('~/src/project/outward.csv')

    # define a reader object for our dataset
    reader = Reader(sep=',')

    #load data from dataset
    data = Dataset.load_from_file(file_path, reader=reader)

    #Train algorithm on dataset
    trainset = data.build_full_trainset()
    sim_options = {'name': 'pearson_baseline', 'user_based': True}
    algo = KNNBaseline(sim_options=sim_options)
    algo.fit(trainset)

    #Retrieve inner id of the user in question
    user_inner_id = algo.trainset.to_inner_uid(str(user_id))

    #Retrieve inner ids of the nearest neighbors of user
    user_neighbors = algo.get_neighbors(user_inner_id, k=10)

    #Convert inner ids of the neighbors into raw user ids
    user_neighbors = (algo.trainset.to_raw_uid(inner_id)
                      for inner_id in user_neighbors)

    neighbors_lst = []
    print()
    print(f'The 10 nearest neighbors of {user_id} are:')
    for user in user_neighbors:
        print(user)
        neighbors_lst.append(user)

    return neighbors_lst
def get_similar_users_recommendations(uid, n=10):
    # 获取训练集,这里取数据集全部数据
    trainset = data.build_full_trainset()
    algo = KNNBaseline(sim_option = user_based_sim_option)
    algo.fit(trainset)
    inner_id = algo.trainset.to_inner_uid(uid)
    # 使用get_neighbors方法得到10个最相似的用户
    neighbors = algo.get_neighbors(inner_id, k=10)
    neighbors_uid = ( algo.trainset.to_raw_uid(x) for x in neighbors )
    recommendations = set()
    #把评分为5的电影加入推荐列表
    for user in neighbors_uid:
        if len(recommendations) > n:
            break
        item = data_df[data_df['user']==user]
        item = item[item['rating']=='5']['item']
        for i in item:
            print(item_dict[i])
            recommendations.add(item_dict[i])
    # print('\nrecommendations for user',uid)
    # for i, j in enumerate(list(recommendations)):
    #     if i >= 10:
    #         break
    #     print(j)
    print("dsofowenjcxjknugoiernfgioernnvrehguhtgunviofv")
    get_my_ratings(uid)
    return recommendations
Exemple #7
0
def knn_experiment(movie_title):
    movie_items = pd.read_csv('item.csv')[['movie_id', 'title'
                                           ]].set_index('title').dropna()
    data = Dataset.load_builtin('ml-100k')
    trainset = data.build_full_trainset()
    sim_options = {'name': 'pearson_baseline', 'user_based': False}
    algo = KNNBaseline(sim_options=sim_options)
    algo.fit(trainset)

    # movie_id and title mapping
    row = movie_items.index.str.startswith(movie_title)
    try:
        raw_id = str(movie_items[row]['movie_id'].values[0])
    except:
        print('Movie not Found')
        return

    # Getting KNN id of the provided movie
    inner_id = algo.trainset.to_inner_iid(raw_id)

    # Get top 10 matched results
    neighbors = algo.get_neighbors(inner_id, k=10)
    neighbors = (algo.trainset.to_raw_iid(inner_id) for inner_id in neighbors)

    neighbors_ids = [x for x in neighbors]
    for x in movie_items['movie_id']:
        if str(x) in neighbors_ids:

            print(movie_items[movie_items['movie_id'] == x].index.values[0])
Exemple #8
0
    def knn_basic(self):
        print("calculating knn basic... File Rating: " + self.file_path)
        print("calculating knn basic... Item to Evaluate: " +
              self.item_evaluated)
        print("calculating knn basic... Number of recommendations: " +
              str(self.number_recommendations))

        # Reader
        reader = Reader(line_format='item user rating',
                        sep=self.delimiter,
                        skip_lines=1)

        # Dataset
        data = Dataset.load_from_file(self.file_path, reader=reader)

        trainset = data.build_full_trainset()
        sim_options = {'name': 'pearson_baseline', 'user_based': True}
        algo = KNNBaseline(sim_options=sim_options)
        algo.fit(trainset)

        item_inner_id = algo.trainset.to_inner_iid(self.item_evaluated)

        item_neighbors = algo.get_neighbors(item_inner_id,
                                            k=int(self.number_recommendations))

        item_neighbors = (algo.trainset.to_raw_iid(inner_id)
                          for inner_id in item_neighbors)

        dictionary_neighbors = {}
        i = 0
        for item in item_neighbors:
            i += 1
            dictionary_neighbors[i] = item

        return dictionary_neighbors
def findSimilarItem(movieName):
    movieLens = MovieLens()
    data = movieLens.loadMovieLensLatestSmall()
    moviesArray = []
    name_to_movieID = ml.name_to_movieID
    movieID_to_name = ml.movieID_to_name

    trainSet = data.build_full_trainset()
    algo = KNNBaseline(sim_options=sim_options)
    algo.fit(trainSet)

    movie_raw_id = name_to_movieID[movieName]
    movie_inner_id = algo.trainset.to_inner_iid(movie_raw_id)

    # Retrieve inner ids of the nearest neighbors of Toy Story.
    movie_neighbors = algo.get_neighbors(movie_inner_id, k=10)

    # Convert inner ids of the neighbors into names.
    movie_neighbors = (algo.trainset.to_raw_iid(inner_id)
                       for inner_id in movie_neighbors)
    movie_neighbors = (movieID_to_name[rid] for rid in movie_neighbors)

    # print(movie_neighbors)
    for movie in movie_neighbors:
        # print(movie)
        moviesArray.append(movie)

    return moviesArray
Exemple #10
0
def get_similar_users_recommendations(uid, n=10):
    # 获取训练集,这里取数据集全部数据
    trainset = data.build_full_trainset()
    #     print(trainset.ur)
    # 考虑基线评级的协同过滤算法
    algo = KNNBaseline(sim_option=user_based_sim_option)
    # 拟合训练集
    algo.fit(trainset)

    # 使用get_neighbors方法得到10个最相似的用户
    neighbors = algo.get_neighbors(int(uid), k=10)
    neighbors_uid = (algo.trainset.to_raw_uid(x) for x in neighbors)

    recommendations = set()
    #把评分为5的电影加入推荐列表
    for user in neighbors_uid:
        if len(recommendations) > n:
            break
        item = data_df[data_df['user'] == str(user)]
        item = item[item['rating'] == '5']['item']
        for i in item:
            recommendations.add(item_dict[i])
#     print('\nrecommendations for user %d:')%inner_id
    for i, j in enumerate(list(recommendations)):
        if i >= 10:
            break
        print(j)
def get_similar_items(iid, n = 10):
    trainset = data.build_full_trainset()
    algo = KNNBaseline(sim_option = item_based_sim_option)
    algo.fit(trainset)
    inner_id = algo.trainset.to_inner_iid(iid)
    # 使用get_neighbors方法得到n个最相似的电影
    neighbors = algo.get_neighbors(inner_id, k=n)
    neighbors_iid = ( algo.trainset.to_raw_iid(x) for x in neighbors )
    recommendations = [ item_dict[x] for x in neighbors_iid ]
    # print('\nten movies most similar to the',item_dict[iid])
    # for i in recommendations:
    #     print(i)
    return recommendations
Exemple #12
0
class Recmodel(object):
    def __init__(self, algo='knn_baseline', filepath=None):
        if not os.path.exists(filepath):
            raise FileNotFoundError("{} not exist".format(filepath))
        self.filepath = filepath
        if algo == 'nmf':
            self.algo = NMF()
            self.model_name = 'nmf'
        else:
            self.algo = KNNBaseline()
            self.model_name = 'knn_baseline'

        self.convertor = DataConvertHelper()

    def buildDataSet(self):
        reader = Reader(line_format='user item rating timestamp', sep=',')
        music_data = Dataset.load_from_file(file_path=self.filepath,
                                            reader=reader)
        self.trainset = music_data.build_full_trainset()

    def train(self):
        print("begin training...")
        self.algo.fit(self.trainset)

    def evaluate(self, index):
        current_playlist_name = self.convertor.get_name_by_index(index)
        print('当前歌单:{}'.format(current_playlist_name))

        current_playlist_rid = self.convertor.get_rid_by_name(
            current_playlist_name)
        print("当前歌单rid: {}".format(current_playlist_rid))

        playlist_inner_id = self.algo.trainset.to_inner_uid(
            current_playlist_rid)
        print('歌单inid', playlist_inner_id)

        playlist_neighbors_inner_ids = self.algo.get_neighbors(
            playlist_inner_id, k=10)
        playlist_neighbors_rids = (
            self.algo.trainset.to_raw_uid(inner_id)
            for inner_id in playlist_neighbors_inner_ids)
        playlist_neighbors_names = (self.convertor.get_name_by_rid(rid)
                                    for rid in playlist_neighbors_rids)

        print('歌单 《', current_playlist_name, '》 最接近的10个歌单为:')
        for playlist_name in playlist_neighbors_names:
            print(
                playlist_name,
                self.algo.trainset.to_inner_uid(
                    self.convertor.get_rid_by_name(playlist_name)))
Exemple #13
0
def recommend_friends(request):
    queryset = Rate.objects.all()
    query, params = queryset.query.as_sql(
        compiler='django.db.backends.sqlite3.compiler.SQLCompiler',
        connection=connections['default'])
    df = pd.read_sql_query(query, con=connections['default'], params=params)
    print("load data")
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(df[['user_id', 'item_id', 'rate']], reader)
    trainset = data.build_full_trainset()
    sim_options = {'name': 'pearson_baseline'}
    algo = KNNBaseline(sim_options=sim_options)
    algo.fit(trainset)
    for given_user_id in set(df['user_id']):
        print(given_user_id)
        given_user_id = int(given_user_id)
        _from = get_object_or_404(Profile, profile_id=given_user_id)
        inner_id = algo.trainset.to_inner_uid(given_user_id)
        #    to_inner_uid(), to_inner_iid(), to_raw_uid(), and to_raw_iid()
        neighbors = algo.get_neighbors(inner_id, k=5)
        results = [
            algo.trainset.to_raw_uid(inner_user_id)
            for inner_user_id in neighbors
        ]
        print('The 5 nearest neighbors of Given User Id:')

        for raw_user_id in results:
            _to = get_object_or_404(Profile, user_id=int(raw_user_id))
            # print(raw_user_id,Candidates2.objects.filter(user_from=user_from,user_to=user_to))
            if Candidates2.objects.filter(user_from=_from):
                if Candidates2.objects.filter(user_from=_from, user_to=_to):
                    print("user from , to 다 일치")
                    pass
                else:
                    cand = Candidates2.objects.get(user_from=_from)
                    cand.user_to.add(_to)
                    print("user from만 일치, to 추가")
            else:
                cand = Candidates2.objects.create()
                cand.user_from.add(_from)
                cand.user_to.add(_to)
            print("해당 유저 %s 에 대한 데이터 저장완료" % given_user_id)
    return render(request, "recommend_completed.html")
	def get(self, item_id):
		# SQL query
		conn = mysql.connect()
		cursor = conn.cursor()
		# STEP 1 : KNN WITH MSD
		df = pd.read_sql_query("SELECT * FROM story_reviews", conn)

		# Data and Model
		reader = Reader(rating_scale=(1, 5))
		data = Dataset.load_from_df(df[['user_id', 'story_id', 'star']], reader)
		sim_options = {'name': 'pearson_baseline', 'user_based': False}

		model = KNNBaseline(sim_options=sim_options)
		
		# Training
		training_set = data.build_full_trainset()
		model.fit(training_set)

		item_inner_id = model.trainset.to_inner_iid(item_id)
		item_neighbors_inner = model.get_neighbors(item_inner_id, k=10)
		item_neighbors = [model.trainset.to_raw_iid(inner_id) for inner_id in item_neighbors_inner]

		# STEP 2 : CASCADE IT WITH TF-IDF
		df_stories = pd.read_sql_query("SELECT * FROM stories", conn)

		# Model
		tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
		tf_idf_matrix = tf.fit_transform(df_stories['title'])
		cosine_similarities = cosine_similarity(tf_idf_matrix, tf_idf_matrix)

		# Retrieve similar items
		cosine_similarities_row = cosine_similarities[item_id-1]
		recommendations_list = []
		n = 10
		for i in range(n):
			recommendations_list.append((item_neighbors[i], cosine_similarities_row[item_neighbors[i]-1]))
				
		recommendations_list.sort(key=lambda x:x[1], reverse=True)
		formatted_recommendations_list = [item[0] for item in recommendations_list]

		# Return K Nearest Neighbors
		return jsonify(recommendations = formatted_recommendations_list)
Exemple #15
0
def compute_user_neighbors(id_name_dic, name_id_dic, trainset):
    algo = KNNBaseline()
    algo.fit(trainset)

    user_name = name_id_dic.keys()[1]
    print("user_name: ", user_name)
    user_id = name_id_dic[user_name]
    print("user_id: ", user_id)
    user_inner_id = algo.trainset.to_inner_uid(user_id)
    print("内部id: ", user_inner_id)

    user_neighbors = algo.get_neighbors(user_inner_id, k=10)

    user_neighbors = (algo.trainset.to_raw_uid(inner_id)
                      for inner_id in user_neighbors)
    user_neighbors = (id_name_dic[user_id] for user_id in user_neighbors)
    print()
    print("和user 《", user_name, "》 最接近的10个user为:")
    for user in user_neighbors:
        print(algo.trainset.to_inner_uid(name_id_dic[user]), user)
Exemple #16
0
def compute_movie_neighbors(id_name_dic, name_id_dic, trainset):
    sim_options = {'user_based': False}
    algo = KNNBaseline(sim_options=sim_options)
    algo.fit(trainset)

    #movie_name = name_id_dic.keys()[1]
    movie_name = "古墓迷途2"
    print("movie_name: ", movie_name)
    movie_id = name_id_dic[movie_name]
    print("movie_id: ", movie_id)
    movie_inner_id = algo.trainset.to_inner_iid(movie_id)
    print("内部id: ", movie_inner_id)

    movie_neighbors = algo.get_neighbors(movie_inner_id, k=10)

    movie_neighbors = (algo.trainset.to_raw_iid(inner_id)
                       for inner_id in movie_neighbors)
    movie_neighbors = (id_name_dic[movie_id] for movie_id in movie_neighbors)
    print()
    print("和movie 《", movie_name, "》 最接近的10个movie为:")
    for movie in movie_neighbors:
        print(algo.trainset.to_inner_iid(name_id_dic[movie]), movie)
Exemple #17
0
class NearestRecipesBaseline:
    def __init__(self, k=40):
        self.model = KNNBaseline(
            k=k,
            sim_options={ 'name': 'pearson_baseline', 'user_based': False },
            verbose=False
        )

    def fit(self, reviews):
        # SurPRISE supports only pandas DataFrame or folds as data input
        data = Dataset.load_from_df(
            DataFrame(reviews), 
            Reader(rating_scale=(1, 5))
        )
        trainset = data.build_full_trainset()
        return self.model.fit(trainset)

    def predict(self, recipe_id, k=20):
        inner_id = self.model.trainset.to_inner_iid(recipe_id)
        nearest_inner_ids = self.model.get_neighbors(inner_id, k=k)
        nearest_ids = [self.model.trainset.to_raw_iid(id) for id in nearest_inner_ids]
        return nearest_ids
Exemple #18
0
def start(goods='951'):

    file_path = os.path.expanduser('SampleData')
    # 指定文件格式
    reader = Reader(line_format='user item rating timestamp', sep=',')
    # 从文件读取数据
    data = Dataset.load_from_file(file_path, reader=reader)

    trainset = data.build_full_trainset()
    sim_options = {'name': 'pearson_baseline', 'user_based': False}
    ##使用KNNBaseline算法
    algo = KNNBaseline(sim_options=sim_options)
    algo.train(trainset)

    iid_innerid = algo.trainset.to_inner_iid(goods)

    iid_neighbors = algo.get_neighbors(iid_innerid, k=10)

    iid_neighbors = (algo.trainset.to_raw_iid(inner_id)
                     for inner_id in iid_neighbors)

    print('The 10 nearest neighbors of %s:' % goods)
    for iid in iid_neighbors:
        print(iid)
	def get(self, algorithm, item_id):
		# SQL query
		conn = mysql.connect()
		cursor = conn.cursor()
		df = pd.read_sql_query("SELECT * FROM story_reviews", conn)

		# Data and Model
		reader = Reader(rating_scale=(1, 5))
		data = Dataset.load_from_df(df[['user_id', 'story_id', 'star']], reader)

		if algorithm == 'pearson':
			sim_options = {'name': 'pearson', 'user_based': False}
		elif algorithm == 'cosine':
			sim_options = {'name': 'cosine', 'user_based': False}
		elif algorithm == 'pearsonbaseline':
			sim_options = {'name': 'pearson_baseline', 'user_based': False}
		elif algorithm == 'msd':
			sim_options = {'name': 'msd', 'user_based': False}
		else:
			sim_options = {'name': 'pearson_baseline', 'user_based': False}

		model = KNNBaseline(sim_options=sim_options)
		
		# Training
		training_set = data.build_full_trainset()
		model.fit(training_set)

		# TESTING
		# cross_validate(model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

		item_inner_id = model.trainset.to_inner_iid(item_id)
		item_neighbors_inner = model.get_neighbors(item_inner_id, k=10)
		item_neighbors = [model.trainset.to_raw_iid(inner_id) for inner_id in item_neighbors_inner]

		# Return K Nearest Neighbors
		return jsonify(recommendations = item_neighbors)
Exemple #20
0
# Computing the pearson_baseline similarity matrix...
# Done computing similarity matrix.

# UserWarning: train() is deprecated. Use fit() instead warnings.warn('train() is deprecated. Use fit() instead', UserWarning)
# algo.train(trainset)
algo.fit(trainset)

rid_to_name, name_to_rid = read_item_names()

origin_film_name = 'GoldenEye (1995)'
toy_story_raw_id = name_to_rid[origin_film_name]
# print(toy_story_raw_id)

toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id)
# print(toy_story_inner_id)
# 一个电影在algo内部的id, 并不一定等于文件中的id

# 找到最近的k = 10个邻居
K = 10
toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, k=K)
print(toy_story_neighbors)

# 从近邻的id映射回电影名称
toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id)
                       for inner_id in toy_story_neighbors)
toy_story_neighbors = (rid_to_name[rid]
                       for rid in toy_story_neighbors)

print('The ' + str(K) + ' nearest neighbors of ' + origin_film_name + ' are:')
for movie in toy_story_neighbors:
    print(movie)
    return rid_to_name, name_to_rid


# First, train the algortihm to compute the similarities between items
data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()
sim_options = {'name': 'pearson_baseline', 'user_based': False}
algo = KNNBaseline(sim_options=sim_options)
algo.fit(trainset)

# Read the mappings raw id <-> movie name
rid_to_name, name_to_rid = read_item_names()

# Retrieve inner id of the movie Toy Story
toy_story_raw_id = name_to_rid['Toy Story (1995)']
toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id)

# Retrieve inner ids of the nearest neighbors of Toy Story.
toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, k=10)

# Convert inner ids of the neighbors into names.
toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id)
                       for inner_id in toy_story_neighbors)
toy_story_neighbors = (rid_to_name[rid]
                       for rid in toy_story_neighbors)

print()
print('The 10 nearest neighbors of Toy Story are:')
for movie in toy_story_neighbors:
    print(movie)
Exemple #22
0
reader = Reader(line_format='user item rating', sep='\t')
data = Dataset.load_from_file(file_path, reader=reader)

trainset = data.build_full_trainset()
sim_options = {'name': 'pearson_baseline', 'user_based': False}

algo = KNNBaseline(sim_options=sim_options)
algo.fit(trainset)

path = os.path.join(BASE_DIR, os.path.dirname(os.path.abspath(__file__)),
                    'play_id.txt')

f = open(path)
cur_play_id = f.read()
f.close()
# print(cur_play_id,'===========')
cur_play_inner_id = algo.trainset.to_inner_iid(cur_play_id)
cur_play_neighbors = algo.get_neighbors(cur_play_inner_id, k=6)
# print(cur_play_neighbors)
cur_play_neighbors = (algo.trainset.to_raw_iid(inner_id)
                      for inner_id in cur_play_neighbors)

path = os.path.join(BASE_DIR, os.path.dirname(os.path.abspath(__file__)),
                    'recmmond_play_result.txt')

f = open(path, 'w')
f.write(','.join(cur_play_neighbors))
f.close()
# for play_id in cur_play_neighbors:
#     print(play_id, '----------')
Exemple #23
0
# print(trainset.n_users)
# print("开始训练模型...")
algo = KNNBaseline()
algo.fit(trainset)

current_palylist = list(name_id_dic.keys())[39]
print("歌单名称", current_palylist)
# 取出近邻
# 映射名字到id
playlist_id = name_id_dic[current_palylist]
print("歌单id", playlist_id)
# 取出来对应的内部user id => to_inner_uid
playlist_inner_id = algo.trainset.to_inner_uid(playlist_id)
print("内部id", playlist_inner_id)

playlist_neighbors = algo.get_neighbors(playlist_inner_id, k=10)

# 把歌曲id转换成歌曲名字
# to_raw_uid映射回去
playlist_neighbors = (algo.trainset.to_raw_uid(inner_id)
                      for inner_id in playlist_neighbors)
playlist_neighbors = (id_name_dic[playlist_id]
                      for playlist_id in playlist_neighbors)

print("之前的啥:", playlist_neighbors)
print("和歌单《", current_palylist, "》最接近的10首歌单为:\n")
for playlist in playlist_neighbors:
    print(playlist, algo.trainset.to_inner_uid(name_id_dic[current_palylist]))

# 针对用户进行预测
song_id_name_dic = pickle.load(open("popular_song.pkl", "rb"),
Exemple #24
0
    # raw_id_toy_story = rid_to_name['1053']
    # print(raw_id_toy_story)

    train_data = data.build_full_trainset()
    # 皮尔逊系数计算相似度,基于物品的推荐
    sim_options = {'name': 'pearson_baseline', 'user_based': False}
    from surprise import KNNBaseline

    knn = KNNBaseline(sim_options=sim_options)
    knn.fit(train_data)

    rid_to_name, name_to_rid = read_item_names()
    toy_story_raw_id = name_to_rid['Now and Then (1995)']
    print("数据中的ID:%s" % toy_story_raw_id)

    # toy_story_inner_id在实际要计算的矩阵中的ID。
    toy_story_inner_id = knn.trainset.to_inner_iid(toy_story_raw_id)
    print("矩阵中的ID:%s" % toy_story_inner_id)
    toy_story_neighbors = knn.get_neighbors(toy_story_inner_id, k=10)
    print("最近邻:%s" % toy_story_neighbors)

    # 对近邻集合中对ID进行转换为名字
    toy_story_neighbors = (knn.trainset.to_raw_iid(inner_id)
                           for inner_id in toy_story_neighbors)
    toy_story_neighbors = (rid_to_name[rid]
                           for rid in toy_story_neighbors)
    print()
    print('推荐的Top 10 :')
    for movie in toy_story_neighbors:
        print(movie)
file_path = 'ratings_android.dat'
reader = Reader(line_format='user item rating', rating_scale=(1, 5), sep=' ')
data = Dataset.load_from_file(file_path, reader=reader)

trainset = data.build_full_trainset()
sim_options = {'name': 'pearson_baseline', 'user_based': True}
algo = KNNBaseline(sim_options=sim_options)
algo.train(trainset)

# Read the mappings raw id <-> movie name
#rid_to_name, name_to_rid = read_item_names()

# Retrieve inner id of the movie Toy Story
#toy_story_raw_id = name_to_rid['Toy Story (1995)']
#toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id)

# Retrieve inner ids of the nearest neighbors of Toy Story.
toy_story_neighbors = algo.get_neighbors(1, k=10)

# Convert inner ids of the neighbors into names.
#toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id)
#                       for inner_id in toy_story_neighbors)
#toy_story_neighbors = (rid_to_name[rid]
#                       for rid in toy_story_neighbors)

#print()
print('The 10 nearest neighbors of Toy Story are:')
#for movie in toy_story_neighbors:
#    print(movie)
print(toy_story_neighbors)
Exemple #26
0
            if int(line[1]) in uid:
                uid[int(line[1])].update({int(line[0]):int(line[2])})
            else:
                uid[int(line[1])]={int(line[0]):int(line[2])}
#        print "done!"
    return uid


file_path=os.path.expanduser('~')+"/Downloads/CSC522/toy/sample.data"
reader=Reader(line_format='item user rating timestamp',sep=',')
data=Dataset.load_from_file(file_path,reader=reader)
data.split(n_folds=60)

trainset=data.build_full_trainset()
sim_options={'name':'pearson_baseline','user_based':True}
algo=KNNBaseline(sim_options=sim_options)
algo.train(trainset)

user_id='911'
user_inner_id=algo.trainset.to_inner_uid(user_id)
user_neighbors=algo.get_neighbors(user_inner_id,k=22)
user_neighbors=(algo.trainset.to_raw_uid(inner_id) for inner_id in user_neighbors)

print()
print('The 5 nearest neighbors of the userid %s are:'%user_id)
for userid in user_neighbors:
    print(userid)

perf=evaluate(algo,data,measures=['RMSE','MAE'])
print (perf)
Exemple #27
0
genre_inner_id_list = []
for genre in full_genres:
    inner = algo_genre_group.trainset.to_inner_iid(genre)
    genre_inner_id_list.append(inner)

game_inner_id_list = []
for game in full_games:
    inner = algo_game_group.trainset.to_inner_iid(game)
    game_inner_id_list.append(inner)

# get the list of neighbors for those genres/games

genre_neighbors_list = []
for inner in genre_inner_id_list:
    genre_neighbors = algo_genre_group.get_neighbors(inner, k=3)
    genre_neighbors_list.append(genre_neighbors)

game_neighbors_list = []
for inner in game_inner_id_list:
    game_neighbors = algo_game_group.get_neighbors(inner, k=3)
    game_neighbors_list.append(game_neighbors)

# prioritize closest neighbors to all original genres/games mentioned

genre_final_list = []
for item in genre_neighbors_list:
    genre_final_list.append(item[0])
    genre_final_list.append(item[1])

game_final_list = []
Exemple #28
0
W = np.ones(s)
algo.weightUpdate(W)
predictions = algo.fit(trainset).test(testset)
# predictions = algo.fit(trainset)

PredictM = np.zeros(s)
for it in predictions:
    PredictM[int(it[0]), int(it[1])] = it[3]
print(PredictM)
PM = pd.DataFrame(PredictM)
PM.to_csv("PredictionMatrix.csv")
# print (trainset.all_items())

NeighborM = np.zeros([trainset.n_items, K])
for i, item in enumerate(trainset.all_items()):
    NeighborM[i] = algo.get_neighbors(item, k=K)
NM = pd.DataFrame(NeighborM)
NM.to_csv("NeighborMatrix.csv")
# print ("Inner id: ",iterator_neighbors[1],"Raw id:")
# print (algo.trainset.to_raw_uid(1))

# print Neighbor

# u_id:user id      i_id:item id
# un_: user number  in_: item number
# real rating matrix: r[u_id][i_id]
# estimate : r_h[m][u_id][i_id]
# ###########similarity matrix: (item_based) s[i_id][i_id]
# rating interval:
# weight matrix: W[u_id][i_id]
# k_neighbor: k_neighbor_li
Exemple #29
0
def read_user_names():
    """Read the u.item file from MovieLens 100-k dataset and return two
    mappings to convert raw ids into movie names and movie names into raw ids.
    """

    file_name = get_dataset_dir() + '/ml-100k/ml-100k/u.data'
    user_id = {}
    name_to_rid = {}
    with io.open(file_name, 'r', encoding='ISO-8859-1') as f:
        for line in f:
            line = line.split('\t')
            user_id[line[0]] = line[0]

    return user_id


# First, train the algortihm to compute the similarities between items
data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()
sim_options = {'name': 'cosine', 'user_based': True, 'min_support': 10}
algo = KNNBaseline(sim_options=sim_options)
algo.fit(trainset)

user_neighbors = algo.get_neighbors(130, k=10)

user_neighbors = (algo.trainset.to_raw_iid(inner_id)
                  for inner_id in user_neighbors)

print('The 10 nearest neighbors of User 130 are:')
for user in user_neighbors:
    print(user)
Exemple #30
0
    'C:/Users/xuwei/workspace/Surprise/src/data.txt')
reader = Reader(line_format='user item rating', sep='\t')

data = Dataset.load_from_file(file_path, reader=reader)

trainset = data.build_full_trainset()
#使用pearson_baseline方式计算相似度  False以item为基准计算相似度 本例为电影之间的相似度
sim_options = {'name': 'pearson_baseline', 'user_based': False}
##使用KNNBaseline算法
algo = KNNBaseline(sim_options=sim_options)
#algo=KNNBaseline()
#训练模型
algo.train(trainset)

n = 0
for item in range(trainset.n_items):
    raw_id = trainset.to_raw_iid(item)
    neighbors = algo.get_neighbors(item, 5)
    for neighbor in neighbors:
        neighbor_raw_id = trainset.to_raw_iid(neighbor)
        sql = "insert into book_similarity_icf(book_id1,book_id2) values('%d','%d');" % (
            int(raw_id), int(neighbor_raw_id))
        print(sql)
        cursor.execute(sql)
        db.commit()
        n += 1
        if (n % 100 == 0):
            print('----------------')
            print(n, 'is done')
db.close()
Exemple #31
0
    def train_knn(self, df, userId, user_m_ids, movies_watched):
        """
            param: df - movies pandas DataFrame
                   userId - user ID to predict movies with
                   user_m_ids - List of movieIDs of movies to be recommended upon
                                (as seen in TMDB dataset)
                   movies_watched - List of movie titles watched 
                                    (as seen in TMDB dataset)

            return: pandas DataFrame of the recommended movies with attributes - 
                    title, id, vote_average, vote_count, popularity, release date

            Collaborative filtering is done using KNN-Baseline and prediction is 
            done using pearson_baseline. The technique used is item-item based.
		"""
        reader = Reader(rating_scale=(1, 5))
        movie_ids = self.get_movie_ids()
        rec_result = dict()

        sim_options = {"name": "pearson_baseline", "user_based": False}

        data = Dataset.load_from_df(df[RATING_ATTR], reader)
        if isfile(PATH_COLL_FILTERING_CACHE):
            model = joblib.load(PATH_COLL_FILTERING_CACHE)
        else:
            trainset = data.build_full_trainset()
            model = KNNBaseline(sim_options=sim_options)
            model.fit(trainset)
            joblib.dump(model, PATH_COLL_FILTERING_CACHE)

        inn_id = model.trainset.to_inner_iid(user_m_ids[0])
        # print(self.get_movie_title(self.get_tmdb_id(user_m_ids[0])))
        inn_id_neigh = model.get_neighbors(inn_id, k=10)
        # print(inn_id_neigh)

        df_pref = pd.DataFrame(columns=[
            "title",
            "id",
            "vote_average",
            "vote_count",
            "popularity",
            "release_date",
        ])
        index = 0

        for m_id in inn_id_neigh:
            title_df = self.get_movie_title(
                self.get_tmdb_id(model.trainset.to_raw_iid(m_id)))
            try:
                if title_df[0] not in movies_watched:
                    df_pref.loc[index] = array([
                        title_df[0],
                        title_df[1],
                        title_df[2],
                        title_df[3],
                        title_df[4],
                        title_df[5],
                    ])
                    index += 1
            except:
                pass

        return df_pref