Esempio n. 1
0
def knn_item(trainset, testset, predset):
    
    modelname = 'knnitem'
    # Check if predictions already exist
    if is_already_predicted(modelname):
        return
    
    bsl_options = { 'method': 'als',
                    'reg_i': 1.e-5,
                    'reg_u': 14.6,
                    'n_epochs': 10
                   }
    sim_options = {
                    'name': 'pearson_baseline',
                    'shrinkage': 100,
                    'user_based': False
                    }
    algo = KNNBaseline(k=60, sim_options=sim_options, bsl_options=bsl_options)
    print('KNN item based Model')
    algo.train(trainset)
    
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions, verbose=False)
    print('   RMSE on Test: ', rmse)
    preds = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds[j] = pred.est
    save_predictions(modelname, rmse, preds, 'test')

    print('   Evaluate predicted ratings...')
    predictions = algo.test(predset)
    preds = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds[j] = pred.est
    save_predictions(modelname, rmse, preds)
def knn_baseline_movie(train, test, ids, Xtest, Xids):
    """
    nearest neighbour approach using the movie baseline
    Argument : train, the trainset
               test, the testset
               ids, unknown ratings
               Xtest, predicted ratings for testset, to be used for final blending
               Xids, predicted ratings for unknown ratings, to be used for final blending
    """

    print('kNN Baseline Movie')
    bsl_option = {'method': 'als', 'n_epochs': 100, 'reg_u': 15, 'reg_i': 0.01}

    sim_option = {
        'name': 'pearson_baseline',
        'min_support': 1,
        'user_based': False
    }

    algo = KNNBaseline(k=100,
                       bsl_options=bsl_option,
                       sim_options=sim_option,
                       verbose=False)

    #Train algorithm on training set
    algo.fit(train)

    #Predict on train and compute RMSE
    predictions = algo.test(train.build_testset())
    print('   Training RMSE: ', accuracy.rmse(predictions, verbose=False))

    #Predict on test and compute RMSE
    predictions = algo.test(test)
    rmse = accuracy.rmse(predictions, verbose=False)
    print('   Test RMSE: ', rmse)

    preds_test = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds_test[j] = pred.est

    #Predict unknown ratings
    preds_ids = []
    for i in range(len(ids[0])):
        pred = algo.predict(str(ids[0][i]), str(ids[1][i]))
        preds_ids.append(pred.est)

    Xtest.append(preds_test)
    Xids.append(preds_ids)
    return rmse, Xtest, Xids, preds_test, preds_ids
Esempio n. 3
0
def surprise_knn_ub(train_file, test_file):
    """
    Knn userbased with Surprise library.
    Compute the predictions on a test_set after training on a train_set using the method KNNBaseLineOnly from Surprise.
    Args:
        train_file (string): path to created test file
        test_file (string): path to created train file
    Hyperparameters :
        k : The (max) number of neighbors to take into account for aggregation
        sim_options (dict) – A dictionary of options for the similarity measure.

    Returns:
        numpy array: predictions
    """
    print("knnUB")
    algo = KNNBaseline(k=300,
                       sim_options={
                           'name': 'pearson_baseline',
                           'user_based': True
                       })
    fold = [(train_file, test_file)]
    reader = Reader(line_format='user item rating', sep=',')
    data = Dataset.load_from_folds(fold, reader=reader)
    pkf = PredefinedKFold()
    for trainset, testset in pkf.split(data):
        # Train
        algo.fit(trainset)

        # Predict
        predictions = algo.test(testset)
    pred = np.zeros(len(predictions))
    for i in range(len(predictions)):
        val = predictions[i].est
        pred[i] = val
    return pred
Esempio n. 4
0
def get_top_n_for_user(target_user_id, recom_alg, recom_size):
    
    file_path = os.path.expanduser('static/CRdata.csv')
    reader = Reader(line_format='user item rating timestamp', sep=',', rating_scale=(0,100))
    data = Dataset.load_from_file(file_path,reader=reader)
    trainset = data.build_full_trainset()
    testset = trainset.build_anti_testset()
    
    
    if(recom_alg == 'KNNBaseline'):
    
        similarity = {'name': 'cosine',
            'user_based': True  # compute  similarities between users
            }
        algo = KNNBaseline(sim_options=similarity)
        
    
    
    elif(recom_alg == 'CoClustering'):
        algo = CoClustering()
        
    else:
        algo = SVD()
        

    algo.fit(trainset)
    predictions  = algo.test(testset)

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:recom_size]

    return top_n[str(target_user_id)]
Esempio n. 5
0
def main():

	# Loads dataset
	rating_data_set = load_dataset(TRAINING_SET_PATH)

	# # Clean data
	rating_data_set = remove_missing_values(rating_data_set)

	# Slice data
	drop_movie_list, rating_data_set = slice_data(rating_data_set)

	# Loads movie file
	movies = load_movies_file(drop_movie_list, MOVIES_FILE_PATH)


	reader = Reader()

	sim_options = {'name': 'cosine', 'min_support': 2, 'shrinkage': 100, 'user_based': True}
	bsl_options = {'method': 'sgd'}
	data = Dataset.load_from_df(rating_data_set[['CustomerID', 'MovieID', 'Rating']][:1000], reader)

	kf = KFold(n_splits=5)
	#algo = SVD()
	algo = KNNBaseline(k=N, sim_options=sim_options, bsl_options=bsl_options)

	i = 0

	for trainset, testset in kf.split(data):
		print("Running fold: ", i)
		algo.fit(trainset)
		predictions = algo.test(testset)
		precisions, recalls = precision_recall(predictions, 20)

	    # Precision and recall can then be averaged over all users
		print(sum(prec for prec in precisions.values()) / len(precisions))
		print(sum(rec for rec in recalls.values()) / len(recalls))

		i += 1
Esempio n. 6
0
# чтение файла в словарь
def read():
    file_name = get_dataset_dir() + '/ml-100k/ml-100k/u.item'
    rid_name = {}
    with io.open(file_name, 'r', encoding='ISO-8859-1') as file:
        for line in file:
            line = line.split('|')
            rid_name[line[0]] = (line[1], line[2])
    return rid_name


# оценка с наилучшими параметрами(test)
test = train.build_anti_testset()
test = filter(lambda x: x[0] == user, test)
pr = algorithm.test(test)
name = read()

# формируем список для пользователей(uid)
top = collections.defaultdict(list)
for uid, iid, _, est, _ in pr:
    top[uid].append((iid, round(est, 3)))

# сортируем и добавляем только top_n
for uid, ratings in top.items():
    ratings.sort(key=lambda x: x[1], reverse=True)
    top[uid] = ratings[:top_n]

print('User {user}:')
for movie, rating in top[user]:
    print(movie, str(name[movie]), rating)
Esempio n. 7
0
def collaborative_filtering(raw_uid):
    # To read the data from a txt file
    # =============== 数据预处理 ===========================
    # 将数据库中的所有数据读取转换到文件
    # dir_data = '/www/wwwroot/music_recommender/page/cf_recommendation/cf_data'
    dir_data = './collaborative_filtering/cf_data'
    file_path = '{}/dataset_user_5.txt'.format(dir_data)
    if not os.path.exists(dir_data):
        os.makedirs(dir_data)

    # 数据库操作
    # 打开数据库连接
    db = pymysql.connect("localhost",
                         "root",
                         "password",
                         "music_recommender",
                         charset='utf8')

    # 使用 cursor() 方法创建一个游标对象 cursor
    cursor = db.cursor()
    songData = defaultdict(list)
    sql = """SELECT uid, song_id, rating
              FROM user_rating
               WHERE 1"""
    cursor.execute(sql)
    results = cursor.fetchall()
    with open(file_path, "w+") as data_f:
        a = 0
        for result in results:
            uid, song_id, rating = result
            if song_id in songData:
                songData[song_id].append(rating)
            else:
                songData[song_id] = [rating]
            
            data_f.writelines("{}\t{}\t{}\n".format(uid, song_id, rating))
            a += 1
  
    if not os.path.exists(file_path):
        raise IOError("Dataset file is not exists!")

    reader = Reader(line_format='user item rating', sep='\t')
    data = Dataset.load_from_file(file_path, reader=reader)
    # Build the training set
    trainset = data.build_full_trainset()
  
    bsl_options = {'method': 'sgd',
                    'learning_rate': 0.0005,
                 }
    algo_BaselineOnly = BaselineOnly(bsl_options=bsl_options)
    algo_BaselineOnly.fit(trainset) #训练模型

    rset = user_build_anti_testset(trainset, raw_uid)
    predictions = algo_BaselineOnly.test(rset)
    top_n_baselineonly = get_top_n(predictions, n=10)
    # print(predictions)
    # uid    原生用户id
    # iid    原生项目id
    # r_ui    浮点型的真实评分
    # est    浮点型的预测评分
    # details    预测相关的其他详细信息
    # print(top_n_baselineonly, 'top_n_baselineonly')
    

    # KNNBasic
    sim_options = {'name': 'pearson', 'user_based': True}
    algo_KNNBasic = KNNBasic(sim_options=sim_options)
    algo_KNNBasic.fit(trainset)

    predictor = PredictionSet(algo_KNNBasic, trainset, raw_uid)
  
    knn_anti_set = predictor.user_build_anti_testset()
    predictions = algo_KNNBasic.test(knn_anti_set)

    top_n_knnbasic = get_top_n(predictions, n=1000)
    # print(predictions, 'top_n_knnbasic')
    # KNNBaseline
    sim_options = {'name': 'pearson_baseline', 'user_based': True}
    algo_KNNBaseline = KNNBaseline(sim_options=sim_options)
    algo_KNNBaseline.fit(trainset)

    predictor = PredictionSet(algo_KNNBaseline, trainset, raw_uid)
    knn_anti_set = predictor.user_build_anti_testset()
    predictions = algo_KNNBaseline.test(knn_anti_set)
    top_n_knnbaseline = get_top_n(predictions, n=1000)

    evaluationMSEResult = evaluationMSE([top_n_baselineonly, top_n_knnbasic, top_n_knnbaseline], raw_uid)

    recommendset = set()
    for results in [top_n_baselineonly, top_n_knnbasic, top_n_knnbaseline]:
        for key in results.keys():
            for recommendations in results[key]:
                iid, rating, true_score = recommendations
                recommendset.add(iid)

    items_baselineonly = set()
    for key in top_n_baselineonly.keys():
        for recommendations in top_n_baselineonly[key]:
            iid, rating, true_score = recommendations
            items_baselineonly.add(iid)

    items_knnbasic = set()
    for key in top_n_knnbasic.keys():
        for recommendations in top_n_knnbasic[key]:
            iid, rating, true_score = recommendations
            items_knnbasic.add(iid)

    items_knnbaseline = set()
    for key in top_n_knnbaseline.keys():
        for recommendations in top_n_knnbaseline[key]:
            iid, rating, true_score = recommendations
            items_knnbaseline.add(iid)

    rank = dict()
    for recommendation in recommendset:
        if recommendation not in rank:
            rank[recommendation] = 0
        if recommendation in items_baselineonly:
            rank[recommendation] += 1
        if recommendation in items_knnbasic:
            rank[recommendation] += 1
        if recommendation in items_knnbaseline:
            rank[recommendation] += 1

    max_rank = max(rank, key=lambda s: rank[s])
    evaluationMSEResult1 = {}
    if max_rank == 1:
        return items_baselineonly
    else:
        resultAll = dict()
        result = nlargest(10, rank, key=lambda s: rank[s])
        for k in result:
            resultAll[k] = rank[k]
        # print("排名结果: {}".format(resultAll))
        evaluation(songData, resultAll)
        for key in evaluationMSEResult:
            if key in resultAll:
                evaluationMSEResult1[key] = evaluationMSEResult[key]
        print(evaluationMSEResult1,'evaluationMSEResult1==') #最后的评估
        return resultAll
Esempio n. 8
0
    def recommend(self, params):
        user = self.fullTrainSet.to_inner_uid(params["user"])

        antiTestSet = self._buildAntiTestSetForUser(user)

        algo = params["algorithm"]
        path = "models/" + algo

        if algo == "svd":
            if "models" not in params.keys():
                args = {
                    "random_state": 0,
                    "reg_all": float(params["rr"]),
                    "lr_all": float(params["lr"]),
                    "n_epochs": int(params["ne"]),
                    "n_factors": int(params["factors"])
                }

                svd = SVD(**args)
                svd = svd.fit(self.fullTrainSet)
                predictions = svd.test(antiTestSet)

                if "name" in params.keys():
                    mm = ModelManager()
                    name = params["name"]
                    path = path + "/" + name
                    mm.saveModel(svd, path)
                    self.models[algo].append(name)
            else:
                mm = ModelManager()
                model = params["models"]
                path = path + "/" + model

                svd, _ = mm.loadModel(path)
                predictions = svd.test(antiTestSet)

            topN = self._getTopNForUser(predictions)
            topN = [(self.getAdditionalData(movieId), int(round(estimated, 0)))for movieId, estimated in topN]

        elif algo == "knnItemBaseline":

            if "models" not in params.keys():
                args = {
                    "sim_options" : {'name': 'cosine', 'user_based': False},
                    "k": int(params["k"])
                }

                knn = KNNBaseline(**args)
                knn = knn.fit(self.fullTrainSet)
                predictions = knn.test(antiTestSet)

                if "name" in params.keys():
                    mm = ModelManager()
                    name = params["name"]
                    path = path + "/" + name
                    mm.saveModel(knn, path)
                    self.models[algo].append(name)
            else:
                mm = ModelManager()
                model = params["models"]
                path = path + "/" + model

                knn, _ = mm.loadModel(path)
                predictions = knn.test(antiTestSet)

            topN = self._getTopNForUser(predictions)
            topN = [(self.getAdditionalData(movieId), int(round(estimated, 0))) for movieId, estimated in topN]

        elif algo == "weightedHybrid":

            svd = SVD(random_state=0, reg_all=0.1, lr_all=0.003, n_factors=30, verbose=False)
            knn = KNNBaseline(sim_options={'name': 'cosine', 'user_based': False}, k=150)
            weightedHybrid = WeightedHybridAlgorithm(svd, knn, weights=[0.6, 0.4])
            weightedHybrid.fit(self.fullTrainSet)
            predictions = weightedHybrid.test(antiTestSet)
            topN = self._getTopNForUser(predictions)
            topN = [(self.getAdditionalData(movieId), int(round(estimated, 0))) for movieId, estimated in topN]

        elif algo == "userCollaborative":

            if "models" not in params.keys():
                args = {
                    "k": int(params["k"]),
                    "sim_options": {'name': 'cosine', 'user_based': True}
                }

                knn = knnRecAlgorithm(**args)
                knn = knn.fit(self.fullTrainSet)
                predictions = knn.test(antiTestSet)

                if "name" in params.keys():
                    mm = ModelManager()
                    name = params["name"]
                    path = path + "/" + name
                    mm.saveModel(knn, path)
                    self.models[algo].append(name)
            else:
                mm = ModelManager()
                model = params["models"]
                path = path + "/" + model

                knn, _ = mm.loadModel(path)
                predictions = knn.test(antiTestSet)

            topN = self._getTopNForUser(predictions, minimumRating=0.0)
            # topN = [(self.getAdditionalData(movieId), round(estimated, 2)) for movieId, estimated in topN]
            topN = [(self.getAdditionalData(movieId), "") for movieId, estimated in topN]

        elif algo == "bpr":

            if "models" not in params.keys():

                args = {
                    "reg": float(params["rr"]),
                    'learning_rate': float(params["lr"]),
                    'n_iters': int(params["ni"]),
                    'n_factors': int(params["factors"]),
                    'batch_size': 100
                }

                bpr = BPRecommender(args)
                bpr = bpr.fit()

                if "name" in params.keys():
                    mm = ModelManager()
                    name = params["name"]
                    path = path + "/" + name
                    mm.saveBprModel(bpr, path)
                    self.models[algo].append(name)
            else:
                mm = ModelManager()
                model = params["models"]
                path = path + "/" + model
                bpr = mm.loadBprModel(path)

            topN = bpr.recommend(user)

            topN = [(self.getAdditionalData(movieId), "") for movieId in topN]

        return topN
Esempio n. 9
0
def run_knn_baseline(sparse_data):
    #filename = "test.json"
    prefix = "knn_baseline_"
    trainFile = prefix + "train.txt"
    testFile = prefix + "test.txt"

    raw_data, userPurchasedSet, userTrueTestSet = preprocess(
        sparse_data, trainFile, testFile)
    folds_files = [(trainFile, testFile)]
    reader = Reader(line_format='user item rating', sep='\t')
    data = Dataset.load_from_folds(folds_files, reader=reader)
    pkf = PredefinedKFold()
    bsl_options = {
        'method': 'sgd',
        'n_epochs': 20,
        'learning_rate': 0.005,
    }
    ### sim name: cosine    msd       pearson     pearson_baseline
    ### user_based : True ---- similarity will be computed based on users
    ###            : False ---- similarity will be computed based on items.
    sim_options = {'name': 'pearson_baseline', 'user_based': False}
    predictions = {}
    top_n = {}
    testsSet = None
    total_precisions = 0.0
    total_recalls = 0.0
    total_hit = 0.0
    total_nDCG = 0.0
    total_ffeature = 0.0
    result_file = prefix + "result.txt"
    result_f = open(result_file, "w")
    for trainset, testset in pkf.split(data):
        testsSet = testset

        #algo = SVD(n_factors = 5)
        algo = KNNBaseline(bsl_options=bsl_options, sim_options=sim_options)
        algo.fit(trainset)
        pre = algo.test(testset)
        accuracy.rmse(pre)
        accuracy.mae(pre)
        #calculate_rmse(predictions)

        ### test
        rowNum = raw_data.get_row_size()
        colNum = raw_data.get_col_size()
        cur_time = time.time()
        time_cost = 0

        for i in range(rowNum):
            user = raw_data.get_userID(i)
            predictions[user] = set()
            pq = []
            heapq.heapify(pq)
            for j in range(colNum):
                item = raw_data.get_itemID(j)
                if user not in userPurchasedSet or item in userPurchasedSet[
                        user]:
                    continue
                value = raw_data.get_val(user, item, 'rating')
                predict = algo.predict(user, item, r_ui=0, verbose=False)[3]
                if len(pq) >= 10:
                    heapq.heappop(pq)
                heapq.heappush(pq, (predict, item))
            top_n[user] = set()
            for items in pq:
                top_n[user].add(items[1])
            if user in userTrueTestSet:
                curPrecisions = calculate_precision(top_n[user],
                                                    userTrueTestSet[user])
                curRecalls = calculate_recall(top_n[user],
                                              userTrueTestSet[user])
                ffeature = calculate_f_feature(curPrecisions, curRecalls)
                curHit = isHit(top_n[user], userTrueTestSet[user])
                cur_nDCG = calculate_NDCG(top_n[user], userTrueTestSet[user])
                total_precisions += curPrecisions
                total_recalls += curRecalls
                total_hit += curHit
                total_nDCG += cur_nDCG
                total_ffeature += ffeature
                result_f.write(user + "\t" + str(curPrecisions) + "\t" +
                               str(curRecalls) + "\t" + str(ffeature) + "\t" +
                               str(curHit) + '\t' + str(cur_nDCG) + "\n")
            if i != 0 and i % 1000 == 0:
                duration = (time.time() - cur_time) / 60
                time_cost += duration
                remaining_time = ((rowNum - i) / 1000) * duration
                cur_time = time.time()
                #print 'precisions', total_precisions, ' recalls', total_recalls, ' nDCG', total_nDCG
                print 'i:', i, "/", rowNum, 'remaining time:', remaining_time, 'min'
    print 'precicions', total_precisions, ' recalls', total_recalls, ' hit', total_hit, 'nDCG:', total_nDCG
    rowNum = raw_data.get_row_size()
    print 'avg_precisions:', total_precisions / rowNum, 'avg_recalls:', total_recalls / rowNum, 'avg_ffeature', str(
        total_ffeature / rowNum
    ), 'avg_hit:', total_hit / rowNum, 'avg_nDCG:', total_nDCG / rowNum
    result_f.write("avg:\t" + str(total_precisions / rowNum) + "\t" +
                   str(total_recalls / rowNum) + "\t" +
                   str(total_ffeature / rowNum) + "\t" +
                   str(total_hit / rowNum) + '\t' + str(total_nDCG / rowNum) +
                   "\n")
    result_f.close()
rating.columns = ['route_id', 'user_id', 'rating']
df = pd.merge(user, rating, on='user_id', how='inner')
#df.drop(['user_id', 'Age'], axis=1, inplace=True)
df.head()

reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(df[['user_id', 'route_id', 'rating']], reader)

train, test = train_test_split(data, test_size=.2)

sim_options = {'name': 'msd',
               'min_support': 5,
               'user_based': True}
base1 = KNNBaseline(k=30,sim_options=sim_options)

base1.fit(train)
base1_preds = base1.test(test)
accuracy.rmse(base1_preds)

sim_options1 = {'name': 'cosine',
               'min_support': 5,
               'user_based': True}
base13 = KNNBaseline(k=2,sim_options=sim_options1)


base13.fit(train)
base13_preds = base13.test(test)
acc = accuracy.rmse(base13_preds)


dump.dump('KNNFinal_Model',algo=base13,predictions=base13_preds)
Esempio n. 11
0
def collaborative_fitlering(raw_uid):
    # =============== 数据预处理 ===========================
    # 将数据库中的所有数据读取转换到文件
    # dir_data = '/www/wwwroot/music_recommender/page/cf_recommendation/cf_data'
    dir_data = './collaborative_filtering/cf_data'
    file_path = '{}/dataset_user_5.txt'.format(dir_data)
    if not os.path.exists(dir_data):
        os.makedirs(dir_data)

    # 数据库操作
    # 打开数据库连接
    db = pymysql.connect("localhost",
                         "music_system",
                         "music_system",
                         "music_recommender",
                         charset='utf8')

    # 使用 cursor() 方法创建一个游标对象 cursor
    cursor = db.cursor()

    sql = """SELECT uid, song_id, rating
              FROM user_rating
               WHERE 1"""
    cursor.execute(sql)
    results = cursor.fetchall()
    with open(file_path, "w+") as data_f:
        for result in results:
            uid, song_id, rating = result

            data_f.writelines("{}\t{}\t{}\n".format(uid, song_id, rating))

    if not os.path.exists(file_path):
        raise IOError("Dataset file is not exists!")

    # ===========  cf recommend ==================
    # 导入数据
    reader = Reader(line_format='user item rating', sep='\t')
    data = Dataset.load_from_file(file_path, reader=reader)

    # 所有数据生成训练集
    trainset = data.build_full_trainset()

    # ================= BaselineOnly  ==================
    # start = time.clock()

    bsl_options = {
        'method': 'sgd',
        'learning_rate': 0.0005,
    }
    algo_BaselineOnly = BaselineOnly(bsl_options=bsl_options)
    algo_BaselineOnly.fit(trainset)

    # 获得推荐结果
    rset = user_build_anti_testset(trainset, raw_uid)
    predictions = algo_BaselineOnly.test(rset)
    top_n_baselineonly = get_top_n(predictions, n=5)

    # end = time.clock()
    # print("user-50NN --- BaselineOnly 耗时: %.2fs\n" % (end-start))
    # print("BaselineOnly 推荐结果:{}\n".format(top_n_baselineonly))

    # ================= KNNBasic  ==================
    sim_options = {'name': 'pearson', 'user_based': True}
    algo_KNNBasic = KNNBasic(sim_options=sim_options)
    algo_KNNBasic.fit(trainset)

    # 获得推荐结果  ---  只考虑 knn 用户的
    # start = time.clock()
    predictor = PredictionSet(algo_KNNBasic, trainset, raw_uid)
    knn_anti_set = predictor.user_build_anti_testset()
    predictions = algo_KNNBasic.test(knn_anti_set)
    top_n_knnbasic = get_top_n(predictions, n=5)

    # end = time.clock()
    # print("user-50NN --- KNNBasic 耗时: %.2fs\n" % (end-start))
    # print("KNNBasic 推荐结果:{}\n".format(top_n_knnbasic))

    # ================= KNNBaseline  ==================
    sim_options = {'name': 'pearson_baseline', 'user_based': True}
    algo_KNNBaseline = KNNBaseline(sim_options=sim_options)
    algo_KNNBaseline.fit(trainset)

    # 获得推荐结果  ---  只考虑 knn 用户的
    # start = time.clock()
    predictor = PredictionSet(algo_KNNBaseline, trainset, raw_uid)
    knn_anti_set = predictor.user_build_anti_testset()
    predictions = algo_KNNBaseline.test(knn_anti_set)
    top_n_knnbaseline = get_top_n(predictions, n=5)

    # end = time.clock()
    # print("user-50NN --- KNNBaseline 耗时: %.2fs\n" % (end-start))
    # print("KNNBaseline 推荐结果:{}\n".format(top_n_knnbaseline))

    # =============== 按比例生成推荐结果 ==================
    recommendset = set()
    for results in [top_n_baselineonly, top_n_knnbasic, top_n_knnbaseline]:
        for key in results.keys():
            for recommendations in results[key]:
                iid, rating = recommendations
                recommendset.add(iid)

    items_baselineonly = set()
    for key in top_n_baselineonly.keys():
        for recommendations in top_n_baselineonly[key]:
            iid, rating = recommendations
            items_baselineonly.add(iid)

    items_knnbasic = set()
    for key in top_n_knnbasic.keys():
        for recommendations in top_n_knnbasic[key]:
            iid, rating = recommendations
            items_knnbasic.add(iid)

    items_knnbaseline = set()
    for key in top_n_knnbaseline.keys():
        for recommendations in top_n_knnbaseline[key]:
            iid, rating = recommendations
            items_knnbaseline.add(iid)

    rank = dict()
    for recommendation in recommendset:
        if recommendation not in rank:
            rank[recommendation] = 0
        if recommendation in items_baselineonly:
            rank[recommendation] += 1
        if recommendation in items_knnbasic:
            rank[recommendation] += 1
        if recommendation in items_knnbaseline:
            rank[recommendation] += 1

    max_rank = max(rank, key=lambda s: rank[s])
    if max_rank == 1:
        # print(items_baselineonly)
        return items_baselineonly
    else:
        result = nlargest(5, rank, key=lambda s: rank[s])
        # print(result)
        return result
class recsysBase:
    data = ''
    trainset = ''
    testset = ''
    algorithm = ''
    algo = ''
    predictions = ''

    def __init__(self,
                 data,
                 algorithm='svd',
                 algo_options={},
                 testset_percent=0):
        if not data:
            return

        self.data = data
        self.algorithm = algorithm

        ##
        if testset_percent == 0:
            self.trainset = self.data.build_full_trainset()
            self.testset = self.trainset.build_anti_testset()
        else:
            self.trainset, self.testset = train_test_split(
                self.data, test_size=testset_percent)

        if self.algorithm == 'svd':
            self.algo = SVD()
        elif self.algorithm == 'knn_basic':
            self.algo = KNNBasic()
        elif self.algorithm == 'knn_baseline':
            if not algo_options:
                algo_options = {
                    'name': 'pearson_baseline',
                    'user_based': False
                }

            self.algo = KNNBaseline(sim_options=algo_options)

        self.algo.fit(self.trainset)

    def exec(self):
        self.step1()
        self.step2()
        self.step3()

    def step1(self):
        pass

    def step2(self):
        pass

    def step3(self):
        pass

    def compute_rmse(self):
        if not self.predictions:
            self.test()

        accuracy.rmse(self.predictions)

    def load_from_file(self, file_path='predictions.csv'):
        self.predictions = pd.read_csv(filepath)

    def save_to_file(self, file_path='predictions.csv'):
        pd.DataFrame(algo.predictions).to_csv(file_path, index=False)

    def benchmark(self):
        cross_validate(self.algo,
                       data,
                       measures=['RMSE', 'MAE'],
                       cv=5,
                       verbose=True)

    def tune(self,
             opt_field='rmse',
             param_grid={
                 'n_epochs': [5, 10],
                 'lr_all': [0.002, 0.005],
                 'reg_all': [0.4, 0.6]
             },
             SHOW_RESULT=False):

        if self.algorithm == 'svd':
            gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

        ## Start tuning
        gs.fit(self.data)

        ## Save to self.algo
        self.algo = gs.best_estimator[opt_field]
        self.algo.fit(self.trainset)

        if SHOW_RESULT:
            # best RMSE score
            print(gs.best_score['rmse'])

            # combination of parameters that gave the best RMSE score
            print(gs.best_params['rmse'])

        return self

    def tune_and_test(self,
                      unbiased_percent=0.1,
                      opt_field='rmse',
                      param_grid={
                          'n_epochs': [5, 10],
                          'lr_all': [0.001, 0.01]
                      }):

        ## Get RAW
        raw_ratings = self.data.raw_ratings

        ## Shuffle ratings if you want
        random.shuffle(raw_ratings)

        ##
        threshold = int((1 - unbiased_percent) * len(raw_ratings))
        A_raw_ratings = raw_ratings[:threshold]
        B_raw_ratings = raw_ratings[threshold:]

        data = self.data
        data.raw_ratings = A_raw_ratings

        ## Select your best algo with grid search.
        grid_search = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
        grid_search.fit(data)

        self.algo = grid_search.best_estimator[opt_field]

        # retrain on the whole set A
        trainset = data.build_full_trainset()
        self.algo.fit(trainset)

        # Compute biased accuracy on A
        predictions = self.algo.test(trainset.build_testset())
        print('Biased accuracy on A,', end='   ')
        accuracy.rmse(predictions)

        # Compute unbiased accuracy on B
        testset = data.construct_testset(
            B_raw_ratings)  # testset is now the set B
        predictions = self.algo.test(testset)
        print('Unbiased accuracy on B,', end=' ')
        accuracy.rmse(predictions)

        return self

    def test(self):
        self.predictions = self.algo.test(self.testset)
        self.compute_rmse()

    def get_top_n(self, target_uid=None, n=10, SHOW_RESULT=False):
        '''Return the top-N recommendation for each user from a set of predictions.

        Args:
            predictions(list of Prediction objects): The list of predictions, as
                returned by the test method of an algorithm.
            n(int): The number of recommendation to output for each user. Default
                is 10.

        Returns:
        A dict where keys are user (raw) ids and values are lists of tuples:
            [(raw item id, rating estimation), ...] of size n.
        '''

        if target_uid:
            target_uid = str(target_uid)

        # Check if testset is valid
        if not self.predictions:
            self.predictions = self.algo.test(self.testset)

        # First map the predictions to each user.
        top_n = defaultdict(list)
        for uid, iid, true_r, est, _ in self.predictions:
            top_n[uid].append((iid, est))

        # Then sort the predictions for each user and retrieve the k highest ones.
        for uid, user_ratings in top_n.items():
            if target_uid and target_uid != uid:
                continue

            user_ratings.sort(key=lambda x: x[1], reverse=True)

            if target_uid:
                top_n = user_ratings[:n]
                break
            else:
                top_n[uid] = user_ratings[:n]

        # Print the recommended items for each user
        if SHOW_RESULT:
            try:
                for uid, user_ratings in top_n.items():
                    print(uid, [iid for (iid, _) in user_ratings])
            except:
                print(top_n)

        return top_n

    def precision_recall_at_k(self,
                              target_uid=1,
                              threshold=3.5,
                              k=10,
                              num_of_testset=5,
                              SHOW_RESULT=True):
        ## target_uid:  User ID to get result
        ## threshold:   the lowerbound that the rating should be higher
        ## k:           to get number of relevant and recommended items in top k

        if target_uid:
            target_uid = str(target_uid)

        kf = KFold(n_splits=num_of_testset)

        final_precision = []
        final_recalls = []

        for trainset, testset in kf.split(self.data):
            self.algo.fit(trainset)
            predictions = self.algo.test(testset)
            '''Return precision and recall at k metrics for each user.'''
            # First map the predictions to each user.
            user_est_true = defaultdict(list)
            for uid, _, true_r, est, _ in predictions:
                user_est_true[uid].append((est, true_r))

            precisions = dict()
            recalls = dict()
            for uid, user_ratings in user_est_true.items():
                # Sort user ratings by estimated value
                user_ratings.sort(key=lambda x: x[0], reverse=True)

                # Number of relevant items
                n_rel = sum(
                    (true_r >= threshold) for (_, true_r) in user_ratings)

                # Number of recommended items in top k
                n_rec_k = sum(
                    (est >= threshold) for (est, _) in user_ratings[:k])

                # Number of relevant and recommended items in top k
                n_rel_and_rec_k = sum(
                    ((true_r >= threshold) and (est >= threshold))
                    for (est, true_r) in user_ratings[:k])

                # Precision@K: Proportion of recommended items that are relevant
                precisions[
                    uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1

                # Recall@K: Proportion of relevant items that are recommended
                recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

            if SHOW_RESULT:
                print('Relevant: ' + str(
                    sum(prec
                        for prec in precisions.values()) / len(precisions)))
                print('Recommended: ' +
                      str(sum(rec for rec in recalls.values()) / len(recalls)))

            final_precision.append(precisions[uid])
            final_recalls.append(recalls[uid])

        if SHOW_RESULT:
            print(final_precision, final_recalls)

        return final_precision, final_recalls

    def read_item_names(self,
                        file_name=get_dataset_dir() +
                        '/ml-100k/ml-100k/u.item'):
        """Read the u.item file from MovieLens 100-k dataset and return two
        mappings to convert raw ids into movie names and movie names into raw ids.
        """

        rid_to_name = {}
        name_to_rid = {}
        with io.open(file_name, 'r', encoding='ISO-8859-1') as f:
            for line in f:
                line = line.split('|')
                rid_to_name[line[0]] = line[1]
                name_to_rid[line[1]] = line[0]

        return rid_to_name, name_to_rid

    def get_k_neighbors(self, name='Toy Story (1995)', k=10, SHOW_RESULT=True):
        ###########################################
        ## You need to use algorithm='knn_baseline' at the beginning
        ###########################################
        if self.algorithm != 'knn_baseline':
            self.__init__(data=self.data,
                          algorithm='knn_baseline',
                          testset_percent=0)

        ###########################################
        ###########################################
        ## Read the mappings raw id <-> movie name
        rid_to_name, name_to_rid = self.read_item_names()

        ##
        input_raw_id = name_to_rid[name]
        input_inner_id = self.algo.trainset.to_inner_iid(input_raw_id)

        ## Retrieve inner ids of the nearest neighbors of Toy Story.
        input_neighbors = self.algo.get_neighbors(input_inner_id, k=k)

        ## Convert inner ids of the neighbors into names.
        input_neighbors = (self.algo.trainset.to_raw_iid(inner_id)
                           for inner_id in input_neighbors)
        input_neighbors = (rid_to_name[rid] for rid in input_neighbors)

        ## Show result
        if SHOW_RESULT:
            print('\nThe ' + str(k) + ' nearest neighbors of "' + name +
                  '" are:')

            for neighbor in input_neighbors:
                print(neighbor)

        return input_neighbors
Esempio n. 13
0
    We are setting minimum number of neighbous (min_k) 1 and maximum number of neighbours (k) = 40  
    We train the model on train set '''

algo2 = KNNBasic(sim_options=sim_options, k=40, min_k=1)
algo2.fit(trainset)

predictions2 = algo2.test(testset)
print("RMSE for KNNBasic:", accuracy.rmse(predictions2, verbose=True))

# In[ ]:
''' We build the model by making use of KNNBasic which is collaborative filtering based algorithm. 
    We are setting minimum number of neighbous (min_k) 1 and maximum number of neighbours (k) = 40  
    We train the model on train set '''

algo3 = KNNBaseline(sim_options=sim_options, k=40, min_k=1)
algo3.fit(trainset)

predictions3 = algo3.test(testset)
print("RMSE for KNNBaseline:", accuracy.rmse(predictions3, verbose=True))

# In[ ]:
''' We build the model by making use of KNNBasic which is collaborative filtering based algorithm. 
    We are setting minimum number of neighbous (min_k) 1 and maximum number of neighbours (k) = 40  
    We train the model on train set '''

algo4 = KNNWithZScore(sim_options=sim_options, k=40, min_k=1)
algo4.fit(trainset)

predictions4 = algo4.test(testset)
print("RMSE for KNNBasic:", accuracy.rmse(predictions4, verbose=True))
Esempio n. 14
0
import pandas as pd
import numpy as np
from tqdm import tqdm
from surprise import KNNWithMeans, KNNBasic, KNNWithZScore, KNNBaseline
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split
from scipy.spatial.distance import cityblock, cosine, euclidean, hamming, jaccard, rogerstanimoto
data = Dataset.load_builtin('ml-1m')

trainset, testset = train_test_split(data, test_size=.15)

algo = KNNBaseline(k=50,
                   min_k=1,
                   sim_options={
                       'name': 'pearson_baseline',
                       'user_based': True
                   })
algo.fit(trainset)

test_pred = algo.test(testset)

print('accuracy', accuracy.rmse(test_pred, verbose=True))
print('predict', algo.predict(uid=2, iid='Fight Club (1999)').est)
Esempio n. 15
0
pred = sv.test(testset)
odf = pd.DataFrame(pred, columns=['uid', 'iid', 'rui', 'est', 'details'])
odf['err'] = abs(odf.est - odf.rui)

print("\n***** SVD Model Prediction Result via model file for Two record*****")
accuracy.rmse(pred, verbose=True)
accuracy.mae(test_pred, verbose=True)
print(odf.head())

# Part-4 - Train the model using KNNBaseline item-item similarity

sim_options = {'name': 'pearson_baseline', 'user_based': False}
simsAlgo = KNNBaseline(sim_options=sim_options)
simsAlgo.fit(trainset)

test_pred = simsAlgo.test(testset)
df = pd.DataFrame(test_pred, columns=['uid', 'iid', 'rui', 'est', 'details'])
df['err'] = abs(df.est - df.rui)
print(
    "****************KNNBaseline item-item similarity: Accuracy Score *****************"
)
accuracy.rmse(test_pred, verbose=True)
accuracy.mae(test_pred, verbose=True)
print(df.head())

# Part-5 - Train the model using KNNBaseline User-User similarity and get the Top-10 movies predictions for each user

sim_options = {'name': 'pearson_baseline', 'user_based': True}
simsAlgo = KNNBaseline(sim_options=sim_options)
simsAlgo.fit(trainset)
Esempio n. 16
0
data = Dataset.load_from_file(file_path=custom_dataset_path, reader=reader)
trainingSet = data.build_full_trainset()

sim_options = {

    'name': 'pearson_baseline',
    'shrinkage': 0  # no shrinkage
    }

knn = KNNBaseline(sim_options=sim_options)

knn.fit(trainingSet)


testSet = trainingSet.build_anti_testset()
predictions = knn.test(testSet)


from collections import defaultdict
def get_top3_recommendations(predictions, topN = 10):
        top_recs = defaultdict(list)
        for uid, iid, true_r, est, _ in predictions:
            top_recs[uid].append((iid, est))
            
        for uid, user_ratings in top_recs.items():
            user_ratings.sort(key = lambda x: x[1], reverse = True)
            top_recs[uid] = user_ratings[:topN]
        return top_recs


import os, io
Esempio n. 17
0
    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n


reader = Reader(line_format='user item rating',
                sep=',',
                skip_lines=1,
                rating_scale=(0, 5))
data = Dataset.load_from_file('base.csv', reader=reader)
trainset = data.build_full_trainset()

#Train the algoritihm to compute the similarities between users
sim_options = {'name': 'pearson_baseline'}
algo = KNNBaseline(sim_options=sim_options)
algo.fit(trainset)

# Than predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()

predictions = algo.test(testset)

top_n = get_top_n(predictions, n=5)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])
Esempio n. 18
0
def collab_recommendations(user_id,
                           df1,
                           ratings,
                           movieIds,
                           movies_ratings,
                           keep_movies1,
                           df2,
                           keep_movies2,
                           content_recommedation_system=False,
                           collab_recommendation_system=False,
                           top_n=10,
                           precision=False):

    # generate recommendations on train/test set
    if precision:
        test_ratings = df2.copy()
        # set parameters for KNN model
        user_based = {
            'name': 'pearson_baseline',
            'shrinkage': 0  # no shrinkage
        }
        collab_ratings = ratings[['userId', 'movieId', 'rating']]
        # set scale between min and max rating
        min_rat = collab_ratings.rating.min()
        max_rat = collab_ratings.rating.max()
        reader = Reader(rating_scale=(min_rat, max_rat))
        # fit on train set
        data = Dataset.load_from_df(collab_ratings, reader)
        trainset = data.build_full_trainset()
        algo = KNNBaseline(sim_options=user_based)
        algo.fit(trainset)

        # predict on test set
        test_ratings = test_ratings[['userId', 'movieId', 'rating']]
        testset = [tuple(x) for x in test_ratings.to_numpy()]
        predictions = algo.test(testset)

        # return predictions on test set
        collab_predictions = pd.DataFrame(predictions)
        collab_predictions = collab_predictions[['uid', 'iid', 'est']]
        collab_predictions = collab_predictions.rename(columns={
            'est': 'prediction',
            'uid': 'userId',
            'iid': 'movieId'
        })[['userId', 'movieId', 'prediction']]
        collab_predictions[['userId', 'movieId'
                            ]] = collab_predictions[['userId',
                                                     'movieId']].astype(int)

    # use precomputed
    else:
        collab_predictions = df2.copy()

    # get recommendations from collab filtering model
    collab_rec = collab_predictions[collab_predictions.userId == user_id]
    # merge with movie ratings + sort on prediction and secondarily on weighted average of ratings
    collab_rec = pd.merge(collab_rec, movies_ratings, on='movieId')
    collab_rec = collab_rec.sort_values(['prediction', 'weighted_avg'],
                                        ascending=[False, True])

    return collab_rec
Esempio n. 19
0
def three_ensemble_predict(m1_preds, m2_preds, m3_preds, w_1, w_2, w_3):
    final_preds = []
    for x, y, z in zip(m1_preds, m2_preds, m3_preds):
        assert x[0] == y[0] == z[0]
        assert x[1] == y[1] == z[1]

        # avg_pred = (x[3] + y[3] + z[3]) / 3.0
        avg_pred = (w_1 * x[3]) + (w_2 * y[3]) + (w_3 * z[3])
        final_preds.append(Prediction(x[0], x[1], x[2], avg_pred, x[4]))
    return final_preds


# Compute biased accuracy on A
train_svd_preds = final_svd_algo.test(trainset.build_testset())
train_knn_preds = item_knn_algo.test(trainset.build_testset())

train_preds = two_ensemble_predict(train_svd_preds, train_knn_preds, opt_w_svd,
                                   opt_w_knn)

train_rmse = accuracy.rmse(train_preds)
train_mae = accuracy.mae(train_preds)
print('Biased RMSE on training set: {}'.format(train_rmse))
print('Biased MAE on training set: {}'.format(train_mae))

# # Compute unbiased accuracy on B
testset = data.construct_testset(test_raw_ratings)  # testset is now the
# set B

test_svd_preds = final_svd_algo.test(testset)
test_knn_preds = item_knn_algo.test(testset)
Esempio n. 20
0
def collaborative_filtering(raw_uid):
    # To read the data from a txt file
    # TODO: To modify the file path of the data set
    # =============== 数据预处理 ===========================
    # 将数据库中的所有数据读取转换到文件
    # dir_data = '/www/wwwroot/music_recommender/page/cf_recommendation/cf_data'
    dir_data = './collaborative_filtering/cf_data'
    file_path = '{}/dataset_user_5.txt'.format(dir_data)
    if not os.path.exists(dir_data):
        os.makedirs(dir_data)

    # 数据库操作
    # 打开数据库连接
    db = pymysql.connect("localhost",
                         "root",
                         "password",
                         "music_recommender",
                         charset='utf8')

    # 使用 cursor() 方法创建一个游标对象 cursor
    cursor = db.cursor()

    sql = """SELECT uid, song_id, rating
              FROM user_rating
               WHERE 1"""
    cursor.execute(sql)
    results = cursor.fetchall()
    with open(file_path, "w+") as data_f:
        # print(data_f)
        # exit()
        for result in results:
            uid, song_id, rating = result

            data_f.writelines("{}\t{}\t{}\n".format(uid, song_id, rating))

    if not os.path.exists(file_path):
        raise IOError("Dataset file is not exists!")
    # file_path = ""

    reader = Reader(line_format='user item rating', sep='\t')
    data = Dataset.load_from_file(file_path, reader=reader)

    # Build the training set
    trainset = data.build_full_trainset()
    # print(trainset)
    # exit()
    # Baselineonly
    bsl_options = {
        'method': 'sgd',
        'learning_rate': 0.0005,
    }
    algo_BaselineOnly = BaselineOnly(bsl_options=bsl_options)
    algo_BaselineOnly.fit(trainset)

    rset = user_build_anti_testset(trainset, raw_uid)
    predictions = algo_BaselineOnly.test(rset)
    top_n_baselineonly = get_top_n(predictions, n=5)

    # KNNBasic
    sim_options = {'name': 'pearson', 'user_based': True}
    algo_KNNBasic = KNNBasic(sim_options=sim_options)
    algo_KNNBasic.fit(trainset)

    predictor = PredictionSet(algo_KNNBasic, trainset, raw_uid)
    knn_anti_set = predictor.user_build_anti_testset()
    predictions = algo_KNNBasic.test(knn_anti_set)
    top_n_knnbasic = get_top_n(predictions, n=5)

    # KNNBaseline
    sim_options = {'name': 'pearson_baseline', 'user_based': True}
    algo_KNNBaseline = KNNBaseline(sim_options=sim_options)
    algo_KNNBaseline.fit(trainset)

    predictor = PredictionSet(algo_KNNBaseline, trainset, raw_uid)
    knn_anti_set = predictor.user_build_anti_testset()
    predictions = algo_KNNBaseline.test(knn_anti_set)
    top_n_knnbaseline = get_top_n(predictions, n=5)
    # l = list(top_n_baselineonly)[0]
    # a = list()
    # a[l] = top_n_baselineonly[l]
    # print(a)
    # print(l,'predictions == ')
    print({raw_uid: top_n_baselineonly[raw_uid]})
    print({raw_uid: top_n_knnbasic[raw_uid]})
    print({raw_uid: top_n_knnbasic[raw_uid]})