Ejemplo n.º 1
0
    def recommender_random(self, train_file, test_file, output):

        train, test, train_dataset, test_dataset = prepare_datasets(
            train_file, test_file)
        # Use user_based true/false to switch between user-based or item-based collaborative filtering
        algo_random = NormalPredictor()

        algo_random.fit(train)

        #not_seen_elems = self.merge_train_set(train_dataset, test_dataset)

        #predictions_precision_svd = algo_svd.test(not_seen_elems, test, verbose=False, not_seen_flag=True)
        predictions_random = algo_random.test(test, verbose=False)

        #precisions, recalls = self.precision_recall_at_k(predictions_precision_svd, 10, threshold=0.0)
        # Precision and recall can then be averaged over all users
        #precision_avg = sum(prec for prec in precisions.values()) / len(precisions)
        #recall_avg = sum(rec for rec in recalls.values()) / len(recalls)
        #print('Precision: ' + str(precision_avg) + ' Recall: ' + str(recall_avg) + ' RMSE: ' + str(
        #    rmse(predictions_svd, verbose=False)) + ' MAE: ' + str(mae(predictions_svd, verbose=False)))
        print('RANDOM: ' + ' RMSE ' +
              str(rmse(predictions_random, verbose=False)) + ' MAE ' +
              str(mae(predictions_random, verbose=False)))

        return algo_random
Ejemplo n.º 2
0
class NormalPredictorX(BaseSurpriseSTLEstimator, ABC):
    def __init__(self, name="Normal Prediction"):
        super().__init__(name, 'non_feature_based')
        self.model = NormalPredictor()

    def _fit(self, x):
        self.model.fit(x)

    def _predict(self, x):
        return self.model.test(x)
def normal_predictor(train, test, ids, Xtest, Xids):
    """
    Generates predictions according to a normal distribution estimated from the training set
    Argument : train, the trainset
               test, the testset
               ids, unknown ratings
               Xtest, predicted ratings for testset, to be used for final blending
               Xids, predicted ratings for unknown ratings, to be used for final blending
    """
    print('Normal Predictor')
    algo = NormalPredictor()

    #Train algorithm on training set
    algo.fit(train)

    #Predict on train and compute RMSE
    predictions = algo.test(train.build_testset())
    print('   Training RMSE: ', accuracy.rmse(predictions, verbose=False))

    #Predict on test and compute RMSE
    predictions = algo.test(test)
    rmse = accuracy.rmse(predictions, verbose=False)
    print('   Test RMSE: ', rmse)

    preds_test = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds_test[j] = pred.est

    #Predict unknown ratings
    preds_ids = []
    for i in range(len(ids[0])):
        pred = algo.predict(str(ids[0][i]), str(ids[1][i]))
        preds_ids.append(pred.est)

    Xtest.append(preds_test)
    Xids.append(preds_ids)
    return rmse, Xtest, Xids, preds_test, preds_ids
Ejemplo n.º 4
0
def run_NORMPRED(x_train, x_test, k):
    reader = Reader(rating_scale=(1, 5))
    data_train = Dataset.load_from_df(x_train[['userId', 'movieId', 'rating']],
                                      reader)
    data_test = Dataset.load_from_df(x_test[['userId', 'movieId', 'rating']],
                                     reader)
    data_train = data_train.build_full_trainset()
    data_test = data_test.build_full_trainset()
    data_testset = data_test.build_testset()
    algo = NormalPredictor()

    algo.fit(data_train)
    pr = algo.test(data_testset)
    rec = format_baselines(pr)
    seen = format_baselines_apk(pr, x_test)
    print(f'APK: {yallah(seen, k)}')
    precisions, recalls = precision_recall_at_k(rec, k)
    print(
        f'|NORMAL PREDICTOR : Precision| = {sum(prec for prec in precisions.values()) / len(precisions)}'
    )
    print(
        f'|NORMAL PREDICTOR : Recall| = {sum(rec for rec in recalls.values()) / len(recalls)}'
    )
Ejemplo n.º 5
0
from surprise import accuracy
from surprise.model_selection import KFold

# 数据读取
file_path = 'E:/python/machina/kaggle_practice/week4/data/ratings.csv'
reader = Reader(line_format='user item rating timestamp',
                sep=',',
                skip_lines=1)
data = Dataset.load_from_file(file_path, reader=reader)
train_set = data.build_full_trainset()
'''
    SGD参数:
        reg:代价函数的正则化项,默认为0.02。
        learning_rate:学习率,默认为0.005。
        n_epochs:迭代次数,默认为20。

'''
# NormalPredictor
bsl_options = {'method': 'sgd', 'n_epochs': 5}
algo = NormalPredictor()
# 定义K折交叉验证迭代器,K=3
kf = KFold(n_splits=3)
for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    accuracy.rmse(predictions, verbose=True)
uid = str(196)
iid = str(302)
pred = algo.predict(uid, iid, r_ui=4, verbose=True)
print(pred)
Ejemplo n.º 6
0
from surprise.model_selection import train_test_split, GridSearchCV, KFold
import random
import pandas as pd
import numpy as np
np.random.seed(0)
random.seed(0)
pd.set_option('display.max_columns', 500)

data = GetBookData(density_filter = False)
trainset, testset = train_test_split(data, test_size=0.2)
results = {}
top_n = {}

norm = NormalPredictor()
norm.fit(trainset)
norm_pred = norm.test(testset)
rmse = accuracy.rmse(norm_pred)
precisions, recalls = precision_recall_at_k(norm_pred, k = 10, threshold = 4)
avg_precision = sum(prec for prec in precisions.values()) / len(precisions)
avg_recall= sum(rec for rec in recalls.values()) / len(recalls)
metrics = {'rmse': rmse, 
            'avg_precision': avg_precision, 
            'avg_recall': avg_recall}
results['NormalPredictor'] = metrics

top_n['NormalPredictor'] = get_top_n(norm_pred, n=10)


param_grid = {'bsl_options':{'method': ['als', 'sgd']}}
gs = GridSearchCV(BaselineOnly, param_grid, measures = ['rmse'], cv = 5)
gs.fit(data)
Ejemplo n.º 7
0
class ExplicitModels:
    def __init__(self, df, algo='KNN', user_based=False):
        self.df = df
        self.algo = algo
        self.user_based = user_based

        reader = Reader(line_format='user item rating')
        data = Dataset.load_from_df(df=self.df, reader=reader)
        self.eval_data = EvaluationData(data)

        if self.algo == 'KNN':
            sim_options = {'name': 'cosine', 'user_based': self.user_based}
            self.model = KNNBasic(sim_options=sim_options)
        elif self.algo == 'SVD':
            self.model = SVD()
        elif self.algo == 'SVD++':
            self.model = SVDpp()
        elif self.algo == 'Random':
            self.model = NormalPredictor()

    def predict_all(self, n=10):
        FullTrainSet = self.eval_data.GetFullTrainSet()
        FullAntiTestSet = self.eval_data.GetFullAntiTestSet()
        self.model.fit(FullTrainSet)
        allPredictions = self.model.test(FullAntiTestSet)
        topN = RecommenderMetrics.GetTopN(allPredictions, n=n)

        return topN

    def predict_user(self, n=10, ruid=85):
        FullTrainSet = self.eval_data.GetFullTrainSet()
        UserAntiTestSet = self.eval_data.GetAntiTestSetForUser(ruid)
        self.model.fit(FullTrainSet)
        predictions = self.model.test(UserAntiTestSet)

        recommendations = []
        for userID, itemID, actualRating, estimatedRating, _ in predictions:
            recommendations.append((itemID, estimatedRating))

        recommendations.sort(key=lambda x: x[1], reverse=True)
        return recommendations[:n]

    def user_seen(self, n=10, ruid=0):
        FullTrainSet = self.eval_data.GetFullTrainSet()
        iuid = FullTrainSet.to_inner_uid(ruid)
        seen = [(FullTrainSet.to_raw_iid(i[0]), i[1])
                for i in FullTrainSet.ur[iuid]]
        seen.sort(key=lambda x: x[1], reverse=True)
        return seen[:n]

    def eval(self, n=10, doTopN=True):
        metrics = {}
        TrainSet = self.eval_data.GetTrainSet()
        TestSet = self.eval_data.GetTestSet()
        self.model.fit(TrainSet)
        predictions = self.model.test(TestSet)
        metrics['RMSE'] = RecommenderMetrics.RMSE(predictions)
        metrics['MAE'] = RecommenderMetrics.MAE(predictions)

        if doTopN:
            LOOCVTrainSet = self.eval_data.GetLOOCVTrainSet()
            LOOCVTestSet = self.eval_data.GetLOOCVTestSet()
            LOOCVAntiTestSet = self.eval_data.GetLOOCVAntiTestSet()
            self.model.fit(LOOCVTrainSet)
            predictions = self.model.test(LOOCVAntiTestSet)
            TopN = RecommenderMetrics.GetTopN(predictions, n=n)
            metrics['HitRate'] = RecommenderMetrics.HitRate(TopN, LOOCVTestSet)

        return metrics
Ejemplo n.º 8
0
from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

import pandas as pd

from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader

# Dummy algo
algo = NormalPredictor()

# Creation of the dataframe. Column names are irrelevant.
ratings_dict = {
    'itemID': [1, 1, 1, 2, 2],
    'userID': [9, 32, 2, 45, 'user_foo'],
    'rating': [3, 2, 4, 3, 1]
}
df = pd.DataFrame(ratings_dict)

# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))
# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)
data.split(2)  # data can now be used normally

for trainset, testset in data.folds():
    algo.train(trainset)
    algo.test(testset)
Ejemplo n.º 9
0
class CFSurprise(object):
    def __init__(self, module_type, baseline_type, cf_type, similar, sim_type,
                 params):
        assert baseline_type in {"ALS", "SGD", "default"}
        assert cf_type in {None, "base_user", "base_item"}
        assert similar in {
            None, "COSINE", "cosine", "MSD", "msd", "PEARSON", "pearson",
            "PEARSON_BASELINE", "pearson_baseline", "JACCARD", "jaccard",
            "EUCLIDEAN", "euclidean"
        }
        assert sim_type in {None, "default"}
        self.module_type = module_type
        self.baseline_type = baseline_type
        self.cf_type = cf_type
        self.similar = similar
        self.sim_type = sim_type
        self.bu = None
        self.bi = None
        self.sim = None
        if self.baseline_type == "ALS":
            bsl_options = {
                'method': params["bsl_options"].get("method", 'als'),
                'n_epochs': params["bsl_options"].get("n_epochs", 10),
                'reg_u': params["bsl_options"].get("reg_u", 15),
                'reg_i': params["bsl_options"].get("reg_i", 10)
            }
        elif self.baseline_type == "SGD":
            bsl_options = {
                'method': params["bsl_options"].get("method", 'sgd'),
                'n_epochs': params["bsl_options"].get("n_epochs", 20),
                'reg': params["bsl_options"].get("reg", 0.02),
                'learning_rate':
                params["bsl_options"].get("learning_rate", 0.005)
            }
        else:  # 默认值
            bsl_options = {}
        params["sim_options"] = {}

        if self.cf_type == "base_user":
            params["sim_options"]["user_based"] = True
        elif self.cf_type == "base_item":
            params["sim_options"]["item_based"] = False
        else:
            params["sim_options"]["user_based"] = True

        if self.similar == "COSINE" or self.similar == "cosine":
            params["sim_options"]["name"] = "cosine"
        elif self.similar == "MSD" or self.similar == "msd":
            params["sim_options"]["name"] = "msd"
        elif self.similar == "PEARSON" or self.similar == "pearson":
            params["sim_options"]["name"] = "pearson"
        elif self.similar == "PEARSON_BASELINE" or self.similar == "pearson_baseline":
            params["sim_options"]["name"] = "pearson_baseline"
        elif self.similar == "JACCARD" or self.similar == "jaccard":
            params["sim_options"]["name"] = "jaccard"
        elif self.similar == "EUCLIDEAN" or self.similar == "euclidean":
            params["sim_options"]["name"] = "euclidean"
        else:
            params["sim_options"]["name"] = "msd"

        if self.sim_type == "default":
            sim_options = {}
        else:
            sim_options = {
                "name": params["sim_options"].get("name", "MSD"),
                "user_based": params["sim_options"].get("user_based", True),
                "min_support": params["sim_options"].get("min_support", 5),
                "shrinkage": params["sim_options"].get("shrinkage", 100)
            }
            """
            'name':要使用的相似性名称,如similarities模块中所定义 。默认值为'MSD'。
            'user_based':将计算用户之间还是项目之间的相似性。这对预测算法的性能有巨大影响。默认值为True。
            'min_support':相似度不为零的最小公共项数('user_based' 为'True'时)或最小公共用户数('user_based'为 'False'时)。
            简单地说,如果 |Iuv|<min_support 然后 sim(u,v)=0。项目也是如此。
            'shrinkage':
            """
        if self.module_type == "KNNmeans":
            # 在KNNBasic算法的基础上,考虑用户均值或项目均值
            self.model = KNNWithMeans(k=params.get("k", 40),
                                      min_k=params.get("min_k", 1),
                                      sim_options=sim_options,
                                      verbose=params.get("verbose", True))
        elif self.module_type == "KNNzscore":
            # 引入Z - Score的思想
            self.model = KNNWithZScore(k=params.get("k", 40),
                                       min_k=params.get("min_k", 1),
                                       sim_options=sim_options,
                                       verbose=params.get("verbose", True))
        elif self.module_type == "KNNbase":
            # 和KNNWithMeans的区别在于,用的不是均值而是bias
            self.model = KNNBaseline(
                k=params.get("k", 40),
                min_k=params.get("min_k", 1),  # 最少的邻居个数
                sim_options=sim_options,
                bsl_options=bsl_options,
                verbose=params.get("verbose", True))
        elif self.module_type == "KNNbasic":
            # 最基础的KNN算法,可分为user - based KNN和item - based KNN
            self.model = KNNBasic(k=params.get("k", 40),
                                  min_k=params.get("min_k", 1),
                                  sim_options=sim_options,
                                  verbose=params.get("verbose", True))
        elif self.module_type == "SVD":
            self.model = SVD(n_factors=params.get("n_factors", 100),
                             n_epochs=params.get("n_epochs", 20),
                             init_mean=params.get("init_mean", 0),
                             init_std_dev=params.get("init_std_dev", 0.1),
                             lr_all=params.get("lr_all", 0.005),
                             reg_all=params.get("reg_all", 0.02),
                             lr_bu=params.get("lr_bu", None),
                             lr_bi=params.get("lr_bi", None),
                             lr_pu=params.get("lr_pu", None),
                             lr_qi=params.get("lr_qi", None),
                             reg_bu=params.get("reg_bu", None),
                             reg_bi=params.get("reg_bi", None),
                             reg_pu=params.get("reg_pu", None),
                             reg_qi=params.get("reg_qi", None),
                             random_state=params.get("random_state", None),
                             verbose=params.get("verbose", False))
            """
            n_factors –因素数。默认值为100。
            n_epochs – SGD过程的迭代次数。默认值为 20。
            偏见(bool)–是否使用基线(或偏见)。请参阅上面的注释。默认值为True。
            init_mean –因子向量初始化的正态分布平均值。默认值为0。
            init_std_dev –因子向量初始化的正态分布的标准偏差。默认值为0.1。
            lr_all –所有参数的学习率。默认值为0.005。
            reg_all –所有参数的正则项。默认值为 0.02。
            lr_bu –的学习率bu。lr_all如果设置优先 。默认值为None。
            lr_bi –的学习率bi。lr_all如果设置优先 。默认值为None。
            lr_pu –的学习率pu。lr_all如果设置优先 。默认值为None。
            lr_qi –的学习率qi。lr_all如果设置优先 。默认值为None。
            reg_bu –的正则化术语bu。reg_all如果设置优先。默认值为None。
            reg_bi –的正则化术语bi。reg_all如果设置优先。默认值为None。
            reg_pu –的正则化术语pu。reg_all如果设置优先。默认值为None。
            reg_qi –的正则化术语qi。reg_all如果设置优先。默认值为None。
            random_state(int,numpy中的RandomState实例或None)–确定将用于初始化的RNG。
            如果为int,random_state则将用作新RNG的种子。通过多次调用进行相同的初始化非常有用 fit()。
            如果是RandomState实例,则将该实例用作RNG。如果为None,则使用numpy中的当前RNG。默认值为 None。
            详细 –如果True,则打印当前纪元。默认值为False。
            """
        elif self.module_type == "SVDpp":
            self.model = SVDpp(n_factors=params.get("n_factors", 100),
                               n_epochs=params.get("n_epochs", 20),
                               init_mean=params.get("init_mean", 0),
                               init_std_dev=params.get("init_std_dev", 0.1),
                               lr_all=params.get("lr_all", 0.005),
                               reg_all=params.get("reg_all", 0.02),
                               lr_bu=params.get("lr_bu", None),
                               lr_bi=params.get("lr_bi", None),
                               lr_pu=params.get("lr_pu", None),
                               lr_qi=params.get("lr_qi", None),
                               reg_bu=params.get("reg_bu", None),
                               reg_bi=params.get("reg_bi", None),
                               reg_pu=params.get("reg_pu", None),
                               reg_qi=params.get("reg_qi", None),
                               random_state=params.get("random_state", None),
                               verbose=params.get("verbose", False))
            """
            n_factors –因素数。默认值为20。
            n_epochs – SGD过程的迭代次数。默认值为
            20。
            init_mean –因子向量初始化的正态分布平均值。默认值为0。
            init_std_dev –因子向量初始化的正态分布的标准偏差。默认值为0
            .1。
            lr_all –所有参数的学习率。默认值为0
            .007。
            reg_all –所有参数的正则项。默认值为
            0.02。
            lr_bu –的学习率bu。lr_all如果设置优先 。默认值为None。
            lr_bi –的学习率bi。lr_all如果设置优先 。默认值为None。
            lr_pu –的学习率pu。lr_all如果设置优先 。默认值为None。
            lr_qi –的学习率qi。lr_all如果设置优先 。默认值为None。
            lr_yj –的学习率yj。lr_all如果设置优先 。默认值为None。
            reg_bu –的正则化术语bu。reg_all如果设置优先。默认值为None。
            reg_bi –的正则化术语bi。reg_all如果设置优先。默认值为None。
            reg_pu –的正则化术语pu。reg_all如果设置优先。默认值为None。
            reg_qi –的正则化术语qi。reg_all如果设置优先。默认值为None。
            reg_yj –的正则化术语yj。reg_all如果设置优先。默认值为None。
            random_state(int,numpy中的RandomState实例或None)–确定将用于初始化的RNG。如果为int,random_state则将用作新RNG的种子。通过多次调用进行相同的初始化非常有用
            fit()。如果是RandomState实例,则将该实例用作RNG。如果为None,则使用numpy中的当前RNG。默认值为
            None。
            详细 –如果True,则打印当前纪元。默认值为False。
            """
        elif self.module_type == "NMF":
            # 非负矩阵分解,即要求p矩阵和q矩阵都是正的
            self.model = NMF(n_factors=params.get("n_factors", 100),
                             n_epochs=params.get("n_epochs", 20),
                             init_mean=params.get("init_mean", 0),
                             init_std_dev=params.get("init_std_dev", 0.1),
                             lr_all=params.get("lr_all", 0.005),
                             reg_all=params.get("reg_all", 0.02),
                             lr_bu=params.get("lr_bu", None),
                             lr_bi=params.get("lr_bi", None),
                             lr_pu=params.get("lr_pu", None),
                             lr_qi=params.get("lr_qi", None),
                             reg_bu=params.get("reg_bu", None),
                             reg_bi=params.get("reg_bi", None),
                             reg_pu=params.get("reg_pu", None),
                             reg_qi=params.get("reg_qi", None),
                             random_state=params.get("random_state", None),
                             verbose=params.get("verbose", False))
            """
            n_factors –因素数。默认值为15。
            n_epochs – SGD过程的迭代次数。默认值为 50。
            偏见(bool)–是否使用基线(或偏见)。默认值为 False。
            reg_pu –用户的正则化术语λu。默认值为 0.06。
            reg_qi –项目的正规化术语λi。默认值为 0.06。
            reg_bu –的正则化术语bu。仅与偏置版本相关。默认值为0.02。
            reg_bi –的正则化术语bi。仅与偏置版本相关。默认值为0.02。
            lr_bu –的学习率bu。仅与偏置版本相关。默认值为0.005。
            lr_bi –的学习率bi。仅与偏置版本相关。默认值为0.005。
            init_low –因子的随机初始化的下限。必须大于0以确保非负因素。默认值为 0。
            init_high –因子的随机初始化的上限。默认值为1。
            random_state(int,numpy中的RandomState实例或None)–确定将用于初始化的RNG。
            如果为int,random_state则将用作新RNG的种子。通过多次调用进行相同的初始化非常有用 fit()。
            如果是RandomState实例,则将该实例用作RNG。如果为None,则使用numpy中的当前RNG。默认值为 None。
            详细 –如果True,则打印当前纪元。默认值为False。
            """
        elif self.module_type == "SlopeOne":
            self.model = SlopeOne(**params)

        elif self.module_type == "cc":
            # 基于聚类的协同过滤
            self.model = CoClustering(n_cltr_u=params.get("n_cltr_u", 3),
                                      n_cltr_i=params.get("n_cltr_i", 3),
                                      n_epochs=params.get("n_epochs", 20),
                                      random_state=params.get(
                                          "random_state", None),
                                      verbose=params.get("verbose", False))
            """
            n_cltr_u(int)–用户集群的数量。默认值为3。
            n_cltr_i(int)–项目集群的数量。默认值为3。
            n_epochs(int)–优化循环的迭代次数。默认值为 20。
            random_state(int,numpy中的RandomState实例或None)–确定将用于初始化的RNG。
            如果为int,random_state则将用作新RNG的种子。通过多次调用进行相同的初始化非常有用 fit()。
            如果是RandomState实例,则将该实例用作RNG。如果为None,则使用numpy中的当前RNG。默认值为 None。
            详细(bool)–如果为True,则将打印当前纪元。默认值为 False。
            """

        elif self.module_type == "BaselineOnly":
            # 不考虑用户的偏好
            self.model = BaselineOnly(bsl_options=bsl_options, verbose=True)

        elif self.module_type == "Np":
            # 该算法即随机预测算法,假设测试集的评分满足正态分布,然后生成正态分布的随机数进行预测,
            self.model = NormalPredictor()

    def fit(self, trainset):
        self.model.fit(trainset=trainset)
        # 计算相似度
        # 具体的计算对象跟sim_options中参数有关
        # 相似度矩阵,计算相似度矩阵的方式取决于sim_options算法创建时候所传递的参数,返回相似度矩阵
        self.sim = self.model.compute_similarities()
        # 计算用户和项目的基线,这个方法只能适用于Pearson相似度或者BaselineOnly算法,
        # 返回一个包含用户相似度和用户相似度的元组
        self.bu, self.bi = self.model.compute_baselines()
        return self

    def test(self, testset, verson=False):
        predictions = self.model.test(testset=testset, verbose=verson)
        return predictions

    def onekey_transform(self, trainset, testset, verbose=False):
        predictions = self.model.fit(trainset=trainset).test(testset=testset,
                                                             verbose=verbose)
        return predictions

    def predict(self, uid, iid, r_ui, verbose=False):
        assert isinstance(uid, str) or isinstance(uid, int)
        assert isinstance(iid, str) or isinstance(iid, str)
        if isinstance(uid, int):
            uid = str(uid)
        if isinstance(iid, int):
            iid = str(iid)
        evluations = self.model.predict(uid=uid,
                                        iid=iid,
                                        r_ui=r_ui,
                                        verbose=verbose)
        return evluations

    def get_neighbors(self, iid, k):
        """
        :param iid:  iid 表示对应的uid 或itemid, 具体的跟 sim_options 中参数有关
        :param k:  k代表最近的k个邻居
        :return:
        """
        return self.model.get_neighbors(iid=iid, k=k)

    @staticmethod
    def metric(predictions, verbose=True, metric_type="rmse"):
        assert metric_type in {"mse", "fcp", "mae", "rmse"}
        if metric_type == "mse":
            metric = accuracy.mse(predictions=predictions, verbose=verbose)
        elif metric_type == "fcp":
            metric = accuracy.fcp(predictions=predictions, verbose=verbose)
        elif metric_type == "mae":
            metric = accuracy.mae(predictions=predictions, verbose=verbose)
        else:
            metric = accuracy.rmse(predictions=predictions, verbose=verbose)
        return metric

    def _estimate(self, trainset, uid, iid, top_k=10):  #一般不使用
        """
        :param trainset:
        :param uid:  均使用的inner_id
        :param iid:  使用的内部的id
        :param top_k:
        :return:
        """
        if not (trainset.knows_user(uid=uid) and trainset.knows_item(iid=iid)):
            raise PredictionImpossible('User and/or item is unkown.')
        neighbors = [(vid, self.sim[uid, vid])
                     for (vid, r) in trainset.ir[iid]]
        # 计算u和v之间的相似性,其中v描述了所有其他用户,他们也对项目I进行了评级。
        neighbors = sorted(neighbors, key=lambda x: x[1], reverse=True)  # 降序
        # 相似度排序操作
        for v, sim_uv in neighbors[:top_k]:
            print('user {0:} with sim {1:1.2f}'.format(v, sim_uv))

    # #  推荐单个的列表信息
    # def recommender_single(self, trainset, ids, top_k=10):
    #     """
    #     :param trainset:
    #     ur,user评分列表(item_inner_id,rating)的字典,键是用户的inner_id
    #     ir,item评分列表(user_inner_id,rating)的字典,键是item的inner_id
    #     :param ids:  使用的内部的id, inner_id
    #     :param top_k:
    #     :return:
    #     """
    #     if not trainset.knows_user(uid=ids):  # 用户中不存在当前的id
    #         raise PredictionImpossible('User is unkown.')
    #     if not trainset.knows_item(iid=ids):   # 物品中不存在当前的id
    #         raise PredictionImpossible("Iterm is unkown")
    #     neighbors = self.model.get_neighbors(iid=ids, k=top_k)  # 获取邻居
    #     innerid_2_rawid(trainset=trainset,inner_id=ids,data_type="")

    # def recommender(self, trainset, ids, top_k=10,verbose=True):
    #     # 遍历每个用户 以及所有物品产生推荐列表
    #     for inner_user_id in range(self._get_n_u_i(object_tytpe="user", verbose=False)):
    #         if verbose:
    #             print("开始处理用户:{}".format(inner_user_id))
    #             top_k_list =[]
    #             count = 0
    #             #

    def model_save(self, out_file, predictions=None, verbose=0):  # 保存模型
        """
        :param out_file: 保存的位置
        :param predictions:  用来保存的预测
        :param verbose: 0, 1
        :return:
        algo 存储的算法
        """
        dump(file_name=out_file,
             predictions=predictions,
             algo=self.model,
             verbose=verbose)

    # 获取用户的数量或item 的数量
    def _get_n_u_i(self, object_tytpe, verbose=True):
        assert object_tytpe in {"user", "item"}
        if object_tytpe == "user":
            n_u_i = self.model.trainset.n_users
            if verbose:
                print("总的用户数是:%d" % n_u_i)
        else:
            n_u_i = self.model.trainset.n_items
            if verbose:
                print("总的用户数是:%d" % n_u_i)
        return n_u_i

    def _get_some_arrtibute(self, attribute_type):
        assert attribute_type in {
            "n_ratings", "rating_scale", "global_mean", "all_items",
            "all_user", "all_ratings"
        }
        if attribute_type == "n_ratings":
            res = self.model.trainset.n_ratings
        elif attribute_type == "rating_scale":
            res = self.model.trainset.rating_scale
        elif attribute_type == "global_mean":
            res = self.model.trainset.global_mean
        elif attribute_type == "all_items":
            res = self.model.trainset.all_items()  # 返回所有item 的内部id
        elif attribute_type == "all_user":
            res = self.model.trainset.all_users()  # 返回所有user 的内部id
        elif attribute_type == "all_ratings":
            res = self.model.trainset.all_ratings(
            )  # 返回一个(uid, iid, rating)的元组
        else:
            ValueError("未知属性,请输出正确的属性")
        return res
Ejemplo n.º 10
0
class RandomRecommender(AbstractRecommender):

    """ Algorithm predicting a random rating based on the distribution of the training set, which is assumed to be normal. """

    def __init__(self, ratings_file_path=None, separator=None):
        super(AbstractRecommender, self).__init__()
        # Create the recommendation input_model and configure its input parameters:
        self.model = NormalPredictor()
        self.rating_data_model = RatingDataModel(ratings_file_path=ratings_file_path, separator=separator)
        self.separator = separator

    def recommend(self, user_id, how_many):

        """
        Recommends the best items for a specific user.
        :param user_id: Id of the user to recommend.
        :param how_many: Number of items that we recommend to the specific user.
        :return: Id of the items that the recommender returns.
        """

        # Items not seen by a specific user.
        item_ids_not_seen_from_user = self.rating_data_model.get_item_ids_not_seen_from_user(user_id)

        list_recommend = []
        for item_id in item_ids_not_seen_from_user:
            preference = self.estimate_preference(user_id, item_id)
            list_recommend.append([item_id, preference])
            print(item_id, ', ', preference)

        list_recommend.sort(key=lambda x: x[1], reverse=True)

        return list_recommend[:how_many]

    def estimate_preference(self, user_id, item_id):

        """
        Estimate the preference value by a specific user.
        :param user_id: Id of the user to recommend.
        :param item_id: Id of the item to recommend.
        :return: The estimate preference by the sepecific recommender.
        """

        # train file:
        df_ratings = self.rating_data_model.df_ratings
        # A reader is still needed but only the rating_scale param is requiered.
        reader = Reader(rating_scale=(self.rating_data_model.get_min_preference(), self.rating_data_model.get_max_preference()))
        train_data = Dataset(reader=reader)
        # The columns must correspond to user id, item id and ratings (in that order).
        raw_trainset = train_data.load_from_df(df_ratings[['user_id', 'item_id', 'rating']], reader)
        trainset = train_data.construct_trainset(raw_trainset.raw_ratings)

        # Train recommendation input_model:
        self.model.fit(trainset)

        return float(self.model.estimate(u=user_id, i=item_id)[0])

    def recommend_rival(self, n_folds, train_test_file_path, reader, recommendation_file_path):

        """
        Prepare the predictions to take them to RiVaL Toolkit.
        :param n_folds: Number of folds.
        :param train_test_file_path: Path with train and input_test files.
        :param recommendation_file_path: Path where the suitable files to run RiVaL Toolkit are saved.
        :return: The suitable files to run RiVaL Toolkit are saved.
        """

        for i in range(n_folds):
            print('Fold: ', i)

            timestart = time.time()
            # train file:
            train_file_name = train_test_file_path + 'train_bin_verified_sep_' + str(i) + '.csv'
            train_data = Dataset(reader=reader)
            raw_trainset = train_data.read_ratings(file_name=train_file_name)
            trainset = train_data.construct_trainset(raw_trainset)
            timeend = time.time()
            print('Train file loading time: ', (timeend - timestart), 'seconds')

            timestart = time.time()
            # Train recommendation input_model:
            self.model.fit(trainset)
            timeend = time.time()
            print('Training time: ', (timeend - timestart), 'seconds')

            # input_test file:
            timestart = time.time()
            test_file_name = train_test_file_path + 'test_bin_verified_sep_' + str(i) + '.csv'
            test_data = Dataset(reader=reader)
            raw_testset = test_data.read_ratings(file_name=test_file_name)
            testset = test_data.construct_testset(raw_testset)
            timeend = time.time()
            print('Load time of the input_test file: ', (timeend - timestart), 'seconds')

            # Predictions:
            timestart = time.time()
            predictions = self.model.test(testset)
            file_name = open(recommendation_file_path + 'recs_' + str(i) + '.csv', 'w')
            for pred in predictions:
                user_id = pred[0]
                item_id = pred[1]
                rating_real = pred[2]
                rating_estimated = pred[3]
                file_name.write(user_id + "\t" + item_id + "\t" + str(rating_estimated) + '\n')
            timeend = time.time()
            print('Prediction time: ', (timeend - timestart), 'seconds')
        sum(rec for rec in recalls_combined.values()) / len(recalls_combined))

print("averaged_precision for SVD algorithm:")
print(sum(averaged_precision_SVD) / len(averaged_precision_SVD))
print("averaged_recall for SVD algorithm:")
print(sum(averaged_recall_SVD) / len(averaged_recall_SVD))
print("averaged_precision for KNN algorithm:")
print(sum(averaged_precision_KNN) / len(averaged_precision_KNN))
print("averaged_recall for KNN algorithm:")
print(sum(averaged_recall_KNN) / len(averaged_recall_KNN))
print("averaged_precision for combined algorithm:")
print(sum(averaged_precision_combined) / len(averaged_precision_combined))
print("averaged_recall for combined algorithm:")
print(sum(averaged_recall_combined) / len(averaged_recall_combined))

#We compare our results with a random Predictor
algo_random = NormalPredictor()
algo_random.fit(trainset)
prediction_random = algo_random.test(testset)

#compare rmse and mae of the different algorithms
rmse(predictions_SVD)
rmse(predictions_KNN)
rmse(predictions_combined)
rmse(prediction_random)

mae(predictions_SVD)
mae(predictions_KNN)
mae(predictions_combined)
mae(prediction_random)