コード例 #1
0
def knn_z(data, training, testing):
    '''
    Tune KNN with Z-score parameters then calculates RMSE, coverage and running time of KNN with Z-score

    Args:
        data(Dataset): the whole dataset divided into 5 folds
        training(Dataset): training dataset
        testing(Dataset): test dataset

    Returns:
        rmse: RMSE of KNN with Z-score with optimized parameters
        top_n: number of unique predictions for top n items
    '''

    # candidate parameters
    knn_param_grid = {'k': [5, 10, 20], 'sim_options': {'name': ['msd', 'cosine', 'pearson'],
                                                        'min_support': [1, 5],'user_based': [False]}}

    # optimize parameters
    knnz_grid_search = GridSearch(KNNWithZScore, knn_param_grid, measures=['RMSE'], verbose=False)
    knnz_grid_search.evaluate(data)
    param = knnz_grid_search.best_params['RMSE']
    print('KNNWithZScore:', param)

    # fit model using the optimized parameters
    knnz = KNNWithZScore(k = param['k'], name=param['sim_options']['name'],
                         min_support=param['sim_options']['min_support'], user_based=param['sim_options']['user_based'])
    knnz.train(training)

    # evaluate the model using test data
    predictions = knnz.test(testing)
    rmse = accuracy.rmse(predictions, verbose=True)
    top_n = get_top_n(predictions, n=5)

    return rmse, top_n
コード例 #2
0
        def cal_KNNWithZScore(trainset, df):
            # KNN With ZScore

            sim_options = {'name': 'cosine', 'user-based': True}
            algo_knnz = KNNWithZScore(k=40, min_k=1, sim_options=sim_options)
            algo_knnz.fit(trainset)
            users = []
            items = []
            real = []
            estimate = []
            for i in range(len(df)):
                uid = df[i:i + 1].user.values[0]
                users.append(uid)
                iid = df[i:i + 1].store.values[0]
                items.append(iid)
                r_ui = df[i:i + 1].stars.values[0]
                real.append(r_ui)
                pred = algo.predict(uid, iid, r_ui, verbose=True)
                estimate.append(pred)
            print("end")
            # knn With Means
            df5 = pd.DataFrame(columns=['user', 'item', 'r_ui', 'est'])
            df5['user'] = users
            df5['item'] = items
            df5['r_ui'] = real
            df5['est'] = estimate
            #df3.head()
            df5['est'] = df5['est'].apply(lambda x: x[-2])
            df5['err'] = abs(df5.est - df5.r_ui)
            df5.to_csv(save_file2)
コード例 #3
0
ファイル: NDCG.py プロジェクト: doyun317/Recommendation_HW
    def CFZ(self):
        u_id = []
        I_id = []
        r_ui_ = np.array([])
        _est = np.array([])

        sim_options = {'name': 'cosine', 'user_based': True}
        algo = KNNWithZScore(k=40, min_k=1, sim_options=sim_options)
        algo.fit(self.trainset)

        for uid in (self.list):
            lids = self.data[self.data.uid == uid]
            a = self.data[self.data.uid == uid]

            for i in range(1, len(a)):
                lid = lids[i - 1:i].lid.values[0]
                r_ui = lids[i - 1:i].rate.values[0]
                pred = algo.predict(uid, lid, r_ui, verbose=True)
                u_id.append(int(pred.uid))
                I_id.append(int(pred.iid))
                r_ui_ = np.append(r_ui_, pred.r_ui)
                _est = np.append(_est, pred.est)

        self.df_est = pd.DataFrame({
            'uid': u_id,
            'Iid': I_id,
            'r_ui': r_ui_,
            'est': _est
        })
        self.arr = self.df_est['uid'].unique()

        self.CFWZ_ndcg_ = self.Calculate_NDCG()
コード例 #4
0
def randomize():
    sim_options_cosine = {'name': 'cosine', 'user_based': False}
    sim_options_msd = {'name': 'msd', 'user_based': False}
    sim_options_pearson = {'name': 'pearson', 'user_based': False}
    sim_options_baseline = {
        'name': 'pearson_baseline',
        'user_based': False,
        'shrinkage': 0
    }

    algorithms = [
        ('kNN Basic - Cosine',
         KNNBasic(sim_options=sim_options_cosine, verbose=False)),
        ('kNN Basic - MSD', KNNBasic(sim_options=sim_options_msd,
                                     verbose=False)),
        ('kNN Basic - Pearson',
         KNNBasic(sim_options=sim_options_pearson, verbose=False)),
        ('kNN Basic - Pearson B',
         KNNBasic(sim_options=sim_options_baseline, verbose=False)),
        ('kNN Means - Cosine',
         KNNWithMeans(sim_options=sim_options_cosine, verbose=False)),
        ('kNN Means - MSD',
         KNNWithMeans(sim_options=sim_options_msd, verbose=False)),
        ('kNN Means - Pearson',
         KNNWithMeans(sim_options=sim_options_pearson, verbose=False)),
        ('kNN Means - Pearson B',
         KNNWithMeans(sim_options=sim_options_baseline, verbose=False)),
        ('kNN Z - Cosine',
         KNNWithZScore(sim_options=sim_options_cosine, verbose=False)),
        ('kNN Z - MSD',
         KNNWithZScore(sim_options=sim_options_msd, verbose=False)),
        ('kNN Z - Pearson',
         KNNWithZScore(sim_options=sim_options_pearson, verbose=False)),
        ('kNN Z - Pearson B',
         KNNWithZScore(sim_options=sim_options_baseline, verbose=False)),
        ('kNN Baseline - Cosine',
         KNNBaseline(sim_options=sim_options_cosine, verbose=False)),
        ('kNN Baseline - MSD',
         KNNBaseline(sim_options=sim_options_msd, verbose=False)),
        ('kNN Baseline - Pearson',
         KNNBaseline(sim_options=sim_options_pearson, verbose=False)),
        ('kNN Baseline - Pearson B',
         KNNBaseline(sim_options=sim_options_baseline, verbose=False)),
        ('SVD', SVD(verbose=False)), ('SVDpp', SVDpp(verbose=False)),
        ('Baseline Only', BaselineOnly(verbose=False)),
        ('CoClustering', CoClustering(verbose=False)),
        ('SlopeOne', SlopeOne()), ('NMF', NMF(verbose=False))
    ]

    random_ = random.randint(0, len(algorithms))

    return algorithms[random_]
コード例 #5
0
def get_algo(algo_id):
    #Define o algoritimo usado com base no segundo parametro da linha de comando
    #KNN com Zscore itembased
    if (algo_id == 2):
        algo = KNNWithZScore(user_based=False)
    #SVD com userbased
    elif (algo_id == 3):
        algo = KNNWithZScore(user_based=True)
    #KNN com Zscore userbased
    else:
        algo = KNNWithZScore(user_based=True)

    return algo
コード例 #6
0
    def CFZ(self):
        sim_options = {'name': 'cosine', 'user_based': True}
        algo = KNNWithZScore(k=40, min_k=1, sim_options=sim_options)
        algo.fit(self.trainset)

        for uid in (self.list):
            lids = self.data[self.data.uid == uid]
            a = self.data[self.data.uid == uid]

            for i in range(1, len(a)):
                lid = lids[i - 1:i].lid.values[0]
                r_ui = lids[i - 1:i].rate.values[0]
                pred = algo.predict(uid, lid, r_ui, verbose=True)

        return pred
コード例 #7
0
def run_baselines(ratings_dict, compressed_test_ratings_dict, data_origin):
    for alg in algos:
        if alg == "KNNBasic":
            algo = KNNBasic()
        elif alg == "KNNWithZScore":
            algo = KNNWithZScore()
        elif alg == "SVD":
            algo = SVD()
        elif alg == "NMF":
            algo = NMF()
        elif alg == "SlopeOne":
            algo = SlopeOne()
        elif alg == "CoClustering":
            algo = CoClustering()

        if data_origin == 'netflix':
            nr_predictions, accuracy, rmse, mae, precision, recall, f1 = testing(
                algo, ratings_dict, compressed_test_ratings_dict, 'netflix')
        elif data_origin == 'small':
            nr_predictions, accuracy, rmse, mae, precision, recall, f1 = testing(
                algo, ratings_dict, compressed_test_ratings_dict, 'small')
        elif data_origin == '100k':
            nr_predictions, accuracy, rmse, mae, precision, recall, f1 = testing(
                algo, ratings_dict, compressed_test_ratings_dict, '100k')

        # print results
        print("\n\nAlg %s" % alg)
        print("Number of user-items pairs: %d" % nr_predictions)
        print("Accuracy: %.2f " % accuracy)
        print("RMSE: %.2f" % rmse)
        print("MAE: %.2f" % mae)
        print("Precision: %.2f" % precision)
        print("Recall: %.2f" % recall)
        print("F1: %.2f" % f1)
コード例 #8
0
    def generate_knn(self,rating_data):
        """
            here we separate untuned and tuned algo as it might take a really long time on tuning,
            it's easier to comment out the tuning part if needed

            Args:
                param1: rating_data: the main data set
            Return:
                    a dictionary of algorithms; key: name of algo, val: algo object

        """

        algo = {}
        bcKNN = KNNBasic(sim_options={'name': 'cosine', 'user_based': True})
        algo['bcKNN'] = bcKNN

        wmKNN = KNNWithMeans(sim_options={'name': 'cosine', 'user_based': True})
        algo['wmKNN'] = wmKNN

        wzKNN = KNNWithZScore(sim_options={'name': 'cosine', 'user_based': True})
        algo['wzKNN'] = wzKNN

        blKNN = KNNBaseline(sim_options={'name': 'cosine', 'user_based': True})
        algo['blKNN'] = blKNN


        # tune param for knnBaseline, since it has best accuracy
        param_grid_bl = {'k': [10, 15, 20, 25, 30, 40, 50, 60, 70, 80, 100]}
        best_params_bl = self.tune_and_find_parameter('blKNN', KNNBaseline, rating_data, param_grid_bl)

        blKNN_tuned = KNNBaseline(k=best_params_bl['k'])
        algo.update({'blKNN_tuned': blKNN_tuned})

        return algo
コード例 #9
0
def get_model(model_name):
    algo = None
    if 'KNN' in model_name:
        model_name = model_name.split('_')
        knn_model_name = model_name[0]
        user_based = False if len(
            model_name) > 1 and model_name[1] == 'I' else True
        dis_method = 'msd' if len(model_name) < 3 else model_name[2]
        k = 20 if len(model_name) < 4 else int(model_name[3])
        sim_options = {'user_based': user_based, 'name': dis_method}
        if knn_model_name == 'KNNBasic':
            algo = KNNBasic(sim_options=sim_options, k=k)
        elif knn_model_name == 'KNNWithMeans':
            algo = KNNWithMeans(sim_options=sim_options, k=k)
        elif knn_model_name == 'KNNWithZScore':
            algo = KNNWithZScore(sim_options=sim_options, k=k)
    elif 'SVDpp' in model_name or 'SVD' in model_name or 'NMF' in model_name:
        model_name = model_name.split('_')
        n_factors = 25 if len(model_name) == 1 else int(model_name[1])
        if model_name[0] == 'SVDpp':
            algo = SVDpp(n_factors=n_factors)
        elif model_name[0] == 'SVD':
            algo = SVD(n_factors=n_factors)
        elif model_name[0] == 'NMF':
            algo = NMF(n_factors=n_factors)
    return algo
コード例 #10
0
def crossvalidate(data):
    results = []
    for algorithm in [
            NormalPredictor(),
            KNNBaseline(k=15, sim_options=similarity_measure('pearson', 1)),
            KNNBasic(k=15, sim_options=similarity_measure('pearson', 1)),
            KNNWithMeans(k=15, sim_options=similarity_measure('pearson', 1)),
            KNNWithZScore(k=15, sim_options=similarity_measure('pearson', 1)),
            BaselineOnly(),
            SVD(),
            SVDpp(),
            NMF(),
            SlopeOne(),
            CoClustering()
    ]:
        result = cross_validate(algorithm,
                                data,
                                measures=['RMSE'],
                                cv=5,
                                verbose=False)
        temp = pd.DataFrame.from_dict(result).mean(axis=0)
        temp = temp.append(
            pd.Series([str(algorithm).split(' ')[0].split(".")[-1]],
                      index=['Algorithm']))
        results.append(temp)
    rmse_values = pd.DataFrame(results).set_index('Algorithm').sort_values(
        'test_rmse')
    return rmse_values
コード例 #11
0
    def generate_knn(self, rating_data):

        algo = {}
        bcKNN = KNNBasic(sim_options={'name': 'cosine', 'user_based': True})
        algo['bcKNN'] = bcKNN

        wmKNN = KNNWithMeans(sim_options={
            'name': 'cosine',
            'user_based': True
        })
        algo['wmKNN'] = wmKNN

        wzKNN = KNNWithZScore(sim_options={
            'name': 'cosine',
            'user_based': True
        })
        algo['wzKNN'] = wzKNN

        blKNN = KNNBaseline(sim_options={'name': 'cosine', 'user_based': True})
        algo['blKNN'] = blKNN

        # tune param for knnBaseline, since it has best accuracy
        param_grid_bl = {'k': [10, 15, 20, 25, 30, 40, 50, 60, 70, 80, 100]}
        best_params_bl = self.tune_and_find_parameter('blKNN', KNNBaseline,
                                                      rating_data,
                                                      param_grid_bl)

        blKNN_tuned = KNNBaseline(k=best_params_bl['k'])
        algo.update({'blKNN_tuned': blKNN_tuned})

        return algo
コード例 #12
0
def EvaluateDifferentAlgorithms():
    benchmark = []
    # Iterate over all algorithms
    for algorithm in [
            SVD(),
            SVDpp(),
            SlopeOne(),
            NMF(),
            NormalPredictor(),
            KNNBaseline(),
            KNNBasic(),
            KNNWithMeans(),
            KNNWithZScore(),
            BaselineOnly(),
            CoClustering()
    ]:
        # Perform cross validation
        results = cross_validate(algorithm,
                                 data_6months,
                                 measures=['RMSE'],
                                 cv=3,
                                 verbose=False)

        # Get results & append algorithm name
        tmp = pd.DataFrame.from_dict(results).mean(axis=0)
        tmp = tmp.append(
            pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],
                      index=['Algorithm']))
        benchmark.append(tmp)

        print(
            pd.DataFrame(benchmark).set_index('Algorithm').sort_values(
                'test_rmse'))
コード例 #13
0
def compAlgos(data):  #Compare MAE, RMSE values for different algorithms
    print("\nLet us compare performance of KNN and SVD algorithms\n")
    #KNN Algos
    knn_Basic = cross_validate(KNNBasic(), data, cv=5, n_jobs=5, verbose=False)
    knn_means = cross_validate(KNNWithMeans(),
                               data,
                               cv=5,
                               n_jobs=5,
                               verbose=False)
    knn_z = cross_validate(KNNWithZScore(),
                           data,
                           cv=5,
                           n_jobs=5,
                           verbose=False)

    #SVD Algos
    svd = cross_validate(SVD(), data, cv=5, n_jobs=5, verbose=False)
    svdpp = cross_validate(SVDpp(), data, cv=5, n_jobs=5, verbose=False)

    print('\nKNN Basic: RMSE: {}, MAE: {}'.format(
        knn_Basic['test_rmse'].mean(), knn_Basic['test_mae'].mean()))
    print('\nKNN Means: RMSE: {}, MAE: {}'.format(
        knn_means['test_rmse'].mean(), knn_means['test_mae'].mean()))
    print('\nKNN Z Score: RMSE: {}, MAE: {}'.format(knn_z['test_rmse'].mean(),
                                                    knn_z['test_mae'].mean()))

    print('\nSVD: RMSE: {}, MAE: {}'.format(svd['test_rmse'].mean(),
                                            svd['test_mae'].mean()))
    print('\nSVD ++: RMSE: {}, MAE: {}'.format(svdpp['test_rmse'].mean(),
                                               svdpp['test_mae'].mean()))

    print('\nBoth SVDs perform better on the dataset\n')
    print(
        '\nWe will test with KNN means from KNN family and SVDPP from svd family\n'
    )
コード例 #14
0
def to_test(k, option, model):
  
  df = pd.read_csv('training_set.dat')
  test_df = pd.read_csv('test_set.dat')
  reader = Reader(rating_scale=(1, 5))
  trainingSet = Dataset.load_from_df(df, reader).build_full_trainset()
  testSet = Dataset.load_from_df(test_df, reader).build_full_trainset().build_testset()

  opt = {'name': option, 'user_based': False}

  
  if model == 'Basic':
    algo = KNNBasic(k = k,sim_options = opt)
    algo.fit(trainingSet)
    # dump.dump("KNNBS.model", algo=algo, verbose=1)
  elif model == 'WithMeans':
    algo = KNNWithMeans(k = k,sim_options = opt)
    algo.fit(trainingSet)
    # dump.dump("KNNWM.model", algo=algo, verbose=1)
  elif model == 'WithZScore':
    algo = KNNWithZScore(k = k,sim_options = opt)
    algo.fit(trainingSet)
    # dump.dump("KNNWZS.model", algo=algo, verbose=1)
  elif model == 'Baseline':
    algo = KNNBaseline(k = k,sim_options = opt)
    algo.fit(trainingSet)
コード例 #15
0
def check_for_args():
    args = sys.argv
    for arg in args:
        if (arg == 'SVD'):
            alg_list.append(SVD())
        elif (arg == 'SVDpp'):
            alg_list.append(SVDpp())
        elif (arg == 'SlopeOne'):
            alg_list.append(SlopeOne())
        elif (arg == 'NMF'):
            alg_list.append(NMF())
        elif (arg == 'NormalPredictor'):
            alg_list.append(NormalPredictor())
        elif (arg == 'KNNBaseline'):
            alg_list.append(KNNBaseline())
        elif (arg == 'KNNBasic'):
            alg_list.append(KNNBasic())
        elif (arg == 'KNNWithMeans'):
            alg_list.append(KNNWithMeans())
        elif (arg == 'KNNWithZScore'):
            alg_list.append(KNNWithZScore())
        elif (arg == 'BaselineOnly'):
            alg_list.append(BaselineOnly())
        elif (arg == 'CoClustering'):
            alg_list.append(CoClustering())

    return alg_list
コード例 #16
0
def benchmark(data):
    performance = []
    algorithms = [
        SVD(),
        SVDpp(),
        SlopeOne(),
        NMF(),
        NormalPredictor(),
        KNNBaseline(),
        KNNBasic(),
        KNNWithMeans(),
        KNNWithZScore(),
        BaselineOnly(),
        CoClustering(),
        SVD_SGD_momentum(),
        SVDpp_SGD_momentum()
    ]
    for algorithm in algorithms:
        results = cross_validate(algorithm,
                                 data,
                                 measures=['RMSE', 'MAE', 'FCP'],
                                 cv=3,
                                 verbose=False)
        output = pd.DataFrame.from_dict(results).mean(axis=0)
        output = output.append(
            pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],
                      index=['Algorithm']))
        performance.append(output)
    output_df = pd.DataFrame(performance).set_index('Algorithm').sort_values(
        'test_rmse')
    store_dataframe(output_df, 'Algorithm_Benchmark.csv')
コード例 #17
0
    def CFZ(self):
        kf = KFold(n_splits=5)
        sim_options = {'name': 'cosine', 'user_based': True}
        algo = KNNWithZScore(k=40, min_k=1, sim_options=sim_options)

        for trainset, testset in kf.split(self.data):
            algo.fit(trainset)
            predictions = algo.test(testset)
            precisions, recalls = self.precision_recall_at_k(predictions)

            P = sum(prec for prec in precisions.values()) / len(precisions)
            R = sum(rec for rec in recalls.values()) / len(recalls)
            F1 = 2 * P * R / (P + R)

            print("Precision : ", P)
            print("Recall    : ", R)
            print("F1        : ", F1)
コード例 #18
0
 def _hyperopt(self, params):
     algo = KNNWithZScore(**params)
     return cross_validate(algo,
                           self._data,
                           measures=ACCURACY_METRICS,
                           cv=self._cv,
                           n_jobs=self._cv_n_jobs,
                           verbose=self._debug)[self._metric].mean()
コード例 #19
0
def get_model(model_name, sim_options):
    if model_name == 'KNNBasic':
        model = KNNBasic(sim_options=sim_options, verbose=False)
    elif model_name == 'KNNWithMeans':
        model = KNNWithMeans(sim_options=sim_options, verbose=False)
    elif model_name == 'KNNWithZScore':
        model = KNNWithZScore(sim_options=sim_options, verbose=False)
    elif model_name == 'KNNBaseline':
        model = KNNBaseline(sim_options=sim_options, verbose=False)
    return model
コード例 #20
0
def computeKNNZScoreMovie(data, test_np):
    """Compute the k-NN with z score item based method and return the predictions on the test
     The method is on all the data and got the following settings:
         - Similarity function : Pearson baseline, item based
         - Number of closest neighbors : 108
         
         data : data frame which represent the train set
         test_np : data frame on which the prediction will be returned
         
         return : test_np with a column of prediction named 'knnzscore_item_rating'"""
    
    trainset, test = dataTrainSurprise(data, test_np)
    
    sim_options = {'name':'pearson_baseline','user_based': False}
    knnz_algo = KNNWithZScore(k = 108, sim_options =sim_options).fit(trainset)

    test['knnzscore_item_rating'] = test[['user_id', 'movie_id']] \
    .apply(lambda row: knnz_algo.predict(row['user_id'], row['movie_id'])[3], axis=1)
    
    return test
コード例 #21
0
def get_model_old(model_name):
    algo = None
    if model_name == 'KNNBasic_U':
        sim_options = {'user_based': True}
        algo = KNNBasic(sim_options=sim_options, k=20)
    elif model_name == 'KNNBasic_I':
        sim_options = {'user_based': False}
        algo = KNNBasic(sim_options=sim_options, k=20)
        # algo = KNNBasic()
    elif model_name == 'KNNWithMeans_I':
        algo = KNNWithMeans(sim_options={'user_based': False}, k=20)
    elif model_name == 'KNNWithMeans_U':
        algo = KNNWithMeans(sim_options={'user_based': True}, k=20)
    elif model_name == 'KNNWithZScore_I':
        algo = KNNWithZScore(sim_options={'user_based': False}, k=20)
    elif model_name == 'KNNWithZScore_U':
        algo = KNNWithZScore(sim_options={'user_based': True}, k=20)
    elif model_name == 'SVDpp':
        algo = SVDpp()
    elif model_name == 'SVD':
        algo = SVD()
    elif model_name == 'NMF':
        algo = NMF()
    elif 'NMF_' in model_name:
        n_factors = int(model_name.split("_")[1])
        algo = NMF(n_factors=n_factors)
    elif 'SVDpp_' in model_name:
        n_factors = int(model_name.split("_")[1])
        algo = SVDpp(n_factors=n_factors)
    elif 'SVD_' in model_name:
        n_factors = int(model_name.split("_")[1])
        algo = SVD(n_factors=n_factors)
    elif 'KNNBasic_U_' in model_name:
        k = int(model_name.split("_")[-1])
        sim_options = {'user_based': True}
        algo = KNNBasic(sim_options=sim_options, k=k)
    elif 'KNNBasic_I_' in model_name:
        k = int(model_name.split("_")[-1])
        sim_options = {'user_based': False}
        algo = KNNBasic(sim_options=sim_options, k=k)
    return algo
コード例 #22
0
ファイル: KNN.py プロジェクト: LLNL/MTLRecSys
class KNN_Normalized(BaseSurpriseSTLEstimator):
    def __init__(self, k, name='KNN_Normalized'):
        super().__init__(name, 'non_feature_based')
        self.k = k
        self.model = KNNWithZScore(k=self.k, verbose=False)

    def _fit(self, x):
        self.model.fit(x)

    def _predict(self, x):
        return self.model.test(x)

    def get_hyper_params(self):
        hparams = {'k': {'type': 'integer', 'values': [2, 13]}}
        return hparams

    def set_hyper_params(self, **kwargs):
        self.k = kwargs['k']

    def similarity_matrix(self):
        return self.model.compute_similarities()
コード例 #23
0
def generate_svd_recommendation_df() -> pd.DataFrame:
    # Prepare input DataFrame and algorithm
    score_df = genearte_score_df()
    svd_data = MyDataSet(score_df)
    #Try SVD
    algo = SVD()
    full_train_set = svd_data.build_full_trainset()
    test_set = full_train_set.build_anti_testset()
    # 5 fold validation
    score = cross_validate(algo, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    # Fitting the SVD
    algo.fit(full_train_set)
    predictions = algo.test(test_set)
    # Then compute RMSE
    accuracy.rmse(predictions)
    # Generate recommendation DataFrame
    recommendation_df_svd = get_top_n(predictions, n=5)
    #print (recommendation_df)
    
    
    #Try the NMF
    nmf_cv = cross_validate(NMF(), svd_data, cv=5, n_jobs=5, verbose=False) 
    algo = NMF()
    full_train_set = svd_data.build_full_trainset()
    test_set = full_train_set.build_anti_testset()
    # 5 fold validation
    score = cross_validate(algo, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    # Fitting the SVD
    algo.fit(full_train_set)
    predictions = algo.test(test_set)
    # Then compute RMSE
    accuracy.rmse(predictions)
    accuracy.mae(predictions)
    # Generate recommendation DataFrame
    recommendation_df_svd = get_top_n(predictions, n=5)
    #print (recommendation_df)
    
    
    
    #---------------------------------------------------
    # as per - https://bmanohar16.github.io/blog/recsys-evaluation-in-surprise
    knnbasic_cv = cross_validate(KNNBasic(), svd_data, cv=5, n_jobs=5, verbose=False)
    knnmeans_cv = cross_validate(KNNWithMeans(), svd_data, cv=5, n_jobs=5, verbose=False)
    knnz_cv = cross_validate(KNNWithZScore(), svd_data, cv=5, n_jobs=5, verbose=False)

    # Matrix Factorization Based Algorithms
    svd_cv = cross_validate(SVD(), svd_data, cv=5, n_jobs=5, verbose=False)
    svdpp_cv = cross_validate(SVDpp(),svd_data, cv=5, n_jobs=5, verbose=False)
    nmf_cv = cross_validate(NMF(), svd_data, cv=5, n_jobs=5, verbose=False) 
    
    #Other Collaborative Filtering Algorithms
    slope_cv = cross_validate(SlopeOne(), svd_data, cv=5, n_jobs=5, verbose=False)
    coclus_cv = cross_validate(CoClustering(), svd_data, cv=5, n_jobs=5, verbose=False)
コード例 #24
0
    def EvaluateAllModels(self):
        """
                         test_rmse   fit_time  test_time
        Algorithm
        SVDpp             0.965824   9.401286   0.151476
        SVD               0.967286   1.474139   0.062471
        BaselineOnly      0.972408   0.108964   0.057277
        NMF               0.992677   4.073005   0.171846
        KNNWithZScore     1.001898   0.620192   0.083341
        KNNWithMeans      1.002924   0.489803   0.078121
        SlopeOne          1.006664  19.091191   1.275676
        KNNBaseline       1.007437   0.890452   0.088495
        KNNBasic          1.016717   0.432159   0.072929
        NormalPredictor   1.253265   0.041646   0.078105
        CoClustering      1.828291   3.020921   0.052071
        :return: test_rmse sonucu en düşük olan alınır.
        """
        benchmark = []
        # Iterate over all algorithms
        for algorithm in [
                SVD(),
                SVDpp(),
                SlopeOne(),
                NMF(),
                NormalPredictor(),
                KNNBaseline(),
                KNNBasic(),
                KNNWithMeans(),
                KNNWithZScore(),
                BaselineOnly(),
                CoClustering()
        ]:
            # Perform cross validation
            results = cross_validate(algorithm,
                                     self.data,
                                     measures=['RMSE'],
                                     cv=3,
                                     verbose=False)

            # Get results & append algorithm name
            tmp = pd.DataFrame.from_dict(results).mean(axis=0)
            tmp = tmp.append(
                pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],
                          index=['Algorithm']))
            benchmark.append(tmp)

        result = pd.DataFrame(benchmark).set_index('Algorithm').sort_values(
            'test_rmse')
        print(result)

        return result
コード例 #25
0
ファイル: tavsiye.py プロジェクト: berkayytu/Ara-Proje
    def calculateRMSE(self, method=9, similarityMeasure=1, isUserBased="Yes"):
        conn = sqlite3.connect(DATABASE_NAME)
        df = pd.read_sql_query(
            "SELECT userID, glassID, relativeRating FROM ratings", conn)

        reader = Reader(rating_scale=(1, 5))
        data = Dataset.load_from_df(
            df[['userID', 'glassID', 'relativeRating']], reader)

        trainset, testset = train_test_split(data, test_size=.20)

        isUserBased = True if (isUserBased == "Yes") else False
        if similarityMeasure == 1:
            similarityMeasure = "cosine"
        elif similarityMeasure == 2:
            similarityMeasure = "pearson"
        else:
            similarityMeasure = "pearson_baseline"

        sim_options = {'name': similarityMeasure, 'user_based': isUserBased}

        if method == 1:
            algo = SVD()
        elif method == 2:
            algo = SlopeOne()
        elif method == 3:
            algo = NMF()
        elif method == 4:
            algo = NormalPredictor()
        elif method == 5:
            algo = KNNBaseline(sim_options=sim_options)
        elif method == 6:
            algo = KNNBasic(sim_options=sim_options)
        elif method == 7:
            algo = KNNWithMeans(sim_options=sim_options)
        elif method == 8:
            algo = KNNWithZScore(sim_options=sim_options)
        elif method == 9:
            algo = BaselineOnly()
        else:
            algo = CoClustering()

        algo.fit(trainset)
        predictions = algo.test(testset)

        conn.close()

        #cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True)
        return round(accuracy.rmse(predictions, verbose=False), 4)
コード例 #26
0
 def __init__(self, modelName, dataPath):
     self.modelDict = {
         "KNNBasic": KNNBasic(),
         "KNNWithMeans": KNNWithMeans(),
         "KNNWithZScore": KNNWithZScore(),
         "SVD": SVD(),
         "SVDpp": SVDpp(),
         "NMF": NMF(),
         "SlopeOne": SlopeOne(),
         "CoClustering": CoClustering()
     }
     self.trainset = None
     self.testset = None
     self.data = None
     self.model = self.modelDict[modelName]
     self.loadData(os.path.expanduser(dataPath))
コード例 #27
0
def knnz_running_time(data):
    '''
        Calculates the running times for training and predictions for KNN with Z-score

        Args:
            data(Dataset): a list of datasets with different numbers of users

        Returns:
            elapsedtime_KnnZtrain: running time for training
            elapsedtime_KnnZtest: running time for predictions on testset
    '''
    elapsedtime_KnnZtrain = []
    elapsedtime_KnnZtest = []

    # tune the parameters on the entire data
    param_grid = {
        'k': [5, 10, 20],
        'sim_options': {
            'name': ['msd', 'cosine', 'pearson'],
            'min_support': [1, 5],
            'user_based': [False]
        }
    }
    grid_search = GridSearch(KNNWithZScore,
                             param_grid,
                             measures=['RMSE'],
                             verbose=False)
    grid_search.evaluate(data[3])
    param = grid_search.best_params['RMSE']
    k = param['k']
    sim = param['sim_options']['name']
    min_support = param['sim_options']['min_support']
    user_based = param['sim_options']['user_based']

    # using the tuned parameters calculate running times
    for i in range(len(data)):
        # training running time
        training_start = time.time()
        training = data[i].build_full_trainset()
        testing = training.build_anti_testset()
        knnz = KNNWithZScore(k=k,
                             name=sim,
                             min_support=min_support,
                             user_based=user_based)
        knnz.train(training)
        elapsedtime_KnnZtrain.append(time.time() - training_start)

        # prediction running time
        test_start = time.time()
        knnz.test(testing)
        elapsedtime_KnnZtest.append(time.time() - test_start)
    return elapsedtime_KnnZtrain, elapsedtime_KnnZtest
コード例 #28
0
def main():
    book_df = pd.read_csv("../../data/processed/filtered_ratings.csv")
    # Reader object and rating scale specification
    book_df = book_df.drop('Unnamed: 0', axis=1)
    reader = Reader(rating_scale=(1, 5))
    # Load data
    data = Dataset.load_from_df(book_df[["user_id", "book_id", "rating"]],
                                reader)

    # Spilt data into train and test sets
    train_set, test_set = train_test_split(data, test_size=0.20)

    algorithm_list = [
        NormalPredictor(),
        BaselineOnly(),
        KNNWithZScore(k=10, sim_options=similarity_measure('pearson', 1)),
        KNNWithMeans(k=10, sim_options=similarity_measure('pearson', 1)),
        KNNBaseline(k=10, sim_options=similarity_measure('pearson', 1)),
        KNNBasic(k=10, sim_options=similarity_measure('pearson', 1)),
        SVDpp(),
        SVD(),
        NMF()
    ]

    # # Fit model for normal predictor and get rmse
    # basic_model_based(train_set, test_set, NormalPredictor())
    #
    # # Fit model for Baselineonly algorithm
    # basic_model_based(train_set, test_set, BaselineOnly())
    #
    # # Fit model for KNN algorithms
    # basic_model_based(train_set, test_set, KNNBasic(k=10, sim_options=similarity_measure('pearson', 1)))
    #
    # plot_for_rmse(train_set, test_set)
    # Crossvalidation results
    # res = crossvalidate(data)
    # print(res)
    results = {}
    for algo in algorithm_list:
        rmse, preci, recall, f1 = basic_model_based(train_set, test_set, algo)
        print("Algorithm:", algo, preci, recall, f1)
        print(
            "**------------------------------------------------------------------------------------------**"
        )
コード例 #29
0
    def checkBestAlgorithm(self):
        self.df = pd.read_csv(csv_name)
        reader = Reader(rating_scale=(1, 10))
        data = Dataset.load_from_df(self.df[['user_id', 'item_id', 'rating']],
                                    reader)
        benchmark = []
        rmseTuple = []
        # 모든 알고리즘을 literate화 시켜서 반복문을 실행시킨다.
        for algorithm in [
                SVD(),
                SVDpp(),
                SlopeOne(),
                NormalPredictor(),
                KNNBaseline(),
                KNNBasic(),
                KNNWithMeans(),
                KNNWithZScore(),
                BaselineOnly(),
                CoClustering()
        ]:
            # 교차검증을 수행하는 단계.
            results = cross_validate(algorithm,
                                     data,
                                     measures=['RMSE'],
                                     cv=3,
                                     verbose=False)

            # 결과 저장과 알고리즘 이름 추가.
            tmp = pd.DataFrame.from_dict(results).mean(axis=0)
            rmseTuple.append((algorithm, tmp['test_rmse']))
            tmp = tmp.append(
                pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],
                          index=['Algorithm']))
            benchmark.append(tmp)
        print(
            pd.DataFrame(benchmark).set_index('Algorithm').sort_values(
                'test_rmse'))
        print("\n")
        rmseTuple.sort(key=lambda x: x[1])

        print("Best algorithm : ")
        print(str(rmseTuple[0]).split(' ')[0].split('.')[-1])
        return rmseTuple[0]
コード例 #30
0
def collab_recommender(train_data,
                       test_data,
                       user_based=True,
                       normalization=False,
                       k=100,
                       sim='cosine'):
    """
    Input: 
    - train_data: dataframe, n*3, columns are ['userid','movieid','rating']
    - test_data: dataframe, n*2, columns are ['userid', 'movieid']
    - user_base: boolean, use user-based knn algorithm if True, use item-based knn algorithm if False
    - normalization: boolean, conduct z-score normalization on user/item matrix if True
    - k: int, number of nearest neighbors
    - sim: string, define the similarity matrix from ['cosine', 'pearson', 'msd', 'pearson_baseline']
    
    Output:
    - pred_rating: dataframe, n*2, columns are ['movieid', 'rating']
    """

    try:
        function_log.trace('Start collaborative recommendation function')

        reader = Reader(rating_scale=(1, 5))
        data = Dataset.load_from_df(train_data, reader)

        sim_options = {'name': sim, 'user_based': user_based}

        if normalization:
            algo = KNNWithZScore(k=k, sim_options=sim_options, verbose=False)
        else:
            algo = KNNWithMeans(k=k, sim_options=sim_options, verbose=False)

        train_set = data.build_full_trainset()
        algo.fit(train_set)

        pred_rating = {'movieid': [], 'rating': []}
        for idx in test_data.index:
            pred_rating['movieid'].append(test_data.loc[idx, 'movieid'])
            pred = algo.predict(test_data.loc[idx, 'userid'],
                                test_data.loc[idx, 'movieid'])
            pred_rating['rating'].append(pred.est)
        function_log.trace('Finish collaborative recommendation function')
        return pd.DataFrame(pred_rating)
    except ValueError:
        function_log.warn("Training and test data cannot be none.")
        raise ValueError
    except Exception as x:
        function_log.exception(
            f'collaborative recommendation function failed {x}')