Ejemplo n.º 1
0
def knn_z(data, training, testing):
    '''
    Tune KNN with Z-score parameters then calculates RMSE, coverage and running time of KNN with Z-score

    Args:
        data(Dataset): the whole dataset divided into 5 folds
        training(Dataset): training dataset
        testing(Dataset): test dataset

    Returns:
        rmse: RMSE of KNN with Z-score with optimized parameters
        top_n: number of unique predictions for top n items
    '''

    # candidate parameters
    knn_param_grid = {'k': [5, 10, 20], 'sim_options': {'name': ['msd', 'cosine', 'pearson'],
                                                        'min_support': [1, 5],'user_based': [False]}}

    # optimize parameters
    knnz_grid_search = GridSearch(KNNWithZScore, knn_param_grid, measures=['RMSE'], verbose=False)
    knnz_grid_search.evaluate(data)
    param = knnz_grid_search.best_params['RMSE']
    print('KNNWithZScore:', param)

    # fit model using the optimized parameters
    knnz = KNNWithZScore(k = param['k'], name=param['sim_options']['name'],
                         min_support=param['sim_options']['min_support'], user_based=param['sim_options']['user_based'])
    knnz.train(training)

    # evaluate the model using test data
    predictions = knnz.test(testing)
    rmse = accuracy.rmse(predictions, verbose=True)
    top_n = get_top_n(predictions, n=5)

    return rmse, top_n
        def cal_KNNWithZScore(trainset, df):
            # KNN With ZScore

            sim_options = {'name': 'cosine', 'user-based': True}
            algo_knnz = KNNWithZScore(k=40, min_k=1, sim_options=sim_options)
            algo_knnz.fit(trainset)
            users = []
            items = []
            real = []
            estimate = []
            for i in range(len(df)):
                uid = df[i:i + 1].user.values[0]
                users.append(uid)
                iid = df[i:i + 1].store.values[0]
                items.append(iid)
                r_ui = df[i:i + 1].stars.values[0]
                real.append(r_ui)
                pred = algo.predict(uid, iid, r_ui, verbose=True)
                estimate.append(pred)
            print("end")
            # knn With Means
            df5 = pd.DataFrame(columns=['user', 'item', 'r_ui', 'est'])
            df5['user'] = users
            df5['item'] = items
            df5['r_ui'] = real
            df5['est'] = estimate
            #df3.head()
            df5['est'] = df5['est'].apply(lambda x: x[-2])
            df5['err'] = abs(df5.est - df5.r_ui)
            df5.to_csv(save_file2)
Ejemplo n.º 3
0
    def CFZ(self):
        u_id = []
        I_id = []
        r_ui_ = np.array([])
        _est = np.array([])

        sim_options = {'name': 'cosine', 'user_based': True}
        algo = KNNWithZScore(k=40, min_k=1, sim_options=sim_options)
        algo.fit(self.trainset)

        for uid in (self.list):
            lids = self.data[self.data.uid == uid]
            a = self.data[self.data.uid == uid]

            for i in range(1, len(a)):
                lid = lids[i - 1:i].lid.values[0]
                r_ui = lids[i - 1:i].rate.values[0]
                pred = algo.predict(uid, lid, r_ui, verbose=True)
                u_id.append(int(pred.uid))
                I_id.append(int(pred.iid))
                r_ui_ = np.append(r_ui_, pred.r_ui)
                _est = np.append(_est, pred.est)

        self.df_est = pd.DataFrame({
            'uid': u_id,
            'Iid': I_id,
            'r_ui': r_ui_,
            'est': _est
        })
        self.arr = self.df_est['uid'].unique()

        self.CFWZ_ndcg_ = self.Calculate_NDCG()
Ejemplo n.º 4
0
def randomize():
    sim_options_cosine = {'name': 'cosine', 'user_based': False}
    sim_options_msd = {'name': 'msd', 'user_based': False}
    sim_options_pearson = {'name': 'pearson', 'user_based': False}
    sim_options_baseline = {
        'name': 'pearson_baseline',
        'user_based': False,
        'shrinkage': 0
    }

    algorithms = [
        ('kNN Basic - Cosine',
         KNNBasic(sim_options=sim_options_cosine, verbose=False)),
        ('kNN Basic - MSD', KNNBasic(sim_options=sim_options_msd,
                                     verbose=False)),
        ('kNN Basic - Pearson',
         KNNBasic(sim_options=sim_options_pearson, verbose=False)),
        ('kNN Basic - Pearson B',
         KNNBasic(sim_options=sim_options_baseline, verbose=False)),
        ('kNN Means - Cosine',
         KNNWithMeans(sim_options=sim_options_cosine, verbose=False)),
        ('kNN Means - MSD',
         KNNWithMeans(sim_options=sim_options_msd, verbose=False)),
        ('kNN Means - Pearson',
         KNNWithMeans(sim_options=sim_options_pearson, verbose=False)),
        ('kNN Means - Pearson B',
         KNNWithMeans(sim_options=sim_options_baseline, verbose=False)),
        ('kNN Z - Cosine',
         KNNWithZScore(sim_options=sim_options_cosine, verbose=False)),
        ('kNN Z - MSD',
         KNNWithZScore(sim_options=sim_options_msd, verbose=False)),
        ('kNN Z - Pearson',
         KNNWithZScore(sim_options=sim_options_pearson, verbose=False)),
        ('kNN Z - Pearson B',
         KNNWithZScore(sim_options=sim_options_baseline, verbose=False)),
        ('kNN Baseline - Cosine',
         KNNBaseline(sim_options=sim_options_cosine, verbose=False)),
        ('kNN Baseline - MSD',
         KNNBaseline(sim_options=sim_options_msd, verbose=False)),
        ('kNN Baseline - Pearson',
         KNNBaseline(sim_options=sim_options_pearson, verbose=False)),
        ('kNN Baseline - Pearson B',
         KNNBaseline(sim_options=sim_options_baseline, verbose=False)),
        ('SVD', SVD(verbose=False)), ('SVDpp', SVDpp(verbose=False)),
        ('Baseline Only', BaselineOnly(verbose=False)),
        ('CoClustering', CoClustering(verbose=False)),
        ('SlopeOne', SlopeOne()), ('NMF', NMF(verbose=False))
    ]

    random_ = random.randint(0, len(algorithms))

    return algorithms[random_]
Ejemplo n.º 5
0
def get_algo(algo_id):
    #Define o algoritimo usado com base no segundo parametro da linha de comando
    #KNN com Zscore itembased
    if (algo_id == 2):
        algo = KNNWithZScore(user_based=False)
    #SVD com userbased
    elif (algo_id == 3):
        algo = KNNWithZScore(user_based=True)
    #KNN com Zscore userbased
    else:
        algo = KNNWithZScore(user_based=True)

    return algo
    def CFZ(self):
        sim_options = {'name': 'cosine', 'user_based': True}
        algo = KNNWithZScore(k=40, min_k=1, sim_options=sim_options)
        algo.fit(self.trainset)

        for uid in (self.list):
            lids = self.data[self.data.uid == uid]
            a = self.data[self.data.uid == uid]

            for i in range(1, len(a)):
                lid = lids[i - 1:i].lid.values[0]
                r_ui = lids[i - 1:i].rate.values[0]
                pred = algo.predict(uid, lid, r_ui, verbose=True)

        return pred
Ejemplo n.º 7
0
def run_baselines(ratings_dict, compressed_test_ratings_dict, data_origin):
    for alg in algos:
        if alg == "KNNBasic":
            algo = KNNBasic()
        elif alg == "KNNWithZScore":
            algo = KNNWithZScore()
        elif alg == "SVD":
            algo = SVD()
        elif alg == "NMF":
            algo = NMF()
        elif alg == "SlopeOne":
            algo = SlopeOne()
        elif alg == "CoClustering":
            algo = CoClustering()

        if data_origin == 'netflix':
            nr_predictions, accuracy, rmse, mae, precision, recall, f1 = testing(
                algo, ratings_dict, compressed_test_ratings_dict, 'netflix')
        elif data_origin == 'small':
            nr_predictions, accuracy, rmse, mae, precision, recall, f1 = testing(
                algo, ratings_dict, compressed_test_ratings_dict, 'small')
        elif data_origin == '100k':
            nr_predictions, accuracy, rmse, mae, precision, recall, f1 = testing(
                algo, ratings_dict, compressed_test_ratings_dict, '100k')

        # print results
        print("\n\nAlg %s" % alg)
        print("Number of user-items pairs: %d" % nr_predictions)
        print("Accuracy: %.2f " % accuracy)
        print("RMSE: %.2f" % rmse)
        print("MAE: %.2f" % mae)
        print("Precision: %.2f" % precision)
        print("Recall: %.2f" % recall)
        print("F1: %.2f" % f1)
Ejemplo n.º 8
0
    def generate_knn(self,rating_data):
        """
            here we separate untuned and tuned algo as it might take a really long time on tuning,
            it's easier to comment out the tuning part if needed

            Args:
                param1: rating_data: the main data set
            Return:
                    a dictionary of algorithms; key: name of algo, val: algo object

        """

        algo = {}
        bcKNN = KNNBasic(sim_options={'name': 'cosine', 'user_based': True})
        algo['bcKNN'] = bcKNN

        wmKNN = KNNWithMeans(sim_options={'name': 'cosine', 'user_based': True})
        algo['wmKNN'] = wmKNN

        wzKNN = KNNWithZScore(sim_options={'name': 'cosine', 'user_based': True})
        algo['wzKNN'] = wzKNN

        blKNN = KNNBaseline(sim_options={'name': 'cosine', 'user_based': True})
        algo['blKNN'] = blKNN


        # tune param for knnBaseline, since it has best accuracy
        param_grid_bl = {'k': [10, 15, 20, 25, 30, 40, 50, 60, 70, 80, 100]}
        best_params_bl = self.tune_and_find_parameter('blKNN', KNNBaseline, rating_data, param_grid_bl)

        blKNN_tuned = KNNBaseline(k=best_params_bl['k'])
        algo.update({'blKNN_tuned': blKNN_tuned})

        return algo
Ejemplo n.º 9
0
def get_model(model_name):
    algo = None
    if 'KNN' in model_name:
        model_name = model_name.split('_')
        knn_model_name = model_name[0]
        user_based = False if len(
            model_name) > 1 and model_name[1] == 'I' else True
        dis_method = 'msd' if len(model_name) < 3 else model_name[2]
        k = 20 if len(model_name) < 4 else int(model_name[3])
        sim_options = {'user_based': user_based, 'name': dis_method}
        if knn_model_name == 'KNNBasic':
            algo = KNNBasic(sim_options=sim_options, k=k)
        elif knn_model_name == 'KNNWithMeans':
            algo = KNNWithMeans(sim_options=sim_options, k=k)
        elif knn_model_name == 'KNNWithZScore':
            algo = KNNWithZScore(sim_options=sim_options, k=k)
    elif 'SVDpp' in model_name or 'SVD' in model_name or 'NMF' in model_name:
        model_name = model_name.split('_')
        n_factors = 25 if len(model_name) == 1 else int(model_name[1])
        if model_name[0] == 'SVDpp':
            algo = SVDpp(n_factors=n_factors)
        elif model_name[0] == 'SVD':
            algo = SVD(n_factors=n_factors)
        elif model_name[0] == 'NMF':
            algo = NMF(n_factors=n_factors)
    return algo
Ejemplo n.º 10
0
def crossvalidate(data):
    results = []
    for algorithm in [
            NormalPredictor(),
            KNNBaseline(k=15, sim_options=similarity_measure('pearson', 1)),
            KNNBasic(k=15, sim_options=similarity_measure('pearson', 1)),
            KNNWithMeans(k=15, sim_options=similarity_measure('pearson', 1)),
            KNNWithZScore(k=15, sim_options=similarity_measure('pearson', 1)),
            BaselineOnly(),
            SVD(),
            SVDpp(),
            NMF(),
            SlopeOne(),
            CoClustering()
    ]:
        result = cross_validate(algorithm,
                                data,
                                measures=['RMSE'],
                                cv=5,
                                verbose=False)
        temp = pd.DataFrame.from_dict(result).mean(axis=0)
        temp = temp.append(
            pd.Series([str(algorithm).split(' ')[0].split(".")[-1]],
                      index=['Algorithm']))
        results.append(temp)
    rmse_values = pd.DataFrame(results).set_index('Algorithm').sort_values(
        'test_rmse')
    return rmse_values
Ejemplo n.º 11
0
    def generate_knn(self, rating_data):

        algo = {}
        bcKNN = KNNBasic(sim_options={'name': 'cosine', 'user_based': True})
        algo['bcKNN'] = bcKNN

        wmKNN = KNNWithMeans(sim_options={
            'name': 'cosine',
            'user_based': True
        })
        algo['wmKNN'] = wmKNN

        wzKNN = KNNWithZScore(sim_options={
            'name': 'cosine',
            'user_based': True
        })
        algo['wzKNN'] = wzKNN

        blKNN = KNNBaseline(sim_options={'name': 'cosine', 'user_based': True})
        algo['blKNN'] = blKNN

        # tune param for knnBaseline, since it has best accuracy
        param_grid_bl = {'k': [10, 15, 20, 25, 30, 40, 50, 60, 70, 80, 100]}
        best_params_bl = self.tune_and_find_parameter('blKNN', KNNBaseline,
                                                      rating_data,
                                                      param_grid_bl)

        blKNN_tuned = KNNBaseline(k=best_params_bl['k'])
        algo.update({'blKNN_tuned': blKNN_tuned})

        return algo
def EvaluateDifferentAlgorithms():
    benchmark = []
    # Iterate over all algorithms
    for algorithm in [
            SVD(),
            SVDpp(),
            SlopeOne(),
            NMF(),
            NormalPredictor(),
            KNNBaseline(),
            KNNBasic(),
            KNNWithMeans(),
            KNNWithZScore(),
            BaselineOnly(),
            CoClustering()
    ]:
        # Perform cross validation
        results = cross_validate(algorithm,
                                 data_6months,
                                 measures=['RMSE'],
                                 cv=3,
                                 verbose=False)

        # Get results & append algorithm name
        tmp = pd.DataFrame.from_dict(results).mean(axis=0)
        tmp = tmp.append(
            pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],
                      index=['Algorithm']))
        benchmark.append(tmp)

        print(
            pd.DataFrame(benchmark).set_index('Algorithm').sort_values(
                'test_rmse'))
def compAlgos(data):  #Compare MAE, RMSE values for different algorithms
    print("\nLet us compare performance of KNN and SVD algorithms\n")
    #KNN Algos
    knn_Basic = cross_validate(KNNBasic(), data, cv=5, n_jobs=5, verbose=False)
    knn_means = cross_validate(KNNWithMeans(),
                               data,
                               cv=5,
                               n_jobs=5,
                               verbose=False)
    knn_z = cross_validate(KNNWithZScore(),
                           data,
                           cv=5,
                           n_jobs=5,
                           verbose=False)

    #SVD Algos
    svd = cross_validate(SVD(), data, cv=5, n_jobs=5, verbose=False)
    svdpp = cross_validate(SVDpp(), data, cv=5, n_jobs=5, verbose=False)

    print('\nKNN Basic: RMSE: {}, MAE: {}'.format(
        knn_Basic['test_rmse'].mean(), knn_Basic['test_mae'].mean()))
    print('\nKNN Means: RMSE: {}, MAE: {}'.format(
        knn_means['test_rmse'].mean(), knn_means['test_mae'].mean()))
    print('\nKNN Z Score: RMSE: {}, MAE: {}'.format(knn_z['test_rmse'].mean(),
                                                    knn_z['test_mae'].mean()))

    print('\nSVD: RMSE: {}, MAE: {}'.format(svd['test_rmse'].mean(),
                                            svd['test_mae'].mean()))
    print('\nSVD ++: RMSE: {}, MAE: {}'.format(svdpp['test_rmse'].mean(),
                                               svdpp['test_mae'].mean()))

    print('\nBoth SVDs perform better on the dataset\n')
    print(
        '\nWe will test with KNN means from KNN family and SVDPP from svd family\n'
    )
Ejemplo n.º 14
0
def to_test(k, option, model):
  
  df = pd.read_csv('training_set.dat')
  test_df = pd.read_csv('test_set.dat')
  reader = Reader(rating_scale=(1, 5))
  trainingSet = Dataset.load_from_df(df, reader).build_full_trainset()
  testSet = Dataset.load_from_df(test_df, reader).build_full_trainset().build_testset()

  opt = {'name': option, 'user_based': False}

  
  if model == 'Basic':
    algo = KNNBasic(k = k,sim_options = opt)
    algo.fit(trainingSet)
    # dump.dump("KNNBS.model", algo=algo, verbose=1)
  elif model == 'WithMeans':
    algo = KNNWithMeans(k = k,sim_options = opt)
    algo.fit(trainingSet)
    # dump.dump("KNNWM.model", algo=algo, verbose=1)
  elif model == 'WithZScore':
    algo = KNNWithZScore(k = k,sim_options = opt)
    algo.fit(trainingSet)
    # dump.dump("KNNWZS.model", algo=algo, verbose=1)
  elif model == 'Baseline':
    algo = KNNBaseline(k = k,sim_options = opt)
    algo.fit(trainingSet)
def check_for_args():
    args = sys.argv
    for arg in args:
        if (arg == 'SVD'):
            alg_list.append(SVD())
        elif (arg == 'SVDpp'):
            alg_list.append(SVDpp())
        elif (arg == 'SlopeOne'):
            alg_list.append(SlopeOne())
        elif (arg == 'NMF'):
            alg_list.append(NMF())
        elif (arg == 'NormalPredictor'):
            alg_list.append(NormalPredictor())
        elif (arg == 'KNNBaseline'):
            alg_list.append(KNNBaseline())
        elif (arg == 'KNNBasic'):
            alg_list.append(KNNBasic())
        elif (arg == 'KNNWithMeans'):
            alg_list.append(KNNWithMeans())
        elif (arg == 'KNNWithZScore'):
            alg_list.append(KNNWithZScore())
        elif (arg == 'BaselineOnly'):
            alg_list.append(BaselineOnly())
        elif (arg == 'CoClustering'):
            alg_list.append(CoClustering())

    return alg_list
Ejemplo n.º 16
0
def benchmark(data):
    performance = []
    algorithms = [
        SVD(),
        SVDpp(),
        SlopeOne(),
        NMF(),
        NormalPredictor(),
        KNNBaseline(),
        KNNBasic(),
        KNNWithMeans(),
        KNNWithZScore(),
        BaselineOnly(),
        CoClustering(),
        SVD_SGD_momentum(),
        SVDpp_SGD_momentum()
    ]
    for algorithm in algorithms:
        results = cross_validate(algorithm,
                                 data,
                                 measures=['RMSE', 'MAE', 'FCP'],
                                 cv=3,
                                 verbose=False)
        output = pd.DataFrame.from_dict(results).mean(axis=0)
        output = output.append(
            pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],
                      index=['Algorithm']))
        performance.append(output)
    output_df = pd.DataFrame(performance).set_index('Algorithm').sort_values(
        'test_rmse')
    store_dataframe(output_df, 'Algorithm_Benchmark.csv')
Ejemplo n.º 17
0
    def CFZ(self):
        kf = KFold(n_splits=5)
        sim_options = {'name': 'cosine', 'user_based': True}
        algo = KNNWithZScore(k=40, min_k=1, sim_options=sim_options)

        for trainset, testset in kf.split(self.data):
            algo.fit(trainset)
            predictions = algo.test(testset)
            precisions, recalls = self.precision_recall_at_k(predictions)

            P = sum(prec for prec in precisions.values()) / len(precisions)
            R = sum(rec for rec in recalls.values()) / len(recalls)
            F1 = 2 * P * R / (P + R)

            print("Precision : ", P)
            print("Recall    : ", R)
            print("F1        : ", F1)
 def _hyperopt(self, params):
     algo = KNNWithZScore(**params)
     return cross_validate(algo,
                           self._data,
                           measures=ACCURACY_METRICS,
                           cv=self._cv,
                           n_jobs=self._cv_n_jobs,
                           verbose=self._debug)[self._metric].mean()
Ejemplo n.º 19
0
def get_model(model_name, sim_options):
    if model_name == 'KNNBasic':
        model = KNNBasic(sim_options=sim_options, verbose=False)
    elif model_name == 'KNNWithMeans':
        model = KNNWithMeans(sim_options=sim_options, verbose=False)
    elif model_name == 'KNNWithZScore':
        model = KNNWithZScore(sim_options=sim_options, verbose=False)
    elif model_name == 'KNNBaseline':
        model = KNNBaseline(sim_options=sim_options, verbose=False)
    return model
Ejemplo n.º 20
0
def computeKNNZScoreMovie(data, test_np):
    """Compute the k-NN with z score item based method and return the predictions on the test
     The method is on all the data and got the following settings:
         - Similarity function : Pearson baseline, item based
         - Number of closest neighbors : 108
         
         data : data frame which represent the train set
         test_np : data frame on which the prediction will be returned
         
         return : test_np with a column of prediction named 'knnzscore_item_rating'"""
    
    trainset, test = dataTrainSurprise(data, test_np)
    
    sim_options = {'name':'pearson_baseline','user_based': False}
    knnz_algo = KNNWithZScore(k = 108, sim_options =sim_options).fit(trainset)

    test['knnzscore_item_rating'] = test[['user_id', 'movie_id']] \
    .apply(lambda row: knnz_algo.predict(row['user_id'], row['movie_id'])[3], axis=1)
    
    return test
Ejemplo n.º 21
0
def get_model_old(model_name):
    algo = None
    if model_name == 'KNNBasic_U':
        sim_options = {'user_based': True}
        algo = KNNBasic(sim_options=sim_options, k=20)
    elif model_name == 'KNNBasic_I':
        sim_options = {'user_based': False}
        algo = KNNBasic(sim_options=sim_options, k=20)
        # algo = KNNBasic()
    elif model_name == 'KNNWithMeans_I':
        algo = KNNWithMeans(sim_options={'user_based': False}, k=20)
    elif model_name == 'KNNWithMeans_U':
        algo = KNNWithMeans(sim_options={'user_based': True}, k=20)
    elif model_name == 'KNNWithZScore_I':
        algo = KNNWithZScore(sim_options={'user_based': False}, k=20)
    elif model_name == 'KNNWithZScore_U':
        algo = KNNWithZScore(sim_options={'user_based': True}, k=20)
    elif model_name == 'SVDpp':
        algo = SVDpp()
    elif model_name == 'SVD':
        algo = SVD()
    elif model_name == 'NMF':
        algo = NMF()
    elif 'NMF_' in model_name:
        n_factors = int(model_name.split("_")[1])
        algo = NMF(n_factors=n_factors)
    elif 'SVDpp_' in model_name:
        n_factors = int(model_name.split("_")[1])
        algo = SVDpp(n_factors=n_factors)
    elif 'SVD_' in model_name:
        n_factors = int(model_name.split("_")[1])
        algo = SVD(n_factors=n_factors)
    elif 'KNNBasic_U_' in model_name:
        k = int(model_name.split("_")[-1])
        sim_options = {'user_based': True}
        algo = KNNBasic(sim_options=sim_options, k=k)
    elif 'KNNBasic_I_' in model_name:
        k = int(model_name.split("_")[-1])
        sim_options = {'user_based': False}
        algo = KNNBasic(sim_options=sim_options, k=k)
    return algo
Ejemplo n.º 22
0
class KNN_Normalized(BaseSurpriseSTLEstimator):
    def __init__(self, k, name='KNN_Normalized'):
        super().__init__(name, 'non_feature_based')
        self.k = k
        self.model = KNNWithZScore(k=self.k, verbose=False)

    def _fit(self, x):
        self.model.fit(x)

    def _predict(self, x):
        return self.model.test(x)

    def get_hyper_params(self):
        hparams = {'k': {'type': 'integer', 'values': [2, 13]}}
        return hparams

    def set_hyper_params(self, **kwargs):
        self.k = kwargs['k']

    def similarity_matrix(self):
        return self.model.compute_similarities()
Ejemplo n.º 23
0
def generate_svd_recommendation_df() -> pd.DataFrame:
    # Prepare input DataFrame and algorithm
    score_df = genearte_score_df()
    svd_data = MyDataSet(score_df)
    #Try SVD
    algo = SVD()
    full_train_set = svd_data.build_full_trainset()
    test_set = full_train_set.build_anti_testset()
    # 5 fold validation
    score = cross_validate(algo, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    # Fitting the SVD
    algo.fit(full_train_set)
    predictions = algo.test(test_set)
    # Then compute RMSE
    accuracy.rmse(predictions)
    # Generate recommendation DataFrame
    recommendation_df_svd = get_top_n(predictions, n=5)
    #print (recommendation_df)
    
    
    #Try the NMF
    nmf_cv = cross_validate(NMF(), svd_data, cv=5, n_jobs=5, verbose=False) 
    algo = NMF()
    full_train_set = svd_data.build_full_trainset()
    test_set = full_train_set.build_anti_testset()
    # 5 fold validation
    score = cross_validate(algo, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    # Fitting the SVD
    algo.fit(full_train_set)
    predictions = algo.test(test_set)
    # Then compute RMSE
    accuracy.rmse(predictions)
    accuracy.mae(predictions)
    # Generate recommendation DataFrame
    recommendation_df_svd = get_top_n(predictions, n=5)
    #print (recommendation_df)
    
    
    
    #---------------------------------------------------
    # as per - https://bmanohar16.github.io/blog/recsys-evaluation-in-surprise
    knnbasic_cv = cross_validate(KNNBasic(), svd_data, cv=5, n_jobs=5, verbose=False)
    knnmeans_cv = cross_validate(KNNWithMeans(), svd_data, cv=5, n_jobs=5, verbose=False)
    knnz_cv = cross_validate(KNNWithZScore(), svd_data, cv=5, n_jobs=5, verbose=False)

    # Matrix Factorization Based Algorithms
    svd_cv = cross_validate(SVD(), svd_data, cv=5, n_jobs=5, verbose=False)
    svdpp_cv = cross_validate(SVDpp(),svd_data, cv=5, n_jobs=5, verbose=False)
    nmf_cv = cross_validate(NMF(), svd_data, cv=5, n_jobs=5, verbose=False) 
    
    #Other Collaborative Filtering Algorithms
    slope_cv = cross_validate(SlopeOne(), svd_data, cv=5, n_jobs=5, verbose=False)
    coclus_cv = cross_validate(CoClustering(), svd_data, cv=5, n_jobs=5, verbose=False)
Ejemplo n.º 24
0
    def EvaluateAllModels(self):
        """
                         test_rmse   fit_time  test_time
        Algorithm
        SVDpp             0.965824   9.401286   0.151476
        SVD               0.967286   1.474139   0.062471
        BaselineOnly      0.972408   0.108964   0.057277
        NMF               0.992677   4.073005   0.171846
        KNNWithZScore     1.001898   0.620192   0.083341
        KNNWithMeans      1.002924   0.489803   0.078121
        SlopeOne          1.006664  19.091191   1.275676
        KNNBaseline       1.007437   0.890452   0.088495
        KNNBasic          1.016717   0.432159   0.072929
        NormalPredictor   1.253265   0.041646   0.078105
        CoClustering      1.828291   3.020921   0.052071
        :return: test_rmse sonucu en düşük olan alınır.
        """
        benchmark = []
        # Iterate over all algorithms
        for algorithm in [
                SVD(),
                SVDpp(),
                SlopeOne(),
                NMF(),
                NormalPredictor(),
                KNNBaseline(),
                KNNBasic(),
                KNNWithMeans(),
                KNNWithZScore(),
                BaselineOnly(),
                CoClustering()
        ]:
            # Perform cross validation
            results = cross_validate(algorithm,
                                     self.data,
                                     measures=['RMSE'],
                                     cv=3,
                                     verbose=False)

            # Get results & append algorithm name
            tmp = pd.DataFrame.from_dict(results).mean(axis=0)
            tmp = tmp.append(
                pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],
                          index=['Algorithm']))
            benchmark.append(tmp)

        result = pd.DataFrame(benchmark).set_index('Algorithm').sort_values(
            'test_rmse')
        print(result)

        return result
Ejemplo n.º 25
0
    def calculateRMSE(self, method=9, similarityMeasure=1, isUserBased="Yes"):
        conn = sqlite3.connect(DATABASE_NAME)
        df = pd.read_sql_query(
            "SELECT userID, glassID, relativeRating FROM ratings", conn)

        reader = Reader(rating_scale=(1, 5))
        data = Dataset.load_from_df(
            df[['userID', 'glassID', 'relativeRating']], reader)

        trainset, testset = train_test_split(data, test_size=.20)

        isUserBased = True if (isUserBased == "Yes") else False
        if similarityMeasure == 1:
            similarityMeasure = "cosine"
        elif similarityMeasure == 2:
            similarityMeasure = "pearson"
        else:
            similarityMeasure = "pearson_baseline"

        sim_options = {'name': similarityMeasure, 'user_based': isUserBased}

        if method == 1:
            algo = SVD()
        elif method == 2:
            algo = SlopeOne()
        elif method == 3:
            algo = NMF()
        elif method == 4:
            algo = NormalPredictor()
        elif method == 5:
            algo = KNNBaseline(sim_options=sim_options)
        elif method == 6:
            algo = KNNBasic(sim_options=sim_options)
        elif method == 7:
            algo = KNNWithMeans(sim_options=sim_options)
        elif method == 8:
            algo = KNNWithZScore(sim_options=sim_options)
        elif method == 9:
            algo = BaselineOnly()
        else:
            algo = CoClustering()

        algo.fit(trainset)
        predictions = algo.test(testset)

        conn.close()

        #cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True)
        return round(accuracy.rmse(predictions, verbose=False), 4)
Ejemplo n.º 26
0
 def __init__(self, modelName, dataPath):
     self.modelDict = {
         "KNNBasic": KNNBasic(),
         "KNNWithMeans": KNNWithMeans(),
         "KNNWithZScore": KNNWithZScore(),
         "SVD": SVD(),
         "SVDpp": SVDpp(),
         "NMF": NMF(),
         "SlopeOne": SlopeOne(),
         "CoClustering": CoClustering()
     }
     self.trainset = None
     self.testset = None
     self.data = None
     self.model = self.modelDict[modelName]
     self.loadData(os.path.expanduser(dataPath))
Ejemplo n.º 27
0
def knnz_running_time(data):
    '''
        Calculates the running times for training and predictions for KNN with Z-score

        Args:
            data(Dataset): a list of datasets with different numbers of users

        Returns:
            elapsedtime_KnnZtrain: running time for training
            elapsedtime_KnnZtest: running time for predictions on testset
    '''
    elapsedtime_KnnZtrain = []
    elapsedtime_KnnZtest = []

    # tune the parameters on the entire data
    param_grid = {
        'k': [5, 10, 20],
        'sim_options': {
            'name': ['msd', 'cosine', 'pearson'],
            'min_support': [1, 5],
            'user_based': [False]
        }
    }
    grid_search = GridSearch(KNNWithZScore,
                             param_grid,
                             measures=['RMSE'],
                             verbose=False)
    grid_search.evaluate(data[3])
    param = grid_search.best_params['RMSE']
    k = param['k']
    sim = param['sim_options']['name']
    min_support = param['sim_options']['min_support']
    user_based = param['sim_options']['user_based']

    # using the tuned parameters calculate running times
    for i in range(len(data)):
        # training running time
        training_start = time.time()
        training = data[i].build_full_trainset()
        testing = training.build_anti_testset()
        knnz = KNNWithZScore(k=k,
                             name=sim,
                             min_support=min_support,
                             user_based=user_based)
        knnz.train(training)
        elapsedtime_KnnZtrain.append(time.time() - training_start)

        # prediction running time
        test_start = time.time()
        knnz.test(testing)
        elapsedtime_KnnZtest.append(time.time() - test_start)
    return elapsedtime_KnnZtrain, elapsedtime_KnnZtest
Ejemplo n.º 28
0
def main():
    book_df = pd.read_csv("../../data/processed/filtered_ratings.csv")
    # Reader object and rating scale specification
    book_df = book_df.drop('Unnamed: 0', axis=1)
    reader = Reader(rating_scale=(1, 5))
    # Load data
    data = Dataset.load_from_df(book_df[["user_id", "book_id", "rating"]],
                                reader)

    # Spilt data into train and test sets
    train_set, test_set = train_test_split(data, test_size=0.20)

    algorithm_list = [
        NormalPredictor(),
        BaselineOnly(),
        KNNWithZScore(k=10, sim_options=similarity_measure('pearson', 1)),
        KNNWithMeans(k=10, sim_options=similarity_measure('pearson', 1)),
        KNNBaseline(k=10, sim_options=similarity_measure('pearson', 1)),
        KNNBasic(k=10, sim_options=similarity_measure('pearson', 1)),
        SVDpp(),
        SVD(),
        NMF()
    ]

    # # Fit model for normal predictor and get rmse
    # basic_model_based(train_set, test_set, NormalPredictor())
    #
    # # Fit model for Baselineonly algorithm
    # basic_model_based(train_set, test_set, BaselineOnly())
    #
    # # Fit model for KNN algorithms
    # basic_model_based(train_set, test_set, KNNBasic(k=10, sim_options=similarity_measure('pearson', 1)))
    #
    # plot_for_rmse(train_set, test_set)
    # Crossvalidation results
    # res = crossvalidate(data)
    # print(res)
    results = {}
    for algo in algorithm_list:
        rmse, preci, recall, f1 = basic_model_based(train_set, test_set, algo)
        print("Algorithm:", algo, preci, recall, f1)
        print(
            "**------------------------------------------------------------------------------------------**"
        )
Ejemplo n.º 29
0
    def checkBestAlgorithm(self):
        self.df = pd.read_csv(csv_name)
        reader = Reader(rating_scale=(1, 10))
        data = Dataset.load_from_df(self.df[['user_id', 'item_id', 'rating']],
                                    reader)
        benchmark = []
        rmseTuple = []
        # 모든 알고리즘을 literate화 시켜서 반복문을 실행시킨다.
        for algorithm in [
                SVD(),
                SVDpp(),
                SlopeOne(),
                NormalPredictor(),
                KNNBaseline(),
                KNNBasic(),
                KNNWithMeans(),
                KNNWithZScore(),
                BaselineOnly(),
                CoClustering()
        ]:
            # 교차검증을 수행하는 단계.
            results = cross_validate(algorithm,
                                     data,
                                     measures=['RMSE'],
                                     cv=3,
                                     verbose=False)

            # 결과 저장과 알고리즘 이름 추가.
            tmp = pd.DataFrame.from_dict(results).mean(axis=0)
            rmseTuple.append((algorithm, tmp['test_rmse']))
            tmp = tmp.append(
                pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],
                          index=['Algorithm']))
            benchmark.append(tmp)
        print(
            pd.DataFrame(benchmark).set_index('Algorithm').sort_values(
                'test_rmse'))
        print("\n")
        rmseTuple.sort(key=lambda x: x[1])

        print("Best algorithm : ")
        print(str(rmseTuple[0]).split(' ')[0].split('.')[-1])
        return rmseTuple[0]
Ejemplo n.º 30
0
def collab_recommender(train_data,
                       test_data,
                       user_based=True,
                       normalization=False,
                       k=100,
                       sim='cosine'):
    """
    Input: 
    - train_data: dataframe, n*3, columns are ['userid','movieid','rating']
    - test_data: dataframe, n*2, columns are ['userid', 'movieid']
    - user_base: boolean, use user-based knn algorithm if True, use item-based knn algorithm if False
    - normalization: boolean, conduct z-score normalization on user/item matrix if True
    - k: int, number of nearest neighbors
    - sim: string, define the similarity matrix from ['cosine', 'pearson', 'msd', 'pearson_baseline']
    
    Output:
    - pred_rating: dataframe, n*2, columns are ['movieid', 'rating']
    """

    try:
        function_log.trace('Start collaborative recommendation function')

        reader = Reader(rating_scale=(1, 5))
        data = Dataset.load_from_df(train_data, reader)

        sim_options = {'name': sim, 'user_based': user_based}

        if normalization:
            algo = KNNWithZScore(k=k, sim_options=sim_options, verbose=False)
        else:
            algo = KNNWithMeans(k=k, sim_options=sim_options, verbose=False)

        train_set = data.build_full_trainset()
        algo.fit(train_set)

        pred_rating = {'movieid': [], 'rating': []}
        for idx in test_data.index:
            pred_rating['movieid'].append(test_data.loc[idx, 'movieid'])
            pred = algo.predict(test_data.loc[idx, 'userid'],
                                test_data.loc[idx, 'movieid'])
            pred_rating['rating'].append(pred.est)
        function_log.trace('Finish collaborative recommendation function')
        return pd.DataFrame(pred_rating)
    except ValueError:
        function_log.warn("Training and test data cannot be none.")
        raise ValueError
    except Exception as x:
        function_log.exception(
            f'collaborative recommendation function failed {x}')