Example #1
0
def test_wrong_file_name():
    """Ensure file names are checked when creating a (custom) Dataset."""
    wrong_files = [('does_not_exist', 'does_not_either')]

    with pytest.raises(ValueError):
        Dataset.load_from_folds(folds_files=wrong_files, reader=Reader(),
                                rating_scale=(1, 5))
Example #2
0
def test_deprecated_way():
    """Test all Dataset constructors without passing rating_scale as a
    parameter. Make sure we revert back to the Reader object, with a warning
    message.

    Also, make sure ValueError is raised if reader has no rating_scale in this
    context.

    Not using dataset fixtures here for more control.
    """

    # test load_from_file
    toy_data_path = (os.path.dirname(os.path.realpath(__file__)) +
                     '/custom_dataset')
    with pytest.warns(UserWarning):
        reader = Reader(line_format='user item rating', sep=' ', skip_lines=3,
                        rating_scale=(1, 5))
        data = Dataset.load_from_file(file_path=toy_data_path,
                                      reader=reader)

    with pytest.raises(ValueError):
        reader = Reader(line_format='user item rating', sep=' ', skip_lines=3,
                        rating_scale=None)
        data = Dataset.load_from_file(file_path=toy_data_path,
                                      reader=reader)

    # test load_from_folds
    train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train')
    test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
    with pytest.warns(UserWarning):
        reader = Reader(line_format='user item rating timestamp', sep='\t',
                        rating_scale=(1, 5))
        data = Dataset.load_from_folds([(train_file, test_file)], reader=reader)
    with pytest.raises(ValueError):
        reader = Reader(line_format='user item rating timestamp', sep='\t',
                        rating_scale=None)
        data = Dataset.load_from_folds([(train_file, test_file)],
                                       reader=reader)
    # test load_from_df
    ratings_dict = {'itemID': [1, 1, 1, 2, 2],
                    'userID': [9, 32, 2, 45, '10000'],
                    'rating': [3, 2, 4, 3, 1]}
    df = pd.DataFrame(ratings_dict)

    with pytest.warns(UserWarning):
        reader = Reader(rating_scale=(1, 5))
        data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']],
                                    reader=reader)
    with pytest.raises(ValueError):
        reader = Reader(rating_scale=None)
        data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']],  # noqa
                                    reader=reader)
Example #3
0
def test_cross_validate(toy_data):

    # First test with a specified CV iterator.
    current_dir = os.path.dirname(os.path.realpath(__file__))
    folds_files = [(current_dir + '/custom_train',
                    current_dir + '/custom_test')]

    reader = Reader(line_format='user item rating', sep=' ', skip_lines=3)
    data = Dataset.load_from_folds(folds_files=folds_files, reader=reader,
                                   rating_scale=(1, 5))

    algo = NormalPredictor()
    pkf = ms.PredefinedKFold()
    ret = ms.cross_validate(algo, data, measures=['rmse', 'mae'], cv=pkf,
                            verbose=1)
    # Basically just test that keys (dont) exist as they should
    assert len(ret['test_rmse']) == 1
    assert len(ret['test_mae']) == 1
    assert len(ret['fit_time']) == 1
    assert len(ret['test_time']) == 1
    assert 'test_fcp' not in ret
    assert 'train_rmse' not in ret
    assert 'train_mae' not in ret

    # Test that 5 fold CV is used when cv=None
    # Also check that train_* key exist when return_train_measures is True.
    ret = ms.cross_validate(algo, toy_data, measures=['rmse', 'mae'], cv=None,
                            return_train_measures=True, verbose=True)
    assert len(ret['test_rmse']) == 5
    assert len(ret['test_mae']) == 5
    assert len(ret['fit_time']) == 5
    assert len(ret['test_time']) == 5
    assert len(ret['train_rmse']) == 5
    assert len(ret['train_mae']) == 5
Example #4
0
def u1_ml100k():
    """Return a Dataset object that contains 10% of the u1 fold from movielens
    100k. Trainset has 8000 ratings and testset has 2000.
    """
    train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train')
    test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
    data = Dataset.load_from_folds([(train_file, test_file)],
                                   Reader('ml-100k'), rating_scale=(1, 5))

    return data
    def get_rating_predictions(self, test_set, cluster_user_mapping=None):
        self.test_set = test_set
        test_path_tmp = "..\\resources\\tmp\\test_file.csv"
        train_path_tmp = "..\\resources\\tmp\\train_file.csv"

        self.train_set.to_csv(train_path_tmp, index=False, header=False)
        self.test_set.to_csv(test_path_tmp, index=False, header=False)

        fold_files = [(train_path_tmp, test_path_tmp)]
        reader = Reader(rating_scale=(1, 10),
                        line_format='user item rating',
                        sep=',')
        data = Dataset.load_from_folds(fold_files, reader=reader)

        for trainset, testset in PredefinedKFold().split(data):

            if cluster_user_mapping is None:
                self.method.fit(trainset)
            else:
                df_users_in_clusters = pd.DataFrame.from_dict(
                    cluster_user_mapping)
                df_cluser_users = df_users_in_clusters.groupby('')
                #Distinct clusters:
                clusters = list(set(cluster_user_mapping.values()))

                #for cluster in clusters:
                #cluster_train_data = trainset[trainset.userID.isin() userID]
                pass

        results = pd.DataFrame(columns=['userID', 'itemID', 'real', 'est'])

        pbar = tqdm(total=len(self.test_set.index))

        for key, val in self.test_set.iterrows():
            prediction = self.method.predict(str(val.userID),
                                             str(val.itemID),
                                             clip=False)
            results = results.append(
                {
                    "userID": int(val.userID),
                    "itemID": int(val.itemID),
                    "real": int(val.rating),
                    "est": int(prediction.est)
                },
                ignore_index=True)
            pbar.update(1)
        pbar.close()
        return results
    def predict_rating_split_by_time(self, files_pair, algo_test):

        algo = algo_test[0]

        use_auto_parse = algo_test[1]
        if use_auto_parse:
            fold_files = [(files_pair)]
            reader = Reader(rating_scale=(1, 10),
                            line_format='user item rating',
                            sep=',')
            data = Dataset.load_from_folds(fold_files, reader=reader)

            for trainset, testset in PredefinedKFold().split(data):
                algo.fit(trainset)
                predictions = algo.test(testset)
                rmse = accuracy.rmse(predictions, verbose=False)
                return rmse
        else:

            # Prepare dataset

            train_set = pd.read_csv(files_pair[0], parse_dates=[3])
            test_set = pd.read_csv(files_pair[1], parse_dates=[3])

            item_to_id_mapping = {}
            user_to_id_mapping = {}

            item_index = 0
            user_index = 0
            all_sets = pd.concat([train_set, test_set])
            for item in all_sets['itemID']:
                if item not in item_to_id_mapping.keys():
                    item_to_id_mapping[item] = item_index
                    item_index += 1
            for user in all_sets['userID']:
                if user not in user_to_id_mapping.keys():
                    user_to_id_mapping[user] = user_index
                    user_index += 1

            train_set['itemID'] = train_set['itemID'].map(item_to_id_mapping)
            test_set['itemID'] = test_set['itemID'].map(item_to_id_mapping)
            train_set['userID'] = train_set['userID'].map(user_to_id_mapping)
            test_set['userID'] = test_set['userID'].map(user_to_id_mapping)

            algo.fit(train_set)
            rec_list = algo.get_top_n_recommendations(test_set)
            pass
    def get_top_n_recommendations(self, test_set, top_n):
        self.test_set = test_set

        test_path_tmp = "..\\resources\\tmp\\test_file.csv"
        train_path_tmp = "..\\resources\\tmp\\train_file.csv"

        self.train_set.to_csv(train_path_tmp, index=False, header=False)
        self.test_set.to_csv(test_path_tmp, index=False, header=False)

        fold_files = [(train_path_tmp, test_path_tmp)]
        reader = Reader(rating_scale=(1, 10),
                        line_format='user item rating',
                        sep=',')
        data = Dataset.load_from_folds(fold_files, reader=reader)

        for trainset, testset in PredefinedKFold().split(data):
            self.method.fit(trainset)

        already_ranked_items_by_users = self.train_set.groupby(
            'userID')['itemID'].apply(list)

        recommendations = {}
        pbar = tqdm(total=len(self.test_set.userID.unique()))
        for userID in self.test_set.userID.unique():
            pbar.update(1)

            if userID not in self.train_set.userID.unique():
                recommendations[str(userID)] = []
                continue

            items_expected_ranking = {}
            for itemID in self.train_set.itemID.unique():
                if itemID in already_ranked_items_by_users[userID]:
                    continue
                # Calc prediction for item for user
                predicted = self.method.predict(str(userID),
                                                str(itemID),
                                                clip=False)
                items_expected_ranking[itemID] = predicted[3]
            sorted_predictions = sorted(items_expected_ranking.items(),
                                        key=operator.itemgetter(1))
            sorted_predictions.reverse()
            sorted_predictions = [str(x[0]) for x in sorted_predictions]
            user_recommendations = sorted_predictions[:top_n]
            recommendations[str(userID)] = user_recommendations
        pbar.close()
        return recommendations
Example #8
0
def test_cross_validate():

    # First test with a specified CV iterator.
    current_dir = os.path.dirname(os.path.realpath(__file__))
    folds_files = [(current_dir + '/custom_train',
                    current_dir + '/custom_test')]

    reader = Reader(line_format='user item rating',
                    sep=' ',
                    skip_lines=3,
                    rating_scale=(1, 5))
    data = Dataset.load_from_folds(folds_files=folds_files, reader=reader)

    algo = NormalPredictor()
    pkf = ms.PredefinedKFold()
    ret = ms.cross_validate(algo,
                            data,
                            measures=[['neg_rmse', neg_rmse],
                                      ['neg_mae', neg_mae]],
                            cv=pkf,
                            verbose=1)
    # Basically just test that keys (dont) exist as they should
    assert len(ret['test_neg_rmse']) == 1
    assert len(ret['test_neg_mae']) == 1
    assert len(ret['fit_time']) == 1
    assert len(ret['test_time']) == 1
    assert 'test_fcp' not in ret
    assert 'train_neg_rmse' not in ret
    assert 'train_neg_mae' not in ret

    # Test that 5 fold CV is used when cv=None
    # Also check that train_* key exist when return_train_measures is True.
    data = Dataset.load_from_file(current_dir + '/custom_dataset', reader)
    ret = ms.cross_validate(algo,
                            data,
                            measures=[['neg_rmse', neg_rmse],
                                      ['neg_mae', neg_mae]],
                            cv=None,
                            return_train_measures=True,
                            verbose=True)
    assert len(ret['test_neg_rmse']) == 5
    assert len(ret['test_neg_mae']) == 5
    assert len(ret['fit_time']) == 5
    assert len(ret['test_time']) == 5
    assert len(ret['train_neg_rmse']) == 5
    assert len(ret['train_neg_mae']) == 5
Example #9
0
def test_trainset_testset():
    """Test the construct_trainset and construct_testset methods."""

    current_dir = os.path.dirname(os.path.realpath(__file__))
    folds_files = [(current_dir + '/custom_train',
                    current_dir + '/custom_test')]

    data = Dataset.load_from_folds(folds_files=folds_files, reader=reader)

    for trainset, testset in data.folds():
        pass  # just need trainset and testset to be set

    # test rm:
    rm = trainset.rm
    assert rm[0, 0] == 4
    assert rm[1, 0] == 4
    assert rm[3, 1] == 5
    assert rm[40, 20000] == 0  # not in the trainset

    # test ur
    ur = trainset.ur
    assert ur[0] == [(0, 4)]
    assert ur[1] == [(0, 4), (1, 2)]
    assert ur[40] == []  # not in the trainset

    # test ir
    ir = trainset.ir
    assert ir[0] == [(0, 4), (1, 4), (2, 1)]
    assert ir[1] == [(1, 2), (2, 1), (3, 5)]
    assert ir[20000] == []  # not in the trainset

    # test n_users, n_items, r_min, r_max
    assert trainset.n_users == 4
    assert trainset.n_items == 2
    assert trainset.r_min == 1
    assert trainset.r_max == 5

    # test raw2inner: ensure inner ids are given in proper order
    raw2inner_id_users = trainset._raw2inner_id_users
    for i in range(4):
        assert raw2inner_id_users['user' + str(i)] == i

    raw2inner_id_items = trainset._raw2inner_id_items
    for i in range(2):
        assert raw2inner_id_items['item' + str(i)] == i
Example #10
0
def test_PredifinedKFold():

    reader = Reader(line_format='user item rating',
                    sep=' ',
                    skip_lines=3,
                    rating_scale=(1, 5))

    current_dir = os.path.dirname(os.path.realpath(__file__))
    folds_files = [(current_dir + '/custom_train',
                    current_dir + '/custom_test')]

    data = Dataset.load_from_folds(folds_files=folds_files, reader=reader)

    # Make sure rating files are read correctly
    pkf = PredefinedKFold()
    trainset, testset = next(pkf.split(data))
    assert trainset.n_ratings == 6
    assert len(testset) == 3
Example #11
0
def test_PredifinedKFold(toy_data_reader):

    current_dir = os.path.dirname(os.path.realpath(__file__))
    folds_files = [(current_dir + '/custom_train',
                    current_dir + '/custom_test')]

    data = Dataset.load_from_folds(folds_files=folds_files,
                                   reader=toy_data_reader, rating_scale=(1, 5))

    # Make sure rating files are read correctly
    pkf = PredefinedKFold()
    trainset, testset = next(pkf.split(data))
    assert trainset.n_ratings == 6
    assert len(testset) == 3

    # Make sure pkf returns the same folds as the deprecated data.folds()
    with pytest.warns(UserWarning):
        trainset_, testset_ = next(data.folds())
    assert testset_ == testset
Example #12
0
def surprise_SVDpp(train_file, test_file):
    """
    Svd++ with Surprise library.
    Compute the predictions on a test_set after training on a train_set using the method Svd++  from Surprise.
    Args:
        train_file (string): path to created test file
        test_file (string): path to created train file
    Hyperparameters:
        n_factors : The number of factors.
        n_epochs : The number of iteration of the SGD procedure
        lr_'x': The learning rate for 'x'
        reg_'x' : The regularization term for 'x'
    'x':
        bi : The item biases
        bu : The user biases
        qi : The item factors
        yj : The (implicit) item factors
        pu : The user factors


    Returns:
        numpy array: predictions
    """
    print("SVDpp")
    fold = [(train_file, test_file)]
    reader = Reader(line_format='user item rating', sep=',')
    data = Dataset.load_from_folds(fold, reader=reader)
    pkf = PredefinedKFold()
    # Algorithm

    algo = SVDpp(n_epochs=40, n_factors=100, lr_all=0.01, reg_all=0.01)
    for trainset, testset in pkf.split(data):
        # Train
        algo.fit(trainset)

        # Predict
        predictions = algo.test(testset)
    pred = np.zeros(len(predictions))
    for i in range(len(predictions)):
        val = predictions[i].est
        pred[i] = val
    return pred
Example #13
0
def test_performances():
    """Test the returned dict. Also do dumping."""

    current_dir = os.path.dirname(os.path.realpath(__file__))
    folds_files = [(current_dir + '/custom_train',
                    current_dir + '/custom_test')]

    reader = Reader(line_format='user item rating', sep=' ', skip_lines=3)
    data = Dataset.load_from_folds(folds_files=folds_files, reader=reader,
                                   rating_scale=(1, 5))

    algo = NormalPredictor()
    tmp_dir = tempfile.mkdtemp()  # create tmp dir
    with pytest.warns(UserWarning):
        performances = evaluate(algo, data, measures=['RmSe', 'Mae'],
                                with_dump=True, dump_dir=tmp_dir, verbose=2)
    shutil.rmtree(tmp_dir)  # remove tmp dir

    assert performances['RMSE'] is performances['rmse']
    assert performances['MaE'] is performances['mae']
Example #14
0
def test_PredifinedKFold(toy_data_reader):

    current_dir = os.path.dirname(os.path.realpath(__file__))
    folds_files = [(current_dir + '/custom_train',
                    current_dir + '/custom_test')]

    data = Dataset.load_from_folds(folds_files=folds_files,
                                   reader=toy_data_reader,
                                   rating_scale=(1, 5))

    # Make sure rating files are read correctly
    pkf = PredefinedKFold()
    trainset, testset = next(pkf.split(data))
    assert trainset.n_ratings == 6
    assert len(testset) == 3

    # Make sure pkf returns the same folds as the deprecated data.folds()
    with pytest.warns(UserWarning):
        trainset_, testset_ = next(data.folds())
    assert testset_ == testset
Example #15
0
def test_gridsearchcv_best_estimator():
    """Ensure that the best estimator is the one giving the best score (by
    re-running it)"""

    train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train')
    test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
    data = Dataset.load_from_folds([(train_file, test_file)],
                                   Reader('ml-100k'))

    param_grid = {'n_epochs': [5], 'lr_all': [0.002, 0.005],
                  'reg_all': [0.4, 0.6], 'n_factors': [1], 'init_std_dev': [0]}
    gs = GridSearchCV(SVD, param_grid, measures=['mae'],
                      cv=PredefinedKFold(), joblib_verbose=100)
    gs.fit(data)
    best_estimator = gs.best_estimator['mae']

    # recompute MAE of best_estimator
    mae = cross_validate(best_estimator, data, measures=['MAE'],
                         cv=PredefinedKFold())['test_mae']

    assert mae == gs.best_score['mae']
Example #16
0
def load_data(file_dict, dataformat):  # 加载数据
    if dataformat == "builtin":
        data = Dataset.load_builtin(name=file_dict["name"], prompt=True)
    elif dataformat == "file":
        reader = Reader(line_format=file_dict["line_format"], sep=file_dict.get("sep", None),
                        rating_scale=file_dict.get("rating_scale", (1, 5)),
                        skip_lines=file_dict.get("skip_lines", 0))
        data = Dataset.load_from_file(file_path=file_dict["file_path"], reader=reader)
    elif dataformat == "dataframe":
        reader = Reader(rating_scale=file_dict.get("rating_scale", (1, 5)))
        data = Dataset.load_from_df(df=file_dict["df"][file_dict["header"]], reader=reader)
    elif dataformat == "folds":   # 已经进行k折交叉验证
        files_dir = os.path.expanduser(file_dict["file_dir"])
        reader = Reader(name=file_dict["name"])
        train_file = files_dir + file_dict["train_name"]
        test_file = files_dir + file_dict["test_name"]
        folds_files = [(train_file % i, test_file % i) for i in file_dict["file_num"]]
        print(folds_files)
        data = Dataset.load_from_folds(folds_files=folds_files, reader=reader)
    else:
        ValueError("dataframe 超出了可处理的文件的类型范围")
    return data
Example #17
0
def test_knns():
    """Ensure the k and min_k parameters are effective for knn algorithms."""

    # the test and train files are from the ml-100k dataset (10% of u1.base and
    # 10 % of u1.test)
    train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train')
    test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
    data = Dataset.load_from_folds([(train_file, test_file)],
                                   Reader('ml-100k'))

    # Actually, as KNNWithMeans and KNNBaseline have back up solutions for when
    # there are not enough neighbors, we can't really test them...
    klasses = (KNNBasic, )  # KNNWithMeans, KNNBaseline)

    k, min_k = 20, 5
    for klass in klasses:
        algo = klass(k=k, min_k=min_k)
        for trainset, testset in data.folds():
            algo.fit(trainset)
            predictions = algo.test(testset)
            for pred in predictions:
                if not pred.details['was_impossible']:
                    assert min_k <= pred.details['actual_k'] <= k
Example #18
0
def func7():
    import os
    from surprise import SVD
    from surprise import Dataset
    from surprise import Reader
    from surprise import accuracy
    from surprise.model_selection import PredefinedKFold

    files_dir = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/')
    reader = Reader('ml-100k')

    train_file = files_dir + 'u%d.base'
    test_file = files_dir + 'u%d.test'
    folds_files = [(train_file % i, test_file % i) for i in (1, 2, 3, 4, 5)]

    data = Dataset.load_from_folds(folds_files, reader=reader)
    pkf = PredefinedKFold()

    algo = SVD()
    for trainset, testset in pkf.split(data):
        algo.fit(trainset)
        predictions = algo.test(testset)
        accuracy.rmse(predictions, verbose=True)
Example #19
0
def basic_rec(model_name, train_path, test_path, target_id):
    # build data
    # TODO check float and min_r
    reader = Reader(line_format='user item rating',
                    sep='\t',
                    rating_scale=(1, 5))
    data = Dataset.load_from_folds([(train_path, test_path)], reader=reader)
    trainset, testset = None, None
    pkf = PredefinedKFold()
    for trainset_, testset_ in pkf.split(data):
        trainset, testset = trainset_, testset_

    # train model
    rec_algo = get_model(model_name)
    rec_algo.fit(trainset)
    # eval
    preds = rec_algo.test(testset)
    rmse = accuracy.rmse(preds, verbose=True)

    # predor target
    fn_pred = lambda uid: rec_algo.predict(str(uid), str(target_id), r_ui=0
                                           ).est
    target_predictions = list(map(fn_pred, range(trainset.n_users)))

    # topn
    testset = trainset.build_anti_testset()
    predictions = rec_algo.test(testset)
    top_n = get_top_n(predictions, n=50)

    hit_ratios = {}
    for uid, user_ratings in top_n.items():
        topN = [int(iid) for (iid, _) in user_ratings]
        hits = [
            1 if target_id in topN[:i] else 0 for i in [1, 3, 5, 10, 20, 50]
        ]
        hit_ratios[int(uid)] = hits
    return target_predictions, hit_ratios
Example #20
0
for x in ratingRDD.collect():
    ratingMap[(x[0], x[1])] = x[2]
# print("total data: " + str(len(ratingMap)))

# add avg business stars
# avgBusList = list()
# userRatingRDD = userRatingRDD.filter(lambda t: int(preUserMap[t[0]]) % 30 == 0)
# busRatingRDD = busRatingRDD.filter(lambda t: int(preBusinessMap[t[0]]) % 40 == 0)
# print("add user avg count: " + str(userRatingRDD.count()))
# print("add bus avg count: " + str(busRatingRDD.count()))

reader = Reader(line_format='user item rating', sep=",", skip_lines=1)

folds_files = [(trainingFilePath, validationFilePath)]

data = Dataset.load_from_folds(folds_files, reader=reader)
pkf = PredefinedKFold()

algo = SVD()

predictionList = list()
for trainset, testset in pkf.split(data):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)
    for uid, iid, true_r, est, _ in predictions:
        predictionList.append((uid, iid, est))

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)
Example #21
0
from __future__ import (absolute_import, division, print_function,
                        unicode_literals)
import os
import pytest

from surprise import NMF
from surprise import Dataset
from surprise import Reader
from surprise import evaluate

# the test and train files are from the ml-100k dataset (10% of u1.base and
# 10 % of u1.test)
train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train')
test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
data = Dataset.load_from_folds([(train_file, test_file)], Reader('ml-100k'))


def test_NMF_parameters():
    """Ensure that all parameters are taken into account."""

    # The baseline against which to compare.
    algo = NMF(n_factors=1, n_epochs=1)
    rmse_default = evaluate(algo, data, measures=['rmse'])['rmse']

    # n_factors
    algo = NMF(n_factors=2, n_epochs=1)
    rmse_factors = evaluate(algo, data, measures=['rmse'])['rmse']
    assert rmse_default != rmse_factors

    # n_epochs
Example #22
0
def test_trainset_testset():
    """Test the construct_trainset and construct_testset methods."""

    current_dir = os.path.dirname(os.path.realpath(__file__))
    folds_files = [(current_dir + '/custom_train',
                    current_dir + '/custom_test')]

    data = Dataset.load_from_folds(folds_files=folds_files, reader=reader)

    for trainset, testset in data.folds():
        pass  # just need trainset and testset to be set

    # test ur
    ur = trainset.ur
    assert ur[0] == [(0, 4)]
    assert ur[1] == [(0, 4), (1, 2)]
    assert ur[40] == []  # not in the trainset

    # test ir
    ir = trainset.ir
    assert ir[0] == [(0, 4), (1, 4), (2, 1)]
    assert ir[1] == [(1, 2), (2, 1), (3, 5)]
    assert ir[20000] == []  # not in the trainset

    # test n_users, n_items, n_ratings, rating_scale
    assert trainset.n_users == 4
    assert trainset.n_items == 2
    assert trainset.n_ratings == 6
    assert trainset.rating_scale == (1, 5)

    # test raw2inner
    for i in range(4):
        assert trainset.to_inner_uid('user' + str(i)) == i
    with pytest.raises(ValueError):
        trainset.to_inner_uid('unkown_user')

    for i in range(2):
        assert trainset.to_inner_iid('item' + str(i)) == i
    with pytest.raises(ValueError):
        trainset.to_inner_iid('unkown_item')

    # test inner2raw
    assert trainset._inner2raw_id_users is None
    assert trainset._inner2raw_id_items is None
    for i in range(4):
        assert trainset.to_raw_uid(i) == 'user' + str(i)
    for i in range(2):
        assert trainset.to_raw_iid(i) == 'item' + str(i)
    assert trainset._inner2raw_id_users is not None
    assert trainset._inner2raw_id_items is not None

    # Test the build_testset() method
    algo = BaselineOnly()
    algo.train(trainset)
    testset = trainset.build_testset()
    algo.test(testset)  # ensure an algorithm can manage the data
    assert ('user0', 'item0', 4) in testset
    assert ('user3', 'item1', 5) in testset
    assert ('user3', 'item1', 0) not in testset

    # Test the build_anti_testset() method
    algo = BaselineOnly()
    algo.train(trainset)
    testset = trainset.build_anti_testset()
    algo.test(testset)  # ensure an algorithm can manage the data
    assert ('user0', 'item0', trainset.global_mean) not in testset
    assert ('user3', 'item1', trainset.global_mean) not in testset
    assert ('user0', 'item1', trainset.global_mean) in testset
    assert ('user3', 'item0', trainset.global_mean) in testset
Example #23
0
def test_wrong_file_name():
    """Ensure file names are checked when creating a (custom) Dataset."""
    wrong_files = [('does_not_exist', 'does_not_either')]

    with pytest.raises(ValueError):
        Dataset.load_from_folds(folds_files=wrong_files, reader=reader)
Example #24
0
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import PredefinedKFold

# path to dataset folder
files_dir = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/')

# This time, we'll use the built-in reader.
reader = Reader('ml-100k')

# folds_files is a list of tuples containing file paths:
# [(u1.base, u1.test), (u2.base, u2.test), ... (u5.base, u5.test)]
train_file = files_dir + 'u%d.base'
test_file = files_dir + 'u%d.test'
folds_files = [(train_file % i, test_file % i) for i in (1, 2, 3, 4, 5)]

data = Dataset.load_from_folds(folds_files, reader=reader, rating_scale=(1, 5))
pkf = PredefinedKFold()

algo = SVD()

for trainset, testset in pkf.split(data):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)
def k_recommend(model, k, testset):

    reader = Reader(line_format='user item rating', sep=',', skip_lines=1)
    fold_files = [('~/Desktop/Tufts/Fall2018/COMP135/Project3/trainset.csv',
                   '~/Desktop/Tufts/Fall2018/COMP135/Project3/testset.csv')]

    pdkfold = sp.model_selection.split.PredefinedKFold()
    clf = model.best_estimator['mae']
    data = Dataset.load_from_folds(fold_files, reader=reader)

    for train, test in pdkfold.split(data):
        clf.fit(train)
        test1 = train.build_anti_testset()
        preds = clf.test(test1)

    top_n = defaultdict(list)

    for uid, iid, true_r, est, _ in preds:
        top_n[uid].append((iid, est))

    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:k]
    """
    for uid, user_ratings in top_n.items():
        print(uid, [iid for (iid, _) in user_ratings])
    for uid, user_ratings in top_n.items():
        print uid, user_ratings
    """

    for uid in top_n:
        i = 0
        for iid in top_n[uid]:
            found = False
            for iid2 in testset[uid]:
                if iid[0] == str(iid2[0]):
                    a = iid[0]
                    top_n[uid].remove(top_n[uid][i])
                    top_n[uid].insert(i,(a, iid2[1]))
                    found = True
                    i += 1
                    break
            if found == False:
                a = iid[0]
                top_n[uid].remove(top_n[uid][i])
                top_n[uid].insert(i,(a, 2))
                i += 1

    total_sum = 0.0
    user_sum = 0.0
    us_rec = []
    for uid in top_n:
        i = 0.0
        for iid in top_n[uid]:
            i += 1.0
            user_sum += iid[1]
        total_sum += float(user_sum / i)
        us_rec.append(user_sum / i)
        user_sum = 0.0

    #print us_rec
    print "Average rating: ", (total_sum/float(len(top_n)))
Example #26
0
def test_refit():

    data_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
    data = Dataset.load_from_file(data_file, Reader('ml-100k'))

    param_grid = {
        'n_epochs': [5],
        'lr_all': [0.002, 0.005],
        'reg_all': [0.4, 0.6],
        'n_factors': [2]
    }

    # assert gs.fit() and gs.test will use best estimator for mae (first
    # appearing in measures)
    gs = GridSearchCV(SVD,
                      param_grid,
                      measures=['mae', 'rmse'],
                      cv=2,
                      refit=True)
    gs.fit(data)
    gs_preds = gs.test(data.construct_testset(data.raw_ratings))
    mae_preds = gs.best_estimator['mae'].test(
        data.construct_testset(data.raw_ratings))
    assert gs_preds == mae_preds

    # assert gs.fit() and gs.test will use best estimator for rmse
    gs = GridSearchCV(SVD,
                      param_grid,
                      measures=['mae', 'rmse'],
                      cv=2,
                      refit='rmse')
    gs.fit(data)
    gs_preds = gs.test(data.construct_testset(data.raw_ratings))
    rmse_preds = gs.best_estimator['rmse'].test(
        data.construct_testset(data.raw_ratings))
    assert gs_preds == rmse_preds
    # test that predict() can be called
    gs.predict(2, 4)

    # assert test() and predict() cannot be used when refit is false
    gs = GridSearchCV(SVD,
                      param_grid,
                      measures=['mae', 'rmse'],
                      cv=2,
                      refit=False)
    gs.fit(data)
    with pytest.raises(ValueError):
        gs_preds = gs.test(data.construct_testset(data.raw_ratings))
    with pytest.raises(ValueError):
        gs.predict('1', '2')

    # test that error is raised if used with load_from_folds
    train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train')
    test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
    data = Dataset.load_from_folds([(train_file, test_file)],
                                   Reader('ml-100k'))
    gs = GridSearchCV(SVD,
                      param_grid,
                      measures=['mae', 'rmse'],
                      cv=2,
                      refit=True)
    with pytest.raises(ValueError):
        gs.fit(data)
"""
Created on Fri Nov  6 09:40:38 2020

@author: sasha
"""
from surprise import Dataset, Reader
from surprise.model_selection import PredefinedKFold
from surprise import accuracy
from lsh_jaccard import lsh_jaccard



train_file_path = "train.csv"
test_file_path = "test.csv"
reader = Reader(line_format='user item rating timestamp', sep=',')
#data = Dataset.load_from_file(train_file_path, reader=reader)
data = Dataset.load_from_folds([(train_file_path, test_file_path)], reader=reader)
pkf = PredefinedKFold()


    
algo = lsh_jaccard(threshold = 0.1)
for trainset, testset in pkf.split(data):
    
    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)
Example #28
0
def test_trainset_testset():
    """Test the construct_trainset and construct_testset methods."""

    current_dir = os.path.dirname(os.path.realpath(__file__))
    folds_files = [(current_dir + '/custom_train',
                    current_dir + '/custom_test')]

    data = Dataset.load_from_folds(folds_files=folds_files, reader=reader)

    with pytest.warns(UserWarning):
        trainset, testset = next(data.folds())

    # test ur
    ur = trainset.ur
    assert ur[0] == [(0, 4, None)]
    assert ur[1] == [(0, 4, None), (1, 2, None)]
    assert ur[40] == []  # not in the trainset

    # test ir
    ir = trainset.ir
    assert ir[0] == [(0, 4, None), (1, 4, None), (2, 1, None)]
    assert ir[1] == [(1, 2, None), (2, 1, None), (3, 5, None)]
    assert ir[20000] == []  # not in the trainset

    # test n_users, n_items, n_ratings, rating_scale
    assert trainset.n_users == 4
    assert trainset.n_items == 2
    assert trainset.n_ratings == 6
    assert trainset.rating_scale == (1, 5)

    # test user features
    u_features = trainset.u_features
    assert u_features[0] == []  # no u_features_df added
    assert u_features[1] == []  # no u_features_df added
    assert u_features[3] == []  # no u_features_df added
    assert u_features[40] == []  # not in trainset and no u_features_df
    assert trainset.user_features_labels == []
    assert trainset.n_user_features == 0

    # test item features
    i_features = trainset.i_features
    assert i_features[0] == []  # no i_features_df added
    assert i_features[1] == []  # no i_features_df added
    assert i_features[20000] == []  # not in trainset and no i_features_df
    assert trainset.item_features_labels == []
    assert trainset.n_item_features == 0

    # test raw2inner
    for i in range(4):
        assert trainset.to_inner_uid('user' + str(i)) == i
    with pytest.raises(ValueError):
        trainset.to_inner_uid('unknown_user')

    for i in range(2):
        assert trainset.to_inner_iid('item' + str(i)) == i
    with pytest.raises(ValueError):
        trainset.to_inner_iid('unknown_item')

    # test inner2raw
    assert trainset._inner2raw_id_users is None
    assert trainset._inner2raw_id_items is None
    for i in range(4):
        assert trainset.to_raw_uid(i) == 'user' + str(i)
    for i in range(2):
        assert trainset.to_raw_iid(i) == 'item' + str(i)
    assert trainset._inner2raw_id_users is not None
    assert trainset._inner2raw_id_items is not None

    # Test the build_testset() method
    algo = BaselineOnly()
    algo.fit(trainset)
    testset = trainset.build_testset()
    algo.test(testset)  # ensure an algorithm can manage the data
    assert ('user0', 'item0', [], [], 4) in testset
    assert ('user3', 'item1', [], [], 5) in testset
    assert ('user3', 'item1', [], [], 0) not in testset

    # Test the build_anti_testset() method
    algo = BaselineOnly()
    algo.fit(trainset)
    testset = trainset.build_anti_testset()
    algo.test(testset)  # ensure an algorithm can manage the data
    assert ('user0', 'item0', [], [], trainset.global_mean) not in testset
    assert ('user3', 'item1', [], [], trainset.global_mean) not in testset
    assert ('user0', 'item1', [], [], trainset.global_mean) in testset
    assert ('user3', 'item0', [], [], trainset.global_mean) in testset
Example #29
0
def run_knn_baseline(sparse_data):
    #filename = "test.json"
    prefix = "knn_baseline_"
    trainFile = prefix + "train.txt"
    testFile = prefix + "test.txt"

    raw_data, userPurchasedSet, userTrueTestSet = preprocess(
        sparse_data, trainFile, testFile)
    folds_files = [(trainFile, testFile)]
    reader = Reader(line_format='user item rating', sep='\t')
    data = Dataset.load_from_folds(folds_files, reader=reader)
    pkf = PredefinedKFold()
    bsl_options = {
        'method': 'sgd',
        'n_epochs': 20,
        'learning_rate': 0.005,
    }
    ### sim name: cosine    msd       pearson     pearson_baseline
    ### user_based : True ---- similarity will be computed based on users
    ###            : False ---- similarity will be computed based on items.
    sim_options = {'name': 'pearson_baseline', 'user_based': False}
    predictions = {}
    top_n = {}
    testsSet = None
    total_precisions = 0.0
    total_recalls = 0.0
    total_hit = 0.0
    total_nDCG = 0.0
    total_ffeature = 0.0
    result_file = prefix + "result.txt"
    result_f = open(result_file, "w")
    for trainset, testset in pkf.split(data):
        testsSet = testset

        #algo = SVD(n_factors = 5)
        algo = KNNBaseline(bsl_options=bsl_options, sim_options=sim_options)
        algo.fit(trainset)
        pre = algo.test(testset)
        accuracy.rmse(pre)
        accuracy.mae(pre)
        #calculate_rmse(predictions)

        ### test
        rowNum = raw_data.get_row_size()
        colNum = raw_data.get_col_size()
        cur_time = time.time()
        time_cost = 0

        for i in range(rowNum):
            user = raw_data.get_userID(i)
            predictions[user] = set()
            pq = []
            heapq.heapify(pq)
            for j in range(colNum):
                item = raw_data.get_itemID(j)
                if user not in userPurchasedSet or item in userPurchasedSet[
                        user]:
                    continue
                value = raw_data.get_val(user, item, 'rating')
                predict = algo.predict(user, item, r_ui=0, verbose=False)[3]
                if len(pq) >= 10:
                    heapq.heappop(pq)
                heapq.heappush(pq, (predict, item))
            top_n[user] = set()
            for items in pq:
                top_n[user].add(items[1])
            if user in userTrueTestSet:
                curPrecisions = calculate_precision(top_n[user],
                                                    userTrueTestSet[user])
                curRecalls = calculate_recall(top_n[user],
                                              userTrueTestSet[user])
                ffeature = calculate_f_feature(curPrecisions, curRecalls)
                curHit = isHit(top_n[user], userTrueTestSet[user])
                cur_nDCG = calculate_NDCG(top_n[user], userTrueTestSet[user])
                total_precisions += curPrecisions
                total_recalls += curRecalls
                total_hit += curHit
                total_nDCG += cur_nDCG
                total_ffeature += ffeature
                result_f.write(user + "\t" + str(curPrecisions) + "\t" +
                               str(curRecalls) + "\t" + str(ffeature) + "\t" +
                               str(curHit) + '\t' + str(cur_nDCG) + "\n")
            if i != 0 and i % 1000 == 0:
                duration = (time.time() - cur_time) / 60
                time_cost += duration
                remaining_time = ((rowNum - i) / 1000) * duration
                cur_time = time.time()
                #print 'precisions', total_precisions, ' recalls', total_recalls, ' nDCG', total_nDCG
                print 'i:', i, "/", rowNum, 'remaining time:', remaining_time, 'min'
    print 'precicions', total_precisions, ' recalls', total_recalls, ' hit', total_hit, 'nDCG:', total_nDCG
    rowNum = raw_data.get_row_size()
    print 'avg_precisions:', total_precisions / rowNum, 'avg_recalls:', total_recalls / rowNum, 'avg_ffeature', str(
        total_ffeature / rowNum
    ), 'avg_hit:', total_hit / rowNum, 'avg_nDCG:', total_nDCG / rowNum
    result_f.write("avg:\t" + str(total_precisions / rowNum) + "\t" +
                   str(total_recalls / rowNum) + "\t" +
                   str(total_ffeature / rowNum) + "\t" +
                   str(total_hit / rowNum) + '\t' + str(total_nDCG / rowNum) +
                   "\n")
    result_f.close()
Example #30
0
def test_randomizedsearchcv_refit():
    """Test refit method of RandomizedSearchCV class."""

    data_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
    data = Dataset.load_from_file(data_file, Reader('ml-100k'))

    param_distributions = {
        'n_epochs': [5],
        'lr_all': uniform(0.002, 0.003),
        'reg_all': uniform(0.4, 0.2),
        'n_factors': [2]
    }

    # assert rs.fit() and rs.test will use best estimator for mae (first
    # appearing in measures)
    rs = RandomizedSearchCV(SVD,
                            param_distributions,
                            measures=['mae', 'rmse'],
                            cv=2,
                            refit=True)
    rs.fit(data)
    rs_preds = rs.test(data.construct_testset(data.raw_ratings))
    mae_preds = rs.best_estimator['mae'].test(
        data.construct_testset(data.raw_ratings))
    assert rs_preds == mae_preds

    # assert rs.fit() and rs.test will use best estimator for rmse
    rs = RandomizedSearchCV(SVD,
                            param_distributions,
                            measures=['mae', 'rmse'],
                            cv=2,
                            refit='rmse')
    rs.fit(data)
    rs_preds = rs.test(data.construct_testset(data.raw_ratings))
    rmse_preds = rs.best_estimator['rmse'].test(
        data.construct_testset(data.raw_ratings))
    assert rs_preds == rmse_preds
    # test that predict() can be called
    rs.predict(2, 4)

    # assert test() and predict() cannot be used when refit is false
    rs = RandomizedSearchCV(SVD,
                            param_distributions,
                            measures=['mae', 'rmse'],
                            cv=2,
                            refit=False)
    rs.fit(data)
    with pytest.raises(ValueError):
        rs.test(data.construct_testset(data.raw_ratings))
    with pytest.raises(ValueError):
        rs.predict('1', '2')

    # test that error is raised if used with load_from_folds
    train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train')
    test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
    data = Dataset.load_from_folds([(train_file, test_file)],
                                   Reader('ml-100k'))
    rs = RandomizedSearchCV(SVD,
                            param_distributions,
                            measures=['mae', 'rmse'],
                            cv=2,
                            refit=True)
    with pytest.raises(ValueError):
        rs.fit(data)
Example #31
0
import numpy as np
import pandas as pd
from surprise import SVD, Dataset, Reader
from surprise.model_selection import PredefinedKFold
from matrix_vis import visualization

u_cols = ['movie_id', 'movie_title', 'unknown', 'action', 'adventure', 'animation', 'childrens', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'film-noir', 'horror', 'musical', 'mystery', 'romance', 'sci-fi', 'thriller', 'war', 'western']
# users = pd.read_csv('data/movies.txt', sep='\t', names = u_cols, encoding = 'latin-1')

r_cols = ['user_id', 'movie_id', 'rating']
reader = Reader(line_format='user item rating', sep='\t')
data = Dataset.load_from_folds([('data/train.txt', 'data/test.txt')], reader=reader)
pkf = PredefinedKFold()

for trainset, testset in pkf.split(data):
    algo = SVD()
    algo.fit(trainset)
    u = algo.pu
    v = algo.qi
    v = np.transpose(v)
    a, _, _ = np.linalg.svd(v)
    a = a[:2]
    vplot = np.dot(a, v)
    predictions = algo.test(testset)
    print('method 2 error: %f' % accuracy.rmse(predictions)

    movie_ratings = np.genfromtxt("data/summary.txt",  names=True)
    movie_titles = []
    movie_data = []
    with open("data/movies.txt", mode="r", encoding="ISO-8859-1") as f:
        for line in f:
Example #32
0
def test_trainset_testset_ui_features():
    """Test the construct_trainset and construct_testset methods with user and
    item features."""

    current_dir = os.path.dirname(os.path.realpath(__file__))
    folds_files = [(current_dir + '/custom_train',
                    current_dir + '/custom_test')]

    data = Dataset.load_from_folds(folds_files=folds_files, reader=reader)

    u_features_df = pd.DataFrame(
        {
            'urid': ['user0', 'user2', 'user3', 'user1'],
            'isMale': [False, True, False, True]
        },
        columns=['urid', 'isMale'])
    data = data.load_features_df(u_features_df, user_features=True)

    i_features_df = pd.DataFrame(
        {
            'irid': ['item0', 'item1'],
            'isNew': [False, True],
            'webRating': [4, 3],
            'isComedy': [True, False]
        },
        columns=['irid', 'isNew', 'webRating', 'isComedy'])
    data = data.load_features_df(i_features_df, user_features=False)

    with pytest.warns(UserWarning):
        trainset, testset = next(data.folds())

    # test ur
    ur = trainset.ur
    assert ur[0] == [(0, 4, None)]
    assert ur[1] == [(0, 4, None), (1, 2, None)]
    assert ur[40] == []  # not in the trainset

    # test ir
    ir = trainset.ir
    assert ir[0] == [(0, 4, None), (1, 4, None), (2, 1, None)]
    assert ir[1] == [(1, 2, None), (2, 1, None), (3, 5, None)]
    assert ir[20000] == []  # not in the trainset

    # test n_users, n_items, n_ratings, rating_scale
    assert trainset.n_users == 4
    assert trainset.n_items == 2
    assert trainset.n_ratings == 6
    assert trainset.rating_scale == (1, 5)

    # test user features
    u_features = trainset.u_features
    assert u_features[0] == [False]
    assert u_features[40] == []  # not in trainset and u_features_df
    assert trainset.user_features_labels == ['isMale']
    assert trainset.n_user_features == 1

    # test item features
    i_features = trainset.i_features
    assert i_features[0] == [False, 4, True]
    assert i_features[20000] == []  # not in trainset and i_features_df
    assert trainset.item_features_labels == ['isNew', 'webRating', 'isComedy']
    assert trainset.n_item_features == 3

    # test raw2inner
    for i in range(4):
        assert trainset.to_inner_uid('user' + str(i)) == i
    with pytest.raises(ValueError):
        trainset.to_inner_uid('unknown_user')

    for i in range(2):
        assert trainset.to_inner_iid('item' + str(i)) == i
    with pytest.raises(ValueError):
        trainset.to_inner_iid('unknown_item')

    # test inner2raw
    assert trainset._inner2raw_id_users is None
    assert trainset._inner2raw_id_items is None
    for i in range(4):
        assert trainset.to_raw_uid(i) == 'user' + str(i)
    for i in range(2):
        assert trainset.to_raw_iid(i) == 'item' + str(i)
    assert trainset._inner2raw_id_users is not None
    assert trainset._inner2raw_id_items is not None

    # Test the build_testset() method
    algo = BaselineOnly()
    algo.fit(trainset)
    testset = trainset.build_testset()
    algo.test(testset)  # ensure an algorithm can manage the data
    assert ('user0', 'item0', [False], [False, 4, True], 4) in testset
    assert ('user2', 'item1', [True], [True, 3, False], 1) in testset
    assert ('user3', 'item1', [False], [True, 3, False], 5) in testset
    assert ('user3', 'item1', [False], [True, 3, False], 0) not in testset

    # Test the build_anti_testset() method
    algo = BaselineOnly()
    algo.fit(trainset)
    testset = trainset.build_anti_testset()
    algo.test(testset)  # ensure an algorithm can manage the data
    assert (('user0', 'item0', [False], [False, 4, True], trainset.global_mean)
            not in testset)
    assert (('user3', 'item1', [False], [True, 3, False], trainset.global_mean)
            not in testset)
    assert (('user0', 'item1', [False], [True, 3, False], trainset.global_mean)
            in testset)
    assert (('user3', 'item0', [False], [False, 4, True], trainset.global_mean)
            in testset)
Example #33
0
def test_trainset_testset(toy_data_reader):
    """Test the construct_trainset and construct_testset methods."""

    current_dir = os.path.dirname(os.path.realpath(__file__))
    folds_files = [(current_dir + '/custom_train',
                    current_dir + '/custom_test')]

    data = Dataset.load_from_folds(folds_files=folds_files,
                                   reader=toy_data_reader, rating_scale=(1, 5))

    with pytest.warns(UserWarning):
        trainset, testset = next(data.folds())

    # test ur
    ur = trainset.ur
    assert ur[0] == [(0, 4)]
    assert ur[1] == [(0, 4), (1, 2)]
    assert ur[40] == []  # not in the trainset

    # test ir
    ir = trainset.ir
    assert ir[0] == [(0, 4), (1, 4), (2, 1)]
    assert ir[1] == [(1, 2), (2, 1), (3, 5)]
    assert ir[20000] == []  # not in the trainset

    # test n_users, n_items, n_ratings, rating_scale
    assert trainset.n_users == 4
    assert trainset.n_items == 2
    assert trainset.n_ratings == 6
    assert trainset.rating_scale == (1, 5)

    # test raw2inner
    for i in range(4):
        assert trainset.to_inner_uid('user' + str(i)) == i
    with pytest.raises(ValueError):
        trainset.to_inner_uid('unkown_user')

    for i in range(2):
        assert trainset.to_inner_iid('item' + str(i)) == i
    with pytest.raises(ValueError):
        trainset.to_inner_iid('unkown_item')

    # test inner2raw
    assert trainset._inner2raw_id_users is None
    assert trainset._inner2raw_id_items is None
    for i in range(4):
        assert trainset.to_raw_uid(i) == 'user' + str(i)
    for i in range(2):
        assert trainset.to_raw_iid(i) == 'item' + str(i)
    assert trainset._inner2raw_id_users is not None
    assert trainset._inner2raw_id_items is not None

    # Test the build_testset() method
    algo = BaselineOnly()
    algo.fit(trainset)
    testset = trainset.build_testset()
    algo.test(testset)  # ensure an algorithm can manage the data
    assert ('user0', 'item0', 4) in testset
    assert ('user3', 'item1', 5) in testset
    assert ('user3', 'item1', 0) not in testset

    # Test the build_anti_testset() method
    algo = BaselineOnly()
    algo.fit(trainset)
    testset = trainset.build_anti_testset()
    algo.test(testset)  # ensure an algorithm can manage the data
    assert ('user0', 'item0', trainset.global_mean) not in testset
    assert ('user3', 'item1', trainset.global_mean) not in testset
    assert ('user0', 'item1', trainset.global_mean) in testset
    assert ('user3', 'item0', trainset.global_mean) in testset