Exemple #1
0
def test_old_style_algo(small_ml):
    '''Test that old algorithms (i.e. algoritms that only define train()) can
    support both calls to fit() and to train()
    - supporting algo.fit() is needed so that custom algorithms that only
    define train() can still use up to date tools (such as evalute, which has
    been updated to use fit()).
    - algo.train() is the old way, and must still be supported for custom
    algorithms and tools.
    '''
    class CustomAlgoTrain(AlgoBase):
        def __init__(self):
            AlgoBase.__init__(self)
            self.cnt = -1

        def train(self, trainset):

            AlgoBase.train(self, trainset)
            self.est = 3
            self.bu, self.bi = 1, 1
            self.cnt += 1

        def estimate(self, u, i):
            return self.est

    with pytest.warns(UserWarning):
        algo = CustomAlgoTrain()

    kf = KFold(n_splits=2)
    for i, (trainset, testset) in enumerate(kf.split(small_ml)):
        with pytest.warns(UserWarning):
            algo.fit(trainset)
        predictions = algo.test(testset)

        # Make sure AlgoBase.fit has been called
        assert hasattr(algo, 'trainset')
        # Make sure CustomAlgoFit.train has been called
        assert all(est == 3 for (_, _, _, est, _) in predictions)
        # Make sure AlgoBase.fit is finished before CustomAlgoTrain.train
        assert (algo.bu, algo.bi) == (1, 1)
        # Make sure the rest of train() is only called once
        assert algo.cnt == i

    with pytest.warns(UserWarning):
        algo = CustomAlgoTrain()
    for i, (trainset, testset) in enumerate(kf.split(small_ml)):
        with pytest.warns(UserWarning):
            algo.train(trainset)
        predictions = algo.test(testset)

        # Make sure AlgoBase.fit has been called
        assert hasattr(algo, 'trainset')
        # Make sure CustomAlgoFit.train has been called
        assert all(est == 3 for (_, _, _, est, _) in predictions)
        # Make sure AlgoBase.fit is finished before CustomAlgoTrain.train
        assert (algo.bu, algo.bi) == (1, 1)
        # Make sure the rest of train() is only called once
        assert algo.cnt == i
Exemple #2
0
def test_new_style_algo(small_ml):
    '''Test that new algorithms (i.e. algoritms that only define fit()) can
    support both calls to fit() and to train()
    - algo.fit() is the new way of doing things
    - supporting algo.train() is needed for the (unlikely?) case where a user
    has defined custom tools that use algo.train().
    '''
    class CustomAlgoFit(AlgoBase):
        def __init__(self):
            AlgoBase.__init__(self)
            self.cnt = -1

        def fit(self, trainset):

            AlgoBase.fit(self, trainset)
            self.est = 3
            self.bu, self.bi = 1, 1
            self.cnt += 1

        def estimate(self, u, i):
            return self.est

    algo = CustomAlgoFit()
    kf = KFold(n_splits=2)
    for i, (trainset, testset) in enumerate(kf.split(small_ml)):
        algo.fit(trainset)
        predictions = algo.test(testset)

        # Make sure AlgoBase.fit has been called
        assert hasattr(algo, 'trainset')
        # Make sure CustomAlgoFit.fit has been called
        assert all(est == 3 for (_, _, _, est, _) in predictions)
        # Make sure AlgoBase.fit is finished before CustomAlgoFit.fit
        assert (algo.bu, algo.bi) == (1, 1)
        # Make sure the rest of fit() is only called once
        assert algo.cnt == i

    algo = CustomAlgoFit()
    for i, (trainset, testset) in enumerate(kf.split(small_ml)):
        with pytest.warns(UserWarning):
            algo.train(trainset)
        predictions = algo.test(testset)

        # Make sure AlgoBase.fit has been called
        assert hasattr(algo, 'trainset')
        # Make sure CustomAlgoFit.fit has been called
        assert all(est == 3 for (_, _, _, est, _) in predictions)
        # Make sure AlgoBase.fit is finished before CustomAlgoFit.fit
        assert (algo.bu, algo.bi) == (1, 1)
        # Make sure the rest of fit() is only called once
        assert algo.cnt == i
Exemple #3
0
def test_gridsearchcv_same_splits():
    """Ensure that all parameter combinations are tested on the same splits (we
    check their RMSE scores are the same once averaged over the splits, which
    should be enough). We use as much parallelism as possible."""

    data_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
    data = Dataset.load_from_file(data_file, reader=Reader('ml-100k'),
                                  rating_scale=(1, 5))
    kf = KFold(3, shuffle=True, random_state=4)

    # all RMSE should be the same (as param combinations are the same)
    param_grid = {'n_epochs': [5], 'lr_all': [.2, .2],
                  'reg_all': [.4, .4], 'n_factors': [5], 'random_state': [0]}
    gs = GridSearchCV(SVD, param_grid, measures=['RMSE'], cv=kf,
                      n_jobs=1)
    gs.fit(data)

    rmse_scores = [m for m in gs.cv_results['mean_test_rmse']]
    assert len(set(rmse_scores)) == 1  # assert rmse_scores are all equal

    # Note: actually, even when setting random_state=None in kf, the same folds
    # are used because we use product(param_comb, kf.split(...)). However, it's
    # needed to have the same folds when calling fit again:
    gs.fit(data)
    rmse_scores += [m for m in gs.cv_results['mean_test_rmse']]
    assert len(set(rmse_scores)) == 1  # assert rmse_scores are all equal
Exemple #4
0
def test_randomizedsearchcv_cv_results():
    """Test the cv_results attribute"""

    f = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
    data = Dataset.load_from_file(f, Reader('ml-100k'), rating_scale=(1, 5))
    kf = KFold(3, shuffle=True, random_state=4)
    param_distributions = {'n_epochs': [5], 'lr_all': uniform(.2, .3),
                           'reg_all': uniform(.4, .3), 'n_factors': [5],
                           'random_state': [0]}
    n_iter = 5
    rs = RandomizedSearchCV(SVD, param_distributions, n_iter=n_iter,
                            measures=['RMSE', 'mae'], cv=kf,
                            return_train_measures=True)
    rs.fit(data)

    # test keys split*_test_rmse, mean and std dev.
    assert rs.cv_results['split0_test_rmse'].shape == (n_iter,)
    assert rs.cv_results['split1_test_rmse'].shape == (n_iter,)
    assert rs.cv_results['split2_test_rmse'].shape == (n_iter,)
    assert rs.cv_results['mean_test_rmse'].shape == (n_iter,)
    assert np.allclose(rs.cv_results['mean_test_rmse'],
                       np.mean([rs.cv_results['split0_test_rmse'],
                                rs.cv_results['split1_test_rmse'],
                                rs.cv_results['split2_test_rmse']], axis=0))
    assert np.allclose(rs.cv_results['std_test_rmse'],
                       np.std([rs.cv_results['split0_test_rmse'],
                               rs.cv_results['split1_test_rmse'],
                               rs.cv_results['split2_test_rmse']], axis=0))

    # test keys split*_train_mae, mean and std dev.
    assert rs.cv_results['split0_train_rmse'].shape == (n_iter,)
    assert rs.cv_results['split1_train_rmse'].shape == (n_iter,)
    assert rs.cv_results['split2_train_rmse'].shape == (n_iter,)
    assert rs.cv_results['mean_train_rmse'].shape == (n_iter,)
    assert np.allclose(rs.cv_results['mean_train_rmse'],
                       np.mean([rs.cv_results['split0_train_rmse'],
                                rs.cv_results['split1_train_rmse'],
                                rs.cv_results['split2_train_rmse']], axis=0))
    assert np.allclose(rs.cv_results['std_train_rmse'],
                       np.std([rs.cv_results['split0_train_rmse'],
                               rs.cv_results['split1_train_rmse'],
                               rs.cv_results['split2_train_rmse']], axis=0))

    # test fit and train times dimensions.
    assert rs.cv_results['mean_fit_time'].shape == (n_iter,)
    assert rs.cv_results['std_fit_time'].shape == (n_iter,)
    assert rs.cv_results['mean_test_time'].shape == (n_iter,)
    assert rs.cv_results['std_test_time'].shape == (n_iter,)

    assert rs.cv_results['params'] is rs.param_combinations

    # assert that best parameter in rs.cv_results['rank_test_measure'] is
    # indeed the best_param attribute.
    best_index = np.argmin(rs.cv_results['rank_test_rmse'])
    assert rs.cv_results['params'][best_index] == rs.best_params['rmse']
    best_index = np.argmin(rs.cv_results['rank_test_mae'])
    assert rs.cv_results['params'][best_index] == rs.best_params['mae']
Exemple #5
0
def test_get_cv():

    get_cv(None)
    get_cv(4)
    get_cv(KFold())

    with pytest.raises(ValueError):
        get_cv(23.2)
    with pytest.raises(ValueError):
        get_cv('bad')
Exemple #6
0
def main():
    class MyParser(argparse.ArgumentParser):
        '''A parser which prints the help message when an error occurs. Taken from
        http://stackoverflow.com/questions/4042452/display-help-message-with-python-argparse-when-script-is-called-without-any-argu.'''  # noqa

        def error(self, message):
            sys.stderr.write('error: %s\n' % message)
            self.print_help()
            sys.exit(2)

    parser = MyParser(
        description='Evaluate the performance of a rating prediction ' +
        'algorithm ' +
        'on a given dataset using cross validation. You can use a built-in ' +
        'or a custom dataset, and you can choose to automatically split the ' +
        'dataset into folds, or manually specify train and test files. ' +
        'Please refer to the documentation page ' +
        '(http://amaze.readthedocs.io/) for more details.',
        epilog="""Example:\n
        amaze -algo SVD -params "{'n_epochs': 5, 'verbose': True}"
        -load-builtin ml-100k -n-folds 3""")

    algo_choices = {
        'NormalPredictor': NormalPredictor,
        'BaselineOnly': BaselineOnly,
        'KNNBasic': KNNBasic,
        'KNNBaseline': KNNBaseline,
        'KNNWithMeans': KNNWithMeans,
        'SVD': SVD,
        'SVDpp': SVDpp,
        'NMF': NMF,
        'SlopeOne': SlopeOne,
        'CoClustering': CoClustering,
    }

    parser.add_argument('-algo',
                        type=str,
                        choices=algo_choices,
                        help='The prediction algorithm to use. ' +
                        'Allowed values are ' +
                        ', '.join(algo_choices.keys()) + '.',
                        metavar='<prediction algorithm>')

    parser.add_argument('-params',
                        type=str,
                        metavar='<algorithm parameters>',
                        default='{}',
                        help='A kwargs dictionary that contains all the ' +
                        'algorithm parameters.' +
                        'Example: "{\'n_epochs\': 10}".')

    parser.add_argument('-load-builtin',
                        type=str,
                        dest='load_builtin',
                        metavar='<dataset name>',
                        default='ml-100k',
                        help='The name of the built-in dataset to use.' +
                        'Allowed values are ' +
                        ', '.join(dataset.BUILTIN_DATASETS.keys()) +
                        '. Default is ml-100k.')

    parser.add_argument(
        '-load-custom',
        type=str,
        dest='load_custom',
        metavar='<file path>',
        default=None,
        help='A file path to custom dataset to use. ' + 'Ignored if ' +
        '-loadbuiltin is set. The -reader parameter needs ' + 'to be set.')

    parser.add_argument('-folds-files',
                        type=str,
                        dest='folds_files',
                        metavar='<train1 test1 train2 test2... >',
                        default=None,
                        help='A list of custom train and test files. ' +
                        'Ignored if -load-builtin or -load-custom is set. '
                        'The -reader parameter needs to be set.')

    parser.add_argument('-reader',
                        type=str,
                        metavar='<reader>',
                        default=None,
                        help='A Reader to read the custom dataset. Example: ' +
                        '"Reader(line_format=\'user item rating timestamp\',' +
                        ' sep=\'\\t\')"')

    parser.add_argument('-n-folds',
                        type=int,
                        dest='n_folds',
                        metavar="<number of folds>",
                        default=5,
                        help='The number of folds for cross-validation. ' +
                        'Default is 5.')

    parser.add_argument('-seed',
                        type=int,
                        metavar='<random seed>',
                        default=None,
                        help='The seed to use for RNG. ' +
                        'Default is the current system time.')

    parser.add_argument('--with-dump',
                        dest='with_dump',
                        action='store_true',
                        help='Dump the algorithm ' +
                        'results in a file (one file per fold). ' +
                        'Default is False.')

    parser.add_argument('-dump-dir',
                        dest='dump_dir',
                        type=str,
                        metavar='<dir>',
                        default=None,
                        help='Where to dump the files. Ignored if ' +
                        'with-dump is not set. Default is ' +
                        os.path.join(get_dataset_dir(), 'dumps/'))

    parser.add_argument('--clean',
                        dest='clean',
                        action='store_true',
                        help='Remove the ' + get_dataset_dir() +
                        ' directory and exit.')

    parser.add_argument('-v',
                        '--version',
                        action='version',
                        version=__version__)

    args = parser.parse_args()

    if args.clean:
        folder = get_dataset_dir()
        shutil.rmtree(folder)
        print('Removed', folder)
        exit()

    # setup RNG
    rd.seed(args.seed)
    np.random.seed(args.seed)

    # setup algorithm
    params = eval(args.params)
    if args.algo is None:
        parser.error('No algorithm was specified.')
    algo = algo_choices[args.algo](**params)

    # setup dataset
    if args.load_custom is not None:  # load custom and split
        if args.reader is None:
            parser.error('-reader parameter is needed.')
        reader = eval(args.reader)
        data = Dataset.load_from_file(args.load_custom, reader=reader)
        cv = KFold(n_splits=args.n_folds, random_state=args.seed)

    elif args.folds_files is not None:  # load from files
        if args.reader is None:
            parser.error('-reader parameter is needed.')
        reader = eval(args.reader)
        folds_files = args.folds_files.split()
        folds_files = [(folds_files[i], folds_files[i + 1])
                       for i in range(0,
                                      len(folds_files) - 1, 2)]
        data = Dataset.load_from_folds(folds_files=folds_files, reader=reader)
        cv = PredefinedKFold()

    else:  # load builtin dataset and split
        data = Dataset.load_builtin(args.load_builtin)
        cv = KFold(n_splits=args.n_folds, random_state=args.seed)

    cross_validate(algo, data, cv=cv, verbose=True)
Exemple #7
0
                                             stable +
                                             'basic_algorithms.html#amaze.prediction_algorithms.random_pred.NormalPredictor'),
        'ml-100k': '[{}]({})'.format('Movielens 100k',
                                     'http://grouplens.org/datasets/movielens/100k'),
        'ml-1m': '[{}]({})'.format('Movielens 1M',
                                   'http://grouplens.org/datasets/movielens/1m'),
        }


# set RNG
np.random.seed(0)
random.seed(0)

dataset = 'ml-1m'
data = Dataset.load_builtin(dataset)
kf = KFold(random_state=0)  # folds will be the same for all algorithms.

table = []
for klass in classes:
    start = time.time()
    out = cross_validate(klass(), data, ['rmse', 'mae'], kf)
    cv_time = str(datetime.timedelta(seconds=int(time.time() - start)))
    link = LINK[klass.__name__]
    mean_rmse = '{:.3f}'.format(np.mean(out['test_rmse']))
    mean_mae = '{:.3f}'.format(np.mean(out['test_mae']))

    new_line = [link, mean_rmse, mean_mae, cv_time]
    print(tabulate([new_line], tablefmt="pipe"))  # print current algo perf
    table.append(new_line)

header = [LINK[dataset],
Exemple #8
0
def test_KFold(toy_data):

    # Test n_folds parameter
    kf = KFold(n_splits=5)
    assert len(list(kf.split(toy_data))) == 5

    with pytest.raises(ValueError):
        kf = KFold(n_splits=10)
        next(kf.split(toy_data))  # Too big (greater than number of ratings)

    with pytest.raises(ValueError):
        kf = KFold(n_splits=1)
        next(kf.split(toy_data))  # Too low (must be >= 2)

    # Make sure data has not been shuffled. If not shuffled, the users in the
    # testsets are 0, 1, 2... 4 (in that order).
    kf = KFold(n_splits=5, shuffle=False)
    users = [int(testset[0][0][-1]) for (_, testset) in kf.split(toy_data)]
    assert users == list(range(5))

    # Make sure that when called two times without shuffling, folds are the
    # same.
    kf = KFold(n_splits=5, shuffle=False)
    testsets_a = [testset for (_, testset) in kf.split(toy_data)]
    testsets_b = [testset for (_, testset) in kf.split(toy_data)]
    assert testsets_a == testsets_b
    # test once again with another KFold instance
    kf = KFold(n_splits=5, shuffle=False)
    testsets_a = [testset for (_, testset) in kf.split(toy_data)]
    assert testsets_a == testsets_b

    # We'll now shuffle b and check that folds are different.
    # (this is conditioned by seed setting at the beginning of file)
    kf = KFold(n_splits=5, random_state=None, shuffle=True)
    testsets_b = [testset for (_, testset) in kf.split(toy_data)]
    assert testsets_a != testsets_b
    # test once again: two calls to kf.split make different splits when
    # random_state=None
    testsets_a = [testset for (_, testset) in kf.split(toy_data)]
    assert testsets_a != testsets_b

    # Make sure that folds are the same when same KFold instance is used with
    # suffle is True but random_state is set to some value
    kf = KFold(n_splits=5, random_state=1, shuffle=True)
    testsets_a = [testset for (_, testset) in kf.split(toy_data)]
    testsets_b = [testset for (_, testset) in kf.split(toy_data)]
    assert testsets_a == testsets_b

    # Make sure raw ratings are not shuffled by KFold
    old_raw_ratings = copy(toy_data.raw_ratings)
    kf = KFold(n_splits=5, shuffle=True)
    next(kf.split(toy_data))
    assert old_raw_ratings == toy_data.raw_ratings

    # Make sure kf.split() and the old toy_data.split() have the same folds.
    np.random.seed(3)
    with pytest.warns(UserWarning):
        toy_data.split(2, shuffle=True)
        testsets_a = [testset for (_, testset) in toy_data.folds()]
    kf = KFold(n_splits=2, random_state=3, shuffle=True)
    testsets_b = [testset for (_, testset) in kf.split(toy_data)]
Exemple #9
0
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1

        # Recall@K: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    return precisions, recalls


data = Dataset.load_builtin('ml-100k')
kf = KFold(n_splits=5)
algo = SVD()

for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)

    # Precision and recall can then be averaged over all users
    print(sum(prec for prec in precisions.values()) / len(precisions))
    print(sum(rec for rec in recalls.values()) / len(recalls))
from amaze import accuracy
from amaze.model_selection import KFold

data = Dataset.load_builtin('ml-100k')

algo = SVD()

trainset = data.build_full_trainset()
algo.fit(trainset)

testset = trainset.build_testset()
predictions = algo.test(testset)
# RMSE should be low as we are biased
accuracy.rmse(predictions, verbose=True)  # ~ 0.68 (which is low)

# We can also do this during a cross-validation procedure!
print('CV procedure:')

kf = KFold(n_splits=3)
for i, (trainset_cv, testset_cv) in enumerate(kf.split(data)):
    print('fold number', i + 1)
    algo.fit(trainset_cv)

    print('On testset,', end='  ')
    predictions = algo.test(testset_cv)
    accuracy.rmse(predictions, verbose=True)

    print('On trainset,', end=' ')
    predictions = algo.test(trainset_cv.build_testset())
    accuracy.rmse(predictions, verbose=True)