Example #1
0
def test_wrong_file_name():
    """Ensure file names are checked when creating a (custom) Dataset."""
    wrong_files = [('does_not_exist', 'does_not_either')]

    with pytest.raises(ValueError):
        Dataset.load_from_folds(folds_files=wrong_files, reader=Reader(),
                                rating_scale=(1, 5))
Example #2
0
def test_gridsearchcv_same_splits():
    """Ensure that all parameter combinations are tested on the same splits (we
    check their RMSE scores are the same once averaged over the splits, which
    should be enough). We use as much parallelism as possible."""

    data_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
    data = Dataset.load_from_file(data_file, reader=Reader('ml-100k'),
                                  rating_scale=(1, 5))
    kf = KFold(3, shuffle=True, random_state=4)

    # all RMSE should be the same (as param combinations are the same)
    param_grid = {'n_epochs': [5], 'lr_all': [.2, .2],
                  'reg_all': [.4, .4], 'n_factors': [5], 'random_state': [0]}
    gs = GridSearchCV(SVD, param_grid, measures=['RMSE'], cv=kf,
                      n_jobs=1)
    gs.fit(data)

    rmse_scores = [m for m in gs.cv_results['mean_test_rmse']]
    assert len(set(rmse_scores)) == 1  # assert rmse_scores are all equal

    # Note: actually, even when setting random_state=None in kf, the same folds
    # are used because we use product(param_comb, kf.split(...)). However, it's
    # needed to have the same folds when calling fit again:
    gs.fit(data)
    rmse_scores += [m for m in gs.cv_results['mean_test_rmse']]
    assert len(set(rmse_scores)) == 1  # assert rmse_scores are all equal
Example #3
0
def test_cross_validate(toy_data):

    # First test with a specified CV iterator.
    current_dir = os.path.dirname(os.path.realpath(__file__))
    folds_files = [(current_dir + '/custom_train',
                    current_dir + '/custom_test')]

    reader = Reader(line_format='user item rating', sep=' ', skip_lines=3)
    data = Dataset.load_from_folds(folds_files=folds_files, reader=reader,
                                   rating_scale=(1, 5))

    algo = NormalPredictor()
    pkf = ms.PredefinedKFold()
    ret = ms.cross_validate(algo, data, measures=['rmse', 'mae'], cv=pkf,
                            verbose=1)
    # Basically just test that keys (dont) exist as they should
    assert len(ret['test_rmse']) == 1
    assert len(ret['test_mae']) == 1
    assert len(ret['fit_time']) == 1
    assert len(ret['test_time']) == 1
    assert 'test_fcp' not in ret
    assert 'train_rmse' not in ret
    assert 'train_mae' not in ret

    # Test that 5 fold CV is used when cv=None
    # Also check that train_* key exist when return_train_measures is True.
    ret = ms.cross_validate(algo, toy_data, measures=['rmse', 'mae'], cv=None,
                            return_train_measures=True, verbose=True)
    assert len(ret['test_rmse']) == 5
    assert len(ret['test_mae']) == 5
    assert len(ret['fit_time']) == 5
    assert len(ret['test_time']) == 5
    assert len(ret['train_rmse']) == 5
    assert len(ret['train_mae']) == 5
Example #4
0
def test_zero_rating_canary():

    ratings_dict = {'itemID': [0, 0, 0, 0, 1, 1],
                    'userID': [0, 1, 2, 3, 3, 4],
                    'rating': [-10, 10, 0, -5, 0, 5]}
    df = pd.DataFrame(ratings_dict)
    data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']],
                                rating_scale=(-10, 10))
    trainset = data.build_full_trainset()

    # test ur and ir fields. Kind of OK, but the purpose of the test is
    # precisely to test what would happen if we removed them...
    assert trainset.ir[0] == [(0, -10), (1, 10), (2, 0), (3, -5)]
    assert trainset.ir[1] == [(3, 0), (4, 5)]

    assert trainset.ur[0] == [(0, -10)]
    assert trainset.ur[1] == [(0, 10)]
    assert trainset.ur[2] == [(0, 0)]
    assert trainset.ur[3] == [(0, -5), (1, 0)]
    assert trainset.ur[4] == [(1, 5)]
    print(trainset.ur)

    # ... so also test all_ratings which should be more reliable.
    all_ratings = list(trainset.all_ratings())
    assert (0, 0, -10) in all_ratings
    assert (1, 0, 10) in all_ratings
    assert (2, 0, 0) in all_ratings
    assert (3, 0, -5) in all_ratings
    assert (3, 1, 0) in all_ratings
    assert (4, 1, 5) in all_ratings
Example #5
0
def small_ml():
    """Return a Dataset object with 2000 movielens-100k ratings.
    """
    data_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
    data = Dataset.load_from_file(data_file, Reader('ml-100k'),
                                  rating_scale=(1, 5))

    return data
Example #6
0
def toy_data(toy_data_reader):

    toy_data_path = (os.path.dirname(os.path.realpath(__file__)) +
                     '/custom_dataset')
    data = Dataset.load_from_file(file_path=toy_data_path,
                                  reader=toy_data_reader, rating_scale=(1, 5))

    return data
Example #7
0
def test_randomizedsearchcv_cv_results():
    """Test the cv_results attribute"""

    f = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
    data = Dataset.load_from_file(f, Reader('ml-100k'), rating_scale=(1, 5))
    kf = KFold(3, shuffle=True, random_state=4)
    param_distributions = {'n_epochs': [5], 'lr_all': uniform(.2, .3),
                           'reg_all': uniform(.4, .3), 'n_factors': [5],
                           'random_state': [0]}
    n_iter = 5
    rs = RandomizedSearchCV(SVD, param_distributions, n_iter=n_iter,
                            measures=['RMSE', 'mae'], cv=kf,
                            return_train_measures=True)
    rs.fit(data)

    # test keys split*_test_rmse, mean and std dev.
    assert rs.cv_results['split0_test_rmse'].shape == (n_iter,)
    assert rs.cv_results['split1_test_rmse'].shape == (n_iter,)
    assert rs.cv_results['split2_test_rmse'].shape == (n_iter,)
    assert rs.cv_results['mean_test_rmse'].shape == (n_iter,)
    assert np.allclose(rs.cv_results['mean_test_rmse'],
                       np.mean([rs.cv_results['split0_test_rmse'],
                                rs.cv_results['split1_test_rmse'],
                                rs.cv_results['split2_test_rmse']], axis=0))
    assert np.allclose(rs.cv_results['std_test_rmse'],
                       np.std([rs.cv_results['split0_test_rmse'],
                               rs.cv_results['split1_test_rmse'],
                               rs.cv_results['split2_test_rmse']], axis=0))

    # test keys split*_train_mae, mean and std dev.
    assert rs.cv_results['split0_train_rmse'].shape == (n_iter,)
    assert rs.cv_results['split1_train_rmse'].shape == (n_iter,)
    assert rs.cv_results['split2_train_rmse'].shape == (n_iter,)
    assert rs.cv_results['mean_train_rmse'].shape == (n_iter,)
    assert np.allclose(rs.cv_results['mean_train_rmse'],
                       np.mean([rs.cv_results['split0_train_rmse'],
                                rs.cv_results['split1_train_rmse'],
                                rs.cv_results['split2_train_rmse']], axis=0))
    assert np.allclose(rs.cv_results['std_train_rmse'],
                       np.std([rs.cv_results['split0_train_rmse'],
                               rs.cv_results['split1_train_rmse'],
                               rs.cv_results['split2_train_rmse']], axis=0))

    # test fit and train times dimensions.
    assert rs.cv_results['mean_fit_time'].shape == (n_iter,)
    assert rs.cv_results['std_fit_time'].shape == (n_iter,)
    assert rs.cv_results['mean_test_time'].shape == (n_iter,)
    assert rs.cv_results['std_test_time'].shape == (n_iter,)

    assert rs.cv_results['params'] is rs.param_combinations

    # assert that best parameter in rs.cv_results['rank_test_measure'] is
    # indeed the best_param attribute.
    best_index = np.argmin(rs.cv_results['rank_test_rmse'])
    assert rs.cv_results['params'][best_index] == rs.best_params['rmse']
    best_index = np.argmin(rs.cv_results['rank_test_mae'])
    assert rs.cv_results['params'][best_index] == rs.best_params['mae']
Example #8
0
  def __init__(self, algo: AlgoBase, path: str=None, fmt='user item rating', sep=','):
    self.algo = algo
    if path:
      self.data = Dataset.load_from_file(path, reader=Reader(line_format=fmt, sep=sep, skip_lines=1))
    else:
      self.data = None
    self.trainset = None

    self.init()
Example #9
0
def u1_ml100k():
    """Return a Dataset object that contains 10% of the u1 fold from movielens
    100k. Trainset has 8000 ratings and testset has 2000.
    """
    train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train')
    test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
    data = Dataset.load_from_folds([(train_file, test_file)],
                                   Reader('ml-100k'), rating_scale=(1, 5))

    return data
Example #10
0
def test_load_form_df():
    """Ensure reading dataset from pandas dataframe is OK."""

    # DF creation.
    ratings_dict = {'itemID': [1, 1, 1, 2, 2],
                    'userID': [9, 32, 2, 45, '10000'],
                    'rating': [3, 2, 4, 3, 1]}
    df = pd.DataFrame(ratings_dict)

    data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']],
                                rating_scale=(1, 5))

    # Assert split and folds can be used without problems
    with pytest.warns(UserWarning):
        data.split(2)
        assert sum(1 for _ in data.folds()) == 2

    # assert users and items are correctly mapped
    trainset = data.build_full_trainset()
    assert trainset.knows_user(trainset.to_inner_uid(9))
    assert trainset.knows_user(trainset.to_inner_uid('10000'))
    assert trainset.knows_item(trainset.to_inner_iid(2))

    # assert r(9, 1) = 3 and r(2, 1) = 4
    uid9 = trainset.to_inner_uid(9)
    uid2 = trainset.to_inner_uid(2)
    iid1 = trainset.to_inner_iid(1)
    assert trainset.ur[uid9] == [(iid1, 3)]
    assert trainset.ur[uid2] == [(iid1, 4)]

    # mess up the column ordering and assert that users are not correctly
    # mapped
    data = Dataset.load_from_df(df[['rating', 'itemID', 'userID']],
                                rating_scale=(1, 5))
    trainset = data.build_full_trainset()
    with pytest.raises(ValueError):
        trainset.to_inner_uid('10000')
Example #11
0
def test_nearest_neighbors():
    """Ensure the nearest neighbors are different when using user-user
    similarity vs item-item."""

    reader = Reader(line_format='user item rating', sep=' ', skip_lines=3)

    data_file = os.path.dirname(os.path.realpath(__file__)) + '/custom_train'
    data = Dataset.load_from_file(data_file, reader, rating_scale=(1, 5))
    trainset = data.build_full_trainset()

    algo_ub = KNNBasic(sim_options={'user_based': True})
    algo_ub.fit(trainset)
    algo_ib = KNNBasic(sim_options={'user_based': False})
    algo_ib.fit(trainset)
    assert algo_ub.get_neighbors(0, k=10) != algo_ib.get_neighbors(0, k=10)
    def collaborative(self,ratings,user_id):

        reader = Reader()
        #ratings.head()

        temp_ratings = ratings



        data = Dataset.load_from_df(temp_ratings[['user_id', 'book_id', 'rating']], reader)
        data.split(n_folds=2)

        ## Training the data ##
        svd = SVD()
        evaluate(svd, data, measures=['RMSE', 'MAE'])

        trainset = data.build_full_trainset()

        algo = SVD()
        algo.fit(trainset)

        #svd.train(trainset)
        ## Testing the data ##

        from collections import defaultdict
        testset = trainset.build_anti_testset()
        predictions = algo.test(testset)

        count = 0
     
        for uid, iid, true_r, est, _ in predictions:

             if uid == user_id:
                count = count+1
                temp_ratings.loc[len(temp_ratings)+1]= [uid,iid,est]

        #print("count\n")
        #print(count)
        #print("\n--------here-------\n")	
        #print(temp_ratings)

        cb = temp_ratings[(temp_ratings['user_id'] == user_id)][['book_id', 'rating']]
        #print("\n--------here-------\n")
        #print(cb)
        
        cb = temp_ratings[(temp_ratings['user_id'] == user_id)][['book_id', 'rating']]

        return(cb)
Example #13
0
def test_gridsearchcv_refit(u1_ml100k):
    """Test refit function of GridSearchCV."""

    data_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
    data = Dataset.load_from_file(data_file, Reader('ml-100k'),
                                  rating_scale=(1, 5))

    param_grid = {'n_epochs': [5], 'lr_all': [0.002, 0.005],
                  'reg_all': [0.4, 0.6], 'n_factors': [2]}

    # assert gs.fit() and gs.test will use best estimator for mae (first
    # appearing in measures)
    gs = GridSearchCV(SVD, param_grid, measures=['mae', 'rmse'], cv=2,
                      refit=True)
    gs.fit(data)
    gs_preds = gs.test(data.construct_testset(data.raw_ratings))
    mae_preds = gs.best_estimator['mae'].test(
        data.construct_testset(data.raw_ratings))
    assert gs_preds == mae_preds

    # assert gs.fit() and gs.test will use best estimator for rmse
    gs = GridSearchCV(SVD, param_grid, measures=['mae', 'rmse'], cv=2,
                      refit='rmse')
    gs.fit(data)
    gs_preds = gs.test(data.construct_testset(data.raw_ratings))
    rmse_preds = gs.best_estimator['rmse'].test(
        data.construct_testset(data.raw_ratings))
    assert gs_preds == rmse_preds
    # test that predict() can be called
    gs.predict(2, 4)

    # assert test() and predict() cannot be used when refit is false
    gs = GridSearchCV(SVD, param_grid, measures=['mae', 'rmse'], cv=2,
                      refit=False)
    gs.fit(data)
    with pytest.raises(ValueError):
        gs_preds = gs.test(data.construct_testset(data.raw_ratings))
    with pytest.raises(ValueError):
        gs.predict('1', '2')

    # test that error is raised if used with load_from_folds
    gs = GridSearchCV(SVD, param_grid, measures=['mae', 'rmse'], cv=2,
                      refit=True)
    with pytest.raises(ValueError):
        gs.fit(u1_ml100k)
Example #14
0
def test_randomizedsearchcv_refit(u1_ml100k):
    """Test refit method of RandomizedSearchCV class."""

    data_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
    data = Dataset.load_from_file(data_file, Reader('ml-100k'),
                                  rating_scale=(1, 5))

    param_distributions = {'n_epochs': [5], 'lr_all': uniform(0.002, 0.003),
                           'reg_all': uniform(0.4, 0.2), 'n_factors': [2]}

    # assert rs.fit() and rs.test will use best estimator for mae (first
    # appearing in measures)
    rs = RandomizedSearchCV(SVD, param_distributions, measures=['mae', 'rmse'],
                            cv=2, refit=True)
    rs.fit(data)
    rs_preds = rs.test(data.construct_testset(data.raw_ratings))
    mae_preds = rs.best_estimator['mae'].test(
        data.construct_testset(data.raw_ratings))
    assert rs_preds == mae_preds

    # assert rs.fit() and rs.test will use best estimator for rmse
    rs = RandomizedSearchCV(SVD, param_distributions, measures=['mae', 'rmse'],
                            cv=2, refit='rmse')
    rs.fit(data)
    rs_preds = rs.test(data.construct_testset(data.raw_ratings))
    rmse_preds = rs.best_estimator['rmse'].test(
        data.construct_testset(data.raw_ratings))
    assert rs_preds == rmse_preds
    # test that predict() can be called
    rs.predict(2, 4)

    # assert test() and predict() cannot be used when refit is false
    rs = RandomizedSearchCV(SVD, param_distributions, measures=['mae', 'rmse'],
                            cv=2, refit=False)
    rs.fit(data)
    with pytest.raises(ValueError):
        rs.test(data.construct_testset(data.raw_ratings))
    with pytest.raises(ValueError):
        rs.predict('1', '2')

    # test that error is raised if used with load_from_folds
    rs = RandomizedSearchCV(SVD, param_distributions, measures=['mae', 'rmse'],
                            cv=2, refit=True)
    with pytest.raises(ValueError):
        rs.fit(u1_ml100k)
Example #15
0
def test_deprecated_way():
    """Test all Dataset constructors without passing rating_scale as a
    parameter. Make sure we revert back to the Reader object, with a warning
    message.

    Also, make sure ValueError is raised if reader has no rating_scale in this
    context.

    Not using dataset fixtures here for more control.
    """

    # test load_from_file
    toy_data_path = (os.path.dirname(os.path.realpath(__file__)) +
                     '/custom_dataset')
    with pytest.warns(UserWarning):
        reader = Reader(line_format='user item rating', sep=' ', skip_lines=3,
                        rating_scale=(1, 5))
        data = Dataset.load_from_file(file_path=toy_data_path,
                                      reader=reader)

    with pytest.raises(ValueError):
        reader = Reader(line_format='user item rating', sep=' ', skip_lines=3,
                        rating_scale=None)
        data = Dataset.load_from_file(file_path=toy_data_path,
                                      reader=reader)

    # test load_from_folds
    train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train')
    test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
    with pytest.warns(UserWarning):
        reader = Reader(line_format='user item rating timestamp', sep='\t',
                        rating_scale=(1, 5))
        data = Dataset.load_from_folds([(train_file, test_file)], reader=reader)
    with pytest.raises(ValueError):
        reader = Reader(line_format='user item rating timestamp', sep='\t',
                        rating_scale=None)
        data = Dataset.load_from_folds([(train_file, test_file)],
                                       reader=reader)
    # test load_from_df
    ratings_dict = {'itemID': [1, 1, 1, 2, 2],
                    'userID': [9, 32, 2, 45, '10000'],
                    'rating': [3, 2, 4, 3, 1]}
    df = pd.DataFrame(ratings_dict)

    with pytest.warns(UserWarning):
        reader = Reader(rating_scale=(1, 5))
        data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']],
                                    reader=reader)
    with pytest.raises(ValueError):
        reader = Reader(rating_scale=None)
        data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']],  # noqa
                                    reader=reader)
Example #16
0
def test_PredifinedKFold(toy_data_reader):

    current_dir = os.path.dirname(os.path.realpath(__file__))
    folds_files = [(current_dir + '/custom_train',
                    current_dir + '/custom_test')]

    data = Dataset.load_from_folds(folds_files=folds_files,
                                   reader=toy_data_reader, rating_scale=(1, 5))

    # Make sure rating files are read correctly
    pkf = PredefinedKFold()
    trainset, testset = next(pkf.split(data))
    assert trainset.n_ratings == 6
    assert len(testset) == 3

    # Make sure pkf returns the same folds as the deprecated data.folds()
    with pytest.warns(UserWarning):
        trainset_, testset_ = next(data.folds())
    assert testset_ == testset
Example #17
0
def test_LeaveOneOut(toy_data):

    loo = LeaveOneOut()
    with pytest.raises(ValueError):
        next(loo.split(toy_data))  # each user only has 1 item so trainsets fail

    reader = Reader('ml-100k')
    data_path = (os.path.dirname(os.path.realpath(__file__)) +
                 '/u1_ml100k_test')
    data = Dataset.load_from_file(file_path=data_path, reader=reader,
                                  rating_scale=(1, 5))

    # Test random_state parameter
    # If random_state is None, you get different split each time (conditioned
    # by rng of course)
    loo = LeaveOneOut(random_state=None)
    testsets_a = [testset for (_, testset) in loo.split(data)]
    testsets_b = [testset for (_, testset) in loo.split(data)]
    assert testsets_a != testsets_b
    # Repeated called to split when random_state is set lead to the same folds
    loo = LeaveOneOut(random_state=1)
    testsets_a = [testset for (_, testset) in loo.split(data)]
    testsets_b = [testset for (_, testset) in loo.split(data)]
    assert testsets_a == testsets_b

    # Make sure only one rating per user is present in the testset
    loo = LeaveOneOut()
    for _, testset in loo.split(data):
        cnt = Counter([uid for (uid, _, _) in testset])
        assert all(val == 1 for val in itervalues(cnt))

    # test the min_n_ratings parameter
    loo = LeaveOneOut(min_n_ratings=5)
    for trainset, _ in loo.split(data):
        assert all(len(ratings) >= 5 for ratings in itervalues(trainset.ur))

    loo = LeaveOneOut(min_n_ratings=10)
    for trainset, _ in loo.split(data):
        assert all(len(ratings) >= 10 for ratings in itervalues(trainset.ur))

    loo = LeaveOneOut(min_n_ratings=10000)  # too high
    with pytest.raises(ValueError):
        next(loo.split(data))
Example #18
0
def test_performances():
    """Test the returned dict. Also do dumping."""

    current_dir = os.path.dirname(os.path.realpath(__file__))
    folds_files = [(current_dir + '/custom_train',
                    current_dir + '/custom_test')]

    reader = Reader(line_format='user item rating', sep=' ', skip_lines=3)
    data = Dataset.load_from_folds(folds_files=folds_files, reader=reader,
                                   rating_scale=(1, 5))

    algo = NormalPredictor()
    tmp_dir = tempfile.mkdtemp()  # create tmp dir
    with pytest.warns(UserWarning):
        performances = evaluate(algo, data, measures=['RmSe', 'Mae'],
                                with_dump=True, dump_dir=tmp_dir, verbose=2)
    shutil.rmtree(tmp_dir)  # remove tmp dir

    assert performances['RMSE'] is performances['rmse']
    assert performances['MaE'] is performances['mae']
Example #19
0
def test_build_anti_testset():
    ratings_dict = {'itemID': [1, 2, 3, 4, 5, 6, 7, 8, 9],
                    'userID': [1, 2, 3, 4, 5, 6, 7, 8, 9],
                    'rating': [1, 2, 3, 4, 5, 6, 7, 8, 9]}
    df = pd.DataFrame(ratings_dict)

    data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']],
                                rating_scale=(1, 5))
    with pytest.warns(UserWarning):
        data.split(2)
        trainset, __testset = next(data.folds())
    # fill with some specific value
    for fillvalue in (0, 42., -1):
        anti = trainset.build_anti_testset(fill=fillvalue)
        for (u, i, r) in anti:
            assert r == fillvalue
    # fill with global_mean
    anti = trainset.build_anti_testset(fill=None)
    for (u, i, r) in anti:
        assert r == trainset.global_mean
    expect = trainset.n_users * trainset.n_items
    assert trainset.n_ratings + len(anti) == expect
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import PredefinedKFold

# path to dataset folder
files_dir = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/')

# This time, we'll use the built-in reader.
reader = Reader('ml-100k')

# folds_files is a list of tuples containing file paths:
# [(u1.base, u1.test), (u2.base, u2.test), ... (u5.base, u5.test)]
train_file = files_dir + 'u%d.base'
test_file = files_dir + 'u%d.test'
folds_files = [(train_file % i, test_file % i) for i in (1, 2, 3, 4, 5)]

data = Dataset.load_from_folds(folds_files, reader=reader, rating_scale=(1, 5))
pkf = PredefinedKFold()

algo = SVD()

for trainset, testset in pkf.split(data):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)
Example #21
0
        'NormalPredictor': '[{}]({})'.format('Random',
                                             stable +
                                             'basic_algorithms.html#surprise.prediction_algorithms.random_pred.NormalPredictor'),
        'ml-100k': '[{}]({})'.format('Movielens 100k',
                                     'http://grouplens.org/datasets/movielens/100k'),
        'ml-1m': '[{}]({})'.format('Movielens 1M',
                                   'http://grouplens.org/datasets/movielens/1m'),
        }


# set RNG
np.random.seed(0)
random.seed(0)

dataset = 'ml-1m'
data = Dataset.load_builtin(dataset)
kf = KFold(random_state=0)  # folds will be the same for all algorithms.

table = []
for klass in classes:
    start = time.time()
    out = cross_validate(klass(), data, ['rmse', 'mae'], kf)
    cv_time = str(datetime.timedelta(seconds=int(time.time() - start)))
    link = LINK[klass.__name__]
    mean_rmse = '{:.3f}'.format(np.mean(out['test_rmse']))
    mean_mae = '{:.3f}'.format(np.mean(out['test_mae']))

    new_line = [link, mean_rmse, mean_mae, cv_time]
    print(tabulate([new_line], tablefmt="pipe"))  # print current algo perf
    table.append(new_line)
Example #22
0
#reading files
df_ratings = pd.read_csv('input/ratings.csv')
df_movies = pd.read_csv('input/movies.csv')
df_ratings = df_ratings.drop(columns='timestamp')
print(df_movies.head(5))
print(df_ratings.head(5))
#splitting data into train and test sets
train_split, test_split = train_test_split(df_ratings,
                                           test_size=0.25,
                                           random_state=20)
print("Training data size:", train_split.shape)
print("Test data size:", test_split.shape)
#reader for parsing the ratings file
reader = Reader(rating_scale=(1, 5))
#building the train and test set, loading the data from dataframe
train_build = Dataset.load_from_df(train_split, reader)
test_build = Dataset.load_from_df(test_split, reader)
trainset = train_build.build_full_trainset()
testset = test_build.build_full_trainset().build_testset()
print("Test set size:", len(testset))
#model building
#takes in factors, epochs, learning rate and regularization parameter
model = SVDpp(n_factors=20, n_epochs=5, lr_all=0.09, reg_all=0.5)
model.fit(trainset)
#making predictions
predictions = model.test(testset)
#calculating rmse
accuracy.rmse(predictions, verbose=True)
#Save all the predicted ratings and convert it to a dataframe
all_recommendations_list = defaultdict(list)
all_recommendations_df = pd.DataFrame([])
Example #23
0
    def _convert_data(self, follow_resp):
        # Must be (user_id, item_id, rating)
        d = [[], [], []]
        user_id = []
        item_id = []
        rating = []

        follows = defaultdict(set)
        inverse_follows = defaultdict(set)
        for follow in follow_resp.results:
            # Do not have to worry about follow state, because even a rejected
            # follow is still a strong signal of interest by the followee.
            user_id.append(follow.follower)
            item_id.append(follow.followed)
            rating.append(1)

            follows[follow.follower].add(follow.followed)
            inverse_follows[follow.followed].add(follow.follower)

        # Now randomly put zeros in non-existing links in the network.
        # This is necessary as the problem is an example of PU-learning, where
        # we have no negative samples to "drag down" the recommendation
        # confidence. That's to say, without zeros, the model will never have
        # incentive not to recommend everyone, as it is never told that an
        # unsuitable recommendation is bad.
        self._logger.debug('Assigning zeros randomly.')

        # Create a set of all users we can create recommendations for by
        # getting the set union of both sides of all follow connections.
        all_users = tuple(set(follows) | set(inverse_follows))

        # We only want to add as many zeros as there are ones.
        num_zeros_added = 0
        num_ones = len(follow_resp.results)

        # Important to keep track of attempts to randomly add zeros, so that
        # in a densely connected graph (eg. on a small instance where everyone
        # follows everyone else) the loop doesn't continue forever.
        # We set the max attempts (rather arbitrarily) to the square of the
        # number of different users; this is the number of possible ways of
        # choosing two random users from the set of all users.
        num_attempts = 0
        max_attempts = len(all_users)**2

        # Continue adding zeros until there are the same number as there are
        # ones, or until we give up.
        while num_zeros_added < num_ones and num_attempts < max_attempts:
            num_attempts += 1
            follower = random.choice(all_users)
            followed = random.choice(all_users)

            if follower == followed:
                # Cannot follow yourself.
                continue
            if followed in follows[follower]:
                # Follow already exists.
                continue

            user_id.append(follower)
            item_id.append(followed)
            rating.append(0)
            num_zeros_added += 1

            # We don't want to accidentally re-add this as another zero later,
            # so reuse the follow set to ensure if this (follower, followed)
            # pair comes up again randomly, it is skipped.
            follows[follower].add(followed)

        d = {'follower': user_id, 'followee': item_id, 'rating': rating}
        df = pd.DataFrame(data=d)

        reader = Reader(rating_scale=(0, 1))
        return Dataset.load_from_df(df[['follower', 'followee', 'rating']],
                                    reader)
Example #24
0
def test_trainset_testset(toy_data_reader):
    """Test the construct_trainset and construct_testset methods."""

    current_dir = os.path.dirname(os.path.realpath(__file__))
    folds_files = [(current_dir + '/custom_train',
                    current_dir + '/custom_test')]

    data = Dataset.load_from_folds(folds_files=folds_files,
                                   reader=toy_data_reader)

    pkf = PredefinedKFold()
    trainset, testset = next(pkf.split(data))

    # test ur
    ur = trainset.ur
    assert ur[0] == [(0, 4)]
    assert ur[1] == [(0, 4), (1, 2)]
    assert ur[40] == []  # not in the trainset

    # test ir
    ir = trainset.ir
    assert ir[0] == [(0, 4), (1, 4), (2, 1)]
    assert ir[1] == [(1, 2), (2, 1), (3, 5)]
    assert ir[20000] == []  # not in the trainset

    # test n_users, n_items, n_ratings, rating_scale
    assert trainset.n_users == 4
    assert trainset.n_items == 2
    assert trainset.n_ratings == 6
    assert trainset.rating_scale == (1, 5)

    # test raw2inner
    for i in range(4):
        assert trainset.to_inner_uid('user' + str(i)) == i
    with pytest.raises(ValueError):
        trainset.to_inner_uid('unknown_user')

    for i in range(2):
        assert trainset.to_inner_iid('item' + str(i)) == i
    with pytest.raises(ValueError):
        trainset.to_inner_iid('unknown_item')

    # test inner2raw
    assert trainset._inner2raw_id_users is None
    assert trainset._inner2raw_id_items is None
    for i in range(4):
        assert trainset.to_raw_uid(i) == 'user' + str(i)
    for i in range(2):
        assert trainset.to_raw_iid(i) == 'item' + str(i)
    assert trainset._inner2raw_id_users is not None
    assert trainset._inner2raw_id_items is not None

    # Test the build_testset() method
    algo = BaselineOnly()
    algo.fit(trainset)
    testset = trainset.build_testset()
    algo.test(testset)  # ensure an algorithm can manage the data
    assert ('user0', 'item0', 4) in testset
    assert ('user3', 'item1', 5) in testset
    assert ('user3', 'item1', 0) not in testset

    # Test the build_anti_testset() method
    algo = BaselineOnly()
    algo.fit(trainset)
    testset = trainset.build_anti_testset()
    algo.test(testset)  # ensure an algorithm can manage the data
    assert ('user0', 'item0', trainset.global_mean) not in testset
    assert ('user3', 'item1', trainset.global_mean) not in testset
    assert ('user0', 'item1', trainset.global_mean) in testset
    assert ('user3', 'item0', trainset.global_mean) in testset
import numpy as np
import pandas as pd

from surprise import Dataset
from surprise import KNNWithMeans
from surprise import SVD
from surprise.model_selection import KFold

import warnings
warnings.filterwarnings('ignore')

import PredictedRating

# 0. Data Load - Movie lens 1M data
data = Dataset.load_builtin('ml-1m')
kf = KFold(n_splits=5)
sim_options = {'name': 'cosine', 'user_based': True}


# 1. Precision & Recall & F1-measure
class Precision_Recall_F1:
    def __init__(self, data, algo):
        self.data = data
        self.algo = algo

    def precision_recall_at_k(self, predictions, k=10, threshold=3.5):
        '''Return precision and recall at k metrics for each user.'''

        # First map the predictions to each user.
        user_est_true = defaultdict(list)
Example #26
0
import pytest

from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNBaseline
from surprise import Dataset
from surprise import Reader
from surprise.accuracy import neg_rmse
from surprise.model_selection import cross_validate
from surprise.model_selection import PredefinedKFold

# the test and train files are from the ml-100k dataset (10% of u1.base and
# 10 % of u1.test)
train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train')
test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
data = Dataset.load_from_folds([(train_file, test_file)], Reader('ml-100k'))
pkf = PredefinedKFold()


def test_name_field():
    """Ensure the name field is taken into account."""

    sim_options = {'name': 'cosine'}
    algo = KNNBasic(sim_options=sim_options)
    rmse_cosine = cross_validate(algo, data, [['neg_rmse', neg_rmse]],
                                 pkf)['test_neg_rmse']

    sim_options = {'name': 'msd'}
    algo = KNNBasic(sim_options=sim_options)
    rmse_msd = cross_validate(algo, data, [['neg_rmse', neg_rmse]],
                              pkf)['test_neg_rmse']
Example #27
0
import numpy as np
import csv

from surprise import Dataset, KNNBasic, SVD, SVDpp, BaselineOnly
from surprise.model_selection import KFold, cross_validate
from cf_models import EbcrMsdKNN, EbcrCosKNN, EbcrNormPccKNN, NormPcc, SW_Norm_PccKNN, SW_MSD_KNN, SW_COS_KNN, LS_MSD_KNN, LS_COS_KNN, LS_Norm_PccKNN

__author__ = "Yu DU"

# Datasets initialisation
ml_100k = Dataset.load_builtin('ml-100k')
ml_1m = Dataset.load_builtin('ml-1m')
jester = Dataset.load_builtin('jester')

# Split train and test set
kf = KFold(random_state=0, n_splits=5)

list_k = [5, 10, 20, 40, 60, 80, 100, 200]
list_k2 = [5, 10, 15, 20, 25, 30, 35, 40]

# The Ml-100k Dataset
with open('results_ml100k_all.csv', mode='w') as result_file:
    fieldnames = ['k', 'algo', 'MAE', 'RMSE']
    writer = csv.DictWriter(result_file, fieldnames=fieldnames)
    writer.writeheader()

    # SVD algo
    svd = SVD()
    out_svd = cross_validate(svd,
                             ml_100k, ['rmse', 'mae'],
                             kf,
Example #28
0
    def ubcf_eval(self, co_pe, df_path):
        kfold = input("Enter number of folds required to Evaluate:")

        reader = Reader(line_format="user item rating",
                        sep=',',
                        rating_scale=(1, 5))
        df = Dataset.load_from_file(df_path, reader=reader)

        self.splitter(kfold, df)

        # SIMILARITY & ALGORITHM DEFINING
        sim_op = {'name': co_pe, 'user_based': True}
        algo = KNNBasic(sim_options=sim_op)

        # RESPONSIBLE TO EXECUTE DATA SPLITS MENTIONED IN STEP 4
        start = time.time()
        perf = evaluate(algo, df, measures=['RMSE', 'MAE'])
        end = time.time()

        print_perf(perf)

        print "\nTotal Time elapsed =", (end - start)
        print "Average time per fold =", (end - start) / kfold, "\n"

        print perf

        ds = pd.read_csv("pred_matrix-full_ubcf.csv")
        confusion_matrix = np.matrix(ds)

        FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
        FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
        TP = np.diag(confusion_matrix)
        TN = confusion_matrix.sum() - (FP + FN + TP)

        # Sensitivity, hit rate, recall, or true positive rate
        TPR = TP / (TP + FN)
        # Specificity or true negative rate
        TNR = TN / (TN + FP)
        # Precision or positive predictive value
        PPV = TP / (TP + FP)
        # Negative predictive value
        NPV = TN / (TN + FN)
        # Fall out or false positive rate
        FPR = FP / (FP + TN)
        # False negative rate
        FNR = FN / (TP + FN)
        # False discovery rate
        FDR = FP / (TP + FP)

        # Overall accuracy
        ACC = (TP + TN) / (TP + FP + FN + TN)

        print "\nTrue Positive:\n", TP, "\n\nTrue Negative\n", TN, "\n\nFalse Positive\n", FP, "\n\nFalse Negative\n", FN
        print "-" * 30
        print "\nTrue Postive Ratio =", TPR, "\n\nFalse Positive Ratio =", FPR
        print "-" * 30

        print "*" * 20
        print confusion_matrix

        print "Accuracy with current Algorithm", algo, "is ", ACC.mean(axis=0)
        '/Users/ronlitman/Ronlitman/University/Statistic/שנה א׳ - סמט׳ א׳/למידה סטטיסטית/Netflix/df_join.csv'
    )


def hide_y(df, size=0.2):
    pass


# df = get_full_df()

df = pd.read_csv(
    '/Users/ronlitman/Ronlitman/University/Statistic/שנה א׳ - סמט׳ א׳/למידה סטטיסטית/Netflix/df_join.csv'
)

reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(df[['uid', 'iid', 'rating']], reader)

# Split data into 5 folds

print('Split data into 5 folds')
data.split(n_folds=5)

# svd
print('SVD')
algo = SVD()

evaluate(algo, data, measures=['RMSE'])

# Retrieve the trainset.
trainset = data.build_full_trainset()
algo.train(trainset)
Example #30
0
def parameter_tuning():
    """
    After deciding to use the KNNWithMeans algorithm our next step is to tune its parameters to further increase its
    accuracy. There are three parameters we can tune:
    (1) The similarity options, in particular which option we use for computing the similarity matrix. [*]
    (2) The min_k parameter.
    (3) The k parameter.


    1. Sim options:
        We can decide between using the standard (naive) cosine similarity, pearson correlation (centred cosine similarity)
        or MSD (mean squared differences). Since the pearson similarity outperforms the others we stick to it.
    2. The min_k parameter:
        The minimum number of neighbors to take into account for computing the weighted adjusted ratings. If less
        than min_k neighbors are available, meaning that not enough games have been rated by the user, that have a
        similarity of >= 0, the prediction is equal to the average rating for the particular game.
    3. The k parameter:
        The maximum number of neighbors to take into account for computing the weighted adjusted ratings. In our case
        the k games rated by the target user that are most similar to the game we are trying to predict.
        We focus on this parameter below.


    More information can be found here:
    https://surprise.readthedocs.io/en/stable/knn_inspired.html#surprise.prediction_algorithms.knns.KNNWithMeans


    As a result we chose the following parameters in our production environment:
    (1) Pearson correlation (centred cosine similarity)
    (2) k = 40
    (3) min_k = 5


    [*] Actually the similarity options include another parameter that determines whether we use item-item or user-user similarities.
     Since we already distinguished between the two in our benchmarking we focus purely on which approach to use for
     computing the similarity matrix here.
    """

    # import reduced dataset:
    df = import_all_reviews('C:/Users/lukas/PycharmProjects/board-game-recommender/import/Data/Joined/Results/Reviews_Reduced_SMALL.csv')

    # check for duplicates:
    duplicates = len(df) - len(df.drop_duplicates(subset=['game_key', 'user_key']))

    # drop duplicates:
    df = df.drop_duplicates(subset=['game_key', 'user_key'])
    print('duplicates removed: ' + str(duplicates))

    ## Surprise:
    reader = Reader(rating_scale=(1, 10))
    data = Dataset.load_from_df(df[['user_key', 'game_key', 'rating']], reader)

    results = []

    sim_option = {'name': 'pearson', 'user_based': False}
    min_k = 5

    # try out different parameters for k:
    k_parameter = list(range(10, 200, 10))
    min_k_parameter = [1, 5, 10]

    # Cross validate:
    for k in k_parameter:
        for min_k in min_k_parameter:
            algo = KNNWithMeans(k=k, min_k=min_k, sim_options=sim_option)
            results.append(
                cross_validate(algo, data, measures=['RMSE'], cv=5, return_train_measures=True, n_jobs=-3, verbose=True))

    # Print results:
    for i, result in enumerate(results):
        print('k = ' + str(k_parameter[i//len(min_k_parameter)]) + '\t \t' + 'min_k = ' + str(min_k_parameter[i % len(min_k_parameter)]) + '\t \t RMSE Score: \t' + str(result['test_rmse'].mean()) + '\t\t Fit-Time: ' + str(
            result['fit_time']) + '\t\t Train-Time: ' + str(result['test_time']))
Example #31
0

df=pd.DataFrame(r)
df.columns=['mem_id','res_id','rating']

#평점이 없는 m_id와 res_id 추출
from numpy import nan
R_ori= df.pivot_table('rating', index='mem_id',columns='res_id')
zero_maxtrix=R_ori[R_ori==nan].reset_index().melt('mem_id', var_name='res_id')[['mem_id','res_id']]


#가져온 데이터를 학습
from surprise import Reader

reader=Reader(rating_scale=(0.01,5))
data = Dataset.load_from_df(df[['mem_id','res_id','rating']], reader=reader)
trainset=data.build_full_trainset()

algo=SVD(n_epochs=20,n_factors=50, random_state=0)
algo.fit(trainset)

#현재 시간을 저장
now = datetime.datetime.now()
formattedDate = now.strftime("%Y%m%d_%H%M%S")
print(formattedDate)

#현재 시간명으로 temp table 생성
sql1 = """CREATE TABLE res_recommend_svd_{}(
            res_id INT,
            pred_rating Float,
            member_id INT)""".format(formattedDate)
Example #32
0
    def execute(self, params, **kwargs):
        # Load the movielens-100k dataset (download it if needed),
        data = Dataset.load_builtin('ml-100k')

        self.marvin_initial_dataset = {"data": data}
Example #33
0
        if self.trainset.knows_user(u) and self.trainset.knows_item(i):
            return np.dot(self.p[u], self.q[i])
        else:
            return self.trainset.global_mean


timex = []
mem = []
m1 = psutil.virtual_memory().percent

#For 100 record dataset
start = time.time()
df1 = pd.read_csv('C:/Users/dell pc/Desktop/Project/ratings_1million1.csv',
                  dtype={'rating': float})
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df1[['user_id', 'book_id', 'rating']], reader)
data.split(2)
algo = MatrixFacto(learning_rate=.01, n_epochs=10, n_factors=10)
result1 = surprise.evaluate(algo, data, measures=['RMSE'])
end = time.time()
print("Time1", end - start)
timex.append(end - start)
m2 = psutil.virtual_memory().percent
#print(m2)
mem.append(m2)

#For 1000 record dataset
start = time.time()
df2 = pd.read_csv('C:/Users/dell pc/Desktop/Project/ratings_1million2.csv',
                  dtype={'rating': float})
reader = Reader(rating_scale=(1, 5))
Example #34
0

def get_top_n(predictions, n=10):
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
        # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    return top_n


# First train an SVD algorithm on the movielens dataset.
data = Dataset.load_from_file('df')
trainset = data.build_full_trainset()
algo = SVD()
algo.fit(trainset)

# Than predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

top_n = get_top_n(predictions, n=10)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

#------------------- To compute precision@k and recall@k using surprise-----------------------------------
Example #35
0
import pickle
import pandas as pd
from surprise import SVD, Reader, Dataset

print('Now offline-train Amazon')
df = pd.read_csv('./exp_data/amazon_exp.csv')

reader = Reader()
data = Dataset.load_from_df(
    df=df[['user_id', 'item_id', 'rating']], reader=reader, rating_scale=(1, 5))
train_set = data.build_full_trainset()

raw_ratings = [(uid, iid, float(r)) for (uid, iid, r, time) in df.itertuples(index=False)]
raw2inner_id_users = {}
raw2inner_id_items = {}
current_u_index = 0
current_i_index = 0
for urid, irid, r in raw_ratings:
    try:
        uid = raw2inner_id_users[urid]
    except KeyError:
        uid = current_u_index
        raw2inner_id_users[urid] = current_u_index
        current_u_index += 1
    try:
        iid = raw2inner_id_items[irid]
    except:
        iid = current_i_index
        raw2inner_id_items[irid] = current_i_index
        current_i_index += 1
user_dict = {val: key for key, val in raw2inner_id_users.items()}
Example #36
0
from util import *


user, book, user_test, book_test, rate, user_all, book_all, user_dict, book_dict = read_data()

# Creation of the dataframe. Column names are irrelevant.
ratings_dict = {'itemID': book,
                'userID': user,
                'rating': rate}
df = pd.DataFrame(ratings_dict)

# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 10))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)


# Models
algos = []
algos_name = []

algos_name.append('BS_ALS')
bsl_options = {'method': 'als',
               'n_epochs': 5,
               'reg_u': 1,
               'reg_i': 5
               }
algos.append(BaselineOnly(bsl_options=bsl_options))

algos_name.append('BS_SGD')
Example #37
0
def test_wrong_file_name():
    """Ensure file names are checked when creating a (custom) Dataset."""
    wrong_files = [('does_not_exist', 'does_not_either')]

    with pytest.raises(ValueError):
        Dataset.load_from_folds(folds_files=wrong_files, reader=Reader())
Example #38
0
def test_trainset_testset(toy_data_reader):
    """Test the construct_trainset and construct_testset methods."""

    current_dir = os.path.dirname(os.path.realpath(__file__))
    folds_files = [(current_dir + '/custom_train',
                    current_dir + '/custom_test')]

    data = Dataset.load_from_folds(folds_files=folds_files,
                                   reader=toy_data_reader, rating_scale=(1, 5))

    with pytest.warns(UserWarning):
        trainset, testset = next(data.folds())

    # test ur
    ur = trainset.ur
    assert ur[0] == [(0, 4)]
    assert ur[1] == [(0, 4), (1, 2)]
    assert ur[40] == []  # not in the trainset

    # test ir
    ir = trainset.ir
    assert ir[0] == [(0, 4), (1, 4), (2, 1)]
    assert ir[1] == [(1, 2), (2, 1), (3, 5)]
    assert ir[20000] == []  # not in the trainset

    # test n_users, n_items, n_ratings, rating_scale
    assert trainset.n_users == 4
    assert trainset.n_items == 2
    assert trainset.n_ratings == 6
    assert trainset.rating_scale == (1, 5)

    # test raw2inner
    for i in range(4):
        assert trainset.to_inner_uid('user' + str(i)) == i
    with pytest.raises(ValueError):
        trainset.to_inner_uid('unkown_user')

    for i in range(2):
        assert trainset.to_inner_iid('item' + str(i)) == i
    with pytest.raises(ValueError):
        trainset.to_inner_iid('unkown_item')

    # test inner2raw
    assert trainset._inner2raw_id_users is None
    assert trainset._inner2raw_id_items is None
    for i in range(4):
        assert trainset.to_raw_uid(i) == 'user' + str(i)
    for i in range(2):
        assert trainset.to_raw_iid(i) == 'item' + str(i)
    assert trainset._inner2raw_id_users is not None
    assert trainset._inner2raw_id_items is not None

    # Test the build_testset() method
    algo = BaselineOnly()
    algo.fit(trainset)
    testset = trainset.build_testset()
    algo.test(testset)  # ensure an algorithm can manage the data
    assert ('user0', 'item0', 4) in testset
    assert ('user3', 'item1', 5) in testset
    assert ('user3', 'item1', 0) not in testset

    # Test the build_anti_testset() method
    algo = BaselineOnly()
    algo.fit(trainset)
    testset = trainset.build_anti_testset()
    algo.test(testset)  # ensure an algorithm can manage the data
    assert ('user0', 'item0', trainset.global_mean) not in testset
    assert ('user3', 'item1', trainset.global_mean) not in testset
    assert ('user0', 'item1', trainset.global_mean) in testset
    assert ('user3', 'item0', trainset.global_mean) in testset
# writing data to file
from surprise import dump

from collections import defaultdict

## Load user reviews
df_users = pd.read_csv(
    r'D:\data science\nutrition\epi_reviews_75plus_w_usernames.csv',
    index_col=0)
df_users = df_users.loc[:, 'user':'******']

# formalize rating scale
reader = Reader(rating_scale=(1, 4))  # for centered: (-3, 3)

# put data into surprise format
data = Dataset.load_from_df(df_users, reader)
print(get_methods(data))

# Do a Grid Search for different hyperparameter values (earlier I tried this
# using only users with at least 8 ratings and the defaults were best for
# n_epochs, lr_all and reg_all, so I will fix them here and vary n_factors):

# # Note that the handbook suggests using different lrs for different params
# param_grid = {'n_factors': [10, 15, 20]}
# gs = GridSearchCV(SVDpp, param_grid, measures=['rmse'], cv=5)

# gs.fit(data)

# # best RMSE score
# print(gs.best_score['rmse'])
    one_data = one_data + user_info_dic[a_data[0]] + busi_info_dic[
        a_data[1]] + [float(a_data[2])]
    new_test_data.append(one_data)
new_test_data = pd.DataFrame(new_test_data)
new_train_only_data = new_train_data.iloc[:, 0:4]
new_train_label = new_train_data.iloc[:, 4]
new_test_only_data = new_test_data.iloc[:, 0:4]
clf = LinearRegression().fit(new_train_only_data, new_train_label)
y_pre = clf.predict(new_test_only_data)
linear_prediction = []
for i in range(len(y_pre)):
    all_info = [test_data_get[i][0]] + [test_data_get[i][1]] + [y_pre[i]]
    linear_prediction.append(all_info)
####################################surprise######################################
surprise_reader = Reader(line_format='user item rating', sep=',', skip_lines=1)
surprise_train = Dataset.load_from_file(input_file, reader=surprise_reader)
surprise_train = surprise_train.build_full_trainset()
surprise_test_data = sc.parallelize(test_data_get).map(
    lambda s: (s[0], s[1], float(s[2]))).collect()
params = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5}
surprise_formula = BaselineOnly(bsl_options=params)
surprise_formula.fit(surprise_train)
surprise_predict = surprise_formula.test(surprise_test_data)
surprise_prediction = []
for i in range(len(surprise_predict)):
    surprise_prediction.append([
        surprise_predict[i][0], surprise_predict[i][1], surprise_predict[i][3]
    ])
################################SVD########################################
from surprise import SVD
svd_surprise = SVD(n_epochs=30, lr_all=0.008, reg_all=0.2)
def SVD_surprise_only(Trainset, N=30):
    reader = Reader()
    Trainset_changetype = Dataset.load_from_df(
        Trainset[['Member_encoding', 'Game_encoding', 'score']], reader)
    Trainset_changetype_result = Trainset_changetype.build_full_trainset()
    svd = SVD(
        n_factors=20,
        n_epochs=20,
        lr_all=0.01,  #0.0001,
        random_state=1234)
    svd.fit(Trainset_changetype_result)

    games = list(Trainset.Game_encoding.unique()
                 )  # Get our unique games that were purchased

    x = np.zeros([len(games), len(games)])

    for k in range(0, round(np.shape(x)[0] / 200) + 1):
        for l in range(0, round(np.shape(x)[0] / 200) + 1):
            minxindex = k * 200
            minyindex = l * 200
            maxxindex = ((k + 1) * 200)  #- 1
            maxyindex = ((l + 1) * 200)  #- 1
            if k == round(np.shape(x)[0] / 200):
                maxxindex = np.shape(x)[1] + 1
            if l == round(np.shape(x)[0] / 200):
                maxyindex = np.shape(x)[1] + 1
            cut0 = np.dot(svd.pu, np.transpose(svd.qi[minxindex:maxxindex, :]))
            cut1 = np.dot(svd.pu, np.transpose(svd.qi[minyindex:maxyindex, :]))
            x[minxindex:maxxindex,
              minyindex:maxyindex] = cosine_similarity(np.transpose(cut0),
                                                       np.transpose(cut1))

    #model SVD_New
    cosine_sim_x = pd.DataFrame(data=x, index=games, columns=games)
    gamesplayed = Trainset.groupby([
        'Member_encoding'
    ])['Game_encoding'].apply(list).reset_index(name='games')
    gamesmax = np.array(
        gamesplayed.games.map(lambda x:
                              ((cosine_sim_x.loc[x, :].values).max(axis=0))))
    gamelist = np.array(cosine_sim_x.columns)

    def Get_neighbor_30(x):
        # x[x>0.99] = 0.0
        return (gamelist[np.flip(np.argsort(x, axis=0))[0:N, ]])

    filtered = list(map(Get_neighbor_30, gamesmax))
    filtered_array = np.array(filtered)
    filtered_array = filtered_array.reshape(
        filtered_array.shape[0] * filtered_array.shape[1], -1)
    filtered_array = filtered_array.reshape(-1, )
    SVD_Neighbor = pd.DataFrame({
        'Member_encoding':
        np.repeat(np.array(np.unique(Trainset.Member_encoding)), N, axis=0),
        'Game_encoding':
        filtered_array
    })
    #SVD_Neighbor_result = SVD_Neighbor.groupby('member_id').head(12)
    SVD_Neighbor_result = SVD_Neighbor.merge(
        Trainset[['Member_encoding', 'Game_encoding', 'score']],
        how='left',
        on=['Member_encoding', 'Game_encoding'])
    SVD_Neighbor_result.score = np.where(SVD_Neighbor_result.score.isna(), 0,
                                         SVD_Neighbor_result.score)
    SVD_Neighbor_result = SVD_Neighbor_result.sort_values(
        by=['Member_encoding', 'score'], ascending=False)
    SVD_Neighbor_result = SVD_Neighbor_result.groupby('Member_encoding').head(
        12)

    return SVD_Neighbor, SVD_Neighbor_result
Example #42
0
popular_products = pd.DataFrame(new_df.groupby('productId')['Rating'].count())
most_popular = popular_products.sort_values('Rating', ascending=False)
most_popular.head(30).plot(kind="bar")

plt.show()

from surprise import KNNWithMeans
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
import os
from surprise.model_selection import train_test_split

#Reading the dataset
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(new_df, reader)

#Splitting the dataset
trainset, testset = train_test_split(data, test_size=0.3, random_state=10)

# Use user_based true/false to switch between user-based or item-based collaborative filtering
algo = KNNWithMeans(k=5,
                    sim_options={
                        'name': 'pearson_baseline',
                        'user_based': False
                    })
algo.fit(trainset)

# run the trained model against the testset
test_pred = algo.test(testset)
print(test_pred)
Example #43
0
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate

# Load the movielens-100k dataset (download it if needed).
data = Dataset.load_builtin('ml-100k')

# Use the famous SVD algorithm.
algo = SVD()
# Run 5-fold cross-validation and print results.
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
Example #44
0
# Reference https://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.SVD
import pandas as pd
import numpy as np

from surprise import SVD
from surprise import Dataset, Reader, accuracy
from surprise.model_selection import cross_validate

reader = Reader(line_format='user item rating',
                sep=',',
                rating_scale=(1, 5),
                skip_lines=1)

## Load the training set into surprise's custom dataset object
train_set = Dataset.load_from_file('data_movie_lens_100k/ratings_train.csv',
                                   reader=reader)
train_set = train_set.build_full_trainset()

## Load the test set into surprise's custom dataset object
## (Need to use intermediate pandas DataFrame because the true ratings are missing)
test_df = pd.read_csv('data_movie_lens_100k/ratings_test_masked.csv')
test_set = Dataset.load_from_df(test_df, reader=reader)
test_set = test_set.build_full_trainset().build_testset()

# Use the SVD algorithm
for n_factors in [1]:
    ## Fit model to training set
    model = SVD(n_factors=n_factors)
    model.fit(train_set)

    ## Measure predictions on train set
Example #45
0
        # get predictions based on training set
        testSet = trainingSet.build_anti_testset()
        testPredictions = knn.test(testSet)

    top3_recommendations = get_top_recommendations(testPredictions)
    print_recommendations(top3_recommendations)


    def parse_input(input):
        return input


    def recommend(input):
        return top3_recommendations[input['uid']]

legion.model.export(
    recommend,
    {
        'uid': legion.model.int32
    })
legion.model.save()

recommendation_example = recommend({'uid': 1})
print(repr(recommendation_example))


# Additional memory workload
file_path = (os.path.expanduser('~') + '/.surprise_data/ml-100k/ml-100k/u.data')
reader = Reader(line_format='user item rating timestamp', sep='\t', skip_lines=0)
data = Dataset.load_from_file(file_path, reader=reader)
Example #46
0
"""
This module descibes how to load a dataset from a pandas dataframe.
"""

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

import pandas as pd

from surprise import NormalPredictor
from surprise import Dataset
from surprise.model_selection import cross_validate


# Creation of the dataframe. Column names are irrelevant.
ratings_dict = {'itemID': [1, 1, 1, 2, 2],
                'userID': [9, 32, 2, 45, 'user_foo'],
                'rating': [3, 2, 4, 3, 1]}
df = pd.DataFrame(ratings_dict)

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']],
                            rating_scale=(1, 5))

# We can now use this dataset as we please, e.g. calling cross_validate
cross_validate(NormalPredictor(), data, cv=2)
Example #47
0
from surprise import SVD
from surprise import Dataset, accuracy
from surprise.model_selection import cross_validate,train_test_split

# Load the movielens-100k dataset (download it if needed).
data = Dataset.load_builtin('ml-100k')

# sample random trainset and testset
# test set is made of 25% of the ratings.
trainset, testset = train_test_split(data, test_size=.25)

# We'll use the famous SVD algorithm.
algo = SVD()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

# Run 5-fold cross-validation and print results.
#cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

import pandas as pd

from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
def hybrid(userId,train_rd):
    #get_ipython().magic('matplotlib inline')
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    from scipy import stats
    from ast import literal_eval
    from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
    from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
    from nltk.stem.snowball import SnowballStemmer
    from nltk.stem.wordnet import WordNetLemmatizer
    from nltk.corpus import wordnet
    from surprise import Reader, Dataset, SVD, evaluate

    import warnings; warnings.simplefilter('ignore')


    # In[2]:


    #Popularity#

    md = pd.read_csv('CustomData/FinalData.csv')

    fd = pd.read_csv('avg_ratings1.csv')



    fd[fd['rating'].notnull()]['rating'] = fd[fd['rating'].notnull()]['rating'].astype('float')
    vote_averages= fd[fd['rating'].notnull()]['rating']
    C = vote_averages.mean()


    fd1 = pd.read_csv('ratings_count.csv')


    fd1[fd1['rating'].notnull()]['rating'] = fd1[fd1['rating'].notnull()]['rating'].astype('float')
    vote_counts = fd1[fd1['rating'].notnull()]['rating']


    # In[3]:


    m = vote_counts.quantile(0.75)



    # In[4]:


    md['ratings_count'] = fd1['rating']
    md['average_rating'] = fd['rating']


    # In[28]:


    #print(md.shape)
    qualified = md[(md['ratings_count'].notnull())][['book_id','title', 'authors', 'ratings_count', 'average_rating']]

    qualified['ratings_count'] = qualified['ratings_count'].astype('float')

    qualified['average_rating'] = qualified['average_rating'].astype('float')

    #qualified.shape


    # In[29]:


    def weighted_rating(x):
        v = x['ratings_count']
        R = x['average_rating']
        return (v/(v+m) * R) + (m/(m+v) * C)


    # In[30]:


    qualified['popularity_rating'] = qualified.apply(weighted_rating, axis=1)
    #qualified['wr']
    #qualified = qualified.sort_values('popularity_rating', ascending=False).head(250)
    pop = qualified[['book_id','popularity_rating']]
    #print(qualified.shape)
    #print(pop.shape)


    # In[11]:


    ### Collaborative ##

    reader = Reader()
    ratings=train_rd
    #ratings = pd.read_csv('ratings.csv')
    #ratings.head()

    temp_ratings = ratings[0:1000]

    #print(temp_ratings)
    data = Dataset.load_from_df(temp_ratings[['user_id', 'book_id', 'rating']], reader)
    data.split(n_folds=2)


    # In[12]:


    svd = SVD()
    evaluate(svd, data, measures=['RMSE', 'MAE'])


    # In[13]:


    trainset = data.build_full_trainset()
    #svd.train(trainset)
    algo = SVD()
    algo.fit(trainset)

    ## usefule = temp_rating[rating]


    # In[14]:


#print(len(temp_ratings[temp_ratings['user_id']==userId]))


    # In[ ]:


    def get_top_n(predictions, n=10):
        '''Return the top-N recommendation for each user from a set of predictions.

        Args:
            predictions(list of Prediction objects): The list of predictions, as
                returned by the test method of an algorithm.
            n(int): The number of recommendation to output for each user. Default
                is 10.

        Returns:
        A dict where keys are user (raw) ids and values are lists of tuples:
            [(raw item id, rating estimation), ...] of size n.
        '''

        # First map the predictions to each user.
        top_n = defaultdict(list)
        for uid, iid, true_r, est, _ in predictions:
            top_n[uid].append((iid, est))

        # Then sort the predictions for each user and retrieve the k highest ones.
        for uid, user_ratings in top_n.items():
            #user_ratings.sort(key=lambda x: x[1], reverse=True)
            top_n[uid] = user_ratings[:n]

        return top_n


    # In[15]:


    from collections import defaultdict
    testset = trainset.build_anti_testset()
    predictions = algo.test(testset)
    '''
    top_n = get_top_n(predictions, n=10000)

    #print(top_n)
    #result = pd.DataFrame(top_n)
    #print(result)
    for uid, user_ratings in top_n.items():
    
        #print(uid, [iid for (iid  , _) in user_ratings])
        for uid, iid, true_r, est, _ in predictions:
        
            temp_ratings.loc[uid]= [uid,iid,est]
        #temp_ratings[i]['cf'] = temp_ratings[(temp_ratings['user_id'] == uid)][['book_id']]
        
    '''
    count = 0
    for uid, iid, true_r, est, _ in predictions:
        
         if uid == userId:
            count = count+1
            temp_ratings.loc[len(temp_ratings)+1]= [uid,iid,est]
            #print('here')

            #print(uid)
            #temp_ratings.append([uid,iid,est],ignore_index=True)

    #print(count)
    #print(temp_ratings)



    # In[16]:


    #print(len(temp_ratings[temp_ratings['user_id']==2]))


    # In[ ]:





    # In[46]:


    ##### CONTENT ######

    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    from scipy import stats
    from ast import literal_eval
    from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
    from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
    from nltk.stem.snowball import SnowballStemmer
    from nltk.stem.wordnet import WordNetLemmatizer
    from nltk.corpus import wordnet
    from surprise import Reader, Dataset, SVD, evaluate
    import csv
    import warnings; warnings.simplefilter('ignore')


    # In[48]:



    md=pd.read_csv('CustomData/FinalData.csv')
    rd=train_rd
    #rd=pd.read_csv('ratings.csv')
    md['book_id'] = md['book_id'].astype('int')
    rd['book_id'] = rd['book_id'].astype('int')
    rd['user_id'] = rd['user_id'].astype('int')
    rd['rating'] = rd['rating'].astype('int')

    #print(md.head())


    md['authors'] = md['authors'].str.replace(' ','')
    md['authors'] = md['authors'].str.lower()
    md['authors'] = md['authors'].str.replace(',',' ')

    #print(md.head())

    md['authors'] = md['authors'].apply(lambda x: [x,x])
    #print(md['authors'])

    md['Genres']=md['Genres'].str.split(';')
    #print(md['Genres'])

    md['soup'] = md['authors'] + md['Genres']
    #print(md['soup'])

    md['soup'] = md['soup'].str.join(' ')

    #md['soup'].fillna({})
    #print(md['soup'])

    count = CountVectorizer(analyzer='word',ngram_range=(1,1),min_df=0, stop_words='english')
    count_matrix = count.fit_transform(md['soup'])
    #print (count_matrix.shape)
    #print np.array(count.get_feature_names())
    #print(count_matrix.shape)

    cosine_sim = cosine_similarity(count_matrix, count_matrix)


    # In[91]:


    def build_user_profiles():
        user_profiles=np.zeros((53421,999))
        #print(rd.iloc[0]['user_id'])
	#len(rd['book_id'])
        for i in range(0,1000):
            u=rd.iloc[i]['user_id']
            b=rd.iloc[i]['book_id']
            #print(u,b)
            #print(i)
            #if b<999:
                #print("match at "+str(b))
            user_profiles[u][b-1]=rd.iloc[i]['rating']
        #print(user_profiles)
        return user_profiles

    user_profiles=build_user_profiles()
    def _get_similar_items_to_user_profile(person_id):
            #Computes the cosine similarity between the user profile and all item profiles
            #print(user_profiles[person_id])
        #print("\n---------\n")
        #print(cosine_sim[0])
        user_ratings = np.empty((999,1))
        cnt=0
        for i in range(0,998):
            book_sim=cosine_sim[i]
            user_sim=user_profiles[person_id]
            user_ratings[i]=(book_sim.dot(user_sim))/sum(cosine_sim[i])
        maxval = max(user_ratings)
    #print(maxval)

        for i in range(0,998):
            user_ratings[i]=((user_ratings[i]*5.0)/(maxval))
            #print(user_ratings[i])
            if(user_ratings[i]>3):
                #print("MILA KUCCHHH")
                cnt+=1
        #print(max(user_ratings))
        #print (cnt)
       
            #print(cosine_similarities)
            
            #return similar_items
        return user_ratings
    content_ratings = _get_similar_items_to_user_profile(userId)



    # In[100]:


    num = md[['book_id']]
    #print(num)

    num1 = pd.DataFrame(data=content_ratings[0:,0:])


    frames = [num, num1]
    #result = pd.concat([df1, df4], axis=1, join_axes=[df1.index])

    mer = pd.concat(frames, axis =1,join_axes=[num.index])
    mer.columns=['book_id', 'content_rating']
    #print(mer.shape)
    #print('here')
    #print(mer)





    # In[102]:


    ## for user 2 #

#print(temp_ratings.shape)
    cb = temp_ratings[(temp_ratings['user_id'] == userId)][['book_id', 'rating']]
#   print(cb.shape)
#   print(pop.shape)
    hyb = md[['book_id']]
    hyb = hyb.merge(cb,on = 'book_id')
    hyb = hyb.merge(pop, on='book_id')
    hyb = hyb.merge(mer, on='book_id')
    #hyb.shape


    # In[106]:


    def weighted_rating(x):
        v = x['rating']
        R = x['popularity_rating']
        c = x['content_rating']
        return 0.4*v + 0.2*R + 0.4 * c


    # In[107]:


    print(hyb)
    hyb['final'] = hyb.apply(weighted_rating, axis=1)
    hyb = hyb.sort_values('final', ascending=False).head(999)
    #print(hyb['final'])

    print(hyb)
    return hyb
Example #49
0
"""# arrange dataset"""

data = rating_author.groupby(['User-ID', 'Author'])["Book-Rating"].agg(['mean']).reset_index()
data.sort_values(by=['User-ID', 'Author'], inplace=True)
data.columns = ["userID", "author", "raw_ratings"]

## binning raw_ratings
data.raw_ratings = data.raw_ratings.apply(lambda x : 0 if x == 0 else (1 if x in [1,2,3,4]  else (2 if x in[5, 6, 7] else 3)))

data.raw_ratings = data.raw_ratings.astype("int")

"""# make dataset for surprise"""

reader = Reader(rating_scale=(0, 10))
data = Dataset.load_from_df(data[["userID", "author", "raw_ratings"]], reader)

del user, item, rating, rating_author
gc.collect()

"""# train by surprise"""

kf = KFold(random_state=0, n_splits=3)

classes = (SVD, SVDpp, NMF, KNNBaseline, BaselineOnly, CoClustering)
for idx, klass in enumerate(classes):
  print(classes[idx])
  for trainset, testset in kf.split(data):
    # train and test algorithm.
    algo = klass()
    algo.fit(trainset)
from surprise.model_selection import cross_validate
reader = Reader()

ratings.head()


# In[25]:


ratings.shape


# In[27]:


data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
#data.split(n_folds=5)

svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'])


# In[28]:


trainset = data.build_full_trainset()
svd.fit(trainset)


# In[29]:
def k_recommend(model, k, testset):

    reader = Reader(line_format='user item rating', sep=',', skip_lines=1)
    fold_files = [('~/Desktop/Tufts/Fall2018/COMP135/Project3/trainset.csv',
                   '~/Desktop/Tufts/Fall2018/COMP135/Project3/testset.csv')]

    pdkfold = sp.model_selection.split.PredefinedKFold()
    clf = model.best_estimator['mae']
    data = Dataset.load_from_folds(fold_files, reader=reader)

    for train, test in pdkfold.split(data):
        clf.fit(train)
        test1 = train.build_anti_testset()
        preds = clf.test(test1)

    top_n = defaultdict(list)

    for uid, iid, true_r, est, _ in preds:
        top_n[uid].append((iid, est))

    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:k]
    """
    for uid, user_ratings in top_n.items():
        print(uid, [iid for (iid, _) in user_ratings])
    for uid, user_ratings in top_n.items():
        print uid, user_ratings
    """

    for uid in top_n:
        i = 0
        for iid in top_n[uid]:
            found = False
            for iid2 in testset[uid]:
                if iid[0] == str(iid2[0]):
                    a = iid[0]
                    top_n[uid].remove(top_n[uid][i])
                    top_n[uid].insert(i,(a, iid2[1]))
                    found = True
                    i += 1
                    break
            if found == False:
                a = iid[0]
                top_n[uid].remove(top_n[uid][i])
                top_n[uid].insert(i,(a, 2))
                i += 1

    total_sum = 0.0
    user_sum = 0.0
    us_rec = []
    for uid in top_n:
        i = 0.0
        for iid in top_n[uid]:
            i += 1.0
            user_sum += iid[1]
        total_sum += float(user_sum / i)
        us_rec.append(user_sum / i)
        user_sum = 0.0

    #print us_rec
    print "Average rating: ", (total_sum/float(len(top_n)))
Example #52
0
from collections import defaultdict

#Load the movielens-100k dataset
##should be ratings
url = 'https://raw.githubusercontent.com/MutugiD/Data-Problems/master/Recommender/movie_ratings.csv'
data = pd.read_csv(url)
rating_dict = {
    'itemID': list(data.movieId),
    'userID': list(data.userId),
    'rating': list(data.rating)
}
df = pd.DataFrame(rating_dict)

reader = Reader(line_format='user item rating timestamp', sep='\t')
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)
df.groupby('itemID')['rating'].count().reset_index().sort_values(
    'rating', ascending=False)[:10]

data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)
benchmark = []
# Iterate over SVD, NMF, NormalPredictor, KNNBasic
for algo in [SVD(), NMF(), NormalPredictor(), KNNBasic()]:
    # Perform cross validation
    results = cross_validate(
        algo, data, measures=['RMSE'], cv=3,
        verbose=False)  # Get results & append algorithm name
    temp = pd.DataFrame.from_dict(results).mean(axis=0)
    temp = temp.append(
        pd.Series([str(algo).split(' ')[0].split('.')[-1]],
                  index=['Algorithm']))
Example #53
0
from surprise import KNNWithMeans
from surprise import Dataset, print_perf, Reader
from surprise.model_selection import cross_validate
import os

# 指定文件所在路径
file_path = os.path.expanduser('mydata.csv')
# 告诉文本阅读器,文本的格式是怎么样的
reader = Reader(line_format='user item rating', sep=',')
# 加载数据
data = Dataset.load_from_file(file_path, reader=reader)
trainset = data.build_full_trainset()

# Use user_based true/false to switch between user-based or item-based collaborative filtering
algo = KNNWithMeans(k=50, sim_options={'user_based': False})#取最相似的用户进行计算时,只取最相似的k个
algo.fit(trainset)

# we can now query for specific predicions
uid = str(5)  # raw user id
iid = str(1)  # raw item id

# get a prediction for specific users and items.
pred = algo.predict(uid, iid)
print('rating of user-{0} to item-{1} is '.format(uid, iid), pred.est)# rating of user-5 to item-1

#----------------------------
uid = str(5)  # raw user id
iid = str(5)  # raw item id
# get a prediction for specific users and items.
pred = algo.predict(uid, iid)
print('rating of user-{0} to item-{1} is '.format(uid, iid), pred.est)
Example #54
0
        # * Precision at K: Proportion of recommended items that are relevant
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1

        # * Recall at K: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    return precisions, recalls


# * using reader to be able to deal with the imported CSV
reader = Reader(line_format="user item rating timestamp",
                sep=",",
                rating_scale=(1, 5),
                skip_lines=1)
# * loading the csv
data = Dataset.load_from_file(
    file_path="../../ML_Dataset/ml-latest-small/ratings.csv", reader=reader)
# * dividing in train and test sets
trainset, testset = train_test_split(data, test_size=0.25)

# * define a cross-validation iterator
kf = KFold(n_splits=5)

# * Choosing KNN With Means as algorithm
algo = KNNWithMeans()

# * Train the algorithm on the trainset, and predict ratings for the testset
for trainset, testset in kf.split(data):
    predictions = algo.fit(trainset).test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)
    accuracy.rmse(predictions)
    accuracy.mae(predictions)
Example #55
0
import pandas as pd
from surprise import Reader
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate

if __name__ == '__main__':
    df = pd.read_csv("data_1m.csv")
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(df[['user_id', 'song_id', 'rating']], reader)
    algo = SVD()
    cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True)
# In[83]:


listOfRMSE = []
models = []


# In[84]:


from surprise import Reader, Dataset, SVD, SVDpp, evaluate, accuracy
from surprise.model_selection import train_test_split
reader = Reader(line_format='user item rating', sep=',', rating_scale=(0,6))
df_temp1 = df_final_user_repo_star_v3.copy(deep=True);
data = Dataset.load_from_df(df_temp1, reader)
# Test that surprise is working by running SVD on the dataset

# We'll use the famous SVD algorithm.
algo = SVD(n_factors= 100, n_epochs= 20, biased=True, init_std_dev=0.1, lr_all=0.005)

# Train the algorithm on the trainset, and predict ratings for the testset
trainset = data.build_full_trainset()

algo.fit(trainset)

testset = trainset.build_anti_testset()
svd_predictions = algo.test(testset)

rmse_svd = accuracy.rmse(svd_predictions)
print(rmse_svd)
Example #57
0
"""
This module describes how to use the GridSearchCV() class for finding the best
parameter combination of a given algorithm.
"""

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

from surprise import SVD
from surprise import Dataset
from surprise.model_selection import GridSearchCV

# Use movielens-100K
data = Dataset.load_builtin('ml-100k')

param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

# We can now use the algorithm that yields the best rmse:
algo = gs.best_estimator['rmse']
algo.fit(data.build_full_trainset())
Example #58
0
import zipfile
from surprise import Reader, Dataset, SVD, evaluate

# Unzip ml-100k.zip
zipfile = zipfile.ZipFile('ml-100k.zip', 'r')
zipfile.extractall()
zipfile.close()

# Read data into an array of strings
with open('./ml-100k/u.data') as f:
    all_lines = f.readlines()

# Prepare the data to be used in Surprise
reader = Reader(line_format='user item rating timestamp', sep='\t')
data = Dataset.load_from_file('./ml-100k/u.data', reader=reader)

# Split the dataset into 5 folds and choose the algorithm
data.split(n_folds=5)
algo = SVD()

# Train and test reporting the RMSE and MAE scores
evaluate(algo, data, measures=['RMSE', 'MAE'])

# Retrieve the trainset.
trainset = data.build_full_trainset()
algo.train(trainset)

# Predict a certain item
userid = str(196)
itemid = str(302)
actual_rating = 4