def test_predict(rating_true):
    svd = surprise.SVD()
    train_set = surprise.Dataset.load_from_df(
        rating_true, reader=surprise.Reader()
    ).build_full_trainset()
    svd.fit(train_set)

    preds = predict(svd, rating_true)
    assert set(preds.columns) == {"userID", "itemID", "prediction"}
    assert preds["userID"].dtypes == rating_true["userID"].dtypes
    assert preds["itemID"].dtypes == rating_true["itemID"].dtypes
    user = rating_true.iloc[0]["userID"]
    item = rating_true.iloc[0]["itemID"]
    assert preds[(preds["userID"] == user) & (preds["itemID"] == item)][
        "prediction"
    ].values == pytest.approx(svd.predict(user, item).est, rel=TOL)

    preds = predict(
        svd,
        rating_true.rename(columns={"userID": "uid", "itemID": "iid"}),
        usercol="uid",
        itemcol="iid",
        predcol="pred",
    )
    assert set(preds.columns) == {"uid", "iid", "pred"}
    assert preds["uid"].dtypes == rating_true["userID"].dtypes
    assert preds["iid"].dtypes == rating_true["itemID"].dtypes
    user = rating_true.iloc[1]["userID"]
    item = rating_true.iloc[1]["itemID"]
    assert preds[(preds["uid"] == user) & (preds["iid"] == item)][
        "pred"
    ].values == pytest.approx(svd.predict(user, item).est, rel=TOL)
Exemple #2
0
    def __init__(self, hyper_params, user_count, item_count):
        latent_size = hyper_params['latent_size']

        if hyper_params['model_type'] == 'kNN':
            self.model = surprise.prediction_algorithms.knns.KNNBasic(
                k=10, verbose=True)
        elif hyper_params['model_type'] == 'NMF':
            self.model = surprise.NMF(n_factors=latent_size,
                                      biased=False,
                                      n_epochs=50,
                                      verbose=True)
        elif hyper_params['model_type'] == 'SVD':
            self.model = surprise.SVD(n_factors=latent_size, verbose=True)
        elif hyper_params['model_type'] == 'SVD++':
            self.model = surprise.SVDpp(n_factors=latent_size, verbose=True)
        elif hyper_params['model_type'] == 'baseline':
            bsl_options = {
                'method': 'sgd',
                'n_epochs': 20,
            }
            self.model = surprise.prediction_algorithms.baseline_only.BaselineOnly(
                bsl_options=bsl_options, verbose=True)

        self.hyper_params = hyper_params
        self.user_count = user_count
        self.item_count = item_count
Exemple #3
0
def get_matrix_factorization(ratings, meta_data, n_user, n_movies):
    # Matrix Faktorization
    algo = surprise.SVD(n_factors=50, biased=False)
    reader = surprise.Reader(rating_scale=(0.5, 5))
    surprise_data = surprise.Dataset.load_from_df(
        ratings[["userId", "movieId", "rating"]],
        reader).build_full_trainset()
    algo.fit(surprise_data)

    pred = algo.test(surprise_data.build_testset())
    print("MSE: ", surprise.accuracy.mse(pred))
    print("RMSE: ", surprise.accuracy.rmse(pred))

    ranking_matrix = np.dot(algo.pu, algo.qi.T)
    # ranking_matrix = np.clip(ranking_matrix, 0.5, 5)

    # movie_idx_to_id = [surprise_data.to_raw_iid(x) for x in movies_to_pick]
    movie_idx_to_id = [surprise_data.to_raw_iid(x) for x in range(n_movies)]
    features_matrix_factorization = algo.pu
    print("Means: ", np.mean(features_matrix_factorization),
          np.mean(algo.qi.T))
    print("Feature STD:", np.std(features_matrix_factorization),
          np.std(algo.qi))
    print("Full Matrix Shape", np.shape(ranking_matrix), "rankinG_shape",
          np.shape(ranking_matrix))

    return ranking_matrix, features_matrix_factorization, movie_idx_to_id
def svd_surprise(k=20, epochs=20, learning_rate=0.005, bias=True, test_fraction=0.0):
    """
    Performs SVD on the ratings data using surprise.

    """
    # Load the data
    reader = surprise.Reader(rating_scale=(1, 5), sep='\t')
    data = surprise.Dataset.load_from_file('data/data.txt', reader)

    if test_fraction == 0.0:
        _, test_set = surprise.model_selection.train_test_split(data, test_size=0.25)
        train_set = data.build_full_trainset()
    else:
        # Split the data into a training set and test set
        train_set, test_set = surprise.model_selection.train_test_split(data, test_size=test_fraction)

    # Declare the model
    model = surprise.SVD(n_factors=k, n_epochs=epochs, lr_all=learning_rate, biased=bias)

    # Train the model on the data
    model.fit(train_set)
    predictions = model.test(test_set)

    # Print the accuracy of the predictions
    print("SVD Test RMSE: " + str(surprise.accuracy.rmse(predictions, verbose=False)))

    # Return U, V, the user bias terms, and the movie bias terms
    return model.pu, model.qi, model.bu, model.bi
Exemple #5
0
def getUserBaseData(user, addr, raw, count):
    # when importing from a DF, you only need to specify the scale of the ratings.
    reader = surprise.Reader(rating_scale=(1, 4))
    #into surprise:
    dataframe = surprise.Dataset.load_from_df(raw, reader)
    trainset = dataframe.build_full_trainset()

    algo = surprise.SVD()
    algo.fit(trainset)

    iids = raw['around'].unique()
    iidsUsrnotVisited = raw.loc[raw['user'] == user, 'around']
    iids_to_pred = np.setdiff1d(iids, iidsUsrnotVisited)  # 안 간 가게 구함(차집합)
    # user_id가 가지않은 가게들로 testset 생성
    testset = [[user, iid, 4.] for iid in iids_to_pred]
    predictions = algo.test(testset)
    # print(surprise.accuracy.rmse(predictions))
    pred_ratings = np.array([pred.est for pred in predictions])
    # print(len(pred_ratings))
    if len(pred_ratings) < count:
        i_max = pred_ratings.argsort()[::-1]
    else:
        i_max = pred_ratings.argsort()[::-1][:count]
    #i_max = pred_ratings.argmax()
    iid = iids_to_pred[i_max]
    results = {}
    results_ids = []
    for i, m in zip(iid, i_max):
        # print('{0} : {1}'.format(i,pred_ratings[m]))
        results[i] = pred_ratings[m]
        results_ids.append(i)
    print(results_ids)
    return results_ids
def train(args):
    """Training script taking parsed command line / SageMaker variable arguments
    """
    input_files = [
        os.path.join(args.train, file) for file in os.listdir(args.train)
    ]
    if len(input_files) == 0:
        raise ValueError((
            "There are no files in {}.\n" +
            "This usually indicates that the channel ({}) was incorrectly specified,\n"
            +
            "the data specification in S3 was incorrectly specified or the role specified\n"
            + "does not have permission to access the data.").format(
                args.train, "train"))
    train_df = pd.concat(
        [pd.read_csv(file, engine="python") for file in input_files])
    train_data = surprise.Dataset.load_from_df(
        train_df,
        surprise.Reader(line_format=u"user item rating", rating_scale=(1, 5)))
    algo = surprise.SVD()

    # Note: Quality metrics like this can be exposed to SageMaker if wanted, see:
    # https://sagemaker.readthedocs.io/en/stable/overview.html#training-metrics
    results = surprise.model_selection.cross_validate(
        algo,
        train_data,
        measures=("RMSE", "MAE"),
        verbose=True,
        cv=args.cross_validation_folds)

    # The main mission of our script is to train a model and then save it to file:
    algo.fit(train_data.build_full_trainset())
    surprise.dump.dump(os.path.join(args.model_dir, ALGO_FILE_NAME), algo=algo)
def train_model(M, N, K, eta, reg, Y, eps=0.0001, max_epochs=300):
    """
    Given a training data matrix Y containing rows (i, j, Y_ij)
    where Y_ij is user i's rating on movie j, learns an
    M x K matrix U and N x K matrix V such that rating Y_ij is approximated
    by (UV^T)_ij.

    Uses a learning rate of <eta> and regularization of <reg>. Stops after
    <max_epochs> epochs, or once the magnitude of the decrease in regularized
    MSE between epochs is smaller than a fraction <eps> of the decrease in
    MSE after the first epoch.

    Returns a tuple (U, V, err) consisting of U, V, and the unregularized MSE
    of the model.
    """
    df = pd.DataFrame(Y)
    df = df.sort_values(1)
    model = surprise.SVD()
    reader = surprise.Reader(rating_scale=(1, 5))
    data = surprise.Dataset.load_from_df(df[[0, 1, 2]], reader)

    trainset = data.build_full_trainset()

    model.fit(trainset)

    return (model, get_err(model, Y), trainset)
Exemple #8
0
def test_compute_rating_predictions(python_data):
    rating_true, _, _ = python_data(binary_rating=False)
    svd = surprise.SVD()
    train_set = surprise.Dataset.load_from_df(
        rating_true, reader=surprise.Reader()).build_full_trainset()
    svd.fit(train_set)

    preds = compute_rating_predictions(svd, rating_true)
    assert set(preds.columns) == {'userID', 'itemID', 'prediction'}
    assert preds['userID'].dtypes == rating_true['userID'].dtypes
    assert preds['itemID'].dtypes == rating_true['itemID'].dtypes
    user = rating_true.iloc[0]['userID']
    item = rating_true.iloc[0]['itemID']
    assert preds[(preds['userID'] == user) & (preds['itemID'] == item)]['prediction'].values == \
           pytest.approx(svd.predict(user, item).est, rel=TOL)

    preds = compute_rating_predictions(svd,
                                       rating_true.rename(columns={
                                           'userID': 'uid',
                                           'itemID': 'iid'
                                       }),
                                       usercol='uid',
                                       itemcol='iid',
                                       predcol='pred')
    assert set(preds.columns) == {'uid', 'iid', 'pred'}
    assert preds['uid'].dtypes == rating_true['userID'].dtypes
    assert preds['iid'].dtypes == rating_true['itemID'].dtypes
    user = rating_true.iloc[1]['userID']
    item = rating_true.iloc[1]['itemID']
    assert preds[(preds['uid'] == user) & (preds['iid'] == item)]['pred'].values == \
           pytest.approx(svd.predict(user, item).est, rel=TOL)
Exemple #9
0
def train(
    dataset: surprise.dataset.Dataset
) -> surprise.prediction_algorithms.AlgoBase:
    algo = surprise.SVD()

    cv_iterator = 5
    # cv_iterator = surprise.model_selection.ShuffleSplit(n_splits=10, test_size=0.2)

    surprise.model_selection.cross_validate(
        algo,
        dataset,
        cv=cv_iterator,
        n_jobs=-1,
        measures=['rmse', 'mae'],  # 'fcp'
        return_train_measures=True,
        verbose=True,
    )

    trainset = dataset.build_full_trainset()
    testset = trainset.build_testset()

    # TODO: Verificar
    algo.fit(trainset)

    print('running test')
    predictions = algo.test(testset)
    print('test done')
    surprise.accuracy.rmse(predictions)

    return algo
Exemple #10
0
def surprise_SVD(trainset, finalset):
    "SVD model"
    algo = spr.SVD(n_factors=40, n_epochs=20, lr_all=0.001)

    algo.fit(trainset)
    predictions_final = algo.test(finalset)

    return spr_estimate_to_vect(predictions_final)
Exemple #11
0
 def train(self, dataset: RecommendationDataset) -> None:
     train_set = surprise.Dataset.load_from_df(
         dataset.data[[
             dataset.user_col, dataset.user_col, dataset.score_col
         ]],
         reader=surprise.Reader()).build_full_trainset()
     self.svd = surprise.SVD(random_state=0,
                             n_factors=200,
                             n_epochs=self.epochs,
                             verbose=True)
     self.svd.fit(train_set)
def SVD(train, test, rate):
    """
    Run the SVD model from Surprise library. The number of factors is 40. The number of iterations is 20.
    @param train: the training set in the Surprise format.
    @param test: the test set in the Surprise format.
    @param rate: the learning rate of all parameters.
    @return: the predictions in a numpy array.
    """
    algo = spr.SVD(n_factors=40, lr_all=rate)
    algo.fit(train)
    predictions = algo.test(test)
    return get_predictions(predictions)
Exemple #13
0
def main(args):

    user_item_based = 'item_based' if args.item_based else 'user_based'
    filename = '_'.join([
        args.exp_name, args.algorithm, args.sim_name, user_item_based,
        str(args.num_rows)
    ]) + '.pkl'

    output_file = Path(filename)
    if output_file.exists():
        print(f'ERROR! Output file {output_file} already exists. Exiting!')
        sys.exit(1)

    print(f'Saving scores in {output_file}\n')

    reader = surprise.Reader(rating_scale=(1, 5))
    df = pq.read_table('all_ratings_with_indices.parquet',
                       columns=['user_idx', 'movie_idx',
                                'rating']).to_pandas()
    df.user_idx = df.user_idx.astype(np.uint32)
    df.movie_idx = df.movie_idx.astype(np.uint16)
    df.rating = df.rating.astype(np.uint8)
    print(df.dtypes)
    data = surprise.Dataset.load_from_df(df[:args.num_rows], reader=reader)
    del df
    sim_options = {
        'name': args.sim_name,
        'user_based': False if args.item_based else True
    }

    if args.algorithm == 'knn':
        algo = surprise.KNNBasic(sim_options=sim_options)
    elif args.algorithm == 'baseline':
        algo = surprise.BaselineOnly()
    elif args.algorithm == 'normal':
        algo = surprise.NormalPredictor()
    elif args.algorithm == 'knn_zscore':
        algo = surprise.KNNWithZScore(sim_options=sim_options)
    elif args.algorithm == 'svd':
        algo = surprise.SVD()
    elif args.algorithm == 'nmf':
        algo = surprise.NMF()
    else:
        print(f'Algorithm {args.algorithm} is not a valid choice.')

    scores = surprise.model_selection.cross_validate(algo,
                                                     data,
                                                     cv=args.cv_folds,
                                                     verbose=True,
                                                     n_jobs=-1)

    pickle.dump(scores, open(output_file, 'wb'))
Exemple #14
0
 def factorisation(self, n_user, n_item):
     #retourne la matrice note complète avec n_user et n_item
     reader = Reader()
     data = Dataset.load_from_df(self.data, reader)
     SVD = surprise.SVD(n_factors=10, n_epochs=10, lr_all=.01, reg_all=.01)
     results = surprise.model_selection.validation.cross_validate(
         SVD, data, measures=['MSE'], cv=3, verbose=True)
     #maintenant on rempli la matrice
     print("temps d'attente estimé : ", round(n_user * n_item / 105000),
           "secondes.")
     M = []
     for u in range(n_user):
         M.append([SVD.predict(u, i).est for i in range(n_item)])
     return np.array(M)
Exemple #15
0
def test_recommend_k_items(rating_true):
    n_users = len(rating_true["userID"].unique())
    n_items = len(rating_true["itemID"].unique())
    svd = surprise.SVD()
    train_set = surprise.Dataset.load_from_df(
        rating_true, reader=surprise.Reader()).build_full_trainset()
    svd.fit(train_set)

    preds = compute_ranking_predictions(svd, rating_true, remove_seen=True)
    assert set(preds.columns) == {"userID", "itemID", "prediction"}
    assert preds["userID"].dtypes == rating_true["userID"].dtypes
    assert preds["itemID"].dtypes == rating_true["itemID"].dtypes
    user = preds.iloc[0]["userID"]
    item = preds.iloc[0]["itemID"]
    assert preds[(preds["userID"] == user) & (
        preds["itemID"] == item)]["prediction"].values == pytest.approx(
            svd.predict(user, item).est, rel=TOL)
    # Test default remove_seen=True
    assert pd.merge(rating_true, preds, on=["userID", "itemID"]).shape[0] == 0
    assert preds.shape[0] == (n_users * n_items - rating_true.shape[0])

    preds = compute_ranking_predictions(
        svd,
        rating_true.rename(columns={
            "userID": "uid",
            "itemID": "iid",
            "rating": "r"
        }),
        usercol="uid",
        itemcol="iid",
        predcol="pred",
        remove_seen=False,
    )
    assert set(preds.columns) == {"uid", "iid", "pred"}
    assert preds["uid"].dtypes == rating_true["userID"].dtypes
    assert preds["iid"].dtypes == rating_true["itemID"].dtypes
    user = preds.iloc[1]["uid"]
    item = preds.iloc[1]["iid"]
    assert preds[(preds["uid"] == user)
                 & (preds["iid"] == item)]["pred"].values == pytest.approx(
                     svd.predict(user, item).est, rel=TOL)

    # Test remove_seen=False
    assert (pd.merge(rating_true,
                     preds,
                     left_on=["userID", "itemID"],
                     right_on=["uid", "iid"]).shape[0] == rating_true.shape[0])
    assert preds.shape[0] == n_users * n_items
Exemple #16
0
    def __init__(self, arm_list, cf_params=None):
        # To use item-based cosine similarity
        sim_options = {
            "name": "pearson",
            "user_based": False,  # Compute  similarities between items
        }
        if cf_params:
            self.algo = surprise.SVD(
            ) if cf_params['algo'] == 'svd' else surprise.KNNBaseline(
                sim_options=sim_options, verbose=False)

        self.first_train = False
        self.data = []
        self.action_num_played_arr = np.zeros(len(arm_list))
        self.is_action_initialized_arr = np.zeros(len(arm_list))
        self.num_rounds = 0
        self.action_rewards = np.zeros(len(arm_list))
        self.action_list = arm_list
Exemple #17
0
def algo_tester(data_object):
    '''
  Produces a dataframe displaying all the different RMSE's, test & train times of the different surprise algorithms

  ---Parameters---
  data_object(variable) created from the read_data_surprise function

  ---Returns---
  returns a dataframe where you can compare the performance of different algorithms
  '''
    benchmark = []
    algos = [
        sp.SVDpp(),
        sp.SVD(),
        sp.SlopeOne(),
        sp.NMF(),
        sp.NormalPredictor(),
        sp.KNNBaseline(),
        sp.KNNBasic(),
        sp.KNNWithMeans(),
        sp.KNNWithZScore(),
        sp.BaselineOnly(),
        sp.CoClustering()
    ]

    # Iterate over all algorithms
    for algorithm in algos:
        # Perform cross validation
        results = cross_validate(algorithm,
                                 data_object,
                                 measures=['RMSE'],
                                 cv=3,
                                 verbose=False)

        # Get results & append algorithm name
        tmp = pd.DataFrame.from_dict(results).mean(axis=0)
        tmp = tmp.append(
            pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],
                      index=['Algorithm']))
        benchmark.append(tmp)

    benchmark = pd.DataFrame(benchmark).set_index('Algorithm').sort_values(
        'test_rmse')
    return benchmark
def svd_training(args):
    """
    Train Surprise SVD using the given hyper-parameters
    """
    print("Start training...")
    train_data = pd.read_pickle(path=os.path.join(args.datastore, args.train_datapath))
    validation_data = pd.read_pickle(path=os.path.join(args.datastore, args.validation_datapath))

    svd = surprise.SVD(random_state=args.random_state, n_epochs=args.epochs, verbose=args.verbose, biased=args.biased,
                       n_factors=args.n_factors, init_mean=args.init_mean, init_std_dev=args.init_std_dev,
                       lr_all=args.lr_all, reg_all=args.reg_all, lr_bu=args.lr_bu, lr_bi=args.lr_bi, lr_pu=args.lr_pu,
                       lr_qi=args.lr_qi, reg_bu=args.reg_bu, reg_bi=args.reg_bi, reg_pu=args.reg_pu,
                       reg_qi=args.reg_qi)

    train_set = surprise.Dataset.load_from_df(train_data, reader=surprise.Reader(args.surprise_reader)) \
        .build_full_trainset()
    svd.fit(train_set)

    print("Evaluating...")

    rating_metrics = args.rating_metrics
    if len(rating_metrics) > 0:
        predictions = compute_rating_predictions(svd, validation_data, usercol=args.usercol, itemcol=args.itemcol)
        for metric in rating_metrics:
            result = eval(metric)(validation_data, predictions)
            print(metric, result)
            if HAS_AML:
                run.log(metric, result)

    ranking_metrics = args.ranking_metrics
    if len(ranking_metrics) > 0:
        all_predictions = compute_ranking_predictions(svd, train_data, usercol=args.usercol, itemcol=args.itemcol,
                                                  remove_seen=args.remove_seen)
        k = args.k
        for metric in ranking_metrics:
            result = eval(metric)(validation_data, all_predictions, col_prediction='prediction', k=k)
            print("{}@{}".format(metric, k), result)
            if HAS_AML:
                run.log(metric, result)

    if len(ranking_metrics) == 0 and len(rating_metrics) == 0:
        raise ValueError("No metrics were specified.")

    return svd
Exemple #19
0
def algo_metrics(df):
    '''
    Return metrics algo metrics for df: rmse
    ---Parameters---
    df (Pandas DataFrame) RUS DataFrame
    u (int) Number of ratings threshold for users
    r (int) Number of ratings threshold for routeIDs
    ---Returns---
    RMSE metrics
    '''
    reader = sp.Reader(line_format='user item rating', sep=',', skip_lines=1)
    data = sp.Dataset.load_from_df(df, reader=reader)
    trainset, testset = train_test_split(data, test_size=.2)

    # Fit out of the box SVD to trainset and predict on test set
    algo = sp.SVD()
    algo.fit(trainset)
    predictions = algo.test(testset)
    return sp.accuracy.rmse(predictions)
Exemple #20
0
def test_compute_ranking_predictions(python_data):
    rating_true, _, _ = python_data(binary_rating=False)
    n_users = len(rating_true['userID'].unique())
    n_items = len(rating_true['itemID'].unique())
    svd = surprise.SVD()
    train_set = surprise.Dataset.load_from_df(
        rating_true, reader=surprise.Reader()).build_full_trainset()
    svd.fit(train_set)

    preds = compute_ranking_predictions(svd, rating_true)
    assert set(preds.columns) == {'userID', 'itemID', 'prediction'}
    assert preds['userID'].dtypes == rating_true['userID'].dtypes
    assert preds['itemID'].dtypes == rating_true['itemID'].dtypes
    user = preds.iloc[0]['userID']
    item = preds.iloc[0]['itemID']
    assert preds[(preds['userID'] == user) & (preds['itemID'] == item)]['prediction'].values == \
           pytest.approx(svd.predict(user, item).est, rel=TOL)
    # Test default recommend_seen=False
    assert pd.merge(rating_true, preds, on=['userID', 'itemID']).shape[0] == 0
    assert preds.shape[0] == (n_users * n_items - rating_true.shape[0])

    preds = compute_ranking_predictions(
        svd,
        rating_true.rename(columns={
            'userID': 'uid',
            'itemID': 'iid',
            'rating': 'r'
        }),
        usercol='uid',
        itemcol='iid',
        predcol='pred',
        recommend_seen=True)
    assert set(preds.columns) == {'uid', 'iid', 'pred'}
    assert preds['uid'].dtypes == rating_true['userID'].dtypes
    assert preds['iid'].dtypes == rating_true['itemID'].dtypes
    user = preds.iloc[1]['uid']
    item = preds.iloc[1]['iid']
    assert preds[(preds['uid'] == user) & (preds['iid'] == item)]['pred'].values == \
           pytest.approx(svd.predict(user, item).est, rel=TOL)
    # Test recommend_seen=True
    assert pd.merge(rating_true, preds, left_on=['userID', 'itemID'], right_on=['uid', 'iid']).shape[0] == \
           rating_true.shape[0]
    assert preds.shape[0] == n_users * n_items
def find_recommendations(data, usr_pref, user_data, user_id):
    # adding the user preferences dataframe to our train data
    df2 = usr_pref
    data = data.append(df2)
    data.reset_index(inplace=True, drop=True)

    # making movielens data compatible with the model by creating a reader object
    reader = Reader(rating_scale=(1, 5))
    data_dr = Dataset.load_from_df(data[['userId', 'movieId', 'rating']],
                                   reader)

    # defining our model based on the parameters that yielded the lowest rmse while having the most efficient running time
    # notice that we use n_factors=15 for two reasons, 1: it drastically reduces the amount of time to train the model.
    # 2: it still results in high model accuracy, and reduces the dimension of the subspace being worked with in the SVD (both movie and user latent factor value vectors become 'shorter', of a smaller dimension), providing
    # an efficient approximation [we get the most significant factors explaining a majority of user behaviour]
    algo = surprise.SVD(n_factors=15,
                        n_epochs=10,
                        lr_all=0.03,
                        reg_all=0.04,
                        verbose=True)

    # training the model on the dataset that now includes the users preferences
    algo.fit(data_dr.build_full_trainset())

    # list to hold the models predictions
    predictions = []

    # getting arrays that can be traversed of the user id, movie (item) id,
    # these are used to apply the predict function
    uids = user_data['userId'].to_numpy()
    iids = user_data['movieId'].to_numpy()

    # counting variable
    i = 0

    # loop to add predictions for each uid, movieid pair to the predictions list defined above
    while i < len(uids):
        predictions.append(algo.predict(uid=uids[i], iid=iids[i]))
        i += 1

    # making the function return the list containing our predictions
    return predictions
    def model_fit(self):
        '''
        Train model using surprise.SVD algorithm. 
        '''
        self.build_trainset()
        algo = self._algo_choise
        if algo == 'SVD':
            self.algorithm = surprise.SVD()
        elif algo == 'Baseline':
            self.algorithm = surprise.BaselineOnly()
        elif algo == 'SlopeOne':
            self.algorithm = surprise.SlopeOne()
        elif algo == 'CoClustering':
            self.algorithm = surprise.CoClustering()
        else:
            self.algorithm = surprise.KNNBasic()

        print('Training Recommender System using %s...' % algo)

        self.algorithm.fit(self.trainset)
        self.ratings_changed = False
        print('Done')
def train_matrix(ratings, factor, k_folds):
    """
    Train a model and return it. Then we can use the model and evaluate it elsewhere
    @param ratings dataframe pandas dataframe to train on, with columns UserId, MovieId, Ratings
    @param n_folds number of folds for cross validation
    @returns List of (algo, test data)
    We can call methods such as `test` and `evaluate` on this object 
    """

    train_data, test_data = cv.train_test_split(ratings, test_size=0.20)
    reader = sp.Reader(rating_scale=(1, 5))

    trainset = sp.Dataset.load_from_df(train_data, reader)
    testset = sp.Dataset.load_from_df(test_data, reader)
    trainset.split(n_folds=k_folds)

    algo = sp.SVD(n_factors=factor)

    for trainset, _ in trainset.folds():
        algo.train(trainset)

    testset = testset.build_full_trainset().build_testset()
    return (algo, testset)
Exemple #24
0
def Initialize_q_models(q, r, dat):
    U = {} # each U_t: m-by-r
    V = {} # each V_t: n-by-r
    anchors = {} # (u_t, i_t) index
    
    for t in range(1, (q+1)):
        # Step 5: initialize U_t and V_t by using SVD

        tmp_reader = surprise.Reader(rating_scale=(dat.click.min(), dat.click.max()))
        tmp_data = surprise.Dataset.load_from_df(dat, tmp_reader)
        tmp_svd = surprise.SVD(random_state = 123 + t, n_factors = r)

        tmp_output = tmp_svd.fit(tmp_data.build_full_trainset())

        U[t] = tmp_output.pu # user factors (m, r)
        V[t] = tmp_output.qi # item factors (n, r)

        # Step 6: pick an observed pair (u_t, i_t) from M at random

        tmp_anchor = dat[['userId', 'articleId']].sample(1, random_state=123+t)
        anchors[t] = [(user_IdtoInd[u], article_IdtoInd[i]) for u, i in tmp_anchor.values][0]

    return U, V, anchors
Exemple #25
0
def svd_training(params):
    """
    Train Surprise SVD using the given hyper-parameters
    """
    logger.debug("Start training...")
    train_data = pd.read_pickle(
        os.path.join(params["datastore"], params["train_datapath"]))
    validation_data = pd.read_pickle(
        os.path.join(params["datastore"], params["validation_datapath"]))

    svd_params = {
        p: params[p]
        for p in [
            "random_state",
            "n_epochs",
            "verbose",
            "biased",
            "n_factors",
            "init_mean",
            "init_std_dev",
            "lr_all",
            "reg_all",
            "lr_bu",
            "lr_bi",
            "lr_pu",
            "lr_qi",
            "reg_bu",
            "reg_bi",
            "reg_pu",
            "reg_qi",
        ]
    }
    svd = surprise.SVD(**svd_params)

    train_set = surprise.Dataset.load_from_df(
        train_data, reader=surprise.Reader(
            params["surprise_reader"])).build_full_trainset()
    svd.fit(train_set)

    logger.debug("Evaluating...")

    metrics_dict = {}
    rating_metrics = params["rating_metrics"]
    if len(rating_metrics) > 0:
        predictions = predict(svd,
                              validation_data,
                              usercol=params["usercol"],
                              itemcol=params["itemcol"])
        for metric in rating_metrics:
            result = getattr(evaluation, metric)(validation_data, predictions)
            logger.debug("%s = %g", metric, result)
            if metric == params["primary_metric"]:
                metrics_dict["default"] = result
            else:
                metrics_dict[metric] = result

    ranking_metrics = params["ranking_metrics"]
    if len(ranking_metrics) > 0:
        all_predictions = compute_ranking_predictions(
            svd,
            train_data,
            usercol=params["usercol"],
            itemcol=params["itemcol"],
            remove_seen=params["remove_seen"],
        )
        k = params["k"]
        for metric in ranking_metrics:
            result = getattr(evaluation, metric)(validation_data,
                                                 all_predictions,
                                                 col_prediction="prediction",
                                                 k=k)
            logger.debug("%s@%d = %g", metric, k, result)
            if metric == params["primary_metric"]:
                metrics_dict["default"] = result
            else:
                metrics_dict[metric] = result

    if len(ranking_metrics) == 0 and len(rating_metrics) == 0:
        raise ValueError("No metrics were specified.")

    # Report the metrics
    nni.report_final_result(metrics_dict)

    # Save the metrics in a JSON file
    output_dir = os.environ.get("NNI_OUTPUT_DIR")
    with open(os.path.join(output_dir, "metrics.json"), "w") as fp:
        temp_dict = metrics_dict.copy()
        temp_dict[params["primary_metric"]] = temp_dict.pop("default")
        json.dump(temp_dict, fp)

    return svd
          epochs=2,
          validation_split=0.1,
          shuffle=True)

y_pred = model.predict([df_hybrid_test['User'], df_hybrid_test['Movie'], test_tfidf])
y_true = df_hybrid_test['Rating'].values

rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_true))
print('\n\nTesting Result With Keras Hybrid Deep Learning: {:.4f} RMSE'.format(rmse))

# Load dataset into surprise specific data-structure
data = sp.Dataset.load_from_df(df_filterd[['User', 'Movie', 'Rating']].sample(20000), sp.Reader())

benchmark = []
# Iterate over all algorithms
for algorithm in [sp.SVD(), sp.SVDpp(), sp.SlopeOne(), sp.NMF(), sp.NormalPredictor(), sp.KNNBaseline(), sp.KNNBasic(), sp.KNNWithMeans(), sp.KNNWithZScore(), sp.BaselineOnly(), sp.CoClustering()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE', 'MAE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    
    # Store data
    benchmark.append(tmp)
    
    # Store results
surprise_results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse', ascending=False)

# Get data
data = surprise_results[['test_rmse', 'test_mae']]
Exemple #27
0
        idcgs[uid] = sum(rel_true/discount_true)
        
    dcg = sum(dcgu for (_,dcgu) in dcgs.items())
    idcg = sum(idcgu for (_,idcgu) in idcgs.items())
    return dcg/idcg

data = pd.read_csv('sampled.csv')
print "Users: "+str(len(np.unique(data['User-ID'])))+ " items: "+str(len(np.unique(data['ISBN'])))
print "No. of ratings: "+str(len(data))

sim_options = {'name': 'pearson',
               'user_based': False
               }

algo_knn = surprise.KNNBasic(k=5, sim_options=sim_options)
algo_svd = surprise.SVD(n_factors = 10, lr_all= 0.001, reg_all =1)

#Around 80% train data for each of these splits
sample_sizes = [0.4, 0.2, 0.1,0.05, 0.01]


time_knn = []
time_svd = []
for s in sample_sizes:
    a = data.sample(frac = s, random_state = 111)
    print "s= "+str(len(a))
    
    print("Removing users with less than 20 ratings....")
    b = a.groupby('User-ID').filter(lambda x: len(x) >= 20)
    densityu = (float(len(b))/(len(np.unique(b['User-ID']))*len(np.unique(b['ISBN']))))*100
    print "Density after filtering users: "+str(densityu) #0.061
Exemple #28
0
# In[2]:

pandas_df.head()

# In[3]:

u_matrix = (pandas_df.pivot(index="UserID", columns="MovieID",
                            values="Rating").fillna(0))

# In[4]:

import time
import surprise

svd = surprise.SVD(random_state=2, n_factors=200, n_epochs=1000, verbose=True)
df_train = pandas_df.drop(columns='Timestamp')

# In[5]:

from surprise import Reader
reader = Reader()
df_set_train = surprise.Dataset.load_from_df(
    df_train[['MovieID', 'UserID', 'Rating']], reader)

# In[6]:

from surprise.model_selection import cross_validate

cross_validate(svd, df_set_train, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    songs_ex.drop(['isrc', 'name'], axis=1, inplace=True)  #song_ex中的name暂不处理
    train = train.merge(songs_ex, on='song_id', how='left')
    test = test.merge(songs_ex, on='song_id', how='left')
    del members, songs, songs_ex
    gc.collect()
    #2.4:特征提取-统计类特征

    #2.5:特征提取-信息类特征(爬虫)

    ######################################################################
    #3:训练user对物品特征的偏好 song_area\song_year\language\genre_ids\song_length\artist_name\composer\lyricist
    reader = surprise.Reader(rating_scale=(0, 1))
    #3.1:msno<->song_area
    mtrain = train[['msno', 'song_area',
                    'target']].dropna().drop_duplicates()  #去空值去重
    algo_area = surprise.SVD()
    data = surprise.Dataset.load_from_df(mtrain, reader)
    algo_area.train(data.build_full_trainset())
    print('完成msno<->song_area训练')
    #3.2:msno<->song_year
    mtrain = train[['msno', 'song_year',
                    'target']].dropna().drop_duplicates()  #去空值去重
    algo_year = surprise.SVD()
    data = surprise.Dataset.load_from_df(mtrain, reader)
    algo_year.train(data.build_full_trainset())
    print('完成msno<->song_year训练')
    #3.3:msno<->language
    mtrain = train[['msno', 'language',
                    'target']].dropna().drop_duplicates()  #去空值去重
    algo_lang = surprise.SVD()
    data = surprise.Dataset.load_from_df(mtrain, reader)
Exemple #30
0
import surprise
import pandas as pd

rating_df = pd.read_csv(
    '/Users/mac/Desktop/推荐系统/RecommendedSystemCallPackage/data_set/MovieLens/ratings.csv',
    sep=';')
rating_df = rating_df[['UserID', 'MovieID', 'Rating']]
reader = surprise.Reader(rating_scale=(1, 5))
rating_data = surprise.Dataset.load_from_df(rating_df, reader=reader)
svd = surprise.SVD()
svd_temp = surprise.model_selection.cross_validate(svd,
                                                   rating_data,
                                                   measures=['RMSE', 'MAE'],
                                                   cv=5,
                                                   verbose=True)
print('SVD--------------')
print(svd_temp)
normalPredictor = surprise.NormalPredictor()
normalPredictor_temp = surprise.model_selection.cross_validate(
    normalPredictor, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
print('normalPredictor--------------')
print(normalPredictor_temp)
baselineOnly = surprise.BaselineOnly()
baselineOnly_temp = surprise.model_selection.cross_validate(
    baselineOnly, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
print('baselineOnly-----------------')
print(baselineOnly_temp)
knnBasic = surprise.KNNBasic()
knnBasic_temp = surprise.model_selection.cross_validate(
    knnBasic, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
print('knnBasic-----------------')