def baseline_bias_model(df):
    """
        Shows the performance of model based on just bias
    """
    ratings_pandas_df = df.drop(columns=['date', 'text'])
    #    ratings_pandas_df.columns = ['user_id', 'business_id', 'rating']

    reader = Reader(rating_scale=(1, 5))  #TODO figure out

    data = surprise.dataset.Dataset.load_from_df(df=ratings_pandas_df,
                                                 reader=reader)

    ts = data.build_full_trainset()
    dusers = ts._raw2inner_id_users
    ditems = ts._raw2inner_id_items

    trainset, testset = train_test_split(data)

    algo = BaselineOnly()
    algo.fit(trainset)

    # testset = trainset.build_anti_testset()
    predictions = algo.test(testset)

    print('\n')
    return (trainset, testset, predictions, dusers, ditems)
def use_als():
    start = time.time()
    performance = []

    data = Dataset.load_builtin('ml-100k')
    trainset = data.build_full_trainset()

    print('Using ALS')
    bsl_options = {'method': 'als', 'n_epochs': 20, 'reg_u': 12, 'reg_i': 5}
    algo_ALS = BaselineOnly(bsl_options=bsl_options)
    algo_ALS.fit(trainset)

    testset = trainset.build_anti_testset()
    predictions_ALS = algo_ALS.test(testset)

    accuracy_rmse \
        = accuracy.rmse(predictions_ALS)
    accuracy_mae = accuracy.mae(predictions_ALS)
    performance.append(accuracy_rmse)
    performance.append(accuracy_mae)

    end = time.time()
    performance.append(end - start)

    return performance
def normalize_affinity_scores_by_user_item_bs(user_item_affinities: List[Tuple[str, str, float]], rating_scale=(1, 5)) \
        -> Tuple[float, Dict[str, float], Dict[str, float], float, List[Tuple[str, str, float]]]:
    train = pd.DataFrame(user_item_affinities)
    reader = Reader(rating_scale=rating_scale)
    trainset = Dataset.load_from_df(train, reader).build_full_trainset()
    trainset_for_testing = trainset.build_testset()
    algo = BaselineOnly(bsl_options={'method': 'sgd'})
    algo.fit(trainset)
    predictions = algo.test(trainset_for_testing)
    mean = algo.trainset.global_mean
    bu = {
        u: algo.bu[algo.trainset.to_inner_uid(u)]
        for u in set([u for u, i, r in user_item_affinities])
    }
    bi = {
        i: algo.bi[algo.trainset.to_inner_iid(i)]
        for i in set([i for u, i, r in user_item_affinities])
    }
    uid = [[p.uid, p.iid, p.r_ui - p.est] for p in predictions]
    estimatates = [p.est for p in predictions]
    estimates_2 = [
        p.r_ui - (mean + bu[p.uid] + bi[p.iid]) for p in predictions
    ]
    uid = pd.DataFrame(uid, columns=["user", "item", "rating"])
    spread = max(uid["rating"].max(), np.abs(uid["rating"].min()))
    uid = list(zip(uid['user'], uid['item'], uid['rating']))
    bu = defaultdict(float, bu)
    bi = defaultdict(float, bi)
    # assert estimatates == estimates_2
    return mean, bu, bi, spread, uid
Beispiel #4
0
def test_dump():
    """Train an algorithm, compute its predictions then dump them.
    Ensure that the predictions that are loaded back are the correct ones, and
    that the predictions of the dumped algorithm are also equal to the other
    ones."""

    random.seed(0)

    train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train')
    test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
    data = Dataset.load_from_folds([(train_file, test_file)],
                                   Reader('ml-100k'))
    pkf = PredefinedKFold()

    trainset, testset = next(pkf.split(data))

    algo = BaselineOnly()
    algo.fit(trainset)
    predictions = algo.test(testset)

    with tempfile.NamedTemporaryFile() as tmp_file:
        dump.dump(tmp_file.name, predictions, algo)
        predictions_dumped, algo_dumped = dump.load(tmp_file.name)

        predictions_algo_dumped = algo_dumped.test(testset)
        assert predictions == predictions_dumped
        assert predictions == predictions_algo_dumped
def predict(trainset):
    print("Training the model for prediction .....")
    # predict ratings for all pairs (u, i) that are NOT in the training set.
    algo = BaselineOnly(bsl_options=bsl_options)
    testset = trainset.build_anti_testset()
    predictions = algo.fit(trainset).test(testset)
    return predictions
Beispiel #6
0
def surprise_baseline(train_file, test_file):
    """
    Baseline with Surprise library.
    Compute the predictions on a test_set after training on a train_set using the method Baseline from Surprise.
    Args:
        train_file (string): path to created test file
        test_file (string): path to created train file
    Hyperparameters:
        -
    Returns:
        numpy array: predictions
    """
    print("baseline")
    algo = BaselineOnly()
    fold = [(train_file, test_file)]
    reader = Reader(line_format='user item rating', sep=',')
    data = Dataset.load_from_folds(fold, reader=reader)
    pkf = PredefinedKFold()
    for trainset, testset in pkf.split(data):
        # Train
        algo.fit(trainset)

        # Predict
        predictions = algo.test(testset)
    pred = np.zeros(len(predictions))
    for i in range(len(predictions)):
        val = predictions[i].est
        pred[i] = val
    return pred
def use_sgd():
    start = time.time()
    performance = []

    data = Dataset.load_builtin('ml-100k')
    trainset = data.build_full_trainset()

    print('Using SGD')
    bsl_options = {
        'method': 'sgd',
        'learning_rate': .005,
    }

    algo_SGD = BaselineOnly(bsl_options=bsl_options)
    algo_SGD.fit(trainset)

    testset = trainset.build_anti_testset()
    predictions_SGD = algo_SGD.test(testset)

    accuracy_rmse = accuracy.rmse(predictions_SGD)
    accuracy_mae = accuracy.mae(predictions_SGD)
    performance.append(accuracy_rmse)
    performance.append(accuracy_mae)

    end = time.time()
    performance.append(end - start)

    return performance
Beispiel #8
0
def baseline(trainset, testset):
    algo = BaselineOnly()
    algo.fit(trainset)
    print("Predictions")
    predictions = algo.test(testset)
    accuracy.rmse(predictions)
    accuracy.mae(predictions)
    return(predictions)
Beispiel #9
0
    class ALSModelSurprise(ALSModel):
        def __init__(self, params):
            super().__init__(params)
            self.algo = BaselineOnly(bsl_options=self.params)

        def parse_data(self, ratings):
            reader = Reader(rating_scale=(1, 5))
            self.data = Dataset.load_from_df(ratings, reader)

        def update_parameters(self):
            self.algo.bsl_options = self.params

        def fit(self):
            self.train = self.data.build_full_trainset()
            self.algo.fit(self.train)

        def predict(self, uid, iid):
            '''
            uid, iid should be consistent with ratings['UID','IID']
            '''
            return self.algo.predict(uid, iid).est

        def top_n_recommendations(self, uid, n=5):
            '''
            Obtain the top n recommendation for any user.
            Method for the surprise library
            '''
            scores = []
            for i in range(self.train.n_items):
                iid = self.train.to_raw_iid(i)
                scores.append((iid, self.predict(uid, iid)))
            scores.sort(key=lambda x: x[1], reverse=True)
            top_n_iid = [l[0] for l in scores[:n]]
            pred = [l[1] for l in scores[:n]]
            return top_n_iid, pred

        def cross_validate(self, cv=5, verbose=False):
            cv_result = cross_validate(self.algo, self.data, \
                                       cv=cv, verbose=verbose)
            rmse = cv_result['test_rmse'].mean()
            return rmse

        def grid_search(self):
            self._best_params = self.params
            self._best_rmse = self.cross_validate(cv=5)
            for n_epochs in [5, 10, 15, 20, 25]:
                for reg_u in [5, 10, 15, 20]:
                    for reg_i in [5, 10, 15]:
                        self.set_params(n_epochs=n_epochs,
                                        reg_u=reg_u,
                                        reg_i=reg_i)
                        rmse = self.cross_validate(cv=5)
                        print(n_epochs, reg_u, reg_i, rmse)
                        if (rmse < self._best_rmse):
                            self._best_rmse = rmse
                            self._best_params = self.params
Beispiel #10
0
def baseline(trainset, testset):

    print("\n" + "-" * 5 + " Baseline algorithm using surprise package " +
          "-" * 5)
    algo = BaselineOnly()
    algo.fit(trainset)
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions)
    mae = accuracy.mae(predictions)
    return rmse, mae, predictions
Beispiel #11
0
 def fit(self, train):
     """
     Fit the model
     """
     self.model = BaselineOnly(bsl_options={
         'method': 'sgd',
         'n_epochs': 30,
         'reg': 0.01,
         'learning_rate': 0.01
     })
     self.model.fit(train)
Beispiel #12
0
 def fit(self, train):
     """
     Fit the model
     """
     baselineOnly = BaselineOnly(bsl_options={
         'method': 'als',
         'n_epochs': 25,
         'reg_u': 5,
         'reg_i': 3
     })
     baselineOnly.fit(train)
     self.model = baselineOnly
def grid_search_knn(data_train, data_test, n_epochs, reg_us, reg_is,
                    file_name):

    print('KNN Surprise manual grid search')

    result_train = pd.DataFrame()
    result_test = pd.DataFrame()

    # loops on the parameters
    for n_epoch in n_epochs:
        for reg_u in reg_us:
            for reg_i in reg_is:

                bsl_options = {
                    'method': 'als',
                    'n_epochs': n_epoch,
                    'reg_u': reg_u,
                    'reg_i': reg_i
                }

                algo = BaselineOnly(bsl_options=bsl_options)
                # Retrieve the trainset.
                trainset = data_train.build_full_trainset()

                # Build an algorithm, and train it.
                algo.train(trainset)
                #Evaluate the performance
                perf_train = evaluate(algo, data_train, measures=['RMSE'])
                perf_test = evaluate(algo, data_test, measures=['RMSE'])

                perf_train["n_epoch"] = n_epoch
                perf_train["reg_u"] = reg_u
                perf_train["reg_i"] = reg_i
                #Store the mean performance RMSE on train
                perf_train["rmse"] = np.mean(perf_train['rmse'])

                perf_test["n_epoch"] = n_epoch
                perf_test["reg_u"] = reg_u
                perf_test["reg_i"] = reg_i
                #Store the mean performance RMSE on test
                perf_test["rmse"] = np.mean(perf_test['rmse'])

                #Store on a dataframe
                result_train = result_train.append(perf_train,
                                                   ignore_index=True)
                result_test = result_test.append(perf_test, ignore_index=True)

    # Save the dataframe so we will see or plot the differencies if it's interesting
    writer = pd.ExcelWriter(file_name, engine='xlsxwriter')
    result_train.to_excel(writer, 'Sheet1')
    result_test.to_excel(writer, 'Sheet2')
    writer.save()
def train(trainset, testset):
    """
    Train the recommender model that uses the baseline algorithm which is based on similarities between users
    and their shared ratings of recipes
    :param trainset: the train set from which the model learns the pattern of ratings and similarity between different users
    :param testset: the testset to which the model validate its knowledge of data and ratings distribution
    :return: a variable containing predictions of ratings of all items given by all users
    """
    print("Training the model for prediction ....")
    # BaselineOnly algorithm gave us the best rmse,
    # therefore, we will train and predict with BaselineOnly and use Alternating Least Squares (ALS).
    algo = BaselineOnly(bsl_options=bsl_options)
    predictions = algo.fit(trainset).test(testset)
    return predictions
Beispiel #15
0
 def baseline_only(self):
     """
     Basic baseline prediction using global mean and user-item biases. 
     Returns:
         predictions_df: The predictions of the model on the test data in
             Pandas Data Frame format
     """
     algorithm = BaselineOnly()
     predictions = algorithm.fit(self.train_data).test(self.test_data)
     predictions_df = self.data.test_df.copy()
     predictions_df['Rating'] = [x.est for x in predictions]
     if self.test_purpose: 
         self.evalueate_model(predictions_df['Rating'], 'Surprise baseline_only')
     return predictions_df
def als_predictions(trainset, dataset_test):
    algo = BaselineOnly(bsl_options={
        'method': 'als',
        'n_epochs': 30,
        'reg_u': 6,
        'reg_i': 4
    })
    predictions = algo.fit(trainset)
    list_1 = []
    for x in dataset_test:
        i = predictions.predict(
            x[0], x[1]) if mode == 'test' else predictions.predict(
                x[0], x[1], x[2])
        list_1.append((i[0], i[1], i[2], i[3]))
    return list_1
Beispiel #17
0
def test_sgd_n_epoch_field():
    """Ensure the n_epoch field is taken into account."""

    bsl_options = {'method': 'sgd',
                   'n_epochs': 1,
                   }
    algo = BaselineOnly(bsl_options=bsl_options)
    rmse_sgd_n_epoch_1 = cross_validate(algo, data, ['rmse'], pkf)['test_rmse']

    bsl_options = {'method': 'sgd',
                   'n_epochs': 20,
                   }
    algo = BaselineOnly(bsl_options=bsl_options)
    rmse_sgd_n_epoch_5 = cross_validate(algo, data, ['rmse'], pkf)['test_rmse']

    assert rmse_sgd_n_epoch_1 != rmse_sgd_n_epoch_5
def get_surprise_base_model(trainset, testset, train_reg, test_reg,
                            model_train_evaluation, model_test_evaluation,
                            error_table):
    bsl_options = {"method": "sgd", "learning_rate": 0.01, "n_epochs": 25}
    algo = BaselineOnly(bsl_options=bsl_options)
    train_result, test_result, error_table = run_surprise(
        algo, trainset, testset, "BaselineOnly", error_table)
    model_train_evaluation["BaselineOnly"] = train_result
    model_test_evaluation["BaselineOnly"] = test_result
    train_reg["BaselineOnly"] = model_train_evaluation["BaselineOnly"][
        "Prediction"]
    st.write("Number of nan values = " + str(train_reg.isnull().sum().sum()))
    test_reg["BaselineOnly"] = model_test_evaluation["BaselineOnly"][
        "Prediction"]
    test_reg.head()
    st.write("Number of nan values = " + str(test_reg.isnull().sum().sum()))
    x_train = train_reg.drop(["User_ID", "Movie_ID", "Rating"], axis=1)
    x_test = test_reg.drop(["User_ID", "Movie_ID", "Rating"], axis=1)
    y_train = train_reg["Rating"]
    y_test = test_reg["Rating"]
    train_result, test_result, error_table, fig = train_test_xgboost(
        x_train, x_test, y_train, y_test, "XGB_BSL", error_table)
    model_train_evaluation["XGB_BSL"] = train_result
    model_test_evaluation["XGB_BSL"] = test_result
    return model_test_evaluation, model_train_evaluation, error_table, fig
Beispiel #19
0
 def __init__(self,
              train_data,
              model_to_use=["baselineonly", "svd", "coClustering", "knn"]):
     """initialize class with full dataset and a set of base models to use"""
     AlgoBase.__init__(self)
     self.available_models = {
         "baselineonly":
         BaselineOnly(
             bsl_options={
                 "method": "sgd",
                 "n_epochs": 30,
                 "reg": 0.1,
                 "learning_rate": 0.005
             }),
         "svd":
         SVD(lr_all=0.005, n_factors=50, reg_all=0.1),
         "coClustering":
         CoClustering(n_epochs=3, n_cltr_u=3, n_cltr_i=3),
         "knn":
         KNNWithMeans(k=40,
                      sim_options={
                          "name": "cosine",
                          "user_based": False
                      }),
     }
     self.model_selection = []
     for model in model_to_use:
         self.model_selection.append([model, self.available_models[model]])
     self.model_rmse = {}
     self.model_mae = {}
     self.model_list = {}
     self.trainset = train_data.build_full_trainset()
Beispiel #20
0
def test_surprise(train, test, items, algo=["baseline", "svd", "svdpp"], algo_params={}, rating_scale=(1, 5)):
    train_affinities = train
    validation_affinities = test
    train = pd.DataFrame(train)
    test = pd.DataFrame(test)
    reader = Reader(rating_scale=rating_scale)
    trainset = Dataset.load_from_df(train, reader).build_full_trainset()
    # testset = Dataset.load_from_df(test, reader).build_full_trainset().build_anti_testset()
    testset = Dataset.load_from_df(test, reader).build_full_trainset().build_testset()
    trainset_for_testing = trainset.build_testset()

    def use_algo(algo, name):
        start = time.time()
        algo.fit(trainset)
        predictions = algo.test(testset)
        end = time.time()
        total_time = end - start
        rmse = accuracy.rmse(predictions, verbose=False)
        mae = accuracy.mae(predictions, verbose=False)

        ex_ee = extraction_efficiency(algo, train_affinities, validation_affinities, surprise_get_topk, items)

        predictions = algo.test(trainset_for_testing)
        train_rmse = accuracy.rmse(predictions, verbose=False)
        train_mae = accuracy.mae(predictions, verbose=False)
        return {"algo": name, "rmse": rmse, "mae": mae, "map": ex_ee["map"], "retrieval_time": ex_ee["retrieval_time"],
                "train_rmse": train_rmse, "train_mae": train_mae, "time": total_time}

    algo_map = {"svd": SVD(**(algo_params["svd"] if "svd" in algo_params else {})),
                "svdpp": SVDpp(**(algo_params["svdpp"] if "svdpp" in algo_params else {})),
                "baseline": BaselineOnly(bsl_options={'method': 'sgd'})}
    results = list(map(lambda a: use_algo(algo_map[a], a), algo))
    return results
Beispiel #21
0
def run_surprise():
    # Load the movielens-100k dataset (download it if needed).
    data = Dataset.load_builtin('ml-100k')

    # Use the famous SVD algorithm.
    algo_svd = SVD()
    algo_normal = NormalPredictor()
    algo_baseline = BaselineOnly()
    algo_knnBasic = KNNBasic()

    # Run 5-fold cross-validation and print results.
    cross_validate(algo_svd,
                   data,
                   measures=['RMSE', 'MAE'],
                   cv=5,
                   verbose=True)
    cross_validate(algo_normal,
                   data,
                   measures=['RMSE', 'MAE'],
                   cv=5,
                   verbose=True)
    cross_validate(algo_baseline,
                   data,
                   measures=['RMSE', 'MAE'],
                   cv=5,
                   verbose=True)
    cross_validate(algo_knnBasic,
                   data,
                   measures=['RMSE', 'MAE'],
                   cv=5,
                   verbose=True)
Beispiel #22
0
def benchmark(data):
    performance = []
    algorithms = [
        SVD(),
        SVDpp(),
        SlopeOne(),
        NMF(),
        NormalPredictor(),
        KNNBaseline(),
        KNNBasic(),
        KNNWithMeans(),
        KNNWithZScore(),
        BaselineOnly(),
        CoClustering(),
        SVD_SGD_momentum(),
        SVDpp_SGD_momentum()
    ]
    for algorithm in algorithms:
        results = cross_validate(algorithm,
                                 data,
                                 measures=['RMSE', 'MAE', 'FCP'],
                                 cv=3,
                                 verbose=False)
        output = pd.DataFrame.from_dict(results).mean(axis=0)
        output = output.append(
            pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],
                      index=['Algorithm']))
        performance.append(output)
    output_df = pd.DataFrame(performance).set_index('Algorithm').sort_values(
        'test_rmse')
    store_dataframe(output_df, 'Algorithm_Benchmark.csv')
Beispiel #23
0
def test_als_reg_i_field():
    """Ensure the reg_i field is taken into account."""

    bsl_options = {'method': 'als',
                   'reg_i': 0,
                   }
    algo = BaselineOnly(bsl_options=bsl_options)
    rmse_als_regi_0 = cross_validate(algo, data, ['rmse'], pkf)['test_rmse']

    bsl_options = {'method': 'als',
                   'reg_i': 10,
                   }
    algo = BaselineOnly(bsl_options=bsl_options)
    rmse_als_regi_10 = cross_validate(algo, data, ['rmse'], pkf)['test_rmse']

    assert rmse_als_regi_0 != rmse_als_regi_10
Beispiel #24
0
 def __init__(self, train_data):
     AlgoBase.__init__(self)
     self.model_selection = [[
         'baselineonly',
         BaselineOnly(bsl_options={
             'method': 'als',
             'n_epochs': 25,
             'reg_u': 5,
             'reg_i': 3
         })
     ], ['svd', SVD(lr_all=0.01, n_epochs=25, reg_all=0.2)],
                             [
                                 'coClustering',
                                 CoClustering(n_epochs=3,
                                              n_cltr_u=3,
                                              n_cltr_i=3)
                             ],
                             [
                                 'knn',
                                 KNNBasic(k=40,
                                          sim_options={
                                              'name': 'cosine',
                                              'user_based': False
                                          })
                             ]]
     self.model_rmse = {}
     self.model_list = {}
     self.trainset = train_data.build_full_trainset()
Beispiel #25
0
def batchrunSVDpp(data, al, folds):
    '''
    define a function to run batches of data
    Args:
        data: data file name in string.
        al: algorithm name in string.
        folds: split the data into x folds for cross-validation, interger
    Returns:
        None	
	'''

    #load the data with given data format
    print "load data..."
    data = Dataset.load_from_file(path + data, reader=reader)

    #split the data into x folds for cross-validation.
    print "Split data...."
    data.split(n_folds=folds)

    # We'll use the famous SVDpp algorithm.

    if al == 'SVDpp':
        algo = SVDpp()
    elif al == 'Base':
        algo = BaselineOnly(bsl_options=bsl_options)

    # Evaluate performances of the algorithm on the dataset.
    perf = evaluate(algo, data, measures=['RMSE', 'MAE'])

    print_perf(perf)
def check_for_args():
    args = sys.argv
    for arg in args:
        if (arg == 'SVD'):
            alg_list.append(SVD())
        elif (arg == 'SVDpp'):
            alg_list.append(SVDpp())
        elif (arg == 'SlopeOne'):
            alg_list.append(SlopeOne())
        elif (arg == 'NMF'):
            alg_list.append(NMF())
        elif (arg == 'NormalPredictor'):
            alg_list.append(NormalPredictor())
        elif (arg == 'KNNBaseline'):
            alg_list.append(KNNBaseline())
        elif (arg == 'KNNBasic'):
            alg_list.append(KNNBasic())
        elif (arg == 'KNNWithMeans'):
            alg_list.append(KNNWithMeans())
        elif (arg == 'KNNWithZScore'):
            alg_list.append(KNNWithZScore())
        elif (arg == 'BaselineOnly'):
            alg_list.append(BaselineOnly())
        elif (arg == 'CoClustering'):
            alg_list.append(CoClustering())

    return alg_list
Beispiel #27
0
def crossvalidate(data):
    results = []
    for algorithm in [
            NormalPredictor(),
            KNNBaseline(k=15, sim_options=similarity_measure('pearson', 1)),
            KNNBasic(k=15, sim_options=similarity_measure('pearson', 1)),
            KNNWithMeans(k=15, sim_options=similarity_measure('pearson', 1)),
            KNNWithZScore(k=15, sim_options=similarity_measure('pearson', 1)),
            BaselineOnly(),
            SVD(),
            SVDpp(),
            NMF(),
            SlopeOne(),
            CoClustering()
    ]:
        result = cross_validate(algorithm,
                                data,
                                measures=['RMSE'],
                                cv=5,
                                verbose=False)
        temp = pd.DataFrame.from_dict(result).mean(axis=0)
        temp = temp.append(
            pd.Series([str(algorithm).split(' ')[0].split(".")[-1]],
                      index=['Algorithm']))
        results.append(temp)
    rmse_values = pd.DataFrame(results).set_index('Algorithm').sort_values(
        'test_rmse')
    return rmse_values
def EvaluateDifferentAlgorithms():
    benchmark = []
    # Iterate over all algorithms
    for algorithm in [
            SVD(),
            SVDpp(),
            SlopeOne(),
            NMF(),
            NormalPredictor(),
            KNNBaseline(),
            KNNBasic(),
            KNNWithMeans(),
            KNNWithZScore(),
            BaselineOnly(),
            CoClustering()
    ]:
        # Perform cross validation
        results = cross_validate(algorithm,
                                 data_6months,
                                 measures=['RMSE'],
                                 cv=3,
                                 verbose=False)

        # Get results & append algorithm name
        tmp = pd.DataFrame.from_dict(results).mean(axis=0)
        tmp = tmp.append(
            pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],
                      index=['Algorithm']))
        benchmark.append(tmp)

        print(
            pd.DataFrame(benchmark).set_index('Algorithm').sort_values(
                'test_rmse'))
Beispiel #29
0
def test_method_field(u1_ml100k, pkf):
    """Ensure the method field is taken into account."""

    bsl_options = {'method': 'als'}
    algo = BaselineOnly(bsl_options=bsl_options)
    rmse_als = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']

    bsl_options = {'method': 'sgd'}
    algo = BaselineOnly(bsl_options=bsl_options)
    rmse_sgd = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']

    assert rmse_als != rmse_sgd

    with pytest.raises(ValueError):
        bsl_options = {'method': 'wrong_name'}
        algo = BaselineOnly(bsl_options=bsl_options)
        cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']
 def _hyperopt(self, params):
     algo = BaselineOnly(**params)
     return cross_validate(algo,
                           self._data,
                           measures=ACCURACY_METRICS,
                           cv=self._cv,
                           n_jobs=self._cv_n_jobs,
                           verbose=self._debug)[self._metric].mean()
Beispiel #31
0
def test_dump(u1_ml100k):
    """Train an algorithm, compute its predictions then dump them.
    Ensure that the predictions that are loaded back are the correct ones, and
    that the predictions of the dumped algorithm are also equal to the other
    ones."""

    random.seed(0)

    trainset, testset = next(PredefinedKFold().split(u1_ml100k))

    algo = BaselineOnly()
    algo.fit(trainset)
    predictions = algo.test(testset)

    with tempfile.NamedTemporaryFile() as tmp_file:
        dump.dump(tmp_file.name, predictions, algo)
        predictions_dumped, algo_dumped = dump.load(tmp_file.name)

        predictions_algo_dumped = algo_dumped.test(testset)
        assert predictions == predictions_dumped
        assert predictions == predictions_algo_dumped
Beispiel #32
0
def test_trainset_testset(toy_data_reader):
    """Test the construct_trainset and construct_testset methods."""

    current_dir = os.path.dirname(os.path.realpath(__file__))
    folds_files = [(current_dir + '/custom_train',
                    current_dir + '/custom_test')]

    data = Dataset.load_from_folds(folds_files=folds_files,
                                   reader=toy_data_reader, rating_scale=(1, 5))

    with pytest.warns(UserWarning):
        trainset, testset = next(data.folds())

    # test ur
    ur = trainset.ur
    assert ur[0] == [(0, 4)]
    assert ur[1] == [(0, 4), (1, 2)]
    assert ur[40] == []  # not in the trainset

    # test ir
    ir = trainset.ir
    assert ir[0] == [(0, 4), (1, 4), (2, 1)]
    assert ir[1] == [(1, 2), (2, 1), (3, 5)]
    assert ir[20000] == []  # not in the trainset

    # test n_users, n_items, n_ratings, rating_scale
    assert trainset.n_users == 4
    assert trainset.n_items == 2
    assert trainset.n_ratings == 6
    assert trainset.rating_scale == (1, 5)

    # test raw2inner
    for i in range(4):
        assert trainset.to_inner_uid('user' + str(i)) == i
    with pytest.raises(ValueError):
        trainset.to_inner_uid('unkown_user')

    for i in range(2):
        assert trainset.to_inner_iid('item' + str(i)) == i
    with pytest.raises(ValueError):
        trainset.to_inner_iid('unkown_item')

    # test inner2raw
    assert trainset._inner2raw_id_users is None
    assert trainset._inner2raw_id_items is None
    for i in range(4):
        assert trainset.to_raw_uid(i) == 'user' + str(i)
    for i in range(2):
        assert trainset.to_raw_iid(i) == 'item' + str(i)
    assert trainset._inner2raw_id_users is not None
    assert trainset._inner2raw_id_items is not None

    # Test the build_testset() method
    algo = BaselineOnly()
    algo.fit(trainset)
    testset = trainset.build_testset()
    algo.test(testset)  # ensure an algorithm can manage the data
    assert ('user0', 'item0', 4) in testset
    assert ('user3', 'item1', 5) in testset
    assert ('user3', 'item1', 0) not in testset

    # Test the build_anti_testset() method
    algo = BaselineOnly()
    algo.fit(trainset)
    testset = trainset.build_anti_testset()
    algo.test(testset)  # ensure an algorithm can manage the data
    assert ('user0', 'item0', trainset.global_mean) not in testset
    assert ('user3', 'item1', trainset.global_mean) not in testset
    assert ('user0', 'item1', trainset.global_mean) in testset
    assert ('user3', 'item0', trainset.global_mean) in testset