Esempio n. 1
0
def surprise_SVD(train_file, test_file):
    """
    Svd with Surprise library.
    Compute the predictions on a test_set after training on a train_set using the method Svd  from Surprise.
    Args:
        train_file (string): path to created test file
        test_file (string): path to created train file
    Hyperparameters:
        n_factors : The number of factors.
        n_epochs : The number of iteration of the SGD procedure
        lr_all: The learning rate for all
        reg_all : The regularization term for all


    Returns:
        numpy array: predictions
    """
    print("SVD")
    fold = [(train_file, test_file)]
    reader = Reader(line_format='user item rating', sep=',')
    data = Dataset.load_from_folds(fold, reader=reader)
    pkf = PredefinedKFold()
    # Algorithm
    algo = SVD(n_epochs=30, lr_all=0.01, reg_all=0.1)
    for trainset, testset in pkf.split(data):
        # Train
        algo.fit(trainset)

        # Predict
        predictions = algo.test(testset)
    pred = np.zeros(len(predictions))
    for i in range(len(predictions)):
        val = predictions[i].est
        pred[i] = val
    return pred
Esempio n. 2
0
def surprise_slopeOne(train_file, test_file):
    """
    SlopeOne with Surprise library.
    Compute the predictions on a test_set after training on a train_set using the method SlopeOne from Surprise.
    Args:
        train_file (string): path to created test file
        test_file (string): path to created train file
    Hyperparameters:
        -
    Returns:
        numpy array: predictions
    """
    print("slopeone")
    algo = SlopeOne()
    fold = [(train_file, test_file)]
    reader = Reader(line_format='user item rating', sep=',')
    data = Dataset.load_from_folds(fold, reader=reader)
    pkf = PredefinedKFold()
    for trainset, testset in pkf.split(data):
        # Train
        algo.fit(trainset)

        # Predict
        predictions = algo.test(testset)
    pred = np.zeros(len(predictions))
    for i in range(len(predictions)):
        val = predictions[i].est
        pred[i] = val
    return pred
Esempio n. 3
0
def test_dump():
    """Train an algorithm, compute its predictions then dump them.
    Ensure that the predictions that are loaded back are the correct ones, and
    that the predictions of the dumped algorithm are also equal to the other
    ones."""

    random.seed(0)

    train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train')
    test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
    data = Dataset.load_from_folds([(train_file, test_file)],
                                   Reader('ml-100k'))
    pkf = PredefinedKFold()

    trainset, testset = next(pkf.split(data))

    algo = BaselineOnly()
    algo.fit(trainset)
    predictions = algo.test(testset)

    with tempfile.NamedTemporaryFile() as tmp_file:
        dump.dump(tmp_file.name, predictions, algo)
        predictions_dumped, algo_dumped = dump.load(tmp_file.name)

        predictions_algo_dumped = algo_dumped.test(testset)
        assert predictions == predictions_dumped
        assert predictions == predictions_algo_dumped
def surprise_algo(algo, train_path="datas/train.csv", test_path="datas/test.csv", verbose=True):
    # reader with rating scale
    reader = Reader(line_format='user item rating', sep=',', rating_scale=(1, 5))
    
    # Specify the training and test dataset
    folds_files = [(train_path, test_path)]

    data = Dataset.load_from_folds(folds_files, reader=reader)
    pkf = PredefinedKFold()
    
    print("Start prediction...")
    for trainset, testset in pkf.split(data):
        # train and predict algorithm.
        model = algo.fit(trainset)
        predictions = algo.test(testset)
    
    pred = pd.read_csv(test_path, names = ["User", "Movie", "Rating"])
    
    print("Postprocessing predictions...")
    for index, row in pred.iterrows():
        rating = round(predictions[index].est)
        if rating > 5:
            rating = 5
        elif rating < 1:
            rating = 1
        row.Rating = rating
    
    return pred
Esempio n. 5
0
def surprise_knn_ub(train_file, test_file):
    """
    Knn userbased with Surprise library.
    Compute the predictions on a test_set after training on a train_set using the method KNNBaseLineOnly from Surprise.
    Args:
        train_file (string): path to created test file
        test_file (string): path to created train file
    Hyperparameters :
        k : The (max) number of neighbors to take into account for aggregation
        sim_options (dict) – A dictionary of options for the similarity measure.

    Returns:
        numpy array: predictions
    """
    print("knnUB")
    algo = KNNBaseline(k=300,
                       sim_options={
                           'name': 'pearson_baseline',
                           'user_based': True
                       })
    fold = [(train_file, test_file)]
    reader = Reader(line_format='user item rating', sep=',')
    data = Dataset.load_from_folds(fold, reader=reader)
    pkf = PredefinedKFold()
    for trainset, testset in pkf.split(data):
        # Train
        algo.fit(trainset)

        # Predict
        predictions = algo.test(testset)
    pred = np.zeros(len(predictions))
    for i in range(len(predictions)):
        val = predictions[i].est
        pred[i] = val
    return pred
Esempio n. 6
0
    def prepare_data(self):
        super(RecommenderOnSurprice, self).prepare_data()

        reader = Reader(line_format='user item rating', sep='\t', rating_scale=(1, 5))
        data = Dataset.load_from_folds([(self.train_path, self.test_path)], reader=reader)
        trainset, testset = None, None
        pkf = PredefinedKFold()
        for trainset_, testset_ in pkf.split(data):
            trainset, testset = trainset_, testset_
        self.trainset, self.testset = trainset, testset
Esempio n. 7
0
    def load_test_files(self):
        reader = Reader(line_format='user item rating', sep=',', skip_lines=1)
        train_file = 'test-data-train.csv'
        test_file = 'test-data-test.csv'
        folds_files = [(train_file, test_file)]

        data = Dataset.load_from_folds(folds_files, reader=reader)
        pkf = PredefinedKFold()

        trainset, testset = next(pkf.split(data))

        return trainset, testset
Esempio n. 8
0
def precompute_data(experiment_dir):
    rating_train_path = os.path.join(experiment_dir, 'ratings_train.txt')
    rating_test_path = os.path.join(experiment_dir, 'ratings_test.txt')
    ratings_reader = Reader(line_format="user item rating", sep=' ')
    dataset = Dataset.load_from_folds([(rating_train_path, rating_test_path)],
                                      ratings_reader)
    pkf = PredefinedKFold()
    trainset, testset = list(pkf.split(dataset))[0]

    n_x, yr = trainset.n_users, trainset.ir
    min_support = 1
    args = [n_x, yr, min_support]
    sims = pearson(*args).astype(np.float32)
    return trainset, testset, sims
Esempio n. 9
0
def load_data_hyper(train_file, submission_file):
    train_file, submission_file, df, df_toBeSubmitted = modify_data(
        train_file, submission_file)
    reader = Reader(line_format='user item rating', sep=',')

    fold = [(train_file, submission_file)]
    trainAndTest = Dataset.load_from_folds(fold, reader=reader)
    pkf = PredefinedKFold()

    # Go through 1 fold
    for trainset, testset in pkf.split(trainAndTest):
        data = trainset
        test = testset

    return data, test, df, df_toBeSubmitted
Esempio n. 10
0
def test_PredifinedKFold():

    reader = Reader(line_format='user item rating',
                    sep=' ',
                    skip_lines=3,
                    rating_scale=(1, 5))

    current_dir = os.path.dirname(os.path.realpath(__file__))
    folds_files = [(current_dir + '/custom_train',
                    current_dir + '/custom_test')]

    data = Dataset.load_from_folds(folds_files=folds_files, reader=reader)

    # Make sure rating files are read correctly
    pkf = PredefinedKFold()
    trainset, testset = next(pkf.split(data))
    assert trainset.n_ratings == 6
    assert len(testset) == 3
Esempio n. 11
0
def test_PredifinedKFold(toy_data_reader):

    current_dir = os.path.dirname(os.path.realpath(__file__))
    folds_files = [(current_dir + '/custom_train',
                    current_dir + '/custom_test')]

    data = Dataset.load_from_folds(folds_files=folds_files,
                                   reader=toy_data_reader, rating_scale=(1, 5))

    # Make sure rating files are read correctly
    pkf = PredefinedKFold()
    trainset, testset = next(pkf.split(data))
    assert trainset.n_ratings == 6
    assert len(testset) == 3

    # Make sure pkf returns the same folds as the deprecated data.folds()
    with pytest.warns(UserWarning):
        trainset_, testset_ = next(data.folds())
    assert testset_ == testset
Esempio n. 12
0
def test_PredifinedKFold(toy_data_reader):

    current_dir = os.path.dirname(os.path.realpath(__file__))
    folds_files = [(current_dir + '/custom_train',
                    current_dir + '/custom_test')]

    data = Dataset.load_from_folds(folds_files=folds_files,
                                   reader=toy_data_reader,
                                   rating_scale=(1, 5))

    # Make sure rating files are read correctly
    pkf = PredefinedKFold()
    trainset, testset = next(pkf.split(data))
    assert trainset.n_ratings == 6
    assert len(testset) == 3

    # Make sure pkf returns the same folds as the deprecated data.folds()
    with pytest.warns(UserWarning):
        trainset_, testset_ = next(data.folds())
    assert testset_ == testset
Esempio n. 13
0
def func7():
    import os
    from surprise import SVD
    from surprise import Dataset
    from surprise import Reader
    from surprise import accuracy
    from surprise.model_selection import PredefinedKFold

    files_dir = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/')
    reader = Reader('ml-100k')

    train_file = files_dir + 'u%d.base'
    test_file = files_dir + 'u%d.test'
    folds_files = [(train_file % i, test_file % i) for i in (1, 2, 3, 4, 5)]

    data = Dataset.load_from_folds(folds_files, reader=reader)
    pkf = PredefinedKFold()

    algo = SVD()
    for trainset, testset in pkf.split(data):
        algo.fit(trainset)
        predictions = algo.test(testset)
        accuracy.rmse(predictions, verbose=True)
Esempio n. 14
0
def basic_rec(model_name, train_path, test_path, target_id):
    # build data
    # TODO check float and min_r
    reader = Reader(line_format='user item rating',
                    sep='\t',
                    rating_scale=(1, 5))
    data = Dataset.load_from_folds([(train_path, test_path)], reader=reader)
    trainset, testset = None, None
    pkf = PredefinedKFold()
    for trainset_, testset_ in pkf.split(data):
        trainset, testset = trainset_, testset_

    # train model
    rec_algo = get_model(model_name)
    rec_algo.fit(trainset)
    # eval
    preds = rec_algo.test(testset)
    rmse = accuracy.rmse(preds, verbose=True)

    # predor target
    fn_pred = lambda uid: rec_algo.predict(str(uid), str(target_id), r_ui=0
                                           ).est
    target_predictions = list(map(fn_pred, range(trainset.n_users)))

    # topn
    testset = trainset.build_anti_testset()
    predictions = rec_algo.test(testset)
    top_n = get_top_n(predictions, n=50)

    hit_ratios = {}
    for uid, user_ratings in top_n.items():
        topN = [int(iid) for (iid, _) in user_ratings]
        hits = [
            1 if target_id in topN[:i] else 0 for i in [1, 3, 5, 10, 20, 50]
        ]
        hit_ratios[int(uid)] = hits
    return target_predictions, hit_ratios
Esempio n. 15
0
def test_knns():
    """Ensure the k and min_k parameters are effective for knn algorithms."""

    # the test and train files are from the ml-100k dataset (10% of u1.base and
    # 10 % of u1.test)
    train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train')
    test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
    data = Dataset.load_from_folds([(train_file, test_file)],
                                   Reader('ml-100k'))
    pkf = PredefinedKFold()

    # Actually, as KNNWithMeans and KNNBaseline have back up solutions for when
    # there are not enough neighbors, we can't really test them...
    klasses = (KNNBasic, )  # KNNWithMeans, KNNBaseline)

    k, min_k = 20, 5
    for klass in klasses:
        algo = klass(k=k, min_k=min_k)
        for trainset, testset in pkf.split(data):
            algo.fit(trainset)
            predictions = algo.test(testset)
            for pred in predictions:
                if not pred.details['was_impossible']:
                    assert min_k <= pred.details['actual_k'] <= k
Esempio n. 16
0
def run_knn_baseline(sparse_data):
    #filename = "test.json"
    prefix = "knn_baseline_"
    trainFile = prefix + "train.txt"
    testFile = prefix + "test.txt"

    raw_data, userPurchasedSet, userTrueTestSet = preprocess(
        sparse_data, trainFile, testFile)
    folds_files = [(trainFile, testFile)]
    reader = Reader(line_format='user item rating', sep='\t')
    data = Dataset.load_from_folds(folds_files, reader=reader)
    pkf = PredefinedKFold()
    bsl_options = {
        'method': 'sgd',
        'n_epochs': 20,
        'learning_rate': 0.005,
    }
    ### sim name: cosine    msd       pearson     pearson_baseline
    ### user_based : True ---- similarity will be computed based on users
    ###            : False ---- similarity will be computed based on items.
    sim_options = {'name': 'pearson_baseline', 'user_based': False}
    predictions = {}
    top_n = {}
    testsSet = None
    total_precisions = 0.0
    total_recalls = 0.0
    total_hit = 0.0
    total_nDCG = 0.0
    total_ffeature = 0.0
    result_file = prefix + "result.txt"
    result_f = open(result_file, "w")
    for trainset, testset in pkf.split(data):
        testsSet = testset

        #algo = SVD(n_factors = 5)
        algo = KNNBaseline(bsl_options=bsl_options, sim_options=sim_options)
        algo.fit(trainset)
        pre = algo.test(testset)
        accuracy.rmse(pre)
        accuracy.mae(pre)
        #calculate_rmse(predictions)

        ### test
        rowNum = raw_data.get_row_size()
        colNum = raw_data.get_col_size()
        cur_time = time.time()
        time_cost = 0

        for i in range(rowNum):
            user = raw_data.get_userID(i)
            predictions[user] = set()
            pq = []
            heapq.heapify(pq)
            for j in range(colNum):
                item = raw_data.get_itemID(j)
                if user not in userPurchasedSet or item in userPurchasedSet[
                        user]:
                    continue
                value = raw_data.get_val(user, item, 'rating')
                predict = algo.predict(user, item, r_ui=0, verbose=False)[3]
                if len(pq) >= 10:
                    heapq.heappop(pq)
                heapq.heappush(pq, (predict, item))
            top_n[user] = set()
            for items in pq:
                top_n[user].add(items[1])
            if user in userTrueTestSet:
                curPrecisions = calculate_precision(top_n[user],
                                                    userTrueTestSet[user])
                curRecalls = calculate_recall(top_n[user],
                                              userTrueTestSet[user])
                ffeature = calculate_f_feature(curPrecisions, curRecalls)
                curHit = isHit(top_n[user], userTrueTestSet[user])
                cur_nDCG = calculate_NDCG(top_n[user], userTrueTestSet[user])
                total_precisions += curPrecisions
                total_recalls += curRecalls
                total_hit += curHit
                total_nDCG += cur_nDCG
                total_ffeature += ffeature
                result_f.write(user + "\t" + str(curPrecisions) + "\t" +
                               str(curRecalls) + "\t" + str(ffeature) + "\t" +
                               str(curHit) + '\t' + str(cur_nDCG) + "\n")
            if i != 0 and i % 1000 == 0:
                duration = (time.time() - cur_time) / 60
                time_cost += duration
                remaining_time = ((rowNum - i) / 1000) * duration
                cur_time = time.time()
                #print 'precisions', total_precisions, ' recalls', total_recalls, ' nDCG', total_nDCG
                print 'i:', i, "/", rowNum, 'remaining time:', remaining_time, 'min'
    print 'precicions', total_precisions, ' recalls', total_recalls, ' hit', total_hit, 'nDCG:', total_nDCG
    rowNum = raw_data.get_row_size()
    print 'avg_precisions:', total_precisions / rowNum, 'avg_recalls:', total_recalls / rowNum, 'avg_ffeature', str(
        total_ffeature / rowNum
    ), 'avg_hit:', total_hit / rowNum, 'avg_nDCG:', total_nDCG / rowNum
    result_f.write("avg:\t" + str(total_precisions / rowNum) + "\t" +
                   str(total_recalls / rowNum) + "\t" +
                   str(total_ffeature / rowNum) + "\t" +
                   str(total_hit / rowNum) + '\t' + str(total_nDCG / rowNum) +
                   "\n")
    result_f.close()
Esempio n. 17
0
# userRatingRDD = userRatingRDD.filter(lambda t: int(preUserMap[t[0]]) % 30 == 0)
# busRatingRDD = busRatingRDD.filter(lambda t: int(preBusinessMap[t[0]]) % 40 == 0)
# print("add user avg count: " + str(userRatingRDD.count()))
# print("add bus avg count: " + str(busRatingRDD.count()))

reader = Reader(line_format='user item rating', sep=",", skip_lines=1)

folds_files = [(trainingFilePath, validationFilePath)]

data = Dataset.load_from_folds(folds_files, reader=reader)
pkf = PredefinedKFold()

algo = SVD()

predictionList = list()
for trainset, testset in pkf.split(data):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)
    for uid, iid, true_r, est, _ in predictions:
        predictionList.append((uid, iid, est))

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

# ((user_id, business_id), rating)
predictionSVD = sc.parallelize(predictionList).map(lambda t:
                                                   ((t[0], t[1]), t[2]))

ratingMap = dict()
Esempio n. 18
0
def test_trainset_testset(toy_data_reader):
    """Test the construct_trainset and construct_testset methods."""

    current_dir = os.path.dirname(os.path.realpath(__file__))
    folds_files = [(current_dir + '/custom_train',
                    current_dir + '/custom_test')]

    data = Dataset.load_from_folds(folds_files=folds_files,
                                   reader=toy_data_reader)

    pkf = PredefinedKFold()
    trainset, testset = next(pkf.split(data))

    # test ur
    ur = trainset.ur
    assert ur[0] == [(0, 4)]
    assert ur[1] == [(0, 4), (1, 2)]
    assert ur[40] == []  # not in the trainset

    # test ir
    ir = trainset.ir
    assert ir[0] == [(0, 4), (1, 4), (2, 1)]
    assert ir[1] == [(1, 2), (2, 1), (3, 5)]
    assert ir[20000] == []  # not in the trainset

    # test n_users, n_items, n_ratings, rating_scale
    assert trainset.n_users == 4
    assert trainset.n_items == 2
    assert trainset.n_ratings == 6
    assert trainset.rating_scale == (1, 5)

    # test raw2inner
    for i in range(4):
        assert trainset.to_inner_uid('user' + str(i)) == i
    with pytest.raises(ValueError):
        trainset.to_inner_uid('unkown_user')

    for i in range(2):
        assert trainset.to_inner_iid('item' + str(i)) == i
    with pytest.raises(ValueError):
        trainset.to_inner_iid('unkown_item')

    # test inner2raw
    assert trainset._inner2raw_id_users is None
    assert trainset._inner2raw_id_items is None
    for i in range(4):
        assert trainset.to_raw_uid(i) == 'user' + str(i)
    for i in range(2):
        assert trainset.to_raw_iid(i) == 'item' + str(i)
    assert trainset._inner2raw_id_users is not None
    assert trainset._inner2raw_id_items is not None

    # Test the build_testset() method
    algo = BaselineOnly()
    algo.fit(trainset)
    testset = trainset.build_testset()
    algo.test(testset)  # ensure an algorithm can manage the data
    assert ('user0', 'item0', 4) in testset
    assert ('user3', 'item1', 5) in testset
    assert ('user3', 'item1', 0) not in testset

    # Test the build_anti_testset() method
    algo = BaselineOnly()
    algo.fit(trainset)
    testset = trainset.build_anti_testset()
    algo.test(testset)  # ensure an algorithm can manage the data
    assert ('user0', 'item0', trainset.global_mean) not in testset
    assert ('user3', 'item1', trainset.global_mean) not in testset
    assert ('user0', 'item1', trainset.global_mean) in testset
    assert ('user3', 'item0', trainset.global_mean) in testset
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import PredefinedKFold

# path to dataset folder
files_dir = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/')

# This time, we'll use the built-in reader.
reader = Reader('ml-100k')

# folds_files is a list of tuples containing file paths:
# [(u1.base, u1.test), (u2.base, u2.test), ... (u5.base, u5.test)]
train_file = files_dir + 'u%d.base'
test_file = files_dir + 'u%d.test'
folds_files = [(train_file % i, test_file % i) for i in (1, 2, 3, 4, 5)]

data = Dataset.load_from_folds(folds_files, reader=reader, rating_scale=(1, 5))
pkf = PredefinedKFold()

algo = SVD()

for trainset, testset in pkf.split(data):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)
Esempio n. 20
0
reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)

train_file = '../data/train%d.csv'
test_file =  '../data/test%d.csv'
folds_files = [(train_file % i, test_file % i) for i in (1, 2, 3, 4, 5)]

data = Dataset.load_from_folds(folds_files, reader=reader)

pkf = PredefinedKFold()

#kf = KFold(n_splits=5)

algorithm = SVD()

i = 1
for train, test in pkf.split(data):
    print(f'Fitting model {i}')
    algorithm.fit(train)
    print(f'Testing model {i}')
    predictions = algorithm.test(test)


    print(f'Saving model {i}')
    model_dump = dump.dump(f'../svd_data/model{i}.model', predictions, algorithm)
    i += 1

# for trainset, testset in kf.split(data):
#     algorithm.fit(trainset)
#     predictions = algorithm.test(testset)
#
#     accuracy.rmse(predictions)