def baseline(trainset, testset, predset):
    
    modelname = 'baseline'
    # Check if predictions already exist
    if is_already_predicted(modelname):
        return
    
    bsl_options = { 'method': 'als',
                    'reg_i': 1.e-5,
                    'reg_u': 14.6,
                    'n_epochs': 10
                   }
    
    algo = BaselineOnly(bsl_options=bsl_options)
    print('Baseline Model')
    algo.train(trainset)
    
    predictions = algo.test(trainset.build_testset())
    print('   RMSE on Train: ', accuracy.rmse(predictions, verbose=False))
    
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions, verbose=False)
    print('   RMSE on Test: ', rmse)
    preds = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds[j] = pred.est
    save_predictions(modelname, rmse, preds, 'test')

    print('   Evaluate predicted ratings...')
    predictions = algo.test(predset)
    preds = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds[j] = pred.est
    save_predictions(modelname, rmse, preds)
Example #2
0
def base_running_time(data):
    '''
        Calculates the running times for training and predictions for Baseline algorithm

        Args:
            data(Dataset): a list of datasets with different numbers of users

        Returns:
            elapsedtime_Basetrain: running time for training
            elapsedtime_Basetest: running time for predictions on testset
    '''
    elapsedtime_Basetrain = []
    elapsedtime_Basetest = []

    # calculate running times
    for i in range(len(data)):
        # training running time
        training_start = time.time()
        training = data[i].build_full_trainset()
        testing = training.build_anti_testset()
        baseline = BaselineOnly()
        baseline.train(training)
        elapsedtime_Basetrain.append(time.time() - training_start)

        # prediction running time
        test_start = time.time()
        baseline.test(testing)
        elapsedtime_Basetest.append(time.time() - test_start)
    return elapsedtime_Basetrain, elapsedtime_Basetrain
def normalize_affinity_scores_by_user_item_bs(user_item_affinities: List[Tuple[str, str, float]], rating_scale=(1, 5)) \
        -> Tuple[float, Dict[str, float], Dict[str, float], float, List[Tuple[str, str, float]]]:
    train = pd.DataFrame(user_item_affinities)
    reader = Reader(rating_scale=rating_scale)
    trainset = Dataset.load_from_df(train, reader).build_full_trainset()
    trainset_for_testing = trainset.build_testset()
    algo = BaselineOnly(bsl_options={'method': 'sgd'})
    algo.fit(trainset)
    predictions = algo.test(trainset_for_testing)
    mean = algo.trainset.global_mean
    bu = {
        u: algo.bu[algo.trainset.to_inner_uid(u)]
        for u in set([u for u, i, r in user_item_affinities])
    }
    bi = {
        i: algo.bi[algo.trainset.to_inner_iid(i)]
        for i in set([i for u, i, r in user_item_affinities])
    }
    uid = [[p.uid, p.iid, p.r_ui - p.est] for p in predictions]
    estimatates = [p.est for p in predictions]
    estimates_2 = [
        p.r_ui - (mean + bu[p.uid] + bi[p.iid]) for p in predictions
    ]
    uid = pd.DataFrame(uid, columns=["user", "item", "rating"])
    spread = max(uid["rating"].max(), np.abs(uid["rating"].min()))
    uid = list(zip(uid['user'], uid['item'], uid['rating']))
    bu = defaultdict(float, bu)
    bi = defaultdict(float, bi)
    # assert estimatates == estimates_2
    return mean, bu, bi, spread, uid
def use_als():
    start = time.time()
    performance = []

    data = Dataset.load_builtin('ml-100k')
    trainset = data.build_full_trainset()

    print('Using ALS')
    bsl_options = {'method': 'als', 'n_epochs': 20, 'reg_u': 12, 'reg_i': 5}
    algo_ALS = BaselineOnly(bsl_options=bsl_options)
    algo_ALS.fit(trainset)

    testset = trainset.build_anti_testset()
    predictions_ALS = algo_ALS.test(testset)

    accuracy_rmse \
        = accuracy.rmse(predictions_ALS)
    accuracy_mae = accuracy.mae(predictions_ALS)
    performance.append(accuracy_rmse)
    performance.append(accuracy_mae)

    end = time.time()
    performance.append(end - start)

    return performance
def use_sgd():
    start = time.time()
    performance = []

    data = Dataset.load_builtin('ml-100k')
    trainset = data.build_full_trainset()

    print('Using SGD')
    bsl_options = {
        'method': 'sgd',
        'learning_rate': .005,
    }

    algo_SGD = BaselineOnly(bsl_options=bsl_options)
    algo_SGD.fit(trainset)

    testset = trainset.build_anti_testset()
    predictions_SGD = algo_SGD.test(testset)

    accuracy_rmse = accuracy.rmse(predictions_SGD)
    accuracy_mae = accuracy.mae(predictions_SGD)
    performance.append(accuracy_rmse)
    performance.append(accuracy_mae)

    end = time.time()
    performance.append(end - start)

    return performance
Example #6
0
def surprise_baseline(train_file, test_file):
    """
    Baseline with Surprise library.
    Compute the predictions on a test_set after training on a train_set using the method Baseline from Surprise.
    Args:
        train_file (string): path to created test file
        test_file (string): path to created train file
    Hyperparameters:
        -
    Returns:
        numpy array: predictions
    """
    print("baseline")
    algo = BaselineOnly()
    fold = [(train_file, test_file)]
    reader = Reader(line_format='user item rating', sep=',')
    data = Dataset.load_from_folds(fold, reader=reader)
    pkf = PredefinedKFold()
    for trainset, testset in pkf.split(data):
        # Train
        algo.fit(trainset)

        # Predict
        predictions = algo.test(testset)
    pred = np.zeros(len(predictions))
    for i in range(len(predictions)):
        val = predictions[i].est
        pred[i] = val
    return pred
Example #7
0
def test_dump():
    """Train an algorithm, compute its predictions then dump them.
    Ensure that the predictions that are loaded back are the correct ones, and
    that the predictions of the dumped algorithm are also equal to the other
    ones."""

    random.seed(0)

    train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train')
    test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
    data = Dataset.load_from_folds([(train_file, test_file)],
                                   Reader('ml-100k'))
    pkf = PredefinedKFold()

    trainset, testset = next(pkf.split(data))

    algo = BaselineOnly()
    algo.fit(trainset)
    predictions = algo.test(testset)

    with tempfile.NamedTemporaryFile() as tmp_file:
        dump.dump(tmp_file.name, predictions, algo)
        predictions_dumped, algo_dumped = dump.load(tmp_file.name)

        predictions_algo_dumped = algo_dumped.test(testset)
        assert predictions == predictions_dumped
        assert predictions == predictions_algo_dumped
def baseline_bias_model(df):
    """
        Shows the performance of model based on just bias
    """
    ratings_pandas_df = df.drop(columns=['date', 'text'])
    #    ratings_pandas_df.columns = ['user_id', 'business_id', 'rating']

    reader = Reader(rating_scale=(1, 5))  #TODO figure out

    data = surprise.dataset.Dataset.load_from_df(df=ratings_pandas_df,
                                                 reader=reader)

    ts = data.build_full_trainset()
    dusers = ts._raw2inner_id_users
    ditems = ts._raw2inner_id_items

    trainset, testset = train_test_split(data)

    algo = BaselineOnly()
    algo.fit(trainset)

    # testset = trainset.build_anti_testset()
    predictions = algo.test(testset)

    print('\n')
    return (trainset, testset, predictions, dusers, ditems)
Example #9
0
def baseline(trainset, testset):
    algo = BaselineOnly()
    algo.fit(trainset)
    print("Predictions")
    predictions = algo.test(testset)
    accuracy.rmse(predictions)
    accuracy.mae(predictions)
    return(predictions)
def predict(path):
    ##read data and transform it to
    reader = Reader(line_format='user item rating', sep=',', skip_lines=1)
    data = Dataset.load_from_file("{}".format(path), reader=reader)
    all_train = data.build_full_trainset()
    bsl = BaselineOnly()
    svd = SVD()
    bsl.fit(all_train)
    svd.fit(all_train)
    all_test = all_train.build_anti_testset()
    bsl_predictions = bsl.test(all_test)
    bsl_pred = get_top_n(bsl_predictions, 100)
    svd_predictions = bsl.test(all_test)
    svd_pred = get_top_n(svd_predictions, 100)
    with open("baseline_predictions.pickle", "wb") as f:
        pickle.dump([bsl_pred, svd_pred], f, protocol=2)
    f.close()
    print("Done recommending using baseline model and SVD model.")
def baseline_only(train, test, ids, Xtest, Xids):
    """
    Combines user and item mean with user and item biases
    Argument : train, the trainset
               test, the testset
               ids, unknown ratings
               Xtest, predicted ratings for testset, to be used for final blending
               Xids, predicted ratings for unknown ratings, to be used for final blending
    """
    print('Baseline Only')
    bsl_options = {
        'method': 'als',
        'n_epochs': 100,
        'reg_u': 15,
        'reg_i': 0.01
    }

    algo = BaselineOnly(bsl_options=bsl_options, verbose=False)

    #Train algorithm on training set
    algo.fit(train)

    #Predict on train and compute RMSE
    predictions = algo.test(train.build_testset())
    print('   Training RMSE: ', accuracy.rmse(predictions, verbose=False))

    #Predict on test and compute RMSE
    predictions = algo.test(test)
    rmse = accuracy.rmse(predictions, verbose=False)
    print('   Test RMSE: ', rmse)

    preds_test = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds_test[j] = pred.est

    #Predict unknown ratings
    preds_ids = []
    for i in range(len(ids[0])):
        pred = algo.predict(str(ids[0][i]), str(ids[1][i]))
        preds_ids.append(pred.est)

    Xtest.append(preds_test)
    Xids.append(preds_ids)
    return rmse, Xtest, Xids, preds_test, preds_ids
Example #12
0
def baseline(trainset, testset):

    print("\n" + "-" * 5 + " Baseline algorithm using surprise package " +
          "-" * 5)
    algo = BaselineOnly()
    algo.fit(trainset)
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions)
    mae = accuracy.mae(predictions)
    return rmse, mae, predictions
Example #13
0
class BaseLineRecommender(object):
    """
    Use surprise's baselineonly algorithm as the baseline of prediction
    """
    def __init__(self):
        self.model = None

    def fit(self, train):
        """
        Fit the model
        """
        self.model = BaselineOnly(bsl_options={
            'method': 'sgd',
            'n_epochs': 30,
            'reg': 0.01,
            'learning_rate': 0.01
        })
        self.model.fit(train)

    def predict(self, user_id, item_id):
        """
        Predict ratings
        """
        return self.model.predict(user_id, item_id)

    def rmse(self, test):
        """
        Calculate RMSE for the predicted ratings
        """
        pred = self.model.test(test)
        return accuracy.rmse(pred)

    def mae(self, test):
        """
        Calculate MAE for the predicted ratings
        """
        pred = self.model.test(test)
        return accuracy.mae(pred)
Example #14
0
def test_dump(u1_ml100k):
    """Train an algorithm, compute its predictions then dump them.
    Ensure that the predictions that are loaded back are the correct ones, and
    that the predictions of the dumped algorithm are also equal to the other
    ones."""

    random.seed(0)

    trainset, testset = next(PredefinedKFold().split(u1_ml100k))

    algo = BaselineOnly()
    algo.fit(trainset)
    predictions = algo.test(testset)

    with tempfile.NamedTemporaryFile() as tmp_file:
        dump.dump(tmp_file.name, predictions, algo)
        predictions_dumped, algo_dumped = dump.load(tmp_file.name)

        predictions_algo_dumped = algo_dumped.test(testset)
        assert predictions == predictions_dumped
        assert predictions == predictions_algo_dumped
Example #15
0
def baseline(training, testing):
    '''
    Calculates RMSE, coverage and running time of Baseline model

    Args:
        training(Dataset): training dataset
        testing(Dataset): test dataset

    Returns:
        rmse: RMSE of Baseline with optimized parameters
        top_n: number of unique predictions for top n items
    '''

    # fit model
    baseline = BaselineOnly()
    baseline.train(training)

    # evaluate the model using test data
    predictions = baseline.test(testing)
    top_n = get_top_n(predictions, n=5)
    rmse = accuracy.rmse(predictions, verbose=True)

    return rmse, top_n
Example #16
0
def test_trainset_testset():
    """Test the construct_trainset and construct_testset methods."""

    current_dir = os.path.dirname(os.path.realpath(__file__))
    folds_files = [(current_dir + '/custom_train',
                    current_dir + '/custom_test')]

    data = Dataset.load_from_folds(folds_files=folds_files, reader=reader)

    for trainset, testset in data.folds():
        pass  # just need trainset and testset to be set

    # test ur
    ur = trainset.ur
    assert ur[0] == [(0, 4)]
    assert ur[1] == [(0, 4), (1, 2)]
    assert ur[40] == []  # not in the trainset

    # test ir
    ir = trainset.ir
    assert ir[0] == [(0, 4), (1, 4), (2, 1)]
    assert ir[1] == [(1, 2), (2, 1), (3, 5)]
    assert ir[20000] == []  # not in the trainset

    # test n_users, n_items, n_ratings, rating_scale
    assert trainset.n_users == 4
    assert trainset.n_items == 2
    assert trainset.n_ratings == 6
    assert trainset.rating_scale == (1, 5)

    # test raw2inner
    for i in range(4):
        assert trainset.to_inner_uid('user' + str(i)) == i
    with pytest.raises(ValueError):
        trainset.to_inner_uid('unkown_user')

    for i in range(2):
        assert trainset.to_inner_iid('item' + str(i)) == i
    with pytest.raises(ValueError):
        trainset.to_inner_iid('unkown_item')

    # test inner2raw
    assert trainset._inner2raw_id_users is None
    assert trainset._inner2raw_id_items is None
    for i in range(4):
        assert trainset.to_raw_uid(i) == 'user' + str(i)
    for i in range(2):
        assert trainset.to_raw_iid(i) == 'item' + str(i)
    assert trainset._inner2raw_id_users is not None
    assert trainset._inner2raw_id_items is not None

    # Test the build_testset() method
    algo = BaselineOnly()
    algo.train(trainset)
    testset = trainset.build_testset()
    algo.test(testset)  # ensure an algorithm can manage the data
    assert ('user0', 'item0', 4) in testset
    assert ('user3', 'item1', 5) in testset
    assert ('user3', 'item1', 0) not in testset

    # Test the build_anti_testset() method
    algo = BaselineOnly()
    algo.train(trainset)
    testset = trainset.build_anti_testset()
    algo.test(testset)  # ensure an algorithm can manage the data
    assert ('user0', 'item0', trainset.global_mean) not in testset
    assert ('user3', 'item1', trainset.global_mean) not in testset
    assert ('user0', 'item1', trainset.global_mean) in testset
    assert ('user3', 'item0', trainset.global_mean) in testset
Example #17
0
    nmf_results = []

    reader = Reader(rating_scale=(0, np.inf))
    data = Dataset.load_from_df(
        usergroups_df[["user_id", "item_id", "rating"]], reader)
    folds_it = KFold(n_splits=5).split(data)
    i = 1
    pl_fit = []
    for trainset, testset in folds_it:
        print("Fold: %d" % i)
        i += 1

        print("Baseline")
        baseline = BaselineOnly()
        baseline.fit(trainset)
        baseline_predictions = baseline.test(testset)
        results = get_group_measures(preds_all=baseline_predictions,
                                     U1=U1_users,
                                     U2=U2_users,
                                     U3=U3_users,
                                     U4=U4_users)
        baseline_results.append(results)

        print("KNN")
        knn = KNNBasic(sim_options={"name": "pearson"})
        #knn = KNNBasic(sim_options={"name": "cosine"})
        knn.fit(trainset)
        knn_predictions = knn.test(testset)
        results = get_group_measures(preds_all=knn_predictions,
                                     U1=U1_users,
                                     U2=U2_users,
from surprise import Dataset
from surprise import Reader
from surprise import BaselineOnly
from surprise import accuracy
from surprise.model_selection import KFold

#数据读取
reader = Reader(line_format='user item rating timestamp',
                sep=',',
                skip_lines=1)
data = Dataset.load_from_file('./ratings.csv', reader=reader)
train_set = data.build_full_trainset()

#ALS优化,优化方式可以选其他的('SGD')
#设置user、item的正则化项
bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5}
model = BaselineOnly(bsl_options=bsl_options)

#k折交叉验证
kf = KFold(n_splits=5)
for trainset, testset in kf.split(data):
    model.fit(trainset)
    pred = model.test(testset)
    #计算RMSE
    accuracy.rmse(pred)

uid = str(300)
iid = str(180)

#输出uid对iid 的预测结果
pred = model.predict(uid, iid, r_ui=4, verbose=True)
Example #19
0
def collaborative_filtering(raw_uid):
    # To read the data from a txt file
    # =============== 数据预处理 ===========================
    # 将数据库中的所有数据读取转换到文件
    # dir_data = '/www/wwwroot/music_recommender/page/cf_recommendation/cf_data'
    dir_data = './collaborative_filtering/cf_data'
    file_path = '{}/dataset_user_5.txt'.format(dir_data)
    if not os.path.exists(dir_data):
        os.makedirs(dir_data)

    # 数据库操作
    # 打开数据库连接
    db = pymysql.connect("localhost",
                         "root",
                         "password",
                         "music_recommender",
                         charset='utf8')

    # 使用 cursor() 方法创建一个游标对象 cursor
    cursor = db.cursor()
    songData = defaultdict(list)
    sql = """SELECT uid, song_id, rating
              FROM user_rating
               WHERE 1"""
    cursor.execute(sql)
    results = cursor.fetchall()
    with open(file_path, "w+") as data_f:
        a = 0
        for result in results:
            uid, song_id, rating = result
            if song_id in songData:
                songData[song_id].append(rating)
            else:
                songData[song_id] = [rating]
            
            data_f.writelines("{}\t{}\t{}\n".format(uid, song_id, rating))
            a += 1
  
    if not os.path.exists(file_path):
        raise IOError("Dataset file is not exists!")

    reader = Reader(line_format='user item rating', sep='\t')
    data = Dataset.load_from_file(file_path, reader=reader)
    # Build the training set
    trainset = data.build_full_trainset()
  
    bsl_options = {'method': 'sgd',
                    'learning_rate': 0.0005,
                 }
    algo_BaselineOnly = BaselineOnly(bsl_options=bsl_options)
    algo_BaselineOnly.fit(trainset) #训练模型

    rset = user_build_anti_testset(trainset, raw_uid)
    predictions = algo_BaselineOnly.test(rset)
    top_n_baselineonly = get_top_n(predictions, n=10)
    # print(predictions)
    # uid    原生用户id
    # iid    原生项目id
    # r_ui    浮点型的真实评分
    # est    浮点型的预测评分
    # details    预测相关的其他详细信息
    # print(top_n_baselineonly, 'top_n_baselineonly')
    

    # KNNBasic
    sim_options = {'name': 'pearson', 'user_based': True}
    algo_KNNBasic = KNNBasic(sim_options=sim_options)
    algo_KNNBasic.fit(trainset)

    predictor = PredictionSet(algo_KNNBasic, trainset, raw_uid)
  
    knn_anti_set = predictor.user_build_anti_testset()
    predictions = algo_KNNBasic.test(knn_anti_set)

    top_n_knnbasic = get_top_n(predictions, n=1000)
    # print(predictions, 'top_n_knnbasic')
    # KNNBaseline
    sim_options = {'name': 'pearson_baseline', 'user_based': True}
    algo_KNNBaseline = KNNBaseline(sim_options=sim_options)
    algo_KNNBaseline.fit(trainset)

    predictor = PredictionSet(algo_KNNBaseline, trainset, raw_uid)
    knn_anti_set = predictor.user_build_anti_testset()
    predictions = algo_KNNBaseline.test(knn_anti_set)
    top_n_knnbaseline = get_top_n(predictions, n=1000)

    evaluationMSEResult = evaluationMSE([top_n_baselineonly, top_n_knnbasic, top_n_knnbaseline], raw_uid)

    recommendset = set()
    for results in [top_n_baselineonly, top_n_knnbasic, top_n_knnbaseline]:
        for key in results.keys():
            for recommendations in results[key]:
                iid, rating, true_score = recommendations
                recommendset.add(iid)

    items_baselineonly = set()
    for key in top_n_baselineonly.keys():
        for recommendations in top_n_baselineonly[key]:
            iid, rating, true_score = recommendations
            items_baselineonly.add(iid)

    items_knnbasic = set()
    for key in top_n_knnbasic.keys():
        for recommendations in top_n_knnbasic[key]:
            iid, rating, true_score = recommendations
            items_knnbasic.add(iid)

    items_knnbaseline = set()
    for key in top_n_knnbaseline.keys():
        for recommendations in top_n_knnbaseline[key]:
            iid, rating, true_score = recommendations
            items_knnbaseline.add(iid)

    rank = dict()
    for recommendation in recommendset:
        if recommendation not in rank:
            rank[recommendation] = 0
        if recommendation in items_baselineonly:
            rank[recommendation] += 1
        if recommendation in items_knnbasic:
            rank[recommendation] += 1
        if recommendation in items_knnbaseline:
            rank[recommendation] += 1

    max_rank = max(rank, key=lambda s: rank[s])
    evaluationMSEResult1 = {}
    if max_rank == 1:
        return items_baselineonly
    else:
        resultAll = dict()
        result = nlargest(10, rank, key=lambda s: rank[s])
        for k in result:
            resultAll[k] = rank[k]
        # print("排名结果: {}".format(resultAll))
        evaluation(songData, resultAll)
        for key in evaluationMSEResult:
            if key in resultAll:
                evaluationMSEResult1[key] = evaluationMSEResult[key]
        print(evaluationMSEResult1,'evaluationMSEResult1==') #最后的评估
        return resultAll
Example #20
0
def collaborative_fitlering(raw_uid):
    # =============== 数据预处理 ===========================
    # 将数据库中的所有数据读取转换到文件
    # dir_data = '/www/wwwroot/music_recommender/page/cf_recommendation/cf_data'
    dir_data = './collaborative_filtering/cf_data'
    file_path = '{}/dataset_user_5.txt'.format(dir_data)
    if not os.path.exists(dir_data):
        os.makedirs(dir_data)

    # 数据库操作
    # 打开数据库连接
    db = pymysql.connect("localhost",
                         "music_system",
                         "music_system",
                         "music_recommender",
                         charset='utf8')

    # 使用 cursor() 方法创建一个游标对象 cursor
    cursor = db.cursor()

    sql = """SELECT uid, song_id, rating
              FROM user_rating
               WHERE 1"""
    cursor.execute(sql)
    results = cursor.fetchall()
    with open(file_path, "w+") as data_f:
        for result in results:
            uid, song_id, rating = result

            data_f.writelines("{}\t{}\t{}\n".format(uid, song_id, rating))

    if not os.path.exists(file_path):
        raise IOError("Dataset file is not exists!")

    # ===========  cf recommend ==================
    # 导入数据
    reader = Reader(line_format='user item rating', sep='\t')
    data = Dataset.load_from_file(file_path, reader=reader)

    # 所有数据生成训练集
    trainset = data.build_full_trainset()

    # ================= BaselineOnly  ==================
    # start = time.clock()

    bsl_options = {
        'method': 'sgd',
        'learning_rate': 0.0005,
    }
    algo_BaselineOnly = BaselineOnly(bsl_options=bsl_options)
    algo_BaselineOnly.fit(trainset)

    # 获得推荐结果
    rset = user_build_anti_testset(trainset, raw_uid)
    predictions = algo_BaselineOnly.test(rset)
    top_n_baselineonly = get_top_n(predictions, n=5)

    # end = time.clock()
    # print("user-50NN --- BaselineOnly 耗时: %.2fs\n" % (end-start))
    # print("BaselineOnly 推荐结果:{}\n".format(top_n_baselineonly))

    # ================= KNNBasic  ==================
    sim_options = {'name': 'pearson', 'user_based': True}
    algo_KNNBasic = KNNBasic(sim_options=sim_options)
    algo_KNNBasic.fit(trainset)

    # 获得推荐结果  ---  只考虑 knn 用户的
    # start = time.clock()
    predictor = PredictionSet(algo_KNNBasic, trainset, raw_uid)
    knn_anti_set = predictor.user_build_anti_testset()
    predictions = algo_KNNBasic.test(knn_anti_set)
    top_n_knnbasic = get_top_n(predictions, n=5)

    # end = time.clock()
    # print("user-50NN --- KNNBasic 耗时: %.2fs\n" % (end-start))
    # print("KNNBasic 推荐结果:{}\n".format(top_n_knnbasic))

    # ================= KNNBaseline  ==================
    sim_options = {'name': 'pearson_baseline', 'user_based': True}
    algo_KNNBaseline = KNNBaseline(sim_options=sim_options)
    algo_KNNBaseline.fit(trainset)

    # 获得推荐结果  ---  只考虑 knn 用户的
    # start = time.clock()
    predictor = PredictionSet(algo_KNNBaseline, trainset, raw_uid)
    knn_anti_set = predictor.user_build_anti_testset()
    predictions = algo_KNNBaseline.test(knn_anti_set)
    top_n_knnbaseline = get_top_n(predictions, n=5)

    # end = time.clock()
    # print("user-50NN --- KNNBaseline 耗时: %.2fs\n" % (end-start))
    # print("KNNBaseline 推荐结果:{}\n".format(top_n_knnbaseline))

    # =============== 按比例生成推荐结果 ==================
    recommendset = set()
    for results in [top_n_baselineonly, top_n_knnbasic, top_n_knnbaseline]:
        for key in results.keys():
            for recommendations in results[key]:
                iid, rating = recommendations
                recommendset.add(iid)

    items_baselineonly = set()
    for key in top_n_baselineonly.keys():
        for recommendations in top_n_baselineonly[key]:
            iid, rating = recommendations
            items_baselineonly.add(iid)

    items_knnbasic = set()
    for key in top_n_knnbasic.keys():
        for recommendations in top_n_knnbasic[key]:
            iid, rating = recommendations
            items_knnbasic.add(iid)

    items_knnbaseline = set()
    for key in top_n_knnbaseline.keys():
        for recommendations in top_n_knnbaseline[key]:
            iid, rating = recommendations
            items_knnbaseline.add(iid)

    rank = dict()
    for recommendation in recommendset:
        if recommendation not in rank:
            rank[recommendation] = 0
        if recommendation in items_baselineonly:
            rank[recommendation] += 1
        if recommendation in items_knnbasic:
            rank[recommendation] += 1
        if recommendation in items_knnbaseline:
            rank[recommendation] += 1

    max_rank = max(rank, key=lambda s: rank[s])
    if max_rank == 1:
        # print(items_baselineonly)
        return items_baselineonly
    else:
        result = nlargest(5, rank, key=lambda s: rank[s])
        # print(result)
        return result
Example #21
0
def test_trainset_testset(toy_data_reader):
    """Test the construct_trainset and construct_testset methods."""

    current_dir = os.path.dirname(os.path.realpath(__file__))
    folds_files = [(current_dir + '/custom_train',
                    current_dir + '/custom_test')]

    data = Dataset.load_from_folds(folds_files=folds_files,
                                   reader=toy_data_reader, rating_scale=(1, 5))

    with pytest.warns(UserWarning):
        trainset, testset = next(data.folds())

    # test ur
    ur = trainset.ur
    assert ur[0] == [(0, 4)]
    assert ur[1] == [(0, 4), (1, 2)]
    assert ur[40] == []  # not in the trainset

    # test ir
    ir = trainset.ir
    assert ir[0] == [(0, 4), (1, 4), (2, 1)]
    assert ir[1] == [(1, 2), (2, 1), (3, 5)]
    assert ir[20000] == []  # not in the trainset

    # test n_users, n_items, n_ratings, rating_scale
    assert trainset.n_users == 4
    assert trainset.n_items == 2
    assert trainset.n_ratings == 6
    assert trainset.rating_scale == (1, 5)

    # test raw2inner
    for i in range(4):
        assert trainset.to_inner_uid('user' + str(i)) == i
    with pytest.raises(ValueError):
        trainset.to_inner_uid('unkown_user')

    for i in range(2):
        assert trainset.to_inner_iid('item' + str(i)) == i
    with pytest.raises(ValueError):
        trainset.to_inner_iid('unkown_item')

    # test inner2raw
    assert trainset._inner2raw_id_users is None
    assert trainset._inner2raw_id_items is None
    for i in range(4):
        assert trainset.to_raw_uid(i) == 'user' + str(i)
    for i in range(2):
        assert trainset.to_raw_iid(i) == 'item' + str(i)
    assert trainset._inner2raw_id_users is not None
    assert trainset._inner2raw_id_items is not None

    # Test the build_testset() method
    algo = BaselineOnly()
    algo.fit(trainset)
    testset = trainset.build_testset()
    algo.test(testset)  # ensure an algorithm can manage the data
    assert ('user0', 'item0', 4) in testset
    assert ('user3', 'item1', 5) in testset
    assert ('user3', 'item1', 0) not in testset

    # Test the build_anti_testset() method
    algo = BaselineOnly()
    algo.fit(trainset)
    testset = trainset.build_anti_testset()
    algo.test(testset)  # ensure an algorithm can manage the data
    assert ('user0', 'item0', trainset.global_mean) not in testset
    assert ('user3', 'item1', trainset.global_mean) not in testset
    assert ('user0', 'item1', trainset.global_mean) in testset
    assert ('user3', 'item0', trainset.global_mean) in testset
Example #22
0
def test_trainset_testset():
    """Test the construct_trainset and construct_testset methods."""

    current_dir = os.path.dirname(os.path.realpath(__file__))
    folds_files = [(current_dir + '/custom_train',
                    current_dir + '/custom_test')]

    data = Dataset.load_from_folds(folds_files=folds_files, reader=reader)

    with pytest.warns(UserWarning):
        trainset, testset = next(data.folds())

    # test ur
    ur = trainset.ur
    assert ur[0] == [(0, 4, None)]
    assert ur[1] == [(0, 4, None), (1, 2, None)]
    assert ur[40] == []  # not in the trainset

    # test ir
    ir = trainset.ir
    assert ir[0] == [(0, 4, None), (1, 4, None), (2, 1, None)]
    assert ir[1] == [(1, 2, None), (2, 1, None), (3, 5, None)]
    assert ir[20000] == []  # not in the trainset

    # test n_users, n_items, n_ratings, rating_scale
    assert trainset.n_users == 4
    assert trainset.n_items == 2
    assert trainset.n_ratings == 6
    assert trainset.rating_scale == (1, 5)

    # test user features
    u_features = trainset.u_features
    assert u_features[0] == []  # no u_features_df added
    assert u_features[1] == []  # no u_features_df added
    assert u_features[3] == []  # no u_features_df added
    assert u_features[40] == []  # not in trainset and no u_features_df
    assert trainset.user_features_labels == []
    assert trainset.n_user_features == 0

    # test item features
    i_features = trainset.i_features
    assert i_features[0] == []  # no i_features_df added
    assert i_features[1] == []  # no i_features_df added
    assert i_features[20000] == []  # not in trainset and no i_features_df
    assert trainset.item_features_labels == []
    assert trainset.n_item_features == 0

    # test raw2inner
    for i in range(4):
        assert trainset.to_inner_uid('user' + str(i)) == i
    with pytest.raises(ValueError):
        trainset.to_inner_uid('unknown_user')

    for i in range(2):
        assert trainset.to_inner_iid('item' + str(i)) == i
    with pytest.raises(ValueError):
        trainset.to_inner_iid('unknown_item')

    # test inner2raw
    assert trainset._inner2raw_id_users is None
    assert trainset._inner2raw_id_items is None
    for i in range(4):
        assert trainset.to_raw_uid(i) == 'user' + str(i)
    for i in range(2):
        assert trainset.to_raw_iid(i) == 'item' + str(i)
    assert trainset._inner2raw_id_users is not None
    assert trainset._inner2raw_id_items is not None

    # Test the build_testset() method
    algo = BaselineOnly()
    algo.fit(trainset)
    testset = trainset.build_testset()
    algo.test(testset)  # ensure an algorithm can manage the data
    assert ('user0', 'item0', [], [], 4) in testset
    assert ('user3', 'item1', [], [], 5) in testset
    assert ('user3', 'item1', [], [], 0) not in testset

    # Test the build_anti_testset() method
    algo = BaselineOnly()
    algo.fit(trainset)
    testset = trainset.build_anti_testset()
    algo.test(testset)  # ensure an algorithm can manage the data
    assert ('user0', 'item0', [], [], trainset.global_mean) not in testset
    assert ('user3', 'item1', [], [], trainset.global_mean) not in testset
    assert ('user0', 'item1', [], [], trainset.global_mean) in testset
    assert ('user3', 'item0', [], [], trainset.global_mean) in testset
Example #23
0






#################predictions using BaselineOnly
print('')
print('Making recommendations...')
print('')

algo4 = BaselineOnly()
algo4.fit(trainset)

predictions4 = algo4.test(testset)
dictMovies4 = get_top_n(predictions4)
topMovies4 = dictMovies4.get(672)

print('')
print('Here are the top 5 recommendations based on Baseline algorithm! ')

for i in range(5):

    movieRecc4 = topMovies4[i]
    movieRawID4 = movieRecc4[0]
    movieName4 = movie[movieRawID4]
    print(str(i+1) + '. ' + movieName4 )


######################predicitons using Matrix-Factorization
from surprise import Dataset, Reader, BaselineOnly
import pandas as pd

train_rating_df = pd.read_csv("train_rating.txt", header=0, index_col=0)
test = pd.read_csv('test_rating.txt', header=0, index_col=0)
test['dummy_rating'] = '-1' 
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(train_rating_df[['user_id', 'business_id', 'rating']], reader)
trainset = data.build_full_trainset()
bsl_options = {'method':'sgd','reg':0.08,'n_epochs':50,'learning_rate':0.0035, 'reg_u':0.07, 'reg_i':0.2}
algo=BaselineOnly(bsl_options=bsl_options)
algo.train(trainset)
testdata = Dataset.load_from_df(test[['user_id', 'business_id', 'dummy_rating']], reader)
predictions = algo.test(testdata.construct_testset(raw_testset=testdata.raw_ratings))
df = pd.DataFrame(predictions)
newdf = df['est']
newdf.rename('rating', inplace=True)
newdf.to_csv('submission.csv',header='rating',index_label='test_id')
Example #25
0
#import pandas as pd

# 数据读取
reader = Reader(line_format='user item rating timestamp',
                sep=',',
                skip_lines=1)
data = Dataset.load_from_file('./ratings.csv', reader=reader)
train_set = data.build_full_trainset()

# ALS优化
#bsl_options = {'method': 'als','n_epochs': 5,'reg_u': 12,'reg_i': 5}
# SGD优化
bsl_options = {'method': 'sgd', 'n_epochs': 5}
algo = BaselineOnly(bsl_options=bsl_options)
#algo = BaselineOnly()
#algo = NormalPredictor()

# 定义K折交叉验证迭代器,K=3
kf = KFold(n_splits=3)
for trainset, testset in kf.split(data):
    # 训练并预测
    algo.fit(trainset)
    predictions = algo.test(testset)
    # 计算RMSE
    accuracy.rmse(predictions, verbose=True)

uid = str(196)
iid = str(302)
# 输出uid对iid的预测结果
pred = algo.predict(uid, iid, r_ui=4, verbose=True)
def make_predictions(user_id):
    performance = []
    algorithms = ['SVD', 'KNN', 'ALS']

    # First train an SVD algorithm on the movielens dataset.
    data = Dataset.load_builtin('ml-100k')
    trainset = data.build_full_trainset()

    algo_SVD = SVD()
    algo_SVD.fit(trainset)

    # Then predict ratings for all pairs (u, i) that are NOT in the training set.
    # SVD algorithm
    testset = trainset.build_anti_testset()
    predictions_SVD = algo_SVD.test(testset)

    accurancy_SVD = accuracy.rmse(predictions_SVD)
    performance.append(accurancy_SVD)

    algo_KNN = KNNBasic()
    algo_KNN.fit(trainset)

    predictions_KNN = algo_SVD.test(testset)

    accurancy_KNN = accuracy.rmse(predictions_KNN)
    performance.append(accurancy_KNN)

    bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5}
    algo_ALS = BaselineOnly(bsl_options=bsl_options)
    algo_ALS.fit(trainset)

    predictions_ALS = algo_ALS.test(testset)

    accurancy_ALS = accuracy.rmse(predictions_ALS)
    performance.append(accurancy_ALS)

    # comparing algorithms by performance
    best_performance_index = performance.index(min(performance))
    best_algorithm = algorithms[best_performance_index]

    if best_algorithm == 'SVD':
        top_n = get_top_n(predictions_SVD, n=10)
    elif best_algorithm == 'KNN':
        top_n = get_top_n(predictions_KNN, n=10)
    elif best_algorithm == 'ALS':
        top_n = get_top_n(predictions_ALS, n=10)

    i_cols = [
        'movie_id', 'movie_title', 'release_date', 'video_release_date',
        'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation',
        'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
        'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
        'Thriller', 'War', 'Western'
    ]

    items = pd.read_csv('../../ml-100k/u.item',
                        sep='|',
                        names=i_cols,
                        encoding='latin-1')

    predictions = []
    # Print the recommended items for the user
    for uid, user_ratings in top_n.items():
        if int(uid) + 1 == int(user_id) + 1:
            # print(uid, [iid for (iid, _) in user_ratings])
            for (iid, _) in user_ratings:
                title = items[items['movie_id'] == int(iid) + 1]['movie_title']
                title_t = str(title)
                title_split = title_t.split()
                print(title_split)
                # print(title_split(1))
                # print(title_split(2))
                # print(title_t)
                predictions.append(title_t)

    return predictions
Example #27
0
avg_precision = sum(prec for prec in precisions.values()) / len(precisions)
avg_recall= sum(rec for rec in recalls.values()) / len(recalls)
metrics = {'rmse': rmse, 
            'avg_precision': avg_precision, 
            'avg_recall': avg_recall}
results['NormalPredictor'] = metrics

top_n['NormalPredictor'] = get_top_n(norm_pred, n=10)


param_grid = {'bsl_options':{'method': ['als', 'sgd']}}
gs = GridSearchCV(BaselineOnly, param_grid, measures = ['rmse'], cv = 5)
gs.fit(data)
params = gs.best_params['rmse']
algo = BaselineOnly(bsl_options = params['bsl_options'])
algo.fit(trainset)
base_pred  = algo.test(testset)
rmse = accuracy.rmse(base_pred)
precisions, recalls = precision_recall_at_k(base_pred, k = 10, threshold = 4)
avg_precision = sum(prec for prec in precisions.values()) / len(precisions)
avg_recall= sum(rec for rec in recalls.values()) / len(recalls)
metrics = {'rmse': rmse, 
               'avg_precision': avg_precision, 
               'avg_recall': avg_recall,
               'best_parameters': params}
results['BaselineOnly'] = metrics

top_n['BaselineOnly'] = get_top_n(base_pred, n=10)


clf = LinearRegression().fit(new_train_only_data, new_train_label)
y_pre = clf.predict(new_test_only_data)
linear_prediction = []
for i in range(len(y_pre)):
    all_info = [test_data_get[i][0]] + [test_data_get[i][1]] + [y_pre[i]]
    linear_prediction.append(all_info)
####################################surprise######################################
surprise_reader = Reader(line_format='user item rating', sep=',', skip_lines=1)
surprise_train = Dataset.load_from_file(input_file, reader=surprise_reader)
surprise_train = surprise_train.build_full_trainset()
surprise_test_data = sc.parallelize(test_data_get).map(
    lambda s: (s[0], s[1], float(s[2]))).collect()
params = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5}
surprise_formula = BaselineOnly(bsl_options=params)
surprise_formula.fit(surprise_train)
surprise_predict = surprise_formula.test(surprise_test_data)
surprise_prediction = []
for i in range(len(surprise_predict)):
    surprise_prediction.append([
        surprise_predict[i][0], surprise_predict[i][1], surprise_predict[i][3]
    ])
################################SVD########################################
from surprise import SVD
svd_surprise = SVD(n_epochs=30, lr_all=0.008, reg_all=0.2)
svd_surprise.fit(surprise_train)
surprise_svd_prediction = svd_surprise.test(surprise_test_data)
svd_prediction = []
for i in range(len(surprise_svd_prediction)):
    svd_prediction.append([
        surprise_svd_prediction[i][0], surprise_svd_prediction[i][1],
        surprise_svd_prediction[i][3]
Example #29
0
def test_trainset_testset_ui_features():
    """Test the construct_trainset and construct_testset methods with user and
    item features."""

    current_dir = os.path.dirname(os.path.realpath(__file__))
    folds_files = [(current_dir + '/custom_train',
                    current_dir + '/custom_test')]

    data = Dataset.load_from_folds(folds_files=folds_files, reader=reader)

    u_features_df = pd.DataFrame(
        {
            'urid': ['user0', 'user2', 'user3', 'user1'],
            'isMale': [False, True, False, True]
        },
        columns=['urid', 'isMale'])
    data = data.load_features_df(u_features_df, user_features=True)

    i_features_df = pd.DataFrame(
        {
            'irid': ['item0', 'item1'],
            'isNew': [False, True],
            'webRating': [4, 3],
            'isComedy': [True, False]
        },
        columns=['irid', 'isNew', 'webRating', 'isComedy'])
    data = data.load_features_df(i_features_df, user_features=False)

    with pytest.warns(UserWarning):
        trainset, testset = next(data.folds())

    # test ur
    ur = trainset.ur
    assert ur[0] == [(0, 4, None)]
    assert ur[1] == [(0, 4, None), (1, 2, None)]
    assert ur[40] == []  # not in the trainset

    # test ir
    ir = trainset.ir
    assert ir[0] == [(0, 4, None), (1, 4, None), (2, 1, None)]
    assert ir[1] == [(1, 2, None), (2, 1, None), (3, 5, None)]
    assert ir[20000] == []  # not in the trainset

    # test n_users, n_items, n_ratings, rating_scale
    assert trainset.n_users == 4
    assert trainset.n_items == 2
    assert trainset.n_ratings == 6
    assert trainset.rating_scale == (1, 5)

    # test user features
    u_features = trainset.u_features
    assert u_features[0] == [False]
    assert u_features[40] == []  # not in trainset and u_features_df
    assert trainset.user_features_labels == ['isMale']
    assert trainset.n_user_features == 1

    # test item features
    i_features = trainset.i_features
    assert i_features[0] == [False, 4, True]
    assert i_features[20000] == []  # not in trainset and i_features_df
    assert trainset.item_features_labels == ['isNew', 'webRating', 'isComedy']
    assert trainset.n_item_features == 3

    # test raw2inner
    for i in range(4):
        assert trainset.to_inner_uid('user' + str(i)) == i
    with pytest.raises(ValueError):
        trainset.to_inner_uid('unknown_user')

    for i in range(2):
        assert trainset.to_inner_iid('item' + str(i)) == i
    with pytest.raises(ValueError):
        trainset.to_inner_iid('unknown_item')

    # test inner2raw
    assert trainset._inner2raw_id_users is None
    assert trainset._inner2raw_id_items is None
    for i in range(4):
        assert trainset.to_raw_uid(i) == 'user' + str(i)
    for i in range(2):
        assert trainset.to_raw_iid(i) == 'item' + str(i)
    assert trainset._inner2raw_id_users is not None
    assert trainset._inner2raw_id_items is not None

    # Test the build_testset() method
    algo = BaselineOnly()
    algo.fit(trainset)
    testset = trainset.build_testset()
    algo.test(testset)  # ensure an algorithm can manage the data
    assert ('user0', 'item0', [False], [False, 4, True], 4) in testset
    assert ('user2', 'item1', [True], [True, 3, False], 1) in testset
    assert ('user3', 'item1', [False], [True, 3, False], 5) in testset
    assert ('user3', 'item1', [False], [True, 3, False], 0) not in testset

    # Test the build_anti_testset() method
    algo = BaselineOnly()
    algo.fit(trainset)
    testset = trainset.build_anti_testset()
    algo.test(testset)  # ensure an algorithm can manage the data
    assert (('user0', 'item0', [False], [False, 4, True], trainset.global_mean)
            not in testset)
    assert (('user3', 'item1', [False], [True, 3, False], trainset.global_mean)
            not in testset)
    assert (('user0', 'item1', [False], [True, 3, False], trainset.global_mean)
            in testset)
    assert (('user3', 'item0', [False], [False, 4, True], trainset.global_mean)
            in testset)
Example #30
0
def collaborative_filtering(raw_uid):
    # To read the data from a txt file
    # TODO: To modify the file path of the data set
    # =============== 数据预处理 ===========================
    # 将数据库中的所有数据读取转换到文件
    # dir_data = '/www/wwwroot/music_recommender/page/cf_recommendation/cf_data'
    dir_data = './collaborative_filtering/cf_data'
    file_path = '{}/dataset_user_5.txt'.format(dir_data)
    if not os.path.exists(dir_data):
        os.makedirs(dir_data)

    # 数据库操作
    # 打开数据库连接
    db = pymysql.connect("localhost",
                         "root",
                         "password",
                         "music_recommender",
                         charset='utf8')

    # 使用 cursor() 方法创建一个游标对象 cursor
    cursor = db.cursor()

    sql = """SELECT uid, song_id, rating
              FROM user_rating
               WHERE 1"""
    cursor.execute(sql)
    results = cursor.fetchall()
    with open(file_path, "w+") as data_f:
        # print(data_f)
        # exit()
        for result in results:
            uid, song_id, rating = result

            data_f.writelines("{}\t{}\t{}\n".format(uid, song_id, rating))

    if not os.path.exists(file_path):
        raise IOError("Dataset file is not exists!")
    # file_path = ""

    reader = Reader(line_format='user item rating', sep='\t')
    data = Dataset.load_from_file(file_path, reader=reader)

    # Build the training set
    trainset = data.build_full_trainset()
    # print(trainset)
    # exit()
    # Baselineonly
    bsl_options = {
        'method': 'sgd',
        'learning_rate': 0.0005,
    }
    algo_BaselineOnly = BaselineOnly(bsl_options=bsl_options)
    algo_BaselineOnly.fit(trainset)

    rset = user_build_anti_testset(trainset, raw_uid)
    predictions = algo_BaselineOnly.test(rset)
    top_n_baselineonly = get_top_n(predictions, n=5)

    # KNNBasic
    sim_options = {'name': 'pearson', 'user_based': True}
    algo_KNNBasic = KNNBasic(sim_options=sim_options)
    algo_KNNBasic.fit(trainset)

    predictor = PredictionSet(algo_KNNBasic, trainset, raw_uid)
    knn_anti_set = predictor.user_build_anti_testset()
    predictions = algo_KNNBasic.test(knn_anti_set)
    top_n_knnbasic = get_top_n(predictions, n=5)

    # KNNBaseline
    sim_options = {'name': 'pearson_baseline', 'user_based': True}
    algo_KNNBaseline = KNNBaseline(sim_options=sim_options)
    algo_KNNBaseline.fit(trainset)

    predictor = PredictionSet(algo_KNNBaseline, trainset, raw_uid)
    knn_anti_set = predictor.user_build_anti_testset()
    predictions = algo_KNNBaseline.test(knn_anti_set)
    top_n_knnbaseline = get_top_n(predictions, n=5)
    # l = list(top_n_baselineonly)[0]
    # a = list()
    # a[l] = top_n_baselineonly[l]
    # print(a)
    # print(l,'predictions == ')
    print({raw_uid: top_n_baselineonly[raw_uid]})
    print({raw_uid: top_n_knnbasic[raw_uid]})
    print({raw_uid: top_n_knnbasic[raw_uid]})