def use_als():
    start = time.time()
    performance = []

    data = Dataset.load_builtin('ml-100k')
    trainset = data.build_full_trainset()

    print('Using ALS')
    bsl_options = {'method': 'als', 'n_epochs': 20, 'reg_u': 12, 'reg_i': 5}
    algo_ALS = BaselineOnly(bsl_options=bsl_options)
    algo_ALS.fit(trainset)

    testset = trainset.build_anti_testset()
    predictions_ALS = algo_ALS.test(testset)

    accuracy_rmse \
        = accuracy.rmse(predictions_ALS)
    accuracy_mae = accuracy.mae(predictions_ALS)
    performance.append(accuracy_rmse)
    performance.append(accuracy_mae)

    end = time.time()
    performance.append(end - start)

    return performance
def use_sgd():
    start = time.time()
    performance = []

    data = Dataset.load_builtin('ml-100k')
    trainset = data.build_full_trainset()

    print('Using SGD')
    bsl_options = {
        'method': 'sgd',
        'learning_rate': .005,
    }

    algo_SGD = BaselineOnly(bsl_options=bsl_options)
    algo_SGD.fit(trainset)

    testset = trainset.build_anti_testset()
    predictions_SGD = algo_SGD.test(testset)

    accuracy_rmse = accuracy.rmse(predictions_SGD)
    accuracy_mae = accuracy.mae(predictions_SGD)
    performance.append(accuracy_rmse)
    performance.append(accuracy_mae)

    end = time.time()
    performance.append(end - start)

    return performance
Ejemplo n.º 3
0
def test_dump():
    """Train an algorithm, compute its predictions then dump them.
    Ensure that the predictions that are loaded back are the correct ones, and
    that the predictions of the dumped algorithm are also equal to the other
    ones."""

    random.seed(0)

    train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train')
    test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
    data = Dataset.load_from_folds([(train_file, test_file)],
                                   Reader('ml-100k'))
    pkf = PredefinedKFold()

    trainset, testset = next(pkf.split(data))

    algo = BaselineOnly()
    algo.fit(trainset)
    predictions = algo.test(testset)

    with tempfile.NamedTemporaryFile() as tmp_file:
        dump.dump(tmp_file.name, predictions, algo)
        predictions_dumped, algo_dumped = dump.load(tmp_file.name)

        predictions_algo_dumped = algo_dumped.test(testset)
        assert predictions == predictions_dumped
        assert predictions == predictions_algo_dumped
Ejemplo n.º 4
0
def surprise_baseline(train_file, test_file):
    """
    Baseline with Surprise library.
    Compute the predictions on a test_set after training on a train_set using the method Baseline from Surprise.
    Args:
        train_file (string): path to created test file
        test_file (string): path to created train file
    Hyperparameters:
        -
    Returns:
        numpy array: predictions
    """
    print("baseline")
    algo = BaselineOnly()
    fold = [(train_file, test_file)]
    reader = Reader(line_format='user item rating', sep=',')
    data = Dataset.load_from_folds(fold, reader=reader)
    pkf = PredefinedKFold()
    for trainset, testset in pkf.split(data):
        # Train
        algo.fit(trainset)

        # Predict
        predictions = algo.test(testset)
    pred = np.zeros(len(predictions))
    for i in range(len(predictions)):
        val = predictions[i].est
        pred[i] = val
    return pred
def normalize_affinity_scores_by_user_item_bs(user_item_affinities: List[Tuple[str, str, float]], rating_scale=(1, 5)) \
        -> Tuple[float, Dict[str, float], Dict[str, float], float, List[Tuple[str, str, float]]]:
    train = pd.DataFrame(user_item_affinities)
    reader = Reader(rating_scale=rating_scale)
    trainset = Dataset.load_from_df(train, reader).build_full_trainset()
    trainset_for_testing = trainset.build_testset()
    algo = BaselineOnly(bsl_options={'method': 'sgd'})
    algo.fit(trainset)
    predictions = algo.test(trainset_for_testing)
    mean = algo.trainset.global_mean
    bu = {
        u: algo.bu[algo.trainset.to_inner_uid(u)]
        for u in set([u for u, i, r in user_item_affinities])
    }
    bi = {
        i: algo.bi[algo.trainset.to_inner_iid(i)]
        for i in set([i for u, i, r in user_item_affinities])
    }
    uid = [[p.uid, p.iid, p.r_ui - p.est] for p in predictions]
    estimatates = [p.est for p in predictions]
    estimates_2 = [
        p.r_ui - (mean + bu[p.uid] + bi[p.iid]) for p in predictions
    ]
    uid = pd.DataFrame(uid, columns=["user", "item", "rating"])
    spread = max(uid["rating"].max(), np.abs(uid["rating"].min()))
    uid = list(zip(uid['user'], uid['item'], uid['rating']))
    bu = defaultdict(float, bu)
    bi = defaultdict(float, bi)
    # assert estimatates == estimates_2
    return mean, bu, bi, spread, uid
Ejemplo n.º 6
0
def baseline_bias_model(df):
    """
        Shows the performance of model based on just bias
    """
    ratings_pandas_df = df.drop(columns=['date', 'text'])
    #    ratings_pandas_df.columns = ['user_id', 'business_id', 'rating']

    reader = Reader(rating_scale=(1, 5))  #TODO figure out

    data = surprise.dataset.Dataset.load_from_df(df=ratings_pandas_df,
                                                 reader=reader)

    ts = data.build_full_trainset()
    dusers = ts._raw2inner_id_users
    ditems = ts._raw2inner_id_items

    trainset, testset = train_test_split(data)

    algo = BaselineOnly()
    algo.fit(trainset)

    # testset = trainset.build_anti_testset()
    predictions = algo.test(testset)

    print('\n')
    return (trainset, testset, predictions, dusers, ditems)
Ejemplo n.º 7
0
def baseline(trainset, testset):
    algo = BaselineOnly()
    algo.fit(trainset)
    print("Predictions")
    predictions = algo.test(testset)
    accuracy.rmse(predictions)
    accuracy.mae(predictions)
    return(predictions)
Ejemplo n.º 8
0
    class ALSModelSurprise(ALSModel):
        def __init__(self, params):
            super().__init__(params)
            self.algo = BaselineOnly(bsl_options=self.params)

        def parse_data(self, ratings):
            reader = Reader(rating_scale=(1, 5))
            self.data = Dataset.load_from_df(ratings, reader)

        def update_parameters(self):
            self.algo.bsl_options = self.params

        def fit(self):
            self.train = self.data.build_full_trainset()
            self.algo.fit(self.train)

        def predict(self, uid, iid):
            '''
            uid, iid should be consistent with ratings['UID','IID']
            '''
            return self.algo.predict(uid, iid).est

        def top_n_recommendations(self, uid, n=5):
            '''
            Obtain the top n recommendation for any user.
            Method for the surprise library
            '''
            scores = []
            for i in range(self.train.n_items):
                iid = self.train.to_raw_iid(i)
                scores.append((iid, self.predict(uid, iid)))
            scores.sort(key=lambda x: x[1], reverse=True)
            top_n_iid = [l[0] for l in scores[:n]]
            pred = [l[1] for l in scores[:n]]
            return top_n_iid, pred

        def cross_validate(self, cv=5, verbose=False):
            cv_result = cross_validate(self.algo, self.data, \
                                       cv=cv, verbose=verbose)
            rmse = cv_result['test_rmse'].mean()
            return rmse

        def grid_search(self):
            self._best_params = self.params
            self._best_rmse = self.cross_validate(cv=5)
            for n_epochs in [5, 10, 15, 20, 25]:
                for reg_u in [5, 10, 15, 20]:
                    for reg_i in [5, 10, 15]:
                        self.set_params(n_epochs=n_epochs,
                                        reg_u=reg_u,
                                        reg_i=reg_i)
                        rmse = self.cross_validate(cv=5)
                        print(n_epochs, reg_u, reg_i, rmse)
                        if (rmse < self._best_rmse):
                            self._best_rmse = rmse
                            self._best_params = self.params
Ejemplo n.º 9
0
def baseline(trainset, testset):

    print("\n" + "-" * 5 + " Baseline algorithm using surprise package " +
          "-" * 5)
    algo = BaselineOnly()
    algo.fit(trainset)
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions)
    mae = accuracy.mae(predictions)
    return rmse, mae, predictions
Ejemplo n.º 10
0
 def fit(self, train):
     """
     Fit the model
     """
     baselineOnly = BaselineOnly(bsl_options={
         'method': 'als',
         'n_epochs': 25,
         'reg_u': 5,
         'reg_i': 3
     })
     baselineOnly.fit(train)
     self.model = baselineOnly
def predict(trainset):
    print("Training the model for prediction .....")
    # predict ratings for all pairs (u, i) that are NOT in the training set.
    algo = BaselineOnly(bsl_options=bsl_options)
    testset = trainset.build_anti_testset()
    predictions = algo.fit(trainset).test(testset)
    return predictions
def predict(path):
    ##read data and transform it to
    reader = Reader(line_format='user item rating', sep=',', skip_lines=1)
    data = Dataset.load_from_file("{}".format(path), reader=reader)
    all_train = data.build_full_trainset()
    bsl = BaselineOnly()
    svd = SVD()
    bsl.fit(all_train)
    svd.fit(all_train)
    all_test = all_train.build_anti_testset()
    bsl_predictions = bsl.test(all_test)
    bsl_pred = get_top_n(bsl_predictions, 100)
    svd_predictions = bsl.test(all_test)
    svd_pred = get_top_n(svd_predictions, 100)
    with open("baseline_predictions.pickle", "wb") as f:
        pickle.dump([bsl_pred, svd_pred], f, protocol=2)
    f.close()
    print("Done recommending using baseline model and SVD model.")
def baseline_only(train, test, ids, Xtest, Xids):
    """
    Combines user and item mean with user and item biases
    Argument : train, the trainset
               test, the testset
               ids, unknown ratings
               Xtest, predicted ratings for testset, to be used for final blending
               Xids, predicted ratings for unknown ratings, to be used for final blending
    """
    print('Baseline Only')
    bsl_options = {
        'method': 'als',
        'n_epochs': 100,
        'reg_u': 15,
        'reg_i': 0.01
    }

    algo = BaselineOnly(bsl_options=bsl_options, verbose=False)

    #Train algorithm on training set
    algo.fit(train)

    #Predict on train and compute RMSE
    predictions = algo.test(train.build_testset())
    print('   Training RMSE: ', accuracy.rmse(predictions, verbose=False))

    #Predict on test and compute RMSE
    predictions = algo.test(test)
    rmse = accuracy.rmse(predictions, verbose=False)
    print('   Test RMSE: ', rmse)

    preds_test = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds_test[j] = pred.est

    #Predict unknown ratings
    preds_ids = []
    for i in range(len(ids[0])):
        pred = algo.predict(str(ids[0][i]), str(ids[1][i]))
        preds_ids.append(pred.est)

    Xtest.append(preds_test)
    Xids.append(preds_ids)
    return rmse, Xtest, Xids, preds_test, preds_ids
Ejemplo n.º 14
0
def test_dump(u1_ml100k):
    """Train an algorithm, compute its predictions then dump them.
    Ensure that the predictions that are loaded back are the correct ones, and
    that the predictions of the dumped algorithm are also equal to the other
    ones."""

    random.seed(0)

    trainset, testset = next(PredefinedKFold().split(u1_ml100k))

    algo = BaselineOnly()
    algo.fit(trainset)
    predictions = algo.test(testset)

    with tempfile.NamedTemporaryFile() as tmp_file:
        dump.dump(tmp_file.name, predictions, algo)
        predictions_dumped, algo_dumped = dump.load(tmp_file.name)

        predictions_algo_dumped = algo_dumped.test(testset)
        assert predictions == predictions_dumped
        assert predictions == predictions_algo_dumped
Ejemplo n.º 15
0
class BaseLineRecommender(object):
    """
    Use surprise's baselineonly algorithm as the baseline of prediction
    """
    def __init__(self):
        self.model = None

    def fit(self, train):
        """
        Fit the model
        """
        self.model = BaselineOnly(bsl_options={
            'method': 'sgd',
            'n_epochs': 30,
            'reg': 0.01,
            'learning_rate': 0.01
        })
        self.model.fit(train)

    def predict(self, user_id, item_id):
        """
        Predict ratings
        """
        return self.model.predict(user_id, item_id)

    def rmse(self, test):
        """
        Calculate RMSE for the predicted ratings
        """
        pred = self.model.test(test)
        return accuracy.rmse(pred)

    def mae(self, test):
        """
        Calculate MAE for the predicted ratings
        """
        pred = self.model.test(test)
        return accuracy.mae(pred)
def train(trainset, testset):
    """
    Train the recommender model that uses the baseline algorithm which is based on similarities between users
    and their shared ratings of recipes
    :param trainset: the train set from which the model learns the pattern of ratings and similarity between different users
    :param testset: the testset to which the model validate its knowledge of data and ratings distribution
    :return: a variable containing predictions of ratings of all items given by all users
    """
    print("Training the model for prediction ....")
    # BaselineOnly algorithm gave us the best rmse,
    # therefore, we will train and predict with BaselineOnly and use Alternating Least Squares (ALS).
    algo = BaselineOnly(bsl_options=bsl_options)
    predictions = algo.fit(trainset).test(testset)
    return predictions
Ejemplo n.º 17
0
 def baseline_only(self):
     """
     Basic baseline prediction using global mean and user-item biases. 
     Returns:
         predictions_df: The predictions of the model on the test data in
             Pandas Data Frame format
     """
     algorithm = BaselineOnly()
     predictions = algorithm.fit(self.train_data).test(self.test_data)
     predictions_df = self.data.test_df.copy()
     predictions_df['Rating'] = [x.est for x in predictions]
     if self.test_purpose: 
         self.evalueate_model(predictions_df['Rating'], 'Surprise baseline_only')
     return predictions_df
Ejemplo n.º 18
0
def als_predictions(trainset, dataset_test):
    algo = BaselineOnly(bsl_options={
        'method': 'als',
        'n_epochs': 30,
        'reg_u': 6,
        'reg_i': 4
    })
    predictions = algo.fit(trainset)
    list_1 = []
    for x in dataset_test:
        i = predictions.predict(
            x[0], x[1]) if mode == 'test' else predictions.predict(
                x[0], x[1], x[2])
        list_1.append((i[0], i[1], i[2], i[3]))
    return list_1
Ejemplo n.º 19
0
         Train_CV.cv_results['mean_test_rmse'], '.k')
plt.xscale('log')
plt.xlabel('Regularization Parameter ($\lambda$) - bu')
plt.ylabel('RMSE')
plt.grid()
plt.title('3-Fold CV - Regularization Parameter ($\lambda$) - bu')
plt.savefig('3_fold_CV_Reg_Param_Baseline_bu.png')


# %% Best Hyper-parameters Training
alg = BaselineOnly(bsl_options=
                   Grid_Search_Result.best_params['rmse']['bsl_options'])

start = time.time()

alg.fit(data_train.build_full_trainset())

end = time.time()
print("***********************************************")
print("Exe time:")
print(end - start)

# %% Loading Test Data
file_path = "Data/sample_submission.csv"
data_test = utils.load_data_desired(file_path)

# %% Prediction
Predict_Test = []

for line in data_test:
    Predict_Test.append(alg.predict(str(line[1]), str(line[0])).est)
Ejemplo n.º 20
0
if __name__ == "__main__":
    # Read data
    df = pd.read_csv(os.path.join(DATA_DIRECTORY, 'ratings.csv'))

    # Drop unneeded column 'timestamp'
    df.drop('timestamp', axis=1, inplace=True)

    # Load the data into the surprise format
    reader = Reader()
    data = Dataset.load_from_df(df, reader=reader)

    # Train ALS model
    print('Using ALS')
    bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5}
    trainset, testset = train_test_split(data, test_size=0.25)
    algo = BaselineOnly(bsl_options=bsl_options)
    predictions = algo.fit(trainset).test(testset)

    # Get the RMSE of our predictions
    rmse = accuracy.rmse(predictions)

    # Get the cross-validated RMSE of our predictions
    cv_results = cross_validate(algo, data)
    cv_rmse = cv_results['test_rmse'].mean()
    print(f'CV RMSE: {cv_rmse}')

    # Get true values and predicted values for our test set
    y_true = [x.r_ui for x in predictions]
    y_pred = [x.est for x in predictions]
Ejemplo n.º 21
0
avg_precision = sum(prec for prec in precisions.values()) / len(precisions)
avg_recall= sum(rec for rec in recalls.values()) / len(recalls)
metrics = {'rmse': rmse, 
            'avg_precision': avg_precision, 
            'avg_recall': avg_recall}
results['NormalPredictor'] = metrics

top_n['NormalPredictor'] = get_top_n(norm_pred, n=10)


param_grid = {'bsl_options':{'method': ['als', 'sgd']}}
gs = GridSearchCV(BaselineOnly, param_grid, measures = ['rmse'], cv = 5)
gs.fit(data)
params = gs.best_params['rmse']
algo = BaselineOnly(bsl_options = params['bsl_options'])
algo.fit(trainset)
base_pred  = algo.test(testset)
rmse = accuracy.rmse(base_pred)
precisions, recalls = precision_recall_at_k(base_pred, k = 10, threshold = 4)
avg_precision = sum(prec for prec in precisions.values()) / len(precisions)
avg_recall= sum(rec for rec in recalls.values()) / len(recalls)
metrics = {'rmse': rmse, 
               'avg_precision': avg_precision, 
               'avg_recall': avg_recall,
               'best_parameters': params}
results['BaselineOnly'] = metrics

top_n['BaselineOnly'] = get_top_n(base_pred, n=10)


Ejemplo n.º 22
0
dataset_test = dataset_test.apply(tuple, axis=1).tolist()
trainset = trainset.build_full_trainset()
"""
'method': 'als',
               'n_epochs': 92,
               'reg_u': 6,
               'reg_i': 4   

n_epochs=75, lr_all=0.005, reg_all=0.3 72,0.005,0.23,145 (55,0.005,0.21,105) (35,0.004,0.18,73)  
        
"""

bsl_options = {'method': 'als', 'n_epochs': 94, 'reg_u': 6, 'reg_i': 4}
algo1 = BaselineOnly(bsl_options=bsl_options)
predictions1 = algo1.fit(trainset)
predictions1 = predictions1.test(dataset_test)
predList1 = []
for i in predictions1:
    predList1.append((i[0], i[1], i[2], i[3]))
resdf1 = pd.DataFrame(
    list(predList1), columns=['user_id', 'business_id', 'stars', 'est_stars1'])

algo2 = SVD(n_factors=105, n_epochs=55, lr_all=0.005, reg_all=0.22)
predictions2 = algo2.fit(trainset)
end = time.time()
predictions2 = predictions2.test(dataset_test)
predList2 = []
for i in predictions2:
    predList2.append((i[0], i[1], i[2], i[3]))
resdf2 = pd.DataFrame(
Ejemplo n.º 23
0
for trainset, testset in kf.split(data):   
    # 训练并预测
    algo.fit(trainset)
    predictions = algo.test(testset)
    # 计算RMSE
    accuracy.rmse(predictions, verbose=True)   #RMSE: 0.8653

# 对指定用户和商品进行评分预测
uid = str(196) 
iid = str(302) 
pred = algo.predict(uid, iid, r_ui=4, verbose=True)


"""方法2:使用baseline算法"""
# Baseline算法,使用ALS进行优化,迭代次数5,reg_u为user正则化系数为12,reg_i为item正则化系数为5
bsl_options = {'method': 'als','n_epochs': 5,'reg_u': 12,'reg_i': 5}
algo2 = BaselineOnly(bsl_options = bsl_options)

kf = KFold(n_splits=5)
for trainset, testset in kf.split(data):
    # 训练并预测
    algo2.fit(trainset)
    predictions = algo2.test(testset)
    # 计算RMSE
    accuracy.rmse(predictions, verbose=True)
    
uid = str(196)
iid = str(302)
# 输出uid对iid的预测结果测结果
pred = algo2.predict(uid, iid, r_ui=4, verbose=True)
Ejemplo n.º 24
0
    knnmean_results = []
    nmf_results = []

    reader = Reader(rating_scale=(0, np.inf))
    data = Dataset.load_from_df(
        usergroups_df[["user_id", "item_id", "rating"]], reader)
    folds_it = KFold(n_splits=5).split(data)
    i = 1
    pl_fit = []
    for trainset, testset in folds_it:
        print("Fold: %d" % i)
        i += 1

        print("Baseline")
        baseline = BaselineOnly()
        baseline.fit(trainset)
        baseline_predictions = baseline.test(testset)
        results = get_group_measures(preds_all=baseline_predictions,
                                     U1=U1_users,
                                     U2=U2_users,
                                     U3=U3_users,
                                     U4=U4_users)
        baseline_results.append(results)

        print("KNN")
        knn = KNNBasic(sim_options={"name": "pearson"})
        #knn = KNNBasic(sim_options={"name": "cosine"})
        knn.fit(trainset)
        knn_predictions = knn.test(testset)
        results = get_group_measures(preds_all=knn_predictions,
                                     U1=U1_users,
from surprise import Dataset
from surprise import Reader
from surprise import BaselineOnly
from surprise import accuracy
from surprise.model_selection import KFold

#数据读取
reader = Reader(line_format='user item rating timestamp',
                sep=',',
                skip_lines=1)
data = Dataset.load_from_file('./ratings.csv', reader=reader)
train_set = data.build_full_trainset()

#ALS优化,优化方式可以选其他的('SGD')
#设置user、item的正则化项
bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5}
model = BaselineOnly(bsl_options=bsl_options)

#k折交叉验证
kf = KFold(n_splits=5)
for trainset, testset in kf.split(data):
    model.fit(trainset)
    pred = model.test(testset)
    #计算RMSE
    accuracy.rmse(pred)

uid = str(300)
iid = str(180)

#输出uid对iid 的预测结果
pred = model.predict(uid, iid, r_ui=4, verbose=True)
"""

from surprise import Dataset, Reader, BaselineOnly, KNNBasic, NormalPredictor, accuracy
from surprise.model_selection import KFold

# 1. 读取数据
reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
data = Dataset.load_from_file('data/ratings.csv', reader=reader)

# 2. 选择模型
# ALS优化
# bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5}
# SGD优化
bsl_options = {'method': 'sgd', 'n_epochs': 5}
algo = BaselineOnly(bsl_options=bsl_options)

# 3. 定义K折交叉验证迭代器,K=3
kf = KFold(n_splits=3)
for train_set, test_set in kf.split(data):
    # 4. 训练并预测
    algo.fit(train_set)
    predictions = algo.test(test_set)
    # 5. 计算RMSE
    accuracy.rmse(predictions, verbose=True)

uid = str(196)
iid = str(302)

# 输出uid对iid的预测结果
pred = algo.predict(uid, iid, r_ui=4, verbose=True)
Ejemplo n.º 27
0
def collaborative_filtering(raw_uid):
    # To read the data from a txt file
    # =============== 数据预处理 ===========================
    # 将数据库中的所有数据读取转换到文件
    # dir_data = '/www/wwwroot/music_recommender/page/cf_recommendation/cf_data'
    dir_data = './collaborative_filtering/cf_data'
    file_path = '{}/dataset_user_5.txt'.format(dir_data)
    if not os.path.exists(dir_data):
        os.makedirs(dir_data)

    # 数据库操作
    # 打开数据库连接
    db = pymysql.connect("localhost",
                         "root",
                         "password",
                         "music_recommender",
                         charset='utf8')

    # 使用 cursor() 方法创建一个游标对象 cursor
    cursor = db.cursor()
    songData = defaultdict(list)
    sql = """SELECT uid, song_id, rating
              FROM user_rating
               WHERE 1"""
    cursor.execute(sql)
    results = cursor.fetchall()
    with open(file_path, "w+") as data_f:
        a = 0
        for result in results:
            uid, song_id, rating = result
            if song_id in songData:
                songData[song_id].append(rating)
            else:
                songData[song_id] = [rating]
            
            data_f.writelines("{}\t{}\t{}\n".format(uid, song_id, rating))
            a += 1
  
    if not os.path.exists(file_path):
        raise IOError("Dataset file is not exists!")

    reader = Reader(line_format='user item rating', sep='\t')
    data = Dataset.load_from_file(file_path, reader=reader)
    # Build the training set
    trainset = data.build_full_trainset()
  
    bsl_options = {'method': 'sgd',
                    'learning_rate': 0.0005,
                 }
    algo_BaselineOnly = BaselineOnly(bsl_options=bsl_options)
    algo_BaselineOnly.fit(trainset) #训练模型

    rset = user_build_anti_testset(trainset, raw_uid)
    predictions = algo_BaselineOnly.test(rset)
    top_n_baselineonly = get_top_n(predictions, n=10)
    # print(predictions)
    # uid    原生用户id
    # iid    原生项目id
    # r_ui    浮点型的真实评分
    # est    浮点型的预测评分
    # details    预测相关的其他详细信息
    # print(top_n_baselineonly, 'top_n_baselineonly')
    

    # KNNBasic
    sim_options = {'name': 'pearson', 'user_based': True}
    algo_KNNBasic = KNNBasic(sim_options=sim_options)
    algo_KNNBasic.fit(trainset)

    predictor = PredictionSet(algo_KNNBasic, trainset, raw_uid)
  
    knn_anti_set = predictor.user_build_anti_testset()
    predictions = algo_KNNBasic.test(knn_anti_set)

    top_n_knnbasic = get_top_n(predictions, n=1000)
    # print(predictions, 'top_n_knnbasic')
    # KNNBaseline
    sim_options = {'name': 'pearson_baseline', 'user_based': True}
    algo_KNNBaseline = KNNBaseline(sim_options=sim_options)
    algo_KNNBaseline.fit(trainset)

    predictor = PredictionSet(algo_KNNBaseline, trainset, raw_uid)
    knn_anti_set = predictor.user_build_anti_testset()
    predictions = algo_KNNBaseline.test(knn_anti_set)
    top_n_knnbaseline = get_top_n(predictions, n=1000)

    evaluationMSEResult = evaluationMSE([top_n_baselineonly, top_n_knnbasic, top_n_knnbaseline], raw_uid)

    recommendset = set()
    for results in [top_n_baselineonly, top_n_knnbasic, top_n_knnbaseline]:
        for key in results.keys():
            for recommendations in results[key]:
                iid, rating, true_score = recommendations
                recommendset.add(iid)

    items_baselineonly = set()
    for key in top_n_baselineonly.keys():
        for recommendations in top_n_baselineonly[key]:
            iid, rating, true_score = recommendations
            items_baselineonly.add(iid)

    items_knnbasic = set()
    for key in top_n_knnbasic.keys():
        for recommendations in top_n_knnbasic[key]:
            iid, rating, true_score = recommendations
            items_knnbasic.add(iid)

    items_knnbaseline = set()
    for key in top_n_knnbaseline.keys():
        for recommendations in top_n_knnbaseline[key]:
            iid, rating, true_score = recommendations
            items_knnbaseline.add(iid)

    rank = dict()
    for recommendation in recommendset:
        if recommendation not in rank:
            rank[recommendation] = 0
        if recommendation in items_baselineonly:
            rank[recommendation] += 1
        if recommendation in items_knnbasic:
            rank[recommendation] += 1
        if recommendation in items_knnbaseline:
            rank[recommendation] += 1

    max_rank = max(rank, key=lambda s: rank[s])
    evaluationMSEResult1 = {}
    if max_rank == 1:
        return items_baselineonly
    else:
        resultAll = dict()
        result = nlargest(10, rank, key=lambda s: rank[s])
        for k in result:
            resultAll[k] = rank[k]
        # print("排名结果: {}".format(resultAll))
        evaluation(songData, resultAll)
        for key in evaluationMSEResult:
            if key in resultAll:
                evaluationMSEResult1[key] = evaluationMSEResult[key]
        print(evaluationMSEResult1,'evaluationMSEResult1==') #最后的评估
        return resultAll
def make_predictions(user_id):
    performance = []
    algorithms = ['SVD', 'KNN', 'ALS']

    # First train an SVD algorithm on the movielens dataset.
    data = Dataset.load_builtin('ml-100k')
    trainset = data.build_full_trainset()

    algo_SVD = SVD()
    algo_SVD.fit(trainset)

    # Then predict ratings for all pairs (u, i) that are NOT in the training set.
    # SVD algorithm
    testset = trainset.build_anti_testset()
    predictions_SVD = algo_SVD.test(testset)

    accurancy_SVD = accuracy.rmse(predictions_SVD)
    performance.append(accurancy_SVD)

    algo_KNN = KNNBasic()
    algo_KNN.fit(trainset)

    predictions_KNN = algo_SVD.test(testset)

    accurancy_KNN = accuracy.rmse(predictions_KNN)
    performance.append(accurancy_KNN)

    bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5}
    algo_ALS = BaselineOnly(bsl_options=bsl_options)
    algo_ALS.fit(trainset)

    predictions_ALS = algo_ALS.test(testset)

    accurancy_ALS = accuracy.rmse(predictions_ALS)
    performance.append(accurancy_ALS)

    # comparing algorithms by performance
    best_performance_index = performance.index(min(performance))
    best_algorithm = algorithms[best_performance_index]

    if best_algorithm == 'SVD':
        top_n = get_top_n(predictions_SVD, n=10)
    elif best_algorithm == 'KNN':
        top_n = get_top_n(predictions_KNN, n=10)
    elif best_algorithm == 'ALS':
        top_n = get_top_n(predictions_ALS, n=10)

    i_cols = [
        'movie_id', 'movie_title', 'release_date', 'video_release_date',
        'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation',
        'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
        'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
        'Thriller', 'War', 'Western'
    ]

    items = pd.read_csv('../../ml-100k/u.item',
                        sep='|',
                        names=i_cols,
                        encoding='latin-1')

    predictions = []
    # Print the recommended items for the user
    for uid, user_ratings in top_n.items():
        if int(uid) + 1 == int(user_id) + 1:
            # print(uid, [iid for (iid, _) in user_ratings])
            for (iid, _) in user_ratings:
                title = items[items['movie_id'] == int(iid) + 1]['movie_title']
                title_t = str(title)
                title_split = title_t.split()
                print(title_split)
                # print(title_split(1))
                # print(title_split(2))
                # print(title_t)
                predictions.append(title_t)

    return predictions
Ejemplo n.º 29
0
def test_trainset_testset():
    """Test the construct_trainset and construct_testset methods."""

    current_dir = os.path.dirname(os.path.realpath(__file__))
    folds_files = [(current_dir + '/custom_train',
                    current_dir + '/custom_test')]

    data = Dataset.load_from_folds(folds_files=folds_files, reader=reader)

    with pytest.warns(UserWarning):
        trainset, testset = next(data.folds())

    # test ur
    ur = trainset.ur
    assert ur[0] == [(0, 4, None)]
    assert ur[1] == [(0, 4, None), (1, 2, None)]
    assert ur[40] == []  # not in the trainset

    # test ir
    ir = trainset.ir
    assert ir[0] == [(0, 4, None), (1, 4, None), (2, 1, None)]
    assert ir[1] == [(1, 2, None), (2, 1, None), (3, 5, None)]
    assert ir[20000] == []  # not in the trainset

    # test n_users, n_items, n_ratings, rating_scale
    assert trainset.n_users == 4
    assert trainset.n_items == 2
    assert trainset.n_ratings == 6
    assert trainset.rating_scale == (1, 5)

    # test user features
    u_features = trainset.u_features
    assert u_features[0] == []  # no u_features_df added
    assert u_features[1] == []  # no u_features_df added
    assert u_features[3] == []  # no u_features_df added
    assert u_features[40] == []  # not in trainset and no u_features_df
    assert trainset.user_features_labels == []
    assert trainset.n_user_features == 0

    # test item features
    i_features = trainset.i_features
    assert i_features[0] == []  # no i_features_df added
    assert i_features[1] == []  # no i_features_df added
    assert i_features[20000] == []  # not in trainset and no i_features_df
    assert trainset.item_features_labels == []
    assert trainset.n_item_features == 0

    # test raw2inner
    for i in range(4):
        assert trainset.to_inner_uid('user' + str(i)) == i
    with pytest.raises(ValueError):
        trainset.to_inner_uid('unknown_user')

    for i in range(2):
        assert trainset.to_inner_iid('item' + str(i)) == i
    with pytest.raises(ValueError):
        trainset.to_inner_iid('unknown_item')

    # test inner2raw
    assert trainset._inner2raw_id_users is None
    assert trainset._inner2raw_id_items is None
    for i in range(4):
        assert trainset.to_raw_uid(i) == 'user' + str(i)
    for i in range(2):
        assert trainset.to_raw_iid(i) == 'item' + str(i)
    assert trainset._inner2raw_id_users is not None
    assert trainset._inner2raw_id_items is not None

    # Test the build_testset() method
    algo = BaselineOnly()
    algo.fit(trainset)
    testset = trainset.build_testset()
    algo.test(testset)  # ensure an algorithm can manage the data
    assert ('user0', 'item0', [], [], 4) in testset
    assert ('user3', 'item1', [], [], 5) in testset
    assert ('user3', 'item1', [], [], 0) not in testset

    # Test the build_anti_testset() method
    algo = BaselineOnly()
    algo.fit(trainset)
    testset = trainset.build_anti_testset()
    algo.test(testset)  # ensure an algorithm can manage the data
    assert ('user0', 'item0', [], [], trainset.global_mean) not in testset
    assert ('user3', 'item1', [], [], trainset.global_mean) not in testset
    assert ('user0', 'item1', [], [], trainset.global_mean) in testset
    assert ('user3', 'item0', [], [], trainset.global_mean) in testset
Ejemplo n.º 30
0
def test_trainset_testset_ui_features():
    """Test the construct_trainset and construct_testset methods with user and
    item features."""

    current_dir = os.path.dirname(os.path.realpath(__file__))
    folds_files = [(current_dir + '/custom_train',
                    current_dir + '/custom_test')]

    data = Dataset.load_from_folds(folds_files=folds_files, reader=reader)

    u_features_df = pd.DataFrame(
        {
            'urid': ['user0', 'user2', 'user3', 'user1'],
            'isMale': [False, True, False, True]
        },
        columns=['urid', 'isMale'])
    data = data.load_features_df(u_features_df, user_features=True)

    i_features_df = pd.DataFrame(
        {
            'irid': ['item0', 'item1'],
            'isNew': [False, True],
            'webRating': [4, 3],
            'isComedy': [True, False]
        },
        columns=['irid', 'isNew', 'webRating', 'isComedy'])
    data = data.load_features_df(i_features_df, user_features=False)

    with pytest.warns(UserWarning):
        trainset, testset = next(data.folds())

    # test ur
    ur = trainset.ur
    assert ur[0] == [(0, 4, None)]
    assert ur[1] == [(0, 4, None), (1, 2, None)]
    assert ur[40] == []  # not in the trainset

    # test ir
    ir = trainset.ir
    assert ir[0] == [(0, 4, None), (1, 4, None), (2, 1, None)]
    assert ir[1] == [(1, 2, None), (2, 1, None), (3, 5, None)]
    assert ir[20000] == []  # not in the trainset

    # test n_users, n_items, n_ratings, rating_scale
    assert trainset.n_users == 4
    assert trainset.n_items == 2
    assert trainset.n_ratings == 6
    assert trainset.rating_scale == (1, 5)

    # test user features
    u_features = trainset.u_features
    assert u_features[0] == [False]
    assert u_features[40] == []  # not in trainset and u_features_df
    assert trainset.user_features_labels == ['isMale']
    assert trainset.n_user_features == 1

    # test item features
    i_features = trainset.i_features
    assert i_features[0] == [False, 4, True]
    assert i_features[20000] == []  # not in trainset and i_features_df
    assert trainset.item_features_labels == ['isNew', 'webRating', 'isComedy']
    assert trainset.n_item_features == 3

    # test raw2inner
    for i in range(4):
        assert trainset.to_inner_uid('user' + str(i)) == i
    with pytest.raises(ValueError):
        trainset.to_inner_uid('unknown_user')

    for i in range(2):
        assert trainset.to_inner_iid('item' + str(i)) == i
    with pytest.raises(ValueError):
        trainset.to_inner_iid('unknown_item')

    # test inner2raw
    assert trainset._inner2raw_id_users is None
    assert trainset._inner2raw_id_items is None
    for i in range(4):
        assert trainset.to_raw_uid(i) == 'user' + str(i)
    for i in range(2):
        assert trainset.to_raw_iid(i) == 'item' + str(i)
    assert trainset._inner2raw_id_users is not None
    assert trainset._inner2raw_id_items is not None

    # Test the build_testset() method
    algo = BaselineOnly()
    algo.fit(trainset)
    testset = trainset.build_testset()
    algo.test(testset)  # ensure an algorithm can manage the data
    assert ('user0', 'item0', [False], [False, 4, True], 4) in testset
    assert ('user2', 'item1', [True], [True, 3, False], 1) in testset
    assert ('user3', 'item1', [False], [True, 3, False], 5) in testset
    assert ('user3', 'item1', [False], [True, 3, False], 0) not in testset

    # Test the build_anti_testset() method
    algo = BaselineOnly()
    algo.fit(trainset)
    testset = trainset.build_anti_testset()
    algo.test(testset)  # ensure an algorithm can manage the data
    assert (('user0', 'item0', [False], [False, 4, True], trainset.global_mean)
            not in testset)
    assert (('user3', 'item1', [False], [True, 3, False], trainset.global_mean)
            not in testset)
    assert (('user0', 'item1', [False], [True, 3, False], trainset.global_mean)
            in testset)
    assert (('user3', 'item0', [False], [False, 4, True], trainset.global_mean)
            in testset)
Ejemplo n.º 31
0
def test_trainset_testset(toy_data_reader):
    """Test the construct_trainset and construct_testset methods."""

    current_dir = os.path.dirname(os.path.realpath(__file__))
    folds_files = [(current_dir + '/custom_train',
                    current_dir + '/custom_test')]

    data = Dataset.load_from_folds(folds_files=folds_files,
                                   reader=toy_data_reader, rating_scale=(1, 5))

    with pytest.warns(UserWarning):
        trainset, testset = next(data.folds())

    # test ur
    ur = trainset.ur
    assert ur[0] == [(0, 4)]
    assert ur[1] == [(0, 4), (1, 2)]
    assert ur[40] == []  # not in the trainset

    # test ir
    ir = trainset.ir
    assert ir[0] == [(0, 4), (1, 4), (2, 1)]
    assert ir[1] == [(1, 2), (2, 1), (3, 5)]
    assert ir[20000] == []  # not in the trainset

    # test n_users, n_items, n_ratings, rating_scale
    assert trainset.n_users == 4
    assert trainset.n_items == 2
    assert trainset.n_ratings == 6
    assert trainset.rating_scale == (1, 5)

    # test raw2inner
    for i in range(4):
        assert trainset.to_inner_uid('user' + str(i)) == i
    with pytest.raises(ValueError):
        trainset.to_inner_uid('unkown_user')

    for i in range(2):
        assert trainset.to_inner_iid('item' + str(i)) == i
    with pytest.raises(ValueError):
        trainset.to_inner_iid('unkown_item')

    # test inner2raw
    assert trainset._inner2raw_id_users is None
    assert trainset._inner2raw_id_items is None
    for i in range(4):
        assert trainset.to_raw_uid(i) == 'user' + str(i)
    for i in range(2):
        assert trainset.to_raw_iid(i) == 'item' + str(i)
    assert trainset._inner2raw_id_users is not None
    assert trainset._inner2raw_id_items is not None

    # Test the build_testset() method
    algo = BaselineOnly()
    algo.fit(trainset)
    testset = trainset.build_testset()
    algo.test(testset)  # ensure an algorithm can manage the data
    assert ('user0', 'item0', 4) in testset
    assert ('user3', 'item1', 5) in testset
    assert ('user3', 'item1', 0) not in testset

    # Test the build_anti_testset() method
    algo = BaselineOnly()
    algo.fit(trainset)
    testset = trainset.build_anti_testset()
    algo.test(testset)  # ensure an algorithm can manage the data
    assert ('user0', 'item0', trainset.global_mean) not in testset
    assert ('user3', 'item1', trainset.global_mean) not in testset
    assert ('user0', 'item1', trainset.global_mean) in testset
    assert ('user3', 'item0', trainset.global_mean) in testset