Beispiel #1
0
    class ALSModelSurprise(ALSModel):
        def __init__(self, params):
            super().__init__(params)
            self.algo = BaselineOnly(bsl_options=self.params)

        def parse_data(self, ratings):
            reader = Reader(rating_scale=(1, 5))
            self.data = Dataset.load_from_df(ratings, reader)

        def update_parameters(self):
            self.algo.bsl_options = self.params

        def fit(self):
            self.train = self.data.build_full_trainset()
            self.algo.fit(self.train)

        def predict(self, uid, iid):
            '''
            uid, iid should be consistent with ratings['UID','IID']
            '''
            return self.algo.predict(uid, iid).est

        def top_n_recommendations(self, uid, n=5):
            '''
            Obtain the top n recommendation for any user.
            Method for the surprise library
            '''
            scores = []
            for i in range(self.train.n_items):
                iid = self.train.to_raw_iid(i)
                scores.append((iid, self.predict(uid, iid)))
            scores.sort(key=lambda x: x[1], reverse=True)
            top_n_iid = [l[0] for l in scores[:n]]
            pred = [l[1] for l in scores[:n]]
            return top_n_iid, pred

        def cross_validate(self, cv=5, verbose=False):
            cv_result = cross_validate(self.algo, self.data, \
                                       cv=cv, verbose=verbose)
            rmse = cv_result['test_rmse'].mean()
            return rmse

        def grid_search(self):
            self._best_params = self.params
            self._best_rmse = self.cross_validate(cv=5)
            for n_epochs in [5, 10, 15, 20, 25]:
                for reg_u in [5, 10, 15, 20]:
                    for reg_i in [5, 10, 15]:
                        self.set_params(n_epochs=n_epochs,
                                        reg_u=reg_u,
                                        reg_i=reg_i)
                        rmse = self.cross_validate(cv=5)
                        print(n_epochs, reg_u, reg_i, rmse)
                        if (rmse < self._best_rmse):
                            self._best_rmse = rmse
                            self._best_params = self.params
def baseline_only(train, test, ids, Xtest, Xids):
    """
    Combines user and item mean with user and item biases
    Argument : train, the trainset
               test, the testset
               ids, unknown ratings
               Xtest, predicted ratings for testset, to be used for final blending
               Xids, predicted ratings for unknown ratings, to be used for final blending
    """
    print('Baseline Only')
    bsl_options = {
        'method': 'als',
        'n_epochs': 100,
        'reg_u': 15,
        'reg_i': 0.01
    }

    algo = BaselineOnly(bsl_options=bsl_options, verbose=False)

    #Train algorithm on training set
    algo.fit(train)

    #Predict on train and compute RMSE
    predictions = algo.test(train.build_testset())
    print('   Training RMSE: ', accuracy.rmse(predictions, verbose=False))

    #Predict on test and compute RMSE
    predictions = algo.test(test)
    rmse = accuracy.rmse(predictions, verbose=False)
    print('   Test RMSE: ', rmse)

    preds_test = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds_test[j] = pred.est

    #Predict unknown ratings
    preds_ids = []
    for i in range(len(ids[0])):
        pred = algo.predict(str(ids[0][i]), str(ids[1][i]))
        preds_ids.append(pred.est)

    Xtest.append(preds_test)
    Xids.append(preds_ids)
    return rmse, Xtest, Xids, preds_test, preds_ids
class BaseLineRecommender(object):
    """
    Use surprise's baselineonly algorithm as the baseline of prediction
    """
    def __init__(self):
        self.model = None

    def fit(self, train):
        """
        Fit the model
        """
        self.model = BaselineOnly(bsl_options={
            'method': 'sgd',
            'n_epochs': 30,
            'reg': 0.01,
            'learning_rate': 0.01
        })
        self.model.fit(train)

    def predict(self, user_id, item_id):
        """
        Predict ratings
        """
        return self.model.predict(user_id, item_id)

    def rmse(self, test):
        """
        Calculate RMSE for the predicted ratings
        """
        pred = self.model.test(test)
        return accuracy.rmse(pred)

    def mae(self, test):
        """
        Calculate MAE for the predicted ratings
        """
        pred = self.model.test(test)
        return accuracy.mae(pred)
Beispiel #4
0
def compute_recommendations(user_id, prediction_table,
                            numeric_prediction_table):

    algo = 'Baseline'

    algorithm = BaselineOnly()

    # add_pageview(user_id=user_id, item_id=None, page="Model Predictions", activity_type="Initialize Predictions - " + algo, rating=None) #pageview

    engine = create_engine(config.DB_URI, echo=True)
    session = scoped_session(
        sessionmaker(bind=engine, autocommit=False, autoflush=False))

    #reading in the database

    df_ratings = pd.read_sql('SELECT * FROM ratings;', con=engine)
    df_ratings = df_ratings[['user_id', 'item_id', 'rating']]
    df_ratings = df_ratings.dropna()
    df_ratings = df_ratings.drop_duplicates()

    df_ratings2 = pd.read_csv('data/ratings.csv', low_memory=False)
    df_ratings2 = df_ratings2.rename(columns={'movie_id': 'item_id'})
    df_ratings2 = df_ratings2[['user_id', 'item_id', 'rating']]
    df_ratings2 = df_ratings2.dropna()
    df_ratings2 = df_ratings2.drop_duplicates()

    df_ratings = pd.concat([df_ratings, df_ratings2], axis=0)

    reader = Reader(line_format='user item rating',
                    sep=',',
                    rating_scale=(1, 10))
    data = Dataset.load_from_df(df_ratings, reader=reader)

    trainset = data.build_full_trainset()

    #     algorithm = eval(algo + "()")# set the algorithm...............................................

    algorithm.train(trainset)

    items = pd.read_sql('SELECT distinct id FROM items;', con=engine)
    df_user_items = df_ratings.loc[df_ratings['user_id'] == user_id]
    total_items = items.id.unique()
    user_items = df_user_items.item_id.unique()
    # user_id = str(user_id)
    prediction_items = [x for x in total_items if x not in user_items]

    predictions = pd.DataFrame(columns=['user_id', 'item_id', 'prediction'])

    predicted_ratings = []

    for i in prediction_items:
        a = user_id
        b = i
        est = algorithm.predict(a, b)
        predicted_ratings.append(est[3])

    predictions['item_id'] = prediction_items
    predictions['user_id'] = pd.Series(
        [user_id for x in range(len(predictions.index))],
        index=predictions.index)

    predictions['prediction'] = predicted_ratings

    predictions = predictions.sort_values('prediction', ascending=False)
    test_prediction = predictions
    predictions = predictions.head(n=10)

    cols = [
        'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5', 'pred_6', 'pred_7',
        'pred_8', 'pred_9', 'pred_10'
    ]

    df_pred = predictions[['item_id']].T

    df_pred.columns = cols

    df_pred['id'] = user_id

    df_pred = df_pred[[
        'id', 'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5', 'pred_6',
        'pred_7', 'pred_8', 'pred_9', 'pred_10'
    ]]

    df_pred['id'] = df_pred['id'].astype(int)

    df_pred.to_sql(prediction_table, engine, if_exists='append',
                   index=False)  #if_exists='append'
    session.commit()

    df_num_ratings = test_prediction

    df_num_ratings = df_num_ratings.head(n=20)

    df_num_ratings['algorithm'] = algo
    df_num_ratings.rename(columns={'prediction': 'predicted_rating'},
                          inplace=True)

    df_num_ratings.to_sql('numeric_predictions',
                          engine,
                          if_exists='append',
                          index=False)  #if_exists='append'
    session.commit()

    predcols = [
        'num_1', 'num_2', 'num_3', 'num_4', 'num_5', 'num_6', 'num_7', 'num_8',
        'num_9', 'num_10'
    ]

    df_num_ratings_transpose = predictions[['prediction']].T
    df_num_ratings_transpose.columns = predcols

    df_num_ratings_transpose['id'] = user_id

    df_num_ratings_transpose = df_num_ratings_transpose[[
        'id', 'num_1', 'num_2', 'num_3', 'num_4', 'num_5', 'num_6', 'num_7',
        'num_8', 'num_9', 'num_10'
    ]]

    df_num_ratings_transpose['id'] = df_num_ratings_transpose['id'].astype(int)

    df_num_ratings_transpose.to_sql(numeric_prediction_table,
                                    engine,
                                    if_exists='append',
                                    index=False)  #if_exists='append'
    session.commit()
Beispiel #5
0
alg.fit(data_train.build_full_trainset())

end = time.time()
print("***********************************************")
print("Exe time:")
print(end - start)

# %% Loading Test Data
file_path = "Data/sample_submission.csv"
data_test = utils.load_data_desired(file_path)

# %% Prediction
Predict_Test = []

for line in data_test:
    Predict_Test.append(alg.predict(str(line[1]), str(line[0])).est)


# %% Save Prediction
file = open("Details.txt", "w")

file.write("+ Best Score: \n \n")
file.write(str(Train_CV.best_score) + "\n \n")
file.write("************************************************************ \n")
file.write("+ Best Param: \n \n")
file.write(str(Train_CV.best_params) + "\n \n")
file.write("************************************************************ \n")
file.write("+ CV Summary: \n \n")
file.write(str(Train_CV.cv_results) + "\n \n")
file.write("************************************************************ \n")
Beispiel #6
0
#import pandas as pd

# 数据读取
reader = Reader(line_format='user item rating timestamp',
                sep=',',
                skip_lines=1)
data = Dataset.load_from_file('./ratings.csv', reader=reader)
train_set = data.build_full_trainset()

# ALS优化
#bsl_options = {'method': 'als','n_epochs': 5,'reg_u': 12,'reg_i': 5}
# SGD优化
bsl_options = {'method': 'sgd', 'n_epochs': 5}
algo = BaselineOnly(bsl_options=bsl_options)
#algo = BaselineOnly()
#algo = NormalPredictor()

# 定义K折交叉验证迭代器,K=3
kf = KFold(n_splits=3)
for trainset, testset in kf.split(data):
    # 训练并预测
    algo.fit(trainset)
    predictions = algo.test(testset)
    # 计算RMSE
    accuracy.rmse(predictions, verbose=True)

uid = str(196)
iid = str(302)
# 输出uid对iid的预测结果
pred = algo.predict(uid, iid, r_ui=4, verbose=True)
from surprise import Reader
from surprise import BaselineOnly
from surprise import accuracy
from surprise.model_selection import KFold

#数据读取
reader = Reader(line_format='user item rating timestamp',
                sep=',',
                skip_lines=1)
data = Dataset.load_from_file('./ratings.csv', reader=reader)
train_set = data.build_full_trainset()

#ALS优化,优化方式可以选其他的('SGD')
#设置user、item的正则化项
bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5}
model = BaselineOnly(bsl_options=bsl_options)

#k折交叉验证
kf = KFold(n_splits=5)
for trainset, testset in kf.split(data):
    model.fit(trainset)
    pred = model.test(testset)
    #计算RMSE
    accuracy.rmse(pred)

uid = str(300)
iid = str(180)

#输出uid对iid 的预测结果
pred = model.predict(uid, iid, r_ui=4, verbose=True)
from surprise.model_selection import train_test_split
# Importing built in MovieLens 100K dataset
data = Dataset.load_builtin('ml-100k')

# Baseline 알고리즘 지정
algo = BaselineOnly()
# cv=4는 데이터를 4개로 나누어서 하나를 test set으로 사용하는데 5개 모두에 대해서 실행
result = cross_validate(algo,
                        data,
                        measures=['RMSE', 'MAE'],
                        cv=4,
                        verbose=True)

# Set full train data 지정, 예측하기
trainset = data.build_full_trainset()
pred = algo.predict('1', '2', r_ui=3,
                    verbose=True)  # user_id, item_id, default rating

# csv 파일에서 불러오기
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('C:/RecoSys/Data/u.data',
                      names=r_cols,
                      sep='\t',
                      encoding='latin-1')
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['user_id', 'movie_id', 'rating']], reader)
result = cross_validate(algo,
                        data,
                        measures=['RMSE', 'MAE'],
                        cv=4,
                        verbose=True)
start = time.time()
for line in data_test:
    Pred_Test_KNN.append(
        alg_KNN.predict(str(line[1]), str(line[0]), clip=False).est)

    Pred_Test_SVD.append(
        alg_SVD.predict(str(line[1]), str(line[0]), clip=False).est)

    Pred_Test_NMF.append(
        alg_NMF.predict(str(line[1]), str(line[0]), clip=False).est)

    Pred_Test_SL1.append(
        alg_SL1.predict(str(line[1]), str(line[0]), clip=False).est)

    Pred_Test_BSL.append(
        alg_BSL.predict(str(line[1]), str(line[0]), clip=False).est)

end = time.time()
print("***********************************************")
print("Exe time:")
print(end - start)

X_Test = np.matrix([
    Pred_Test_SVD, Pred_Test_NMF, Pred_Test_SL1, Pred_Test_KNN, Pred_Test_BSL
])
X_Test = X_Test.T

# %% Prior Based
X_Test = np.matrix(
    [Pred_Test_SVD, Pred_Test_NMF, Pred_Test_SL1, Pred_Test_KNN])
X_Test = X_Test.T
Beispiel #10
0
# 定义K折交叉验证迭代器,K=3
kf = KFold(n_splits=3)
for trainset, testset in kf.split(suprise_data):
    # 训练并预测
    algo.fit(trainset)
    predictions = algo.test(testset)
    # 计算RMSE
    accuracy.rmse(predictions, verbose=True)

#读取需要预测数据,并处理
probe = pd.read_table('probe.txt', sep='/t', header=None)
processed_probe = process_probe(probe)

#由于训练数据读取一部分,需要筛选出在train中出现的user_id
pre = pd.merge(data, processed_probe, how='inner', on=['user_id', 'movie_id'])
print('start predict')
############最终结果0.989714596450271################
count = 0
error = 0
for user, movie in zip(pre['user_id'], pre['movie_id']):
    rui = pre[(pre.user_id == str(user))
              & (pre.movie_id == str(movie))].loc[:, 'rating']
    count += 1
    rui_value = int(rui.iloc[0])
    prediction = algo.predict(str(user),
                              str(movie),
                              r_ui=rui_value,
                              verbose=True)
    error += np.square(prediction[3] - rui_value)
print("RMSE:{}".format(np.sqrt(error / count)))
Beispiel #11
0
print("Predicted Rating:")
pred[3]


# print('Using ALS')
bsl_options = {'method': 'als',
               'n_epochs': 5,
               'reg_u': 12,
               'reg_i': 5
               }
algo_2 = BaselineOnly(bsl_options=bsl_options)

trainset = data.build_full_trainset()
algo_2.train(trainset)

pred = algo_2.predict('374', '500')

print("Prediction Object:")
pred

print("Predicted Rating:")
pred[3]

#Predicting all missing entries
#First lets start by visualising our matrix of all observed entries.
#This matrix is quite sparse.
import numpy as np

n_users = trainset.n_users
n_items = trainset.n_items
Beispiel #12
0
trainset = data.build_full_trainset()
del data

print(time.asctime(), 'training set built, now training')
# algo = SlopeOne()
#
#
# MODEL DEFINITION
algo = BaselineOnly(verbose=True)
#
#
#
algo.fit(trainset)

print(time.asctime(), 'training complete, now loading prediction data')
to_predict = pd.read_csv(file_path_test, delimiter=' ', header=None)
to_predict = to_predict.values.T[0:2].T
predicted = np.zeros(len(to_predict))

print(time.asctime(), 'prediction data loaded, now predicting')
for i in range(len(predicted)):
    user = to_predict[i][0]
    item = to_predict[i][1]
    predicted[i] = algo.predict(uid=user, iid=item, verbose=0).est
    if (i % 500000 == 0):
        print(i, 'of', len(predicted), 'predicted')

print(time.asctime(), 'now saving predictions')
np.savetxt('../custom_data/' + title + '.dta', predicted, fmt='%.3f')

print(time.asctime(), 'done')
Beispiel #13
0
relation_file = open("../relation.txt", "r")
relation_dict = {}
for line in relation_file:
    temp0 = line.split(":")
    relation_dict[temp0[0].strip()] = temp0[1].strip()

#print(relation_dict)
#print(relation_dict[str(2159)])

testset = open("../test1.csv", "r")
result = open("./result/result_BaselineOnly.txt", "w")
full_result = open("./full_result/result_BaselineOnly.txt", "w")

for line in testset:
    temp = line.split(",")
    pred = algo.predict(temp[0], temp[1], verbose=False)
    #score = round(pred[3])
    score = pred[3]
    if str(temp[0]) in relation_dict.keys():
        sum_score = 0
        member_list = relation_dict[str(temp[0])].split(",")
        num = len(member_list)
        for i in range(num):
            sum_score = (algo.predict(member_list[i], temp[1],
                                      verbose=False))[3] + sum_score
        #print("calculate")
        final_score = round(0.9 * score + 0.1 * (sum_score / num))
    #result.write(str(score) + "\n")
    result.write(str(final_score) + "\n")
    #full_result.write(str(pred[3]) + "\n")