def final_model(data): """Pickles the collaborative filtering recommendation system model for repeat customers. Args: data -- a dataframe containing user id, item id, and ratings columns in that order. """ # Creates a user ratings surprise matrix for fitting model user_ratings_matrix = surprise_df(data) # Splits dataset into train and test datasets to generate predictions train_set, test_set = train_test_split(user_ratings_matrix, test_size=0.2, random_state=19) # Best params determined using GridSearchCV params = {'n_factors': 10, 'n_epochs': 50, 'lr_all': 0.01, 'reg_all': 0.1} svdpp = SVDpp(n_factors=params['n_factors'], n_epochs=params['n_epochs'], lr_all=params['lr_all'], reg_all=params['reg_all']) svdpp.fit(train_set) predictions = svdpp.test(test_set) # Use surprise wrapper to pickle model dump.dump('repeat_customer_model', predictions=predictions, algo=svdpp, verbose=0)
def trainSVD_surprise3D( training_data, colorlabels, plot=True, savefig="figures/"): #colorlabels, sizelabels, plot=True, savefig=True # algo = SVD(n_factors=4, n_epochs=1000, biased=True) # algo = SVD(n_factors=20, n_epochs=500, biased=False) algo = SVDpp(n_factors=10, n_epochs=1000) algo.fit(training_data) U = algo.pu if plot: fig = plt.figure(figsize=(8, 8)) # ax = fig.add_subplot(1,1,1) ax = fig.add_subplot(111, projection='3d') ax.set_xlabel('First', fontsize=15) ax.set_ylabel('Second', fontsize=15) ax.set_title('Reduced SVD', fontsize=20) scatter = ax.scatter( U[:, 0], U[:, 1], U[:, 2], c=colorlabels, s=10, alpha=0.7 ) #explore labeling colors with features like demographics, age ax.grid() cbar = fig.colorbar(scatter, ax=ax) cbar.set_label("state") if savefig: plt.savefig(savefig + "svd_counties_3D") plt.show()
def svdPP(data): #SVDPP algorithm print("\nTraining SVDPP model..\n") global x_test, y_test, testlen, trainlen, model_params, x_train, y_train, X, Y, avg_rat, cold_itm p1, p2, p3 = [ model_params[1]['n_epochs'], model_params[1]['lr_all'], model_params[1]['reg_all'] ] svdModel = SVDpp(n_epochs=p1, lr_all=p2, reg_all=p3) svdModel.fit(data.build_full_trainset()) print("\nTraining done..\nPrediction started..") test = [(x_test[i][0], x_test[i][1], y_test[i]) for i in range(testlen)] #train_=[(x_train[i][0],x_train[i][1],y_train[i]) for i in range(trainlen)] #total_=[(X[i][0],X[i][1],Y[i]) for i in range(trainlen+testlen)] predict = svdModel.test(test) #trainset, testset = t_t_s(data, test_size=.25) svdModel_1 = SVDpp() svdModel_1.fit(data.build_full_trainset()) predict1 = svdModel_1.test(test) #predict_train = svdModel_1.test(train_) #predict_tot = svdModel_1.test(total_) usrA = [int(i[0]) - 1 for i in predict] itmA = [int(i[1]) - 1 for i in predict] res = [i[3] for i in predict] res1 = [i[3] for i in predict1] for i in range(testlen): if itmA[i] in cold_itm: res[i] = avg_rat[usrA[i]] res1[i] = avg_rat[usrA[i]] #restrain=[i[3] for i in predict_train] print("\nPrediction done..\n") return [res, res1, svdModel, svdModel_1] #,restrain, predict_tot
def SVDPP(PointFrame, RecommendNum=10, TypeNum=5): OutUserList = [] OutFundList = [] PointFrameList = [] UserType = 0 # 拆分评分矩阵为5类: for Type in range(5): PointFrameList.append(PointFrame.ix[PointFrame.Type == Type]) # 对每一类用户分别评分: for Frame in PointFrameList: Frame = Frame.loc[:, 'User':'******'] UserList = Frame.User.unique() FundList = Frame.FundCode.unique() UserType = UserType + 1 reader = Reader(rating_scale=(0, 2)) data = Dataset.load_from_df(Frame, reader=reader).build_full_trainset() if UserType == 4: model = SVDpp(n_factors=5) else: model = SVDpp() model.fit(data) for User in UserList: UserPointList = [] for Fund in FundList: UserPointList.append(model.predict(User, Fund).est) RecommendList = np.argsort(UserPointList)[::-1][0:RecommendNum] for FundIndex in RecommendList: OutUserList.append(User) OutFundList.append(FundList[FundIndex]) OutFrame = pd.DataFrame({ "User": OutUserList, "RecommendFundCode": OutFundList }) return OutFrame
class RecommenderSVDpp(Recommender): def __init__(self, recommendation_dataset: RecommendationDataSet): super(RecommenderSVDpp, self).__init__(recommendation_dataset.movies) self.algorithm = SVDpp() self.recommendation_dataset = recommendation_dataset def fit(self, dataset): return self.algorithm.fit(dataset) def test(self, test_set): return self.algorithm.test(test_set) def get_recommendation(self, watched, k=20): # get dataset new_user_id, full_dataset = self.recommendation_dataset.get_dataset_with_extended_user(watched) inner_user_id = full_dataset.to_inner_uid(new_user_id) # after new dataset we need again train our model with the new user for the whole # dataset with the new user. self.algorithm.fit(full_dataset) # watched movies watched = {full_dataset.to_inner_iid(key): value for key,value in watched.items()} # Calculate for all similar user, predictions test_items = [ self.algorithm.predict(new_user_id, full_dataset.to_raw_iid(i)) for i in range(0, full_dataset.n_items) if i not in watched ] topn_items = [i[0] for i in get_top_n(test_items, n=k, minimum_rating=1.0)[new_user_id]] return self.movies.get_movie_by_movie_ids(topn_items)
def time_location_model(df): """ Shows the performance of model based on just bias """ lower = df['date_dist_rating'].min() upper = df['date_dist_rating'].max() df = df.drop(columns=["rating", "dist_rating", "date_rating"], axis=1) reader = Reader(rating_scale=(lower, upper)) #TODO figure out data = surprise.dataset.Dataset.load_from_df(df=df, reader=reader) ts = data.build_full_trainset() dusers = ts._raw2inner_id_users ditems = ts._raw2inner_id_items # breakpoint() trainset, testset = train_test_split(data) algo = SVDpp() algo.fit(trainset) # testset = trainset.build_anti_testset() predictions = algo.test(testset) print('\n') return (trainset, testset, predictions, dusers, ditems)
def top_ten_df (df): ''' inputs: df (Pandas DF) the dataframe that you would like to train on/NOTE: use f.df_samp_unique_vals() to get a smaller DF if you dont have enough memory to run full DF outputs: top_ten_df (DataFrame Pandas) returns a dataframe with the top ten predictions for every user in your original dataframe ''' data= f.read_data_surprise(df)#use f.df_samp_unique_vals() to get a smaller DF if you dont have enough memory to run full DF # First train an SVD algorithm on entire dataset (choose 6x name filter) trainset = data.build_full_trainset() algo = SVDpp()#n_epochs= 18, lr_all= 0.01, reg_all= 0.175 algo.fit(trainset) # Than predict ratings for all pairs (u, i) that are NOT in the training set. testset = trainset.build_anti_testset()#HEAVY THIS TAKES THE MOST RAM predictions = algo.test(testset) #create a dictionary of predictions top_n = f.get_top_n(predictions, n=10) #Turn the dictionary into a df top_ten_df = pd.DataFrame(top_n) return top_ten_df
def SVDpp_calculation(data , trainset, testset, time, cv): start = time.time() algo = SVDpp() algo.fit(trainset) predictions = algo.test(testset) cross_validate_svdpp_dict = cross_validate(algo, data, measures = ['RMSE'],cv=cv,verbose=True) end = time.time() time = end-start return time, cross_validate_svdpp_dict
def svdpp(trainset, testset): # Matrix factorization - SVD++ print("\n" + "-" * 5 + " SVD++ algorithm using surprise package " + "-" * 5) algo = SVDpp() algo.fit(trainset) predictions = algo.test(testset) rmse = accuracy.rmse(predictions) mae = accuracy.mae(predictions) return rmse, mae, predictions
class TrainModel: # def __init__(self, method='als', n_epochs=20, sim_option='pearson_baseline'): # # self.algo = KNNBasic(bsl_options={'method': method,'n_epochs': n_epochs}, # sim_options={'name': sim_option, 'user_based': False}) def __init__(self, lr_all=0.006, n_epochs=40): self.algo = SVDpp(lr_all=lr_all, n_epochs=n_epochs) self.reader = Reader(rating_scale=(0, 1)) self.filename = 'trained_model.pkl' def read_from_df(self, dataframe, user_col, item_col, rating_col): data = Dataset.load_from_df( dataframe[[user_col, item_col, rating_col]], self.reader) trainset = data.build_full_trainset() return trainset def train_mod(self, dataframe, user_col, item_col, rating_col): self.algo.fit( self.read_from_df(dataframe, user_col, item_col, rating_col)) def dump_model(self, predictions): saved_ent = dump.dump(self.filename, algo=self.algo, predictions=predictions) return saved_ent def load_model(self): predictions, loaded_ent = dump.load(self.filename) return predictions, loaded_ent def get_user_pred(self, user_id, dataframe, user_col, item_col, rating_col, n=2): data = Dataset.load_from_df( dataframe[[user_col, item_col, rating_col]], self.reader) testset = data.build_full_trainset().build_anti_testset() predictions = self.algo.test(testset) top_n = dict() for uid, iid, _, est, _ in predictions: if uid == user_id: top_n[iid] = est top_n = sorted(top_n.items(), key=lambda kv: kv[1], reverse=True) return predictions, top_n[:n] def get_user_pred_stable(self, user_id, predictions, n=2): top_n = dict() for uid, iid, _, est, _ in predictions: if uid == user_id: top_n[iid] = est top_n = sorted(top_n.items(), key=lambda kv: kv[1], reverse=True) # top_nn = {k: top_n[k] for k in top_n.keys()[0][:n]} return top_n[:n]
class SvdPP(RecommenderBase): """ SVDpp algorithm. Actually woring bad, just a draft """ def __init__(self, URM): print('train set built') # double check if training set is built fine for sgd # for u, i, r in self.trainset.all_ratings(): # a = 1 def fit(self, urm, n_factors=20, n_epochs=20, lr_all=0.007, reg_all=0.02, init_mean=0, init_std_dev=0.1, verbose=True): # create the training set r, c = urm.nonzero() ones = np.ones(len(r), dtype=np.int32) d = np.vstack((r, c, ones)).transpose() df = pd.DataFrame(d) df.columns = ['userID', 'itemID', 'rating'] reader = Reader() data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader) self.trainset = data.build_full_trainset() # fit self.algo = SVDpp(n_factors=n_factors, n_epochs=n_epochs, lr_all=lr_all, init_mean=init_mean, init_std_dev=init_std_dev, verbose=verbose) self.algo.fit(self.trainset) def recommend(self, userid, N=10, urm=None, filter_already_liked=True, with_scores=True, items_to_exclude=[]): if len(items_to_exclude) > 1: raise NotImplementedError('Items to exclude functionality is not implemented yet') r = np.empty([1]) for i in range(d.N_TRACKS): p = self.algo.predict(userid, i) r = np.array([p[3]]) if i == 0 else np.concatenate((r, np.array([p[3]]))) if filter_already_liked: if urm == None: raise ValueError('Please provide a URM in order to items already liked') else: r[urm.getrow(userid).nonzero()[1]] = 0 l = [userid] ind = np.argpartition(r, -10)[-10:] for i in ind: if with_scores: l.append((i, r[i])) else: l.append(i) return l
def create_model(self): n = 1000000 raw_data = self.get_ratings()[:n].fillna(0)[["userId", "id", "rating"]] reader = Reader() data = Dataset.load_from_df(raw_data, reader) data.split(n_folds=5) svdpp = SVDpp() trainset = data.build_full_trainset() svdpp.fit(trainset) filename = "C:/datasets/the-movies-dataset/models/collaborative_based/coll_svdpp.sav" joblib.dump(svdpp, filename)
def svdpp(dataset): start = time.time() algo = SVDpp() kf = KFold(n_splits=5) for trainset, testset in kf.split(dataset): algo.fit(trainset) predictions = algo.test(testset) acc = accuracy.rmse(predictions, verbose=True) end = time.time() print('svdpp花分钟数为:', (end - start) / 60) return acc
def model(train_set, test_set): params = {'n_factors': 3, 'n_epochs': 50, 'lr_all': 0.01, 'reg_all': 0.1} svdpp = SVDpp(n_factors=params['n_factors'], n_epochs=params['n_epochs'], lr_all=params['lr_all'], reg_all=params['reg_all']) svdpp.fit(train_set) predictions = svdpp.test(test_set) rmse = accuracy.rmse(predictions, verbose=False) return predictions, rmse
def trainSVD_surprise( training_data, colorlabels, plot=True, simplify=False, savefig="figures/"): #colorlabels, sizelabels, plot=True, savefig=True # algo = SVD(n_factors=4, n_epochs=1000, biased=True) # algo = SVD(n_factors=20, n_epochs=500, biased=False) algo = SVDpp(n_factors=3, n_epochs=1000) algo.fit(training_data) U = algo.pu if plot: fig = plt.figure(figsize=(8, 8)) ax = fig.add_subplot(1, 1, 1) ax.set_xlabel('First', fontsize=15) ax.set_ylabel('Second', fontsize=15) ax.set_title('Reduced SVD', fontsize=20) scatter = ax.scatter( U[:, 0], U[:, 1], c=colorlabels, s=10, alpha=0.7 ) #explore labeling colors with features like demographics, age ax.grid() cbar = fig.colorbar(scatter, ax=ax) cbar.set_label("state") if savefig: plt.savefig(savefig + "svd_counties") plt.show() if simplify: U = U.transpose() A = np.linalg.svd(U)[0] U_proj = np.dot(A[:, :2].transpose(), U) # Rescale dimensions U_proj /= U_proj.std(axis=1).reshape(2, 1) if plot: fig = plt.figure(figsize=(8, 8)) ax = fig.add_subplot(1, 1, 1) ax.set_xlabel('First', fontsize=15) ax.set_ylabel('Second', fontsize=15) ax.set_title('Reduced SVD', fontsize=20) scatter = ax.scatter(U_proj[0], U_proj[1], c=colorlabels, s=10) ax.grid() cbar = fig.colorbar(scatter, ax=ax) cbar.set_label("state") if savefig: plt.savefig(savefig + "svd_counties_simplfied") plt.show() return U_proj return U
def predict(): global top_n global user_id print("--predict start--------------------------------") # dataset import rating_data = pd.DataFrame(get_default_ratings()) reader = Reader(rating_scale=(0, 5)) data = Dataset.load_from_df(df=rating_data, reader=reader) trainset_2, testset_2 = train_test_split(data, test_size=0.3) # print("--test2--------------------------------") algo = SVDpp() predictions = algo.fit(trainset_2).test(testset_2) # print("--test1--------------------------------") top_n = get_top_n(predictions, n=10) print("--predict end--------------------------------")
def fit_model(mlr_df): algo = SVDpp() # Object to parse the data reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(mlr_df[['userId', 'id', 'rating']], reader) trainset = data.build_full_trainset() PREDICTOR = algo.fit(trainset) return PREDICTOR
def run_colab_filter(self): # A reader is still needed but only the rating_scale param is requiered. reader = Reader(rating_scale=(1, 4)) # The columns must correspond to user id, item id and ratings (in that order). data = Dataset.load_from_df( self.df20[['user_id', 'route_id', 'rating']], reader) # Retrieve the trainset. trainset = data.build_full_trainset() # Than predict ratings for all pairs (u, i) that are NOT in the training set. testset = trainset.build_anti_testset() algo_tuned = SVDpp(n_factors=20) algo_tuned.fit(trainset) iid = self.df20['route_id'].unique() #user_id = 200128311 #mine, trad, alpine, intermediate #user_id = 110596403 #boulder-er #user_id = 200272475 #boulder-er, advanced #user_id = 200077815 #michaels, trad, alpine, intermediate user_id = 106540415 #mixed climber, alpine climber, advanced iid_me = self.df20.loc[self.df20['user_id'] == user_id, 'user_id'] iids_to_pred = np.setdiff1d(iid, iid_me) testset = [[user_id, iid, 2] for iid in iids_to_pred] predictions_tuned = algo_tuned.test(testset) dump.dump(file_name='SVD_tuned.p', predictions=predictions_tuned, algo=algo_tuned) pred_ratings_tuned = np.array([pred.est for pred in predictions_tuned]) i_max = np.argpartition(pred_ratings_tuned, -20)[-20:] i_max = i_max[np.argsort(-pred_ratings_tuned[i_max])] iid = iids_to_pred[i_max] #top 20 recommended climbs self.df_top_climbs_mf = pd.DataFrame(iid, pred_ratings_tuned[i_max]) self.df_top_climbs_mf = self.df_top_climbs.reset_index() self.df_top_climbs_mf.columns = ['predicted rating', 'route id']
def SVD_pp(): algo = SVDpp() # 定义K折交叉验证迭代器,k=3 kf = KFold(n_splits=3) for trainset, testset in kf.split(data): # 训练并预测 algo.fit(trainset) predictions = algo.test(testset) # 计算RMSE accuracy.rmse(predictions, verbose=True) # verbose 输出当前跌代,默认False uid = str(196) iid = str(302) # 输出uid对iid的预测结果 pred = algo.predict(uid, iid, r_ui=4, verbose=True) time2 = time.time() print(time2 - time1)
def train_collaborative_explicit(user_id): try: db = getDb() if (db): print("### Start training collaborative explicit") collaboratives = db.collaboratives data = pd.DataFrame( list( collaboratives.find({ "customer": ObjectId(user_id), "explicit": True }))) data = data[['userId', 'itemId', 'feedBack']] data = data.rename(columns={ 'userId': 'user', 'itemId': 'item', 'feedBack': 'rating' }) lower_rating = data['rating'].min() upper_rating = data['rating'].max() reader = Reader(rating_scale=(lower_rating, upper_rating)) data = Dataset.load_from_df(data[["user", "item", "rating"]], reader) svdpp = SVDpp(verbose=True, n_epochs=5) svdpp.fit(data.build_full_trainset()) file_name = os.path.expanduser('models/' + user_id + '_collaborative_explicit') dump.dump(file_name, algo=svdpp) # pickle.dump(matrix, open(DUMPED_MODEL + user_id + "_content.pickle", "wb")) print("### Training collaborative explicit complete") channel.basic_publish( '', STATUS_QUEUE, 'complete|' + user_id + '|collaborative_explicit') print(" [x] Sent to {0}: complete_{1}".format( STATUS_QUEUE, user_id)) else: raise Exception("Database not found") except: raise Exception("Unable to connect mongo")
def surprise_SVDpp(train_file, test_file): """ Svd++ with Surprise library. Compute the predictions on a test_set after training on a train_set using the method Svd++ from Surprise. Args: train_file (string): path to created test file test_file (string): path to created train file Hyperparameters: n_factors : The number of factors. n_epochs : The number of iteration of the SGD procedure lr_'x': The learning rate for 'x' reg_'x' : The regularization term for 'x' 'x': bi : The item biases bu : The user biases qi : The item factors yj : The (implicit) item factors pu : The user factors Returns: numpy array: predictions """ print("SVDpp") fold = [(train_file, test_file)] reader = Reader(line_format='user item rating', sep=',') data = Dataset.load_from_folds(fold, reader=reader) pkf = PredefinedKFold() # Algorithm algo = SVDpp(n_epochs=40, n_factors=100, lr_all=0.01, reg_all=0.01) for trainset, testset in pkf.split(data): # Train algo.fit(trainset) # Predict predictions = algo.test(testset) pred = np.zeros(len(predictions)) for i in range(len(predictions)): val = predictions[i].est pred[i] = val return pred
def evaluate_on_test(self, train_set, test_set): """ Evaluate the algorithm on the test set after running it on the test set :param train_set: :param test_set: :return: RMSE value on test set """ if train_set is not None and test_set is not None: print("Evaluate RMSE on test data") self.LOG_HANDLE.info("Evaluate RMSE on test data") # http://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.SVDpp algo = SVDpp() # Train the algorithm on the trainset, and predict ratings for the testset algo.fit(train_set) predictions = algo.test(test_set) # Then compute RMSE return accuracy.rmse(predictions)
def train_best_model_generate_ratings_test(self, ratings_set, test_set): """ Train the best model (with minimum AMSE) on the complete ratings set and then compute the ratings for the test set :param ratings_set: The complete ratings data set :param test_set: The streams for the users for which ratings are not yet available :return: A data frame of the form user, stream, predicted rating """ if ratings_set and test_set: print( "Training the best model and generating the ratings for the test data set" ) self.LOG_HANDLE.info( "Training the best model and generating the ratings for the test data set" ) algo = SVDpp(**model_params.svdpp_best_params) algo.fit(ratings_set) predictions = algo.test(test_set) return predictions
def svdpp(train, test, ids, Xtest, Xids): """ Extension of svd taking the implicit ratings into account Argument : train, the trainset test, the testset ids, unknown ratings Xtest, predicted ratings for testset, to be used for final blending Xids, predicted ratings for unknown ratings, to be used for final blending """ print('SVD++') algo = SVDpp(n_factors=100, n_epochs=10, lr_all=0.0015, reg_all=0.05, random_state=15) #Train algorithm on training set algo.fit(train) #Predict on train and compute RMSE predictions = algo.test(train.build_testset()) print(' Training RMSE: ', accuracy.rmse(predictions, verbose=False)) #Predict on test and compute RMSE predictions = algo.test(test) rmse = accuracy.rmse(predictions, verbose=False) print(' Test RMSE: ', rmse) preds_test = np.zeros(len(predictions)) for j, pred in enumerate(predictions): preds_test[j] = pred.est #Predict unknown ratings preds_ids = [] for i in range(len(ids[0])): pred = algo.predict(str(ids[0][i]), str(ids[1][i])) preds_ids.append(pred.est) Xtest.append(preds_test) Xids.append(preds_ids) return rmse, Xtest, Xids, preds_test, preds_ids
class SurpriseRecommender(Recommender): name = 'surprise-svdpp' def train(self, data): ratings_dict = {'itemID': data[:,1], 'userID': data[:,0], 'rating': data[:,2]} df = pd.DataFrame(ratings_dict) reader = Reader(rating_scale=(0, 1)) data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader).build_full_trainset() # self.algo = KNNBasic(verbose=False) self.algo = SVDpp(verbose=True) self.algo.fit(data) def rate(self, user, movie): return self.algo.test([[user, movie, 0]])[0].est def rate_bool(self, user, movie): return self.rate(user, movie) > 0.5
def SVDPPThreadFuc(Frame): OutUserList = [] OutFundList = [] Frame = Frame.loc[:, 'User':'******'] UserList = Frame.User.unique() FundList = Frame.FundCode.unique() reader = Reader(rating_scale=(0, 2)) data = Dataset.load_from_df(Frame, reader=reader).build_full_trainset() model = SVDpp() model.fit(data) for User in UserList: UserPointList = [] for Fund in FundList: UserPointList.append(model.predict(User, Fund).est) RecommendList = np.argsort(UserPointList)[::-1][0:10] for FundIndex in RecommendList: OutUserList.append(User) OutFundList.append(FundList[FundIndex]) OutFrame = pd.DataFrame({ "User": OutUserList, "RecommendFundCode": OutFundList }) return OutFrame
def main(): print('Reading in and chopping data...') df = pd.read_csv(rus_master_loc) df_chopped = ah.rus_chop(df, 5, 10) reader = Reader(line_format='user item rating', sep=',', skip_lines=1) data = Dataset.load_from_df(df_chopped[['user', 'route', 'num_stars']], reader=reader) print('Training on full data...') trainset = data.build_full_trainset() algo = SVDpp(n_epochs=100, reg_all=.1, lr_all=.006) algo.fit(trainset) print('Writing relevant route_ids...') algo_rids = pd.DataFrame(df_chopped.groupby('route').count().index) algo_rids.to_csv(algo_rids_write_loc, index=None) print('Pickling algo...') pickle.dump(algo, open(pickle_write_loc, 'wb')) print('Done, pickled algo is here: {}'.format(pickle_write_loc)) print( 'Route IDs contained in algo are here: {}'.format(algo_rids_write_loc))
def svd_model(df): """ Apply SVD. """ df = pd.melt(df, id_vars='smiles', value_vars=list(df.columns[1:]), var_name='Target', value_name='TargetValue') mark = df.TargetValue.isna() unknown = df.loc[mark] known = df.loc[~mark] reader = Reader(rating_scale=(0, 1)) data = Dataset.load_from_df(known[['smiles', 'Target', 'TargetValue']], reader) kf = KFold(n_splits=3, random_state=57) algo = SVDpp(n_factors=12, reg_all=0.003, lr_all=0.006, random_state=132) for trainset, testset in kf.split(data): algo.fit(trainset) predictions = algo.test(testset) rmse = round(accuracy.rmse(predictions, verbose=True), 3) print('RMSE of SVD model for cross validation' + str(rmse)) result = unknown.copy() result['ToxicProb'] = result.apply( lambda x: algo.predict(x.smiles, x.Target).est, axis=1) result = result.drop(columns='TargetValue') return result
class RecommenderSVDppSimilarUsers(Recommender): """ Instead of building new dataset when the new user is in, we get similar users, and based on that try to get similar movies """ def __init__(self, movies): super(RecommenderSVDppSimilarUsers, self).__init__(movies) self.algorithm = SVDpp() def fit(self, dataset): return self.algorithm.fit(dataset) def test(self, test_set): return self.algorithm.test(test_set) def get_recommendation(self, watched, k=20, k_inner_item=10): # get dataset full_dataset = self.algorithm.trainset # watched movies watched = { full_dataset.to_inner_iid(key): value for key, value in watched.items() } # get similar users similar_users = self.get_similar_user_ids(watched, k=k_inner_item) # Calculate for all similar user, predictions candidates = defaultdict(float) for inner_move_id in range(0, full_dataset.n_items): if inner_move_id not in watched: movie_id = full_dataset.to_raw_iid(inner_move_id) for inner_user_id, similarity in similar_users.items(): prediction = self.algorithm.predict( full_dataset.to_raw_uid(inner_user_id), movie_id) candidates[movie_id] += similarity * prediction.est # heapq.nlargest(k, candidates.items(), key=itemgetter(1)) return self.movies.get_movie_by_movie_ids( heapq.nlargest(k, candidates, key=candidates.get))
def SVDpp(self, n_factors=20, n_epochs=20, lr_all=0.005, reg_all=0.02): """ An extension of Singular Value Decomposition algorithm that takes implicit ratings into account. Args: n_factors: Number of latent features, factors n_epochs: Number of iterations of the optimization loop lr_all: The learning rate for all parameters reg_all: The regularization term for all parameters Returns: predictions_df: The predictions of the model on the test data in Pandas Data Frame format """ algorithm = SVDpp(n_factors=n_factors, n_epochs=n_epochs, lr_all=lr_all, reg_all=reg_all) predictions = algorithm.fit(self.train_data).test(self.test_data) predictions_df = self.data.test_df.copy() predictions_df['Rating'] = [x.est for x in predictions] if self.test_purpose: self.evalueate_model(predictions_df['Rating'], 'Surprise SVDpp') return predictions_df