def svdpp(trainset, testset, predset): modelname = 'svdpp' # Check if predictions already exist if is_already_predicted(modelname): return bsl_options = { 'method': 'als', 'reg_i': 1.e-5, 'reg_u': 14.6, 'n_epochs': 10 } algo = SVDpp(n_epochs=40, n_factors=100, bsl_options=bsl_options, lr_bu=0.01, lr_bi=0.01, lr_pu=0.1, lr_qi=0.1, lr_yj=0.01, reg_bu = 0.05, reg_bi = 0.05, reg_pu = 0.09, reg_qi = 0.1, reg_yj=0.01) print('SVDpp Model') algo.train(trainset) predictions = algo.test(trainset.build_testset()) print(' RMSE on Train: ', accuracy.rmse(predictions, verbose=False)) predictions = algo.test(testset) rmse = accuracy.rmse(predictions, verbose=False) print(' RMSE on Test: ', rmse) preds = np.zeros(len(predictions)) for j, pred in enumerate(predictions): preds[j] = pred.est save_predictions(modelname, rmse, preds, 'test') print(' Evaluate predicted ratings...') predictions = algo.test(predset) preds = np.zeros(len(predictions)) for j, pred in enumerate(predictions): preds[j] = pred.est save_predictions(modelname, rmse, preds)
def RecommendPredictions(): ## Load train and test data into Dataframes trainDF = pan.read_csv("data_source/train_count_norm_1_10.csv", header=None, dtype={2: np.float16}) trainDF = trainDF.fillna(10.0) reader = Reader(rating_scale=(1, 10)) print "Load train set...." dataTrain = Dataset.load_from_df(trainDF[[0, 1, 2]], reader=reader) trainset = dataTrain.build_full_trainset() print "Initiate Training ....." algo = SVDpp(n_epochs=1, lr_all=0.01, reg_all=0.02, verbose=True) algo.train(trainset) ## Predictions for test set with ground truth present print " Load test set..." testDF = pan.read_csv("data_source/test_count_norm_1_10.csv", header=None, dtype={2: np.float16}) testDF = testDF.fillna(10.0) dataTest = Dataset.load_from_df(testDF[[0, 1, 2]], reader=reader) testset = dataTest.build_full_trainset().build_testset() print "Start predictions" predictions = algo.test(testset) try: os.remove("data_source/predictions_results_svdpp.csv") except OSError: pass print "Saving Prediction results in File" resultFile = open("data_source/predictions_results_svdpp.csv", "a") csv_writer = csv.writer(resultFile) for item in predictions: predictionTuple = (item.uid, item.iid, item.r_ui, item.est) csv_writer.writerow(predictionTuple) resultFile.close() ## Predictions for test set with random products present ## LEFT #rmse = accuracy.rmse(predictions, verbose=True)
def svdpp_running_time(data): ''' Calculates the running times for training and predictions for SVD++ Args: data(Dataset): a list of datasets with different numbers of users Returns: elapsedtime_SVDpptrain: running time for training elapsedtime_SVDpptest: running time for predictions on testset ''' elapsedtime_SVDpptrain = [] elapsedtime_SVDpptest = [] # tune the parameters on the entire data param_grid = { 'n_factors': [25, 50, 100, 250], 'n_epochs': [10, 20, 30, 40, 50] } grid_search = GridSearch(SVD, param_grid, measures=['RMSE'], verbose=False) grid_search.evaluate(data[3]) param = grid_search.best_params['RMSE'] n_factors = param['n_factors'] n_epochs = param['n_epochs'] # using the tuned parameters calculate running times for i in range(len(data)): # training running time training_start = time.time() training = data[i].build_full_trainset() testing = training.build_anti_testset() svdpp = SVDpp(n_factors=n_factors, n_epochs=n_epochs) svdpp.train(training) elapsedtime_SVDpptrain.append(time.time() - training_start) # prediction running time test_start = time.time() svdpp.test(testing) elapsedtime_SVDpptest.append(time.time() - test_start) return elapsedtime_SVDpptrain, elapsedtime_SVDpptest
def svdpp(data, training, testing): ''' Tune SVD++ parameters then calculates RMSE, coverage and running time of SVD++ Args: data(Dataset): the whole dataset divided into 5 folds training(Dataset): training dataset testing(Dataset): test dataset Returns: rmse: RMSE of SVD++ with optimized parameters top_n: number of unique predictions for top n items ''' # candidate parameters param_grid = {'n_factors': [25, 50, 100, 250], 'n_epochs': [10, 20, 30, 40, 50]} # optimize parameters grid_search = GridSearch(SVDpp, param_grid, measures=['RMSE'], verbose=False) grid_search.evaluate(data) param = grid_search.best_params['RMSE'] print('SVDpp:', param) # RMSE against parameters result_df = pd.DataFrame.from_dict(grid_search.cv_results) result_df.to_csv('data/svdpp_rmse_against_param.csv') # fit model using the optimized parameters svdpp = SVDpp(n_factors=param['n_factors'], n_epochs=param['n_epochs']) svdpp.train(training) # evaluate the model using test data predictions = svdpp.test(testing) top_n = get_top_n(predictions, n=5) rmse = accuracy.rmse(predictions, verbose=True) return rmse, top_n
testset = rating_test2.build_full_trainset().build_testset() #SVD Model n_factors = [20] # where default = 20 n_epochs = [5] # where default = 20 lr_all = [0.007] # where default = 0.007 reg_all = [0.02] # where default = 0.02 count = 1 for i in n_factors: for j in n_epochs: for k in lr_all: for m in reg_all: start = dt.datetime.today() print("================================================") algo = SVDpp(n_factors=i, n_epochs=j, lr_all=k, reg_all=m) algo.train(trainset) print("This is the #" + str(count) + " parameter combination") predictions = algo.test(testset) print("n_factors=" + str(i) + ", n_epochs=" + str(j) + ", lr_all=" + str(k) + ", reg_all=" + str(m)) accuracy.rmse(predictions, verbose=True) accuracy.fcp(predictions, verbose=True) accuracy.mae(predictions, verbose=True) count = count + 1 end = dt.datetime.today() print("Runtime: " + str(end - start))
dfRatings = pd.read_csv(sys.argv[1]) dfTest = pd.read_csv(sys.argv[2]) # Delete unused columns del dfRatings['date'] del dfRatings['train_id'] del dfTest['date'] del dfTest['test_id'] # Set the rating scale and create the data for Surprise to use reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df( dfRatings[['user_id', 'business_id', 'rating']], reader) train_set = data.build_full_trainset() # Use SVD with surprise algo = SVDpp() algo.train(train_set) f = open('SVDOutput.csv', 'w') f.write("test_id,rating\n") for i in range(len(dfTest)): prediction = algo.predict(dfTest.at[i, 'user_id'], dfTest.at[i, 'business_id'], r_ui=4, verbose=True) predRating = prediction.est f.write(str(i) + "," + str(predRating) + '\n') f.close()
#with open('/Shared/bdagroup7/download/test_set.dat', "rb") as f: # test_set = pickle.load(f) #with open('/Shared/bdagroup7/download/training_set.dat', "rb") as f: # training_set = pickle.load(f) # Learning options sim_options = {'name': 'cosine', 'min_support': 50, 'user_based': True} bsl_options = {'method': 'sgd', 'learning_rate': .0005} # Algorithms (only select one) #algo = SVD() #algo = KNNBasic(k=10, min_k=8, sim_options=sim_options) #algo = KNNWithMeans(k=15, min_k=5, sim_options=sim_options) #algo = CoClustering() algo = SVDpp() algo.train(training_set) predictions = algo.test(test_set) with open('/Shared/bdagroup7/download/predictions_svd_pp.dat', "wb") as f: pickle.dump(predictions, f) # TODO: Ensemble rmse = accuracy.rmse(predictions, verbose=True) print("RMSE is: ") print(rmse)
class SurSVDpp: def __init__(self, k=5): if not isinstance(k, int) or k <= 0: raise IOError("Parameter k should be a positive integer.") self.data = None self.k = k self.algo = SVDpp(n_factors=self.k) self.predictions = pd.DataFrame() def fit_directly(self, data_long): """ This function directly computes the predictions of the algorithm for the data provided. The data needs to be in the long shape format. It then add to the class attributes the predictions made by the algorithm (maintaining the long format) :param data_long: pd.DataFrame | DataFrame in the long shape format :return void: """ # Run SVD++ reader = Reader(rating_scale=(0, 1)) data = Dataset.load_from_df(data_long, reader) trainset = data.build_full_trainset() self.algo.train(trainset) testset = trainset.build_anti_testset() predictions = self.algo.test(testset) # Reconstruct predictions users = [] items = [] ratings = [] dataframe = pd.DataFrame() for uid, iid, r_ui, _, _ in predictions: users.append(uid) items.append(iid) ratings.append(r_ui) dataframe["userID"] = users dataframe["itemID"] = items dataframe["ratings"] = ratings self.predictions = dataframe def fit(self, rating_matrix): """ Fits the instance to the rating matrix. The index must be the users and the columns the items. :param rating_matrix: pd.DataFrame | rating matrix :return: void """ data_long = rating_matrix.stack().reset_index() data_long.columns = ["user_id", "item_id", "ratings"] # Run SVD++ reader = Reader(rating_scale=(0, 1)) data = Dataset.load_from_df(data_long, reader) trainset = data.build_full_trainset() self.algo.train(trainset) testset = trainset.build_anti_testset() predictions = self.algo.test(testset) # Reconstruct predictions users = [] items = [] ratings = [] dataframe = pd.DataFrame() for uid, iid, r_ui, _, _ in predictions: users.append(uid) items.append(iid) ratings.append(r_ui) dataframe["itemID"] = items dataframe["ratings"] = ratings dataframe["userID"] = users self.predictions = dataframe def predict(self, user, item): """ Predict the probability that input user will like input item :param user: int | user ID :param item: int | item ID :return: float | probability that user likes item """ cond1 = self.predictions["userID"] == user cond2 = self.predictions["itemID"] == item mask = cond1 & cond2 temp = np.array(self.predictions.loc[mask, "ratings"]) proba = np.sum(temp) return proba