algo = gs.best_estimator['rmse'] return algo if __name__ == '__main__': movie_names = pd.read_csv("./ml-latest-small/movies.csv") df = pd.read_csv("./ml-latest-small/ratings.csv") train_set, test_users = data_handler.remove_users_from_trainset(df, 120) # only necessary once if values are not saved # algo = do_grid_search(df) # With saved best values, gives RSME of 0.8706802008822863 on full dataset algo = SVD() algo.n_factors = 150 algo.n_epochs = 30 algo.lr_all = 0.005 algo.reg_all = 0.2 with Pool(processes=multiprocessing.cpu_count()) as pool: # the minimal rating for a user to want to watch a movie _rating_threshold = 4 # the amount of movies a user gets recommended before updating the model with new swipes _movies_per_refresh = 2 # ratio of the movies per refresh that is random vs already rated by other users random_movie_ratio_range = [0.5] # the group sizes to test group_size_range = range(5, 15) # the minimal fraction of the group that needs to agree for a certain movie ratio_agreement_range = np.arange(1, 1.1, .25) # the amount of runs per setting n = 100
def train_model(trainFilePath, testFilePath, K, eta, reg, Y_train, Y_test): print('Surprise! V.2') # Get the training and testing data from the file # NOTE: Had to concatenate both files because if we didn't we would get # that surprise would not recognize that there's N movies, because not all # movies are rated in the train set file_pathTrain = os.path.expanduser('./data/trainTest1.txt') reader = Reader(sep='\t') dataLocal = Dataset.load_from_file(file_pathTrain, reader=reader) alg = SVD() # using the SVD algorithm alg.n_factors = K # n_factors is the number of factors, K in matrices alg.n_epochs = 100 alg.lr_all = eta # set the learning rate alg.reg_all = reg # the reglarization constant trainset = dataLocal.build_full_trainset() alg.fit(trainset) print('number of users:', trainset.n_users) print('number of movies:', trainset.n_items) print('number of ratings:', trainset.n_ratings) #testset = trainset.build_testset() #predictionsTrain = alg.test(testset) # testing on data in #errorTrain = accuracy.rmse(predictionsTrain, verbose=True) print('U matrix', type(alg.pu)) print('V matrix', type(alg.qi)) print('bu array', type(alg.bu)) print('bi array', type(alg.bi)) U = np.asmatrix(alg.pu) V = np.asmatrix(alg.qi) print('U shape', U.shape) print('V shape', V.shape) errorTrain = get_err(U, V, alg.bu, alg.bi, Y_train, reg) errorTest = get_err(U, V, alg.bu, alg.bi, Y_test, reg) # to make a test test for the testing data, we need to go through a convoluted # process of making it first a .build_full_trainset() -> .build_testset() #trainsetForTest = dataLocalTrain.build_full_trainset() #testsetForTest = trainsetForTest.build_testset() #testsetForTest = dataLocalTrain.build_testset() #predictionsTest = alg.test(testsetForTest) #errorTest = accuracy.rmse(predictionsTest, verbose=True) ''' NOTE: Have to call alg.fit() for this to work, this is what we're returning: alg.pu is the numpy array of user factors U? alg.qi is the numpy array of item factors V? alg.bu is the numpy array of user biases alg.bi is the numpy array of item biases ''' print('pu array:', alg.pu) print('qi array:', alg.qi) return alg.pu, alg.qi, alg.bu, alg.bi, errorTrain, errorTest
def train_model(trainFilePath, testFilePath, K, eta, reg, Y_train): # Get the training data from the file file_pathTrain = os.path.expanduser('./data/trainTest1.txt') reader = Reader(sep='\t') dataLocal = Dataset.load_from_file(file_pathTrain, reader=reader) # Get the testing data from the file # file_pathTest = os.path.expanduser(testFilePath) #'./data/train.txt') # reader = Reader(sep='\t') # dataLocalTest = Dataset.load_from_file(file_pathTrain, reader=reader) alg = SVD() # using the SVD algorithm alg.n_factors = K # n_factors is the number of factors, K in matrices alg.n_epochs = 100 alg.lr_all = eta # set the learning rate alg.reg_all = reg # the reglarization constant # define a cross-validation iterator #kf = KFold(n_splits=5) #for trainset, testset in kf.split(dataLocal): # # train and test algorithm # alg.fit(trainset) # predictions = alg.test(testset) trainset = dataLocal.build_full_trainset() alg.fit(trainset) #trainset = dataLocalTrain.build_full_trainset() #alg.fit(trainset) #print('number of users:', trainset.n_users) #print('number of movies:', trainset.n_items) #print('number of ratings:', trainset.n_ratings) #testset = trainset.build_testset() #predictionsTrain = alg.test(testset) # testing on data in #errorTrain = accuracy.rmse(predictionsTrain, verbose=True) print('U matrix', type(alg.pu)) print('V matrix', type(alg.qi)) print('bu array', type(alg.bu)) print('bi array', type(alg.bi)) U = np.asmatrix(alg.pu) V = np.asmatrix(alg.qi) print('U shape', U.shape) print('V shape', V.shape) errorTrain = get_err(U, V, alg.bu, alg.bi, Y_train, reg) errorTest = -1 # to make a test test for the testing data, we need to go through a convoluted # process of making it first a .build_full_trainset() -> .build_testset() #trainsetForTest = dataLocalTrain.build_full_trainset() #testsetForTest = trainsetForTest.build_testset() #testsetForTest = dataLocalTrain.build_testset() #predictionsTest = alg.test(testsetForTest) #errorTest = accuracy.rmse(predictionsTest, verbose=True) ''' NOTE: Have to call alg.fit() for this to work, this is what we're returning: alg.pu is the numpy array of user factors U? alg.qi is the numpy array of item factors V? alg.bu is the numpy array of user biases alg.bi is the numpy array of item biases ''' print('pu array:', alg.pu) print('qi array:', alg.qi) return alg.pu, alg.qi, alg.bu, alg.bi, errorTrain, errorTest
def train_model(trainFilePath, testFilePath, K, eta, reg, Y_train, Y_test): print('Surprise! V.3') # NOTE: Had to concatenate both files because if we didn't we would get # that surprise would not recognize that there's N movies, because not all # movies are rated in the train set # Get the training and testing data from the file, trainTest1.txt is the # concatenation of train.txt and test.txt file_pathTrain = os.path.expanduser('./data/trainTest1.txt') reader = Reader(sep='\t') dataLocal = Dataset.load_from_file(file_pathTrain, reader=reader) alg = SVD() # use the SVD algorithm # Set parameters: alg.n_factors = K # n_factors is the number of factors, K in matrices alg.n_epochs = 100 alg.lr_all = eta # set the learning rate alg.reg_all = reg # the reglarization constant # Know this works, training on all of the data at once, both train & test trainset = dataLocal.build_full_trainset() # use all data to train alg.fit(trainset) # train on the trainset testset = trainset.build_testset() prediction = alg.test(testset) acc = accuracy.rmse(prediction, verbose=True) # Cross validation doesn't work, always leaves out a movie rating we're # guessing. # doesn't work!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # cross validation does not work here at all #kf = KFold(n_splits=50) # #for trainset1, testset in kf.split(dataLocal): # alg.fit(trainset1) # predictions = alg.test(testset) # accuracy.rmse(predictions, verbose=True) ''' NOTE: Have to call alg.fit() for this to work, this is what we're returning: alg.pu is the numpy array of user factors U alg.qi is the numpy array of item(movie) factors V alg.bu is the numpy array of user biases alg.bi is the numpy array of item(movie) biases ''' U = np.asmatrix(alg.pu) # convert to numpy matrices for error function V = np.asmatrix(alg.qi) # Sanity checks to make sure our returned U and V matrices have right # dimensions: #print('number of users:', trainset.n_users) #print('number of movies:', trainset.n_items) #print('number of ratings:', trainset.n_ratings) print('U shape, MxK', U.shape) print('V shape, NxK', V.shape) # Get the training and test error using the same error function we used # with our other models errorTrain = get_err(U, V, alg.bu, alg.bi, Y_train, reg) errorTest = get_err(U, V, alg.bu, alg.bi, Y_test, reg) return U, V, alg.bu, alg.bi, errorTrain, errorTest
# Based on each gridsearch, we apply the same parameters for each algorithms on # sample test set to get individual predictions. # ## SVD # In[ ]: #SVD with baselines algo = SVD() algo.n_factors = 400 algo.verbose = False algo.biased = True algo.reg_all = 0.1 algo.lr_all = 0.01 algo.n_epochs = 500 algo.random_state = seed print("Training SVD...") algo.fit(trainset) print("Computing predictions for SVD... \n") test_predictions_svd = algo.test( testset) #Get real predictions to append to big final matrix # In[ ]: test_predictions_svd = np.asarray(test_predictions_svd) test_predictions_svd_filtered = test_predictions_svd[:, 3]