Beispiel #1
0
    algo = gs.best_estimator['rmse']
    return algo


if __name__ == '__main__':
    movie_names = pd.read_csv("./ml-latest-small/movies.csv")
    df = pd.read_csv("./ml-latest-small/ratings.csv")
    train_set, test_users = data_handler.remove_users_from_trainset(df, 120)

    # only necessary once if values are not saved
    # algo = do_grid_search(df)
    # With saved best values, gives RSME of 0.8706802008822863 on full dataset
    algo = SVD()
    algo.n_factors = 150
    algo.n_epochs = 30
    algo.lr_all = 0.005
    algo.reg_all = 0.2

    with Pool(processes=multiprocessing.cpu_count()) as pool:
        # the minimal rating for a user to want to watch a movie
        _rating_threshold = 4
        # the amount of movies a user gets recommended before updating the model with new swipes
        _movies_per_refresh = 2
        # ratio of the movies per refresh that is random vs already rated by other users
        random_movie_ratio_range = [0.5]
        # the group sizes to test
        group_size_range = range(5, 15)
        # the minimal fraction of the group that needs to agree for a certain movie
        ratio_agreement_range = np.arange(1, 1.1, .25)
        # the amount of runs per setting
        n = 100
def train_model(trainFilePath, testFilePath, K, eta, reg, Y_train, Y_test):
    print('Surprise! V.2')

    # Get the training and testing data from the file
    # NOTE: Had to concatenate both files because if we didn't we would get
    # that surprise would  not recognize that there's N movies, because not all
    # movies are rated in the train set
    file_pathTrain = os.path.expanduser('./data/trainTest1.txt')
    reader = Reader(sep='\t')
    dataLocal = Dataset.load_from_file(file_pathTrain, reader=reader)
    
    alg = SVD() # using the SVD algorithm
    
    alg.n_factors = K # n_factors is the number of factors, K in matrices
    alg.n_epochs = 100
    alg.lr_all = eta # set the learning rate
    alg.reg_all = reg # the reglarization constant

    trainset = dataLocal.build_full_trainset()
    alg.fit(trainset)

    print('number of users:', trainset.n_users)
    print('number of movies:', trainset.n_items)
    print('number of ratings:', trainset.n_ratings)

    #testset = trainset.build_testset()
    #predictionsTrain = alg.test(testset) # testing on data in
    #errorTrain = accuracy.rmse(predictionsTrain, verbose=True)
    
    print('U matrix', type(alg.pu))
    print('V matrix', type(alg.qi))
    print('bu array', type(alg.bu))
    print('bi array', type(alg.bi))

    U = np.asmatrix(alg.pu)
    V = np.asmatrix(alg.qi)
    
    print('U shape', U.shape)
    print('V shape', V.shape)
    
    errorTrain = get_err(U, V, alg.bu, alg.bi, Y_train, reg)
    errorTest = get_err(U, V, alg.bu, alg.bi, Y_test, reg)

    # to make a test test for the testing data, we need to go through a convoluted
    # process of making it first a .build_full_trainset() -> .build_testset()
    #trainsetForTest = dataLocalTrain.build_full_trainset()
    #testsetForTest = trainsetForTest.build_testset()
    #testsetForTest = dataLocalTrain.build_testset()
    #predictionsTest = alg.test(testsetForTest)
    #errorTest = accuracy.rmse(predictionsTest, verbose=True)

    '''
    NOTE: Have to call alg.fit() for this to work, this is what we're returning:
     alg.pu is the numpy array of user factors U?
     alg.qi is the numpy array of item factors V?
     alg.bu is the numpy array of user biases
     alg.bi is the numpy array of item biases
    '''
    
    print('pu array:', alg.pu) 
    print('qi array:', alg.qi)


    return alg.pu, alg.qi, alg.bu, alg.bi, errorTrain, errorTest
Beispiel #3
0
def train_model(trainFilePath, testFilePath, K, eta, reg, Y_train):

    # Get the training data from the file
    file_pathTrain = os.path.expanduser('./data/trainTest1.txt')
    reader = Reader(sep='\t')
    dataLocal = Dataset.load_from_file(file_pathTrain, reader=reader)
    
    # Get the testing data from the file
#    file_pathTest = os.path.expanduser(testFilePath) #'./data/train.txt')
#    reader = Reader(sep='\t')
#    dataLocalTest = Dataset.load_from_file(file_pathTrain, reader=reader)
    
    alg = SVD() # using the SVD algorithm
    
    alg.n_factors = K # n_factors is the number of factors, K in matrices
    alg.n_epochs = 100
    alg.lr_all = eta # set the learning rate
    alg.reg_all = reg # the reglarization constant

    # define a cross-validation iterator 
    #kf = KFold(n_splits=5)

    #for trainset, testset in kf.split(dataLocal):
    #    # train and test algorithm
    #    alg.fit(trainset)
    #    predictions = alg.test(testset)

    trainset = dataLocal.build_full_trainset()
    alg.fit(trainset)


    #trainset = dataLocalTrain.build_full_trainset()
    #alg.fit(trainset)
    #print('number of users:', trainset.n_users)
    #print('number of movies:', trainset.n_items)
    #print('number of ratings:', trainset.n_ratings)

    #testset = trainset.build_testset()
    #predictionsTrain = alg.test(testset) # testing on data in
    #errorTrain = accuracy.rmse(predictionsTrain, verbose=True)
    
    print('U matrix', type(alg.pu))
    print('V matrix', type(alg.qi))
    print('bu array', type(alg.bu))
    print('bi array', type(alg.bi))

    U = np.asmatrix(alg.pu)
    V = np.asmatrix(alg.qi)
    
    print('U shape', U.shape)
    print('V shape', V.shape)
    
    errorTrain = get_err(U, V, alg.bu, alg.bi, Y_train, reg)
    errorTest = -1

    # to make a test test for the testing data, we need to go through a convoluted
    # process of making it first a .build_full_trainset() -> .build_testset()
    #trainsetForTest = dataLocalTrain.build_full_trainset()
    #testsetForTest = trainsetForTest.build_testset()
    #testsetForTest = dataLocalTrain.build_testset()
    #predictionsTest = alg.test(testsetForTest)
    #errorTest = accuracy.rmse(predictionsTest, verbose=True)

    '''
    NOTE: Have to call alg.fit() for this to work, this is what we're returning:
     alg.pu is the numpy array of user factors U?
     alg.qi is the numpy array of item factors V?
     alg.bu is the numpy array of user biases
     alg.bi is the numpy array of item biases
    '''
    
    print('pu array:', alg.pu) 
    print('qi array:', alg.qi)


    return alg.pu, alg.qi, alg.bu, alg.bi, errorTrain, errorTest
Beispiel #4
0
def train_model(trainFilePath, testFilePath, K, eta, reg, Y_train, Y_test):
    print('Surprise! V.3')

    # NOTE: Had to concatenate both files because if we didn't we would get
    # that surprise would  not recognize that there's N movies, because not all
    # movies are rated in the train set

    # Get the training and testing data from the file, trainTest1.txt is the
    # concatenation of train.txt and test.txt
    file_pathTrain = os.path.expanduser('./data/trainTest1.txt')
    reader = Reader(sep='\t')
    dataLocal = Dataset.load_from_file(file_pathTrain, reader=reader)

    alg = SVD()  # use the SVD algorithm

    # Set parameters:
    alg.n_factors = K  # n_factors is the number of factors, K in matrices
    alg.n_epochs = 100
    alg.lr_all = eta  # set the learning rate
    alg.reg_all = reg  # the reglarization constant

    # Know this works, training on all of the data at once, both train & test
    trainset = dataLocal.build_full_trainset()  # use all data to train
    alg.fit(trainset)  # train on the trainset
    testset = trainset.build_testset()
    prediction = alg.test(testset)
    acc = accuracy.rmse(prediction, verbose=True)

    # Cross validation doesn't work, always leaves out a movie rating we're
    # guessing.
    # doesn't work!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    # cross validation does not work here at all
    #kf = KFold(n_splits=50)
    #
    #for trainset1, testset in kf.split(dataLocal):
    #    alg.fit(trainset1)
    #    predictions = alg.test(testset)
    #    accuracy.rmse(predictions, verbose=True)
    '''
    NOTE: Have to call alg.fit() for this to work, this is what we're returning:
     alg.pu is the numpy array of user factors U
     alg.qi is the numpy array of item(movie) factors V
     alg.bu is the numpy array of user biases
     alg.bi is the numpy array of item(movie) biases
    '''

    U = np.asmatrix(alg.pu)  # convert to numpy matrices for error function
    V = np.asmatrix(alg.qi)

    # Sanity checks to make sure our returned U and V matrices have right
    # dimensions:
    #print('number of users:', trainset.n_users)
    #print('number of movies:', trainset.n_items)
    #print('number of ratings:', trainset.n_ratings)
    print('U shape, MxK', U.shape)
    print('V shape, NxK', V.shape)

    # Get the training and test error using the same error function we used
    # with our other models
    errorTrain = get_err(U, V, alg.bu, alg.bi, Y_train, reg)
    errorTest = get_err(U, V, alg.bu, alg.bi, Y_test, reg)

    return U, V, alg.bu, alg.bi, errorTrain, errorTest
Beispiel #5
0
# Based on each gridsearch, we apply the same parameters for each algorithms on
# sample test set to get individual predictions.

# ## SVD

# In[ ]:

#SVD with baselines

algo = SVD()
algo.n_factors = 400
algo.verbose = False
algo.biased = True
algo.reg_all = 0.1
algo.lr_all = 0.01
algo.n_epochs = 500
algo.random_state = seed

print("Training SVD...")
algo.fit(trainset)

print("Computing predictions for SVD... \n")
test_predictions_svd = algo.test(
    testset)  #Get real predictions to append to big final matrix

# In[ ]:

test_predictions_svd = np.asarray(test_predictions_svd)
test_predictions_svd_filtered = test_predictions_svd[:, 3]