Esempio n. 1
0
def svdpp(trainset, testset, predset):
    
    modelname = 'svdpp'
    # Check if predictions already exist
    if is_already_predicted(modelname):
        return
    
    bsl_options = { 'method': 'als',
                    'reg_i': 1.e-5,
                    'reg_u': 14.6,
                    'n_epochs': 10
                   }
    
    algo = SVDpp(n_epochs=40, n_factors=100, bsl_options=bsl_options, lr_bu=0.01, lr_bi=0.01, lr_pu=0.1, lr_qi=0.1, lr_yj=0.01, reg_bu = 0.05, reg_bi = 0.05, reg_pu = 0.09, reg_qi = 0.1, reg_yj=0.01)
    print('SVDpp Model')
    algo.train(trainset)
    
    predictions = algo.test(trainset.build_testset())
    print('   RMSE on Train: ', accuracy.rmse(predictions, verbose=False))
    
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions, verbose=False)
    print('   RMSE on Test: ', rmse)
    preds = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds[j] = pred.est
    save_predictions(modelname, rmse, preds, 'test')

    print('   Evaluate predicted ratings...')
    predictions = algo.test(predset)
    preds = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds[j] = pred.est
    save_predictions(modelname, rmse, preds)
    
Esempio n. 2
0
def RecommendPredictions():
    ## Load train and test data into Dataframes
    trainDF = pan.read_csv("data_source/train_count_norm_1_10.csv",
                           header=None,
                           dtype={2: np.float16})
    trainDF = trainDF.fillna(10.0)

    reader = Reader(rating_scale=(1, 10))

    print "Load train set...."
    dataTrain = Dataset.load_from_df(trainDF[[0, 1, 2]], reader=reader)
    trainset = dataTrain.build_full_trainset()

    print "Initiate Training ....."
    algo = SVDpp(n_epochs=1, lr_all=0.01, reg_all=0.02, verbose=True)
    algo.train(trainset)

    ## Predictions for test set with ground truth present
    print " Load test set..."
    testDF = pan.read_csv("data_source/test_count_norm_1_10.csv",
                          header=None,
                          dtype={2: np.float16})
    testDF = testDF.fillna(10.0)
    dataTest = Dataset.load_from_df(testDF[[0, 1, 2]], reader=reader)
    testset = dataTest.build_full_trainset().build_testset()

    print "Start predictions"
    predictions = algo.test(testset)

    try:
        os.remove("data_source/predictions_results_svdpp.csv")
    except OSError:
        pass

    print "Saving Prediction results in File"
    resultFile = open("data_source/predictions_results_svdpp.csv", "a")
    csv_writer = csv.writer(resultFile)

    for item in predictions:
        predictionTuple = (item.uid, item.iid, item.r_ui, item.est)
        csv_writer.writerow(predictionTuple)

    resultFile.close()

    ## Predictions for test set with random products present
    ##	LEFT

    #rmse = accuracy.rmse(predictions, verbose=True)
Esempio n. 3
0
def svdpp_running_time(data):
    '''
        Calculates the running times for training and predictions for SVD++

        Args:
            data(Dataset): a list of datasets with different numbers of users

        Returns:
            elapsedtime_SVDpptrain: running time for training
            elapsedtime_SVDpptest: running time for predictions on testset
    '''
    elapsedtime_SVDpptrain = []
    elapsedtime_SVDpptest = []

    # tune the parameters on the entire data
    param_grid = {
        'n_factors': [25, 50, 100, 250],
        'n_epochs': [10, 20, 30, 40, 50]
    }
    grid_search = GridSearch(SVD, param_grid, measures=['RMSE'], verbose=False)
    grid_search.evaluate(data[3])
    param = grid_search.best_params['RMSE']
    n_factors = param['n_factors']
    n_epochs = param['n_epochs']

    # using the tuned parameters calculate running times
    for i in range(len(data)):
        # training running time
        training_start = time.time()
        training = data[i].build_full_trainset()
        testing = training.build_anti_testset()
        svdpp = SVDpp(n_factors=n_factors, n_epochs=n_epochs)
        svdpp.train(training)
        elapsedtime_SVDpptrain.append(time.time() - training_start)

        # prediction running time
        test_start = time.time()
        svdpp.test(testing)
        elapsedtime_SVDpptest.append(time.time() - test_start)
    return elapsedtime_SVDpptrain, elapsedtime_SVDpptest
Esempio n. 4
0
def svdpp(data, training, testing):
    '''
    Tune SVD++ parameters then calculates RMSE, coverage and running time of SVD++

    Args:
        data(Dataset): the whole dataset divided into 5 folds
        training(Dataset): training dataset
        testing(Dataset): test dataset

    Returns:
        rmse: RMSE of SVD++ with optimized parameters
        top_n: number of unique predictions for top n items
    '''
    # candidate parameters
    param_grid = {'n_factors': [25, 50, 100, 250], 'n_epochs': [10, 20, 30, 40, 50]}

    # optimize parameters
    grid_search = GridSearch(SVDpp, param_grid, measures=['RMSE'], verbose=False)
    grid_search.evaluate(data)
    param = grid_search.best_params['RMSE']
    print('SVDpp:', param)
    # RMSE against parameters
    result_df = pd.DataFrame.from_dict(grid_search.cv_results)
    result_df.to_csv('data/svdpp_rmse_against_param.csv')


    # fit model using the optimized parameters
    svdpp = SVDpp(n_factors=param['n_factors'], n_epochs=param['n_epochs'])
    svdpp.train(training)

    # evaluate the model using test data
    predictions = svdpp.test(testing)
    top_n = get_top_n(predictions, n=5)
    rmse = accuracy.rmse(predictions, verbose=True)

    return rmse, top_n
Esempio n. 5
0
testset = rating_test2.build_full_trainset().build_testset()

#SVD Model

n_factors = [20]  # where default = 20
n_epochs = [5]  # where default = 20
lr_all = [0.007]  # where default = 0.007
reg_all = [0.02]  # where default = 0.02

count = 1

for i in n_factors:
    for j in n_epochs:
        for k in lr_all:
            for m in reg_all:
                start = dt.datetime.today()
                print("================================================")
                algo = SVDpp(n_factors=i, n_epochs=j, lr_all=k, reg_all=m)

                algo.train(trainset)
                print("This is the #" + str(count) + " parameter combination")
                predictions = algo.test(testset)

                print("n_factors=" + str(i) + ", n_epochs=" + str(j) +
                      ", lr_all=" + str(k) + ", reg_all=" + str(m))
                accuracy.rmse(predictions, verbose=True)
                accuracy.fcp(predictions, verbose=True)
                accuracy.mae(predictions, verbose=True)
                count = count + 1
                end = dt.datetime.today()
                print("Runtime: " + str(end - start))
Esempio n. 6
0
    dfRatings = pd.read_csv(sys.argv[1])
    dfTest = pd.read_csv(sys.argv[2])

    # Delete unused columns
    del dfRatings['date']
    del dfRatings['train_id']
    del dfTest['date']
    del dfTest['test_id']

    # Set the rating scale and create the data for Surprise to use
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(
        dfRatings[['user_id', 'business_id', 'rating']], reader)

    train_set = data.build_full_trainset()

    # Use SVD with surprise
    algo = SVDpp()
    algo.train(train_set)

    f = open('SVDOutput.csv', 'w')
    f.write("test_id,rating\n")
    for i in range(len(dfTest)):
        prediction = algo.predict(dfTest.at[i, 'user_id'],
                                  dfTest.at[i, 'business_id'],
                                  r_ui=4,
                                  verbose=True)
        predRating = prediction.est
        f.write(str(i) + "," + str(predRating) + '\n')

    f.close()
Esempio n. 7
0
#with open('/Shared/bdagroup7/download/test_set.dat', "rb") as f:
#    test_set = pickle.load(f)
#with open('/Shared/bdagroup7/download/training_set.dat', "rb") as f:
#    training_set = pickle.load(f)

# Learning options

sim_options = {'name': 'cosine', 'min_support': 50, 'user_based': True}
bsl_options = {'method': 'sgd', 'learning_rate': .0005}

# Algorithms (only select one)
#algo = SVD()
#algo = KNNBasic(k=10, min_k=8, sim_options=sim_options)
#algo = KNNWithMeans(k=15, min_k=5, sim_options=sim_options)
#algo = CoClustering()
algo = SVDpp()

algo.train(training_set)

predictions = algo.test(test_set)

with open('/Shared/bdagroup7/download/predictions_svd_pp.dat', "wb") as f:
    pickle.dump(predictions, f)

# TODO: Ensemble

rmse = accuracy.rmse(predictions, verbose=True)

print("RMSE is: ")
print(rmse)
class SurSVDpp:
    def __init__(self, k=5):
        if not isinstance(k, int) or k <= 0:
            raise IOError("Parameter k should be a positive integer.")
        self.data = None
        self.k = k
        self.algo = SVDpp(n_factors=self.k)
        self.predictions = pd.DataFrame()

    def fit_directly(self, data_long):
        """
        This function directly computes the predictions
        of the algorithm for the data provided. The
        data needs to be in the long shape format. It then
        add to the class attributes the predictions made
        by the algorithm (maintaining the long format)
        :param data_long: pd.DataFrame | DataFrame in the long
                                    shape format
        :return void:
        """
        # Run SVD++
        reader = Reader(rating_scale=(0, 1))
        data = Dataset.load_from_df(data_long, reader)
        trainset = data.build_full_trainset()
        self.algo.train(trainset)
        testset = trainset.build_anti_testset()
        predictions = self.algo.test(testset)

        # Reconstruct predictions
        users = []
        items = []
        ratings = []
        dataframe = pd.DataFrame()
        for uid, iid, r_ui, _, _ in predictions:
            users.append(uid)
            items.append(iid)
            ratings.append(r_ui)

        dataframe["userID"] = users
        dataframe["itemID"] = items
        dataframe["ratings"] = ratings

        self.predictions = dataframe

    def fit(self, rating_matrix):
        """
        Fits the instance to the rating matrix. The index must be
        the users and the columns the items.
        :param rating_matrix: pd.DataFrame | rating matrix
        :return: void
        """
        data_long = rating_matrix.stack().reset_index()
        data_long.columns = ["user_id", "item_id", "ratings"]

        # Run SVD++
        reader = Reader(rating_scale=(0, 1))
        data = Dataset.load_from_df(data_long, reader)
        trainset = data.build_full_trainset()
        self.algo.train(trainset)
        testset = trainset.build_anti_testset()
        predictions = self.algo.test(testset)

        # Reconstruct predictions
        users = []
        items = []
        ratings = []
        dataframe = pd.DataFrame()
        for uid, iid, r_ui, _, _ in predictions:
            users.append(uid)
            items.append(iid)
            ratings.append(r_ui)

        dataframe["itemID"] = items
        dataframe["ratings"] = ratings
        dataframe["userID"] = users
        self.predictions = dataframe

    def predict(self, user, item):
        """
        Predict the probability that input user will like input item
        :param user: int | user ID
        :param item: int | item ID
        :return: float | probability that user likes item
        """
        cond1 = self.predictions["userID"] == user
        cond2 = self.predictions["itemID"] == item
        mask = cond1 & cond2
        temp = np.array(self.predictions.loc[mask, "ratings"])
        proba = np.sum(temp)
        return proba