Beispiel #1
0
def train_spotlight_models(train, test, dataset_testing, embedding_dims, n_iters, batch_sizes, l2s, learning_rates, is_save = False):
    """
    takes train, test, dataset_testing datasets as spotlight.interactions.
    train multiple spotlight models using ExplicitFactorizationModel, with given parameters.
    parameters are given in embedding_dims, n_iters, batch_sizes, l2s, learning_rates.
    return predictions of train, test, dataset_testing datasets as well as rmse on train and test.
    """
    
    # initialize train_rmses and test_rmses, these store rmse on train and test set
    train_rmses = np.array([])
    test_rmses = np.array([])
    # initialize preds_train_trains, preds_train_tests, preds_tests; these store predictions of models  on train, test and dataset_testing datasets
    preds_train_trains = []
    preds_train_tests = []
    preds_tests = []
    
    # traverse all parameter combinations
    # embedding_din, n_iter, batch_size, l2 regularization, learning_rate 
    for embedding_dim in embedding_dims:
        for n_iter in n_iters:
            for batch_size in batch_sizes:
                for l2 in l2s:
                    for learning_rate in learning_rates:
                        # initialize model with parameter, ues GPU is torch.cuda.is_available() returns True, otherwise use CPU
                        model = ExplicitFactorizationModel(loss='regression',
                                                           embedding_dim=embedding_dim,  # latent dimensionality
                                                           n_iter=n_iter,  # number of epochs of training
                                                           batch_size=batch_size,  # minibatch size
                                                           l2=l2,  # strength of L2 regularization
                                                           learning_rate=learning_rate,
                                                           use_cuda=torch.cuda.is_available())
                        
                        # print which model is being trained
                        print("embedding_dim={}, n_iter={}, batch_size={}, l2={}, learning_rate={}".format(embedding_dim, n_iter, batch_size, l2, learning_rate))
                        # fit model
                        model.fit(train, verbose=True)
                        # find rmse on train
                        train_rmse = rmse_score(model, train)
                        # find rmse on test
                        test_rmse = rmse_score(model, test)
                        # store rmse on train and test sets
                        train_rmses = np.append(train_rmses, train_rmse)
                        test_rmses = np.append(test_rmses, test_rmse)   
                        # print train and test rmses
                        print('Train RMSE {:.3f}, test RMSE {:.3f}'.format(train_rmse, test_rmse))
                        # if is_save given, save the models to disk
                        if is_save:
                            torch.save(model, "models/embedding_dim={}, n_iter={}, batch_size={}, l2={}, learning_rate={}".format(embedding_dim, n_iter, batch_size, l2, learning_rate))
                        # find predictions of train, test and dataset_testing datasets
                        preds_train_train = model.predict(train.user_ids,train.item_ids)
                        preds_train_test = model.predict(test.user_ids,test.item_ids)
                        preds_test = model.predict(dataset_testing.user_ids,dataset_testing.item_ids)
                        #store those predictions
                        preds_train_trains.append(preds_train_train)
                        preds_train_tests.append(preds_train_test)
                        preds_tests.append(preds_test)
    
    # return stored predictions on train, test, dataset_testing; return rmses on train and test
    return preds_train_trains, preds_train_tests, preds_tests, train_rmses, test_rmses
Beispiel #2
0
def train_spotlight_models_using_all_data(train, dataset_testing, embedding_dims, n_iters, batch_sizes, l2s, learning_rates, verbose=True):
    """
    takes train dataset as spotlight.interactions.
    train multiple spotlight models using ExplicitFactorizationModel, with given parameters.
    parameters are given in embedding_dims, n_iters, batch_sizes, l2s, learning_rates.
    saves trained models into disk
    """
    
    # store predictions on test set
    preds_tests = []

    # traverse all parameter combinations
    # embedding_din, n_iter, batch_size, l2 regularization, learning_rate 
    for embedding_dim in embedding_dims:
        for n_iter in n_iters:
            for batch_size in batch_sizes:
                for l2 in l2s:
                    for learning_rate in learning_rates:
                        # initialize model
                        model = ExplicitFactorizationModel(loss='regression',
                                                           embedding_dim=embedding_dim,  # latent dimensionality
                                                           n_iter=n_iter,  # number of epochs of training
                                                           batch_size=batch_size,  # minibatch size
                                                           l2=l2,  # strength of L2 regularization
                                                           learning_rate=learning_rate,
                                                           use_cuda=torch.cuda.is_available())
                        
                        # print if given True
                        if verbose:
                            print("embedding_dim={}, n_iter={}, batch_size={}, l2={}, learning_rate={}".format(embedding_dim, n_iter, batch_size, l2, learning_rate))
                        # fit model using train dataset
                        model.fit(train, verbose=verbose)
                        preds_test = model.predict(dataset_testing.user_ids,dataset_testing.item_ids)
                        preds_tests.append(preds_test)
                        # save model to disk
                        torch.save(model, "models_all_data/embedding_dim={}, n_iter={}, batch_size={}, l2={}, learning_rate={}".format(embedding_dim, n_iter, batch_size, l2, learning_rate))

    # return stored predictions on dataset_testing
    return preds_tests
Beispiel #3
0
    embedding_dim=5,  # latent dimensionality
    n_iter=10,  # number of epochs of training
    batch_size=256,  # minibatch size
    l2=1e-9,  # strength of L2 regularization
    learning_rate=2e-2,
    use_cuda=torch.cuda.is_available())
# model = ImplicitFactorizationModel(loss='bpr',
#                                    embedding_dim=128,  # latent dimensionality
#                                    n_iter=10,  # number of epochs of training
#                                    batch_size=256,  # minibatch size
#                                    l2=1e-9,  # strength of L2 regularization
#                                    learning_rate=1e-2,
#                                    use_cuda=torch.cuda.is_available())
from spotlight.cross_validation import random_train_test_split

train, test = random_train_test_split(dataset,
                                      random_state=np.random.RandomState(42))

print('Split into \n {} and \n {}.'.format(train, test))
model.fit(train, verbose=True)
torch.save(model, 'spotlight.model')

from spotlight.evaluation import rmse_score

train_rmse = rmse_score(model, train)
test_rmse = rmse_score(model, test)

print('Train RMSE {:.3f}, test RMSE {:.3f}'.format(train_rmse, test_rmse))
predictions = model.predict(test.user_ids, test.item_ids)
print(((predictions > 0.5) == (test.ratings > 0)).sum() / len(predictions))
Beispiel #4
0
test = np.load('data/loocv_test.npz')
train_feat = train['train_feat'].astype('int64')
train_scor = train['train_scor'][:, None].astype('float32')
test_feat = test['test_feat'].astype('int64')
test_scor = test['test_scor'][:, None].astype('float32')

model = ExplicitFactorizationModel(
    loss='regression',
    embedding_dim=64,  # latent dimensionality
    n_iter=20,  # number of epochs of training
    batch_size=1024 * 4,  # minibatch size
    l2=1e-9,  # strength of L2 regularization
    learning_rate=1e-3,
    use_cuda=torch.cuda.is_available())


def features(feat, scor):
    user = feat[:, 0].astype('int64')
    item = feat[:, 1].astype('int64')
    y = scor[:, 0].astype('float32')
    return user, item, y


train = Interactions(*features(train_feat, train_scor))
test_user, test_item, test_y = features(test_feat, test_scor)
model.fit(train, verbose=True)
pred_y = model.predict(test_user, test_item)

rmse = np.sqrt(((pred_y - test_y)**2.0).mean())
print(rmse)
full_movies = movie_ind.movie_int.unique()
recommendations = []
# Convert datetime to string to ensure serialization success
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
batch_count = 0

for device, user_row in user_ind.iterrows():
    # Get list of all movies this user voted on
    log.info("Generating recommendations for user {}".format(device))
    user = user_row.user_int
    user_votes = ratings_df[ratings_df.user_int == user].movie_int.unique()
    # Calculate difference in the two lists - rate those movies only
    m = np.setdiff1d(full_movies, user_votes)
    user_rank = 0
    # for each movie and prediction for a given user, create a recommendation row
    for movie, pred in zip(m, model.predict(user_ids=user, item_ids=m)):
        batch_count += 1
        user_rank += 1
        log.debug('...movie {}'.format(user_rank))
        # For each prediction, make a recommendation row
        recommendations.append({
            'user_id':
            device,
            'rank':
            user_rank,
            'movie_id':
            movie_ind[movie_ind.movie_int == movie].index[0],
            'pred_rating':
            float(pred),
            'pred_time':
            timestamp
Beispiel #6
0
def trainModelUntilOverfit(dataset, modelSteps, modelIterations,
                           numberDataSplits, embedding_dim, learning_rate):

    numUsers = dataset.num_users
    numMovies = dataset.num_items
    train, test = random_train_test_split(dataset, 0.2)

    print('Split into \n {} and \n {}.'.format(train, test))

    #add random seed
    seed = np.random.RandomState(seed=55555)
    model = ExplicitFactorizationModel(n_iter=modelIterations,
                                       embedding_dim=embedding_dim,
                                       learning_rate=learning_rate,
                                       random_state=seed)

    rmseResults = np.empty((modelSteps * numberDataSplits, 2))
    indexPreviousClosest = ["0"]

    if (numberDataSplits > 1):
        arraySplits = dataSplit(train, numberDataSplits)
        print("Data set split into", len(arraySplits), "*", (arraySplits[1]))
    # Each model step fits the entire dataset
    arrayOfSteps = []
    splitCounter = 0
    fullStepCounter = 0  # increases each time the entire data set has been visited
    currentStep = 0  # increases at every split of the data set (does not reset)
    for i in range(modelSteps * numberDataSplits):
        print("\nStarting step", fullStepCounter)
        print("Data split", splitCounter)
        if (numberDataSplits == 1):
            model.fit(train, verbose=True)
        elif (numberDataSplits > 1):
            print(arraySplits[splitCounter])
            model.fit(arraySplits[splitCounter], verbose=True)

        else:
            print("Invalid number of data splits")
            break

        #predictions for any user are made for all items, matrix has shape (944, 1683)
        modelPredict = np.empty((numUsers, numMovies))
        for userIndex in range(numUsers):
            modelPredict[userIndex, :] = model.predict(userIndex)

        # We take the transpose for tsne formatting (should be more rows than columns)
        modelPredict = modelPredict.T

        #Measure the model's effectiveness (how good predictions are):
        rmse = rmse_score(model, test)
        rmseTrain = rmse_score(model, train)
        rmseTest = rmse_score(model, test)
        print("RMSE TEST:", rmseTest, "\n")
        rmseResults[i, :] = [rmseTrain, rmseTest]
        arrayOfSteps += [i]

        if (stopTraining(rmseResults, arrayOfSteps)):
            rmseResults = rmseResults[:len(arrayOfSteps)]
            break

        if (numberDataSplits > 1):
            splitCounter += 1
            if (splitCounter >= len(arraySplits)):
                splitCounter = 0
                fullStepCounter += 1

    currentStep += 1

    return (model, rmseResults)
Beispiel #7
0
train_seq = train.to_sequence(SEQ_LEN)
test_seq = test.to_sequence(SEQ_LEN)

model = ExplicitSequenceModel(n_iter=30, representation='lstm', batch_size=1)
model.fit(train_seq, ratings, verbose=True)

SEQ_ID = 0
user_batch = train_seq.user_ids[SEQ_ID]
item_batch = train_seq.sequences[SEQ_ID]
print('seq', item_batch)
item_batch = np.trim_zeros(item_batch)

truth = np.array([
    ratings[u, i] for u, i in np.broadcast(user_batch, item_batch)
]).reshape(1, -1)
pred = model.predict(item_batch, truth)
print(pred)
print(truth)

user_batch = test_seq.user_ids[SEQ_ID]
item_batch = test_seq.sequences[SEQ_ID]
print('seq', item_batch)
item_batch = np.trim_zeros(item_batch)

truth = np.array([
    ratings[u, i] for u, i in np.broadcast(user_batch, item_batch)
]).reshape(1, -1)
pred = model.predict(item_batch, truth)
print(pred)
print(truth)