Esempio n. 1
0
def train_initial_model():
    dataset = get_movielens_dataset(variant='100K')

    train, test = random_train_test_split(dataset, random_state=np.random.RandomState(42))

    model = ImplicitFactorizationModel(loss='adaptive_hinge',
                                    embedding_dim=128,  # latent dimensionality
                                    n_iter=10,  # number of epochs of training
                                    batch_size=1024,  # minibatch size
                                    l2=1e-9,  # strength of L2 regularization
                                    learning_rate=1e-3,
                                    use_cuda=torch.cuda.is_available())

    print('Fitting the model')

    model.fit(train, verbose=True)
    print(type(model))

    model_file = open('models/filmclub.model', 'wb')
    pickle.dump(model, model_file)
    model_file.close()

    dataset.num_users = 1000000

    dataset_file = open('data/dataset.pkl', 'wb')
    pickle.dump(dataset, dataset_file)
    dataset_file.close()

    train_rmse = rmse_score(model, train)
    test_rmse = rmse_score(model, test)

    print('Train RMSE {:.3f}, test RMSE {:.3f}'.format(train_rmse, test_rmse))
Esempio n. 2
0
def train_spotlight_models(train, test, dataset_testing, embedding_dims, n_iters, batch_sizes, l2s, learning_rates, is_save = False):
    """
    takes train, test, dataset_testing datasets as spotlight.interactions.
    train multiple spotlight models using ExplicitFactorizationModel, with given parameters.
    parameters are given in embedding_dims, n_iters, batch_sizes, l2s, learning_rates.
    return predictions of train, test, dataset_testing datasets as well as rmse on train and test.
    """
    
    # initialize train_rmses and test_rmses, these store rmse on train and test set
    train_rmses = np.array([])
    test_rmses = np.array([])
    # initialize preds_train_trains, preds_train_tests, preds_tests; these store predictions of models  on train, test and dataset_testing datasets
    preds_train_trains = []
    preds_train_tests = []
    preds_tests = []
    
    # traverse all parameter combinations
    # embedding_din, n_iter, batch_size, l2 regularization, learning_rate 
    for embedding_dim in embedding_dims:
        for n_iter in n_iters:
            for batch_size in batch_sizes:
                for l2 in l2s:
                    for learning_rate in learning_rates:
                        # initialize model with parameter, ues GPU is torch.cuda.is_available() returns True, otherwise use CPU
                        model = ExplicitFactorizationModel(loss='regression',
                                                           embedding_dim=embedding_dim,  # latent dimensionality
                                                           n_iter=n_iter,  # number of epochs of training
                                                           batch_size=batch_size,  # minibatch size
                                                           l2=l2,  # strength of L2 regularization
                                                           learning_rate=learning_rate,
                                                           use_cuda=torch.cuda.is_available())
                        
                        # print which model is being trained
                        print("embedding_dim={}, n_iter={}, batch_size={}, l2={}, learning_rate={}".format(embedding_dim, n_iter, batch_size, l2, learning_rate))
                        # fit model
                        model.fit(train, verbose=True)
                        # find rmse on train
                        train_rmse = rmse_score(model, train)
                        # find rmse on test
                        test_rmse = rmse_score(model, test)
                        # store rmse on train and test sets
                        train_rmses = np.append(train_rmses, train_rmse)
                        test_rmses = np.append(test_rmses, test_rmse)   
                        # print train and test rmses
                        print('Train RMSE {:.3f}, test RMSE {:.3f}'.format(train_rmse, test_rmse))
                        # if is_save given, save the models to disk
                        if is_save:
                            torch.save(model, "models/embedding_dim={}, n_iter={}, batch_size={}, l2={}, learning_rate={}".format(embedding_dim, n_iter, batch_size, l2, learning_rate))
                        # find predictions of train, test and dataset_testing datasets
                        preds_train_train = model.predict(train.user_ids,train.item_ids)
                        preds_train_test = model.predict(test.user_ids,test.item_ids)
                        preds_test = model.predict(dataset_testing.user_ids,dataset_testing.item_ids)
                        #store those predictions
                        preds_train_trains.append(preds_train_train)
                        preds_train_tests.append(preds_train_test)
                        preds_tests.append(preds_test)
    
    # return stored predictions on train, test, dataset_testing; return rmses on train and test
    return preds_train_trains, preds_train_tests, preds_tests, train_rmses, test_rmses
Esempio n. 3
0
    def fit(self, num_epochs, report_int=1):
        for t in range(num_epochs):
            self.step_mcmc()

            ## REPORTING ###
            if t % report_int == 0:
                rmse_train = rmse_score(self, self.train)
                rmse_test = rmse_score(self, self.test)
                print(
                    f'step: {t} \t rmse train: {rmse_train:.2f}, test: {rmse_test:.2f}'
                )
        return rmse_train, rmse_test
Esempio n. 4
0
def test_bloom(compression_ratio, expected_rmse):

    interactions = movielens.get_movielens_dataset('100K')

    train, test = random_train_test_split(interactions,
                                          random_state=RANDOM_STATE)

    user_embeddings = BloomEmbedding(interactions.num_users,
                                     32,
                                     compression_ratio=compression_ratio,
                                     num_hash_functions=2)
    item_embeddings = BloomEmbedding(interactions.num_items,
                                     32,
                                     compression_ratio=compression_ratio,
                                     num_hash_functions=2)
    network = BilinearNet(interactions.num_users,
                          interactions.num_items,
                          user_embedding_layer=user_embeddings,
                          item_embedding_layer=item_embeddings)

    model = ExplicitFactorizationModel(loss='regression',
                                       n_iter=10,
                                       batch_size=1024,
                                       learning_rate=1e-2,
                                       l2=1e-5,
                                       representation=network,
                                       use_cuda=CUDA)

    model.fit(train)
    print(model)

    rmse = rmse_score(model, test)
    print(rmse)

    assert rmse - EPSILON < expected_rmse
Esempio n. 5
0
def test_explicit_serialization(data):

    train, test = data

    model = ExplicitFactorizationModel(loss='regression',
                                       n_iter=3,
                                       batch_size=1024,
                                       learning_rate=1e-3,
                                       l2=1e-5,
                                       use_cuda=CUDA)
    model.fit(train)

    rmse_original = rmse_score(model, test)
    rmse_recovered = rmse_score(_reload(model), test)

    assert rmse_original == rmse_recovered
Esempio n. 6
0
def best_params_spotlight(losses,
                          n_iters,
                          batch_sizes,
                          l2s,
                          learning_rates,
                          embedding_dims,
                          train_data,
                          t=Timer()):
    rmses = dict()
    params = dict()
    t.start()
    for loss in losses:
        params['loss'] = loss
        for n_iter in n_iters:
            params['n_iter'] = n_iter
            for batch_size in batch_sizes:
                params['batch_size'] = batch_size
                for l2 in l2s:
                    params['l2'] = l2
                    for learning_rate in learning_rates:
                        params['learning_rate'] = learning_rate
                        for embedding_dim in embedding_dims:
                            params['embedding_dim'] = embedding_dim
                            model = ExplicitFactorizationModel(
                                loss='regression',
                                embedding_dim=
                                embedding_dim,  # latent dimensionality
                                n_iter=n_iter,  # number of epochs of training
                                batch_size=batch_size,  # minibatch size
                                l2=l2,  # strength of L2 regularization
                                learning_rate=learning_rate,
                                use_cuda=torch.cuda.is_available())

                            params['model'] = model

                            train_tr_data, test_tr_data = random_train_test_split(
                                train_data,
                                random_state=np.random.RandomState(42))

                            model.fit(train_tr_data, verbose=True)

                            rmse = rmse_score(model, test_tr_data)

                            rmses[rmse] = params
                            print(
                                "-----------Time: {}, Loss: {}, n_iter: {}, l2: {}, batch_size: {}, learning_rate: {}, embedding_dim: {}, rmse: {}-------------\n\n"
                                .format(t.stop(), loss, n_iter, l2, batch_size,
                                        learning_rate, embedding_dim, rmse))
                            # restart timer
                            t.start()
Esempio n. 7
0
def load_spotlight_models(train, test, dataset_testing, verbose=False):
    """
    Loads pretrained spotlight models from the folder in the directory. 
    Takes train, test datasets and dataset_testing to generate predictions and calculate rmse
    """
    
    # initialize predictions, stores predictions on train, test and dataset_testing datasets
    preds_train_trains = []
    preds_train_tests = []
    preds_tests = []
    # initialize rmses, stores rmses on train and test dataset
    train_rmses = np.array([])
    test_rmses = np.array([])
    
    # for each file in the "models" folder in the directory
    for file in glob.glob("models/*"):
        # prinr filenames, if given True
        if verbose:
            print(file)
        # load model
        model = torch.load(file)
        # calculate and store rmses on train and test datasets
        train_rmse = rmse_score(model, train)
        test_rmse = rmse_score(model, test)
        train_rmses = np.append(train_rmses, train_rmse)
        test_rmses = np.append(test_rmses, test_rmse)
        # make predictions on train, test and dataset_testing datasets
        preds_train_train = model.predict(train.user_ids,train.item_ids)
        preds_train_test = model.predict(test.user_ids,test.item_ids)
        preds_test = model.predict(dataset_testing.user_ids,dataset_testing.item_ids)
        # store predictions
        preds_train_trains.append(preds_train_train)
        preds_train_tests.append(preds_train_test)
        preds_tests.append(preds_test)
    
    # return predictions on train, test and dataset_testing datasets; return rmse on train and test datasets
    return preds_train_trains, preds_train_tests, preds_tests, train_rmses, test_rmses
    def resultados_factorizacion_explicito(self):
        """
        Método resultados_factorizacion_explicito. Calcula las métricas del modelo de factorización explícito.

        Este método solo se utiliza en la interfaz de texto.
        """
        
        global train, test, modelo
        
        # Se calculan las métricas
        rmse = rmse_score(modelo, test)
        mrr = mrr_score(modelo, test, train=train).mean()
        precision, recall = precision_recall_score(modelo, test, train=train, k=10)
        
        # Se imprimen las métricas
        imprimir_resultados_dl(mrr, precision.mean(), recall.mean(), rmse)
Esempio n. 9
0
def build_model(data, loss, embedding_dim, n_iter, batch_size, l2,
                learning_rate, **kwargs):
    model = ExplicitFactorizationModel(
        loss=loss,
        embedding_dim=embedding_dim,  # latent dimensionality
        n_iter=n_iter,  # number of epochs of training
        batch_size=batch_size,  # minibatch size
        l2=l2,  # strength of L2 regularization
        learning_rate=learning_rate,
        use_cuda=torch.cuda.is_available())

    train, test = random_train_test_split(
        data, random_state=np.random.RandomState(42))
    model.fit(train, verbose=True)
    test_rmse = rmse_score(model, test)
    return test_rmse
Esempio n. 10
0
def test_poisson():

    interactions = movielens.get_movielens_dataset('100K')

    train, test = random_train_test_split(interactions,
                                          random_state=RANDOM_STATE)

    model = ExplicitFactorizationModel(loss='poisson',
                                       n_iter=10,
                                       batch_size=1024,
                                       learning_rate=1e-3,
                                       l2=1e-6)
    model.fit(train)

    rmse = rmse_score(model, test)

    assert rmse < 1.0
Esempio n. 11
0
def test_poisson():

    interactions = movielens.get_movielens_dataset('100K')

    train, test = random_train_test_split(interactions,
                                          random_state=RANDOM_STATE)

    model = ExplicitFactorizationModel(loss='poisson',
                                       n_iter=10,
                                       batch_size=1024,
                                       learning_rate=1e-3,
                                       l2=1e-6)
    model.fit(train)

    rmse = rmse_score(model, test)

    assert rmse < 1.0
    def obtener_metricas_gui(self):
        """
        Método obtener_metricas_gui. Obtiene las métricas del modelo escogido.

        Este método solo se utiliza en la interfaz web.

        Returns
        -------

        metricas_devueltas: dict
            diccionario con las métricas del modelo
        """

        global train, test, modelo

        # Se guardan las métricas en un diccionario para su futura muestra en la interfaz web
        metricas = dict()

        # Se calculan las métricas y se guardan en el diccionario formateadas
        if self.opcion_modelo == 1:
            rmse = rmse_score(modelo, test)
            mrr = mrr_score(modelo, test, train=train).mean()
            precision, recall = precision_recall_score(modelo, test, train=train, k=10)
            metricas_devueltas = {"RMSE": format(rmse, '.4f'), "MRR": format(mrr, '.4f'), "Precisión k": format(precision.mean(), '.4f'), "Recall k": format(recall.mean(), '.4f')}
            metricas_a_guardar = {"RMSE": [format(rmse, '.4f')], "MRR": [format(mrr, '.4f')], "Precisión k": [format(precision.mean(), '.4f')], "Recall k": [format(recall.mean(), '.4f')]}
        elif self.opcion_modelo == 2:
            mrr = mrr_score(modelo, test, train=train).mean()
            precision, recall = precision_recall_score(modelo, test, train=train, k=10)
            metricas_devueltas = {"MRR": format(mrr, '.4f'), "Precisión k": format(precision.mean(), '.4f'), "Recall k": format(recall.mean(), '.4f')}
            metricas_a_guardar = {"MRR": [format(mrr, '.4f')], "Precisión k": [format(precision.mean(), '.4f')], "Recall k": [format(recall.mean(), '.4f')]}
        else:
            mrr = sequence_mrr_score(modelo, test).mean()
            metricas_devueltas = {"MRR": format(mrr, '.4f')}
            metricas_a_guardar = {"MRR": [format(mrr, '.4f')]}
        
        # Se guardan las métricas en un archivo .csv
        guardar_resultados(metricas_a_guardar)

        return metricas_devueltas
Esempio n. 13
0
def test_logistic():

    interactions = movielens.get_movielens_dataset('100K')

    # Convert to binary
    interactions.ratings = (interactions.ratings > 3).astype(np.float32)
    # Convert from (0, 1) to (-1, 1)
    interactions.ratings = interactions.ratings * 2 - 1

    train, test = random_train_test_split(interactions,
                                          random_state=RANDOM_STATE)

    model = ExplicitFactorizationModel(loss='logistic',
                                       n_iter=10,
                                       batch_size=1024,
                                       learning_rate=1e-3,
                                       l2=1e-6,
                                       use_cuda=CUDA)
    model.fit(train)

    rmse = rmse_score(model, test)

    assert rmse - EPSILON < 1.05
Esempio n. 14
0
from spotlight.interactions import Interactions

user_ids = np.array(lite['user']).astype(np.int32)
item_ids = np.array(lite['item']).astype(np.int32)
ratings = np.array(lite['rating']).astype(np.float32)
times = np.array(lite['time']).astype(np.int32)
dataset = Interactions(user_ids, item_ids, ratings, times)

# Prepare train test
train, test = user_based_train_test_split(dataset)
# train, test = random_train_test_split(dataset)

# Test baseline
model = ExplicitFactorizationModel(n_iter=20)
model.fit(train, verbose=True)
print('RMSE', rmse_score(model, test))

from scipy.sparse import coo_matrix

ratings = coo_matrix((dataset.ratings, (dataset.user_ids, dataset.item_ids)),
                     shape=(dataset.num_users, dataset.num_items)).tocsr()

train_seq = train.to_sequence(SEQ_LEN)
test_seq = test.to_sequence(SEQ_LEN)

model = ExplicitSequenceModel(n_iter=30, representation='lstm', batch_size=1)
model.fit(train_seq, ratings, verbose=True)

SEQ_ID = 0
user_batch = train_seq.user_ids[SEQ_ID]
item_batch = train_seq.sequences[SEQ_ID]
Esempio n. 15
0
from spotlight.cross_validation import random_train_test_split
from spotlight.datasets.movielens import get_movielens_dataset
from spotlight.evaluation import rmse_score
from spotlight.factorization.explicit import ExplicitFactorizationModel

dataset = get_movielens_dataset(variant='100K')

train, test = random_train_test_split(dataset)

model = ExplicitFactorizationModel(n_iter=1)
model.fit(train)

rmse = rmse_score(model, test)

print(rmse)
Esempio n. 16
0
def trainModelUntilOverfit(dataset, modelSteps, modelIterations,
                           numberDataSplits, embedding_dim, learning_rate):

    numUsers = dataset.num_users
    numMovies = dataset.num_items
    train, test = random_train_test_split(dataset, 0.2)

    print('Split into \n {} and \n {}.'.format(train, test))

    #add random seed
    seed = np.random.RandomState(seed=55555)
    model = ExplicitFactorizationModel(n_iter=modelIterations,
                                       embedding_dim=embedding_dim,
                                       learning_rate=learning_rate,
                                       random_state=seed)

    rmseResults = np.empty((modelSteps * numberDataSplits, 2))
    indexPreviousClosest = ["0"]

    if (numberDataSplits > 1):
        arraySplits = dataSplit(train, numberDataSplits)
        print("Data set split into", len(arraySplits), "*", (arraySplits[1]))
    # Each model step fits the entire dataset
    arrayOfSteps = []
    splitCounter = 0
    fullStepCounter = 0  # increases each time the entire data set has been visited
    currentStep = 0  # increases at every split of the data set (does not reset)
    for i in range(modelSteps * numberDataSplits):
        print("\nStarting step", fullStepCounter)
        print("Data split", splitCounter)
        if (numberDataSplits == 1):
            model.fit(train, verbose=True)
        elif (numberDataSplits > 1):
            print(arraySplits[splitCounter])
            model.fit(arraySplits[splitCounter], verbose=True)

        else:
            print("Invalid number of data splits")
            break

        #predictions for any user are made for all items, matrix has shape (944, 1683)
        modelPredict = np.empty((numUsers, numMovies))
        for userIndex in range(numUsers):
            modelPredict[userIndex, :] = model.predict(userIndex)

        # We take the transpose for tsne formatting (should be more rows than columns)
        modelPredict = modelPredict.T

        #Measure the model's effectiveness (how good predictions are):
        rmse = rmse_score(model, test)
        rmseTrain = rmse_score(model, train)
        rmseTest = rmse_score(model, test)
        print("RMSE TEST:", rmseTest, "\n")
        rmseResults[i, :] = [rmseTrain, rmseTest]
        arrayOfSteps += [i]

        if (stopTraining(rmseResults, arrayOfSteps)):
            rmseResults = rmseResults[:len(arrayOfSteps)]
            break

        if (numberDataSplits > 1):
            splitCounter += 1
            if (splitCounter >= len(arraySplits)):
                splitCounter = 0
                fullStepCounter += 1

    currentStep += 1

    return (model, rmseResults)
Esempio n. 17
0
    embedding_dim=5,  # latent dimensionality
    n_iter=10,  # number of epochs of training
    batch_size=256,  # minibatch size
    l2=1e-9,  # strength of L2 regularization
    learning_rate=2e-2,
    use_cuda=torch.cuda.is_available())
# model = ImplicitFactorizationModel(loss='bpr',
#                                    embedding_dim=128,  # latent dimensionality
#                                    n_iter=10,  # number of epochs of training
#                                    batch_size=256,  # minibatch size
#                                    l2=1e-9,  # strength of L2 regularization
#                                    learning_rate=1e-2,
#                                    use_cuda=torch.cuda.is_available())
from spotlight.cross_validation import random_train_test_split

train, test = random_train_test_split(dataset,
                                      random_state=np.random.RandomState(42))

print('Split into \n {} and \n {}.'.format(train, test))
model.fit(train, verbose=True)
torch.save(model, 'spotlight.model')

from spotlight.evaluation import rmse_score

train_rmse = rmse_score(model, train)
test_rmse = rmse_score(model, test)

print('Train RMSE {:.3f}, test RMSE {:.3f}'.format(train_rmse, test_rmse))
predictions = model.predict(test.user_ids, test.item_ids)
print(((predictions > 0.5) == (test.ratings > 0)).sum() / len(predictions))