Ejemplo n.º 1
0
def train_initial_model():
    dataset = get_movielens_dataset(variant='100K')

    train, test = random_train_test_split(dataset, random_state=np.random.RandomState(42))

    model = ImplicitFactorizationModel(loss='adaptive_hinge',
                                    embedding_dim=128,  # latent dimensionality
                                    n_iter=10,  # number of epochs of training
                                    batch_size=1024,  # minibatch size
                                    l2=1e-9,  # strength of L2 regularization
                                    learning_rate=1e-3,
                                    use_cuda=torch.cuda.is_available())

    print('Fitting the model')

    model.fit(train, verbose=True)
    print(type(model))

    model_file = open('models/filmclub.model', 'wb')
    pickle.dump(model, model_file)
    model_file.close()

    dataset.num_users = 1000000

    dataset_file = open('data/dataset.pkl', 'wb')
    pickle.dump(dataset, dataset_file)
    dataset_file.close()

    train_rmse = rmse_score(model, train)
    test_rmse = rmse_score(model, test)

    print('Train RMSE {:.3f}, test RMSE {:.3f}'.format(train_rmse, test_rmse))
Ejemplo n.º 2
0
def main(args):
    status = 'available' if CUDA else 'not available'
    print("CUDA is {}!".format(status))
    args = parse_args(args)

    # Fix random_state
    seed = 72
    set_seed(seed)
    random_state = np.random.RandomState(seed)

    max_sequence_length = 100
    min_sequence_length = 20
    step_size = max_sequence_length

    if args.dataset == 'amazon':
        max_sequence_length = 50
        min_sequence_length = 5
        step_size = max_sequence_length
        dataset = get_amazon_dataset()
    elif args.dataset == 'goodbooks':
        dataset = get_goodbooks_dataset()
    else:
        dataset = get_movielens_dataset(args.dataset.upper())

    args.variant = args.dataset
    train, rest = user_based_train_test_split(
        dataset,
        test_percentage=0.2,
        random_state=random_state)
    test, valid = user_based_train_test_split(
        rest,
        test_percentage=0.5,
        random_state=random_state)
    train = train.to_sequence(
        max_sequence_length=max_sequence_length,
        min_sequence_length=min_sequence_length,
        step_size=step_size)
    test = test.to_sequence(
        max_sequence_length=max_sequence_length,
        min_sequence_length=min_sequence_length,
        step_size=step_size)
    valid = valid.to_sequence(
        max_sequence_length=max_sequence_length,
        min_sequence_length=min_sequence_length,
        step_size=step_size)

    print('model: {}, data: {}'.format(args.model, train))

    fname = 'experiment_{}_{}.pickle'.format(args.model, args.dataset)
    objective = get_objective(train, valid, test, random_state)
    space = hyperparameter_space(args.model)

    for iteration in range(args.num_trials):
        print('Iteration {}'.format(iteration))
        trials = optimize(objective,
                          space,
                          trials_fname=fname,
                          max_evals=iteration + 1)

        summarize_trials(trials)
Ejemplo n.º 3
0
def generate_dataset_table():

    headers = ['Dataset', 'Users', 'Items', 'Density', '95th/50th']

    rows = []

    for name, dataset in (('Movielens 10M', get_movielens_dataset('10M')),
                          ('Amazon', get_amazon_dataset()),
                          ('Goodbooks', get_goodbooks_dataset())):

        item_counts = dataset.tocoo().getnnz(axis=0)

        print('Dataset {}, ratio: {:0,}'
              .format(name, np.percentile(item_counts, 95) / np.percentile(item_counts, 50)))

        row = [
            name,
            '{:0,}'.format(dataset.num_users),
            '{:0,}'.format(dataset.num_items),
            len(dataset) / dataset.num_users / dataset.num_items,
            '{0:.2f}'.format(np.percentile(item_counts, 95) / np.percentile(item_counts, 50))
        ]

        rows.append(row)

    return _full_width_table(
        tabulate(rows,
                 headers=headers,
                 floatfmt='.4f',
                 tablefmt='latex_booktabs'))
Ejemplo n.º 4
0
def load_data(dataset, random_state):

    dataset = get_movielens_dataset(dataset)

    # np.random.shuffle(dataset.timestamps)

    # max_sequence_length = int(np.percentile(dataset.tocsr()
    #                                         .getnnz(axis=1),
    #                                         80))
    max_sequence_length = 100
    min_sequence_length = 50
    step_size = max_sequence_length

    train_nonsequence, rest = user_based_train_test_split(
        dataset, test_percentage=0.2, random_state=random_state)
    test, validation = user_based_train_test_split(rest,
                                                   test_percentage=0.5,
                                                   random_state=random_state)

    train = train_nonsequence.to_sequence(
        max_sequence_length=max_sequence_length,
        min_sequence_length=min_sequence_length,
        step_size=step_size)
    test = test.to_sequence(max_sequence_length=max_sequence_length,
                            min_sequence_length=min_sequence_length,
                            step_size=step_size)
    validation = validation.to_sequence(
        max_sequence_length=max_sequence_length,
        min_sequence_length=min_sequence_length,
        step_size=step_size)

    return train_nonsequence, train, validation, test
Ejemplo n.º 5
0
def test_bloom(compression_ratio, expected_rmse):

    interactions = movielens.get_movielens_dataset('100K')

    train, test = random_train_test_split(interactions,
                                          random_state=RANDOM_STATE)

    user_embeddings = BloomEmbedding(interactions.num_users,
                                     32,
                                     compression_ratio=compression_ratio,
                                     num_hash_functions=2)
    item_embeddings = BloomEmbedding(interactions.num_items,
                                     32,
                                     compression_ratio=compression_ratio,
                                     num_hash_functions=2)
    network = BilinearNet(interactions.num_users,
                          interactions.num_items,
                          user_embedding_layer=user_embeddings,
                          item_embedding_layer=item_embeddings)

    model = ExplicitFactorizationModel(loss='regression',
                                       n_iter=10,
                                       batch_size=1024,
                                       learning_rate=1e-2,
                                       l2=1e-5,
                                       representation=network,
                                       use_cuda=CUDA)

    model.fit(train)
    print(model)

    rmse = rmse_score(model, test)
    print(rmse)

    assert rmse - EPSILON < expected_rmse
Ejemplo n.º 6
0
def data_implicit_sequence():

    max_sequence_length = 200
    min_sequence_length = 20
    step_size = 200

    interactions = movielens.get_movielens_dataset('100K')

    train, test = user_based_train_test_split(interactions,
                                              random_state=RANDOM_STATE)

    train = train.to_sequence(max_sequence_length=max_sequence_length,
                              min_sequence_length=min_sequence_length,
                              step_size=step_size)

    test = test.to_sequence(max_sequence_length=max_sequence_length,
                            min_sequence_length=min_sequence_length,
                            step_size=step_size)

    model = ImplicitSequenceModel(loss='adaptive_hinge',
                                  representation='lstm',
                                  batch_size=8,
                                  learning_rate=1e-2,
                                  l2=1e-3,
                                  n_iter=2,
                                  use_cuda=CUDA,
                                  random_state=RANDOM_STATE)

    model.fit(train, verbose=True)

    return train, test, model
Ejemplo n.º 7
0
def test_bpr_bloom(compression_ratio, expected_mrr):

    interactions = movielens.get_movielens_dataset('100K')

    train, test = random_train_test_split(interactions,
                                          random_state=RANDOM_STATE)

    user_embeddings = BloomEmbedding(interactions.num_users,
                                     32,
                                     compression_ratio=compression_ratio,
                                     num_hash_functions=2)
    item_embeddings = BloomEmbedding(interactions.num_items,
                                     32,
                                     compression_ratio=compression_ratio,
                                     num_hash_functions=2)
    network = BilinearNet(interactions.num_users,
                          interactions.num_items,
                          user_embedding_layer=user_embeddings,
                          item_embedding_layer=item_embeddings)

    model = ImplicitFactorizationModel(loss='bpr',
                                       n_iter=10,
                                       batch_size=1024,
                                       learning_rate=1e-2,
                                       l2=1e-6,
                                       representation=network,
                                       use_cuda=CUDA)

    model.fit(train)
    print(model)

    mrr = mrr_score(model, test, train=train).mean()

    assert mrr > expected_mrr
Ejemplo n.º 8
0
def data():

    interactions = movielens.get_movielens_dataset('100K')

    train, test = random_train_test_split(interactions,
                                          random_state=RANDOM_STATE)

    return train, test
Ejemplo n.º 9
0
def get_sequence_data():

    dataset = get_movielens_dataset('1M')
    max_sequence_length = 200
    min_sequence_length = 20
    data = dataset.to_sequence(
        max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=max_sequence_length)
    print(data.sequences.shape)

    return data
Ejemplo n.º 10
0
def load_data(dataset, random_state):

    dataset = get_movielens_dataset(dataset)

    train, rest = random_train_test_split(dataset, random_state=random_state)

    test, validation = random_train_test_split(rest,
                                               test_percentage=0.5,
                                               random_state=random_state)

    return train, validation, test
Ejemplo n.º 11
0
def test_user_based_split():

    interactions = movielens.get_movielens_dataset('100K')

    train, test = (cross_validation.user_based_train_test_split(
        interactions, test_percentage=0.2, random_state=RANDOM_STATE))

    assert len(train) + len(test) == len(interactions)

    users_in_test = len(np.unique(test.user_ids))
    assert np.allclose(float(users_in_test) / interactions.num_users, 0.2, atol=0.001)
Ejemplo n.º 12
0
def test_to_sequence_min_length():

    min_sequence_length = 10
    interactions = movielens.get_movielens_dataset('100K')

    # Check that with default arguments there are sequences
    # that are shorter than we want
    sequences = interactions.to_sequence(max_sequence_length=20)
    assert np.any((sequences.sequences != 0).sum(axis=1) < min_sequence_length)

    # But no such sequences after we specify min length.
    sequences = interactions.to_sequence(max_sequence_length=20,
                                         min_sequence_length=min_sequence_length)
    assert not np.any((sequences.sequences != 0).sum(axis=1) < min_sequence_length)
Ejemplo n.º 13
0
def test_user_based_split():

    interactions = movielens.get_movielens_dataset('100K')

    train, test = (cross_validation
                   .user_based_train_test_split(interactions,
                                                test_percentage=0.2,
                                                random_state=RANDOM_STATE))

    assert len(train) + len(test) == len(interactions)

    users_in_test = len(np.unique(test.user_ids))
    assert np.allclose(float(users_in_test) / interactions.num_users,
                       0.2, atol=0.001)
Ejemplo n.º 14
0
def test_to_sequence_min_length():

    min_sequence_length = 10
    interactions = movielens.get_movielens_dataset('100K')

    # Check that with default arguments there are sequences
    # that are shorter than we want
    sequences = interactions.to_sequence(max_sequence_length=20)
    assert np.any((sequences.sequences != 0).sum(axis=1) < min_sequence_length)

    # But no such sequences after we specify min length.
    sequences = interactions.to_sequence(
        max_sequence_length=20, min_sequence_length=min_sequence_length)
    assert not np.any(
        (sequences.sequences != 0).sum(axis=1) < min_sequence_length)
Ejemplo n.º 15
0
def test_to_sequence(max_sequence_length):

    interactions = movielens.get_movielens_dataset('100K')

    sequences = interactions.to_sequence(
        max_sequence_length=max_sequence_length)

    assert sequences.sequences.shape == (len(interactions),
                                         max_sequence_length)

    _test_just_padding(sequences.sequences)
    _test_final_column_no_padding(sequences.sequences)
    _test_shifted(sequences.sequences)
    _test_temporal_order(sequences.user_ids,
                         sequences.sequences,
                         interactions)
Ejemplo n.º 16
0
def test_precision_recall(data, k):

    (train, test, model) = data

    interactions = movielens.get_movielens_dataset('100K')
    train, test = random_train_test_split(interactions,
                                          random_state=RANDOM_STATE)

    precision, recall = precision_recall_score(model, test, train, k=k)

    assert precision.shape == recall.shape

    if not isinstance(k, list):
        assert len(precision.shape) == 1
    else:
        assert precision.shape[1] == len(k)
Ejemplo n.º 17
0
def test_adaptive_hinge():

    interactions = movielens.get_movielens_dataset('100K')

    train, test = random_train_test_split(interactions,
                                          random_state=RANDOM_STATE)

    model = ImplicitFactorizationModel(loss='adaptive_hinge',
                                       n_iter=10,
                                       batch_size=1024,
                                       learning_rate=1e-2,
                                       l2=1e-6)
    model.fit(train)

    mrr = mrr_score(model, test, train=train).mean()

    assert mrr > 0.07
Ejemplo n.º 18
0
def data():

    interactions = movielens.get_movielens_dataset('100K')

    train, test = random_train_test_split(interactions,
                                          random_state=RANDOM_STATE)

    model = ImplicitFactorizationModel(loss='bpr',
                                       n_iter=1,
                                       batch_size=1024,
                                       learning_rate=1e-2,
                                       l2=1e-6,
                                       random_state=RANDOM_STATE,
                                       use_cuda=CUDA)
    model.fit(train)

    return train, test, model
Ejemplo n.º 19
0
def test_poisson():

    interactions = movielens.get_movielens_dataset('100K')

    train, test = random_train_test_split(interactions,
                                          random_state=RANDOM_STATE)

    model = ExplicitFactorizationModel(loss='poisson',
                                       n_iter=10,
                                       batch_size=1024,
                                       learning_rate=1e-3,
                                       l2=1e-6)
    model.fit(train)

    rmse = rmse_score(model, test)

    assert rmse < 1.0
Ejemplo n.º 20
0
def test_poisson():

    interactions = movielens.get_movielens_dataset('100K')

    train, test = random_train_test_split(interactions,
                                          random_state=RANDOM_STATE)

    model = ExplicitFactorizationModel(loss='poisson',
                                       n_iter=10,
                                       batch_size=1024,
                                       learning_rate=1e-3,
                                       l2=1e-6)
    model.fit(train)

    rmse = rmse_score(model, test)

    assert rmse < 1.0
Ejemplo n.º 21
0
def test_predict_movielens(model_class):

    interactions = movielens.get_movielens_dataset('100K')

    model = model_class(n_iter=1, use_cuda=CUDA)
    model.fit(interactions)

    for user_id in np.random.randint(0, interactions.num_users, size=10):
        user_ids = np.repeat(user_id, interactions.num_items)
        item_ids = np.arange(interactions.num_items)

        uid_predictions = model.predict(user_id)
        iid_predictions = model.predict(user_id, item_ids)
        pair_predictions = model.predict(user_ids, item_ids)

        assert (uid_predictions == iid_predictions).all()
        assert (uid_predictions == pair_predictions).all()
Ejemplo n.º 22
0
def test_adaptive_hinge():

    interactions = movielens.get_movielens_dataset('100K')

    train, test = random_train_test_split(interactions,
                                          random_state=RANDOM_STATE)

    model = ImplicitFactorizationModel(loss='adaptive_hinge',
                                       n_iter=10,
                                       batch_size=1024,
                                       learning_rate=1e-2,
                                       l2=1e-6)
    model.fit(train)

    mrr = mrr_score(model, test, train=train).mean()

    assert mrr > 0.07
Ejemplo n.º 23
0
def test_to_sequence(max_sequence_length, step_size):

    interactions = movielens.get_movielens_dataset('100K')
    _, interactions = random_train_test_split(interactions)

    sequences = interactions.to_sequence(
        max_sequence_length=max_sequence_length, step_size=step_size)

    if step_size == 1:
        assert sequences.sequences.shape == (len(interactions),
                                             max_sequence_length)
    else:
        assert sequences.sequences.shape[1] == max_sequence_length

    _test_just_padding(sequences.sequences)
    _test_final_column_no_padding(sequences.sequences)
    _test_shifted(sequences.user_ids, sequences.sequences, step_size)
    _test_temporal_order(sequences.user_ids, sequences.sequences, interactions)
Ejemplo n.º 24
0
def test_bpr():

    interactions = movielens.get_movielens_dataset('100K')

    train, test = random_train_test_split(interactions,
                                          random_state=RANDOM_STATE)

    model = ImplicitFactorizationModel(loss='bpr',
                                       n_iter=10,
                                       batch_size=1024,
                                       learning_rate=1e-2,
                                       l2=1e-6,
                                       use_cuda=CUDA)
    model.fit(train)

    mrr = mrr_score(model, test, train=train).mean()

    assert mrr + EPSILON > 0.07
Ejemplo n.º 25
0
def test_check_input():
    # Train for single iter.
    interactions = movielens.get_movielens_dataset('100K')

    train, test = random_train_test_split(interactions,
                                          random_state=RANDOM_STATE)

    model = ExplicitFactorizationModel(loss='regression',
                                       n_iter=1,
                                       batch_size=1024,
                                       learning_rate=1e-3,
                                       l2=1e-6)
    model.fit(train)

    # Modify data to make imcompatible with original model.
    train.user_ids[0] = train.user_ids.max() + 1
    with pytest.raises(ValueError):
        model.fit(train)
Ejemplo n.º 26
0
def load_data(dataset, random_state):

    if 'goodbooks' in dataset:
        dataset = get_goodbooks_dataset()
    elif 'amazon' in dataset:
        dataset = get_amazon_dataset()
    else:
        dataset = get_movielens_dataset(dataset)

    train, rest = random_train_test_split(dataset,
                                          test_percentage=0.05,
                                          random_state=random_state)

    test, validation = random_train_test_split(rest,
                                               test_percentage=0.5,
                                               random_state=random_state)

    return train, validation, test
Ejemplo n.º 27
0
def test_bpr_custom_optimizer():

    interactions = movielens.get_movielens_dataset('100K')

    train, test = random_train_test_split(interactions,
                                          random_state=RANDOM_STATE)

    def adagrad_optimizer(model_params, lr=1e-2, weight_decay=1e-6):

        return torch.optim.Adagrad(model_params,
                                   lr=lr,
                                   weight_decay=weight_decay)

    model = ImplicitFactorizationModel(loss='bpr',
                                       n_iter=10,
                                       batch_size=1024,
                                       optimizer_func=adagrad_optimizer)
    model.fit(train)

    mrr = mrr_score(model, test, train=train).mean()

    assert mrr > 0.06
Ejemplo n.º 28
0
def test_to_sequence(max_sequence_length, step_size):

    interactions = movielens.get_movielens_dataset('100K')
    _, interactions = random_train_test_split(interactions)

    sequences = interactions.to_sequence(
        max_sequence_length=max_sequence_length,
        step_size=step_size)

    if step_size == 1:
        assert sequences.sequences.shape == (len(interactions),
                                             max_sequence_length)
    else:
        assert sequences.sequences.shape[1] == max_sequence_length

    _test_just_padding(sequences.sequences)
    _test_final_column_no_padding(sequences.sequences)
    _test_shifted(sequences.user_ids,
                  sequences.sequences,
                  step_size)
    _test_temporal_order(sequences.user_ids,
                         sequences.sequences,
                         interactions)
Ejemplo n.º 29
0
def test_logistic():

    interactions = movielens.get_movielens_dataset('100K')

    # Convert to binary
    interactions.ratings = (interactions.ratings > 3).astype(np.float32)
    # Convert from (0, 1) to (-1, 1)
    interactions.ratings = interactions.ratings * 2 - 1

    train, test = random_train_test_split(interactions,
                                          random_state=RANDOM_STATE)

    model = ExplicitFactorizationModel(loss='logistic',
                                       n_iter=10,
                                       batch_size=1024,
                                       learning_rate=1e-3,
                                       l2=1e-6,
                                       use_cuda=CUDA)
    model.fit(train)

    rmse = rmse_score(model, test)

    assert rmse - EPSILON < 1.05
Ejemplo n.º 30
0
def load_data(dataset, random_state):

    max_sequence_length = 100
    min_sequence_length = 20
    step_size = max_sequence_length

    if 'goodbooks' in dataset:
        dataset = get_goodbooks_dataset()
    elif 'amazon' in dataset:
        dataset = get_amazon_dataset()

        # This is a dataset with shorter sequences
        max_sequence_length = 50
        min_sequence_length = 5
        step_size = max_sequence_length
    else:
        dataset = get_movielens_dataset(dataset)

    train_nonsequence, rest = user_based_train_test_split(
        dataset, test_percentage=0.2, random_state=random_state)
    test, validation = user_based_train_test_split(rest,
                                                   test_percentage=0.5,
                                                   random_state=random_state)

    train = train_nonsequence.to_sequence(
        max_sequence_length=max_sequence_length,
        min_sequence_length=min_sequence_length,
        step_size=step_size)
    test = test.to_sequence(max_sequence_length=max_sequence_length,
                            min_sequence_length=min_sequence_length,
                            step_size=step_size)
    validation = validation.to_sequence(
        max_sequence_length=max_sequence_length,
        min_sequence_length=min_sequence_length,
        step_size=step_size)

    return train_nonsequence, train, validation, test
Ejemplo n.º 31
0
def test_bpr_custom_optimizer():

    interactions = movielens.get_movielens_dataset('100K')

    train, test = random_train_test_split(interactions,
                                          random_state=RANDOM_STATE)

    def adagrad_optimizer(model_params,
                          lr=1e-2,
                          weight_decay=1e-6):

        return torch.optim.Adagrad(model_params,
                                   lr=lr,
                                   weight_decay=weight_decay)

    model = ImplicitFactorizationModel(loss='bpr',
                                       n_iter=10,
                                       batch_size=1024,
                                       optimizer_func=adagrad_optimizer)
    model.fit(train)

    mrr = mrr_score(model, test, train=train).mean()

    assert mrr > 0.06
Ejemplo n.º 32
0
def generate_data(size_variant, **kwargs):
    dataset = get_movielens_dataset(variant=size_variant)
    return dataset
Ejemplo n.º 33
0
            test_mrr.mean(), val_mrr.mean()
        ))

        results.save(hyperparameters, test_mrr.mean(), val_mrr.mean())

    return results


if __name__ == '__main__':

    max_sequence_length = 200
    min_sequence_length = 20
    step_size = 200
    random_state = np.random.RandomState(100)

    dataset = get_movielens_dataset('1M')

    train, rest = user_based_train_test_split(dataset,
                                              random_state=random_state)
    test, validation = user_based_train_test_split(rest,
                                                   test_percentage=0.5,
                                                   random_state=random_state)
    train = train.to_sequence(max_sequence_length=max_sequence_length,
                              min_sequence_length=min_sequence_length,
                              step_size=step_size)
    test = test.to_sequence(max_sequence_length=max_sequence_length,
                            min_sequence_length=min_sequence_length,
                            step_size=step_size)
    validation = validation.to_sequence(max_sequence_length=max_sequence_length,
                                        min_sequence_length=min_sequence_length,
                                        step_size=step_size)
Ejemplo n.º 34
0
def get_data():
    dataset = get_movielens_dataset(variant='100K')
    return dataset
Ejemplo n.º 35
0
        print('Test MRR {} val MRR {}'.format(test_mrr.mean(), val_mrr.mean()))

        results.save(hyperparameters, test_mrr.mean(), val_mrr.mean())

    return results


if __name__ == '__main__':

    max_sequence_length = 200
    min_sequence_length = 20
    step_size = 200
    random_state = np.random.RandomState(100)

    dataset = get_movielens_dataset('1M')

    train, rest = user_based_train_test_split(dataset,
                                              random_state=random_state)
    test, validation = user_based_train_test_split(rest,
                                                   test_percentage=0.5,
                                                   random_state=random_state)
    train = train.to_sequence(max_sequence_length=max_sequence_length,
                              min_sequence_length=min_sequence_length,
                              step_size=step_size)
    test = test.to_sequence(max_sequence_length=max_sequence_length,
                            min_sequence_length=min_sequence_length,
                            step_size=step_size)
    validation = validation.to_sequence(
        max_sequence_length=max_sequence_length,
        min_sequence_length=min_sequence_length,
Ejemplo n.º 36
0
def get_factorization_data():

    dataset = get_movielens_dataset('1M')

    return dataset
Ejemplo n.º 37
0
from spotlight.cross_validation import random_train_test_split
from spotlight.datasets.movielens import get_movielens_dataset
from spotlight.evaluation import rmse_score
from spotlight.factorization.explicit import ExplicitFactorizationModel

dataset = get_movielens_dataset(variant='100K')

train, test = random_train_test_split(dataset)

model = ExplicitFactorizationModel(n_iter=1)
model.fit(train)

rmse = rmse_score(model, test)

print(rmse)