def implicit_model(train_val_implicit_data, gpu_count):
    train, val = train_val_implicit_data
    model = MatrixFactorizationModel(train=train,
                                     val=val,
                                     embedding_dim=10,
                                     lr=1e-1)
    model_trainer = CollieTrainer(model=model,
                                  gpus=gpu_count,
                                  max_epochs=10,
                                  deterministic=True,
                                  logger=False,
                                  checkpoint_callback=False)
    model_trainer.fit(model)
    model.freeze()

    return model
Beispiel #2
0
def test_instantiation_of_sparse_model_with_weight_decay(
        train_val_implicit_data, capfd):
    train, val = train_val_implicit_data

    model_1 = MatrixFactorizationModel(train=train,
                                       val=val,
                                       sparse=False,
                                       weight_decay=100)
    assert model_1.hparams.weight_decay == 100

    with pytest.warns(UserWarning):
        model_2 = MatrixFactorizationModel(train=train,
                                           val=val,
                                           sparse=True,
                                           weight_decay=100)
    assert model_2.hparams.weight_decay == 0
Beispiel #3
0
def test_loading_and_saving_implicit_model(implicit_model,
                                           untrained_implicit_model, tmpdir):
    expected = implicit_model.get_item_predictions(user_id=42,
                                                   unseen_items_only=False)

    # set up TemporaryDirectory for writing and reading all files in this test
    temp_dir_name = str(tmpdir)

    save_model_path = os.path.join(temp_dir_name, 'test_mf_model_save.pth')
    implicit_model.save_model(save_model_path)
    loaded_implicit_model = MatrixFactorizationModel(
        load_model_path=save_model_path)

    actual = loaded_implicit_model.get_item_predictions(
        user_id=42, unseen_items_only=False)

    assert expected.equals(actual)

    # now, test that this is not equal to a randomly initialized model's output
    new_preds = untrained_implicit_model.get_item_predictions(
        user_id=42, unseen_items_only=False)

    assert not expected.equals(new_preds)
Beispiel #4
0
def test_okay_mismatched_train_and_val_loaders(train_val_implicit_data):
    train, val = train_val_implicit_data

    train = copy.copy(train)
    val = copy.copy(val)

    train.num_negative_samples = 2
    val.num_negative_samples = 3

    model = MatrixFactorizationModel(train=train, val=val)
    trainer = CollieTrainer(model=model,
                            logger=False,
                            checkpoint_callback=False,
                            max_steps=1)
    trainer.fit(model)
Beispiel #5
0
def test_instantiation_of_model_loss(train_val_implicit_data):
    train, val = train_val_implicit_data

    train = copy.copy(train)
    val = copy.copy(val)

    train.num_negative_samples = 1
    val.num_negative_samples = 1

    model_1 = MatrixFactorizationModel(train=train, val=val, loss='hinge')
    assert model_1.loss_function == hinge_loss

    model_2 = MatrixFactorizationModel(train=train, val=val, loss='bpr')
    assert model_2.loss_function == bpr_loss

    with pytest.raises(ValueError):
        MatrixFactorizationModel(train=train, val=val, loss='warp')

    train.num_negative_samples = 2
    val.num_negative_samples = 2

    model_4 = MatrixFactorizationModel(train=train, val=val, loss='hinge')
    assert model_4.loss_function == adaptive_hinge_loss

    model_5 = MatrixFactorizationModel(train=train, val=val, loss='bpr')
    assert model_5.loss_function == adaptive_bpr_loss

    model_6 = MatrixFactorizationModel(train=train, val=val, loss='warp')
    assert model_6.loss_function == warp_loss

    def custom_loss_function(*args, **kwargs):
        return 42

    model_7 = MatrixFactorizationModel(train=train,
                                       val=val,
                                       loss=custom_loss_function)
    assert model_7.loss_function == custom_loss_function

    with pytest.raises(ValueError):
        MatrixFactorizationModel(train=train, val=val, loss='nonexistent_loss')
Beispiel #6
0
def test_mismatched_train_and_val_loaders(train_val_implicit_data,
                                          change_to_make):
    train, val = train_val_implicit_data

    train = copy.copy(train)
    val = copy.copy(val)

    expected_error = AssertionError

    if change_to_make == 'num_users':
        train.num_users = 3
        val.num_users = 2
    elif change_to_make == 'num_items':
        train.num_items = 1
        val.num_items = 2
    elif change_to_make == 'num_negative_samples':
        train.num_negative_samples = 1
        val.num_negative_samples = 2
    elif change_to_make == 'bad_train_num_negative_samples':
        train.num_negative_samples = 0
        expected_error = ValueError

    with pytest.raises(expected_error):
        MatrixFactorizationModel(train=train, val=val)
def models_trained_for_one_step(request,
                                train_val_implicit_data,
                                movielens_metadata_df,
                                movielens_implicit_df,
                                train_val_implicit_pandas_data,
                                gpu_count):
    train, val = train_val_implicit_data

    if request.param == 'mf_hdf5':
        # create, fit, and return the model all at once so we can close the HDF5 file
        train_pandas_df, val_pandas_df = train_val_implicit_pandas_data

        with tempfile.TemporaryDirectory() as temp_dir:
            pandas_df_to_hdf5(df=train_pandas_df,
                              out_path=os.path.join(temp_dir, 'train.h5'),
                              key='interactions')
            pandas_df_to_hdf5(df=val_pandas_df,
                              out_path=os.path.join(temp_dir, 'val.h5'),
                              key='interactions')

            train_loader = HDF5InteractionsDataLoader(hdf5_path=os.path.join(temp_dir, 'train.h5'),
                                                      user_col='user_id',
                                                      item_col='item_id',
                                                      num_users=train.num_users,
                                                      num_items=train.num_items,
                                                      batch_size=1024,
                                                      shuffle=True)
            val_loader = HDF5InteractionsDataLoader(hdf5_path=os.path.join(temp_dir, 'val.h5'),
                                                    user_col='user_id',
                                                    item_col='item_id',
                                                    num_users=val.num_users,
                                                    num_items=val.num_items,
                                                    batch_size=1024,
                                                    shuffle=False)

            model = MatrixFactorizationModel(train=train_loader,
                                             val=val_loader,
                                             embedding_dim=15,
                                             dropout_p=0.1,
                                             lr=1e-1,
                                             bias_lr=1e-2,
                                             optimizer='adam',
                                             bias_optimizer='sgd',
                                             weight_decay=1e-7,
                                             loss='bpr',
                                             sparse=False)

            model_trainer = CollieTrainer(model=model,
                                          gpus=gpu_count,
                                          max_steps=1,
                                          deterministic=True,
                                          logger=False,
                                          checkpoint_callback=False)

            model_trainer.fit(model)
            model.freeze()

            return model

    elif request.param == 'sparse_mf':
        model = MatrixFactorizationModel(train=train,
                                         val=val,
                                         embedding_dim=15,
                                         dropout_p=0.1,
                                         lr=1e-1,
                                         bias_lr=1e-2,
                                         optimizer='sparse_adam',
                                         bias_optimizer='sgd',
                                         weight_decay=0,
                                         loss='hinge',
                                         sparse=True)
    elif request.param == 'mf_no_val':
        model = MatrixFactorizationModel(train=train, val=None)
    elif request.param == 'mf_non_approximate' or request.param == 'mf_approximate':
        if request.param == 'mf_non_approximate':
            train_loader = InteractionsDataLoader(interactions=train, batch_size=1024, shuffle=True)
            val_loader = InteractionsDataLoader(interactions=val, batch_size=1024, shuffle=False)
        else:
            train_loader = ApproximateNegativeSamplingInteractionsDataLoader(interactions=train,
                                                                             batch_size=1024,
                                                                             shuffle=True)
            val_loader = ApproximateNegativeSamplingInteractionsDataLoader(interactions=val,
                                                                           batch_size=1024,
                                                                           shuffle=False)

        model = MatrixFactorizationModel(train=train_loader,
                                         val=val_loader,
                                         embedding_dim=15,
                                         dropout_p=0.1,
                                         lr=1e-1,
                                         bias_lr=1e-2,
                                         optimizer='adam',
                                         bias_optimizer='sgd',
                                         weight_decay=1e-7,
                                         loss='bpr',
                                         sparse=False)
    elif request.param == 'mf_with_y_range':
        model = MatrixFactorizationModel(train=train,
                                         val=val,
                                         y_range=(0, 4))
    elif request.param == 'nonlinear_mf':
        model = NonlinearMatrixFactorizationModel(train=train,
                                                  val=val,
                                                  user_embedding_dim=15,
                                                  item_embedding_dim=15,
                                                  user_dense_layers_dims=[15, 10],
                                                  item_dense_layers_dims=[15, 10],
                                                  embedding_dropout_p=0.05,
                                                  dense_dropout_p=0.1,
                                                  lr=1e-1,
                                                  bias_lr=1e-2,
                                                  optimizer='adam',
                                                  bias_optimizer='sgd',
                                                  weight_decay=1e-7,
                                                  loss='bpr')
    elif request.param == 'nonlinear_mf_with_y_range':
        model = NonlinearMatrixFactorizationModel(train=train,
                                                  val=val,
                                                  y_range=(0, 4))
    elif request.param == 'neucf':
        model = NeuralCollaborativeFiltering(train=train,
                                             val=val,
                                             embedding_dim=10,
                                             num_layers=1,
                                             dropout_p=0.1,
                                             lr=1e-3,
                                             weight_decay=0.,
                                             optimizer='adam',
                                             loss='adaptive')
    elif request.param == 'neucf_sigmoid':
        model = NeuralCollaborativeFiltering(train=train,
                                             val=val,
                                             final_layer='sigmoid')
    elif request.param == 'neucf_relu':
        model = NeuralCollaborativeFiltering(train=train,
                                             val=val,
                                             final_layer='relu')
    elif request.param == 'neucf_leaky_rulu':
        model = NeuralCollaborativeFiltering(train=train,
                                             val=val,
                                             final_layer='leaky_relu')
    elif request.param == 'neucf_custom':
        model = NeuralCollaborativeFiltering(train=train,
                                             val=val,
                                             final_layer=torch.tanh)
    elif (
        request.param == 'hybrid_pretrained' or request.param == 'hybrid_pretrained_metadata_layers'
    ):
        implicit_model = MatrixFactorizationModel(train=train,
                                                  val=val,
                                                  embedding_dim=10,
                                                  lr=1e-1,
                                                  optimizer='adam')
        implicit_model_trainer = CollieTrainer(model=implicit_model,
                                               gpus=gpu_count,
                                               max_steps=1,
                                               deterministic=True,
                                               logger=False,
                                               checkpoint_callback=False)
        implicit_model_trainer.fit(implicit_model)
        implicit_model.freeze()

        genres = (
            torch.tensor(movielens_metadata_df[
                [c for c in movielens_metadata_df.columns if 'genre' in c]
            ].values)
            .topk(1)
            .indices
            .view(-1)
        )

        if request.param == 'hybrid_pretrained_metadata_layers':
            metadata_layers_dims = [16, 12]
        else:
            metadata_layers_dims = None

        model_frozen = HybridPretrainedModel(train=train,
                                             val=val,
                                             item_metadata=movielens_metadata_df,
                                             trained_model=implicit_model,
                                             metadata_layers_dims=metadata_layers_dims,
                                             freeze_embeddings=True,
                                             dropout_p=0.15,
                                             loss='warp',
                                             lr=.01,
                                             optimizer=torch.optim.Adam,
                                             metadata_for_loss={'genre': genres},
                                             metadata_for_loss_weights={'genre': .4},
                                             weight_decay=0.0)
        model_frozen_trainer = CollieTrainer(model=model_frozen,
                                             gpus=gpu_count,
                                             max_steps=1,
                                             deterministic=True,
                                             logger=False,
                                             checkpoint_callback=False)
        model_frozen_trainer.fit(model_frozen)

        model = HybridPretrainedModel(train=train,
                                      val=val,
                                      item_metadata=movielens_metadata_df,
                                      trained_model=implicit_model,
                                      metadata_layers_dims=metadata_layers_dims,
                                      freeze_embeddings=False,
                                      dropout_p=0.15,
                                      loss='bpr',
                                      lr=1e-4,
                                      optimizer=torch.optim.Adam,
                                      metadata_for_loss={'genre': genres},
                                      metadata_for_loss_weights={'genre': .4},
                                      weight_decay=0.0)
        model.load_from_hybrid_model(model_frozen)

    model_trainer = CollieTrainer(model=model,
                                  gpus=gpu_count,
                                  max_steps=1,
                                  deterministic=True,
                                  logger=False,
                                  checkpoint_callback=False)

    if request.param == 'mf_no_val':
        with pytest.warns(UserWarning):
            model_trainer.fit(model)
    else:
        model_trainer.fit(model)

    model.freeze()

    return model
def untrained_implicit_model_no_val_data(train_val_implicit_data):
    train, _ = train_val_implicit_data
    model = MatrixFactorizationModel(train=train, val=None)

    return model
def untrained_implicit_model(train_val_implicit_data):
    train, val = train_val_implicit_data
    model = MatrixFactorizationModel(train=train, val=val)

    return model
Beispiel #10
0
def test_instantiation_of_model_optimizer(train_val_implicit_data):
    train, val = train_val_implicit_data

    model_1 = MatrixFactorizationModel(train=train,
                                       val=val,
                                       bias_optimizer=None)
    trainer_1 = CollieTrainer(model=model_1,
                              logger=False,
                              checkpoint_callback=False,
                              max_steps=1)
    trainer_1.fit(model_1)
    assert not isinstance(model_1.optimizers(), list)
    model_1_lr_schedulers = [s['scheduler'] for s in trainer_1.lr_schedulers]
    assert len(model_1_lr_schedulers) == 1

    model_2 = MatrixFactorizationModel(train=train,
                                       val=val,
                                       bias_optimizer=None,
                                       lr_scheduler_func=None)
    trainer_2 = CollieTrainer(model=model_2,
                              logger=False,
                              checkpoint_callback=False,
                              max_steps=1)
    trainer_2.fit(model_2)
    assert not isinstance(model_2.optimizers(), list)
    model_2_lr_schedulers = [s['scheduler'] for s in trainer_2.lr_schedulers]
    assert len(model_2_lr_schedulers) == 0

    model_3 = MatrixFactorizationModel(train=train,
                                       val=val,
                                       bias_optimizer='infer',
                                       bias_lr='infer')
    trainer_3 = CollieTrainer(model=model_3,
                              logger=False,
                              checkpoint_callback=False,
                              max_steps=1)
    trainer_3.fit(model_3)
    assert len(model_3.optimizers()) == 2
    assert model_3.bias_optimizer == model_3.optimizer
    assert model_3.hparams.bias_lr == model_3.hparams.lr
    model_3_lr_schedulers = [s['scheduler'] for s in trainer_3.lr_schedulers]
    assert len(model_3_lr_schedulers) == 2

    model_4 = MatrixFactorizationModel(train=train,
                                       val=val,
                                       bias_optimizer='infer',
                                       bias_lr='infer',
                                       lr_scheduler_func=None)
    trainer_4 = CollieTrainer(model=model_4,
                              logger=False,
                              checkpoint_callback=False,
                              max_steps=1)
    trainer_4.fit(model_4)
    assert len(model_4.optimizers()) == 2
    assert model_4.bias_optimizer == model_4.optimizer
    assert model_4.hparams.bias_lr == model_4.hparams.lr
    model_4_lr_schedulers = [s['scheduler'] for s in trainer_4.lr_schedulers]
    assert len(model_4_lr_schedulers) == 0

    model_5 = MatrixFactorizationModel(train=train,
                                       val=val,
                                       bias_optimizer='infer',
                                       bias_lr=10,
                                       lr_scheduler_func=None)
    trainer_5 = CollieTrainer(model=model_5,
                              logger=False,
                              checkpoint_callback=False,
                              max_steps=1)
    trainer_5.fit(model_5)
    assert len(model_5.optimizers()) == 2
    assert model_5.bias_optimizer == model_5.optimizer
    assert model_5.hparams.bias_lr != model_5.hparams.lr
    model_5_lr_schedulers = [s['scheduler'] for s in trainer_5.lr_schedulers]
    assert len(model_5_lr_schedulers) == 0

    model_6 = MatrixFactorizationModel(train=train,
                                       val=val,
                                       optimizer='fake_optimizer')
    trainer_6 = CollieTrainer(model=model_6,
                              logger=False,
                              checkpoint_callback=False,
                              max_steps=1)
    with pytest.raises(ValueError):
        trainer_6.fit(model_6)

    model_7 = MatrixFactorizationModel(train=train,
                                       val=val,
                                       bias_optimizer='fake_optimizer')
    trainer_7 = CollieTrainer(model=model_7,
                              logger=False,
                              checkpoint_callback=False,
                              max_steps=1)
    with pytest.raises(ValueError):
        trainer_7.fit(model_7)

    # ``Adadelta`` accepts ``weight_decay`` parameter
    model_8 = MatrixFactorizationModel(train=train,
                                       val=val,
                                       optimizer=torch.optim.Adadelta)
    trainer_8 = CollieTrainer(model=model_8,
                              logger=False,
                              checkpoint_callback=False,
                              max_steps=1)
    trainer_8.fit(model_8)

    # ``LBFGS`` does not accept ``weight_decay`` parameter
    model_9 = MatrixFactorizationModel(train=train,
                                       val=val,
                                       optimizer=torch.optim.LBFGS,
                                       sparse=True)
    trainer_9 = CollieTrainer(model=model_9,
                              logger=False,
                              checkpoint_callback=False,
                              max_steps=1)
    trainer_9.fit(model_9)
Beispiel #11
0
def run_movielens_example(epochs: int = 20, gpus: int = 0) -> None:
    """
    Retrieve and split data, train and evaluate a model, and save it.

    From the terminal, you can run this script with:

    .. code-block:: bash

        python collie_recs/movielens/run.py  --epochs 20

    Parameters
    -------------
    epochs: int
        Number of epochs for model training
    gpus: int
        Number of gpus to train on

    """
    t = Timer()

    t.timecheck('  1.0 - retrieving MovieLens 100K dataset')
    df = read_movielens_df(decrement_ids=True)
    t.timecheck('  1.0 complete')

    t.timecheck('  2.0 - splitting data')
    df_imp = convert_to_implicit(df)
    interactions = Interactions(users=df_imp['user_id'],
                                items=df_imp['item_id'],
                                allow_missing_ids=True)
    train, val, test = stratified_split(interactions, val_p=0.1, test_p=0.1)
    train_loader = InteractionsDataLoader(train, batch_size=1024, shuffle=True)
    val_loader = InteractionsDataLoader(val, batch_size=1024, shuffle=False)
    t.timecheck('  2.0 complete')

    t.timecheck('  3.0 - training the model')
    model = MatrixFactorizationModel(train=train_loader,
                                     val=val_loader,
                                     dropout_p=0.05,
                                     loss='adaptive',
                                     lr=5e-2,
                                     embedding_dim=10,
                                     optimizer='adam',
                                     weight_decay=1e-7)
    trainer = CollieTrainer(
        model=model,
        gpus=gpus,
        max_epochs=epochs,
        deterministic=True,
        logger=False,
        checkpoint_callback=False,
        callbacks=[EarlyStopping(monitor='val_loss_epoch', mode='min')],
        weights_summary='full',
        terminate_on_nan=True)
    trainer.fit(model)
    model.eval()
    t.timecheck('\n  3.0 complete')

    t.timecheck('  4.0 - evaluating model')
    auc_score, mrr_score, mapk_score = evaluate_in_batches([auc, mrr, mapk],
                                                           test,
                                                           model,
                                                           k=10)
    print(f'AUC:          {auc_score}')
    print(f'MRR:          {mrr_score}')
    print(f'MAP@10:       {mapk_score}')
    t.timecheck('  4.0 complete')

    t.timecheck('  5.0 - saving model')
    absolute_data_path = DATA_PATH / 'fitted_model'
    model.save_model(absolute_data_path)
    t.timecheck('  5.0 complete')