def implicit_model(train_val_implicit_data, gpu_count): train, val = train_val_implicit_data model = MatrixFactorizationModel(train=train, val=val, embedding_dim=10, lr=1e-1) model_trainer = CollieTrainer(model=model, gpus=gpu_count, max_epochs=10, deterministic=True, logger=False, checkpoint_callback=False) model_trainer.fit(model) model.freeze() return model
def test_instantiation_of_sparse_model_with_weight_decay( train_val_implicit_data, capfd): train, val = train_val_implicit_data model_1 = MatrixFactorizationModel(train=train, val=val, sparse=False, weight_decay=100) assert model_1.hparams.weight_decay == 100 with pytest.warns(UserWarning): model_2 = MatrixFactorizationModel(train=train, val=val, sparse=True, weight_decay=100) assert model_2.hparams.weight_decay == 0
def test_loading_and_saving_implicit_model(implicit_model, untrained_implicit_model, tmpdir): expected = implicit_model.get_item_predictions(user_id=42, unseen_items_only=False) # set up TemporaryDirectory for writing and reading all files in this test temp_dir_name = str(tmpdir) save_model_path = os.path.join(temp_dir_name, 'test_mf_model_save.pth') implicit_model.save_model(save_model_path) loaded_implicit_model = MatrixFactorizationModel( load_model_path=save_model_path) actual = loaded_implicit_model.get_item_predictions( user_id=42, unseen_items_only=False) assert expected.equals(actual) # now, test that this is not equal to a randomly initialized model's output new_preds = untrained_implicit_model.get_item_predictions( user_id=42, unseen_items_only=False) assert not expected.equals(new_preds)
def test_okay_mismatched_train_and_val_loaders(train_val_implicit_data): train, val = train_val_implicit_data train = copy.copy(train) val = copy.copy(val) train.num_negative_samples = 2 val.num_negative_samples = 3 model = MatrixFactorizationModel(train=train, val=val) trainer = CollieTrainer(model=model, logger=False, checkpoint_callback=False, max_steps=1) trainer.fit(model)
def test_instantiation_of_model_loss(train_val_implicit_data): train, val = train_val_implicit_data train = copy.copy(train) val = copy.copy(val) train.num_negative_samples = 1 val.num_negative_samples = 1 model_1 = MatrixFactorizationModel(train=train, val=val, loss='hinge') assert model_1.loss_function == hinge_loss model_2 = MatrixFactorizationModel(train=train, val=val, loss='bpr') assert model_2.loss_function == bpr_loss with pytest.raises(ValueError): MatrixFactorizationModel(train=train, val=val, loss='warp') train.num_negative_samples = 2 val.num_negative_samples = 2 model_4 = MatrixFactorizationModel(train=train, val=val, loss='hinge') assert model_4.loss_function == adaptive_hinge_loss model_5 = MatrixFactorizationModel(train=train, val=val, loss='bpr') assert model_5.loss_function == adaptive_bpr_loss model_6 = MatrixFactorizationModel(train=train, val=val, loss='warp') assert model_6.loss_function == warp_loss def custom_loss_function(*args, **kwargs): return 42 model_7 = MatrixFactorizationModel(train=train, val=val, loss=custom_loss_function) assert model_7.loss_function == custom_loss_function with pytest.raises(ValueError): MatrixFactorizationModel(train=train, val=val, loss='nonexistent_loss')
def test_mismatched_train_and_val_loaders(train_val_implicit_data, change_to_make): train, val = train_val_implicit_data train = copy.copy(train) val = copy.copy(val) expected_error = AssertionError if change_to_make == 'num_users': train.num_users = 3 val.num_users = 2 elif change_to_make == 'num_items': train.num_items = 1 val.num_items = 2 elif change_to_make == 'num_negative_samples': train.num_negative_samples = 1 val.num_negative_samples = 2 elif change_to_make == 'bad_train_num_negative_samples': train.num_negative_samples = 0 expected_error = ValueError with pytest.raises(expected_error): MatrixFactorizationModel(train=train, val=val)
def models_trained_for_one_step(request, train_val_implicit_data, movielens_metadata_df, movielens_implicit_df, train_val_implicit_pandas_data, gpu_count): train, val = train_val_implicit_data if request.param == 'mf_hdf5': # create, fit, and return the model all at once so we can close the HDF5 file train_pandas_df, val_pandas_df = train_val_implicit_pandas_data with tempfile.TemporaryDirectory() as temp_dir: pandas_df_to_hdf5(df=train_pandas_df, out_path=os.path.join(temp_dir, 'train.h5'), key='interactions') pandas_df_to_hdf5(df=val_pandas_df, out_path=os.path.join(temp_dir, 'val.h5'), key='interactions') train_loader = HDF5InteractionsDataLoader(hdf5_path=os.path.join(temp_dir, 'train.h5'), user_col='user_id', item_col='item_id', num_users=train.num_users, num_items=train.num_items, batch_size=1024, shuffle=True) val_loader = HDF5InteractionsDataLoader(hdf5_path=os.path.join(temp_dir, 'val.h5'), user_col='user_id', item_col='item_id', num_users=val.num_users, num_items=val.num_items, batch_size=1024, shuffle=False) model = MatrixFactorizationModel(train=train_loader, val=val_loader, embedding_dim=15, dropout_p=0.1, lr=1e-1, bias_lr=1e-2, optimizer='adam', bias_optimizer='sgd', weight_decay=1e-7, loss='bpr', sparse=False) model_trainer = CollieTrainer(model=model, gpus=gpu_count, max_steps=1, deterministic=True, logger=False, checkpoint_callback=False) model_trainer.fit(model) model.freeze() return model elif request.param == 'sparse_mf': model = MatrixFactorizationModel(train=train, val=val, embedding_dim=15, dropout_p=0.1, lr=1e-1, bias_lr=1e-2, optimizer='sparse_adam', bias_optimizer='sgd', weight_decay=0, loss='hinge', sparse=True) elif request.param == 'mf_no_val': model = MatrixFactorizationModel(train=train, val=None) elif request.param == 'mf_non_approximate' or request.param == 'mf_approximate': if request.param == 'mf_non_approximate': train_loader = InteractionsDataLoader(interactions=train, batch_size=1024, shuffle=True) val_loader = InteractionsDataLoader(interactions=val, batch_size=1024, shuffle=False) else: train_loader = ApproximateNegativeSamplingInteractionsDataLoader(interactions=train, batch_size=1024, shuffle=True) val_loader = ApproximateNegativeSamplingInteractionsDataLoader(interactions=val, batch_size=1024, shuffle=False) model = MatrixFactorizationModel(train=train_loader, val=val_loader, embedding_dim=15, dropout_p=0.1, lr=1e-1, bias_lr=1e-2, optimizer='adam', bias_optimizer='sgd', weight_decay=1e-7, loss='bpr', sparse=False) elif request.param == 'mf_with_y_range': model = MatrixFactorizationModel(train=train, val=val, y_range=(0, 4)) elif request.param == 'nonlinear_mf': model = NonlinearMatrixFactorizationModel(train=train, val=val, user_embedding_dim=15, item_embedding_dim=15, user_dense_layers_dims=[15, 10], item_dense_layers_dims=[15, 10], embedding_dropout_p=0.05, dense_dropout_p=0.1, lr=1e-1, bias_lr=1e-2, optimizer='adam', bias_optimizer='sgd', weight_decay=1e-7, loss='bpr') elif request.param == 'nonlinear_mf_with_y_range': model = NonlinearMatrixFactorizationModel(train=train, val=val, y_range=(0, 4)) elif request.param == 'neucf': model = NeuralCollaborativeFiltering(train=train, val=val, embedding_dim=10, num_layers=1, dropout_p=0.1, lr=1e-3, weight_decay=0., optimizer='adam', loss='adaptive') elif request.param == 'neucf_sigmoid': model = NeuralCollaborativeFiltering(train=train, val=val, final_layer='sigmoid') elif request.param == 'neucf_relu': model = NeuralCollaborativeFiltering(train=train, val=val, final_layer='relu') elif request.param == 'neucf_leaky_rulu': model = NeuralCollaborativeFiltering(train=train, val=val, final_layer='leaky_relu') elif request.param == 'neucf_custom': model = NeuralCollaborativeFiltering(train=train, val=val, final_layer=torch.tanh) elif ( request.param == 'hybrid_pretrained' or request.param == 'hybrid_pretrained_metadata_layers' ): implicit_model = MatrixFactorizationModel(train=train, val=val, embedding_dim=10, lr=1e-1, optimizer='adam') implicit_model_trainer = CollieTrainer(model=implicit_model, gpus=gpu_count, max_steps=1, deterministic=True, logger=False, checkpoint_callback=False) implicit_model_trainer.fit(implicit_model) implicit_model.freeze() genres = ( torch.tensor(movielens_metadata_df[ [c for c in movielens_metadata_df.columns if 'genre' in c] ].values) .topk(1) .indices .view(-1) ) if request.param == 'hybrid_pretrained_metadata_layers': metadata_layers_dims = [16, 12] else: metadata_layers_dims = None model_frozen = HybridPretrainedModel(train=train, val=val, item_metadata=movielens_metadata_df, trained_model=implicit_model, metadata_layers_dims=metadata_layers_dims, freeze_embeddings=True, dropout_p=0.15, loss='warp', lr=.01, optimizer=torch.optim.Adam, metadata_for_loss={'genre': genres}, metadata_for_loss_weights={'genre': .4}, weight_decay=0.0) model_frozen_trainer = CollieTrainer(model=model_frozen, gpus=gpu_count, max_steps=1, deterministic=True, logger=False, checkpoint_callback=False) model_frozen_trainer.fit(model_frozen) model = HybridPretrainedModel(train=train, val=val, item_metadata=movielens_metadata_df, trained_model=implicit_model, metadata_layers_dims=metadata_layers_dims, freeze_embeddings=False, dropout_p=0.15, loss='bpr', lr=1e-4, optimizer=torch.optim.Adam, metadata_for_loss={'genre': genres}, metadata_for_loss_weights={'genre': .4}, weight_decay=0.0) model.load_from_hybrid_model(model_frozen) model_trainer = CollieTrainer(model=model, gpus=gpu_count, max_steps=1, deterministic=True, logger=False, checkpoint_callback=False) if request.param == 'mf_no_val': with pytest.warns(UserWarning): model_trainer.fit(model) else: model_trainer.fit(model) model.freeze() return model
def untrained_implicit_model_no_val_data(train_val_implicit_data): train, _ = train_val_implicit_data model = MatrixFactorizationModel(train=train, val=None) return model
def untrained_implicit_model(train_val_implicit_data): train, val = train_val_implicit_data model = MatrixFactorizationModel(train=train, val=val) return model
def test_instantiation_of_model_optimizer(train_val_implicit_data): train, val = train_val_implicit_data model_1 = MatrixFactorizationModel(train=train, val=val, bias_optimizer=None) trainer_1 = CollieTrainer(model=model_1, logger=False, checkpoint_callback=False, max_steps=1) trainer_1.fit(model_1) assert not isinstance(model_1.optimizers(), list) model_1_lr_schedulers = [s['scheduler'] for s in trainer_1.lr_schedulers] assert len(model_1_lr_schedulers) == 1 model_2 = MatrixFactorizationModel(train=train, val=val, bias_optimizer=None, lr_scheduler_func=None) trainer_2 = CollieTrainer(model=model_2, logger=False, checkpoint_callback=False, max_steps=1) trainer_2.fit(model_2) assert not isinstance(model_2.optimizers(), list) model_2_lr_schedulers = [s['scheduler'] for s in trainer_2.lr_schedulers] assert len(model_2_lr_schedulers) == 0 model_3 = MatrixFactorizationModel(train=train, val=val, bias_optimizer='infer', bias_lr='infer') trainer_3 = CollieTrainer(model=model_3, logger=False, checkpoint_callback=False, max_steps=1) trainer_3.fit(model_3) assert len(model_3.optimizers()) == 2 assert model_3.bias_optimizer == model_3.optimizer assert model_3.hparams.bias_lr == model_3.hparams.lr model_3_lr_schedulers = [s['scheduler'] for s in trainer_3.lr_schedulers] assert len(model_3_lr_schedulers) == 2 model_4 = MatrixFactorizationModel(train=train, val=val, bias_optimizer='infer', bias_lr='infer', lr_scheduler_func=None) trainer_4 = CollieTrainer(model=model_4, logger=False, checkpoint_callback=False, max_steps=1) trainer_4.fit(model_4) assert len(model_4.optimizers()) == 2 assert model_4.bias_optimizer == model_4.optimizer assert model_4.hparams.bias_lr == model_4.hparams.lr model_4_lr_schedulers = [s['scheduler'] for s in trainer_4.lr_schedulers] assert len(model_4_lr_schedulers) == 0 model_5 = MatrixFactorizationModel(train=train, val=val, bias_optimizer='infer', bias_lr=10, lr_scheduler_func=None) trainer_5 = CollieTrainer(model=model_5, logger=False, checkpoint_callback=False, max_steps=1) trainer_5.fit(model_5) assert len(model_5.optimizers()) == 2 assert model_5.bias_optimizer == model_5.optimizer assert model_5.hparams.bias_lr != model_5.hparams.lr model_5_lr_schedulers = [s['scheduler'] for s in trainer_5.lr_schedulers] assert len(model_5_lr_schedulers) == 0 model_6 = MatrixFactorizationModel(train=train, val=val, optimizer='fake_optimizer') trainer_6 = CollieTrainer(model=model_6, logger=False, checkpoint_callback=False, max_steps=1) with pytest.raises(ValueError): trainer_6.fit(model_6) model_7 = MatrixFactorizationModel(train=train, val=val, bias_optimizer='fake_optimizer') trainer_7 = CollieTrainer(model=model_7, logger=False, checkpoint_callback=False, max_steps=1) with pytest.raises(ValueError): trainer_7.fit(model_7) # ``Adadelta`` accepts ``weight_decay`` parameter model_8 = MatrixFactorizationModel(train=train, val=val, optimizer=torch.optim.Adadelta) trainer_8 = CollieTrainer(model=model_8, logger=False, checkpoint_callback=False, max_steps=1) trainer_8.fit(model_8) # ``LBFGS`` does not accept ``weight_decay`` parameter model_9 = MatrixFactorizationModel(train=train, val=val, optimizer=torch.optim.LBFGS, sparse=True) trainer_9 = CollieTrainer(model=model_9, logger=False, checkpoint_callback=False, max_steps=1) trainer_9.fit(model_9)
def run_movielens_example(epochs: int = 20, gpus: int = 0) -> None: """ Retrieve and split data, train and evaluate a model, and save it. From the terminal, you can run this script with: .. code-block:: bash python collie_recs/movielens/run.py --epochs 20 Parameters ------------- epochs: int Number of epochs for model training gpus: int Number of gpus to train on """ t = Timer() t.timecheck(' 1.0 - retrieving MovieLens 100K dataset') df = read_movielens_df(decrement_ids=True) t.timecheck(' 1.0 complete') t.timecheck(' 2.0 - splitting data') df_imp = convert_to_implicit(df) interactions = Interactions(users=df_imp['user_id'], items=df_imp['item_id'], allow_missing_ids=True) train, val, test = stratified_split(interactions, val_p=0.1, test_p=0.1) train_loader = InteractionsDataLoader(train, batch_size=1024, shuffle=True) val_loader = InteractionsDataLoader(val, batch_size=1024, shuffle=False) t.timecheck(' 2.0 complete') t.timecheck(' 3.0 - training the model') model = MatrixFactorizationModel(train=train_loader, val=val_loader, dropout_p=0.05, loss='adaptive', lr=5e-2, embedding_dim=10, optimizer='adam', weight_decay=1e-7) trainer = CollieTrainer( model=model, gpus=gpus, max_epochs=epochs, deterministic=True, logger=False, checkpoint_callback=False, callbacks=[EarlyStopping(monitor='val_loss_epoch', mode='min')], weights_summary='full', terminate_on_nan=True) trainer.fit(model) model.eval() t.timecheck('\n 3.0 complete') t.timecheck(' 4.0 - evaluating model') auc_score, mrr_score, mapk_score = evaluate_in_batches([auc, mrr, mapk], test, model, k=10) print(f'AUC: {auc_score}') print(f'MRR: {mrr_score}') print(f'MAP@10: {mapk_score}') t.timecheck(' 4.0 complete') t.timecheck(' 5.0 - saving model') absolute_data_path = DATA_PATH / 'fitted_model' model.save_model(absolute_data_path) t.timecheck(' 5.0 complete')