def test_stratified_split_with_user_with_only_one_interaction( interactions_to_split_with_a_user_with_only_one_interaction, ): with pytest.raises(ValueError): stratified_split( interactions= interactions_to_split_with_a_user_with_only_one_interaction, test_p=0.2, seed=42, )
def train_val_implicit_data(movielens_implicit_interactions): return stratified_split( interactions=movielens_implicit_interactions, val_p=0., test_p=0.2, seed=42, )
def test_splits_vary_number_of_processes(interactions_to_split): train_1, test_1 = stratified_split(interactions=interactions_to_split, seed=42, processes=-1) train_2, test_2 = stratified_split(interactions=interactions_to_split, seed=42, processes=0) train_3, test_3 = stratified_split(interactions=interactions_to_split, seed=42, processes=1) train_4, test_4 = stratified_split(interactions=interactions_to_split, seed=42, processes=2) # transitive property in action here np.testing.assert_array_equal(train_1.toarray(), train_2.toarray()) np.testing.assert_array_equal(train_2.toarray(), train_3.toarray()) np.testing.assert_array_equal(train_3.toarray(), train_4.toarray()) np.testing.assert_array_equal(test_1.toarray(), test_2.toarray()) np.testing.assert_array_equal(test_2.toarray(), test_3.toarray()) np.testing.assert_array_equal(test_3.toarray(), test_4.toarray())
def test_stratified_split(interactions_to_split): train_expected_df = pd.DataFrame( data={ 'user_id': [0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 3, 4, 4], 'item_id': [1, 2, 3, 4, 6, 8, 1, 2, 3, 4, 2, 4, 5], 'rating': [2, 3, 4, 5, 3, 1, 1, 2, 4, 5, 5, 5, 4], }) train_expected = Interactions( mat=coo_matrix( ( train_expected_df['rating'], (train_expected_df['user_id'], train_expected_df['item_id']), ), shape=(interactions_to_split.num_users, interactions_to_split.num_items), ), allow_missing_ids=True, check_num_negative_samples_is_valid=False, ) validate_expected_df = pd.DataFrame( data={ 'user_id': [0, 1, 2, 3, 4], 'item_id': [7, 3, 2, 1, 2], 'rating': [2, 3, 3, 1, 3], }) validate_expected = Interactions( mat=coo_matrix( ( validate_expected_df['rating'], (validate_expected_df['user_id'], validate_expected_df['item_id']), ), shape=(interactions_to_split.num_users, interactions_to_split.num_items), ), allow_missing_ids=True, check_num_negative_samples_is_valid=False, ) test_expected_df = pd.DataFrame( data={ 'user_id': [0, 0, 1, 2, 3, 4], 'item_id': [0, 5, 4, 1, 4, 1], 'rating': [1, 4, 4, 2, 4, 2], }) test_expected = Interactions( mat=coo_matrix( ( test_expected_df['rating'], (test_expected_df['user_id'], test_expected_df['item_id']), ), shape=(interactions_to_split.num_users, interactions_to_split.num_items), ), allow_missing_ids=True, check_num_negative_samples_is_valid=False, ) (train_actual, validate_actual, test_actual) = stratified_split(interactions=interactions_to_split, val_p=0.1, test_p=0.2, seed=46) np.testing.assert_array_equal(train_actual.toarray(), train_expected.toarray()) np.testing.assert_array_equal(validate_actual.toarray(), validate_expected.toarray()) np.testing.assert_array_equal(test_actual.toarray(), test_expected.toarray()) assert (train_actual.num_users == train_expected.num_users == validate_actual.num_users == validate_expected.num_users == test_actual.num_users == test_expected.num_users) assert (train_actual.num_items == train_expected.num_items == validate_actual.num_items == validate_expected.num_items == test_actual.num_items == test_expected.num_items)
def test_test_p_negative_stratified(self, interactions_to_split): with pytest.raises(ValueError): stratified_split(interactions=interactions_to_split, test_p=-0.7)
def test_test_p_equal_one_stratified(self, interactions_to_split): with pytest.raises(ValueError): stratified_split(interactions=interactions_to_split, test_p=1)
def test_val_negative_but_combined_good_stratified(self, interactions_to_split): with pytest.raises(ValueError): stratified_split(interactions=interactions_to_split, val_p=-0.1, test_p=0.3)
def test_combined_equal_one_stratified(self, interactions_to_split): with pytest.raises(ValueError): stratified_split(interactions=interactions_to_split, val_p=0.7, test_p=0.3)
def test_combined_too_large_stratified(self, interactions_to_split): with pytest.raises(ValueError): stratified_split(interactions=interactions_to_split, val_p=0.9, test_p=0.2)
def run_movielens_example(epochs: int = 20, gpus: int = 0) -> None: """ Retrieve and split data, train and evaluate a model, and save it. From the terminal, you can run this script with: .. code-block:: bash python collie_recs/movielens/run.py --epochs 20 Parameters ------------- epochs: int Number of epochs for model training gpus: int Number of gpus to train on """ t = Timer() t.timecheck(' 1.0 - retrieving MovieLens 100K dataset') df = read_movielens_df(decrement_ids=True) t.timecheck(' 1.0 complete') t.timecheck(' 2.0 - splitting data') df_imp = convert_to_implicit(df) interactions = Interactions(users=df_imp['user_id'], items=df_imp['item_id'], allow_missing_ids=True) train, val, test = stratified_split(interactions, val_p=0.1, test_p=0.1) train_loader = InteractionsDataLoader(train, batch_size=1024, shuffle=True) val_loader = InteractionsDataLoader(val, batch_size=1024, shuffle=False) t.timecheck(' 2.0 complete') t.timecheck(' 3.0 - training the model') model = MatrixFactorizationModel(train=train_loader, val=val_loader, dropout_p=0.05, loss='adaptive', lr=5e-2, embedding_dim=10, optimizer='adam', weight_decay=1e-7) trainer = CollieTrainer( model=model, gpus=gpus, max_epochs=epochs, deterministic=True, logger=False, checkpoint_callback=False, callbacks=[EarlyStopping(monitor='val_loss_epoch', mode='min')], weights_summary='full', terminate_on_nan=True) trainer.fit(model) model.eval() t.timecheck('\n 3.0 complete') t.timecheck(' 4.0 - evaluating model') auc_score, mrr_score, mapk_score = evaluate_in_batches([auc, mrr, mapk], test, model, k=10) print(f'AUC: {auc_score}') print(f'MRR: {mrr_score}') print(f'MAP@10: {mapk_score}') t.timecheck(' 4.0 complete') t.timecheck(' 5.0 - saving model') absolute_data_path = DATA_PATH / 'fitted_model' model.save_model(absolute_data_path) t.timecheck(' 5.0 complete')