def test_stratified_split_with_user_with_only_one_interaction(
    interactions_to_split_with_a_user_with_only_one_interaction, ):
    with pytest.raises(ValueError):
        stratified_split(
            interactions=
            interactions_to_split_with_a_user_with_only_one_interaction,
            test_p=0.2,
            seed=42,
        )
Ejemplo n.º 2
0
def train_val_implicit_data(movielens_implicit_interactions):
    return stratified_split(
        interactions=movielens_implicit_interactions,
        val_p=0.,
        test_p=0.2,
        seed=42,
    )
def test_splits_vary_number_of_processes(interactions_to_split):
    train_1, test_1 = stratified_split(interactions=interactions_to_split,
                                       seed=42,
                                       processes=-1)
    train_2, test_2 = stratified_split(interactions=interactions_to_split,
                                       seed=42,
                                       processes=0)
    train_3, test_3 = stratified_split(interactions=interactions_to_split,
                                       seed=42,
                                       processes=1)
    train_4, test_4 = stratified_split(interactions=interactions_to_split,
                                       seed=42,
                                       processes=2)

    # transitive property in action here
    np.testing.assert_array_equal(train_1.toarray(), train_2.toarray())
    np.testing.assert_array_equal(train_2.toarray(), train_3.toarray())
    np.testing.assert_array_equal(train_3.toarray(), train_4.toarray())

    np.testing.assert_array_equal(test_1.toarray(), test_2.toarray())
    np.testing.assert_array_equal(test_2.toarray(), test_3.toarray())
    np.testing.assert_array_equal(test_3.toarray(), test_4.toarray())
def test_stratified_split(interactions_to_split):
    train_expected_df = pd.DataFrame(
        data={
            'user_id': [0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 3, 4, 4],
            'item_id': [1, 2, 3, 4, 6, 8, 1, 2, 3, 4, 2, 4, 5],
            'rating': [2, 3, 4, 5, 3, 1, 1, 2, 4, 5, 5, 5, 4],
        })
    train_expected = Interactions(
        mat=coo_matrix(
            (
                train_expected_df['rating'],
                (train_expected_df['user_id'], train_expected_df['item_id']),
            ),
            shape=(interactions_to_split.num_users,
                   interactions_to_split.num_items),
        ),
        allow_missing_ids=True,
        check_num_negative_samples_is_valid=False,
    )

    validate_expected_df = pd.DataFrame(
        data={
            'user_id': [0, 1, 2, 3, 4],
            'item_id': [7, 3, 2, 1, 2],
            'rating': [2, 3, 3, 1, 3],
        })
    validate_expected = Interactions(
        mat=coo_matrix(
            (
                validate_expected_df['rating'],
                (validate_expected_df['user_id'],
                 validate_expected_df['item_id']),
            ),
            shape=(interactions_to_split.num_users,
                   interactions_to_split.num_items),
        ),
        allow_missing_ids=True,
        check_num_negative_samples_is_valid=False,
    )

    test_expected_df = pd.DataFrame(
        data={
            'user_id': [0, 0, 1, 2, 3, 4],
            'item_id': [0, 5, 4, 1, 4, 1],
            'rating': [1, 4, 4, 2, 4, 2],
        })
    test_expected = Interactions(
        mat=coo_matrix(
            (
                test_expected_df['rating'],
                (test_expected_df['user_id'], test_expected_df['item_id']),
            ),
            shape=(interactions_to_split.num_users,
                   interactions_to_split.num_items),
        ),
        allow_missing_ids=True,
        check_num_negative_samples_is_valid=False,
    )

    (train_actual, validate_actual,
     test_actual) = stratified_split(interactions=interactions_to_split,
                                     val_p=0.1,
                                     test_p=0.2,
                                     seed=46)

    np.testing.assert_array_equal(train_actual.toarray(),
                                  train_expected.toarray())
    np.testing.assert_array_equal(validate_actual.toarray(),
                                  validate_expected.toarray())
    np.testing.assert_array_equal(test_actual.toarray(),
                                  test_expected.toarray())

    assert (train_actual.num_users == train_expected.num_users ==
            validate_actual.num_users == validate_expected.num_users ==
            test_actual.num_users == test_expected.num_users)

    assert (train_actual.num_items == train_expected.num_items ==
            validate_actual.num_items == validate_expected.num_items ==
            test_actual.num_items == test_expected.num_items)
 def test_test_p_negative_stratified(self, interactions_to_split):
     with pytest.raises(ValueError):
         stratified_split(interactions=interactions_to_split, test_p=-0.7)
 def test_test_p_equal_one_stratified(self, interactions_to_split):
     with pytest.raises(ValueError):
         stratified_split(interactions=interactions_to_split, test_p=1)
 def test_val_negative_but_combined_good_stratified(self,
                                                    interactions_to_split):
     with pytest.raises(ValueError):
         stratified_split(interactions=interactions_to_split,
                          val_p=-0.1,
                          test_p=0.3)
 def test_combined_equal_one_stratified(self, interactions_to_split):
     with pytest.raises(ValueError):
         stratified_split(interactions=interactions_to_split,
                          val_p=0.7,
                          test_p=0.3)
 def test_combined_too_large_stratified(self, interactions_to_split):
     with pytest.raises(ValueError):
         stratified_split(interactions=interactions_to_split,
                          val_p=0.9,
                          test_p=0.2)
Ejemplo n.º 10
0
def run_movielens_example(epochs: int = 20, gpus: int = 0) -> None:
    """
    Retrieve and split data, train and evaluate a model, and save it.

    From the terminal, you can run this script with:

    .. code-block:: bash

        python collie_recs/movielens/run.py  --epochs 20

    Parameters
    -------------
    epochs: int
        Number of epochs for model training
    gpus: int
        Number of gpus to train on

    """
    t = Timer()

    t.timecheck('  1.0 - retrieving MovieLens 100K dataset')
    df = read_movielens_df(decrement_ids=True)
    t.timecheck('  1.0 complete')

    t.timecheck('  2.0 - splitting data')
    df_imp = convert_to_implicit(df)
    interactions = Interactions(users=df_imp['user_id'],
                                items=df_imp['item_id'],
                                allow_missing_ids=True)
    train, val, test = stratified_split(interactions, val_p=0.1, test_p=0.1)
    train_loader = InteractionsDataLoader(train, batch_size=1024, shuffle=True)
    val_loader = InteractionsDataLoader(val, batch_size=1024, shuffle=False)
    t.timecheck('  2.0 complete')

    t.timecheck('  3.0 - training the model')
    model = MatrixFactorizationModel(train=train_loader,
                                     val=val_loader,
                                     dropout_p=0.05,
                                     loss='adaptive',
                                     lr=5e-2,
                                     embedding_dim=10,
                                     optimizer='adam',
                                     weight_decay=1e-7)
    trainer = CollieTrainer(
        model=model,
        gpus=gpus,
        max_epochs=epochs,
        deterministic=True,
        logger=False,
        checkpoint_callback=False,
        callbacks=[EarlyStopping(monitor='val_loss_epoch', mode='min')],
        weights_summary='full',
        terminate_on_nan=True)
    trainer.fit(model)
    model.eval()
    t.timecheck('\n  3.0 complete')

    t.timecheck('  4.0 - evaluating model')
    auc_score, mrr_score, mapk_score = evaluate_in_batches([auc, mrr, mapk],
                                                           test,
                                                           model,
                                                           k=10)
    print(f'AUC:          {auc_score}')
    print(f'MRR:          {mrr_score}')
    print(f'MAP@10:       {mapk_score}')
    t.timecheck('  4.0 complete')

    t.timecheck('  5.0 - saving model')
    absolute_data_path = DATA_PATH / 'fitted_model'
    model.save_model(absolute_data_path)
    t.timecheck('  5.0 complete')