Beispiel #1
0
def test_stratified_split_with_user_with_only_one_interaction(
    interactions_to_split_with_a_user_with_only_one_interaction, ):
    with pytest.raises(ValueError):
        stratified_split(
            interactions=
            interactions_to_split_with_a_user_with_only_one_interaction,
            test_p=0.2,
            seed=42,
        )
Beispiel #2
0
def train_val_explicit_data(movielens_explicit_interactions):
    return stratified_split(
        interactions=movielens_explicit_interactions,
        val_p=0.,
        test_p=0.2,
        seed=42,
    )
Beispiel #3
0
def test_splits_vary_number_of_processes(implicit_interactions_to_split):
    train_1, test_1 = stratified_split(
        interactions=implicit_interactions_to_split, seed=42, processes=-1)
    train_2, test_2 = stratified_split(
        interactions=implicit_interactions_to_split, seed=42, processes=0)
    train_3, test_3 = stratified_split(
        interactions=implicit_interactions_to_split, seed=42, processes=1)
    train_4, test_4 = stratified_split(
        interactions=implicit_interactions_to_split, seed=42, processes=2)

    # transitive property in action here
    np.testing.assert_array_equal(train_1.toarray(), train_2.toarray())
    np.testing.assert_array_equal(train_2.toarray(), train_3.toarray())
    np.testing.assert_array_equal(train_3.toarray(), train_4.toarray())

    np.testing.assert_array_equal(test_1.toarray(), test_2.toarray())
    np.testing.assert_array_equal(test_2.toarray(), test_3.toarray())
    np.testing.assert_array_equal(test_3.toarray(), test_4.toarray())
Beispiel #4
0
def run_movielens_example(epochs: int = 20, gpus: int = 0) -> None:
    """
    Retrieve and split data, train and evaluate a model, and save it.

    From the terminal, you can run this script with:

    .. code-block:: bash

        python collie/movielens/run.py  --epochs 20

    Parameters
    ----------
    epochs: int
        Number of epochs for model training
    gpus: int
        Number of gpus to train on

    """
    t = Timer()

    t.timecheck('  1.0 - retrieving MovieLens 100K dataset')
    df = read_movielens_df(decrement_ids=True)
    t.timecheck('  1.0 complete')

    t.timecheck('  2.0 - splitting data')
    df_imp = convert_to_implicit(df)
    interactions = Interactions(users=df_imp['user_id'],
                                items=df_imp['item_id'],
                                allow_missing_ids=True)
    train, val, test = stratified_split(interactions, val_p=0.1, test_p=0.1)
    train_loader = InteractionsDataLoader(train, batch_size=1024, shuffle=True)
    val_loader = InteractionsDataLoader(val, batch_size=1024, shuffle=False)
    t.timecheck('  2.0 complete')

    t.timecheck('  3.0 - training the model')
    model = MatrixFactorizationModel(train=train_loader,
                                     val=val_loader,
                                     dropout_p=0.05,
                                     loss='adaptive',
                                     lr=5e-2,
                                     embedding_dim=10,
                                     optimizer='adam',
                                     weight_decay=1e-7)
    trainer = CollieTrainer(model=model,
                            gpus=gpus,
                            max_epochs=epochs,
                            deterministic=True,
                            logger=False,
                            checkpoint_callback=False,
                            callbacks=[EarlyStopping(monitor='val_loss_epoch', mode='min')],
                            weights_summary='full',
                            terminate_on_nan=True)
    trainer.fit(model)
    model.eval()
    t.timecheck('\n  3.0 complete')

    t.timecheck('  4.0 - evaluating model')
    auc_score, mrr_score, mapk_score = evaluate_in_batches([auc, mrr, mapk], test, model, k=10)
    print(f'AUC:          {auc_score}')
    print(f'MRR:          {mrr_score}')
    print(f'MAP@10:       {mapk_score}')
    t.timecheck('  4.0 complete')

    t.timecheck('  5.0 - saving model')
    absolute_data_path = DATA_PATH / 'fitted_model'
    model.save_model(absolute_data_path)
    t.timecheck('  5.0 complete')
Beispiel #5
0
 def test_test_p_negative_stratified(self, implicit_interactions_to_split):
     with pytest.raises(ValueError):
         stratified_split(interactions=implicit_interactions_to_split,
                          test_p=-0.7)
Beispiel #6
0
 def test_test_p_equal_one_stratified(self, implicit_interactions_to_split):
     with pytest.raises(ValueError):
         stratified_split(interactions=implicit_interactions_to_split,
                          test_p=1)
Beispiel #7
0
 def test_val_negative_but_combined_good_stratified(
         self, implicit_interactions_to_split):
     with pytest.raises(ValueError):
         stratified_split(interactions=implicit_interactions_to_split,
                          val_p=-0.1,
                          test_p=0.3)
Beispiel #8
0
 def test_combined_equal_one_stratified(self,
                                        implicit_interactions_to_split):
     with pytest.raises(ValueError):
         stratified_split(interactions=implicit_interactions_to_split,
                          val_p=0.7,
                          test_p=0.3)
Beispiel #9
0
 def test_combined_too_large_stratified(self,
                                        implicit_interactions_to_split):
     with pytest.raises(ValueError):
         stratified_split(interactions=implicit_interactions_to_split,
                          val_p=0.9,
                          test_p=0.2)
Beispiel #10
0
def test_bad_stratified_split_HDF5Interactions(hdf5_interactions):
    with pytest.raises(AssertionError):
        stratified_split(interactions=hdf5_interactions, )
Beispiel #11
0
def test_stratified_split(implicit_interactions_to_split,
                          explicit_interactions_to_split, data_type):
    if data_type == 'implicit':
        interactions_class = Interactions
        interactions_kwargs = {
            'check_num_negative_samples_is_valid': False,
        }
        interactions_to_split = implicit_interactions_to_split
    else:
        interactions_class = ExplicitInteractions
        interactions_kwargs = {}
        interactions_to_split = explicit_interactions_to_split

    train_expected_df = pd.DataFrame(
        data={
            'user_id': [0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 3, 4, 4],
            'item_id': [1, 2, 3, 4, 6, 8, 1, 2, 3, 4, 2, 4, 5],
            'rating': [2, 3, 4, 5, 3, 1, 1, 2, 4, 5, 5, 5, 4],
        })
    train_expected = interactions_class(
        mat=coo_matrix(
            (
                train_expected_df['rating'],
                (train_expected_df['user_id'], train_expected_df['item_id']),
            ),
            shape=(interactions_to_split.num_users,
                   interactions_to_split.num_items),
        ),
        allow_missing_ids=True,
        **interactions_kwargs,
    )

    validate_expected_df = pd.DataFrame(
        data={
            'user_id': [0, 1, 2, 3, 4],
            'item_id': [7, 3, 2, 1, 2],
            'rating': [2, 3, 3, 1, 3],
        })
    validate_expected = interactions_class(
        mat=coo_matrix(
            (
                validate_expected_df['rating'],
                (validate_expected_df['user_id'],
                 validate_expected_df['item_id']),
            ),
            shape=(interactions_to_split.num_users,
                   interactions_to_split.num_items),
        ),
        allow_missing_ids=True,
        **interactions_kwargs,
    )

    test_expected_df = pd.DataFrame(
        data={
            'user_id': [0, 0, 1, 2, 3, 4],
            'item_id': [0, 5, 4, 1, 4, 1],
            'rating': [1, 4, 4, 2, 4, 2],
        })
    test_expected = interactions_class(
        mat=coo_matrix(
            (
                test_expected_df['rating'],
                (test_expected_df['user_id'], test_expected_df['item_id']),
            ),
            shape=(interactions_to_split.num_users,
                   interactions_to_split.num_items),
        ),
        allow_missing_ids=True,
        **interactions_kwargs,
    )

    (train_actual, validate_actual,
     test_actual) = stratified_split(interactions=interactions_to_split,
                                     val_p=0.1,
                                     test_p=0.2,
                                     seed=46)

    np.testing.assert_array_equal(train_actual.toarray(),
                                  train_expected.toarray())
    np.testing.assert_array_equal(validate_actual.toarray(),
                                  validate_expected.toarray())
    np.testing.assert_array_equal(test_actual.toarray(),
                                  test_expected.toarray())

    assert (train_actual.num_users == train_expected.num_users ==
            validate_actual.num_users == validate_expected.num_users ==
            test_actual.num_users == test_expected.num_users)

    assert (train_actual.num_items == train_expected.num_items ==
            validate_actual.num_items == validate_expected.num_items ==
            test_actual.num_items == test_expected.num_items)

    assert (type(train_actual) == type(train_expected) == type(validate_actual)
            == type(validate_expected) == type(test_actual) ==
            type(test_expected) == interactions_class)