Esempio n. 1
0
def test_explicit_to_implicit(explicit_df):
    expected = pd.DataFrame(data={
        'userId': [3, 1],
        'itemId': [1, 3],
        'rating': [1, 1]
    })

    actual = convert_to_implicit(explicit_df, user_col='userId', item_col='itemId')

    pd.testing.assert_frame_equal(actual, expected)
Esempio n. 2
0
def test_explicit_to_implicit_with_duplicate_user_id_item_id_pairs(
    explicit_df_with_duplicate_user_item_pairs
):
    expected = pd.DataFrame(data={
        'userId': [3, 2, 1],
        'itemId': [1, 0, 3],
        'rating': [1, 1, 1]
    })

    actual = convert_to_implicit(explicit_df_with_duplicate_user_item_pairs,
                                 user_col='userId',
                                 item_col='itemId')

    pd.testing.assert_frame_equal(actual, expected)
Esempio n. 3
0
def movielens_implicit_df(movielens_explicit_df):
    return convert_to_implicit(movielens_explicit_df)
Esempio n. 4
0
def run_movielens_example(epochs: int = 20, gpus: int = 0) -> None:
    """
    Retrieve and split data, train and evaluate a model, and save it.

    From the terminal, you can run this script with:

    .. code-block:: bash

        python collie_recs/movielens/run.py  --epochs 20

    Parameters
    -------------
    epochs: int
        Number of epochs for model training
    gpus: int
        Number of gpus to train on

    """
    t = Timer()

    t.timecheck('  1.0 - retrieving MovieLens 100K dataset')
    df = read_movielens_df(decrement_ids=True)
    t.timecheck('  1.0 complete')

    t.timecheck('  2.0 - splitting data')
    df_imp = convert_to_implicit(df)
    interactions = Interactions(users=df_imp['user_id'],
                                items=df_imp['item_id'],
                                allow_missing_ids=True)
    train, val, test = stratified_split(interactions, val_p=0.1, test_p=0.1)
    train_loader = InteractionsDataLoader(train, batch_size=1024, shuffle=True)
    val_loader = InteractionsDataLoader(val, batch_size=1024, shuffle=False)
    t.timecheck('  2.0 complete')

    t.timecheck('  3.0 - training the model')
    model = MatrixFactorizationModel(train=train_loader,
                                     val=val_loader,
                                     dropout_p=0.05,
                                     loss='adaptive',
                                     lr=5e-2,
                                     embedding_dim=10,
                                     optimizer='adam',
                                     weight_decay=1e-7)
    trainer = CollieTrainer(
        model=model,
        gpus=gpus,
        max_epochs=epochs,
        deterministic=True,
        logger=False,
        checkpoint_callback=False,
        callbacks=[EarlyStopping(monitor='val_loss_epoch', mode='min')],
        weights_summary='full',
        terminate_on_nan=True)
    trainer.fit(model)
    model.eval()
    t.timecheck('\n  3.0 complete')

    t.timecheck('  4.0 - evaluating model')
    auc_score, mrr_score, mapk_score = evaluate_in_batches([auc, mrr, mapk],
                                                           test,
                                                           model,
                                                           k=10)
    print(f'AUC:          {auc_score}')
    print(f'MRR:          {mrr_score}')
    print(f'MAP@10:       {mapk_score}')
    t.timecheck('  4.0 complete')

    t.timecheck('  5.0 - saving model')
    absolute_data_path = DATA_PATH / 'fitted_model'
    model.save_model(absolute_data_path)
    t.timecheck('  5.0 complete')