Esempio n. 1
0
def get_recommendation_visualizations(
    model: collie_recs.model.BasePipeline,
    user_id: int,
    df_user: Optional[pd.DataFrame] = None,
    df_item: Optional[pd.DataFrame] = None,
    movielens_posters_df: Optional[pd.DataFrame] = None,
    num_user_movies_to_display: int = 10,
    num_similar_movies: int = 10,
    filter_films: bool = True,
    shuffle: bool = True,
    detailed: bool = False,
    image_width: int = 500,
) -> str:
    """
    Get visual recommendations through Movielens posters for a given user.

    Parameters
    -------------
    model: collie_recs.model.BasePipeline
    user_id: int
        User ID to pull recommendations for
    df_user: DataFrame
        ``u.data`` from MovieLens data. If ``None``, will set to the output of
        ``read_movielens_df(decrement_ids=False)``
        Note: User and item IDs should start at 1 for this, which is the default for MovieLens 100K
    df_item: DataFrame
        ``u.item`` from MovieLens data. If ``None``, will set to the output of
        ``read_movielens_df_item()``
    movielens_posters_df: DataFrame
        DataFrame containing item_ids from MovieLens data and the poster url. If ``None``, will set
        to the output of ``read_movielens_posters_df()``
    num_user_movies_to_display: int
        Number of movies rated 4 or 5 to display for the user
    num_similar_movies: int
        Number of movies recommendations to display
    filter_films: bool
        Filter already-seen films from recommendations
    shuffle: bool
        Shuffle user-loved movie order
    detailed: bool
        Of the top N unfiltered recommendations, displays how many movies the user gave a positive
        and negative rating to
    image_width: int
        Image width for HTML images

    Returns
    -------------
    html: str
        HTML string of movies a user loved and the model recommended for a given user, ready for
        displaying

    """
    assert num_similar_movies > 0, 'Number of similar movies returned must be 1 or greater.'

    if df_user is None:
        df_user = read_movielens_df(decrement_ids=False)

    if df_item is None:
        df_item = read_movielens_df_item()

    if movielens_posters_df is None:
        movielens_posters_df = read_movielens_posters_df()

    user_df = df_user.query(f'user_id=={user_id}')
    user_liked_movies = sorted(user_df[user_df['rating'] >= 4]['item_id'].tolist())

    if shuffle:
        random.shuffle(user_liked_movies)

    user_liked_movies = user_liked_movies[:num_user_movies_to_display]

    top_movies = model.get_item_predictions(user_id - 1,
                                            unseen_items_only=filter_films,
                                            sort_values=True)
    top_movies_k = top_movies[:num_similar_movies]

    if len(top_movies_k) == 0:
        if filter_films:
            raise ValueError(f'User {user_id} cannot have rated every movie.')
        else:
            raise ValueError(f'User {user_id} has no top rated films.')

    html = f'<h3>User {user_id}:</h3>'
    html += _get_posters_html(movielens_posters_df, df_item, user_liked_movies,
                              col_description='Some loved films:', image_width=image_width)
    html += _get_posters_html(movielens_posters_df, df_item, top_movies_k.index + 1)

    if detailed:
        loved_movies = df_user.query(f'user_id=={user_id} and (rating >= 4)')
        loved_movies = loved_movies.item_id.tolist()
        hated_movies = df_user.query(f'user_id=={user_id} and (rating < 4)')
        hated_movies = hated_movies.item_id.tolist()

        unfiltered_top_movies = model.get_item_predictions(user_id - 1,
                                                           unseen_items_only=False,
                                                           sort_values=True)
        unfiltered_top_movies_k = (
            (unfiltered_top_movies[:num_similar_movies].index + 1).tolist()
        )

        percent_captured = round(len(set(loved_movies) & set(unfiltered_top_movies_k))
                                 / num_similar_movies * 100, 3)
        percent_bad = round(len(set(hated_movies) & set(unfiltered_top_movies_k))
                            / num_similar_movies * 100, 3)

        html += '-----'
        html += f'<p style="margin:0">User {user_id} has rated <strong>{len(loved_movies)}'
        html += '</strong> films with a 4 or 5</p>'
        html += f'<p style="margin:0">User {user_id} has rated <strong>{len(hated_movies)}'
        html += '</strong> films with a 1, 2, or 3</p>'
        html += '<p style="margin:0">% of these films rated 5 or 4 appearing in the '
        html += f'first {num_similar_movies} recommendations:'
        html += f'<strong style="color:green">{percent_captured}%</strong></p>'
        html += '<p style="margin:0">% of these films rated 1, 2, or 3 appearing in the '
        html += f'first {num_similar_movies} recommendations: '
        html += f'<strong style="color:red">{percent_bad}%</strong></p>'

    return html
Esempio n. 2
0
def get_recommendation_visualizations(
    model: collie_recs.model.BasePipeline,
    user_id: int,
    df_user: Optional[pd.DataFrame] = None,
    df_item: Optional[pd.DataFrame] = None,
    movielens_posters_df: Optional[pd.DataFrame] = None,
    num_user_movies_to_display: int = 10,
    num_similar_movies: int = 10,
    filter_films: bool = True,
    shuffle: bool = True,
    detailed: bool = False,
    image_width: int = 500,
) -> str:
    """
    Visualize Movielens 100K recommendations for a given user.

    Parameters
    -------------
    model: collie_recs.model.BasePipeline
    user_id: int
        User ID to retrieve recommendations for
    df_user: DataFrame
        ``u.data`` from MovieLens data. This DataFrame must have columns:

        * ``user_id`` (starting at ``1``)

        * ``item_id`` (starting at ``1``)

        * ``rating`` (explicit ratings)

        If ``None``, will set to the output of ``read_movielens_df(decrement_ids=False)``.
    df_item: DataFrame
        ``u.item`` from MovieLens data. This DataFrame must have columns:

        * ``item_id`` (starting at ``1``)

        * ``movie_title``

        If ``None``, will set to the output of ``read_movielens_df_item()``
    movielens_posters_df: DataFrame
        DataFrame containing item_ids from MovieLens data and the poster url. This DataFrame must
        have columns:

        * ``item_id`` (starting at ``1``)

        * ``url``

        If ``None``, will set to the output of ``read_movielens_posters_df()``
    num_user_movies_to_display: int
        Number of movies rated ``4`` or ``5`` to display for the user
    num_similar_movies: int
        Number of movies recommendations to display
    filter_films: bool
        Filter films out of recommendations if the user has already interacted with them
    shuffle: bool
        Shuffle order of ``num_user_movies_to_display`` films
    detailed: bool
        Of the top ``N`` unfiltered recommendations, display how many movies the user gave a
        positive and negative rating to
    image_width: int
        Image width for HTML images

    Returns
    -------------
    html: str
        HTML string of movies a user loved and the model recommended for a given user, ready for
        displaying

    """
    assert num_similar_movies > 0, 'Number of similar movies returned must be 1 or greater.'

    if df_user is None:
        df_user = read_movielens_df(decrement_ids=False)

    if df_item is None:
        df_item = read_movielens_df_item()

    if movielens_posters_df is None:
        movielens_posters_df = read_movielens_posters_df()

    if df_user['user_id'].min() != 1 or df_user['item_id'].min() != 1:
        raise ValueError(
            'Both user and item IDs must start at ``1`` for MovieLens 100K ``df_user`` data.'
        )
    if df_item['item_id'].min() != 1:
        raise ValueError(
            'Item IDs must start at ``1`` for MovieLens 100K ``df_item`` data.'
        )

    user_df = df_user.query(f'user_id=={user_id}')
    user_liked_movies = sorted(
        user_df[user_df['rating'] >= 4]['item_id'].tolist())

    if shuffle:
        random.shuffle(user_liked_movies)

    user_liked_movies = user_liked_movies[:num_user_movies_to_display]

    top_movies = model.get_item_predictions(user_id - 1,
                                            unseen_items_only=filter_films,
                                            sort_values=True)
    top_movies_k = top_movies[:num_similar_movies]

    if len(top_movies_k) == 0:
        if filter_films:
            raise ValueError(f'User {user_id} cannot have rated every movie.')
        else:
            raise ValueError(f'User {user_id} has no top rated films.')

    html = f'<h3>User {user_id}:</h3>'
    html += _get_posters_html(movielens_posters_df=movielens_posters_df,
                              df_item=df_item,
                              item_ids=user_liked_movies,
                              col_description='Some loved films:',
                              image_width=image_width)
    html += _get_posters_html(movielens_posters_df=movielens_posters_df,
                              df_item=df_item,
                              item_ids=(top_movies_k.index + 1),
                              col_description='Recommended films:',
                              image_width=image_width)

    if detailed:
        loved_movies = df_user.query(f'user_id == {user_id} and (rating >= 4)')
        loved_movies = loved_movies.item_id.tolist()
        hated_movies = df_user.query(f'user_id == {user_id} and (rating < 4)')
        hated_movies = hated_movies.item_id.tolist()

        unfiltered_top_movies = model.get_item_predictions(
            user_id - 1, unseen_items_only=False, sort_values=True)
        unfiltered_top_movies_k = (
            unfiltered_top_movies[:num_similar_movies].index + 1).tolist()

        percent_captured = round(
            len(set(loved_movies) & set(unfiltered_top_movies_k)) /
            num_similar_movies * 100, 3)
        percent_bad = round(
            len(set(hated_movies) & set(unfiltered_top_movies_k)) /
            num_similar_movies * 100, 3)

        html += (
            '-----'
            f'<p style="margin:0">User {user_id} has rated <strong>{len(loved_movies)}'
            '</strong> films with a 4 or 5</p>'
            f'<p style="margin:0">User {user_id} has rated <strong>{len(hated_movies)}'
            '</strong> films with a 1, 2, or 3</p>'
            '<p style="margin:0">% of these films rated 5 or 4 appearing in the '
            f'first {num_similar_movies} recommendations:'
            f'<strong style="color:green">{percent_captured}%</strong></p>'
            '<p style="margin:0">% of these films rated 1, 2, or 3 appearing in the '
            f'first {num_similar_movies} recommendations: '
            f'<strong style="color:red">{percent_bad}%</strong></p>')

    return html
Esempio n. 3
0
def movielens_explicit_df():
    return read_movielens_df(decrement_ids=True)
Esempio n. 4
0
def movielens_explicit_df_not_decremented():
    return read_movielens_df(decrement_ids=False)
Esempio n. 5
0
def run_movielens_example(epochs: int = 20, gpus: int = 0) -> None:
    """
    Retrieve and split data, train and evaluate a model, and save it.

    From the terminal, you can run this script with:

    .. code-block:: bash

        python collie_recs/movielens/run.py  --epochs 20

    Parameters
    -------------
    epochs: int
        Number of epochs for model training
    gpus: int
        Number of gpus to train on

    """
    t = Timer()

    t.timecheck('  1.0 - retrieving MovieLens 100K dataset')
    df = read_movielens_df(decrement_ids=True)
    t.timecheck('  1.0 complete')

    t.timecheck('  2.0 - splitting data')
    df_imp = convert_to_implicit(df)
    interactions = Interactions(users=df_imp['user_id'],
                                items=df_imp['item_id'],
                                allow_missing_ids=True)
    train, val, test = stratified_split(interactions, val_p=0.1, test_p=0.1)
    train_loader = InteractionsDataLoader(train, batch_size=1024, shuffle=True)
    val_loader = InteractionsDataLoader(val, batch_size=1024, shuffle=False)
    t.timecheck('  2.0 complete')

    t.timecheck('  3.0 - training the model')
    model = MatrixFactorizationModel(train=train_loader,
                                     val=val_loader,
                                     dropout_p=0.05,
                                     loss='adaptive',
                                     lr=5e-2,
                                     embedding_dim=10,
                                     optimizer='adam',
                                     weight_decay=1e-7)
    trainer = CollieTrainer(
        model=model,
        gpus=gpus,
        max_epochs=epochs,
        deterministic=True,
        logger=False,
        checkpoint_callback=False,
        callbacks=[EarlyStopping(monitor='val_loss_epoch', mode='min')],
        weights_summary='full',
        terminate_on_nan=True)
    trainer.fit(model)
    model.eval()
    t.timecheck('\n  3.0 complete')

    t.timecheck('  4.0 - evaluating model')
    auc_score, mrr_score, mapk_score = evaluate_in_batches([auc, mrr, mapk],
                                                           test,
                                                           model,
                                                           k=10)
    print(f'AUC:          {auc_score}')
    print(f'MRR:          {mrr_score}')
    print(f'MAP@10:       {mapk_score}')
    t.timecheck('  4.0 complete')

    t.timecheck('  5.0 - saving model')
    absolute_data_path = DATA_PATH / 'fitted_model'
    model.save_model(absolute_data_path)
    t.timecheck('  5.0 complete')