def get_recommendation_visualizations( model: collie_recs.model.BasePipeline, user_id: int, df_user: Optional[pd.DataFrame] = None, df_item: Optional[pd.DataFrame] = None, movielens_posters_df: Optional[pd.DataFrame] = None, num_user_movies_to_display: int = 10, num_similar_movies: int = 10, filter_films: bool = True, shuffle: bool = True, detailed: bool = False, image_width: int = 500, ) -> str: """ Get visual recommendations through Movielens posters for a given user. Parameters ------------- model: collie_recs.model.BasePipeline user_id: int User ID to pull recommendations for df_user: DataFrame ``u.data`` from MovieLens data. If ``None``, will set to the output of ``read_movielens_df(decrement_ids=False)`` Note: User and item IDs should start at 1 for this, which is the default for MovieLens 100K df_item: DataFrame ``u.item`` from MovieLens data. If ``None``, will set to the output of ``read_movielens_df_item()`` movielens_posters_df: DataFrame DataFrame containing item_ids from MovieLens data and the poster url. If ``None``, will set to the output of ``read_movielens_posters_df()`` num_user_movies_to_display: int Number of movies rated 4 or 5 to display for the user num_similar_movies: int Number of movies recommendations to display filter_films: bool Filter already-seen films from recommendations shuffle: bool Shuffle user-loved movie order detailed: bool Of the top N unfiltered recommendations, displays how many movies the user gave a positive and negative rating to image_width: int Image width for HTML images Returns ------------- html: str HTML string of movies a user loved and the model recommended for a given user, ready for displaying """ assert num_similar_movies > 0, 'Number of similar movies returned must be 1 or greater.' if df_user is None: df_user = read_movielens_df(decrement_ids=False) if df_item is None: df_item = read_movielens_df_item() if movielens_posters_df is None: movielens_posters_df = read_movielens_posters_df() user_df = df_user.query(f'user_id=={user_id}') user_liked_movies = sorted(user_df[user_df['rating'] >= 4]['item_id'].tolist()) if shuffle: random.shuffle(user_liked_movies) user_liked_movies = user_liked_movies[:num_user_movies_to_display] top_movies = model.get_item_predictions(user_id - 1, unseen_items_only=filter_films, sort_values=True) top_movies_k = top_movies[:num_similar_movies] if len(top_movies_k) == 0: if filter_films: raise ValueError(f'User {user_id} cannot have rated every movie.') else: raise ValueError(f'User {user_id} has no top rated films.') html = f'<h3>User {user_id}:</h3>' html += _get_posters_html(movielens_posters_df, df_item, user_liked_movies, col_description='Some loved films:', image_width=image_width) html += _get_posters_html(movielens_posters_df, df_item, top_movies_k.index + 1) if detailed: loved_movies = df_user.query(f'user_id=={user_id} and (rating >= 4)') loved_movies = loved_movies.item_id.tolist() hated_movies = df_user.query(f'user_id=={user_id} and (rating < 4)') hated_movies = hated_movies.item_id.tolist() unfiltered_top_movies = model.get_item_predictions(user_id - 1, unseen_items_only=False, sort_values=True) unfiltered_top_movies_k = ( (unfiltered_top_movies[:num_similar_movies].index + 1).tolist() ) percent_captured = round(len(set(loved_movies) & set(unfiltered_top_movies_k)) / num_similar_movies * 100, 3) percent_bad = round(len(set(hated_movies) & set(unfiltered_top_movies_k)) / num_similar_movies * 100, 3) html += '-----' html += f'<p style="margin:0">User {user_id} has rated <strong>{len(loved_movies)}' html += '</strong> films with a 4 or 5</p>' html += f'<p style="margin:0">User {user_id} has rated <strong>{len(hated_movies)}' html += '</strong> films with a 1, 2, or 3</p>' html += '<p style="margin:0">% of these films rated 5 or 4 appearing in the ' html += f'first {num_similar_movies} recommendations:' html += f'<strong style="color:green">{percent_captured}%</strong></p>' html += '<p style="margin:0">% of these films rated 1, 2, or 3 appearing in the ' html += f'first {num_similar_movies} recommendations: ' html += f'<strong style="color:red">{percent_bad}%</strong></p>' return html
def get_recommendation_visualizations( model: collie_recs.model.BasePipeline, user_id: int, df_user: Optional[pd.DataFrame] = None, df_item: Optional[pd.DataFrame] = None, movielens_posters_df: Optional[pd.DataFrame] = None, num_user_movies_to_display: int = 10, num_similar_movies: int = 10, filter_films: bool = True, shuffle: bool = True, detailed: bool = False, image_width: int = 500, ) -> str: """ Visualize Movielens 100K recommendations for a given user. Parameters ------------- model: collie_recs.model.BasePipeline user_id: int User ID to retrieve recommendations for df_user: DataFrame ``u.data`` from MovieLens data. This DataFrame must have columns: * ``user_id`` (starting at ``1``) * ``item_id`` (starting at ``1``) * ``rating`` (explicit ratings) If ``None``, will set to the output of ``read_movielens_df(decrement_ids=False)``. df_item: DataFrame ``u.item`` from MovieLens data. This DataFrame must have columns: * ``item_id`` (starting at ``1``) * ``movie_title`` If ``None``, will set to the output of ``read_movielens_df_item()`` movielens_posters_df: DataFrame DataFrame containing item_ids from MovieLens data and the poster url. This DataFrame must have columns: * ``item_id`` (starting at ``1``) * ``url`` If ``None``, will set to the output of ``read_movielens_posters_df()`` num_user_movies_to_display: int Number of movies rated ``4`` or ``5`` to display for the user num_similar_movies: int Number of movies recommendations to display filter_films: bool Filter films out of recommendations if the user has already interacted with them shuffle: bool Shuffle order of ``num_user_movies_to_display`` films detailed: bool Of the top ``N`` unfiltered recommendations, display how many movies the user gave a positive and negative rating to image_width: int Image width for HTML images Returns ------------- html: str HTML string of movies a user loved and the model recommended for a given user, ready for displaying """ assert num_similar_movies > 0, 'Number of similar movies returned must be 1 or greater.' if df_user is None: df_user = read_movielens_df(decrement_ids=False) if df_item is None: df_item = read_movielens_df_item() if movielens_posters_df is None: movielens_posters_df = read_movielens_posters_df() if df_user['user_id'].min() != 1 or df_user['item_id'].min() != 1: raise ValueError( 'Both user and item IDs must start at ``1`` for MovieLens 100K ``df_user`` data.' ) if df_item['item_id'].min() != 1: raise ValueError( 'Item IDs must start at ``1`` for MovieLens 100K ``df_item`` data.' ) user_df = df_user.query(f'user_id=={user_id}') user_liked_movies = sorted( user_df[user_df['rating'] >= 4]['item_id'].tolist()) if shuffle: random.shuffle(user_liked_movies) user_liked_movies = user_liked_movies[:num_user_movies_to_display] top_movies = model.get_item_predictions(user_id - 1, unseen_items_only=filter_films, sort_values=True) top_movies_k = top_movies[:num_similar_movies] if len(top_movies_k) == 0: if filter_films: raise ValueError(f'User {user_id} cannot have rated every movie.') else: raise ValueError(f'User {user_id} has no top rated films.') html = f'<h3>User {user_id}:</h3>' html += _get_posters_html(movielens_posters_df=movielens_posters_df, df_item=df_item, item_ids=user_liked_movies, col_description='Some loved films:', image_width=image_width) html += _get_posters_html(movielens_posters_df=movielens_posters_df, df_item=df_item, item_ids=(top_movies_k.index + 1), col_description='Recommended films:', image_width=image_width) if detailed: loved_movies = df_user.query(f'user_id == {user_id} and (rating >= 4)') loved_movies = loved_movies.item_id.tolist() hated_movies = df_user.query(f'user_id == {user_id} and (rating < 4)') hated_movies = hated_movies.item_id.tolist() unfiltered_top_movies = model.get_item_predictions( user_id - 1, unseen_items_only=False, sort_values=True) unfiltered_top_movies_k = ( unfiltered_top_movies[:num_similar_movies].index + 1).tolist() percent_captured = round( len(set(loved_movies) & set(unfiltered_top_movies_k)) / num_similar_movies * 100, 3) percent_bad = round( len(set(hated_movies) & set(unfiltered_top_movies_k)) / num_similar_movies * 100, 3) html += ( '-----' f'<p style="margin:0">User {user_id} has rated <strong>{len(loved_movies)}' '</strong> films with a 4 or 5</p>' f'<p style="margin:0">User {user_id} has rated <strong>{len(hated_movies)}' '</strong> films with a 1, 2, or 3</p>' '<p style="margin:0">% of these films rated 5 or 4 appearing in the ' f'first {num_similar_movies} recommendations:' f'<strong style="color:green">{percent_captured}%</strong></p>' '<p style="margin:0">% of these films rated 1, 2, or 3 appearing in the ' f'first {num_similar_movies} recommendations: ' f'<strong style="color:red">{percent_bad}%</strong></p>') return html
def movielens_explicit_df(): return read_movielens_df(decrement_ids=True)
def movielens_explicit_df_not_decremented(): return read_movielens_df(decrement_ids=False)
def run_movielens_example(epochs: int = 20, gpus: int = 0) -> None: """ Retrieve and split data, train and evaluate a model, and save it. From the terminal, you can run this script with: .. code-block:: bash python collie_recs/movielens/run.py --epochs 20 Parameters ------------- epochs: int Number of epochs for model training gpus: int Number of gpus to train on """ t = Timer() t.timecheck(' 1.0 - retrieving MovieLens 100K dataset') df = read_movielens_df(decrement_ids=True) t.timecheck(' 1.0 complete') t.timecheck(' 2.0 - splitting data') df_imp = convert_to_implicit(df) interactions = Interactions(users=df_imp['user_id'], items=df_imp['item_id'], allow_missing_ids=True) train, val, test = stratified_split(interactions, val_p=0.1, test_p=0.1) train_loader = InteractionsDataLoader(train, batch_size=1024, shuffle=True) val_loader = InteractionsDataLoader(val, batch_size=1024, shuffle=False) t.timecheck(' 2.0 complete') t.timecheck(' 3.0 - training the model') model = MatrixFactorizationModel(train=train_loader, val=val_loader, dropout_p=0.05, loss='adaptive', lr=5e-2, embedding_dim=10, optimizer='adam', weight_decay=1e-7) trainer = CollieTrainer( model=model, gpus=gpus, max_epochs=epochs, deterministic=True, logger=False, checkpoint_callback=False, callbacks=[EarlyStopping(monitor='val_loss_epoch', mode='min')], weights_summary='full', terminate_on_nan=True) trainer.fit(model) model.eval() t.timecheck('\n 3.0 complete') t.timecheck(' 4.0 - evaluating model') auc_score, mrr_score, mapk_score = evaluate_in_batches([auc, mrr, mapk], test, model, k=10) print(f'AUC: {auc_score}') print(f'MRR: {mrr_score}') print(f'MAP@10: {mapk_score}') t.timecheck(' 4.0 complete') t.timecheck(' 5.0 - saving model') absolute_data_path = DATA_PATH / 'fitted_model' model.save_model(absolute_data_path) t.timecheck(' 5.0 complete')