def test_he_sampling_correctness(sample_dataframes): df, _, _ = sample_dataframes df = df.drop_duplicates(subset=["user", "item"]) encoder = DatasetEncoder() encoder.fit(df["user"], df["item"]) train, test = split(df, n_test=1) dataset = Dataset.from_df(df, encoder=encoder, normalize=lambda _: ones_like(_)) train_dataset = Dataset.from_df( train, encoder=encoder, normalize=lambda _: ones_like(_) ) test_dataset = Dataset.from_df( test, encoder=encoder, normalize=lambda _: ones_like(_) ) users, items = he_sampling(test_dataset, train_dataset) a, b, _ = dataset.to_components() all_users, all_items = groupby(a, b) for i in range(len(items)): # items includes no more than one element from 'all_items'. assert len(intersect1d(items[i], all_items[i])) == 1
def test_fit_transform_user_item_succeeds(sample_dataset): d = DatasetEncoder() r = d.fit_transform(users=sample_dataset[:, 0], items=sample_dataset[:, 1]) # check overall shape of responses matches assert r["users"].shape == sample_dataset[:, 0].shape assert r["items"].shape == sample_dataset[:, 1].shape # check number of unique elements matches. assert np.unique(r["users"]).shape == np.unique(sample_dataset[:, 0]).shape assert np.unique(r["items"]).shape == np.unique(sample_dataset[:, 1]).shape
def test_partial_fit_item_user_succeeds(sample_dataset): users, items = sample_dataset[:, 0], sample_dataset[:, 1] n = int(users.shape[0] / 2) a_users, a_items = users[:n], items[:n] b_users, b_items = users[n:], items[n:] d = DatasetEncoder() d.partial_fit(users=a_users, items=a_items) d.partial_fit(users=b_users, items=b_items) r = d.transform(users=users, items=items) assert r["users"].shape == users.shape assert r["items"].shape == items.shape assert np.unique(r["users"]).shape == np.unique(users).shape assert np.unique(r["items"]).shape == np.unique(items).shape
def test_shared_encoder_interactions_shapes(sample_dataframes): """ Datasets using the same DatasetEncoder produce interactions matrices of the same shape. """ interactions, _, _ = sample_dataframes encoder = DatasetEncoder() encoder.fit(interactions["user"], interactions["item"]) train_interactions = interactions.iloc[:int(len(interactions) / 2)] test_interactions = interactions.iloc[:int(len(interactions) / 2)] train_dataset = Dataset.from_df(train_interactions, encoder=encoder) test_dataset = Dataset.from_df(test_interactions, encoder=encoder) dataset = Dataset.from_df(interactions) assert dataset.interactions.shape == train_dataset.interactions.shape assert dataset.interactions.shape == test_dataset.interactions.shape
def test_to_df_fails_with_mismatched_inputs(sample_dataset): users, items = sample_dataset[:, 0], sample_dataset[:, 1] d = DatasetEncoder() d.fit(users=users, items=items) encoded = d.transform(users=users, items=items) recommended = [ np.random.choice(encoded["items"], 10, replace=False) for _ in range(int(len(users) / 2)) ] with pytest.raises(ValueError): d.to_df(users, recommended)
def test_transform_fails_for_unknown_elements(sample_dataset): users, items = sample_dataset[:, 0], sample_dataset[:, 1] n = int(users.shape[0] / 4) with pytest.raises(KeyError): a_users, a_items = users[:n], items[:n] b_users, b_items = users[n:], items[n:] d = DatasetEncoder() d.fit(users=a_users, items=a_items) d.transform(users=b_users, items=b_items)
def test_fit_reversible_user_item_transform(sample_dataset): users, items = sample_dataset[:, 0], sample_dataset[:, 1] d = DatasetEncoder() d.fit(users=users, items=items) r = d.inverse_transform(**d.transform(users=users, items=items)) assert r["users"].shape == users.shape assert r["items"].shape == items.shape assert np.all(r["users"] == users) assert np.all(r["items"] == items)
def test_to_df_succeeds(sample_dataset): users, items = sample_dataset[:, 0], sample_dataset[:, 1] d = DatasetEncoder() d.fit(users=users, items=items) encoded = d.transform(users=users, items=items) recommended = [ np.random.choice(encoded["items"], 10, replace=False) for _ in users ] rdf = d.to_df(encoded["users"], recommended, target_col="user") assert np.all(rdf["user"].values == users) for i, row in enumerate(rdf.values[:, 1:]): assert np.all(row == d.inverse_transform( items=recommended[i])["items"])
def test_inverse_fit_transform_using_metadata(sample_metadata_dataset): users = sample_metadata_dataset[:, 0] items = sample_metadata_dataset[:, 1] user_meta = sample_metadata_dataset[:, 2] item_meta = sample_metadata_dataset[:, 3] d = DatasetEncoder() d.fit(users=users, items=items, user_tags=user_meta, item_tags=item_meta) r = d.inverse_transform(**d.transform( users=users, items=items, user_tags=user_meta, item_tags=item_meta)) assert r["users"].shape == users.shape assert r["items"].shape == items.shape assert r["user_tags"].shape == user_meta.shape assert r["item_tags"].shape == item_meta.shape assert np.all(r["users"] == users) assert np.all(r["items"] == items) assert np.all(r["user_tags"] == user_meta) assert np.all(r["item_tags"] == item_meta)
from xanthus.models import GeneralizedMatrixFactorizationModel as GMFModel from xanthus.datasets import Dataset, DatasetEncoder, utils from xanthus.evaluate import leave_one_out, score, metrics, he_sampling ratings = pd.read_csv("data/movielens-100k/ratings.csv") movies = pd.read_csv("data/movielens-100k/movies.csv") title_mapping = dict(zip(movies["movieId"], movies["title"])) ratings = ratings.rename(columns={"userId": "user", "movieId": "item"}) ratings.loc[:, "item"] = ratings["item"].apply(lambda _: title_mapping[_]) ratings = ratings[ratings["rating"] > 3.0] train_df, test_df = leave_one_out(ratings) encoder = DatasetEncoder() encoder.fit(ratings["user"], ratings["item"]) train_ds = Dataset.from_df(train_df, normalize=utils.as_implicit, encoder=encoder) test_ds = Dataset.from_df(test_df, normalize=utils.as_implicit, encoder=encoder) model = GMFModel(fit_params=dict(epochs=10, batch_size=256), n_factors=32, negative_samples=4) model.fit(train_ds)
def test_fit_user_item_succeeds(sample_dataset): d = DatasetEncoder() assert d.fit(users=sample_dataset[:, 0], items=sample_dataset[:, 1]) == d