def test_load_pandas_df( size, num_samples, num_movies, movie_example, title_example, genres_example, year_example, ): """Test MovieLens dataset load into pd.DataFrame """ # Test if correct data are loaded and local_cache_path works with TemporaryDirectory() as tmp_dir: # Test if can handle different size of header columns header = ["a"] df = load_pandas_df(size=size, local_cache_path=tmp_dir, header=header) assert len(df) == num_samples assert len(df.columns) == max( len(header), 2) # Should load at least 2 columns, user and item # Test title, genres, and released year load header = ["a", "b", "c", "d", "e"] with pytest.warns(Warning): df = load_pandas_df( size=size, local_cache_path=tmp_dir, header=header, title_col="Title", genres_col="Genres", year_col="Year", ) assert len(df) == num_samples assert ( len(df.columns) == 7 ) # 4 header columns (user, item, rating, timestamp) and 3 feature columns assert "e" not in df.columns # only the first 4 header columns are used # Get two records of the same items and check if the item-features are the same. head = df.loc[df["b"] == movie_example][:2] title = head["Title"].values assert title[0] == title[1] assert title[0] == title_example genres = head["Genres"].values assert genres[0] == genres[1] assert genres[0] == genres_example year = head["Year"].values assert year[0] == year[1] assert year[0] == year_example # Test if raw-zip file, rating file, and item file are cached assert len(os.listdir(tmp_dir)) == 3 # Test default arguments df = load_pandas_df(size) assert len(df) == num_samples assert len(df.columns) == 4
def test_load_pandas_df( size, num_samples, num_movies, movie_example, title_example, genres_example, year_example, tmp, ): """Test MovieLens dataset load as pd.DataFrame """ # Test if correct data are loaded header = ["a", "b", "c"] df = load_pandas_df(size=size, local_cache_path=tmp, header=header) assert len(df) == num_samples assert len(df.columns) == len(header) # Test if raw-zip file, rating file, and item file are cached assert len(os.listdir(tmp)) == 3 # Test title, genres, and released year load header = ["a", "b", "c", "d", "e"] with pytest.warns(Warning): df = load_pandas_df( size=size, header=header, local_cache_path=tmp, title_col="Title", genres_col="Genres", year_col="Year", ) assert len(df) == num_samples assert ( len(df.columns) == 7 ) # 4 header columns (user, item, rating, timestamp) and 3 feature columns assert "e" not in df.columns # only the first 4 header columns are used # Get two records of the same items and check if the item-features are the same. head = df.loc[df["b"] == movie_example][:2] title = head["Title"].values assert title[0] == title[1] assert title[0] == title_example genres = head["Genres"].values assert genres[0] == genres[1] assert genres[0] == genres_example year = head["Year"].values assert year[0] == year[1] assert year[0] == year_example # Test default arguments df = load_pandas_df(size) assert len(df) == num_samples # user, item, rating and timestamp assert len(df.columns) == 4
def run(self, meta: dict = None): df_userdata = load_pandas_df( self.data_size, ('UserId', 'ItemId', 'Rating', 'Timestamp'), title_col=self.include_title, genres_col=self.include_genre, year_col=self.include_year) return df_userdata
def test_lightgcn_component_definition(resource_path): yaml_file = os.path.join( resource_path, "..", "..", "reco_utils", "recommender", "deeprec", "config", "lightgcn.yaml", ) df = movielens.load_pandas_df(size="100k") train, test = python_stratified_split(df, ratio=0.75) data = ImplicitCF(train=train, test=test) embed_size = 64 hparams = prepare_hparams(yaml_file, embed_size=embed_size) model = LightGCN(hparams, data) assert model.norm_adj is not None assert model.ua_embeddings.shape == [data.n_users, embed_size] assert model.ia_embeddings.shape == [data.n_items, embed_size] assert model.u_g_embeddings is not None assert model.pos_i_g_embeddings is not None assert model.neg_i_g_embeddings is not None assert model.batch_ratings is not None assert model.loss is not None assert model.opt is not None
def load(self) -> None: self.data = movielens.load_pandas_df(size=self.variation, header=[ self.user_col, self.item_col, self.score_col, self.timestamp_col ])
def test_model_lightgcn(resource_path): data_path = os.path.join(resource_path, "..", "resources", "deeprec", "dkn") yaml_file = os.path.join( resource_path, "..", "..", "reco_utils", "recommender", "deeprec", "config", "lightgcn.yaml", ) user_file = os.path.join(data_path, r"user_embeddings.csv") item_file = os.path.join(data_path, r"item_embeddings.csv") df = movielens.load_pandas_df(size="100k") train, test = python_stratified_split(df, ratio=0.75) data = ImplicitCF(train=train, test=test) hparams = prepare_hparams(yaml_file, epochs=1) model = LightGCN(hparams, data) assert model.run_eval() is not None model.fit() assert model.recommend_k_items(test) is not None model.infer_embedding(user_file, item_file) assert os.path.getsize(user_file) != 0 assert os.path.getsize(item_file) != 0
def test_load_pandas_df(): """Test MovieLens dataset load into pd.DataFrame """ # Test if the function load correct dataset size_100k = movielens.load_pandas_df(size="100k") assert len(size_100k) == 100000 assert len(size_100k.columns) == 4 size_1m = movielens.load_pandas_df(size="1m") assert len(size_1m) == 1000209 assert len(size_1m.columns) == 4 size_10m = movielens.load_pandas_df(size="10m") assert len(size_10m) == 10000054 assert len(size_10m.columns) == 4 size_20m = movielens.load_pandas_df(size="20m") assert len(size_20m) == 20000263 assert len(size_20m.columns) == 4 # Test if can handle wrong size argument with pytest.raises(ValueError): movielens.load_pandas_df(size="10k") # Test if can handle wrong cache path argument with pytest.raises(ValueError): movielens.load_pandas_df(local_cache_path=".") # Test if can handle different size of header columns header = ["a", "b", "c"] with_header = movielens.load_pandas_df(header=header) assert len(with_header) == 100000 assert len(with_header.columns) == len(header) header = ["a", "b", "c", "d", "e"] with pytest.warns(Warning): with_header = movielens.load_pandas_df(header=header) assert len(with_header) == 100000 assert len(with_header.columns) == 4
def test_load_pandas_df(size, num_samples, num_movies, title_example, genres_example): """Test MovieLens dataset load into pd.DataFrame """ df = movielens.load_pandas_df(size=size) assert len(df) == num_samples assert len(df.columns) == 4 # Test if can handle different size of header columns header = ["a"] df = movielens.load_pandas_df(header=header) assert len(df.columns) == len(header) header = ["a", "b", "c", "d", "e"] with pytest.warns(Warning): df = movielens.load_pandas_df(header=header) assert len(df.columns) == 4 # Test title load df = movielens.load_pandas_df(size=size, title_col="Title") assert len(df.columns) == 5 # Movie 1 is Toy Story title = df.loc[df[DEFAULT_ITEM_COL] == 1][:2]["Title"].values assert title[0] == title[1] assert title[0] == title_example # Test genres load df = movielens.load_pandas_df(size=size, genres_col="Genres") assert len(df.columns) == 5 # Movie 1 is Toy Story genres = df.loc[df[DEFAULT_ITEM_COL] == 1][:2]["Genres"].values assert genres[0] == genres[1] assert genres[0] == genres_example # Test movie data load (not rating data) df = movielens.load_pandas_df(size=size, header=None, title_col="Title", genres_col="Genres") assert len(df) == num_movies assert len(df.columns) == 3
on user history using collaborative filtering. It produces easily explainable and interpretable recommendations and handles "cold item" and "semi-cold user" scenarios. The training data schema is: <User ID> <Item ID> <Time> [<Event Type>] [<Event Weight>]. Each observation is an interaction between a user and item (e.g., a movie watched on a streaming site or an item clicked on an e-commerce website). The MovieLens dataset records movie ratings provided by viewers. The ratings are treated as the event weights. The smaller of the available datasets is used, consisting of 100K users. Press Enter to load the dataset and show the first few observations: """) data = movielens.load_pandas_df( size=MOVIELENS, header=['UserId', 'MovieId', 'Rating', 'Timestamp']) # Convert float precision to 32-bit to reduce memory consumption. data.loc[:, 'Rating'] = data['Rating'].astype(np.float32) # Load the movie title index. titles = pd.read_table('titles.txt', sep='|', header=None, encoding="ISO-8859-1") titles = titles.loc[:, 0:1] titles.columns = ["MovieId", "MovieTitle"] answer = input() # Wait for user.
if dataset == "ciao": data_path = "/cluster/home/it_stu110/data/ciao/ciao_with_rating_timestamp/rating_with_timestamp.mat" import scipy.io as scio data = scio.loadmat(data_path) # userid, productid, categoryid, rating, helpfulness and time point df = pd.DataFrame(data['rating'][:, [0, 1, 3, 5]], columns=["userID", "itemID", "rating", "timestamp"]) elif dataset == "yelp_ON": data_path = "/cluster/home/it_stu110/data/yelp/state/ON_reindex.csv" df = pd.read_csv(data_path, index_col=0)[['user_id', 'business_id', 'stars', 'date']] df.columns = ["userID", "itemID", "rating", "timestamp"] elif dataset == "movielens": MOVIELENS_DATA_SIZE = '100k' df = movielens.load_pandas_df( size=MOVIELENS_DATA_SIZE, header=["userID", "itemID", "rating", "timestamp"]) # Select MovieLens data size: 100k, 1m, 10m, or 20m # MOVIELENS_DATA_SIZE = '100k' # df = movielens.load_pandas_df( # size=MOVIELENS_DATA_SIZE, # header=["userID", "itemID", "rating", "timestamp"] # ) train, test = python_chrono_split(df, 0.75) print("start getting data") ''' data = NCFDataset(train=train, test=test, seed=SEED) print("start getting model") model = NCF (
import logging import numpy as np from reco_utils.dataset import movielens from reco_utils.dataset.python_splitters import python_stratified_split from reco_utils.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k from reco_utils.recommender.sar.sar_singlenode import SARSingleNode # top k items to recommend TOP_K = 10 if __name__ == "__main__": data = movielens.load_pandas_df( size='100k', header=['UserId', 'MovieId', 'Rating', 'Timestamp'], title_col='Title') # Convert the float precision to 32-bit in order to reduce memory consumption data.loc[:, 'Rating'] = data['Rating'].astype(np.float32) header = { "col_user": "******", "col_item": "MovieId", "col_rating": "Rating", "col_timestamp": "Timestamp", "col_prediction": "Prediction", } train, test = python_stratified_split(data, ratio=0.75, col_user=header["col_user"],