def test_load_pandas_df(
    size,
    num_samples,
    num_movies,
    movie_example,
    title_example,
    genres_example,
    year_example,
):
    """Test MovieLens dataset load into pd.DataFrame
    """
    # Test if correct data are loaded and local_cache_path works
    with TemporaryDirectory() as tmp_dir:
        # Test if can handle different size of header columns
        header = ["a"]
        df = load_pandas_df(size=size, local_cache_path=tmp_dir, header=header)
        assert len(df) == num_samples
        assert len(df.columns) == max(
            len(header), 2)  # Should load at least 2 columns, user and item

        # Test title, genres, and released year load
        header = ["a", "b", "c", "d", "e"]
        with pytest.warns(Warning):
            df = load_pandas_df(
                size=size,
                local_cache_path=tmp_dir,
                header=header,
                title_col="Title",
                genres_col="Genres",
                year_col="Year",
            )
            assert len(df) == num_samples
            assert (
                len(df.columns) == 7
            )  # 4 header columns (user, item, rating, timestamp) and 3 feature columns
            assert "e" not in df.columns  # only the first 4 header columns are used
            # Get two records of the same items and check if the item-features are the same.
            head = df.loc[df["b"] == movie_example][:2]
            title = head["Title"].values
            assert title[0] == title[1]
            assert title[0] == title_example
            genres = head["Genres"].values
            assert genres[0] == genres[1]
            assert genres[0] == genres_example
            year = head["Year"].values
            assert year[0] == year[1]
            assert year[0] == year_example

        # Test if raw-zip file, rating file, and item file are cached
        assert len(os.listdir(tmp_dir)) == 3

    # Test default arguments
    df = load_pandas_df(size)
    assert len(df) == num_samples
    assert len(df.columns) == 4
def test_load_pandas_df(
    size,
    num_samples,
    num_movies,
    movie_example,
    title_example,
    genres_example,
    year_example,
    tmp,
):
    """Test MovieLens dataset load as pd.DataFrame
    """
    # Test if correct data are loaded
    header = ["a", "b", "c"]
    df = load_pandas_df(size=size, local_cache_path=tmp, header=header)
    assert len(df) == num_samples
    assert len(df.columns) == len(header)
    # Test if raw-zip file, rating file, and item file are cached
    assert len(os.listdir(tmp)) == 3

    # Test title, genres, and released year load
    header = ["a", "b", "c", "d", "e"]
    with pytest.warns(Warning):
        df = load_pandas_df(
            size=size,
            header=header,
            local_cache_path=tmp,
            title_col="Title",
            genres_col="Genres",
            year_col="Year",
        )
        assert len(df) == num_samples
        assert (
            len(df.columns) == 7
        )  # 4 header columns (user, item, rating, timestamp) and 3 feature columns
        assert "e" not in df.columns  # only the first 4 header columns are used
        # Get two records of the same items and check if the item-features are the same.
        head = df.loc[df["b"] == movie_example][:2]
        title = head["Title"].values
        assert title[0] == title[1]
        assert title[0] == title_example
        genres = head["Genres"].values
        assert genres[0] == genres[1]
        assert genres[0] == genres_example
        year = head["Year"].values
        assert year[0] == year[1]
        assert year[0] == year_example

    # Test default arguments
    df = load_pandas_df(size)
    assert len(df) == num_samples
    # user, item, rating and timestamp
    assert len(df.columns) == 4
 def run(self, meta: dict = None):
     df_userdata = load_pandas_df(
         self.data_size, ('UserId', 'ItemId', 'Rating', 'Timestamp'),
         title_col=self.include_title,
         genres_col=self.include_genre,
         year_col=self.include_year)
     return df_userdata
Exemple #4
0
def test_lightgcn_component_definition(resource_path):
    yaml_file = os.path.join(
        resource_path,
        "..",
        "..",
        "reco_utils",
        "recommender",
        "deeprec",
        "config",
        "lightgcn.yaml",
    )

    df = movielens.load_pandas_df(size="100k")
    train, test = python_stratified_split(df, ratio=0.75)

    data = ImplicitCF(train=train, test=test)

    embed_size = 64
    hparams = prepare_hparams(yaml_file, embed_size=embed_size)
    model = LightGCN(hparams, data)

    assert model.norm_adj is not None
    assert model.ua_embeddings.shape == [data.n_users, embed_size]
    assert model.ia_embeddings.shape == [data.n_items, embed_size]
    assert model.u_g_embeddings is not None
    assert model.pos_i_g_embeddings is not None
    assert model.neg_i_g_embeddings is not None
    assert model.batch_ratings is not None
    assert model.loss is not None
    assert model.opt is not None
Exemple #5
0
 def load(self) -> None:
     self.data = movielens.load_pandas_df(size=self.variation,
                                          header=[
                                              self.user_col, self.item_col,
                                              self.score_col,
                                              self.timestamp_col
                                          ])
Exemple #6
0
def test_model_lightgcn(resource_path):
    data_path = os.path.join(resource_path, "..", "resources", "deeprec",
                             "dkn")
    yaml_file = os.path.join(
        resource_path,
        "..",
        "..",
        "reco_utils",
        "recommender",
        "deeprec",
        "config",
        "lightgcn.yaml",
    )
    user_file = os.path.join(data_path, r"user_embeddings.csv")
    item_file = os.path.join(data_path, r"item_embeddings.csv")

    df = movielens.load_pandas_df(size="100k")
    train, test = python_stratified_split(df, ratio=0.75)

    data = ImplicitCF(train=train, test=test)

    hparams = prepare_hparams(yaml_file, epochs=1)
    model = LightGCN(hparams, data)

    assert model.run_eval() is not None
    model.fit()
    assert model.recommend_k_items(test) is not None
    model.infer_embedding(user_file, item_file)
    assert os.path.getsize(user_file) != 0
    assert os.path.getsize(item_file) != 0
def test_load_pandas_df():
    """Test MovieLens dataset load into pd.DataFrame
    """

    # Test if the function load correct dataset
    size_100k = movielens.load_pandas_df(size="100k")
    assert len(size_100k) == 100000
    assert len(size_100k.columns) == 4
    size_1m = movielens.load_pandas_df(size="1m")
    assert len(size_1m) == 1000209
    assert len(size_1m.columns) == 4
    size_10m = movielens.load_pandas_df(size="10m")
    assert len(size_10m) == 10000054
    assert len(size_10m.columns) == 4
    size_20m = movielens.load_pandas_df(size="20m")
    assert len(size_20m) == 20000263
    assert len(size_20m.columns) == 4

    # Test if can handle wrong size argument
    with pytest.raises(ValueError):
        movielens.load_pandas_df(size="10k")
    # Test if can handle wrong cache path argument
    with pytest.raises(ValueError):
        movielens.load_pandas_df(local_cache_path=".")

    # Test if can handle different size of header columns
    header = ["a", "b", "c"]
    with_header = movielens.load_pandas_df(header=header)
    assert len(with_header) == 100000
    assert len(with_header.columns) == len(header)

    header = ["a", "b", "c", "d", "e"]
    with pytest.warns(Warning):
        with_header = movielens.load_pandas_df(header=header)
        assert len(with_header) == 100000
        assert len(with_header.columns) == 4
def test_load_pandas_df(size, num_samples, num_movies, title_example,
                        genres_example):
    """Test MovieLens dataset load into pd.DataFrame
    """
    df = movielens.load_pandas_df(size=size)
    assert len(df) == num_samples
    assert len(df.columns) == 4

    # Test if can handle different size of header columns
    header = ["a"]
    df = movielens.load_pandas_df(header=header)
    assert len(df.columns) == len(header)

    header = ["a", "b", "c", "d", "e"]
    with pytest.warns(Warning):
        df = movielens.load_pandas_df(header=header)
        assert len(df.columns) == 4

    # Test title load
    df = movielens.load_pandas_df(size=size, title_col="Title")
    assert len(df.columns) == 5
    # Movie 1 is Toy Story
    title = df.loc[df[DEFAULT_ITEM_COL] == 1][:2]["Title"].values
    assert title[0] == title[1]
    assert title[0] == title_example

    # Test genres load
    df = movielens.load_pandas_df(size=size, genres_col="Genres")
    assert len(df.columns) == 5
    # Movie 1 is Toy Story
    genres = df.loc[df[DEFAULT_ITEM_COL] == 1][:2]["Genres"].values
    assert genres[0] == genres[1]
    assert genres[0] == genres_example

    # Test movie data load (not rating data)
    df = movielens.load_pandas_df(size=size,
                                  header=None,
                                  title_col="Title",
                                  genres_col="Genres")
    assert len(df) == num_movies
    assert len(df.columns) == 3
Exemple #9
0
on user history using collaborative filtering. It produces easily explainable
and interpretable recommendations and handles "cold item" and "semi-cold user"
scenarios. The training data schema is:

  <User ID> <Item ID> <Time> [<Event Type>] [<Event Weight>].

Each observation is an interaction between a user and item (e.g., a movie
watched on a streaming site or an item clicked on an e-commerce website).

The MovieLens dataset records movie ratings provided by viewers. The ratings
are treated as the event weights. The smaller of the available datasets is
used, consisting of 100K users.

Press Enter to load the dataset and show the first few observations: """)

data = movielens.load_pandas_df(
    size=MOVIELENS, header=['UserId', 'MovieId', 'Rating', 'Timestamp'])

# Convert float precision to 32-bit to reduce memory consumption.

data.loc[:, 'Rating'] = data['Rating'].astype(np.float32)

# Load the movie title index.

titles = pd.read_table('titles.txt',
                       sep='|',
                       header=None,
                       encoding="ISO-8859-1")
titles = titles.loc[:, 0:1]
titles.columns = ["MovieId", "MovieTitle"]

answer = input()  # Wait for user.
Exemple #10
0
if dataset == "ciao":
    data_path = "/cluster/home/it_stu110/data/ciao/ciao_with_rating_timestamp/rating_with_timestamp.mat"
    import scipy.io as scio
    data = scio.loadmat(data_path)
    #  userid, productid, categoryid, rating, helpfulness and  time point
    df = pd.DataFrame(data['rating'][:, [0, 1, 3, 5]],
                      columns=["userID", "itemID", "rating", "timestamp"])
elif dataset == "yelp_ON":
    data_path = "/cluster/home/it_stu110/data/yelp/state/ON_reindex.csv"
    df = pd.read_csv(data_path,
                     index_col=0)[['user_id', 'business_id', 'stars', 'date']]
    df.columns = ["userID", "itemID", "rating", "timestamp"]
elif dataset == "movielens":
    MOVIELENS_DATA_SIZE = '100k'
    df = movielens.load_pandas_df(
        size=MOVIELENS_DATA_SIZE,
        header=["userID", "itemID", "rating", "timestamp"])

# Select MovieLens data size: 100k, 1m, 10m, or 20m
# MOVIELENS_DATA_SIZE = '100k'
# df = movielens.load_pandas_df(
#     size=MOVIELENS_DATA_SIZE,
#     header=["userID", "itemID", "rating", "timestamp"]
# )

train, test = python_chrono_split(df, 0.75)
print("start getting data")
'''
data = NCFDataset(train=train, test=test, seed=SEED)
print("start getting model")
model = NCF (
Exemple #11
0
import logging
import numpy as np

from reco_utils.dataset import movielens
from reco_utils.dataset.python_splitters import python_stratified_split
from reco_utils.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from reco_utils.recommender.sar.sar_singlenode import SARSingleNode

# top k items to recommend
TOP_K = 10

if __name__ == "__main__":
    data = movielens.load_pandas_df(
        size='100k',
        header=['UserId', 'MovieId', 'Rating', 'Timestamp'],
        title_col='Title')

    # Convert the float precision to 32-bit in order to reduce memory consumption
    data.loc[:, 'Rating'] = data['Rating'].astype(np.float32)

    header = {
        "col_user": "******",
        "col_item": "MovieId",
        "col_rating": "Rating",
        "col_timestamp": "Timestamp",
        "col_prediction": "Prediction",
    }

    train, test = python_stratified_split(data,
                                          ratio=0.75,
                                          col_user=header["col_user"],