Exemple #1
0
def python_dataset_ncf(test_specs_ncf):
    """Get Python labels"""
    def random_date_generator(start_date, range_in_days):
        """Helper function to generate random timestamps.

        Reference: https://stackoverflow.com/questions/41006182/generate-random-dates-within-a
        -range-in-numpy
        """
        days_to_add = np.arange(0, range_in_days)
        random_dates = []
        for i in range(range_in_days):
            random_date = np.datetime64(start_date) + np.random.choice(
                days_to_add)
            random_dates.append(random_date)

        return random_dates

    np.random.seed(test_specs_ncf["seed"])

    rating = pd.DataFrame({
        DEFAULT_USER_COL:
        np.random.randint(1, 100, test_specs_ncf["number_of_rows"]),
        DEFAULT_ITEM_COL:
        np.random.randint(1, 100, test_specs_ncf["number_of_rows"]),
        DEFAULT_RATING_COL:
        np.random.randint(1, 5, test_specs_ncf["number_of_rows"]),
        DEFAULT_TIMESTAMP_COL:
        random_date_generator("2018-01-01", test_specs_ncf["number_of_rows"]),
    })

    train, test = python_chrono_split(rating, ratio=test_specs_ncf["ratio"])

    return train, test
def python_dataset_ncf(test_specs_ncf):
    """Get Python labels"""

    def random_date_generator(start_date, range_in_days):
        """Helper function to generate random timestamps.

        Reference: https://stackoverflow.com/questions/41006182/generate-random-dates-within-a
        -range-in-numpy
        """
        days_to_add = np.arange(0, range_in_days)
        random_dates = []
        for i in range(range_in_days):
            random_date = np.datetime64(start_date) + np.random.choice(days_to_add)
            random_dates.append(random_date)

        return random_dates

    np.random.seed(test_specs_ncf["seed"])

    rating = pd.DataFrame(
        {
            DEFAULT_USER_COL: np.random.randint(
                1, 100, test_specs_ncf["number_of_rows"]
            ),
            DEFAULT_ITEM_COL: np.random.randint(
                1, 100, test_specs_ncf["number_of_rows"]
            ),
            DEFAULT_RATING_COL: np.random.randint(
                1, 5, test_specs_ncf["number_of_rows"]
            ),
            DEFAULT_TIMESTAMP_COL: random_date_generator(
                "2018-01-01", test_specs_ncf["number_of_rows"]
            ),
        }
    )

    train, test = python_chrono_split(rating, ratio=np.random.choice(test_specs_ncf["ratios"]))

    return train, test
Exemple #3
0
def test_chrono_splitter(test_specs, python_dataset):
    splits = python_chrono_split(python_dataset,
                                 ratio=test_specs["ratio"],
                                 min_rating=10,
                                 filter_by="user")

    assert len(splits[0]) / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratio"], test_specs["tolerance"])
    assert len(splits[1]) / test_specs["number_of_rows"] == pytest.approx(
        1 - test_specs["ratio"], test_specs["tolerance"])

    for split in splits:
        assert set(split.columns) == set(python_dataset.columns)

    # Test if both contains the same user list. This is because chrono split is stratified.
    users_train = splits[0][DEFAULT_USER_COL].unique()
    users_test = splits[1][DEFAULT_USER_COL].unique()
    assert set(users_train) == set(users_test)

    # Test all time stamps in test are later than that in train for all users.
    # This is for single-split case.
    max_train_times = (splits[0][[DEFAULT_USER_COL, DEFAULT_TIMESTAMP_COL
                                  ]].groupby(DEFAULT_USER_COL).max())
    min_test_times = (splits[1][[DEFAULT_USER_COL, DEFAULT_TIMESTAMP_COL
                                 ]].groupby(DEFAULT_USER_COL).min())
    check_times = max_train_times.join(min_test_times,
                                       lsuffix="_0",
                                       rsuffix="_1")
    assert all((check_times[DEFAULT_TIMESTAMP_COL + "_0"] <
                check_times[DEFAULT_TIMESTAMP_COL + "_1"]).values)

    # Test multi-split case
    splits = python_chrono_split(python_dataset,
                                 ratio=test_specs["ratios"],
                                 min_rating=10,
                                 filter_by="user")

    assert len(splits) == 3
    assert len(splits[0]) / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratios"][0], test_specs["tolerance"])
    assert len(splits[1]) / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratios"][1], test_specs["tolerance"])
    assert len(splits[2]) / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratios"][2], test_specs["tolerance"])

    for split in splits:
        assert set(split.columns) == set(python_dataset.columns)

    # Test if all splits contain the same user list. This is because chrono split is stratified.
    users_train = splits[0][DEFAULT_USER_COL].unique()
    users_test = splits[1][DEFAULT_USER_COL].unique()
    users_val = splits[2][DEFAULT_USER_COL].unique()
    assert set(users_train) == set(users_test)
    assert set(users_train) == set(users_val)

    # Test if timestamps are correctly split. This is for multi-split case.
    max_train_times = (splits[0][[DEFAULT_USER_COL, DEFAULT_TIMESTAMP_COL
                                  ]].groupby(DEFAULT_USER_COL).max())
    min_test_times = (splits[1][[DEFAULT_USER_COL, DEFAULT_TIMESTAMP_COL
                                 ]].groupby(DEFAULT_USER_COL).min())
    check_times = max_train_times.join(min_test_times,
                                       lsuffix="_0",
                                       rsuffix="_1")
    assert all((check_times[DEFAULT_TIMESTAMP_COL + "_0"] <
                check_times[DEFAULT_TIMESTAMP_COL + "_1"]).values)

    max_test_times = (splits[1][[DEFAULT_USER_COL, DEFAULT_TIMESTAMP_COL
                                 ]].groupby(DEFAULT_USER_COL).max())
    min_val_times = (splits[2][[DEFAULT_USER_COL, DEFAULT_TIMESTAMP_COL
                                ]].groupby(DEFAULT_USER_COL).min())
    check_times = max_test_times.join(min_val_times,
                                      lsuffix="_1",
                                      rsuffix="_2")
    assert all((check_times[DEFAULT_TIMESTAMP_COL + "_1"] <
                check_times[DEFAULT_TIMESTAMP_COL + "_2"]).values)
def test_chrono_splitter(test_specs, python_dataset):
    """Test chronological splitter for Spark dataframes.
    """
    df_rating = python_dataset

    splits = python_chrono_split(df_rating,
                                 ratio=test_specs["ratio"],
                                 min_rating=10,
                                 filter_by="user")

    assert len(splits[0]) / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratio"], test_specs["tolerance"])
    assert len(splits[1]) / test_specs["number_of_rows"] == pytest.approx(
        1 - test_specs["ratio"], test_specs["tolerance"])

    # Test all time stamps in test are later than that in train for all users.
    # This is for single-split case.
    all_later = []
    for user in test_specs["user_ids"]:
        df_train = splits[0][splits[0][DEFAULT_USER_COL] == user]
        df_test = splits[1][splits[1][DEFAULT_USER_COL] == user]

        p = product(df_train[DEFAULT_TIMESTAMP_COL],
                    df_test[DEFAULT_TIMESTAMP_COL])
        user_later = [a <= b for (a, b) in p]

        all_later.append(user_later)
    assert all(all_later)

    # Test if both contains the same user list. This is because chrono split is stratified.
    users_train = splits[0][DEFAULT_USER_COL].unique()
    users_test = splits[1][DEFAULT_USER_COL].unique()

    assert set(users_train) == set(users_test)

    splits = python_chrono_split(df_rating,
                                 ratio=test_specs["ratios"],
                                 min_rating=10,
                                 filter_by="user")

    assert len(splits) == 3
    assert len(splits[0]) / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratios"][0], test_specs["tolerance"])
    assert len(splits[1]) / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratios"][1], test_specs["tolerance"])
    assert len(splits[2]) / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratios"][2], test_specs["tolerance"])

    # Test if timestamps are correctly split. This is for multi-split case.
    all_later = []
    for user in test_specs["user_ids"]:
        df_train = splits[0][splits[0][DEFAULT_USER_COL] == user]
        df_valid = splits[1][splits[1][DEFAULT_USER_COL] == user]
        df_test = splits[2][splits[2][DEFAULT_USER_COL] == user]

        p1 = product(df_train[DEFAULT_TIMESTAMP_COL],
                     df_valid[DEFAULT_TIMESTAMP_COL])
        p2 = product(df_valid[DEFAULT_TIMESTAMP_COL],
                     df_test[DEFAULT_TIMESTAMP_COL])
        user_later_1 = [a <= b for (a, b) in p1]
        user_later_2 = [a <= b for (a, b) in p2]

        all_later.append(user_later_1)
        all_later.append(user_later_2)
    assert all(all_later)
Exemple #5
0
                     index_col=0)[['user_id', 'business_id', 'stars', 'date']]
    df.columns = ["userID", "itemID", "rating", "timestamp"]
elif dataset == "movielens":
    MOVIELENS_DATA_SIZE = '100k'
    df = movielens.load_pandas_df(
        size=MOVIELENS_DATA_SIZE,
        header=["userID", "itemID", "rating", "timestamp"])

# Select MovieLens data size: 100k, 1m, 10m, or 20m
# MOVIELENS_DATA_SIZE = '100k'
# df = movielens.load_pandas_df(
#     size=MOVIELENS_DATA_SIZE,
#     header=["userID", "itemID", "rating", "timestamp"]
# )

train, test = python_chrono_split(df, 0.75)
print("start getting data")
'''
data = NCFDataset(train=train, test=test, seed=SEED)
print("start getting model")
model = NCF (
    n_users=data.n_users, 
    n_items=data.n_items,
    model_type="NeuMF",
    n_factors=4,
    layer_sizes=[16,8,4],
    n_epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=1e-3,
    verbose=1,
    seed=SEED
def test_chrono_splitter(test_specs, python_dataset):
    splits = python_chrono_split(
        python_dataset, ratio=test_specs["ratio"], min_rating=10, filter_by="user"
    )

    assert len(splits[0]) / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratio"], test_specs["tolerance"]
    )
    assert len(splits[1]) / test_specs["number_of_rows"] == pytest.approx(
        1 - test_specs["ratio"], test_specs["tolerance"]
    )

    for split in splits:
        assert set(split.columns) == set(python_dataset.columns)

    # Test if both contains the same user list. This is because chrono split is stratified.
    users_train = splits[0][DEFAULT_USER_COL].unique()
    users_test = splits[1][DEFAULT_USER_COL].unique()
    assert set(users_train) == set(users_test)

    # Test all time stamps in test are later than that in train for all users.
    # This is for single-split case.
    max_train_times = (
        splits[0][[DEFAULT_USER_COL, DEFAULT_TIMESTAMP_COL]]
        .groupby(DEFAULT_USER_COL)
        .max()
    )
    min_test_times = (
        splits[1][[DEFAULT_USER_COL, DEFAULT_TIMESTAMP_COL]]
        .groupby(DEFAULT_USER_COL)
        .min()
    )
    check_times = max_train_times.join(min_test_times, lsuffix="_0", rsuffix="_1")
    assert all(
        (
            check_times[DEFAULT_TIMESTAMP_COL + "_0"]
            < check_times[DEFAULT_TIMESTAMP_COL + "_1"]
        ).values
    )

    # Test multi-split case
    splits = python_chrono_split(
        python_dataset, ratio=test_specs["ratios"], min_rating=10, filter_by="user"
    )

    assert len(splits) == 3
    assert len(splits[0]) / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratios"][0], test_specs["tolerance"]
    )
    assert len(splits[1]) / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratios"][1], test_specs["tolerance"]
    )
    assert len(splits[2]) / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratios"][2], test_specs["tolerance"]
    )

    for split in splits:
        assert set(split.columns) == set(python_dataset.columns)

    # Test if all splits contain the same user list. This is because chrono split is stratified.
    users_train = splits[0][DEFAULT_USER_COL].unique()
    users_test = splits[1][DEFAULT_USER_COL].unique()
    users_val = splits[2][DEFAULT_USER_COL].unique()
    assert set(users_train) == set(users_test)
    assert set(users_train) == set(users_val)

    # Test if timestamps are correctly split. This is for multi-split case.
    max_train_times = (
        splits[0][[DEFAULT_USER_COL, DEFAULT_TIMESTAMP_COL]]
        .groupby(DEFAULT_USER_COL)
        .max()
    )
    min_test_times = (
        splits[1][[DEFAULT_USER_COL, DEFAULT_TIMESTAMP_COL]]
        .groupby(DEFAULT_USER_COL)
        .min()
    )
    check_times = max_train_times.join(min_test_times, lsuffix="_0", rsuffix="_1")
    assert all(
        (
            check_times[DEFAULT_TIMESTAMP_COL + "_0"]
            < check_times[DEFAULT_TIMESTAMP_COL + "_1"]
        ).values
    )

    max_test_times = (
        splits[1][[DEFAULT_USER_COL, DEFAULT_TIMESTAMP_COL]]
        .groupby(DEFAULT_USER_COL)
        .max()
    )
    min_val_times = (
        splits[2][[DEFAULT_USER_COL, DEFAULT_TIMESTAMP_COL]]
        .groupby(DEFAULT_USER_COL)
        .min()
    )
    check_times = max_test_times.join(min_val_times, lsuffix="_1", rsuffix="_2")
    assert all(
        (
            check_times[DEFAULT_TIMESTAMP_COL + "_1"]
            < check_times[DEFAULT_TIMESTAMP_COL + "_2"]
        ).values
    )