Example #1
0
def test_train_loader(python_dataset_ncf):
    train, test = python_dataset_ncf
    data = Dataset(train=train,
                   test=test,
                   n_neg=N_NEG,
                   n_neg_test=N_NEG_TEST,
                   seed=SEED)

    # collect positvie user-item dict
    positive_pool = {}
    for u in train[DEFAULT_USER_COL].unique():
        positive_pool[u] = set(
            train[train[DEFAULT_USER_COL] == u][DEFAULT_ITEM_COL])

    # without negative sampling
    for batch in data.train_loader(batch_size=BATCH_SIZE, shuffle=False):
        user, item, labels = batch
        # shape
        assert len(user) == BATCH_SIZE
        assert len(item) == BATCH_SIZE
        assert len(labels) == BATCH_SIZE
        assert max(labels) == min(labels)

        # right labels
        for u, i, is_pos in zip(user, item, labels):
            if is_pos:
                assert i in positive_pool[u]
            else:
                assert i not in positive_pool[u]

    data.negative_sampling()
    label_list = []
    for idx, batch in enumerate(data.train_loader(batch_size=1)):
        user, item, labels = batch
        assert len(user) == 1
        assert len(item) == 1
        assert len(labels) == 1

        # right labels
        for u, i, is_pos in zip(user, item, labels):
            if is_pos:
                assert i in positive_pool[u]
            else:
                assert i not in positive_pool[u]

            label_list.append(is_pos)

    # neagtive smapling
    assert len(label_list) == (N_NEG + 1) * sum(label_list)
Example #2
0
def test_train_loader(tmp_path, dataset_ncf_files_sorted):
    train_path, _, _ = dataset_ncf_files_sorted
    train = pd.read_csv(train_path)
    users = train[DEFAULT_USER_COL].unique()
    items = train[DEFAULT_ITEM_COL].unique()

    n_neg = 1
    dataset = Dataset(train_path, n_neg=n_neg)
    assert dataset.n_users == len(users)
    assert dataset.n_items == len(items)
    assert set(dataset.user2id.keys()) == set(users)
    assert set(dataset.item2id.keys()) == set(items)
    assert len(set(dataset.user2id.values())) == len(users)
    assert len(set(dataset.item2id.values())) == len(items)

    # test number of batches and data size is as expected after loading all training data
    full_data_len = train.shape[0] * 2
    batch_size = full_data_len // 10
    expected_batches = full_data_len // batch_size
    train_save_path = os.path.join(tmp_path, "train_full.csv")
    batch_records = []
    for batch in dataset.train_loader(batch_size, shuffle_size=batch_size, yield_id=True, write_to=train_save_path):
        assert type(batch[0][0]) == int
        assert type(batch[1][0]) == int
        assert type(batch[2][0]) == float
        batch_data = {
            DEFAULT_USER_COL: [dataset.id2user[user] for user in batch[0]],
            DEFAULT_ITEM_COL: [dataset.id2item[item] for item in batch[1]],
            DEFAULT_RATING_COL: batch[2]
        }
        batch_records.append(pd.DataFrame(batch_data))
    
    assert len(batch_records) == expected_batches
    train_loader_df = pd.concat(batch_records).reset_index(drop=True)
    assert train_loader_df.shape[0] == expected_batches * batch_size
    assert set(train_loader_df[DEFAULT_USER_COL]) == set(users)
    assert set(train_loader_df[DEFAULT_ITEM_COL]) == set(items)

    # test that data is successfully saved
    assert os.path.exists(train_save_path)
    train_file_data = pd.read_csv(train_save_path)
    assert train_file_data.equals(train_loader_df)