コード例 #1
0
def test_test_loader(python_dataset_ncf):
    train, test = python_dataset_ncf
    data = Dataset(train=train, test=test, n_neg=N_NEG, n_neg_test=N_NEG_TEST)

    # positive user-item dict, noting that the pool is train+test
    positive_pool = {}
    df = train.append(test)
    for u in df[DEFAULT_USER_COL].unique():
        positive_pool[u] = set(df[df[DEFAULT_USER_COL] == u][DEFAULT_ITEM_COL])

    for batch in data.test_loader():
        user, item, labels = batch
        # shape
        assert len(user) == N_NEG_TEST + 1
        assert len(item) == N_NEG_TEST + 1
        assert len(labels) == N_NEG_TEST + 1

        label_list = []

        for u, i, is_pos in zip(user, item, labels):
            if is_pos:
                assert i in positive_pool[u]
            else:
                assert i not in positive_pool[u]

            label_list.append(is_pos)

        # leave-one-out
        assert sum(label_list) == 1
        # right labels
        assert len(label_list) == (N_NEG_TEST + 1) * sum(label_list)
コード例 #2
0
def test_test_loader(dataset_ncf_files_sorted):
    train_path, _, leave_one_out_test_path = dataset_ncf_files_sorted
    leave_one_out_test = pd.read_csv(leave_one_out_test_path)
    test_users = leave_one_out_test[DEFAULT_USER_COL].unique()

    n_neg = 1
    n_neg_test = 1
    dataset = Dataset(train_path, test_file=leave_one_out_test_path, n_neg=n_neg, n_neg_test=n_neg_test)
    assert set(dataset.test_full_datafile.users) == set(test_users)

    # test number of batches and data size is as expected after loading all test data
    expected_test_batches = leave_one_out_test.shape[0]
    assert max(dataset.test_full_datafile.batch_indices_range) + 1 == expected_test_batches
    batch_records = []
    for batch in dataset.test_loader(yield_id=True):
        assert type(batch[0][0]) == int
        assert type(batch[1][0]) == int
        assert type(batch[2][0]) == float
        batch_data = {
            DEFAULT_USER_COL: [dataset.id2user[user] for user in batch[0]],
            DEFAULT_ITEM_COL: [dataset.id2item[item] for item in batch[1]],
            DEFAULT_RATING_COL: batch[2]
        }
        batch_records.append(pd.DataFrame(batch_data))
    
    assert len(batch_records) == expected_test_batches
    test_loader_df = pd.concat(batch_records).reset_index(drop=True)
    assert test_loader_df.shape[0] == expected_test_batches * n_neg_test * 2
    assert set(test_loader_df[DEFAULT_USER_COL]) == set(test_users)
コード例 #3
0
def test_data_preprocessing(python_dataset_ncf):
    train, test = python_dataset_ncf
    data = Dataset(train=train,
                   test=test,
                   n_neg=N_NEG,
                   n_neg_test=N_NEG_TEST,
                   seed=SEED)

    # shape
    assert len(data.train) == len(train)
    assert len(data.test) == len(test)

    # index correctness for id2user, user2id, id2item, item2id
    for data_row, row in zip(data.train.iterrows(), train.iterrows()):
        assert data_row[1][DEFAULT_USER_COL] == data.user2id[
            row[1][DEFAULT_USER_COL]]
        assert row[1][DEFAULT_USER_COL] == data.id2user[data_row[1]
                                                        [DEFAULT_USER_COL]]
        assert data_row[1][DEFAULT_ITEM_COL] == data.item2id[
            row[1][DEFAULT_ITEM_COL]]
        assert row[1][DEFAULT_ITEM_COL] == data.id2item[data_row[1]
                                                        [DEFAULT_ITEM_COL]]

    for data_row, row in zip(data.test.iterrows(), test.iterrows()):
        assert data_row[1][DEFAULT_USER_COL] == data.user2id[
            row[1][DEFAULT_USER_COL]]
        assert row[1][DEFAULT_USER_COL] == data.id2user[data_row[1]
                                                        [DEFAULT_USER_COL]]
        assert data_row[1][DEFAULT_ITEM_COL] == data.item2id[
            row[1][DEFAULT_ITEM_COL]]
        assert row[1][DEFAULT_ITEM_COL] == data.id2item[data_row[1]
                                                        [DEFAULT_ITEM_COL]]
コード例 #4
0
def test_fit(python_dataset_ncf, model_type):
    train, test = python_dataset_ncf
    data = Dataset(train=train, test=test, n_neg=N_NEG, n_neg_test=N_NEG_TEST)
    model = NCF(n_users=data.n_users,
                n_items=data.n_items,
                model_type=model_type,
                n_epochs=1)
    model.fit(data)
コード例 #5
0
def test_train_loader(tmp_path, dataset_ncf_files_sorted):
    train_path, _, _ = dataset_ncf_files_sorted
    train = pd.read_csv(train_path)
    users = train[DEFAULT_USER_COL].unique()
    items = train[DEFAULT_ITEM_COL].unique()

    n_neg = 1
    dataset = Dataset(train_path, n_neg=n_neg)
    assert dataset.n_users == len(users)
    assert dataset.n_items == len(items)
    assert set(dataset.user2id.keys()) == set(users)
    assert set(dataset.item2id.keys()) == set(items)
    assert len(set(dataset.user2id.values())) == len(users)
    assert len(set(dataset.item2id.values())) == len(items)

    # test number of batches and data size is as expected after loading all training data
    full_data_len = train.shape[0] * 2
    batch_size = full_data_len // 10
    expected_batches = full_data_len // batch_size
    train_save_path = os.path.join(tmp_path, "train_full.csv")
    batch_records = []
    for batch in dataset.train_loader(batch_size, shuffle_size=batch_size, yield_id=True, write_to=train_save_path):
        assert type(batch[0][0]) == int
        assert type(batch[1][0]) == int
        assert type(batch[2][0]) == float
        batch_data = {
            DEFAULT_USER_COL: [dataset.id2user[user] for user in batch[0]],
            DEFAULT_ITEM_COL: [dataset.id2item[item] for item in batch[1]],
            DEFAULT_RATING_COL: batch[2]
        }
        batch_records.append(pd.DataFrame(batch_data))
    
    assert len(batch_records) == expected_batches
    train_loader_df = pd.concat(batch_records).reset_index(drop=True)
    assert train_loader_df.shape[0] == expected_batches * batch_size
    assert set(train_loader_df[DEFAULT_USER_COL]) == set(users)
    assert set(train_loader_df[DEFAULT_ITEM_COL]) == set(items)

    # test that data is successfully saved
    assert os.path.exists(train_save_path)
    train_file_data = pd.read_csv(train_save_path)
    assert train_file_data.equals(train_loader_df)
コード例 #6
0
def test_fit(dataset_ncf_files_sorted, model_type):
    train_path, test_path, _ = dataset_ncf_files_sorted
    data = Dataset(train_file=train_path,
                   test_file=test_path,
                   n_neg=N_NEG,
                   n_neg_test=N_NEG_TEST)
    model = NCF(n_users=data.n_users,
                n_items=data.n_items,
                model_type=model_type,
                n_epochs=1)
    model.fit(data)
コード例 #7
0
def test_predict(python_dataset_ncf, model_type):
    # test data format
    train, test = python_dataset_ncf
    data = Dataset(train=train, test=test, n_neg=N_NEG, n_neg_test=N_NEG_TEST)
    model = NCF(n_users=data.n_users,
                n_items=data.n_items,
                model_type=model_type,
                n_epochs=1)
    model.fit(data)

    test_users, test_items = list(test[DEFAULT_USER_COL]), list(
        test[DEFAULT_ITEM_COL])

    assert type(model.predict(test_users[0], test_items[0])) == float

    res = model.predict(test_users, test_items, is_list=True)

    assert type(res) == list
    assert len(res) == len(test)
コード例 #8
0
def test_train_loader(python_dataset_ncf):
    train, test = python_dataset_ncf
    data = Dataset(train=train,
                   test=test,
                   n_neg=N_NEG,
                   n_neg_test=N_NEG_TEST,
                   seed=SEED)

    # collect positvie user-item dict
    positive_pool = {}
    for u in train[DEFAULT_USER_COL].unique():
        positive_pool[u] = set(
            train[train[DEFAULT_USER_COL] == u][DEFAULT_ITEM_COL])

    # without negative sampling
    for batch in data.train_loader(batch_size=BATCH_SIZE, shuffle=False):
        user, item, labels = batch
        # shape
        assert len(user) == BATCH_SIZE
        assert len(item) == BATCH_SIZE
        assert len(labels) == BATCH_SIZE
        assert max(labels) == min(labels)

        # right labels
        for u, i, is_pos in zip(user, item, labels):
            if is_pos:
                assert i in positive_pool[u]
            else:
                assert i not in positive_pool[u]

    data.negative_sampling()
    label_list = []
    for idx, batch in enumerate(data.train_loader(batch_size=1)):
        user, item, labels = batch
        assert len(user) == 1
        assert len(item) == 1
        assert len(labels) == 1

        # right labels
        for u, i, is_pos in zip(user, item, labels):
            if is_pos:
                assert i in positive_pool[u]
            else:
                assert i not in positive_pool[u]

            label_list.append(is_pos)

    # neagtive smapling
    assert len(label_list) == (N_NEG + 1) * sum(label_list)
コード例 #9
0
def test_predict(dataset_ncf_files_sorted, model_type):
    # test data format
    train_path, test_path, _ = dataset_ncf_files_sorted
    test = pd.read_csv(test_path)
    data = Dataset(train_file=train_path,
                   test_file=test_path,
                   n_neg=N_NEG,
                   n_neg_test=N_NEG_TEST)
    model = NCF(n_users=data.n_users,
                n_items=data.n_items,
                model_type=model_type,
                n_epochs=1)
    model.fit(data)

    test_users, test_items = list(test[DEFAULT_USER_COL]), list(
        test[DEFAULT_ITEM_COL])

    assert type(model.predict(test_users[0], test_items[0])) == float

    res = model.predict(test_users, test_items, is_list=True)

    assert type(res) == list
    assert len(res) == len(test)