Esempio n. 1
0
def test_empty_cols(tmpdir, df, dataset, engine):
    # test out https://github.com/NVIDIA/NVTabular/issues/149 making sure we can iterate over
    # empty cats/conts
    # first with no continuous columns
    no_conts = torch_dataloader.TorchAsyncItr(
        dataset, cats=["id"], conts=[], labels=["label"], batch_size=1
    )
    assert all(conts is None for _, conts, _ in no_conts)

    # and with no categorical columns
    no_cats = torch_dataloader.TorchAsyncItr(dataset, cats=[], conts=["x"], labels=["label"])
    assert all(cats is None for cats, _, _ in no_cats)
def test_sparse_tensors(sparse_dense):
    # create small dataset, add values to sparse_list
    df = nvt.dispatch._make_df({
        "spar1": [[1, 2, 3, 4], [4, 2, 4, 4], [1, 3, 4, 3], [1, 1, 3, 3]],
        "spar2": [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14],
                  [15, 16]],
    })
    spa_lst = ["spar1", "spar2"]
    spa_mx = {"spar1": 5, "spar2": 6}
    batch_size = 2
    data_itr = torch_dataloader.TorchAsyncItr(
        nvt.Dataset(df),
        cats=spa_lst,
        conts=[],
        labels=[],
        batch_size=batch_size,
        sparse_names=spa_lst,
        sparse_max=spa_mx,
        sparse_as_dense=sparse_dense,
    )
    for batch in data_itr:
        feats, labs = batch
        for col in spa_lst:
            feature_tensor = feats[col]
            if not sparse_dense:
                assert list(feature_tensor.shape) == [batch_size, spa_mx[col]]
                assert feature_tensor.is_sparse
            else:
                assert feature_tensor.shape[1] == spa_mx[col]
                assert not feature_tensor.is_sparse
def test_empty_cols(tmpdir, df, dataset, engine, cat_names, cont_names, label_name):
    # test out https://github.com/NVIDIA/NVTabular/issues/149 making sure we can iterate over
    # empty cats/conts
    # first with no continuous columns
    processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name)

    processor.add_feature([ops.FillMedian()])
    processor.add_feature(ops.Normalize())
    processor.add_feature(ops.Categorify())

    output_train = os.path.join(tmpdir, "train/")
    os.mkdir(output_train)

    processor.apply(
        dataset,
        apply_offline=True,
        record_stats=True,
        shuffle=nvt.io.Shuffle.PER_PARTITION,
        output_format=None,
    )
    df_out = processor.get_ddf().compute(scheduler="synchronous")

    data_itr = torch_dataloader.TorchAsyncItr(
        nvt.Dataset(df_out), cats=cat_names, conts=cont_names, labels=label_name, batch_size=1
    )

    for nvt_batch in data_itr:
        cats, conts, labels = nvt_batch
        if cat_names:
            assert cats.shape[-1] == len(cat_names)
        if cont_names:
            assert conts.shape[-1] == len(cont_names)
        if label_name:
            assert labels.shape[-1] == len(label_name)
Esempio n. 4
0
def test_mh_support(tmpdir):
    df = cudf.DataFrame(
        {
            "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"], ["User_C"]],
            "Reviewers": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"], ["User_C"]],
            "Engaging User": ["User_B", "User_B", "User_A", "User_D"],
            "Post": [1, 2, 3, 4],
        }
    )
    cat_names = ["Authors", "Reviewers"]  # , "Engaging User"]
    cont_names = []
    label_name = ["Post"]

    cats = cat_names >> ops.HashBucket(num_buckets=10)

    processor = nvt.Workflow(cats + label_name)
    df_out = processor.fit_transform(nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous")

    # check to make sure that the same strings are hashed the same
    authors = df_out["Authors"].to_arrow().to_pylist()
    assert authors[0][0] == authors[1][0]  # 'User_A'
    assert authors[2][1] == authors[3][0]  # 'User_C'

    data_itr = torch_dataloader.TorchAsyncItr(
        nvt.Dataset(df_out), cats=cat_names, conts=cont_names, labels=label_name
    )
    idx = 0
    for batch in data_itr:
        idx = idx + 1
        cats, conts, labels = batch
        cats, mh = cats
        # mh is a tuple of dictionaries {Column name: (values, offsets)}
        assert len(mh) == len(cat_names)
        assert not cats
    assert idx > 0
Esempio n. 5
0
def test_empty_cols(tmpdir, df, dataset, engine, cat_names, cont_names, label_name):

    features = []
    if cont_names:
        features.append(cont_names >> ops.FillMedian() >> ops.Normalize())
    if cat_names:
        features.append(cat_names >> ops.Categorify())

    # test out https://github.com/NVIDIA/NVTabular/issues/149 making sure we can iterate over
    # empty cats/conts
    graph = sum(features, nvt.ColumnGroup(label_name))
    if not graph.columns:
        # if we don't have conts/cats/labels we're done
        return

    processor = nvt.Workflow(sum(features, nvt.ColumnGroup(label_name)))

    output_train = os.path.join(tmpdir, "train/")
    os.mkdir(output_train)

    df_out = processor.fit_transform(dataset).to_ddf().compute(scheduler="synchronous")

    data_itr = torch_dataloader.TorchAsyncItr(
        nvt.Dataset(df_out), cats=cat_names, conts=cont_names, labels=label_name, batch_size=1
    )

    for nvt_batch in data_itr:
        cats, conts, labels = nvt_batch
        if cat_names:
            assert cats.shape[-1] == len(cat_names)
        if cont_names:
            assert conts.shape[-1] == len(cont_names)
        if label_name:
            assert labels.shape[-1] == len(label_name)
def test_gpu_dl_break(tmpdir, df, dataset, batch_size, part_mem_fraction,
                      engine, device):
    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    conts = cont_names >> ops.FillMedian() >> ops.Normalize()
    cats = cat_names >> ops.Categorify()

    processor = nvt.Workflow(conts + cats + label_name)

    output_train = os.path.join(tmpdir, "train/")
    os.mkdir(output_train)

    processor.fit_transform(dataset).to_parquet(
        shuffle=nvt.io.Shuffle.PER_PARTITION,
        output_path=output_train,
        out_files_per_proc=2,
    )

    tar_paths = [
        os.path.join(output_train, x) for x in os.listdir(output_train)
        if x.endswith("parquet")
    ]

    nvt_data = nvt.Dataset(tar_paths[0],
                           engine="parquet",
                           part_mem_fraction=part_mem_fraction)
    data_itr = torch_dataloader.TorchAsyncItr(
        nvt_data,
        batch_size=batch_size,
        cats=cat_names,
        conts=cont_names,
        labels=["label"],
        device=device,
    )
    len_dl = len(data_itr) - 1

    first_chunk = 0
    idx = 0
    for idx, chunk in enumerate(data_itr):
        if idx == 0:
            first_chunk = len(chunk[0])
        last_chk = len(chunk[0])
        print(last_chk)
        if idx == 1:
            break
        del chunk

    assert idx < len_dl

    first_chunk_2 = 0
    for idx, chunk in enumerate(data_itr):
        if idx == 0:
            first_chunk_2 = len(chunk[0])
        del chunk
    assert idx == len_dl

    assert first_chunk == first_chunk_2
Esempio n. 7
0
def test_kill_dl(tmpdir, df, dataset, part_mem_fraction, engine):
    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name)

    processor.add_feature([ops.FillMedian()])
    processor.add_preprocess(ops.Normalize())
    processor.add_preprocess(ops.Categorify())

    output_train = os.path.join(tmpdir, "train/")
    os.mkdir(output_train)

    processor.apply(
        dataset,
        apply_offline=True,
        record_stats=True,
        shuffle=nvt.io.Shuffle.PER_PARTITION,
        output_path=output_train,
    )

    tar_paths = [
        os.path.join(output_train, x) for x in os.listdir(output_train) if x.endswith("parquet")
    ]

    nvt_data = nvt.Dataset(tar_paths[0], engine="parquet", part_mem_fraction=part_mem_fraction)

    data_itr = torch_dataloader.TorchAsyncItr(
        nvt_data, cats=cat_names, conts=cont_names, labels=["label"]
    )

    results = {}

    for batch_size in [2 ** i for i in range(9, 25, 1)]:
        print("Checking batch size: ", batch_size)
        num_iter = max(10 * 1000 * 1000 // batch_size, 100)  # load 10e7 samples
        # import pdb; pdb.set_trace()
        data_itr.batch_size = batch_size
        start = time.time()
        for i, data in enumerate(data_itr):
            if i >= num_iter:
                break
            del data

        stop = time.time()

        throughput = i * batch_size / (stop - start)
        results[batch_size] = throughput
        print(
            "batch size: ",
            batch_size,
            ", throughput: ",
            throughput,
            "items",
            i * batch_size,
            "time",
            stop - start,
        )
def test_torch_drp_reset(tmpdir, batch_size, drop_last, num_rows):
    df = nvt.dispatch._make_df({
        "cat1": [1] * num_rows,
        "cat2": [2] * num_rows,
        "cat3": [3] * num_rows,
        "label": [0] * num_rows,
        "cont3": [3.0] * num_rows,
        "cont2": [2.0] * num_rows,
        "cont1": [1.0] * num_rows,
    })
    path = os.path.join(tmpdir, "dataset.parquet")
    df.to_parquet(path)
    cat_names = ["cat3", "cat2", "cat1"]
    cont_names = ["cont3", "cont2", "cont1"]
    label_name = ["label"]

    data_itr = torch_dataloader.TorchAsyncItr(
        nvt.Dataset([path]),
        cats=cat_names,
        conts=cont_names,
        labels=label_name,
        batch_size=batch_size,
        drop_last=drop_last,
        device="cpu",
    )

    all_len = len(data_itr) if drop_last else len(data_itr) - 1
    all_rows = 0
    df_cols = df.columns.to_list()
    for idx, chunk in enumerate(data_itr):
        all_rows += len(chunk[0]["cat1"])
        if idx < all_len:
            for col in df_cols:
                if col in chunk[0].keys():
                    if nvt.dispatch.HAS_GPU:
                        assert (list(
                            chunk[0][col].cpu().numpy()) == df[col].values_host
                                ).all()
                    else:
                        assert (list(
                            chunk[0][col].cpu().numpy()) == df[col].values
                                ).all()

    if drop_last and num_rows % batch_size > 0:
        assert num_rows > all_rows
    else:
        assert num_rows == all_rows
def test_mh_support(tmpdir):
    df = nvt.dispatch._make_df({
        "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"],
                    ["User_C"]],
        "Reviewers": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"],
                      ["User_C"]],
        "Engaging User": ["User_B", "User_B", "User_A", "User_D"],
        "Post": [1, 2, 3, 4],
    })
    cat_names = ["Authors", "Reviewers"]  # , "Engaging User"]
    cont_names = []
    label_name = ["Post"]
    if HAS_GPU:
        cats = cat_names >> ops.HashBucket(num_buckets=10)
    else:
        cats = cat_names >> ops.Categorify()

    processor = nvt.Workflow(cats + label_name)
    df_out = processor.fit_transform(
        nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous")

    # check to make sure that the same strings are hashed the same
    if HAS_GPU:
        authors = df_out["Authors"].to_arrow().to_pylist()
    else:
        authors = df_out["Authors"]
    assert authors[0][0] == authors[1][0]  # 'User_A'
    assert authors[2][1] == authors[3][0]  # 'User_C'

    data_itr = torch_dataloader.TorchAsyncItr(nvt.Dataset(df_out),
                                              cats=cat_names,
                                              conts=cont_names,
                                              labels=label_name)
    idx = 0
    for batch in data_itr:
        idx = idx + 1
        cats_conts, labels = batch
        assert "Reviewers" in cats_conts
        # check it is multihot
        assert isinstance(cats_conts["Reviewers"], tuple)
        # mh is a tuple of dictionaries {Column name: (values, offsets)}
        assert "Authors" in cats_conts
        assert isinstance(cats_conts["Authors"], tuple)
    assert idx > 0
Esempio n. 10
0
def test_torch_drp_reset(tmpdir, batch_size, drop_last, num_rows):
    df = cudf.DataFrame(
        {
            "cat1": [1] * num_rows,
            "cat2": [2] * num_rows,
            "cat3": [3] * num_rows,
            "label": [0] * num_rows,
            "cont3": [3.0] * num_rows,
            "cont2": [2.0] * num_rows,
            "cont1": [1.0] * num_rows,
        }
    )
    path = os.path.join(tmpdir, "dataset.parquet")
    df.to_parquet(path)
    cat_names = ["cat3", "cat2", "cat1"]
    cont_names = ["cont3", "cont2", "cont1"]
    label_name = ["label"]

    data_itr = torch_dataloader.TorchAsyncItr(
        nvt.Dataset([path]),
        cats=cat_names,
        conts=cont_names,
        labels=label_name,
        batch_size=batch_size,
        drop_last=drop_last,
    )

    all_len = len(data_itr) if drop_last else len(data_itr) - 1
    all_rows = 0
    for idx, chunk in enumerate(data_itr):
        all_rows += len(chunk[0])
        if idx < all_len:
            for sub in chunk[:2]:
                sub = sub.cpu()
                assert list(sub[:, 0].numpy()) == [1] * batch_size
                assert list(sub[:, 1].numpy()) == [2] * batch_size
                assert list(sub[:, 2].numpy()) == [3] * batch_size

    if drop_last and num_rows % batch_size > 0:
        assert num_rows > all_rows
    else:
        assert num_rows == all_rows
def test_dataloader_schema(tmpdir, df, dataset, batch_size, engine, device):
    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    conts = cont_names >> ops.FillMedian() >> ops.Normalize()
    cats = cat_names >> ops.Categorify()

    processor = nvt.Workflow(conts + cats + label_name)

    output_train = os.path.join(tmpdir, "train/")
    os.mkdir(output_train)

    processor.fit_transform(dataset).to_parquet(
        shuffle=nvt.io.Shuffle.PER_PARTITION,
        output_path=output_train,
        out_files_per_proc=2,
    )

    tar_paths = [
        os.path.join(output_train, x) for x in os.listdir(output_train)
        if x.endswith("parquet")
    ]

    nvt_data = nvt.Dataset(tar_paths, engine="parquet")

    data_loader = torch_dataloader.TorchAsyncItr(
        nvt_data,
        batch_size=batch_size,
        shuffle=False,
        labels=label_name,
    )

    batch = next(iter(data_loader))
    assert all(name in batch[0] for name in cat_names)
    assert all(name in batch[0] for name in cont_names)

    num_label_cols = batch[1].shape[1] if len(batch[1].shape) > 1 else 1
    assert num_label_cols == len(label_name)
def test_shuffling():
    num_rows = 10000
    batch_size = 10000

    df = pd.DataFrame({
        "a": np.asarray(range(num_rows)),
        "b": np.asarray([0] * num_rows)
    })

    train_dataset = torch_dataloader.TorchAsyncItr(Dataset(df),
                                                   conts=["a"],
                                                   labels=["b"],
                                                   batch_size=batch_size,
                                                   shuffle=True)

    batch = next(iter(train_dataset))

    first_batch = batch[0]["a"].cpu()
    in_order = torch.arange(0, batch_size)

    assert (first_batch != in_order).any()
    assert (torch.sort(first_batch).values == in_order).all()
Esempio n. 13
0
def test_gpu_dl(tmpdir, df, dataset, batch_size, part_mem_fraction, engine,
                devices):
    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    conts = cont_names >> ops.FillMedian() >> ops.Normalize()
    cats = cat_names >> ops.Categorify()

    processor = nvt.Workflow(conts + cats + label_name)

    output_train = os.path.join(tmpdir, "train/")
    os.mkdir(output_train)

    processor.fit_transform(dataset).to_parquet(
        shuffle=nvt.io.Shuffle.PER_PARTITION,
        output_path=output_train,
        out_files_per_proc=2,
    )

    tar_paths = [
        os.path.join(output_train, x) for x in os.listdir(output_train)
        if x.endswith("parquet")
    ]

    nvt_data = nvt.Dataset(tar_paths[0],
                           engine="parquet",
                           part_mem_fraction=part_mem_fraction)
    data_itr = torch_dataloader.TorchAsyncItr(
        nvt_data,
        batch_size=batch_size,
        cats=cat_names,
        conts=cont_names,
        labels=["label"],
        devices=devices,
    )

    columns = mycols_pq
    df_test = cudf.read_parquet(tar_paths[0])[columns]
    df_test.columns = [x for x in range(0, len(columns))]
    num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata(
        tar_paths[0])
    rows = 0
    # works with iterator alone, needs to test inside torch dataloader

    for idx, chunk in enumerate(data_itr):
        if devices is None:
            assert float(df_test.iloc[rows][0]) == float(chunk[0][0][0])
        rows += len(chunk[0])
        del chunk
    # accounts for incomplete batches at the end of chunks
    # that dont necesssarily have the full batch_size
    assert rows == num_rows

    def gen_col(batch):
        batch = batch[0]
        return batch[0], batch[1], batch[2]

    t_dl = torch_dataloader.DLDataLoader(data_itr,
                                         collate_fn=gen_col,
                                         pin_memory=False,
                                         num_workers=0)
    rows = 0
    for idx, chunk in enumerate(t_dl):
        if devices is None:
            assert float(df_test.iloc[rows][0]) == float(chunk[0][0][0])
        rows += len(chunk[0])

    if os.path.exists(output_train):
        shutil.rmtree(output_train)
Esempio n. 14
0
def test_mh_model_support(tmpdir):
    df = cudf.DataFrame({
        "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"],
                    ["User_C"]],
        "Reviewers": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"],
                      ["User_C"]],
        "Engaging User": ["User_B", "User_B", "User_A", "User_D"],
        "Null User": ["User_B", "User_B", "User_A", "User_D"],
        "Post": [1, 2, 3, 4],
        "Cont1": [0.3, 0.4, 0.5, 0.6],
        "Cont2": [0.3, 0.4, 0.5, 0.6],
        "Cat1": ["A", "B", "A", "C"],
    })
    cat_names = ["Cat1", "Null User", "Authors",
                 "Reviewers"]  # , "Engaging User"]
    cont_names = ["Cont1", "Cont2"]
    label_name = ["Post"]
    out_path = os.path.join(tmpdir, "train/")
    os.mkdir(out_path)

    cats = cat_names >> ops.Categorify()
    conts = cont_names >> ops.Normalize()

    processor = nvt.Workflow(cats + conts + label_name)
    df_out = processor.fit_transform(nvt.Dataset(df)).to_ddf().compute()
    data_itr = torch_dataloader.TorchAsyncItr(
        nvt.Dataset(df_out),
        cats=cat_names,
        conts=cont_names,
        labels=label_name,
        batch_size=2,
    )
    emb_sizes = nvt.ops.get_embedding_sizes(processor)
    EMBEDDING_DROPOUT_RATE = 0.04
    DROPOUT_RATES = [0.001, 0.01]
    HIDDEN_DIMS = [1000, 500]
    LEARNING_RATE = 0.001
    model = Model(
        embedding_table_shapes=emb_sizes,
        num_continuous=len(cont_names),
        emb_dropout=EMBEDDING_DROPOUT_RATE,
        layer_hidden_dims=HIDDEN_DIMS,
        layer_dropout_rates=DROPOUT_RATES,
    ).cuda()
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

    def rmspe_func(y_pred, y):
        "Return y_pred and y to non-log space and compute RMSPE"
        y_pred, y = torch.exp(y_pred) - 1, torch.exp(y) - 1
        pct_var = (y_pred - y) / y
        return (pct_var**2).mean().pow(0.5)

    train_loss, y_pred, y = process_epoch(
        data_itr,
        model,
        train=True,
        optimizer=optimizer,
        # transform=batch_transform,
        amp=False,
    )
    train_rmspe = None
    train_rmspe = rmspe_func(y_pred, y)
    assert train_rmspe is not None
    assert len(y_pred) > 0
    assert len(y) > 0
def test_empty_cols(tmpdir, engine, cat_names, mh_names, cont_names,
                    label_name, num_rows):
    json_sample["num_rows"] = num_rows

    cols = datagen._get_cols_from_schema(json_sample)

    df_gen = datagen.DatasetGen(datagen.PowerLawDistro(0.1))
    dataset = df_gen.create_df(num_rows, cols)
    dataset = nvt.Dataset(dataset)
    features = []
    if cont_names:
        features.append(cont_names >> ops.FillMedian() >> ops.Normalize())
    if cat_names or mh_names:
        features.append(cat_names + mh_names >> ops.Categorify())
    # test out https://github.com/NVIDIA/NVTabular/issues/149 making sure we can iterate over
    # empty cats/conts
    graph = sum(features, nvt.WorkflowNode(label_name))
    processor = nvt.Workflow(graph)

    output_train = os.path.join(tmpdir, "train/")
    os.mkdir(output_train)

    df_out = processor.fit_transform(dataset).to_ddf().compute(
        scheduler="synchronous")

    if processor.output_node.output_schema.apply_inverse(
            ColumnSelector("lab_1")):
        # if we don't have conts/cats/labels we're done
        return

    data_itr = None

    with pytest.raises(ValueError) as exc_info:
        data_itr = torch_dataloader.TorchAsyncItr(
            nvt.Dataset(df_out),
            cats=cat_names + mh_names,
            conts=cont_names,
            labels=label_name,
            batch_size=2,
        )
    assert "Neither Categorical or Continuous columns were found by the dataloader. " in str(
        exc_info.value)

    if data_itr:
        for nvt_batch in data_itr:
            cats_conts, labels = nvt_batch
            if cat_names:
                assert set(cat_names).issubset(set(list(cats_conts.keys())))
            if cont_names:
                assert set(cont_names).issubset(set(list(cats_conts.keys())))

        if cat_names or cont_names or mh_names:
            emb_sizes = nvt.ops.get_embedding_sizes(processor)

            EMBEDDING_DROPOUT_RATE = 0.04
            DROPOUT_RATES = [0.001, 0.01]
            HIDDEN_DIMS = [1000, 500]
            LEARNING_RATE = 0.001
            model = Model(
                embedding_table_shapes=emb_sizes,
                num_continuous=len(cont_names),
                emb_dropout=EMBEDDING_DROPOUT_RATE,
                layer_hidden_dims=HIDDEN_DIMS,
                layer_dropout_rates=DROPOUT_RATES,
            ).cuda()
            optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

            def rmspe_func(y_pred, y):
                "Return y_pred and y to non-log space and compute RMSPE"
                y_pred, y = torch.exp(y_pred) - 1, torch.exp(y) - 1
                pct_var = (y_pred - y) / y
                return (pct_var**2).mean().pow(0.5)

            train_loss, y_pred, y = process_epoch(
                data_itr,
                model,
                train=True,
                optimizer=optimizer,
                amp=False,
            )
            train_rmspe = None
            train_rmspe = rmspe_func(y_pred, y)
            assert train_rmspe is not None
            assert len(y_pred) > 0
            assert len(y) > 0