Ejemplo n.º 1
0
def test_inspect_datagen(tmpdir, datasets, engine, dist):
    # Dataset
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])

    # Dataset columns type config
    columns_dict = {}
    columns_dict["cats"] = ["name-cat", "name-string"
                            ] if engine == "parquet" else ["name-string"]
    columns_dict["conts"] = ["x", "y"]
    columns_dict["labels"] = ["label"]

    # Create inspector and inspect
    output_inspect1 = tmpdir + "/dataset_info1.json"
    dataset = Dataset(paths, engine=engine)
    a = datains.DatasetInspector()
    a.inspect(dataset, columns_dict, output_inspect1)
    assert os.path.isfile(output_inspect1)

    # Generate dataset using data_gen tool
    output_datagen = tmpdir + "/datagen"
    os.mkdir(output_datagen)
    with fsspec.open(output_inspect1) as f:
        output1 = json.load(f)
    cols = datagen._get_cols_from_schema(output1)
    if dist == "uniform":
        df_gen = datagen.DatasetGen(datagen.UniformDistro(), gpu_frac=0.00001)
    else:
        df_gen = datagen.DatasetGen(datagen.PowerLawDistro(0.1),
                                    gpu_frac=0.00001)

    output_datagen_files = df_gen.full_df_create(output1["num_rows"],
                                                 cols,
                                                 entries=True,
                                                 output=output_datagen)

    # Inspect again and check output are the same
    output_inspect2 = tmpdir + "/dataset_info2.json"
    dataset = Dataset(output_datagen_files, engine=engine)
    a.inspect(dataset, columns_dict, output_inspect2)
    assert os.path.isfile(output_inspect2)

    # Compare json outputs
    with fsspec.open(output_inspect2) as f:
        output2 = json.load(f)
    for k1 in output1.keys():
        if k1 == "num_rows":
            assert output1[k1] == output2[k1]
        else:
            for k2 in output1[k1].keys():
                for k3 in output1[k1][k2].keys():
                    if k3 == "dtype":
                        if output1[k1][k2][k3] == "object":
                            assert (output1[k1][k2][k3] == output2[k1][k2][k3]
                                    or output2[k1][k2][k3] == "int64")
                        else:
                            assert output1[k1][k2][k3] == output2[k1][k2][k3]
                    else:
                        assert output1[k1][k2][k3] == pytest.approx(
                            output2[k1][k2][k3], rel=1e-0, abs=1e-0)
Ejemplo n.º 2
0
def test_powerlaw(num_rows, distro):
    cats = list(json_sample["cats"].keys())[1:]

    cols = datagen._get_cols_from_schema(json_sample, distros=distro)

    df_gen = datagen.DatasetGen(datagen.PowerLawDistro(0.1))
    df_pw = cudf.DataFrame()
    for x in range(10):
        df_pw_1 = df_gen.create_df(num_rows, cols)
        df_pw = cudf.concat([df_pw, df_pw_1], axis=0)
    sts, ps = df_gen.verify_df(df_pw[cats])
    assert all(s > 0.9 for s in sts)
def test_empty_cols(tmpdir, engine, cat_names, mh_names, cont_names,
                    label_name, num_rows):
    json_sample["num_rows"] = num_rows

    cols = datagen._get_cols_from_schema(json_sample)

    df_gen = datagen.DatasetGen(datagen.PowerLawDistro(0.1))
    dataset = df_gen.create_df(num_rows, cols)
    dataset = nvt.Dataset(dataset)
    features = []
    if cont_names:
        features.append(cont_names >> ops.FillMedian() >> ops.Normalize())
    if cat_names or mh_names:
        features.append(cat_names + mh_names >> ops.Categorify())
    # test out https://github.com/NVIDIA/NVTabular/issues/149 making sure we can iterate over
    # empty cats/conts
    graph = sum(features, nvt.WorkflowNode(label_name))
    processor = nvt.Workflow(graph)

    output_train = os.path.join(tmpdir, "train/")
    os.mkdir(output_train)

    df_out = processor.fit_transform(dataset).to_ddf().compute(
        scheduler="synchronous")

    if processor.output_node.output_schema.apply_inverse(
            ColumnSelector("lab_1")):
        # if we don't have conts/cats/labels we're done
        return

    data_itr = None

    with pytest.raises(ValueError) as exc_info:
        data_itr = torch_dataloader.TorchAsyncItr(
            nvt.Dataset(df_out),
            cats=cat_names + mh_names,
            conts=cont_names,
            labels=label_name,
            batch_size=2,
        )
    assert "Neither Categorical or Continuous columns were found by the dataloader. " in str(
        exc_info.value)

    if data_itr:
        for nvt_batch in data_itr:
            cats_conts, labels = nvt_batch
            if cat_names:
                assert set(cat_names).issubset(set(list(cats_conts.keys())))
            if cont_names:
                assert set(cont_names).issubset(set(list(cats_conts.keys())))

        if cat_names or cont_names or mh_names:
            emb_sizes = nvt.ops.get_embedding_sizes(processor)

            EMBEDDING_DROPOUT_RATE = 0.04
            DROPOUT_RATES = [0.001, 0.01]
            HIDDEN_DIMS = [1000, 500]
            LEARNING_RATE = 0.001
            model = Model(
                embedding_table_shapes=emb_sizes,
                num_continuous=len(cont_names),
                emb_dropout=EMBEDDING_DROPOUT_RATE,
                layer_hidden_dims=HIDDEN_DIMS,
                layer_dropout_rates=DROPOUT_RATES,
            ).cuda()
            optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

            def rmspe_func(y_pred, y):
                "Return y_pred and y to non-log space and compute RMSPE"
                y_pred, y = torch.exp(y_pred) - 1, torch.exp(y) - 1
                pct_var = (y_pred - y) / y
                return (pct_var**2).mean().pow(0.5)

            train_loss, y_pred, y = process_epoch(
                data_itr,
                model,
                train=True,
                optimizer=optimizer,
                amp=False,
            )
            train_rmspe = None
            train_rmspe = rmspe_func(y_pred, y)
            assert train_rmspe is not None
            assert len(y_pred) > 0
            assert len(y) > 0