Beispiel #1
0
def test_inspect_datagen(tmpdir, datasets, engine, dist):
    # Dataset
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])

    # Dataset columns type config
    columns_dict = {}
    columns_dict["cats"] = ["name-cat", "name-string"
                            ] if engine == "parquet" else ["name-string"]
    columns_dict["conts"] = ["x", "y"]
    columns_dict["labels"] = ["label"]

    # Create inspector and inspect
    output_inspect1 = tmpdir + "/dataset_info1.json"
    dataset = Dataset(paths, engine=engine)
    a = datains.DatasetInspector()
    a.inspect(dataset, columns_dict, output_inspect1)
    assert os.path.isfile(output_inspect1)

    # Generate dataset using data_gen tool
    output_datagen = tmpdir + "/datagen"
    os.mkdir(output_datagen)
    with fsspec.open(output_inspect1) as f:
        output1 = json.load(f)
    cols = datagen._get_cols_from_schema(output1)
    if dist == "uniform":
        df_gen = datagen.DatasetGen(datagen.UniformDistro(), gpu_frac=0.00001)
    else:
        df_gen = datagen.DatasetGen(datagen.PowerLawDistro(0.1),
                                    gpu_frac=0.00001)

    output_datagen_files = df_gen.full_df_create(output1["num_rows"],
                                                 cols,
                                                 entries=True,
                                                 output=output_datagen)

    # Inspect again and check output are the same
    output_inspect2 = tmpdir + "/dataset_info2.json"
    dataset = Dataset(output_datagen_files, engine=engine)
    a.inspect(dataset, columns_dict, output_inspect2)
    assert os.path.isfile(output_inspect2)

    # Compare json outputs
    with fsspec.open(output_inspect2) as f:
        output2 = json.load(f)
    for k1 in output1.keys():
        if k1 == "num_rows":
            assert output1[k1] == output2[k1]
        else:
            for k2 in output1[k1].keys():
                for k3 in output1[k1][k2].keys():
                    if k3 == "dtype":
                        if output1[k1][k2][k3] == "object":
                            assert (output1[k1][k2][k3] == output2[k1][k2][k3]
                                    or output2[k1][k2][k3] == "int64")
                        else:
                            assert output1[k1][k2][k3] == output2[k1][k2][k3]
                    else:
                        assert output1[k1][k2][k3] == pytest.approx(
                            output2[k1][k2][k3], rel=1e-0, abs=1e-0)
Beispiel #2
0
def main(args):
    # Get device configuration
    device_size = device_mem_size(kind="total")
    device_limit = int(args.device_limit_frac * device_size)
    device_pool_size = int(args.device_pool_frac * device_size)
    part_size = int(args.part_mem_frac * device_size)

    # Get dataset columns
    with fsspec.open(args.config_file) as f:
        config = json.load(f)

    # Create Dataset
    dataset = Dataset(args.data_path, engine=args.format, part_size=part_size)

    # Call Inspector
    with managed_client(args.devices, device_limit, args.protocol) as client:
        setup_rmm_pool(client, device_pool_size)
        a = datains.DatasetInspector(client)
        a.inspect(dataset, config, args.output_file)