def test_inspect_datagen(tmpdir, datasets, engine, dist): # Dataset paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) # Dataset columns type config columns_dict = {} columns_dict["cats"] = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] columns_dict["conts"] = ["x", "y"] columns_dict["labels"] = ["label"] # Create inspector and inspect output_inspect1 = tmpdir + "/dataset_info1.json" dataset = Dataset(paths, engine=engine) a = datains.DatasetInspector() a.inspect(dataset, columns_dict, output_inspect1) assert os.path.isfile(output_inspect1) # Generate dataset using data_gen tool output_datagen = tmpdir + "/datagen" os.mkdir(output_datagen) with fsspec.open(output_inspect1) as f: output1 = json.load(f) cols = datagen._get_cols_from_schema(output1) if dist == "uniform": df_gen = datagen.DatasetGen(datagen.UniformDistro(), gpu_frac=0.00001) else: df_gen = datagen.DatasetGen(datagen.PowerLawDistro(0.1), gpu_frac=0.00001) output_datagen_files = df_gen.full_df_create(output1["num_rows"], cols, entries=True, output=output_datagen) # Inspect again and check output are the same output_inspect2 = tmpdir + "/dataset_info2.json" dataset = Dataset(output_datagen_files, engine=engine) a.inspect(dataset, columns_dict, output_inspect2) assert os.path.isfile(output_inspect2) # Compare json outputs with fsspec.open(output_inspect2) as f: output2 = json.load(f) for k1 in output1.keys(): if k1 == "num_rows": assert output1[k1] == output2[k1] else: for k2 in output1[k1].keys(): for k3 in output1[k1][k2].keys(): if k3 == "dtype": if output1[k1][k2][k3] == "object": assert (output1[k1][k2][k3] == output2[k1][k2][k3] or output2[k1][k2][k3] == "int64") else: assert output1[k1][k2][k3] == output2[k1][k2][k3] else: assert output1[k1][k2][k3] == pytest.approx( output2[k1][k2][k3], rel=1e-0, abs=1e-0)
def main(args): # Get device configuration device_size = device_mem_size(kind="total") device_limit = int(args.device_limit_frac * device_size) device_pool_size = int(args.device_pool_frac * device_size) part_size = int(args.part_mem_frac * device_size) # Get dataset columns with fsspec.open(args.config_file) as f: config = json.load(f) # Create Dataset dataset = Dataset(args.data_path, engine=args.format, part_size=part_size) # Call Inspector with managed_client(args.devices, device_limit, args.protocol) as client: setup_rmm_pool(client, device_pool_size) a = datains.DatasetInspector(client) a.inspect(dataset, config, args.output_file)