Beispiel #1
0
def test_full_df(num_rows, tmpdir, distro):
    json_sample["num_rows"] = num_rows
    cats = list(json_sample["cats"].keys())
    cols = datagen._get_cols_from_schema(json_sample, distros=distro)

    df_gen = datagen.DatasetGen(datagen.UniformDistro(), gpu_frac=0.00001)
    df_files = df_gen.full_df_create(num_rows,
                                     cols,
                                     entries=True,
                                     output=tmpdir)
    test_size = 0
    full_df = cudf.DataFrame()
    for fi in df_files:
        df = cudf.read_parquet(fi)
        test_size = test_size + df.shape[0]
        full_df = cudf.concat([full_df, df])
    assert test_size == num_rows
    conts_rep = cols["conts"]
    cats_rep = cols["cats"]
    labels_rep = cols["labels"]
    assert df.shape[1] == len(conts_rep) + len(cats_rep) + len(labels_rep)
    for idx, cat in enumerate(cats[1:]):
        dist = cats_rep[idx + 1].distro or df_gen.dist
        if not is_string_dtype(full_df[cat]._column):
            sts, ps = dist.verify(full_df[cat].to_pandas())
            assert all(s > 0.9 for s in sts)
        assert full_df[cat].nunique() == cats_rep[idx + 1].cardinality
        assert full_df[cat].str.len().min() == cats_rep[idx + 1].min_entry_size
        assert full_df[cat].str.len().max() == cats_rep[idx + 1].max_entry_size
    check_ser = cudf.Series(full_df[cats[0]]._column.elements.values_host)
    assert check_ser.nunique() == cats_rep[0].cardinality
    assert check_ser.str.len().min() == cats_rep[0].min_entry_size
    assert check_ser.str.len().max() == cats_rep[0].max_entry_size
Beispiel #2
0
def test_inspect_datagen(tmpdir, datasets, engine, dist):
    # Dataset
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])

    # Dataset columns type config
    columns_dict = {}
    columns_dict["cats"] = ["name-cat", "name-string"
                            ] if engine == "parquet" else ["name-string"]
    columns_dict["conts"] = ["x", "y"]
    columns_dict["labels"] = ["label"]

    # Create inspector and inspect
    output_inspect1 = tmpdir + "/dataset_info1.json"
    dataset = Dataset(paths, engine=engine)
    a = datains.DatasetInspector()
    a.inspect(dataset, columns_dict, output_inspect1)
    assert os.path.isfile(output_inspect1)

    # Generate dataset using data_gen tool
    output_datagen = tmpdir + "/datagen"
    os.mkdir(output_datagen)
    with fsspec.open(output_inspect1) as f:
        output1 = json.load(f)
    cols = datagen._get_cols_from_schema(output1)
    if dist == "uniform":
        df_gen = datagen.DatasetGen(datagen.UniformDistro(), gpu_frac=0.00001)
    else:
        df_gen = datagen.DatasetGen(datagen.PowerLawDistro(0.1),
                                    gpu_frac=0.00001)

    output_datagen_files = df_gen.full_df_create(output1["num_rows"],
                                                 cols,
                                                 entries=True,
                                                 output=output_datagen)

    # Inspect again and check output are the same
    output_inspect2 = tmpdir + "/dataset_info2.json"
    dataset = Dataset(output_datagen_files, engine=engine)
    a.inspect(dataset, columns_dict, output_inspect2)
    assert os.path.isfile(output_inspect2)

    # Compare json outputs
    with fsspec.open(output_inspect2) as f:
        output2 = json.load(f)
    for k1 in output1.keys():
        if k1 == "num_rows":
            assert output1[k1] == output2[k1]
        else:
            for k2 in output1[k1].keys():
                for k3 in output1[k1][k2].keys():
                    if k3 == "dtype":
                        if output1[k1][k2][k3] == "object":
                            assert (output1[k1][k2][k3] == output2[k1][k2][k3]
                                    or output2[k1][k2][k3] == "int64")
                        else:
                            assert output1[k1][k2][k3] == output2[k1][k2][k3]
                    else:
                        assert output1[k1][k2][k3] == pytest.approx(
                            output2[k1][k2][k3], rel=1e-0, abs=1e-0)
Beispiel #3
0
def test_uniform(num_rows, distro):
    cats = list(json_sample["cats"].keys())[1:]
    cols = datagen._get_cols_from_schema(json_sample, distros=distro)

    df_gen = datagen.DatasetGen(datagen.UniformDistro())
    df_uni = df_gen.create_df(num_rows, cols)
    sts, ps = df_gen.verify_df(df_uni[cats])
    assert all(s > 0.9 for s in sts)
Beispiel #4
0
def _get_random_movielens_data(tmpdir, rows, dataset="movie", valid=None):
    if dataset == "movie":
        json_sample_movie = {
            "conts": {},
            "cats": {
                "genres": {
                    "dtype": None,
                    "cardinality": 50,
                    "min_entry_size": 1,
                    "max_entry_size": 5,
                    "multi_min": 2,
                    "multi_max": 4,
                    "multi_avg": 3,
                },
                "movieId": {
                    "dtype": None,
                    "cardinality": 500,
                    "min_entry_size": 1,
                    "max_entry_size": 5,
                },
            },
        }
        cols = datagen._get_cols_from_schema(json_sample_movie)
    if dataset == "ratings":
        json_sample_ratings = {
            "conts": {},
            "cats": {
                "movieId": {
                    "dtype": None,
                    "cardinality": 500,
                    "min_entry_size": 1,
                    "max_entry_size": 5,
                },
                "userId": {
                    "dtype": None,
                    "cardinality": 500,
                    "min_entry_size": 1,
                    "max_entry_size": 5,
                },
            },
            "labels": {"rating": {"dtype": None, "cardinality": 5}},
        }
        cols = datagen._get_cols_from_schema(json_sample_ratings)

    df_gen = datagen.DatasetGen(datagen.UniformDistro(), gpu_frac=0.1)
    target_path = tmpdir
    df_gen.full_df_create(rows, cols, output=target_path)

    if dataset == "movie":
        movies_converted = cudf.read_parquet(os.path.join(tmpdir, "dataset_0.parquet"))
        movies_converted = movies_converted.drop_duplicates(["movieId"], keep="first")
        movies_converted.to_parquet(os.path.join(tmpdir, "movies_converted.parquet"))

    elif dataset == "ratings" and not valid:
        os.rename(os.path.join(tmpdir, "dataset_0.parquet"), os.path.join(tmpdir, "train.parquet"))
    else:
        os.rename(os.path.join(tmpdir, "dataset_0.parquet"), os.path.join(tmpdir, "valid.parquet"))
def test_sparse_tensors(tmpdir, sparse_dense):
    # create small dataset, add values to sparse_list
    json_sample = {
        "conts": {},
        "cats": {
            "spar1": {
                "dtype": None,
                "cardinality": 50,
                "min_entry_size": 1,
                "max_entry_size": 5,
                "multi_min": 2,
                "multi_max": 4,
                "multi_avg": 3,
            },
            "spar2": {
                "dtype": None,
                "cardinality": 50,
                "min_entry_size": 1,
                "max_entry_size": 5,
                "multi_min": 3,
                "multi_max": 5,
                "multi_avg": 4,
            },
            # "": {"dtype": None, "cardinality": 500, "min_entry_size": 1, "max_entry_size": 5},
        },
        "labels": {"rating": {"dtype": None, "cardinality": 2}},
    }
    cols = datagen._get_cols_from_schema(json_sample)
    df_gen = datagen.DatasetGen(datagen.UniformDistro(), gpu_frac=0.0001)
    target_path = os.path.join(tmpdir, "input/")
    os.mkdir(target_path)
    df_files = df_gen.full_df_create(10000, cols, output=target_path)
    spa_lst = ["spar1", "spar2"]
    spa_mx = {"spar1": 5, "spar2": 6}
    batch_size = 10
    data_itr = tf_dataloader.KerasSequenceLoader(
        df_files,
        cat_names=spa_lst,
        cont_names=[],
        label_names=["rating"],
        batch_size=batch_size,
        buffer_size=0.1,
        sparse_names=spa_lst,
        sparse_max=spa_mx,
        sparse_as_dense=sparse_dense,
    )
    for batch in data_itr:
        feats, labs = batch
        for col in spa_lst:
            feature_tensor = feats[f"{col}"]
            if not sparse_dense:
                assert list(feature_tensor.shape) == [batch_size, spa_mx[col]]
                assert isinstance(feature_tensor, tf.sparse.SparseTensor)
            else:
                assert feature_tensor.shape[1] == spa_mx[col]
                assert not isinstance(feature_tensor, tf.sparse.SparseTensor)
def test_width(num_rows, distro):
    json_sample_1 = {
        "conts": {
            "cont_1": {"dtype": np.float32, "min_val": 0, "max_val": 1, "width": 20},
        }
    }
    json_sample_1["num_rows"] = num_rows
    cols = datagen._get_cols_from_schema(json_sample_1, distros=distro)

    df_gen = datagen.DatasetGen(datagen.UniformDistro())
    df_uni = df_gen.create_df(num_rows, cols)
    assert df_uni.shape[1] == 20
Beispiel #7
0
def test_cat_rep(num_rows, distro):
    cats = list(json_sample["cats"].keys())
    cols = datagen._get_cols_from_schema(json_sample, distros=distro)

    df_gen = datagen.DatasetGen(datagen.UniformDistro())
    df_uni = df_gen.create_df(num_rows, cols, entries=True)
    df_cats = df_uni[cats]
    assert df_cats.shape[1] == len(cats)
    assert df_cats.shape[0] == num_rows
    cats_rep = cols["cats"]
    for idx, cat in enumerate(cats[1:]):
        assert df_uni[cat].nunique() == cats_rep[idx + 1].cardinality
        assert df_uni[cat].str.len().min() == cats_rep[idx + 1].min_entry_size
        assert df_uni[cat].str.len().max() == cats_rep[idx + 1].max_entry_size
    check_ser = cudf.Series(df_uni[cats[0]]._column.elements.values_host)
    assert check_ser.nunique() == cats_rep[0].cardinality
    assert check_ser.str.len().min() == cats_rep[0].min_entry_size
    assert check_ser.str.len().max() == cats_rep[0].max_entry_size
def test_full_df(num_rows, tmpdir, distro):
    json_sample["num_rows"] = num_rows
    cats = list(json_sample["cats"].keys())
    cols = datagen._get_cols_from_schema(json_sample, distros=distro)

    df_gen = datagen.DatasetGen(datagen.UniformDistro(), gpu_frac=0.00001)
    df_files = df_gen.full_df_create(num_rows, cols, entries=True, output=tmpdir)
    test_size = 0
    full_df = _make_df()
    for fi in df_files:
        df = Dataset(fi).to_ddf().compute()
        test_size = test_size + df.shape[0]
        full_df = _concat([full_df, df])
    assert test_size == num_rows
    conts_rep = cols["conts"]
    cats_rep = cols["cats"]
    labels_rep = cols["labels"]
    assert df.shape[1] == len(conts_rep) + len(cats_rep) + len(labels_rep)
    for idx, cat in enumerate(cats[1:]):
        dist = cats_rep[idx + 1].distro or df_gen.dist
        if HAS_GPU:
            if not _is_string_dtype(full_df[cat]._column):
                sts, ps = dist.verify(full_df[cat].to_pandas())
                assert all(s > 0.9 for s in sts)
        else:
            if not _is_string_dtype(full_df[cat]):
                sts, ps = dist.verify(full_df[cat])
                assert all(s > 0.9 for s in sts)
        # these are not mh series
        assert full_df[cat].nunique() == cats_rep[0].cardinality
        assert full_df[cat].str.len().min() == cats_rep[0].min_entry_size
        assert full_df[cat].str.len().max() == cats_rep[0].max_entry_size
    # check the mh list here cat 0 only
    if HAS_GPU:
        check_ser = _make_df(list(full_df[cats[0]]._column.elements.values_host))[0]
    else:
        check_ser = _pull_apart_list(full_df[cats[0]])[0]
    assert check_ser.nunique() == cats_rep[0].cardinality
    assert check_ser.str.len().min() == cats_rep[0].min_entry_size
    assert check_ser.str.len().max() == cats_rep[0].max_entry_size
def test_cat_rep(num_rows, distro):
    json_sample["num_rows"] = num_rows
    cats = list(json_sample["cats"].keys())
    cols = datagen._get_cols_from_schema(json_sample, distros=distro)

    df_gen = datagen.DatasetGen(datagen.UniformDistro())
    df_uni = df_gen.create_df(num_rows, cols, entries=True)
    df_cats = df_uni[cats]
    assert df_cats.shape[1] == len(cats)
    assert df_cats.shape[0] == num_rows
    cats_rep = cols["cats"]
    for idx, cat in enumerate(cats[1:]):
        assert df_uni[cat].nunique() == cats_rep[idx + 1].cardinality
        assert df_uni[cat].str.len().min() == cats_rep[idx + 1].min_entry_size
        assert df_uni[cat].str.len().max() == cats_rep[idx + 1].max_entry_size
    if HAS_GPU:
        check_ser = _make_df(list(df_uni[cats[0]]._column.elements.values_host))[0]
    else:
        check_ser = df_uni[cats[0]]
    if isinstance(check_ser[0], (list, np.ndarray)):
        check_ser = _pull_apart_list(check_ser)[0]
    assert check_ser.nunique() == cats_rep[0].cardinality
    assert check_ser.str.len().min() == cats_rep[0].min_entry_size
    assert check_ser.str.len().max() == cats_rep[0].max_entry_size
Beispiel #10
0
def test_horovod_multigpu(tmpdir):
    json_sample = {
        "conts": {},
        "cats": {
            "genres": {
                "dtype": None,
                "cardinality": 50,
                "min_entry_size": 1,
                "max_entry_size": 5,
                "multi_min": 2,
                "multi_max": 4,
                "multi_avg": 3,
            },
            "movieId": {
                "dtype": None,
                "cardinality": 500,
                "min_entry_size": 1,
                "max_entry_size": 5,
            },
            "userId": {
                "dtype": None,
                "cardinality": 500,
                "min_entry_size": 1,
                "max_entry_size": 5
            },
        },
        "labels": {
            "rating": {
                "dtype": None,
                "cardinality": 2
            }
        },
    }
    cols = datagen._get_cols_from_schema(json_sample)
    df_gen = datagen.DatasetGen(datagen.UniformDistro(), gpu_frac=0.0001)
    target_path = os.path.join(tmpdir, "input/")
    os.mkdir(target_path)
    df_files = df_gen.full_df_create(10000, cols, output=target_path)
    # process them
    cat_features = nvt.ColumnGroup(["userId", "movieId", "genres"
                                    ]) >> nvt.ops.Categorify()
    ratings = nvt.ColumnGroup(["rating"]) >> (lambda col:
                                              (col > 3).astype("int8"))
    output = cat_features + ratings
    proc = nvt.Workflow(output)
    train_iter = nvt.Dataset(df_files, part_size="10MB")
    proc.fit(train_iter)
    target_path_train = os.path.join(tmpdir, "train/")
    os.mkdir(target_path_train)
    proc.transform(train_iter).to_parquet(output_path=target_path_train,
                                          out_files_per_proc=5)
    # add new location
    target_path = os.path.join(tmpdir, "workflow/")
    os.mkdir(target_path)
    proc.save(target_path)
    curr_path = os.path.abspath(__file__)
    repo_root = os.path.relpath(
        os.path.normpath(os.path.join(curr_path, "../../..")))
    hvd_wrap_path = os.path.join(
        repo_root, "examples/multi-gpu-movielens/hvd_wrapper.sh")
    hvd_exam_path = os.path.join(repo_root,
                                 "examples/multi-gpu-movielens/tf_trainer.py")
    process = subprocess.Popen(
        [
            "horovodrun",
            "-np",
            "2",
            "-H",
            "localhost:2",
            "sh",
            hvd_wrap_path,
            "python",
            hvd_exam_path,
            "--dir_in",
            f"{tmpdir}",
            "--batch_size",
            "1024",
        ],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
    )
    process.wait()
    stdout, stderr = process.communicate()
    print(stdout, stderr)
    assert "Loss:" in str(stdout)