Example #1
0
def read_parquet_dask(path, columns=None, categories=None, storage_options=None, **kwargs):
    result = dd_read_parquet(
        path,
        columns=columns,
        categories=categories,
        storage_options=storage_options,
        engine="pyarrow",
        **kwargs
    )

    # Import geometry columns, not needed for pyarrow >= 0.16
    metadata = _load_parquet_pandas_metadata(path)
    geom_cols = _get_geometry_columns(metadata)
    if not geom_cols:
        # No geometry columns found, regular DaskDataFrame
        return result

    # Convert Dask DataFrame to DaskGeoDataFrame and the partitions and metadata
    # to GeoDataFrames
    result = result.map_partitions(
        lambda df: GeoDataFrame(_import_geometry_columns(df, geom_cols)),
    )

    result = DaskGeoDataFrame(
        result.dask,
        result._name,
        GeoDataFrame(_import_geometry_columns(result._meta, geom_cols)),
        result.divisions,
    )
    # Load bounding box info from _metadata
    pqds = pq.ParquetDataset(path)
    if b'spatialpandas' in pqds.common_metadata.metadata:
        spatial_metadata = json.loads(
            pqds.common_metadata.metadata[b'spatialpandas'].decode('utf')
        )
        if "partition_bounds" in spatial_metadata:
            partition_bounds = {}
            for name in spatial_metadata['partition_bounds']:
                bounds_df = pd.DataFrame(
                    spatial_metadata['partition_bounds'][name]
                )

                # Index labels will be read in as strings.
                # Here we convert to integers, sort by index, then drop index just in
                # case the rows got shuffled on read
                bounds_df = (bounds_df
                             .set_index(bounds_df.index.astype('int'))
                             .sort_index()
                             .reset_index(drop=True))
                bounds_df.index.name = 'partition'

                partition_bounds[name] = bounds_df
            result._partition_bounds = partition_bounds
    return result
def test_dask_preproc_cpu(client, tmpdir, datasets, engine, shuffle, cpu):
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    if engine == "parquet":
        df1 = cudf.read_parquet(paths[0])[mycols_pq]
        df2 = cudf.read_parquet(paths[1])[mycols_pq]
    elif engine == "csv":
        df1 = cudf.read_csv(paths[0], header=0)[mycols_csv]
        df2 = cudf.read_csv(paths[1], header=0)[mycols_csv]
    else:
        df1 = cudf.read_csv(paths[0], names=allcols_csv)[mycols_csv]
        df2 = cudf.read_csv(paths[1], names=allcols_csv)[mycols_csv]
    df0 = cudf.concat([df1, df2], axis=0)

    if engine in ("parquet", "csv"):
        dataset = Dataset(paths, part_size="1MB", cpu=cpu)
    else:
        dataset = Dataset(paths, names=allcols_csv, part_size="1MB", cpu=cpu)

    # Simple transform (normalize)
    cat_names = ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]
    conts = cont_names >> ops.FillMissing() >> ops.Normalize()
    workflow = Workflow(conts + cat_names + label_name, client=client)
    transformed = workflow.fit_transform(dataset)

    # Write out dataset
    output_path = os.path.join(tmpdir, "processed")
    transformed.to_parquet(output_path=output_path,
                           shuffle=shuffle,
                           out_files_per_proc=4)

    # Check the final result
    df_disk = dd_read_parquet(output_path, engine="pyarrow").compute()
    assert_eq(
        df0.sort_values(["id", "x"])[["name-string", "label"]],
        df_disk.sort_values(["id", "x"])[["name-string", "label"]],
        check_index=False,
    )
Example #3
0
def _perform_read_parquet_dask(
    paths,
    columns,
    filesystem,
    load_divisions,
    geometry=None,
    bounds=None,
    categories=None,
):
    filesystem = validate_coerce_filesystem(paths[0], filesystem)
    datasets = [
        pa.parquet.ParquetDataset(path,
                                  filesystem=filesystem,
                                  validate_schema=False) for path in paths
    ]

    # Create delayed partition for each piece
    pieces = []
    for dataset in datasets:
        # Perform natural sort on pieces so that "part.10" comes after "part.2"
        dataset_pieces = sorted(dataset.pieces,
                                key=lambda piece: natural_sort_key(piece.path))
        pieces.extend(dataset_pieces)

    delayed_partitions = [
        delayed(read_parquet)(piece.path,
                              columns=columns,
                              filesystem=filesystem) for piece in pieces
    ]

    # Load divisions
    if load_divisions:
        div_mins_list, div_maxes_list = zip(
            *[_load_divisions(dataset) for dataset in datasets])

        div_mins = reduce(lambda a, b: a + b, div_mins_list, [])
        div_maxes = reduce(lambda a, b: a + b, div_maxes_list, [])
    else:
        div_mins = None
        div_maxes = None

    # load partition bounds
    partition_bounds_list = [
        _load_partition_bounds(dataset) for dataset in datasets
    ]
    if not any([b is None for b in partition_bounds_list]):
        partition_bounds = {}
        # We have partition bounds for all datasets
        for partition_bounds_el in partition_bounds_list:
            for col, col_bounds in partition_bounds_el.items():
                col_bounds_list = partition_bounds.get(col, [])
                col_bounds_list.append(col_bounds)
                partition_bounds[col] = col_bounds_list

        # Concat bounds for each geometry column
        for col in list(partition_bounds):
            partition_bounds[col] = pd.concat(partition_bounds[col],
                                              axis=0).reset_index(drop=True)
            partition_bounds[col].index.name = 'partition'
    else:
        partition_bounds = {}

    # Use Dask's read_parquet to get metadata
    if columns is not None:
        cols_no_index = [col for col in columns if col != "hilbert_distance"]
    else:
        cols_no_index = None

    meta = dd_read_parquet(
        paths[0],
        columns=cols_no_index,
        filesystem=filesystem,
        engine='pyarrow',
        categories=categories,
        gather_statistics=False,
    )._meta

    # Import geometry columns in meta, not needed for pyarrow >= 0.16
    metadata = _load_parquet_pandas_metadata(paths[0], filesystem=filesystem)
    geom_cols = _get_geometry_columns(metadata)
    if geom_cols:
        meta = _import_geometry_columns(meta, geom_cols)
    meta = GeoDataFrame(meta)

    # Handle geometry in meta
    if geometry:
        meta = meta.set_geometry(geometry)

    geometry = meta.geometry.name

    # Filter partitions by bounding box
    if bounds and geometry in partition_bounds:
        # Unpack bounds coordinates and make sure
        x0, y0, x1, y1 = bounds

        # Make sure x0 < c1
        if x0 > x1:
            x0, x1 = x1, x0

        # Make sure y0 < y1
        if y0 > y1:
            y0, y1 = y1, y0

        # Make DataFrame with bounds and parquet piece
        partitions_df = partition_bounds[geometry].assign(
            delayed_partition=delayed_partitions)

        if load_divisions:
            partitions_df = partitions_df.assign(div_mins=div_mins,
                                                 div_maxes=div_maxes)

        inds = ~((partitions_df.x1 < x0) | (partitions_df.y1 < y0) |
                 (partitions_df.x0 > x1) | (partitions_df.y0 > y1))

        partitions_df = partitions_df[inds]
        for col in list(partition_bounds):
            partition_bounds[col] = partition_bounds[col][inds]
            partition_bounds[col].reset_index(drop=True, inplace=True)
            partition_bounds[col].index.name = "partition"

        delayed_partitions = partitions_df.delayed_partition.tolist()
        if load_divisions:
            div_mins = partitions_df.div_mins
            div_maxes = partitions_df.div_maxes

    if load_divisions:
        divisions = div_mins + [div_maxes[-1]]
        if divisions != sorted(divisions):
            raise ValueError(
                "Cannot load divisions because the discovered divisions are unsorted.\n"
                "Set load_divisions=False to skip loading divisions.")
    else:
        divisions = None

    # Create DaskGeoDataFrame
    if delayed_partitions:
        result = from_delayed(delayed_partitions,
                              divisions=divisions,
                              meta=meta,
                              verify_meta=False)
    else:
        # Single partition empty result
        result = from_pandas(meta, npartitions=1)

    # Set partition bounds
    if partition_bounds:
        result._partition_bounds = partition_bounds

    return result
def test_dask_workflow_api_dlrm(
    client,
    tmpdir,
    datasets,
    freq_threshold,
    part_mem_fraction,
    engine,
    cat_cache,
    on_host,
    shuffle,
    cpu,
):
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    paths = sorted(paths)
    if engine == "parquet":
        df1 = cudf.read_parquet(paths[0])[mycols_pq]
        df2 = cudf.read_parquet(paths[1])[mycols_pq]
    elif engine == "csv":
        df1 = cudf.read_csv(paths[0], header=0)[mycols_csv]
        df2 = cudf.read_csv(paths[1], header=0)[mycols_csv]
    else:
        df1 = cudf.read_csv(paths[0], names=allcols_csv)[mycols_csv]
        df2 = cudf.read_csv(paths[1], names=allcols_csv)[mycols_csv]
    df0 = cudf.concat([df1, df2], axis=0)
    df0 = df0.to_pandas() if cpu else df0

    if engine == "parquet":
        cat_names = ["name-cat", "name-string"]
    else:
        cat_names = ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    cats = cat_names >> ops.Categorify(freq_threshold=freq_threshold,
                                       out_path=str(tmpdir),
                                       cat_cache=cat_cache,
                                       on_host=on_host)

    conts = cont_names >> ops.FillMissing() >> ops.Clip(
        min_value=0) >> ops.LogOp()

    workflow = Workflow(cats + conts + label_name, client=client)

    if engine in ("parquet", "csv"):
        dataset = Dataset(paths, cpu=cpu, part_mem_fraction=part_mem_fraction)
    else:
        dataset = Dataset(paths,
                          cpu=cpu,
                          names=allcols_csv,
                          part_mem_fraction=part_mem_fraction)

    output_path = os.path.join(tmpdir, "processed")

    transformed = workflow.fit_transform(dataset)
    transformed.to_parquet(output_path=output_path,
                           shuffle=shuffle,
                           out_files_per_proc=1)

    result = transformed.to_ddf().compute()
    assert len(df0) == len(result)
    assert result["x"].min() == 0.0
    assert result["x"].isna().sum() == 0
    assert result["y"].min() == 0.0
    assert result["y"].isna().sum() == 0

    # Check categories.  Need to sort first to make sure we are comparing
    # "apples to apples"
    expect = df0.sort_values(["label", "x", "y",
                              "id"]).reset_index(drop=True).reset_index()
    got = result.sort_values(["label", "x", "y",
                              "id"]).reset_index(drop=True).reset_index()
    dfm = expect.merge(got, on="index",
                       how="inner")[["name-string_x", "name-string_y"]]
    dfm_gb = dfm.groupby(["name-string_x", "name-string_y"]).agg({
        "name-string_x":
        "count",
        "name-string_y":
        "count"
    })
    if freq_threshold:
        dfm_gb = dfm_gb[dfm_gb["name-string_x"] >= freq_threshold]
    assert_eq(dfm_gb["name-string_x"],
              dfm_gb["name-string_y"],
              check_names=False)

    # Read back from disk
    if cpu:
        df_disk = dd_read_parquet(output_path).compute()
    else:
        df_disk = dask_cudf.read_parquet(output_path).compute()

    # we don't have a deterministic ordering here, especially when using
    # a dask client with multiple workers - so we need to sort the values here
    columns = ["label", "x", "y", "id"] + cat_names
    got = result.sort_values(columns).reset_index(drop=True)
    expect = df_disk.sort_values(columns).reset_index(drop=True)
    assert_eq(got, expect)