def read_parquet_dask(path, columns=None, categories=None, storage_options=None, **kwargs): result = dd_read_parquet( path, columns=columns, categories=categories, storage_options=storage_options, engine="pyarrow", **kwargs ) # Import geometry columns, not needed for pyarrow >= 0.16 metadata = _load_parquet_pandas_metadata(path) geom_cols = _get_geometry_columns(metadata) if not geom_cols: # No geometry columns found, regular DaskDataFrame return result # Convert Dask DataFrame to DaskGeoDataFrame and the partitions and metadata # to GeoDataFrames result = result.map_partitions( lambda df: GeoDataFrame(_import_geometry_columns(df, geom_cols)), ) result = DaskGeoDataFrame( result.dask, result._name, GeoDataFrame(_import_geometry_columns(result._meta, geom_cols)), result.divisions, ) # Load bounding box info from _metadata pqds = pq.ParquetDataset(path) if b'spatialpandas' in pqds.common_metadata.metadata: spatial_metadata = json.loads( pqds.common_metadata.metadata[b'spatialpandas'].decode('utf') ) if "partition_bounds" in spatial_metadata: partition_bounds = {} for name in spatial_metadata['partition_bounds']: bounds_df = pd.DataFrame( spatial_metadata['partition_bounds'][name] ) # Index labels will be read in as strings. # Here we convert to integers, sort by index, then drop index just in # case the rows got shuffled on read bounds_df = (bounds_df .set_index(bounds_df.index.astype('int')) .sort_index() .reset_index(drop=True)) bounds_df.index.name = 'partition' partition_bounds[name] = bounds_df result._partition_bounds = partition_bounds return result
def test_dask_preproc_cpu(client, tmpdir, datasets, engine, shuffle, cpu): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) if engine == "parquet": df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] elif engine == "csv": df1 = cudf.read_csv(paths[0], header=0)[mycols_csv] df2 = cudf.read_csv(paths[1], header=0)[mycols_csv] else: df1 = cudf.read_csv(paths[0], names=allcols_csv)[mycols_csv] df2 = cudf.read_csv(paths[1], names=allcols_csv)[mycols_csv] df0 = cudf.concat([df1, df2], axis=0) if engine in ("parquet", "csv"): dataset = Dataset(paths, part_size="1MB", cpu=cpu) else: dataset = Dataset(paths, names=allcols_csv, part_size="1MB", cpu=cpu) # Simple transform (normalize) cat_names = ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] conts = cont_names >> ops.FillMissing() >> ops.Normalize() workflow = Workflow(conts + cat_names + label_name, client=client) transformed = workflow.fit_transform(dataset) # Write out dataset output_path = os.path.join(tmpdir, "processed") transformed.to_parquet(output_path=output_path, shuffle=shuffle, out_files_per_proc=4) # Check the final result df_disk = dd_read_parquet(output_path, engine="pyarrow").compute() assert_eq( df0.sort_values(["id", "x"])[["name-string", "label"]], df_disk.sort_values(["id", "x"])[["name-string", "label"]], check_index=False, )
def _perform_read_parquet_dask( paths, columns, filesystem, load_divisions, geometry=None, bounds=None, categories=None, ): filesystem = validate_coerce_filesystem(paths[0], filesystem) datasets = [ pa.parquet.ParquetDataset(path, filesystem=filesystem, validate_schema=False) for path in paths ] # Create delayed partition for each piece pieces = [] for dataset in datasets: # Perform natural sort on pieces so that "part.10" comes after "part.2" dataset_pieces = sorted(dataset.pieces, key=lambda piece: natural_sort_key(piece.path)) pieces.extend(dataset_pieces) delayed_partitions = [ delayed(read_parquet)(piece.path, columns=columns, filesystem=filesystem) for piece in pieces ] # Load divisions if load_divisions: div_mins_list, div_maxes_list = zip( *[_load_divisions(dataset) for dataset in datasets]) div_mins = reduce(lambda a, b: a + b, div_mins_list, []) div_maxes = reduce(lambda a, b: a + b, div_maxes_list, []) else: div_mins = None div_maxes = None # load partition bounds partition_bounds_list = [ _load_partition_bounds(dataset) for dataset in datasets ] if not any([b is None for b in partition_bounds_list]): partition_bounds = {} # We have partition bounds for all datasets for partition_bounds_el in partition_bounds_list: for col, col_bounds in partition_bounds_el.items(): col_bounds_list = partition_bounds.get(col, []) col_bounds_list.append(col_bounds) partition_bounds[col] = col_bounds_list # Concat bounds for each geometry column for col in list(partition_bounds): partition_bounds[col] = pd.concat(partition_bounds[col], axis=0).reset_index(drop=True) partition_bounds[col].index.name = 'partition' else: partition_bounds = {} # Use Dask's read_parquet to get metadata if columns is not None: cols_no_index = [col for col in columns if col != "hilbert_distance"] else: cols_no_index = None meta = dd_read_parquet( paths[0], columns=cols_no_index, filesystem=filesystem, engine='pyarrow', categories=categories, gather_statistics=False, )._meta # Import geometry columns in meta, not needed for pyarrow >= 0.16 metadata = _load_parquet_pandas_metadata(paths[0], filesystem=filesystem) geom_cols = _get_geometry_columns(metadata) if geom_cols: meta = _import_geometry_columns(meta, geom_cols) meta = GeoDataFrame(meta) # Handle geometry in meta if geometry: meta = meta.set_geometry(geometry) geometry = meta.geometry.name # Filter partitions by bounding box if bounds and geometry in partition_bounds: # Unpack bounds coordinates and make sure x0, y0, x1, y1 = bounds # Make sure x0 < c1 if x0 > x1: x0, x1 = x1, x0 # Make sure y0 < y1 if y0 > y1: y0, y1 = y1, y0 # Make DataFrame with bounds and parquet piece partitions_df = partition_bounds[geometry].assign( delayed_partition=delayed_partitions) if load_divisions: partitions_df = partitions_df.assign(div_mins=div_mins, div_maxes=div_maxes) inds = ~((partitions_df.x1 < x0) | (partitions_df.y1 < y0) | (partitions_df.x0 > x1) | (partitions_df.y0 > y1)) partitions_df = partitions_df[inds] for col in list(partition_bounds): partition_bounds[col] = partition_bounds[col][inds] partition_bounds[col].reset_index(drop=True, inplace=True) partition_bounds[col].index.name = "partition" delayed_partitions = partitions_df.delayed_partition.tolist() if load_divisions: div_mins = partitions_df.div_mins div_maxes = partitions_df.div_maxes if load_divisions: divisions = div_mins + [div_maxes[-1]] if divisions != sorted(divisions): raise ValueError( "Cannot load divisions because the discovered divisions are unsorted.\n" "Set load_divisions=False to skip loading divisions.") else: divisions = None # Create DaskGeoDataFrame if delayed_partitions: result = from_delayed(delayed_partitions, divisions=divisions, meta=meta, verify_meta=False) else: # Single partition empty result result = from_pandas(meta, npartitions=1) # Set partition bounds if partition_bounds: result._partition_bounds = partition_bounds return result
def test_dask_workflow_api_dlrm( client, tmpdir, datasets, freq_threshold, part_mem_fraction, engine, cat_cache, on_host, shuffle, cpu, ): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) paths = sorted(paths) if engine == "parquet": df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] elif engine == "csv": df1 = cudf.read_csv(paths[0], header=0)[mycols_csv] df2 = cudf.read_csv(paths[1], header=0)[mycols_csv] else: df1 = cudf.read_csv(paths[0], names=allcols_csv)[mycols_csv] df2 = cudf.read_csv(paths[1], names=allcols_csv)[mycols_csv] df0 = cudf.concat([df1, df2], axis=0) df0 = df0.to_pandas() if cpu else df0 if engine == "parquet": cat_names = ["name-cat", "name-string"] else: cat_names = ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] cats = cat_names >> ops.Categorify(freq_threshold=freq_threshold, out_path=str(tmpdir), cat_cache=cat_cache, on_host=on_host) conts = cont_names >> ops.FillMissing() >> ops.Clip( min_value=0) >> ops.LogOp() workflow = Workflow(cats + conts + label_name, client=client) if engine in ("parquet", "csv"): dataset = Dataset(paths, cpu=cpu, part_mem_fraction=part_mem_fraction) else: dataset = Dataset(paths, cpu=cpu, names=allcols_csv, part_mem_fraction=part_mem_fraction) output_path = os.path.join(tmpdir, "processed") transformed = workflow.fit_transform(dataset) transformed.to_parquet(output_path=output_path, shuffle=shuffle, out_files_per_proc=1) result = transformed.to_ddf().compute() assert len(df0) == len(result) assert result["x"].min() == 0.0 assert result["x"].isna().sum() == 0 assert result["y"].min() == 0.0 assert result["y"].isna().sum() == 0 # Check categories. Need to sort first to make sure we are comparing # "apples to apples" expect = df0.sort_values(["label", "x", "y", "id"]).reset_index(drop=True).reset_index() got = result.sort_values(["label", "x", "y", "id"]).reset_index(drop=True).reset_index() dfm = expect.merge(got, on="index", how="inner")[["name-string_x", "name-string_y"]] dfm_gb = dfm.groupby(["name-string_x", "name-string_y"]).agg({ "name-string_x": "count", "name-string_y": "count" }) if freq_threshold: dfm_gb = dfm_gb[dfm_gb["name-string_x"] >= freq_threshold] assert_eq(dfm_gb["name-string_x"], dfm_gb["name-string_y"], check_names=False) # Read back from disk if cpu: df_disk = dd_read_parquet(output_path).compute() else: df_disk = dask_cudf.read_parquet(output_path).compute() # we don't have a deterministic ordering here, especially when using # a dask client with multiple workers - so we need to sort the values here columns = ["label", "x", "y", "id"] + cat_names got = result.sort_values(columns).reset_index(drop=True) expect = df_disk.sort_values(columns).reset_index(drop=True) assert_eq(got, expect)