def build_and_process_graph( self, dataset, end_phase=None, output_path=None, record_stats=True, shuffle=None, output_format=None, out_files_per_proc=None, apply_ops=True, num_io_threads=0, dtypes=None, ): """Build Dask-task graph for workflow. Full graph is only executed if `output_format` is specified. """ # Check shuffle argument shuffle = _check_shuffle_arg(shuffle) # Reorder tasks for two-phase workflows # TODO: Generalize this type of optimization self.reorder_tasks() end = end_phase if end_phase else len(self.phases) if output_format not in ("parquet", "hugectr", None): raise ValueError( f"Output format {output_format} not yet supported with Dask.") # Clear worker caches to be "safe" if self.client: self.client.run(clean_worker_cache) else: clean_worker_cache() self.set_ddf(dataset, shuffle=(shuffle is not None)) if apply_ops: self._base_phase = 0 # Set _base_phase for idx, _ in enumerate(self.phases[:end]): self.exec_phase(idx, record_stats=record_stats, update_ddf=(idx == (end - 1))) self._base_phase = 0 # Re-Set _base_phase if dtypes: ddf = self.get_ddf() _meta = _set_dtypes(ddf._meta, dtypes) self.set_ddf(ddf.map_partitions(_set_dtypes, dtypes, meta=_meta)) if output_format: output_path = output_path or "./" output_path = str(output_path) self.ddf_to_dataset( output_path, output_format=output_format, shuffle=shuffle, out_files_per_proc=out_files_per_proc, num_threads=num_io_threads, )
def iterate_online( self, dataset, end_phase=None, output_path=None, shuffle=None, output_format=None, out_files_per_proc=None, apply_ops=True, num_io_threads=0, dtypes=None, ): """Iterate through dataset and (optionally) apply/shuffle/write.""" # Check shuffle argument shuffle = _check_shuffle_arg(shuffle) # Check if we have a (supported) writer output_path = output_path or "./" output_path = str(output_path) writer = writer_factory( output_format, output_path, out_files_per_proc, shuffle, bytes_io=(shuffle == Shuffle.PER_WORKER), num_threads=num_io_threads, ) # Iterate through dataset, apply ops, and write out processed data if apply_ops: columns = self.columns_ctx["all"]["base"] for gdf in dataset.to_iter(shuffle=(shuffle is not None), columns=columns): self.apply_ops(gdf, output_path=output_path, writer=writer, dtypes=dtypes) # Close writer and write general/specialized metadata if writer: general_md, special_md = writer.close() # Note that we "could" have the special and general metadata # written during `writer.close()` (just above) for the single-GPU case. # Instead, the metadata logic is separated from the `Writer` object to # simplify multi-GPU integration. When using Dask, we cannot assume # that the "shared" metadata files can/will be written by the same # process that writes the data. writer.write_special_metadata(special_md, writer.fs, output_path) writer.write_general_metadata(general_md, writer.fs, output_path)
def write_to_dataset( self, path, dataset, apply_ops=False, out_files_per_proc=None, shuffle=None, output_format="parquet", iterate=False, nfiles=None, num_io_threads=0, dtypes=None, ): """Write data to shuffled parquet dataset. Assumes statistics are already gathered. """ # Check shuffle argument shuffle = _check_shuffle_arg(shuffle) if nfiles: warnings.warn("nfiles is deprecated. Use out_files_per_proc") if out_files_per_proc is None: out_files_per_proc = nfiles out_files_per_proc = out_files_per_proc or 1 path = str(path) if iterate: self.iterate_online( dataset, output_path=path, shuffle=shuffle, output_format=output_format, out_files_per_proc=out_files_per_proc, apply_ops=apply_ops, num_io_threads=num_io_threads, dtypes=dtypes, ) else: self.build_and_process_graph( dataset, output_path=path, record_stats=False, shuffle=shuffle, output_format=output_format, out_files_per_proc=out_files_per_proc, apply_ops=apply_ops, num_io_threads=num_io_threads, dtypes=dtypes, )
def to_hugectr( self, output_path, cats, conts, labels, shuffle=None, file_partition_map=None, out_files_per_proc=None, num_threads=0, dtypes=None, ): """Writes out to a parquet dataset Parameters ---------- output_path : string Path to write processed/shuffled output data cats : list of str List of categorical columns conts : list of str List of continuous columns labels : list of str List of label columns shuffle : nvt.io.Shuffle, optional How to shuffle the output dataset. Shuffling is only performed if the data is written to disk. For all options, other than `None` (which means no shuffling), the partitions of the underlying dataset/ddf will be randomly ordered. If `PER_PARTITION` is specified, each worker/process will also shuffle the rows within each partition before splitting and appending the data to a number (`out_files_per_proc`) of output files. Output files are distinctly mapped to each worker process. If `PER_WORKER` is specified, each worker will follow the same procedure as `PER_PARTITION`, but will re-shuffle each file after all data is persisted. This results in a full shuffle of the data processed by each worker. To improve performace, this option currently uses host-memory `BytesIO` objects for the intermediate persist stage. The `FULL` option is not yet implemented. file_partition_map : dict Dictionary mapping of output file names to partition indices that should be written to that file name. If this argument is passed, only the partitions included in the dictionary will be written to disk, and the `output_files_per_proc` argument will be ignored. out_files_per_proc : integer Number of files to create (per process) after shuffling the data num_threads : integer Number of IO threads to use for writing the output dataset. For `0` (default), no dedicated IO threads will be used. dtypes : dict Dictionary containing desired datatypes for output columns. Keys are column names, values are datatypes. """ # For now, we must move to the GPU to # write an output dataset. # TODO: Support CPU-mode output self.to_gpu() shuffle = _check_shuffle_arg(shuffle) ddf = self.to_ddf(shuffle=shuffle) if dtypes: _meta = _set_dtypes(ddf._meta, dtypes) ddf = ddf.map_partitions(_set_dtypes, dtypes, meta=_meta) fs = get_fs_token_paths(output_path)[0] fs.mkdirs(output_path, exist_ok=True) # Output dask_cudf DataFrame to dataset, _ddf_to_dataset( ddf, fs, output_path, shuffle, file_partition_map, out_files_per_proc, cats, conts, labels, "hugectr", self.client, num_threads, self.cpu, )
def to_parquet( self, output_path, shuffle=None, preserve_files=False, output_files=None, out_files_per_proc=None, num_threads=0, dtypes=None, cats=None, conts=None, labels=None, suffix=".parquet", partition_on=None, ): """Writes out to a parquet dataset Parameters ---------- output_path : string Path to write processed/shuffled output data shuffle : nvt.io.Shuffle enum How to shuffle the output dataset. For all options, other than `None` (which means no shuffling), the partitions of the underlying dataset/ddf will be randomly ordered. If `PER_PARTITION` is specified, each worker/process will also shuffle the rows within each partition before splitting and appending the data to a number (`out_files_per_proc`) of output files. Output files are distinctly mapped to each worker process. If `PER_WORKER` is specified, each worker will follow the same procedure as `PER_PARTITION`, but will re-shuffle each file after all data is persisted. This results in a full shuffle of the data processed by each worker. To improve performace, this option currently uses host-memory `BytesIO` objects for the intermediate persist stage. The `FULL` option is not yet implemented. partition_on : str or list(str) Columns to use for hive-partitioning. If this option is used, `preserve_files`, `output_files`, and `out_files_per_proc` will all be ignored. Also, the `PER_WORKER` shuffle will not be supported. preserve_files : bool Whether to preserve the original file-to-partition mapping of the base dataset. This option is only available if the base dataset is known, and if it corresponds to csv or parquet format. If True, the `out_files_per_proc` option will be ignored, but the `output_files` option will take precedence. Default is False. output_files : dict, list or int Dictionary mapping of output file names to partition indices. If a list of file names is specified, a contiguous range of output partitions will be mapped to each file. The same procedure is used if an integer is specified, but the file names will be written as "part_*". If anything is specified for `output_files`, the `output_files_per_proc` argument will be ignored. Also, if a dictionary is specified, excluded partition indices will not be written to disk. out_files_per_proc : integer Number of files to create (per process) after shuffling the data. This option will be ignored if `output_files` is specified. num_threads : integer Number of IO threads to use for writing the output dataset. For `0` (default), no dedicated IO threads will be used. dtypes : dict Dictionary containing desired datatypes for output columns. Keys are column names, values are datatypes. suffix : str or False File-name extension to use for all output files. This argument is ignored if a specific list of file names is specified using the ``output_files`` option. If ``preserve_files=True``, this suffix will be appended to the original name of each file, unless the original extension is ".csv", ".parquet", ".avro", or ".orc" (in which case the old extension will be replaced). cats : list of str, optional List of categorical columns conts : list of str, optional List of continuous columns labels : list of str, optional List of label columns """ shuffle = _check_shuffle_arg(shuffle) if isinstance(output_files, dict) or (not output_files and preserve_files): # Do not shuffle partitions if we are preserving files or # if a specific file-partition mapping is already specified ddf = self.to_ddf() else: ddf = self.to_ddf(shuffle=shuffle) # Replace None/False suffix argument with "" suffix = suffix or "" # Convert `output_files` argument to a dict mapping if output_files: if isinstance(output_files, int): output_files = [f"part_{i}" + suffix for i in range(output_files)] if isinstance(output_files, list): new = {} split = math.ceil(ddf.npartitions / len(output_files)) for i, fn in enumerate(output_files): start = i * split stop = min(start + split, ddf.npartitions) new[fn] = np.arange(start, stop) output_files = new suffix = "" # Don't add a suffix later - Names already include it if not isinstance(output_files, dict): raise TypeError(f"{type(output_files)} not a supported type for `output_files`.") # If we are preserving files, use the stored dictionary, # or use file_partition_map to extract the mapping elif preserve_files: try: _output_files = self.base_dataset.file_partition_map except AttributeError as e: raise AttributeError( f"`to_parquet(..., preserve_files=True)` is not currently supported " f"for datasets with a {type(self.base_dataset.engine)} engine. Check " f"that `dataset.base_dataset` is backed by csv or parquet files." ) from e if suffix == "": output_files = _output_files else: output_files = {} for fn, rgs in _output_files.items(): split_fn = fn.split(".") if split_fn[-1] in ("parquet", "avro", "orc", "csv"): output_files[".".join(split_fn[:-1]) + suffix] = rgs else: output_files[fn + suffix] = rgs suffix = "" # Don't add a suffix later - Names already include it if dtypes: _meta = _set_dtypes(ddf._meta, dtypes) ddf = ddf.map_partitions(_set_dtypes, dtypes, meta=_meta) fs = get_fs_token_paths(output_path)[0] fs.mkdirs(output_path, exist_ok=True) # Output dask_cudf DataFrame to dataset _ddf_to_dataset( ddf, fs, output_path, shuffle, output_files, out_files_per_proc, cats or [], conts or [], labels or [], "parquet", self.client, num_threads, self.cpu, suffix=suffix, partition_on=partition_on, )
def apply( self, dataset, apply_offline=True, record_stats=True, shuffle=None, output_path="./ds_export", output_format="parquet", out_files_per_proc=None, num_io_threads=0, dtypes=None, ): """ Runs all the preprocessing and feature engineering operators. Also, shuffles the data if a `shuffle` option is specified. Parameters ----------- dataset : object apply_offline : boolean Runs operators in offline mode or not record_stats : boolean Record the stats in file or not. Only available for apply_offline=True shuffle : nvt.io.Shuffle enum How to shuffle the output dataset. Shuffling is only performed if the data is written to disk. For all options, other than `None` (which means no shuffling), the partitions of the underlying dataset/ddf will be randomly ordered. If `PER_PARTITION` is specified, each worker/process will also shuffle the rows within each partition before splitting and appending the data to a number (`out_files_per_proc`) of output files. Output files are distinctly mapped to each worker process. If `PER_WORKER` is specified, each worker will follow the same procedure as `PER_PARTITION`, but will re-shuffle each file after all data is persisted. This results in a full shuffle of the data processed by each worker. To improve performace, this option currently uses host-memory `BytesIO` objects for the intermediate persist stage. The `FULL` option is not yet implemented. output_path : string Path to write processed/shuffled output data output_format : {"parquet", "hugectr", None} Output format to write processed/shuffled data. If None, no output dataset will be written (and shuffling skipped). out_files_per_proc : integer Number of files to create (per process) after shuffling the data num_io_threads : integer Number of IO threads to use for writing the output dataset. For `0` (default), no dedicated IO threads will be used. dtypes : dict Dictionary containing desired datatypes for output columns. Keys are column names, values are datatypes. """ # Check shuffle argument shuffle = _check_shuffle_arg(shuffle) # If no tasks have been loaded then we need to load internal config if not self.phases: self.finalize() # Gather statstics (if apply_offline), and/or transform # and write out processed data if apply_offline: self.build_and_process_graph( dataset, output_path=output_path, record_stats=record_stats, shuffle=shuffle, output_format=output_format, out_files_per_proc=out_files_per_proc, num_io_threads=num_io_threads, dtypes=dtypes, ) else: self.iterate_online( dataset, output_path=output_path, shuffle=shuffle, output_format=output_format, out_files_per_proc=out_files_per_proc, num_io_threads=num_io_threads, dtypes=dtypes, )