def build_and_process_graph( self, dataset, end_phase=None, output_path=None, record_stats=True, shuffle=None, output_format=None, out_files_per_proc=None, apply_ops=True, num_io_threads=0, ): """ Build Dask-task graph for workflow. Full graph is only executed if `output_format` is specified. """ # Check shuffle argument shuffle = nvt_io._check_shuffle_arg(shuffle) end = end_phase if end_phase else len(self.phases) if output_format not in ("parquet", "hugectr", None): raise ValueError( f"Output format {output_format} not yet supported with Dask.") # Reorder tasks for two-phase workflows # TODO: Generalize this type of optimization self.reorder_tasks(end) # Clear worker caches to be "safe" if self.client: self.client.run(clean_worker_cache) else: clean_worker_cache() self.set_ddf(dataset, shuffle=(shuffle is not None)) if apply_ops: for idx, _ in enumerate(self.phases[:end]): self.exec_phase(idx, record_stats=record_stats) if output_format: output_path = output_path or "./" output_path = str(output_path) self.ddf_to_dataset( output_path, output_format=output_format, shuffle=shuffle, out_files_per_proc=out_files_per_proc, num_threads=num_io_threads, )
def write_to_dataset( self, path, dataset, apply_ops=False, out_files_per_proc=None, shuffle=None, output_format="parquet", iterate=False, nfiles=None, num_io_threads=0, ): """ Write data to shuffled parquet dataset. Assumes statistics are already gathered. """ # Check shuffle argument shuffle = nvt_io._check_shuffle_arg(shuffle) if nfiles: warnings.warn("nfiles is deprecated. Use out_files_per_proc") if out_files_per_proc is None: out_files_per_proc = nfiles out_files_per_proc = out_files_per_proc or 1 path = str(path) if iterate: self.iterate_online( dataset, output_path=path, shuffle=shuffle, output_format=output_format, out_files_per_proc=out_files_per_proc, apply_ops=apply_ops, num_io_threads=num_io_threads, ) else: self.build_and_process_graph( dataset, output_path=path, record_stats=False, shuffle=shuffle, output_format=output_format, out_files_per_proc=out_files_per_proc, apply_ops=apply_ops, num_io_threads=num_io_threads, )
def iterate_online( self, dataset, end_phase=None, output_path=None, shuffle=None, output_format=None, out_files_per_proc=None, apply_ops=True, num_io_threads=0, ): """ Iterate through dataset and (optionally) apply/shuffle/write. """ # Check shuffle argument shuffle = nvt_io._check_shuffle_arg(shuffle) # Check if we have a (supported) writer output_path = output_path or "./" output_path = str(output_path) writer = nvt_io.writer_factory( output_format, output_path, out_files_per_proc, shuffle, bytes_io=(shuffle == nvt_io.shuffle.per_worker), num_threads=num_io_threads, ) # Iterate through dataset, apply ops, and write out processed data if apply_ops: for gdf in dataset.to_iter(shuffle=(shuffle is not None)): self.apply_ops(gdf, output_path=output_path, writer=writer) # Close writer and write general/specialized metadata if writer: general_md, special_md = writer.close() # Note that we "could" have the special and general metadata # written during `writer.close()` (just above) for the single-GPU case. # Instead, the metadata logic is separated from the `Writer` object to # simplify multi-GPU integration. When using Dask, we cannot assume # that the "shared" metadata files can/will be written by the same # process that writes the data. writer.write_special_metadata(special_md, writer.fs, output_path) writer.write_general_metadata(general_md, writer.fs, output_path)
def apply( self, dataset, apply_offline=True, record_stats=True, shuffle=None, output_path="./ds_export", output_format="parquet", out_files_per_proc=None, num_io_threads=0, ): """ Runs all the preprocessing and feature engineering operators. Also, shuffles the data if a `shuffle` option is specified. Parameters ----------- dataset : object apply_offline : boolean Runs operators in offline mode or not record_stats : boolean Record the stats in file or not. Only available for apply_offline=True shuffle : nvt.io.Shuffle enum How to shuffle the output dataset. Shuffling is only performed if the data is written to disk. For all options, other than `None` (which means no shuffling), the partitions of the underlying dataset/ddf will be randomly ordered. If `PER_PARTITION` is specified, each worker/process will also shuffle the rows within each partition before splitting and appending the data to a number (`out_files_per_proc`) of output files. Output files are distinctly mapped to each worker process. If `PER_WORKER` is specified, each worker will follow the same procedure as `PER_PARTITION`, but will re-shuffle each file after all data is persisted. This results in a full shuffle of the data processed by each worker. To improve performace, this option currently uses host-memory `BytesIO` objects for the intermediate persist stage. The `FULL` option is not yet implemented. output_path : string Path to write processed/shuffled output data output_format : {"parquet", "hugectr", None} Output format to write processed/shuffled data. If None, no output dataset will be written (and shuffling skipped). out_files_per_proc : integer Number of files to create (per process) after shuffling the data num_io_threads : integer Number of IO threads to use for writing the output dataset. For `0` (default), no dedicated IO threads will be used. """ # Check shuffle argument shuffle = nvt_io._check_shuffle_arg(shuffle) # If no tasks have been loaded then we need to load internal config if not self.phases: self.finalize() # Gather statstics (if apply_offline), and/or transform # and write out processed data if apply_offline: self.build_and_process_graph( dataset, output_path=output_path, record_stats=record_stats, shuffle=shuffle, output_format=output_format, out_files_per_proc=out_files_per_proc, num_io_threads=num_io_threads, ) else: self.iterate_online( dataset, output_path=output_path, shuffle=shuffle, output_format=output_format, out_files_per_proc=out_files_per_proc, num_io_threads=num_io_threads, )