Example #1
0
    def build_and_process_graph(
        self,
        dataset,
        end_phase=None,
        output_path=None,
        record_stats=True,
        shuffle=None,
        output_format=None,
        out_files_per_proc=None,
        apply_ops=True,
        num_io_threads=0,
    ):
        """ Build Dask-task graph for workflow.

            Full graph is only executed if `output_format` is specified.
        """
        # Check shuffle argument
        shuffle = nvt_io._check_shuffle_arg(shuffle)

        end = end_phase if end_phase else len(self.phases)

        if output_format not in ("parquet", "hugectr", None):
            raise ValueError(
                f"Output format {output_format} not yet supported with Dask.")

        # Reorder tasks for two-phase workflows
        # TODO: Generalize this type of optimization
        self.reorder_tasks(end)

        # Clear worker caches to be "safe"
        if self.client:
            self.client.run(clean_worker_cache)
        else:
            clean_worker_cache()

        self.set_ddf(dataset, shuffle=(shuffle is not None))
        if apply_ops:
            for idx, _ in enumerate(self.phases[:end]):
                self.exec_phase(idx, record_stats=record_stats)
        if output_format:
            output_path = output_path or "./"
            output_path = str(output_path)
            self.ddf_to_dataset(
                output_path,
                output_format=output_format,
                shuffle=shuffle,
                out_files_per_proc=out_files_per_proc,
                num_threads=num_io_threads,
            )
Example #2
0
    def write_to_dataset(
        self,
        path,
        dataset,
        apply_ops=False,
        out_files_per_proc=None,
        shuffle=None,
        output_format="parquet",
        iterate=False,
        nfiles=None,
        num_io_threads=0,
    ):
        """ Write data to shuffled parquet dataset.

            Assumes statistics are already gathered.
        """
        # Check shuffle argument
        shuffle = nvt_io._check_shuffle_arg(shuffle)

        if nfiles:
            warnings.warn("nfiles is deprecated. Use out_files_per_proc")
            if out_files_per_proc is None:
                out_files_per_proc = nfiles
        out_files_per_proc = out_files_per_proc or 1

        path = str(path)
        if iterate:
            self.iterate_online(
                dataset,
                output_path=path,
                shuffle=shuffle,
                output_format=output_format,
                out_files_per_proc=out_files_per_proc,
                apply_ops=apply_ops,
                num_io_threads=num_io_threads,
            )
        else:
            self.build_and_process_graph(
                dataset,
                output_path=path,
                record_stats=False,
                shuffle=shuffle,
                output_format=output_format,
                out_files_per_proc=out_files_per_proc,
                apply_ops=apply_ops,
                num_io_threads=num_io_threads,
            )
Example #3
0
    def iterate_online(
        self,
        dataset,
        end_phase=None,
        output_path=None,
        shuffle=None,
        output_format=None,
        out_files_per_proc=None,
        apply_ops=True,
        num_io_threads=0,
    ):
        """ Iterate through dataset and (optionally) apply/shuffle/write.
        """
        # Check shuffle argument
        shuffle = nvt_io._check_shuffle_arg(shuffle)

        # Check if we have a (supported) writer
        output_path = output_path or "./"
        output_path = str(output_path)
        writer = nvt_io.writer_factory(
            output_format,
            output_path,
            out_files_per_proc,
            shuffle,
            bytes_io=(shuffle == nvt_io.shuffle.per_worker),
            num_threads=num_io_threads,
        )

        # Iterate through dataset, apply ops, and write out processed data
        if apply_ops:
            for gdf in dataset.to_iter(shuffle=(shuffle is not None)):
                self.apply_ops(gdf, output_path=output_path, writer=writer)

        # Close writer and write general/specialized metadata
        if writer:
            general_md, special_md = writer.close()

            # Note that we "could" have the special and general metadata
            # written during `writer.close()` (just above) for the single-GPU case.
            # Instead, the metadata logic is separated from the `Writer` object to
            # simplify multi-GPU integration. When using Dask, we cannot assume
            # that the "shared" metadata files can/will be written by the same
            # process that writes the data.
            writer.write_special_metadata(special_md, writer.fs, output_path)
            writer.write_general_metadata(general_md, writer.fs, output_path)
Example #4
0
    def apply(
        self,
        dataset,
        apply_offline=True,
        record_stats=True,
        shuffle=None,
        output_path="./ds_export",
        output_format="parquet",
        out_files_per_proc=None,
        num_io_threads=0,
    ):
        """
        Runs all the preprocessing and feature engineering operators.
        Also, shuffles the data if a `shuffle` option is specified.

        Parameters
        -----------
        dataset : object
        apply_offline : boolean
            Runs operators in offline mode or not
        record_stats : boolean
            Record the stats in file or not. Only available
            for apply_offline=True
        shuffle : nvt.io.Shuffle enum
            How to shuffle the output dataset. Shuffling is only
            performed if the data is written to disk. For all options,
            other than `None` (which means no shuffling), the partitions
            of the underlying dataset/ddf will be randomly ordered. If
            `PER_PARTITION` is specified, each worker/process will also
            shuffle the rows within each partition before splitting and
            appending the data to a number (`out_files_per_proc`) of output
            files. Output files are distinctly mapped to each worker process.
            If `PER_WORKER` is specified, each worker will follow the same
            procedure as `PER_PARTITION`, but will re-shuffle each file after
            all data is persisted.  This results in a full shuffle of the
            data processed by each worker.  To improve performace, this option
            currently uses host-memory `BytesIO` objects for the intermediate
            persist stage. The `FULL` option is not yet implemented.
        output_path : string
            Path to write processed/shuffled output data
        output_format : {"parquet", "hugectr", None}
            Output format to write processed/shuffled data. If None,
            no output dataset will be written (and shuffling skipped).
        out_files_per_proc : integer
            Number of files to create (per process) after
            shuffling the data
        num_io_threads : integer
            Number of IO threads to use for writing the output dataset.
            For `0` (default), no dedicated IO threads will be used.
        """

        # Check shuffle argument
        shuffle = nvt_io._check_shuffle_arg(shuffle)

        # If no tasks have been loaded then we need to load internal config
        if not self.phases:
            self.finalize()

        # Gather statstics (if apply_offline), and/or transform
        # and write out processed data
        if apply_offline:
            self.build_and_process_graph(
                dataset,
                output_path=output_path,
                record_stats=record_stats,
                shuffle=shuffle,
                output_format=output_format,
                out_files_per_proc=out_files_per_proc,
                num_io_threads=num_io_threads,
            )
        else:
            self.iterate_online(
                dataset,
                output_path=output_path,
                shuffle=shuffle,
                output_format=output_format,
                out_files_per_proc=out_files_per_proc,
                num_io_threads=num_io_threads,
            )