Ejemplo n.º 1
0
    def build_and_process_graph(
        self,
        dataset,
        end_phase=None,
        output_path=None,
        record_stats=True,
        shuffle=None,
        output_format=None,
        out_files_per_proc=None,
        apply_ops=True,
        num_io_threads=0,
        dtypes=None,
    ):
        """Build Dask-task graph for workflow.

        Full graph is only executed if `output_format` is specified.
        """
        # Check shuffle argument
        shuffle = _check_shuffle_arg(shuffle)

        # Reorder tasks for two-phase workflows
        # TODO: Generalize this type of optimization
        self.reorder_tasks()

        end = end_phase if end_phase else len(self.phases)

        if output_format not in ("parquet", "hugectr", None):
            raise ValueError(
                f"Output format {output_format} not yet supported with Dask.")

        # Clear worker caches to be "safe"
        if self.client:
            self.client.run(clean_worker_cache)
        else:
            clean_worker_cache()

        self.set_ddf(dataset, shuffle=(shuffle is not None))
        if apply_ops:
            self._base_phase = 0  # Set _base_phase
            for idx, _ in enumerate(self.phases[:end]):
                self.exec_phase(idx,
                                record_stats=record_stats,
                                update_ddf=(idx == (end - 1)))
            self._base_phase = 0  # Re-Set _base_phase

        if dtypes:
            ddf = self.get_ddf()
            _meta = _set_dtypes(ddf._meta, dtypes)
            self.set_ddf(ddf.map_partitions(_set_dtypes, dtypes, meta=_meta))

        if output_format:
            output_path = output_path or "./"
            output_path = str(output_path)
            self.ddf_to_dataset(
                output_path,
                output_format=output_format,
                shuffle=shuffle,
                out_files_per_proc=out_files_per_proc,
                num_threads=num_io_threads,
            )
Ejemplo n.º 2
0
    def iterate_online(
        self,
        dataset,
        end_phase=None,
        output_path=None,
        shuffle=None,
        output_format=None,
        out_files_per_proc=None,
        apply_ops=True,
        num_io_threads=0,
        dtypes=None,
    ):
        """Iterate through dataset and (optionally) apply/shuffle/write."""
        # Check shuffle argument
        shuffle = _check_shuffle_arg(shuffle)

        # Check if we have a (supported) writer
        output_path = output_path or "./"
        output_path = str(output_path)
        writer = writer_factory(
            output_format,
            output_path,
            out_files_per_proc,
            shuffle,
            bytes_io=(shuffle == Shuffle.PER_WORKER),
            num_threads=num_io_threads,
        )

        # Iterate through dataset, apply ops, and write out processed data
        if apply_ops:
            columns = self.columns_ctx["all"]["base"]
            for gdf in dataset.to_iter(shuffle=(shuffle is not None),
                                       columns=columns):
                self.apply_ops(gdf,
                               output_path=output_path,
                               writer=writer,
                               dtypes=dtypes)

        # Close writer and write general/specialized metadata
        if writer:
            general_md, special_md = writer.close()

            # Note that we "could" have the special and general metadata
            # written during `writer.close()` (just above) for the single-GPU case.
            # Instead, the metadata logic is separated from the `Writer` object to
            # simplify multi-GPU integration. When using Dask, we cannot assume
            # that the "shared" metadata files can/will be written by the same
            # process that writes the data.
            writer.write_special_metadata(special_md, writer.fs, output_path)
            writer.write_general_metadata(general_md, writer.fs, output_path)
Ejemplo n.º 3
0
    def write_to_dataset(
        self,
        path,
        dataset,
        apply_ops=False,
        out_files_per_proc=None,
        shuffle=None,
        output_format="parquet",
        iterate=False,
        nfiles=None,
        num_io_threads=0,
        dtypes=None,
    ):
        """Write data to shuffled parquet dataset.

        Assumes statistics are already gathered.
        """
        # Check shuffle argument
        shuffle = _check_shuffle_arg(shuffle)

        if nfiles:
            warnings.warn("nfiles is deprecated. Use out_files_per_proc")
            if out_files_per_proc is None:
                out_files_per_proc = nfiles
        out_files_per_proc = out_files_per_proc or 1

        path = str(path)
        if iterate:
            self.iterate_online(
                dataset,
                output_path=path,
                shuffle=shuffle,
                output_format=output_format,
                out_files_per_proc=out_files_per_proc,
                apply_ops=apply_ops,
                num_io_threads=num_io_threads,
                dtypes=dtypes,
            )
        else:
            self.build_and_process_graph(
                dataset,
                output_path=path,
                record_stats=False,
                shuffle=shuffle,
                output_format=output_format,
                out_files_per_proc=out_files_per_proc,
                apply_ops=apply_ops,
                num_io_threads=num_io_threads,
                dtypes=dtypes,
            )
Ejemplo n.º 4
0
    def to_hugectr(
        self,
        output_path,
        cats,
        conts,
        labels,
        shuffle=None,
        file_partition_map=None,
        out_files_per_proc=None,
        num_threads=0,
        dtypes=None,
    ):
        """Writes out to a parquet dataset

        Parameters
        ----------
        output_path : string
            Path to write processed/shuffled output data
        cats : list of str
            List of categorical columns
        conts : list of str
            List of continuous columns
        labels : list of str
            List of label columns
        shuffle : nvt.io.Shuffle, optional
            How to shuffle the output dataset. Shuffling is only
            performed if the data is written to disk. For all options,
            other than `None` (which means no shuffling), the partitions
            of the underlying dataset/ddf will be randomly ordered. If
            `PER_PARTITION` is specified, each worker/process will also
            shuffle the rows within each partition before splitting and
            appending the data to a number (`out_files_per_proc`) of output
            files. Output files are distinctly mapped to each worker process.
            If `PER_WORKER` is specified, each worker will follow the same
            procedure as `PER_PARTITION`, but will re-shuffle each file after
            all data is persisted.  This results in a full shuffle of the
            data processed by each worker.  To improve performace, this option
            currently uses host-memory `BytesIO` objects for the intermediate
            persist stage. The `FULL` option is not yet implemented.
        file_partition_map : dict
            Dictionary mapping of output file names to partition indices
            that should be written to that file name.  If this argument
            is passed, only the partitions included in the dictionary
            will be written to disk, and the `output_files_per_proc` argument
            will be ignored.
        out_files_per_proc : integer
            Number of files to create (per process) after
            shuffling the data
        num_threads : integer
            Number of IO threads to use for writing the output dataset.
            For `0` (default), no dedicated IO threads will be used.
        dtypes : dict
            Dictionary containing desired datatypes for output columns.
            Keys are column names, values are datatypes.
        """

        # For now, we must move to the GPU to
        # write an output dataset.
        # TODO: Support CPU-mode output
        self.to_gpu()

        shuffle = _check_shuffle_arg(shuffle)
        ddf = self.to_ddf(shuffle=shuffle)
        if dtypes:
            _meta = _set_dtypes(ddf._meta, dtypes)
            ddf = ddf.map_partitions(_set_dtypes, dtypes, meta=_meta)

        fs = get_fs_token_paths(output_path)[0]
        fs.mkdirs(output_path, exist_ok=True)

        # Output dask_cudf DataFrame to dataset,
        _ddf_to_dataset(
            ddf,
            fs,
            output_path,
            shuffle,
            file_partition_map,
            out_files_per_proc,
            cats,
            conts,
            labels,
            "hugectr",
            self.client,
            num_threads,
            self.cpu,
        )
Ejemplo n.º 5
0
    def to_parquet(
        self,
        output_path,
        shuffle=None,
        preserve_files=False,
        output_files=None,
        out_files_per_proc=None,
        num_threads=0,
        dtypes=None,
        cats=None,
        conts=None,
        labels=None,
        suffix=".parquet",
        partition_on=None,
    ):
        """Writes out to a parquet dataset

        Parameters
        ----------
        output_path : string
            Path to write processed/shuffled output data
        shuffle : nvt.io.Shuffle enum
            How to shuffle the output dataset. For all options,
            other than `None` (which means no shuffling), the partitions
            of the underlying dataset/ddf will be randomly ordered. If
            `PER_PARTITION` is specified, each worker/process will also
            shuffle the rows within each partition before splitting and
            appending the data to a number (`out_files_per_proc`) of output
            files. Output files are distinctly mapped to each worker process.
            If `PER_WORKER` is specified, each worker will follow the same
            procedure as `PER_PARTITION`, but will re-shuffle each file after
            all data is persisted.  This results in a full shuffle of the
            data processed by each worker.  To improve performace, this option
            currently uses host-memory `BytesIO` objects for the intermediate
            persist stage. The `FULL` option is not yet implemented.
        partition_on : str or list(str)
            Columns to use for hive-partitioning.  If this option is used,
            `preserve_files`, `output_files`, and `out_files_per_proc` will
            all be ignored.  Also, the `PER_WORKER` shuffle will not be
            supported.
        preserve_files : bool
            Whether to preserve the original file-to-partition mapping of
            the base dataset. This option is only available if the base
            dataset is known, and if it corresponds to csv or parquet format.
            If True, the `out_files_per_proc` option will be ignored, but the
            `output_files` option will take precedence. Default is False.
        output_files : dict, list or int
            Dictionary mapping of output file names to partition indices.
            If a list of file names is specified, a contiguous range of
            output partitions will be mapped to each file. The same procedure
            is used if an integer is specified, but the file names will be
            written as "part_*". If anything is specified for `output_files`,
            the `output_files_per_proc` argument will be ignored.  Also, if
            a dictionary is specified, excluded partition indices will not
            be written to disk.
        out_files_per_proc : integer
            Number of files to create (per process) after shuffling the
            data. This option will be ignored if `output_files`
            is specified.
        num_threads : integer
            Number of IO threads to use for writing the output dataset.
            For `0` (default), no dedicated IO threads will be used.
        dtypes : dict
            Dictionary containing desired datatypes for output columns.
            Keys are column names, values are datatypes.
        suffix : str or False
            File-name extension to use for all output files. This argument
            is ignored if a specific list of file names is specified using
            the ``output_files`` option. If ``preserve_files=True``, this
            suffix will be appended to the original name of each file,
            unless the original extension is ".csv", ".parquet", ".avro",
            or ".orc" (in which case the old extension will be replaced).
        cats : list of str, optional
            List of categorical columns
        conts : list of str, optional
            List of continuous columns
        labels : list of str, optional
            List of label columns
        """

        shuffle = _check_shuffle_arg(shuffle)

        if isinstance(output_files, dict) or (not output_files and preserve_files):
            # Do not shuffle partitions if we are preserving files or
            # if a specific file-partition mapping is already specified
            ddf = self.to_ddf()
        else:
            ddf = self.to_ddf(shuffle=shuffle)

        # Replace None/False suffix argument with ""
        suffix = suffix or ""

        # Convert `output_files` argument to a dict mapping
        if output_files:

            if isinstance(output_files, int):
                output_files = [f"part_{i}" + suffix for i in range(output_files)]
            if isinstance(output_files, list):
                new = {}
                split = math.ceil(ddf.npartitions / len(output_files))
                for i, fn in enumerate(output_files):
                    start = i * split
                    stop = min(start + split, ddf.npartitions)
                    new[fn] = np.arange(start, stop)
                output_files = new
                suffix = ""  # Don't add a suffix later - Names already include it
            if not isinstance(output_files, dict):
                raise TypeError(f"{type(output_files)} not a supported type for `output_files`.")

        # If we are preserving files, use the stored dictionary,
        # or use file_partition_map to extract the mapping
        elif preserve_files:
            try:
                _output_files = self.base_dataset.file_partition_map
            except AttributeError as e:
                raise AttributeError(
                    f"`to_parquet(..., preserve_files=True)` is not currently supported "
                    f"for datasets with a {type(self.base_dataset.engine)} engine. Check "
                    f"that `dataset.base_dataset` is backed by csv or parquet files."
                ) from e
            if suffix == "":
                output_files = _output_files
            else:
                output_files = {}
                for fn, rgs in _output_files.items():
                    split_fn = fn.split(".")
                    if split_fn[-1] in ("parquet", "avro", "orc", "csv"):
                        output_files[".".join(split_fn[:-1]) + suffix] = rgs
                    else:
                        output_files[fn + suffix] = rgs
            suffix = ""  # Don't add a suffix later - Names already include it

        if dtypes:
            _meta = _set_dtypes(ddf._meta, dtypes)
            ddf = ddf.map_partitions(_set_dtypes, dtypes, meta=_meta)

        fs = get_fs_token_paths(output_path)[0]
        fs.mkdirs(output_path, exist_ok=True)

        # Output dask_cudf DataFrame to dataset
        _ddf_to_dataset(
            ddf,
            fs,
            output_path,
            shuffle,
            output_files,
            out_files_per_proc,
            cats or [],
            conts or [],
            labels or [],
            "parquet",
            self.client,
            num_threads,
            self.cpu,
            suffix=suffix,
            partition_on=partition_on,
        )
Ejemplo n.º 6
0
    def apply(
        self,
        dataset,
        apply_offline=True,
        record_stats=True,
        shuffle=None,
        output_path="./ds_export",
        output_format="parquet",
        out_files_per_proc=None,
        num_io_threads=0,
        dtypes=None,
    ):
        """
        Runs all the preprocessing and feature engineering operators.
        Also, shuffles the data if a `shuffle` option is specified.

        Parameters
        -----------
        dataset : object
        apply_offline : boolean
            Runs operators in offline mode or not
        record_stats : boolean
            Record the stats in file or not. Only available
            for apply_offline=True
        shuffle : nvt.io.Shuffle enum
            How to shuffle the output dataset. Shuffling is only
            performed if the data is written to disk. For all options,
            other than `None` (which means no shuffling), the partitions
            of the underlying dataset/ddf will be randomly ordered. If
            `PER_PARTITION` is specified, each worker/process will also
            shuffle the rows within each partition before splitting and
            appending the data to a number (`out_files_per_proc`) of output
            files. Output files are distinctly mapped to each worker process.
            If `PER_WORKER` is specified, each worker will follow the same
            procedure as `PER_PARTITION`, but will re-shuffle each file after
            all data is persisted.  This results in a full shuffle of the
            data processed by each worker.  To improve performace, this option
            currently uses host-memory `BytesIO` objects for the intermediate
            persist stage. The `FULL` option is not yet implemented.
        output_path : string
            Path to write processed/shuffled output data
        output_format : {"parquet", "hugectr", None}
            Output format to write processed/shuffled data. If None,
            no output dataset will be written (and shuffling skipped).
        out_files_per_proc : integer
            Number of files to create (per process) after
            shuffling the data
        num_io_threads : integer
            Number of IO threads to use for writing the output dataset.
            For `0` (default), no dedicated IO threads will be used.
        dtypes : dict
            Dictionary containing desired datatypes for output columns.
            Keys are column names, values are datatypes.
        """

        # Check shuffle argument
        shuffle = _check_shuffle_arg(shuffle)

        # If no tasks have been loaded then we need to load internal config
        if not self.phases:
            self.finalize()

        # Gather statstics (if apply_offline), and/or transform
        # and write out processed data
        if apply_offline:
            self.build_and_process_graph(
                dataset,
                output_path=output_path,
                record_stats=record_stats,
                shuffle=shuffle,
                output_format=output_format,
                out_files_per_proc=out_files_per_proc,
                num_io_threads=num_io_threads,
                dtypes=dtypes,
            )
        else:
            self.iterate_online(
                dataset,
                output_path=output_path,
                shuffle=shuffle,
                output_format=output_format,
                out_files_per_proc=out_files_per_proc,
                num_io_threads=num_io_threads,
                dtypes=dtypes,
            )