Esempio n. 1
0
    def transform(self, col_selector: ColumnSelector,
                  df: DataFrameType) -> DataFrameType:
        filtered = self.f(df)
        if _is_dataframe_object(filtered):
            new_df = filtered
        elif _is_series_object(filtered) and filtered.dtype == bool:
            new_df = df[filtered]
        else:
            raise ValueError(
                f"Invalid output from filter op: f{filtered.__class__}")

        new_df.reset_index(drop=True, inplace=True)
        return new_df
Esempio n. 2
0
    def transform(self, columns: ColumnNames,
                  df: DataFrameType) -> DataFrameType:
        # compute a mask indicating partition boundaries, handling multiple partition_cols
        # represent partition boundaries by None values
        output = {}
        for shift in self.shifts:
            mask = df[self.partition_cols] == df[self.partition_cols].shift(
                shift)
            if _is_dataframe_object(mask):
                mask = mask.fillna(False).all(axis=1)
            mask[mask == False] = None  # noqa pylint: disable=singleton-comparison

            for col in columns:
                output[self._column_name(
                    col, shift)] = (df[col] - df[col].shift(shift)) * mask
        return type(df)(output)
Esempio n. 3
0
    def __init__(
        self,
        path_or_source,
        engine=None,
        npartitions=None,
        part_size=None,
        part_mem_fraction=None,
        storage_options=None,
        dtypes=None,
        client=None,
        cpu=None,
        base_dataset=None,
        **kwargs,
    ):
        self.dtypes = dtypes
        self.client = client

        # Check if we are keeping data in cpu memory
        self.cpu = cpu or False

        # Keep track of base dataset (optional)
        self.base_dataset = base_dataset or self

        # For now, lets warn the user that "cpu mode" is experimental
        if self.cpu:
            warnings.warn(
                "Initializing an NVTabular Dataset in CPU mode."
                "This is an experimental feature with extremely limited support!"
            )

        npartitions = npartitions or 1
        if isinstance(path_or_source, dask.dataframe.DataFrame) or _is_dataframe_object(
            path_or_source
        ):
            # User is passing in a <dask.dataframe|cudf|pd>.DataFrame
            # Use DataFrameDatasetEngine
            _path_or_source = _convert_data(
                path_or_source, cpu=self.cpu, to_collection=True, npartitions=npartitions
            )
            # Check if this is a collection that has now moved between host <-> device
            moved_collection = isinstance(path_or_source, dask.dataframe.DataFrame) and (
                not isinstance(_path_or_source._meta, type(path_or_source._meta))
            )
            if part_size:
                warnings.warn("part_size is ignored for DataFrame input.")
            if part_mem_fraction:
                warnings.warn("part_mem_fraction is ignored for DataFrame input.")
            self.engine = DataFrameDatasetEngine(
                _path_or_source, cpu=self.cpu, moved_collection=moved_collection
            )
        else:
            if part_size:
                # If a specific partition size is given, use it directly
                part_size = parse_bytes(part_size)
            else:
                # If a fractional partition size is given, calculate part_size
                part_mem_fraction = part_mem_fraction or 0.125
                assert 0.0 < part_mem_fraction < 1.0
                if part_mem_fraction > 0.25:
                    warnings.warn(
                        "Using very large partitions sizes for Dask. "
                        "Memory-related errors are likely."
                    )
                part_size = int(device_mem_size(kind="total", cpu=self.cpu) * part_mem_fraction)

            # Engine-agnostic path handling
            paths = path_or_source
            if hasattr(paths, "name"):
                paths = stringify_path(paths)
            if isinstance(paths, str):
                paths = [paths]
            paths = sorted(paths, key=natural_sort_key)

            storage_options = storage_options or {}
            # If engine is not provided, try to infer from end of paths[0]
            if engine is None:
                engine = paths[0].split(".")[-1]
            if isinstance(engine, str):
                if engine == "parquet":
                    self.engine = ParquetDatasetEngine(
                        paths, part_size, storage_options=storage_options, cpu=self.cpu, **kwargs
                    )
                elif engine == "csv":
                    self.engine = CSVDatasetEngine(
                        paths, part_size, storage_options=storage_options, cpu=self.cpu, **kwargs
                    )
                elif engine == "avro":
                    try:
                        from .avro import AvroDatasetEngine
                    except ImportError as e:
                        raise RuntimeError(
                            "Failed to import AvroDatasetEngine. Make sure uavro is installed."
                        ) from e

                    self.engine = AvroDatasetEngine(
                        paths, part_size, storage_options=storage_options, cpu=self.cpu, **kwargs
                    )
                else:
                    raise ValueError("Only parquet, csv, and avro supported (for now).")
            else:
                self.engine = engine(
                    paths, part_size, cpu=self.cpu, storage_options=storage_options
                )