Beispiel #1
0
    def _fit(self, dataset: Dataset) -> Preprocessor:
        low = self.quantile_range[0]
        med = 0.50
        high = self.quantile_range[1]

        num_records = dataset.count()
        max_index = num_records - 1
        split_indices = [
            int(percentile * max_index) for percentile in (low, med, high)
        ]

        self.stats_ = {}

        # TODO(matt): Handle case where quantile lands between 2 numbers.
        # The current implementation will simply choose the closest index.
        # This will affect the results of small datasets more than large datasets.
        for col in self.columns:
            filtered_dataset = dataset.map_batches(lambda df: df[[col]],
                                                   batch_format="pandas")
            sorted_dataset = filtered_dataset.sort(col)
            _, low, med, high = sorted_dataset.split_at_indices(split_indices)

            def _get_first_value(ds: Dataset, c: str):
                return ds.take(1)[0][c]

            low_val = _get_first_value(low, col)
            med_val = _get_first_value(med, col)
            high_val = _get_first_value(high, col)

            self.stats_[f"low_quantile({col})"] = low_val
            self.stats_[f"median({col})"] = med_val
            self.stats_[f"high_quantile({col})"] = high_val

        return self
Beispiel #2
0
def train_test_split(
    dataset: Dataset,
    test_size: Union[int, float],
    *,
    shuffle: bool = False,
    seed: Optional[int] = None,
) -> Tuple[Dataset, Dataset]:
    """Split a Dataset into train and test subsets.

    Example:
        .. code-block:: python

            import ray
            from ray.ml import train_test_split

            ds = ray.data.range(8)
            train, test = train_test_split(ds, test_size=0.25)
            print(train.take())  # [0, 1, 2, 3, 4, 5]
            print(test.take())  # [6, 7]

    Args:
        dataset: Dataset to split.
        test_size: If float, should be between 0.0 and 1.0 and represent the proportion
            of the dataset to include in the test split. If int, represents the
            absolute number of test samples. The train split will always be the
            compliment of the test split.
        shuffle: Whether or not to globally shuffle the dataset before splitting.
            Defaults to False. This may be a very expensive operation with large
            datasets.
        seed: Fix the random seed to use for shuffle, otherwise one will be chosen
            based on system randomness. Ignored if ``shuffle=False``.

    Returns:
        Train and test subsets as two Datasets.
    """
    if shuffle:
        dataset = dataset.random_shuffle(seed=seed)

    if not isinstance(test_size, (int, float)):
        raise TypeError(f"`test_size` must be int or float got {type(test_size)}.")
    if isinstance(test_size, float):
        if test_size <= 0 or test_size >= 1:
            raise ValueError(
                "If `test_size` is a float, it must be bigger than 0 and smaller than "
                f"1. Got {test_size}."
            )
        return dataset.split_proportionately([1 - test_size])
    else:
        dataset_length = dataset.count()
        if test_size <= 0 or test_size >= dataset_length:
            raise ValueError(
                "If `test_size` is an int, it must be bigger than 0 and smaller than "
                f"the size of the dataset ({dataset_length}). Got {test_size}."
            )
        return dataset.split_at_indices([dataset_length - test_size])
Beispiel #3
0
def _get_unique_value_indices(
    dataset: Dataset,
    columns: List[str],
    drop_na_values: bool = False,
    key_format: str = "unique_values({0})",
) -> Dict[str, Dict[str, int]]:
    """If drop_na_values is True, will silently drop NA values."""
    def get_pd_unique_values(df: pd.DataFrame) -> List[Dict[str, set]]:
        return [{col: set(df[col].unique()) for col in columns}]

    uniques = dataset.map_batches(get_pd_unique_values, batch_format="pandas")
    final_uniques = {col: set() for col in columns}
    for batch in uniques.iter_batches():
        for col_uniques in batch:
            for col, uniques in col_uniques.items():
                final_uniques[col].update(uniques)

    for col, uniques in final_uniques.items():
        if drop_na_values:
            final_uniques[col] = {v for v in uniques if not pd.isnull(v)}
        else:
            if any(pd.isnull(v) for v in uniques):
                raise ValueError(
                    f"Unable to fit column '{col}' because it contains null values. "
                    f"Consider imputing missing values first.")

    unique_values_with_indices = {
        key_format.format(column):
        {k: j
         for j, k in enumerate(sorted(final_uniques[column]))}
        for column in columns
    }
    return unique_values_with_indices
Beispiel #4
0
    def _fit(self, dataset: Dataset) -> Preprocessor:
        def get_pd_value_counts(df: pd.DataFrame) -> List[Counter]:
            def get_token_counts(col):
                token_series = df[col].apply(self.tokenization_fn)
                tokens = token_series.sum()
                return Counter(tokens)

            return [get_token_counts(col) for col in self.columns]

        value_counts = dataset.map_batches(get_pd_value_counts, batch_format="pandas")
        total_counts = [Counter() for _ in self.columns]
        for batch in value_counts.iter_batches():
            for i, col_value_counts in enumerate(batch):
                total_counts[i].update(col_value_counts)

        def most_common(counter: Counter, n: int):
            return Counter(dict(counter.most_common(n)))

        top_counts = [
            most_common(counter, self.max_features) for counter in total_counts
        ]

        self.stats_ = {
            f"token_counts({col})": counts
            for (col, counts) in zip(self.columns, top_counts)
        }

        return self
Beispiel #5
0
def _get_unique_value_indices(
    dataset: Dataset,
    columns: List[str],
    drop_na_values: bool = False,
    key_format: str = "unique_values({0})",
    limit: Optional[Dict[str, int]] = None,
) -> Dict[str, Dict[str, int]]:
    """If drop_na_values is True, will silently drop NA values."""
    limit = limit or {}
    for column in limit:
        if column not in columns:
            raise ValueError(
                f"You set limit for {column}, which is not present in {columns}."
            )

    def get_pd_value_counts(df: pd.DataFrame) -> List[Dict[str, Counter]]:
        result = [
            {
                col: Counter(df[col].value_counts(dropna=False).to_dict())
                for col in columns
            }
        ]
        return result

    value_counts = dataset.map_batches(get_pd_value_counts, batch_format="pandas")
    final_counters = {col: Counter() for col in columns}
    for batch in value_counts.iter_batches():
        for col_value_counts in batch:
            for col, value_counts in col_value_counts.items():
                final_counters[col] += value_counts

    # Inspect if there is any NA values.
    for col in columns:
        if drop_na_values:
            counter = final_counters[col]
            counter_dict = dict(counter)
            sanitized_dict = {k: v for k, v in counter_dict.items() if not pd.isnull(k)}
            final_counters[col] = Counter(sanitized_dict)
        else:
            if any(pd.isnull(k) for k in final_counters[col]):
                raise ValueError(
                    f"Unable to fit column '{col}' because it contains null"
                    f" values. Consider imputing missing values first."
                )

    unique_values_with_indices = dict()
    for column in columns:
        if column in limit:
            # Output sorted by freq.
            unique_values_with_indices[key_format.format(column)] = {
                k[0]: j
                for j, k in enumerate(final_counters[column].most_common(limit[column]))
            }
        else:
            # Output sorted by column name.
            unique_values_with_indices[key_format.format(column)] = {
                k: j for j, k in enumerate(sorted(dict(final_counters[column]).keys()))
            }
    return unique_values_with_indices
Beispiel #6
0
    def _fit(self, dataset: Dataset) -> Preprocessor:
        if self.strategy == "mean":
            aggregates = [Mean(col) for col in self.columns]
            self.stats_ = dataset.aggregate(*aggregates)
        elif self.strategy == "most_frequent":
            self.stats_ = _get_most_frequent_values(dataset, *self.columns)

        return self
Beispiel #7
0
    def _transform(self, dataset: Dataset) -> Dataset:
        # TODO(matt): Expose `batch_size` or similar configurability.
        # The default may be too small for some datasets and too large for others.

        dataset_format = dataset._dataset_format()
        if dataset_format not in ("pandas", "arrow"):
            raise ValueError(
                f"Unsupported Dataset format: '{dataset_format}'. Only 'pandas' and "
                "'arrow' Dataset formats are supported.")

        transform_type = self._determine_transform_to_use(dataset_format)

        if transform_type == "pandas":
            return dataset.map_batches(self._transform_pandas,
                                       batch_format="pandas")
        elif transform_type == "arrow":
            return dataset.map_batches(self._transform_arrow,
                                       batch_format="pyarrow")
        else:
            raise ValueError(
                "Invalid transform type returned from _determine_transform_to_use; "
                f'"pandas" and "arrow" allowed, but got: {transform_type}')
Beispiel #8
0
def _get_most_frequent_values(dataset: Dataset,
                              *columns: str) -> Dict[str, Union[str, Number]]:
    columns = list(columns)

    def get_pd_value_counts(df: pd.DataFrame) -> List[Counter]:
        return [Counter(df[col].value_counts().to_dict()) for col in columns]

    value_counts = dataset.map_batches(get_pd_value_counts,
                                       batch_format="pandas")
    final_counters = [Counter() for _ in columns]
    for batch in value_counts.iter_batches():
        for i, col_value_counts in enumerate(batch):
            final_counters[i] += col_value_counts

    return {
        f"most_frequent({column})": final_counters[i].most_common(1)[0][0]
        for i, column in enumerate(columns)
    }
Beispiel #9
0
def _get_most_frequent_values(dataset: Dataset,
                              *columns: str) -> Dict[str, Union[str, Number]]:
    columns = list(columns)

    def get_pd_value_counts(df: pd.DataFrame) -> List[Dict[str, Counter]]:
        return [{
            col: Counter(df[col].value_counts().to_dict())
            for col in columns
        }]

    value_counts = dataset.map_batches(get_pd_value_counts,
                                       batch_format="pandas")
    final_counters = {col: Counter() for col in columns}
    for batch in value_counts.iter_batches():
        for col_value_counts in batch:
            for col, value_counts in col_value_counts.items():
                final_counters[col] += value_counts

    return {
        f"most_frequent({column})": final_counters[column].most_common(1)[0][0]
        for column in columns
    }
Beispiel #10
0
def _get_most_frequent_values(dataset: Dataset,
                              *columns: str) -> Dict[str, Union[str, Number]]:
    # TODO(matt): Optimize this.
    results = {}
    for column in columns:
        # Remove nulls.
        nonnull_dataset = dataset.map_batches(
            lambda df: df.dropna(subset=[column]), batch_format="pandas")
        # Count values.
        counts = nonnull_dataset.groupby(column).count()
        # Find max count.
        max_aggregate = counts.aggregate(Max("count()"))
        max_count = max_aggregate["max(count())"]
        # Find values with max_count.
        most_frequent_values = counts.map_batches(
            lambda df: df.drop(df[df["count()"] < max_count].index),
            batch_format="pandas",
        )
        # Take first (sorted) value.
        most_frequent_value_count = most_frequent_values.take(1)[0]
        most_frequent_value = most_frequent_value_count[column]
        results[f"most_frequent({column})"] = most_frequent_value

    return results
Beispiel #11
0
 def _transform(self, dataset: Dataset) -> Dataset:
     # TODO(matt): Expose `batch_size` or similar configurability.
     # The default may be too small for some datasets and too large for others.
     return dataset.map_batches(self._transform_pandas,
                                batch_format="pandas")
Beispiel #12
0
 def _fit(self, dataset: Dataset) -> Preprocessor:
     aggregates = [Agg(col) for Agg in [Min, Max] for col in self.columns]
     self.stats_ = dataset.aggregate(*aggregates)
     return self
Beispiel #13
0
 def _fit(self, dataset: Dataset) -> Preprocessor:
     mean_aggregates = [Mean(col) for col in self.columns]
     std_aggregates = [Std(col, ddof=self.ddof) for col in self.columns]
     self.stats_ = dataset.aggregate(*mean_aggregates, *std_aggregates)
     return self
Beispiel #14
0
 def _get_first_value(ds: Dataset, c: str):
     return ds.take(1)[0][c]
Beispiel #15
0
def _get_unique_values(dataset: Dataset, column: str) -> Set[str]:
    agg_ds = dataset.groupby(column).count()
    # TODO: Support an upper limit by using `agg_ds.take(N)` instead.
    return {row[column] for row in agg_ds.iter_rows()}
Beispiel #16
0
def get_max(ds: Dataset):
    return ds.aggregate(Max("value"))
Beispiel #17
0
 def get_max_a(ds: Dataset):
     # Calculate max value for column A.
     max_a = ds.aggregate(Max("A"))
     return max_a
 def execute_if_needed(self, ds: Dataset) -> Dataset:
     if ds._uuid not in self.set:
         ds = ds.fully_executed()
         self.set.add(ds._uuid)
     return ds