Beispiel #1
0
    def _fit(self, dataset: Dataset) -> Preprocessor:
        def get_pd_value_counts(df: pd.DataFrame) -> List[Counter]:
            def get_token_counts(col):
                token_series = df[col].apply(self.tokenization_fn)
                tokens = token_series.sum()
                return Counter(tokens)

            return [get_token_counts(col) for col in self.columns]

        value_counts = dataset.map_batches(get_pd_value_counts, batch_format="pandas")
        total_counts = [Counter() for _ in self.columns]
        for batch in value_counts.iter_batches():
            for i, col_value_counts in enumerate(batch):
                total_counts[i].update(col_value_counts)

        def most_common(counter: Counter, n: int):
            return Counter(dict(counter.most_common(n)))

        top_counts = [
            most_common(counter, self.max_features) for counter in total_counts
        ]

        self.stats_ = {
            f"token_counts({col})": counts
            for (col, counts) in zip(self.columns, top_counts)
        }

        return self
Beispiel #2
0
def _get_unique_value_indices(
    dataset: Dataset,
    columns: List[str],
    drop_na_values: bool = False,
    key_format: str = "unique_values({0})",
) -> Dict[str, Dict[str, int]]:
    """If drop_na_values is True, will silently drop NA values."""
    def get_pd_unique_values(df: pd.DataFrame) -> List[Dict[str, set]]:
        return [{col: set(df[col].unique()) for col in columns}]

    uniques = dataset.map_batches(get_pd_unique_values, batch_format="pandas")
    final_uniques = {col: set() for col in columns}
    for batch in uniques.iter_batches():
        for col_uniques in batch:
            for col, uniques in col_uniques.items():
                final_uniques[col].update(uniques)

    for col, uniques in final_uniques.items():
        if drop_na_values:
            final_uniques[col] = {v for v in uniques if not pd.isnull(v)}
        else:
            if any(pd.isnull(v) for v in uniques):
                raise ValueError(
                    f"Unable to fit column '{col}' because it contains null values. "
                    f"Consider imputing missing values first.")

    unique_values_with_indices = {
        key_format.format(column):
        {k: j
         for j, k in enumerate(sorted(final_uniques[column]))}
        for column in columns
    }
    return unique_values_with_indices
Beispiel #3
0
    def _fit(self, dataset: Dataset) -> Preprocessor:
        low = self.quantile_range[0]
        med = 0.50
        high = self.quantile_range[1]

        num_records = dataset.count()
        max_index = num_records - 1
        split_indices = [
            int(percentile * max_index) for percentile in (low, med, high)
        ]

        self.stats_ = {}

        # TODO(matt): Handle case where quantile lands between 2 numbers.
        # The current implementation will simply choose the closest index.
        # This will affect the results of small datasets more than large datasets.
        for col in self.columns:
            filtered_dataset = dataset.map_batches(lambda df: df[[col]],
                                                   batch_format="pandas")
            sorted_dataset = filtered_dataset.sort(col)
            _, low, med, high = sorted_dataset.split_at_indices(split_indices)

            def _get_first_value(ds: Dataset, c: str):
                return ds.take(1)[0][c]

            low_val = _get_first_value(low, col)
            med_val = _get_first_value(med, col)
            high_val = _get_first_value(high, col)

            self.stats_[f"low_quantile({col})"] = low_val
            self.stats_[f"median({col})"] = med_val
            self.stats_[f"high_quantile({col})"] = high_val

        return self
Beispiel #4
0
def _get_unique_value_indices(
    dataset: Dataset,
    columns: List[str],
    drop_na_values: bool = False,
    key_format: str = "unique_values({0})",
    limit: Optional[Dict[str, int]] = None,
) -> Dict[str, Dict[str, int]]:
    """If drop_na_values is True, will silently drop NA values."""
    limit = limit or {}
    for column in limit:
        if column not in columns:
            raise ValueError(
                f"You set limit for {column}, which is not present in {columns}."
            )

    def get_pd_value_counts(df: pd.DataFrame) -> List[Dict[str, Counter]]:
        result = [
            {
                col: Counter(df[col].value_counts(dropna=False).to_dict())
                for col in columns
            }
        ]
        return result

    value_counts = dataset.map_batches(get_pd_value_counts, batch_format="pandas")
    final_counters = {col: Counter() for col in columns}
    for batch in value_counts.iter_batches():
        for col_value_counts in batch:
            for col, value_counts in col_value_counts.items():
                final_counters[col] += value_counts

    # Inspect if there is any NA values.
    for col in columns:
        if drop_na_values:
            counter = final_counters[col]
            counter_dict = dict(counter)
            sanitized_dict = {k: v for k, v in counter_dict.items() if not pd.isnull(k)}
            final_counters[col] = Counter(sanitized_dict)
        else:
            if any(pd.isnull(k) for k in final_counters[col]):
                raise ValueError(
                    f"Unable to fit column '{col}' because it contains null"
                    f" values. Consider imputing missing values first."
                )

    unique_values_with_indices = dict()
    for column in columns:
        if column in limit:
            # Output sorted by freq.
            unique_values_with_indices[key_format.format(column)] = {
                k[0]: j
                for j, k in enumerate(final_counters[column].most_common(limit[column]))
            }
        else:
            # Output sorted by column name.
            unique_values_with_indices[key_format.format(column)] = {
                k: j for j, k in enumerate(sorted(dict(final_counters[column]).keys()))
            }
    return unique_values_with_indices
Beispiel #5
0
    def _transform(self, dataset: Dataset) -> Dataset:
        # TODO(matt): Expose `batch_size` or similar configurability.
        # The default may be too small for some datasets and too large for others.

        dataset_format = dataset._dataset_format()
        if dataset_format not in ("pandas", "arrow"):
            raise ValueError(
                f"Unsupported Dataset format: '{dataset_format}'. Only 'pandas' and "
                "'arrow' Dataset formats are supported.")

        transform_type = self._determine_transform_to_use(dataset_format)

        if transform_type == "pandas":
            return dataset.map_batches(self._transform_pandas,
                                       batch_format="pandas")
        elif transform_type == "arrow":
            return dataset.map_batches(self._transform_arrow,
                                       batch_format="pyarrow")
        else:
            raise ValueError(
                "Invalid transform type returned from _determine_transform_to_use; "
                f'"pandas" and "arrow" allowed, but got: {transform_type}')
Beispiel #6
0
def _get_most_frequent_values(dataset: Dataset,
                              *columns: str) -> Dict[str, Union[str, Number]]:
    columns = list(columns)

    def get_pd_value_counts(df: pd.DataFrame) -> List[Counter]:
        return [Counter(df[col].value_counts().to_dict()) for col in columns]

    value_counts = dataset.map_batches(get_pd_value_counts,
                                       batch_format="pandas")
    final_counters = [Counter() for _ in columns]
    for batch in value_counts.iter_batches():
        for i, col_value_counts in enumerate(batch):
            final_counters[i] += col_value_counts

    return {
        f"most_frequent({column})": final_counters[i].most_common(1)[0][0]
        for i, column in enumerate(columns)
    }
Beispiel #7
0
def _get_most_frequent_values(dataset: Dataset,
                              *columns: str) -> Dict[str, Union[str, Number]]:
    columns = list(columns)

    def get_pd_value_counts(df: pd.DataFrame) -> List[Dict[str, Counter]]:
        return [{
            col: Counter(df[col].value_counts().to_dict())
            for col in columns
        }]

    value_counts = dataset.map_batches(get_pd_value_counts,
                                       batch_format="pandas")
    final_counters = {col: Counter() for col in columns}
    for batch in value_counts.iter_batches():
        for col_value_counts in batch:
            for col, value_counts in col_value_counts.items():
                final_counters[col] += value_counts

    return {
        f"most_frequent({column})": final_counters[column].most_common(1)[0][0]
        for column in columns
    }
Beispiel #8
0
def _get_most_frequent_values(dataset: Dataset,
                              *columns: str) -> Dict[str, Union[str, Number]]:
    # TODO(matt): Optimize this.
    results = {}
    for column in columns:
        # Remove nulls.
        nonnull_dataset = dataset.map_batches(
            lambda df: df.dropna(subset=[column]), batch_format="pandas")
        # Count values.
        counts = nonnull_dataset.groupby(column).count()
        # Find max count.
        max_aggregate = counts.aggregate(Max("count()"))
        max_count = max_aggregate["max(count())"]
        # Find values with max_count.
        most_frequent_values = counts.map_batches(
            lambda df: df.drop(df[df["count()"] < max_count].index),
            batch_format="pandas",
        )
        # Take first (sorted) value.
        most_frequent_value_count = most_frequent_values.take(1)[0]
        most_frequent_value = most_frequent_value_count[column]
        results[f"most_frequent({column})"] = most_frequent_value

    return results
Beispiel #9
0
 def _transform(self, dataset: Dataset) -> Dataset:
     # TODO(matt): Expose `batch_size` or similar configurability.
     # The default may be too small for some datasets and too large for others.
     return dataset.map_batches(self._transform_pandas,
                                batch_format="pandas")