Esempio n. 1
0
    def fit_transform(self, input_df: XDataFrame) -> XDataFrame:
        """Fit to data frame, then transform it.

        Args:
            input_df (XDataFrame): Input data frame.
        Returns:
            XDataFrame : Output data frame.
        """
        if cudf_is_available() and isinstance(input_df, cudf.DataFrame):
            self._selected_cols = (
                input_df.to_pandas()
                .T.drop_duplicates(keep="first")
                .index.values.tolist()
            )
        else:
            self._selected_cols = input_df.T.drop_duplicates(
                keep="first"
            ).index.values.tolist()
        return input_df[self._selected_cols]
Esempio n. 2
0
    def transform(self, input_df: XDataFrame) -> XDataFrame:
        """Transform data frame.

        Args:
            input_df (XDataFrame): Input data frame.
        Returns:
            XDataFrame : Output data frame.
        """
        if isinstance(input_df, pd.DataFrame):
            new_df = input_df.copy()
        elif cudf_is_available() and isinstance(input_df, cudf.DataFrame):
            new_df = input_df.to_pandas()
        else:
            raise RuntimeError("Unexpected data type: {}".format(type(input_df)))
        generated_cols = []

        input_cols = self._input_cols
        if not input_cols:
            input_cols = new_df.columns.tolist()
        if len(self._exclude_cols) > 0:
            input_cols = [col for col in input_cols if col not in self._exclude_cols]

        for col in input_cols:
            new_col = self._output_prefix + col + self._output_suffix
            if self._fillna is not None:
                new_df[new_col] = (
                    new_df[col].fillna(self._fillna).apply(self._lambda_func)
                )
            else:
                new_df[new_col] = new_df[col].apply(self._lambda_func)

            generated_cols.append(new_col)

        if cudf_is_available() and isinstance(input_df, cudf.DataFrame):
            new_df = cudf.from_pandas(new_df)

        if self._drop_origin:
            return new_df[generated_cols]

        return new_df
Esempio n. 3
0
def reduce_mem_usage(df: XDataFrame,
                     verbose: bool = True,
                     debug: bool = True) -> XDataFrame:
    start_mem = df.memory_usage().sum() / 1024**2

    if is_cudf(df):
        df = compress_df(df.to_pandas())
    else:
        df = compress_df(df)

    end_mem = df.memory_usage().sum() / 1024**2
    reduction = (start_mem - end_mem) / start_mem

    msg = (f"Mem. usage decreased to {end_mem:5.2f} MB" +
           f" ({reduction * 100:.1f} % reduction)")
    if verbose:
        print(msg)

    if debug:
        logging.debug(msg)

    return df
Esempio n. 4
0
    def fit(self, input_df: XDataFrame) -> None:
        """Fit to data frame

        Args:
            input_df (XDataFrame): Input data frame.
        Returns:
            XDataFrame : Output data frame.
        """
        org_cols = input_df.columns.tolist()

        input_df = (input_df.to_pandas()
                    if isinstance(input_df, cudf.DataFrame) else input_df)

        seen_cols_pairs = (load_pickle(self.save_path /
                                       "seen_feats_pairs.pkl") if
                           (self.save_path / "seen_feats_pairs.pkl").exists()
                           else defaultdict(list))
        removed_cols_pairs = (load_pickle(self.save_path /
                                          "removed_feats_pairs.pkl") if
                              (self.save_path /
                               "removed_feats_pairs.pkl").exists() else
                              defaultdict(list))
        removed_cols = sum(removed_cols_pairs.values(), [])
        if self.dry_run:
            self._selected_cols = [
                col for col in org_cols if col not in set(removed_cols)
            ]
            return

        org_cols = [col for col in org_cols if col not in removed_cols]
        counter = 0
        for i in tqdm(range(len(org_cols) - 1)):
            feat_a_name = org_cols[i]
            if feat_a_name in removed_cols:
                continue

            feat_a = input_df[feat_a_name]

            for j in range(i + 1, len(org_cols)):
                feat_b_name = org_cols[j]

                if self._has_seen(feat_a_name, feat_b_name, seen_cols_pairs):
                    continue
                else:
                    seen_cols_pairs[feat_a_name].append(feat_b_name)
                    seen_cols_pairs[feat_b_name].append(feat_a_name)

                if self._has_removed(feat_a_name, feat_b_name, removed_cols):
                    continue

                feat_b = input_df[feat_b_name]
                c = np.corrcoef(feat_a, feat_b)[0][1]

                if abs(c) > self._threshold:
                    counter += 1
                    removed_cols.append(feat_b_name)
                    removed_cols_pairs[feat_a_name].append(feat_b_name)
                    print("{}: FEAT_A: {} FEAT_B: {} - Correlation: {}".format(
                        counter, feat_a_name, feat_b_name, c))

        save_pickle(removed_cols_pairs,
                    self.save_path / "removed_feats_pairs.pkl")
        save_pickle(seen_cols_pairs, self.save_path / "seen_feats_pairs.pkl")
        self._selected_cols = [
            col for col in org_cols if col not in set(removed_cols)
        ]