def fit_transform(self, input_df: XDataFrame) -> XDataFrame: """Fit to data frame, then transform it. Args: input_df (XDataFrame): Input data frame. Returns: XDataFrame : Output data frame. """ if cudf_is_available() and isinstance(input_df, cudf.DataFrame): self._selected_cols = ( input_df.to_pandas() .T.drop_duplicates(keep="first") .index.values.tolist() ) else: self._selected_cols = input_df.T.drop_duplicates( keep="first" ).index.values.tolist() return input_df[self._selected_cols]
def transform(self, input_df: XDataFrame) -> XDataFrame: """Transform data frame. Args: input_df (XDataFrame): Input data frame. Returns: XDataFrame : Output data frame. """ if isinstance(input_df, pd.DataFrame): new_df = input_df.copy() elif cudf_is_available() and isinstance(input_df, cudf.DataFrame): new_df = input_df.to_pandas() else: raise RuntimeError("Unexpected data type: {}".format(type(input_df))) generated_cols = [] input_cols = self._input_cols if not input_cols: input_cols = new_df.columns.tolist() if len(self._exclude_cols) > 0: input_cols = [col for col in input_cols if col not in self._exclude_cols] for col in input_cols: new_col = self._output_prefix + col + self._output_suffix if self._fillna is not None: new_df[new_col] = ( new_df[col].fillna(self._fillna).apply(self._lambda_func) ) else: new_df[new_col] = new_df[col].apply(self._lambda_func) generated_cols.append(new_col) if cudf_is_available() and isinstance(input_df, cudf.DataFrame): new_df = cudf.from_pandas(new_df) if self._drop_origin: return new_df[generated_cols] return new_df
def reduce_mem_usage(df: XDataFrame, verbose: bool = True, debug: bool = True) -> XDataFrame: start_mem = df.memory_usage().sum() / 1024**2 if is_cudf(df): df = compress_df(df.to_pandas()) else: df = compress_df(df) end_mem = df.memory_usage().sum() / 1024**2 reduction = (start_mem - end_mem) / start_mem msg = (f"Mem. usage decreased to {end_mem:5.2f} MB" + f" ({reduction * 100:.1f} % reduction)") if verbose: print(msg) if debug: logging.debug(msg) return df
def fit(self, input_df: XDataFrame) -> None: """Fit to data frame Args: input_df (XDataFrame): Input data frame. Returns: XDataFrame : Output data frame. """ org_cols = input_df.columns.tolist() input_df = (input_df.to_pandas() if isinstance(input_df, cudf.DataFrame) else input_df) seen_cols_pairs = (load_pickle(self.save_path / "seen_feats_pairs.pkl") if (self.save_path / "seen_feats_pairs.pkl").exists() else defaultdict(list)) removed_cols_pairs = (load_pickle(self.save_path / "removed_feats_pairs.pkl") if (self.save_path / "removed_feats_pairs.pkl").exists() else defaultdict(list)) removed_cols = sum(removed_cols_pairs.values(), []) if self.dry_run: self._selected_cols = [ col for col in org_cols if col not in set(removed_cols) ] return org_cols = [col for col in org_cols if col not in removed_cols] counter = 0 for i in tqdm(range(len(org_cols) - 1)): feat_a_name = org_cols[i] if feat_a_name in removed_cols: continue feat_a = input_df[feat_a_name] for j in range(i + 1, len(org_cols)): feat_b_name = org_cols[j] if self._has_seen(feat_a_name, feat_b_name, seen_cols_pairs): continue else: seen_cols_pairs[feat_a_name].append(feat_b_name) seen_cols_pairs[feat_b_name].append(feat_a_name) if self._has_removed(feat_a_name, feat_b_name, removed_cols): continue feat_b = input_df[feat_b_name] c = np.corrcoef(feat_a, feat_b)[0][1] if abs(c) > self._threshold: counter += 1 removed_cols.append(feat_b_name) removed_cols_pairs[feat_a_name].append(feat_b_name) print("{}: FEAT_A: {} FEAT_B: {} - Correlation: {}".format( counter, feat_a_name, feat_b_name, c)) save_pickle(removed_cols_pairs, self.save_path / "removed_feats_pairs.pkl") save_pickle(seen_cols_pairs, self.save_path / "seen_feats_pairs.pkl") self._selected_cols = [ col for col in org_cols if col not in set(removed_cols) ]