def fit(self, X: dd, y=None): """Calculate what columns should be removed, based on the defined thresholds Args: X (dd): Dataframe to be processed y (dd, optional): Target. Defaults to None. Returns: None """ subset = X.select_dtypes(exclude=[np.number, "datetime64[ns]"]) # Calculate the entropy column-wisely entropies_df = subset.compute().apply(entropy, axis=0).to_frame(name="entropy") entropies_df.reset_index(inplace=True) entropies_df.rename(columns={"index": "column_name"}, inplace=True) entropies_df.sort_values(by="entropy", inplace=True, ascending=False) # Get thresholds and calculate what columns will be removed thresholds = [float(value) for value in self.entropy_thresholds] mask_entropy = entropies_df["entropy"].between( min(thresholds), max(thresholds), inclusive=self.inclusive) # Get list of columns to be removed self.feature_names = list(entropies_df.loc[~mask_entropy, "column_name"].values) mask_removed = entropies_df["column_name"].isin(self.feature_names) entropies_df.loc[mask_removed, "filtered_entropy"] = 1 return self
def make_filter_std_pipeline(data: dd, numerical_columns: list[str] or bool = True, thresholds: list[float] = None, inclusive: bool = False): #TODO: write unit tests """ Makes pipeline to filter columns according to standard deviation Args: data (dd): Data frame to be filtered numerical_columns (list or bool, optional): Columns to subset the filtering. Defaults to True. thresholds (list, optional): Interval of std values to filter. Defaults to None. inclusive (bool, optional): Includes or not the interval boundaries. Defaults to False. Returns: EPipeline: Pipeline to filter data frame """ selected_columns = data.select_dtypes( include=[np.number]).columns.values if isinstance( numerical_columns, bool) else numerical_columns steps = [("extract", Extract(selected_columns)), ("std_filter", Filter_Std(std_thresholds=thresholds, inclusive=inclusive))] return EPipeline(steps)
def make_filter_entropy_pipeline(data: dd, categorical_columns: list[str] or bool = True, thresholds: list[float] = None, inclusive: bool = False): #TODO: write unit tests selected_columns = data.select_dtypes( exclude=[np.number], include=["object"]) if isinstance( categorical_columns, bool) else categorical_columns steps = [("extract", Extract(selected_columns)), ("entropy_filter", Filter_Entropy(entropy_thresholds=thresholds, inclusive=inclusive))] return EPipeline(steps)
def fit(self, X: dd, y=None): """Calculate what columns should be removed, based on the defined thresholds Args: X (dd): Dataframe to be processed y (dd, optional): Target. Defaults to None. Returns: None """ subset = X.select_dtypes(include=[np.number]) # Calculate the standad deviation column-wisely stds = np.nanstd(subset, axis=0) stds_df = pd.DataFrame.from_dict({ "column_name": subset.columns.values, "std": stds }) stds_df.sort_values(by="std", inplace=True, ascending=False) # Get thresholds and calculate what columns will be removed thresholds = [float(value) for value in self.std_thresholds] mask_variance = stds_df["std"].between(min(thresholds), max(thresholds), inclusive=self.inclusive) # Get list of columns to be removed self.feature_names = list(stds_df.loc[~mask_variance, "column_name"].values) mask_removed = stds_df["column_name"].isin(self.feature_names) stds_df.loc[mask_removed, "filtered_variance"] = 1 stds_df.loc[~mask_removed, "filtered_variance"] = 0 return self