def fit(self, X: dd, y=None): """ Calculate what columns should be removed, based on the defined thresholds Args: X (dd): Dataframe to be processed y (dd, optional): Target. Defaults to None. Returns: None """ # Calculate number of missing rows in each column summary_df = X.isnull().sum().compute() summary_df = summary_df.to_frame(name="nulls_count") summary_df["nulls_proportions"] = summary_df["nulls_count"] / X.shape[ 0].compute() summary_df.sort_values(by="nulls_count", ascending=False, inplace=True) # Select what columns should be removed, based on proportions mask_nulls = summary_df["nulls_proportions"] > self.nulls_threshold summary_df.loc[mask_nulls, "filtered_nulls"] = 1 summary_df.loc[~mask_nulls, "filtered_nulls"] = 0 self.feature_names = list(summary_df[mask_nulls].index.values) return self
def null_data_check(data: dd = None) -> bool: """ Check if dataframe contains any null values if so return true """ data = data.compute() return data.isnull().values.any()