Example #1
0
    def fit(self, X: dd, y=None):
        """
        Calculate what columns should be removed, based on the defined thresholds

        Args:
            X (dd): Dataframe to be processed
            y (dd, optional): Target. Defaults to None.

        Returns:
            None
        """

        # Calculate number of missing rows in each column
        summary_df = X.isnull().sum().compute()
        summary_df = summary_df.to_frame(name="nulls_count")
        summary_df["nulls_proportions"] = summary_df["nulls_count"] / X.shape[
            0].compute()
        summary_df.sort_values(by="nulls_count", ascending=False, inplace=True)

        # Select what columns should be removed, based on proportions
        mask_nulls = summary_df["nulls_proportions"] > self.nulls_threshold
        summary_df.loc[mask_nulls, "filtered_nulls"] = 1
        summary_df.loc[~mask_nulls, "filtered_nulls"] = 0

        self.feature_names = list(summary_df[mask_nulls].index.values)

        return self
Example #2
0
def null_data_check(data: dd = None) -> bool:
    """
    Check if dataframe contains any null values if so return true
    """
    data = data.compute()
    return data.isnull().values.any()