Ejemplo n.º 1
0
    def test_compare_columns(self):
        df1 = pd.DataFrame({"a": [1, 2], "b": [2, 3]})
        df2 = pd.DataFrame({"b": [3, 4], "c": [4, 5]})
        comparison = compare_columns(df1, df2)
        self.assertTrue(comparison["mismatch"])
        self.assertListEqual(comparison["df1_not_in_df2"], ["a"])
        self.assertListEqual(comparison["df2_not_in_df1"], ["c"])

        comparison2 = compare_columns(df1, df1)
        self.assertFalse(comparison2["mismatch"])

        comparison3 = compare_columns(df1, df2, ignore=["c"])
        self.assertTrue(comparison3["mismatch"])
        self.assertListEqual(comparison3["df1_not_in_df2"], ["a"])
        self.assertListEqual(comparison3["df2_not_in_df1"], [])
Ejemplo n.º 2
0
    def test_DataCleaner(self):
        """
        A basic test ensuring Preprocess can handle numerical features and
        features/targets  that may be strings but should be numbers.

        Returns: None
        """
        df = self.test_df
        dc = DataCleaner()

        # Test the case of numbers as strings
        df[self.target] = df[self.target].astype(str)
        df = dc.fit_transform(df, self.target)
        self.assertAlmostEqual(df[self.target].iloc[0], 0.35)

        # Test if there is an nan in target
        df[self.target].iloc[8] = np.nan
        df = dc.fit_transform(df, self.target)
        self.assertEqual(df.shape[0], self.test_df.shape[0] - 1)

        # Test if there is an nan in feature
        df["HOMO_energy"].iloc[40] = np.nan
        df = dc.fit_transform(df, self.target)
        self.assertEqual(df.shape[0], self.test_df.shape[0] - 2)

        # Test if nan threshold is exceeded for a feature
        df["LUMO_energy"].iloc[:-2] = [np.nan] * (df.shape[0] - 2)
        df = dc.fit_transform(df, self.target)
        self.assertEqual(df.shape[1], self.test_df.shape[1] - 1)

        # test transferability
        df2 = self.test_df
        df2 = df2.drop(columns=[self.target])
        df2 = dc.transform(df2, self.target)
        self.assertFalse(
            compare_columns(df, df2, ignore=self.target)["mismatch"])
        self.assertTrue(self.target not in df2.columns)
Ejemplo n.º 3
0
    def handle_na(self, df, target, na_method, coerce_mismatch=True):
        """
        First pass for handling cells without values (null or nan). Additional
        preprocessing may be necessary as one column may be filled with
        median while the other with mean or mode, etc.

        Args:
            df (pandas.DataFrame): The dataframe containing features
            target (str): The key defining the ML target.
            coerce_mismatch (bool): If there is a mismatch between the fitted
                dataframe columns and the argument dataframe columns, create
                and drop mismatch columns so the dataframes are matching. If
                False, raises an error. New columns are instantiated as all
                zeros, as most of the time this is a onehot encoding issue.
            na_method (str): How to deal with samples still containing nans
                after troublesome columns are already dropped. Default is
                'drop'. Other options are from pandas.DataFrame.fillna:
                {‘bfill’, ‘pad’, ‘ffill’}, or 'ignore' to ignore nans.
                Alternatively, specify a value to replace the nans, e.g. 0.

        Returns:
            (pandas.DataFrame) The cleaned df
        """
        self.logger.info(self._log_prefix +
                         "Before handling na: {} samples, {} features"
                         "".format(*df.shape))

        # Drop targets containing na before further processing
        if self.drop_na_targets and target in df.columns:
            clean_df = df.dropna(axis=0, how='any', subset=[target])
            self.dropped_samples = df[~df.index.isin(clean_df.index)]
            self.logger.info(
                self._log_prefix +
                "{} samples did not have target values. They were "
                "dropped.".format(len(self.dropped_samples)))
            df = clean_df

        # Remove features failing the max_na_frac limit
        feats0 = set(df.columns)
        if not self.is_fit:
            self.logger.info(self._log_prefix +
                             "Handling feature na by max na threshold of {} "
                             "with method '{}'.".format(
                                 self.max_na_frac, self.feature_na_method))
            threshold = int((1 - self.max_na_frac) * len(df))
            if self.feature_na_method == "drop":
                df = df.dropna(axis=1, thresh=threshold)
            else:
                df = df.dropna(axis=1, thresh=1)
                problem_cols = df.columns[
                    df.isnull().mean() > self.max_na_frac]
                dfp = df[problem_cols]
                if self.feature_na_method == "fill":
                    dfp = dfp.fillna(method="ffill")
                    dfp = dfp.fillna(method="bfill")
                elif self.feature_na_method == "mean":
                    # Take the mean of all numeric columns
                    dfpn = dfp[[
                        ncol for ncol in dfp.columns
                        if ncol in self.number_cols
                    ]]
                    dfpn = dfpn.fillna(value=dfpn.mean())
                    dfp[dfpn.columns] = dfpn

                    # Simply fill one hot encoded columns
                    dfp = dfp.fillna(method="ffill")
                    dfp = dfp.fillna(method="bfill")
                else:
                    dfp = dfp.fillna(value=self.feature_na_method)
                df[problem_cols] = dfp

            if len(df.columns) < len(feats0):
                feats = set(df.columns)
                n_feats = len(feats0) - len(feats)
                napercent = self.max_na_frac * 100
                feat_names = feats0 - feats
                self.logger.info(
                    self._log_prefix +
                    'These {} features were removed as they had more '
                    'than {}% missing values: {}'.format(
                        n_feats, napercent, feat_names))
        else:
            mismatch = compare_columns(self.fitted_df, df, ignore=target)
            if mismatch["mismatch"]:
                self.logger.warning(self._log_prefix +
                                    "Mismatched columns found in dataframe "
                                    "used for fitting and argument dataframe.")
                if coerce_mismatch:
                    self.logger.warning(self._log_prefix +
                                        "Coercing mismatched columns...")
                    if mismatch["df1_not_in_df2"]:  # in fitted, not in arg
                        self.logger.warning(
                            self._log_prefix +
                            "Assuming missing columns in argument df are "
                            "one-hot encoding issues. Setting to zero the "
                            "following new columns:\n{}".format(
                                mismatch["df1_not_in_df2"]))
                        for c in self.fitted_df.columns:
                            if c not in df.columns and c != target:
                                # Interpret as one-hot problems...
                                df[c] = np.zeros((df.shape[0]))
                    if mismatch["df2_not_in_df1"]:  # arg cols not in fitted
                        self.logger.warning(
                            self._log_prefix +
                            "Following columns are being dropped:\n{}".format(
                                mismatch["df2_not_in_df1"]))
                        df = df.drop(columns=mismatch["df2_not_in_df1"])
                else:
                    raise AutomatminerError(
                        "Mismatch between columns found in "
                        "arg dataframe and dataframe used "
                        "for fitting!")

            # handle the case where all samples of transformed df are nan but
            # feature is required by fitted input df, and these is no way to
            # impute by samples or drop...
            nan_cols = [c for c in df.columns if df[c].isna().all()]
            if nan_cols:
                self.logger.error(
                    self._log_prefix + "Columns {} are all nan "
                    "in transform df but are required by the fit "
                    "df. Using mean values of fitted df to "
                    "impute transformed df. This may result in "
                    "highly erroenous imputed values!"
                    "".format(nan_cols))
                for col in nan_cols:
                    mean_val = self.fitted_df[col].mean()
                    df[col] = [mean_val] * df.shape[0]

        self.dropped_features = [
            c for c in feats0 if c not in df.columns.values
        ]

        # Handle all rows that still contain any nans
        if na_method == "drop":
            clean_df = df.dropna(axis=0, how='any')
            self.dropped_samples = pd.concat(
                (df[~df.index.isin(clean_df.index)], self.dropped_samples),
                axis=0,
                sort=True)
            df = clean_df
        elif na_method == "ignore":
            pass
        elif na_method == "fill":
            df = df.fillna(method="ffill")
            df = df.fillna(method="bfill")
        elif na_method == "mean":
            # Samples belonging in number columns are averaged to replace na
            dfn = df[[ncol for ncol in df.columns if ncol in self.number_cols]]
            dfn = dfn.fillna(value=dfn.mean())
            df[dfn.columns] = dfn

            # the rest are simply filled
            df = df.fillna(method="ffill")
            df = df.fillna(method="bfill")
        else:
            df = df.fillna(value=na_method)
        self.logger.info(self._log_prefix +
                         "After handling na: {} samples, {} features".format(
                             *df.shape))
        return df