def test_significance(self): """ Check significance tests """ df = pd.read_csv(self.test_data) df_inspection = Inspector(df, m_cats=20) s = df_inspection.significance_test("fnlwgt","age") self.assertIsInstance(s,pd.Series) ## field1, field2, test, statistic, p-value self.assertEqual(len(s), 5) ## Default correlation self.assertEqual(s["test"], "Spearman correlation") df_pval = df_inspection.significance_test_features("label") self.assertEqual(df_pval.shape[1], 5) df_pval.set_index("field1", inplace=True) self.assertEqual( df_pval.loc["age", "test"], "one-way ANOVA on ranks" ) self.assertEqual( df_pval.loc["education-num", "test"], "chi-square test" )
# # |feature\target|categorical|continuous| # |-|-|-| # |categorical|$\chi^2$-test|ANOVA| # |continuous|ANOVA|correlation| # # **Remark** # # 1. It is usual that the p-values you got are extremly small. In particular when you have a relatively large dataset. # 2. The null-hypotheses of the tests are quite different. # - $\chi^2$-test : two categorical variables are independent. # - one-way ANOVA on ranks: consider the groups by the value of the categorical variable. Then the null-hypothesis is that the medians of the continuous variable in the groups are the same. # - Correlation: The two continuous variables are not correlated. # 3. The result of chi-square test for `dummy_ts` is shown. We have to think of whether it is meaningful. inspector.significance_test_features("label", verbose=False) # According to the above table the p-value of the ANOVA for `fnlwgt` and `label` is relatively large. In fact the KDE of `flnwgt` by `label` are quite similar. inspector.visualize_two_fields("fnlwgt", "label") ## con vs cat # ## 3. Convert the DataFrame into a feature matrix # # 1. Fill missing values with the most-frequent value. # 2. Apply a one-hot-encoder to each categorical variable. # # These two steps can be combined to one step by chosing some classes. If we just apply a one-hot-encoder to a categorical variable without missing values, then the feature matrix (including a constant column) has a colinear tuple of columns. The existance of linearly dependent variables is harmful when we train a linear model. df["capital-gain"] = df["capital-gain"] - df[ "capital-loss"] ## merge the two fields
# # - The $p$-values are not suitable for a ranking. Only for a screening. # - The $p$-value measures how small/large the difference is under the null-hypothesis. (The $p$-value is small => the difference is large.) # - We ignore the possibility that the feature is important under a certain combination with another feature. # + ## Actually we should apply this method after imputation. ## Because this is just demonstration and we do not use the result, ## we just drop rows with a missing values and the field "Utilities", ## which is constant because of dropping missing values. df_na_dropped = df_train.dropna(how="any").drop("Utilities", axis=1) inspector_na_dropped = Inspector(df_na_dropped) with pd.option_context("display.max_rows", None): display(inspector_na_dropped.significance_test_features(target).sort_values(by="pval")) # - # ## ML Pipeline # + def separate_x_y(data:pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]: X = data.drop(target, axis=1) y = np.log1p(data[target]).rename("LogSalePrice") return X, y X_train, y_train = separate_x_y(df_train) X_dev, y_dev = separate_x_y(df_dev) # -