def test_significance(self): """ Check significance tests """ df = pd.read_csv(self.test_data) df_inspection = Inspector(df, m_cats=20) s = df_inspection.significance_test("fnlwgt","age") self.assertIsInstance(s,pd.Series) ## field1, field2, test, statistic, p-value self.assertEqual(len(s), 5) ## Default correlation self.assertEqual(s["test"], "Spearman correlation") df_pval = df_inspection.significance_test_features("label") self.assertEqual(df_pval.shape[1], 5) df_pval.set_index("field1", inplace=True) self.assertEqual( df_pval.loc["age", "test"], "one-way ANOVA on ranks" ) self.assertEqual( df_pval.loc["education-num", "test"], "chi-square test" )
def test_regard_as(self): """ conversion of variable type """ df = pd.read_csv(self.test_data) inspector = Inspector(df, m_cats=20) self.assertEqual(inspector.result.loc["age", "variable"], VariableType.continuous.name) inspector.regard_as_categorical("age") self.assertEqual(inspector.result.loc["age", "variable"], VariableType.categorical.name) ## If we set m_cats, then the inspection logic will be executed. ## As a result the manual setting will be lost. inspector.m_cats = 21 self.assertEqual(inspector.result.loc["age", "variable"], VariableType.continuous.name)
def test_visualize_two_fields(self): # check if the function works without any error np.random.seed(1) df = load_iris(target="species") df["cat"] = np.random.choice(["a","b","c"], size=df.shape[0], replace=True) inspector = Inspector(df) ## continuous x continuous inspector.visualize_two_fields("sepal_width","sepal_length") ## continuous x categorical inspector.visualize_two_fields("sepal_length", "species") ## categorical x continuous inspector.visualize_two_fields("species", "petal_width") ## categorical x categorical inspector.visualize_two_fields("species","cat")
def test_distribution_timestamps_dates(self): base_date = date(year=2019, month=4, day=1) data_dates = [base_date + timedelta(days=d) for d in range(6)] data_dates[0] = np.nan df = pd.DataFrame({"col": data_dates}) df_stats = Inspector(df).distribution_timestamps(fields=["col"]) self.assertEqual(1, df_stats.shape[0]) self.assertEqual(5, df_stats.loc["col","count"]) self.assertEqual(4, df_stats.loc["col","mean"].day) self.assertIsInstance(df_stats.loc["col","std"], timedelta)
def test_an_inspection(self): """ check the inspection """ df = pd.read_csv(self.test_data) inspector = Inspector(df, m_cats=20) ## attribute check self.assertEqual(20, inspector.m_cats) self.assertEqual( inspector.result.loc["education-num", "variable"], VariableType.categorical.name ) ## nan must be ignored self.assertEqual( inspector.result.loc["workclass", "n_unique"], 8 ) self.assertEqual( inspector.result.loc["sex", "variable"], VariableType.binary.name ) df["const"] = 1 # TODO: [datetime(year=2019,month=1,day=1) + timedelta(hours=h) for h in range(360)] inspector = Inspector(df, m_cats=15) self.assertEqual( inspector.result.loc["const", "variable"], VariableType.constant.name ) self.assertEqual( inspector.result.loc["education-num", "variable"], VariableType.continuous.name ) ## An "object" column must always be categorical self.assertTrue( "education" in inspector.get_cats() ) self.assertEqual(inspector.get_cats(), ["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country", "label"]) self.assertEqual(inspector.get_cons(), ["age", "fnlwgt", "education-num","capital-gain", "capital-loss", "hours-per-week"])
def test_distribution_timestamps(self): base_dt = datetime(year=2019, month=4, day=1, tzinfo=utc) df = pd.DataFrame({ "col1": [base_dt + timedelta(days=d) for d in range(-2,3)], "col2": [base_dt + timedelta(hours=3*h) for h in range(-2,3)], "dummy": list(range(-2,3)) }) df_stats = Inspector(df).distribution_timestamps() self.assertEqual(2, df_stats.shape[0]) self.assertEqual(base_dt, df_stats.loc["col1", "mean"]) self.assertEqual(base_dt, df_stats.loc["col2", "mean"]) self.assertIsInstance(df_stats.loc["col1","std"], timedelta)
def test_distribution(self): """ check DataFrames for distributions """ df = pd.read_csv(self.test_data) nrow = df.shape[0] inspector = Inspector(df, m_cats=20) df_cat = inspector.distribution_cats() self.assertAlmostEqual( df_cat.loc["workclass"].loc["Private", "count"] / nrow, df_cat.loc["workclass"].loc["Private", "rate"] ) df_con = inspector.distribution_cons() ## Since it is just a transpose of describe(), ## the number of columns is equal to 8 self.assertEqual( df_con.shape, (len(inspector.get_cons()), 8) )
for d in np.random.normal(loc=0, scale=30, size=df.shape[0]) ] df["dummy_ts"][0] = np.nan df["dummy_ym"] = df["dummy_ts"].apply(lambda ts: ts.date().replace(day=1)) df.head() # - # ### 1. Check the quality of data # # Creating an instance of `Inspector`, you can get an overview of the data quality of your dataset. # + from adhoc.processing import Inspector inspector = Inspector(df, m_cats=20) inspector # - # First of all the instance `inspector` is **not a DataFrame**. The default representation of the instance is the result of the inspection of the given DataFrame. You can access the DataFrame by the property `inspector.result`. inspector.result.query("count_na > 0") # #### Description of fields of `inspector.result` # # - dtype: This is the result of `df.dtypes` # - count_na: The number of missing values (NA) in the column. `df.isna().sum()` # - rate_na: The number of missing values (NA) in the column. `df.isna().mean()` # - n_unique: The number of distinct values in the column. **We ignore missing values here.** # - distinct: If a different row has a different number, then `True` else `False`. When it is `True`, then the column can be an ID such as a primary key or just a continuous variable. # - variable: See below
from adhoc.processing import Inspector from adhoc.modeling import show_tree from adhoc.utilities import load_iris, facet_grid_scatter_plot, bins_heatmap # + np.random.seed(1) df = load_iris(target="species") df["cat1"] = np.random.choice(["a", "b", "c"], size=df.shape[0], replace=True) df["cat2"] = (df.iloc[:, 0] * df.iloc[:, 1] - df.iloc[:, 2] * df.iloc[:, 3] > 11).map({ True: 1, False: 0 }) inspector = Inspector(df) inspector ## 4 continuous variables and 3 categorical variables # - inspector.visualize_two_fields("sepal_width", "sepal_length") ## continuous x continuous inspector.visualize_two_fields("petal_width", "species") ## continuous x categorical inspector.visualize_two_fields("species", "petal_width") ## categorical x continuous inspector.visualize_two_fields("species", "cat2") inspector.visualize_two_fields("species", "cat2", heatmap=True)
# - ScreenPorch: Screen porch area in square feet # - PoolArea: Pool area in square feet # - PoolQC: Pool quality # - Fence: Fence quality # - MiscFeature: Miscellaneous feature not covered in other categories # - MiscVal: $Value of miscellaneous feature # - MoSold: Month Sold # - YrSold: Year Sold # - SaleType: Type of sale # - SaleCondition: Condition of sale # # + from adhoc.processing import Inspector inspector_train = Inspector(df_train) with pd.option_context("display.max_rows",None): display(inspector_train) # - # ### Data type correction # # Some categorical values are described as numbers in the data set so that they look like continuous variables. For example `MSSubClass` (the building class). def correct_dtype(data:pd.DataFrame) -> pd.DataFrame: df = data.copy() df["MSSubClass"] = df["MSSubClass"].apply(lambda x: f"C{x:03d}") df["MoSold"] = df["MoSold"].apply(lambda x: f"M{x:02d}")