def test_an_inspection(self): """ check the inspection """ df = pd.read_csv(self.test_data) inspector = Inspector(df, m_cats=20) ## attribute check self.assertEqual(20, inspector.m_cats) self.assertEqual( inspector.result.loc["education-num", "variable"], VariableType.categorical.name ) ## nan must be ignored self.assertEqual( inspector.result.loc["workclass", "n_unique"], 8 ) self.assertEqual( inspector.result.loc["sex", "variable"], VariableType.binary.name ) df["const"] = 1 # TODO: [datetime(year=2019,month=1,day=1) + timedelta(hours=h) for h in range(360)] inspector = Inspector(df, m_cats=15) self.assertEqual( inspector.result.loc["const", "variable"], VariableType.constant.name ) self.assertEqual( inspector.result.loc["education-num", "variable"], VariableType.continuous.name ) ## An "object" column must always be categorical self.assertTrue( "education" in inspector.get_cats() ) self.assertEqual(inspector.get_cats(), ["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country", "label"]) self.assertEqual(inspector.get_cons(), ["age", "fnlwgt", "education-num","capital-gain", "capital-loss", "hours-per-week"])
def test_distribution(self): """ check DataFrames for distributions """ df = pd.read_csv(self.test_data) nrow = df.shape[0] inspector = Inspector(df, m_cats=20) df_cat = inspector.distribution_cats() self.assertAlmostEqual( df_cat.loc["workclass"].loc["Private", "count"] / nrow, df_cat.loc["workclass"].loc["Private", "rate"] ) df_con = inspector.distribution_cons() ## Since it is just a transpose of describe(), ## the number of columns is equal to 8 self.assertEqual( df_con.shape, (len(inspector.get_cons()), 8) )
# If we assign a number to `m_cats`, then the inspection is computed again. As a result your manual modification of variable types will be lost. inspector.m_cats = 20 inspector.result.query( "dtype == 'int64'") ## The variable type of age is now continuous. # If you want to calculate the inspection once again because you converted a column, then `make_an_inspection()` does the job. inspector.make_an_inspection() # We can get easily the list of categorical/continuous variables. (Note that a constant variable is neither categorical nor continuous.) print(inspector.get_cats() ) ## list of categorical variables (binary or categorical) print(inspector.get_cons()) ## list of continuous variables # #### Distributions # # In order to find a special/strange values we need to check the distributions of the variables. # # The distributions of categorical variables is shown by `distribution_cats()` inspector.distribution_cats(fields=["workclass", "dummy_ym"], sort=True) # If `fields` is not given, then the distributions of all categorical variables are shown. # # You can also give selected fields to see their distribution in the above form. You can also give a continuous variable. ## Obviously a histogram is better and easier than this, but this is just an example. inspector.distribution_cats(["age"]).reset_index(level=0)["count"].plot()