Example #1
0
    def test_significance(self):
        """
        Check significance tests
        """

        df = pd.read_csv(self.test_data)
        df_inspection = Inspector(df, m_cats=20)

        s = df_inspection.significance_test("fnlwgt","age")

        self.assertIsInstance(s,pd.Series)

        ## field1, field2, test, statistic, p-value
        self.assertEqual(len(s), 5)

        ## Default correlation
        self.assertEqual(s["test"], "Spearman correlation")

        df_pval = df_inspection.significance_test_features("label")

        self.assertEqual(df_pval.shape[1], 5)

        df_pval.set_index("field1", inplace=True)

        self.assertEqual(
            df_pval.loc["age", "test"],
            "one-way ANOVA on ranks"
        )

        self.assertEqual(
            df_pval.loc["education-num", "test"],
            "chi-square test"
        )
Example #2
0
    def test_regard_as(self):
        """
        conversion of variable type
        """

        df = pd.read_csv(self.test_data)
        inspector = Inspector(df, m_cats=20)

        self.assertEqual(inspector.result.loc["age", "variable"],
                         VariableType.continuous.name)

        inspector.regard_as_categorical("age")
        self.assertEqual(inspector.result.loc["age", "variable"],
                         VariableType.categorical.name)

        ## If we set m_cats, then the inspection logic will be executed.
        ## As a result the manual setting will be lost.
        inspector.m_cats = 21
        self.assertEqual(inspector.result.loc["age", "variable"],
                         VariableType.continuous.name)
Example #3
0
    def test_visualize_two_fields(self):
        # check if the function works without any error
        np.random.seed(1)
        df = load_iris(target="species")
        df["cat"] = np.random.choice(["a","b","c"],
                                     size=df.shape[0],
                                     replace=True)
        inspector = Inspector(df)

        ## continuous x continuous
        inspector.visualize_two_fields("sepal_width","sepal_length")

        ## continuous x categorical
        inspector.visualize_two_fields("sepal_length", "species")

        ## categorical x continuous
        inspector.visualize_two_fields("species", "petal_width")

        ## categorical x categorical
        inspector.visualize_two_fields("species","cat")
Example #4
0
    def test_distribution_timestamps_dates(self):
        base_date = date(year=2019, month=4, day=1)
        data_dates = [base_date + timedelta(days=d) for d in range(6)]
        data_dates[0] = np.nan
        df = pd.DataFrame({"col": data_dates})
        df_stats = Inspector(df).distribution_timestamps(fields=["col"])

        self.assertEqual(1, df_stats.shape[0])
        self.assertEqual(5, df_stats.loc["col","count"])
        self.assertEqual(4, df_stats.loc["col","mean"].day)
        self.assertIsInstance(df_stats.loc["col","std"], timedelta)
Example #5
0
    def test_an_inspection(self):
        """
        check the inspection
        """
        df = pd.read_csv(self.test_data)

        inspector = Inspector(df, m_cats=20)

        ## attribute check
        self.assertEqual(20, inspector.m_cats)

        self.assertEqual(
            inspector.result.loc["education-num", "variable"],
            VariableType.categorical.name
        )

        ## nan must be ignored
        self.assertEqual(
            inspector.result.loc["workclass", "n_unique"],
            8
        )

        self.assertEqual(
            inspector.result.loc["sex", "variable"],
            VariableType.binary.name
        )

        df["const"] = 1
        # TODO: [datetime(year=2019,month=1,day=1) + timedelta(hours=h) for h in range(360)]
        inspector = Inspector(df, m_cats=15)

        self.assertEqual(
            inspector.result.loc["const", "variable"],
            VariableType.constant.name
        )

        self.assertEqual(
            inspector.result.loc["education-num", "variable"],
            VariableType.continuous.name
        )

        ## An "object" column must always be categorical
        self.assertTrue(
            "education" in inspector.get_cats()
        )

        self.assertEqual(inspector.get_cats(),
                         ["workclass", "education", "marital-status",
                          "occupation", "relationship", "race",
                          "sex", "native-country", "label"])

        self.assertEqual(inspector.get_cons(),
                         ["age", "fnlwgt", "education-num","capital-gain",
                          "capital-loss", "hours-per-week"])
Example #6
0
    def test_distribution_timestamps(self):
        base_dt = datetime(year=2019, month=4, day=1, tzinfo=utc)

        df = pd.DataFrame({
            "col1": [base_dt + timedelta(days=d) for d in range(-2,3)],
            "col2": [base_dt + timedelta(hours=3*h) for h in range(-2,3)],
            "dummy": list(range(-2,3))
        })

        df_stats = Inspector(df).distribution_timestamps()
        self.assertEqual(2, df_stats.shape[0])
        self.assertEqual(base_dt, df_stats.loc["col1", "mean"])
        self.assertEqual(base_dt, df_stats.loc["col2", "mean"])
        self.assertIsInstance(df_stats.loc["col1","std"], timedelta)
Example #7
0
    def test_distribution(self):
        """
        check DataFrames for distributions
        """

        df = pd.read_csv(self.test_data)
        nrow = df.shape[0]
        inspector = Inspector(df, m_cats=20)

        df_cat = inspector.distribution_cats()

        self.assertAlmostEqual(
            df_cat.loc["workclass"].loc["Private", "count"] / nrow,
            df_cat.loc["workclass"].loc["Private", "rate"]
        )

        df_con = inspector.distribution_cons()

        ## Since it is just a transpose of describe(),
        ## the number of columns is equal to 8
        self.assertEqual(
            df_con.shape,
            (len(inspector.get_cons()), 8)
        )
Example #8
0
    for d in np.random.normal(loc=0, scale=30, size=df.shape[0])
]
df["dummy_ts"][0] = np.nan
df["dummy_ym"] = df["dummy_ts"].apply(lambda ts: ts.date().replace(day=1))

df.head()
# -

# ### 1. Check the quality of data
#
# Creating an instance of `Inspector`, you can get an overview of the data quality of your dataset.

# +
from adhoc.processing import Inspector

inspector = Inspector(df, m_cats=20)
inspector
# -

# First of all the instance `inspector` is **not a DataFrame**. The default representation of the instance is the result of the inspection of the given DataFrame. You can access the DataFrame by the property `inspector.result`.

inspector.result.query("count_na > 0")

# #### Description of fields of `inspector.result`
#
# - dtype: This is the result of `df.dtypes`
# - count_na: The number of missing values (NA) in the column. `df.isna().sum()`
# - rate_na: The number of missing values (NA) in the column. `df.isna().mean()`
# - n_unique: The number of distinct values in the column. **We ignore missing values here.**
# - distinct: If a different row has a different number, then `True` else `False`. When it is `True`, then the column can be an ID such as a primary key or just a continuous variable.
# - variable: See below
Example #9
0
    from adhoc.processing import Inspector
    from adhoc.modeling import show_tree
    from adhoc.utilities import load_iris, facet_grid_scatter_plot, bins_heatmap

# +
np.random.seed(1)

df = load_iris(target="species")
df["cat1"] = np.random.choice(["a", "b", "c"], size=df.shape[0], replace=True)
df["cat2"] = (df.iloc[:, 0] * df.iloc[:, 1] - df.iloc[:, 2] * df.iloc[:, 3] >
              11).map({
                  True: 1,
                  False: 0
              })

inspector = Inspector(df)
inspector  ## 4 continuous variables and 3 categorical variables
# -

inspector.visualize_two_fields("sepal_width",
                               "sepal_length")  ## continuous x continuous

inspector.visualize_two_fields("petal_width",
                               "species")  ## continuous x categorical

inspector.visualize_two_fields("species",
                               "petal_width")  ## categorical x continuous

inspector.visualize_two_fields("species", "cat2")

inspector.visualize_two_fields("species", "cat2", heatmap=True)
Example #10
0
# - ScreenPorch: Screen porch area in square feet
# - PoolArea: Pool area in square feet
# - PoolQC: Pool quality
# - Fence: Fence quality
# - MiscFeature: Miscellaneous feature not covered in other categories
# - MiscVal: $Value of miscellaneous feature
# - MoSold: Month Sold
# - YrSold: Year Sold
# - SaleType: Type of sale
# - SaleCondition: Condition of sale
#

# +
from adhoc.processing import Inspector

inspector_train = Inspector(df_train)

with pd.option_context("display.max_rows",None):
    display(inspector_train)


# -

# ### Data type correction
#
# Some categorical values are described as numbers in the data set so that they look like continuous variables. For example `MSSubClass` (the building class). 

def correct_dtype(data:pd.DataFrame) -> pd.DataFrame:
    df = data.copy()
    df["MSSubClass"] = df["MSSubClass"].apply(lambda x: f"C{x:03d}")
    df["MoSold"] = df["MoSold"].apply(lambda x: f"M{x:02d}")