コード例 #1
0
    def test_an_inspection(self):
        """
        check the inspection
        """
        df = pd.read_csv(self.test_data)

        inspector = Inspector(df, m_cats=20)

        ## attribute check
        self.assertEqual(20, inspector.m_cats)

        self.assertEqual(
            inspector.result.loc["education-num", "variable"],
            VariableType.categorical.name
        )

        ## nan must be ignored
        self.assertEqual(
            inspector.result.loc["workclass", "n_unique"],
            8
        )

        self.assertEqual(
            inspector.result.loc["sex", "variable"],
            VariableType.binary.name
        )

        df["const"] = 1
        # TODO: [datetime(year=2019,month=1,day=1) + timedelta(hours=h) for h in range(360)]
        inspector = Inspector(df, m_cats=15)

        self.assertEqual(
            inspector.result.loc["const", "variable"],
            VariableType.constant.name
        )

        self.assertEqual(
            inspector.result.loc["education-num", "variable"],
            VariableType.continuous.name
        )

        ## An "object" column must always be categorical
        self.assertTrue(
            "education" in inspector.get_cats()
        )

        self.assertEqual(inspector.get_cats(),
                         ["workclass", "education", "marital-status",
                          "occupation", "relationship", "race",
                          "sex", "native-country", "label"])

        self.assertEqual(inspector.get_cons(),
                         ["age", "fnlwgt", "education-num","capital-gain",
                          "capital-loss", "hours-per-week"])
コード例 #2
0
    def test_distribution(self):
        """
        check DataFrames for distributions
        """

        df = pd.read_csv(self.test_data)
        nrow = df.shape[0]
        inspector = Inspector(df, m_cats=20)

        df_cat = inspector.distribution_cats()

        self.assertAlmostEqual(
            df_cat.loc["workclass"].loc["Private", "count"] / nrow,
            df_cat.loc["workclass"].loc["Private", "rate"]
        )

        df_con = inspector.distribution_cons()

        ## Since it is just a transpose of describe(),
        ## the number of columns is equal to 8
        self.assertEqual(
            df_con.shape,
            (len(inspector.get_cons()), 8)
        )
コード例 #3
0
ファイル: usage-processing.py プロジェクト: stdiff/adhoc
# If we assign a number to `m_cats`, then the inspection is computed again. As a result your manual modification of variable types will be lost.

inspector.m_cats = 20
inspector.result.query(
    "dtype == 'int64'")  ## The variable type of age is now continuous.

# If you want to calculate the inspection once again because you converted a column, then `make_an_inspection()` does the job.

inspector.make_an_inspection()

# We can get easily the list of categorical/continuous variables. (Note that a constant variable is neither categorical nor continuous.)

print(inspector.get_cats()
      )  ## list of categorical variables (binary or categorical)

print(inspector.get_cons())  ## list of continuous variables

# #### Distributions
#
# In order to find a special/strange values we need to check the distributions of the variables.
#
# The distributions of categorical variables is shown by `distribution_cats()`

inspector.distribution_cats(fields=["workclass", "dummy_ym"], sort=True)

# If `fields` is not given, then the distributions of all categorical variables are shown.
#
# You can also give selected fields to see their distribution in the above form. You can also give a continuous variable.

## Obviously a histogram is better and easier than this, but this is just an example.
inspector.distribution_cats(["age"]).reset_index(level=0)["count"].plot()