Example #1
0
    def test_significance_matrix(self):
        """Test significance calculation"""

        import numpy as np
        import pandas as pd
        from phik import resources

        # open fake car insurance data
        df = pd.read_csv(resources.fixture('fake_insurance_data.csv.gz'))
        cols = list(df.columns)
        # get significances
        interval_cols = ['driver_age', 'mileage']
        sm = df.significance_matrix(interval_cols=interval_cols,
                                    significance_method='asymptotic')

        self.assertTrue(
            np.isclose(sm.values[cols.index('car_color'),
                                 cols.index('area')], 37.66184429195198))
        self.assertTrue(
            np.isclose(sm.values[cols.index('area'),
                                 cols.index('car_color')], 37.66184429195198))
        self.assertTrue(
            np.isclose(
                sm.values[cols.index('mileage'),
                          cols.index('car_size')], 49.3323049685695))
        self.assertTrue(
            np.isclose(
                sm.values[cols.index('car_size'),
                          cols.index('mileage')], 49.3323049685695))
Example #2
0
    def test_phik_matrix(self):
        """Test the calculation of Phi_K"""

        import numpy as np
        import pandas as pd
        from phik import resources

        # open fake car insurance data
        df = pd.read_csv(resources.fixture('fake_insurance_data.csv.gz'))
        cols = list(df.columns)

        # get the phi_k correlation matrix between all variables
        interval_cols = ['driver_age', 'mileage']
        phik_corr = df.phik_matrix(interval_cols=interval_cols)

        self.assertTrue(
            np.isclose(
                phik_corr.values[cols.index('car_color'),
                                 cols.index('area')], 0.5904561614620166))
        self.assertTrue(
            np.isclose(
                phik_corr.values[cols.index('area'),
                                 cols.index('car_color')], 0.5904561614620166))
        self.assertTrue(
            np.isclose(
                phik_corr.values[cols.index('mileage'),
                                 cols.index('car_size')], 0.768588987856336))
        self.assertTrue(
            np.isclose(
                phik_corr.values[cols.index('car_size'),
                                 cols.index('mileage')], 0.768588987856336))
Example #3
0
    def test_outlier_significance_matrices(self):
        """Test the calculation of outlier significances"""

        # open fake car insurance data
        df = pd.read_csv( resources.fixture('fake_insurance_data.csv.gz') )

        # calculate outlier significances
        interval_cols = ['mileage', 'driver_age']
        om = df.outlier_significance_matrices(interval_cols=interval_cols)

        self.assertTrue(isinstance(om, dict))
Example #4
0
    def test_hist2d_array(self):
        """Test the calculation of global Phi_K values"""

        # open fake car insurance data
        df = pd.read_csv( resources.fixture('fake_insurance_data.csv.gz') )

        # create contingency matrix
        interval_cols = ['mileage']
        h2d = df['mileage'].hist2d(df['car_size'], interval_cols=interval_cols)
        self.assertEqual(h2d.values[1, 1], 10)
        self.assertEqual(h2d.values[5, 5], 217)
Example #5
0
    def test_outlier_significance_matrix(self):
        """Test the calculation of outlier significances"""

        # open fake car insurance data
        df = pd.read_csv( resources.fixture('fake_insurance_data.csv.gz') )

        # calculate outlier significances
        cols = ['mileage','car_size']
        interval_cols = ['mileage']
        om = df[cols].outlier_significance_matrix(interval_cols=interval_cols)

        self.assertTrue(np.isclose(om.values[0,1], 21.483476494343552))
        self.assertTrue(np.isclose(om.values[2,4], -1.246784034214704))
Example #6
0
    def test_significance_matrix_hybrid(self):
        """Test significance calculation"""

        # open fake car insurance data
        df = pd.read_csv( resources.fixture('fake_insurance_data.csv.gz') )
        cols = list(df.columns)
        # get significances
        interval_cols = ['driver_age', 'mileage']
        sm = df.significance_matrix(interval_cols=interval_cols, significance_method='hybrid')

        self.assertTrue(np.isclose(sm.values[cols.index('car_color'), cols.index('area')], 37.63086023595297, atol=10e-2))
        self.assertTrue(np.isclose(sm.values[cols.index('area'), cols.index('car_color')], 37.63086023595297, atol=10e-2))
        self.assertTrue(np.isclose(sm.values[cols.index('mileage'), cols.index('car_size')], 49.28345609465683, atol=10e-2))
        self.assertTrue(np.isclose(sm.values[cols.index('car_size'), cols.index('mileage')], 49.28345609465683, atol=10e-2))
Example #7
0
    def test_significance_matrix_mc(self):
        """Test significance calculation"""

        # open fake car insurance data
        df = pd.read_csv(resources.fixture('fake_insurance_data.csv.gz'))
        cols = list(df.columns)
        # get significances
        interval_cols = ['driver_age', 'mileage']
        sm = df.significance_matrix(interval_cols=interval_cols, significance_method='MC')

        self.assertTrue(np.isclose(sm.values[cols.index('car_color'), cols.index('area')], np.inf))
        self.assertTrue(np.isclose(sm.values[cols.index('area'), cols.index('car_color')], np.inf))
        self.assertTrue(np.isclose(sm.values[cols.index('mileage'), cols.index('car_size')], np.inf))
        self.assertTrue(np.isclose(sm.values[cols.index('car_size'), cols.index('mileage')], np.inf))
Example #8
0
    def test_hist2d(self):
        """Test the calculation of global Phi_K values"""

        import pandas as pd
        from phik import resources

        # open fake car insurance data
        df = pd.read_csv(resources.fixture('fake_insurance_data.csv.gz'))

        # create contingency matrix
        cols = ['mileage', 'car_size']
        interval_cols = ['mileage']
        h2d = df[cols].hist2d(interval_cols=interval_cols)

        self.assertEqual(h2d.values[1, 1], 10)
        self.assertEqual(h2d.values[5, 5], 217)
Example #9
0
    def test_global_phik(self):
        """Test the calculation of global Phi_K values"""

        import numpy as np
        import pandas as pd
        from phik import resources

        # open fake car insurance data
        df = pd.read_csv(resources.fixture('fake_insurance_data.csv.gz'))

        # get the global phi_k values
        interval_cols = ['driver_age', 'mileage']
        gk = df.global_phik(interval_cols=interval_cols)

        self.assertTrue(np.isclose(gk[0][0][0], 0.6057528003711345))
        self.assertTrue(np.isclose(gk[0][4][0], 0.768588987856336))
Example #10
0
    def test_global_phik(self):
        """Test the calculation of global Phi_K values"""

        # open fake car insurance data
        df = pd.read_csv( resources.fixture('fake_insurance_data.csv.gz') )

        # get the global phi_k values 
        interval_cols = ['driver_age', 'mileage']
        gk = df.global_phik(interval_cols=interval_cols)

        area = (np.where(gk[1] == 'area'))[0][0]
        car_size = (np.where(gk[1] == 'car_size'))[0][0]
        mileage = (np.where(gk[1] == 'mileage'))[0][0]

        self.assertTrue(np.isclose(gk[0][area][0], 0.6057528003711345))
        self.assertTrue(np.isclose(gk[0][car_size][0], 0.76858883))
        self.assertTrue(np.isclose(gk[0][mileage][0], 0.768588987856336))
Example #11
0
import pandas as pd

import phik
from phik import resources, report

# open fake car insurance data
df = pd.read_csv(resources.fixture('fake_insurance_data.csv.gz'))
df.head()

# Pearson's correlation matrix between numeric variables (pandas functionality)
df.corr()

# get the phi_k correlation matrix between all variables
df.phik_matrix()

# get global correlations based on phi_k correlation matrix
df.global_phik()

# get the significance matrix (expressed as one-sided Z)
# of the hypothesis test of each variable-pair dependency
df.significance_matrix()

# contingency table of two columns
cols = ['mileage', 'car_size']
df[cols].hist2d()

# normalized residuals of contingency test applied to cols
df[cols].outlier_significance_matrix()

# show the normalized residuals of each variable-pair
df.outlier_significance_matrices()