Beispiel #1
0
    def test_get_perc_works_as_expected(self):
        float_nums = [(0.123, '12.30%'), (3.1243453, '312.43%'),
                      (213.12312, '21,312.31%')]

        int_nums = [(0.14, '14%'), (1.300, '130%')]

        for num, expected in float_nums:
            self.assertEqual(DataFrameSummary._percent(num), expected)

        for num, expected in int_nums:
            self.assertEqual(DataFrameSummary._percent(num), expected)
    def test_get_perc_works_as_expected(self):
        float_nums = [(0.123, '12.30%'),
                      (3.1243453, '312.43%'),
                      (213.12312, '21,312.31%')]

        int_nums = [(0.14, '14%'),
                    (1.300, '130%')]

        for num, expected in float_nums:
            self.assertEqual(DataFrameSummary._percent(num), expected)

        for num, expected in int_nums:
            self.assertEqual(DataFrameSummary._percent(num), expected)
    def test_bool1_summary(self):
        count_values = self.df['dbool1'].value_counts()
        total_count = self.df['dbool1'].count()
        count0 = count_values[0]
        count1 = count_values[1]
        perc0 = DataFrameSummary._percent(count0 / total_count)
        perc1 = DataFrameSummary._percent(count1 / total_count)
        expected = pd.Series(index=['"0" count', '"0" perc', '"1" count', '"1" perc',
                                    'counts', 'uniques', 'missing', 'missing_perc', 'types'],
                             data=[str(count0), perc0, str(count1), perc1,
                                   self.size, 2, 0, '0%', DataFrameSummary.TYPE_BOOL],
                             name='dbool1',
                             dtype=object).sort_index()

        assert_series_equal(self.dfs['dbool1'].sort_index(), expected)
    def test_numerics_summary(self):
        num1 = self.df['dnumerics1']
        dm, dmp = self.dfs._get_deviation_of_mean(num1)
        dam, damp = self.dfs._get_median_absolute_deviation(num1)
        expected = pd.Series(index=['mean', 'std', 'variance', 'min', 'max', '5%', '25%', '50%',
                                    '75%', '95%', 'iqr', 'kurtosis', 'skewness', 'sum', 'mad', 'cv',
                                    'zeros_num', 'zeros_perc', 'deviating_of_mean',
                                    'deviating_of_mean_perc', 'deviating_of_median',
                                    'deviating_of_median_perc', 'top_correlations', 'counts',
                                    'uniques', 'missing', 'missing_perc', 'types'],
                             data=[num1.mean(), num1.std(), num1.var(), num1.min(), num1.max(),
                                   num1.quantile(0.05), num1.quantile(
                                       0.25), num1.quantile(0.5),
                                   num1.quantile(0.75), num1.quantile(0.95),
                                   num1.quantile(0.75) - num1.quantile(0.25),
                                   num1.kurt(), num1.skew(), num1.sum(), num1.mad(),
                                   num1.std() / num1.mean() if num1.mean() else np.nan,
                                   self.size - np.count_nonzero(num1),
                                   DataFrameSummary._percent(
                                       (self.size - np.count_nonzero(num1))/self.size),
                                   dm, dmp, dam, damp, 'dnumerics2: 100%', self.size, self.size,
                                   0, '0%', DataFrameSummary.TYPE_NUMERIC],
                             name='dnumerics1',
                             dtype=object)

        assert_series_equal(self.dfs['dnumerics1'], expected)
Beispiel #5
0
    def test_bool1_summary(self):
        count_values = self.df['dbool1'].value_counts()
        total_count = self.df['dbool1'].count()
        count0 = count_values[0]
        count1 = count_values[1]
        perc0 = DataFrameSummary._percent(count0 / total_count)
        perc1 = DataFrameSummary._percent(count1 / total_count)
        expected = pd.Series(index=[
            '"0" count', '"0" perc', '"1" count', '"1" perc', 'counts',
            'uniques', 'missing', 'missing_perc', 'types'
        ],
                             data=[
                                 str(count0), perc0,
                                 str(count1), perc1, self.size, 2, 0, '0%',
                                 DataFrameSummary.TYPE_BOOL
                             ],
                             name='dbool1',
                             dtype=object)

        assert_series_equal(self.dfs['dbool1'], expected)
Beispiel #6
0
    def test_numerics_summary(self):
        num1 = self.df['dnumerics1']
        dm, dmp = self.dfs._get_deviation_of_mean(num1)
        dam, damp = self.dfs._get_median_absolute_deviation(num1)
        expected = pd.Series(
            index=[
                'mean', 'std', 'variance', 'min', 'max', 'mode', '5%', '25%',
                '50%', '75%', '95%', 'iqr', 'kurtosis', 'skewness', 'sum',
                'mad', 'cv', 'zeros_num', 'zeros_perc', 'deviating_of_mean',
                'deviating_of_mean_perc', 'deviating_of_median',
                'deviating_of_median_perc', 'top_correlations', 'counts',
                'uniques', 'missing', 'missing_perc', 'types'
            ],
            data=[
                num1.mean(),
                num1.std(),
                num1.var(),
                num1.min(),
                num1.max(),
                num1.mode()[0],
                num1.quantile(0.05),
                num1.quantile(0.25),
                num1.quantile(0.5),
                num1.quantile(0.75),
                num1.quantile(0.95),
                num1.quantile(0.75) - num1.quantile(0.25),
                num1.kurt(),
                num1.skew(),
                num1.sum(),
                num1.mad(),
                num1.std() / num1.mean() if num1.mean() else np.nan,
                self.size - np.count_nonzero(num1),
                DataFrameSummary._percent(
                    (self.size - np.count_nonzero(num1)) / self.size), dm, dmp,
                dam, damp, 'dnumerics2: 100%', self.size, self.size, 0, '0%',
                DataFrameSummary.TYPE_NUMERIC
            ],
            name='dnumerics1',
            dtype=object)

        assert_series_equal(self.dfs['dnumerics1'], expected)