コード例 #1
0
    def setUp(self):
        self.data = {
            'id': [chr(97 + c) for c in range(1, 10)],
            'x': [50, 50, -10, 0, 0, 5, 15, -3, None],
            'y': [
                0.000001, 654.152, None, 15.984512, 3122, -3.1415926535, 111,
                15.9, 13.5
            ],
            'cat': [
                'a', 'long text value', u'Élysée', '', None,
                'some <b> B.s </div> </div> HTML stuff', 'c', 'c', 'c'
            ],
            's1':
            np.ones(9),
            's2':
            [u'some constant text $ % value {obj} ' for _ in range(1, 10)],
            'somedate': [
                datetime.date(2011, 7, 4),
                datetime.datetime(2022, 1, 1, 13, 57),
                datetime.datetime(1990, 12, 9), np.nan,
                datetime.datetime(1990, 12, 9),
                datetime.datetime(1950, 12, 9),
                datetime.datetime(1898, 1, 2),
                datetime.datetime(1950, 12, 9),
                datetime.datetime(1950, 12, 9)
            ],
            'bool': [True, True, False, True, False, True, True, False, True]
        }
        self.df = pd.DataFrame(self.data)
        self.df['somedate'] = pd.to_datetime(self.df['somedate'])

        self.results = describe(self.df)
        self.test_dir = tempfile.mkdtemp()
コード例 #2
0
ファイル: tests.py プロジェクト: romainx/pandas-profiling
    def test_recoding_reject(self):
        self.data = {
             'x': ['chien', 'chien', 'chien', 'chien', 'chat', 'chat', 'chameaux', 'chameaux'],
             'y': ['dog', 'dog', 'dog', 'dog', 'cat', 'cat', 'camel', 'camel'],
           }
        self.df = pd.DataFrame(self.data)
        self.results = describe(self.df)

        self.assertEqual(self.results['variables'].loc['x']['type'], 'RECODED')
        self.assertEqual(
            self.results['variables'].loc['x']['correlation_var'], 'y')

        expected_results = {'total_missing': 0.0, 'UNIQUE': 0, 'CONST': 0, 'nvar': 2, 'REJECTED': 1,
            'n': 8, 'RECODED': 1, 'CORR': 0, 'DATE': 0, 'NUM': 0, 'CAT': 1, 'n_duplicates': 5}
        for key in expected_results:
            self.assertEqual(self.results['table'][key], expected_results[key])

        # Rerun without checking for correlation
        self.results2 = describe(self.df, check_correlation=False)
        self.assertIsNone(
            self.results2['variables'].loc['x'].get('correlation_var'))
        self.assertEqual(self.results2['table']['REJECTED'], 0)
コード例 #3
0
ファイル: tests.py プロジェクト: sleitner/pandas-profiling
    def test_recoding_reject(self):
        self.data = {
            'x': [
                'chien', 'chien', 'chien', 'chien', 'chat', 'chat', 'chameaux',
                'chameaux'
            ],
            'y': ['dog', 'dog', 'dog', 'dog', 'cat', 'cat', 'camel', 'camel'],
        }
        self.df = pd.DataFrame(self.data)
        self.results = describe(self.df, check_recoded=True)

        self.assertEqual(self.results['variables'].loc['x']['type'], 'RECODED')
        self.assertEqual(self.results['variables'].loc['x']['correlation_var'],
                         'y')

        expected_results = {
            'total_missing': 0.0,
            'UNIQUE': 0,
            'CONST': 0,
            'nvar': 2,
            'REJECTED': 1,
            'n': 8,
            'RECODED': 1,
            'CORR': 0,
            'DATE': 0,
            'NUM': 0,
            'CAT': 1,
            'n_duplicates': 5
        }
        for key in expected_results:
            self.assertEqual(self.results['table'][key], expected_results[key])

        # Rerun without checking for correlation
        self.results2 = describe(self.df, check_correlation=False)
        self.assertIsNone(
            self.results2['variables'].loc['x'].get('correlation_var'))
        self.assertEqual(self.results2['table']['REJECTED'], 0)
コード例 #4
0
ファイル: tests.py プロジェクト: romainx/pandas-profiling
    def setUp(self):
        self.data = {'id': [chr(97 + c) for c in range(1, 10)],
                     'x': [50, 50, -10, 0, 0, 5, 15, -3, None],
                     'y': [0.000001, 654.152, None, 15.984512, 3122, -3.1415926535, 111, 15.9, 13.5],
                     'cat': ['a', 'long text value', u'Élysée', '', None, 'some <b> B.s </div> </div> HTML stuff', 'c',
                             'c',
                             'c'],
                     's1': np.ones(9),
                     's2': [u'some constant text $ % value {obj} ' for _ in range(1, 10)],
                     'somedate': [datetime.date(2011, 7, 4), datetime.datetime(2022, 1, 1, 13, 57),
                                  datetime.datetime(1990, 12, 9), np.nan,
                                  datetime.datetime(
                                      1990, 12, 9), datetime.datetime(1950, 12, 9),
                                  datetime.datetime(1898, 1, 2), datetime.datetime(1950, 12, 9), datetime.datetime(1950, 12, 9)],
                     'bool': [True, True, False, True, False, True, True, False, True]
                     }
        self.df = pd.DataFrame(self.data)
        self.df['somedate'] = pd.to_datetime(self.df['somedate'])

        self.results = describe(self.df)
        self.test_dir = tempfile.mkdtemp()
コード例 #5
0
ファイル: tests.py プロジェクト: sleitner/pandas-profiling
 def test_bins(self):
     self.results = describe(self.df, bins=100)
     self.test_describe_df()
コード例 #6
0
ファイル: tests.py プロジェクト: sleitner/pandas-profiling
    def setUp(self):
        self.data = {
            'id': [chr(97 + c) for c in range(1, 10)],
            'x': [50, 50, -10, 0, 0, 5, 15, -3, None],
            'y': [
                0.000001, 654.152, None, 15.984512, 3122, -3.1415926535, 111,
                15.9, 13.5
            ],
            'cat': [
                'a', 'long text value', u'Élysée', '', None,
                'some <b> B.s </div> </div> HTML stuff', 'c', 'c', 'c'
            ],
            's1':
            np.ones(9),
            's2':
            [u'some constant text $ % value {obj} ' for _ in range(1, 10)],
            'somedate': [
                datetime.date(2011, 7, 4),
                datetime.datetime(2022, 1, 1, 13, 57),
                datetime.datetime(1990, 12, 9), np.nan,
                datetime.datetime(1990, 12, 9),
                datetime.datetime(1950, 12, 9),
                datetime.datetime(1898, 1, 2),
                datetime.datetime(1950, 12, 9),
                datetime.datetime(1950, 12, 9)
            ],
            'bool_tf':
            [True, True, False, True, False, True, True, False, True],
            'bool_tf_with_nan':
            [True, False, False, False, False, True, True, False, np.nan],
            'bool_01': [1, 1, 0, 1, 1, 0, 0, 0, 1],
            'bool_01_with_nan': [1, 0, 1, 0, 0, 1, 1, 0, np.nan],
            'list': [[1, 2], [1, 2], [1, 2], [1, 2], [1, 2], [1, 2], [1, 2],
                     [1, 2], [1, 2]],
            'mixed': [1, 2, "a", 4, 5, 6, 7, 8, 9],
            'dict': [{
                'a': 'a'
            }, {
                'b': 'b'
            }, {
                'c': 'c'
            }, {
                'd': 'd'
            }, {
                'e': 'e'
            }, {
                'f': 'f'
            }, {
                'g': 'g'
            }, {
                'h': 'h'
            }, {
                'i': 'i'
            }],
            'tuple': [(1, 2), (3, 4), (5, 6), (7, 8), (9, 10), (11, 12),
                      (13, 14), (15, 16), (17, 18)]
        }
        self.df = pd.DataFrame(self.data)
        self.df['somedate'] = pd.to_datetime(self.df['somedate'])

        self.results = describe(self.df)
        self.test_dir = tempfile.mkdtemp()
コード例 #7
0
ファイル: tests.py プロジェクト: romainx/pandas-profiling
 def test_bins(self):
     self.results = describe(self.df, bins=100)
     self.test_describe_df()