Example #1
0
    def test_datetime(self):
        df1 = pd.DataFrame(data={
            'col1': [1,2],
            'col2': [datetime.date(2000,1,1), datetime.time(10,30)],
            'col3': [datetime.datetime.now().astimezone(pytz.timezone('UTC')), datetime.datetime.now().astimezone(None)]
        })
        result1 = describe_pd_dataframe(df1)
        self.assertEqual(result1['row_count'], 2)
        self.assertEqual(result1['column_count'], 3)

        df2 = pd.DataFrame(np.random.randn(2, 3), index=pd.date_range('1/1/2000', periods=2), columns=['A', 'B', 'C'])
        result2 = describe_pd_dataframe(df2)
        self.assertEqual(result2['row_count'], 2)
        self.assertEqual(result2['column_count'], 3)
Example #2
0
 def test_categorical_columns(self):
     df = pd.DataFrame(data={
         'cat1': ['a', 'b', 'c', 'd'],
         'cat2': ['a', 'b', None, 'd'],
         # 'cat3': [1, (2,3), '4', []],
         'cat3': [1, (2,3), '4', 5],
         'cat4': [True, True, True, False],
     })
     result = describe_pd_dataframe(df)
     self.assertEqual(result['row_count'], 4)
     self.assertEqual(result['column_count'], 4)
     self.assertEqual(len(result['rows_top']), 4)
     self.assertEqual(result['rows_bottom'], None)
     self.assertDictEqual(result['columns'][0], {
         'name': 'cat1',
         'dtype': 'object',
         'stats': {
             'unique_count': 4,
             'nan_count': 0,
             'categories': [
                 {'name': 'a', 'count': 1},
                 {'name': 'b', 'count': 1},
                 {'name': '2 others', 'count': 2},
             ]
         },
     })
     self.assertEqual(result['columns'][1]['stats']['categories'], [
         {'name': 'a', 'count': 1},
         {'name': '2 others', 'count': 2},
         {'name': 'Missing', 'count': 1},
     ])
Example #3
0
 def dataframe_formatter(df):
     # inspired by https://jupyter.readthedocs.io/en/latest/reference/mimetype.html
     MIME_TYPE = 'application/vnd.deepnote.dataframe.v2+json'
     try:
         return { MIME_TYPE: describe_pd_dataframe(df) }
     except:
         return { MIME_TYPE: { 'error': traceback.format_exc() } }
Example #4
0
 def test_no_rows(self):
     df = pd.DataFrame(data={
         'col1': [],
         'col2': [],
     })
     result = describe_pd_dataframe(df)
     self.assertEqual(result['row_count'], 0)
     self.assertEqual(result['column_count'], 2)
Example #5
0
 def test_dataframe(self):
     df = pd.DataFrame(data={'col1': [1, 2], 'col2': [3, 4]})
     result = describe_pd_dataframe(df)
     self.assertEqual(result['row_count'], 2)
     self.assertEqual(result['column_count'], 2)
     self.assertEqual(len(result['rows_top']), 2)
     self.assertEqual(result['rows_bottom'], None)
     self.assertEqual(result['columns'][0]['name'], 'col1')
Example #6
0
 def test_dataframe_sort(self):
     df = pd.DataFrame(data={'col1': [3, 1, 2]})
     result = describe_pd_dataframe(df.sort_values('col1'))
     self.assertEqual(result['rows_top'][0]['col1'], 1)
     self.assertEqual(result['rows_top'][1]['col1'], 2)
     self.assertEqual(result['rows_top'][2]['col1'], 3)
     # _deepnote_index_column is hidden on frontend. See variable_explorer_helpers for more info.
     self.assertEqual(result['rows_top'][0]['_deepnote_index_column'], 1)
Example #7
0
 def test_duplicate_columns(self):
     df = pd.DataFrame(data={
         'col1': ['a', 'b', 'c', 'd'],
         'col2': [1, 2, 3, 4],
     })
     df.columns = ['col1', 'col1']
     result = describe_pd_dataframe(df)
     self.assertEqual(result['row_count'], 4)
     self.assertEqual(result['column_count'], 2)
     self.assertEqual(result['columns'][0]['name'], 'col1')
     self.assertEqual(result['columns'][1]['name'], 'col1.1')
Example #8
0
    def test_big_dataframe(self):
        import numpy as np
        df = pd.DataFrame(data={
            'col1': np.arange(100000),
            'col2': np.arange(100000),
            'col3': np.arange(100000),
        })
        result = describe_pd_dataframe(df)
        self.assertEqual(result['row_count'], 100000)
        self.assertEqual(result['column_count'], 3)
        self.assertEqual(len(result['rows_top']), 166)
        self.assertEqual(len(result['rows_bottom']), 167)
        self.assertTrue('stats' in result['columns'][0])
        self.assertTrue('stats' not in result['columns'][1])

        df = pd.DataFrame(data={
            'col1': np.arange(200000),
            'col2': np.arange(200000),
            'col3': np.arange(200000),
        })
        result = describe_pd_dataframe(df)
        self.assertTrue('stats' not in result['columns'][0])
Example #9
0
 def test_numerical_columns(self):
     df = pd.DataFrame(data={
         'col1': [1, 2, 3, 4],
         'col2': [1, 2, None, 4],
         # 'col3': [1, 2.1, complex(-1.0, 0.0), 10**1000]
         'col3': [1, 2.1, 3, 4]
     })
     result = describe_pd_dataframe(df)
     self.assertEqual(result['row_count'], 4)
     self.assertEqual(result['column_count'], 3)
     self.assertEqual(len(result['rows_top']), 4)
     self.assertEqual(result['rows_bottom'], None)
     self.assertEqual(result['columns'][0]['name'], 'col1')
Example #10
0
 def test_object_to_string_casting(self):
     df1 = pd.DataFrame(data={
         'col1': [1,2],
         'col2': [datetime.date(2000,1,1), datetime.time(10,30)],
         'col3': [datetime.datetime.now().astimezone(pytz.timezone('UTC')), datetime.datetime.now().astimezone(None)]
     })
     result1 = describe_pd_dataframe(df1)
     self.assertTrue(type(result1['rows_top'][0]['col2']) is str)
     self.assertTrue(type(result1['rows_top'][0]['col3']) is str)
     self.assertTrue(type(result1['rows_top'][1]['col2']) is str)
     self.assertTrue(type(result1['rows_top'][1]['col3']) is str)
     self.assertEqual(result1['columns'][0]["dtype"], "int64")
     self.assertEqual(result1['columns'][1]["dtype"], "object")
     self.assertEqual(result1['columns'][1]["dtype"], "object")
Example #11
0
 def test_nans(self):
     df = pd.DataFrame(data={
         'col1': [None, None, None],
     })
     result = describe_pd_dataframe(df)
     self.assertEqual(result['row_count'], 3)
     self.assertEqual(result['column_count'], 1)
     self.assertEqual(result['columns'][0]['stats'], {
         'unique_count': 0,
         'nan_count': 3,
         'categories': [
             {'name': 'Missing', 'count': 3},
         ]
     })