def test_datetime(self): df1 = pd.DataFrame(data={ 'col1': [1,2], 'col2': [datetime.date(2000,1,1), datetime.time(10,30)], 'col3': [datetime.datetime.now().astimezone(pytz.timezone('UTC')), datetime.datetime.now().astimezone(None)] }) result1 = describe_pd_dataframe(df1) self.assertEqual(result1['row_count'], 2) self.assertEqual(result1['column_count'], 3) df2 = pd.DataFrame(np.random.randn(2, 3), index=pd.date_range('1/1/2000', periods=2), columns=['A', 'B', 'C']) result2 = describe_pd_dataframe(df2) self.assertEqual(result2['row_count'], 2) self.assertEqual(result2['column_count'], 3)
def test_categorical_columns(self): df = pd.DataFrame(data={ 'cat1': ['a', 'b', 'c', 'd'], 'cat2': ['a', 'b', None, 'd'], # 'cat3': [1, (2,3), '4', []], 'cat3': [1, (2,3), '4', 5], 'cat4': [True, True, True, False], }) result = describe_pd_dataframe(df) self.assertEqual(result['row_count'], 4) self.assertEqual(result['column_count'], 4) self.assertEqual(len(result['rows_top']), 4) self.assertEqual(result['rows_bottom'], None) self.assertDictEqual(result['columns'][0], { 'name': 'cat1', 'dtype': 'object', 'stats': { 'unique_count': 4, 'nan_count': 0, 'categories': [ {'name': 'a', 'count': 1}, {'name': 'b', 'count': 1}, {'name': '2 others', 'count': 2}, ] }, }) self.assertEqual(result['columns'][1]['stats']['categories'], [ {'name': 'a', 'count': 1}, {'name': '2 others', 'count': 2}, {'name': 'Missing', 'count': 1}, ])
def dataframe_formatter(df): # inspired by https://jupyter.readthedocs.io/en/latest/reference/mimetype.html MIME_TYPE = 'application/vnd.deepnote.dataframe.v2+json' try: return { MIME_TYPE: describe_pd_dataframe(df) } except: return { MIME_TYPE: { 'error': traceback.format_exc() } }
def test_no_rows(self): df = pd.DataFrame(data={ 'col1': [], 'col2': [], }) result = describe_pd_dataframe(df) self.assertEqual(result['row_count'], 0) self.assertEqual(result['column_count'], 2)
def test_dataframe(self): df = pd.DataFrame(data={'col1': [1, 2], 'col2': [3, 4]}) result = describe_pd_dataframe(df) self.assertEqual(result['row_count'], 2) self.assertEqual(result['column_count'], 2) self.assertEqual(len(result['rows_top']), 2) self.assertEqual(result['rows_bottom'], None) self.assertEqual(result['columns'][0]['name'], 'col1')
def test_dataframe_sort(self): df = pd.DataFrame(data={'col1': [3, 1, 2]}) result = describe_pd_dataframe(df.sort_values('col1')) self.assertEqual(result['rows_top'][0]['col1'], 1) self.assertEqual(result['rows_top'][1]['col1'], 2) self.assertEqual(result['rows_top'][2]['col1'], 3) # _deepnote_index_column is hidden on frontend. See variable_explorer_helpers for more info. self.assertEqual(result['rows_top'][0]['_deepnote_index_column'], 1)
def test_duplicate_columns(self): df = pd.DataFrame(data={ 'col1': ['a', 'b', 'c', 'd'], 'col2': [1, 2, 3, 4], }) df.columns = ['col1', 'col1'] result = describe_pd_dataframe(df) self.assertEqual(result['row_count'], 4) self.assertEqual(result['column_count'], 2) self.assertEqual(result['columns'][0]['name'], 'col1') self.assertEqual(result['columns'][1]['name'], 'col1.1')
def test_big_dataframe(self): import numpy as np df = pd.DataFrame(data={ 'col1': np.arange(100000), 'col2': np.arange(100000), 'col3': np.arange(100000), }) result = describe_pd_dataframe(df) self.assertEqual(result['row_count'], 100000) self.assertEqual(result['column_count'], 3) self.assertEqual(len(result['rows_top']), 166) self.assertEqual(len(result['rows_bottom']), 167) self.assertTrue('stats' in result['columns'][0]) self.assertTrue('stats' not in result['columns'][1]) df = pd.DataFrame(data={ 'col1': np.arange(200000), 'col2': np.arange(200000), 'col3': np.arange(200000), }) result = describe_pd_dataframe(df) self.assertTrue('stats' not in result['columns'][0])
def test_numerical_columns(self): df = pd.DataFrame(data={ 'col1': [1, 2, 3, 4], 'col2': [1, 2, None, 4], # 'col3': [1, 2.1, complex(-1.0, 0.0), 10**1000] 'col3': [1, 2.1, 3, 4] }) result = describe_pd_dataframe(df) self.assertEqual(result['row_count'], 4) self.assertEqual(result['column_count'], 3) self.assertEqual(len(result['rows_top']), 4) self.assertEqual(result['rows_bottom'], None) self.assertEqual(result['columns'][0]['name'], 'col1')
def test_object_to_string_casting(self): df1 = pd.DataFrame(data={ 'col1': [1,2], 'col2': [datetime.date(2000,1,1), datetime.time(10,30)], 'col3': [datetime.datetime.now().astimezone(pytz.timezone('UTC')), datetime.datetime.now().astimezone(None)] }) result1 = describe_pd_dataframe(df1) self.assertTrue(type(result1['rows_top'][0]['col2']) is str) self.assertTrue(type(result1['rows_top'][0]['col3']) is str) self.assertTrue(type(result1['rows_top'][1]['col2']) is str) self.assertTrue(type(result1['rows_top'][1]['col3']) is str) self.assertEqual(result1['columns'][0]["dtype"], "int64") self.assertEqual(result1['columns'][1]["dtype"], "object") self.assertEqual(result1['columns'][1]["dtype"], "object")
def test_nans(self): df = pd.DataFrame(data={ 'col1': [None, None, None], }) result = describe_pd_dataframe(df) self.assertEqual(result['row_count'], 3) self.assertEqual(result['column_count'], 1) self.assertEqual(result['columns'][0]['stats'], { 'unique_count': 0, 'nan_count': 3, 'categories': [ {'name': 'Missing', 'count': 3}, ] })