def setUp(self): self.data = { 'id': [chr(97 + c) for c in range(1, 10)], 'x': [50, 50, -10, 0, 0, 5, 15, -3, None], 'y': [ 0.000001, 654.152, None, 15.984512, 3122, -3.1415926535, 111, 15.9, 13.5 ], 'cat': [ 'a', 'long text value', u'Élysée', '', None, 'some <b> B.s </div> </div> HTML stuff', 'c', 'c', 'c' ], 's1': np.ones(9), 's2': [u'some constant text $ % value {obj} ' for _ in range(1, 10)], 'somedate': [ datetime.date(2011, 7, 4), datetime.datetime(2022, 1, 1, 13, 57), datetime.datetime(1990, 12, 9), np.nan, datetime.datetime(1990, 12, 9), datetime.datetime(1950, 12, 9), datetime.datetime(1898, 1, 2), datetime.datetime(1950, 12, 9), datetime.datetime(1950, 12, 9) ], 'bool': [True, True, False, True, False, True, True, False, True] } self.df = pd.DataFrame(self.data) self.df['somedate'] = pd.to_datetime(self.df['somedate']) self.results = describe(self.df) self.test_dir = tempfile.mkdtemp()
def test_recoding_reject(self): self.data = { 'x': ['chien', 'chien', 'chien', 'chien', 'chat', 'chat', 'chameaux', 'chameaux'], 'y': ['dog', 'dog', 'dog', 'dog', 'cat', 'cat', 'camel', 'camel'], } self.df = pd.DataFrame(self.data) self.results = describe(self.df) self.assertEqual(self.results['variables'].loc['x']['type'], 'RECODED') self.assertEqual( self.results['variables'].loc['x']['correlation_var'], 'y') expected_results = {'total_missing': 0.0, 'UNIQUE': 0, 'CONST': 0, 'nvar': 2, 'REJECTED': 1, 'n': 8, 'RECODED': 1, 'CORR': 0, 'DATE': 0, 'NUM': 0, 'CAT': 1, 'n_duplicates': 5} for key in expected_results: self.assertEqual(self.results['table'][key], expected_results[key]) # Rerun without checking for correlation self.results2 = describe(self.df, check_correlation=False) self.assertIsNone( self.results2['variables'].loc['x'].get('correlation_var')) self.assertEqual(self.results2['table']['REJECTED'], 0)
def test_recoding_reject(self): self.data = { 'x': [ 'chien', 'chien', 'chien', 'chien', 'chat', 'chat', 'chameaux', 'chameaux' ], 'y': ['dog', 'dog', 'dog', 'dog', 'cat', 'cat', 'camel', 'camel'], } self.df = pd.DataFrame(self.data) self.results = describe(self.df, check_recoded=True) self.assertEqual(self.results['variables'].loc['x']['type'], 'RECODED') self.assertEqual(self.results['variables'].loc['x']['correlation_var'], 'y') expected_results = { 'total_missing': 0.0, 'UNIQUE': 0, 'CONST': 0, 'nvar': 2, 'REJECTED': 1, 'n': 8, 'RECODED': 1, 'CORR': 0, 'DATE': 0, 'NUM': 0, 'CAT': 1, 'n_duplicates': 5 } for key in expected_results: self.assertEqual(self.results['table'][key], expected_results[key]) # Rerun without checking for correlation self.results2 = describe(self.df, check_correlation=False) self.assertIsNone( self.results2['variables'].loc['x'].get('correlation_var')) self.assertEqual(self.results2['table']['REJECTED'], 0)
def setUp(self): self.data = {'id': [chr(97 + c) for c in range(1, 10)], 'x': [50, 50, -10, 0, 0, 5, 15, -3, None], 'y': [0.000001, 654.152, None, 15.984512, 3122, -3.1415926535, 111, 15.9, 13.5], 'cat': ['a', 'long text value', u'Élysée', '', None, 'some <b> B.s </div> </div> HTML stuff', 'c', 'c', 'c'], 's1': np.ones(9), 's2': [u'some constant text $ % value {obj} ' for _ in range(1, 10)], 'somedate': [datetime.date(2011, 7, 4), datetime.datetime(2022, 1, 1, 13, 57), datetime.datetime(1990, 12, 9), np.nan, datetime.datetime( 1990, 12, 9), datetime.datetime(1950, 12, 9), datetime.datetime(1898, 1, 2), datetime.datetime(1950, 12, 9), datetime.datetime(1950, 12, 9)], 'bool': [True, True, False, True, False, True, True, False, True] } self.df = pd.DataFrame(self.data) self.df['somedate'] = pd.to_datetime(self.df['somedate']) self.results = describe(self.df) self.test_dir = tempfile.mkdtemp()
def test_bins(self): self.results = describe(self.df, bins=100) self.test_describe_df()
def setUp(self): self.data = { 'id': [chr(97 + c) for c in range(1, 10)], 'x': [50, 50, -10, 0, 0, 5, 15, -3, None], 'y': [ 0.000001, 654.152, None, 15.984512, 3122, -3.1415926535, 111, 15.9, 13.5 ], 'cat': [ 'a', 'long text value', u'Élysée', '', None, 'some <b> B.s </div> </div> HTML stuff', 'c', 'c', 'c' ], 's1': np.ones(9), 's2': [u'some constant text $ % value {obj} ' for _ in range(1, 10)], 'somedate': [ datetime.date(2011, 7, 4), datetime.datetime(2022, 1, 1, 13, 57), datetime.datetime(1990, 12, 9), np.nan, datetime.datetime(1990, 12, 9), datetime.datetime(1950, 12, 9), datetime.datetime(1898, 1, 2), datetime.datetime(1950, 12, 9), datetime.datetime(1950, 12, 9) ], 'bool_tf': [True, True, False, True, False, True, True, False, True], 'bool_tf_with_nan': [True, False, False, False, False, True, True, False, np.nan], 'bool_01': [1, 1, 0, 1, 1, 0, 0, 0, 1], 'bool_01_with_nan': [1, 0, 1, 0, 0, 1, 1, 0, np.nan], 'list': [[1, 2], [1, 2], [1, 2], [1, 2], [1, 2], [1, 2], [1, 2], [1, 2], [1, 2]], 'mixed': [1, 2, "a", 4, 5, 6, 7, 8, 9], 'dict': [{ 'a': 'a' }, { 'b': 'b' }, { 'c': 'c' }, { 'd': 'd' }, { 'e': 'e' }, { 'f': 'f' }, { 'g': 'g' }, { 'h': 'h' }, { 'i': 'i' }], 'tuple': [(1, 2), (3, 4), (5, 6), (7, 8), (9, 10), (11, 12), (13, 14), (15, 16), (17, 18)] } self.df = pd.DataFrame(self.data) self.df['somedate'] = pd.to_datetime(self.df['somedate']) self.results = describe(self.df) self.test_dir = tempfile.mkdtemp()
def test_bins(self): self.results = describe(self.df, bins=100) self.test_describe_df()