def test_columns_of_type(self): xdfs = DataFrameSummary(self.xdf) the_type = "numeric" columns = xdfs.columns_of_type(the_type) print(columns) # xdfs.get_numeric_summary() cols = [x.encode('ascii') for x in columns] print(cols)
def test_clean_column_on_excel(self): xdfs = DataFrameSummary(self.xdf) xdf_columns = self.xdf.columns.tolist() print(xdfs._clean_column(xdf_columns[0])) for x in xdf_columns: # print(xdfs._clean_column(x)) self.assertTrue(xdfs._clean_column(x))
def test_numer_format_works_as_expected(self): float_nums = [(123.123, '123.12'), (123.1243453, '123.12'), (213213213.123, '213,213,213.12')] int_nums = [(213214, '213,214'), (123213.00, '123,213')] for num, expected in float_nums: self.assertEqual(DataFrameSummary._number_format(num), expected) for num, expected in int_nums: self.assertEqual(DataFrameSummary._number_format(num), expected)
def test_get_perc_works_as_expected(self): float_nums = [(0.123, '12.30%'), (3.1243453, '312.43%'), (213.12312, '21,312.31%')] int_nums = [(0.14, '14%'), (1.300, '130%')] for num, expected in float_nums: self.assertEqual(DataFrameSummary._percent(num), expected) for num, expected in int_nums: self.assertEqual(DataFrameSummary._percent(num), expected)
def test_bool1_summary(self): count_values = self.df['dbool1'].value_counts() total_count = self.df['dbool1'].count() count0 = count_values[0] count1 = count_values[1] perc0 = DataFrameSummary._percent(count0 / total_count) perc1 = DataFrameSummary._percent(count1 / total_count) expected = pd.Series(index=[ '"0" count', '"0" perc', '"1" count', '"1" perc', 'counts', 'uniques', 'missing', 'missing_perc', 'types' ], data=[ str(count0), perc0, str(count1), perc1, self.size, 2, 0, '0%', DataFrameSummary.TYPE_BOOL ], name='dbool1', dtype=object) assert_series_equal(self.dfs['dbool1'], expected)
def setUp(self): self.size = 1000 missing = [np.nan] * (self.size // 10) + list(range(10)) * ((self.size - self.size // 10) // 10) shuffle(missing) self.types = [DataFrameSummary.TYPE_NUMERIC, DataFrameSummary.TYPE_BOOL, DataFrameSummary.TYPE_CATEGORICAL, DataFrameSummary.TYPE_CONSTANT, DataFrameSummary.TYPE_UNIQUE, DataFrameSummary.TYPE_DATE] self.columns = ['dbool1', 'dbool2', 'duniques1', 'duniques2', 'dcategoricals1', 'dcategoricals2', 'dnumerics1', 'dnumerics2', 'dnumerics3', 'dmissing', 'dconstant', 'ddates1', 'ddates2'] self.df = pd.DataFrame(dict( dbool1=np.random.choice([0, 1], size=self.size), dbool2=np.random.choice(['a', 'b'], size=self.size), duniques1=['x{}'.format(i) for i in range(self.size)], duniques2=['y{}'.format(i) for i in range(self.size)], dcategoricals1=['a'.format(i) if i % 2 == 0 else 'b'.format(i) if i % 3 == 0 else 'c'.format(i) for i in range(self.size)], dcategoricals2=['x'.format(i) if i % 2 == 0 else 'y'.format(i) if i % 3 == 0 else 'z'.format(i) for i in range(self.size)], dnumerics1=range(self.size), dnumerics2=range(self.size, 2 * self.size), dnumerics3=list(range(self.size - self.size // 10)) + list(range(-self.size // 10, 0)), dmissing=missing, dconstant=['a'] * self.size, ddates1=pd.date_range('2010-01-01', periods=self.size, freq='1M'), ddates2=pd.date_range('2000-01-01', periods=self.size, freq='1W'), )) self.dfs = DataFrameSummary(self.df)
class DataFrameSummaryTest(unittest.TestCase): """ Test the new methods added by Alfonso R. Reyes. Dataframe has been expanded to show more columns of the same type. Needed for the summary. """ def setUp(self): self.size = 1000 missing = [np.nan] * (self.size // 10) + list(range(10)) * ((self.size - self.size // 10) // 10) shuffle(missing) self.types = [DataFrameSummary.TYPE_NUMERIC, DataFrameSummary.TYPE_BOOL, DataFrameSummary.TYPE_CATEGORICAL, DataFrameSummary.TYPE_CONSTANT, DataFrameSummary.TYPE_UNIQUE, DataFrameSummary.TYPE_DATE] self.columns = ['dbool1', 'dbool2', 'duniques1', 'duniques2', 'dcategoricals1', 'dcategoricals2', 'dnumerics1', 'dnumerics2', 'dnumerics3', 'dmissing', 'dconstant', 'ddates1', 'ddates2'] self.df = pd.DataFrame(dict( dbool1=np.random.choice([0, 1], size=self.size), dbool2=np.random.choice(['a', 'b'], size=self.size), duniques1=['x{}'.format(i) for i in range(self.size)], duniques2=['y{}'.format(i) for i in range(self.size)], dcategoricals1=['a'.format(i) if i % 2 == 0 else 'b'.format(i) if i % 3 == 0 else 'c'.format(i) for i in range(self.size)], dcategoricals2=['x'.format(i) if i % 2 == 0 else 'y'.format(i) if i % 3 == 0 else 'z'.format(i) for i in range(self.size)], dnumerics1=range(self.size), dnumerics2=range(self.size, 2 * self.size), dnumerics3=list(range(self.size - self.size // 10)) + list(range(-self.size // 10, 0)), dmissing=missing, dconstant=['a'] * self.size, ddates1=pd.date_range('2010-01-01', periods=self.size, freq='1M'), ddates2=pd.date_range('2000-01-01', periods=self.size, freq='1W'), )) self.dfs = DataFrameSummary(self.df) def test__columns_stats(self): """ Test the _columns_stats instance variable and the columns of the test dataframe. :return: """ columns_stats = self.dfs._columns_stats print(type(columns_stats)) self.assertIsInstance(columns_stats, pd.core.frame.DataFrame) expected = ['dbool1', 'dbool2', 'dcategoricals1', 'dcategoricals2', 'dconstant', 'ddates1', 'ddates2', 'dmissing', 'dnumerics1', 'dnumerics2', 'dnumerics3', 'duniques1', 'duniques2'] result = columns_stats.columns.tolist() print(result) self.assertEqual(expected, result) def test__is_all_numeric_false(self): """ Test that not all the columns provided in the list are "numeric". It must return "False" :return: """ columns = ['dbool1', 'dbool2', 'dcategoricals', 'dconstant', 'ddates', 'dmissing', 'dnumerics1', 'dnumerics2', 'dnumerics3', 'duniques'] result = self.dfs._is_all_numeric(columns) print(result) self.assertFalse(result) def test__is_all_numeric_true(self): """ Test that all columns passed are "numeric". It must be "True" :return: """ columns = ['dnumerics1', 'dnumerics2', 'dnumerics3'] result = self.dfs._is_all_numeric(columns) print(result) self.assertTrue(result) def test__is_all_numeric_true_missing(self): """ Numeric columns provided this time included NaNs. It muest be "True" :return: """ #: includes missing nan column, which is numeric as well columns = ['dnumerics1', 'dnumerics2', 'dnumerics3', 'dmissing'] result = self.dfs._is_all_numeric(columns) print(result) self.assertTrue(result) def test_columns_of_type_numeric(self): """ Test that a list of numeric columns matches the test dataframe :return: """ expected = ['dmissing', 'dnumerics1', 'dnumerics2', 'dnumerics3'] result = self.dfs.columns_of_type("numeric") print(result) print(self.dfs[result]) self.assertTrue(expected == result) def test_columns_of_type_numeric_generic(self): """ Test that all the columns returning are all of the same `numeric` type :return: """ the_type = "numeric" columns = self.dfs.columns_of_type(the_type) frame = self.dfs[columns] print(frame) types = frame.ix['types'] set_of_types = set(types.tolist()) result = the_type in set_of_types print(result) self.assertTrue(result) def test_type_summary_numeric(self): """ Test that the columns types reduce to a unique numeric value and matches. :return: """ frame = self.dfs.type_summary('numeric') print(frame) result = self.dfs.TYPE_NUMERIC in set(frame.ix['types']) print(result) self.assertTrue(result) def test_columns_of_type_boolean(self): """ Test that boolean columns match the type `bool` :return: """ expected = ['dbool1', 'dbool2'] result = self.dfs.columns_of_type("bool") print(result) self.assertTrue(expected == result) def test_show_dataframe_per_type(self): """ Shows a column, one by one grouping by column type :return: """ for column in self.types: print(column) columns = self.dfs.columns_of_type(column) # print(self.dfs[columns]) list_of = columns for col in list_of: print(self.dfs[col]) def test_columns_of_type_bool_generic(self): """ This is an OLD behavior. Now corrected. There is a problem when the list of columns specified is not numeric: what returns when dfs[columns] is specified could be a list of the columns values. No what we are looking for. """ the_type = "bool" columns = self.dfs.columns_of_type(the_type) print(columns) df = self.dfs[['dbool1', 'dbool2']] print(df) self.assertTrue(df.shape[1] == 2) def test_get_all_series_bool(self): """ Test that boolean summary return the same number of rows. WIth the new behavior the number of rows must be 9 in the case of booleans :return: """ list_of = ['dbool1', 'dbool2'] for col in list_of: ser = self.dfs[col] print ser print ser.shape[0] self.assertTrue(ser.shape[0] == 9) def test_show_columns_types(self): """ Test that the columns in the test dataframe is a subset of the class variable "types" :return: """ self.assertTrue(set(self.dfs.columns_types.index).issubset(self.dfs.types)) def test__is_type_the_same_bool(self): """ Test that the columns passed are of the same type :return: """ columns = ['dbool1', 'dbool2'] list_of_types = self.dfs._is_type_the_same(columns) self.assertTrue(list_of_types) def test__is_type_the_same_many_false(self): """ Tests that the columns passed are NOT all of the same type :return: """ columns = ['dbool1', 'dbool2', 'dnumerics1'] list_of_types = self.dfs._is_type_the_same(columns) self.assertFalse(list_of_types) def test__is_type_the_same_numeric(self): """ Test that the columns passed are all the same :return: """ columns = ['dnumerics1', 'dnumerics2', 'dnumerics3', 'dmissing'] list_of_types = self.dfs._is_type_the_same(columns) self.assertTrue(list_of_types) def test_get_all_the_same_unique(self): """ Test that the unique columns passed are all unique :return: """ columns = ['duniques1', 'duniques2'] self.assertTrue(set(self.dfs[columns].loc['types'].tolist()) == {'unique'}) def test_get_all_the_same_numeric(self): """ Test that all the numeric columns are all numeric """ columns = ['dnumerics1', 'dnumerics2', 'dnumerics3', 'dmissing'] self.assertTrue(set(self.dfs[columns].loc['types'].tolist()) == {'numeric'}) def test_get_all_the_same_categorical(self): """ Tests that all categorical columns reduce to `categorical` :return: """ columns = ['dcategoricals1', 'dcategoricals2'] self.assertTrue(set(self.dfs[columns].loc['types'].tolist()) == {'categorical'}) def test_get_all_the_same_dates(self): """ Test that all the ``date columns reduce to a unique type `date` :return: """ columns = ['ddates1', 'ddates2'] self.assertTrue(set(self.dfs[columns].loc['types'].tolist()) == {'date'})
def test_pandas_summary_on_csv_df(self): #: this works great! cdfs = DataFrameSummary(self.cdf) print(cdfs.type_summary('numeric'))
def test_pandas_summary_on_excel_df(self): #: fixme: this returns error xdfs = DataFrameSummary(self.xdf) print(xdfs.type_summary('numeric'))
def test_numerics_summary(self): #: fixed on 20161026 num1 = self.df['dnumerics1'] dm, dmp = self.dfs._get_deviation_of_mean(num1) dam, damp = self.dfs._get_median_absolute_deviation(num1) #: new expected variable with `top_correlations` removed expected = pd.Series( index=[ 'mean', 'std', 'variance', 'min', 'max', '5%', '25%', '50%', '75%', '95%', 'iqr', 'kurtosis', 'skewness', 'sum', 'mad', 'cv', 'zeros_num', 'zeros_perc', 'deviating_of_mean', 'deviating_of_mean_perc', 'deviating_of_median', 'deviating_of_median_perc', # 'top_correlations', #: removing top_correlations 'counts', 'uniques', 'missing', 'missing_perc', 'types' ], data=[ num1.mean(), num1.std(), num1.var(), num1.min(), num1.max(), num1.quantile(0.05), num1.quantile(0.25), num1.quantile(0.5), num1.quantile(0.75), num1.quantile(0.95), num1.quantile(0.75) - num1.quantile(0.25), num1.kurt(), num1.skew(), num1.sum(), num1.mad(), num1.std() / num1.mean() if num1.mean() else np.nan, self.size - np.count_nonzero(num1), DataFrameSummary._percent( (self.size - np.count_nonzero(num1)) / self.size), dm, dmp, dam, damp, # 'dnumerics2: 100%', #: removing top_correlations self.size, self.size, 0, '0%', DataFrameSummary.TYPE_NUMERIC ], name='dnumerics1', dtype=object) print(self.dfs['dnumerics1']) print(expected) assert_series_equal(self.dfs['dnumerics1'], expected)
class DataFrameSummaryTest(unittest.TestCase): #: fixed some TYPE comparisons on 20161026 def setUp(self): self.size = 1000 missing = [np.nan] * (self.size // 10) + list(range(10)) * ( (self.size - self.size // 10) // 10) shuffle(missing) self.types = [ DataFrameSummary.TYPE_NUMERIC, DataFrameSummary.TYPE_BOOL, DataFrameSummary.TYPE_CATEGORICAL, DataFrameSummary.TYPE_CONSTANT, DataFrameSummary.TYPE_UNIQUE, DataFrameSummary.TYPE_DATE ] self.columns = [ 'dbool1', 'dbool2', 'duniques', 'dcategoricals', 'dnumerics1', 'dnumerics2', 'dnumerics3', 'dmissing', 'dconstant', 'ddates' ] self.df = pd.DataFrame( dict(dbool1=np.random.choice([0, 1], size=self.size), dbool2=np.random.choice(['a', 'b'], size=self.size), duniques=['x{}'.format(i) for i in range(self.size)], dcategoricals=[ 'a'.format(i) if i % 2 == 0 else 'b'.format(i) if i % 3 == 0 else 'c'.format(i) for i in range(self.size) ], dnumerics1=range(self.size), dnumerics2=range(self.size, 2 * self.size), dnumerics3=list(range(self.size - self.size // 10)) + list(range(-self.size // 10, 0)), dmissing=missing, dconstant=['a'] * self.size, ddates=pd.date_range('2010-01-01', periods=self.size, freq='1M'))) self.dfs = DataFrameSummary(self.df) def test_get_columns_works_as_expected(self): assert len(self.dfs.get_columns(self.df, DataFrameSummary.ALL)) == 10 assert len( self.dfs.get_columns( self.df, DataFrameSummary.INCLUDE, ['dnumerics1', 'dnumerics2', 'dnumerics3'])) == 3 assert len( self.dfs.get_columns( self.df, DataFrameSummary.EXCLUDE, ['dnumerics1', 'dnumerics2', 'dnumerics3'])) == 7 def test_column_types_works_as_expected(self): expected = pd.Series(index=self.types, data=[4, 2, 1, 1, 1, 1], name='types') assert_series_equal(self.dfs.columns_types[self.types], expected[self.types]) def test_column_stats_works_as_expected(self): column_stats = self.dfs._columns_stats self.assertTupleEqual(column_stats.shape, (5, 10)) # counts expected = pd.Series(index=self.columns, data=self.size, name='counts', dtype='object') expected['dmissing'] -= 100 assert_series_equal(column_stats[self.columns].loc['counts'], expected[self.columns]) # uniques expected = pd.Series(index=self.columns, data=self.size, name='uniques', dtype='object') expected[['dbool1', 'dbool2']] = 2 expected['dcategoricals'] = 3 expected['dconstant'] = 1 expected['dmissing'] = 10 print(column_stats[self.columns].loc['uniques']) print(expected[self.columns]) assert_series_equal(column_stats[self.columns].loc['uniques'], expected[self.columns].astype('object')) # missing expected = pd.Series(index=self.columns, data=0, name='missing', dtype='object') expected[['dmissing']] = 100 assert_series_equal(column_stats[self.columns].loc['missing'], expected[self.columns].astype('object')) # missing_perc expected = pd.Series(index=self.columns, data=['0%'], name='missing_perc', dtype='object') expected[['dmissing']] = '10%' assert_series_equal(column_stats[self.columns].loc['missing_perc'], expected[self.columns].astype('object')) # types expected = pd.Series(index=self.columns, data=[np.nan], name='types', dtype='object') expected[['dbool1', 'dbool2']] = DataFrameSummary.TYPE_BOOL expected[['dcategoricals']] = DataFrameSummary.TYPE_CATEGORICAL expected[['dconstant']] = DataFrameSummary.TYPE_CONSTANT expected[['ddates']] = DataFrameSummary.TYPE_DATE expected[['duniques']] = DataFrameSummary.TYPE_UNIQUE expected[['dnumerics1', 'dnumerics2', 'dnumerics3', 'dmissing']] = DataFrameSummary.TYPE_NUMERIC assert_series_equal(column_stats[self.columns].loc['types'], expected[self.columns].astype('object')) def test_numer_format_works_as_expected(self): float_nums = [(123.123, '123.12'), (123.1243453, '123.12'), (213213213.123, '213,213,213.12')] int_nums = [(213214, '213,214'), (123213.00, '123,213')] for num, expected in float_nums: self.assertEqual(DataFrameSummary._number_format(num), expected) for num, expected in int_nums: self.assertEqual(DataFrameSummary._number_format(num), expected) def test_get_perc_works_as_expected(self): float_nums = [(0.123, '12.30%'), (3.1243453, '312.43%'), (213.12312, '21,312.31%')] int_nums = [(0.14, '14%'), (1.300, '130%')] for num, expected in float_nums: self.assertEqual(DataFrameSummary._percent(num), expected) for num, expected in int_nums: self.assertEqual(DataFrameSummary._percent(num), expected) def test_uniques_summary(self): expected = pd.Series( index=['counts', 'uniques', 'missing', 'missing_perc', 'types'], data=[self.size, self.size, 0, '0%', DataFrameSummary.TYPE_UNIQUE], name='duniques', dtype=object) assert_series_equal(self.dfs['duniques'], expected) def test_constant_summary(self): #: fixed on 20161026 expected = pd.Series(index=[ 'top', 'counts', 'uniques', 'missing', 'missing_perc', 'types' ], data=[ 'a: 1000', self.size, 1, 0, '0%', DataFrameSummary.TYPE_CONSTANT ], name='dconstant', dtype=object) print(expected) print(self.dfs['dconstant']) assert_series_equal(self.dfs['dconstant'], expected) def test_bool1_summary(self): count_values = self.df['dbool1'].value_counts() total_count = self.df['dbool1'].count() count0 = count_values[0] count1 = count_values[1] perc0 = DataFrameSummary._percent(count0 / total_count) perc1 = DataFrameSummary._percent(count1 / total_count) expected = pd.Series(index=[ '"0" count', '"0" perc', '"1" count', '"1" perc', 'counts', 'uniques', 'missing', 'missing_perc', 'types' ], data=[ str(count0), perc0, str(count1), perc1, self.size, 2, 0, '0%', DataFrameSummary.TYPE_BOOL ], name='dbool1', dtype=object) assert_series_equal(self.dfs['dbool1'], expected) def test_bool2_summary(self): count_values = self.df['dbool2'].value_counts() total_count = self.df['dbool2'].count() count0 = count_values['a'] count1 = count_values['b'] perc0 = DataFrameSummary._percent(count0 / total_count) perc1 = DataFrameSummary._percent(count1 / total_count) expected = pd.Series(index=[ '"a" count', '"a" perc', '"b" count', '"b" perc', 'counts', 'uniques', 'missing', 'missing_perc', 'types' ], data=[ str(count0), perc0, str(count1), perc1, self.size, 2, 0, '0%', DataFrameSummary.TYPE_BOOL ], name='dbool2', dtype=object) assert_series_equal(self.dfs['dbool2'], expected) print(expected) def test_categorical_summary(self): #: fixed on 20161026 expected = pd.Series(index=[ 'cats', 'top', 'counts', 'uniques', 'missing', 'missing_perc', 'types' ], data=[{'a', 'c', 'b'}, 'a: 500', self.size, 3, 0, '0%', DataFrameSummary.TYPE_CATEGORICAL], name='dcategoricals', dtype=object) assert_series_equal(self.dfs['dcategoricals'], expected) print(self.dfs['dcategoricals']) print(expected) def test_dates_summary(self): #: fixed on 20161026 dmin = self.df['ddates'].min() dmax = self.df['ddates'].max() freq = pd.infer_freq(self.df['ddates']) expected = pd.Series(index=[ 'freq', 'max', 'min', 'range', 'counts', 'uniques', 'missing', 'missing_perc', 'types' ], data=[ freq, dmax, dmin, dmax - dmin, self.size, self.size, 0, '0%', DataFrameSummary.TYPE_DATE ], name='ddates', dtype=object) assert_series_equal(self.dfs['ddates'], expected) print(self.dfs['ddates']) print(expected) def test_numerics_summary(self): #: fixed on 20161026 num1 = self.df['dnumerics1'] dm, dmp = self.dfs._get_deviation_of_mean(num1) dam, damp = self.dfs._get_median_absolute_deviation(num1) #: new expected variable with `top_correlations` removed expected = pd.Series( index=[ 'mean', 'std', 'variance', 'min', 'max', '5%', '25%', '50%', '75%', '95%', 'iqr', 'kurtosis', 'skewness', 'sum', 'mad', 'cv', 'zeros_num', 'zeros_perc', 'deviating_of_mean', 'deviating_of_mean_perc', 'deviating_of_median', 'deviating_of_median_perc', # 'top_correlations', #: removing top_correlations 'counts', 'uniques', 'missing', 'missing_perc', 'types' ], data=[ num1.mean(), num1.std(), num1.var(), num1.min(), num1.max(), num1.quantile(0.05), num1.quantile(0.25), num1.quantile(0.5), num1.quantile(0.75), num1.quantile(0.95), num1.quantile(0.75) - num1.quantile(0.25), num1.kurt(), num1.skew(), num1.sum(), num1.mad(), num1.std() / num1.mean() if num1.mean() else np.nan, self.size - np.count_nonzero(num1), DataFrameSummary._percent( (self.size - np.count_nonzero(num1)) / self.size), dm, dmp, dam, damp, # 'dnumerics2: 100%', #: removing top_correlations self.size, self.size, 0, '0%', DataFrameSummary.TYPE_NUMERIC ], name='dnumerics1', dtype=object) print(self.dfs['dnumerics1']) print(expected) assert_series_equal(self.dfs['dnumerics1'], expected)