def test_bootstrap_confidence_interval(self): normal = pd.Series(np.random.normal(size=10000)) expo = pd.Series(np.random.exponential(size=10000)) result = DataDiscovery.bootstrap_confidence_interval(normal) print(result) result = DataDiscovery.bootstrap_confidence_interval(expo) print(result)
def test_analyse_date(self): tools = SyntheticBuilder.scratch_pad() str_dates = tools.get_datetime('12/01/2016', '12/01/2018', date_format='%d-%m-%Y', size=10, seed=31) ts_dates = tools.get_datetime('12/01/2016', '12/01/2018', size=10, seed=31) result = Discover.analyse_date(str_dates, granularity=3, date_format='%Y-%m-%d') control = {'intent': {'date_format': '%Y-%m-%d', 'day_first': False, 'dtype': 'date', 'granularity': 3, 'lowest': '2017-12-02', 'selection': [('2017-12-02', '2018-03-13', 'both'), ('2018-03-13', '2018-06-22', 'right'), ('2018-06-22', '2018-10-01', 'right')], 'highest': '2018-10-01', 'freq_precision': 2, 'year_first': False}, 'patterns': {'sample_distribution': [1, 0, 3], 'relative_freq': [25.0, 0.0, 75.0]}, 'stats': {'bootstrap_bci': (17572.5, 17797.75), 'emp_outliers': [0, 0], 'excluded_percent': 0.0, 'irq_outliers': [1, 0], 'kurtosis': 3.86, 'mean': '2018-07-04', 'nulls_percent': 60.0, 'sample': 4, 'skew': -1.96}} self.assertEqual(control, result) result = Discover.analyse_date(ts_dates, granularity=3) control = {'intent': {'day_first': False, 'dtype': 'date', 'granularity': 3, 'lowest': pd.Timestamp('2017-02-12 19:02:11.531780+0000', tz='UTC'), 'selection': [(pd.Timestamp('2017-02-12 19:02:11.531780+0000', tz='UTC'), pd.Timestamp('2017-09-08 17:43:30.973860+0000', tz='UTC'), 'both'), (pd.Timestamp('2017-09-08 17:43:30.973860+0000', tz='UTC'), pd.Timestamp('2018-04-04 16:24:50.415940+0000', tz='UTC'), 'right'), (pd.Timestamp('2018-04-04 16:24:50.415940+0000', tz='UTC'), pd.Timestamp('2018-10-29 15:06:09.858020+0000', tz='UTC'), 'right')], 'highest': pd.Timestamp('2018-10-29 15:06:09.858020+0000', tz='UTC'), 'freq_precision': 2, 'year_first': False}, 'patterns': {'sample_distribution': [2, 3, 5], 'relative_freq': [20.0, 30.0, 50.0]}, 'stats': {'bootstrap_bci': (17493.5054775573, 17724.4628926684), 'emp_outliers': [0, 0], 'excluded_percent': 0.0, 'irq_outliers': [1, 0], 'kurtosis': 0.64, 'mean': pd.Timestamp('2018-03-22 17:31:12+0000', tz='UTC'), 'nulls_percent': 0.0, 'excluded_percent': 0.0, 'sample': 10, 'skew': -0.94}} self.assertEqual(control, result)
def test_number_zero_count(self): dataset = [1,0,0,1,1,1,0,0,1,0] result = DataAnalytics(analysis=Discover.analyse_number(dataset)) self.assertEqual([50.0], result.get('dominance_weighting')) dataset = [1,0,0,2,5,4,0,4,1,0] result = Discover.analyse_number(dataset, lower=0, granularity=3) self.assertEqual([40.0], result.get('dominance_weighting')) self.assertEqual([10], result.get('sample')) result = Discover.analyse_number(dataset, lower=1, granularity=3) self.assertEqual([40.0], result.get('dominance_weighting')) self.assertEqual([6], result.get('sample'))
def test_filter_fisher_score(self): df = sns.load_dataset('titanic') result = Discover.filter_fisher_score(df, target='survived') self.assertCountEqual( ['class', 'pclass', 'deck', 'parch', 'sibsp', 'age'], result) result = Discover.filter_fisher_score(df, target='survived', inc_zero_score=True) self.assertCountEqual( ['class', 'pclass', 'deck', 'parch', 'sibsp', 'age', 'fare'], result) result = Discover.filter_fisher_score(df, target='survived', inc_zero_score=True, top=3) self.assertEqual(3, len(result)) result = Discover.filter_fisher_score(df, target='survived', top=0) self.assertEqual(6, len(result)) result = Discover.filter_fisher_score(df, target='survived', top=20) self.assertEqual(6, len(result)) result = Discover.filter_fisher_score(df, target='survived', inc_zero_score=True, top=20) self.assertEqual(7, len(result)) result = Discover.filter_fisher_score(df, target='survived', inc_zero_score=True, top=0.3) self.assertEqual(2, len(result)) result = Discover.filter_fisher_score(df, target='survived', inc_zero_score=True, top=0.999) self.assertEqual(5, len(result))
def test_analysis_granularity_list(self): dataset = [0,1,2,3] intervals = [(0,1,'both'),(1,2),(2,3)] result = DataAnalytics(Discover.analyse_number(dataset, granularity=intervals)) control = [50.0, 25.0, 25.0] self.assertEqual(control, result.patterns.relative_freq) intervals = [(0,1,'both'),(1,2,'both'),(2,3)] result = DataAnalytics(Discover.analyse_number(dataset, granularity=intervals)) control = [50.0, 50.0, 25.0] self.assertEqual(control, result.patterns.relative_freq) # percentile dataset = [0, 0, 2, 2, 1, 2, 3] percentile = [.25, .5, .75] result = DataAnalytics(Discover.analyse_number(dataset, granularity=percentile)) control = [28.57, 57.14, 0.0, 14.29] self.assertEqual(control, result.patterns.relative_freq)
def test_discovery_associate(self): tools = SyntheticBuilder.scratch_pad() df = pd.DataFrame() df['cat'] = tools.get_category(list('AB'), relative_freq=[1,3], size=1000) df['gender'] = tools.get_category(list('MF'), relative_freq=[1,3], size=1000) result = Discover.analyse_association(df, columns_list=['cat', 'gender']) self.assertEqual(['cat', 'gender'], list(result))
def test_associate_analysis_from_discovery(self): df = pd.DataFrame() df['cat'] = self.tools.get_category(selection=list('ABC'), quantity=0.9, size=100) df['values'] = self.tools.get_number(from_value=20, size=100) # discover associate = [{ 'cat': { 'dtype': 'category' }, 'values': { 'dtype': 'category', 'granularity': 5, 'precision': 3 } }] analysis = Discovery.analyse_association(df, columns_list=associate) # build sample_size = 173 result = self.tools.model_analysis( pd.DataFrame(index=range(sample_size)), analysis) self.assertCountEqual(['cat', 'values'], result.keys()) for key in result.keys(): self.assertEqual(sample_size, len(result.get(key)))
def test_associate_analysis_from_discovery(self): df = pd.DataFrame() df['cat'] = self.tools.get_category(selection=list('ABC'), quantity=0.9, size=100, save_intent=False) df['values'] = self.tools.get_number(from_value=20, size=100, save_intent=False) # discover associate = [{ 'cat': { 'dtype': 'category' }, 'values': { 'dtype': 'category', 'granularity': 5, 'precision': 3 } }] analysis = Discovery.analyse_association(df, columns_list=associate) # build empty_frame = self.tools.canonical2dict(method='@empty', size=1973) result = self.tools.model_analysis(canonical=empty_frame, analytics_model=analysis, save_intent=False) self.assertCountEqual(['cat', 'values'], result.keys()) for key in result.keys(): self.assertEqual(sample_size, len(result.get(key)))
def test_discovery_analytics_class(self): tools = SyntheticBuilder.scratch_pad() dataset = tools.get_category(list('ABCDE')+[np.nan], relative_freq=[1,3,2,7,4], size=694) result = Discover.analyse_category(dataset) analytics = DataAnalytics(analysis=result) self.assertEqual(analytics.intent.selection, analytics.sample_map.index.to_list()) self.assertEqual(analytics.patterns.sample_distribution, analytics.sample_map.to_list())
def test_sharpo_wilks(self): norm = pd.Series(np.random.normal(size=10000)) expo = pd.Series(np.random.exponential(size=10000)) logistic = pd.Series(np.random.logistic(size=10000)) norm.sort_values(inplace=True) # result = DataDiscovery.shapiro_wilk_normality(norm) # print(result) # result = DataDiscovery.dagostinos_k2_normality(norm) # print(result) result = DataDiscovery.anderson_darling_tests(norm, dist='norm') print(result) result = DataDiscovery.anderson_darling_tests(expo, dist='expon') print(result) result = DataDiscovery.anderson_darling_tests(logistic, dist='logistic') print(result)
def test_analyse_number_lower_upper(self): # Default dataset = list(range(1,10)) result = DataAnalytics(Discover.analyse_number(dataset)) self.assertEqual(1, result.stats.lowest) self.assertEqual(9, result.stats.highest) # Outer Boundaries result = DataAnalytics(Discover.analyse_number(dataset, lower=0, upper=10)) self.assertEqual(0, result.params.lower) self.assertEqual(10, result.params.upper) self.assertEqual(0, result.stats.lowest) self.assertEqual(10, result.stats.highest) # Inner Boundaries result = DataAnalytics(Discover.analyse_number(dataset, lower=2, upper=8)) self.assertEqual(2, result.params.lower) self.assertEqual(8, result.params.upper) self.assertEqual(2, result.stats.lowest) self.assertEqual(8, result.stats.highest)
def get_weights(self, df, columns: list, index: int, weighting: dict): col = columns[index] weighting.update({col: Discover.analyse_category(df[col])}) if index == len(columns)-1: return for category in weighting.get(col).get('selection'): if weighting.get(col).get('sub_category') is None: weighting[col].update({'sub_category': {}}) weighting.get(col).get('sub_category').update({category: {}}) sub_category = weighting.get(col).get('sub_category').get(category) self.get_weights(df[df[col] == category], columns, index + 1, sub_category) return
def test_filter_correlated(self): tools = SyntheticBuilder.scratch_pad() df = pd.DataFrame() df['col1'] = [1, 2, 3, 4, 5, 6, 7] df['col2'] = [1, 2, 3, 4, 5, 6, 7] df['col3'] = [2, 2, 3, 2, 2, 2, 3] df['col4'] = [2, 2, 3, 2, 2, 2, 3] df['col5'] = [2, 2, 3, 2, 2, 2, 3] df['col4'] = [7, 2, 4, 2, 1, 6, 4] df['target'] = [1, 0, 1, 1, 0, 0, 1] result = DataDiscovery.filter_correlated(df, target='target') print(result)
def test_analyse_number(self): dataset = [] result = Discover.analyse_number(dataset) control = [1] self.assertEqual(control, result.get('patterns').get('relative_freq')) self.assertEqual((0,0,3), (result.get('intent').get('lowest'), result.get('intent').get('highest'), result.get('intent').get('granularity'))) self.assertEqual(0, result.get('stats').get('sample_size')) dataset = [1,2,3,4,5,6,7,8,9,10] result = Discover.analyse_number(dataset, granularity=2) control = [50.0, 50.0] self.assertEqual(control, result.get('patterns').get('relative_freq')) self.assertEqual((1,10), (result.get('intent').get('lowest'), result.get('intent').get('highest'))) control = [(1.0, 5.5, 'both'), (5.5, 10.0, 'right')] self.assertEqual(control, result.get('intent').get('intervals')) self.assertEqual(2, result.get('intent').get('granularity')) result = Discover.analyse_number(dataset, granularity=3.0) control = [30.0, 30.0, 40.0] self.assertEqual(control, result.get('patterns').get('relative_freq')) self.assertEqual((1,10), (result.get('intent').get('lowest'), result.get('intent').get('highest'))) control = [(1.0, 4.0, 'left'), (4.0, 7.0, 'left'), (7.0, 10.0, 'both')] self.assertEqual(control, result.get('intent').get('intervals')) self.assertEqual(3.0, result.get('intent').get('granularity')) result = Discover.analyse_number(dataset, granularity=2.0) control = [20.0, 20.0, 20.0, 20.0, 20.0] self.assertEqual(control, result.get('patterns').get('relative_freq')) self.assertEqual((1,10), (result.get('intent').get('lowest'), result.get('intent').get('highest'))) control = [(1.0, 3.0, 'left'), (3.0, 5.0, 'left'), (5.0, 7.0, 'left'), (7.0, 9.0, 'left'), (9.0, 11.0, 'both')] self.assertEqual(control, result.get('intent').get('intervals')) self.assertEqual(2.0, result.get('intent').get('granularity')) result = Discover.analyse_number(dataset, granularity=1.0, lower=0, upper=5) control = [0.0, 20.0, 20.0, 20.0, 20.0, 20.0] self.assertEqual(control, result.get('patterns').get('relative_freq')) self.assertEqual((0,5), (result.get('intent').get('lowest'), result.get('intent').get('highest'))) control = [(0.0, 1.0, 'left'), (1.0, 2.0, 'left'), (2.0, 3.0, 'left'), (3.0, 4.0, 'left'), (4.0, 5.0, 'left'), (5.0, 6.0, 'both')] self.assertEqual(control, result.get('intent').get('intervals')) self.assertEqual(1.0, result.get('intent').get('granularity')) pprint(result)
def test_analyse_category_limits(self): top = 2 dataset = ['A']*8 + ['B']*6 + ['C']*4 + ['D']*2 result = Discover.analyse_category(dataset, top=top, freq_precision=0) control = ['dtype', 'categories', 'top', 'highest_unique', 'lowest_unique', 'category_count'] self.assertCountEqual(control, list(result.get('top'))) self.assertEqual(top, len(result.get('intent').get('categories'))) self.assertCountEqual(['A', 'B'], result.get('intent').get('categories')) self.assertCountEqual([40, 30], result.get('patterns').get('relative_freq')) self.assertEqual(30, result.get('stats').get('excluded_percent')) self.assertEqual(14, result.get('stats').get('sample_size')) lower = 0.2 upper = 7 result = Discover.analyse_category(dataset, lower=lower, upper=upper, freq_precision=0) control = ['dtype', 'categories', 'highest_unique', 'lowest_unique', 'granularity'] self.assertCountEqual(control, list(result.get('intent').keys())) self.assertEqual(lower, result.get('intent').get('lowest_unique')) self.assertEqual(upper, result.get('intent').get('highest_unique')) self.assertCountEqual(['C', 'B'], result.get('intent').get('categories')) self.assertCountEqual([33, 50], result.get('patterns').get('relative_freq')) self.assertEqual(50, result.get('stats').get('excluded_percent')) self.assertEqual(10, result.get('stats').get('sample_size'))
def test_analyse_associate_multi(self): tools = SyntheticBuilder.scratch_pad() size = 50 df = pd.DataFrame() df['gender'] = tools.get_category(selection=['M', 'F'], relative_freq=[6, 4], size=size) df['lived'] = tools.get_category(selection=['yes', 'no'], quantity=80.0, seed=31, size=size) df['age'] = tools.get_number(from_value=20,to_value=80, relative_freq=[1,2,5,6,2,1,0.5], seed=31, size=size) df['fare'] = tools.get_number(from_value=1000, relative_freq=[5,0,2], size=size, quantity=0.9, seed=31) df['numbers'] = tools.get_number(from_value=1000, relative_freq=[5,0,2], size=size, quantity=0.9, seed=31) df['dates'] = tools.get_datetime('10/10/2000', '31/12/2018', relative_freq=[1, 9, 4], size=size, quantity=0.9, seed=31) columns_list = ['numbers', 'age', 'fare'] result = Discover.analyse_association(df, columns_list) pprint(result)
def test_filter_univariate_roc_auc(self): tr = Transition.from_env('test', default_save=False, default_save_intent=False, has_contract=False) tr.set_source('paribas.csv', nrows=5000) data = tr.load_source_canonical() result = Discover.filter_univariate_roc_auc(data, target='target', threshold=0.55) self.assertCountEqual(['v10', 'v129', 'v14', 'v62', 'v50'], result) # Custom classifier classifier_kwargs = {'iterations': 2, 'learning_rate': 1, 'depth': 2} result = Discover.filter_univariate_roc_auc( data, target='target', threshold=0.55, package='catboost', model='CatBoostClassifier', classifier_kwargs=classifier_kwargs, fit_kwargs={'verbose': False}) self.assertCountEqual( ['v50', 'v10', 'v14', 'v12', 'v129', 'v62', 'v21', 'v34'], result)
def test_filter_univariate_mse(self): tr = Transition.from_env('test', default_save=False, default_save_intent=False, has_contract=False) tr.set_source('ames_housing.csv', nrows=5000) data = tr.load_source_canonical() result = Discover.filter_univariate_mse(data, target='SalePrice', as_series=False, top=5) self.assertEqual([ 'OverallQual', 'GarageCars', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt' ], result)
def test_analyse_category(self): builer = SyntheticBuilder.from_memory() tools = builer.tools dataset = tools.get_category(list('ABCDE')+[np.nan], relative_freq=[1,3,2,7,4], size=694) result = Discover.analyse_category(dataset) control = ['intent', 'patterns', 'stats', 'params'] self.assertCountEqual(control, list(result.keys())) control = ['dtype', 'categories', 'highest_unique', 'lowest_unique', 'category_count'] self.assertCountEqual(control, list(result.get('intent').keys())) control = ['relative_freq', 'sample_distribution'] self.assertCountEqual(control, list(result.get('patterns').keys())) control = ['nulls_percent', 'sample_size', 'excluded_percent'] self.assertCountEqual(control, list(result.get('stats').keys())) control = ['freq_precision'] self.assertCountEqual(control, list(result.get('params').keys()))
def test_analyse_associate_levels(self): tools = SyntheticBuilder.scratch_pad() size = 50 df = pd.DataFrame() df['gender'] = tools.get_category(selection=['M', 'F'], relative_freq=[6, 4], size=size) df['lived'] = tools.get_category(selection=['yes', 'no'], quantity=80.0, size=size) df['age'] = tools.get_number(from_value=20,to_value=80, relative_freq=[1,2,5,6,2,1,0.5], size=size) df['fare'] = tools.get_number(from_value=1000, relative_freq=[5,0,2], size=size, quantity=0.9) columns_list = [{'gender': {}, 'age': {}}, {'lived': {}}] exclude = ['age.lived'] result = Discover.analyse_association(df, columns_list, exclude) self.assertCountEqual(['age', 'gender'], list(result.keys())) self.assertNotIn('sub_category', result.get('age').keys()) self.assertIn('sub_category', result.get('gender').keys()) self.assertCountEqual(['M', 'F'], list(result.get('gender').get('sub_category').keys())) self.assertCountEqual(['lived'], list(result.get('gender').get('sub_category').get('M').keys())) self.assertCountEqual(['lived'], list(result.get('gender').get('sub_category').get('F').keys()))
def save_canonical_schema(self, schema_name: str = None, canonical: pd.DataFrame = None, schema_tree: list = None, exclude_associate: list = None, detail_numeric: bool = None, strict_typing: bool = None, category_limit: int = None, save: bool = None): """ Saves the canonical schema to the Property contract. The default loads the clean canonical but optionally a canonical can be passed to base the schema on and optionally a name given other than the default :param schema_name: (optional) the name of the schema to save :param canonical: (optional) the canonical to base the schema on :param schema_tree: (optional) an analytics dict (see Discovery.analyse_association(...) :param exclude_associate: (optional) a list of dot notation tree of items to exclude from iteration (e.g. ['age.gender.salary'] will cut 'salary' branch from gender and all sub branches) :param detail_numeric: (optional) if numeric columns should have detail stats, slowing analysis. default False :param strict_typing: (optional) stops objects and string types being seen as categories. default True :param category_limit: (optional) a global cap on categories captured. default is 10 :param save: (optional) if True, save to file. Default is True """ schema_name = schema_name if isinstance(schema_name, str) else self.REPORT_SCHEMA canonical = canonical if isinstance( canonical, pd.DataFrame) else self.load_persist_canonical() schema_tree = schema_tree if isinstance( schema_tree, list) else canonical.columns.to_list() detail_numeric = detail_numeric if isinstance(detail_numeric, bool) else False strict_typing = strict_typing if isinstance(strict_typing, bool) else True category_limit = category_limit if isinstance(category_limit, int) else 10 analytics = DataDiscovery.analyse_association( canonical, columns_list=schema_tree, exclude_associate=exclude_associate, detail_numeric=detail_numeric, strict_typing=strict_typing, category_limit=category_limit) self.pm.set_canonical_schema(name=schema_name, schema=analytics) self.pm_persist(save=save) return
def canonical_report(canonical, stylise: bool = True, inc_next_dom: bool = False, report_header: str = None, condition: str = None): """The Canonical Report is a data dictionary of the canonical providing a reference view of the dataset's attribute properties :param canonical: the DataFrame to view :param stylise: if True present the report stylised. :param inc_next_dom: (optional) if to include the next dominate element column :param report_header: (optional) filter on a header where the condition is true. Condition must exist :param condition: (optional) the condition to apply to the header. Header must exist. examples: ' > 0.95', ".str.contains('shed')" :return: """ return DataDiscovery.data_dictionary(df=canonical, stylise=stylise, inc_next_dom=inc_next_dom, report_header=report_header, condition=condition)
def test_jensen_shannon_distance(self): p = [0.20, 0.60, 0.20] q = [0.20, 0.599, 0.201] result = Discover.jensen_shannon_distance(p, q, precision=5) print(result)
def test_interquartile_outliers(self): expo = pd.Series(np.random.exponential(size=10000)) result = DataDiscovery.interquartile_outliers(expo) print(result)
def test_analyse_associate_single(self): tools = SyntheticBuilder.scratch_pad() size = 50 df = pd.DataFrame() df['gender'] = tools.get_category(selection=['M', 'F'], relative_freq=[6, 4], bounded_weighting=True, size=size) # category columns_list = [{'gender': {}}] result = Discover.analyse_association(df, columns_list) control = {'gender': {'analysis': {'intent': {'dtype': 'category', 'granularity': 2, 'lowest': 40.0, 'selection': ['M', 'F'], 'highest': 60.0, 'freq_precision': 2}, 'patterns': {'sample_distribution': [30, 20], 'relative_freq': [60.0, 40.0]}, 'stats': {'excluded_percent': 0.0,'nulls_percent': 0.0, 'sample': 50}}, 'associate': 'gender'}} self.assertEqual(control, result) columns_list = [{'gender': {'chunk_size': 1, 'replace_zero': 0}}] result = Discover.analyse_association(df, columns_list) self.assertEqual(control, result) # number df['numbers'] = tools.get_number(from_value=1000, relative_freq=[5,0,2], size=size, quantity=0.9, seed=31) columns_list = [{'numbers': {'type': 'number', 'granularity': 3}}] result = Discover.analyse_association(df, columns_list) control = {'numbers': {'analysis': {'intent': {'dtype': 'number', 'granularity': 3, 'lowest': 9.0, 'precision': 3, 'selection': [(9.0, 330.0, 'both'), (330.0, 651.0, 'right'), (651.0, 972.0, 'right')], 'highest': 972.0, 'freq_precision': 2}, 'patterns': {'dominance_weighting': [50.0, 50.0], 'dominant_percent': 9.09, 'dominant_values': [100.0, 139.0], 'sample_distribution': [31, 0, 13], 'freq_mean': [140.484, 0.0, 827.231], 'relative_freq': [70.45, 0.0, 29.55], 'freq_std': [7568.791, 0.0, 7760.859]}, 'stats': {'bootstrap_bci': (253.857, 445.214), 'emp_outliers': [0, 0], 'excluded_percent': 0.0, 'irq_outliers': [0, 0], 'kurtosis': -1.03, 'mad': 285.91, 'mean': 343.39, 'excluded_percent': 0.0, 'nulls_percent': 12.0, 'sample': 44, 'sem': 49.52, 'skew': 0.84, 'var': 107902.71}}, 'associate': 'numbers'}} self.assertEqual(control, result) #dates df['dates'] = tools.get_datetime('10/10/2000', '31/12/2018', relative_freq=[1, 9, 4], size=size, quantity=0.9, seed=31) columns_list = [{'dates': {'dtype': 'datetime', 'granularity': 3, 'date_format': '%d-%m-%Y'}}] control = {'dates': {'analysis': {'intent': {'date_format': '%d-%m-%Y', 'day_first': False, 'dtype': 'date', 'granularity': 3, 'lowest': '14-01-2003', 'selection': [('14-01-2003', '29-04-2008', 'both'), ('29-04-2008', '13-08-2013', 'right'), ('13-08-2013', '27-11-2018', 'right')], 'highest': '27-11-2018', 'freq_precision': 2, 'year_first': False}, 'patterns': {'sample_distribution': [12, 21, 11], 'relative_freq': [27.27, 47.73, 25.0]}, 'stats': {'bootstrap_bci': (14622.3654489759, 15435.002697157), 'emp_outliers': [0, 0], 'excluded_percent': 0.0, 'irq_outliers': [0, 0], 'kurtosis': -0.5, 'mean': '14-03-2011', 'nulls_percent': 12.0, 'sample': 44, 'skew': 0.25}}, 'associate': 'dates'}} result = Discover.analyse_association(df, columns_list) self.assertEqual(control, result)
def discovery_pad(cls) -> DataDiscovery: """ A class method to use the Components discovery methods as a scratch pad""" return DataDiscovery()
def discover(self) -> DataDiscovery: """The components instance""" return DataDiscovery()