def test_discovery_associate(self):
     tools = SyntheticBuilder.scratch_pad()
     df = pd.DataFrame()
     df['cat'] = tools.get_category(list('AB'), relative_freq=[1,3], size=1000)
     df['gender'] = tools.get_category(list('MF'), relative_freq=[1,3], size=1000)
     result = Discover.analyse_association(df, columns_list=['cat', 'gender'])
     self.assertEqual(['cat', 'gender'], list(result))
Ejemplo n.º 2
0
 def test_associate_analysis_from_discovery(self):
     df = pd.DataFrame()
     df['cat'] = self.tools.get_category(selection=list('ABC'),
                                         quantity=0.9,
                                         size=100,
                                         save_intent=False)
     df['values'] = self.tools.get_number(from_value=20,
                                          size=100,
                                          save_intent=False)
     # discover
     associate = [{
         'cat': {
             'dtype': 'category'
         },
         'values': {
             'dtype': 'category',
             'granularity': 5,
             'precision': 3
         }
     }]
     analysis = Discovery.analyse_association(df, columns_list=associate)
     # build
     empty_frame = self.tools.canonical2dict(method='@empty', size=1973)
     result = self.tools.model_analysis(canonical=empty_frame,
                                        analytics_model=analysis,
                                        save_intent=False)
     self.assertCountEqual(['cat', 'values'], result.keys())
     for key in result.keys():
         self.assertEqual(sample_size, len(result.get(key)))
Ejemplo n.º 3
0
 def test_associate_analysis_from_discovery(self):
     df = pd.DataFrame()
     df['cat'] = self.tools.get_category(selection=list('ABC'),
                                         quantity=0.9,
                                         size=100)
     df['values'] = self.tools.get_number(from_value=20, size=100)
     # discover
     associate = [{
         'cat': {
             'dtype': 'category'
         },
         'values': {
             'dtype': 'category',
             'granularity': 5,
             'precision': 3
         }
     }]
     analysis = Discovery.analyse_association(df, columns_list=associate)
     # build
     sample_size = 173
     result = self.tools.model_analysis(
         pd.DataFrame(index=range(sample_size)), analysis)
     self.assertCountEqual(['cat', 'values'], result.keys())
     for key in result.keys():
         self.assertEqual(sample_size, len(result.get(key)))
 def test_analyse_associate_multi(self):
     tools = SyntheticBuilder.scratch_pad()
     size = 50
     df = pd.DataFrame()
     df['gender'] = tools.get_category(selection=['M', 'F'], relative_freq=[6, 4], size=size)
     df['lived'] = tools.get_category(selection=['yes', 'no'], quantity=80.0, seed=31, size=size)
     df['age'] = tools.get_number(from_value=20,to_value=80, relative_freq=[1,2,5,6,2,1,0.5], seed=31, size=size)
     df['fare'] = tools.get_number(from_value=1000, relative_freq=[5,0,2], size=size, quantity=0.9, seed=31)
     df['numbers'] = tools.get_number(from_value=1000, relative_freq=[5,0,2], size=size, quantity=0.9, seed=31)
     df['dates'] = tools.get_datetime('10/10/2000', '31/12/2018', relative_freq=[1, 9, 4], size=size, quantity=0.9, seed=31)
     columns_list = ['numbers', 'age', 'fare']
     result = Discover.analyse_association(df, columns_list)
     pprint(result)
 def test_analyse_associate_levels(self):
     tools = SyntheticBuilder.scratch_pad()
     size = 50
     df = pd.DataFrame()
     df['gender'] = tools.get_category(selection=['M', 'F'], relative_freq=[6, 4], size=size)
     df['lived'] = tools.get_category(selection=['yes', 'no'], quantity=80.0, size=size)
     df['age'] = tools.get_number(from_value=20,to_value=80, relative_freq=[1,2,5,6,2,1,0.5], size=size)
     df['fare'] = tools.get_number(from_value=1000, relative_freq=[5,0,2], size=size, quantity=0.9)
     columns_list = [{'gender': {}, 'age':  {}}, {'lived': {}}]
     exclude = ['age.lived']
     result = Discover.analyse_association(df, columns_list, exclude)
     self.assertCountEqual(['age', 'gender'], list(result.keys()))
     self.assertNotIn('sub_category', result.get('age').keys())
     self.assertIn('sub_category', result.get('gender').keys())
     self.assertCountEqual(['M', 'F'], list(result.get('gender').get('sub_category').keys()))
     self.assertCountEqual(['lived'], list(result.get('gender').get('sub_category').get('M').keys()))
     self.assertCountEqual(['lived'], list(result.get('gender').get('sub_category').get('F').keys()))
    def save_canonical_schema(self,
                              schema_name: str = None,
                              canonical: pd.DataFrame = None,
                              schema_tree: list = None,
                              exclude_associate: list = None,
                              detail_numeric: bool = None,
                              strict_typing: bool = None,
                              category_limit: int = None,
                              save: bool = None):
        """ Saves the canonical schema to the Property contract. The default loads the clean canonical but optionally
        a canonical can be passed to base the schema on and optionally a name given other than the default

        :param schema_name: (optional) the name of the schema to save
        :param canonical: (optional) the canonical to base the schema on
        :param schema_tree: (optional) an analytics dict (see Discovery.analyse_association(...)
        :param exclude_associate: (optional) a list of dot notation tree of items to exclude from iteration
                (e.g. ['age.gender.salary']  will cut 'salary' branch from gender and all sub branches)
        :param detail_numeric: (optional) if numeric columns should have detail stats, slowing analysis. default False
        :param strict_typing: (optional) stops objects and string types being seen as categories. default True
        :param category_limit: (optional) a global cap on categories captured. default is 10
        :param save: (optional) if True, save to file. Default is True
        """
        schema_name = schema_name if isinstance(schema_name,
                                                str) else self.REPORT_SCHEMA
        canonical = canonical if isinstance(
            canonical, pd.DataFrame) else self.load_persist_canonical()
        schema_tree = schema_tree if isinstance(
            schema_tree, list) else canonical.columns.to_list()
        detail_numeric = detail_numeric if isinstance(detail_numeric,
                                                      bool) else False
        strict_typing = strict_typing if isinstance(strict_typing,
                                                    bool) else True
        category_limit = category_limit if isinstance(category_limit,
                                                      int) else 10
        analytics = DataDiscovery.analyse_association(
            canonical,
            columns_list=schema_tree,
            exclude_associate=exclude_associate,
            detail_numeric=detail_numeric,
            strict_typing=strict_typing,
            category_limit=category_limit)
        self.pm.set_canonical_schema(name=schema_name, schema=analytics)
        self.pm_persist(save=save)
        return
 def test_analyse_associate_single(self):
     tools = SyntheticBuilder.scratch_pad()
     size = 50
     df = pd.DataFrame()
     df['gender'] = tools.get_category(selection=['M', 'F'], relative_freq=[6, 4], bounded_weighting=True, size=size)
     # category
     columns_list = [{'gender': {}}]
     result = Discover.analyse_association(df, columns_list)
     control = {'gender': {'analysis': {'intent': {'dtype': 'category',
                                                   'granularity': 2,
                                                   'lowest': 40.0,
                                                   'selection': ['M', 'F'],
                                                   'highest': 60.0,
                                                   'freq_precision': 2},
                                        'patterns': {'sample_distribution': [30, 20],
                                                     'relative_freq': [60.0, 40.0]},
                                        'stats': {'excluded_percent': 0.0,'nulls_percent': 0.0,
                                                  'sample': 50}},
                           'associate': 'gender'}}
     self.assertEqual(control, result)
     columns_list = [{'gender': {'chunk_size': 1, 'replace_zero': 0}}]
     result = Discover.analyse_association(df, columns_list)
     self.assertEqual(control, result)
     # number
     df['numbers'] = tools.get_number(from_value=1000, relative_freq=[5,0,2], size=size, quantity=0.9, seed=31)
     columns_list = [{'numbers': {'type': 'number', 'granularity': 3}}]
     result = Discover.analyse_association(df, columns_list)
     control = {'numbers': {'analysis': {'intent': {'dtype': 'number',
                                                    'granularity': 3,
                                                    'lowest': 9.0,
                                                    'precision': 3,
                                                    'selection': [(9.0, 330.0, 'both'),
                                                                  (330.0, 651.0, 'right'),
                                                                  (651.0, 972.0, 'right')],
                                                    'highest': 972.0,
                                                    'freq_precision': 2},
                                         'patterns': {'dominance_weighting': [50.0, 50.0],
                                                      'dominant_percent': 9.09,
                                                      'dominant_values': [100.0, 139.0],
                                                      'sample_distribution': [31, 0, 13],
                                                      'freq_mean': [140.484, 0.0, 827.231],
                                                      'relative_freq': [70.45, 0.0, 29.55],
                                                      'freq_std': [7568.791, 0.0, 7760.859]},
                                         'stats': {'bootstrap_bci': (253.857, 445.214),
                                                   'emp_outliers': [0, 0],
                                                   'excluded_percent': 0.0,
                                                   'irq_outliers': [0, 0], 'kurtosis': -1.03,
                                                   'mad': 285.91,
                                                   'mean': 343.39,
                                                   'excluded_percent': 0.0,
                                                   'nulls_percent': 12.0,
                                                   'sample': 44,
                                                   'sem': 49.52,
                                                   'skew': 0.84,
                                                   'var': 107902.71}},
                            'associate': 'numbers'}}
     self.assertEqual(control, result)
     #dates
     df['dates'] = tools.get_datetime('10/10/2000', '31/12/2018', relative_freq=[1, 9, 4], size=size, quantity=0.9, seed=31)
     columns_list = [{'dates': {'dtype': 'datetime', 'granularity': 3, 'date_format': '%d-%m-%Y'}}]
     control = {'dates': {'analysis': {'intent': {'date_format': '%d-%m-%Y',
                                                  'day_first': False,
                                                  'dtype': 'date',
                                                  'granularity': 3,
                                                  'lowest': '14-01-2003',
                                                  'selection': [('14-01-2003', '29-04-2008', 'both'),
                                                                ('29-04-2008', '13-08-2013', 'right'),
                                                                ('13-08-2013', '27-11-2018', 'right')],
                                                  'highest': '27-11-2018',
                                                  'freq_precision': 2,
                                                  'year_first': False},
                                       'patterns': {'sample_distribution': [12, 21, 11],
                                                    'relative_freq': [27.27, 47.73, 25.0]},
                                       'stats': {'bootstrap_bci': (14622.3654489759,
                                                                  15435.002697157),
                                                 'emp_outliers': [0, 0],
                                                 'excluded_percent': 0.0,
                                                 'irq_outliers': [0, 0], 'kurtosis': -0.5,
                                                 'mean': '14-03-2011',
                                                 'nulls_percent': 12.0,
                                                 'sample': 44,
                                                 'skew': 0.25}},
                          'associate': 'dates'}}
     result = Discover.analyse_association(df, columns_list)
     self.assertEqual(control, result)