Beispiel #1
0
 def synthetic_agent(agent_name: str, size: int, remote_uri: str):
     SyntheticBuilder.from_env(
         agent_name,
         uri_pm_repo=remote_uri).run_synthetic_pipeline(size=size)
     Transition.from_env(agent_name,
                         uri_pm_repo=remote_uri).run_transition_pipeline()
     FeatureCatalog.from_env(agent_name,
                             uri_pm_repo=remote_uri).run_feature_pipeline()
Beispiel #2
0
 def test_associate_analysis_complex(self):
     builder = SyntheticBuilder.from_memory()
     clinical_health = 'https://assets.datacamp.com/production/repositories/628/datasets/444cdbf175d5fbf564b564bd36ac21740627a834/diabetes.csv'
     builder.add_connector_uri('clinical_health', uri=clinical_health)
     discover: DataDiscovery = Transition.from_memory().discover
     A = discover.analysis2dict(header='age',
                                dtype='int',
                                granularity=10.0,
                                lower=21,
                                upper=90)
     B = discover.analysis2dict(header='pregnancies')
     columns_list = [A, B]
     df_clinical = builder.load_canonical('clinical_health')
     analysis_blob = discover.analyse_association(df_clinical,
                                                  columns_list=columns_list)
     canonical = builder.tools.canonical2dict(method='@empty', size=1973)
     df = builder.tools.model_analysis(canonical,
                                       analytics_model=analysis_blob,
                                       column_name='clinical')
     self.assertEqual((1973, 2), df.shape)
     pregnancies = SyntheticCommons.list_standardize(
         SyntheticCommons.list_formatter(df_clinical.pregnancies))
     low, high = discover.bootstrap_confidence_interval(
         pd.Series(pregnancies), func=np.mean)
     pregnancies = SyntheticCommons.list_standardize(
         SyntheticCommons.list_formatter(df.pregnancies))
     self.assertTrue(low <= np.mean(pregnancies) <= high)
    def setUp(self):
        # clean out any old environments
        for key in os.environ.keys():
            if key.startswith('HADRON'):
                del os.environ[key]

        os.environ['HADRON_PM_PATH'] = os.path.join('work', 'config')
        os.environ['HADRON_DEFAULT_PATH'] = os.path.join('work', 'data')
        try:
            os.makedirs(os.environ['HADRON_PM_PATH'])
            os.makedirs(os.environ['HADRON_DEFAULT_PATH'])
        except:
            raise IOError('Unable to create directories')
        PropertyManager._remove_all()
        builder = SyntheticBuilder.from_env('task1', has_contract=False)
        builder.set_persist()
        builder.pm_persist()
        tr = Transition.from_env('task2', has_contract=False)
        tr.set_source_uri(builder.get_persist_contract().raw_uri)
        tr.set_persist()
        tr.pm_persist()
        wr = Wrangle.from_env('task3', has_contract=False)
        wr.set_source_uri(tr.get_persist_contract().raw_uri)
        wr.set_persist()
        wr.pm_persist()
Beispiel #4
0
 def setUp(self):
     # clean out any old environments
     for key in os.environ.keys():
         if key.startswith('HADRON'):
             del os.environ[key]
     # Local Domain Contract
     os.environ['HADRON_PM_PATH'] = os.path.join('working', 'contracts')
     os.environ['HADRON_PM_TYPE'] = 'json'
     # Local Connectivity
     os.environ['HADRON_DEFAULT_PATH'] = Path('working/data').as_posix()
     # Specialist Component
     try:
         os.makedirs(os.environ['HADRON_PM_PATH'])
     except:
         pass
     try:
         os.makedirs(os.environ['HADRON_DEFAULT_PATH'])
     except:
         pass
     PropertyManager._remove_all()
     tr = Transition.from_env('task1', has_contract=False)
     tr.set_source_uri(
         "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv"
     )
     tr.set_persist()
     wr = Wrangle.from_env('task2', has_contract=False)
     wr.set_source_uri(tr.get_persist_contract().raw_uri)
     wr.set_persist()
     controller = Controller.from_env(has_contract=False)
     controller.intent_model.transition(canonical=pd.DataFrame(),
                                        task_name='task1',
                                        intent_level='transition')
     controller.intent_model.wrangle(canonical=pd.DataFrame(),
                                     task_name='task2',
                                     intent_level='wrangle')
Beispiel #5
0
    def setUp(self):
        # clean out any old environments
        for key in os.environ.keys():
            if key.startswith('HADRON'):
                del os.environ[key]

        os.environ['HADRON_PM_PATH'] = os.path.join('work', 'config')
        os.environ['HADRON_DEFAULT_PATH'] = os.path.join('work', 'data')
        try:
            os.makedirs(os.environ['HADRON_PM_PATH'])
            os.makedirs(os.environ['HADRON_DEFAULT_PATH'])
        except:
            pass
        PropertyManager._remove_all()
        tr = Transition.from_env('task1', has_contract=False)
        tr.set_source_uri(
            "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv"
        )
        tr.set_persist()
        wr = Wrangle.from_env('task2', has_contract=False)
        wr.set_source_uri(tr.get_persist_contract().raw_uri)
        wr.set_persist()
        controller = Controller.from_env(has_contract=False)
        controller.intent_model.transition(canonical=pd.DataFrame(),
                                           task_name='task1',
                                           intent_level='task1_tr')
        controller.intent_model.wrangle(canonical=pd.DataFrame(),
                                        task_name='task2',
                                        intent_level='task2_wr')
Beispiel #6
0
 def test_run_transition_pipeline(self):
     os.environ[
         'HADRON_PM_REPO'] = "https://raw.githubusercontent.com/project-hadron/hadron-asset-bank/master/bundles/samples/hk_income_sample/contracts/"
     tr: Transition = Transition.from_env('hk_income',
                                          default_save=False,
                                          default_save_intent=False,
                                          has_contract=False)
     pprint(tr.pm.report_connectors())
Beispiel #7
0
 def test_set_report_persist(self):
     tr: Transition = Transition.from_env('tester',
                                          default_save=False,
                                          has_contract=False)
     tr.setup_bootstrap(domain='domain',
                        project_name='project_name',
                        path=None)
     report = tr.report_connectors(stylise=False)
     _, file = os.path.split(report.uri.iloc[0])
     self.assertTrue(file.startswith('project_name'))
 def setUp(self):
     os.environ['HADRON_PM_PATH'] = os.path.join('work', 'config')
     os.environ['HADRON_DEFAULT_PATH'] = os.path.join('work', 'data')
     try:
         os.makedirs(os.environ['HADRON_PM_PATH'])
         os.makedirs(os.environ['HADRON_DEFAULT_PATH'])
     except:
         pass
     PropertyManager._remove_all()
     self.tools = SyntheticBuilder.scratch_pad()
     self.clean = Transition.scratch_pad()
Beispiel #9
0
 def test_filter_correlate(self):
     builder = SyntheticBuilder.from_env('builder')
     tr = Transition.from_env("tr1", has_contract=False)
     cleaners: TransitionIntentModel = tr.cleaners
     tr.set_source_uri(builder.get_persist_contract().raw_uri)
     tr.set_persist()
     df = tr.load_source_canonical()
     self.assertEqual((1000, 9), df.shape)
     df = cleaners.auto_brute_force_correlated(df)
     self.assertEqual((1000, 7), df.shape)
     df = cleaners.auto_brute_force_correlated(df, threshold=0.8)
     self.assertEqual((1000, 3), df.shape)
Beispiel #10
0
 def test_provenance_from_bootstrap(self):
     tr: Transition = Transition.from_memory()
     tr.setup_bootstrap(domain='heathcare', project_name='datalake_gen')
     tr.set_provenance(provider_name="Project Hadron",
                       author_name='doatridge',
                       cost_price="$0.00")
     report = tr.report_provenance(stylise=False).index.to_list()
     control = [
         'title', 'domain', 'description', 'license_type', 'license_name',
         'license_uri', 'cost_price', 'provider_name', 'author_name'
     ]
     self.assertCountEqual(control, report)
Beispiel #11
0
 def test_provenance_report(self):
     tr: Transition = Transition.from_env('test',
                                          default_save=False,
                                          default_save_intent=False,
                                          has_contract=False)
     tr.set_provenance(title='new_title',
                       domain='Healthcare',
                       author_name='Joe Bloggs')
     result = tr.report_provenance(stylise=False)
     self.assertEqual((6, 1), result.shape)
     self.assertCountEqual([
         'title', 'domain', 'license_type', 'license_name', 'license_uri',
         'author_name'
     ], list(result.index))
Beispiel #12
0
    def test_from_env(self):
        os.environ['HADRON_PM_PATH'] = Path(os.environ['PWD'],
                                            'work').as_posix()
        os.environ['HADRON_PM_TYPE'] = 'pickle'
        os.environ['HADRON_PM_MODULE'] = 'aistac.handlers.python_handlers'
        os.environ['HADRON_PM_HANDLER'] = 'PythonPersistHandler'
        tr = Transition.from_env('task', has_contract=False)
        self.assertEqual(
            os.environ['HADRON_PM_PATH'] + "/hadron_pm_transition_task.pickle",
            tr.pm.get_connector_contract(tr.pm.CONNECTOR_PM_CONTRACT).uri)
        self.assertEqual(
            os.environ['HADRON_PM_MODULE'],
            tr.pm.get_connector_contract(
                tr.pm.CONNECTOR_PM_CONTRACT).module_name)
        self.assertEqual(
            os.environ['HADRON_PM_HANDLER'],
            tr.pm.get_connector_contract(tr.pm.CONNECTOR_PM_CONTRACT).handler)

        os.environ[
            'HADRON_PM_MODULE'] = 'ds_discovery.handlers.pandas_handlers'
        os.environ['HADRON_PM_HANDLER'] = 'PandasPersistHandler'
        tr = Transition.from_env('task', has_contract=False)
        self.assertEqual(
            os.environ['HADRON_PM_PATH'] + "/hadron_pm_transition_task.pickle",
            tr.pm.get_connector_contract(tr.pm.CONNECTOR_PM_CONTRACT).uri)
        self.assertEqual(
            os.environ['HADRON_PM_MODULE'],
            tr.pm.get_connector_contract(
                tr.pm.CONNECTOR_PM_CONTRACT).module_name)
        self.assertEqual(
            os.environ['HADRON_PM_HANDLER'],
            tr.pm.get_connector_contract(tr.pm.CONNECTOR_PM_CONTRACT).handler)

        os.unsetenv('HADRON_PM_PATH')
        os.unsetenv('HADRON_PM_TYPE')
        os.unsetenv('HADRON_PM_MODULE')
        os.unsetenv('HADRON_PM_HANDLER')
 def test_filter_univariate_mse(self):
     tr = Transition.from_env('test',
                              default_save=False,
                              default_save_intent=False,
                              has_contract=False)
     tr.set_source('ames_housing.csv', nrows=5000)
     data = tr.load_source_canonical()
     result = Discover.filter_univariate_mse(data,
                                             target='SalePrice',
                                             as_series=False,
                                             top=5)
     self.assertEqual([
         'OverallQual', 'GarageCars', 'FullBath', 'TotRmsAbvGrd',
         'YearBuilt'
     ], result)
Beispiel #14
0
 def test_transition_summary_report(self):
     tr: Transition = Transition.from_env('test',
                                          default_save=False,
                                          default_save_intent=False,
                                          has_contract=False)
     cc = ConnectorContract(uri=os.path.join(os.environ['HOME'], 'code',
                                             'projects', 'data', 'sample',
                                             'synthetic_customer.csv'),
                            module_name=tr.DEFAULT_MODULE,
                            handler=tr.DEFAULT_SOURCE_HANDLER)
     tr.set_source_contract(connector_contract=cc)
     report = tr.report_quality_summary(as_dict=True)
     self.assertEqual(
         ['score', 'data_shape', 'data_type', 'usability', 'cost'],
         list(report.keys()))
 def test_analyse_associate_levels_nums(self):
     clinical_health = 'https://assets.datacamp.com/production/repositories/628/datasets/444cdbf175d5fbf564b564bd36ac21740627a834/diabetes.csv'
     tr = Transition.from_memory()
     tr.set_source_uri(clinical_health)
     # columns_list = [{'diabetes': {'dtype': 'category'}},
     #                 {'age': {'dtype': 'int', 'granularity': 10.0, 'lower': 21, 'upper': 90, }},
     #                 {'pregnancies': {}, 'glucose': {}, 'diastolic': {}, 'triceps': {}, 'insulin': {}, 'bmi': {},
     #                  'dpf': {}}]
     columns_list = [{'age': {'dtype': 'int', 'granularity': 1}},
                     {'glucose': {}}]
     df_clinical = tr.load_source_canonical()
     discover: DataDiscovery = tr.discover
     analysis_blob = discover.analyse_association(df_clinical, columns_list=columns_list)
     age = DataAnalytics.from_branch(analytics_blob=analysis_blob, branch="age")
     glucose = DataAnalytics.from_branch(analytics_blob=analysis_blob, branch="age.0.glucose")
     pprint(age.intent.to_dict())
     pprint(glucose.intent.to_dict())
Beispiel #16
0
 def test_dictionary_report(self):
     df = pd.DataFrame({
         'A': [1, 2, 3],
         'B': [1, 2, 3],
         'C': [1, 2, 3],
         'D': [1, 2, 3]
     })
     notes = pd.DataFrame()
     notes['label'] = ['A', 'B', 'D', 'F']
     notes['text'] = [
         'This is the Alpha', 'Beta follows it closely', 'D is the last',
         'F is out of place'
     ]
     cp = Transition.from_env('task', has_contract=False)
     cp.upload_attributes(canonical=notes,
                          label_key='label',
                          text_key='text')
     result = cp.report_attributes(df, stylise=False)
     self.assertEqual((4, 2), result.shape)
Beispiel #17
0
 def test_controller_check_changed(self):
     tr = Transition.from_env('task1')
     tr.set_source(uri_file='sample.csv')
     tr.set_persist(uri_file='sample.csv')
     df = pd.DataFrame({'A': [1, 2, 3, 4]})
     tr.save_persist_canonical(df)
     controller = Controller.from_env()
     controller.run_controller(
         repeat=2,
         source_check_uri=tr.get_persist_contract().raw_uri,
         run_cycle_report='report.csv')
     df = controller.load_canonical(connector_name='run_cycle_report')
     control = [
         'start run-cycle 0', 'start run count 0', 'running task1_tr',
         'canonical shape is (4, 1)', 'running task2_wr',
         'canonical shape is (4, 1)', 'tasks complete', 'start run count 1',
         'Source has not changed', 'end of report'
     ]
     self.assertEqual(control, df['text'].to_list())
Beispiel #18
0
 def test_associate_analysis_dominance(self):
     sample = pd.DataFrame()
     sample['values'] = [0, 1, 0, 0, 7, 0, 0, 4, 2, 0, 0, 5, 8, 7, 0, 0]
     discover: DataDiscovery = Transition.from_memory().discover
     columns_list = [
         discover.analysis2dict(header='values',
                                dtype='int',
                                precision=0,
                                exclude_dominant=True)
     ]
     analysis_blob = discover.analyse_association(sample,
                                                  columns_list=columns_list)
     builder = SyntheticBuilder.from_memory()
     canonical = builder.tools.canonical2dict(method='@empty', size=1000)
     df = builder.tools.model_analysis(canonical,
                                       analytics_model=analysis_blob,
                                       apply_bias=True)
     self.assertAlmostEqual(
         df['values'].value_counts().iloc[0] / df.shape[0],
         sample['values'].value_counts().iloc[0] / sample.shape[0],
         places=2)
 def test_filter_univariate_roc_auc(self):
     tr = Transition.from_env('test',
                              default_save=False,
                              default_save_intent=False,
                              has_contract=False)
     tr.set_source('paribas.csv', nrows=5000)
     data = tr.load_source_canonical()
     result = Discover.filter_univariate_roc_auc(data,
                                                 target='target',
                                                 threshold=0.55)
     self.assertCountEqual(['v10', 'v129', 'v14', 'v62', 'v50'], result)
     # Custom classifier
     classifier_kwargs = {'iterations': 2, 'learning_rate': 1, 'depth': 2}
     result = Discover.filter_univariate_roc_auc(
         data,
         target='target',
         threshold=0.55,
         package='catboost',
         model='CatBoostClassifier',
         classifier_kwargs=classifier_kwargs,
         fit_kwargs={'verbose': False})
     self.assertCountEqual(
         ['v50', 'v10', 'v14', 'v12', 'v129', 'v62', 'v21', 'v34'], result)
Beispiel #20
0
 def test_repo_load(self):
     os.environ[
         'HADRON_PM_REPO'] = "https://raw.githubusercontent.com/project-hadron/hadron-asset-bank/master/bundles/samples/hk_income_sample/contracts/"
     tr: Transition = Transition.from_env('hk_income', has_contract=False)
Beispiel #21
0
 def test_runs(self):
     """Basic smoke test"""
     Transition.from_env('TestAgent', has_contract=False)