def test_from_component(self): # EventBook os.environ['HADRON_DEFAULT_PATH'] = 'eb://grey_storage/' os.environ[ 'HADRON_DEFAULT_MODULE'] = 'ds_engines.handlers.event_handlers' os.environ['HADRON_DEFAULT_SOURCE_HANDLER'] = 'EventPersistHandler' os.environ['HADRON_DEFAULT_PERSIST_HANDLER'] = 'EventSourceHandler' # Portfolio builder = SyntheticBuilder.from_env('members', has_contract=False) builder.set_outcome(uri_file="synthetic_members") builder = SyntheticBuilder.from_env('members')
def test_associate_analysis_complex(self): builder = SyntheticBuilder.from_memory() clinical_health = 'https://assets.datacamp.com/production/repositories/628/datasets/444cdbf175d5fbf564b564bd36ac21740627a834/diabetes.csv' builder.add_connector_uri('clinical_health', uri=clinical_health) discover: DataDiscovery = Transition.from_memory().discover A = discover.analysis2dict(header='age', dtype='int', granularity=10.0, lower=21, upper=90) B = discover.analysis2dict(header='pregnancies') columns_list = [A, B] df_clinical = builder.load_canonical('clinical_health') analysis_blob = discover.analyse_association(df_clinical, columns_list=columns_list) canonical = builder.tools.canonical2dict(method='@empty', size=1973) df = builder.tools.model_analysis(canonical, analytics_model=analysis_blob, column_name='clinical') self.assertEqual((1973, 2), df.shape) pregnancies = SyntheticCommons.list_standardize( SyntheticCommons.list_formatter(df_clinical.pregnancies)) low, high = discover.bootstrap_confidence_interval( pd.Series(pregnancies), func=np.mean) pregnancies = SyntheticCommons.list_standardize( SyntheticCommons.list_formatter(df.pregnancies)) self.assertTrue(low <= np.mean(pregnancies) <= high)
def test_runs(self): """Basic smoke test""" im = SyntheticBuilder.from_env('tester', default_save=False, default_save_intent=False, reset_templates=False).intent_model self.assertTrue(SyntheticIntentModel, type(im))
def test_dict_generate(self): builder = SyntheticBuilder.from_env('generator', has_contract=False) tools: SyntheticIntentModel = builder.tools df = pd.DataFrame() df['gender'] = tools.get_category(selection=['M', 'F'], column_name='gender') df['age'] = tools.get_number(from_value=18, to_value=90, column_name='age') target = {'method': '@generate', 'task_name': 'generator'} result = tools._get_canonical(data=target) self.assertCountEqual(['age', 'gender'], result.columns.to_list()) target = {'method': '@generate', 'task_name': 'generator', 'size': 100} result = tools._get_canonical(data=target) self.assertCountEqual(['age', 'gender'], result.columns.to_list()) self.assertEqual(100, result.shape[0]) selection = [tools.select2dict(column='gender', condition="@=='M'")] target = { 'method': '@generate', 'task_name': 'generator', 'size': 100, 'selection': selection } result = tools._get_canonical(data=target) self.assertGreater(result.shape[0], 0) self.assertEqual(0, (result[result['gender'] == 'F']).shape[0])
def test_model_us_zip(self): builder = SyntheticBuilder.from_env('test', default_save=False, default_save_intent=False, has_contract=False) df = pd.DataFrame(index=range(300)) result = builder.tools.model_us_zip(df, state_code_filter=['NY', 'TX', 'FRED']) self.assertCountEqual(['NY', 'TX'], result['StateCode'].value_counts().index.to_list()) self.assertCountEqual(['StateAbbrev', 'Zipcode', 'City', 'State', 'StateCode', 'Phone'], result.columns.to_list()) self.assertEqual(300, result.shape[0])
def test_remove_unwanted_headers(self): builder = SyntheticBuilder.from_env('test', default_save=False, default_save_intent=False, has_contract=False) builder.set_source_uri(uri="https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv") selection = [builder.tools.select2dict(column='survived', condition='==1')] result = builder.tools.frame_selection(canonical=builder.CONNECTOR_SOURCE, selection=selection, headers=['survived', 'sex', 'fare']) self.assertCountEqual(['survived', 'sex', 'fare'], list(result.columns)) self.assertEqual(1, result['survived'].min())
def test_run_pipeline_with_analytics(self): builder: SyntheticBuilder = SyntheticBuilder.from_env( 'sample', has_contract=False) tools: SyntheticIntentModel = builder.tools # load the sample dataset to analyse and rename columns clinical_health = 'https://assets.datacamp.com/production/repositories/628/datasets/444cdbf175d5fbf564b564bd36ac21740627a834/diabetes.csv' builder.add_connector_uri('clinical_health', uri=clinical_health) df_clinical = builder.load_canonical('clinical_health')
def test_model_sample_map(self): builder = SyntheticBuilder.from_memory(default_save_intent=False) result = builder.tools.model_sample_map(pd.DataFrame(), sample_map='us_healthcare_practitioner') self.assertEqual((70655, 10), result.shape) result = builder.tools.model_sample_map(pd.DataFrame(index=range(50)), sample_map='us_healthcare_practitioner') result = builder.tools.model_sample_map(pd.DataFrame(index=range(50)), sample_map='us_healthcare_practitioner', headers=['pcp_tax_id']) self.assertEqual((50, 1), result.shape)
def test_model_columns_headers(self): builder = SyntheticBuilder.from_env('test', default_save=False, default_save_intent=False, has_contract=False) tools: SyntheticIntentModel = builder.tools builder.set_source_uri(uri="https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv") df = pd.DataFrame(index=range(300)) result = tools.model_concat(df, other=builder.CONNECTOR_SOURCE, as_rows=False, headers=['survived', 'sex', 'fare']) self.assertCountEqual(['survived', 'sex', 'fare'], list(result.columns)) self.assertEqual(300, result.shape[0])
def test_canonical_run_pipeline_dict(self): tools = self.builder.intent_model df = pd.DataFrame() df['numbers'] = tools.get_number(1, 2, column_name='numbers') # create a remote pm contract inst = SyntheticBuilder.from_env('sub_set', has_contract=False) _ = inst.tools.get_category(selection=['A', 'B'], column_name='value') sub_set = SyntheticCommons.param2dict() df['corr_num'] = tools.correlate_numbers(df, offset=1, header='numbers', column_name='numbers', intent_order=1)
def test_str(self): builder = SyntheticBuilder.from_memory() tools = builder.tools df = pd.DataFrame(data={'A': list('12345')}) builder.add_connector_persist(connector_name='test', uri_file='test.pickle') builder.save_canonical(connector_name='test', canonical=df) result = tools._get_canonical(data='test') self.assertDictEqual(df.to_dict(), result.to_dict())
def test_model_us_person(self): builder = SyntheticBuilder.from_memory(default_save_intent=False) df = pd.DataFrame(index=range(300)) result = builder.tools.model_sample_map(canonical=df, sample_map='us_persona') self.assertCountEqual(['first_name', 'middle_name', 'gender', 'family_name', 'email'], result.columns.to_list()) self.assertEqual(300, result.shape[0]) df = pd.DataFrame(index=range(1000)) df = builder.tools.model_sample_map(canonical=df, sample_map='us_persona', female_bias=0.3) self.assertEqual((1000, 5), df.shape) print(df['gender'].value_counts().loc['F'])
def test_list(self): builder = SyntheticBuilder.from_memory() tools = builder.tools sample = list('12345') result = tools._get_canonical(data=sample) self.assertEqual(sample, result['default'].to_list()) result = tools._get_canonical(data=sample, header='sample') self.assertEqual(sample, result['sample'].to_list()) sample = pd.Series(sample) result = tools._get_canonical(data=sample, header='sample') self.assertEqual(sample.to_list(), result['sample'].to_list())
def test_dict_method(self): builder = SyntheticBuilder.from_env('generator', has_contract=False) tools: SyntheticIntentModel = builder.tools action = tools.canonical2dict(method='model_sample_map', canonical=tools.action2dict( method='@empty', size=100), sample_map='us_persona', female_bias=0.3) result = tools._get_canonical(data=action) self.assertEqual((100, 5), result.shape) self.assertEqual(30, result['gender'].value_counts().loc['F'])
def test_complex_sample_modelling(self): tools = SyntheticBuilder.from_memory().tools state_code = ['CA', 'NY', 'LA', 'NJ', 'VA', 'CO', 'NV', 'GA', 'IN', 'OH', 'KY', 'ME', 'MO', 'WI'] df = tools.model_sample_map(canonical={'method': '@empty', 'size':100}, sample_map='us_zipcode', state_filter=state_code, column_name='zipcodes') sample_data = tools.action2dict(method='model_sample_map', canonical=tools.action2dict(method='@empty'), sample_map='us_healthcare_practitioner', headers=['city', 'pcp_tax_id'], shuffle=False) merge_data = tools.action2dict(method='model_group', canonical=sample_data, headers='pcp_tax_id', group_by='city', aggregator='list') df = tools.model_merge(df, merge_data, how='left', left_on='city', right_on='city', column_name='pcp_tax_id') self.assertCountEqual(['city', 'state_abbr', 'state', 'county_fips', 'county', 'zipcode', 'pcp_tax_id'], df.columns.to_list())
def setUp(self): os.environ['HADRON_PM_PATH'] = os.path.join('work', 'config') os.environ['HADRON_DEFAULT_PATH'] = os.path.join('work', 'data') try: os.makedirs(os.environ['HADRON_PM_PATH']) os.makedirs(os.environ['HADRON_DEFAULT_PATH']) except: pass PropertyManager._remove_all() self.builder: SyntheticBuilder = SyntheticBuilder.from_env( 'sample', has_contract=False) self.builder.setup_bootstrap() self.tools: SyntheticIntentModel = self.builder.tools
def test_model_group(self): builder = SyntheticBuilder.from_memory() tools: SyntheticIntentModel = builder.tools builder.add_connector_uri('titanic', uri="https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv") df = tools.model_group('titanic', headers='fare', group_by=['survived', 'sex'], aggregator='sum') self.assertEqual((4, 3), df.shape) df = tools.model_group('titanic', headers=['class', 'embark_town'], group_by=['survived', 'sex'], aggregator='set', list_choice=2) # print(df.loc[:, ['class', 'embark_town']]) self.assertEqual((4, 4), df.shape) self.assertCountEqual(['class', 'embark_town', 'survived', 'sex'], df.columns.to_list()) df = tools.model_group('titanic', headers=['fare', 'survived'], group_by='sex', aggregator='sum', include_weighting=True) self.assertEqual((2, 4), df.shape) self.assertCountEqual(['survived', 'sex', 'fare', 'weighting'], df.columns.to_list())
def test_run_synthetic_pipeline_seed(self): builder = SyntheticBuilder.from_env('tester', has_contract=False) builder.set_persist() tools: SyntheticIntentModel = builder.tools _ = tools.get_category(selection=['M', 'F'], relative_freq=[4, 3], column_name='gender') _ = tools.get_number(from_value=18, to_value=80, column_name='age') builder.run_synthetic_pipeline(size=1000, seed=23) df = builder.load_synthetic_canonical() dist = df['gender'].value_counts().values mean = df['age'].mean() builder.run_synthetic_pipeline(size=1000, seed=23) df = builder.load_synthetic_canonical() self.assertCountEqual(dist, df['gender'].value_counts().values) self.assertEqual(mean, df['age'].mean())
def test_dict_empty(self): builder = SyntheticBuilder.from_env('generator', has_contract=False) tools: SyntheticIntentModel = builder.tools action = tools.canonical2dict(method='@empty') result = tools._get_canonical(data=action) self.assertEqual((0, 0), result.shape) action = tools.canonical2dict(method='@empty', size=100) result = tools._get_canonical(data=action) self.assertEqual((100, 0), result.shape) action = tools.canonical2dict(method='@empty', size=100) result = tools._get_canonical(data=action) self.assertEqual((100, 0), result.shape) action = tools.canonical2dict(method='@empty', size=100, headers=['A', 'B', 'C']) result = tools._get_canonical(data=action) self.assertEqual((100, 3), result.shape)
def test_dict_generate_remote(self): builder = SyntheticBuilder.from_memory() tools: SyntheticIntentModel = builder.tools canonical = tools.canonical2dict(method='@empty', size=1000) other = tools.canonical2dict( method='@generate', task_name='members', uri_pm_repo= 'https://raw.githubusercontent.com/project-hadron/hadron-asset-bank/master/contracts/healthcare/factory/members/' ) result = builder.intent_model.model_concat( canonical=canonical, other=other, as_rows=False, headers=[ 'member_id', 'state', 'prev_flu_shot', 'age', 'channel_pref' ], column_name='member_reference') print(result.columns)
def test_associate_analysis_dominance(self): sample = pd.DataFrame() sample['values'] = [0, 1, 0, 0, 7, 0, 0, 4, 2, 0, 0, 5, 8, 7, 0, 0] discover: DataDiscovery = Transition.from_memory().discover columns_list = [ discover.analysis2dict(header='values', dtype='int', precision=0, exclude_dominant=True) ] analysis_blob = discover.analyse_association(sample, columns_list=columns_list) builder = SyntheticBuilder.from_memory() canonical = builder.tools.canonical2dict(method='@empty', size=1000) df = builder.tools.model_analysis(canonical, analytics_model=analysis_blob, apply_bias=True) self.assertAlmostEqual( df['values'].value_counts().iloc[0] / df.shape[0], sample['values'].value_counts().iloc[0] / sample.shape[0], places=2)
def recommend_heuristic(profile: pd.Series, items: pd.DataFrame, recommend: int = None, top: int = None, exclude_items: list = None) -> list: """ takes a profile of an entity where the index of the profile represents the columns in the items. for example the profile will be an index list or film genres and how many times these categories have been watched. The items will be columns of categories with the index the films and row values being the count of film watches in the column categories :param profile: a pandas series of categories (index) counters for a single profile :param items: a pandas dataframe of item counts (index) of columns (categories :param recommend: the number of recommended items to select from :param top: limits the cut-off of the top categories to select from :param exclude_items: item index to not include :return: a list of recommendations """ recommend = 10 if recommend is None else recommend top = 10 if top is None or top < 1 else top # drop the entities in the exclude _df = items.drop(index=exclude_items, errors='ignore') if profile is None or profile.size == 0: return [] categories = profile.sort_values(ascending=False).iloc[:top] choices = SyntheticBuilder.scratch_pad().get_category( selection=categories.index.to_list(), weight_pattern=categories.values.tolist(), size=recommend) choices_count = pd.Series(choices).value_counts() selection_dict = {} for index in choices_count.index: selection_dict.update({ index: _df[index].sort_values(ascending=False). iloc[:choices_count.loc[index]].index.to_list() }) rtn_list = [] for item in choices: rtn_list.append(selection_dict[item].pop()) return rtn_list
def test_model_iterator(self): builder = SyntheticBuilder.from_env('test', default_save=False, default_save_intent=False, has_contract=False) tools: SyntheticIntentModel = builder.tools builder.add_connector_uri('titanic', uri="https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv") # do nothing result = tools.model_iterator(canonical='titanic') self.assertEqual(builder.load_canonical('titanic').shape, result.shape) # add marker result = tools.model_iterator(canonical='titanic', marker_col='marker') self.assertEqual(builder.load_canonical('titanic').shape[1]+1, result.shape[1]) # with selection selection = [tools.select2dict(column='survived', condition="==1")] control = tools.frame_selection(canonical='titanic', selection=selection) result = tools.model_iterator(canonical='titanic', marker_col='marker', selection=selection) self.assertEqual(control.shape[0], result.shape[0]) # with iteration result = tools.model_iterator(canonical='titanic', marker_col='marker', iter_stop=3) self.assertCountEqual([0,1,2], result['marker'].value_counts().index.to_list()) # with actions actions = {2: (tools.action2dict(method='get_category', selection=[4,5]))} result = tools.model_iterator(canonical='titanic', marker_col='marker', iter_stop=3, iteration_actions=actions) self.assertCountEqual([0,1,4,5], result['marker'].value_counts().index.to_list())
def test_dataframe(self): tools = SyntheticBuilder.from_memory().tools df = pd.DataFrame(data={'A': list('12345')}) result = tools._get_canonical(data=df) self.assertDictEqual(df.to_dict(), result.to_dict())
def builder(self) -> SyntheticBuilder: return SyntheticBuilder.from_env('tester')
def builder(self) -> SyntheticBuilder: return SyntheticBuilder.from_env('tester', has_contract=False)
def test_set_report_persist(self): builder = SyntheticBuilder.from_env('tester', default_save=False, has_contract=False) builder.setup_bootstrap(domain='domain', project_name='project_name', path=None) report = builder.report_connectors(stylise=False) _, file = os.path.split(report.uri.iloc[-1]) self.assertTrue(file.startswith('project_name'))
def test_runs(self): """Basic smoke test""" self.assertEqual(SyntheticBuilder, type(SyntheticBuilder.from_env('tester', has_contract=False)))
def tools(self) -> SyntheticIntentModel: return SyntheticBuilder.scratch_pad()
def test_runs(self): """Basic smoke test""" self.assertEqual(SyntheticBuilder, type(SyntheticBuilder.from_env('tester')))