def populate_achilles(hpo_bucket, hpo_id=FAKE_HPO_ID, include_heel=True): from validation import achilles, achilles_heel import app_identity app_id = app_identity.get_application_id() test_file_name = achilles.ACHILLES_ANALYSIS + '.csv' achilles_analysis_file_path = os.path.join(TEST_DATA_EXPORT_PATH, test_file_name) schema_name = achilles.ACHILLES_ANALYSIS write_cloud_file(hpo_bucket, achilles_analysis_file_path) gcs_path = 'gs://' + hpo_bucket + '/' + test_file_name dataset_id = bq_utils.get_dataset_id() table_id = bq_utils.get_table_id(hpo_id, achilles.ACHILLES_ANALYSIS) bq_utils.load_csv(schema_name, gcs_path, app_id, dataset_id, table_id) table_names = [achilles.ACHILLES_RESULTS, achilles.ACHILLES_RESULTS_DIST] if include_heel: table_names.append(achilles_heel.ACHILLES_HEEL_RESULTS) running_jobs = [] for table_name in table_names: test_file_name = table_name + '.csv' test_file_path = os.path.join(TEST_DATA_EXPORT_SYNPUF_PATH, table_name + '.csv') write_cloud_file(hpo_bucket, test_file_path) gcs_path = 'gs://' + hpo_bucket + '/' + test_file_name dataset_id = bq_utils.get_dataset_id() table_id = bq_utils.get_table_id(hpo_id, table_name) load_results = bq_utils.load_csv(table_name, gcs_path, app_id, dataset_id, table_id) running_jobs.append(load_results['jobReference']['jobId']) bq_utils.wait_on_jobs(running_jobs)
def test_merge_EHR(self, mock_check_cron): self._load_datasets() # enable exception propagation as described at https://goo.gl/LqDgnj old_dataset_items = bq_utils.list_dataset_contents( bq_utils.get_dataset_id()) expected_items = ['visit_id_mapping_table'] expected_items.extend( ['unioned_ehr_' + table_name for table_name in common.CDM_TABLES]) ehr_merge.merge(bq_utils.get_dataset_id(), self.project_id) # check the result files were placed in bucket dataset_items = bq_utils.list_dataset_contents( bq_utils.get_dataset_id()) for table_name in common.CDM_TABLES: cmd = 'SELECT COUNT(1) FROM unioned_ehr_{}'.format(table_name) result = bq_utils.query(cmd) self.assertEqual( int(result['rows'][0]['f'][0]['v']), 2 * globals().get(table_name.upper() + '_COUNT', 0), msg='failed for table unioned_ehr_{}'.format(table_name)) self.assertSetEqual(set(old_dataset_items + expected_items), set(dataset_items)) table_name = 'condition_occurrence' cmd_union = 'SELECT * FROM unioned_ehr_{}'.format(table_name) cmd_pitt = 'SELECT * FROM pitt_{}'.format(table_name) cmd_visit_mapping = "SELECT global_visit_id, mapping_visit_id FROM visit_id_mapping_table where hpo='pitt'" qr_union = bq_utils.query(cmd_union) qr_pitt = bq_utils.query(cmd_pitt) qr_visit_mapping = bq_utils.query(cmd_visit_mapping) union_result = query_result_to_payload(qr_union) pitt_result = query_result_to_payload(qr_pitt) visit_mapping_result = query_result_to_payload(qr_visit_mapping) def get_element_from_list_of_lists(index, list_of_lists): return [list_item[index] for list_item in list_of_lists] for ind, pitt_visit_id in enumerate( pitt_result['VISIT_OCCURRENCE_ID']): if pitt_visit_id not in visit_mapping_result['MAPPING_VISIT_ID']: continue global_visit_id_index = visit_mapping_result[ 'MAPPING_VISIT_ID'].index(pitt_visit_id) global_visit_id = visit_mapping_result['GLOBAL_VISIT_ID'][ global_visit_id_index] union_visit_id_index = union_result['VISIT_OCCURRENCE_ID'].index( global_visit_id) pitt_cols_without_id = [ values for key, values in pitt_result.items() if key not in [u'VISIT_OCCURRENCE_ID', u'CONDITION_OCCURRENCE_ID'] ] union_cols_without_id = [ values for key, values in union_result.items() if key not in [u'VISIT_OCCURRENCE_ID', u'CONDITION_OCCURRENCE_ID'] ] self.assertListEqual( get_element_from_list_of_lists(ind, pitt_cols_without_id), get_element_from_list_of_lists(union_visit_id_index, union_cols_without_id))
def mapping_query(domain_table): """ Returns query used to get mapping of all records from RDR combined with EHR records of consented participants :param domain_table: one of the domain tables (e.g. 'visit_occurrence', 'condition_occurrence') :return: """ if combine_consts.PERSON_ID in [ field['name'] for field in resources.fields_for(domain_table) ]: return combine_consts.MAPPING_QUERY_WITH_PERSON_CHECK.format( rdr_dataset_id=bq_utils.get_rdr_dataset_id(), ehr_dataset_id=bq_utils.get_dataset_id(), ehr_rdr_dataset_id=bq_utils.get_ehr_rdr_dataset_id(), domain_table=domain_table, mapping_constant=common.RDR_ID_CONSTANT, ehr_consent_table_id=combine_consts.EHR_CONSENT_TABLE_ID) else: return combine_consts.MAPPING_QUERY_WITHOUT_PERSON_CHECK.format( rdr_dataset_id=bq_utils.get_rdr_dataset_id(), ehr_dataset_id=bq_utils.get_dataset_id(), ehr_rdr_dataset_id=bq_utils.get_ehr_rdr_dataset_id(), domain_table=domain_table, mapping_constant=common.RDR_ID_CONSTANT)
def test_merge_with_unmatched_schema(self): running_jobs = [] with open(NYC_FIVE_PERSONS_MEASUREMENT_CSV, 'rb') as fp: gcs_utils.upload_object(gcs_utils.get_hpo_bucket('nyc'), 'measurement.csv', fp) result = bq_utils.load_cdm_csv('nyc', 'measurement') running_jobs.append(result['jobReference']['jobId']) with open(PITT_FIVE_PERSONS_PERSON_CSV, 'rb') as fp: gcs_utils.upload_object(gcs_utils.get_hpo_bucket('pitt'), 'person.csv', fp) result = bq_utils.load_cdm_csv('pitt', 'person') running_jobs.append(result['jobReference']['jobId']) incomplete_jobs = bq_utils.wait_on_jobs(running_jobs) self.assertEqual( len(incomplete_jobs), 0, 'loading tables {},{} timed out'.format('nyc_measurement', 'pitt_person')) table_names = ['nyc_measurement', 'pitt_person'] success, error = bq_utils.merge_tables(bq_utils.get_dataset_id(), table_names, bq_utils.get_dataset_id(), 'merged_nyc_pitt') self.assertFalse(success)
def test_merge_bad_table_names(self): table_ids = ['nyc_person_foo', 'pitt_person_foo'] success_flag, error_msg = bq_utils.merge_tables( bq_utils.get_dataset_id(), table_ids, bq_utils.get_dataset_id(), 'merged_nyc_pitt') # print error_msg assert (not success_flag)
def test_merge_with_good_data(self): running_jobs = [] with open(NYC_FIVE_PERSONS_PERSON_CSV, 'rb') as fp: gcs_utils.upload_object(gcs_utils.get_hpo_bucket('nyc'), 'person.csv', fp) result = bq_utils.load_cdm_csv('nyc', 'person') running_jobs.append(result['jobReference']['jobId']) with open(PITT_FIVE_PERSONS_PERSON_CSV, 'rb') as fp: gcs_utils.upload_object(gcs_utils.get_hpo_bucket('pitt'), 'person.csv', fp) result = bq_utils.load_cdm_csv('pitt', 'person') running_jobs.append(result['jobReference']['jobId']) nyc_person_ids = [ int(row['person_id']) for row in resources._csv_to_list(NYC_FIVE_PERSONS_PERSON_CSV) ] pitt_person_ids = [ int(row['person_id']) for row in resources._csv_to_list(PITT_FIVE_PERSONS_PERSON_CSV) ] expected_result = nyc_person_ids + pitt_person_ids expected_result.sort() incomplete_jobs = bq_utils.wait_on_jobs(running_jobs) self.assertEqual( len(incomplete_jobs), 0, 'loading tables {},{} timed out'.format('nyc_person', 'pitt_person')) table_ids = ['nyc_person', 'pitt_person'] success_flag, error = bq_utils.merge_tables(bq_utils.get_dataset_id(), table_ids, bq_utils.get_dataset_id(), 'merged_nyc_pitt') self.assertTrue(success_flag) self.assertEqual(error, "") query_string = "SELECT person_id FROM {}.{} LIMIT 1000".format( bq_utils.get_dataset_id(), 'merged_nyc_pitt') merged_query_job_result = bq_utils.query_table(query_string) self.assertIsNone(merged_query_job_result.get('errors', None)) actual_result = [ int(row['f'][0]['v']) for row in merged_query_job_result['rows'] ] actual_result.sort() self.assertListEqual(expected_result, actual_result)
def clean_ehr_dataset(project_id=None, dataset_id=None): """ Run all clean rules defined for the ehr dataset. :param project_id: Name of the BigQuery project. :param dataset_id: Name of the dataset to clean """ if project_id is None: project_id = app_identity.get_application_id() LOGGER.info('Project is unspecified. Using default value of:\t%s', project_id) if dataset_id is None: dataset_id = bq_utils.get_dataset_id() LOGGER.info('Dataset is unspecified. Using default value of:\t%s', dataset_id) sandbox_dataset_id = sandbox.create_sandbox_dataset(project_id=project_id, dataset_id=dataset_id) query_list = _gather_ehr_queries(project_id, dataset_id, sandbox_dataset_id) LOGGER.info("Cleaning ehr_dataset") clean_engine.clean_dataset(project_id, query_list, stage.EHR)
def get_lab_concept_summary_query(hpo_id): """ Get the query that checks if the HPO site has submitted the required labs :param hpo_id: :return: """ project_id = app_identity.get_application_id() dataset_id = bq_utils.get_dataset_id() hpo_measurement_table = bq_utils.get_table_id(hpo_id, common.MEASUREMENT) # Create measurement_concept_sets_table if not exist if not bq_utils.table_exists(MEASUREMENT_CONCEPT_SETS_TABLE, dataset_id): load_measurement_concept_sets_table(project_id, dataset_id) # Create measurement_concept_sets_descendants_table if not exist if not bq_utils.table_exists(MEASUREMENT_CONCEPT_SETS_DESCENDANTS_TABLE, dataset_id): load_measurement_concept_sets_descendants_table(project_id, dataset_id) return CHECK_REQUIRED_LAB_QUERY.format( project_id=project_id, ehr_ops_dataset_id=dataset_id, hpo_measurement_table=hpo_measurement_table, measurement_concept_sets_descendants= MEASUREMENT_CONCEPT_SETS_DESCENDANTS_TABLE)
def mapping_query(domain_table): """ Returns query used to get mapping of all records from RDR combined with EHR records of consented participants :param domain_table: one of the domain tables (e.g. 'visit_occurrence', 'condition_occurrence') :return: """ return '''SELECT DISTINCT '{rdr_dataset_id}' AS src_dataset_id, {domain_table}_id AS src_{domain_table}_id, 'rdr' as src_hpo_id, {domain_table}_id + {mapping_constant} AS {domain_table}_id FROM {rdr_dataset_id}.{domain_table} UNION ALL SELECT DISTINCT '{ehr_dataset_id}' AS src_dataset_id, t.{domain_table}_id AS src_{domain_table}_id, v.src_hpo_id AS src_hpo_id, t.{domain_table}_id AS {domain_table}_id FROM {ehr_dataset_id}.{domain_table} t JOIN {ehr_dataset_id}._mapping_{domain_table} v on t.{domain_table}_id = v.{domain_table}_id WHERE EXISTS (SELECT 1 FROM {ehr_rdr_dataset_id}.{ehr_consent_table_id} c WHERE t.person_id = c.person_id) '''.format(rdr_dataset_id=bq_utils.get_rdr_dataset_id(), ehr_dataset_id=bq_utils.get_dataset_id(), ehr_rdr_dataset_id=bq_utils.get_ehr_rdr_dataset_id(), domain_table=domain_table, mapping_constant=common.RDR_ID_CONSTANT, ehr_consent_table_id=EHR_CONSENT_TABLE_ID)
def mapping_query(domain_table): """ Returns query used to get mapping of all records from RDR combined with EHR records of consented participants :param domain_table: one of the domain tables (e.g. 'visit_occurrence', 'condition_occurrence') :return: """ return ''' WITH all_records AS ( SELECT '{rdr_dataset_id}' AS src_dataset_id, {domain_table}_id AS src_{domain_table}_id FROM {rdr_dataset_id}.{domain_table} UNION ALL SELECT '{ehr_dataset_id}' AS src_dataset_id, {domain_table}_id AS src_{domain_table}_id FROM {ehr_dataset_id}.{domain_table} t WHERE EXISTS (SELECT 1 FROM {ehr_rdr_dataset_id}.{ehr_consent_table_id} c WHERE t.person_id = c.person_id) ) SELECT ROW_NUMBER() OVER (ORDER BY src_dataset_id, src_{domain_table}_id) AS {domain_table}_id, src_dataset_id, src_{domain_table}_id FROM all_records '''.format(rdr_dataset_id=bq_utils.get_rdr_dataset_id(), ehr_dataset_id=bq_utils.get_dataset_id(), ehr_rdr_dataset_id=bq_utils.get_ehr_rdr_dataset_id(), domain_table=domain_table, ehr_consent_table_id=EHR_CONSENT_TABLE_ID)
def setUp(self): self.app_id = app_identity.get_application_id() self.dataset_id = bq_utils.get_dataset_id() self.bucket = gcs_utils.get_drc_bucket() test_util.empty_bucket(self.bucket) test_util.delete_all_tables(self.dataset_id) self.load_test_data(hpo_id=HPO_NYC)
def setUp(self): self.hpo_bucket = gcs_utils.get_hpo_bucket(FAKE_HPO_ID) self.dataset = bq_utils.get_dataset_id() self.project_id = app_identity.get_application_id() self.storage_client = StorageClient(self.project_id) self.storage_client.empty_bucket(self.hpo_bucket) test_util.delete_all_tables(self.dataset)
def most_common_heel_errors(app_id=None, dataset_id=None, hpo_ids=None): """ :param app_id: Application Id :param dataset_id: Dataset Id :param hpo_ids: list of Hpo_ids :return: None """ heel_errors = list() if app_id is None: app_id = app_identity.get_application_id() if dataset_id is None: dataset_id = bq_utils.get_dataset_id() if not os.path.exists(HEEL_ERRORS_JSON) and not os.path.exists( HEEL_ERRORS_CSV): for hpo_id in hpo_ids: if bq_utils.table_exists( table_id='{hpo_id}_achilles_heel_results'.format( hpo_id=hpo_id), dataset_id=dataset_id): query = heel_error_query.format(app_id=app_id, dataset_id=dataset_id, hpo_id=hpo_id) query_job = bq_utils.query(query) result = bq_utils.response2rows(query_job) heel_errors.extend(result) with open(HEEL_ERRORS_JSON, 'w') as fp: json.dump(heel_errors, fp, sort_keys=True, indent=4) parse_json_csv()
def create_dose_form_route_mappings_table(project_id, dataset_id=None): """ Creates "_logging_dose_form_route_mappings" table with only id columns from resources/dose_form_route_mappings.csv :param project_id: :param dataset_id: BQ dataset_id :return: upload metadata for created table """ if dataset_id is None: # Using table created in bq_dataset instead of re-creating in every dataset dataset_id = bq_utils.get_dataset_id() dose_form_routes_table_id = DOSE_FORM_ROUTES_TABLE_ID LOGGER.info("Creating %s.%s", dataset_id, DOSE_FORM_ROUTES_TABLE_ID) # create empty table bq_utils.create_table(DOSE_FORM_ROUTES_TABLE_ID, DOSE_FORM_ROUTE_FIELDS, drop_existing=True, dataset_id=dataset_id) dose_form_route_mappings_csv = os.path.join(resources.resource_path, DOSE_FORM_ROUTES_FILE + ".csv") dose_form_route_mappings_list = resources.csv_to_list( dose_form_route_mappings_csv) dose_form_routes_populate_query = INSERT_ROUTES_QUERY.format( dataset_id=dataset_id, project_id=project_id, routes_table_id=DOSE_FORM_ROUTES_TABLE_ID, mapping_list=get_mapping_list(dose_form_route_mappings_list)) result = bq_utils.query(dose_form_routes_populate_query) LOGGER.info("Created %s.%s", dataset_id, dose_form_routes_table_id) return result
def export_from_path(p, hpo_id): """ Export results :param p: path to SQL file :param hpo_id: HPO to run export for :return: `dict` structured for report render """ result = dict() for f in list_files_only(p): name = f[0:-4].upper() abs_path = os.path.join(p, f) with open(abs_path, 'r') as fp: sql = fp.read() sql = render(sql, hpo_id, results_schema=bq_utils.get_dataset_id(), vocab_schema='') query_result = bq_utils.query(sql) # TODO reshape results result[name] = query_result_to_payload(query_result) for d in list_dirs_only(p): abs_path = os.path.join(p, d) name = d.upper() # recursive call dir_result = export_from_path(abs_path, hpo_id) if name in result: # a sql file generated the item already result[name].update(dir_result) else: # add the item result[name] = dir_result return result
def test_load_csv(self): from google.appengine.api import app_identity app_id = app_identity.get_application_id() table_name = 'achilles_analysis' schema_file_name = table_name + '.json' csv_file_name = table_name + '.csv' schema_path = os.path.join(resources.fields_path, schema_file_name) local_csv_path = os.path.join(test_util.TEST_DATA_EXPORT_PATH, csv_file_name) with open(local_csv_path, 'r') as fp: response = gcs_utils.upload_object(self.hpo_bucket, csv_file_name, fp) hpo_bucket = self.hpo_bucket gcs_object_path = 'gs://%(hpo_bucket)s/%(csv_file_name)s' % locals() dataset_id = bq_utils.get_dataset_id() load_results = bq_utils.load_csv(schema_path, gcs_object_path, app_id, dataset_id, table_name) load_job_id = load_results['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([load_job_id]) self.assertEqual(len(incomplete_jobs), 0, 'loading table {} timed out'.format(table_name)) query_response = bq_utils.query('SELECT COUNT(1) FROM %(table_name)s' % locals()) self.assertEqual(query_response['kind'], 'bigquery#queryResponse')
def _export_query_response_by_path(p, hpo_id): """Utility to create response test payloads""" from validation import export for f in export.list_files_only(p): abs_path = os.path.join(p, f) with open(abs_path, 'r') as fp: sql = fp.read() sql = export.render(sql, hpo_id, results_schema=bq_utils.get_dataset_id(), vocab_schema='synpuf_100') query_result = bq_utils.query(sql) out_file = os.path.join(TEST_DATA_EXPORT_PATH, f.replace('.sql', '_response.json')) with open(out_file, 'w') as fp: data = dict() if 'rows' in query_result: data['rows'] = query_result['rows'] if 'schema' in query_result: data['schema'] = query_result['schema'] import json json.dump(data, fp, sort_keys=True, indent=4, separators=(',', ': '))
def setUp(self): self.testbed = testbed.Testbed() self.testbed.activate() self.testbed.init_app_identity_stub() self.testbed.init_memcache_stub() self.testbed.init_urlfetch_stub() self.testbed.init_blobstore_stub() self.testbed.init_datastore_v3_stub() self.project_id = bq_utils.app_identity.get_application_id() self.hpo_ids = [NYC_HPO_ID, PITT_HPO_ID] self.input_dataset_id = bq_utils.get_dataset_id() self.output_dataset_id = bq_utils.get_unioned_dataset_id() self._empty_hpo_buckets() test_util.delete_all_tables(self.input_dataset_id) test_util.delete_all_tables(self.output_dataset_id) # TODO Generalize to work for all foreign key references # Collect all primary key fields in CDM tables mapped_fields = [] for table in cdm.tables_to_map(): field = table + '_id' mapped_fields.append(field) self.mapped_fields = mapped_fields self.implemented_foreign_keys = [ eu_constants.VISIT_OCCURRENCE_ID, eu_constants.CARE_SITE_ID, eu_constants.LOCATION_ID ]
def test_load_ehr_observation(self): hpo_id = 'pitt' dataset_id = bq_utils.get_dataset_id() table_id = bq_utils.get_table_id(hpo_id, table_name='observation') q = 'SELECT observation_id FROM {dataset_id}.{table_id} ORDER BY observation_id'.format( dataset_id=dataset_id, table_id=table_id) expected_observation_ids = [ int(row['observation_id']) for row in resources._csv_to_list( PITT_FIVE_PERSONS_OBSERVATION_CSV) ] with open(PITT_FIVE_PERSONS_OBSERVATION_CSV, 'rb') as fp: gcs_utils.upload_object(gcs_utils.get_hpo_bucket(hpo_id), 'observation.csv', fp) result = bq_utils.load_cdm_csv(hpo_id, 'observation') job_id = result['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([job_id]) self.assertEqual(len(incomplete_jobs), 0, 'pitt_observation load job did not complete') load_job_result = bq_utils.get_job_details(job_id) load_job_result_status = load_job_result['status'] load_job_errors = load_job_result_status.get('errors') self.assertIsNone(load_job_errors, msg='pitt_observation load job failed: ' + str(load_job_errors)) query_results_response = bq_utils.query(q) query_job_errors = query_results_response.get('errors') self.assertIsNone(query_job_errors) actual_result = [ int(row['f'][0]['v']) for row in query_results_response['rows'] ] self.assertListEqual(actual_result, expected_observation_ids)
def tearDownClass(cls): ehr_dataset_id = bq_utils.get_dataset_id() rdr_dataset_id = bq_utils.get_rdr_dataset_id() test_util.delete_all_tables(ehr_dataset_id) test_util.delete_all_tables(rdr_dataset_id) cls.testbed.deactivate() logger.handlers = []
def mapping_query(table_name, hpo_ids, dataset_id=None, project_id=None): """ Get query used to generate new ids for a CDM table :param table_name: name of CDM table :param hpo_ids: identifies the HPOs :param dataset_id: identifies the BQ dataset containing the input table :param project_id: identifies the GCP project containing the dataset :return: the query """ if dataset_id is None: dataset_id = bq_utils.get_dataset_id() if project_id is None: project_id = app_identity.get_application_id() subqueries = _mapping_subqueries(table_name, hpo_ids, dataset_id, project_id) union_all_query = UNION_ALL.join(subqueries) return ''' WITH all_{table_name} AS ( {union_all_query} ) SELECT src_table_id, src_{table_name}_id, ROW_NUMBER() OVER () AS {table_name}_id FROM all_{table_name} '''.format(union_all_query=union_all_query, table_name=table_name)
def setUpClass(cls): print( '\n**************************************************************') print(cls.__name__) print('**************************************************************') dataset_id = bq_utils.get_dataset_id() test_util.delete_all_tables(dataset_id) test_util.populate_achilles()
def assert_ehr_and_rdr_tables(): """ Raise assertion error if any CDM tables missing from EHR or RDR dataset """ ehr_dataset_id = bq_utils.get_dataset_id() assert_tables_in(ehr_dataset_id) rdr_dataset_id = bq_utils.get_rdr_dataset_id() assert_tables_in(rdr_dataset_id)
def setUp(self): super(CombineEhrRdrTest, self).setUp() self.APP_ID = bq_utils.app_identity.get_application_id() self.ehr_dataset_id = bq_utils.get_dataset_id() self.rdr_dataset_id = bq_utils.get_rdr_dataset_id() self.combined_dataset_id = bq_utils.get_ehr_rdr_dataset_id() self.drc_bucket = gcs_utils.get_drc_bucket() test_util.delete_all_tables(self.combined_dataset_id)
def setUp(self): self.project_id = app_identity.get_application_id() self.dataset_id = bq_utils.get_dataset_id() self.bucket: str = gcs_utils.get_drc_bucket() self.storage_client = StorageClient(self.project_id) self.storage_client.empty_bucket(self.bucket) test_util.delete_all_tables(self.dataset_id) self.load_test_data(hpo_id=HPO_NYC)
def get_unconsented_ehr_records_count(self, table_name): q = UNCONSENTED_EHR_COUNTS_QUERY.format(rdr_dataset_id=bq_utils.get_rdr_dataset_id(), ehr_dataset_id=bq_utils.get_dataset_id(), ehr_rdr_dataset_id=self.combined_dataset_id, domain_table=table_name, ehr_consent_table_id='_ehr_consent') response = bq_utils.query(q) rows = bq_utils.response2rows(response) return rows[0]['n']
def _test_mapping_query(self): table = 'measurement' hpo_ids = ['chs', 'pitt'] project_id = bq_utils.app_identity.get_application_id() dataset_id = bq_utils.get_dataset_id() created_tables = [] for hpo_id in hpo_ids: hpo_table = self._create_hpo_table(hpo_id, table, dataset_id) created_tables.append(hpo_table) q = ehr_union.mapping_query(table, hpo_ids, dataset_id, project_id)
def main(args): dataset_id = get_dataset_id() target_bucket = args.bucket folder_prefix = args.folder + '/' _run_achilles() _run_export(datasource_id=dataset_id, folder_prefix=folder_prefix, target_bucket=target_bucket) _upload_achilles_files(folder_prefix=folder_prefix, target_bucket=target_bucket)
def setUpClass(cls): print( '\n**************************************************************') print(cls.__name__) print('**************************************************************') fake_bucket = gcs_utils.get_hpo_bucket(test_util.FAKE_HPO_ID) dataset_id = bq_utils.get_dataset_id() test_util.delete_all_tables(dataset_id) test_util.get_synpuf_results_files() test_util.populate_achilles(fake_bucket)
def clean_ehr_dataset(project=None, dataset=None): if dataset is None or dataset == '' or dataset.isspace(): dataset = bq_utils.get_dataset_id() LOGGER.info('Dataset is unspecified. Using default value of:\t%s', dataset) query_list = _gather_ehr_queries(project, dataset) LOGGER.info("Cleaning ehr_dataset") clean_engine.clean_dataset(project, dataset, query_list)