def _load_datasets(self): """ Load five persons data for each test hpo # expected_tables is for testing output # it maps table name to list of expected records ex: "unioned_ehr_visit_occurrence" -> [{}, {}, ...] """ expected_tables = dict() running_jobs = [] for cdm_table in resources.CDM_TABLES: output_table = ehr_union.output_table_for(cdm_table) expected_tables[output_table] = [] for hpo_id in self.hpo_ids: # upload csv into hpo bucket if hpo_id == NYC_HPO_ID: cdm_file_name = os.path.join(test_util.FIVE_PERSONS_PATH, cdm_table + '.csv') else: cdm_file_name = os.path.join( test_util.PITT_FIVE_PERSONS_PATH, cdm_table + '.csv') bucket = gcs_utils.get_hpo_bucket(hpo_id) if os.path.exists(cdm_file_name): test_util.write_cloud_file(bucket, cdm_file_name) csv_rows = resources.csv_to_list(cdm_file_name) else: # results in empty table test_util.write_cloud_str(bucket, cdm_table + '.csv', 'dummy\n') csv_rows = [] # load table from csv result = bq_utils.load_cdm_csv(hpo_id, cdm_table) running_jobs.append(result['jobReference']['jobId']) expected_tables[output_table] += list(csv_rows) # ensure person to observation output is as expected output_table_person = ehr_union.output_table_for( combine_ehr_rdr.PERSON_TABLE) output_table_observation = ehr_union.output_table_for( combine_ehr_rdr.OBSERVATION_TABLE) expected_tables[output_table_observation] += 4 * expected_tables[ output_table_person] incomplete_jobs = bq_utils.wait_on_jobs(running_jobs) if len(incomplete_jobs) > 0: message = "Job id(s) %s failed to complete" % incomplete_jobs raise RuntimeError(message) self.expected_tables = expected_tables
def test_bad_file_names(self): bad_file_names = [ "avisit_occurrence.csv", "condition_occurence.csv", # misspelled "person_final.csv", "procedure_occurrence.tsv" ] # unsupported file extension expected_warnings = [] for file_name in bad_file_names: test_util.write_cloud_str(self.hpo_bucket, self.folder_prefix + file_name, ".") expected_item = (file_name, common.UNKNOWN_FILE) expected_warnings.append(expected_item) bucket_items = gcs_utils.list_bucket(self.hpo_bucket) folder_items = main.get_folder_items(bucket_items, self.folder_prefix) r = main.validate_submission(self.hpo_id, self.hpo_bucket, folder_items, self.folder_prefix) self.assertCountEqual(expected_warnings, r['warnings'])
def _load_dataset(self, hpo_id): for cdm_table in resources.CDM_TABLES: cdm_file_name = os.path.join(test_util.FIVE_PERSONS_PATH, cdm_table + '.csv') if os.path.exists(cdm_file_name): test_util.write_cloud_file(self.hpo_bucket, cdm_file_name) else: test_util.write_cloud_str(self.hpo_bucket, cdm_table + '.csv', 'dummy\n') bq_utils.load_cdm_csv(hpo_id, cdm_table) # ensure concept table exists if not bq_utils.table_exists(common.CONCEPT): bq_utils.create_standard_table(common.CONCEPT, common.CONCEPT) q = """INSERT INTO {dataset}.concept SELECT * FROM {vocab}.concept""".format( dataset=self.dataset, vocab=common.VOCABULARY_DATASET) bq_utils.query(q)
def _upload_file_to_bucket(bucket, dataset_id, path, table): app_id = bq_utils.app_identity.get_application_id() filename = table + '.csv' file_path = os.path.join(path, filename) try: with open(file_path, 'rb') as filepath: gcs_utils.upload_object(bucket, filename, filepath) except OSError: test_util.write_cloud_str(bucket, filename, '\n') gcs_path = 'gs://{bucket}/{filename}'.format(bucket=bucket, filename=filename) load_results = bq_utils.load_csv(table, gcs_path, app_id, dataset_id, table, allow_jagged_rows=True) load_job_id = load_results['jobReference']['jobId'] return load_job_id