def test_all_files_unparseable_output(self, mock_check_cron): # TODO possible bug: if no pre-existing table, results in bq table not found error folder_prefix = 'dummy-prefix-2018-03-22/' for cdm_table in common.CDM_FILES: test_util.write_cloud_str(self.hpo_bucket, folder_prefix + cdm_table, ".\n .") main.app.testing = True with main.app.test_client() as c: c.get(test_util.VALIDATE_HPO_FILES_URL) # check the result file was put in bucket list_bucket_result = gcs_utils.list_bucket(self.hpo_bucket) bucket_item_names = [ item['name'] for item in list_bucket_result if item['name'].startswith(folder_prefix) ] expected_items = common.CDM_FILES + common.IGNORE_LIST expected_items = [ folder_prefix + item_name for item_name in expected_items ] self.assertSetEqual(set(bucket_item_names), set(expected_items)) # check content of the file is correct actual_result = test_util.read_cloud_file( self.hpo_bucket, folder_prefix + common.RESULT_CSV) actual_result = resources._csv_file_to_list( StringIO.StringIO(actual_result)) expected = [{ 'cdm_file_name': cdm_file_name, 'found': '1', 'parsed': '0', 'loaded': '0' } for cdm_file_name in common.CDM_FILES] self.assertEqual(expected, actual_result)
def _load_datasets(self): """ Load five persons data for each test hpo """ # expected_tables is for testing output # it maps table name to list of expected records ex: "unioned_ehr_visit_occurrence" -> [{}, {}, ...] expected_tables = dict() running_jobs = [] for cdm_table in common.CDM_TABLES: cdm_file_name = os.path.join(test_util.FIVE_PERSONS_PATH, cdm_table + '.csv') output_table = ehr_union.output_table_for(cdm_table) expected_tables[output_table] = [] for hpo_id in self.hpo_ids: # upload csv into hpo bucket bucket = gcs_utils.get_hpo_bucket(hpo_id) if os.path.exists(cdm_file_name): test_util.write_cloud_file(bucket, cdm_file_name) csv_rows = resources._csv_to_list(cdm_file_name) else: # results in empty table test_util.write_cloud_str(bucket, cdm_table + '.csv', 'dummy\n') csv_rows = [] # load table from csv result = bq_utils.load_cdm_csv(hpo_id, cdm_table) running_jobs.append(result['jobReference']['jobId']) expected_tables[output_table] += list(csv_rows) incomplete_jobs = bq_utils.wait_on_jobs(running_jobs) if len(incomplete_jobs) > 0: message = "Job id(s) %s failed to complete" % incomplete_jobs raise RuntimeError(message) self.expected_tables = expected_tables
def test_bad_file_names(self, mock_check_cron): folder_prefix = 'dummy-prefix-2018-03-22/' exclude_file_list = ["person_final.csv", "condition_occurence.csv", # misspelled "avisit_occurrence.csv", "procedure_occurrence.tsv"] # unsupported file extension exclude_file_list = [folder_prefix + item for item in exclude_file_list] expected_result_items = [] for file_name in exclude_file_list: test_util.write_cloud_str(self.hpo_bucket, file_name, ".") expected_item = dict(file_name=file_name.split('/')[1], message=main.UNKNOWN_FILE) expected_result_items.append(expected_item) main.app.testing = True with main.app.test_client() as c: c.get(test_util.VALIDATE_HPO_FILES_URL) # check content of the bucket is correct expected_bucket_items = exclude_file_list + [folder_prefix + item for item in common.IGNORE_LIST] # [common.RESULT_CSV, common.WARNINGS_CSV] list_bucket_result = gcs_utils.list_bucket(self.hpo_bucket) actual_bucket_items = [item['name'] for item in list_bucket_result] self.assertSetEqual(set(expected_bucket_items), set(actual_bucket_items)) # check content of the warnings file is correct actual_result = test_util.read_cloud_file(self.hpo_bucket, folder_prefix + common.WARNINGS_CSV) actual_result_file = StringIO.StringIO(actual_result) actual_result_items = resources._csv_file_to_list(actual_result_file) # sort in order to compare expected_result_items.sort() actual_result_items.sort() self.assertListEqual(expected_result_items, actual_result_items)
def test_errors_csv(self, mock_check_cron): folder_prefix = 'dummy-prefix-2018-03-22/' test_util.write_cloud_str(self.hpo_bucket, folder_prefix + 'person.csv', ".\n .,.,.") main.app.testing = True with main.app.test_client() as c: c.get(test_util.VALIDATE_HPO_FILES_URL) # check the result file was put in bucket list_bucket_result = gcs_utils.list_bucket(self.hpo_bucket) bucket_item_names = [ item['name'] for item in list_bucket_result if item['name'].startswith(folder_prefix) ] expected_items = ['person.csv'] + common.IGNORE_LIST expected_items = [folder_prefix + item for item in expected_items] self.assertSetEqual(set(bucket_item_names), set(expected_items)) # check content of the file is correct actual_result = test_util.read_cloud_file( self.hpo_bucket, folder_prefix + common.ERRORS_CSV) with open(test_util.BAD_PERSON_FILE_BQ_LOAD_ERRORS_CSV, 'r') as f: expected = f.read() self.assertEqual(expected, actual_result)
def load_dataset_from_files(dataset_id, path): app_id = bq_utils.app_identity.get_application_id() bucket = gcs_utils.get_hpo_bucket(test_util.FAKE_HPO_ID) test_util.empty_bucket(bucket) job_ids = [] for table in common.CDM_TABLES: filename = table + '.csv' schema = os.path.join(resources.fields_path, table + '.json') f = os.path.join(path, filename) if os.path.exists(os.path.join(path, filename)): with open(f, 'r') as fp: gcs_utils.upload_object(bucket, filename, fp) else: test_util.write_cloud_str(bucket, filename, '\n') gcs_path = 'gs://{bucket}/{filename}'.format(bucket=bucket, filename=filename) load_results = bq_utils.load_csv(schema, gcs_path, app_id, dataset_id, table, allow_jagged_rows=True) load_job_id = load_results['jobReference']['jobId'] job_ids.append(load_job_id) incomplete_jobs = bq_utils.wait_on_jobs(job_ids) if len(incomplete_jobs) > 0: message = "Job id(s) %s failed to complete" % incomplete_jobs raise RuntimeError(message) test_util.empty_bucket(bucket)
def _load_datasets(self): load_jobs = [] self.expected_tables = dict() for cdm_table in common.CDM_TABLES: cdm_file_name = os.path.join(test_util.FIVE_PERSONS_PATH, cdm_table + '.csv') result_table = ehr_merge.result_table_for(cdm_table) if os.path.exists(cdm_file_name): # one copy for chs, the other for pitt csv_rows = resources._csv_to_list(cdm_file_name) self.expected_tables[result_table] = csv_rows + list(csv_rows) test_util.write_cloud_file(self.chs_bucket, cdm_file_name) test_util.write_cloud_file(self.pitt_bucket, cdm_file_name) else: self.expected_tables[result_table] = [] test_util.write_cloud_str(self.chs_bucket, cdm_table + '.csv', 'dummy\n') test_util.write_cloud_str(self.pitt_bucket, cdm_table + '.csv', 'dummy\n') chs_load_results = bq_utils.load_cdm_csv(CHS_HPO_ID, cdm_table) pitt_load_results = bq_utils.load_cdm_csv(PITT_HPO_ID, cdm_table) chs_load_job_id = chs_load_results['jobReference']['jobId'] pitt_load_job_id = pitt_load_results['jobReference']['jobId'] load_jobs.append(chs_load_job_id) load_jobs.append(pitt_load_job_id) incomplete_jobs = bq_utils.wait_on_jobs(load_jobs) if len(incomplete_jobs) > 0: raise RuntimeError('BigQuery jobs %s failed to complete' % incomplete_jobs)
def test_errors_csv(self, mock_check_cron): folder_prefix = 'dummy-prefix-2018-03-22/' test_util.write_cloud_str(self.hpo_bucket, folder_prefix + 'person.csv', ".\n .,.,.") main.app.testing = True with main.app.test_client() as c: c.get(test_util.VALIDATE_HPO_FILES_URL) # check the result file was put in bucket list_bucket_result = gcs_utils.list_bucket(self.hpo_bucket) bucket_item_names = [ item['name'] for item in list_bucket_result if item['name'].startswith(folder_prefix) ] expected_items = ['person.csv'] + common.IGNORE_LIST expected_items = [folder_prefix + item for item in expected_items] self.assertSetEqual(set(bucket_item_names), set(expected_items)) # check content of the file is correct actual_result = test_util.read_cloud_file( self.hpo_bucket, folder_prefix + common.ERRORS_CSV) actual = resources._csv_file_to_list( StringIO.StringIO(actual_result)) for row in actual: row.pop('message', None) expected = [{'file_name': 'person.csv', 'type': 'error'}] self.assertEqual(actual, expected)
def _load_dataset(self): for cdm_table in common.CDM_TABLES: cdm_file_name = os.path.join(test_util.FIVE_PERSONS_PATH, cdm_table + '.csv') if os.path.exists(cdm_file_name): test_util.write_cloud_file(self.hpo_bucket, cdm_file_name) else: test_util.write_cloud_str(self.hpo_bucket, cdm_table + '.csv', 'dummy\n') bq_utils.load_cdm_csv(FAKE_HPO_ID, cdm_table)
def test_check_processed(self): folder_prefix = 'folder/' test_util.write_cloud_str(self.hpo_bucket, folder_prefix + 'person.csv', '\n') test_util.write_cloud_str(self.hpo_bucket, folder_prefix + common.PROCESSED_TXT, '\n') bucket_items = gcs_utils.list_bucket(self.hpo_bucket) result = main._get_to_process_list(self.hpo_bucket, bucket_items, force_process=False) self.assertListEqual([], result) result = main._get_to_process_list(self.hpo_bucket, bucket_items, force_process=True) self.assertListEqual(result, [folder_prefix])
def test_pii_files_ignore(self, mock_check_cron): folder_prefix = 'dummy-prefix-2018-03-22/' test_util.write_cloud_str(self.hpo_bucket, folder_prefix + 'pii_person.csv', contents_str='.') main.app.testing = True with main.app.test_client() as c: c.get(test_util.VALIDATE_HPO_FILES_URL) actual_result = test_util.read_cloud_file(self.hpo_bucket, folder_prefix + common.WARNINGS_CSV) with open(test_util.EMPTY_WARNINGS_CSV, 'r') as f: expected = f.read() self.assertEqual(expected, actual_result)
def test_latest_folder_validation(self, mock_check_cron): folder_prefix_1 = 'dummy-prefix-2018-03-22-v1/' folder_prefix_2 = 'dummy-prefix-2018-03-22-v2/' folder_prefix_3 = 'dummy-prefix-2018-03-22-v3/' exclude_file_list = [folder_prefix_1 + 'person.csv', folder_prefix_2 + 'blah.csv', folder_prefix_3 + 'visit_occurrence.csv'] for filename in exclude_file_list: test_util.write_cloud_str(self.hpo_bucket, filename, ".\n .") main.app.testing = True with main.app.test_client() as c: return_string = c.get(test_util.VALIDATE_HPO_FILES_URL).data
def _upload_file_to_bucket(bucket, dataset_id, path, table): app_id = bq_utils.app_identity.get_application_id() filename = table + '.csv' schema = os.path.join(resources.fields_path, table + '.json') f = os.path.join(path, filename) if os.path.exists(os.path.join(path, filename)): with open(f, 'r') as fp: gcs_utils.upload_object(bucket, filename, fp) else: test_util.write_cloud_str(bucket, filename, '\n') gcs_path = 'gs://{bucket}/{filename}'.format(bucket=bucket, filename=filename) load_results = bq_utils.load_csv(schema, gcs_path, app_id, dataset_id, table, allow_jagged_rows=True) load_job_id = load_results['jobReference']['jobId'] return load_job_id
def test_folder_list(self): folder_prefix_1 = 'dummy-prefix-2018-03-22-v1/' folder_prefix_2 = 'dummy-prefix-2018-03-22-v2/' folder_prefix_3 = 'dummy-prefix-2018-03-22-v3/' file_list = [ folder_prefix_1 + 'person.csv', folder_prefix_2 + 'blah.csv', folder_prefix_3 + 'visit_occurrence.csv', 'person.csv' ] for filename in file_list: test_util.write_cloud_str(self.hpo_bucket, filename, ".\n .") bucket_items = gcs_utils.list_bucket(self.hpo_bucket) folder_list = main._get_to_process_list(self.hpo_bucket, bucket_items) self.assertListEqual(folder_list, [folder_prefix_3])
def _load_datasets(self): load_jobs = [] for cdm_table in common.CDM_TABLES: cdm_file_name = os.path.join(test_util.FIVE_PERSONS_PATH, cdm_table + '.csv') if os.path.exists(cdm_file_name): test_util.write_cloud_file(self.chs_bucket, cdm_file_name) test_util.write_cloud_file(self.pitt_bucket, cdm_file_name) else: test_util.write_cloud_str(self.chs_bucket, cdm_table + '.csv', 'dummy\n') test_util.write_cloud_str(self.pitt_bucket, cdm_table + '.csv', 'dummy\n') chs_load_results = bq_utils.load_cdm_csv(CHS_HPO_ID, cdm_table) pitt_load_results = bq_utils.load_cdm_csv(PITT_HPO_ID, cdm_table) chs_load_job_id = chs_load_results['jobReference']['jobId'] pitt_load_job_id = pitt_load_results['jobReference']['jobId'] load_jobs.append(chs_load_job_id) load_jobs.append(pitt_load_job_id) incomplete_jobs = bq_utils.wait_on_jobs(load_jobs) if len(incomplete_jobs) > 0: raise RuntimeError('BigQuery jobs %s failed to complete' % incomplete_jobs)
def test_validation_done_folder(self, mock_check_cron): folder_prefix_v1 = 'dummy-prefix-2018-03-22-v1/' folder_prefix = 'dummy-prefix-2018-03-22/' # upload all five_persons files test_util.write_cloud_str(self.hpo_bucket, folder_prefix_v1 + 'person.csv', contents_str='.') test_util.write_cloud_str(self.hpo_bucket, folder_prefix + 'person.csv', contents_str='.') test_util.write_cloud_str(self.hpo_bucket, folder_prefix + common.PROCESSED_TXT, contents_str='.') main.app.testing = True with main.app.test_client() as c: return_string = c.get(test_util.VALIDATE_HPO_FILES_URL).data self.assertFalse(folder_prefix in return_string) self.assertFalse(folder_prefix_v1 in return_string)