def setUpClass(cls): cls.testbed = testbed.Testbed() cls.testbed.activate() cls.testbed.init_app_identity_stub() cls.testbed.init_memcache_stub() cls.testbed.init_urlfetch_stub() cls.testbed.init_blobstore_stub() cls.testbed.init_datastore_v3_stub() fake_bucket = gcs_utils.get_hpo_bucket(test_util.FAKE_HPO_ID) dataset_id = bq_utils.get_dataset_id() test_util.delete_all_tables(dataset_id) test_util.get_synpuf_results_files() test_util.populate_achilles(fake_bucket)
def load_dataset_from_files(dataset_id, path, mappings=False): bucket = gcs_utils.get_hpo_bucket(test_util.FAKE_HPO_ID) test_util.empty_bucket(bucket) job_ids = [] for table in resources.CDM_TABLES: job_ids.append(CombineEhrRdrTest._upload_file_to_bucket(bucket, dataset_id, path, table)) if mappings and table in DOMAIN_TABLES: mapping_table = '_mapping_{table}'.format(table=table) job_ids.append(CombineEhrRdrTest._upload_file_to_bucket(bucket, dataset_id, path, mapping_table)) incomplete_jobs = bq_utils.wait_on_jobs(job_ids) if len(incomplete_jobs) > 0: message = "Job id(s) %s failed to complete" % incomplete_jobs raise RuntimeError(message) test_util.empty_bucket(bucket)
def process_hpo(hpo_id, force_run=False): """ runs validation for a single hpo_id :param hpo_id: which hpo_id to run for :param force_run: if True, process the latest submission whether or not it has already been processed before :raises BucketDoesNotExistError: Raised when a configured bucket does not exist InternalValidationError: Raised when an internal error is encountered during validation """ try: logging.info(f"Processing hpo_id {hpo_id}") bucket = gcs_utils.get_hpo_bucket(hpo_id) bucket_items = list_bucket(bucket) folder_prefix = _get_submission_folder(bucket, bucket_items, force_run) if folder_prefix is None: logging.info( f"No submissions to process in {hpo_id} bucket {bucket}") else: folder_items = [] if is_valid_folder_prefix_name(folder_prefix): # perform validation folder_items = get_folder_items(bucket_items, folder_prefix) summary = validate_submission(hpo_id, bucket, folder_items, folder_prefix) report_data = generate_metrics(hpo_id, bucket, folder_prefix, summary) else: # do not perform validation report_data = generate_empty_report(hpo_id, folder_prefix) perform_reporting(hpo_id, report_data, folder_items, bucket, folder_prefix) except BucketDoesNotExistError as bucket_error: bucket = bucket_error.bucket # App engine converts an env var set but left empty to be the string 'None' if bucket and bucket.lower() != 'none': logging.warning( f"Bucket '{bucket}' configured for hpo_id '{hpo_id}' does not exist" ) else: logging.info( f"Bucket '{bucket}' configured for hpo_id '{hpo_id}' is empty/unset" ) except HttpError as http_error: message = (f"Failed to process hpo_id '{hpo_id}' due to the following " f"HTTP error: {http_error.content.decode()}") logging.exception(message)
def test_merge_with_unmatched_schema(self): running_jobs = [] with open(NYC_FIVE_PERSONS_MEASUREMENT_CSV, 'rb') as fp: gcs_utils.upload_object(gcs_utils.get_hpo_bucket('nyc'), 'measurement.csv', fp) result = bq_utils.load_cdm_csv('nyc', 'measurement') running_jobs.append(result['jobReference']['jobId']) with open(PITT_FIVE_PERSONS_PERSON_CSV, 'rb') as fp: gcs_utils.upload_object(gcs_utils.get_hpo_bucket('pitt'), 'person.csv', fp) result = bq_utils.load_cdm_csv('pitt', 'person') running_jobs.append(result['jobReference']['jobId']) incomplete_jobs = bq_utils.wait_on_jobs(running_jobs) self.assertEqual(len(incomplete_jobs), 0, 'loading tables {},{} timed out'.format('nyc_measurement', 'pitt_person')) table_names = ['nyc_measurement', 'pitt_person'] success, error = bq_utils.merge_tables( bq_utils.get_dataset_id(), table_names, bq_utils.get_dataset_id(), 'merged_nyc_pitt' ) self.assertFalse(success)
def setUp(self): self.hpo_id = test_util.FAKE_HPO_ID self.bucket = gcs_utils.get_hpo_bucket(self.hpo_id) self.site_bucket = 'test_bucket' self.folder_1 = '2019-01-01-v1/' self.folder_2 = '2019-02-02-v2/' self.folder_prefix_1 = self.hpo_id + '/' + self.site_bucket + '/' + self.folder_1 self.folder_prefix_2 = self.hpo_id + '/' + self.site_bucket + '/' + self.folder_2 self.pids = [17, 20] self.skip_pids = [10, 25] self.project_id = 'project_id' self.sandbox_dataset_id = bq_utils.get_unioned_dataset_id() self.pid_table_id = 'pid_table' self._empty_bucket()
def test_target_bucket_upload(self): bucket_nyc = gcs_utils.get_hpo_bucket('nyc') folder_prefix = 'test-folder-fake/' test_util.empty_bucket(bucket_nyc) main._upload_achilles_files(hpo_id=None, folder_prefix=folder_prefix, target_bucket=bucket_nyc) actual_bucket_files = set( [item['name'] for item in gcs_utils.list_bucket(bucket_nyc)]) expected_bucket_files = set([ 'test-folder-fake/' + item for item in resources.ALL_ACHILLES_INDEX_FILES ]) self.assertSetEqual(expected_bucket_files, actual_bucket_files)
def setUp(self): super(ValidationTest, self).setUp() self.testbed = testbed.Testbed() self.testbed.activate() self.testbed.init_app_identity_stub() self.testbed.init_memcache_stub() self.testbed.init_urlfetch_stub() self.testbed.init_blobstore_stub() self.testbed.init_datastore_v3_stub() self.hpo_id = test_util.FAKE_HPO_ID self.hpo_bucket = gcs_utils.get_hpo_bucket(self.hpo_id) self.bigquery_dataset_id = bq_utils.get_dataset_id() self.folder_prefix = '2019-01-01/' self._empty_bucket() test_util.delete_all_tables(self.bigquery_dataset_id) self._create_drug_class_table()
def setUp(self): self.hpo_bucket = gcs_utils.get_hpo_bucket(FAKE_HPO_ID) self.project_id = app_identity.get_application_id() self.dataset_id = bq_utils.get_dataset_id() self.rdr_dataset_id = bq_utils.get_rdr_dataset_id() self.folder_prefix = '2019-01-01/' test_util.delete_all_tables(self.dataset_id) test_util.empty_bucket(self.hpo_bucket) mock_get_hpo_name = mock.patch('validation.main.get_hpo_name') self.mock_get_hpo_name = mock_get_hpo_name.start() self.mock_get_hpo_name.return_value = 'Fake HPO' self.addCleanup(mock_get_hpo_name.stop) self._load_data()
def setUp(self): self.hpo_id = test_util.FAKE_HPO_ID self.hpo_bucket = gcs_utils.get_hpo_bucket(self.hpo_id) self.project_id = app_identity.get_application_id() self.rdr_dataset_id = bq_utils.get_rdr_dataset_id() mock_get_hpo_name = mock.patch('validation.main.get_hpo_name') self.mock_get_hpo_name = mock_get_hpo_name.start() self.mock_get_hpo_name.return_value = 'Fake HPO' self.addCleanup(mock_get_hpo_name.stop) self.bigquery_dataset_id = bq_utils.get_dataset_id() self.folder_prefix = '2019-01-01-v1/' self._empty_bucket() test_util.delete_all_tables(self.bigquery_dataset_id) self._create_drug_class_table(self.bigquery_dataset_id)
def load_pii_csv(hpo_id, pii_table_name, source_folder_prefix=""): """ Load PII file from a bucket into a table in bigquery :param hpo_id: ID for the HPO site :param pii_table_name: name of the CDM table :return: an object describing the associated bigquery job """ if pii_table_name not in common.PII_TABLES: raise ValueError('{} is not a valid table to load'.format(pii_table_name)) app_id = app_identity.get_application_id() dataset_id = get_dataset_id() bucket = gcs_utils.get_hpo_bucket(hpo_id) fields_filename = os.path.join(resources.fields_path, pii_table_name + '.json') gcs_object_path = 'gs://%s/%s%s.csv' % (bucket, source_folder_prefix, pii_table_name) table_id = get_table_id(hpo_id, pii_table_name) return load_csv(fields_filename, gcs_object_path, app_id, dataset_id, table_id)
def _load_datasets(self): """ Load five persons data for each test hpo # expected_tables is for testing output # it maps table name to list of expected records ex: "unioned_ehr_visit_occurrence" -> [{}, {}, ...] """ expected_tables = dict() running_jobs = [] for cdm_table in resources.CDM_TABLES: output_table = ehr_union.output_table_for(cdm_table) expected_tables[output_table] = [] for hpo_id in self.hpo_ids: # upload csv into hpo bucket if hpo_id == NYC_HPO_ID: cdm_file_name = os.path.join(test_util.FIVE_PERSONS_PATH, cdm_table + '.csv') else: cdm_file_name = os.path.join( test_util.PITT_FIVE_PERSONS_PATH, cdm_table + '.csv') bucket = gcs_utils.get_hpo_bucket(hpo_id) if os.path.exists(cdm_file_name): test_util.write_cloud_file(bucket, cdm_file_name) csv_rows = resources.csv_to_list(cdm_file_name) else: # results in empty table test_util.write_cloud_str(bucket, cdm_table + '.csv', 'dummy\n') csv_rows = [] # load table from csv result = bq_utils.load_cdm_csv(hpo_id, cdm_table) running_jobs.append(result['jobReference']['jobId']) expected_tables[output_table] += list(csv_rows) # ensure person to observation output is as expected output_table_person = ehr_union.output_table_for( combine_ehr_rdr.PERSON_TABLE) output_table_observation = ehr_union.output_table_for( combine_ehr_rdr.OBSERVATION_TABLE) expected_tables[output_table_observation] += 4 * expected_tables[ output_table_person] incomplete_jobs = bq_utils.wait_on_jobs(running_jobs) if len(incomplete_jobs) > 0: message = "Job id(s) %s failed to complete" % incomplete_jobs raise RuntimeError(message) self.expected_tables = expected_tables
def setUp(self): self.testbed = testbed.Testbed() self.testbed.activate() self.testbed.init_app_identity_stub() self.testbed.init_memcache_stub() self.testbed.init_urlfetch_stub() self.testbed.init_blobstore_stub() self.testbed.init_datastore_v3_stub() self.hpo_id = test_util.FAKE_HPO_ID self.bucket = gcs_utils.get_hpo_bucket(self.hpo_id) self.site_bucket = 'test_bucket' self.folder_1 = '2019-01-01-v1/' self.folder_2 = '2019-02-02-v2/' self.folder_prefix_1 = self.hpo_id + '/' + self.site_bucket + '/' + self.folder_1 self.folder_prefix_2 = self.hpo_id + '/' + self.site_bucket + '/' + self.folder_2 self.pids = [17, 20] self.skip_pids = [10, 25] self._empty_bucket()
def _upload_achilles_files(hpo_id, folder_prefix): """uploads achilles web files to the corresponding hpo bucket :hpo_id: which hpo bucket do these files go into :returns: """ results = [] bucket = gcs_utils.get_hpo_bucket(hpo_id) for filename in common.ACHILLES_INDEX_FILES: logging.debug('Uploading achilles file `%s` to bucket `%s`' % (filename, bucket)) bucket_file_name = filename.split(resources.resource_path + os.sep)[1].strip() with open(filename, 'r') as fp: upload_result = gcs_utils.upload_object( bucket, folder_prefix + bucket_file_name, fp) results.append(upload_result) return results
def load_cdm_csv(hpo_id, cdm_table_name, source_folder_prefix="", dataset_id=None): """ Load CDM file from a bucket into a table in bigquery :param hpo_id: ID for the HPO site :param cdm_table_name: name of the CDM table :return: an object describing the associated bigquery job """ if cdm_table_name not in resources.CDM_TABLES: raise ValueError('{} is not a valid table to load'.format(cdm_table_name)) app_id = app_identity.get_application_id() if dataset_id is None: dataset_id = get_dataset_id() bucket = gcs_utils.get_hpo_bucket(hpo_id) fields_filename = os.path.join(resources.fields_path, cdm_table_name + '.json') gcs_object_path = 'gs://%s/%s%s.csv' % (bucket, source_folder_prefix, cdm_table_name) table_id = get_table_id(hpo_id, cdm_table_name) allow_jagged_rows = cdm_table_name == 'observation' return load_csv(fields_filename, gcs_object_path, app_id, dataset_id, table_id, allow_jagged_rows=allow_jagged_rows)
def test_run_export_with_target_bucket_and_hpo_id(self): folder_prefix = 'dummy-prefix-2018-03-24/' bucket_nyc = gcs_utils.get_hpo_bucket('nyc') main.run_export(hpo_id=test_util.FAKE_HPO_ID, folder_prefix=folder_prefix, target_bucket=bucket_nyc) bucket_objects = gcs_utils.list_bucket(bucket_nyc) actual_object_names = [obj['name'] for obj in bucket_objects] for report in common.ALL_REPORT_FILES: prefix = folder_prefix + common.ACHILLES_EXPORT_PREFIX_STRING + test_util.FAKE_HPO_ID + '/' expected_object_name = prefix + report self.assertIn(expected_object_name, actual_object_names) datasources_json_path = folder_prefix + common.ACHILLES_EXPORT_DATASOURCES_JSON self.assertIn(datasources_json_path, actual_object_names) datasources_json = gcs_utils.get_object(bucket_nyc, datasources_json_path) datasources_actual = json.loads(datasources_json) datasources_expected = { 'datasources': [ {'name': test_util.FAKE_HPO_ID, 'folder': test_util.FAKE_HPO_ID, 'cdmVersion': 5} ] } self.assertDictEqual(datasources_expected, datasources_actual)
def setUp(self): self.testbed = testbed.Testbed() self.testbed.activate() self.testbed.init_app_identity_stub() self.testbed.init_memcache_stub() self.testbed.init_urlfetch_stub() self.testbed.init_blobstore_stub() self.testbed.init_datastore_v3_stub() self.hpo_id = test_util.FAKE_HPO_ID self.hpo_bucket = gcs_utils.get_hpo_bucket(self.hpo_id) mock_get_hpo_name = mock.patch('validation.main.get_hpo_name') self.mock_get_hpo_name = mock_get_hpo_name.start() self.mock_get_hpo_name.return_value = 'Fake HPO' self.addCleanup(mock_get_hpo_name.stop) self.bigquery_dataset_id = bq_utils.get_dataset_id() self.folder_prefix = '2019-01-01/' self._empty_bucket() test_util.delete_all_tables(self.bigquery_dataset_id) self._create_drug_class_table()
def copy_files(hpo_id): """copies over files from hpo bucket to drc bucket :hpo_id: hpo from which to copy """ hpo_bucket = gcs_utils.get_hpo_bucket(hpo_id) drc_private_bucket = gcs_utils.get_drc_bucket() bucket_items = gcs_utils.list_bucket(hpo_bucket) prefix = hpo_id + '/' + hpo_bucket + '/' for item in bucket_items: item_name = item['name'] gcs_utils.copy_object(source_bucket=hpo_bucket, source_object_id=item_name, destination_bucket=drc_private_bucket, destination_object_id=prefix + item_name) return '{"copy-status": "done"}'
def run_export(hpo_id=None, folder_prefix="", target_bucket=None): """ Run export queries for an HPO and store JSON payloads in specified folder in (optional) target bucket :type hpo_id: ID of the HPO to run export for. This is the data source name in the report. :param folder_prefix: Relative base path to store report. empty by default. :param target_bucket: Bucket to save report. If None, use bucket associated with hpo_id. """ results = [] # Using separate var rather than hpo_id here because hpo_id None needed in calls below datasource_name = 'default' if hpo_id is None: if target_bucket is None: raise RuntimeError( 'Cannot export if neither hpo_id or target_bucket is specified.' ) else: datasource_name = hpo_id if target_bucket is None: target_bucket = gcs_utils.get_hpo_bucket(hpo_id) logging.info('Exporting %s report to bucket %s', datasource_name, target_bucket) # Run export queries and store json payloads in specified folder in the target bucket reports_prefix = folder_prefix + ACHILLES_EXPORT_PREFIX_STRING + datasource_name + '/' for export_name in common.ALL_REPORTS: sql_path = os.path.join(export.EXPORT_PATH, export_name) result = export.export_from_path(sql_path, hpo_id) content = json.dumps(result) fp = StringIO(content) result = gcs_utils.upload_object( target_bucket, reports_prefix + export_name + '.json', fp) results.append(result) result = save_datasources_json(hpo_id=hpo_id, folder_prefix=folder_prefix, target_bucket=target_bucket) results.append(result) return results
def test_run_export_with_target_bucket(self): folder_prefix = 'dummy-prefix-2018-03-24/' bucket_nyc = gcs_utils.get_hpo_bucket('nyc') test_util.get_synpuf_results_files() test_util.populate_achilles(self.hpo_bucket, hpo_id=None) main.run_export(folder_prefix=folder_prefix, target_bucket=bucket_nyc) bucket_objects = gcs_utils.list_bucket(bucket_nyc) actual_object_names = [obj['name'] for obj in bucket_objects] for report in common.ALL_REPORT_FILES: expected_object_name = folder_prefix + common.ACHILLES_EXPORT_PREFIX_STRING + 'default' + '/' + report self.assertIn(expected_object_name, actual_object_names) datasources_json_path = folder_prefix + common.ACHILLES_EXPORT_DATASOURCES_JSON self.assertIn(datasources_json_path, actual_object_names) datasources_json = gcs_utils.get_object(bucket_nyc, datasources_json_path) datasources_actual = json.loads(datasources_json) datasources_expected = { 'datasources': [ {'name': 'default', 'folder': 'default', 'cdmVersion': 5} ] } self.assertDictEqual(datasources_expected, datasources_actual)
def process_hpo(hpo_id, force_run=False): """ runs validation for a single hpo_id :param hpo_id: which hpo_id to run for :param force_run: if True, process the latest submission whether or not it has already been processed before :raises BucketDoesNotExistError: Raised when a configured bucket does not exist InternalValidationError: Raised when an internal error is encountered during validation """ try: logging.info('Processing hpo_id %s', hpo_id) bucket = gcs_utils.get_hpo_bucket(hpo_id) bucket_items = list_bucket(bucket) folder_prefix = _get_submission_folder(bucket, bucket_items, force_run) if folder_prefix is None: logging.info('No submissions to process in %s bucket %s', hpo_id, bucket) else: if is_valid_folder_prefix_name(folder_prefix): # perform validation summary = validate_submission(hpo_id, bucket, bucket_items, folder_prefix) generate_metrics(hpo_id, bucket, folder_prefix, summary) else: # do not perform validation. Generate empty report and processed.txt generate_empty_report(hpo_id, bucket, folder_prefix) except BucketDoesNotExistError as bucket_error: bucket = bucket_error.bucket logging.warning( 'Bucket `%s` configured for hpo_id `%s` does not exist', bucket, hpo_id) except HttpError as http_error: message = 'Failed to process hpo_id `%s` due to the following HTTP error: %s' % ( hpo_id, http_error.content.decode()) logging.exception(message)
def run_export(hpo_id, folder_prefix): """ this function also changes the datasources.json file """ results = [] logging.info('running export for hpo_id %s' % hpo_id) # TODO : add check for required tables hpo_bucket = gcs_utils.get_hpo_bucket(hpo_id) _reports_prefix = ACHILLES_EXPORT_PREFIX_STRING + hpo_id + "/" for export_name in common.ALL_REPORTS: sql_path = os.path.join(export.EXPORT_PATH, export_name) result = export.export_from_path(sql_path, hpo_id) content = json.dumps(result) fp = StringIO.StringIO(content) result = gcs_utils.upload_object( hpo_bucket, folder_prefix + _reports_prefix + export_name + '.json', fp) results.append(result) datasources_json_result = save_datasources_json(hpo_id, folder_prefix) results.append(datasources_json_result) return results
def load_dataset_from_files(dataset_id, path): app_id = bq_utils.app_identity.get_application_id() bucket = gcs_utils.get_hpo_bucket(test_util.FAKE_HPO_ID) test_util.empty_bucket(bucket) job_ids = [] for table in common.CDM_TABLES: filename = table + '.csv' schema = os.path.join(resources.fields_path, table + '.json') f = os.path.join(path, filename) if os.path.exists(os.path.join(path, filename)): with open(f, 'r') as fp: gcs_utils.upload_object(bucket, filename, fp) else: test_util.write_cloud_str(bucket, filename, '\n') gcs_path = 'gs://{bucket}/{filename}'.format(bucket=bucket, filename=filename) load_results = bq_utils.load_csv(schema, gcs_path, app_id, dataset_id, table, allow_jagged_rows=True) load_job_id = load_results['jobReference']['jobId'] job_ids.append(load_job_id) incomplete_jobs = bq_utils.wait_on_jobs(job_ids) if len(incomplete_jobs) > 0: message = "Job id(s) %s failed to complete" % incomplete_jobs raise RuntimeError(message) test_util.empty_bucket(bucket)
def test_load_ehr_observation(self): hpo_id = 'pitt' dataset_id = bq_utils.get_dataset_id() table_id = bq_utils.get_table_id(hpo_id, table_name='observation') q = 'SELECT observation_id FROM {dataset_id}.{table_id} ORDER BY observation_id'.format( dataset_id=dataset_id, table_id=table_id) expected_observation_ids = [int(row['observation_id']) for row in resources._csv_to_list(PITT_FIVE_PERSONS_OBSERVATION_CSV)] with open(PITT_FIVE_PERSONS_OBSERVATION_CSV, 'rb') as fp: gcs_utils.upload_object(gcs_utils.get_hpo_bucket(hpo_id), 'observation.csv', fp) result = bq_utils.load_cdm_csv(hpo_id, 'observation') job_id = result['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([job_id]) self.assertEqual(len(incomplete_jobs), 0, 'pitt_observation load job did not complete') load_job_result = bq_utils.get_job_details(job_id) load_job_result_status = load_job_result['status'] load_job_errors = load_job_result_status.get('errors') self.assertIsNone(load_job_errors, msg='pitt_observation load job failed: ' + str(load_job_errors)) query_results_response = bq_utils.query(q) query_job_errors = query_results_response.get('errors') self.assertIsNone(query_job_errors) actual_result = [int(row['f'][0]['v']) for row in query_results_response['rows']] self.assertListEqual(actual_result, expected_observation_ids)
def setUp(self): self.hpo_bucket = gcs_utils.get_hpo_bucket(FAKE_HPO_ID) self.gcs_path = '/'.join([self.hpo_bucket, 'dummy']) self._empty_bucket()
def setUp(self): self.hpo_bucket = gcs_utils.get_hpo_bucket(test_util.FAKE_HPO_ID) test_util.empty_bucket(self.hpo_bucket) test_util.delete_all_tables(bq_utils.get_dataset_id())
def tearDown(self): self._empty_bucket() bucket_nyc = gcs_utils.get_hpo_bucket('nyc') test_util.empty_bucket(bucket_nyc) test_util.empty_bucket(gcs_utils.get_drc_bucket()) self.testbed.deactivate()
def setUpClass(cls): fake_bucket = gcs_utils.get_hpo_bucket(test_util.FAKE_HPO_ID) dataset_id = bq_utils.get_dataset_id() test_util.delete_all_tables(dataset_id) test_util.get_synpuf_results_files() test_util.populate_achilles(fake_bucket)
def tearDown(self): self._empty_bucket() bucket_nyc = gcs_utils.get_hpo_bucket('nyc') test_util.empty_bucket(bucket_nyc) test_util.empty_bucket(gcs_utils.get_drc_bucket()) test_util.delete_all_tables(self.bigquery_dataset_id)
def _empty_hpo_buckets(self): for hpo_id in self.hpo_ids: bucket = gcs_utils.get_hpo_bucket(hpo_id) test_util.empty_bucket(bucket)
def run_validation(hpo_id, force_run=False): """ runs validation for a single hpo_id :param hpo_id: which hpo_id to run for :param force_run: if True, process the latest submission whether or not it has already been processed before :raises BucketDoesNotExistError: Raised when a configured bucket does not exist InternalValidationError: Raised when an internal error is encountered during validation """ logging.info(' Validating hpo_id %s' % hpo_id) bucket = gcs_utils.get_hpo_bucket(hpo_id) bucket_items = list_bucket(bucket) to_process_folder_list = _get_to_process_list(bucket, bucket_items, force_run) for folder_prefix in to_process_folder_list: logging.info('Processing gs://%s/%s' % (bucket, folder_prefix)) # separate cdm from the unknown (unexpected) files found_cdm_files = [] unknown_files = [] found_pii_files = [] folder_items = [ item['name'].split('/')[1] for item in bucket_items if item['name'].startswith(folder_prefix) ] for item in folder_items: if _is_cdm_file(item): found_cdm_files.append(item) elif _is_pii_file(item): found_pii_files.append(item) else: is_known_file = item in common.IGNORE_LIST if not is_known_file: unknown_files.append(item) errors = [] results = [] # Create all tables first to simplify downstream processes # (e.g. ehr_union doesn't have to check if tables exist) for file_name in common.CDM_FILES + common.PII_FILES: table_name = file_name.split('.')[0] table_id = bq_utils.get_table_id(hpo_id, table_name) bq_utils.create_standard_table(table_name, table_id, drop_existing=True) for cdm_file_name in common.CDM_FILES: file_results, file_errors = perform_validation_on_file( cdm_file_name, found_cdm_files, hpo_id, folder_prefix, bucket) results.extend(file_results) errors.extend(file_errors) for pii_file_name in common.PII_FILES: file_results, file_errors = perform_validation_on_file( pii_file_name, found_pii_files, hpo_id, folder_prefix, bucket) results.extend(file_results) errors.extend(file_errors) # (filename, message) for each unknown file warnings = [(unknown_file, UNKNOWN_FILE) for unknown_file in unknown_files] # output to GCS _save_result_in_gcs(bucket, folder_prefix + RESULT_CSV, results) _save_errors_warnings_in_gcs(bucket, folder_prefix + ERRORS_CSV, errors, warnings) if all_required_files_loaded(hpo_id, folder_prefix=folder_prefix): run_achilles(hpo_id) run_export(hpo_id=hpo_id, folder_prefix=folder_prefix) logging.info('Uploading achilles index files to `gs://%s/%s`.' % (bucket, folder_prefix)) _upload_achilles_files(hpo_id, folder_prefix) now_datetime_string = datetime.datetime.now().strftime( '%Y-%m-%dT%H:%M:%S') logging.info( 'Processing complete. Saving timestamp %s to `gs://%s/%s`.' % (bucket, now_datetime_string, folder_prefix + common.PROCESSED_TXT)) _write_string_to_file(bucket, folder_prefix + common.PROCESSED_TXT, now_datetime_string)