def load_measurement_concept_sets_table(project_id, dataset_id): """ Loads the required lab table from resource_files/measurement_concept_sets.csv into project_id.ehr_ops :param project_id: Project where the dataset resides :param dataset_id: Dataset where the required lab table needs to be created :return: None """ check_and_copy_tables(project_id, dataset_id) try: LOGGER.info( 'Upload {measurement_concept_sets_table}.csv to {dataset_id} in {project_id}' .format( measurement_concept_sets_table=MEASUREMENT_CONCEPT_SETS_TABLE, dataset_id=dataset_id, project_id=project_id)) bq_utils.load_table_from_csv(project_id, dataset_id, MEASUREMENT_CONCEPT_SETS_TABLE) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError): LOGGER.exception( f"FAILED: CSV file could not be uploaded:\n{app_identity}")
def load_operational_pii_fields_lookup_table(project_id, sandbox_dataset_id): """ Loads the operational pii fields from resources/operational_pii_fields.csv into project_id.sandbox_dataset_id.operational_pii_fields in BQ :param project_id: Project where the sandbox dataset resides :param sandbox_dataset_id: Dataset where the smoking lookup table needs to be created :return: None """ bq_utils.load_table_from_csv(project_id, sandbox_dataset_id, OPERATIONAL_PII_FIELDS_TABLE, csv_path=None, fields=None)
def load_smoking_lookup_table(project_id, sandbox_dataset_id): """ Loads the smoking lookup table from resource_files/smoking_lookup.csv into project_id.sandbox_dataset_id.smoking_lookup in BQ :param project_id: Project where the sandbox dataset resides :param sandbox_dataset_id: Dataset where the smoking lookup table needs to be created :return: None """ bq_utils.load_table_from_csv(project_id, sandbox_dataset_id, SMOKING_LOOKUP_TABLE, csv_path=None, fields=SMOKING_LOOKUP_FIELDS)
def _load_data(self): # Load measurement_concept_sets required_labs.load_measurement_concept_sets_table( project_id=self.project_id, dataset_id=self.dataset_id) # Load measurement_concept_sets_descendants required_labs.load_measurement_concept_sets_descendants_table( project_id=self.project_id, dataset_id=self.dataset_id) # Create concept and concept_ancestor empty tables if not exist if not bq_utils.table_exists(common.CONCEPT, self.dataset_id): bq_utils.create_standard_table(common.CONCEPT, common.CONCEPT) if not bq_utils.table_exists(common.CONCEPT, self.dataset_id): bq_utils.create_standard_table(common.CONCEPT_ANCESTOR, common.CONCEPT_ANCESTOR) # we need to load measurement.csv into bigquery_dataset_id in advance for the other integration tests ehr_measurement_result = bq_utils.load_table_from_csv( project_id=self.project_id, dataset_id=self.dataset_id, table_name=bq_utils.get_table_id(FAKE_HPO_ID, common.MEASUREMENT), csv_path=test_util.FIVE_PERSONS_MEASUREMENT_CSV, fields=resources.fields_for(common.MEASUREMENT)) bq_utils.wait_on_jobs( [ehr_measurement_result['jobReference']['jobId']])
def load_analyses(hpo_id): """ Populate achilles lookup table :param hpo_id: hpo_id of the site to run achilles on :return: None """ project_id = app_identity.get_application_id() dataset_id = bq_utils.get_dataset_id() if hpo_id is None: table_prefix = "" else: table_prefix = hpo_id + '_' table_name = table_prefix + ACHILLES_ANALYSIS csv_path = os.path.join(resources.resource_files_path, ACHILLES_ANALYSIS + '.csv') schema = resources.fields_for(ACHILLES_ANALYSIS) bq_utils.load_table_from_csv(project_id, dataset_id, table_name, csv_path, schema)
def get_update_questions_answers_not_mapped_to_omop(project_id, dataset_id, sandbox_dataset_id): """ This function gets the queries required to update the questions and answers that were unmapped to OMOP concepts :param project_id: Name of the project :param dataset_id: Name of the dataset where the queries should be run :param sandbox_dataset_id: Name of the sandbox dataset :return: """ bq_utils.load_table_from_csv(project_id=project_id, dataset_id=sandbox_dataset_id, table_name=OLD_MAP_SHORT_CODES_TABLE, fields=OLD_MAP_SHORT_CODES_TABLE_FIELDS) queries_list = [] # Update concept_ids to questions using OLD_MAP_SHORT_CODES_TABLE. query = dict() query[cdr_consts.QUERY] = UPDATE_QUESTIONS_MAP_QUERY.format( dataset=dataset_id, project=project_id, old_map=OLD_MAP_SHORT_CODES_TABLE, sandbox=sandbox_dataset_id) queries_list.append(query) # Update concept_ids to answers using OLD_MAP_SHORT_CODES_TABLE. query = dict() query[cdr_consts.QUERY] = UPDATE_ANSWERS_MAP_QUERY.format( dataset=dataset_id, project=project_id, old_map=OLD_MAP_SHORT_CODES_TABLE, sandbox=sandbox_dataset_id) queries_list.append(query) return queries_list
def test_load_table_from_csv(self): table_id = 'test_csv_table' csv_file = 'load_csv_test_data.csv' csv_path = os.path.join(test_util.TEST_DATA_PATH, csv_file) with open(csv_path, 'r') as f: expected = list(csv.DictReader(f)) bq_utils.load_table_from_csv(self.project_id, self.dataset_id, table_id, csv_path, self.TEST_FIELDS) q = """ SELECT * FROM `{project_id}.{dataset_id}.{table_id}`""".format( project_id=self.project_id, dataset_id=self.dataset_id, table_id=table_id) r = bq_utils.query(q) actual = bq_utils.response2rows(r) # Convert the epoch times to datetime with time zone for row in actual: row['timestamp_field'] = time.strftime( self.DT_FORMAT + ' UTC', time.gmtime(row['timestamp_field'])) expected.sort(key=lambda row: row['integer_field']) actual.sort(key=lambda row: row['integer_field']) for i, _ in enumerate(expected): self.assertCountEqual(expected[i], actual[i])
def test_html_report_five_person(self, mock_check_cron, mock_first_run, mock_rdr_date, mock_required_files_loaded): mock_required_files_loaded.return_value = False mock_first_run.return_value = False rdr_date = '2020-01-01' mock_rdr_date.return_value = rdr_date for cdm_file in test_util.FIVE_PERSONS_FILES: test_util.write_cloud_file(self.hpo_bucket, cdm_file, prefix=self.folder_prefix) # load person table in RDR bq_utils.load_table_from_csv(self.project_id, self.rdr_dataset_id, common.PERSON, test_util.FIVE_PERSONS_PERSON_CSV) # Load measurement_concept_sets required_labs.load_measurement_concept_sets_table( project_id=self.project_id, dataset_id=self.bigquery_dataset_id) # Load measurement_concept_sets_descendants required_labs.load_measurement_concept_sets_descendants_table( project_id=self.project_id, dataset_id=self.bigquery_dataset_id) main.app.testing = True with main.app.test_client() as c: c.get(test_util.VALIDATE_HPO_FILES_URL) actual_result = test_util.read_cloud_file( self.hpo_bucket, self.folder_prefix + common.RESULTS_HTML) # ensure emails are not sent bucket_items = gcs_utils.list_bucket(self.hpo_bucket) folder_items = main.get_folder_items(bucket_items, self.folder_prefix) self.assertFalse(main.is_first_validation_run(folder_items)) # parse html soup = bs(actual_result, parser="lxml", features="lxml") missing_pii_html_table = soup.find('table', id='missing_pii') table_headers = missing_pii_html_table.find_all('th') self.assertEqual('Missing Participant Record Type', table_headers[0].get_text()) self.assertEqual('Count', table_headers[1].get_text()) table_rows = missing_pii_html_table.find_next('tbody').find_all('tr') missing_record_types = [ table_row.find('td').text for table_row in table_rows ] self.assertIn(main_consts.EHR_NO_PII, missing_record_types) self.assertIn(main_consts.PII_NO_EHR, missing_record_types) self.assertIn(main_consts.EHR_NO_RDR.format(date=rdr_date), missing_record_types) self.assertIn(main_consts.EHR_NO_PARTICIPANT_MATCH, missing_record_types) required_lab_html_table = soup.find('table', id='required-lab') table_headers = required_lab_html_table.find_all('th') self.assertEqual(3, len(table_headers)) self.assertEqual('Ancestor Concept ID', table_headers[0].get_text()) self.assertEqual('Ancestor Concept Name', table_headers[1].get_text()) self.assertEqual('Found', table_headers[2].get_text()) table_rows = required_lab_html_table.find_next('tbody').find_all('tr') table_rows_last_column = [ table_row.find_all('td')[-1] for table_row in table_rows ] submitted_labs = [ row for row in table_rows_last_column if 'result-1' in row.attrs['class'] ] missing_labs = [ row for row in table_rows_last_column if 'result-0' in row.attrs['class'] ] self.assertTrue(len(table_rows) > 0) self.assertTrue(len(submitted_labs) > 0) self.assertTrue(len(missing_labs) > 0)
def get_queries_health_insurance(project_id, dataset_id, sandbox_dataset_id, pid_file): """ Queries to run for updating health insurance information :param project_id: project id associated with the dataset to run the queries on :param dataset_id: dataset id to run the queries on :param sandbox_dataset_id: dataset id of the sandbox :param pid_file: path to the file containing pids :return: list of query dicts """ pids_list = extract_pids_from_file(pid_file) pids = ', '.join([str(pid) for pid in pids_list]) bq_utils.load_table_from_csv(project_id, sandbox_dataset_id, INSURANCE_LOOKUP, csv_path=None, fields=INSURANCE_LOOKUP_FIELDS) queries = [] sandbox_query = dict() sandbox_query[cdr_consts.QUERY] = SANDBOX_CREATE_QUERY.format( project_id=project_id, combined_dataset_id=dataset_id, sandbox_dataset_id=sandbox_dataset_id, new_insurance_rows=NEW_INSURANCE_ROWS, insurance_lookup=INSURANCE_LOOKUP, ORIGINAL_OBSERVATION_SOURCE_CONCEPT_ID= ORIGINAL_OBSERVATION_SOURCE_CONCEPT_ID, HCAU_OBSERVATION_SOURCE_CONCEPT_ID=HCAU_OBSERVATION_SOURCE_CONCEPT_ID, pids=pids) queries.append(sandbox_query) invalidate_query = dict() invalidate_query[cdr_consts.QUERY] = UPDATE_INVALID_QUERY.format( project_id=project_id, combined_dataset_id=dataset_id, ORIGINAL_OBSERVATION_SOURCE_CONCEPT_ID= ORIGINAL_OBSERVATION_SOURCE_CONCEPT_ID, pids=pids) queries.append(invalidate_query) delete_query = dict() delete_query[ cdr_consts.QUERY] = DELETE_ORIGINAL_FOR_HCAU_PARTICIPANTS.format( project_id=project_id, combined_dataset_id=dataset_id, sandbox_dataset_id=sandbox_dataset_id, new_insurance_rows=NEW_INSURANCE_ROWS, ORIGINAL_OBSERVATION_SOURCE_CONCEPT_ID= ORIGINAL_OBSERVATION_SOURCE_CONCEPT_ID) queries.append(delete_query) insert_query = dict() insert_query[ cdr_consts.QUERY] = INSERT_ANSWERS_FOR_HCAU_PARTICIPANTS.format( project_id=project_id, combined_dataset_id=dataset_id, sandbox_dataset_id=sandbox_dataset_id, new_insurance_rows=NEW_INSURANCE_ROWS) queries.append(insert_query) return queries
def test_html_report_five_person(self, mock_check_cron, mock_first_run, mock_required_files_loaded, mock_has_all_required_files, mock_updated_datetime_object): mock_required_files_loaded.return_value = False mock_first_run.return_value = False mock_has_all_required_files.return_value = True mock_updated_datetime_object.return_value = datetime.datetime.today( ) - datetime.timedelta(minutes=7) for cdm_file in test_util.FIVE_PERSONS_FILES: blob_name = f'{self.folder_prefix}{os.path.basename(cdm_file)}' test_blob = self.storage_bucket.blob(blob_name) test_blob.upload_from_filename(cdm_file) # load person table in RDR bq_utils.load_table_from_csv(self.project_id, self.rdr_dataset_id, common.PERSON, test_util.FIVE_PERSONS_PERSON_CSV) # Load measurement_concept_sets required_labs.load_measurement_concept_sets_table( project_id=self.project_id, dataset_id=self.bigquery_dataset_id) # Load measurement_concept_sets_descendants required_labs.load_measurement_concept_sets_descendants_table( project_id=self.project_id, dataset_id=self.bigquery_dataset_id) main.app.testing = True with main.app.test_client() as c: c.get(test_util.VALIDATE_HPO_FILES_URL) actual_result = test_util.read_cloud_file( self.hpo_bucket, self.folder_prefix + common.RESULTS_HTML) # ensure emails are not sent bucket_items = gcs_utils.list_bucket(self.hpo_bucket) folder_items = main.get_folder_items(bucket_items, self.folder_prefix) self.assertFalse(main.is_first_validation_run(folder_items)) # parse html soup = bs(actual_result, parser="lxml", features="lxml") missing_pii_html_table = soup.find('table', id='missing_pii') table_headers = missing_pii_html_table.find_all('th') self.assertEqual('Missing Participant Record Type', table_headers[0].get_text()) self.assertEqual('Count', table_headers[1].get_text()) table_rows = missing_pii_html_table.find_next('tbody').find_all('tr') missing_record_types = [ table_row.find('td').text for table_row in table_rows ] self.assertIn(main_consts.EHR_NO_PII, missing_record_types) self.assertIn(main_consts.PII_NO_EHR, missing_record_types) # the missing from RDR component is obsolete (see DC-1932) # this is to confirm it was removed successfully from the report rdr_date = '2020-01-01' self.assertNotIn(main_consts.EHR_NO_RDR.format(date=rdr_date), missing_record_types) self.assertIn(main_consts.EHR_NO_PARTICIPANT_MATCH, missing_record_types) required_lab_html_table = soup.find('table', id='required-lab') table_headers = required_lab_html_table.find_all('th') self.assertEqual(3, len(table_headers)) self.assertEqual('Ancestor Concept ID', table_headers[0].get_text()) self.assertEqual('Ancestor Concept Name', table_headers[1].get_text()) self.assertEqual('Found', table_headers[2].get_text()) table_rows = required_lab_html_table.find_next('tbody').find_all('tr') table_rows_last_column = [ table_row.find_all('td')[-1] for table_row in table_rows ] submitted_labs = [ row for row in table_rows_last_column if 'result-1' in row.attrs['class'] ] missing_labs = [ row for row in table_rows_last_column if 'result-0' in row.attrs['class'] ] self.assertTrue(len(table_rows) > 0) self.assertTrue(len(submitted_labs) > 0) self.assertTrue(len(missing_labs) > 0)