Beispiel #1
0
def populate_achilles(hpo_bucket, hpo_id=FAKE_HPO_ID, include_heel=True):
    from validation import achilles, achilles_heel
    import app_identity

    app_id = app_identity.get_application_id()

    test_file_name = achilles.ACHILLES_ANALYSIS + '.csv'
    achilles_analysis_file_path = os.path.join(TEST_DATA_EXPORT_PATH,
                                               test_file_name)
    schema_name = achilles.ACHILLES_ANALYSIS
    write_cloud_file(hpo_bucket, achilles_analysis_file_path)
    gcs_path = 'gs://' + hpo_bucket + '/' + test_file_name
    dataset_id = bq_utils.get_dataset_id()
    table_id = bq_utils.get_table_id(hpo_id, achilles.ACHILLES_ANALYSIS)
    bq_utils.load_csv(schema_name, gcs_path, app_id, dataset_id, table_id)

    table_names = [achilles.ACHILLES_RESULTS, achilles.ACHILLES_RESULTS_DIST]
    if include_heel:
        table_names.append(achilles_heel.ACHILLES_HEEL_RESULTS)

    running_jobs = []
    for table_name in table_names:
        test_file_name = table_name + '.csv'
        test_file_path = os.path.join(TEST_DATA_EXPORT_SYNPUF_PATH,
                                      table_name + '.csv')
        write_cloud_file(hpo_bucket, test_file_path)
        gcs_path = 'gs://' + hpo_bucket + '/' + test_file_name
        dataset_id = bq_utils.get_dataset_id()
        table_id = bq_utils.get_table_id(hpo_id, table_name)
        load_results = bq_utils.load_csv(table_name, gcs_path, app_id,
                                         dataset_id, table_id)
        running_jobs.append(load_results['jobReference']['jobId'])
    bq_utils.wait_on_jobs(running_jobs)
Beispiel #2
0
def get_hpo_missing_pii_query(hpo_id):
    """
    Query to retrieve counts of drug classes in an HPO site's drug_exposure table

    :param hpo_id: identifies the HPO site
    :return: the query
    """
    person_table_id = bq_utils.get_table_id(hpo_id, common.PERSON)
    pii_name_table_id = bq_utils.get_table_id(hpo_id, common.PII_NAME)
    pii_wildcard = bq_utils.get_table_id(hpo_id, common.PII_WILDCARD)
    participant_match_table_id = bq_utils.get_table_id(
        hpo_id, common.PARTICIPANT_MATCH)
    rdr_dataset_id = bq_utils.get_rdr_dataset_id()
    rdr_date = extract_date_from_rdr_dataset_id(rdr_dataset_id)
    ehr_no_rdr_with_date = consts.EHR_NO_RDR.format(date=rdr_date)
    rdr_person_table_id = common.PERSON
    return render_query(
        consts.MISSING_PII_QUERY,
        person_table_id=person_table_id,
        rdr_dataset_id=rdr_dataset_id,
        rdr_person_table_id=rdr_person_table_id,
        ehr_no_pii=consts.EHR_NO_PII,
        ehr_no_rdr=ehr_no_rdr_with_date,
        pii_no_ehr=consts.PII_NO_EHR,
        ehr_no_participant_match=consts.EHR_NO_PARTICIPANT_MATCH,
        pii_name_table_id=pii_name_table_id,
        pii_wildcard=pii_wildcard,
        participant_match_table_id=participant_match_table_id)
Beispiel #3
0
 def test_is_hpo_col(self):
     col = dict()
     hpo_id = 'bogus_hpo'
     hpo_id_fake = 'bogus_hpo_fake'
     table_name = 'condition_occurrence'
     col[consts.TABLE_NAME] = bq_utils.get_table_id(hpo_id, table_name)
     self.assertTrue(completeness.is_hpo_col(hpo_id, col))
     col[consts.TABLE_NAME] = table_name
     self.assertFalse(completeness.is_hpo_col(hpo_id, col))
     col[consts.TABLE_NAME] = bq_utils.get_table_id(hpo_id_fake, table_name)
     self.assertFalse(completeness.is_hpo_col(hpo_id, col))
Beispiel #4
0
    def _load_data(self):

        # Load measurement_concept_sets
        required_labs.load_measurement_concept_sets_table(
            project_id=self.project_id, dataset_id=self.dataset_id)
        # Load measurement_concept_sets_descendants
        required_labs.load_measurement_concept_sets_descendants_table(
            project_id=self.project_id, dataset_id=self.dataset_id)

        # Create concept and concept_ancestor empty tables if not exist
        if not bq_utils.table_exists(common.CONCEPT, self.dataset_id):
            bq_utils.create_standard_table(common.CONCEPT, common.CONCEPT)
        if not bq_utils.table_exists(common.CONCEPT, self.dataset_id):
            bq_utils.create_standard_table(common.CONCEPT_ANCESTOR,
                                           common.CONCEPT_ANCESTOR)

        # we need to load measurement.csv into bigquery_dataset_id in advance for the other integration tests
        ehr_measurement_result = bq_utils.load_table_from_csv(
            project_id=self.project_id,
            dataset_id=self.dataset_id,
            table_name=bq_utils.get_table_id(FAKE_HPO_ID, common.MEASUREMENT),
            csv_path=test_util.FIVE_PERSONS_MEASUREMENT_CSV,
            fields=resources.fields_for(common.MEASUREMENT))
        bq_utils.wait_on_jobs(
            [ehr_measurement_result['jobReference']['jobId']])
Beispiel #5
0
def _union_subqueries(table_name, hpo_ids, input_dataset_id,
                      output_dataset_id):
    """
    Get list of subqueries (one for each HPO table found in the source) that comprise the load query

    :param table_name: name of a CDM table to load
    :param hpo_ids: list of HPOs to process
    :param input_dataset_id: identifies the source dataset
    :param output_dataset_id: identifies the output dataset
    :return: list of subqueries
    """
    result = []
    # Exclude subqueries that reference tables that are missing from source dataset
    all_table_ids = bq_utils.list_all_table_ids(input_dataset_id)
    for hpo_id in hpo_ids:
        table_id = bq_utils.get_table_id(hpo_id, table_name)
        if table_id in all_table_ids:
            if table_name == common.FACT_RELATIONSHIP:
                subquery = fact_relationship_hpo_subquery(
                    hpo_id, input_dataset_id, output_dataset_id)
                result.append(subquery)
            else:
                subquery = table_hpo_subquery(table_name, hpo_id,
                                              input_dataset_id,
                                              output_dataset_id)
                result.append(subquery)
        else:
            logging.info(
                f'Excluding table {table_id} from mapping query because it does not exist'
            )
    return result
Beispiel #6
0
def get_lab_concept_summary_query(hpo_id):
    """
    Get the query that checks if the HPO site has submitted the required labs
    :param hpo_id: 
    :return: 
    """
    project_id = app_identity.get_application_id()
    dataset_id = bq_utils.get_dataset_id()
    hpo_measurement_table = bq_utils.get_table_id(hpo_id, common.MEASUREMENT)

    # Create measurement_concept_sets_table if not exist
    if not bq_utils.table_exists(MEASUREMENT_CONCEPT_SETS_TABLE, dataset_id):
        load_measurement_concept_sets_table(project_id, dataset_id)

    # Create measurement_concept_sets_descendants_table if not exist
    if not bq_utils.table_exists(MEASUREMENT_CONCEPT_SETS_DESCENDANTS_TABLE,
                                 dataset_id):
        load_measurement_concept_sets_descendants_table(project_id, dataset_id)

    return CHECK_REQUIRED_LAB_QUERY.format(
        project_id=project_id,
        ehr_ops_dataset_id=dataset_id,
        hpo_measurement_table=hpo_measurement_table,
        measurement_concept_sets_descendants=
        MEASUREMENT_CONCEPT_SETS_DESCENDANTS_TABLE)
Beispiel #7
0
def render(sql, hpo_id, results_schema, vocab_schema=''):
    table_id = bq_utils.get_table_id(hpo_id, '')
    vocab_replacement = vocab_schema + '.' if vocab_schema else ''
    sql = sql.replace(RESULTS_SCHEMA_PLACEHOLDER,
                      results_schema + '.' + table_id)
    sql = sql.replace(VOCAB_SCHEMA_PLACEHOLDER, vocab_replacement)
    return sql
Beispiel #8
0
 def test_load_ehr_observation(self):
     hpo_id = 'pitt'
     dataset_id = self.dataset_id
     table_id = bq_utils.get_table_id(hpo_id, table_name='observation')
     q = 'SELECT observation_id FROM {dataset_id}.{table_id} ORDER BY observation_id'.format(
         dataset_id=dataset_id, table_id=table_id)
     expected_observation_ids = [
         int(row['observation_id'])
         for row in resources.csv_to_list(PITT_FIVE_PERSONS_OBSERVATION_CSV)
     ]
     sc_bucket = self.client.get_bucket(gcs_utils.get_hpo_bucket(hpo_id))
     bucket_blob = sc_bucket.blob('observation.csv')
     with open(PITT_FIVE_PERSONS_OBSERVATION_CSV, 'rb') as fp:
         bucket_blob.upload_from_file(fp)
     result = bq_utils.load_cdm_csv(hpo_id, 'observation')
     job_id = result['jobReference']['jobId']
     incomplete_jobs = bq_utils.wait_on_jobs([job_id])
     self.assertEqual(len(incomplete_jobs), 0,
                      'pitt_observation load job did not complete')
     load_job_result = bq_utils.get_job_details(job_id)
     load_job_result_status = load_job_result['status']
     load_job_errors = load_job_result_status.get('errors')
     self.assertIsNone(load_job_errors,
                       msg='pitt_observation load job failed: ' +
                       str(load_job_errors))
     query_results_response = bq_utils.query(q)
     query_job_errors = query_results_response.get('errors')
     self.assertIsNone(query_job_errors)
     actual_result = [
         int(row['f'][0]['v']) for row in query_results_response['rows']
     ]
     self.assertCountEqual(actual_result, expected_observation_ids)
Beispiel #9
0
    def test_validate_five_persons_success(self, mock_check_cron):
        expected_results = []
        test_file_names = [
            os.path.basename(f) for f in test_util.FIVE_PERSONS_FILES
        ]

        for cdm_file in common.SUBMISSION_FILES:
            if cdm_file in test_file_names:
                expected_result = (cdm_file, 1, 1, 1)
                test_file = os.path.join(test_util.FIVE_PERSONS_PATH, cdm_file)
                test_util.write_cloud_file(self.hpo_bucket,
                                           test_file,
                                           prefix=self.folder_prefix)
            else:
                expected_result = (cdm_file, 0, 0, 0)
            expected_results.append(expected_result)
        bucket_items = gcs_utils.list_bucket(self.hpo_bucket)
        folder_items = main.get_folder_items(bucket_items, self.folder_prefix)
        r = main.validate_submission(self.hpo_id, self.hpo_bucket, folder_items,
                                     self.folder_prefix)
        self.assertSetEqual(set(r['results']), set(expected_results))

        # check tables exist and are clustered as expected
        for table in resources.CDM_TABLES + common.PII_TABLES:
            fields_file = os.path.join(resources.fields_path, table + '.json')
            table_id = bq_utils.get_table_id(test_util.FAKE_HPO_ID, table)
            table_info = bq_utils.get_table_info(table_id)
            with open(fields_file, 'r') as fp:
                fields = json.load(fp)
                field_names = [field['name'] for field in fields]
                if 'person_id' in field_names:
                    self.table_has_clustering(table_info)
Beispiel #10
0
    def test_validate_five_persons_success(self, mock_check_cron):
        expected_results: list = []
        test_file_names: list = [
            os.path.basename(f) for f in test_util.FIVE_PERSONS_FILES
        ]

        for cdm_filename in common.SUBMISSION_FILES:
            if cdm_filename in test_file_names:
                expected_result: tuple = (cdm_filename, 1, 1, 1)
                test_filepath: str = os.path.join(test_util.FIVE_PERSONS_PATH,
                                                  cdm_filename)
                test_blob = self.storage_bucket.blob(
                    f'{self.folder_prefix}{cdm_filename}')
                test_blob.upload_from_filename(test_filepath)

            else:
                expected_result: tuple = (cdm_filename, 0, 0, 0)
            expected_results.append(expected_result)
        bucket_items = gcs_utils.list_bucket(self.hpo_bucket)
        folder_items = main.get_folder_items(bucket_items, self.folder_prefix)
        r = main.validate_submission(self.hpo_id, self.hpo_bucket, folder_items,
                                     self.folder_prefix)
        self.assertSetEqual(set(r['results']), set(expected_results))

        # check tables exist and are clustered as expected
        for table in resources.CDM_TABLES + common.PII_TABLES:
            table_id = bq_utils.get_table_id(test_util.FAKE_HPO_ID, table)
            table_info = bq_utils.get_table_info(table_id)
            fields = resources.fields_for(table)
            field_names = [field['name'] for field in fields]
            if 'person_id' in field_names:
                self.table_has_clustering(table_info)
    def load_test_data(self, hpo_id: str = None):
        """
        Load to bq test achilles heel results data from csv file

        :param hpo_id: if specified, prefix to use on csv test file and bq table, otherwise no prefix is used
        :return: contents of the file as list of objects
        """

        table_name: str = common.ACHILLES_HEEL_RESULTS
        if hpo_id is not None:
            table_id: str = bq_utils.get_table_id(hpo_id, table_name)
        else:
            table_id: str = table_name
        test_file_name: str = f'{table_id}.csv'
        test_file_path: str = os.path.join(test_util.TEST_DATA_PATH,
                                           test_file_name)

        target_bucket = self.storage_client.get_bucket(self.bucket)
        test_blob = target_bucket.blob(test_file_name)
        test_blob.upload_from_filename(test_file_path)

        gcs_path: str = f'gs://{self.bucket}/{test_file_name}'
        load_results = bq_utils.load_csv(table_name, gcs_path, self.project_id,
                                         self.dataset_id, table_id)
        job_id = load_results['jobReference']['jobId']
        bq_utils.wait_on_jobs([job_id])
        return resources.csv_to_list(test_file_path)
Beispiel #12
0
def _mapping_subqueries(table_name, hpo_ids, dataset_id, project_id):
    """
    Get list of subqueries (one for each HPO table found in the source) that comprise the ID mapping query

    :param table_name: name of a CDM table whose ID field must be remapped
    :param hpo_ids: list of HPOs to process
    :param dataset_id: identifies the source dataset
    :param project_id: identifies the GCP project
    :return: list of subqueries
    """
    result = []
    hpo_unique_identifiers = get_hpo_offsets(hpo_ids)

    # Exclude subqueries that reference tables that are missing from source dataset
    all_table_ids = bq_utils.list_all_table_ids(dataset_id)
    for hpo_id in hpo_ids:
        table_id = bq_utils.get_table_id(hpo_id, table_name)
        if table_id in all_table_ids:
            subquery = f'''
                (SELECT '{table_id}' AS src_table_id,
                  {table_name}_id AS src_{table_name}_id,
                  {table_name}_id + {hpo_unique_identifiers[hpo_id]} as {table_name}_id
                  FROM `{project_id}.{dataset_id}.{table_id}`)
                '''
            result.append(subquery)
        else:
            logging.info(
                f'Excluding table {table_id} from mapping query because it does not exist'
            )
    return result
Beispiel #13
0
    def test_queries_to_retract_from_ehr_dataset(self):
        hpo_person = bq_utils.get_table_id(self.hpo_id, common.PERSON)
        hpo_death = bq_utils.get_table_id(self.hpo_id, common.DEATH)

        # hpo tables
        existing_table_ids = [hpo_person, hpo_death]
        for table in self.tables_to_retract_unioned:
            table_id = bq_utils.get_table_id(self.hpo_id, table)
            existing_table_ids.append(table_id)

        # unioned tables
        ignored_tables = []
        for cdm_table in resources.CDM_TABLES:
            unioned_table_id = rbq.UNIONED_EHR + cdm_table
            existing_table_ids.append(unioned_table_id)

            if cdm_table not in self.tables_to_retract_unioned:
                ignored_tables.append(unioned_table_id)

        # mock existing tables
        mock_table_ids = []
        for table_id in existing_table_ids:
            mock_table_id = MagicMock()
            mock_table_id.table_id = table_id
            mock_table_ids.append(mock_table_id)
        self.client.list_tables.return_value = mock_table_ids

        person_id_query = rbq.JINJA_ENV.from_string(
            rbq.PERSON_ID_QUERY).render(
                person_research_id=rbq.PERSON_ID,
                pid_project=self.project_id,
                sandbox_dataset_id=self.sandbox_dataset_id,
                pid_table_id=self.pid_table_id)
        qs = rbq.queries_to_retract_from_ehr_dataset(self.client,
                                                     self.project_id,
                                                     self.ehr_dataset_id,
                                                     self.hpo_id,
                                                     person_id_query)

        expected_tables = set(existing_table_ids) - set(ignored_tables)
        actual_tables = set()
        for query in qs:
            fq_table = re.search('`(.*)`', query)
            if fq_table:
                table = fq_table.group(1).split('.')[-1]
                actual_tables.add(table)
        self.assertSetEqual(expected_tables, actual_tables)
Beispiel #14
0
def get_heel_error_query(hpo_id):
    """
    Query to retrieve errors in Achilles Heel for an HPO site

    :param hpo_id: identifies the HPO site
    :return: the query
    """
    table_id = bq_utils.get_table_id(hpo_id, consts.ACHILLES_HEEL_RESULTS_TABLE)
    return render_query(consts.HEEL_ERROR_QUERY_VALIDATION, table_id=table_id)
Beispiel #15
0
def get_drug_class_counts_query(hpo_id):
    """
    Query to retrieve counts of drug classes in an HPO site's drug_exposure table

    :param hpo_id: identifies the HPO site
    :return: the query
    """
    table_id = bq_utils.get_table_id(hpo_id, consts.DRUG_CHECK_TABLE)
    return render_query(consts.DRUG_CHECKS_QUERY_VALIDATION, table_id=table_id)
    def test_integration_queries_to_retract_from_fake_dataset(self, mock_list_datasets, mock_is_ehr_dataset):
        mock_list_datasets.return_value = [{'id': self.project_id+':'+self.bq_dataset_id}]
        mock_is_ehr_dataset.return_value = True

        job_ids = []
        row_count_queries = {}
        # load the cdm files into dataset
        for cdm_file in test_util.NYC_FIVE_PERSONS_FILES:
            cdm_file_name = os.path.basename(cdm_file)
            cdm_table = cdm_file_name.split('.')[0]
            hpo_table = bq_utils.get_table_id(self.hpo_id, cdm_table)
            # store query for checking number of rows to delete
            row_count_queries[hpo_table] = EXPECTED_ROWS_QUERY.format(dataset_id=self.bq_dataset_id,
                                                                      table_id=hpo_table,
                                                                      pids=retract_data_bq.int_list_to_bq(
                                                                                self.person_ids))
            retract_data_bq.logger.debug('Preparing to load table %s.%s' % (self.bq_dataset_id,
                                                                            hpo_table))
            with open(cdm_file, 'rb') as f:
                gcs_utils.upload_object(gcs_utils.get_hpo_bucket(self.hpo_id), cdm_file_name, f)
            result = bq_utils.load_cdm_csv(self.hpo_id, cdm_table, dataset_id=self.bq_dataset_id)
            retract_data_bq.logger.debug('Loading table %s.%s' % (self.bq_dataset_id,
                                                                  hpo_table))
            job_id = result['jobReference']['jobId']
            job_ids.append(job_id)
        incomplete_jobs = bq_utils.wait_on_jobs(job_ids)
        self.assertEqual(len(incomplete_jobs), 0, 'NYC five person load job did not complete')
        retract_data_bq.logger.debug('All tables loaded successfully')

        # use query results to count number of expected row deletions
        expected_row_count = {}
        for table in row_count_queries:
            result = bq_utils.query(row_count_queries[table])
            expected_row_count[table] = retract_data_bq.to_int(result['totalRows'])

        # separate check to find number of actual deleted rows
        q = TABLE_ROWS_QUERY.format(dataset_id=self.bq_dataset_id)
        q_result = bq_utils.query(q)
        result = bq_utils.response2rows(q_result)
        row_count_before_retraction = {}
        for row in result:
            row_count_before_retraction[row['table_id']] = row['row_count']
        deid_flag = False

        # perform retraction
        retract_data_bq.run_retraction(self.test_project_id, self.person_ids, self.hpo_id, deid_flag)

        # find actual deleted rows
        q_result = bq_utils.query(q)
        result = bq_utils.response2rows(q_result)
        row_count_after_retraction = {}
        for row in result:
            row_count_after_retraction[row['table_id']] = row['row_count']
        for table in expected_row_count:
            self.assertEqual(expected_row_count[table],
                             row_count_before_retraction[table] - row_count_after_retraction[table])
    def test_validate_five_persons_success(self, mock_check_cron):
        prefix = 'dummy-prefix-2018-03-22/'
        expected_result_items = resources._csv_to_list(
            test_util.FIVE_PERSONS_SUCCESS_RESULT_CSV)
        json_export_files = self.get_json_export_files(test_util.FAKE_HPO_ID)

        # upload all five_persons files
        for cdm_file in test_util.FIVE_PERSONS_FILES:
            test_util.write_cloud_file(self.hpo_bucket,
                                       cdm_file,
                                       prefix=prefix)

        expected_tables = [
            'person', 'visit_occurrence', 'condition_occurrence',
            'procedure_occurrence', 'drug_exposure', 'measurement'
        ]
        cdm_files = [table + '.csv' for table in expected_tables]

        main.app.testing = True
        with main.app.test_client() as c:
            c.get(test_util.VALIDATE_HPO_FILES_URL)

            # check the result file was put in bucket
            expected_object_names = cdm_files + common.IGNORE_LIST + json_export_files
            expected_objects = [
                prefix + item for item in expected_object_names
            ]

            list_bucket_result = gcs_utils.list_bucket(self.hpo_bucket)
            actual_objects = [item['name'] for item in list_bucket_result]
            self.assertSetEqual(set(expected_objects), set(actual_objects))

            # result says file found, parsed, loaded
            actual_result = test_util.read_cloud_file(
                self.hpo_bucket, prefix + common.RESULT_CSV)
            actual_result_file = StringIO.StringIO(actual_result)
            actual_result_items = resources._csv_file_to_list(
                actual_result_file)

            expected_result_items.sort()
            actual_result_items.sort()
            self.assertListEqual(expected_result_items, actual_result_items)
            self.assertTrue(
                main.all_required_files_loaded(test_util.FAKE_HPO_ID,
                                               folder_prefix=prefix))

        # check tables exist and are clustered as expected
        for table in expected_tables:
            fields_file = os.path.join(resources.fields_path, table + '.json')
            table_id = bq_utils.get_table_id(test_util.FAKE_HPO_ID, table)
            table_info = bq_utils.get_table_info(table_id)
            with open(fields_file, 'r') as fp:
                fields = json.load(fp)
                field_names = [field['name'] for field in fields]
                if 'person_id' in field_names:
                    self.table_has_clustering(table_info)
def create_tables(hpo_id, drop_existing=False):
    """
    Create the achilles related tables
    :param hpo_id: associated hpo id
    :param drop_existing: if True, drop existing tables
    :return:
    """
    for table_name in ACHILLES_HEEL_TABLES:
        table_id = bq_utils.get_table_id(hpo_id, table_name)
        bq_utils.create_standard_table(table_name, table_id, drop_existing)
def exist_participant_match(ehr_dataset_id, hpo_id):
    """
    This function checks if the hpo has submitted the participant_match data 
    
    :param ehr_dataset_id: 
    :param hpo_id: 
    :return: 
    """
    return bq_utils.table_exists(
        bq_utils.get_table_id(hpo_id, PARTICIPANT_MATCH), ehr_dataset_id)
Beispiel #20
0
 def setUp(self):
     self.hpo_bucket = gcs_utils.get_hpo_bucket(FAKE_HPO_ID)
     self.person_table_id = bq_utils.get_table_id(FAKE_HPO_ID,
                                                  common.PERSON)
     self.dataset_id = bq_utils.get_dataset_id()
     test_util.delete_all_tables(self.dataset_id)
     self.project_id = app_identity.get_application_id()
     self.TEST_FIELDS = [
         {
             "type": "integer",
             "name": "integer_field",
             "mode": "required",
             "description": "An integer field"
         },
         # DC-586 Import RDR rules should support null fields
         {
             "type": "integer",
             "name": "nullable_integer_field",
             "mode": "nullable",
             "description": "A nullable integer field"
         },
         {
             "type": "string",
             "name": "string_field",
             "mode": "required",
             "description": "A string field"
         },
         {
             "type": "date",
             "name": "date_field",
             "mode": "required",
             "description": "A date field"
         },
         {
             "type": "timestamp",
             "name": "timestamp_field",
             "mode": "required",
             "description": "A timestamp field"
         },
         {
             "type": "boolean",
             "name": "boolean_field",
             "mode": "required",
             "description": "A boolean field"
         },
         {
             "type": "float",
             "name": "float_field",
             "mode": "required",
             "description": "A float field"
         }
     ]
     self.DT_FORMAT = '%Y-%m-%d %H:%M:%S'
     self.client = StorageClient(self.project_id)
     self.client.empty_bucket(self.hpo_bucket)
Beispiel #21
0
def get_hpo_subqueries(app_id, dataset_id, all_table_ids):
    result = []
    for hpo_id in get_hpo_ids():
        table_id = bq_utils.get_table_id(hpo_id, common.ACHILLES_HEEL_RESULTS)
        if table_id in all_table_ids:
            subquery = QUERY_FORMAT(dataset_name=hpo_id,
                                    app_id=app_id,
                                    dataset_id=dataset_id,
                                    table_id=table_id)
            result.append(subquery)
    return result
    def test_queries_to_retract_from_ehr_dataset(self,
                                                 mock_list_existing_tables):
        hpo_person = bq_utils.get_table_id(self.hpo_id, common.PERSON)
        hpo_death = bq_utils.get_table_id(self.hpo_id, common.DEATH)

        # hpo tables
        existing_table_ids = [hpo_person, hpo_death]
        for table in self.tables_to_retract_unioned:
            table_id = bq_utils.get_table_id(self.hpo_id, table)
            existing_table_ids.append(table_id)

        # unioned tables
        ignored_tables = []
        for cdm_table in resources.CDM_TABLES:
            unioned_table_id = retract_data_bq.UNIONED_EHR + cdm_table
            existing_table_ids.append(unioned_table_id)

            if cdm_table not in self.tables_to_retract_unioned:
                ignored_tables.append(unioned_table_id)

        mapped_tables = cdm.tables_to_map()

        # fact_relationship does not have pid, is handled separate from other mapped tables
        for mapped_table in mapped_tables:
            mapping_table = ehr_union.mapping_table_for(mapped_table)
            existing_table_ids.append(mapping_table)
            legacy_mapping_table = retract_data_bq.UNIONED_EHR + mapping_table
            existing_table_ids.append(legacy_mapping_table)
            if mapped_table not in self.tables_to_retract_unioned:
                ignored_tables.append(mapping_table)
                ignored_tables.append(legacy_mapping_table)

        mock_list_existing_tables.return_value = existing_table_ids
        mqs, qs = retract_data_bq.queries_to_retract_from_ehr_dataset(
            self.project_id, self.ehr_dataset_id, self.project_id,
            self.sandbox_dataset_id, self.hpo_id, self.pid_table_id)
        actual_dest_tables = set(q[retract_data_bq.DEST_TABLE]
                                 for q in qs + mqs)
        expected_dest_tables = set(existing_table_ids) - set(hpo_person) - set(
            ignored_tables)
        self.assertSetEqual(expected_dest_tables, actual_dest_tables)
Beispiel #23
0
    def test_get_lab_concept_summary_query(self):
        summary_query = required_labs.get_lab_concept_summary_query(
            FAKE_HPO_ID)
        summary_response = bq_utils.query(summary_query)
        summary_rows = bq_utils.response2rows(summary_response)
        submitted_labs = [
            row for row in summary_rows
            if row['measurement_concept_id_exists'] == 1
        ]
        actual_total_labs = summary_response['totalRows']

        # Count the total number of labs required, this number should be equal to the total number of rows in the
        # results generated by get_lab_concept_summary_query including the submitted and missing labs.
        unique_ancestor_concept_query = sql_wrangle.qualify_tables(
            """SELECT DISTINCT ancestor_concept_id FROM `{project_id}.{dataset_id}.{table_id}`"""
            .format(project_id=self.project_id,
                    dataset_id=self.dataset_id,
                    table_id=MEASUREMENT_CONCEPT_SETS_DESCENDANTS_TABLE))
        unique_ancestor_cocnept_response = bq_utils.query(
            unique_ancestor_concept_query)
        expected_total_labs = unique_ancestor_cocnept_response['totalRows']

        # Count the number of labs in the measurement table, this number should be equal to the number of labs
        # submitted by the fake site
        unique_measurement_concept_id_query = '''
                SELECT
                  DISTINCT c.ancestor_concept_id
                FROM
                  `{project_id}.{dataset_id}.{measurement_concept_sets_descendants}` AS c
                JOIN
                  `{project_id}.{dataset_id}.{measurement}` AS m
                ON
                  c.descendant_concept_id = m.measurement_concept_id
                '''.format(project_id=self.project_id,
                           dataset_id=self.dataset_id,
                           measurement_concept_sets_descendants=
                           MEASUREMENT_CONCEPT_SETS_DESCENDANTS_TABLE,
                           measurement=bq_utils.get_table_id(
                               FAKE_HPO_ID, common.MEASUREMENT))

        unique_measurement_concept_id_response = bq_utils.query(
            unique_measurement_concept_id_query)
        unique_measurement_concept_id_total_labs = unique_measurement_concept_id_response[
            'totalRows']

        self.assertEqual(int(expected_total_labs),
                         int(actual_total_labs),
                         msg='Compare the total number of labs')
        self.assertEqual(int(unique_measurement_concept_id_total_labs),
                         len(submitted_labs),
                         msg='Compare the number '
                         'of labs submitted '
                         'in the measurement')
Beispiel #24
0
def is_hpo_col(hpo_id, col):
    """
    True if col is on specified HPO table

    :param hpo_id: identifies the HPO
    :param col: column summary
    :return: True if col is on specified HPO table, False otherwise
    """
    hpo_tables = [
        bq_utils.get_table_id(hpo_id, table) for table in resources.CDM_TABLES
    ]
    return col[consts.TABLE_NAME] in hpo_tables
 def setUp(self):
     self.testbed = testbed.Testbed()
     self.testbed.activate()
     self.testbed.init_app_identity_stub()
     self.testbed.init_memcache_stub()
     self.testbed.init_urlfetch_stub()
     self.testbed.init_blobstore_stub()
     self.testbed.init_datastore_v3_stub()
     self.hpo_bucket = gcs_utils.get_hpo_bucket(FAKE_HPO_ID)
     self.person_table_id = bq_utils.get_table_id(FAKE_HPO_ID, PERSON)
     test_util.delete_all_tables(self.EHR_DATASET_ID)
     self._empty_bucket()
Beispiel #26
0
def _filter_hpo_tables(tables: List[bigquery.table.TableListItem],
                       hpo_id: str) -> List[bigquery.table.TableListItem]:
    """
    Given a list of tables get those associated with an HPO submission

    :param tables: list of tables to filter
    :param hpo_id: identifies the HPO
    :return: list of tables associated with the HPO
    """
    expected_tables = [
        bq_utils.get_table_id(hpo_id, table) for table in resources.CDM_TABLES
    ]
    return [table for table in tables if table.table_id in expected_tables]
    def test_query_result(self):
        with open(FIVE_PERSONS_PERSON_CSV, 'rb') as fp:
            gcs_utils.upload_object(self.hpo_bucket, 'person.csv', fp)
        result = bq_utils.load_cdm_csv(FAKE_HPO_ID, PERSON)

        load_job_id = result['jobReference']['jobId']
        incomplete_jobs = bq_utils.wait_on_jobs([load_job_id])
        self.assertEqual(len(incomplete_jobs), 0, 'loading table {} timed out'.format(PERSON))

        table_id = bq_utils.get_table_id(FAKE_HPO_ID, PERSON)
        q = 'SELECT person_id FROM %s' % table_id
        result = bq_utils.query(q)
        self.assertEqual(5, int(result['totalRows']))
Beispiel #28
0
 def setUp(self):
     super(BqUtilsTest, self).setUp()
     self.testbed = testbed.Testbed()
     self.testbed.activate()
     self.testbed.init_app_identity_stub()
     self.testbed.init_memcache_stub()
     self.testbed.init_urlfetch_stub()
     self.testbed.init_blobstore_stub()
     self.testbed.init_datastore_v3_stub()
     self.hpo_bucket = gcs_utils.get_hpo_bucket(FAKE_HPO_ID)
     self.person_table_id = bq_utils.get_table_id(FAKE_HPO_ID, PERSON)
     self._drop_tables()
     self._empty_bucket()
Beispiel #29
0
 def test_query_result(self):
     sc_bucket = self.client.get_bucket(self.hpo_bucket)
     bucket_blob = sc_bucket.blob('person.csv')
     with open(FIVE_PERSONS_PERSON_CSV, 'rb') as fp:
         bucket_blob.upload_from_file(fp)
     result = bq_utils.load_cdm_csv(FAKE_HPO_ID, common.PERSON)
     load_job_id = result['jobReference']['jobId']
     incomplete_jobs = bq_utils.wait_on_jobs([load_job_id])
     self.assertEqual(len(incomplete_jobs), 0,
                      'loading table {} timed out'.format(common.PERSON))
     table_id = bq_utils.get_table_id(FAKE_HPO_ID, common.PERSON)
     q = 'SELECT person_id FROM %s' % table_id
     result = bq_utils.query(q)
     self.assertEqual(5, int(result['totalRows']))
Beispiel #30
0
def validate_submission(hpo_id, bucket, bucket_items, folder_prefix):
    """
    Load submission in BigQuery and summarize outcome

    :param hpo_id:
    :param bucket:
    :param bucket_items:
    :param folder_prefix:
    :return: a dict with keys results, errors, warnings
      results is list of tuples (file_name, found, parsed, loaded)
      errors and warnings are both lists of tuples (file_name, message)
    """
    logging.info('Validating %s submission in gs://%s/%s', hpo_id, bucket,
                 folder_prefix)
    # separate cdm from the unknown (unexpected) files
    folder_items = [item['name'][len(folder_prefix):] \
                    for item in bucket_items if item['name'].startswith(folder_prefix)]
    found_cdm_files, found_pii_files, unknown_files = categorize_folder_items(
        folder_items)

    errors = []
    results = []

    # Create all tables first to simplify downstream processes
    # (e.g. ehr_union doesn't have to check if tables exist)
    for file_name in resources.CDM_FILES + common.PII_FILES:
        table_name = file_name.split('.')[0]
        table_id = bq_utils.get_table_id(hpo_id, table_name)
        bq_utils.create_standard_table(table_name,
                                       table_id,
                                       drop_existing=True)

    for cdm_file_name in sorted(resources.CDM_FILES):
        file_results, file_errors = perform_validation_on_file(
            cdm_file_name, found_cdm_files, hpo_id, folder_prefix, bucket)
        results.extend(file_results)
        errors.extend(file_errors)

    for pii_file_name in sorted(common.PII_FILES):
        file_results, file_errors = perform_validation_on_file(
            pii_file_name, found_pii_files, hpo_id, folder_prefix, bucket)
        results.extend(file_results)
        errors.extend(file_errors)

    # (filename, message) for each unknown file
    warnings = [(unknown_file, common.UNKNOWN_FILE)
                for unknown_file in unknown_files]
    return dict(results=results, errors=errors, warnings=warnings)