Ejemplo n.º 1
0
    def test_ehr_person_to_observation_counts(self):
        self._load_datasets()

        # perform ehr union
        ehr_union.main(self.input_dataset_id, self.output_dataset_id,
                       self.project_id, self.hpo_ids)

        q_person = '''
                    SELECT *
                    FROM {output_dataset_id}.unioned_ehr_person AS p
                    '''.format(output_dataset_id=self.output_dataset_id)
        person_response = bq_utils.query(q_person)
        person_rows = bq_utils.response2rows(person_response)
        q_observation = '''
                    SELECT *
                    FROM {output_dataset_id}.unioned_ehr_observation
                    WHERE observation_type_concept_id = 38000280
                    '''.format(output_dataset_id=self.output_dataset_id)
        # observation should contain 4 records per person of type EHR
        expected = len(person_rows) * 4
        observation_response = bq_utils.query(q_observation)
        observation_rows = bq_utils.response2rows(observation_response)
        actual = len(observation_rows)
        self.assertEqual(
            actual, expected,
            'Expected %s EHR person records in observation but found %s' %
            (expected, actual))
Ejemplo n.º 2
0
def run_heel(hpo_id):
    """
    Run heel commands

    :param hpo_id:  string name for the hpo identifier
    :returns: None
    :raises RuntimeError: Raised if BigQuery takes longer than 30 seconds
        to complete a job on a temporary table
    """
    # very long test
    commands = _get_heel_commands(hpo_id)
    count = 0
    for command in commands:
        count = count + 1
        logging.debug(' ---- running query # {}'.format(count))
        logging.debug(' ---- Running `%s`...\n' % command)
        if sql_wrangle.is_to_temp_table(command):
            table_id = sql_wrangle.get_temp_table_name(command)
            query = sql_wrangle.get_temp_table_query(command)
            insert_query_job_result = bq_utils.query(query, False, table_id)
            query_job_id = insert_query_job_result['jobReference']['jobId']

            incomplete_jobs = bq_utils.wait_on_jobs([query_job_id])
            if len(incomplete_jobs) > 0:
                logging.critical('tempresults doesnt get created in 30 secs')
                raise RuntimeError('Tempresults taking too long to create')
        elif sql_wrangle.is_truncate(command):
            table_id = sql_wrangle.get_truncate_table_name(command)
            query = 'DELETE FROM %s WHERE TRUE' % table_id
            bq_utils.query(query)
        elif sql_wrangle.is_drop(command):
            table_id = sql_wrangle.get_drop_table_name(command)
            bq_utils.delete_table(table_id)
        else:
            bq_utils.query(command)
Ejemplo n.º 3
0
    def test_ehr_person_to_observation_counts(self, mock_tables_map):
        self._load_datasets()
        mock_tables_map.return_value = [
            common.OBSERVATION, common.LOCATION, common.CARE_SITE,
            common.VISIT_OCCURRENCE
        ]

        # perform ehr union
        ehr_union.main(self.input_dataset_id, self.output_dataset_id,
                       self.project_id, self.hpo_ids)

        q_person = '''
                    SELECT *
                    FROM {output_dataset_id}.unioned_ehr_person AS p
                    '''.format(output_dataset_id=self.output_dataset_id)
        person_response = bq_utils.query(q_person)
        person_rows = bq_utils.response2rows(person_response)
        q_observation = '''
                    SELECT *
                    FROM {output_dataset_id}.unioned_ehr_observation
                    WHERE observation_type_concept_id = 38000280
                    '''.format(output_dataset_id=self.output_dataset_id)
        # observation should contain 4 records per person of type EHR
        expected = len(person_rows) * 4
        observation_response = bq_utils.query(q_observation)
        observation_rows = bq_utils.response2rows(observation_response)
        actual = len(observation_rows)
        self.assertEqual(
            actual, expected,
            'Expected %s EHR person records in observation but found %s' %
            (expected, actual))
Ejemplo n.º 4
0
    def _create_drug_class_table(self):
        table_name = 'drug_class'
        fields = [{
            "type": "integer",
            "name": "concept_id",
            "mode": "required"
        }, {
            "type": "string",
            "name": "concept_name",
            "mode": "required"
        }, {
            "type": "string",
            "name": "drug_class_name",
            "mode": "required"
        }]
        bq_utils.create_table(table_id=table_name,
                              fields=fields,
                              drop_existing=True,
                              dataset_id=self.bigquery_dataset_id)

        bq_utils.query(q=main_constants.DRUG_CLASS_QUERY.format(
            dataset_id=self.bigquery_dataset_id),
                       use_legacy_sql=False,
                       destination_table_id='drug_class',
                       retry_count=bq_consts.BQ_DEFAULT_RETRY_COUNT,
                       write_disposition='WRITE_TRUNCATE',
                       destination_dataset_id=self.bigquery_dataset_id)
Ejemplo n.º 5
0
def retraction_query_runner(queries):
    query_job_ids = []
    for query_dict in queries:
        logger.debug('Retracting from %s.%s using query %s' %
                     (query_dict[DEST_DATASET], query_dict[DEST_TABLE],
                      query_dict[QUERY]))
        if query_dict[DELETE_FLAG]:
            job_results = bq_utils.query(q=query_dict[QUERY], batch=True)
            rows_affected = job_results['numDmlAffectedRows']
            logger.debug('%s rows deleted from %s.%s' %
                         (rows_affected, query_dict[DEST_DATASET],
                          query_dict[DEST_TABLE]))
        else:
            job_results = bq_utils.query(
                q=query_dict[QUERY],
                destination_table_id=query_dict[DEST_TABLE],
                write_disposition=WRITE_TRUNCATE,
                destination_dataset_id=query_dict[DEST_DATASET],
                batch=True)
        query_job_id = job_results['jobReference']['jobId']
        query_job_ids.append(query_job_id)

    incomplete_jobs = bq_utils.wait_on_jobs(query_job_ids)
    if incomplete_jobs:
        logger.debug('Failed on {count} job ids {ids}'.format(
            count=len(incomplete_jobs), ids=incomplete_jobs))
        logger.debug('Terminating retraction')
        raise bq_utils.BigQueryJobWaitError(incomplete_jobs)
Ejemplo n.º 6
0
def run_analyses(hpo_id):
    """
    Run the achilles analyses
    :param hpo_id:
    :return:
    """
    commands = _get_run_analysis_commands(hpo_id)
    for command in commands:
        logging.debug(' ---- Running `%s`...\n' % command)
        if sql_wrangle.is_to_temp_table(command):
            table_id = sql_wrangle.get_temp_table_name(command)
            query = sql_wrangle.get_temp_table_query(command)
            insert_query_job_result = bq_utils.query(query, False, table_id)
            query_job_id = insert_query_job_result['jobReference']['jobId']

            incomplete_jobs = bq_utils.wait_on_jobs([query_job_id])
            if len(incomplete_jobs) > 0:
                logging.critical('tempresults doesnt get created in 15 secs')
                raise RuntimeError('Tempresults taking too long to create')
        elif sql_wrangle.is_truncate(command):
            table_id = sql_wrangle.get_truncate_table_name(command)
            if bq_utils.table_exists(table_id):
                bq_utils.delete_table(table_id)
        elif sql_wrangle.is_drop(command):
            table_id = sql_wrangle.get_drop_table_name(command)
            if bq_utils.table_exists(table_id):
                bq_utils.delete_table(table_id)
        else:
            bq_utils.query(command)
Ejemplo n.º 7
0
def run_heel(hpo_id):
    # very long test
    commands = _get_heel_commands(hpo_id)
    count = 0
    for command in commands:
        count = count + 1
        logging.debug(' ---- running query # {}'.format(count))
        logging.debug(' ---- Running `%s`...\n' % command)
        if sql_wrangle.is_to_temp_table(command):
            table_id = sql_wrangle.get_temp_table_name(command)
            query = sql_wrangle.get_temp_table_query(command)
            insert_query_job_result = bq_utils.query(query, False, table_id)
            query_job_id = insert_query_job_result['jobReference']['jobId']

            incomplete_jobs = bq_utils.wait_on_jobs([query_job_id])
            if len(incomplete_jobs) > 0:
                logging.critical('tempresults doesnt get created in 30 secs')
                raise RuntimeError('Tempresults taking too long to create')
        elif sql_wrangle.is_truncate(command):
            table_id = sql_wrangle.get_truncate_table_name(command)
            query = 'DELETE FROM %s WHERE TRUE' % table_id
            bq_utils.query(query)
        elif sql_wrangle.is_drop(command):
            table_id = sql_wrangle.get_drop_table_name(command)
            bq_utils.delete_table(table_id)
        else:
            bq_utils.query(command)
            time.sleep(1)
Ejemplo n.º 8
0
    def test_merge_EHR(self, mock_check_cron):
        self._load_datasets()
        # enable exception propagation as described at https://goo.gl/LqDgnj
        old_dataset_items = bq_utils.list_dataset_contents(
            bq_utils.get_dataset_id())
        expected_items = ['visit_id_mapping_table']
        expected_items.extend(
            ['unioned_ehr_' + table_name for table_name in common.CDM_TABLES])

        ehr_merge.merge(bq_utils.get_dataset_id(), self.project_id)
        # check the result files were placed in bucket
        dataset_items = bq_utils.list_dataset_contents(
            bq_utils.get_dataset_id())
        for table_name in common.CDM_TABLES:
            cmd = 'SELECT COUNT(1) FROM unioned_ehr_{}'.format(table_name)
            result = bq_utils.query(cmd)
            self.assertEqual(
                int(result['rows'][0]['f'][0]['v']),
                2 * globals().get(table_name.upper() + '_COUNT', 0),
                msg='failed for table unioned_ehr_{}'.format(table_name))
        self.assertSetEqual(set(old_dataset_items + expected_items),
                            set(dataset_items))

        table_name = 'condition_occurrence'
        cmd_union = 'SELECT * FROM unioned_ehr_{}'.format(table_name)
        cmd_pitt = 'SELECT * FROM pitt_{}'.format(table_name)
        cmd_visit_mapping = "SELECT global_visit_id, mapping_visit_id FROM visit_id_mapping_table where hpo='pitt'"
        qr_union = bq_utils.query(cmd_union)
        qr_pitt = bq_utils.query(cmd_pitt)
        qr_visit_mapping = bq_utils.query(cmd_visit_mapping)

        union_result = query_result_to_payload(qr_union)
        pitt_result = query_result_to_payload(qr_pitt)
        visit_mapping_result = query_result_to_payload(qr_visit_mapping)

        def get_element_from_list_of_lists(index, list_of_lists):
            return [list_item[index] for list_item in list_of_lists]

        for ind, pitt_visit_id in enumerate(
                pitt_result['VISIT_OCCURRENCE_ID']):
            if pitt_visit_id not in visit_mapping_result['MAPPING_VISIT_ID']:
                continue
            global_visit_id_index = visit_mapping_result[
                'MAPPING_VISIT_ID'].index(pitt_visit_id)
            global_visit_id = visit_mapping_result['GLOBAL_VISIT_ID'][
                global_visit_id_index]
            union_visit_id_index = union_result['VISIT_OCCURRENCE_ID'].index(
                global_visit_id)
            pitt_cols_without_id = [
                values for key, values in pitt_result.items() if key not in
                [u'VISIT_OCCURRENCE_ID', u'CONDITION_OCCURRENCE_ID']
            ]
            union_cols_without_id = [
                values for key, values in union_result.items() if key not in
                [u'VISIT_OCCURRENCE_ID', u'CONDITION_OCCURRENCE_ID']
            ]
            self.assertListEqual(
                get_element_from_list_of_lists(ind, pitt_cols_without_id),
                get_element_from_list_of_lists(union_visit_id_index,
                                               union_cols_without_id))
Ejemplo n.º 9
0
def get_most_recent(app_id=None, drc_bucket=None, report_for=None):
    """
    Query audit logs for paths to the most recent datasources.json files in the DRC bucket.

    Note: Results are cached in a local json file to avoid unnecessary queries.
    :param app_id: identifies the GCP project
    :param drc_bucket: identifies the DRC bucket
    :param report_for: denotes which query to use b/w achilles and results
    :return: list of dict with keys `file_path`, `upload_timestamp`
    """
    if app_id is None:
        app_id = app_identity.get_application_id()
    if drc_bucket is None:
        drc_bucket = gcs_utils.get_drc_bucket()
        if report_for == common.REPORT_FOR_ACHILLES:
            if not os.path.exists(common.LATEST_REPORTS_JSON):
                query = LATEST_REPORTS_QUERY.format(app_id=app_id, drc_bucket=drc_bucket, year=common.LOG_YEAR)
                query_job = bq_utils.query(query)
                result = bq_utils.response2rows(query_job)
                with open(common.LATEST_REPORTS_JSON, 'w') as fp:
                    json.dump(result, fp, sort_keys=True, indent=4)
            with open(common.LATEST_REPORTS_JSON, 'r') as fp:
                return json.load(fp)
        elif report_for == common.REPORT_FOR_RESULTS:
            if not os.path.exists(common.LATEST_RESULTS_JSON):
                query = LATEST_RESULTS_QUERY.format(app_id=app_id, drc_bucket=drc_bucket, year=common.LOG_YEAR)
                query_job = bq_utils.query(query)
                result = bq_utils.response2rows(query_job)
                with open(common.LATEST_RESULTS_JSON, 'w') as fp:
                    json.dump(result, fp, sort_keys=True, indent=4)
            with open(common.LATEST_RESULTS_JSON, 'r') as fp:
                return json.load(fp)
def drop_concept_table(dataset_id):
    if bq_utils.table_exists(common.CONCEPT):
        q = "DROP TABLE {dataset}.concept;".format(dataset=dataset_id)
        try:
            bq_utils.query(q)
        except HttpError as err:
            if err.resp.status != 404:
                raise
Ejemplo n.º 11
0
def load_analyses(hpo_id):
    """
    Populate achilles lookup table
    :param hpo_id:
    :return:
    """
    commands = _get_load_analysis_commands(hpo_id)
    for command in commands:
        bq_utils.query(command)
Ejemplo n.º 12
0
    def test_ehr_person_to_observation(self, mock_tables_map):
        # ehr person table converts to observation records
        self._load_datasets()
        mock_tables_map.return_value = [
            common.OBSERVATION, common.LOCATION, common.CARE_SITE,
            common.VISIT_OCCURRENCE
        ]

        # perform ehr union
        ehr_union.main(self.input_dataset_id, self.output_dataset_id,
                       self.project_id, self.hpo_ids)

        person_query = '''
            SELECT 
                person_id,
                gender_concept_id,
                gender_source_value,
                race_concept_id,
                race_source_value,
                CAST(birth_datetime AS STRING) AS birth_datetime,
                ethnicity_concept_id,
                ethnicity_source_value,
                EXTRACT(DATE FROM birth_datetime) AS birth_date
            FROM {output_dataset_id}.unioned_ehr_person
            '''.format(output_dataset_id=self.output_dataset_id)
        person_response = bq_utils.query(person_query)
        person_rows = bq_utils.response2rows(person_response)

        # construct dicts of expected values
        expected = []
        for person_row in person_rows:
            expected.extend(self.convert_ehr_person_to_observation(person_row))

        # query for observation table records
        query = '''
            SELECT person_id,
                    observation_concept_id,
                    value_as_concept_id,
                    value_as_string,
                    observation_source_value,
                    observation_date
            FROM {output_dataset_id}.unioned_ehr_observation AS obs
            WHERE obs.observation_concept_id IN ({gender_concept_id},{race_concept_id},{dob_concept_id},
            {ethnicity_concept_id})
            '''

        obs_query = query.format(
            output_dataset_id=self.output_dataset_id,
            gender_concept_id=eu_constants.GENDER_CONCEPT_ID,
            race_concept_id=eu_constants.RACE_CONCEPT_ID,
            dob_concept_id=eu_constants.DOB_CONCEPT_ID,
            ethnicity_concept_id=eu_constants.ETHNICITY_CONCEPT_ID)
        obs_response = bq_utils.query(obs_query)
        obs_rows = bq_utils.response2rows(obs_response)
        actual = obs_rows

        self.assertCountEqual(expected, actual)
Ejemplo n.º 13
0
    def test_integration_queries_to_retract_from_fake_dataset(self, mock_list_datasets, mock_is_ehr_dataset):
        mock_list_datasets.return_value = [{'id': self.project_id+':'+self.bq_dataset_id}]
        mock_is_ehr_dataset.return_value = True

        job_ids = []
        row_count_queries = {}
        # load the cdm files into dataset
        for cdm_file in test_util.NYC_FIVE_PERSONS_FILES:
            cdm_file_name = os.path.basename(cdm_file)
            cdm_table = cdm_file_name.split('.')[0]
            hpo_table = bq_utils.get_table_id(self.hpo_id, cdm_table)
            # store query for checking number of rows to delete
            row_count_queries[hpo_table] = EXPECTED_ROWS_QUERY.format(dataset_id=self.bq_dataset_id,
                                                                      table_id=hpo_table,
                                                                      pids=retract_data_bq.int_list_to_bq(
                                                                                self.person_ids))
            retract_data_bq.logger.debug('Preparing to load table %s.%s' % (self.bq_dataset_id,
                                                                            hpo_table))
            with open(cdm_file, 'rb') as f:
                gcs_utils.upload_object(gcs_utils.get_hpo_bucket(self.hpo_id), cdm_file_name, f)
            result = bq_utils.load_cdm_csv(self.hpo_id, cdm_table, dataset_id=self.bq_dataset_id)
            retract_data_bq.logger.debug('Loading table %s.%s' % (self.bq_dataset_id,
                                                                  hpo_table))
            job_id = result['jobReference']['jobId']
            job_ids.append(job_id)
        incomplete_jobs = bq_utils.wait_on_jobs(job_ids)
        self.assertEqual(len(incomplete_jobs), 0, 'NYC five person load job did not complete')
        retract_data_bq.logger.debug('All tables loaded successfully')

        # use query results to count number of expected row deletions
        expected_row_count = {}
        for table in row_count_queries:
            result = bq_utils.query(row_count_queries[table])
            expected_row_count[table] = retract_data_bq.to_int(result['totalRows'])

        # separate check to find number of actual deleted rows
        q = TABLE_ROWS_QUERY.format(dataset_id=self.bq_dataset_id)
        q_result = bq_utils.query(q)
        result = bq_utils.response2rows(q_result)
        row_count_before_retraction = {}
        for row in result:
            row_count_before_retraction[row['table_id']] = row['row_count']
        deid_flag = False

        # perform retraction
        retract_data_bq.run_retraction(self.test_project_id, self.person_ids, self.hpo_id, deid_flag)

        # find actual deleted rows
        q_result = bq_utils.query(q)
        result = bq_utils.response2rows(q_result)
        row_count_after_retraction = {}
        for row in result:
            row_count_after_retraction[row['table_id']] = row['row_count']
        for table in expected_row_count:
            self.assertEqual(expected_row_count[table],
                             row_count_before_retraction[table] - row_count_after_retraction[table])
Ejemplo n.º 14
0
    def test_get_lab_concept_summary_query(self):
        summary_query = required_labs.get_lab_concept_summary_query(
            FAKE_HPO_ID)
        summary_response = bq_utils.query(summary_query)
        summary_rows = bq_utils.response2rows(summary_response)
        submitted_labs = [
            row for row in summary_rows
            if row['measurement_concept_id_exists'] == 1
        ]
        actual_total_labs = summary_response['totalRows']

        # Count the total number of labs required, this number should be equal to the total number of rows in the
        # results generated by get_lab_concept_summary_query including the submitted and missing labs.
        unique_ancestor_concept_query = sql_wrangle.qualify_tables(
            """SELECT DISTINCT ancestor_concept_id FROM `{project_id}.{dataset_id}.{table_id}`"""
            .format(project_id=self.project_id,
                    dataset_id=self.dataset_id,
                    table_id=MEASUREMENT_CONCEPT_SETS_DESCENDANTS_TABLE))
        unique_ancestor_cocnept_response = bq_utils.query(
            unique_ancestor_concept_query)
        expected_total_labs = unique_ancestor_cocnept_response['totalRows']

        # Count the number of labs in the measurement table, this number should be equal to the number of labs
        # submitted by the fake site
        unique_measurement_concept_id_query = '''
                SELECT
                  DISTINCT c.ancestor_concept_id
                FROM
                  `{project_id}.{dataset_id}.{measurement_concept_sets_descendants}` AS c
                JOIN
                  `{project_id}.{dataset_id}.{measurement}` AS m
                ON
                  c.descendant_concept_id = m.measurement_concept_id
                '''.format(project_id=self.project_id,
                           dataset_id=self.dataset_id,
                           measurement_concept_sets_descendants=
                           MEASUREMENT_CONCEPT_SETS_DESCENDANTS_TABLE,
                           measurement=bq_utils.get_table_id(
                               FAKE_HPO_ID, common.MEASUREMENT))

        unique_measurement_concept_id_response = bq_utils.query(
            unique_measurement_concept_id_query)
        unique_measurement_concept_id_total_labs = unique_measurement_concept_id_response[
            'totalRows']

        self.assertEqual(int(expected_total_labs),
                         int(actual_total_labs),
                         msg='Compare the total number of labs')
        self.assertEqual(int(unique_measurement_concept_id_total_labs),
                         len(submitted_labs),
                         msg='Compare the number '
                         'of labs submitted '
                         'in the measurement')
Ejemplo n.º 15
0
    def test_execute_queries(self):
        gender_nonbinary_concept_id = 1585841
        gender_nonbinary_source_concept_id = 123
        sex_female_concept_id = 1585847
        sex_female_source_concept_id = 45878463
        for tmpl in INSERT_FAKE_PARTICIPANTS_TMPLS:
            query = tmpl.render(
                project_id=self.project_id,
                dataset_id=self.dataset_id,
                gender_concept_id=repopulate_person_post_deid.GENDER_CONCEPT_ID,
                gender_nonbinary_concept_id=gender_nonbinary_concept_id,
                gender_nonbinary_source_concept_id=
                gender_nonbinary_source_concept_id,
                sex_at_birth_concept_id=repopulate_person_post_deid.
                SEX_AT_BIRTH_CONCEPT_ID,
                sex_female_concept_id=sex_female_concept_id,
                sex_female_source_concept_id=sex_female_source_concept_id)
            try:
                resp = bq_utils.query(query)
            except HttpError as e:
                self.fail("failed to execute query '{}': {}".format(
                    query, e.content))
            self.assertTrue(resp["jobComplete"])

        clean_cdr_engine.clean_dataset(
            self.project_id, self.dataset_id, self.sandbox_dataset_id,
            [(repopulate_person_post_deid.
              get_repopulate_person_post_deid_queries,)])

        rows = bq_utils.response2rows(
            bq_utils.query("SELECT * FROM `{}.{}.person`".format(
                self.project_id, self.dataset_id)))
        self.assertEquals(len(rows), 2)

        by_participant = {r["person_id"]: r for r in rows}
        self.assertPersonFields(
            by_participant[1], {
                "gender_concept_id": gender_nonbinary_concept_id,
                "gender_source_value": "nonbinary_src",
                "gender_source_concept_id": gender_nonbinary_source_concept_id,
                "sex_at_birth_concept_id": sex_female_concept_id,
                "sex_at_birth_source_value": "female_src",
                "sex_at_birth_source_concept_id": sex_female_source_concept_id
            })
        self.assertPersonFields(
            by_participant[2], {
                "gender_concept_id": 0,
                "gender_source_value": "No matching concept",
                "gender_source_concept_id": 0,
                "sex_at_birth_concept_id": 0,
                "sex_at_birth_source_value": "No matching concept",
                "sex_at_birth_source_concept_id": 0
            })
    def tearDown(self):
        test_util.delete_all_tables(self.dataset_id)
        # Delete concept table
        drop_concept_table(self.dataset_id)

        # re create concept table
        q = """CREATE or REPLACE table `{project}.{dataset}.concept`  as (
            SELECT * FROM `{project}.{vocab}.concept`)""".format(
            project=self.project_id,
            dataset=self.dataset_id,
            vocab=common.VOCABULARY_DATASET)
        bq_utils.query(q)
Ejemplo n.º 17
0
def drop_or_truncate_table(command):
    """
    Deletes or truncates table
    :param command: query to run
    :return: None
    """
    if sql_wrangle.is_truncate(command):
        table_id = sql_wrangle.get_truncate_table_name(command)
        query = 'DELETE FROM %s WHERE TRUE' % table_id
        bq_utils.query(query)
    else:
        table_id = sql_wrangle.get_drop_table_name(command)
        bq_utils.delete_table(table_id)
Ejemplo n.º 18
0
def copy_metadata_table(project_id, source_dataset_id, target_dataset_id,
                        table_fields):
    """

    :param project_id:
    :param source_dataset_id:
    :param target_dataset_id:
    :param table_fields:
    :return:
    """
    create_metadata_table(target_dataset_id, table_fields)
    query = COPY_QUERY.format(project=project_id, datset=source_dataset_id)
    bq_utils.query(query,
                   destination_dataset_id=target_dataset_id,
                   destination_table_id=METADATA_TABLE)
Ejemplo n.º 19
0
 def test_load_ehr_observation(self):
     hpo_id = 'pitt'
     dataset_id = self.dataset_id
     table_id = bq_utils.get_table_id(hpo_id, table_name='observation')
     q = 'SELECT observation_id FROM {dataset_id}.{table_id} ORDER BY observation_id'.format(
         dataset_id=dataset_id, table_id=table_id)
     expected_observation_ids = [
         int(row['observation_id'])
         for row in resources.csv_to_list(PITT_FIVE_PERSONS_OBSERVATION_CSV)
     ]
     sc_bucket = self.client.get_bucket(gcs_utils.get_hpo_bucket(hpo_id))
     bucket_blob = sc_bucket.blob('observation.csv')
     with open(PITT_FIVE_PERSONS_OBSERVATION_CSV, 'rb') as fp:
         bucket_blob.upload_from_file(fp)
     result = bq_utils.load_cdm_csv(hpo_id, 'observation')
     job_id = result['jobReference']['jobId']
     incomplete_jobs = bq_utils.wait_on_jobs([job_id])
     self.assertEqual(len(incomplete_jobs), 0,
                      'pitt_observation load job did not complete')
     load_job_result = bq_utils.get_job_details(job_id)
     load_job_result_status = load_job_result['status']
     load_job_errors = load_job_result_status.get('errors')
     self.assertIsNone(load_job_errors,
                       msg='pitt_observation load job failed: ' +
                       str(load_job_errors))
     query_results_response = bq_utils.query(q)
     query_job_errors = query_results_response.get('errors')
     self.assertIsNone(query_job_errors)
     actual_result = [
         int(row['f'][0]['v']) for row in query_results_response['rows']
     ]
     self.assertCountEqual(actual_result, expected_observation_ids)
Ejemplo n.º 20
0
    def test_consented_person_id(self):
        """
        Test observation data has seven (7) persons with consent records as described below
         1: No
         2: Yes
         3: NULL
         4: No  followed by Yes
         5: Yes followed by No
         6: Yes followed by NULL
         7: NULL and Yes with same date/time
        """
        # sanity check
        # pre-conditions
        self.assertFalse(
            bq_utils.table_exists(EHR_CONSENT_TABLE_ID,
                                  self.combined_dataset_id))

        # test
        ehr_consent()

        # post conditions
        self.assertTrue(
            bq_utils.table_exists(EHR_CONSENT_TABLE_ID,
                                  self.combined_dataset_id),
            'Table {dataset}.{table} created by consented_person'.format(
                dataset=self.combined_dataset_id, table=EHR_CONSENT_TABLE_ID))
        response = bq_utils.query('SELECT * FROM {dataset}.{table}'.format(
            dataset=self.combined_dataset_id, table=EHR_CONSENT_TABLE_ID))
        rows = bq_utils.response2rows(response)
        expected = {2, 4}
        actual = set(row['person_id'] for row in rows)
        self.assertSetEqual(
            expected, actual, 'Records in {dataset}.{table}'.format(
                dataset=self.combined_dataset_id, table=EHR_CONSENT_TABLE_ID))
Ejemplo n.º 21
0
 def _all_rdr_records_included(self):
     """
     All rdr records are included whether or not there is corresponding ehr record
     """
     for domain_table in DOMAIN_TABLES:
         mapping_table = mapping_table_for(domain_table)
         query = (
             'SELECT rt.{domain_table}_id as id '
             'FROM `{rdr_dataset_id}.{domain_table}` AS rt '
             'LEFT JOIN `{combined_dataset_id}.{mapping_table}` AS m '
             'ON rt.{domain_table}_id = m.src_{domain_table}_id '
             'WHERE '
             '  m.{domain_table}_id IS NULL '
             'OR NOT EXISTS '
             ' (SELECT 1 FROM `{combined_dataset_id}.{domain_table}` AS t '
             '  WHERE t.{domain_table}_id = m.{domain_table}_id)').format(
                 domain_table=domain_table,
                 rdr_dataset_id=bq_utils.get_rdr_dataset_id(),
                 combined_dataset_id=bq_utils.get_combined_dataset_id(),
                 mapping_table=mapping_table)
         response = bq_utils.query(query)
         rows = bq_utils.response2rows(response)
         self.assertEqual(
             0, len(rows),
             "RDR records should map to records in mapping and combined tables"
         )
Ejemplo n.º 22
0
 def _ehr_only_records_excluded(self):
     """
     EHR person records which are missing from RDR are excluded from combined
     """
     query = ('WITH ehr_only AS '
              ' (SELECT person_id '
              '  FROM `{ehr_dataset_id}.person` AS ep '
              '  WHERE NOT EXISTS '
              '    (SELECT 1 '
              '     FROM `{rdr_dataset_id}.person` AS rp '
              '     WHERE rp.person_id = ep.person_id) '
              ' ) '
              'SELECT '
              'ehr_only.person_id AS ehr_person_id, '
              'p.person_id AS combined_person_id '
              'FROM ehr_only '
              'LEFT JOIN `{combined_dataset_id}.person` AS p '
              'ON ehr_only.person_id = p.person_id').format(
                  ehr_dataset_id=self.ehr_dataset_id,
                  rdr_dataset_id=self.rdr_dataset_id,
                  combined_dataset_id=self.combined_dataset_id)
     response = bq_utils.query(query)
     rows = bq_utils.response2rows(response)
     self.assertGreater(len(rows), 0,
                        'Test data is missing EHR-only records')
     for row in rows:
         combined_person_id = row['combined_person_id']
         self.assertIsNone(
             combined_person_id,
             'EHR-only person_id `{ehr_person_id}` found in combined when it should be excluded'
         )
Ejemplo n.º 23
0
    def test_copy_rdr_tables(self):
        for table in RDR_TABLES_TO_COPY:
            self.assertFalse(
                bq_utils.table_exists(
                    table, self.combined_dataset_id))  # sanity check
            copy_rdr_table(table)
            actual = bq_utils.table_exists(table, self.combined_dataset_id)
            self.assertTrue(
                actual,
                msg='RDR table {table} should be copied'.format(table=table))

            # Check that row count in combined is same as rdr
            query = (
                'WITH rdr AS '
                ' (SELECT COUNT(1) n FROM `{rdr_dataset_id}.{table}`), '
                'combined AS '
                ' (SELECT COUNT(1) n FROM `{combined_dataset_id}.{table}`) '
                'SELECT '
                'rdr.n AS rdr_count, '
                'combined.n AS combined_count '
                'FROM rdr, combined ').format(
                    rdr_dataset_id=self.rdr_dataset_id,
                    combined_dataset_id=self.combined_dataset_id,
                    table=table)
            response = bq_utils.query(query)
            rows = bq_utils.response2rows(response)
            self.assertTrue(len(rows) == 1)  # sanity check
            row = rows[0]
            rdr_count, combined_count = row['rdr_count'], row['combined_count']
            msg_fmt = 'Table {table} has {rdr_count} in rdr and {combined_count} in combined (expected to be equal)'
            self.assertEqual(
                rdr_count, combined_count,
                msg_fmt.format(table=table,
                               rdr_count=rdr_count,
                               combined_count=combined_count))
Ejemplo n.º 24
0
 def _all_rdr_records_included(self):
     """
     All rdr records are included whether or not there is corresponding ehr record
     """
     for domain_table in DOMAIN_TABLES:
         mapping_table = mapping_table_for(domain_table)
         q = '''SELECT rt.{domain_table}_id as id
            FROM {rdr_dataset_id}.{domain_table} rt
            LEFT JOIN {ehr_rdr_dataset_id}.{mapping_table} m
            ON rt.{domain_table}_id = m.src_{domain_table}_id
            WHERE
              m.{domain_table}_id IS NULL
            OR NOT EXISTS
              (SELECT 1 FROM {ehr_rdr_dataset_id}.{domain_table} t
               WHERE t.{domain_table}_id = m.{domain_table}_id)'''.format(
             domain_table=domain_table,
             rdr_dataset_id=bq_utils.get_rdr_dataset_id(),
             ehr_rdr_dataset_id=bq_utils.get_ehr_rdr_dataset_id(),
             mapping_table=mapping_table)
         response = bq_utils.query(q)
         rows = test_util.response2rows(response)
         self.assertEqual(
             0, len(rows),
             "RDR records should map to records in mapping and combined tables"
         )
Ejemplo n.º 25
0
def export_from_path(p, hpo_id):
    """
    Export results
    :param p: path to SQL file
    :param hpo_id: HPO to run export for
    :return: `dict` structured for report render
    """
    result = dict()
    for f in list_files_only(p):
        name = f[0:-4].upper()
        abs_path = os.path.join(p, f)
        with open(abs_path, 'r') as fp:
            sql = fp.read()
            sql = render(sql,
                         hpo_id,
                         results_schema=bq_utils.get_dataset_id(),
                         vocab_schema='')
            query_result = bq_utils.query(sql)
            # TODO reshape results
            result[name] = query_result_to_payload(query_result)

    for d in list_dirs_only(p):
        abs_path = os.path.join(p, d)
        name = d.upper()
        # recursive call
        dir_result = export_from_path(abs_path, hpo_id)
        if name in result:
            # a sql file generated the item already
            result[name].update(dir_result)
        else:
            # add the item
            result[name] = dir_result
    return result
Ejemplo n.º 26
0
def query(q, dst_table_id, dst_dataset_id, write_disposition='WRITE_APPEND'):
    """
    Run query and save results to a table

    :param q: SQL statement
    :param dst_table_id: save results in a table with the specified id
    :param dst_dataset_id: identifies output dataset
    :param write_disposition: WRITE_TRUNCATE, WRITE_EMPTY, or WRITE_APPEND (default, to preserve schema)
    :return: query result
    """
    query_job_result = bq_utils.query(q,
                                      destination_table_id=dst_table_id,
                                      destination_dataset_id=dst_dataset_id,
                                      write_disposition=write_disposition)
    query_job_id = query_job_result['jobReference']['jobId']
    logging.info(f'Job {query_job_id} started for table {dst_table_id}')
    job_status = query_job_result['status']
    error_result = job_status.get('errorResult')
    if error_result is not None:
        msg = f'Job {query_job_id} failed because: {error_result}'
        raise bq_utils.InvalidOperationError(msg)
    incomplete_jobs = bq_utils.wait_on_jobs([query_job_id])
    if len(incomplete_jobs) > 0:
        raise bq_utils.BigQueryJobWaitError(incomplete_jobs)
    return query_job_result
Ejemplo n.º 27
0
    def test_load_csv(self):
        from google.appengine.api import app_identity

        app_id = app_identity.get_application_id()
        table_name = 'achilles_analysis'
        schema_file_name = table_name + '.json'
        csv_file_name = table_name + '.csv'
        schema_path = os.path.join(resources.fields_path, schema_file_name)
        local_csv_path = os.path.join(test_util.TEST_DATA_EXPORT_PATH,
                                      csv_file_name)
        with open(local_csv_path, 'r') as fp:
            response = gcs_utils.upload_object(self.hpo_bucket, csv_file_name,
                                               fp)
        hpo_bucket = self.hpo_bucket
        gcs_object_path = 'gs://%(hpo_bucket)s/%(csv_file_name)s' % locals()
        dataset_id = bq_utils.get_dataset_id()
        load_results = bq_utils.load_csv(schema_path, gcs_object_path, app_id,
                                         dataset_id, table_name)

        load_job_id = load_results['jobReference']['jobId']
        incomplete_jobs = bq_utils.wait_on_jobs([load_job_id])
        self.assertEqual(len(incomplete_jobs), 0,
                         'loading table {} timed out'.format(table_name))
        query_response = bq_utils.query('SELECT COUNT(1) FROM %(table_name)s' %
                                        locals())
        self.assertEqual(query_response['kind'], 'bigquery#queryResponse')
Ejemplo n.º 28
0
    def test_integration_create_drug_route_mappings_table(self):
        if bq_utils.table_exists(populate_route_ids.DRUG_ROUTES_TABLE_ID,
                                 dataset_id=self.dataset_id):
            bq_utils.delete_table(populate_route_ids.DRUG_ROUTES_TABLE_ID,
                                  dataset_id=self.dataset_id)

        if not bq_utils.table_exists(
                populate_route_ids.DOSE_FORM_ROUTES_TABLE_ID,
                dataset_id=self.dataset_id):
            populate_route_ids.create_dose_form_route_mappings_table(
                self.project_id, self.dataset_id)

        populate_route_ids.create_drug_route_mappings_table(
            self.project_id, self.dataset_id,
            populate_route_ids.DOSE_FORM_ROUTES_TABLE_ID,
            self.route_mapping_prefix)
        time.sleep(10)
        query = ("SELECT COUNT(*) AS n "
                 "FROM `{project_id}.{dataset_id}.{table_id}`").format(
                     project_id=self.project_id,
                     dataset_id=self.dataset_id,
                     table_id=populate_route_ids.DRUG_ROUTES_TABLE_ID)

        result = bq_utils.query(query)
        actual = bq_utils.response2rows(result)
        self.assertGreater(actual[0]["n"], 0)
Ejemplo n.º 29
0
def _export_query_response_by_path(p, hpo_id):
    """Utility to create response test payloads"""

    from validation import export

    for f in export.list_files_only(p):
        abs_path = os.path.join(p, f)
        with open(abs_path, 'r') as fp:
            sql = fp.read()
            sql = export.render(sql,
                                hpo_id,
                                results_schema=bq_utils.get_dataset_id(),
                                vocab_schema='synpuf_100')
            query_result = bq_utils.query(sql)
            out_file = os.path.join(TEST_DATA_EXPORT_PATH,
                                    f.replace('.sql', '_response.json'))
            with open(out_file, 'w') as fp:
                data = dict()
                if 'rows' in query_result:
                    data['rows'] = query_result['rows']
                if 'schema' in query_result:
                    data['schema'] = query_result['schema']
                import json
                json.dump(data,
                          fp,
                          sort_keys=True,
                          indent=4,
                          separators=(',', ': '))
Ejemplo n.º 30
0
def most_common_heel_errors(app_id=None, dataset_id=None, hpo_ids=None):
    """
    :param app_id: Application Id
    :param dataset_id: Dataset Id
    :param hpo_ids: list of Hpo_ids
    :return: None
    """
    heel_errors = list()
    if app_id is None:
        app_id = app_identity.get_application_id()
    if dataset_id is None:
        dataset_id = bq_utils.get_dataset_id()
    if not os.path.exists(HEEL_ERRORS_JSON) and not os.path.exists(
            HEEL_ERRORS_CSV):
        for hpo_id in hpo_ids:
            if bq_utils.table_exists(
                    table_id='{hpo_id}_achilles_heel_results'.format(
                        hpo_id=hpo_id),
                    dataset_id=dataset_id):
                query = heel_error_query.format(app_id=app_id,
                                                dataset_id=dataset_id,
                                                hpo_id=hpo_id)
                query_job = bq_utils.query(query)
                result = bq_utils.response2rows(query_job)
                heel_errors.extend(result)
    with open(HEEL_ERRORS_JSON, 'w') as fp:
        json.dump(heel_errors, fp, sort_keys=True, indent=4)
    parse_json_csv()