Exemple #1
0
def populate_achilles(hpo_bucket, hpo_id=FAKE_HPO_ID, include_heel=True):
    from validation import achilles, achilles_heel
    import app_identity

    app_id = app_identity.get_application_id()

    test_file_name = achilles.ACHILLES_ANALYSIS + '.csv'
    achilles_analysis_file_path = os.path.join(TEST_DATA_EXPORT_PATH,
                                               test_file_name)
    schema_name = achilles.ACHILLES_ANALYSIS
    write_cloud_file(hpo_bucket, achilles_analysis_file_path)
    gcs_path = 'gs://' + hpo_bucket + '/' + test_file_name
    dataset_id = bq_utils.get_dataset_id()
    table_id = bq_utils.get_table_id(hpo_id, achilles.ACHILLES_ANALYSIS)
    bq_utils.load_csv(schema_name, gcs_path, app_id, dataset_id, table_id)

    table_names = [achilles.ACHILLES_RESULTS, achilles.ACHILLES_RESULTS_DIST]
    if include_heel:
        table_names.append(achilles_heel.ACHILLES_HEEL_RESULTS)

    running_jobs = []
    for table_name in table_names:
        test_file_name = table_name + '.csv'
        test_file_path = os.path.join(TEST_DATA_EXPORT_SYNPUF_PATH,
                                      table_name + '.csv')
        write_cloud_file(hpo_bucket, test_file_path)
        gcs_path = 'gs://' + hpo_bucket + '/' + test_file_name
        dataset_id = bq_utils.get_dataset_id()
        table_id = bq_utils.get_table_id(hpo_id, table_name)
        load_results = bq_utils.load_csv(table_name, gcs_path, app_id,
                                         dataset_id, table_id)
        running_jobs.append(load_results['jobReference']['jobId'])
    bq_utils.wait_on_jobs(running_jobs)
Exemple #2
0
    def test_merge_EHR(self, mock_check_cron):
        self._load_datasets()
        # enable exception propagation as described at https://goo.gl/LqDgnj
        old_dataset_items = bq_utils.list_dataset_contents(
            bq_utils.get_dataset_id())
        expected_items = ['visit_id_mapping_table']
        expected_items.extend(
            ['unioned_ehr_' + table_name for table_name in common.CDM_TABLES])

        ehr_merge.merge(bq_utils.get_dataset_id(), self.project_id)
        # check the result files were placed in bucket
        dataset_items = bq_utils.list_dataset_contents(
            bq_utils.get_dataset_id())
        for table_name in common.CDM_TABLES:
            cmd = 'SELECT COUNT(1) FROM unioned_ehr_{}'.format(table_name)
            result = bq_utils.query(cmd)
            self.assertEqual(
                int(result['rows'][0]['f'][0]['v']),
                2 * globals().get(table_name.upper() + '_COUNT', 0),
                msg='failed for table unioned_ehr_{}'.format(table_name))
        self.assertSetEqual(set(old_dataset_items + expected_items),
                            set(dataset_items))

        table_name = 'condition_occurrence'
        cmd_union = 'SELECT * FROM unioned_ehr_{}'.format(table_name)
        cmd_pitt = 'SELECT * FROM pitt_{}'.format(table_name)
        cmd_visit_mapping = "SELECT global_visit_id, mapping_visit_id FROM visit_id_mapping_table where hpo='pitt'"
        qr_union = bq_utils.query(cmd_union)
        qr_pitt = bq_utils.query(cmd_pitt)
        qr_visit_mapping = bq_utils.query(cmd_visit_mapping)

        union_result = query_result_to_payload(qr_union)
        pitt_result = query_result_to_payload(qr_pitt)
        visit_mapping_result = query_result_to_payload(qr_visit_mapping)

        def get_element_from_list_of_lists(index, list_of_lists):
            return [list_item[index] for list_item in list_of_lists]

        for ind, pitt_visit_id in enumerate(
                pitt_result['VISIT_OCCURRENCE_ID']):
            if pitt_visit_id not in visit_mapping_result['MAPPING_VISIT_ID']:
                continue
            global_visit_id_index = visit_mapping_result[
                'MAPPING_VISIT_ID'].index(pitt_visit_id)
            global_visit_id = visit_mapping_result['GLOBAL_VISIT_ID'][
                global_visit_id_index]
            union_visit_id_index = union_result['VISIT_OCCURRENCE_ID'].index(
                global_visit_id)
            pitt_cols_without_id = [
                values for key, values in pitt_result.items() if key not in
                [u'VISIT_OCCURRENCE_ID', u'CONDITION_OCCURRENCE_ID']
            ]
            union_cols_without_id = [
                values for key, values in union_result.items() if key not in
                [u'VISIT_OCCURRENCE_ID', u'CONDITION_OCCURRENCE_ID']
            ]
            self.assertListEqual(
                get_element_from_list_of_lists(ind, pitt_cols_without_id),
                get_element_from_list_of_lists(union_visit_id_index,
                                               union_cols_without_id))
def mapping_query(domain_table):
    """
    Returns query used to get mapping of all records from RDR combined with EHR records of consented participants

    :param domain_table: one of the domain tables (e.g. 'visit_occurrence', 'condition_occurrence')
    :return:
    """

    if combine_consts.PERSON_ID in [
            field['name'] for field in resources.fields_for(domain_table)
    ]:
        return combine_consts.MAPPING_QUERY_WITH_PERSON_CHECK.format(
            rdr_dataset_id=bq_utils.get_rdr_dataset_id(),
            ehr_dataset_id=bq_utils.get_dataset_id(),
            ehr_rdr_dataset_id=bq_utils.get_ehr_rdr_dataset_id(),
            domain_table=domain_table,
            mapping_constant=common.RDR_ID_CONSTANT,
            ehr_consent_table_id=combine_consts.EHR_CONSENT_TABLE_ID)
    else:
        return combine_consts.MAPPING_QUERY_WITHOUT_PERSON_CHECK.format(
            rdr_dataset_id=bq_utils.get_rdr_dataset_id(),
            ehr_dataset_id=bq_utils.get_dataset_id(),
            ehr_rdr_dataset_id=bq_utils.get_ehr_rdr_dataset_id(),
            domain_table=domain_table,
            mapping_constant=common.RDR_ID_CONSTANT)
Exemple #4
0
    def test_merge_with_unmatched_schema(self):
        running_jobs = []
        with open(NYC_FIVE_PERSONS_MEASUREMENT_CSV, 'rb') as fp:
            gcs_utils.upload_object(gcs_utils.get_hpo_bucket('nyc'),
                                    'measurement.csv', fp)
        result = bq_utils.load_cdm_csv('nyc', 'measurement')
        running_jobs.append(result['jobReference']['jobId'])

        with open(PITT_FIVE_PERSONS_PERSON_CSV, 'rb') as fp:
            gcs_utils.upload_object(gcs_utils.get_hpo_bucket('pitt'),
                                    'person.csv', fp)
        result = bq_utils.load_cdm_csv('pitt', 'person')
        running_jobs.append(result['jobReference']['jobId'])

        incomplete_jobs = bq_utils.wait_on_jobs(running_jobs)
        self.assertEqual(
            len(incomplete_jobs), 0,
            'loading tables {},{} timed out'.format('nyc_measurement',
                                                    'pitt_person'))

        table_names = ['nyc_measurement', 'pitt_person']
        success, error = bq_utils.merge_tables(bq_utils.get_dataset_id(),
                                               table_names,
                                               bq_utils.get_dataset_id(),
                                               'merged_nyc_pitt')
        self.assertFalse(success)
Exemple #5
0
    def test_merge_bad_table_names(self):
        table_ids = ['nyc_person_foo', 'pitt_person_foo']
        success_flag, error_msg = bq_utils.merge_tables(
            bq_utils.get_dataset_id(), table_ids, bq_utils.get_dataset_id(),
            'merged_nyc_pitt')

        # print error_msg
        assert (not success_flag)
Exemple #6
0
    def test_merge_with_good_data(self):
        running_jobs = []
        with open(NYC_FIVE_PERSONS_PERSON_CSV, 'rb') as fp:
            gcs_utils.upload_object(gcs_utils.get_hpo_bucket('nyc'),
                                    'person.csv', fp)
        result = bq_utils.load_cdm_csv('nyc', 'person')
        running_jobs.append(result['jobReference']['jobId'])

        with open(PITT_FIVE_PERSONS_PERSON_CSV, 'rb') as fp:
            gcs_utils.upload_object(gcs_utils.get_hpo_bucket('pitt'),
                                    'person.csv', fp)
        result = bq_utils.load_cdm_csv('pitt', 'person')
        running_jobs.append(result['jobReference']['jobId'])

        nyc_person_ids = [
            int(row['person_id'])
            for row in resources._csv_to_list(NYC_FIVE_PERSONS_PERSON_CSV)
        ]
        pitt_person_ids = [
            int(row['person_id'])
            for row in resources._csv_to_list(PITT_FIVE_PERSONS_PERSON_CSV)
        ]
        expected_result = nyc_person_ids + pitt_person_ids
        expected_result.sort()

        incomplete_jobs = bq_utils.wait_on_jobs(running_jobs)
        self.assertEqual(
            len(incomplete_jobs), 0,
            'loading tables {},{} timed out'.format('nyc_person',
                                                    'pitt_person'))

        table_ids = ['nyc_person', 'pitt_person']
        success_flag, error = bq_utils.merge_tables(bq_utils.get_dataset_id(),
                                                    table_ids,
                                                    bq_utils.get_dataset_id(),
                                                    'merged_nyc_pitt')

        self.assertTrue(success_flag)
        self.assertEqual(error, "")

        query_string = "SELECT person_id FROM {}.{} LIMIT 1000".format(
            bq_utils.get_dataset_id(), 'merged_nyc_pitt')

        merged_query_job_result = bq_utils.query_table(query_string)

        self.assertIsNone(merged_query_job_result.get('errors', None))
        actual_result = [
            int(row['f'][0]['v']) for row in merged_query_job_result['rows']
        ]
        actual_result.sort()
        self.assertListEqual(expected_result, actual_result)
Exemple #7
0
def clean_ehr_dataset(project_id=None, dataset_id=None):
    """
    Run all clean rules defined for the ehr dataset.

    :param project_id:  Name of the BigQuery project.
    :param dataset_id:  Name of the dataset to clean
    """
    if project_id is None:
        project_id = app_identity.get_application_id()
        LOGGER.info('Project is unspecified.  Using default value of:\t%s',
                    project_id)

    if dataset_id is None:
        dataset_id = bq_utils.get_dataset_id()
        LOGGER.info('Dataset is unspecified.  Using default value of:\t%s',
                    dataset_id)

    sandbox_dataset_id = sandbox.create_sandbox_dataset(project_id=project_id,
                                                        dataset_id=dataset_id)

    query_list = _gather_ehr_queries(project_id, dataset_id,
                                     sandbox_dataset_id)

    LOGGER.info("Cleaning ehr_dataset")
    clean_engine.clean_dataset(project_id, query_list, stage.EHR)
Exemple #8
0
def get_lab_concept_summary_query(hpo_id):
    """
    Get the query that checks if the HPO site has submitted the required labs
    :param hpo_id: 
    :return: 
    """
    project_id = app_identity.get_application_id()
    dataset_id = bq_utils.get_dataset_id()
    hpo_measurement_table = bq_utils.get_table_id(hpo_id, common.MEASUREMENT)

    # Create measurement_concept_sets_table if not exist
    if not bq_utils.table_exists(MEASUREMENT_CONCEPT_SETS_TABLE, dataset_id):
        load_measurement_concept_sets_table(project_id, dataset_id)

    # Create measurement_concept_sets_descendants_table if not exist
    if not bq_utils.table_exists(MEASUREMENT_CONCEPT_SETS_DESCENDANTS_TABLE,
                                 dataset_id):
        load_measurement_concept_sets_descendants_table(project_id, dataset_id)

    return CHECK_REQUIRED_LAB_QUERY.format(
        project_id=project_id,
        ehr_ops_dataset_id=dataset_id,
        hpo_measurement_table=hpo_measurement_table,
        measurement_concept_sets_descendants=
        MEASUREMENT_CONCEPT_SETS_DESCENDANTS_TABLE)
def mapping_query(domain_table):
    """
    Returns query used to get mapping of all records from RDR combined with EHR records of consented participants

    :param domain_table: one of the domain tables (e.g. 'visit_occurrence', 'condition_occurrence')
    :return:
    """

    return '''SELECT DISTINCT
          '{rdr_dataset_id}'  AS src_dataset_id,
          {domain_table}_id  AS src_{domain_table}_id,
          'rdr' as src_hpo_id,
          {domain_table}_id + {mapping_constant}  AS {domain_table}_id
        FROM {rdr_dataset_id}.{domain_table}

        UNION ALL

        SELECT DISTINCT
          '{ehr_dataset_id}'  AS src_dataset_id,
          t.{domain_table}_id AS src_{domain_table}_id,
          v.src_hpo_id AS src_hpo_id,
          t.{domain_table}_id  AS {domain_table}_id
        FROM {ehr_dataset_id}.{domain_table} t
        JOIN {ehr_dataset_id}._mapping_{domain_table}  v on t.{domain_table}_id = v.{domain_table}_id
        WHERE EXISTS
           (SELECT 1 FROM {ehr_rdr_dataset_id}.{ehr_consent_table_id} c
            WHERE t.person_id = c.person_id)
    '''.format(rdr_dataset_id=bq_utils.get_rdr_dataset_id(),
               ehr_dataset_id=bq_utils.get_dataset_id(),
               ehr_rdr_dataset_id=bq_utils.get_ehr_rdr_dataset_id(),
               domain_table=domain_table,
               mapping_constant=common.RDR_ID_CONSTANT,
               ehr_consent_table_id=EHR_CONSENT_TABLE_ID)
Exemple #10
0
def mapping_query(domain_table):
    """
    Returns query used to get mapping of all records from RDR combined with EHR records of consented participants

    :param domain_table: one of the domain tables (e.g. 'visit_occurrence', 'condition_occurrence')
    :return:
    """
    return '''
    WITH all_records AS
    (
        SELECT
          '{rdr_dataset_id}'  AS src_dataset_id, 
          {domain_table}_id AS src_{domain_table}_id
        FROM {rdr_dataset_id}.{domain_table}

        UNION ALL

        SELECT
          '{ehr_dataset_id}'  AS src_dataset_id, 
          {domain_table}_id AS src_{domain_table}_id
        FROM {ehr_dataset_id}.{domain_table} t
        WHERE EXISTS
           (SELECT 1 FROM {ehr_rdr_dataset_id}.{ehr_consent_table_id} c 
            WHERE t.person_id = c.person_id)
    )
    SELECT 
      ROW_NUMBER() OVER (ORDER BY src_dataset_id, src_{domain_table}_id) AS {domain_table}_id,
      src_dataset_id,
      src_{domain_table}_id
    FROM all_records
    '''.format(rdr_dataset_id=bq_utils.get_rdr_dataset_id(),
               ehr_dataset_id=bq_utils.get_dataset_id(),
               ehr_rdr_dataset_id=bq_utils.get_ehr_rdr_dataset_id(),
               domain_table=domain_table,
               ehr_consent_table_id=EHR_CONSENT_TABLE_ID)
Exemple #11
0
 def setUp(self):
     self.app_id = app_identity.get_application_id()
     self.dataset_id = bq_utils.get_dataset_id()
     self.bucket = gcs_utils.get_drc_bucket()
     test_util.empty_bucket(self.bucket)
     test_util.delete_all_tables(self.dataset_id)
     self.load_test_data(hpo_id=HPO_NYC)
Exemple #12
0
 def setUp(self):
     self.hpo_bucket = gcs_utils.get_hpo_bucket(FAKE_HPO_ID)
     self.dataset = bq_utils.get_dataset_id()
     self.project_id = app_identity.get_application_id()
     self.storage_client = StorageClient(self.project_id)
     self.storage_client.empty_bucket(self.hpo_bucket)
     test_util.delete_all_tables(self.dataset)
Exemple #13
0
def most_common_heel_errors(app_id=None, dataset_id=None, hpo_ids=None):
    """
    :param app_id: Application Id
    :param dataset_id: Dataset Id
    :param hpo_ids: list of Hpo_ids
    :return: None
    """
    heel_errors = list()
    if app_id is None:
        app_id = app_identity.get_application_id()
    if dataset_id is None:
        dataset_id = bq_utils.get_dataset_id()
    if not os.path.exists(HEEL_ERRORS_JSON) and not os.path.exists(
            HEEL_ERRORS_CSV):
        for hpo_id in hpo_ids:
            if bq_utils.table_exists(
                    table_id='{hpo_id}_achilles_heel_results'.format(
                        hpo_id=hpo_id),
                    dataset_id=dataset_id):
                query = heel_error_query.format(app_id=app_id,
                                                dataset_id=dataset_id,
                                                hpo_id=hpo_id)
                query_job = bq_utils.query(query)
                result = bq_utils.response2rows(query_job)
                heel_errors.extend(result)
    with open(HEEL_ERRORS_JSON, 'w') as fp:
        json.dump(heel_errors, fp, sort_keys=True, indent=4)
    parse_json_csv()
Exemple #14
0
def create_dose_form_route_mappings_table(project_id, dataset_id=None):
    """
    Creates "_logging_dose_form_route_mappings" table with only id columns from resources/dose_form_route_mappings.csv

    :param project_id:
    :param dataset_id: BQ dataset_id
    :return: upload metadata for created table
    """
    if dataset_id is None:
        # Using table created in bq_dataset instead of re-creating in every dataset
        dataset_id = bq_utils.get_dataset_id()

    dose_form_routes_table_id = DOSE_FORM_ROUTES_TABLE_ID

    LOGGER.info("Creating %s.%s", dataset_id, DOSE_FORM_ROUTES_TABLE_ID)

    # create empty table
    bq_utils.create_table(DOSE_FORM_ROUTES_TABLE_ID,
                          DOSE_FORM_ROUTE_FIELDS,
                          drop_existing=True,
                          dataset_id=dataset_id)

    dose_form_route_mappings_csv = os.path.join(resources.resource_path,
                                                DOSE_FORM_ROUTES_FILE + ".csv")
    dose_form_route_mappings_list = resources.csv_to_list(
        dose_form_route_mappings_csv)
    dose_form_routes_populate_query = INSERT_ROUTES_QUERY.format(
        dataset_id=dataset_id,
        project_id=project_id,
        routes_table_id=DOSE_FORM_ROUTES_TABLE_ID,
        mapping_list=get_mapping_list(dose_form_route_mappings_list))
    result = bq_utils.query(dose_form_routes_populate_query)
    LOGGER.info("Created %s.%s", dataset_id, dose_form_routes_table_id)
    return result
Exemple #15
0
def export_from_path(p, hpo_id):
    """
    Export results
    :param p: path to SQL file
    :param hpo_id: HPO to run export for
    :return: `dict` structured for report render
    """
    result = dict()
    for f in list_files_only(p):
        name = f[0:-4].upper()
        abs_path = os.path.join(p, f)
        with open(abs_path, 'r') as fp:
            sql = fp.read()
            sql = render(sql,
                         hpo_id,
                         results_schema=bq_utils.get_dataset_id(),
                         vocab_schema='')
            query_result = bq_utils.query(sql)
            # TODO reshape results
            result[name] = query_result_to_payload(query_result)

    for d in list_dirs_only(p):
        abs_path = os.path.join(p, d)
        name = d.upper()
        # recursive call
        dir_result = export_from_path(abs_path, hpo_id)
        if name in result:
            # a sql file generated the item already
            result[name].update(dir_result)
        else:
            # add the item
            result[name] = dir_result
    return result
Exemple #16
0
    def test_load_csv(self):
        from google.appengine.api import app_identity

        app_id = app_identity.get_application_id()
        table_name = 'achilles_analysis'
        schema_file_name = table_name + '.json'
        csv_file_name = table_name + '.csv'
        schema_path = os.path.join(resources.fields_path, schema_file_name)
        local_csv_path = os.path.join(test_util.TEST_DATA_EXPORT_PATH,
                                      csv_file_name)
        with open(local_csv_path, 'r') as fp:
            response = gcs_utils.upload_object(self.hpo_bucket, csv_file_name,
                                               fp)
        hpo_bucket = self.hpo_bucket
        gcs_object_path = 'gs://%(hpo_bucket)s/%(csv_file_name)s' % locals()
        dataset_id = bq_utils.get_dataset_id()
        load_results = bq_utils.load_csv(schema_path, gcs_object_path, app_id,
                                         dataset_id, table_name)

        load_job_id = load_results['jobReference']['jobId']
        incomplete_jobs = bq_utils.wait_on_jobs([load_job_id])
        self.assertEqual(len(incomplete_jobs), 0,
                         'loading table {} timed out'.format(table_name))
        query_response = bq_utils.query('SELECT COUNT(1) FROM %(table_name)s' %
                                        locals())
        self.assertEqual(query_response['kind'], 'bigquery#queryResponse')
Exemple #17
0
def _export_query_response_by_path(p, hpo_id):
    """Utility to create response test payloads"""

    from validation import export

    for f in export.list_files_only(p):
        abs_path = os.path.join(p, f)
        with open(abs_path, 'r') as fp:
            sql = fp.read()
            sql = export.render(sql,
                                hpo_id,
                                results_schema=bq_utils.get_dataset_id(),
                                vocab_schema='synpuf_100')
            query_result = bq_utils.query(sql)
            out_file = os.path.join(TEST_DATA_EXPORT_PATH,
                                    f.replace('.sql', '_response.json'))
            with open(out_file, 'w') as fp:
                data = dict()
                if 'rows' in query_result:
                    data['rows'] = query_result['rows']
                if 'schema' in query_result:
                    data['schema'] = query_result['schema']
                import json
                json.dump(data,
                          fp,
                          sort_keys=True,
                          indent=4,
                          separators=(',', ': '))
Exemple #18
0
    def setUp(self):
        self.testbed = testbed.Testbed()
        self.testbed.activate()
        self.testbed.init_app_identity_stub()
        self.testbed.init_memcache_stub()
        self.testbed.init_urlfetch_stub()
        self.testbed.init_blobstore_stub()
        self.testbed.init_datastore_v3_stub()
        self.project_id = bq_utils.app_identity.get_application_id()
        self.hpo_ids = [NYC_HPO_ID, PITT_HPO_ID]
        self.input_dataset_id = bq_utils.get_dataset_id()
        self.output_dataset_id = bq_utils.get_unioned_dataset_id()
        self._empty_hpo_buckets()
        test_util.delete_all_tables(self.input_dataset_id)
        test_util.delete_all_tables(self.output_dataset_id)

        # TODO Generalize to work for all foreign key references
        # Collect all primary key fields in CDM tables
        mapped_fields = []
        for table in cdm.tables_to_map():
            field = table + '_id'
            mapped_fields.append(field)
        self.mapped_fields = mapped_fields
        self.implemented_foreign_keys = [
            eu_constants.VISIT_OCCURRENCE_ID, eu_constants.CARE_SITE_ID,
            eu_constants.LOCATION_ID
        ]
Exemple #19
0
 def test_load_ehr_observation(self):
     hpo_id = 'pitt'
     dataset_id = bq_utils.get_dataset_id()
     table_id = bq_utils.get_table_id(hpo_id, table_name='observation')
     q = 'SELECT observation_id FROM {dataset_id}.{table_id} ORDER BY observation_id'.format(
         dataset_id=dataset_id, table_id=table_id)
     expected_observation_ids = [
         int(row['observation_id']) for row in resources._csv_to_list(
             PITT_FIVE_PERSONS_OBSERVATION_CSV)
     ]
     with open(PITT_FIVE_PERSONS_OBSERVATION_CSV, 'rb') as fp:
         gcs_utils.upload_object(gcs_utils.get_hpo_bucket(hpo_id),
                                 'observation.csv', fp)
     result = bq_utils.load_cdm_csv(hpo_id, 'observation')
     job_id = result['jobReference']['jobId']
     incomplete_jobs = bq_utils.wait_on_jobs([job_id])
     self.assertEqual(len(incomplete_jobs), 0,
                      'pitt_observation load job did not complete')
     load_job_result = bq_utils.get_job_details(job_id)
     load_job_result_status = load_job_result['status']
     load_job_errors = load_job_result_status.get('errors')
     self.assertIsNone(load_job_errors,
                       msg='pitt_observation load job failed: ' +
                       str(load_job_errors))
     query_results_response = bq_utils.query(q)
     query_job_errors = query_results_response.get('errors')
     self.assertIsNone(query_job_errors)
     actual_result = [
         int(row['f'][0]['v']) for row in query_results_response['rows']
     ]
     self.assertListEqual(actual_result, expected_observation_ids)
 def tearDownClass(cls):
     ehr_dataset_id = bq_utils.get_dataset_id()
     rdr_dataset_id = bq_utils.get_rdr_dataset_id()
     test_util.delete_all_tables(ehr_dataset_id)
     test_util.delete_all_tables(rdr_dataset_id)
     cls.testbed.deactivate()
     logger.handlers = []
Exemple #21
0
def mapping_query(table_name, hpo_ids, dataset_id=None, project_id=None):
    """
    Get query used to generate new ids for a CDM table

    :param table_name: name of CDM table
    :param hpo_ids: identifies the HPOs
    :param dataset_id: identifies the BQ dataset containing the input table
    :param project_id: identifies the GCP project containing the dataset
    :return: the query
    """
    if dataset_id is None:
        dataset_id = bq_utils.get_dataset_id()
    if project_id is None:
        project_id = app_identity.get_application_id()
    subqueries = _mapping_subqueries(table_name, hpo_ids, dataset_id, project_id)
    union_all_query = UNION_ALL.join(subqueries)
    return '''
    WITH all_{table_name} AS (
      {union_all_query}
    )
    SELECT 
        src_table_id,
        src_{table_name}_id,
        ROW_NUMBER() OVER () AS {table_name}_id
    FROM all_{table_name}
    '''.format(union_all_query=union_all_query, table_name=table_name)
Exemple #22
0
 def setUpClass(cls):
     print(
         '\n**************************************************************')
     print(cls.__name__)
     print('**************************************************************')
     dataset_id = bq_utils.get_dataset_id()
     test_util.delete_all_tables(dataset_id)
     test_util.populate_achilles()
def assert_ehr_and_rdr_tables():
    """
    Raise assertion error if any CDM tables missing from EHR or RDR dataset
    """
    ehr_dataset_id = bq_utils.get_dataset_id()
    assert_tables_in(ehr_dataset_id)
    rdr_dataset_id = bq_utils.get_rdr_dataset_id()
    assert_tables_in(rdr_dataset_id)
 def setUp(self):
     super(CombineEhrRdrTest, self).setUp()
     self.APP_ID = bq_utils.app_identity.get_application_id()
     self.ehr_dataset_id = bq_utils.get_dataset_id()
     self.rdr_dataset_id = bq_utils.get_rdr_dataset_id()
     self.combined_dataset_id = bq_utils.get_ehr_rdr_dataset_id()
     self.drc_bucket = gcs_utils.get_drc_bucket()
     test_util.delete_all_tables(self.combined_dataset_id)
    def setUp(self):
        self.project_id = app_identity.get_application_id()
        self.dataset_id = bq_utils.get_dataset_id()
        self.bucket: str = gcs_utils.get_drc_bucket()
        self.storage_client = StorageClient(self.project_id)

        self.storage_client.empty_bucket(self.bucket)
        test_util.delete_all_tables(self.dataset_id)
        self.load_test_data(hpo_id=HPO_NYC)
Exemple #26
0
 def get_unconsented_ehr_records_count(self, table_name):
     q = UNCONSENTED_EHR_COUNTS_QUERY.format(rdr_dataset_id=bq_utils.get_rdr_dataset_id(),
                                             ehr_dataset_id=bq_utils.get_dataset_id(),
                                             ehr_rdr_dataset_id=self.combined_dataset_id,
                                             domain_table=table_name,
                                             ehr_consent_table_id='_ehr_consent')
     response = bq_utils.query(q)
     rows = bq_utils.response2rows(response)
     return rows[0]['n']
Exemple #27
0
 def _test_mapping_query(self):
     table = 'measurement'
     hpo_ids = ['chs', 'pitt']
     project_id = bq_utils.app_identity.get_application_id()
     dataset_id = bq_utils.get_dataset_id()
     created_tables = []
     for hpo_id in hpo_ids:
         hpo_table = self._create_hpo_table(hpo_id, table, dataset_id)
         created_tables.append(hpo_table)
     q = ehr_union.mapping_query(table, hpo_ids, dataset_id, project_id)
def main(args):
    dataset_id = get_dataset_id()
    target_bucket = args.bucket
    folder_prefix = args.folder + '/'
    _run_achilles()
    _run_export(datasource_id=dataset_id,
                folder_prefix=folder_prefix,
                target_bucket=target_bucket)
    _upload_achilles_files(folder_prefix=folder_prefix,
                           target_bucket=target_bucket)
Exemple #29
0
 def setUpClass(cls):
     print(
         '\n**************************************************************')
     print(cls.__name__)
     print('**************************************************************')
     fake_bucket = gcs_utils.get_hpo_bucket(test_util.FAKE_HPO_ID)
     dataset_id = bq_utils.get_dataset_id()
     test_util.delete_all_tables(dataset_id)
     test_util.get_synpuf_results_files()
     test_util.populate_achilles(fake_bucket)
Exemple #30
0
def clean_ehr_dataset(project=None, dataset=None):
    if dataset is None or dataset == '' or dataset.isspace():
        dataset = bq_utils.get_dataset_id()
        LOGGER.info('Dataset is unspecified.  Using default value of:\t%s',
                    dataset)

    query_list = _gather_ehr_queries(project, dataset)

    LOGGER.info("Cleaning ehr_dataset")
    clean_engine.clean_dataset(project, dataset, query_list)