Python get_table_infoの例、bq_utils.get_table_info Pythonの例

コード例 #1

0

ファイルを表示

ファイル: bq_utils_test.py プロジェクト: ksdkalluri/curation

 def test_create_table(self):
     table_id = 'some_random_table_id'
     fields = [
         dict(name='person_id', type='integer', mode='required'),
         dict(name='name', type='string', mode='nullable')
     ]
     result = bq_utils.create_table(table_id, fields)
     self.assertTrue('kind' in result)
     self.assertEqual(result['kind'], 'bigquery#table')
     table_info = bq_utils.get_table_info(table_id)
     self._table_has_clustering(table_info)

コード例 #2

0

ファイルを表示

ファイル: bq_utils_test.py プロジェクト: t-abdul-basser/curation

    def test_load_cdm_csv(self):
        with open(FIVE_PERSONS_PERSON_CSV, 'rb') as fp:
            gcs_utils.upload_object(self.hpo_bucket, 'person.csv', fp)
        result = bq_utils.load_cdm_csv(FAKE_HPO_ID, PERSON)
        self.assertEqual(result['status']['state'], 'RUNNING')

        load_job_id = result['jobReference']['jobId']
        table_id = result['configuration']['load']['destinationTable']['tableId']
        incomplete_jobs = bq_utils.wait_on_jobs([load_job_id])
        self.assertEqual(len(incomplete_jobs), 0, 'loading table {} timed out'.format(table_id))
        table_info = bq_utils.get_table_info(table_id)
        num_rows = table_info.get('numRows')
        self.assertEqual(num_rows, '5')

コード例 #3

0

ファイルを表示

    def get_table_columns(self, tablename):
        """
        Return a list of columns for the given table name.
        """
        info = bq_utils.get_table_info(tablename, dataset_id=self.idataset)
        schema = info.get('schema', {})
        fields = schema.get('fields')

        field_names = []
        for field in fields:
            field_names.append(field.get('name'))

        return field_names

コード例 #4

0

ファイルを表示

ファイル: combine_ehr_rdr_test.py プロジェクト: rfrancis1/curation

    def _mapping_table_checks(self):
        """
        Check mapping tables exist, have correct schema, have expected number of records
        """
        where = (
            'WHERE EXISTS '
            '  (SELECT 1 FROM `{combined_dataset_id}.{ehr_consent_table_id}` AS c '
            '   WHERE t.person_id = c.person_id)').format(
                combined_dataset_id=self.combined_dataset_id,
                ehr_consent_table_id=EHR_CONSENT_TABLE_ID)
        ehr_counts = test_util.get_table_counts(self.ehr_dataset_id,
                                                DOMAIN_TABLES, where)
        rdr_counts = test_util.get_table_counts(self.rdr_dataset_id)
        combined_counts = test_util.get_table_counts(self.combined_dataset_id)
        output_tables = combined_counts.keys()
        expected_counts = dict()
        expected_diffs = ['observation']

        for table in DOMAIN_TABLES:
            expected_mapping_table = mapping_table_for(table)
            self.assertIn(expected_mapping_table, output_tables)
            expected_fields = resources.fields_for(expected_mapping_table)
            actual_table_info = bq_utils.get_table_info(
                expected_mapping_table, self.combined_dataset_id)
            actual_fields = actual_table_info.get('schema',
                                                  dict()).get('fields', [])
            actual_fields_norm = map(test_util.normalize_field_payload,
                                     actual_fields)
            self.assertCountEqual(expected_fields, actual_fields_norm)

            # Count should be sum of EHR and RDR
            # (except for tables like observation where extra records are created for demographics)
            if 'person_id' in [
                    field.get('name', '')
                    for field in resources.fields_for(table)
            ]:
                unconsented_ehr_records = self.get_unconsented_ehr_records_count(
                    table)
            else:
                unconsented_ehr_records = 0

            actual_count = combined_counts[expected_mapping_table]

            if table in expected_diffs:
                expected_count = actual_count
            else:
                expected_count = (ehr_counts[table] -
                                  unconsented_ehr_records) + rdr_counts[table]
            expected_counts[expected_mapping_table] = expected_count

        self.assertDictContainsSubset(expected_counts, combined_counts)

コード例 #5

0

ファイルを表示

    def test_validate_five_persons_success(self, mock_check_cron):
        prefix = 'dummy-prefix-2018-03-22/'
        expected_result_items = resources._csv_to_list(test_util.FIVE_PERSONS_SUCCESS_RESULT_CSV)
        json_export_files = self.get_json_export_files(test_util.FAKE_HPO_ID)

        # upload all five_persons files
        for cdm_file in test_util.FIVE_PERSONS_FILES:
            test_util.write_cloud_file(self.hpo_bucket, cdm_file, prefix=prefix)

        expected_tables = ['person',
                           'visit_occurrence',
                           'condition_occurrence',
                           'procedure_occurrence',
                           'drug_exposure',
                           'measurement']
        cdm_files = [table + '.csv' for table in expected_tables]

        main.app.testing = True
        with main.app.test_client() as c:
            c.get(test_util.VALIDATE_HPO_FILES_URL)

            # check the result file was put in bucket
            expected_object_names = cdm_files + common.IGNORE_LIST + json_export_files
            expected_objects = [prefix + item for item in expected_object_names]

            list_bucket_result = gcs_utils.list_bucket(self.hpo_bucket)
            actual_objects = [item['name'] for item in list_bucket_result]
            self.assertSetEqual(set(expected_objects), set(actual_objects))

            # result says file found, parsed, loaded
            actual_result = test_util.read_cloud_file(self.hpo_bucket, prefix + common.RESULT_CSV)
            actual_result_file = StringIO.StringIO(actual_result)
            actual_result_items = resources._csv_file_to_list(actual_result_file)

            expected_result_items.sort()
            actual_result_items.sort()
            self.assertListEqual(expected_result_items, actual_result_items)
            self.assertTrue(main.all_required_files_loaded(test_util.FAKE_HPO_ID, folder_prefix=prefix))

        # check tables exist and are clustered as expected
        for table in expected_tables:
            fields_file = os.path.join(resources.fields_path, table + '.json')
            table_id = bq_utils.get_table_id(test_util.FAKE_HPO_ID, table)
            table_info = bq_utils.get_table_info(table_id)
            with open(fields_file, 'r') as fp:
                fields = json.load(fp)
                field_names = [field['name'] for field in fields]
                if 'person_id' in field_names:
                    self.table_has_clustering(table_info)

コード例 #6

0

ファイルを表示

ファイル: combine_ehr_rdr_test.py プロジェクト: nishanthpp93/curation

    def _mapping_table_checks(self):
        """
        Check mapping tables exist, have correct schema, have expected number of records
        """
        where = '''
                WHERE EXISTS
                   (SELECT 1 FROM {ehr_rdr_dataset_id}.{ehr_consent_table_id} c 
                    WHERE t.person_id = c.person_id)
                '''.format(ehr_rdr_dataset_id=self.combined_dataset_id,
                           ehr_consent_table_id=EHR_CONSENT_TABLE_ID)
        ehr_counts = test_util.get_table_counts(self.ehr_dataset_id,
                                                DOMAIN_TABLES, where)
        rdr_counts = test_util.get_table_counts(self.rdr_dataset_id)
        combined_counts = test_util.get_table_counts(self.combined_dataset_id)
        output_tables = combined_counts.keys()
        expected_counts = dict()
        expected_diffs = ['observation']
        self.maxDiff = None

        for t in DOMAIN_TABLES:
            expected_mapping_table = mapping_table_for(t)
            self.assertIn(expected_mapping_table, output_tables)
            expected_fields = resources.fields_for(expected_mapping_table)
            actual_table_info = bq_utils.get_table_info(
                expected_mapping_table, self.combined_dataset_id)
            actual_fields = actual_table_info.get('schema',
                                                  dict()).get('fields', [])
            actual_fields_norm = map(test_util.normalize_field_payload,
                                     actual_fields)
            self.assertItemsEqual(expected_fields, actual_fields_norm)

            # Count should be sum of EHR and RDR
            # (except for tables like observation where extra records are created for demographics)
            actual_count = combined_counts[expected_mapping_table]
            expected_count = actual_count if t in expected_diffs else ehr_counts[
                t] + rdr_counts[t]
            expected_counts[expected_mapping_table] = expected_count
        self.assertDictContainsSubset(expected=expected_counts,
                                      actual=combined_counts)

コード例 #7

0

ファイルを表示

ファイル: ehr_union_test.py プロジェクト: nishanthpp93/curation

    def test_union_ehr(self):
        self._load_datasets()
        input_tables_before = set(self._dataset_tables(self.input_dataset_id))

        # output should be mapping tables and cdm tables
        output_tables_before = self._dataset_tables(self.output_dataset_id)
        mapping_tables = [
            ehr_union.mapping_table_for(table)
            for table in ehr_union.tables_to_map()
        ]
        output_cdm_tables = [
            ehr_union.output_table_for(table) for table in common.CDM_TABLES
        ]
        expected_output = set(output_tables_before + mapping_tables +
                              output_cdm_tables)

        # perform ehr union
        ehr_union.main(self.input_dataset_id, self.output_dataset_id,
                       self.project_id, self.hpo_ids)

        # input dataset should be unchanged
        input_tables_after = set(self._dataset_tables(self.input_dataset_id))
        self.assertSetEqual(input_tables_before, input_tables_after)

        # mapping tables
        tables_to_map = ehr_union.tables_to_map()
        for table_to_map in tables_to_map:
            mapping_table = ehr_union.mapping_table_for(table_to_map)
            expected_fields = {
                'src_table_id',
                'src_%s_id' % table_to_map,
                '%s_id' % table_to_map, 'src_hpo_id'
            }
            mapping_table_info = bq_utils.get_table_info(
                mapping_table, dataset_id=self.output_dataset_id)
            mapping_table_fields = mapping_table_info.get('schema',
                                                          dict()).get(
                                                              'fields', [])
            actual_fields = set([f['name'] for f in mapping_table_fields])
            message = 'Table %s has fields %s when %s expected' % (
                mapping_table, actual_fields, expected_fields)
            self.assertSetEqual(expected_fields, actual_fields, message)
            result_table = ehr_union.output_table_for(table_to_map)
            expected_num_rows = len(self.expected_tables[result_table])
            actual_num_rows = int(mapping_table_info.get('numRows', -1))
            message = 'Table %s has %s rows when %s expected' % (
                mapping_table, actual_num_rows, expected_num_rows)
            self.assertEqual(expected_num_rows, actual_num_rows, message)

        # check for each output table
        for table_name in common.CDM_TABLES:
            # output table exists and row count is sum of those submitted by hpos
            result_table = ehr_union.output_table_for(table_name)
            expected_rows = self.expected_tables[result_table]
            expected_count = len(expected_rows)
            table_info = bq_utils.get_table_info(
                result_table, dataset_id=self.output_dataset_id)
            actual_count = int(table_info.get('numRows'))
            msg = 'Unexpected row count in table {result_table} after ehr union'.format(
                result_table=result_table)
            self.assertEqual(expected_count, actual_count, msg)
            # TODO Compare table rows to expected accounting for the new ids and ignoring field types
            # q = 'SELECT * FROM {dataset}.{table}'.format(dataset=self.output_dataset_id, table=result_table)
            # query_response = bq_utils.query(q)
            # actual_rows = test_util.response2rows(query_response)

            # output table has clustering on person_id where applicable
            fields_file = os.path.join(resources.fields_path,
                                       table_name + '.json')
            with open(fields_file, 'r') as fp:
                fields = json.load(fp)
                field_names = [field['name'] for field in fields]
                if 'person_id' in field_names:
                    self._table_has_clustering(table_info)

        actual_output = set(self._dataset_tables(self.output_dataset_id))
        self.assertSetEqual(expected_output, actual_output)

        # explicit check that output person_ids are same as input
        chs_person_table_id = bq_utils.get_table_id(CHS_HPO_ID, 'person')
        pitt_person_table_id = bq_utils.get_table_id(PITT_HPO_ID, 'person')
        q = '''SELECT DISTINCT person_id FROM (
           SELECT person_id FROM {dataset_id}.{chs_person_table_id}
           UNION ALL
           SELECT person_id FROM {dataset_id}.{pitt_person_table_id}
        ) ORDER BY person_id ASC'''.format(
            dataset_id=self.input_dataset_id,
            chs_person_table_id=chs_person_table_id,
            pitt_person_table_id=pitt_person_table_id)
        response = bq_utils.query(q)
        expected_rows = test_util.response2rows(response)
        person_table_id = ehr_union.output_table_for('person')
        q = '''SELECT DISTINCT person_id 
               FROM {dataset_id}.{table_id} 
               ORDER BY person_id ASC'''.format(
            dataset_id=self.output_dataset_id, table_id=person_table_id)
        response = bq_utils.query(q)
        actual_rows = test_util.response2rows(response)
        self.assertListEqual(expected_rows, actual_rows)

コード例 #8

0

ファイルを表示

ファイル: aou.py プロジェクト: dcarbone/curation

def create_person_id_src_hpo_map(input_dataset, credentials):
    """
    Create a table containing person_ids and src_hpo_ids

    :param input_dataset:  the input dataset to deid
    :param credentidals:  the credentials needed to create a new table.
    """
    map_tablename = "_mapping_person_src_hpos"
    sql = ("select person_id, src_hpo_id "
           "from {input_dataset}._mapping_{table} "
           "join {input_dataset}.{table} "
           "using ({table}_id) "
           "where src_hpo_id not like 'rdr'")

    # list dataset contents
    dataset_tables = bq_utils.list_dataset_contents(input_dataset)
    mapping_tables = []
    mapped_tables = []
    for table in dataset_tables:
        if table.startswith('_mapping_'):
            mapping_tables.append(table)
            mapped_tables.append(table[9:])

    # make sure mapped tables all exist
    check_tables = []
    for table in mapped_tables:
        if table in dataset_tables:
            check_tables.append(table)

    # make sure check_tables contain person_id fields
    person_id_tables = []
    for table in check_tables:
        info = bq_utils.get_table_info(table, dataset_id=input_dataset)
        schema = info.get('schema', {})
        for field_info in schema.get('fields', []):
            if 'person_id' in field_info.get('name'):
                person_id_tables.append(table)

    # revamp mapping tables to contain only mapping tables for tables
    # with person_id fields
    mapping_tables = ['_mapping_' + table for table in person_id_tables]

    sql_statement = []
    for table in person_id_tables:
        sql_statement.append(
            sql.format(table=table, input_dataset=input_dataset))

    final_query = ' UNION ALL '.join(sql_statement)

    # create the mapping table
    if map_tablename not in dataset_tables:
        fields = [{
            "type": "integer",
            "name": "person_id",
            "mode": "required",
            "description": "the person_id of someone with an ehr record"
        }, {
            "type": "string",
            "name": "src_hpo_id",
            "mode": "required",
            "description": "the src_hpo_id of an ehr record"
        }]
        bq_utils.create_table(map_tablename, fields, dataset_id=input_dataset)

    bq_utils.query(final_query,
                   destination_table_id=map_tablename,
                   destination_dataset_id=input_dataset,
                   write_disposition=bq_consts.WRITE_TRUNCATE)
    LOGGER.info(f"Created mapping table:\t{input_dataset}.{map_tablename}")

コード例 #9

0

ファイルを表示

    def test_union_ehr(self):
        self._load_datasets()
        input_tables_before = set(self._dataset_tables(self.input_dataset_id))

        # output should be mapping tables and cdm tables
        output_tables_before = self._dataset_tables(self.output_dataset_id)
        mapping_tables = [
            ehr_union.mapping_table_for(table)
            for table in cdm.tables_to_map() + [combine_ehr_rdr.PERSON_TABLE]
        ]
        output_cdm_tables = [
            ehr_union.output_table_for(table) for table in resources.CDM_TABLES
        ]
        expected_output = set(output_tables_before + mapping_tables +
                              output_cdm_tables)

        # perform ehr union
        ehr_union.main(self.input_dataset_id, self.output_dataset_id,
                       self.project_id, self.hpo_ids)

        # input dataset should be unchanged
        input_tables_after = set(self._dataset_tables(self.input_dataset_id))
        self.assertSetEqual(input_tables_before, input_tables_after)

        # fact_relationship from pitt
        hpo_unique_identifiers = ehr_union.get_hpo_offsets(self.hpo_ids)
        pitt_offset = hpo_unique_identifiers[PITT_HPO_ID]
        q = '''SELECT fact_id_1, fact_id_2 
               FROM `{input_dataset}.{hpo_id}_fact_relationship`
               where domain_concept_id_1 = 21 and domain_concept_id_2 = 21'''.format(
            input_dataset=self.input_dataset_id, hpo_id=PITT_HPO_ID)
        response = bq_utils.query(q)
        result = bq_utils.response2rows(response)

        expected_fact_id_1 = result[0]["fact_id_1"] + pitt_offset
        expected_fact_id_2 = result[0]["fact_id_2"] + pitt_offset

        q = '''SELECT fr.fact_id_1, fr.fact_id_2 FROM `{dataset_id}.unioned_ehr_fact_relationship` fr
            join `{dataset_id}._mapping_measurement` mm on fr.fact_id_1 = mm.measurement_id
            and mm.src_hpo_id = "{hpo_id}"'''.format(
            dataset_id=self.output_dataset_id, hpo_id=PITT_HPO_ID)
        response = bq_utils.query(q)
        result = bq_utils.response2rows(response)
        actual_fact_id_1, actual_fact_id_2 = result[0]["fact_id_1"], result[0][
            "fact_id_2"]
        self.assertEqual(expected_fact_id_1, actual_fact_id_1)
        self.assertEqual(expected_fact_id_2, actual_fact_id_2)

        # mapping tables
        tables_to_map = cdm.tables_to_map()
        for table_to_map in tables_to_map:
            mapping_table = ehr_union.mapping_table_for(table_to_map)
            expected_fields = {
                'src_table_id',
                'src_%s_id' % table_to_map,
                '%s_id' % table_to_map, 'src_hpo_id'
            }
            mapping_table_info = bq_utils.get_table_info(
                mapping_table, dataset_id=self.output_dataset_id)
            mapping_table_fields = mapping_table_info.get('schema',
                                                          dict()).get(
                                                              'fields', [])
            actual_fields = set([f['name'] for f in mapping_table_fields])
            message = 'Table %s has fields %s when %s expected' % (
                mapping_table, actual_fields, expected_fields)
            self.assertSetEqual(expected_fields, actual_fields, message)
            result_table = ehr_union.output_table_for(table_to_map)
            expected_num_rows = len(self.expected_tables[result_table])
            actual_num_rows = int(mapping_table_info.get('numRows', -1))
            message = 'Table %s has %s rows when %s expected' % (
                mapping_table, actual_num_rows, expected_num_rows)
            self.assertEqual(expected_num_rows, actual_num_rows, message)

        # check for each output table
        for table_name in resources.CDM_TABLES:
            # output table exists and row count is sum of those submitted by hpos
            result_table = ehr_union.output_table_for(table_name)
            expected_rows = self.expected_tables[result_table]
            expected_count = len(expected_rows)
            table_info = bq_utils.get_table_info(
                result_table, dataset_id=self.output_dataset_id)
            actual_count = int(table_info.get('numRows'))
            msg = 'Unexpected row count in table {result_table} after ehr union'.format(
                result_table=result_table)
            self.assertEqual(expected_count, actual_count, msg)
            # TODO Compare table rows to expected accounting for the new ids and ignoring field types
            # q = 'SELECT * FROM {dataset}.{table}'.format(dataset=self.output_dataset_id, table=result_table)
            # query_response = bq_utils.query(q)
            # actual_rows = bq_utils.response2rows(query_response)

            # output table has clustering on person_id where applicable
            fields_file = os.path.join(resources.fields_path,
                                       table_name + '.json')
            with open(fields_file, 'r') as fp:
                fields = json.load(fp)
                field_names = [field['name'] for field in fields]
                if 'person_id' in field_names:
                    self._table_has_clustering(table_info)

        actual_output = set(self._dataset_tables(self.output_dataset_id))
        self.assertSetEqual(expected_output, actual_output)

        # explicit check that output person_ids are same as input
        nyc_person_table_id = bq_utils.get_table_id(NYC_HPO_ID, 'person')
        pitt_person_table_id = bq_utils.get_table_id(PITT_HPO_ID, 'person')
        q = '''SELECT DISTINCT person_id FROM (
           SELECT person_id FROM {dataset_id}.{nyc_person_table_id}
           UNION ALL
           SELECT person_id FROM {dataset_id}.{pitt_person_table_id}
        ) ORDER BY person_id ASC'''.format(
            dataset_id=self.input_dataset_id,
            nyc_person_table_id=nyc_person_table_id,
            pitt_person_table_id=pitt_person_table_id)
        response = bq_utils.query(q)
        expected_rows = bq_utils.response2rows(response)
        person_table_id = ehr_union.output_table_for('person')
        q = '''SELECT DISTINCT person_id 
               FROM {dataset_id}.{table_id} 
               ORDER BY person_id ASC'''.format(
            dataset_id=self.output_dataset_id, table_id=person_table_id)
        response = bq_utils.query(q)
        actual_rows = bq_utils.response2rows(response)
        self.assertCountEqual(expected_rows, actual_rows)

コード例 #10

0

ファイルを表示

ファイル: combine_ehr_rdr_test.py プロジェクト: nishanthpp93/curation

    def test_ehr_person_to_observation(self):
        # ehr person table converts to observation records
        create_cdm_tables()
        copy_rdr_table('person')
        move_ehr_person_to_observation()
        # person table query
        q_person = '''
            SELECT (person_id,
                    gender_concept_id,
                    gender_source_value,
                    race_concept_id,
                    race_source_value,
                    CAST(birth_datetime as STRING),
                    ethnicity_concept_id,
                    ethnicity_source_value,
                    EXTRACT(DATE FROM birth_datetime))
            FROM {ehr_dataset_id}.person
        '''.format(ehr_dataset_id=self.ehr_dataset_id)
        response_ehr_person = [[
            item['v'] for item in row['f']
        ] for row in query_result_to_payload(bq_utils.query(q_person))['F0_']]
        q_obs = '''
            SELECT (person_id,
                    observation_concept_id,
                    value_as_concept_id,
                    value_as_string,
                    observation_source_value,
                    observation_date)
            FROM {ehr_dataset_id}.observation obs
            WHERE   obs.observation_concept_id=4013886 -- Race - 4013886
                OR  obs.observation_concept_id=4271761 -- Ethnic group - 4271761
                OR  obs.observation_concept_id=4135376 -- Gender - 4135376
                OR  obs.observation_concept_id=4083587 -- DOB - 4083587
        '''.format(ehr_dataset_id=self.combined_dataset_id)
        response_obs = [[
            item['v'] for item in row['f']
        ] for row in query_result_to_payload(bq_utils.query(q_obs))['F0_']]
        # concept ids
        gender_concept_id = '4135376'
        race_concept_id = '4013886'
        dob_concept_id = '4083587'
        ethnicity_concept_id = '4271761'

        # expected lists
        expected_gender_list = [(row[0], gender_concept_id, row[1], row[8])
                                for row in response_ehr_person]
        expected_race_list = [(row[0], race_concept_id, row[3], row[8])
                              for row in response_ehr_person]
        expected_dob_list = [(row[0], dob_concept_id, row[5], row[8])
                             for row in response_ehr_person]
        expected_ethnicity_list = [(row[0], ethnicity_concept_id, row[6],
                                    row[8]) for row in response_ehr_person]

        # actual lists
        actual_gender_list = [(row[0], row[1], row[2], row[5])
                              for row in response_obs
                              if row[1] == gender_concept_id]
        actual_race_list = [(row[0], row[1], row[2], row[5])
                            for row in response_obs
                            if row[1] == race_concept_id]
        actual_dob_list = [(row[0], row[1], row[3], row[5])
                           for row in response_obs if row[1] == dob_concept_id]
        actual_ethnicity_list = [(row[0], row[1], row[2], row[5])
                                 for row in response_obs
                                 if row[1] == ethnicity_concept_id]

        self.assertListEqual(sorted(expected_gender_list),
                             sorted(actual_gender_list), 'gender check fails')
        self.assertListEqual(sorted(expected_race_list),
                             sorted(actual_race_list), 'race check fails')
        self.assertListEqual(sorted(expected_dob_list),
                             sorted(actual_dob_list), 'dob check fails')
        self.assertListEqual(sorted(expected_ethnicity_list),
                             sorted(actual_ethnicity_list),
                             'ethnicity check fails')

        person_ehr_row_count = int(
            bq_utils.get_table_info('person', self.ehr_dataset_id)['numRows'])
        obs_row_count = int(
            bq_utils.get_table_info('observation',
                                    self.combined_dataset_id)['numRows'])

        self.assertEqual(person_ehr_row_count * 4, obs_row_count)

コード例 #11

0

ファイルを表示

ファイル: ehr_merge_test.py プロジェクト: asmitagauchan/curation

    def test_merge_EHR(self, mock_check_cron):
        self._load_datasets()
        dataset_id = bq_utils.get_dataset_id()
        old_dataset_items = bq_utils.list_dataset_contents(dataset_id)
        expected_items = ['visit_id_mapping_table']
        expected_items.extend([
            ehr_merge.result_table_for(table_name)
            for table_name in common.CDM_TABLES
        ])

        ehr_merge.merge(dataset_id, self.project_id)

        # Check row counts for each output table
        dataset_items = bq_utils.list_dataset_contents(dataset_id)
        for table_name in common.CDM_TABLES:
            result_table = ehr_merge.result_table_for(table_name)
            expected_rows = self.expected_tables[result_table]
            expected_count = len(expected_rows)
            table_info = bq_utils.get_table_info(result_table)
            actual_count = int(table_info.get('numRows'))
            msg = 'Unexpected row count in table {result_table} after ehr union'.format(
                result_table=result_table)
            self.assertEqual(actual_count, expected_count, msg)

            # Check for clustering if table has person_id
            fields_file = os.path.join(resources.fields_path,
                                       table_name + '.json')
            with open(fields_file, 'r') as fp:
                fields = json.load(fp)
                field_names = [field['name'] for field in fields]
                if 'person_id' in field_names:
                    self._table_has_clustering(table_info)
        self.assertSetEqual(set(old_dataset_items + expected_items),
                            set(dataset_items))

        table_name = 'condition_occurrence'
        hpo_id = 'pitt'
        result_table = ehr_merge.result_table_for(table_name)
        pitt_table = bq_utils.get_table_id(hpo_id, table_name)
        cmd_union = 'SELECT * FROM ' + result_table
        cmd_pitt = 'SELECT * FROM ' + pitt_table
        cmd_visit_mapping = """
          SELECT global_visit_id, 
                 mapping_visit_id 
          FROM visit_id_mapping_table 
          WHERE hpo='{hpo_id}'""".format(hpo_id=hpo_id)
        qr_union = bq_utils.query(cmd_union)
        qr_pitt = bq_utils.query(cmd_pitt)
        qr_visit_mapping = bq_utils.query(cmd_visit_mapping)

        union_result = query_result_to_payload(qr_union)
        pitt_result = query_result_to_payload(qr_pitt)
        visit_mapping_result = query_result_to_payload(qr_visit_mapping)

        def get_element_from_list_of_lists(index, list_of_lists):
            return [list_item[index] for list_item in list_of_lists]

        for ind, pitt_visit_id in enumerate(
                pitt_result['VISIT_OCCURRENCE_ID']):
            if pitt_visit_id not in visit_mapping_result['MAPPING_VISIT_ID']:
                continue
            global_visit_id_index = visit_mapping_result[
                'MAPPING_VISIT_ID'].index(pitt_visit_id)
            global_visit_id = visit_mapping_result['GLOBAL_VISIT_ID'][
                global_visit_id_index]
            union_visit_id_index = union_result['VISIT_OCCURRENCE_ID'].index(
                global_visit_id)
            pitt_cols_without_id = [
                values for key, values in pitt_result.items() if key not in
                [u'VISIT_OCCURRENCE_ID', u'CONDITION_OCCURRENCE_ID']
            ]
            union_cols_without_id = [
                values for key, values in union_result.items() if key not in
                [u'VISIT_OCCURRENCE_ID', u'CONDITION_OCCURRENCE_ID']
            ]
            self.assertListEqual(
                get_element_from_list_of_lists(ind, pitt_cols_without_id),
                get_element_from_list_of_lists(union_visit_id_index,
                                               union_cols_without_id))