def test_consented_person_id(self):
     """
     Test observation data has seven (7) persons with consent records as described below
      1: No
      2: Yes
      3: NULL
      4: No  followed by Yes
      5: Yes followed by No
      6: Yes followed by NULL
      7: NULL and Yes with same date/time
     """
     # sanity check
     self.assertFalse(
         bq_utils.table_exists(EHR_CONSENT_TABLE_ID,
                               self.combined_dataset_id))
     ehr_consent()
     self.assertTrue(
         bq_utils.table_exists(EHR_CONSENT_TABLE_ID,
                               self.combined_dataset_id),
         'Table {dataset}.{table} created by consented_person'.format(
             dataset=self.combined_dataset_id, table=EHR_CONSENT_TABLE_ID))
     response = bq_utils.query('SELECT * FROM {dataset}.{table}'.format(
         dataset=self.combined_dataset_id, table=EHR_CONSENT_TABLE_ID))
     rows = test_util.response2rows(response)
     expected = {2, 4}
     actual = set(row['person_id'] for row in rows)
     self.assertSetEqual(
         expected, actual, 'Records in {dataset}.{table}'.format(
             dataset=self.combined_dataset_id, table=EHR_CONSENT_TABLE_ID))
 def _all_rdr_records_included(self):
     """
     All rdr records are included whether or not there is corresponding ehr record
     """
     for domain_table in DOMAIN_TABLES:
         mapping_table = mapping_table_for(domain_table)
         q = '''SELECT rt.{domain_table}_id as id
            FROM {rdr_dataset_id}.{domain_table} rt
            LEFT JOIN {ehr_rdr_dataset_id}.{mapping_table} m
            ON rt.{domain_table}_id = m.src_{domain_table}_id
            WHERE
              m.{domain_table}_id IS NULL
            OR NOT EXISTS
              (SELECT 1 FROM {ehr_rdr_dataset_id}.{domain_table} t
               WHERE t.{domain_table}_id = m.{domain_table}_id)'''.format(
             domain_table=domain_table,
             rdr_dataset_id=bq_utils.get_rdr_dataset_id(),
             ehr_rdr_dataset_id=bq_utils.get_ehr_rdr_dataset_id(),
             mapping_table=mapping_table)
         response = bq_utils.query(q)
         rows = test_util.response2rows(response)
         self.assertEqual(
             0, len(rows),
             "RDR records should map to records in mapping and combined tables"
         )
 def _ehr_only_records_excluded(self):
     """
     EHR person records which are missing from RDR are excluded from combined
     """
     q = '''
     WITH ehr_only AS
     (SELECT person_id
      FROM {ehr_dataset_id}.person ep
      WHERE NOT EXISTS
        (SELECT 1
         FROM {rdr_dataset_id}.person rp
         WHERE rp.person_id = ep.person_id)
     )
     SELECT
       ehr_only.person_id AS ehr_person_id,
       p.person_id        AS combined_person_id
     FROM ehr_only
       LEFT JOIN {ehr_rdr_dataset_id}.person p
         ON ehr_only.person_id = p.person_id
     '''.format(ehr_dataset_id=self.ehr_dataset_id,
                rdr_dataset_id=self.rdr_dataset_id,
                ehr_rdr_dataset_id=self.combined_dataset_id)
     response = bq_utils.query(q)
     rows = test_util.response2rows(response)
     self.assertGreater(len(rows), 0,
                        'Test data is missing EHR-only records')
     for row in rows:
         combined_person_id = row['combined_person_id']
         self.assertIsNone(
             combined_person_id,
             'EHR-only person_id `{ehr_person_id}` found in combined when it should be excluded'
         )
    def test_copy_rdr_tables(self):
        for table in RDR_TABLES_TO_COPY:
            self.assertFalse(
                bq_utils.table_exists(
                    table, self.combined_dataset_id))  # sanity check
            copy_rdr_table(table)
            actual = bq_utils.table_exists(table, self.combined_dataset_id)
            self.assertTrue(
                actual,
                msg='RDR table {table} should be copied'.format(table=table))

            # Check that row count in combined is same as rdr
            q = '''
              WITH rdr AS
               (SELECT COUNT(1) n FROM {rdr_dataset_id}.{table}),
              combined AS
               (SELECT COUNT(1) n FROM {combined_dataset_id}.{table})
              SELECT
                rdr.n      AS rdr_count,
                combined.n AS combined_count
              FROM rdr, combined
            '''.format(rdr_dataset_id=self.rdr_dataset_id,
                       combined_dataset_id=self.combined_dataset_id,
                       table=table)
            response = bq_utils.query(q)
            rows = test_util.response2rows(response)
            self.assertTrue(len(rows) == 1)  # sanity check
            row = rows[0]
            rdr_count, combined_count = row['rdr_count'], row['combined_count']
            msg_fmt = 'Table {table} has {rdr_count} in rdr and {combined_count} in combined (expected to be equal)'
            self.assertEqual(
                rdr_count, combined_count,
                msg_fmt.format(table=table,
                               rdr_count=rdr_count,
                               combined_count=combined_count))
 def _check_ehr_person_observation(self):
     q = '''SELECT * FROM {dataset_id}.person'''.format(
         dataset_id=self.ehr_dataset_id)
     person_response = bq_utils.query(q)
     person_rows = test_util.response2rows(person_response)
     q = '''SELECT * 
            FROM {ehr_rdr_dataset_id}.observation
            WHERE observation_type_concept_id = 38000280'''.format(
         ehr_rdr_dataset_id=self.combined_dataset_id)
     # observation should contain 4 records per person of type EHR
     expected = len(person_rows) * 4
     observation_response = bq_utils.query(q)
     observation_rows = test_util.response2rows(observation_response)
     # TODO check row content is as expected
     actual = len(observation_rows)
     self.assertEqual(
         actual, expected,
         'Expected %s EHR person records in observation but found %s' %
         (expected, actual))
    def test_union_ehr(self):
        self._load_datasets()
        input_tables_before = set(self._dataset_tables(self.input_dataset_id))

        # output should be mapping tables and cdm tables
        output_tables_before = self._dataset_tables(self.output_dataset_id)
        mapping_tables = [
            ehr_union.mapping_table_for(table)
            for table in ehr_union.tables_to_map()
        ]
        output_cdm_tables = [
            ehr_union.output_table_for(table) for table in common.CDM_TABLES
        ]
        expected_output = set(output_tables_before + mapping_tables +
                              output_cdm_tables)

        # perform ehr union
        ehr_union.main(self.input_dataset_id, self.output_dataset_id,
                       self.project_id, self.hpo_ids)

        # input dataset should be unchanged
        input_tables_after = set(self._dataset_tables(self.input_dataset_id))
        self.assertSetEqual(input_tables_before, input_tables_after)

        # mapping tables
        tables_to_map = ehr_union.tables_to_map()
        for table_to_map in tables_to_map:
            mapping_table = ehr_union.mapping_table_for(table_to_map)
            expected_fields = {
                'src_table_id',
                'src_%s_id' % table_to_map,
                '%s_id' % table_to_map, 'src_hpo_id'
            }
            mapping_table_info = bq_utils.get_table_info(
                mapping_table, dataset_id=self.output_dataset_id)
            mapping_table_fields = mapping_table_info.get('schema',
                                                          dict()).get(
                                                              'fields', [])
            actual_fields = set([f['name'] for f in mapping_table_fields])
            message = 'Table %s has fields %s when %s expected' % (
                mapping_table, actual_fields, expected_fields)
            self.assertSetEqual(expected_fields, actual_fields, message)
            result_table = ehr_union.output_table_for(table_to_map)
            expected_num_rows = len(self.expected_tables[result_table])
            actual_num_rows = int(mapping_table_info.get('numRows', -1))
            message = 'Table %s has %s rows when %s expected' % (
                mapping_table, actual_num_rows, expected_num_rows)
            self.assertEqual(expected_num_rows, actual_num_rows, message)

        # check for each output table
        for table_name in common.CDM_TABLES:
            # output table exists and row count is sum of those submitted by hpos
            result_table = ehr_union.output_table_for(table_name)
            expected_rows = self.expected_tables[result_table]
            expected_count = len(expected_rows)
            table_info = bq_utils.get_table_info(
                result_table, dataset_id=self.output_dataset_id)
            actual_count = int(table_info.get('numRows'))
            msg = 'Unexpected row count in table {result_table} after ehr union'.format(
                result_table=result_table)
            self.assertEqual(expected_count, actual_count, msg)
            # TODO Compare table rows to expected accounting for the new ids and ignoring field types
            # q = 'SELECT * FROM {dataset}.{table}'.format(dataset=self.output_dataset_id, table=result_table)
            # query_response = bq_utils.query(q)
            # actual_rows = test_util.response2rows(query_response)

            # output table has clustering on person_id where applicable
            fields_file = os.path.join(resources.fields_path,
                                       table_name + '.json')
            with open(fields_file, 'r') as fp:
                fields = json.load(fp)
                field_names = [field['name'] for field in fields]
                if 'person_id' in field_names:
                    self._table_has_clustering(table_info)

        actual_output = set(self._dataset_tables(self.output_dataset_id))
        self.assertSetEqual(expected_output, actual_output)

        # explicit check that output person_ids are same as input
        chs_person_table_id = bq_utils.get_table_id(CHS_HPO_ID, 'person')
        pitt_person_table_id = bq_utils.get_table_id(PITT_HPO_ID, 'person')
        q = '''SELECT DISTINCT person_id FROM (
           SELECT person_id FROM {dataset_id}.{chs_person_table_id}
           UNION ALL
           SELECT person_id FROM {dataset_id}.{pitt_person_table_id}
        ) ORDER BY person_id ASC'''.format(
            dataset_id=self.input_dataset_id,
            chs_person_table_id=chs_person_table_id,
            pitt_person_table_id=pitt_person_table_id)
        response = bq_utils.query(q)
        expected_rows = test_util.response2rows(response)
        person_table_id = ehr_union.output_table_for('person')
        q = '''SELECT DISTINCT person_id 
               FROM {dataset_id}.{table_id} 
               ORDER BY person_id ASC'''.format(
            dataset_id=self.output_dataset_id, table_id=person_table_id)
        response = bq_utils.query(q)
        actual_rows = test_util.response2rows(response)
        self.assertListEqual(expected_rows, actual_rows)