Example #1
0
def get_tables_with_person_id(input_dataset):
    """
    Get list of tables that have a person_id column
    """
    person_table_query = PERSON_TABLE_QUERY.format(INPUT_DATASET=input_dataset)
    person_tables_df = bq.query(person_table_query)
    return list(person_tables_df.table_name.get_values())
Example #2
0
  row_count
FROM
(SELECT *
 FROM {DEID}.__TABLES__
 WHERE table_id LIKE '%\\\_ext'

 UNION ALL

 SELECT * 
 FROM {COMBINED}.__TABLES__ d1
 WHERE table_id LIKE '\\\_mapping\\\_%')

ORDER BY REPLACE(REPLACE(table_id, '_mapping_', ''), '_ext', ''), dataset_id
"""
q = ROW_COUNTS_QUERY.format(COMBINED=COMBINED, DEID=DEID)
row_counts_df = bq.query(q)
render.dataframe(row_counts_df)

# ## Side by side comparison of row counts

compare_df = row_counts_df.pivot(index='mapped_table',
                                 columns='dataset_id',
                                 values='row_count')
render.dataframe(compare_df)

# ## Row count differences
# The combined mapping tables and deid ext tables are expected to have the same number of rows. Below we find where the row counts differ.

query_str = '{DEID} <> {COMBINED}'.format(COMBINED=COMBINED, DEID=DEID)
diff_row_counts_df = compare_df.query(query_str)
render.dataframe(diff_row_counts_df)
(
  SELECT
    observation_id
  FROM (
    SELECT
      DENSE_RANK() OVER(PARTITION BY person_id,
        observation_source_concept_id,
        value_source_concept_id
      ORDER BY
        observation_datetime DESC,
        observation_id DESC) AS rank_order,
      observation_id
    FROM
      `{DEID}.observation` 
    JOIN
      `{COMBINED}._mapping_observation` as map
    USING
    (observation_id)
      WHERE observation_source_concept_id IN (1586140, 1585838, 1585952) -- race, gender, employment status
      AND value_source_concept_id IN (2000000008, 2000000005, 2000000004, 2000000002)
      AND map.src_hpo_id like "rdr"
    ) o
  WHERE 
    o.rank_order <> 1
) unique_observation_ids 
ON o.observation_id = unique_observation_ids.observation_id
"""
q = DUPLICATE_GEN_RACE_QUERY.format(DEID=DEID, COMBINED=COMBINED)
df = bq.query(q)
df
Example #4
0
def rdr_ehr_name_match(hpo_ids):
    q = get_rdr_ehr_name_match_query(hpo_ids)
    return bq.query(q)
Example #5
0
# # IDs whose records must be retracted

# Determine associated research IDs for RDR participants whose data must be retracted
AIAN_PID_QUERY = """
SELECT DISTINCT 
       rdr.person_id    AS person_id,
       deid.research_id AS research_id
FROM `{RDR}.observation` rdr
 JOIN `{COMBINED}.deid_map` deid
  ON rdr.person_id = deid.person_id
WHERE 
    rdr.observation_source_concept_id = 1586140 
AND rdr.value_source_concept_id       = 1586141
"""
q = AIAN_PID_QUERY.format(RDR=RDR, COMBINED=COMBINED)
aian_pid_df = bq.query(q)
render.dataframe(aian_pid_df)

# Save research IDs to a table in the sandbox
aian_pid_df.to_gbq(destination_table=ID_TABLE, if_exists='fail')

# # Expected row counts after retraction vs. actual row counts

# +
PERSON_TABLE_QUERY = """
SELECT table_name
FROM `{INPUT_DATASET}.INFORMATION_SCHEMA.COLUMNS`
WHERE COLUMN_NAME = 'person_id'
"""
def get_tables_with_person_id(input_dataset):
    """
  LEFT JOIN `{rdr_project}.GcsBucketLogging.cloudaudit_googleapis_com_data_access_*` l
   ON l.resource.labels.bucket_name = h.bucket_name
WHERE
  _TABLE_SUFFIX >= '{end_suffix}'
  AND protopayload_auditlog.authenticationInfo.principalEmail IS NOT NULL
  AND ENDS_WITH(protopayload_auditlog.authenticationInfo.principalEmail, 'pmi-ops.org')
  AND protopayload_auditlog.methodName = 'storage.objects.create'
  AND resource.labels.bucket_name LIKE 'aou%'
  AND protopayload_auditlog.resourceName LIKE '%person.csv'
GROUP BY
  h.hpo_id,
  m.Site_Name,
  resource.labels.bucket_name,
  protopayload_auditlog.authenticationInfo.principalEmail
ORDER BY MAX(timestamp) ASC""".format(rdr_project=RDR_PROJECT_ID, end_suffix=end_suffix)
bq.query(query)

# ## EHR Site Submission Counts

bq.query('''
SELECT 
  l.Org_ID AS org_id,
  l.HPO_ID AS hpo_id,
  l.Site_Name AS site_name,
  table_id AS table_id, 
  row_count AS row_count
FROM `{EHR_DATASET_ID}.__TABLES__` AS t
JOIN `lookup_tables.hpo_site_id_mappings` AS l  
  ON STARTS_WITH(table_id,lower(l.HPO_ID))=true
WHERE table_id like '%person%' AND
NOT(table_id like '%unioned_ehr_%') AND 
Example #7
0
# # Person
# ## Person ID validation

from notebooks import bq
from notebooks.parameters import RDR_DATASET_ID, EHR_DATASET_ID

# Report sites where the number of invalid / total participant IDs
# exceeds this threshold and provide diagnostics
INVALID_THRESHOLD = 0.5

# Get list of all hpo_ids
hpo_ids = bq.query("""
SELECT REPLACE(table_id, '_person', '') AS hpo_id
FROM `{EHR_DATASET_ID}.__TABLES__`
WHERE table_id LIKE '%person' 
AND table_id NOT LIKE '%unioned_ehr_%' AND table_id NOT LIKE '\\\_%'
""".format(EHR_DATASET_ID=EHR_DATASET_ID)).hpo_id.tolist()

# For each site submission, how many person_ids cannot be found in the latest RDR dump (*not_in_rdr*) or are not valid 9-digit participant identifiers (_invalid_).

subqueries = []
subquery = """
SELECT
 '{h}' AS hpo_id,
 not_in_rdr.n AS not_in_rdr,
 invalid.n AS invalid,
 CAST(T.row_count AS INT64) AS total
FROM {EHR_DATASET_ID}.__TABLES__ T
LEFT JOIN
(SELECT COUNT(1) AS n
 FROM {EHR_DATASET_ID}.{h}_person e
Example #8
0
(SELECT o.person_id, 
  o.questionnaire_response_id, 
  STRING_AGG(REPLACE(c.concept_code, 'WhatRaceEthnicity_', ''), ' ' ORDER BY value_source_value) selected_races
 FROM {DATASET}.observation o
 JOIN {VOCAB}.concept c ON o.value_source_concept_id = c.concept_id  
 WHERE observation_source_concept_id = 1586140
 GROUP BY person_id, questionnaire_response_id)
 
SELECT 
  selected_races, 
  (LENGTH(selected_races) - LENGTH(REPLACE(selected_races, ' ', '')) + 1) AS selected_count,
  COUNT(DISTINCT person_id) row_count
FROM race_combo 
GROUP BY selected_races
ORDER BY selected_count, selected_races
"""

render.md('## In dataset `{RDR}`'.format(RDR=RDR))

q = MULTIRACIAL_DIST_QUERY.format(DATASET=RDR, VOCAB=VOCAB)
multi_race_count_df = bq.query(q)
render.dataframe(multi_race_count_df)

render.md('## In dataset `{DEID}`'.format(DEID=DEID))

# Generalization during the privacy methodology should limit the populations represented in the deidentified dataset to those who selected 1 or 2 races only. Where 2 races are selected, Hispanic must be one of them.

q = MULTIRACIAL_DIST_QUERY.format(DATASET=DEID, VOCAB=VOCAB)
multi_race_count_df = bq.query(q)
render.dataframe(multi_race_count_df)
  FROM
    `{DATASET}.person` p
    LEFT JOIN `{DATASET}.observation` o
     ON p.person_id = o.person_id AND observation_source_concept_id = 1585838)

SELECT g.person_id,
  g.gender_concept_id,
  g.gender_source_concept_id,
  c.concept_code AS gender_source_value
FROM gender g
JOIN `{DATASET}.concept` c
  ON g.gender_source_concept_id = c.concept_id
"""

q = SEX_AT_BIRTH_QUERY.format(DATASET=DEID_DATASET_ID)
sex_at_birth_df = bq.query(q)
render.dataframe(sex_at_birth_df)

q = GENDER_QUERY.format(DATASET=DEID_DATASET_ID)
gender_df = bq.query(q)
render.dataframe(gender_df)

# +
from pandas_gbq.gbq import TableCreationError

def df_to_gbq(df, destination_table, table_schema=None):
    try:
        df.to_gbq(destination_table=destination_table, if_exists='fail', table_schema=table_schema)
    except TableCreationError as table_creation_error:
        print('Using existing {} table'.format(destination_table))
  AND value_source_concept_id = 1585839   -- GenderIdentity_Man
  AND person_id IN (
  SELECT
    person_id
  FROM
    `{DATASET}.observation`
  WHERE
    observation_source_concept_id = 1585845  -- BiologicalSexAtBirth_SexAtBirth
      AND value_source_concept_id = 1585847) -- SexAtBirth_Female
"""

# ## Deid dataset
# How many rows in the deid dataset **do not** have the appropriate generalization applied for each scenario?

q = GENDER_SEX_DIFF_QUERY.format(DATASET=DEID)
deid_gender_sex_diff_df = bq.query(q)
deid_has_diff_obs = len(
    deid_gender_sex_diff_df.query('row_count > 0').index) > 0
render.dataframe(deid_gender_sex_diff_df)

# Show number of person_ids associated with each combination (person.gender, ppi)
GENDER_SEX_HIST_QUERY = """
WITH sex_at_birth AS (
  SELECT
    person_id,
    value_source_concept_id AS sex_at_birth_concept_id 
  FROM
    `{DATASET}.observation`
  WHERE
    observation_source_concept_id = 1585845) -- BiologicalSexAtBirth_SexAtBirth
,
  m3.bin_lower_bound,
  m3.bin_upper_bound
ORDER BY
  m3.measurement_concept_id,
  m3.unit_concept_id,
  m3.bin
"""

# Check the number of records associated with the units before and after the unit transformation. Theoretically the number of records units should be same as before after the unit transformation.

unit_conversion_count_query = UNIT_CONVERSION_COUNT_TEMPLATE.format(
    TABLE_BEFORE_CONVERSION=TABLE_BEFORE_CONVERSION,
    TABLE_AFTER_CONVERSION=TABLE_AFTER_CONVERSION,
    UNIT_MAPPING=UNIT_MAPPING,
    VOCAB=VOCAB)
unit_conversion_count = bq.query(unit_conversion_count_query)
render.dataframe(unit_conversion_count)

# Compute the first, median and third quartiles before and after the unit transformation

unit_conversion_stats_query = UNIT_CONVERSION_STATS_TEMPLATE.format(
    TABLE_BEFORE_CONVERSION=TABLE_BEFORE_CONVERSION,
    TABLE_AFTER_CONVERSION=TABLE_AFTER_CONVERSION,
    UNIT_MAPPING=UNIT_MAPPING,
    VOCAB=VOCAB)
unit_conversion_stats = bq.query(unit_conversion_stats_query)
unit_conversion_stats.measurement_concept_id = unit_conversion_stats.measurement_concept_id.apply(
    str)
render.dataframe(unit_conversion_stats)

# +
(
  SELECT
    measurement_id,
    person_id,
    IF(measurement_concept_id IS NULL OR measurement_concept_id=0, measurement_source_concept_id, measurement_concept_id) AS measurement_concept_id
  FROM
    `{DATASET_ID}.measurement` 
) meas
JOIN
  `{DATASET_ID}._mapping_measurement`
USING
  (measurement_id)
JOIN 
  get_measurement_concept_sets_descendants AS valid_lab
ON
  meas.measurement_concept_id = valid_lab.descendant_concept_id
GROUP BY
  1,
  2,
  3,
  4,
  5
ORDER BY 
  1,2
"""

identify_labs_query_results = bq.query(
    IDENTIFY_LABS_QUERY.format(VOCAB_DATASET_ID=VOCAB_DATASET_ID,
                               DATASET_ID=DATASET_ID))
render.dataframe(identify_labs_query_results)