コード例 #1
0
  SELECT
    person_id
  FROM
    `{DATASET}.observation`
  WHERE
    observation_source_concept_id = 1585845  -- BiologicalSexAtBirth_SexAtBirth
      AND value_source_concept_id = 1585847) -- SexAtBirth_Female
"""

# ## Deid dataset
# How many rows in the deid dataset **do not** have the appropriate generalization applied for each scenario?

q = GENDER_SEX_DIFF_QUERY.format(DATASET=DEID)
deid_gender_sex_diff_df = utils.bq.query(q)
deid_has_diff_obs = len(utils.bq.query('row_count > 0').index) > 0
render.dataframe(deid_gender_sex_diff_df)

# Show number of person_ids associated with each combination (person.gender, ppi)
GENDER_SEX_HIST_QUERY = """
WITH sex_at_birth AS (
  SELECT
    person_id,
    value_source_concept_id AS sex_at_birth_concept_id 
  FROM
    `{DATASET}.observation`
  WHERE
    observation_source_concept_id = 1585845) -- BiologicalSexAtBirth_SexAtBirth
,

gender_identity AS (
  SELECT
コード例 #2
0
FROM
(SELECT *
 FROM {DEID}.__TABLES__
 WHERE table_id LIKE '%\\\_ext'

 UNION ALL

 SELECT * 
 FROM {COMBINED}.__TABLES__ d1
 WHERE table_id LIKE '\\\_mapping\\\_%')

ORDER BY REPLACE(REPLACE(table_id, '_mapping_', ''), '_ext', ''), dataset_id
"""
q = ROW_COUNTS_QUERY.format(COMBINED=COMBINED, DEID=DEID)
row_counts_df = bq.query(q)
render.dataframe(row_counts_df)

# ## Side by side comparison of row counts

compare_df = row_counts_df.pivot(index='mapped_table',
                                 columns='dataset_id',
                                 values='row_count')
render.dataframe(compare_df)

# ## Row count differences
# The combined mapping tables and deid ext tables are expected to have the same number of rows. Below we find where the row counts differ.

query_str = '{DEID} <> {COMBINED}'.format(COMBINED=COMBINED, DEID=DEID)
diff_row_counts_df = compare_df.query(query_str)
render.dataframe(diff_row_counts_df)
コード例 #3
0
(SELECT o.person_id, 
  o.questionnaire_response_id, 
  STRING_AGG(REPLACE(c.concept_code, 'WhatRaceEthnicity_', ''), ' ' ORDER BY value_source_value) selected_races
 FROM {DATASET}.observation o
 JOIN {VOCAB}.concept c ON o.value_source_concept_id = c.concept_id  
 WHERE observation_source_concept_id = 1586140
 GROUP BY person_id, questionnaire_response_id)
 
SELECT 
  selected_races, 
  (LENGTH(selected_races) - LENGTH(REPLACE(selected_races, ' ', '')) + 1) AS selected_count,
  COUNT(DISTINCT person_id) row_count
FROM race_combo 
GROUP BY selected_races
ORDER BY selected_count, selected_races
"""

render.md('## In dataset `{RDR}`'.format(RDR=RDR))

q = MULTIRACIAL_DIST_QUERY.format(DATASET=RDR, VOCAB=VOCAB)
multi_race_count_df = utils.bq.query(q)
render.dataframe(multi_race_count_df)

render.md('## In dataset `{DEID}`'.format(DEID=DEID))

# Generalization during the privacy methodology should limit the populations represented in the deidentified dataset to those who selected 1 or 2 races only. Where 2 races are selected, Hispanic must be one of them.

q = MULTIRACIAL_DIST_QUERY.format(DATASET=DEID, VOCAB=VOCAB)
multi_race_count_df = utils.bq.query(q)
render.dataframe(multi_race_count_df)
コード例 #4
0
  m3.bin_upper_bound
ORDER BY
  m3.measurement_concept_id,
  m3.unit_concept_id,
  m3.bin
"""

# Check the number of records associated with the units before and after the unit transformation. Theoretically the number of records units should be same as before after the unit transformation.

unit_conversion_count_query = UNIT_CONVERSION_COUNT_TEMPLATE.format(
    TABLE_BEFORE_CONVERSION=TABLE_BEFORE_CONVERSION,
    TABLE_AFTER_CONVERSION=TABLE_AFTER_CONVERSION,
    UNIT_MAPPING=UNIT_MAPPING,
    VOCAB=VOCAB)
unit_conversion_count = utils.bq.query(unit_conversion_count_query)
render.dataframe(unit_conversion_count)

# Compute the first, median and third quartiles before and after the unit transformation

unit_conversion_stats_query = UNIT_CONVERSION_STATS_TEMPLATE.format(
    TABLE_BEFORE_CONVERSION=TABLE_BEFORE_CONVERSION,
    TABLE_AFTER_CONVERSION=TABLE_AFTER_CONVERSION,
    UNIT_MAPPING=UNIT_MAPPING,
    VOCAB=VOCAB)
unit_conversion_stats = utils.bq.query(unit_conversion_stats_query)
unit_conversion_stats.measurement_concept_id = unit_conversion_stats.measurement_concept_id.apply(
    str)
render.dataframe(unit_conversion_stats)

# +
before_unit_conversion_dist_query = UNIT_DISTRIBUTION_QUERY.format(
コード例 #5
0
# Determine associated research IDs for RDR participants whose data must be retracted
AIAN_PID_QUERY = """
SELECT DISTINCT 
       rdr.person_id    AS person_id,
       deid.research_id AS research_id
FROM `{RDR}.observation` rdr
 JOIN `{COMBINED}.deid_map` deid
  ON rdr.person_id = deid.person_id
WHERE 
    rdr.observation_source_concept_id = 1586140 
AND rdr.value_source_concept_id       = 1586141
"""
q = AIAN_PID_QUERY.format(RDR=RDR, COMBINED=COMBINED)
aian_pid_df = bq.query(q)
render.dataframe(aian_pid_df)

# Save research IDs to a table in the sandbox
aian_pid_df.to_gbq(destination_table=ID_TABLE, if_exists='fail')

# # Expected row counts after retraction vs. actual row counts

# +
PERSON_TABLE_QUERY = """
SELECT table_name
FROM `{INPUT_DATASET}.INFORMATION_SCHEMA.COLUMNS`
WHERE COLUMN_NAME = 'person_id'
"""
def get_tables_with_person_id(input_dataset):
    """
    Get list of tables that have a person_id column
コード例 #6
0
(
  SELECT
    measurement_id,
    person_id,
    IF(measurement_concept_id IS NULL OR measurement_concept_id=0, measurement_source_concept_id, measurement_concept_id) AS measurement_concept_id
  FROM
    `{DATASET_ID}.measurement` 
) meas
JOIN
  `{DATASET_ID}._mapping_measurement`
USING
  (measurement_id)
JOIN 
  get_measurement_concept_sets_descendants AS valid_lab
ON
  meas.measurement_concept_id = valid_lab.descendant_concept_id
GROUP BY
  1,
  2,
  3,
  4,
  5
ORDER BY 
  1,2
"""

identify_labs_query_results = utils.bq.query(
    IDENTIFY_LABS_QUERY.format(VOCAB_DATASET_ID=VOCAB_DATASET_ID,
                               DATASET_ID=DATASET_ID))
render.dataframe(identify_labs_query_results)
コード例 #7
0
    `{DATASET}.person` p
    LEFT JOIN `{DATASET}.observation` o
     ON p.person_id = o.person_id AND observation_source_concept_id = 1585838)

SELECT g.person_id,
  g.gender_concept_id,
  g.gender_source_concept_id,
  c.concept_code AS gender_source_value
FROM gender g
JOIN `{DATASET}.concept` c
  ON g.gender_source_concept_id = c.concept_id
"""

q = SEX_AT_BIRTH_QUERY.format(DATASET=DEID_DATASET_ID)
sex_at_birth_df = bq.query(q)
render.dataframe(sex_at_birth_df)

q = GENDER_QUERY.format(DATASET=DEID_DATASET_ID)
gender_df = bq.query(q)
render.dataframe(gender_df)

# +
from pandas_gbq.gbq import TableCreationError

def df_to_gbq(df, destination_table, table_schema=None):
    try:
        df.to_gbq(destination_table=destination_table, if_exists='fail', table_schema=table_schema)
    except TableCreationError as table_creation_error:
        print('Using existing {} table'.format(destination_table))

コード例 #8
0
ファイル: sex_gender_540.py プロジェクト: rfrancis1/curation
`{DEID_DATASET_ID}.observation` o
WHERE
o.observation_concept_id = 1585838
)

GROUP BY 1, 2
ORDER BY count DESC
"""

# +
gender = gender.format(
    DEID_DATASET_ID=DEID_DATASET_ID,
    PERSON_TABLE_BEFORE_CLEANING_RULE=PERSON_TABLE_BEFORE_CLEANING_RULE)

gender_output = utils.bq.query(gender)
render.dataframe(gender_output)
# -

# ### After the cleaning rule - get the counts for the gender concept

gender_cleaned = """
SELECT
DISTINCT
p.gender_concept_id, c.concept_name, COUNT(*) as count
FROM
`{PERSON_TABLE_AFTER_CLEANING_RULE}` p
JOIN
`{CONCEPT_TABLE}` c
ON
p.gender_concept_id = c.concept_id