Exemple #1
0
def add_metadata(dataset_id, project_id, table_fields, field_values=None):
    """
    Adds the metadata value passed in as parameters to the medatadata table

    :param dataset_id: Name of the dataset
    :param project_id: Name of the project
    :param table_fields: field list of a table
    :param field_values: dictionary of field values passed as parameters
    :return: None
    """
    etl_check = get_etl_version(dataset_id, project_id)
    if not etl_check:
        add_etl_query = ADD_ETL_METADATA_QUERY.format(
            project=project_id,
            dataset=dataset_id,
            etl_version=ETL_VERSION,
            field_value=field_values[ETL_VERSION])
        bq.query(add_etl_query, project_id=project_id)

    update_statement = parse_update_statement(table_fields, field_values)
    if update_statement != '':
        q = UPDATE_QUERY.format(project=project_id,
                                dataset=dataset_id,
                                table=METADATA_TABLE,
                                statement=update_statement,
                                etl_version=ETL_VERSION,
                                etl_value=field_values[ETL_VERSION])
        bq.query(q, project_id=project_id)
Exemple #2
0
def get_etl_version(dataset_id, project_id):
    etl_version = bq.query(ETL_VERSION_CHECK.format(etl=ETL_VERSION,
                                                    project=project_id,
                                                    dataset=dataset_id,
                                                    table=METADATA_TABLE),
                           project_id=project_id)[ETL_VERSION].tolist()
    return etl_version
Exemple #3
0
def get_etl_version(dataset_id, project_id):
    """
    Gets the etl version

    :param dataset_id: Name of the dataset
    :param project_id: Name of the project
    :return: etl version
    """
    etl_version = bq.query(ETL_VERSION_CHECK.render(etl=ETL_VERSION,
                                                    project=project_id,
                                                    dataset=dataset_id,
                                                    table=METADATA_TABLE),
                           project_id=project_id)[ETL_VERSION].tolist()
    return etl_version
def count_pid_rows_in_dataset(project_id, dataset_id, hpo_id, pid_source):
    """
    Returns df containing tables and counts of participant rows for pids in pids_source

    :param project_id: identifies the project
    :param dataset_id: identifies the dataset
    :param hpo_id: Identifies the hpo site that submitted the pids
    :param pid_source: string containing query or list containing pids
    :return: df with headers table_id, all_counts, all_ehr_counts, and map_ehr_counts
    """
    dataset_type = ru.get_dataset_type(dataset_id)
    counts_df = pd.DataFrame(columns=[
        ru_consts.TABLE_ID, consts.ALL_COUNT, consts.ALL_EHR_COUNT,
        consts.MAP_EHR_COUNT
    ])
    bq_client = bq.get_client(project_id)
    cols_query = bq.dataset_columns_query(project_id, dataset_id)
    table_df = bq_client.query(cols_query).to_dataframe()

    if dataset_type == common.COMBINED:
        query = get_combined_deid_query(project_id, dataset_id, pid_source,
                                        table_df)
    elif dataset_type == common.DEID or dataset_type == common.RELEASE:
        query = get_combined_deid_query(project_id,
                                        dataset_id,
                                        pid_source,
                                        table_df,
                                        for_deid=True)
    elif dataset_type == common.EHR:
        query = get_ehr_query(project_id, dataset_id, pid_source, hpo_id,
                              table_df)
    elif dataset_type == common.RDR:
        query = get_dataset_query(project_id,
                                  dataset_id,
                                  pid_source,
                                  table_df,
                                  for_rdr=True)
    else:
        query = get_dataset_query(project_id, dataset_id, pid_source, table_df)

    if query:
        counts_df = bq.query(query, project_id)
        # sort by count desc
        counts_df = counts_df.sort_values(by=consts.ALL_COUNT, ascending=False)
    return counts_df
Exemple #5
0
    def get_tables(self, table_type):
        """
        Retrieves mapping/ext tables in dataset

        :param table_type: can take values 'mapping' or 'ext', identifies
            tables in the dataset with the given type

        :return: list of tables in the dataset which are mapping or ext tables of cdm_tables
        """
        tables_query = GET_TABLES_QUERY.format(project=self.project_id,
                                               dataset=self.dataset_id,
                                               table_type=table_type)
        tables = bq.query(tables_query).get(TABLE_NAME).to_list()
        cdm_tables = set(resources.CDM_TABLES)
        tables = [
            table for table in tables
            if self.get_cdm_table(table, table_type) in cdm_tables
        ]
        return tables
def row_counts(dataset_ids):
    sq = "SELECT '{dataset_id}' dataset_id, table_id, row_count FROM `{dataset_id}.__TABLES__`"
    sqs = [sq.format(dataset_id=d) for d in dataset_ids]
    iq = "\nUNION ALL\n".join(sqs)
    q = """ 
    SELECT dataset_id, table_id, row_count 
    FROM ({iq})
    WHERE table_id NOT LIKE '%union%' 
      AND table_id NOT LIKE '%ipmc%'
    ORDER BY table_id, dataset_id""".format(iq=iq)
    df = bq.query(q)
    df['load_date'] = df.dataset_id.str[-8:]
    df['load_date'] = df['load_date'].astype('category')
    df['dataset_id'] = df['dataset_id'].astype('category')
    df['table_id'] = df['table_id'].astype('category')
    g = sns.FacetGrid(df, col='table_id', sharey=False, col_wrap=5)
    g.map(sns.barplot, 'dataset_id', 'row_count', ci=None)
    g.set_xticklabels(rotation=45, ha='right')
    return df
def gender_by_race(dataset_id):
    df = bq.query('''
    SELECT 
     c1.concept_name AS gender,
     c2.concept_name AS race,
     COUNT(1) AS `count`
    FROM `{dataset_id}.person` p
    JOIN `{latest.vocabulary}.concept` c1 
      ON p.gender_concept_id = c1.concept_id
    JOIN `{latest.vocabulary}.concept` c2
      ON p.race_concept_id = c2.concept_id
    GROUP BY c2.concept_name, c1.concept_name
    '''.format(dataset_id=dataset_id, latest=DEFAULT_DATASETS.latest))
    df['race'] = df['race'].astype('category')
    df['gender'] = df['gender'].astype('category')
    g = sns.FacetGrid(df, col='race', hue='gender', col_wrap=5)
    g.map(sns.barplot, 'gender', 'count', ci=None)
    g.set_xticklabels([])
    g.set_axis_labels('', '')
    g.add_legend()
def csv_file_updates(csv_file):
    """
    Summarize the deletes associated with a CSV file
    
    :param csv_file: path to a file where each column is a list of pids and the header is an observation_source_value
    :return: dictionary with keys file_name, observation_source_value, num_pids, num_rows, q
    """

    if not os.path.exists(csv_file):
        raise IOError('File "%s" not found' % csv_file)
    obs_count_fmt = OBS_COUNT_FMT
    obs_query_fmt = OBS_QUERY_FMT
    if DEID:
        obs_count_fmt = DEID_OBS_COUNT_FMT
        obs_query_fmt = DEID_OBS_QUERY_FMT
    file_name = os.path.basename(csv_file)
    csv_df = pd.read_csv(csv_file)
    cols = list(csv_df.columns.to_native_types())
    results = list()
    for col in cols:
        person_ids = csv_df[col].dropna().apply(str).to_list()
        q = obs_count_fmt.format(PROJECT_ID=PROJECT_ID,
                                 TARGET_DATASET_ID=TARGET_DATASET_ID,
                                 COMBINED=COMBINED,
                                 OBSERVATION_SOURCE_VALUE=col,
                                 PERSON_IDS=', '.join(person_ids))
        num_rows_result = query(q)
        q = obs_query_fmt.format(PROJECT_ID=PROJECT_ID,
                                 TARGET_DATASET_ID=TARGET_DATASET_ID,
                                 COMBINED=COMBINED,
                                 OBSERVATION_SOURCE_VALUE=col,
                                 PERSON_IDS=', '.join(person_ids))
        num_rows = num_rows_result.iloc[0]['n']
        result = dict(file_name=file_name,
                      observation_source_value=col,
                      q=q,
                      num_pids=len(person_ids),
                      num_rows=num_rows)
        results.append(result)
    return results
  gc.concept_name AS gender,
  rc.concept_name AS race,
  ec.concept_name AS ethnicity,
  CASE WHEN e.person_id IS NULL THEN 'no' ELSE 'yes' END AS has_ehr_data
FROM {latest.rdr}.person r
  LEFT JOIN `{latest.unioned}.person` e 
    ON r.person_id = e.person_id
JOIN `{latest.vocabulary}.concept` gc 
  ON r.gender_concept_id = gc.concept_id
JOIN `{latest.vocabulary}.concept` rc
  ON r.race_concept_id = rc.concept_id
JOIN `{latest.vocabulary}.concept` ec
  ON r.ethnicity_concept_id = ec.concept_id
ORDER BY age, gender, race
""".format(latest=DEFAULT_DATASETS.latest)
df = bq.query(q)

# ## Presence of EHR data by race

# +
df['race'] = df['race'].astype('category')
df['ethnicity'] = df['ethnicity'].astype('category')
df['has_ehr_data'] = df['has_ehr_data'].astype('category')

# exclude anomalous records where age<18 or age>100
f = df[(df.age > 17) & (df.age < 100)]
g = sns.factorplot('race',
                   data=f,
                   aspect=4,
                   size=3.25,
                   kind='count',