def add_metadata(dataset_id, project_id, table_fields, field_values=None): """ Adds the metadata value passed in as parameters to the medatadata table :param dataset_id: Name of the dataset :param project_id: Name of the project :param table_fields: field list of a table :param field_values: dictionary of field values passed as parameters :return: None """ etl_check = get_etl_version(dataset_id, project_id) if not etl_check: add_etl_query = ADD_ETL_METADATA_QUERY.format( project=project_id, dataset=dataset_id, etl_version=ETL_VERSION, field_value=field_values[ETL_VERSION]) bq.query(add_etl_query, project_id=project_id) update_statement = parse_update_statement(table_fields, field_values) if update_statement != '': q = UPDATE_QUERY.format(project=project_id, dataset=dataset_id, table=METADATA_TABLE, statement=update_statement, etl_version=ETL_VERSION, etl_value=field_values[ETL_VERSION]) bq.query(q, project_id=project_id)
def get_etl_version(dataset_id, project_id): etl_version = bq.query(ETL_VERSION_CHECK.format(etl=ETL_VERSION, project=project_id, dataset=dataset_id, table=METADATA_TABLE), project_id=project_id)[ETL_VERSION].tolist() return etl_version
def get_etl_version(dataset_id, project_id): """ Gets the etl version :param dataset_id: Name of the dataset :param project_id: Name of the project :return: etl version """ etl_version = bq.query(ETL_VERSION_CHECK.render(etl=ETL_VERSION, project=project_id, dataset=dataset_id, table=METADATA_TABLE), project_id=project_id)[ETL_VERSION].tolist() return etl_version
def count_pid_rows_in_dataset(project_id, dataset_id, hpo_id, pid_source): """ Returns df containing tables and counts of participant rows for pids in pids_source :param project_id: identifies the project :param dataset_id: identifies the dataset :param hpo_id: Identifies the hpo site that submitted the pids :param pid_source: string containing query or list containing pids :return: df with headers table_id, all_counts, all_ehr_counts, and map_ehr_counts """ dataset_type = ru.get_dataset_type(dataset_id) counts_df = pd.DataFrame(columns=[ ru_consts.TABLE_ID, consts.ALL_COUNT, consts.ALL_EHR_COUNT, consts.MAP_EHR_COUNT ]) bq_client = bq.get_client(project_id) cols_query = bq.dataset_columns_query(project_id, dataset_id) table_df = bq_client.query(cols_query).to_dataframe() if dataset_type == common.COMBINED: query = get_combined_deid_query(project_id, dataset_id, pid_source, table_df) elif dataset_type == common.DEID or dataset_type == common.RELEASE: query = get_combined_deid_query(project_id, dataset_id, pid_source, table_df, for_deid=True) elif dataset_type == common.EHR: query = get_ehr_query(project_id, dataset_id, pid_source, hpo_id, table_df) elif dataset_type == common.RDR: query = get_dataset_query(project_id, dataset_id, pid_source, table_df, for_rdr=True) else: query = get_dataset_query(project_id, dataset_id, pid_source, table_df) if query: counts_df = bq.query(query, project_id) # sort by count desc counts_df = counts_df.sort_values(by=consts.ALL_COUNT, ascending=False) return counts_df
def get_tables(self, table_type): """ Retrieves mapping/ext tables in dataset :param table_type: can take values 'mapping' or 'ext', identifies tables in the dataset with the given type :return: list of tables in the dataset which are mapping or ext tables of cdm_tables """ tables_query = GET_TABLES_QUERY.format(project=self.project_id, dataset=self.dataset_id, table_type=table_type) tables = bq.query(tables_query).get(TABLE_NAME).to_list() cdm_tables = set(resources.CDM_TABLES) tables = [ table for table in tables if self.get_cdm_table(table, table_type) in cdm_tables ] return tables
def row_counts(dataset_ids): sq = "SELECT '{dataset_id}' dataset_id, table_id, row_count FROM `{dataset_id}.__TABLES__`" sqs = [sq.format(dataset_id=d) for d in dataset_ids] iq = "\nUNION ALL\n".join(sqs) q = """ SELECT dataset_id, table_id, row_count FROM ({iq}) WHERE table_id NOT LIKE '%union%' AND table_id NOT LIKE '%ipmc%' ORDER BY table_id, dataset_id""".format(iq=iq) df = bq.query(q) df['load_date'] = df.dataset_id.str[-8:] df['load_date'] = df['load_date'].astype('category') df['dataset_id'] = df['dataset_id'].astype('category') df['table_id'] = df['table_id'].astype('category') g = sns.FacetGrid(df, col='table_id', sharey=False, col_wrap=5) g.map(sns.barplot, 'dataset_id', 'row_count', ci=None) g.set_xticklabels(rotation=45, ha='right') return df
def gender_by_race(dataset_id): df = bq.query(''' SELECT c1.concept_name AS gender, c2.concept_name AS race, COUNT(1) AS `count` FROM `{dataset_id}.person` p JOIN `{latest.vocabulary}.concept` c1 ON p.gender_concept_id = c1.concept_id JOIN `{latest.vocabulary}.concept` c2 ON p.race_concept_id = c2.concept_id GROUP BY c2.concept_name, c1.concept_name '''.format(dataset_id=dataset_id, latest=DEFAULT_DATASETS.latest)) df['race'] = df['race'].astype('category') df['gender'] = df['gender'].astype('category') g = sns.FacetGrid(df, col='race', hue='gender', col_wrap=5) g.map(sns.barplot, 'gender', 'count', ci=None) g.set_xticklabels([]) g.set_axis_labels('', '') g.add_legend()
def csv_file_updates(csv_file): """ Summarize the deletes associated with a CSV file :param csv_file: path to a file where each column is a list of pids and the header is an observation_source_value :return: dictionary with keys file_name, observation_source_value, num_pids, num_rows, q """ if not os.path.exists(csv_file): raise IOError('File "%s" not found' % csv_file) obs_count_fmt = OBS_COUNT_FMT obs_query_fmt = OBS_QUERY_FMT if DEID: obs_count_fmt = DEID_OBS_COUNT_FMT obs_query_fmt = DEID_OBS_QUERY_FMT file_name = os.path.basename(csv_file) csv_df = pd.read_csv(csv_file) cols = list(csv_df.columns.to_native_types()) results = list() for col in cols: person_ids = csv_df[col].dropna().apply(str).to_list() q = obs_count_fmt.format(PROJECT_ID=PROJECT_ID, TARGET_DATASET_ID=TARGET_DATASET_ID, COMBINED=COMBINED, OBSERVATION_SOURCE_VALUE=col, PERSON_IDS=', '.join(person_ids)) num_rows_result = query(q) q = obs_query_fmt.format(PROJECT_ID=PROJECT_ID, TARGET_DATASET_ID=TARGET_DATASET_ID, COMBINED=COMBINED, OBSERVATION_SOURCE_VALUE=col, PERSON_IDS=', '.join(person_ids)) num_rows = num_rows_result.iloc[0]['n'] result = dict(file_name=file_name, observation_source_value=col, q=q, num_pids=len(person_ids), num_rows=num_rows) results.append(result) return results
gc.concept_name AS gender, rc.concept_name AS race, ec.concept_name AS ethnicity, CASE WHEN e.person_id IS NULL THEN 'no' ELSE 'yes' END AS has_ehr_data FROM {latest.rdr}.person r LEFT JOIN `{latest.unioned}.person` e ON r.person_id = e.person_id JOIN `{latest.vocabulary}.concept` gc ON r.gender_concept_id = gc.concept_id JOIN `{latest.vocabulary}.concept` rc ON r.race_concept_id = rc.concept_id JOIN `{latest.vocabulary}.concept` ec ON r.ethnicity_concept_id = ec.concept_id ORDER BY age, gender, race """.format(latest=DEFAULT_DATASETS.latest) df = bq.query(q) # ## Presence of EHR data by race # + df['race'] = df['race'].astype('category') df['ethnicity'] = df['ethnicity'].astype('category') df['has_ehr_data'] = df['has_ehr_data'].astype('category') # exclude anomalous records where age<18 or age>100 f = df[(df.age > 17) & (df.age < 100)] g = sns.factorplot('race', data=f, aspect=4, size=3.25, kind='count',