def _union_subqueries(table_name, hpo_ids, input_dataset_id, output_dataset_id): """ Get list of subqueries (one for each HPO table found in the source) that comprise the load query :param table_name: name of a CDM table to load :param hpo_ids: list of HPOs to process :param input_dataset_id: identifies the source dataset :param output_dataset_id: identifies the output dataset :return: list of subqueries """ result = [] # Exclude subqueries that reference tables that are missing from source dataset all_table_ids = bq_utils.list_all_table_ids(input_dataset_id) for hpo_id in hpo_ids: table_id = bq_utils.get_table_id(hpo_id, table_name) if table_id in all_table_ids: if table_name == common.FACT_RELATIONSHIP: subquery = fact_relationship_hpo_subquery( hpo_id, input_dataset_id, output_dataset_id) result.append(subquery) else: subquery = table_hpo_subquery(table_name, hpo_id, input_dataset_id, output_dataset_id) result.append(subquery) else: logging.info( f'Excluding table {table_id} from mapping query because it does not exist' ) return result
def _mapping_subqueries(table_name, hpo_ids, dataset_id, project_id): """ Get list of subqueries (one for each HPO table found in the source) that comprise the ID mapping query :param table_name: name of a CDM table whose ID field must be remapped :param hpo_ids: list of HPOs to process :param dataset_id: identifies the source dataset :param project_id: identifies the GCP project :return: list of subqueries """ result = [] hpo_unique_identifiers = get_hpo_offsets(hpo_ids) # Exclude subqueries that reference tables that are missing from source dataset all_table_ids = bq_utils.list_all_table_ids(dataset_id) for hpo_id in hpo_ids: table_id = bq_utils.get_table_id(hpo_id, table_name) if table_id in all_table_ids: subquery = f''' (SELECT '{table_id}' AS src_table_id, {table_name}_id AS src_{table_name}_id, {table_name}_id + {hpo_unique_identifiers[hpo_id]} as {table_name}_id FROM `{project_id}.{dataset_id}.{table_id}`) ''' result.append(subquery) else: logging.info( f'Excluding table {table_id} from mapping query because it does not exist' ) return result
def get_duplicate_counts_query(hpo_id): """ Query to retrieve count of duplicate primary keys in domain tables for an HPO site :param hpo_id: identifies the HPO site :return: the query """ sub_queries = [] all_table_ids = bq_utils.list_all_table_ids() for table_name in cdm.tables_to_map(): table_id = bq_utils.get_table_id(hpo_id, table_name) if table_id in all_table_ids: sub_query = render_query(consts.DUPLICATE_IDS_SUBQUERY, table_name=table_name, table_id=table_id) sub_queries.append(sub_query) unioned_query = consts.UNION_ALL.join(sub_queries) return consts.DUPLICATE_IDS_WRAPPER.format( union_of_subqueries=unioned_query)
def _mapping_subqueries(table_name, hpo_ids, dataset_id, project_id): """ Get list of subqueries (one for each HPO table found in the source) that comprise the ID mapping query :param table_name: name of a CDM table whose ID field must be remapped :param hpo_ids: list of HPOs to process :param dataset_id: identifies the source dataset :param project_id: identifies the GCP project :return: list of subqueries """ # Until dynamic queries are refactored to use either a single template or dynamic SQL, # defining template locally (rather than top of module) so it is closer to code # that references it below hpo_subquery_tpl = common.JINJA_ENV.from_string(''' (SELECT '{{table_id}}' AS src_table_id, {{table_name}}_id AS src_{{table_name}}_id, -- offset is added to the destination key only if add_hpo_offset == True -- {{table_name}}_id {%- if add_hpo_offset %} + {{hpo_offset}} {%- endif %} AS {{table_name}}_id FROM `{{project_id}}.{{dataset_id}}.{{table_id}}`) ''') result = [] hpo_unique_identifiers = get_hpo_offsets(hpo_ids) # Exclude subqueries that reference tables that are missing from source dataset all_table_ids = bq_utils.list_all_table_ids(dataset_id) for hpo_id in hpo_ids: table_id = bq_utils.get_table_id(hpo_id, table_name) hpo_offset = hpo_unique_identifiers[hpo_id] if table_id in all_table_ids: add_hpo_offset = table_name != common.PERSON subquery = hpo_subquery_tpl.render(table_id=table_id, table_name=table_name, add_hpo_offset=add_hpo_offset, hpo_offset=hpo_offset, project_id=project_id, dataset_id=dataset_id) result.append(subquery) else: logging.info( f'Excluding table {table_id} from mapping query because it does not exist' ) return result
def copy_tables_to_new_dataset(project_id, dataset_id, snapshot_dataset_id): """ lists the tables in the dataset and copies each table to a new dataset. :param dataset_id: :param project_id: :param snapshot_dataset_id: :return: """ copy_table_job_ids = [] for table_id in list_all_table_ids(dataset_id): q = get_copy_table_query(project_id, dataset_id, table_id) results = query(q, use_legacy_sql=False, destination_table_id=table_id, destination_dataset_id=snapshot_dataset_id, batch=True) copy_table_job_ids.append(results['jobReference']['jobId']) incomplete_jobs = wait_on_jobs(copy_table_job_ids) if len(incomplete_jobs) > 0: raise BigQueryJobWaitError(incomplete_jobs)