Beispiel #1
0
def _union_subqueries(table_name, hpo_ids, input_dataset_id,
                      output_dataset_id):
    """
    Get list of subqueries (one for each HPO table found in the source) that comprise the load query

    :param table_name: name of a CDM table to load
    :param hpo_ids: list of HPOs to process
    :param input_dataset_id: identifies the source dataset
    :param output_dataset_id: identifies the output dataset
    :return: list of subqueries
    """
    result = []
    # Exclude subqueries that reference tables that are missing from source dataset
    all_table_ids = bq_utils.list_all_table_ids(input_dataset_id)
    for hpo_id in hpo_ids:
        table_id = bq_utils.get_table_id(hpo_id, table_name)
        if table_id in all_table_ids:
            if table_name == common.FACT_RELATIONSHIP:
                subquery = fact_relationship_hpo_subquery(
                    hpo_id, input_dataset_id, output_dataset_id)
                result.append(subquery)
            else:
                subquery = table_hpo_subquery(table_name, hpo_id,
                                              input_dataset_id,
                                              output_dataset_id)
                result.append(subquery)
        else:
            logging.info(
                f'Excluding table {table_id} from mapping query because it does not exist'
            )
    return result
Beispiel #2
0
def _mapping_subqueries(table_name, hpo_ids, dataset_id, project_id):
    """
    Get list of subqueries (one for each HPO table found in the source) that comprise the ID mapping query

    :param table_name: name of a CDM table whose ID field must be remapped
    :param hpo_ids: list of HPOs to process
    :param dataset_id: identifies the source dataset
    :param project_id: identifies the GCP project
    :return: list of subqueries
    """
    result = []
    hpo_unique_identifiers = get_hpo_offsets(hpo_ids)

    # Exclude subqueries that reference tables that are missing from source dataset
    all_table_ids = bq_utils.list_all_table_ids(dataset_id)
    for hpo_id in hpo_ids:
        table_id = bq_utils.get_table_id(hpo_id, table_name)
        if table_id in all_table_ids:
            subquery = f'''
                (SELECT '{table_id}' AS src_table_id,
                  {table_name}_id AS src_{table_name}_id,
                  {table_name}_id + {hpo_unique_identifiers[hpo_id]} as {table_name}_id
                  FROM `{project_id}.{dataset_id}.{table_id}`)
                '''
            result.append(subquery)
        else:
            logging.info(
                f'Excluding table {table_id} from mapping query because it does not exist'
            )
    return result
Beispiel #3
0
def get_duplicate_counts_query(hpo_id):
    """
    Query to retrieve count of duplicate primary keys in domain tables for an HPO site

    :param hpo_id: identifies the HPO site
    :return: the query
    """
    sub_queries = []
    all_table_ids = bq_utils.list_all_table_ids()
    for table_name in cdm.tables_to_map():
        table_id = bq_utils.get_table_id(hpo_id, table_name)
        if table_id in all_table_ids:
            sub_query = render_query(consts.DUPLICATE_IDS_SUBQUERY,
                                     table_name=table_name,
                                     table_id=table_id)
            sub_queries.append(sub_query)
    unioned_query = consts.UNION_ALL.join(sub_queries)
    return consts.DUPLICATE_IDS_WRAPPER.format(
        union_of_subqueries=unioned_query)
Beispiel #4
0
def _mapping_subqueries(table_name, hpo_ids, dataset_id, project_id):
    """
    Get list of subqueries (one for each HPO table found in the source) that comprise the ID mapping query

    :param table_name: name of a CDM table whose ID field must be remapped
    :param hpo_ids: list of HPOs to process
    :param dataset_id: identifies the source dataset
    :param project_id: identifies the GCP project
    :return: list of subqueries
    """
    # Until dynamic queries are refactored to use either a single template or dynamic SQL,
    # defining template locally (rather than top of module) so it is closer to code
    # that references it below
    hpo_subquery_tpl = common.JINJA_ENV.from_string('''
    (SELECT '{{table_id}}' AS src_table_id,
      {{table_name}}_id AS src_{{table_name}}_id,
      -- offset is added to the destination key only if add_hpo_offset == True --
      {{table_name}}_id 
        {%- if add_hpo_offset %} + {{hpo_offset}} {%- endif %} AS {{table_name}}_id
      FROM `{{project_id}}.{{dataset_id}}.{{table_id}}`)
    ''')
    result = []
    hpo_unique_identifiers = get_hpo_offsets(hpo_ids)

    # Exclude subqueries that reference tables that are missing from source dataset
    all_table_ids = bq_utils.list_all_table_ids(dataset_id)
    for hpo_id in hpo_ids:
        table_id = bq_utils.get_table_id(hpo_id, table_name)
        hpo_offset = hpo_unique_identifiers[hpo_id]
        if table_id in all_table_ids:
            add_hpo_offset = table_name != common.PERSON
            subquery = hpo_subquery_tpl.render(table_id=table_id,
                                               table_name=table_name,
                                               add_hpo_offset=add_hpo_offset,
                                               hpo_offset=hpo_offset,
                                               project_id=project_id,
                                               dataset_id=dataset_id)
            result.append(subquery)
        else:
            logging.info(
                f'Excluding table {table_id} from mapping query because it does not exist'
            )
    return result
Beispiel #5
0
def copy_tables_to_new_dataset(project_id, dataset_id, snapshot_dataset_id):
    """
    lists the tables in the dataset and copies each table to a new dataset.
    :param dataset_id:
    :param project_id:
    :param snapshot_dataset_id:
    :return:
    """
    copy_table_job_ids = []
    for table_id in list_all_table_ids(dataset_id):
        q = get_copy_table_query(project_id, dataset_id, table_id)
        results = query(q,
                        use_legacy_sql=False,
                        destination_table_id=table_id,
                        destination_dataset_id=snapshot_dataset_id,
                        batch=True)
        copy_table_job_ids.append(results['jobReference']['jobId'])
    incomplete_jobs = wait_on_jobs(copy_table_job_ids)
    if len(incomplete_jobs) > 0:
        raise BigQueryJobWaitError(incomplete_jobs)