Python build_combined_schema Exemples, common_etl.support.build_combined_schema Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : build_open_somatic_mut_bq_table.py Projet : oshahzada98/NextGenETL

def main(args):
    if not confirm_google_vm():
        print(
            'This job needs to run on a Google Cloud Compute Engine to avoid storage egress charges [EXITING]'
        )
        return

    if len(args) != 2:
        print(" ")
        print(" Usage : {} <configuration_yaml>".format(args[0]))
        return

    print('job started')

    #
    # Get the YAML config loaded:
    #

    with open(args[1], mode='r') as yaml_file:
        params, filters, bq_filters, steps, callers, update_schema_tables, schema_tags = load_config(
            yaml_file.read())

    if params is None:
        print("Bad YAML load")
        return

    #
    # BQ does not like to be given paths that have "~". So make all local paths absolute:
    #

    home = expanduser("~")
    local_files_dir = "{}/{}".format(home, params['LOCAL_FILES_DIR'])
    one_big_tsv = "{}/{}".format(home, params['ONE_BIG_TSV'])
    manifest_file = "{}/{}".format(home, params['MANIFEST_FILE'])
    local_pull_list = "{}/{}".format(home, params['LOCAL_PULL_LIST'])
    file_traversal_list = "{}/{}".format(home, params['FILE_TRAVERSAL_LIST'])
    hold_schema_dict = "{}/{}".format(home, params['HOLD_SCHEMA_DICT'])
    hold_schema_list = "{}/{}".format(home, params['HOLD_SCHEMA_LIST'])

    # Which table are we building?
    release = "".join(["r", str(params['RELEASE'])])
    use_schema = params['VER_SCHEMA_FILE_NAME']
    if 'current' in steps:
        print('This workflow will update the schema for the "current" table')
        release = 'current'
        use_schema = params['SCHEMA_FILE_NAME']

    # Create table names
    concat_table = '_'.join([params['PROGRAM'], params['DATA_TYPE'], 'concat'])
    barcode_table = '_'.join(
        [params['PROGRAM'], params['DATA_TYPE'], 'barcode'])
    draft_table = '_'.join(
        [params['PROGRAM'], params['DATA_TYPE'], params['BUILD'], 'gdc', '{}'])
    publication_table = '_'.join(
        [params['DATA_TYPE'], params['BUILD'], 'gdc', '{}'])
    manifest_table = '_'.join(
        [params['PROGRAM'], params['DATA_TYPE'], 'manifest'])

    if params['RELEASE'] < 21 and 'METADATA_REL' not in params:
        print("The input release is before new metadata process, "
              "please specify which release of the metadata to use.")

    metadata_rel = "".join(["r", str(params['METADATA_REL'])
                            ]) if 'METADATA_REL' in params else release

    if 'build_manifest_from_filters' in steps:

        max_files = params['MAX_FILES'] if 'MAX_FILES' in params else None
        manifest_success = get_the_bq_manifest(
            params['FILE_TABLE'].format(metadata_rel), bq_filters, max_files,
            params['WORKING_PROJECT'], params['SCRATCH_DATASET'],
            manifest_table, params['WORKING_BUCKET'],
            params['BUCKET_MANIFEST_TSV'], manifest_file,
            params['BQ_AS_BATCH'])
        if not manifest_success:
            print("Failure generating manifest")
            return

    #
    # Best practice is to clear out the directory where the files are going. Don't want anything left over:
    #

    if 'clear_target_directory' in steps:
        create_clean_target(local_files_dir)

    #
    # We need to create a "pull list" of gs:// URLs to pull from GDC buckets. If you have already
    # created a pull list, just plunk it in 'LOCAL_PULL_LIST' and skip this step. If creating a pull
    # list, you can do it using IndexD calls on a manifest file, OR using BQ as long as you have
    # built the manifest using BQ (that route uses the BQ Manifest table that was created).
    #

    if 'build_pull_list' in steps:

        build_pull_list_with_bq(
            "{}.{}.{}".format(params['WORKING_PROJECT'],
                              params['SCRATCH_DATASET'], manifest_table),
            params['INDEXD_BQ_TABLE'].format(metadata_rel),
            params['WORKING_PROJECT'], params['SCRATCH_DATASET'],
            "_".join([params['PROGRAM'], params['DATA_TYPE'], 'pull',
                      'list']), params['WORKING_BUCKET'],
            params['BUCKET_PULL_LIST'], local_pull_list, params['BQ_AS_BATCH'])

    #
    # Now hitting GDC cloud buckets, not "downloading". Get the files in the pull list:
    #

    if 'download_from_gdc' in steps:
        with open(local_pull_list, mode='r') as pull_list_file:
            pull_list = pull_list_file.read().splitlines()
        pull_from_buckets(pull_list, local_files_dir)

    #
    # Traverse the tree of downloaded files and create a flat list of all files:
    #

    if 'build_traversal_list' in steps:
        all_files = build_file_list(local_files_dir)
        with open(file_traversal_list, mode='w') as traversal_list:
            for line in all_files:
                traversal_list.write("{}\n".format(line))

    if 'concat_all_files' in steps:
        with open(file_traversal_list, mode='r') as traversal_list_file:
            all_files = traversal_list_file.read().splitlines()
            concat_all_files(all_files, one_big_tsv, params['PROGRAM'],
                             callers, params['FIELDS_TO_FIX'])
    #
    # Schemas and table descriptions are maintained in the github repo:
    #

    if 'pull_table_info_from_git' in steps:
        print('pull_table_info_from_git')
        try:
            create_clean_target(params['SCHEMA_REPO_LOCAL'])
            repo = Repo.clone_from(params['SCHEMA_REPO_URL'],
                                   params['SCHEMA_REPO_LOCAL'])
            repo.git.checkout(params['SCHEMA_REPO_BRANCH'])
        except Exception as ex:
            print("pull_table_info_from_git failed: {}".format(str(ex)))
            return

    for table in update_schema_tables:
        if table == 'current':
            use_schema = params['SCHEMA_FILE_NAME']
            schema_release = 'current'
        else:
            use_schema = params['VER_SCHEMA_FILE_NAME']
            schema_release = release

        if 'process_git_schemas' in steps:
            print('process_git_schema')
            # Where do we dump the schema git repository?
            schema_file = "{}/{}/{}".format(params['SCHEMA_REPO_LOCAL'],
                                            params['RAW_SCHEMA_DIR'],
                                            use_schema)
            full_file_prefix = "{}/{}".format(
                params['PROX_DESC_PREFIX'], draft_table.format(schema_release))
            # Write out the details
            success = generate_table_detail_files(schema_file,
                                                  full_file_prefix)
            if not success:
                print("process_git_schemas failed")
                return

        # Customize generic schema to this data program:

        if 'replace_schema_tags' in steps:
            print('replace_schema_tags')
            pn = params['PROGRAM']
            dataset_tuple = (pn, pn.replace(".", "_"))
            tag_map_list = []
            for tag_pair in schema_tags:
                for tag in tag_pair:
                    val = tag_pair[tag]
                    use_pair = {}
                    tag_map_list.append(use_pair)
                    if val.find('~-') == 0 or val.find(
                            '~lc-') == 0 or val.find('~lcbqs-') == 0:
                        chunks = val.split('-', 1)
                        if chunks[1] == 'programs':
                            if val.find('~lcbqs-') == 0:
                                rep_val = dataset_tuple[1].lower(
                                )  # can't have "." in a tag...
                            else:
                                rep_val = dataset_tuple[0]
                        elif chunks[1] == 'builds':
                            rep_val = params['BUILD']
                        else:
                            raise Exception()
                        if val.find('~lc-') == 0:
                            rep_val = rep_val.lower()
                        use_pair[tag] = rep_val
                    else:
                        use_pair[tag] = val
            full_file_prefix = "{}/{}".format(
                params['PROX_DESC_PREFIX'], draft_table.format(schema_release))

            # Write out the details
            success = customize_labels_and_desc(full_file_prefix, tag_map_list)

            if not success:
                print("replace_schema_tags failed")
                return False

        if 'analyze_the_schema' in steps:
            print('analyze_the_schema')
            typing_tups = build_schema(one_big_tsv,
                                       params['SCHEMA_SAMPLE_SKIPS'])
            full_file_prefix = "{}/{}".format(
                params['PROX_DESC_PREFIX'], draft_table.format(schema_release))
            schema_dict_loc = "{}_schema.json".format(full_file_prefix)
            build_combined_schema(None, schema_dict_loc, typing_tups,
                                  hold_schema_list, hold_schema_dict)

    bucket_target_blob = '{}/{}-{}-{}.tsv'.format(params['WORKING_BUCKET_DIR'],
                                                  params['DATE'],
                                                  params['PROGRAM'],
                                                  params['DATA_TYPE'])

    if 'upload_to_bucket' in steps:
        print('upload_to_bucket')
        upload_to_bucket(params['WORKING_BUCKET'], bucket_target_blob,
                         one_big_tsv)

    #
    # Create the BQ table from the TSV:
    #

    if 'create_bq_from_tsv' in steps:
        print('create_bq_from_tsv')
        bucket_src_url = 'gs://{}/{}'.format(params['WORKING_BUCKET'],
                                             bucket_target_blob)
        with open(hold_schema_list, mode='r') as schema_hold_dict:
            typed_schema = json_loads(schema_hold_dict.read())
        csv_to_bq(typed_schema, bucket_src_url, params['SCRATCH_DATASET'],
                  concat_table, params['BQ_AS_BATCH'])

    #
    # Need to merge in aliquot and sample barcodes from other tables:
    #

    if 'collect_barcodes' in steps:
        skel_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                       params['SCRATCH_DATASET'], concat_table)

        if params['RELEASE'] < 25:
            case_table = params['CASE_TABLE'].format('25')
        else:
            case_table = params['CASE_TABLE'].format(release)

        if params['PROGRAM'] == 'TCGA':
            success = attach_aliquot_ids(skel_table,
                                         params['FILE_TABLE'].format(release),
                                         params['SCRATCH_DATASET'],
                                         '_'.join([barcode_table, 'pre']),
                                         params['BQ_AS_BATCH'])
            if not success:
                print("attach_aliquot_ids job failed")
                return

            step_1_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                             params['SCRATCH_DATASET'],
                                             '_'.join([barcode_table, 'pre']))
        else:
            step_1_table = skel_table

        success = attach_barcodes(step_1_table,
                                  params['ALIQUOT_TABLE'].format(release),
                                  params['SCRATCH_DATASET'], barcode_table,
                                  params['BQ_AS_BATCH'], params['PROGRAM'],
                                  case_table)
        if not success:
            print("attach_barcodes job failed")
            return

    #
    # Merge the barcode info into the final table we are building:
    #

    if 'create_final_table' in steps:
        skel_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                       params['SCRATCH_DATASET'], concat_table)
        barcodes_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                           params['SCRATCH_DATASET'],
                                           barcode_table)
        success = final_merge(skel_table, barcodes_table,
                              params['SCRATCH_DATASET'],
                              draft_table.format(release),
                              params['BQ_AS_BATCH'], params['PROGRAM'])
        if not success:
            print("Join job failed")
            return

    #
    # Create second table
    #

    if 'create_current_table' in steps:
        source_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                         params['SCRATCH_DATASET'],
                                         draft_table.format(release))
        current_dest = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                         params['SCRATCH_DATASET'],
                                         draft_table.format('current'))

        success = publish_table(source_table, current_dest)

        if not success:
            print("create current table failed")
            return

    #
    # The derived table we generate has no field descriptions. Add them from the github json files:
    #
    for table in update_schema_tables:
        schema_release = 'current' if table == 'current' else release
        if 'update_final_schema' in steps:
            success = update_schema(params['SCRATCH_DATASET'],
                                    draft_table.format(schema_release),
                                    hold_schema_dict)
            if not success:
                print("Schema update failed")
                return

        #
        # Add the table description:
        #

        if 'add_table_description' in steps:
            print('update_table_description')
            full_file_prefix = "{}/{}".format(
                params['PROX_DESC_PREFIX'], draft_table.format(schema_release))
            success = install_labels_and_desc(
                params['SCRATCH_DATASET'], draft_table.format(schema_release),
                full_file_prefix)
            if not success:
                print("update_table_description failed")
                return

    #
    # compare and remove old current table
    #

    # compare the two tables
    if 'compare_remove_old_current' in steps:
        old_current_table = '{}.{}.{}'.format(
            params['PUBLICATION_PROJECT'], params['PUBLICATION_DATASET'],
            publication_table.format('current'))
        previous_ver_table = '{}.{}.{}'.format(
            params['PUBLICATION_PROJECT'],
            "_".join([params['PUBLICATION_DATASET'], 'versioned']),
            publication_table.format("".join(
                ["r", str(params['PREVIOUS_RELEASE'])])))
        table_temp = '{}.{}.{}'.format(
            params['WORKING_PROJECT'], params['SCRATCH_DATASET'], "_".join([
                params['PROGRAM'],
                publication_table.format("".join(
                    ["r", str(params['PREVIOUS_RELEASE'])])), 'backup'
            ]))

        print('Compare {} to {}'.format(old_current_table, previous_ver_table))

        compare = compare_two_tables(old_current_table, previous_ver_table,
                                     params['BQ_AS_BATCH'])

        num_rows = compare.total_rows

        if num_rows == 0:
            print('the tables are the same')
        else:
            print('the tables are NOT the same and differ by {} rows'.format(
                num_rows))

        if not compare:
            print('compare_tables failed')
            return
        # move old table to a temporary location
        elif compare and num_rows == 0:
            print('Move old table to temp location')
            table_moved = publish_table(old_current_table, table_temp)

            if not table_moved:
                print('Old Table was not moved and will not be deleted')
            # remove old table
            elif table_moved:
                print('Deleting old table: {}'.format(old_current_table))
                delete_table = delete_table_bq_job(
                    params['PUBLICATION_DATASET'],
                    publication_table.format('current'))
                if not delete_table:
                    print('delete table failed')
                    return

    #
    # publish table:
    #

    if 'publish' in steps:

        tables = ['versioned', 'current']

        for table in tables:
            if table == 'versioned':
                print(table)
                source_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                                 params['SCRATCH_DATASET'],
                                                 draft_table.format(release))
                publication_dest = '{}.{}.{}'.format(
                    params['PUBLICATION_PROJECT'],
                    "_".join([params['PUBLICATION_DATASET'], 'versioned']),
                    publication_table.format(release))
            elif table == 'current':
                print(table)
                source_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                                 params['SCRATCH_DATASET'],
                                                 draft_table.format('current'))
                publication_dest = '{}.{}.{}'.format(
                    params['PUBLICATION_PROJECT'],
                    params['PUBLICATION_DATASET'],
                    publication_table.format('current'))
            success = publish_table(source_table, publication_dest)

        if not success:
            print("publish table failed")
            return

    #
    # Update previous versioned table with archived tag
    #

    if 'update_status_tag' in steps:
        print('Update previous table')

        success = update_status_tag(
            "_".join([params['PUBLICATION_DATASET'], 'versioned']),
            publication_table.format("".join(
                ["r", str(params['PREVIOUS_RELEASE'])])), 'archived')

        if not success:
            print("update status tag table failed")
            return

    #
    # Clear out working temp tables:
    #

    if 'dump_working_tables' in steps:
        dump_tables = [
            concat_table, barcode_table,
            draft_table.format('current'),
            draft_table.format(release), manifest_table
        ]
        for table in dump_tables:
            delete_table_bq_job(params['SCRATCH_DATASET'], table)
    #
    # Done!
    #

    print('job completed')

    if 'archive' in steps:

        print('archive files from VM')
        archive_file_prefix = "{}_{}".format(date.today(),
                                             params['PUBLICATION_DATASET'])
        if params['ARCHIVE_YAML']:
            yaml_file = re.search(r"\/(\w*.yaml)$", args[1])
            archive_yaml = "{}/{}/{}_{}".format(params['ARCHIVE_BUCKET_DIR'],
                                                params['ARCHIVE_CONFIG'],
                                                archive_file_prefix,
                                                yaml_file.group(1))
            upload_to_bucket(params['ARCHIVE_BUCKET'], archive_yaml, args[1])
        archive_pull_file = "{}/{}_{}".format(params['ARCHIVE_BUCKET_DIR'],
                                              archive_file_prefix,
                                              params['LOCAL_PULL_LIST'])
        upload_to_bucket(params['ARCHIVE_BUCKET'], archive_pull_file,
                         params['LOCAL_PULL_LIST'])
        archive_manifest_file = "{}/{}_{}".format(params['ARCHIVE_BUCKET_DIR'],
                                                  archive_file_prefix,
                                                  params['MANIFEST_FILE'])
        upload_to_bucket(params['ARCHIVE_BUCKET'], archive_manifest_file,
                         params['MANIFEST_FILE'])

Exemple #2

0

Afficher le fichier

Fichier : build_mirna_expr_bq_table.py Projet : CancerDataAggregator/etl

def main(args):

    if not confirm_google_vm():
        print('This job needs to run on a Google Cloud Compute Engine to avoid storage egress charges [EXITING]')
        return

    if len(args) != 2:
        print(" ")
        print(" Usage : {} <configuration_yaml>".format(args[0]))
        return

    print('job started')

    #
    # Get the YAML config loaded:
    #

    with open(args[1], mode='r') as yaml_file:
        params, bq_filters, steps, extra_cols = load_config(yaml_file.read())

    if params is None:
        print("Bad YAML load")
        return

    # Schema that describes table columns:

    AUGMENTED_SCHEMA_FILE = "SchemaFiles/mirna_augmented_schema_list.json"

    #
    # BQ does not like to be given paths that have "~". So make all local paths absolute:
    #

    home = expanduser("~")
    local_files_dir = "{}/{}".format(home, params['LOCAL_FILES_DIR'])
    one_big_tsv = "{}/{}".format(home, params['ONE_BIG_TSV'])
    manifest_file = "{}/{}".format(home, params['MANIFEST_FILE'])
    local_pull_list = "{}/{}".format(home, params['LOCAL_PULL_LIST'])
    file_traversal_list = "{}/{}".format(home, params['FILE_TRAVERSAL_LIST'])
    hold_schema_dict = "{}/{}".format(home, params['HOLD_SCHEMA_DICT'])
    hold_schema_list = "{}/{}".format(home, params['HOLD_SCHEMA_LIST'])

    #
    # Best practice is to clear out the directory where the files are going. Don't want anything left over.
    # Also creates the destination directory
    #

    if 'clear_target_directory' in steps:
        create_clean_target(local_files_dir)

    #
    # Use the filter set to get a manifest. Note that is a pull list is
    # provided, these steps can be omitted:
    #
    
    if 'build_manifest_from_filters' in steps:
        max_files = params['MAX_FILES'] if 'MAX_FILES' in params else None
        manifest_success = get_the_bq_manifest(params['FILE_TABLE'], bq_filters, max_files,
                                               params['WORKING_PROJECT'], params['TARGET_DATASET'],
                                               params['BQ_MANIFEST_TABLE'], params['WORKING_BUCKET'],
                                               params['BUCKET_MANIFEST_TSV'], manifest_file,
                                               params['BQ_AS_BATCH'])
        if not manifest_success:
            print("Failure generating manifest")
            return

    #
    # If you have already created a pull list, just plunk it in 'LOCAL_PULL_LIST' and skip this step.
    #
    
    if 'build_pull_list' in steps:
        full_manifest = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                          params['TARGET_DATASET'],
                                          params['BQ_MANIFEST_TABLE'])

        build_pull_list_with_bq(full_manifest, params['INDEXD_BQ_TABLE'],
                                params['WORKING_PROJECT'], params['TARGET_DATASET'],
                                params['BQ_PULL_LIST_TABLE'],
                                params['WORKING_BUCKET'],
                                params['BUCKET_PULL_LIST'],
                                local_pull_list, params['BQ_AS_BATCH'])
 
    #
    # Now hitting GDC cloud buckets, not "downloading". Get the files in the pull list:
    #

    if 'download_from_gdc' in steps:       
        with open(local_pull_list, mode='r') as pull_list_file:
            pull_list = pull_list_file.read().splitlines()
        print("Preparing to download %s files from buckets\n" % len(pull_list))
        bp = BucketPuller(10)
        bp.pull_from_buckets(pull_list, local_files_dir)

    #
    # Traverse the tree of downloaded files and create a flat list of all files:
    #
    
    if 'build_traversal_list' in steps:
        all_files = build_file_list(local_files_dir)
        with open(file_traversal_list, mode='w') as traversal_list:
            for line in all_files:
                traversal_list.write("{}\n".format(line)) 
   
    #
    # Take all the files and make one BIG TSV file to upload:
    #
    
    if 'concat_all_files' in steps:       
        with open(file_traversal_list, mode='r') as traversal_list_file:
            all_files = traversal_list_file.read().splitlines()  
        concat_all_files(all_files, one_big_tsv,
                         params['PROGRAM_PREFIX'], extra_cols, file_info, None)

    #
    # For the legacy table, the descriptions had lots of analysis tidbits. Very nice, but hard to maintain.
    # We just use hardwired schema descriptions now, most directly pulled from the GDC website:
    #

    if 'build_the_schema' in steps:
        typing_tups = build_schema(one_big_tsv, params['SCHEMA_SAMPLE_SKIPS'])
        build_combined_schema(None, AUGMENTED_SCHEMA_FILE,
                              typing_tups, hold_schema_list, hold_schema_dict)

    #
    # Upload the giant TSV into a cloud bucket:
    #
    
    if 'upload_to_bucket' in steps:
        upload_to_bucket(params['WORKING_BUCKET'], params['BUCKET_SKEL_TSV'], one_big_tsv)

    #
    # Create the BQ table from the TSV:
    #
        
    if 'create_bq_from_tsv' in steps:
        bucket_src_url = 'gs://{}/{}'.format(params['WORKING_BUCKET'], params['BUCKET_SKEL_TSV'])
        with open(hold_schema_list, mode='r') as schema_hold_dict:
            typed_schema = json_loads(schema_hold_dict.read())
        csv_to_bq(typed_schema, bucket_src_url, params['TARGET_DATASET'], params['SKELETON_TABLE'], params['BQ_AS_BATCH'])

    #
    # Need to merge in aliquot and sample barcodes from other tables:
    #
           
    if 'collect_barcodes' in steps:
        skel_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], 
                                       params['TARGET_DATASET'], 
                                       params['SKELETON_TABLE'])
        
        success = attach_aliquot_ids(skel_table, params['FILE_TABLE'], 
                                     params['TARGET_DATASET'], 
                                     params['BARCODE_STEP_1_TABLE'], params['BQ_AS_BATCH'])
        if not success:
            print("attach_aliquot_ids job failed")
            return

        step_1_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], 
                                         params['TARGET_DATASET'], 
                                         params['BARCODE_STEP_1_TABLE'])
        success = attach_barcodes(step_1_table, params['ALIQUOT_TABLE'], 
                                  params['TARGET_DATASET'], params['BARCODE_STEP_2_TABLE'], params['BQ_AS_BATCH'])
        if not success:
            print("attach_barcodes job failed")
            return
   
    #
    # Merge the barcode info into the final table we are building:
    #

    if 'create_final_table' in steps:
        skel_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], 
                                       params['TARGET_DATASET'], 
                                       params['SKELETON_TABLE'])
        barcodes_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], 
                                           params['TARGET_DATASET'], 
                                           params['BARCODE_STEP_2_TABLE'])        
        success = final_merge(skel_table, barcodes_table, 
                              params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], params['BQ_AS_BATCH'])
        if not success:
            print("Join job failed")
            return

    #
    # Schemas and table descriptions are maintained in the github repo:
    #

    if 'pull_table_info_from_git' in steps:
        print('pull_table_info_from_git')
        try:
            create_clean_target(params['SCHEMA_REPO_LOCAL'])
            repo = Repo.clone_from(params['SCHEMA_REPO_URL'], params['SCHEMA_REPO_LOCAL'])
            repo.git.checkout(params['SCHEMA_REPO_BRANCH'])
        except Exception as ex:
            print("pull_table_info_from_git failed: {}".format(str(ex)))
            return

    if 'process_git_schemas' in steps:
        print('process_git_schema')
        # Where do we dump the schema git repository?
        schema_file = "{}/{}/{}".format(params['SCHEMA_REPO_LOCAL'], params['RAW_SCHEMA_DIR'], params['SCHEMA_FILE_NAME'])
        full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE'])
        # Write out the details
        success = generate_table_detail_files(schema_file, full_file_prefix)
        if not success:
            print("process_git_schemas failed")
            return

    if 'analyze_the_schema' in steps:
        print('analyze_the_schema')
        typing_tups = build_schema(one_big_tsv, params['SCHEMA_SAMPLE_SKIPS'])
        full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE'])
        schema_dict_loc = "{}_schema.json".format(full_file_prefix)
        build_combined_schema(None, schema_dict_loc,
                                typing_tups, hold_schema_list, hold_schema_dict)

    #
    # Update the per-field descriptions:
    #

    if 'update_field_descriptions' in steps:
        print('update_field_descriptions')
        full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE'])
        schema_dict_loc = "{}_schema.json".format(full_file_prefix)
        schema_dict = {}
        with open(schema_dict_loc, mode='r') as schema_hold_dict:
            full_schema_list = json_loads(schema_hold_dict.read())
        for entry in full_schema_list:
            schema_dict[entry['name']] = {'description': entry['description']}

        success = update_schema_with_dict(params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], schema_dict)
        if not success:
            print("update_field_descriptions failed")
            return

    #
    # Add description and labels to the target table:
    #

    if 'update_table_description' in steps:
        print('update_table_description')
        full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE'])
        success = install_labels_and_desc(params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], full_file_prefix)
        if not success:
            print("update_table_description failed")
            return

    #
    # Clear out working temp tables:
    #
    
    if 'dump_working_tables' in steps:   
        dump_table_tags = ['SKELETON_TABLE', 'BARCODE_STEP_1_TABLE', 'BARCODE_STEP_2_TABLE', 
                           'BQ_MANIFEST_TABLE', 'BQ_PULL_LIST_TABLE']
        dump_tables = [params[x] for x in dump_table_tags]
        for table in dump_tables:
            delete_table_bq_job(params['TARGET_DATASET'], table)

    #
    # publish table:
    #

    if 'publish' in steps:
        source_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'],
                                         params['FINAL_TARGET_TABLE'])
        publication_dest = '{}.{}.{}'.format(params['PUBLICATION_PROJECT'], params['PUBLICATION_DATASET'],
                                             params['PUBLICATION_TABLE'])
        success = publish_table(source_table, publication_dest)
        if not success:
            print("publish table failed")
            return

    print('job completed')

Exemple #3

0

Afficher le fichier

Fichier : build_dna_methylation_bq_table.py Projet : oshahzada98/NextGenETL

def main():

    if not confirm_google_vm():
        print(
            'This job needs to run on a Google Cloud Compute Engine to avoid storage egress charges [EXITING]'
        )
        return

    print('job started')

    #
    # First thing is to load the configuration:
    #

    params, filters, bq_filters, steps, retain_cols, extra_cols, retain_platform_ref_fields = load_config(
        yaml_config)

    if params is None:
        print("Bad YAML load")
        return

    #
    # Use the filter set to get a manifest from GDC using their API. Note that if a pull list is
    # provided, these steps can be omitted:
    #

    if 'build_manifest_from_filters' in steps:
        max_files = params['MAX_FILES'] if 'MAX_FILES' in params else None
        if params['USE_GDC_API_FOR_MANIFEST']:
            manifest_filter = build_manifest_filter(filters)
            manifest_success = get_the_manifest(manifest_filter,
                                                params['API_URL'],
                                                params['MANIFEST_FILE'],
                                                max_files)
        else:
            manifest_success = get_the_bq_manifest(
                params['FILE_TABLE'], bq_filters, max_files,
                params['WORKING_PROJECT'], params['TARGET_DATASET'],
                params['BQ_MANIFEST_TABLE'], params['WORKING_BUCKET'],
                params['BUCKET_MANIFEST_TSV'], params['MANIFEST_FILE'],
                params['BQ_AS_BATCH'])
        if not manifest_success:
            print("Failure generating manifest")
            return

    #
    # Best practice is to clear out the directory where the files are going. Don't want anything left over:
    #

    if 'clear_target_directory' in steps:
        create_clean_target(params['LOCAL_FILES_DIR'])

    #
    # We need to create a "pull list" of gs:// URLs to pull from GDC buckets. If you have already
    # created a pull list, just plunk it in 'LOCAL_PULL_LIST' and skip this step. If creating a pull
    # list, you can do it using IndexD calls on a manifest file, OR using BQ as long as you have
    # built the manifest using BQ (that route uses the BQ Manifest table that was created).
    #

    if 'build_pull_list' in steps:

        if params['USE_INDEXD_FOR_PULL']:
            build_pull_list_with_indexd(params['MANIFEST_FILE'],
                                        params['INDEXD_IDS_PER_CALL'],
                                        params['INDEXD_URL'],
                                        params['LOCAL_PULL_LIST'])
        else:
            full_manifest = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                              params['TARGET_DATASET'],
                                              params['BQ_MANIFEST_TABLE'])

            build_pull_list_with_bq(
                full_manifest, params['INDEXD_BQ_TABLE'],
                params['WORKING_PROJECT'], params['TARGET_DATASET'],
                params['BQ_PULL_LIST_TABLE'], params['WORKING_BUCKET'],
                params['BUCKET_PULL_LIST'], params['LOCAL_PULL_LIST'],
                params['BQ_AS_BATCH'])

    #
    # Now hitting GDC cloud buckets, not "downloading". Get the files in the pull list:
    #

    if 'download_from_gdc' in steps:
        with open(params['LOCAL_PULL_LIST'], mode='r') as pull_list_file:
            pull_list = pull_list_file.read().splitlines()
        print("Preparing to download %s files from buckets\n" % len(pull_list))
        bp = BucketPuller(10)
        bp.pull_from_buckets(pull_list, params['LOCAL_FILES_DIR'])

    #
    # Traverse the tree of downloaded files and create a flat list of all files:
    #

    if 'build_traversal_list' in steps:
        all_files = build_file_list(params['LOCAL_FILES_DIR'])
        with open(params['FILE_TRAVERSAL_LIST'], mode='w') as traversal_list:
            for line in all_files:
                traversal_list.write("{}\n".format(line))

    #
    # Take all the files and make one BIG TSV file to upload:
    #

    print("fix me have to toss out NA rows!")
    if 'concat_all_files' in steps:
        with open(params['FILE_TRAVERSAL_LIST'],
                  mode='r') as traversal_list_file:
            all_files = traversal_list_file.read().splitlines()
        concat_all_files_selected_cols(all_files, params['ONE_BIG_TSV'],
                                       params['PROGRAM_PREFIX'], retain_cols,
                                       extra_cols, file_info, None,
                                       "Beta_value", "NA")

    #
    # Build the platform reference table
    #

    if 'build_plat_ref' in steps:
        with open(params['FILE_TRAVERSAL_LIST'],
                  mode='r') as traversal_list_file:
            all_files = traversal_list_file.read().splitlines()
        concat_all_files_selected_cols(all_files, params['ONE_BIG_REF_TSV'],
                                       params['PROGRAM_PREFIX'],
                                       retain_platform_ref_fields, [],
                                       file_info, None, None, None)
        set_from_file(params['ONE_BIG_REF_TSV'],
                      params['ONE_BIG_DISTINCT_REF_TSV'])

    #
    # For the legacy table, the descriptions had lots of analysis tidbits. Very nice, but hard to maintain.
    # We just use hardwired schema descriptions now, most directly pulled from the GDC website:
    #

    if 'build_the_schema' in steps:
        typing_tups = build_schema(params['ONE_BIG_TSV'],
                                   params['SCHEMA_SAMPLE_SKIPS'])
        build_combined_schema(None, params['AUGMENTED_SCHEMA_FILE'],
                              typing_tups, params['HOLD_SCHEMA_LIST'],
                              params['HOLD_SCHEMA_DICT'])

    #
    # Upload the giant TSV into a cloud bucket:
    #

    if 'upload_to_bucket' in steps:
        upload_to_bucket(params['WORKING_BUCKET'], params['BUCKET_SKEL_TSV'],
                         params['ONE_BIG_TSV'])

    #
    # Create the BQ table from the TSV:
    #

    if 'create_bq_from_tsv' in steps:
        bucket_src_url = 'gs://{}/{}'.format(params['WORKING_BUCKET'],
                                             params['BUCKET_SKEL_TSV'])
        with open(params['HOLD_SCHEMA_LIST'], mode='r') as schema_hold_dict:
            typed_schema = json_loads(schema_hold_dict.read())
        csv_to_bq(typed_schema, bucket_src_url, params['TARGET_DATASET'],
                  params['SKELETON_TABLE'], params['BQ_AS_BATCH'])

    #
    # Need to merge in aliquot and sample barcodes from other tables:
    #

    if 'collect_barcodes' in steps:
        skel_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                       params['TARGET_DATASET'],
                                       params['SKELETON_TABLE'])

        success = attach_aliquot_ids(skel_table, params['FILE_TABLE'],
                                     params['TARGET_DATASET'],
                                     params['BARCODE_STEP_1_TABLE'],
                                     params['BQ_AS_BATCH'])
        if not success:
            print("attach_aliquot_ids job failed")
            return

        step_1_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                         params['TARGET_DATASET'],
                                         params['BARCODE_STEP_1_TABLE'])
        success = attach_barcodes(step_1_table, params['ALIQUOT_TABLE'],
                                  params['TARGET_DATASET'],
                                  params['BARCODE_STEP_2_TABLE'],
                                  params['BQ_AS_BATCH'])
        if not success:
            print("attach_barcodes job failed")
            return

    #
    # Merge the barcode info into the final table we are building:
    #

    if 'create_final_table' in steps:
        skel_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                       params['TARGET_DATASET'],
                                       params['SKELETON_TABLE'])
        barcodes_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                           params['TARGET_DATASET'],
                                           params['BARCODE_STEP_2_TABLE'])
        success = final_merge(skel_table, barcodes_table,
                              params['TARGET_DATASET'],
                              params['FINAL_TARGET_TABLE'],
                              params['BQ_AS_BATCH'])
        if not success:
            print("Join job failed")
            return

    #
    # The derived table we generate has no field descriptions. Add them from the scraped page:
    #

    if 'update_final_schema' in steps:
        success = update_schema(params['TARGET_DATASET'],
                                params['FINAL_TARGET_TABLE'],
                                params['HOLD_SCHEMA_DICT'])
        if not success:
            print("Schema update failed")
            return

    #
    # Add the table description:
    #

    if 'add_table_description' in steps:
        update_description(params['TARGET_DATASET'],
                           params['FINAL_TARGET_TABLE'],
                           params['TABLE_DESCRIPTION'])

    #
    # Clear out working temp tables:
    #

    if 'dump_working_tables' in steps:
        dump_table_tags = [
            'SKELETON_TABLE', 'BARCODE_STEP_1_TABLE', 'BARCODE_STEP_2_TABLE',
            'BQ_MANIFEST_TABLE', 'BQ_PULL_LIST_TABLE'
        ]
        dump_tables = [params[x] for x in dump_table_tags]
        for table in dump_tables:
            delete_table_bq_job(params['TARGET_DATASET'], table)
    #
    # Done!
    #

    print('job completed')

Exemple #4

0

Afficher le fichier

Fichier : build_open_somatic_mut_bq_table.py Projet : CancerDataAggregator/etl

def main(args):

    if not confirm_google_vm():
        print(
            'This job needs to run on a Google Cloud Compute Engine to avoid storage egress charges [EXITING]'
        )
        return

    if len(args) != 2:
        print(" ")
        print(" Usage : {} <configuration_yaml>".format(args[0]))
        return

    print('job started')

    #
    # Get the YAML config loaded:
    #

    with open(args[1], mode='r') as yaml_file:
        params, filters, bq_filters, steps, extra_cols, key_fields, callers = load_config(
            yaml_file.read())

    if params is None:
        print("Bad YAML load")
        return

    #
    # BQ does not like to be given paths that have "~". So make all local paths absolute:
    #

    home = expanduser("~")
    local_files_dir = "{}/{}".format(home, params['LOCAL_FILES_DIR'])
    one_big_tsv = "{}/{}".format(home, params['ONE_BIG_TSV'])
    manifest_file = "{}/{}".format(home, params['MANIFEST_FILE'])
    local_pull_list = "{}/{}".format(home, params['LOCAL_PULL_LIST'])
    file_traversal_list = "{}/{}".format(home, params['FILE_TRAVERSAL_LIST'])
    hold_schema_dict = "{}/{}".format(home, params['HOLD_SCHEMA_DICT'])
    hold_schema_list = "{}/{}".format(home, params['HOLD_SCHEMA_LIST'])
    hold_scraped_dict = "{}/{}".format(home, params['HOLD_SCRAPED_DICT'])

    AUGMENTED_SCHEMA_FILE = "SchemaFiles/augmented_schema_list.json"

    #
    # Empirical evidence suggests this workflow is going to be very memory hungry if you are doing
    # merging, and requires at least 26 GB to be safe. Confirm that before starting!
    #

    do_merging = params['DO_MERGED_OUTPUT']
    if do_merging:
        meminfo = dict((i.split()[0].rstrip(':'), int(i.split()[1]))
                       for i in open('/proc/meminfo').readlines())
        mem_kib = meminfo['MemTotal']
        print("Machine memory: {}".format(mem_kib))
        if int(mem_kib) < 26000000:
            print("Job requires at least 26 GB physical memory to complete")
            return

    #
    # Next, use the filter set to get a manifest from GDC using their API. Note that is a pull list is
    # provided, these steps can be omitted:
    #

    if 'build_manifest_from_filters' in steps:
        max_files = params['MAX_FILES'] if 'MAX_FILES' in params else None
        manifest_success = get_the_bq_manifest(
            params['FILE_TABLE'], bq_filters, max_files,
            params['WORKING_PROJECT'], params['TARGET_DATASET'],
            params['BQ_MANIFEST_TABLE'], params['WORKING_BUCKET'],
            params['BUCKET_MANIFEST_TSV'], manifest_file,
            params['BQ_AS_BATCH'])
        if not manifest_success:
            print("Failure generating manifest")
            return

    #
    # Best practice is to clear out the directory where the files are going. Don't want anything left over:
    #

    if 'clear_target_directory' in steps:
        create_clean_target(local_files_dir)

    #
    # We need to create a "pull list" of gs:// URLs to pull from GDC buckets. If you have already
    # created a pull list, just plunk it in 'LOCAL_PULL_LIST' and skip this step. If creating a pull
    # list, you can do it using IndexD calls on a manifest file, OR using BQ as long as you have
    # built the manifest using BQ (that route uses the BQ Manifest table that was created).
    #

    if 'build_pull_list' in steps:
        full_manifest = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                          params['TARGET_DATASET'],
                                          params['BQ_MANIFEST_TABLE'])

        build_pull_list_with_bq(
            full_manifest, params['INDEXD_BQ_TABLE'],
            params['WORKING_PROJECT'], params['TARGET_DATASET'],
            params['BQ_PULL_LIST_TABLE'], params['WORKING_BUCKET'],
            params['BUCKET_PULL_LIST'], local_pull_list, params['BQ_AS_BATCH'])

    #
    # Now hitting GDC cloud buckets, not "downloading". Get the files in the pull list:
    #

    if 'download_from_gdc' in steps:
        with open(local_pull_list, mode='r') as pull_list_file:
            pull_list = pull_list_file.read().splitlines()
        pull_from_buckets(pull_list, local_files_dir)

    #
    # Traverse the tree of downloaded files and create a flat list of all files:
    #

    if 'build_traversal_list' in steps:
        all_files = build_file_list(local_files_dir)
        program_list = build_program_list(all_files)
        if not check_caller_list(all_files, callers):
            print("Unexpected caller mismatch! Expecting {}".format(callers))
            return
        with open(file_traversal_list, mode='w') as traversal_list:
            for line in all_files:
                traversal_list.write("{}\n".format(line))

    #
    # We can create either a table that merges identical mutations from the different callers into
    # one row, or keep them separate:
    #

    if do_merging:
        do_debug = params['DO_DEBUG_LOGGING']
        target_count = int(params['EXPECTED_COLUMNS'])
        for program in program_list:
            print("Look at MAFS for {}".format(program))
            if 'run_maf_reader' in steps:
                with open(file_traversal_list,
                          mode='r') as traversal_list_file:
                    all_files = traversal_list_file.read().splitlines()
                print("Start reading MAFS for {}".format(program))
                mut_calls, hdr_pick = read_MAFs(program, all_files,
                                                params['PROGRAM_PREFIX'],
                                                extra_cols, target_count,
                                                do_debug, key_fields,
                                                params['FIRST_MAF_COL'],
                                                file_info)
                print("Finish reading MAFS for {}".format(program))

            if 'run_maf_writer' in steps:
                print("Start writing MAFS for {}".format(program))
                hist_count = write_MAFs(program, mut_calls, hdr_pick, callers,
                                        do_debug)
                for ii in range(len(hist_count)):
                    if hist_count[ii] > 0:
                        print(" %6d  %9d " % (ii, hist_count[ii]))
                print("Finish writing MAFS for {}".format(program))

    #
    # Take all the files and make one BIG TSV file to upload:
    #

    if 'concat_all_files' in steps:
        if do_merging:
            maf_list = ["mergeA." + tumor + ".maf" for tumor in program_list]
            concat_all_merged_files(maf_list, one_big_tsv)
        else:
            with open(file_traversal_list, mode='r') as traversal_list_file:
                all_files = traversal_list_file.read().splitlines()
            concat_all_files(all_files, one_big_tsv, params['PROGRAM_PREFIX'],
                             extra_cols, file_info)

    #
    # Scrape the column descriptions from the GDC web page
    #

    if 'scrape_schema' in steps:
        scrape_list = scrape_schema(params['MAF_URL'], params['FIRST_MAF_COL'])
        with open(hold_scraped_dict, mode='w') as scraped_hold_list:
            scraped_hold_list.write(json_dumps(scrape_list))

    #
    # For the legacy table, the descriptions had lots of analysis tidbits. Very nice, but hard to maintain.
    # We just use hardwired schema descriptions now, most directly pulled from the GDC website:
    #

    if 'build_the_schema' in steps:
        typing_tups = build_schema(one_big_tsv, params['SCHEMA_SAMPLE_SKIPS'])
        build_combined_schema(hold_scraped_dict, AUGMENTED_SCHEMA_FILE,
                              typing_tups, hold_schema_list, hold_schema_dict)

    #
    # Upload the giant TSV into a cloud bucket:
    #

    if 'upload_to_bucket' in steps:
        upload_to_bucket(params['WORKING_BUCKET'], params['BUCKET_SKEL_TSV'],
                         one_big_tsv)

    #
    # Create the BQ table from the TSV:
    #

    if 'create_bq_from_tsv' in steps:
        bucket_src_url = 'gs://{}/{}'.format(params['WORKING_BUCKET'],
                                             params['BUCKET_SKEL_TSV'])
        with open(hold_schema_list, mode='r') as schema_hold_dict:
            typed_schema = json_loads(schema_hold_dict.read())
        csv_to_bq(typed_schema, bucket_src_url, params['TARGET_DATASET'],
                  params['SKELETON_TABLE'], params['BQ_AS_BATCH'])

    #
    # Need to merge in aliquot and sample barcodes from other tables:
    #

    if 'collect_barcodes' in steps:
        skel_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                       params['TARGET_DATASET'],
                                       params['SKELETON_TABLE'])

        success = attach_aliquot_ids(skel_table, params['FILE_TABLE'],
                                     params['TARGET_DATASET'],
                                     params['BARCODE_STEP_1_TABLE'],
                                     params['BQ_AS_BATCH'])
        if not success:
            print("attach_aliquot_ids job failed")
            return

        step_1_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                         params['TARGET_DATASET'],
                                         params['BARCODE_STEP_1_TABLE'])
        success = attach_barcodes(step_1_table, params['ALIQUOT_TABLE'],
                                  params['TARGET_DATASET'],
                                  params['BARCODE_STEP_2_TABLE'],
                                  params['BQ_AS_BATCH'])
        if not success:
            print("attach_barcodes job failed")
            return

    #
    # Merge the barcode info into the final table we are building:
    #

    if 'create_final_table' in steps:
        skel_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                       params['TARGET_DATASET'],
                                       params['SKELETON_TABLE'])
        barcodes_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                           params['TARGET_DATASET'],
                                           params['BARCODE_STEP_2_TABLE'])
        success = final_merge(skel_table, barcodes_table,
                              params['TARGET_DATASET'],
                              params['FINAL_TARGET_TABLE'],
                              params['BQ_AS_BATCH'])
        if not success:
            print("Join job failed")
            return

    #
    # The derived table we generate has no field descriptions. Add them from the scraped page:
    #

    if 'update_final_schema' in steps:
        success = update_schema(params['TARGET_DATASET'],
                                params['FINAL_TARGET_TABLE'], hold_schema_dict)
        if not success:
            print("Schema update failed")
            return

    #
    # Add the table description:
    #

    if 'add_table_description' in steps:
        desc = params['TABLE_DESCRIPTION'].format(params['MAF_URL'])
        update_description(params['TARGET_DATASET'],
                           params['FINAL_TARGET_TABLE'], desc)

    #
    # Clear out working temp tables:
    #

    if 'dump_working_tables' in steps:
        dump_table_tags = [
            'SKELETON_TABLE', 'BARCODE_STEP_1_TABLE', 'BARCODE_STEP_2_TABLE',
            'BQ_MANIFEST_TABLE', 'BQ_PULL_LIST_TABLE'
        ]
        dump_tables = [params[x] for x in dump_table_tags]
        for table in dump_tables:
            delete_table_bq_job(params['TARGET_DATASET'], table)
    #
    # Done!
    #

    print('job completed')

Exemple #5

0

Afficher le fichier

def main(args):

    if not confirm_google_vm():
        print('This job needs to run on a Google Cloud Compute Engine to avoid storage egress charges [EXITING]')
        return

    if len(args) != 2:
        print(" ")
        print(" Usage : {} <configuration_yaml>".format(args[0]))
        return

    print('job started')

    #
    # Get the YAML config loaded:
    #

    with open(args[1], mode='r') as yaml_file:
        params, bq_filters, na_values, steps = load_config(yaml_file.read())

    #
    # BQ does not like to be given paths that have "~". So make all local paths absolute:
    #

    home = expanduser("~")
    local_files_dir = "{}/{}".format(home, params['LOCAL_FILES_DIR'])
    one_big_tsv = "{}/{}".format(home, params['ONE_BIG_TSV'])
    manifest_file = "{}/{}".format(home, params['MANIFEST_FILE'])
    local_pull_list = "{}/{}".format(home, params['LOCAL_PULL_LIST'])
    file_traversal_list = "{}/{}".format(home, params['FILE_TRAVERSAL_LIST'])
    hold_schema_dict = "{}/{}".format(home, params['HOLD_SCHEMA_DICT'])
    hold_schema_list = "{}/{}".format(home, params['HOLD_SCHEMA_LIST'])

    #
    # Actual fields have brackets:
    #

    na_set = set()
    for val in na_values:
        na_set.add("[{}]".format(val))

    if 'clear_target_directory' in steps:
        print('clear_target_directory')
        create_clean_target(local_files_dir)

    #
    # Use the filter set to build a manifest. Note that if a pull list is
    # provided, these steps can be omitted:
    #

    if 'build_manifest_from_filters' in steps:
        print('build_manifest_from_filters')
        max_files = params['MAX_FILES'] if 'MAX_FILES' in params else None

        manifest_success = get_the_bq_manifest(params['FILE_TABLE'], bq_filters, max_files,
                                               params['WORKING_PROJECT'], params['TARGET_DATASET'],
                                               params['BQ_MANIFEST_TABLE'], params['WORKING_BUCKET'],
                                               params['BUCKET_MANIFEST_TSV'], manifest_file,
                                               params['BQ_AS_BATCH'])
        if not manifest_success:
            print("Failure generating manifest")
            return

    #
    # We need to create a "pull list" of gs:// URLs to pull from GDC buckets. If you have already
    # created a pull list, just plunk it in 'LOCAL_PULL_LIST' and skip this step. If creating a pull
    # list, uses BQ as long as you have built the manifest using BQ (that route uses the BQ Manifest
    # table that was created).
    #

    if 'build_pull_list' in steps:
        print('build_pull_list')
        full_manifest = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                          params['TARGET_DATASET'],
                                          params['BQ_MANIFEST_TABLE'])
        success = build_pull_list_with_bq_public(full_manifest, params['INDEXD_BQ_TABLE'],
                                          params['WORKING_PROJECT'], params['TARGET_DATASET'],
                                          params['BQ_PULL_LIST_TABLE'],
                                          params['WORKING_BUCKET'],
                                          params['BUCKET_PULL_LIST'],
                                          local_pull_list, params['BQ_AS_BATCH'])

        if not success:
            print("Build pull list failed")
            return;
    #
    # Now hitting GDC cloud buckets. Get the files in the pull list:
    #

    if 'download_from_gdc' in steps:
        print('download_from_gdc')
        with open(local_pull_list, mode='r') as pull_list_file:
            pull_list = pull_list_file.read().splitlines()
        print("Preparing to download %s files from buckets\n" % len(pull_list))
        bp = BucketPuller(10)
        bp.pull_from_buckets(pull_list, local_files_dir)

    if 'build_file_list' in steps:
        print('build_file_list')
        all_files = build_file_list(local_files_dir)
        with open(file_traversal_list, mode='w') as traversal_list:
            for line in all_files:
                traversal_list.write("{}\n".format(line))

    if 'group_by_type' in steps:
        print('group_by_type')
        print(file_traversal_list)
        with open(file_traversal_list, mode='r') as traversal_list_file:
            all_files = traversal_list_file.read().splitlines()
        group_dict = group_by_suffixes(all_files) # WRITE OUT AS JSON!!

    if 'convert_excel_to_csv' in steps:
        print('convert_excel_to_csv')
        with open(file_traversal_list, mode='r') as traversal_list_file:
            all_files = traversal_list_file.read().splitlines()
        convert_excel_to_csv(all_files, local_files_dir)

    if 'concat_all_files' in steps:
        print('concat_all_files')
        for k, v in group_dict.items():
            concat_all_files(v, one_big_tsv.format(k), na_set)

    #
    # Schemas and table descriptions are maintained in the github repo:
    #

    if 'pull_table_info_from_git' in steps:
        print('pull_table_info_from_git')
        try:
            create_clean_target(params['SCHEMA_REPO_LOCAL'])
            repo = Repo.clone_from(params['SCHEMA_REPO_URL'], params['SCHEMA_REPO_LOCAL'])
            repo.git.checkout(params['SCHEMA_REPO_BRANCH'])
        except Exception as ex:
            print("pull_table_info_from_git failed: {}".format(str(ex)))
            return

    if 'process_git_schemas' in steps:
        print('process_git_schema')
        # Where do we dump the schema git repository?
        schema_file = "{}/{}/{}".format(params['SCHEMA_REPO_LOCAL'], params['RAW_SCHEMA_DIR'], params['SCHEMA_FILE_NAME'])
        full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE'])
        # Write out the details
        success = generate_table_detail_files(schema_file, full_file_prefix)
        if not success:
            print("process_git_schemas failed")
            return

    if 'analyze_the_schema' in steps:
        print('analyze_the_schema')
        for k in group_dict:
            typing_tups = build_schema(one_big_tsv.format(k), params['SCHEMA_SAMPLE_SKIPS'])
            #full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE'])
            #schema_dict_loc = "{}_schema.json".format(full_file_prefix)
            hold_schema_dict_for_group = hold_schema_dict.format(k)
            hold_schema_list_for_group = hold_schema_list.format(k)
            build_combined_schema(None, None,
                                  typing_tups, hold_schema_list_for_group, hold_schema_dict_for_group)

    bucket_target_blob = '{}/{}'.format(params['WORKING_BUCKET_DIR'], params['BUCKET_TSV'])

    if 'upload_to_bucket' in steps:
        print('upload_to_bucket')
        for k in group_dict:
            upload_to_bucket(params['WORKING_BUCKET'], bucket_target_blob.format(k), one_big_tsv.format(k))

    if 'create_bq_from_tsv' in steps:
        print('create_bq_from_tsv')
        for k in group_dict:
            bucket_src_url = 'gs://{}/{}'.format(params['WORKING_BUCKET'], bucket_target_blob.format(k))
            with open(hold_schema_list.format(k), mode='r') as schema_hold_dict:
                typed_schema = json_loads(schema_hold_dict.read())
            csv_to_bq(typed_schema, bucket_src_url, params['TARGET_DATASET'],
                      params['FINAL_TARGET_TABLE'].format(k.replace(".", "_").replace("-", "_")), params['BQ_AS_BATCH'])

    if 'add_aliquot_fields' in steps:
        print('add_aliquot_fields')
        full_target_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                              params['TARGET_DATASET'],
                                              params['TARGET_TABLE'])
        success = join_with_aliquot_table(full_target_table, params['ALIQUOT_TABLE'],
                                          params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], params['BQ_AS_BATCH'])
        if not success:
            print("Join job failed")

    #
    # Update the per-field descriptions:
    #

    if 'update_field_descriptions' in steps:
        print('update_field_descriptions')
        full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE'])
        schema_dict_loc = "{}_schema.json".format(full_file_prefix)
        schema_dict = {}
        with open(schema_dict_loc, mode='r') as schema_hold_dict:
            full_schema_list = json_loads(schema_hold_dict.read())
        for entry in full_schema_list:
            schema_dict[entry['name']] = {'description': entry['description']}

        success = update_schema_with_dict(params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], schema_dict)
        if not success:
            print("update_field_descriptions failed")
            return

    #
    # Add description and labels to the target table:
    #

    if 'update_table_description' in steps:
        print('update_table_description')
        full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE'])
        success = install_labels_and_desc(params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], full_file_prefix)
        if not success:
            print("update_table_description failed")
            return

    #
    # publish table:
    #

    if 'publish' in steps:

        source_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'],
                                         params['FINAL_TARGET_TABLE'])
        publication_dest = '{}.{}.{}'.format(params['PUBLICATION_PROJECT'], params['PUBLICATION_DATASET'],
                                             params['PUBLICATION_TABLE'])

        success = publish_table(source_table, publication_dest)

        if not success:
            print("publish table failed")
            return

    #
    # Clear out working temp tables:
    #

    if 'dump_working_tables' in steps:
        dump_table_tags = ['TARGET_TABLE']
        dump_tables = [params[x] for x in dump_table_tags]
        for table in dump_tables:
            delete_table_bq_job(params['TARGET_DATASET'], table)

    print('job completed')

Exemple #6

0

Afficher le fichier

Fichier : build_pdc_metadata.py Projet : CancerDataAggregator/etl

def main(args):

    if not confirm_google_vm():
        print(
            'This job needs to run on a Google Cloud Compute Engine to avoid storage egress charges [EXITING]'
        )
        return

    if len(args) != 2:
        print(" ")
        print(" Usage : {} <configuration_yaml>".format(args[0]))
        return

    print('job started')

    #
    # Get the YAML config loaded:
    #

    with open(args[1], mode='r') as yaml_file:
        params, steps = load_config(yaml_file.read())

    #
    # BQ does not like to be given paths that have "~". So make all local paths absolute:
    #

    home = expanduser("~")
    local_files_dir = "{}/{}".format(home, params['LOCAL_FILES_DIR'])
    prog_tsv = "{}/{}".format(home, params['PROG_TSV'])
    case_tsv = "{}/{}".format(home, params['CASE_TSV'])
    sample_tsv = "{}/{}".format(home, params['SAMPLE_TSV'])
    aliquot_tsv = "{}/{}".format(home, params['ALIQUOT_TSV'])

    hold_schema_dict_prog = "{}/{}".format(home,
                                           params['HOLD_SCHEMA_DICT_PROG'])
    hold_schema_list_prog = "{}/{}".format(home,
                                           params['HOLD_SCHEMA_LIST_PROG'])
    hold_schema_dict_case = "{}/{}".format(home,
                                           params['HOLD_SCHEMA_DICT_CASE'])
    hold_schema_list_case = "{}/{}".format(home,
                                           params['HOLD_SCHEMA_LIST_CASE'])
    hold_schema_dict_sample = "{}/{}".format(home,
                                             params['HOLD_SCHEMA_DICT_SAMPLE'])
    hold_schema_list_sample = "{}/{}".format(home,
                                             params['HOLD_SCHEMA_LIST_SAMPLE'])
    hold_schema_dict_aliquot = "{}/{}".format(
        home, params['HOLD_SCHEMA_DICT_ALIQUOT'])
    hold_schema_list_aliquot = "{}/{}".format(
        home, params['HOLD_SCHEMA_LIST_ALIQUOT'])

    if 'clear_target_directory' in steps:
        print('clear_target_directory')
        create_clean_target(local_files_dir)

    #
    # Use the filter set to build a manifest. Note that if a pull list is
    # provided, these steps can be omitted:
    #

    if 'pull_cases_per_program_from_pdc' in steps:
        endpoint = params["PDC_ENDPOINT"]
        success = pull_cases_per_program_from_pdc(endpoint, prog_tsv)
        if not success:
            print("Failure pulling programs")
            return

    if 'pull_aliquots_from_pdc' in steps:
        endpoint = params["PDC_ENDPOINT"]
        success = pull_aliquots_from_pdc(endpoint, case_tsv, sample_tsv,
                                         aliquot_tsv)
        if not success:
            print("Failure pulling programs")
            return

    #
    # Schemas and table descriptions are maintained in the github repo:
    #

    if 'pull_table_info_from_git' in steps:
        print('pull_table_info_from_git')
        try:
            create_clean_target(params['SCHEMA_REPO_LOCAL'])
            repo = Repo.clone_from(params['SCHEMA_REPO_URL'],
                                   params['SCHEMA_REPO_LOCAL'])
            repo.git.checkout(params['SCHEMA_REPO_BRANCH'])
        except Exception as ex:
            print("pull_table_info_from_git failed: {}".format(str(ex)))
            return

    if 'process_git_schemas' in steps:
        print('process_git_schema')
        # Where do we dump the schema git repository?
        schema_file = "{}/{}/{}".format(params['SCHEMA_REPO_LOCAL'],
                                        params['RAW_SCHEMA_DIR'],
                                        params['SCHEMA_FILE_NAME'])
        full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'],
                                          params['FINAL_TARGET_TABLE'])
        # Write out the details
        success = generate_table_detail_files(schema_file, full_file_prefix)
        if not success:
            print("process_git_schemas failed")
            return

    if 'analyze_the_schema' in steps:
        print('analyze_the_schema')
        typing_tups = build_schema(prog_tsv, params['SCHEMA_SAMPLE_SKIPS'])
        full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'],
                                          params['TARGET_TABLE_PROG'])
        schema_dict_loc = "{}_schema.json".format(full_file_prefix)
        build_combined_schema(None, None, typing_tups, hold_schema_list_prog,
                              hold_schema_dict_prog)
        typing_tups = build_schema(case_tsv, params['SCHEMA_SAMPLE_SKIPS'])
        full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'],
                                          params['FINAL_TARGET_CASE_TABLE'])
        schema_dict_loc = "{}_schema.json".format(full_file_prefix)
        build_combined_schema(None, None, typing_tups, hold_schema_list_case,
                              hold_schema_dict_case)
        typing_tups = build_schema(sample_tsv, params['SCHEMA_SAMPLE_SKIPS'])
        full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'],
                                          params['TARGET_TABLE_SAMPLE'])
        schema_dict_loc = "{}_schema.json".format(full_file_prefix)
        build_combined_schema(None, None, typing_tups, hold_schema_list_sample,
                              hold_schema_dict_sample)
        typing_tups = build_schema(aliquot_tsv, params['SCHEMA_SAMPLE_SKIPS'])
        full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'],
                                          params['TARGET_TABLE_ALIQUOT'])
        schema_dict_loc = "{}_schema.json".format(full_file_prefix)
        build_combined_schema(None, None, typing_tups,
                              hold_schema_list_aliquot,
                              hold_schema_dict_aliquot)

    bucket_target_program = '{}/{}'.format(params['WORKING_BUCKET_DIR'],
                                           params['BUCKET_TSV_PROGRAM'])
    bucket_target_case = '{}/{}'.format(params['WORKING_BUCKET_DIR'],
                                        params['BUCKET_TSV_CASE'])
    bucket_target_sample = '{}/{}'.format(params['WORKING_BUCKET_DIR'],
                                          params['BUCKET_TSV_SAMPLE'])
    bucket_target_aliquot = '{}/{}'.format(params['WORKING_BUCKET_DIR'],
                                           params['BUCKET_TSV_ALIQUOT'])

    if 'upload_to_bucket' in steps:
        print('upload_to_bucket')
        upload_to_bucket(params['WORKING_BUCKET'], bucket_target_program,
                         prog_tsv)
        upload_to_bucket(params['WORKING_BUCKET'], bucket_target_case,
                         case_tsv)
        upload_to_bucket(params['WORKING_BUCKET'], bucket_target_sample,
                         sample_tsv)
        upload_to_bucket(params['WORKING_BUCKET'], bucket_target_aliquot,
                         aliquot_tsv)

    if 'create_bq_from_tsv' in steps:
        print('create_bq_from_tsv')
        bucket_src_url = 'gs://{}/{}'.format(params['WORKING_BUCKET'],
                                             bucket_target_program)
        with open(hold_schema_list_prog, mode='r') as schema_hold_dict:
            typed_schema = json_loads(schema_hold_dict.read())
        csv_to_bq(typed_schema, bucket_src_url, params['TARGET_DATASET'],
                  params['TARGET_TABLE_PROG'], params['BQ_AS_BATCH'])

        bucket_src_url = 'gs://{}/{}'.format(params['WORKING_BUCKET'],
                                             bucket_target_case)
        with open(hold_schema_list_case, mode='r') as schema_hold_dict:
            typed_schema = json_loads(schema_hold_dict.read())
        csv_to_bq(typed_schema, bucket_src_url, params['TARGET_DATASET'],
                  params['TARGET_TABLE_CASE'], params['BQ_AS_BATCH'])

        bucket_src_url = 'gs://{}/{}'.format(params['WORKING_BUCKET'],
                                             bucket_target_sample)
        with open(hold_schema_list_sample, mode='r') as schema_hold_dict:
            typed_schema = json_loads(schema_hold_dict.read())
        csv_to_bq(typed_schema, bucket_src_url, params['TARGET_DATASET'],
                  params['TARGET_TABLE_SAMPLE'], params['BQ_AS_BATCH'])

        bucket_src_url = 'gs://{}/{}'.format(params['WORKING_BUCKET'],
                                             bucket_target_aliquot)
        with open(hold_schema_list_aliquot, mode='r') as schema_hold_dict:
            typed_schema = json_loads(schema_hold_dict.read())
        csv_to_bq(typed_schema, bucket_src_url, params['TARGET_DATASET'],
                  params['TARGET_TABLE_ALIQUOT'], params['BQ_AS_BATCH'])

    if 'join_case_tables' in steps:
        print('join_case_tables')
        full_target_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                              params['TARGET_DATASET'],
                                              params['TARGET_TABLE'])
        success = join_with_aliquot_table(full_target_table,
                                          params['ALIQUOT_TABLE'],
                                          params['TARGET_DATASET'],
                                          params['FINAL_TARGET_TABLE'],
                                          params['BQ_AS_BATCH'])
        if not success:
            print("Join job failed")

    #
    # Update the per-field descriptions:
    #

    if 'update_field_descriptions' in steps:
        print('update_field_descriptions')
        full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'],
                                          params['FINAL_TARGET_TABLE'])
        schema_dict_loc = "{}_schema.json".format(full_file_prefix)
        schema_dict = {}
        with open(schema_dict_loc, mode='r') as schema_hold_dict:
            full_schema_list = json_loads(schema_hold_dict.read())
        for entry in full_schema_list:
            schema_dict[entry['name']] = {'description': entry['description']}

        success = update_schema_with_dict(params['TARGET_DATASET'],
                                          params['FINAL_TARGET_TABLE'],
                                          schema_dict)
        if not success:
            print("update_field_descriptions failed")
            return

    #
    # Add description and labels to the target table:
    #

    if 'update_table_description' in steps:
        print('update_table_description')
        full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'],
                                          params['FINAL_TARGET_TABLE'])
        success = install_labels_and_desc(params['TARGET_DATASET'],
                                          params['FINAL_TARGET_TABLE'],
                                          full_file_prefix)
        if not success:
            print("update_table_description failed")
            return

    #
    # publish table:
    #

    if 'publish' in steps:

        source_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                         params['TARGET_DATASET'],
                                         params['FINAL_TARGET_TABLE'])
        publication_dest = '{}.{}.{}'.format(params['PUBLICATION_PROJECT'],
                                             params['PUBLICATION_DATASET'],
                                             params['PUBLICATION_TABLE'])

        success = publish_table(source_table, publication_dest)

        if not success:
            print("publish table failed")
            return

    #
    # Clear out working temp tables:
    #

    if 'dump_working_tables' in steps:
        dump_table_tags = ['TARGET_TABLE']
        dump_tables = [params[x] for x in dump_table_tags]
        for table in dump_tables:
            delete_table_bq_job(params['TARGET_DATASET'], table)

    print('job completed')

Exemple #7

0

Afficher le fichier

Fichier : build_rna_seq_gexp_bq_table.py Projet : oshahzada98/NextGenETL

def main(args):

    if not confirm_google_vm():
        print(
            'This job needs to run on a Google Cloud Compute Engine to avoid storage egress charges [EXITING]'
        )
        return

    if len(args) != 2:
        print(" ")
        print(" Usage : {} <configuration_yaml>".format(args[0]))
        return

    print('job started')

    #
    # Get the YAML config loaded:
    #

    with open(args[1], mode='r') as yaml_file:
        params, file_sets, update_schema_tables, schema_tags, steps = load_config(
            yaml_file.read())

    #
    # BQ does not like to be given paths that have "~". So make all local paths absolute:
    #

    home = expanduser("~")
    local_files_dir = "{}/{}".format(home, params['LOCAL_FILES_DIR'])
    one_big_tsv = "{}/{}".format(home, params['ONE_BIG_TSV'])
    manifest_file = "{}/{}".format(home, params['MANIFEST_FILE'])
    local_pull_list = "{}/{}".format(home, params['LOCAL_PULL_LIST'])
    file_traversal_list = "{}/{}".format(home, params['FILE_TRAVERSAL_LIST'])
    hold_schema_dict = "{}/{}".format(home, params['HOLD_SCHEMA_DICT'])
    hold_schema_list = "{}/{}".format(home, params['HOLD_SCHEMA_LIST'])

    # Which release is the workflow running on?
    release = "".join(["r", str(params['RELEASE'])])

    # Create table names
    upload_table = '_'.join([params['PROGRAM'], params['DATA_TYPE'], '{}'])
    manifest_table = '_'.join(
        [params['PROGRAM'], params['DATA_TYPE'], 'manifest', '{}'])
    pull_list_table = '_'.join(
        [params['PROGRAM'], params['DATA_TYPE'], 'pull', 'list', '{}'])
    files_to_case_table = '_'.join(
        [params['PROGRAM'], params['DATA_TYPE'], 'files_to_case'])
    files_to_case_w_plat_table = '_'.join(
        [params['PROGRAM'], params['DATA_TYPE'], 'files_to_case_with_plat'])
    barcodes_table = '_'.join(
        [params['PROGRAM'], params['DATA_TYPE'], 'barcodes'])
    counts_w_metadata_table = '_'.join(
        [params['PROGRAM'], params['DATA_TYPE'], 'counts_and_meta', '{}'])
    merged_counts_table = '_'.join(
        [params['PROGRAM'], params['DATA_TYPE'], 'merged_counts'])
    draft_table = '_'.join(
        [params['PROGRAM'], params['DATA_TYPE'], params['BUILD'], 'gdc', '{}'])
    publication_table = '_'.join(
        [params['DATA_TYPE'], params['BUILD'], 'gdc', '{}'])

    if params['RELEASE'] < 21 and 'METADATA_REL' not in params:
        print("The input release is before new metadata process, "
              "please specify which release of the metadata to use.")

    metadata_rel = "".join(["r", str(params['METADATA_REL'])
                            ]) if 'METADATA_REL' in params else release

    if 'clear_target_directory' in steps:
        for file_set in file_sets:
            count_name, _ = next(iter(file_set.items()))
            create_clean_target(local_files_dir.format(count_name))

    if 'build_manifest_from_filters' in steps:
        for file_set in file_sets:
            count_name, count_dict = next(iter(file_set.items()))
            mani_for_count = manifest_file.format(count_name)
            table_for_count = manifest_table.format(count_name)
            tsv_for_count = params['BUCKET_MANIFEST_TSV'].format(count_name)
            max_files = params['MAX_FILES'] if 'MAX_FILES' in params else None
            manifest_success = get_the_bq_manifest(
                params['FILE_TABLE'].format(metadata_rel),
                count_dict['filters'], max_files, params['WORKING_PROJECT'],
                params['SCRATCH_DATASET'], table_for_count,
                params['WORKING_BUCKET'], tsv_for_count, mani_for_count,
                params['BQ_AS_BATCH'])
            if not manifest_success:
                print("Failure generating manifest")
                return

    #
    # We need to create a "pull list" of gs:// URLs to pull from GDC buckets. If you have already
    # created a pull list, just plunk it in 'LOCAL_PULL_LIST' and skip this step. If creating a pull
    # list, uses BQ as long as you have built the manifest using BQ (that route uses the BQ Manifest
    # table that was created).
    #

    if 'build_pull_list' in steps:
        for file_set in file_sets:
            count_name, count_dict = next(iter(file_set.items()))
            table_for_count = manifest_table.format(count_name)
            local_pull_for_count = local_pull_list.format(count_name)
            pull_table_for_count = pull_list_table.format(count_name)
            bucket_pull_list_for_count = params['BUCKET_PULL_LIST'].format(
                count_name)
            full_manifest = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                              params['SCRATCH_DATASET'],
                                              table_for_count)
            build_pull_list_with_bq(
                full_manifest, params['INDEXD_BQ_TABLE'].format(metadata_rel),
                params['WORKING_PROJECT'], params['SCRATCH_DATASET'],
                pull_table_for_count, params['WORKING_BUCKET'],
                bucket_pull_list_for_count, local_pull_for_count,
                params['BQ_AS_BATCH'])
    #
    # Now hitting GDC cloud buckets. Get the files in the pull list:
    #

    if 'download_from_gdc' in steps:
        for file_set in file_sets:
            count_name, _ = next(iter(file_set.items()))
            pull_for_count = local_pull_list.format(count_name)
            with open(pull_for_count, mode='r') as pull_list_file:
                pull_list = pull_list_file.read().splitlines()
            print("Preparing to download %s files from buckets\n" %
                  len(pull_list))
            bp = BucketPuller(10)
            local_files_dir_for_count = local_files_dir.format(count_name)
            bp.pull_from_buckets(pull_list, local_files_dir_for_count)

    if 'build_file_list' in steps:
        for file_set in file_sets:
            count_name, _ = next(iter(file_set.items()))
            local_files_dir_for_count = local_files_dir.format(count_name)
            all_files = build_file_list(local_files_dir_for_count)
            file_traversal_list_for_count = file_traversal_list.format(
                count_name)
            with open(file_traversal_list_for_count,
                      mode='w') as traversal_list:
                for line in all_files:
                    traversal_list.write("{}\n".format(line))

    if 'concat_all_files' in steps:
        for file_set in file_sets:
            count_name, count_dict = next(iter(file_set.items()))
            header = count_dict['header'] if 'header' in count_dict else None
            file_traversal_list_for_count = file_traversal_list.format(
                count_name)
            with open(file_traversal_list_for_count,
                      mode='r') as traversal_list_file:
                all_files = traversal_list_file.read().splitlines()
                concat_all_files(all_files, one_big_tsv.format(count_name),
                                 header)

    #
    # Schemas and table descriptions are maintained in the github repo:
    #

    if 'pull_table_info_from_git' in steps:
        print('pull_table_info_from_git')
        try:
            create_clean_target(params['SCHEMA_REPO_LOCAL'])
            repo = Repo.clone_from(params['SCHEMA_REPO_URL'],
                                   params['SCHEMA_REPO_LOCAL'])
            repo.git.checkout(params['SCHEMA_REPO_BRANCH'])
        except Exception as ex:
            print("pull_table_info_from_git failed: {}".format(str(ex)))
            return

    for table in update_schema_tables:
        if table == 'current':
            use_schema = params['SCHEMA_FILE_NAME']
            schema_release = 'current'
        else:
            use_schema = params['VER_SCHEMA_FILE_NAME']
            schema_release = release

        if 'process_git_schemas' in steps:
            print('process_git_schema')
            # Where do we dump the schema git repository?
            schema_file = "{}/{}/{}".format(params['SCHEMA_REPO_LOCAL'],
                                            params['RAW_SCHEMA_DIR'],
                                            use_schema)
            full_file_prefix = "{}/{}".format(
                params['PROX_DESC_PREFIX'], draft_table.format(schema_release))
            # Write out the details
            success = generate_table_detail_files(schema_file,
                                                  full_file_prefix)
            if not success:
                print("process_git_schemas failed")
                return

        # Customize generic schema to this data program:

        if 'replace_schema_tags' in steps:
            print('replace_schema_tags')
            pn = params['PROGRAM']
            dataset_tuple = (pn, pn.replace(".", "_"))
            tag_map_list = []
            for tag_pair in schema_tags:
                for tag in tag_pair:
                    val = tag_pair[tag]
                    use_pair = {}
                    tag_map_list.append(use_pair)
                    if val.find('~-') == 0 or val.find(
                            '~lc-') == 0 or val.find('~lcbqs-') == 0:
                        chunks = val.split('-', 1)
                        if chunks[1] == 'programs':
                            if val.find('~lcbqs-') == 0:
                                rep_val = dataset_tuple[1].lower(
                                )  # can't have "." in a tag...
                            else:
                                rep_val = dataset_tuple[0]
                        elif chunks[1] == 'builds':
                            rep_val = params['BUILD']
                        else:
                            raise Exception()
                        if val.find('~lc-') == 0:
                            rep_val = rep_val.lower()
                        use_pair[tag] = rep_val
                    else:
                        use_pair[tag] = val
            full_file_prefix = "{}/{}".format(
                params['PROX_DESC_PREFIX'], draft_table.format(schema_release))

            # Write out the details
            success = customize_labels_and_desc(full_file_prefix, tag_map_list)

            if not success:
                print("replace_schema_tags failed")
                return False

        if 'analyze_the_schema' in steps:
            print('analyze_the_schema')
            for file_set in file_sets:
                count_name, _ = next(iter(file_set.items()))
                typing_tups = build_schema(one_big_tsv.format(count_name),
                                           params['SCHEMA_SAMPLE_SKIPS'])
                full_file_prefix = "{}/{}".format(
                    params['PROX_DESC_PREFIX'],
                    draft_table.format(schema_release))
                schema_dict_loc = "{}_schema.json".format(full_file_prefix)
                build_combined_schema(None, schema_dict_loc, typing_tups,
                                      hold_schema_list.format(count_name),
                                      hold_schema_dict.format(count_name))

    bucket_target_blob_sets = {}
    for file_set in file_sets:
        count_name, _ = next(iter(file_set.items()))
        bucket_target_blob_sets[count_name] = '{}/{}-{}-{}-{}.tsv'.format(
            params['WORKING_BUCKET_DIR'], params['DATE'], params['PROGRAM'],
            params['DATA_TYPE'], count_name)

    if 'upload_to_bucket' in steps:
        for file_set in file_sets:
            count_name, _ = next(iter(file_set.items()))
            upload_to_bucket(params['WORKING_BUCKET'],
                             bucket_target_blob_sets[count_name],
                             one_big_tsv.format(count_name))

    if 'delete_all_bq' in steps:
        table_cleaner(params, file_sets, True)

    if 'create_bq_from_tsv' in steps:
        for file_set in file_sets:
            count_name, _ = next(iter(file_set.items()))
            bucket_src_url = 'gs://{}/{}'.format(
                params['WORKING_BUCKET'], bucket_target_blob_sets[count_name])
            hold_schema_list_for_count = hold_schema_list.format(count_name)
            with open(hold_schema_list_for_count,
                      mode='r') as schema_hold_dict:
                typed_schema = json_loads(schema_hold_dict.read())
            csv_to_bq_write_depo(typed_schema, bucket_src_url,
                                 params['SCRATCH_DATASET'],
                                 upload_table.format(count_name),
                                 params['BQ_AS_BATCH'], None)

    if 'attach_ids_to_files' in steps:
        count = 0
        for file_set in file_sets:
            count_name, _ = next(iter(file_set.items()))
            write_depo = "WRITE_TRUNCATE" if (count == 0) else "WRITE_APPEND"
            gexp_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                           params['SCRATCH_DATASET'],
                                           upload_table.format(count_name))
            success = build_aliquot_and_case(
                gexp_table, params['FILEDATA_TABLE'].format(release),
                params['SCRATCH_DATASET'], files_to_case_table, write_depo, {},
                params['BQ_AS_BATCH'])
            count += 1

        if not success:
            print("attach_ids_to_files failed")
            return

    if 'extract_platform' in steps:
        step2_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                        params['SCRATCH_DATASET'],
                                        files_to_case_table)
        success = extract_platform_for_files(
            step2_table, params['FILEDATA_TABLE'].format(release),
            params['SCRATCH_DATASET'], files_to_case_w_plat_table, True, {},
            params['BQ_AS_BATCH'])

        if not success:
            print("extract_platform failed")
            return

    if 'attach_barcodes_to_ids' in steps:
        step2_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                        params['SCRATCH_DATASET'],
                                        files_to_case_w_plat_table)

        if params['RELEASE'] < 25:
            case_table = params['CASE_TABLE'].format("r25")
        else:
            case_table = params['CASE_TABLE'].format(release)

        success = attach_barcodes(step2_table,
                                  params['ALIQUOT_TABLE'].format(metadata_rel),
                                  case_table, params['SCRATCH_DATASET'],
                                  barcodes_table, True, params['BQ_AS_BATCH'])

        if not success:
            print("attach_barcodes_to_ids failed")
            return

    if 'merge_counts_and_metadata' in steps:
        for file_set in file_sets:
            count_name, count_dict = next(iter(file_set.items()))
            if 'header' not in count_dict:
                print("must have defined headers to work")
                break
            header = count_dict['header']
            print(header)
            sql_dict = {}
            sql_dict['count_column'] = header.split(',')[1].strip()
            sql_dict['file_column'] = 'file_gdc_id_{}'.format(count_name)

            step3_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                            params['SCRATCH_DATASET'],
                                            barcodes_table)
            counts_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                             params['SCRATCH_DATASET'],
                                             upload_table.format(count_name))

            success = merge_counts_and_metadata(
                step3_table, counts_table, params['SCRATCH_DATASET'],
                counts_w_metadata_table.format(count_name), True, sql_dict,
                params['BQ_AS_BATCH'])

            if not success:
                print("merge_counts_and_metadata failed")
                return

    if 'merge_all' in steps:
        sql_dict = {}
        count = 0
        for file_set in file_sets:
            count_name, count_dict = next(iter(file_set.items()))
            dict_for_set = {}
            sql_dict['table_{}'.format(count)] = dict_for_set
            count += 1
            if 'header' not in count_dict:
                print("must have defined headers to work")
                return
            header = count_dict['header']
            dict_for_set['count_column'] = header.split(',')[1].strip()
            dict_for_set['file_column'] = 'file_gdc_id_{}'.format(count_name)
            dict_for_set['table'] = '{}.{}.{}'.format(
                params['WORKING_PROJECT'], params['SCRATCH_DATASET'],
                counts_w_metadata_table.format(count_name))

        success = all_counts_to_one_table(params['SCRATCH_DATASET'],
                                          merged_counts_table, True, sql_dict,
                                          params['BQ_AS_BATCH'])

        if not success:
            print("merge_counts_and_metadata failed")
            return

    if 'glue_gene_names' in steps:
        sql_dict = {}
        count = 0
        for file_set in file_sets:
            count_name, count_dict = next(iter(file_set.items()))
            dict_for_set = {}
            sql_dict['table_{}'.format(count)] = dict_for_set
            count += 1
            if 'header' not in count_dict:
                print("must have defined headers to work")
                return
            header = count_dict['header']
            dict_for_set['count_column'] = header.split(',')[1].strip()
            dict_for_set['file_column'] = 'file_gdc_id_{}'.format(count_name)

        three_counts_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                               params['SCRATCH_DATASET'],
                                               merged_counts_table)

        success = glue_in_gene_names(three_counts_table,
                                     params['GENE_NAMES_TABLE'],
                                     params['SCRATCH_DATASET'],
                                     draft_table.format(release), True,
                                     sql_dict, params['BQ_AS_BATCH'])

        if not success:
            print("glue_gene_names failed")
            return

        #
        # Create second table
        #

    if 'create_current_table' in steps:
        source_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                         params['SCRATCH_DATASET'],
                                         draft_table.format(release))
        current_dest = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                         params['SCRATCH_DATASET'],
                                         draft_table.format('current'))

        success = publish_table(source_table, current_dest)

        if not success:
            print("create current table failed")
            return

    #
    # The derived table we generate has no field descriptions. Add them from the github json files:
    #
    for table in update_schema_tables:
        schema_release = 'current' if table == 'current' else release
        if 'update_final_schema' in steps:
            success = update_schema(params['SCRATCH_DATASET'],
                                    draft_table.format(schema_release),
                                    hold_schema_dict.format('counts'))
            if not success:
                print("Schema update failed")
                return

        #
        # Add description and labels to the target table:
        #

        if 'add_table_description' in steps:
            print('update_table_description')
            full_file_prefix = "{}/{}".format(
                params['PROX_DESC_PREFIX'], draft_table.format(schema_release))
            success = install_labels_and_desc(
                params['SCRATCH_DATASET'], draft_table.format(schema_release),
                full_file_prefix)
            if not success:
                print("update_table_description failed")
                return

    #
    # compare and remove old current table
    #

    # compare the two tables
    if 'compare_remove_old_current' in steps:
        old_current_table = '{}.{}.{}'.format(
            params['PUBLICATION_PROJECT'], params['PUBLICATION_DATASET'],
            publication_table.format('current'))
        previous_ver_table = '{}.{}.{}'.format(
            params['PUBLICATION_PROJECT'],
            "_".join([params['PUBLICATION_DATASET'], 'versioned']),
            publication_table.format("".join(
                ["r", str(params['PREVIOUS_RELEASE'])])))
        table_temp = '{}.{}.{}'.format(
            params['WORKING_PROJECT'], params['SCRATCH_DATASET'], "_".join([
                params['PROGRAM'],
                publication_table.format("".join(
                    ["r", str(params['PREVIOUS_RELEASE'])])), 'backup'
            ]))

        print('Compare {} to {}'.format(old_current_table, previous_ver_table))

        compare = compare_two_tables(old_current_table, previous_ver_table,
                                     params['BQ_AS_BATCH'])

        num_rows = compare.total_rows

        if num_rows == 0:
            print('the tables are the same')
        else:
            print('the tables are NOT the same and differ by {} rows'.format(
                num_rows))

        if not compare:
            print('compare_tables failed')
            return
        # move old table to a temporary location
        elif compare and num_rows == 0:
            print('Move old table to temp location')
            table_moved = publish_table(old_current_table, table_temp)

            if not table_moved:
                print('Old Table was not moved and will not be deleted')
            # remove old table
            elif table_moved:
                print('Deleting old table: {}'.format(old_current_table))
                delete_table = delete_table_bq_job(
                    params['PUBLICATION_DATASET'],
                    publication_table.format('current'))
                if not delete_table:
                    print('delete table failed')
                    return

    #
    # publish table:
    #

    if 'publish' in steps:
        print('publish tables')
        tables = ['versioned', 'current']

        for table in tables:
            if table == 'versioned':
                print(table)
                source_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                                 params['SCRATCH_DATASET'],
                                                 draft_table.format(release))
                publication_dest = '{}.{}.{}'.format(
                    params['PUBLICATION_PROJECT'],
                    "_".join([params['PUBLICATION_DATASET'], 'versioned']),
                    publication_table.format(release))
            elif table == 'current':
                print(table)
                source_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                                 params['SCRATCH_DATASET'],
                                                 draft_table.format('current'))
                publication_dest = '{}.{}.{}'.format(
                    params['PUBLICATION_PROJECT'],
                    params['PUBLICATION_DATASET'],
                    publication_table.format('current'))
            success = publish_table(source_table, publication_dest)

        if not success:
            print("publish table failed")
            return

    #
    # Update previous versioned table with archived tag
    #

    if 'update_status_tag' in steps:
        print('Update previous table')

        success = update_status_tag(
            "_".join([params['PUBLICATION_DATASET'], 'versioned']),
            publication_table.format("".join(
                ["r", str(params['PREVIOUS_RELEASE'])])), 'archived')

        if not success:
            print("update status tag table failed")
            return

    if 'dump_working_tables' in steps:
        dump_tables = [
            files_to_case_table, files_to_case_w_plat_table, barcodes_table,
            counts_w_metadata_table, merge_counts_and_metadata,
            merged_counts_table, draft_table
        ]
        for file_set in file_sets:
            count_name, _ = next(iter(file_set.items()))
            dump_tables.append(upload_table.format(count_name))
            dump_tables.append(counts_w_metadata_table.format(count_name))
            dump_tables.append(manifest_table.format(count_name))
            dump_tables.append(pull_list_table.format(count_name))

        table_cleaner(dump_tables, False)

    #
    # archive files on VM:
    #

    bucket_archive_blob_sets = {}
    for file_set in file_sets:
        count_name, _ = next(iter(file_set.items()))
        bucket_target_blob_sets[count_name] = '{}/{}-{}-{}-{}'.format(
            params['ARCHIVE_BUCKET_DIR'], params['DATE'], params['PROGRAM'],
            params['DATA_TYPE'], release, count_name)

    if 'archive' in steps:

        print('archive files from VM')
        archive_file_prefix = "{}_{}".format(date.today(),
                                             params['PUBLICATION_DATASET'])
        if params['ARCHIVE_YAML']:
            yaml_file = re.search(r"\/(\w*.yaml)$", args[1])
            archive_yaml = "{}/{}/{}_{}".format(params['ARCHIVE_BUCKET_DIR'],
                                                params['ARCHIVE_CONFIG'],
                                                archive_file_prefix,
                                                yaml_file.group(1))
            upload_to_bucket(params['ARCHIVE_BUCKET'], archive_yaml, args[1])

        for file_set in file_sets:
            count_name, count_dict = next(iter(file_set.items()))
            pull_file_name = params['LOCAL_PULL_LIST']
            archive_pull_file = "{}/{}_{}".format(
                params['ARCHIVE_BUCKET_DIR'], archive_file_prefix,
                pull_file_name.format(count_name))
            upload_to_bucket(params['ARCHIVE_BUCKET'], archive_pull_file,
                             local_pull_list.format(count_name))
            manifest_file_name = params['MANIFEST_FILE']
            archive_manifest_file = "{}/{}_{}".format(
                params['ARCHIVE_BUCKET_DIR'], archive_file_prefix,
                manifest_file_name.format(count_name))
            upload_to_bucket(params['ARCHIVE_BUCKET'], archive_manifest_file,
                             manifest_file.format(count_name))

    print('job completed')

Exemple #8

0

Afficher le fichier

def main(args):

    if not confirm_google_vm():
        print(
            'This job needs to run on a Google Cloud Compute Engine to avoid storage egress charges [EXITING]'
        )
        return

    if len(args) != 2:
        print(" ")
        print(" Usage : {} <configuration_yaml>".format(args[0]))
        return

    print('job started')

    #
    # Get the YAML config loaded:
    #

    with open(args[1], mode='r') as yaml_file:
        params, file_sets, steps = load_config(yaml_file.read())

    #
    # BQ does not like to be given paths that have "~". So make all local paths absolute:
    #

    home = expanduser("~")
    local_files_dir = "{}/{}".format(home, params['LOCAL_FILES_DIR'])
    one_big_tsv = "{}/{}".format(home, params['ONE_BIG_TSV'])
    manifest_file = "{}/{}".format(home, params['MANIFEST_FILE'])
    local_pull_list = "{}/{}".format(home, params['LOCAL_PULL_LIST'])
    file_traversal_list = "{}/{}".format(home, params['FILE_TRAVERSAL_LIST'])
    hold_schema_dict = "{}/{}".format(home, params['HOLD_SCHEMA_DICT'])
    hold_schema_list = "{}/{}".format(home, params['HOLD_SCHEMA_LIST'])

    if 'clear_target_directory' in steps:
        for file_set in file_sets:
            count_name, _ = next(iter(file_set.items()))
            create_clean_target(local_files_dir.format(count_name))

    if 'build_manifest_from_filters' in steps:
        for file_set in file_sets:
            count_name, count_dict = next(iter(file_set.items()))
            mani_for_count = manifest_file.format(count_name)
            table_for_count = params['BQ_MANIFEST_TABLE'].format(count_name)
            tsv_for_count = params['BUCKET_MANIFEST_TSV'].format(count_name)
            max_files = params['MAX_FILES'] if 'MAX_FILES' in params else None
            manifest_success = get_the_bq_manifest(
                params['FILE_TABLE'], count_dict['filters'], max_files,
                params['WORKING_PROJECT'], params['TARGET_DATASET'],
                table_for_count, params['WORKING_BUCKET'], tsv_for_count,
                mani_for_count, params['BQ_AS_BATCH'])
            if not manifest_success:
                print("Failure generating manifest")
                return

    #
    # We need to create a "pull list" of gs:// URLs to pull from GDC buckets. If you have already
    # created a pull list, just plunk it in 'LOCAL_PULL_LIST' and skip this step. If creating a pull
    # list, uses BQ as long as you have built the manifest using BQ (that route uses the BQ Manifest
    # table that was created).
    #

    if 'build_pull_list' in steps:
        for file_set in file_sets:
            count_name, count_dict = next(iter(file_set.items()))
            table_for_count = params['BQ_MANIFEST_TABLE'].format(count_name)
            local_pull_for_count = local_pull_list.format(count_name)
            pull_table_for_count = params['BQ_PULL_LIST_TABLE'].format(
                count_name)
            bucket_pull_list_for_count = params['BUCKET_PULL_LIST'].format(
                count_name)
            full_manifest = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                              params['TARGET_DATASET'],
                                              table_for_count)
            build_pull_list_with_bq(
                full_manifest, params['INDEXD_BQ_TABLE'],
                params['WORKING_PROJECT'], params['TARGET_DATASET'],
                pull_table_for_count, params['WORKING_BUCKET'],
                bucket_pull_list_for_count, local_pull_for_count,
                params['BQ_AS_BATCH'])
    #
    # Now hitting GDC cloud buckets. Get the files in the pull list:
    #

    if 'download_from_gdc' in steps:
        for file_set in file_sets:
            count_name, _ = next(iter(file_set.items()))
            pull_for_count = local_pull_list.format(count_name)
            with open(pull_for_count, mode='r') as pull_list_file:
                pull_list = pull_list_file.read().splitlines()
            print("Preparing to download %s files from buckets\n" %
                  len(pull_list))
            bp = BucketPuller(10)
            local_files_dir_for_count = local_files_dir.format(count_name)
            bp.pull_from_buckets(pull_list, local_files_dir_for_count)

    if 'build_file_list' in steps:
        for file_set in file_sets:
            count_name, _ = next(iter(file_set.items()))
            local_files_dir_for_count = local_files_dir.format(count_name)
            all_files = build_file_list(local_files_dir_for_count)
            file_traversal_list_for_count = file_traversal_list.format(
                count_name)
            with open(file_traversal_list_for_count,
                      mode='w') as traversal_list:
                for line in all_files:
                    traversal_list.write("{}\n".format(line))

    if 'concat_all_files' in steps:
        for file_set in file_sets:
            count_name, count_dict = next(iter(file_set.items()))
            header = count_dict['header'] if 'header' in count_dict else None
            file_traversal_list_for_count = file_traversal_list.format(
                count_name)
            with open(file_traversal_list_for_count,
                      mode='r') as traversal_list_file:
                all_files = traversal_list_file.read().splitlines()
                concat_all_files(all_files, one_big_tsv.format(count_name),
                                 header)

    #
    # Schemas and table descriptions are maintained in the github repo:
    #

    if 'pull_table_info_from_git' in steps:
        print('pull_table_info_from_git')
        try:
            create_clean_target(params['SCHEMA_REPO_LOCAL'])
            repo = Repo.clone_from(params['SCHEMA_REPO_URL'],
                                   params['SCHEMA_REPO_LOCAL'])
            repo.git.checkout(params['SCHEMA_REPO_BRANCH'])
        except Exception as ex:
            print("pull_table_info_from_git failed: {}".format(str(ex)))
            return

    if 'process_git_schemas' in steps:
        print('process_git_schema')
        # Where do we dump the schema git repository?
        schema_file = "{}/{}/{}".format(params['SCHEMA_REPO_LOCAL'],
                                        params['RAW_SCHEMA_DIR'],
                                        params['SCHEMA_FILE_NAME'])
        full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'],
                                          params['FINAL_TARGET_TABLE'])
        # Write out the details
        success = generate_table_detail_files(schema_file, full_file_prefix)
        if not success:
            print("process_git_schemas failed")
            return

    if 'analyze_the_schema' in steps:
        print('analyze_the_schema')
        for file_set in file_sets:
            count_name, _ = next(iter(file_set.items()))
            typing_tups = build_schema(one_big_tsv.format(count_name),
                                       params['SCHEMA_SAMPLE_SKIPS'])
            full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'],
                                              params['FINAL_TARGET_TABLE'])
            schema_dict_loc = "{}_schema.json".format(full_file_prefix)
            build_combined_schema(None, schema_dict_loc, typing_tups,
                                  hold_schema_list.format(count_name),
                                  hold_schema_dict.format(count_name))

    bucket_target_blob_sets = {}
    for file_set in file_sets:
        count_name, _ = next(iter(file_set.items()))
        bucket_target_blob_sets[count_name] = '{}/{}'.format(
            params['WORKING_BUCKET_DIR'],
            params['BUCKET_TSV'].format(count_name))

    if 'upload_to_bucket' in steps:
        for file_set in file_sets:
            count_name, _ = next(iter(file_set.items()))
            upload_to_bucket(params['WORKING_BUCKET'],
                             bucket_target_blob_sets[count_name],
                             one_big_tsv.format(count_name))

    if 'delete_all_bq' in steps:
        table_cleaner(params, file_sets, True)

    if 'create_bq_from_tsv' in steps:
        for file_set in file_sets:
            count_name, _ = next(iter(file_set.items()))
            bucket_src_url = 'gs://{}/{}'.format(
                params['WORKING_BUCKET'], bucket_target_blob_sets[count_name])
            hold_schema_list_for_count = hold_schema_list.format(count_name)
            with open(hold_schema_list_for_count,
                      mode='r') as schema_hold_dict:
                typed_schema = json_loads(schema_hold_dict.read())
            csv_to_bq_write_depo(typed_schema, bucket_src_url,
                                 params['TARGET_DATASET'],
                                 params['TARGET_TABLE'].format(count_name),
                                 params['BQ_AS_BATCH'], None)

    if 'attach_ids_to_files' in steps:
        count = 0
        for file_set in file_sets:
            count_name, _ = next(iter(file_set.items()))
            write_depo = "WRITE_TRUNCATE" if (count == 0) else "WRITE_APPEND"
            gexp_table = '{}.{}.{}'.format(
                params['WORKING_PROJECT'], params['TARGET_DATASET'],
                params['TARGET_TABLE'].format(count_name))
            success = build_aliquot_and_case(
                gexp_table, params['FILEDATA_TABLE'], params['TARGET_DATASET'],
                params['STEP_2_TABLE'], write_depo, {}, params['BQ_AS_BATCH'])
            count += 1

        if not success:
            print("attach_ids_to_files failed")
            return

    if 'extract_platform' in steps:
        step2_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                        params['TARGET_DATASET'],
                                        params['STEP_2_TABLE'])
        success = extract_platform_for_files(step2_table,
                                             params['FILEDATA_TABLE'],
                                             params['TARGET_DATASET'],
                                             params['STEP_2A_TABLE'], True, {},
                                             params['BQ_AS_BATCH'])

        if not success:
            print("extract_platform failed")
            return

    if 'attach_barcodes_to_ids' in steps:
        step2_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                        params['TARGET_DATASET'],
                                        params['STEP_2A_TABLE'])
        success = attach_barcodes(step2_table, params['ALIQUOT_TABLE'],
                                  params['TARGET_DATASET'],
                                  params['STEP_3_TABLE'], True, {},
                                  params['BQ_AS_BATCH'])

        if not success:
            print("attach_barcodes_to_ids failed")
            return

    if 'merge_counts_and_metadata' in steps:
        for file_set in file_sets:
            count_name, count_dict = next(iter(file_set.items()))
            if 'header' not in count_dict:
                print("must have defined headers to work")
                break
            header = count_dict['header']
            print(header)
            sql_dict = {}
            sql_dict['count_column'] = header.split(',')[1].strip()
            sql_dict['file_column'] = 'file_gdc_id_{}'.format(count_name)

            step3_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                            params['TARGET_DATASET'],
                                            params['STEP_3_TABLE'])
            counts_table = '{}.{}.{}'.format(
                params['WORKING_PROJECT'], params['TARGET_DATASET'],
                params['TARGET_TABLE'].format(count_name))

            success = merge_counts_and_metadata(
                step3_table, counts_table, params['TARGET_DATASET'],
                params['COUNTS_WITH_METADATA_TABLE'].format(count_name), True,
                sql_dict, params['BQ_AS_BATCH'])

            if not success:
                print("merge_counts_and_metadata failed")
                return

    if 'merge_all' in steps:
        sql_dict = {}
        count = 0
        for file_set in file_sets:
            count_name, count_dict = next(iter(file_set.items()))
            dict_for_set = {}
            sql_dict['table_{}'.format(count)] = dict_for_set
            count += 1
            if 'header' not in count_dict:
                print("must have defined headers to work")
                return
            header = count_dict['header']
            dict_for_set['count_column'] = header.split(',')[1].strip()
            dict_for_set['file_column'] = 'file_gdc_id_{}'.format(count_name)
            dict_for_set['table'] = '{}.{}.{}'.format(
                params['WORKING_PROJECT'], params['TARGET_DATASET'],
                params['COUNTS_WITH_METADATA_TABLE'].format(count_name))

        success = all_counts_to_one_table(params['TARGET_DATASET'],
                                          params['THREE_COUNTS_TABLE'], True,
                                          sql_dict, params['BQ_AS_BATCH'])

        if not success:
            print("merge_counts_and_metadata failed")
            return

    if 'glue_gene_names' in steps:
        sql_dict = {}
        count = 0
        for file_set in file_sets:
            count_name, count_dict = next(iter(file_set.items()))
            dict_for_set = {}
            sql_dict['table_{}'.format(count)] = dict_for_set
            count += 1
            if 'header' not in count_dict:
                print("must have defined headers to work")
                return
            header = count_dict['header']
            dict_for_set['count_column'] = header.split(',')[1].strip()
            dict_for_set['file_column'] = 'file_gdc_id_{}'.format(count_name)

        three_counts_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                               params['TARGET_DATASET'],
                                               params['THREE_COUNTS_TABLE'])

        success = glue_in_gene_names(three_counts_table,
                                     params['GENE_NAMES_TABLE'],
                                     params['TARGET_DATASET'],
                                     params['FINAL_TARGET_TABLE'], True,
                                     sql_dict, params['BQ_AS_BATCH'])

        if not success:
            print("glue_gene_names failed")
            return

    #
    # Update the per-field descriptions:
    #

    if 'update_field_descriptions' in steps:
        print('update_field_descriptions')
        full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'],
                                          params['FINAL_TARGET_TABLE'])
        schema_dict_loc = "{}_schema.json".format(full_file_prefix)
        schema_dict = {}
        with open(schema_dict_loc, mode='r') as schema_hold_dict:
            full_schema_list = json_loads(schema_hold_dict.read())
        for entry in full_schema_list:
            schema_dict[entry['name']] = {'description': entry['description']}

        success = update_schema_with_dict(params['TARGET_DATASET'],
                                          params['FINAL_TARGET_TABLE'],
                                          schema_dict)
        if not success:
            print("update_field_descriptions failed")
            return

    #
    # Add description and labels to the target table:
    #

    if 'update_table_description' in steps:
        print('update_table_description')
        full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'],
                                          params['FINAL_TARGET_TABLE'])
        success = install_labels_and_desc(params['TARGET_DATASET'],
                                          params['FINAL_TARGET_TABLE'],
                                          full_file_prefix)
        if not success:
            print("update_table_description failed")
            return

    if 'dump_working_tables' in steps:
        table_cleaner(params, file_sets, False)

    #
    # archive files on VM:
    #

    bucket_archive_blob_sets = {}
    for file_set in file_sets:
        count_name, _ = next(iter(file_set.items()))
        bucket_target_blob_sets[count_name] = '{}/{}'.format(
            params['ARCHIVE_BUCKET_DIR'],
            params['BUCKET_TSV'].format(count_name))

    if 'archive' in steps:

        print('archive files from VM')
        archive_file_prefix = "{}_{}".format(date.today(),
                                             params['PUBLICATION_DATASET'])
        yaml_file = re.search(r"\/(\w*.yaml)$", args[1])
        archive_yaml = "{}/{}/{}_{}".format(params['ARCHIVE_BUCKET_DIR'],
                                            params['ARCHIVE_CONFIG'],
                                            archive_file_prefix,
                                            yaml_file.group(1))
        upload_to_bucket(params['ARCHIVE_BUCKET'], archive_yaml, args[1])
        for file_set in file_sets:
            count_name, count_dict = next(iter(file_set.items()))
            pull_file_name = params['LOCAL_PULL_LIST']
            archive_pull_file = "{}/{}_{}".format(
                params['ARCHIVE_BUCKET_DIR'], archive_file_prefix,
                pull_file_name.format(count_name))
            upload_to_bucket(params['ARCHIVE_BUCKET'], archive_pull_file,
                             local_pull_list.format(count_name))
            manifest_file_name = params['MANIFEST_FILE']
            archive_manifest_file = "{}/{}_{}".format(
                params['ARCHIVE_BUCKET_DIR'], archive_file_prefix,
                manifest_file_name.format(count_name))
            upload_to_bucket(params['ARCHIVE_BUCKET'], archive_manifest_file,
                             manifest_file.format(count_name))

    #
    # publish table:
    #

    if 'publish' in steps:

        source_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                         params['TARGET_DATASET'],
                                         params['FINAL_TARGET_TABLE'])
        publication_dest = '{}.{}.{}'.format(params['PUBLICATION_PROJECT'],
                                             params['PUBLICATION_DATASET'],
                                             params['PUBLICATION_TABLE'])

        success = publish_table(source_table, publication_dest)

        if not success:
            print("publish table failed")
            return

    print('job completed')

Exemple #9

0

Afficher le fichier

Fichier : build_cnvr_bq_table.py Projet : madelyngreyes/NextGenETL

def main(args):

    if not confirm_google_vm():
        print(
            'This job needs to run on a Google Cloud Compute Engine to avoid storage egress charges [EXITING]'
        )
        return

    if len(args) != 2:
        print(" ")
        print(" Usage : {} <configuration_yaml>".format(args[0]))
        return

    print('job started')

    #
    # Get the YAML config loaded:
    #

    with open(args[1], mode='r') as yaml_file:
        params, bq_filters, steps = load_config(yaml_file.read())

    #
    # BQ does not like to be given paths that have "~". So make all local paths absolute:
    #

    home = expanduser("~")
    local_files_dir = "{}/{}".format(home, params['LOCAL_FILES_DIR'])
    one_big_tsv = "{}/{}".format(home, params['ONE_BIG_TSV'])
    manifest_file = "{}/{}".format(home, params['MANIFEST_FILE'])
    local_pull_list = "{}/{}".format(home, params['LOCAL_PULL_LIST'])
    file_traversal_list = "{}/{}".format(home, params['FILE_TRAVERSAL_LIST'])
    hold_schema_dict = "{}/{}".format(home, params['HOLD_SCHEMA_DICT'])
    hold_schema_list = "{}/{}".format(home, params['HOLD_SCHEMA_LIST'])

    # Schema that describes CNVR table:

    AUGMENTED_SCHEMA_FILE = "SchemaFiles/cnvr_augmented_schema_list.json"

    #
    # Use the filter set to get a manifest from GDC using their API. Note that if a pull list is
    # provided, these steps can be omitted:
    #

    if 'build_manifest_from_filters' in steps:
        max_files = params['MAX_FILES'] if 'MAX_FILES' in params else None

        manifest_success = get_the_bq_manifest(
            params['FILE_TABLE'], bq_filters, max_files,
            params['WORKING_PROJECT'], params['TARGET_DATASET'],
            params['BQ_MANIFEST_TABLE'], params['WORKING_BUCKET'],
            params['BUCKET_MANIFEST_TSV'], manifest_file,
            params['BQ_AS_BATCH'])
        if not manifest_success:
            print("Failure generating manifest")
            return

    if 'clear_target_directory' in steps:
        create_clean_target(local_files_dir)

    #
    # We need to create a "pull list" of gs:// URLs to pull from GDC buckets. If you have already
    # created a pull list, just plunk it in 'LOCAL_PULL_LIST' and skip this step. If creating a pull
    # list, uses BQ as long as you have built the manifest using BQ (that route uses the BQ Manifest
    # table that was created).
    #

    if 'build_pull_list' in steps:
        full_manifest = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                          params['TARGET_DATASET'],
                                          params['BQ_MANIFEST_TABLE'])
        build_pull_list_with_bq(
            full_manifest, params['INDEXD_BQ_TABLE'],
            params['WORKING_PROJECT'], params['TARGET_DATASET'],
            params['BQ_PULL_LIST_TABLE'], params['WORKING_BUCKET'],
            params['BUCKET_PULL_LIST'], local_pull_list, params['BQ_AS_BATCH'])
    #
    # Now hitting GDC cloud buckets. Get the files in the pull list:
    #

    if 'download_from_gdc' in steps:
        with open(local_pull_list, mode='r') as pull_list_file:
            pull_list = pull_list_file.read().splitlines()
        print("Preparing to download %s files from buckets\n" % len(pull_list))
        bp = BucketPuller(10)
        bp.pull_from_buckets(pull_list, local_files_dir)

    if 'build_file_list' in steps:
        all_files = build_file_list(local_files_dir)
        with open(file_traversal_list, mode='w') as traversal_list:
            for line in all_files:
                traversal_list.write("{}\n".format(line))

    if 'concat_all_files' in steps:
        with open(file_traversal_list, mode='r') as traversal_list_file:
            all_files = traversal_list_file.read().splitlines()
        concat_all_files(all_files, one_big_tsv)

    if 'build_the_schema' in steps:
        typing_tups = build_schema(one_big_tsv, params['SCHEMA_SAMPLE_SKIPS'])
        build_combined_schema(None, AUGMENTED_SCHEMA_FILE, typing_tups,
                              hold_schema_list, hold_schema_dict)

    bucket_target_blob = '{}/{}'.format(params['WORKING_BUCKET_DIR'],
                                        params['BUCKET_TSV'])

    if 'upload_to_bucket' in steps:
        upload_to_bucket(params['WORKING_BUCKET'], bucket_target_blob,
                         params['ONE_BIG_TSV'])

    if 'create_bq_from_tsv' in steps:
        bucket_src_url = 'gs://{}/{}'.format(params['WORKING_BUCKET'],
                                             bucket_target_blob)
        with open(hold_schema_list, mode='r') as schema_hold_dict:
            typed_schema = json_loads(schema_hold_dict.read())
        csv_to_bq(typed_schema, bucket_src_url, params['TARGET_DATASET'],
                  params['TARGET_TABLE'], params['BQ_AS_BATCH'])

    if 'add_aliquot_fields' in steps:
        full_target_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                              params['TARGET_DATASET'],
                                              params['TARGET_TABLE'])
        success = join_with_aliquot_table(full_target_table,
                                          params['ALIQUOT_TABLE'],
                                          params['TARGET_DATASET'],
                                          params['FINAL_TARGET_TABLE'],
                                          params['BQ_AS_BATCH'])
        if not success:
            print("Join job failed")

    #
    # Clear out working temp tables:
    #

    if 'dump_working_tables' in steps:
        dump_table_tags = ['TARGET_TABLE']
        dump_tables = [params[x] for x in dump_table_tags]
        for table in dump_tables:
            delete_table_bq_job(params['TARGET_DATASET'], table)

    print('job completed')

Exemple #10

0

Afficher le fichier

Fichier : build_mirna_isoform_expr_bq_table.py Projet : CancerDataAggregator/etl

def main(args):

    if not confirm_google_vm():
        print(
            'This job needs to run on a Google Cloud Compute Engine to avoid storage egress charges [EXITING]'
        )
        return

    if len(args) != 2:
        print(" ")
        print(" Usage : {} <configuration_yaml>".format(args[0]))
        return

    print('job started')

    #
    # Get the YAML config loaded:
    #

    with open(args[1], mode='r') as yaml_file:
        params, bq_filters, steps, extra_cols = load_config(yaml_file.read())

    if params is None:
        print("Bad YAML load")
        return

    # Schema that describes table columns:

    AUGMENTED_SCHEMA_FILE = "SchemaFiles/isoform_augmented_schema_list.json"

    #
    # BQ does not like to be given paths that have "~". So make all local paths absolute:
    #

    home = expanduser("~")
    local_files_dir = "{}/{}".format(home, params['LOCAL_FILES_DIR'])
    one_big_tsv = "{}/{}".format(home, params['ONE_BIG_TSV'])
    manifest_file = "{}/{}".format(home, params['MANIFEST_FILE'])
    local_pull_list = "{}/{}".format(home, params['LOCAL_PULL_LIST'])
    file_traversal_list = "{}/{}".format(home, params['FILE_TRAVERSAL_LIST'])
    hold_schema_dict = "{}/{}".format(home, params['HOLD_SCHEMA_DICT'])
    hold_schema_list = "{}/{}".format(home, params['HOLD_SCHEMA_LIST'])

    #
    # Use the filter set to get a manifest from GDC using their API. Note that is a pull list is
    # provided, these steps can be omitted:
    #

    if 'build_manifest_from_filters' in steps:
        max_files = params['MAX_FILES'] if 'MAX_FILES' in params else None
        manifest_success = get_the_bq_manifest(
            params['FILE_TABLE'], bq_filters, max_files,
            params['WORKING_PROJECT'], params['TARGET_DATASET'],
            params['BQ_MANIFEST_TABLE'], params['WORKING_BUCKET'],
            params['BUCKET_MANIFEST_TSV'], manifest_file,
            params['BQ_AS_BATCH'])
        if not manifest_success:
            print("Failure generating manifest")
            return

    #
    # Best practice is to clear out the directory where the files are going. Don't want anything left over:
    #

    if 'clear_target_directory' in steps:
        create_clean_target(local_files_dir)

    #
    # We need to create a "pull list" of gs:// URLs to pull from GDC buckets. If you have already
    # created a pull list, just plunk it in 'LOCAL_PULL_LIST' and skip this step. If creating a pull
    # list, uses BQ as long as you have built the manifest using BQ (that route uses the BQ Manifest
    # table that was created).
    #

    if 'build_pull_list' in steps:
        full_manifest = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                          params['TARGET_DATASET'],
                                          params['BQ_MANIFEST_TABLE'])

        build_pull_list_with_bq(
            full_manifest, params['INDEXD_BQ_TABLE'],
            params['WORKING_PROJECT'], params['TARGET_DATASET'],
            params['BQ_PULL_LIST_TABLE'], params['WORKING_BUCKET'],
            params['BUCKET_PULL_LIST'], local_pull_list, params['BQ_AS_BATCH'])

    #
    # Now hitting GDC cloud buckets. Get the files in the pull list:
    #

    if 'download_from_gdc' in steps:
        with open(local_pull_list, mode='r') as pull_list_file:
            pull_list = pull_list_file.read().splitlines()
        print("Preparing to download %s files from buckets\n" % len(pull_list))
        bp = BucketPuller(10)
        bp.pull_from_buckets(pull_list, local_files_dir)

    #
    # Traverse the tree of downloaded files and create a flat list of all files:
    #

    if 'build_traversal_list' in steps:
        all_files = build_file_list(local_files_dir)
        with open(file_traversal_list, mode='w') as traversal_list:
            for line in all_files:
                traversal_list.write("{}\n".format(line))

    #
    # Take all the files and make one BIG TSV file to upload:
    #

    if 'concat_all_files' in steps:
        with open(file_traversal_list, mode='r') as traversal_list_file:
            all_files = traversal_list_file.read().splitlines()
        concat_all_files(all_files, one_big_tsv, params['PROGRAM_PREFIX'],
                         extra_cols, file_info, split_col_func)

    #
    # For the legacy table, the descriptions had lots of analysis tidbits. Very nice, but hard to maintain.
    # We just use hardwired schema descriptions now, most directly pulled from the GDC website:
    #

    if 'build_the_schema' in steps:
        typing_tups = build_schema(one_big_tsv, params['SCHEMA_SAMPLE_SKIPS'])
        build_combined_schema(None, AUGMENTED_SCHEMA_FILE, typing_tups,
                              hold_schema_list, hold_schema_dict)

    #
    # Upload the giant TSV into a cloud bucket:
    #

    if 'upload_to_bucket' in steps:
        upload_to_bucket(params['WORKING_BUCKET'], params['BUCKET_SKEL_TSV'],
                         one_big_tsv)

    #
    # Create the BQ table from the TSV:
    #

    if 'create_bq_from_tsv' in steps:
        bucket_src_url = 'gs://{}/{}'.format(params['WORKING_BUCKET'],
                                             params['BUCKET_SKEL_TSV'])
        with open(hold_schema_list, mode='r') as schema_hold_dict:
            typed_schema = json_loads(schema_hold_dict.read())
        csv_to_bq(typed_schema, bucket_src_url, params['TARGET_DATASET'],
                  params['SKELETON_TABLE'], params['BQ_AS_BATCH'])

    #
    # Need to merge in aliquot and sample barcodes from other tables:
    #

    if 'collect_barcodes' in steps:
        skel_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                       params['TARGET_DATASET'],
                                       params['SKELETON_TABLE'])

        success = attach_aliquot_ids(skel_table, params['FILE_TABLE'],
                                     params['TARGET_DATASET'],
                                     params['BARCODE_STEP_1_TABLE'],
                                     params['BQ_AS_BATCH'])
        if not success:
            print("attach_aliquot_ids job failed")
            return

        step_1_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                         params['TARGET_DATASET'],
                                         params['BARCODE_STEP_1_TABLE'])
        success = attach_barcodes(step_1_table, params['ALIQUOT_TABLE'],
                                  params['TARGET_DATASET'],
                                  params['BARCODE_STEP_2_TABLE'],
                                  params['BQ_AS_BATCH'])
        if not success:
            print("attach_barcodes job failed")
            return

    #
    # Merge the barcode info into the final table we are building:
    #

    if 'create_final_table' in steps:
        skel_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                       params['TARGET_DATASET'],
                                       params['SKELETON_TABLE'])
        barcodes_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                           params['TARGET_DATASET'],
                                           params['BARCODE_STEP_2_TABLE'])
        success = final_merge(skel_table, barcodes_table,
                              params['TARGET_DATASET'],
                              params['FINAL_TARGET_TABLE'],
                              params['BQ_AS_BATCH'])
        if not success:
            print("Join job failed")
            return

    #
    # The derived table we generate has no field descriptions. Add them from the scraped page:
    #

    if 'update_final_schema' in steps:
        success = update_schema(params['TARGET_DATASET'],
                                params['FINAL_TARGET_TABLE'], hold_schema_dict)
        if not success:
            print("Schema update failed")
            return

    #
    # Add the table description:
    #

    if 'add_table_description' in steps:
        update_description(params['TARGET_DATASET'],
                           params['FINAL_TARGET_TABLE'],
                           params['TABLE_DESCRIPTION'])

    #
    # Clear out working temp tables:
    #

    if 'dump_working_tables' in steps:
        dump_table_tags = [
            'SKELETON_TABLE', 'BARCODE_STEP_1_TABLE', 'BARCODE_STEP_2_TABLE',
            'BQ_MANIFEST_TABLE', 'BQ_PULL_LIST_TABLE'
        ]
        dump_tables = [params[x] for x in dump_table_tags]
        for table in dump_tables:
            delete_table_bq_job(params['TARGET_DATASET'], table)
    #
    # Done!
    #

    print('job completed')