Esempi in Python per update_description

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: common_etl.support

Metodo/funzione: update_description

Esempi su hotexamples.com: 4

update_description in Python: 4 esempi trovati. Questi sono i migliori esempi reali in Python per common_etl.support.update_description, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Esempio n. 1

Mostra file

File: build_dcf_manifest_bq_tables.py Progetto: madelyngreyes/NextGenETL

def main(args):

    #if not confirm_google_vm():
    #    print('This job needs to run on a Google Cloud Compute Engine to avoid storage egress charges [EXITING]')
    #    return

    if len(args) != 2:
        print(" ")
        print(" Usage : {} <configuration_yaml>".format(args[0]))
        return

    print('job started')

    #
    # Get the YAML config loaded:
    #

    with open(args[1], mode='r') as yaml_file:
        params, steps = load_config(yaml_file.read())

    if params is None:
        print("Bad YAML load")
        return

    # Schema that describes DCF manifests:

    MANIFEST_SCHEMA_LIST = "SchemaFiles/dcf_manifest_schema.json"

    # Schema that describes our final map table:

    FILE_MAP_SCHEMA_LIST = "SchemaFiles/dcf_file_map_schema.json"

    #
    # Decide if we are doing active, legacy, or both manifests:
    #

    mani_dict = {}
    map_dict = {}
    if params['DO_ACTIVE']:
        mani_dict['ACTIVE_MANIFEST_TSV'] = 'ACTIVE_MANIFEST_BQ'
        map_dict['ACTIVE_MANIFEST_BQ'] = 'ACTIVE_FILE_MAP_BQ'

    if params['DO_LEGACY']:
        mani_dict['LEGACY_MANIFEST_TSV'] = 'LEGACY_MANIFEST_BQ'
        map_dict['LEGACY_MANIFEST_BQ'] = 'LEGACY_FILE_MAP_BQ'

    #
    # Create a manifest BQ table from a TSV:
    #

    if 'create_bq_manifest_from_tsv' in steps:
        with open(MANIFEST_SCHEMA_LIST, mode='r') as schema_hold_dict:
            typed_schema = json_loads(schema_hold_dict.read())

        for manikey in list(mani_dict.keys()):
            bucket_src_url = 'gs://{}/{}'.format(params['WORKING_BUCKET'], params[manikey])
            success = csv_to_bq(typed_schema, bucket_src_url, params['TARGET_DATASET'],
                                params[mani_dict[manikey]], params['BQ_AS_BATCH'])
            if not success:
                print("create_bq_manifest_from_tsv failed")
                return

    #
    # Create the file map tables:
    #

    if 'create_file_map_bq' in steps:

        for mapkey in list(map_dict.keys()):
            mani_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params[mapkey])
            success = build_file_map(mani_table, params['TARGET_DATASET'], params[map_dict[mapkey]], params['BQ_AS_BATCH'])
            if not success:
                print("create_file_map_bq failed")
                return

            # Install a schema in the new table:
            schema_dict = schema_list_to_dict(FILE_MAP_SCHEMA_LIST)
            success = update_schema_with_dict(params['TARGET_DATASET'], params[map_dict[mapkey]], schema_dict)
            if not success:
                print("install file map schema failed")
                return

    #
    # Add descriptions
    #

    if 'add_table_descriptions' in steps:
        for mapkey in list(map_dict.keys()):
            success = update_description(params['TARGET_DATASET'], params[mapkey],
                                         params['DCF_MANIFEST_TABLE_DESCRIPTION'])
            if not success:
                print("install manifest description failed")
                return

            success = update_description(params['TARGET_DATASET'], params[map_dict[mapkey]],
                                         params['FILE_MAP_TABLE_DESCRIPTION'])
            if not success:
                print("install file map description failed")
                return

    print('job completed')

Esempio n. 2

Mostra file

File: build_open_somatic_mut_bq_table.py Progetto: CancerDataAggregator/etl

def main(args):

    if not confirm_google_vm():
        print(
            'This job needs to run on a Google Cloud Compute Engine to avoid storage egress charges [EXITING]'
        )
        return

    if len(args) != 2:
        print(" ")
        print(" Usage : {} <configuration_yaml>".format(args[0]))
        return

    print('job started')

    #
    # Get the YAML config loaded:
    #

    with open(args[1], mode='r') as yaml_file:
        params, filters, bq_filters, steps, extra_cols, key_fields, callers = load_config(
            yaml_file.read())

    if params is None:
        print("Bad YAML load")
        return

    #
    # BQ does not like to be given paths that have "~". So make all local paths absolute:
    #

    home = expanduser("~")
    local_files_dir = "{}/{}".format(home, params['LOCAL_FILES_DIR'])
    one_big_tsv = "{}/{}".format(home, params['ONE_BIG_TSV'])
    manifest_file = "{}/{}".format(home, params['MANIFEST_FILE'])
    local_pull_list = "{}/{}".format(home, params['LOCAL_PULL_LIST'])
    file_traversal_list = "{}/{}".format(home, params['FILE_TRAVERSAL_LIST'])
    hold_schema_dict = "{}/{}".format(home, params['HOLD_SCHEMA_DICT'])
    hold_schema_list = "{}/{}".format(home, params['HOLD_SCHEMA_LIST'])
    hold_scraped_dict = "{}/{}".format(home, params['HOLD_SCRAPED_DICT'])

    AUGMENTED_SCHEMA_FILE = "SchemaFiles/augmented_schema_list.json"

    #
    # Empirical evidence suggests this workflow is going to be very memory hungry if you are doing
    # merging, and requires at least 26 GB to be safe. Confirm that before starting!
    #

    do_merging = params['DO_MERGED_OUTPUT']
    if do_merging:
        meminfo = dict((i.split()[0].rstrip(':'), int(i.split()[1]))
                       for i in open('/proc/meminfo').readlines())
        mem_kib = meminfo['MemTotal']
        print("Machine memory: {}".format(mem_kib))
        if int(mem_kib) < 26000000:
            print("Job requires at least 26 GB physical memory to complete")
            return

    #
    # Next, use the filter set to get a manifest from GDC using their API. Note that is a pull list is
    # provided, these steps can be omitted:
    #

    if 'build_manifest_from_filters' in steps:
        max_files = params['MAX_FILES'] if 'MAX_FILES' in params else None
        manifest_success = get_the_bq_manifest(
            params['FILE_TABLE'], bq_filters, max_files,
            params['WORKING_PROJECT'], params['TARGET_DATASET'],
            params['BQ_MANIFEST_TABLE'], params['WORKING_BUCKET'],
            params['BUCKET_MANIFEST_TSV'], manifest_file,
            params['BQ_AS_BATCH'])
        if not manifest_success:
            print("Failure generating manifest")
            return

    #
    # Best practice is to clear out the directory where the files are going. Don't want anything left over:
    #

    if 'clear_target_directory' in steps:
        create_clean_target(local_files_dir)

    #
    # We need to create a "pull list" of gs:// URLs to pull from GDC buckets. If you have already
    # created a pull list, just plunk it in 'LOCAL_PULL_LIST' and skip this step. If creating a pull
    # list, you can do it using IndexD calls on a manifest file, OR using BQ as long as you have
    # built the manifest using BQ (that route uses the BQ Manifest table that was created).
    #

    if 'build_pull_list' in steps:
        full_manifest = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                          params['TARGET_DATASET'],
                                          params['BQ_MANIFEST_TABLE'])

        build_pull_list_with_bq(
            full_manifest, params['INDEXD_BQ_TABLE'],
            params['WORKING_PROJECT'], params['TARGET_DATASET'],
            params['BQ_PULL_LIST_TABLE'], params['WORKING_BUCKET'],
            params['BUCKET_PULL_LIST'], local_pull_list, params['BQ_AS_BATCH'])

    #
    # Now hitting GDC cloud buckets, not "downloading". Get the files in the pull list:
    #

    if 'download_from_gdc' in steps:
        with open(local_pull_list, mode='r') as pull_list_file:
            pull_list = pull_list_file.read().splitlines()
        pull_from_buckets(pull_list, local_files_dir)

    #
    # Traverse the tree of downloaded files and create a flat list of all files:
    #

    if 'build_traversal_list' in steps:
        all_files = build_file_list(local_files_dir)
        program_list = build_program_list(all_files)
        if not check_caller_list(all_files, callers):
            print("Unexpected caller mismatch! Expecting {}".format(callers))
            return
        with open(file_traversal_list, mode='w') as traversal_list:
            for line in all_files:
                traversal_list.write("{}\n".format(line))

    #
    # We can create either a table that merges identical mutations from the different callers into
    # one row, or keep them separate:
    #

    if do_merging:
        do_debug = params['DO_DEBUG_LOGGING']
        target_count = int(params['EXPECTED_COLUMNS'])
        for program in program_list:
            print("Look at MAFS for {}".format(program))
            if 'run_maf_reader' in steps:
                with open(file_traversal_list,
                          mode='r') as traversal_list_file:
                    all_files = traversal_list_file.read().splitlines()
                print("Start reading MAFS for {}".format(program))
                mut_calls, hdr_pick = read_MAFs(program, all_files,
                                                params['PROGRAM_PREFIX'],
                                                extra_cols, target_count,
                                                do_debug, key_fields,
                                                params['FIRST_MAF_COL'],
                                                file_info)
                print("Finish reading MAFS for {}".format(program))

            if 'run_maf_writer' in steps:
                print("Start writing MAFS for {}".format(program))
                hist_count = write_MAFs(program, mut_calls, hdr_pick, callers,
                                        do_debug)
                for ii in range(len(hist_count)):
                    if hist_count[ii] > 0:
                        print(" %6d  %9d " % (ii, hist_count[ii]))
                print("Finish writing MAFS for {}".format(program))

    #
    # Take all the files and make one BIG TSV file to upload:
    #

    if 'concat_all_files' in steps:
        if do_merging:
            maf_list = ["mergeA." + tumor + ".maf" for tumor in program_list]
            concat_all_merged_files(maf_list, one_big_tsv)
        else:
            with open(file_traversal_list, mode='r') as traversal_list_file:
                all_files = traversal_list_file.read().splitlines()
            concat_all_files(all_files, one_big_tsv, params['PROGRAM_PREFIX'],
                             extra_cols, file_info)

    #
    # Scrape the column descriptions from the GDC web page
    #

    if 'scrape_schema' in steps:
        scrape_list = scrape_schema(params['MAF_URL'], params['FIRST_MAF_COL'])
        with open(hold_scraped_dict, mode='w') as scraped_hold_list:
            scraped_hold_list.write(json_dumps(scrape_list))

    #
    # For the legacy table, the descriptions had lots of analysis tidbits. Very nice, but hard to maintain.
    # We just use hardwired schema descriptions now, most directly pulled from the GDC website:
    #

    if 'build_the_schema' in steps:
        typing_tups = build_schema(one_big_tsv, params['SCHEMA_SAMPLE_SKIPS'])
        build_combined_schema(hold_scraped_dict, AUGMENTED_SCHEMA_FILE,
                              typing_tups, hold_schema_list, hold_schema_dict)

    #
    # Upload the giant TSV into a cloud bucket:
    #

    if 'upload_to_bucket' in steps:
        upload_to_bucket(params['WORKING_BUCKET'], params['BUCKET_SKEL_TSV'],
                         one_big_tsv)

    #
    # Create the BQ table from the TSV:
    #

    if 'create_bq_from_tsv' in steps:
        bucket_src_url = 'gs://{}/{}'.format(params['WORKING_BUCKET'],
                                             params['BUCKET_SKEL_TSV'])
        with open(hold_schema_list, mode='r') as schema_hold_dict:
            typed_schema = json_loads(schema_hold_dict.read())
        csv_to_bq(typed_schema, bucket_src_url, params['TARGET_DATASET'],
                  params['SKELETON_TABLE'], params['BQ_AS_BATCH'])

    #
    # Need to merge in aliquot and sample barcodes from other tables:
    #

    if 'collect_barcodes' in steps:
        skel_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                       params['TARGET_DATASET'],
                                       params['SKELETON_TABLE'])

        success = attach_aliquot_ids(skel_table, params['FILE_TABLE'],
                                     params['TARGET_DATASET'],
                                     params['BARCODE_STEP_1_TABLE'],
                                     params['BQ_AS_BATCH'])
        if not success:
            print("attach_aliquot_ids job failed")
            return

        step_1_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                         params['TARGET_DATASET'],
                                         params['BARCODE_STEP_1_TABLE'])
        success = attach_barcodes(step_1_table, params['ALIQUOT_TABLE'],
                                  params['TARGET_DATASET'],
                                  params['BARCODE_STEP_2_TABLE'],
                                  params['BQ_AS_BATCH'])
        if not success:
            print("attach_barcodes job failed")
            return

    #
    # Merge the barcode info into the final table we are building:
    #

    if 'create_final_table' in steps:
        skel_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                       params['TARGET_DATASET'],
                                       params['SKELETON_TABLE'])
        barcodes_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                           params['TARGET_DATASET'],
                                           params['BARCODE_STEP_2_TABLE'])
        success = final_merge(skel_table, barcodes_table,
                              params['TARGET_DATASET'],
                              params['FINAL_TARGET_TABLE'],
                              params['BQ_AS_BATCH'])
        if not success:
            print("Join job failed")
            return

    #
    # The derived table we generate has no field descriptions. Add them from the scraped page:
    #

    if 'update_final_schema' in steps:
        success = update_schema(params['TARGET_DATASET'],
                                params['FINAL_TARGET_TABLE'], hold_schema_dict)
        if not success:
            print("Schema update failed")
            return

    #
    # Add the table description:
    #

    if 'add_table_description' in steps:
        desc = params['TABLE_DESCRIPTION'].format(params['MAF_URL'])
        update_description(params['TARGET_DATASET'],
                           params['FINAL_TARGET_TABLE'], desc)

    #
    # Clear out working temp tables:
    #

    if 'dump_working_tables' in steps:
        dump_table_tags = [
            'SKELETON_TABLE', 'BARCODE_STEP_1_TABLE', 'BARCODE_STEP_2_TABLE',
            'BQ_MANIFEST_TABLE', 'BQ_PULL_LIST_TABLE'
        ]
        dump_tables = [params[x] for x in dump_table_tags]
        for table in dump_tables:
            delete_table_bq_job(params['TARGET_DATASET'], table)
    #
    # Done!
    #

    print('job completed')

Esempio n. 3

Mostra file

File: build_dna_methylation_bq_table.py Progetto: oshahzada98/NextGenETL

def main():

    if not confirm_google_vm():
        print(
            'This job needs to run on a Google Cloud Compute Engine to avoid storage egress charges [EXITING]'
        )
        return

    print('job started')

    #
    # First thing is to load the configuration:
    #

    params, filters, bq_filters, steps, retain_cols, extra_cols, retain_platform_ref_fields = load_config(
        yaml_config)

    if params is None:
        print("Bad YAML load")
        return

    #
    # Use the filter set to get a manifest from GDC using their API. Note that if a pull list is
    # provided, these steps can be omitted:
    #

    if 'build_manifest_from_filters' in steps:
        max_files = params['MAX_FILES'] if 'MAX_FILES' in params else None
        if params['USE_GDC_API_FOR_MANIFEST']:
            manifest_filter = build_manifest_filter(filters)
            manifest_success = get_the_manifest(manifest_filter,
                                                params['API_URL'],
                                                params['MANIFEST_FILE'],
                                                max_files)
        else:
            manifest_success = get_the_bq_manifest(
                params['FILE_TABLE'], bq_filters, max_files,
                params['WORKING_PROJECT'], params['TARGET_DATASET'],
                params['BQ_MANIFEST_TABLE'], params['WORKING_BUCKET'],
                params['BUCKET_MANIFEST_TSV'], params['MANIFEST_FILE'],
                params['BQ_AS_BATCH'])
        if not manifest_success:
            print("Failure generating manifest")
            return

    #
    # Best practice is to clear out the directory where the files are going. Don't want anything left over:
    #

    if 'clear_target_directory' in steps:
        create_clean_target(params['LOCAL_FILES_DIR'])

    #
    # We need to create a "pull list" of gs:// URLs to pull from GDC buckets. If you have already
    # created a pull list, just plunk it in 'LOCAL_PULL_LIST' and skip this step. If creating a pull
    # list, you can do it using IndexD calls on a manifest file, OR using BQ as long as you have
    # built the manifest using BQ (that route uses the BQ Manifest table that was created).
    #

    if 'build_pull_list' in steps:

        if params['USE_INDEXD_FOR_PULL']:
            build_pull_list_with_indexd(params['MANIFEST_FILE'],
                                        params['INDEXD_IDS_PER_CALL'],
                                        params['INDEXD_URL'],
                                        params['LOCAL_PULL_LIST'])
        else:
            full_manifest = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                              params['TARGET_DATASET'],
                                              params['BQ_MANIFEST_TABLE'])

            build_pull_list_with_bq(
                full_manifest, params['INDEXD_BQ_TABLE'],
                params['WORKING_PROJECT'], params['TARGET_DATASET'],
                params['BQ_PULL_LIST_TABLE'], params['WORKING_BUCKET'],
                params['BUCKET_PULL_LIST'], params['LOCAL_PULL_LIST'],
                params['BQ_AS_BATCH'])

    #
    # Now hitting GDC cloud buckets, not "downloading". Get the files in the pull list:
    #

    if 'download_from_gdc' in steps:
        with open(params['LOCAL_PULL_LIST'], mode='r') as pull_list_file:
            pull_list = pull_list_file.read().splitlines()
        print("Preparing to download %s files from buckets\n" % len(pull_list))
        bp = BucketPuller(10)
        bp.pull_from_buckets(pull_list, params['LOCAL_FILES_DIR'])

    #
    # Traverse the tree of downloaded files and create a flat list of all files:
    #

    if 'build_traversal_list' in steps:
        all_files = build_file_list(params['LOCAL_FILES_DIR'])
        with open(params['FILE_TRAVERSAL_LIST'], mode='w') as traversal_list:
            for line in all_files:
                traversal_list.write("{}\n".format(line))

    #
    # Take all the files and make one BIG TSV file to upload:
    #

    print("fix me have to toss out NA rows!")
    if 'concat_all_files' in steps:
        with open(params['FILE_TRAVERSAL_LIST'],
                  mode='r') as traversal_list_file:
            all_files = traversal_list_file.read().splitlines()
        concat_all_files_selected_cols(all_files, params['ONE_BIG_TSV'],
                                       params['PROGRAM_PREFIX'], retain_cols,
                                       extra_cols, file_info, None,
                                       "Beta_value", "NA")

    #
    # Build the platform reference table
    #

    if 'build_plat_ref' in steps:
        with open(params['FILE_TRAVERSAL_LIST'],
                  mode='r') as traversal_list_file:
            all_files = traversal_list_file.read().splitlines()
        concat_all_files_selected_cols(all_files, params['ONE_BIG_REF_TSV'],
                                       params['PROGRAM_PREFIX'],
                                       retain_platform_ref_fields, [],
                                       file_info, None, None, None)
        set_from_file(params['ONE_BIG_REF_TSV'],
                      params['ONE_BIG_DISTINCT_REF_TSV'])

    #
    # For the legacy table, the descriptions had lots of analysis tidbits. Very nice, but hard to maintain.
    # We just use hardwired schema descriptions now, most directly pulled from the GDC website:
    #

    if 'build_the_schema' in steps:
        typing_tups = build_schema(params['ONE_BIG_TSV'],
                                   params['SCHEMA_SAMPLE_SKIPS'])
        build_combined_schema(None, params['AUGMENTED_SCHEMA_FILE'],
                              typing_tups, params['HOLD_SCHEMA_LIST'],
                              params['HOLD_SCHEMA_DICT'])

    #
    # Upload the giant TSV into a cloud bucket:
    #

    if 'upload_to_bucket' in steps:
        upload_to_bucket(params['WORKING_BUCKET'], params['BUCKET_SKEL_TSV'],
                         params['ONE_BIG_TSV'])

    #
    # Create the BQ table from the TSV:
    #

    if 'create_bq_from_tsv' in steps:
        bucket_src_url = 'gs://{}/{}'.format(params['WORKING_BUCKET'],
                                             params['BUCKET_SKEL_TSV'])
        with open(params['HOLD_SCHEMA_LIST'], mode='r') as schema_hold_dict:
            typed_schema = json_loads(schema_hold_dict.read())
        csv_to_bq(typed_schema, bucket_src_url, params['TARGET_DATASET'],
                  params['SKELETON_TABLE'], params['BQ_AS_BATCH'])

    #
    # Need to merge in aliquot and sample barcodes from other tables:
    #

    if 'collect_barcodes' in steps:
        skel_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                       params['TARGET_DATASET'],
                                       params['SKELETON_TABLE'])

        success = attach_aliquot_ids(skel_table, params['FILE_TABLE'],
                                     params['TARGET_DATASET'],
                                     params['BARCODE_STEP_1_TABLE'],
                                     params['BQ_AS_BATCH'])
        if not success:
            print("attach_aliquot_ids job failed")
            return

        step_1_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                         params['TARGET_DATASET'],
                                         params['BARCODE_STEP_1_TABLE'])
        success = attach_barcodes(step_1_table, params['ALIQUOT_TABLE'],
                                  params['TARGET_DATASET'],
                                  params['BARCODE_STEP_2_TABLE'],
                                  params['BQ_AS_BATCH'])
        if not success:
            print("attach_barcodes job failed")
            return

    #
    # Merge the barcode info into the final table we are building:
    #

    if 'create_final_table' in steps:
        skel_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                       params['TARGET_DATASET'],
                                       params['SKELETON_TABLE'])
        barcodes_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                           params['TARGET_DATASET'],
                                           params['BARCODE_STEP_2_TABLE'])
        success = final_merge(skel_table, barcodes_table,
                              params['TARGET_DATASET'],
                              params['FINAL_TARGET_TABLE'],
                              params['BQ_AS_BATCH'])
        if not success:
            print("Join job failed")
            return

    #
    # The derived table we generate has no field descriptions. Add them from the scraped page:
    #

    if 'update_final_schema' in steps:
        success = update_schema(params['TARGET_DATASET'],
                                params['FINAL_TARGET_TABLE'],
                                params['HOLD_SCHEMA_DICT'])
        if not success:
            print("Schema update failed")
            return

    #
    # Add the table description:
    #

    if 'add_table_description' in steps:
        update_description(params['TARGET_DATASET'],
                           params['FINAL_TARGET_TABLE'],
                           params['TABLE_DESCRIPTION'])

    #
    # Clear out working temp tables:
    #

    if 'dump_working_tables' in steps:
        dump_table_tags = [
            'SKELETON_TABLE', 'BARCODE_STEP_1_TABLE', 'BARCODE_STEP_2_TABLE',
            'BQ_MANIFEST_TABLE', 'BQ_PULL_LIST_TABLE'
        ]
        dump_tables = [params[x] for x in dump_table_tags]
        for table in dump_tables:
            delete_table_bq_job(params['TARGET_DATASET'], table)
    #
    # Done!
    #

    print('job completed')

Esempio n. 4

Mostra file

File: build_mirna_isoform_expr_bq_table.py Progetto: CancerDataAggregator/etl

def main(args):

    if not confirm_google_vm():
        print(
            'This job needs to run on a Google Cloud Compute Engine to avoid storage egress charges [EXITING]'
        )
        return

    if len(args) != 2:
        print(" ")
        print(" Usage : {} <configuration_yaml>".format(args[0]))
        return

    print('job started')

    #
    # Get the YAML config loaded:
    #

    with open(args[1], mode='r') as yaml_file:
        params, bq_filters, steps, extra_cols = load_config(yaml_file.read())

    if params is None:
        print("Bad YAML load")
        return

    # Schema that describes table columns:

    AUGMENTED_SCHEMA_FILE = "SchemaFiles/isoform_augmented_schema_list.json"

    #
    # BQ does not like to be given paths that have "~". So make all local paths absolute:
    #

    home = expanduser("~")
    local_files_dir = "{}/{}".format(home, params['LOCAL_FILES_DIR'])
    one_big_tsv = "{}/{}".format(home, params['ONE_BIG_TSV'])
    manifest_file = "{}/{}".format(home, params['MANIFEST_FILE'])
    local_pull_list = "{}/{}".format(home, params['LOCAL_PULL_LIST'])
    file_traversal_list = "{}/{}".format(home, params['FILE_TRAVERSAL_LIST'])
    hold_schema_dict = "{}/{}".format(home, params['HOLD_SCHEMA_DICT'])
    hold_schema_list = "{}/{}".format(home, params['HOLD_SCHEMA_LIST'])

    #
    # Use the filter set to get a manifest from GDC using their API. Note that is a pull list is
    # provided, these steps can be omitted:
    #

    if 'build_manifest_from_filters' in steps:
        max_files = params['MAX_FILES'] if 'MAX_FILES' in params else None
        manifest_success = get_the_bq_manifest(
            params['FILE_TABLE'], bq_filters, max_files,
            params['WORKING_PROJECT'], params['TARGET_DATASET'],
            params['BQ_MANIFEST_TABLE'], params['WORKING_BUCKET'],
            params['BUCKET_MANIFEST_TSV'], manifest_file,
            params['BQ_AS_BATCH'])
        if not manifest_success:
            print("Failure generating manifest")
            return

    #
    # Best practice is to clear out the directory where the files are going. Don't want anything left over:
    #

    if 'clear_target_directory' in steps:
        create_clean_target(local_files_dir)

    #
    # We need to create a "pull list" of gs:// URLs to pull from GDC buckets. If you have already
    # created a pull list, just plunk it in 'LOCAL_PULL_LIST' and skip this step. If creating a pull
    # list, uses BQ as long as you have built the manifest using BQ (that route uses the BQ Manifest
    # table that was created).
    #

    if 'build_pull_list' in steps:
        full_manifest = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                          params['TARGET_DATASET'],
                                          params['BQ_MANIFEST_TABLE'])

        build_pull_list_with_bq(
            full_manifest, params['INDEXD_BQ_TABLE'],
            params['WORKING_PROJECT'], params['TARGET_DATASET'],
            params['BQ_PULL_LIST_TABLE'], params['WORKING_BUCKET'],
            params['BUCKET_PULL_LIST'], local_pull_list, params['BQ_AS_BATCH'])

    #
    # Now hitting GDC cloud buckets. Get the files in the pull list:
    #

    if 'download_from_gdc' in steps:
        with open(local_pull_list, mode='r') as pull_list_file:
            pull_list = pull_list_file.read().splitlines()
        print("Preparing to download %s files from buckets\n" % len(pull_list))
        bp = BucketPuller(10)
        bp.pull_from_buckets(pull_list, local_files_dir)

    #
    # Traverse the tree of downloaded files and create a flat list of all files:
    #

    if 'build_traversal_list' in steps:
        all_files = build_file_list(local_files_dir)
        with open(file_traversal_list, mode='w') as traversal_list:
            for line in all_files:
                traversal_list.write("{}\n".format(line))

    #
    # Take all the files and make one BIG TSV file to upload:
    #

    if 'concat_all_files' in steps:
        with open(file_traversal_list, mode='r') as traversal_list_file:
            all_files = traversal_list_file.read().splitlines()
        concat_all_files(all_files, one_big_tsv, params['PROGRAM_PREFIX'],
                         extra_cols, file_info, split_col_func)

    #
    # For the legacy table, the descriptions had lots of analysis tidbits. Very nice, but hard to maintain.
    # We just use hardwired schema descriptions now, most directly pulled from the GDC website:
    #

    if 'build_the_schema' in steps:
        typing_tups = build_schema(one_big_tsv, params['SCHEMA_SAMPLE_SKIPS'])
        build_combined_schema(None, AUGMENTED_SCHEMA_FILE, typing_tups,
                              hold_schema_list, hold_schema_dict)

    #
    # Upload the giant TSV into a cloud bucket:
    #

    if 'upload_to_bucket' in steps:
        upload_to_bucket(params['WORKING_BUCKET'], params['BUCKET_SKEL_TSV'],
                         one_big_tsv)

    #
    # Create the BQ table from the TSV:
    #

    if 'create_bq_from_tsv' in steps:
        bucket_src_url = 'gs://{}/{}'.format(params['WORKING_BUCKET'],
                                             params['BUCKET_SKEL_TSV'])
        with open(hold_schema_list, mode='r') as schema_hold_dict:
            typed_schema = json_loads(schema_hold_dict.read())
        csv_to_bq(typed_schema, bucket_src_url, params['TARGET_DATASET'],
                  params['SKELETON_TABLE'], params['BQ_AS_BATCH'])

    #
    # Need to merge in aliquot and sample barcodes from other tables:
    #

    if 'collect_barcodes' in steps:
        skel_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                       params['TARGET_DATASET'],
                                       params['SKELETON_TABLE'])

        success = attach_aliquot_ids(skel_table, params['FILE_TABLE'],
                                     params['TARGET_DATASET'],
                                     params['BARCODE_STEP_1_TABLE'],
                                     params['BQ_AS_BATCH'])
        if not success:
            print("attach_aliquot_ids job failed")
            return

        step_1_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                         params['TARGET_DATASET'],
                                         params['BARCODE_STEP_1_TABLE'])
        success = attach_barcodes(step_1_table, params['ALIQUOT_TABLE'],
                                  params['TARGET_DATASET'],
                                  params['BARCODE_STEP_2_TABLE'],
                                  params['BQ_AS_BATCH'])
        if not success:
            print("attach_barcodes job failed")
            return

    #
    # Merge the barcode info into the final table we are building:
    #

    if 'create_final_table' in steps:
        skel_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                       params['TARGET_DATASET'],
                                       params['SKELETON_TABLE'])
        barcodes_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                           params['TARGET_DATASET'],
                                           params['BARCODE_STEP_2_TABLE'])
        success = final_merge(skel_table, barcodes_table,
                              params['TARGET_DATASET'],
                              params['FINAL_TARGET_TABLE'],
                              params['BQ_AS_BATCH'])
        if not success:
            print("Join job failed")
            return

    #
    # The derived table we generate has no field descriptions. Add them from the scraped page:
    #

    if 'update_final_schema' in steps:
        success = update_schema(params['TARGET_DATASET'],
                                params['FINAL_TARGET_TABLE'], hold_schema_dict)
        if not success:
            print("Schema update failed")
            return

    #
    # Add the table description:
    #

    if 'add_table_description' in steps:
        update_description(params['TARGET_DATASET'],
                           params['FINAL_TARGET_TABLE'],
                           params['TABLE_DESCRIPTION'])

    #
    # Clear out working temp tables:
    #

    if 'dump_working_tables' in steps:
        dump_table_tags = [
            'SKELETON_TABLE', 'BARCODE_STEP_1_TABLE', 'BARCODE_STEP_2_TABLE',
            'BQ_MANIFEST_TABLE', 'BQ_PULL_LIST_TABLE'
        ]
        dump_tables = [params[x] for x in dump_table_tags]
        for table in dump_tables:
            delete_table_bq_job(params['TARGET_DATASET'], table)
    #
    # Done!
    #

    print('job completed')