Ejemplo n.º 1
0
def main(args):

    if len(args) != 2:
        print(" ")
        print(" Usage : {} <configuration_yaml>".format(args[0]))
        return

    print('job started')

    #
    # Get the YAML config loaded:
    #

    with open(args[1], mode='r') as yaml_file:
        params, steps = load_config(yaml_file.read())

    if params is None:
        print("Bad YAML load")
        return

    #
    # Schemas and table descriptions are maintained in the github repo:
    #

    if 'pull_table_info_from_git' in steps:
        print('pull_table_info_from_git')
        try:
            create_clean_target(params['SCHEMA_REPO_LOCAL'])
            repo = Repo.clone_from(params['SCHEMA_REPO_URL'], params['SCHEMA_REPO_LOCAL'])
            repo.git.checkout(params['SCHEMA_REPO_BRANCH'])
        except Exception as ex:
            print("pull_table_info_from_git failed: {}".format(str(ex)))
            return

    for dict in params['FIX_LIST']:

        table, repo_file = next(iter(dict.items()))

        if 'process_git_schemas' in steps:
            print('process_git_schemas: {}'.format(table))
            # Where do we dump the schema git repository?
            schema_file = "{}/{}/{}".format(params['SCHEMA_REPO_LOCAL'], params['RAW_SCHEMA_DIR'], repo_file)
            full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], table)
            # Write out the details
            success = generate_table_detail_files(schema_file, full_file_prefix)
            if not success:
                print("process_git_schemas failed")
                return
        #
        # Update the per-field descriptions:
        #

        if 'update_field_descriptions' in steps:
            print('update_field_descriptions: {}'.format(table))
            full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], table)
            schema_dict_loc = "{}_schema.json".format(full_file_prefix)
            schema_dict = {}
            with open(schema_dict_loc, mode='r') as schema_hold_dict:
                full_schema_list = json_loads(schema_hold_dict.read())
            for entry in full_schema_list:
                schema_dict[entry['name']] = {'description': entry['description']}
            set_and_table = table.split('.', maxsplit=1)
            success = update_schema_with_dict(set_and_table[0], set_and_table[1], schema_dict, project=params['TARGET_PROJECT'])
            if not success:
                print("update_field_descriptions failed")
                return

        #
        # Add description and labels to the target table:
        #

        if 'update_table_description' in steps:
            print('update_table_description: {}'.format(table))
            full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], table)
            set_and_table = table.split('.', maxsplit=1)
            success = install_labels_and_desc(set_and_table[0], set_and_table[1], full_file_prefix, project=params['TARGET_PROJECT'])
            if not success:
                print("update_table_description failed")
                return

        print('job completed')
def main(args):

    if not confirm_google_vm():
        print('This job needs to run on a Google Cloud Compute Engine to avoid storage egress charges [EXITING]')
        return

    if len(args) != 2:
        print(" ")
        print(" Usage : {} <configuration_yaml>".format(args[0]))
        return

    print('job started')

    #
    # Get the YAML config loaded:
    #

    with open(args[1], mode='r') as yaml_file:
        params, bq_filters, steps, extra_cols = load_config(yaml_file.read())

    if params is None:
        print("Bad YAML load")
        return

    # Schema that describes table columns:

    AUGMENTED_SCHEMA_FILE = "SchemaFiles/mirna_augmented_schema_list.json"

    #
    # BQ does not like to be given paths that have "~". So make all local paths absolute:
    #

    home = expanduser("~")
    local_files_dir = "{}/{}".format(home, params['LOCAL_FILES_DIR'])
    one_big_tsv = "{}/{}".format(home, params['ONE_BIG_TSV'])
    manifest_file = "{}/{}".format(home, params['MANIFEST_FILE'])
    local_pull_list = "{}/{}".format(home, params['LOCAL_PULL_LIST'])
    file_traversal_list = "{}/{}".format(home, params['FILE_TRAVERSAL_LIST'])
    hold_schema_dict = "{}/{}".format(home, params['HOLD_SCHEMA_DICT'])
    hold_schema_list = "{}/{}".format(home, params['HOLD_SCHEMA_LIST'])

    #
    # Best practice is to clear out the directory where the files are going. Don't want anything left over.
    # Also creates the destination directory
    #

    if 'clear_target_directory' in steps:
        create_clean_target(local_files_dir)

    #
    # Use the filter set to get a manifest. Note that is a pull list is
    # provided, these steps can be omitted:
    #
    
    if 'build_manifest_from_filters' in steps:
        max_files = params['MAX_FILES'] if 'MAX_FILES' in params else None
        manifest_success = get_the_bq_manifest(params['FILE_TABLE'], bq_filters, max_files,
                                               params['WORKING_PROJECT'], params['TARGET_DATASET'],
                                               params['BQ_MANIFEST_TABLE'], params['WORKING_BUCKET'],
                                               params['BUCKET_MANIFEST_TSV'], manifest_file,
                                               params['BQ_AS_BATCH'])
        if not manifest_success:
            print("Failure generating manifest")
            return

    #
    # If you have already created a pull list, just plunk it in 'LOCAL_PULL_LIST' and skip this step.
    #
    
    if 'build_pull_list' in steps:
        full_manifest = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                          params['TARGET_DATASET'],
                                          params['BQ_MANIFEST_TABLE'])

        build_pull_list_with_bq(full_manifest, params['INDEXD_BQ_TABLE'],
                                params['WORKING_PROJECT'], params['TARGET_DATASET'],
                                params['BQ_PULL_LIST_TABLE'],
                                params['WORKING_BUCKET'],
                                params['BUCKET_PULL_LIST'],
                                local_pull_list, params['BQ_AS_BATCH'])
 
    #
    # Now hitting GDC cloud buckets, not "downloading". Get the files in the pull list:
    #

    if 'download_from_gdc' in steps:       
        with open(local_pull_list, mode='r') as pull_list_file:
            pull_list = pull_list_file.read().splitlines()
        print("Preparing to download %s files from buckets\n" % len(pull_list))
        bp = BucketPuller(10)
        bp.pull_from_buckets(pull_list, local_files_dir)

    #
    # Traverse the tree of downloaded files and create a flat list of all files:
    #
    
    if 'build_traversal_list' in steps:
        all_files = build_file_list(local_files_dir)
        with open(file_traversal_list, mode='w') as traversal_list:
            for line in all_files:
                traversal_list.write("{}\n".format(line)) 
   
    #
    # Take all the files and make one BIG TSV file to upload:
    #
    
    if 'concat_all_files' in steps:       
        with open(file_traversal_list, mode='r') as traversal_list_file:
            all_files = traversal_list_file.read().splitlines()  
        concat_all_files(all_files, one_big_tsv,
                         params['PROGRAM_PREFIX'], extra_cols, file_info, None)

    #
    # For the legacy table, the descriptions had lots of analysis tidbits. Very nice, but hard to maintain.
    # We just use hardwired schema descriptions now, most directly pulled from the GDC website:
    #

    if 'build_the_schema' in steps:
        typing_tups = build_schema(one_big_tsv, params['SCHEMA_SAMPLE_SKIPS'])
        build_combined_schema(None, AUGMENTED_SCHEMA_FILE,
                              typing_tups, hold_schema_list, hold_schema_dict)

    #
    # Upload the giant TSV into a cloud bucket:
    #
    
    if 'upload_to_bucket' in steps:
        upload_to_bucket(params['WORKING_BUCKET'], params['BUCKET_SKEL_TSV'], one_big_tsv)

    #
    # Create the BQ table from the TSV:
    #
        
    if 'create_bq_from_tsv' in steps:
        bucket_src_url = 'gs://{}/{}'.format(params['WORKING_BUCKET'], params['BUCKET_SKEL_TSV'])
        with open(hold_schema_list, mode='r') as schema_hold_dict:
            typed_schema = json_loads(schema_hold_dict.read())
        csv_to_bq(typed_schema, bucket_src_url, params['TARGET_DATASET'], params['SKELETON_TABLE'], params['BQ_AS_BATCH'])

    #
    # Need to merge in aliquot and sample barcodes from other tables:
    #
           
    if 'collect_barcodes' in steps:
        skel_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], 
                                       params['TARGET_DATASET'], 
                                       params['SKELETON_TABLE'])
        
        success = attach_aliquot_ids(skel_table, params['FILE_TABLE'], 
                                     params['TARGET_DATASET'], 
                                     params['BARCODE_STEP_1_TABLE'], params['BQ_AS_BATCH'])
        if not success:
            print("attach_aliquot_ids job failed")
            return

        step_1_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], 
                                         params['TARGET_DATASET'], 
                                         params['BARCODE_STEP_1_TABLE'])
        success = attach_barcodes(step_1_table, params['ALIQUOT_TABLE'], 
                                  params['TARGET_DATASET'], params['BARCODE_STEP_2_TABLE'], params['BQ_AS_BATCH'])
        if not success:
            print("attach_barcodes job failed")
            return
   
    #
    # Merge the barcode info into the final table we are building:
    #

    if 'create_final_table' in steps:
        skel_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], 
                                       params['TARGET_DATASET'], 
                                       params['SKELETON_TABLE'])
        barcodes_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], 
                                           params['TARGET_DATASET'], 
                                           params['BARCODE_STEP_2_TABLE'])        
        success = final_merge(skel_table, barcodes_table, 
                              params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], params['BQ_AS_BATCH'])
        if not success:
            print("Join job failed")
            return

    #
    # Schemas and table descriptions are maintained in the github repo:
    #

    if 'pull_table_info_from_git' in steps:
        print('pull_table_info_from_git')
        try:
            create_clean_target(params['SCHEMA_REPO_LOCAL'])
            repo = Repo.clone_from(params['SCHEMA_REPO_URL'], params['SCHEMA_REPO_LOCAL'])
            repo.git.checkout(params['SCHEMA_REPO_BRANCH'])
        except Exception as ex:
            print("pull_table_info_from_git failed: {}".format(str(ex)))
            return

    if 'process_git_schemas' in steps:
        print('process_git_schema')
        # Where do we dump the schema git repository?
        schema_file = "{}/{}/{}".format(params['SCHEMA_REPO_LOCAL'], params['RAW_SCHEMA_DIR'], params['SCHEMA_FILE_NAME'])
        full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE'])
        # Write out the details
        success = generate_table_detail_files(schema_file, full_file_prefix)
        if not success:
            print("process_git_schemas failed")
            return

    if 'analyze_the_schema' in steps:
        print('analyze_the_schema')
        typing_tups = build_schema(one_big_tsv, params['SCHEMA_SAMPLE_SKIPS'])
        full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE'])
        schema_dict_loc = "{}_schema.json".format(full_file_prefix)
        build_combined_schema(None, schema_dict_loc,
                                typing_tups, hold_schema_list, hold_schema_dict)

    #
    # Update the per-field descriptions:
    #

    if 'update_field_descriptions' in steps:
        print('update_field_descriptions')
        full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE'])
        schema_dict_loc = "{}_schema.json".format(full_file_prefix)
        schema_dict = {}
        with open(schema_dict_loc, mode='r') as schema_hold_dict:
            full_schema_list = json_loads(schema_hold_dict.read())
        for entry in full_schema_list:
            schema_dict[entry['name']] = {'description': entry['description']}

        success = update_schema_with_dict(params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], schema_dict)
        if not success:
            print("update_field_descriptions failed")
            return

    #
    # Add description and labels to the target table:
    #

    if 'update_table_description' in steps:
        print('update_table_description')
        full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE'])
        success = install_labels_and_desc(params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], full_file_prefix)
        if not success:
            print("update_table_description failed")
            return

    #
    # Clear out working temp tables:
    #
    
    if 'dump_working_tables' in steps:   
        dump_table_tags = ['SKELETON_TABLE', 'BARCODE_STEP_1_TABLE', 'BARCODE_STEP_2_TABLE', 
                           'BQ_MANIFEST_TABLE', 'BQ_PULL_LIST_TABLE']
        dump_tables = [params[x] for x in dump_table_tags]
        for table in dump_tables:
            delete_table_bq_job(params['TARGET_DATASET'], table)

    #
    # publish table:
    #

    if 'publish' in steps:
        source_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'],
                                         params['FINAL_TARGET_TABLE'])
        publication_dest = '{}.{}.{}'.format(params['PUBLICATION_PROJECT'], params['PUBLICATION_DATASET'],
                                             params['PUBLICATION_TABLE'])
        success = publish_table(source_table, publication_dest)
        if not success:
            print("publish table failed")
            return

    print('job completed')
def main(args):

    #if not confirm_google_vm():
    #    print('This job needs to run on a Google Cloud Compute Engine to avoid storage egress charges [EXITING]')
    #    return

    if len(args) != 2:
        print(" ")
        print(" Usage : {} <configuration_yaml>".format(args[0]))
        return

    print('job started')

    #
    # Get the YAML config loaded:
    #

    with open(args[1], mode='r') as yaml_file:
        params, steps = load_config(yaml_file.read())

    if params is None:
        print("Bad YAML load")
        return

    # Schema that describes DCF manifests:

    MANIFEST_SCHEMA_LIST = "SchemaFiles/dcf_manifest_schema.json"

    # Schema that describes our final map table:

    FILE_MAP_SCHEMA_LIST = "SchemaFiles/dcf_file_map_schema.json"

    #
    # Decide if we are doing active, legacy, or both manifests:
    #

    mani_dict = {}
    map_dict = {}
    if params['DO_ACTIVE']:
        mani_dict['ACTIVE_MANIFEST_TSV'] = 'ACTIVE_MANIFEST_BQ'
        map_dict['ACTIVE_MANIFEST_BQ'] = 'ACTIVE_FILE_MAP_BQ'

    if params['DO_LEGACY']:
        mani_dict['LEGACY_MANIFEST_TSV'] = 'LEGACY_MANIFEST_BQ'
        map_dict['LEGACY_MANIFEST_BQ'] = 'LEGACY_FILE_MAP_BQ'

    #
    # Create a manifest BQ table from a TSV:
    #

    if 'create_bq_manifest_from_tsv' in steps:
        with open(MANIFEST_SCHEMA_LIST, mode='r') as schema_hold_dict:
            typed_schema = json_loads(schema_hold_dict.read())

        for manikey in list(mani_dict.keys()):
            bucket_src_url = 'gs://{}/{}'.format(params['WORKING_BUCKET'], params[manikey])
            success = csv_to_bq(typed_schema, bucket_src_url, params['TARGET_DATASET'],
                                params[mani_dict[manikey]], params['BQ_AS_BATCH'])
            if not success:
                print("create_bq_manifest_from_tsv failed")
                return

    #
    # Create the file map tables:
    #

    if 'create_file_map_bq' in steps:

        for mapkey in list(map_dict.keys()):
            mani_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params[mapkey])
            success = build_file_map(mani_table, params['TARGET_DATASET'], params[map_dict[mapkey]], params['BQ_AS_BATCH'])
            if not success:
                print("create_file_map_bq failed")
                return

            # Install a schema in the new table:
            schema_dict = schema_list_to_dict(FILE_MAP_SCHEMA_LIST)
            success = update_schema_with_dict(params['TARGET_DATASET'], params[map_dict[mapkey]], schema_dict)
            if not success:
                print("install file map schema failed")
                return

    #
    # Add descriptions
    #

    if 'add_table_descriptions' in steps:
        for mapkey in list(map_dict.keys()):
            success = update_description(params['TARGET_DATASET'], params[mapkey],
                                         params['DCF_MANIFEST_TABLE_DESCRIPTION'])
            if not success:
                print("install manifest description failed")
                return

            success = update_description(params['TARGET_DATASET'], params[map_dict[mapkey]],
                                         params['FILE_MAP_TABLE_DESCRIPTION'])
            if not success:
                print("install file map description failed")
                return

    print('job completed')
Ejemplo n.º 4
0
def main(args):

    if not confirm_google_vm():
        print(
            'This job needs to run on a Google Cloud Compute Engine to avoid storage egress charges [EXITING]'
        )
        return

    if len(args) != 2:
        print(" ")
        print(" Usage : {} <configuration_yaml>".format(args[0]))
        return

    print('job started')

    #
    # Get the YAML config loaded:
    #

    with open(args[1], mode='r') as yaml_file:
        params, steps = load_config(yaml_file.read())

    #
    # BQ does not like to be given paths that have "~". So make all local paths absolute:
    #

    home = expanduser("~")
    local_files_dir = "{}/{}".format(home, params['LOCAL_FILES_DIR'])
    prog_tsv = "{}/{}".format(home, params['PROG_TSV'])
    case_tsv = "{}/{}".format(home, params['CASE_TSV'])
    sample_tsv = "{}/{}".format(home, params['SAMPLE_TSV'])
    aliquot_tsv = "{}/{}".format(home, params['ALIQUOT_TSV'])

    hold_schema_dict_prog = "{}/{}".format(home,
                                           params['HOLD_SCHEMA_DICT_PROG'])
    hold_schema_list_prog = "{}/{}".format(home,
                                           params['HOLD_SCHEMA_LIST_PROG'])
    hold_schema_dict_case = "{}/{}".format(home,
                                           params['HOLD_SCHEMA_DICT_CASE'])
    hold_schema_list_case = "{}/{}".format(home,
                                           params['HOLD_SCHEMA_LIST_CASE'])
    hold_schema_dict_sample = "{}/{}".format(home,
                                             params['HOLD_SCHEMA_DICT_SAMPLE'])
    hold_schema_list_sample = "{}/{}".format(home,
                                             params['HOLD_SCHEMA_LIST_SAMPLE'])
    hold_schema_dict_aliquot = "{}/{}".format(
        home, params['HOLD_SCHEMA_DICT_ALIQUOT'])
    hold_schema_list_aliquot = "{}/{}".format(
        home, params['HOLD_SCHEMA_LIST_ALIQUOT'])

    if 'clear_target_directory' in steps:
        print('clear_target_directory')
        create_clean_target(local_files_dir)

    #
    # Use the filter set to build a manifest. Note that if a pull list is
    # provided, these steps can be omitted:
    #

    if 'pull_cases_per_program_from_pdc' in steps:
        endpoint = params["PDC_ENDPOINT"]
        success = pull_cases_per_program_from_pdc(endpoint, prog_tsv)
        if not success:
            print("Failure pulling programs")
            return

    if 'pull_aliquots_from_pdc' in steps:
        endpoint = params["PDC_ENDPOINT"]
        success = pull_aliquots_from_pdc(endpoint, case_tsv, sample_tsv,
                                         aliquot_tsv)
        if not success:
            print("Failure pulling programs")
            return

    #
    # Schemas and table descriptions are maintained in the github repo:
    #

    if 'pull_table_info_from_git' in steps:
        print('pull_table_info_from_git')
        try:
            create_clean_target(params['SCHEMA_REPO_LOCAL'])
            repo = Repo.clone_from(params['SCHEMA_REPO_URL'],
                                   params['SCHEMA_REPO_LOCAL'])
            repo.git.checkout(params['SCHEMA_REPO_BRANCH'])
        except Exception as ex:
            print("pull_table_info_from_git failed: {}".format(str(ex)))
            return

    if 'process_git_schemas' in steps:
        print('process_git_schema')
        # Where do we dump the schema git repository?
        schema_file = "{}/{}/{}".format(params['SCHEMA_REPO_LOCAL'],
                                        params['RAW_SCHEMA_DIR'],
                                        params['SCHEMA_FILE_NAME'])
        full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'],
                                          params['FINAL_TARGET_TABLE'])
        # Write out the details
        success = generate_table_detail_files(schema_file, full_file_prefix)
        if not success:
            print("process_git_schemas failed")
            return

    if 'analyze_the_schema' in steps:
        print('analyze_the_schema')
        typing_tups = build_schema(prog_tsv, params['SCHEMA_SAMPLE_SKIPS'])
        full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'],
                                          params['TARGET_TABLE_PROG'])
        schema_dict_loc = "{}_schema.json".format(full_file_prefix)
        build_combined_schema(None, None, typing_tups, hold_schema_list_prog,
                              hold_schema_dict_prog)
        typing_tups = build_schema(case_tsv, params['SCHEMA_SAMPLE_SKIPS'])
        full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'],
                                          params['FINAL_TARGET_CASE_TABLE'])
        schema_dict_loc = "{}_schema.json".format(full_file_prefix)
        build_combined_schema(None, None, typing_tups, hold_schema_list_case,
                              hold_schema_dict_case)
        typing_tups = build_schema(sample_tsv, params['SCHEMA_SAMPLE_SKIPS'])
        full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'],
                                          params['TARGET_TABLE_SAMPLE'])
        schema_dict_loc = "{}_schema.json".format(full_file_prefix)
        build_combined_schema(None, None, typing_tups, hold_schema_list_sample,
                              hold_schema_dict_sample)
        typing_tups = build_schema(aliquot_tsv, params['SCHEMA_SAMPLE_SKIPS'])
        full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'],
                                          params['TARGET_TABLE_ALIQUOT'])
        schema_dict_loc = "{}_schema.json".format(full_file_prefix)
        build_combined_schema(None, None, typing_tups,
                              hold_schema_list_aliquot,
                              hold_schema_dict_aliquot)

    bucket_target_program = '{}/{}'.format(params['WORKING_BUCKET_DIR'],
                                           params['BUCKET_TSV_PROGRAM'])
    bucket_target_case = '{}/{}'.format(params['WORKING_BUCKET_DIR'],
                                        params['BUCKET_TSV_CASE'])
    bucket_target_sample = '{}/{}'.format(params['WORKING_BUCKET_DIR'],
                                          params['BUCKET_TSV_SAMPLE'])
    bucket_target_aliquot = '{}/{}'.format(params['WORKING_BUCKET_DIR'],
                                           params['BUCKET_TSV_ALIQUOT'])

    if 'upload_to_bucket' in steps:
        print('upload_to_bucket')
        upload_to_bucket(params['WORKING_BUCKET'], bucket_target_program,
                         prog_tsv)
        upload_to_bucket(params['WORKING_BUCKET'], bucket_target_case,
                         case_tsv)
        upload_to_bucket(params['WORKING_BUCKET'], bucket_target_sample,
                         sample_tsv)
        upload_to_bucket(params['WORKING_BUCKET'], bucket_target_aliquot,
                         aliquot_tsv)

    if 'create_bq_from_tsv' in steps:
        print('create_bq_from_tsv')
        bucket_src_url = 'gs://{}/{}'.format(params['WORKING_BUCKET'],
                                             bucket_target_program)
        with open(hold_schema_list_prog, mode='r') as schema_hold_dict:
            typed_schema = json_loads(schema_hold_dict.read())
        csv_to_bq(typed_schema, bucket_src_url, params['TARGET_DATASET'],
                  params['TARGET_TABLE_PROG'], params['BQ_AS_BATCH'])

        bucket_src_url = 'gs://{}/{}'.format(params['WORKING_BUCKET'],
                                             bucket_target_case)
        with open(hold_schema_list_case, mode='r') as schema_hold_dict:
            typed_schema = json_loads(schema_hold_dict.read())
        csv_to_bq(typed_schema, bucket_src_url, params['TARGET_DATASET'],
                  params['TARGET_TABLE_CASE'], params['BQ_AS_BATCH'])

        bucket_src_url = 'gs://{}/{}'.format(params['WORKING_BUCKET'],
                                             bucket_target_sample)
        with open(hold_schema_list_sample, mode='r') as schema_hold_dict:
            typed_schema = json_loads(schema_hold_dict.read())
        csv_to_bq(typed_schema, bucket_src_url, params['TARGET_DATASET'],
                  params['TARGET_TABLE_SAMPLE'], params['BQ_AS_BATCH'])

        bucket_src_url = 'gs://{}/{}'.format(params['WORKING_BUCKET'],
                                             bucket_target_aliquot)
        with open(hold_schema_list_aliquot, mode='r') as schema_hold_dict:
            typed_schema = json_loads(schema_hold_dict.read())
        csv_to_bq(typed_schema, bucket_src_url, params['TARGET_DATASET'],
                  params['TARGET_TABLE_ALIQUOT'], params['BQ_AS_BATCH'])

    if 'join_case_tables' in steps:
        print('join_case_tables')
        full_target_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                              params['TARGET_DATASET'],
                                              params['TARGET_TABLE'])
        success = join_with_aliquot_table(full_target_table,
                                          params['ALIQUOT_TABLE'],
                                          params['TARGET_DATASET'],
                                          params['FINAL_TARGET_TABLE'],
                                          params['BQ_AS_BATCH'])
        if not success:
            print("Join job failed")

    #
    # Update the per-field descriptions:
    #

    if 'update_field_descriptions' in steps:
        print('update_field_descriptions')
        full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'],
                                          params['FINAL_TARGET_TABLE'])
        schema_dict_loc = "{}_schema.json".format(full_file_prefix)
        schema_dict = {}
        with open(schema_dict_loc, mode='r') as schema_hold_dict:
            full_schema_list = json_loads(schema_hold_dict.read())
        for entry in full_schema_list:
            schema_dict[entry['name']] = {'description': entry['description']}

        success = update_schema_with_dict(params['TARGET_DATASET'],
                                          params['FINAL_TARGET_TABLE'],
                                          schema_dict)
        if not success:
            print("update_field_descriptions failed")
            return

    #
    # Add description and labels to the target table:
    #

    if 'update_table_description' in steps:
        print('update_table_description')
        full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'],
                                          params['FINAL_TARGET_TABLE'])
        success = install_labels_and_desc(params['TARGET_DATASET'],
                                          params['FINAL_TARGET_TABLE'],
                                          full_file_prefix)
        if not success:
            print("update_table_description failed")
            return

    #
    # publish table:
    #

    if 'publish' in steps:

        source_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                         params['TARGET_DATASET'],
                                         params['FINAL_TARGET_TABLE'])
        publication_dest = '{}.{}.{}'.format(params['PUBLICATION_PROJECT'],
                                             params['PUBLICATION_DATASET'],
                                             params['PUBLICATION_TABLE'])

        success = publish_table(source_table, publication_dest)

        if not success:
            print("publish table failed")
            return

    #
    # Clear out working temp tables:
    #

    if 'dump_working_tables' in steps:
        dump_table_tags = ['TARGET_TABLE']
        dump_tables = [params[x] for x in dump_table_tags]
        for table in dump_tables:
            delete_table_bq_job(params['TARGET_DATASET'], table)

    print('job completed')
Ejemplo n.º 5
0
def main(args):

    if not confirm_google_vm():
        print('This job needs to run on a Google Cloud Compute Engine to avoid storage egress charges [EXITING]')
        return

    if len(args) != 2:
        print(" ")
        print(" Usage : {} <configuration_yaml>".format(args[0]))
        return

    print('job started')

    #
    # Get the YAML config loaded:
    #

    with open(args[1], mode='r') as yaml_file:
        params, bq_filters, na_values, steps = load_config(yaml_file.read())

    #
    # BQ does not like to be given paths that have "~". So make all local paths absolute:
    #

    home = expanduser("~")
    local_files_dir = "{}/{}".format(home, params['LOCAL_FILES_DIR'])
    one_big_tsv = "{}/{}".format(home, params['ONE_BIG_TSV'])
    manifest_file = "{}/{}".format(home, params['MANIFEST_FILE'])
    local_pull_list = "{}/{}".format(home, params['LOCAL_PULL_LIST'])
    file_traversal_list = "{}/{}".format(home, params['FILE_TRAVERSAL_LIST'])
    hold_schema_dict = "{}/{}".format(home, params['HOLD_SCHEMA_DICT'])
    hold_schema_list = "{}/{}".format(home, params['HOLD_SCHEMA_LIST'])

    #
    # Actual fields have brackets:
    #

    na_set = set()
    for val in na_values:
        na_set.add("[{}]".format(val))

    if 'clear_target_directory' in steps:
        print('clear_target_directory')
        create_clean_target(local_files_dir)

    #
    # Use the filter set to build a manifest. Note that if a pull list is
    # provided, these steps can be omitted:
    #

    if 'build_manifest_from_filters' in steps:
        print('build_manifest_from_filters')
        max_files = params['MAX_FILES'] if 'MAX_FILES' in params else None

        manifest_success = get_the_bq_manifest(params['FILE_TABLE'], bq_filters, max_files,
                                               params['WORKING_PROJECT'], params['TARGET_DATASET'],
                                               params['BQ_MANIFEST_TABLE'], params['WORKING_BUCKET'],
                                               params['BUCKET_MANIFEST_TSV'], manifest_file,
                                               params['BQ_AS_BATCH'])
        if not manifest_success:
            print("Failure generating manifest")
            return

    #
    # We need to create a "pull list" of gs:// URLs to pull from GDC buckets. If you have already
    # created a pull list, just plunk it in 'LOCAL_PULL_LIST' and skip this step. If creating a pull
    # list, uses BQ as long as you have built the manifest using BQ (that route uses the BQ Manifest
    # table that was created).
    #

    if 'build_pull_list' in steps:
        print('build_pull_list')
        full_manifest = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                          params['TARGET_DATASET'],
                                          params['BQ_MANIFEST_TABLE'])
        success = build_pull_list_with_bq_public(full_manifest, params['INDEXD_BQ_TABLE'],
                                          params['WORKING_PROJECT'], params['TARGET_DATASET'],
                                          params['BQ_PULL_LIST_TABLE'],
                                          params['WORKING_BUCKET'],
                                          params['BUCKET_PULL_LIST'],
                                          local_pull_list, params['BQ_AS_BATCH'])

        if not success:
            print("Build pull list failed")
            return;
    #
    # Now hitting GDC cloud buckets. Get the files in the pull list:
    #

    if 'download_from_gdc' in steps:
        print('download_from_gdc')
        with open(local_pull_list, mode='r') as pull_list_file:
            pull_list = pull_list_file.read().splitlines()
        print("Preparing to download %s files from buckets\n" % len(pull_list))
        bp = BucketPuller(10)
        bp.pull_from_buckets(pull_list, local_files_dir)

    if 'build_file_list' in steps:
        print('build_file_list')
        all_files = build_file_list(local_files_dir)
        with open(file_traversal_list, mode='w') as traversal_list:
            for line in all_files:
                traversal_list.write("{}\n".format(line))

    if 'group_by_type' in steps:
        print('group_by_type')
        print(file_traversal_list)
        with open(file_traversal_list, mode='r') as traversal_list_file:
            all_files = traversal_list_file.read().splitlines()
        group_dict = group_by_suffixes(all_files) # WRITE OUT AS JSON!!

    if 'convert_excel_to_csv' in steps:
        print('convert_excel_to_csv')
        with open(file_traversal_list, mode='r') as traversal_list_file:
            all_files = traversal_list_file.read().splitlines()
        convert_excel_to_csv(all_files, local_files_dir)

    if 'concat_all_files' in steps:
        print('concat_all_files')
        for k, v in group_dict.items():
            concat_all_files(v, one_big_tsv.format(k), na_set)

    #
    # Schemas and table descriptions are maintained in the github repo:
    #

    if 'pull_table_info_from_git' in steps:
        print('pull_table_info_from_git')
        try:
            create_clean_target(params['SCHEMA_REPO_LOCAL'])
            repo = Repo.clone_from(params['SCHEMA_REPO_URL'], params['SCHEMA_REPO_LOCAL'])
            repo.git.checkout(params['SCHEMA_REPO_BRANCH'])
        except Exception as ex:
            print("pull_table_info_from_git failed: {}".format(str(ex)))
            return

    if 'process_git_schemas' in steps:
        print('process_git_schema')
        # Where do we dump the schema git repository?
        schema_file = "{}/{}/{}".format(params['SCHEMA_REPO_LOCAL'], params['RAW_SCHEMA_DIR'], params['SCHEMA_FILE_NAME'])
        full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE'])
        # Write out the details
        success = generate_table_detail_files(schema_file, full_file_prefix)
        if not success:
            print("process_git_schemas failed")
            return

    if 'analyze_the_schema' in steps:
        print('analyze_the_schema')
        for k in group_dict:
            typing_tups = build_schema(one_big_tsv.format(k), params['SCHEMA_SAMPLE_SKIPS'])
            #full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE'])
            #schema_dict_loc = "{}_schema.json".format(full_file_prefix)
            hold_schema_dict_for_group = hold_schema_dict.format(k)
            hold_schema_list_for_group = hold_schema_list.format(k)
            build_combined_schema(None, None,
                                  typing_tups, hold_schema_list_for_group, hold_schema_dict_for_group)

    bucket_target_blob = '{}/{}'.format(params['WORKING_BUCKET_DIR'], params['BUCKET_TSV'])

    if 'upload_to_bucket' in steps:
        print('upload_to_bucket')
        for k in group_dict:
            upload_to_bucket(params['WORKING_BUCKET'], bucket_target_blob.format(k), one_big_tsv.format(k))

    if 'create_bq_from_tsv' in steps:
        print('create_bq_from_tsv')
        for k in group_dict:
            bucket_src_url = 'gs://{}/{}'.format(params['WORKING_BUCKET'], bucket_target_blob.format(k))
            with open(hold_schema_list.format(k), mode='r') as schema_hold_dict:
                typed_schema = json_loads(schema_hold_dict.read())
            csv_to_bq(typed_schema, bucket_src_url, params['TARGET_DATASET'],
                      params['FINAL_TARGET_TABLE'].format(k.replace(".", "_").replace("-", "_")), params['BQ_AS_BATCH'])

    if 'add_aliquot_fields' in steps:
        print('add_aliquot_fields')
        full_target_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                              params['TARGET_DATASET'],
                                              params['TARGET_TABLE'])
        success = join_with_aliquot_table(full_target_table, params['ALIQUOT_TABLE'],
                                          params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], params['BQ_AS_BATCH'])
        if not success:
            print("Join job failed")

    #
    # Update the per-field descriptions:
    #

    if 'update_field_descriptions' in steps:
        print('update_field_descriptions')
        full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE'])
        schema_dict_loc = "{}_schema.json".format(full_file_prefix)
        schema_dict = {}
        with open(schema_dict_loc, mode='r') as schema_hold_dict:
            full_schema_list = json_loads(schema_hold_dict.read())
        for entry in full_schema_list:
            schema_dict[entry['name']] = {'description': entry['description']}

        success = update_schema_with_dict(params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], schema_dict)
        if not success:
            print("update_field_descriptions failed")
            return

    #
    # Add description and labels to the target table:
    #

    if 'update_table_description' in steps:
        print('update_table_description')
        full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE'])
        success = install_labels_and_desc(params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], full_file_prefix)
        if not success:
            print("update_table_description failed")
            return

    #
    # publish table:
    #

    if 'publish' in steps:

        source_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'],
                                         params['FINAL_TARGET_TABLE'])
        publication_dest = '{}.{}.{}'.format(params['PUBLICATION_PROJECT'], params['PUBLICATION_DATASET'],
                                             params['PUBLICATION_TABLE'])

        success = publish_table(source_table, publication_dest)

        if not success:
            print("publish table failed")
            return

    #
    # Clear out working temp tables:
    #

    if 'dump_working_tables' in steps:
        dump_table_tags = ['TARGET_TABLE']
        dump_tables = [params[x] for x in dump_table_tags]
        for table in dump_tables:
            delete_table_bq_job(params['TARGET_DATASET'], table)

    print('job completed')
def main(args):

    if not confirm_google_vm():
        print(
            'This job needs to run on a Google Cloud Compute Engine to avoid storage egress charges [EXITING]'
        )
        return

    if len(args) != 2:
        print(" ")
        print(" Usage : {} <configuration_yaml>".format(args[0]))
        return

    print('job started')

    #
    # Get the YAML config loaded:
    #

    with open(args[1], mode='r') as yaml_file:
        params, steps, tables_to_patch = load_config(yaml_file.read())

    if params is None:
        print("Bad YAML load")
        return

    #
    # Schemas and table descriptions are maintained in the github repo. Only do this once:
    #

    if 'pull_table_info_from_git' in steps:
        print('pull_table_info_from_git')
        try:
            create_clean_target(params['SCHEMA_REPO_LOCAL'])
            repo = Repo.clone_from(params['SCHEMA_REPO_URL'],
                                   params['SCHEMA_REPO_LOCAL'])
            repo.git.checkout(params['SCHEMA_REPO_BRANCH'])
        except Exception as ex:
            print("pull_table_info_from_git failed: {}".format(str(ex)))
            return

    for mydict in tables_to_patch:

        full_table, table_dict = next(iter(mydict.items()))

        #
        # Extract the project, dataset, and table name:
        #

        split_table = full_table.split('.')
        target_project = split_table[0]
        target_dataset = split_table[1]
        target_table = split_table[2]

        if 'process_git_schemas' in steps:
            print('process_git_schema')
            # Where do we dump the schema git repository?
            schema_file_name = table_dict["generic_schema_file"]
            schema_file = "{}/{}/{}".format(params['SCHEMA_REPO_LOCAL'],
                                            params['RAW_SCHEMA_DIR'],
                                            schema_file_name)
            full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'],
                                              full_table)
            # Write out the details
            success = generate_table_detail_files(schema_file,
                                                  full_file_prefix)
            if not success:
                print("process_git_schemas failed")
                return False

        # Customize generic schema to this data program:

        if 'replace_schema_tags' in steps:
            print('replace_schema_tags')
            full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'],
                                              full_table)
            # Write out the details
            success = customize_labels_and_desc(full_file_prefix,
                                                table_dict["schema_tags"])
            if not success:
                print("replace_schema_tags failed")
                return False

        #
        # Update the per-field descriptions:
        #

        if 'install_field_descriptions' in steps:
            print('install_field_descriptions: {}'.format(full_table))
            full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'],
                                              full_table)
            schema_dict_loc = "{}_schema.json".format(full_file_prefix)
            schema_dict = {}
            with open(schema_dict_loc, mode='r') as schema_hold_dict:
                full_schema_list = json_loads(schema_hold_dict.read())
            for entry in full_schema_list:
                schema_dict[entry['name']] = {
                    'description': entry['description']
                }
            success = update_schema_with_dict(target_dataset,
                                              target_table,
                                              schema_dict,
                                              project=target_project)
            if not success:
                print("install_field_descriptions failed")
                return False

        #
        # Add description and labels to the target table:
        #

        if 'install_table_description' in steps:
            print('install_table_description: {}'.format(full_table))
            full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'],
                                              full_table)
            success = install_labels_and_desc(target_dataset,
                                              target_table,
                                              full_file_prefix,
                                              project=target_project)
            if not success:
                print("install_table_description failed")
                return False

    print('job completed')
Ejemplo n.º 7
0
def main(args):

    if not confirm_google_vm():
        print(
            'This job needs to run on a Google Cloud Compute Engine to avoid storage egress charges [EXITING]'
        )
        return

    if len(args) != 2:
        print(" ")
        print(" Usage : {} <configuration_yaml>".format(args[0]))
        return

    print('job started')

    #
    # Get the YAML config loaded:
    #

    with open(args[1], mode='r') as yaml_file:
        params, steps = load_config(yaml_file.read())

    #
    # Schemas and table descriptions are maintained in the github repo:
    #

    if 'pull_table_info_from_git' in steps:
        print('pull_table_info_from_git')
        try:
            create_clean_target(params['SCHEMA_REPO_LOCAL'])
            repo = Repo.clone_from(params['SCHEMA_REPO_URL'],
                                   params['SCHEMA_REPO_LOCAL'])
            repo.git.checkout(params['SCHEMA_REPO_BRANCH'])
        except Exception as ex:
            print("pull_table_info_from_git failed: {}".format(str(ex)))
            return

    if 'process_git_schemas' in steps:
        print('process_git_schema')
        # Where do we dump the schema git repository?
        schema_file = "{}/{}/{}".format(params['SCHEMA_REPO_LOCAL'],
                                        params['RAW_SCHEMA_DIR'],
                                        params['SCHEMA_FILE_NAME'])
        full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'],
                                          params['FINAL_TARGET_TABLE'])
        # Write out the details
        success = generate_table_detail_files(schema_file, full_file_prefix)
        if not success:
            print("process_git_schemas failed")
            return

    if 'build_final_table' in steps:
        print('build_final_table')

        success = build_final_table(
            params['PDC_META_ALIQUOT_TABLE'],
            params['PDC_QUANT_ALIQUOT_TABLE'], params['PDC_META_CASES_TABLE'],
            params['GDC_CASE_DATA_TABLE'], params['TARGET_DATASET'],
            params['FINAL_TARGET_TABLE'], params['BQ_AS_BATCH'])

        if not success:
            print("Join job failed")

    #
    # Update the per-field descriptions:
    #

    if 'update_field_descriptions' in steps:
        print('update_field_descriptions')
        full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'],
                                          params['FINAL_TARGET_TABLE'])
        schema_dict_loc = "{}_schema.json".format(full_file_prefix)
        schema_dict = {}
        with open(schema_dict_loc, mode='r') as schema_hold_dict:
            full_schema_list = json_loads(schema_hold_dict.read())
        for entry in full_schema_list:
            schema_dict[entry['name']] = {'description': entry['description']}

        success = update_schema_with_dict(params['TARGET_DATASET'],
                                          params['FINAL_TARGET_TABLE'],
                                          schema_dict)
        if not success:
            print("update_field_descriptions failed")
            return

    #
    # Add description and labels to the target table:
    #

    if 'update_table_description' in steps:
        print('update_table_description')
        full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'],
                                          params['FINAL_TARGET_TABLE'])
        success = install_labels_and_desc(params['TARGET_DATASET'],
                                          params['FINAL_TARGET_TABLE'],
                                          full_file_prefix)
        if not success:
            print("update_table_description failed")
            return

    #
    # publish table:
    #

    if 'publish' in steps:

        source_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                         params['TARGET_DATASET'],
                                         params['FINAL_TARGET_TABLE'])
        publication_dest = '{}.{}.{}'.format(params['PUBLICATION_PROJECT'],
                                             params['PUBLICATION_DATASET'],
                                             params['PUBLICATION_TABLE'])

        success = publish_table(source_table, publication_dest)

        if not success:
            print("publish table failed")
            return

    #
    # Clear out working temp tables:
    #

    if 'dump_working_tables' in steps:
        dump_table_tags = ['TARGET_TABLE']
        dump_tables = [params[x] for x in dump_table_tags]
        for table in dump_tables:
            delete_table_bq_job(params['TARGET_DATASET'], table)

    print('job completed')
Ejemplo n.º 8
0
def main(args):

    if len(args) != 2:
        print(" ")
        print(" Usage : {} <configuration_yaml>".format(args[0]))
        return

    print('job started')

    #
    # Get the YAML config loaded:
    #

    with open(args[1], mode='r') as yaml_file:
        params, steps = load_config(yaml_file.read())

    if params is None:
        print("Bad YAML load")
        return

    #
    # Schemas and table descriptions are maintained in the github repo. Only do this once:
    #

    if 'pull_table_info_from_git' in steps:
        print('pull_table_info_from_git')
        try:
            create_clean_target(params['SCHEMA_REPO_LOCAL'])
            repo = Repo.clone_from(params['SCHEMA_REPO_URL'],
                                   params['SCHEMA_REPO_LOCAL'])
            repo.git.checkout(params['SCHEMA_REPO_BRANCH'])
        except Exception as ex:
            print("pull_table_info_from_git failed: {}".format(str(ex)))
            return

    #
    # Pull the radiology entries
    #

    if 'pull_radiology' in steps:
        success = import_radiology(params['RADIOLOGY_SOURCE'],
                                   params['TARGET_DATASET'],
                                   params['RADIOLOGY_TABLE_NAME'],
                                   params['BQ_AS_BATCH'])

        if not success:
            print("pull_radiology job failed")
            return

        if bq_table_is_empty(params['TARGET_DATASET'],
                             params['RADIOLOGY_TABLE_NAME']):
            delete_table_bq_job(params['TARGET_DATASET'],
                                params['RADIOLOGY_TABLE_NAME'])
            print("{} pull_slide table result was empty: table deleted".format(
                params['RADIOLOGY_TABLE_NAME']))

    #
    # Stage the schema metadata from the repo copy:
    #

    if 'process_git_schemas' in steps:
        print('process_git_schema')
        # Where do we dump the schema git repository?
        schema_file = "{}/{}/{}".format(params['SCHEMA_REPO_LOCAL'],
                                        params['RAW_SCHEMA_DIR'],
                                        params['SCHEMA_FILE_NAME'])
        full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'],
                                          params['RADIOLOGY_TABLE_NAME'])
        # Write out the details
        success = generate_table_detail_files(schema_file, full_file_prefix)
        if not success:
            print("process_git_schemas failed")
            return False

    #
    # Update the per-field descriptions:
    #

    if 'update_field_descriptions' in steps:
        print('update_field_descriptions')

        if bq_table_exists(params['TARGET_DATASET'],
                           params['RADIOLOGY_TABLE_NAME']):
            full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'],
                                              params['RADIOLOGY_TABLE_NAME'])
            schema_dict_loc = "{}_schema.json".format(full_file_prefix)
            schema_dict = {}
            with open(schema_dict_loc, mode='r') as schema_hold_dict:
                full_schema_list = json_loads(schema_hold_dict.read())
            for entry in full_schema_list:
                schema_dict[entry['name']] = {
                    'description': entry['description']
                }

            success = update_schema_with_dict(
                params['TARGET_DATASET'],
                params['RADIOLOGY_TABLE_NAME'],
                schema_dict,
                project=params['WORKING_PROJECT'])
            if not success:
                print("update_field_descriptions failed")
                return

    #
    # Add description and labels to the target table:
    #

    if 'update_table_description' in steps:
        print('update_table_description')

        if bq_table_exists(params['TARGET_DATASET'],
                           params['RADIOLOGY_TABLE_NAME']):
            full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'],
                                              params['RADIOLOGY_TABLE_NAME'])
            success = install_labels_and_desc(params['TARGET_DATASET'],
                                              params['RADIOLOGY_TABLE_NAME'],
                                              full_file_prefix)
            if not success:
                print("update_table_description failed")
                return

    #
    # publish table:
    #

    if 'publish' in steps:

        if bq_table_exists(params['TARGET_DATASET'],
                           params['RADIOLOGY_TABLE_NAME']):
            source_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                             params['TARGET_DATASET'],
                                             params['RADIOLOGY_TABLE_NAME'])
            publication_dest = '{}.{}.{}'.format(params['PUBLICATION_PROJECT'],
                                                 params['PUBLICATION_DATASET'],
                                                 params['PUBLICATION_TABLE'])
            success = publish_table(source_table, publication_dest)
            if not success:
                print("publish table failed")
                return

    print('job completed')
Ejemplo n.º 9
0
def main(args):

    if not confirm_google_vm():
        print(
            'This job needs to run on a Google Cloud Compute Engine to avoid storage egress charges [EXITING]'
        )
        return

    if len(args) != 2:
        print(" ")
        print(" Usage : {} <configuration_yaml>".format(args[0]))
        return

    print('job started')

    #
    # Get the YAML config loaded:
    #

    with open(args[1], mode='r') as yaml_file:
        params, file_sets, steps = load_config(yaml_file.read())

    #
    # BQ does not like to be given paths that have "~". So make all local paths absolute:
    #

    home = expanduser("~")
    local_files_dir = "{}/{}".format(home, params['LOCAL_FILES_DIR'])
    one_big_tsv = "{}/{}".format(home, params['ONE_BIG_TSV'])
    manifest_file = "{}/{}".format(home, params['MANIFEST_FILE'])
    local_pull_list = "{}/{}".format(home, params['LOCAL_PULL_LIST'])
    file_traversal_list = "{}/{}".format(home, params['FILE_TRAVERSAL_LIST'])
    hold_schema_dict = "{}/{}".format(home, params['HOLD_SCHEMA_DICT'])
    hold_schema_list = "{}/{}".format(home, params['HOLD_SCHEMA_LIST'])

    if 'clear_target_directory' in steps:
        for file_set in file_sets:
            count_name, _ = next(iter(file_set.items()))
            create_clean_target(local_files_dir.format(count_name))

    if 'build_manifest_from_filters' in steps:
        for file_set in file_sets:
            count_name, count_dict = next(iter(file_set.items()))
            mani_for_count = manifest_file.format(count_name)
            table_for_count = params['BQ_MANIFEST_TABLE'].format(count_name)
            tsv_for_count = params['BUCKET_MANIFEST_TSV'].format(count_name)
            max_files = params['MAX_FILES'] if 'MAX_FILES' in params else None
            manifest_success = get_the_bq_manifest(
                params['FILE_TABLE'], count_dict['filters'], max_files,
                params['WORKING_PROJECT'], params['TARGET_DATASET'],
                table_for_count, params['WORKING_BUCKET'], tsv_for_count,
                mani_for_count, params['BQ_AS_BATCH'])
            if not manifest_success:
                print("Failure generating manifest")
                return

    #
    # We need to create a "pull list" of gs:// URLs to pull from GDC buckets. If you have already
    # created a pull list, just plunk it in 'LOCAL_PULL_LIST' and skip this step. If creating a pull
    # list, uses BQ as long as you have built the manifest using BQ (that route uses the BQ Manifest
    # table that was created).
    #

    if 'build_pull_list' in steps:
        for file_set in file_sets:
            count_name, count_dict = next(iter(file_set.items()))
            table_for_count = params['BQ_MANIFEST_TABLE'].format(count_name)
            local_pull_for_count = local_pull_list.format(count_name)
            pull_table_for_count = params['BQ_PULL_LIST_TABLE'].format(
                count_name)
            bucket_pull_list_for_count = params['BUCKET_PULL_LIST'].format(
                count_name)
            full_manifest = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                              params['TARGET_DATASET'],
                                              table_for_count)
            build_pull_list_with_bq(
                full_manifest, params['INDEXD_BQ_TABLE'],
                params['WORKING_PROJECT'], params['TARGET_DATASET'],
                pull_table_for_count, params['WORKING_BUCKET'],
                bucket_pull_list_for_count, local_pull_for_count,
                params['BQ_AS_BATCH'])
    #
    # Now hitting GDC cloud buckets. Get the files in the pull list:
    #

    if 'download_from_gdc' in steps:
        for file_set in file_sets:
            count_name, _ = next(iter(file_set.items()))
            pull_for_count = local_pull_list.format(count_name)
            with open(pull_for_count, mode='r') as pull_list_file:
                pull_list = pull_list_file.read().splitlines()
            print("Preparing to download %s files from buckets\n" %
                  len(pull_list))
            bp = BucketPuller(10)
            local_files_dir_for_count = local_files_dir.format(count_name)
            bp.pull_from_buckets(pull_list, local_files_dir_for_count)

    if 'build_file_list' in steps:
        for file_set in file_sets:
            count_name, _ = next(iter(file_set.items()))
            local_files_dir_for_count = local_files_dir.format(count_name)
            all_files = build_file_list(local_files_dir_for_count)
            file_traversal_list_for_count = file_traversal_list.format(
                count_name)
            with open(file_traversal_list_for_count,
                      mode='w') as traversal_list:
                for line in all_files:
                    traversal_list.write("{}\n".format(line))

    if 'concat_all_files' in steps:
        for file_set in file_sets:
            count_name, count_dict = next(iter(file_set.items()))
            header = count_dict['header'] if 'header' in count_dict else None
            file_traversal_list_for_count = file_traversal_list.format(
                count_name)
            with open(file_traversal_list_for_count,
                      mode='r') as traversal_list_file:
                all_files = traversal_list_file.read().splitlines()
                concat_all_files(all_files, one_big_tsv.format(count_name),
                                 header)

    #
    # Schemas and table descriptions are maintained in the github repo:
    #

    if 'pull_table_info_from_git' in steps:
        print('pull_table_info_from_git')
        try:
            create_clean_target(params['SCHEMA_REPO_LOCAL'])
            repo = Repo.clone_from(params['SCHEMA_REPO_URL'],
                                   params['SCHEMA_REPO_LOCAL'])
            repo.git.checkout(params['SCHEMA_REPO_BRANCH'])
        except Exception as ex:
            print("pull_table_info_from_git failed: {}".format(str(ex)))
            return

    if 'process_git_schemas' in steps:
        print('process_git_schema')
        # Where do we dump the schema git repository?
        schema_file = "{}/{}/{}".format(params['SCHEMA_REPO_LOCAL'],
                                        params['RAW_SCHEMA_DIR'],
                                        params['SCHEMA_FILE_NAME'])
        full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'],
                                          params['FINAL_TARGET_TABLE'])
        # Write out the details
        success = generate_table_detail_files(schema_file, full_file_prefix)
        if not success:
            print("process_git_schemas failed")
            return

    if 'analyze_the_schema' in steps:
        print('analyze_the_schema')
        for file_set in file_sets:
            count_name, _ = next(iter(file_set.items()))
            typing_tups = build_schema(one_big_tsv.format(count_name),
                                       params['SCHEMA_SAMPLE_SKIPS'])
            full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'],
                                              params['FINAL_TARGET_TABLE'])
            schema_dict_loc = "{}_schema.json".format(full_file_prefix)
            build_combined_schema(None, schema_dict_loc, typing_tups,
                                  hold_schema_list.format(count_name),
                                  hold_schema_dict.format(count_name))

    bucket_target_blob_sets = {}
    for file_set in file_sets:
        count_name, _ = next(iter(file_set.items()))
        bucket_target_blob_sets[count_name] = '{}/{}'.format(
            params['WORKING_BUCKET_DIR'],
            params['BUCKET_TSV'].format(count_name))

    if 'upload_to_bucket' in steps:
        for file_set in file_sets:
            count_name, _ = next(iter(file_set.items()))
            upload_to_bucket(params['WORKING_BUCKET'],
                             bucket_target_blob_sets[count_name],
                             one_big_tsv.format(count_name))

    if 'delete_all_bq' in steps:
        table_cleaner(params, file_sets, True)

    if 'create_bq_from_tsv' in steps:
        for file_set in file_sets:
            count_name, _ = next(iter(file_set.items()))
            bucket_src_url = 'gs://{}/{}'.format(
                params['WORKING_BUCKET'], bucket_target_blob_sets[count_name])
            hold_schema_list_for_count = hold_schema_list.format(count_name)
            with open(hold_schema_list_for_count,
                      mode='r') as schema_hold_dict:
                typed_schema = json_loads(schema_hold_dict.read())
            csv_to_bq_write_depo(typed_schema, bucket_src_url,
                                 params['TARGET_DATASET'],
                                 params['TARGET_TABLE'].format(count_name),
                                 params['BQ_AS_BATCH'], None)

    if 'attach_ids_to_files' in steps:
        count = 0
        for file_set in file_sets:
            count_name, _ = next(iter(file_set.items()))
            write_depo = "WRITE_TRUNCATE" if (count == 0) else "WRITE_APPEND"
            gexp_table = '{}.{}.{}'.format(
                params['WORKING_PROJECT'], params['TARGET_DATASET'],
                params['TARGET_TABLE'].format(count_name))
            success = build_aliquot_and_case(
                gexp_table, params['FILEDATA_TABLE'], params['TARGET_DATASET'],
                params['STEP_2_TABLE'], write_depo, {}, params['BQ_AS_BATCH'])
            count += 1

        if not success:
            print("attach_ids_to_files failed")
            return

    if 'extract_platform' in steps:
        step2_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                        params['TARGET_DATASET'],
                                        params['STEP_2_TABLE'])
        success = extract_platform_for_files(step2_table,
                                             params['FILEDATA_TABLE'],
                                             params['TARGET_DATASET'],
                                             params['STEP_2A_TABLE'], True, {},
                                             params['BQ_AS_BATCH'])

        if not success:
            print("extract_platform failed")
            return

    if 'attach_barcodes_to_ids' in steps:
        step2_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                        params['TARGET_DATASET'],
                                        params['STEP_2A_TABLE'])
        success = attach_barcodes(step2_table, params['ALIQUOT_TABLE'],
                                  params['TARGET_DATASET'],
                                  params['STEP_3_TABLE'], True, {},
                                  params['BQ_AS_BATCH'])

        if not success:
            print("attach_barcodes_to_ids failed")
            return

    if 'merge_counts_and_metadata' in steps:
        for file_set in file_sets:
            count_name, count_dict = next(iter(file_set.items()))
            if 'header' not in count_dict:
                print("must have defined headers to work")
                break
            header = count_dict['header']
            print(header)
            sql_dict = {}
            sql_dict['count_column'] = header.split(',')[1].strip()
            sql_dict['file_column'] = 'file_gdc_id_{}'.format(count_name)

            step3_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                            params['TARGET_DATASET'],
                                            params['STEP_3_TABLE'])
            counts_table = '{}.{}.{}'.format(
                params['WORKING_PROJECT'], params['TARGET_DATASET'],
                params['TARGET_TABLE'].format(count_name))

            success = merge_counts_and_metadata(
                step3_table, counts_table, params['TARGET_DATASET'],
                params['COUNTS_WITH_METADATA_TABLE'].format(count_name), True,
                sql_dict, params['BQ_AS_BATCH'])

            if not success:
                print("merge_counts_and_metadata failed")
                return

    if 'merge_all' in steps:
        sql_dict = {}
        count = 0
        for file_set in file_sets:
            count_name, count_dict = next(iter(file_set.items()))
            dict_for_set = {}
            sql_dict['table_{}'.format(count)] = dict_for_set
            count += 1
            if 'header' not in count_dict:
                print("must have defined headers to work")
                return
            header = count_dict['header']
            dict_for_set['count_column'] = header.split(',')[1].strip()
            dict_for_set['file_column'] = 'file_gdc_id_{}'.format(count_name)
            dict_for_set['table'] = '{}.{}.{}'.format(
                params['WORKING_PROJECT'], params['TARGET_DATASET'],
                params['COUNTS_WITH_METADATA_TABLE'].format(count_name))

        success = all_counts_to_one_table(params['TARGET_DATASET'],
                                          params['THREE_COUNTS_TABLE'], True,
                                          sql_dict, params['BQ_AS_BATCH'])

        if not success:
            print("merge_counts_and_metadata failed")
            return

    if 'glue_gene_names' in steps:
        sql_dict = {}
        count = 0
        for file_set in file_sets:
            count_name, count_dict = next(iter(file_set.items()))
            dict_for_set = {}
            sql_dict['table_{}'.format(count)] = dict_for_set
            count += 1
            if 'header' not in count_dict:
                print("must have defined headers to work")
                return
            header = count_dict['header']
            dict_for_set['count_column'] = header.split(',')[1].strip()
            dict_for_set['file_column'] = 'file_gdc_id_{}'.format(count_name)

        three_counts_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                               params['TARGET_DATASET'],
                                               params['THREE_COUNTS_TABLE'])

        success = glue_in_gene_names(three_counts_table,
                                     params['GENE_NAMES_TABLE'],
                                     params['TARGET_DATASET'],
                                     params['FINAL_TARGET_TABLE'], True,
                                     sql_dict, params['BQ_AS_BATCH'])

        if not success:
            print("glue_gene_names failed")
            return

    #
    # Update the per-field descriptions:
    #

    if 'update_field_descriptions' in steps:
        print('update_field_descriptions')
        full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'],
                                          params['FINAL_TARGET_TABLE'])
        schema_dict_loc = "{}_schema.json".format(full_file_prefix)
        schema_dict = {}
        with open(schema_dict_loc, mode='r') as schema_hold_dict:
            full_schema_list = json_loads(schema_hold_dict.read())
        for entry in full_schema_list:
            schema_dict[entry['name']] = {'description': entry['description']}

        success = update_schema_with_dict(params['TARGET_DATASET'],
                                          params['FINAL_TARGET_TABLE'],
                                          schema_dict)
        if not success:
            print("update_field_descriptions failed")
            return

    #
    # Add description and labels to the target table:
    #

    if 'update_table_description' in steps:
        print('update_table_description')
        full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'],
                                          params['FINAL_TARGET_TABLE'])
        success = install_labels_and_desc(params['TARGET_DATASET'],
                                          params['FINAL_TARGET_TABLE'],
                                          full_file_prefix)
        if not success:
            print("update_table_description failed")
            return

    if 'dump_working_tables' in steps:
        table_cleaner(params, file_sets, False)

    #
    # archive files on VM:
    #

    bucket_archive_blob_sets = {}
    for file_set in file_sets:
        count_name, _ = next(iter(file_set.items()))
        bucket_target_blob_sets[count_name] = '{}/{}'.format(
            params['ARCHIVE_BUCKET_DIR'],
            params['BUCKET_TSV'].format(count_name))

    if 'archive' in steps:

        print('archive files from VM')
        archive_file_prefix = "{}_{}".format(date.today(),
                                             params['PUBLICATION_DATASET'])
        yaml_file = re.search(r"\/(\w*.yaml)$", args[1])
        archive_yaml = "{}/{}/{}_{}".format(params['ARCHIVE_BUCKET_DIR'],
                                            params['ARCHIVE_CONFIG'],
                                            archive_file_prefix,
                                            yaml_file.group(1))
        upload_to_bucket(params['ARCHIVE_BUCKET'], archive_yaml, args[1])
        for file_set in file_sets:
            count_name, count_dict = next(iter(file_set.items()))
            pull_file_name = params['LOCAL_PULL_LIST']
            archive_pull_file = "{}/{}_{}".format(
                params['ARCHIVE_BUCKET_DIR'], archive_file_prefix,
                pull_file_name.format(count_name))
            upload_to_bucket(params['ARCHIVE_BUCKET'], archive_pull_file,
                             local_pull_list.format(count_name))
            manifest_file_name = params['MANIFEST_FILE']
            archive_manifest_file = "{}/{}_{}".format(
                params['ARCHIVE_BUCKET_DIR'], archive_file_prefix,
                manifest_file_name.format(count_name))
            upload_to_bucket(params['ARCHIVE_BUCKET'], archive_manifest_file,
                             manifest_file.format(count_name))

    #
    # publish table:
    #

    if 'publish' in steps:

        source_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                         params['TARGET_DATASET'],
                                         params['FINAL_TARGET_TABLE'])
        publication_dest = '{}.{}.{}'.format(params['PUBLICATION_PROJECT'],
                                             params['PUBLICATION_DATASET'],
                                             params['PUBLICATION_TABLE'])

        success = publish_table(source_table, publication_dest)

        if not success:
            print("publish table failed")
            return

    print('job completed')
Ejemplo n.º 10
0
def do_dataset_and_build(steps, build, build_tag, path_tag, dataset_tuple,
                         aliquot_map_programs, params, schema_tags):

    file_table = "{}_{}".format(params['FILE_TABLE'], build_tag)

    #
    # Pull stuff from rel:
    #

    if 'pull_slides' in steps:
        step_one_table = "{}_{}_{}".format(dataset_tuple[1], build,
                                           params['SLIDE_STEP_1_TABLE'])
        success = extract_active_slide_file_data(file_table, dataset_tuple[0],
                                                 params['TARGET_DATASET'],
                                                 step_one_table,
                                                 params['BQ_AS_BATCH'])
        if not success:
            print("{} {} pull_slides job failed".format(
                dataset_tuple[0], build))
            return False

        if bq_table_is_empty(params['TARGET_DATASET'], step_one_table):
            delete_table_bq_job(params['TARGET_DATASET'], step_one_table)
            print("{} pull_slide table result was empty: table deleted".format(
                params['SLIDE_STEP_1_TABLE']))

    if 'pull_aliquot' in steps:
        step_one_table = "{}_{}_{}".format(dataset_tuple[1], build,
                                           params['ALIQUOT_STEP_1_TABLE'])
        success = extract_active_aliquot_file_data(file_table,
                                                   dataset_tuple[0],
                                                   params['TARGET_DATASET'],
                                                   step_one_table,
                                                   params['BQ_AS_BATCH'])
        if not success:
            print("{} {} pull_aliquot job failed".format(
                dataset_tuple[0], build))
            return False

        if bq_table_is_empty(params['TARGET_DATASET'], step_one_table):
            delete_table_bq_job(params['TARGET_DATASET'], step_one_table)
            print(
                "{} pull_aliquot table result was empty: table deleted".format(
                    params['ALIQUOT_STEP_1_TABLE']))

    if 'pull_case' in steps:
        step_one_table = "{}_{}_{}".format(dataset_tuple[1], build,
                                           params['CASE_STEP_1_TABLE'])
        success = extract_active_case_file_data(file_table, dataset_tuple[0],
                                                params['TARGET_DATASET'],
                                                step_one_table,
                                                params['BQ_AS_BATCH'])
        if not success:
            print("{} {} pull_clinbio job failed".format(
                dataset_tuple[0], build))
            return False

        if bq_table_is_empty(params['TARGET_DATASET'], step_one_table):
            delete_table_bq_job(params['TARGET_DATASET'], step_one_table)
            print("{} pull_case table result was empty: table deleted".format(
                params['CASE_STEP_1_TABLE']))

    if 'slide_barcodes' in steps:
        table_name = "{}_{}_{}".format(dataset_tuple[1], build,
                                       params['SLIDE_STEP_1_TABLE'])
        in_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                     params['TARGET_DATASET'], table_name)

        if bq_table_exists(params['TARGET_DATASET'], table_name):
            step_two_table = "{}_{}_{}".format(dataset_tuple[1], build,
                                               params['SLIDE_STEP_2_TABLE'])
            success = extract_slide_barcodes(in_table, params['SLIDE_TABLE'],
                                             dataset_tuple[0],
                                             params['TARGET_DATASET'],
                                             step_two_table,
                                             params['BQ_AS_BATCH'])

            if not success:
                print("{} {} slide_barcodes job failed".format(
                    dataset_tuple[0], build))
                return False

    if 'aliquot_barcodes' in steps:
        table_name = "{}_{}_{}".format(dataset_tuple[1], build,
                                       params['ALIQUOT_STEP_1_TABLE'])
        in_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                     params['TARGET_DATASET'], table_name)

        if bq_table_exists(params['TARGET_DATASET'], table_name):
            step_two_table = "{}_{}_{}".format(dataset_tuple[1], build,
                                               params['ALIQUOT_STEP_2_TABLE'])

            if dataset_tuple[0] in aliquot_map_programs:
                success = extract_aliquot_barcodes(in_table,
                                                   params['ALIQUOT_TABLE'],
                                                   dataset_tuple[0],
                                                   params['TARGET_DATASET'],
                                                   step_two_table,
                                                   params['BQ_AS_BATCH'])

                if not success:
                    print("{} {} align_barcodes job failed".format(
                        dataset_tuple[0], build))
                    return False
            else:
                success = prepare_aliquot_without_map(in_table,
                                                      params['CASE_TABLE'],
                                                      dataset_tuple[0],
                                                      params['TARGET_DATASET'],
                                                      step_two_table,
                                                      params['BQ_AS_BATCH'])

                if not success:
                    print("{} {} align_barcodes job failed".format(
                        dataset_tuple[0], build))
                    return False

        else:
            print(
                "{} {} aliquot_barcodes step skipped (no input table)".format(
                    dataset_tuple[0], build))

    if 'case_barcodes' in steps:
        table_name = "{}_{}_{}".format(dataset_tuple[1], build,
                                       params['CASE_STEP_1_TABLE'])
        in_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                     params['TARGET_DATASET'], table_name)

        if bq_table_exists(params['TARGET_DATASET'], table_name):
            step_two_table = "{}_{}_{}".format(dataset_tuple[1], build,
                                               params['CASE_STEP_2_TABLE'])
            success = extract_case_barcodes(in_table, params['CASE_TABLE'],
                                            dataset_tuple[0],
                                            params['TARGET_DATASET'],
                                            step_two_table,
                                            params['BQ_AS_BATCH'])

            if not success:
                print("{} {} case_barcodes job failed".format(
                    dataset_tuple[0], build))
                return False

    if 'union_tables' in steps:
        table_list = []

        union_table_tags = [
            'SLIDE_STEP_2_TABLE', 'ALIQUOT_STEP_2_TABLE', 'CASE_STEP_2_TABLE'
        ]

        for tag in union_table_tags:
            if tag in params:
                table_name = "{}_{}_{}".format(dataset_tuple[1], build,
                                               params[tag])
                if bq_table_exists(params['TARGET_DATASET'], table_name):
                    full_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                                   params['TARGET_DATASET'],
                                                   table_name)
                    table_list.append(full_table)

        union_table = "{}_{}_{}".format(dataset_tuple[1], build,
                                        params['UNION_TABLE'])
        success = build_union(table_list, params['TARGET_DATASET'],
                              union_table, params['BQ_AS_BATCH'])
        if not success:
            print("{} {} union_tables job failed".format(
                dataset_tuple[0], build))
            return False

    # Merge the URL info into the final table we are building:

    if 'create_final_table' in steps:
        union_table = '{}.{}.{}'.format(
            params['WORKING_PROJECT'], params['TARGET_DATASET'],
            "{}_{}_{}".format(dataset_tuple[1], build, params['UNION_TABLE']))
        success = install_uris(
            union_table, "{}{}".format(params['UUID_2_URL_TABLE'],
                                       path_tag), params['TARGET_DATASET'],
            "{}_{}_{}".format(dataset_tuple[1], build,
                              params['FINAL_TABLE']), params['BQ_AS_BATCH'])
        if not success:
            print("{} {} create_final_table job failed".format(
                dataset_tuple[0], build))
            return False

    # Stage the schema metadata from the repo copy:

    if 'process_git_schemas' in steps:
        print('process_git_schema')
        # Where do we dump the schema git repository?
        schema_file = "{}/{}/{}".format(params['SCHEMA_REPO_LOCAL'],
                                        params['RAW_SCHEMA_DIR'],
                                        params['GENERIC_SCHEMA_FILE_NAME'])
        table_name = "{}_{}_{}".format(dataset_tuple[1], build,
                                       params['FINAL_TABLE'])
        full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'],
                                          table_name)
        # Write out the details
        success = generate_table_detail_files(schema_file, full_file_prefix)
        if not success:
            print("process_git_schemas failed")
            return

    # Customize generic schema to this data program:

    if 'replace_schema_tags' in steps:
        print('replace_schema_tags')
        tag_map_list = []
        for tag_pair in schema_tags:
            for tag in tag_pair:
                val = tag_pair[tag]
                use_pair = {}
                tag_map_list.append(use_pair)
                if val.find('~-') == 0 or val.find('~lc-') == 0 or val.find(
                        '~lcbqs-') == 0:
                    chunks = val.split('-', 1)
                    if chunks[1] == 'programs':
                        if val.find('~lcbqs-') == 0:
                            rep_val = dataset_tuple[1].lower(
                            )  # can't have "." in a tag...
                        else:
                            rep_val = dataset_tuple[0]
                    elif chunks[1] == 'path_tags':
                        rep_val = path_tag
                    elif chunks[1] == 'builds':
                        rep_val = build
                    else:
                        raise Exception()
                    if val.find('~lc-') == 0:
                        rep_val = rep_val.lower()
                    use_pair[tag] = rep_val
                else:
                    use_pair[tag] = val
        table_name = "{}_{}_{}".format(dataset_tuple[1], build,
                                       params['FINAL_TABLE'])
        full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'],
                                          table_name)
        # Write out the details
        success = customize_labels_and_desc(full_file_prefix, tag_map_list)
        if not success:
            print("replace_schema_tags failed")
            return

    #
    # Update the per-field descriptions:
    #

    if 'install_field_descriptions' in steps:
        table_name = "{}_{}_{}".format(dataset_tuple[1], build,
                                       params['FINAL_TABLE'])
        print('install_field_descriptions: {}'.format(table_name))
        full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'],
                                          table_name)
        schema_dict_loc = "{}_schema.json".format(full_file_prefix)
        schema_dict = {}
        with open(schema_dict_loc, mode='r') as schema_hold_dict:
            full_schema_list = json_loads(schema_hold_dict.read())
        for entry in full_schema_list:
            schema_dict[entry['name']] = {'description': entry['description']}
        success = update_schema_with_dict(params['TARGET_DATASET'],
                                          table_name,
                                          schema_dict,
                                          project=params['WORKING_PROJECT'])
        if not success:
            print("install_field_descriptions failed")
            return

    #
    # Add description and labels to the target table:
    #

    if 'install_table_description' in steps:
        table_name = "{}_{}_{}".format(dataset_tuple[1], build,
                                       params['FINAL_TABLE'])
        print('install_table_description: {}'.format(table_name))
        full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'],
                                          table_name)
        success = install_labels_and_desc(params['TARGET_DATASET'],
                                          table_name,
                                          full_file_prefix,
                                          project=params['WORKING_PROJECT'])
        if not success:
            print("install_table_description failed")
            return

    #
    # publish table:
    #

    if 'publish' in steps:
        table_name = "{}_{}_{}".format(dataset_tuple[1], build,
                                       params['FINAL_TABLE'])
        print('publish: {}'.format(table_name))

        source_table = '{}.{}.{}'.format(params['WORKING_PROJECT'],
                                         params['TARGET_DATASET'], table_name)
        publication_dest = '{}.{}.{}'.format(params['PUBLICATION_PROJECT'],
                                             dataset_tuple[1], table_name)

        success = publish_table(source_table, publication_dest)

        if not success:
            print("publish failed")
            return

    #
    # Clear out working temp tables:
    #

    if 'dump_working_tables' in steps:
        print('dump_working_tables')
        dump_tables = []
        dump_table_tags = [
            'SLIDE_STEP_1_TABLE', 'SLIDE_STEP_2_TABLE', 'ALIQUOT_STEP_1_TABLE',
            'ALIQUOT_STEP_2_TABLE', 'CASE_STEP_1_TABLE', 'CASE_STEP_2_TABLE',
            'UNION_TABLE'
        ]
        for tag in dump_table_tags:
            table_name = "{}_{}_{}".format(dataset_tuple[1], build,
                                           params[tag])
            if bq_table_exists(params['TARGET_DATASET'], table_name):
                dump_tables.append(table_name)

        for table in dump_tables:
            success = delete_table_bq_job(params['TARGET_DATASET'], table)
            if not success:
                print("problem deleting table {}".format(table))

    #
    # Done!
    #

    return True