def main(): if not confirm_google_vm(): print( 'This job needs to run on a Google Cloud Compute Engine to avoid storage egress charges [EXITING]' ) return print('job started') # # First thing is to load the configuration: # params, filters, bq_filters, steps, retain_cols, extra_cols, retain_platform_ref_fields = load_config( yaml_config) if params is None: print("Bad YAML load") return # # Use the filter set to get a manifest from GDC using their API. Note that if a pull list is # provided, these steps can be omitted: # if 'build_manifest_from_filters' in steps: max_files = params['MAX_FILES'] if 'MAX_FILES' in params else None if params['USE_GDC_API_FOR_MANIFEST']: manifest_filter = build_manifest_filter(filters) manifest_success = get_the_manifest(manifest_filter, params['API_URL'], params['MANIFEST_FILE'], max_files) else: manifest_success = get_the_bq_manifest( params['FILE_TABLE'], bq_filters, max_files, params['WORKING_PROJECT'], params['TARGET_DATASET'], params['BQ_MANIFEST_TABLE'], params['WORKING_BUCKET'], params['BUCKET_MANIFEST_TSV'], params['MANIFEST_FILE'], params['BQ_AS_BATCH']) if not manifest_success: print("Failure generating manifest") return # # Best practice is to clear out the directory where the files are going. Don't want anything left over: # if 'clear_target_directory' in steps: create_clean_target(params['LOCAL_FILES_DIR']) # # We need to create a "pull list" of gs:// URLs to pull from GDC buckets. If you have already # created a pull list, just plunk it in 'LOCAL_PULL_LIST' and skip this step. If creating a pull # list, you can do it using IndexD calls on a manifest file, OR using BQ as long as you have # built the manifest using BQ (that route uses the BQ Manifest table that was created). # if 'build_pull_list' in steps: if params['USE_INDEXD_FOR_PULL']: build_pull_list_with_indexd(params['MANIFEST_FILE'], params['INDEXD_IDS_PER_CALL'], params['INDEXD_URL'], params['LOCAL_PULL_LIST']) else: full_manifest = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['BQ_MANIFEST_TABLE']) build_pull_list_with_bq( full_manifest, params['INDEXD_BQ_TABLE'], params['WORKING_PROJECT'], params['TARGET_DATASET'], params['BQ_PULL_LIST_TABLE'], params['WORKING_BUCKET'], params['BUCKET_PULL_LIST'], params['LOCAL_PULL_LIST'], params['BQ_AS_BATCH']) # # Now hitting GDC cloud buckets, not "downloading". Get the files in the pull list: # if 'download_from_gdc' in steps: with open(params['LOCAL_PULL_LIST'], mode='r') as pull_list_file: pull_list = pull_list_file.read().splitlines() print("Preparing to download %s files from buckets\n" % len(pull_list)) bp = BucketPuller(10) bp.pull_from_buckets(pull_list, params['LOCAL_FILES_DIR']) # # Traverse the tree of downloaded files and create a flat list of all files: # if 'build_traversal_list' in steps: all_files = build_file_list(params['LOCAL_FILES_DIR']) with open(params['FILE_TRAVERSAL_LIST'], mode='w') as traversal_list: for line in all_files: traversal_list.write("{}\n".format(line)) # # Take all the files and make one BIG TSV file to upload: # print("fix me have to toss out NA rows!") if 'concat_all_files' in steps: with open(params['FILE_TRAVERSAL_LIST'], mode='r') as traversal_list_file: all_files = traversal_list_file.read().splitlines() concat_all_files_selected_cols(all_files, params['ONE_BIG_TSV'], params['PROGRAM_PREFIX'], retain_cols, extra_cols, file_info, None, "Beta_value", "NA") # # Build the platform reference table # if 'build_plat_ref' in steps: with open(params['FILE_TRAVERSAL_LIST'], mode='r') as traversal_list_file: all_files = traversal_list_file.read().splitlines() concat_all_files_selected_cols(all_files, params['ONE_BIG_REF_TSV'], params['PROGRAM_PREFIX'], retain_platform_ref_fields, [], file_info, None, None, None) set_from_file(params['ONE_BIG_REF_TSV'], params['ONE_BIG_DISTINCT_REF_TSV']) # # For the legacy table, the descriptions had lots of analysis tidbits. Very nice, but hard to maintain. # We just use hardwired schema descriptions now, most directly pulled from the GDC website: # if 'build_the_schema' in steps: typing_tups = build_schema(params['ONE_BIG_TSV'], params['SCHEMA_SAMPLE_SKIPS']) build_combined_schema(None, params['AUGMENTED_SCHEMA_FILE'], typing_tups, params['HOLD_SCHEMA_LIST'], params['HOLD_SCHEMA_DICT']) # # Upload the giant TSV into a cloud bucket: # if 'upload_to_bucket' in steps: upload_to_bucket(params['WORKING_BUCKET'], params['BUCKET_SKEL_TSV'], params['ONE_BIG_TSV']) # # Create the BQ table from the TSV: # if 'create_bq_from_tsv' in steps: bucket_src_url = 'gs://{}/{}'.format(params['WORKING_BUCKET'], params['BUCKET_SKEL_TSV']) with open(params['HOLD_SCHEMA_LIST'], mode='r') as schema_hold_dict: typed_schema = json_loads(schema_hold_dict.read()) csv_to_bq(typed_schema, bucket_src_url, params['TARGET_DATASET'], params['SKELETON_TABLE'], params['BQ_AS_BATCH']) # # Need to merge in aliquot and sample barcodes from other tables: # if 'collect_barcodes' in steps: skel_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['SKELETON_TABLE']) success = attach_aliquot_ids(skel_table, params['FILE_TABLE'], params['TARGET_DATASET'], params['BARCODE_STEP_1_TABLE'], params['BQ_AS_BATCH']) if not success: print("attach_aliquot_ids job failed") return step_1_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['BARCODE_STEP_1_TABLE']) success = attach_barcodes(step_1_table, params['ALIQUOT_TABLE'], params['TARGET_DATASET'], params['BARCODE_STEP_2_TABLE'], params['BQ_AS_BATCH']) if not success: print("attach_barcodes job failed") return # # Merge the barcode info into the final table we are building: # if 'create_final_table' in steps: skel_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['SKELETON_TABLE']) barcodes_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['BARCODE_STEP_2_TABLE']) success = final_merge(skel_table, barcodes_table, params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], params['BQ_AS_BATCH']) if not success: print("Join job failed") return # # The derived table we generate has no field descriptions. Add them from the scraped page: # if 'update_final_schema' in steps: success = update_schema(params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], params['HOLD_SCHEMA_DICT']) if not success: print("Schema update failed") return # # Add the table description: # if 'add_table_description' in steps: update_description(params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], params['TABLE_DESCRIPTION']) # # Clear out working temp tables: # if 'dump_working_tables' in steps: dump_table_tags = [ 'SKELETON_TABLE', 'BARCODE_STEP_1_TABLE', 'BARCODE_STEP_2_TABLE', 'BQ_MANIFEST_TABLE', 'BQ_PULL_LIST_TABLE' ] dump_tables = [params[x] for x in dump_table_tags] for table in dump_tables: delete_table_bq_job(params['TARGET_DATASET'], table) # # Done! # print('job completed')
def main(args): if not confirm_google_vm(): print( 'This job needs to run on a Google Cloud Compute Engine to avoid storage egress charges [EXITING]' ) return if len(args) != 2: print(" ") print(" Usage : {} <configuration_yaml>".format(args[0])) return print('job started') # # Get the YAML config loaded: # with open(args[1], mode='r') as yaml_file: params, filters, bq_filters, steps, callers, update_schema_tables, schema_tags = load_config( yaml_file.read()) if params is None: print("Bad YAML load") return # # BQ does not like to be given paths that have "~". So make all local paths absolute: # home = expanduser("~") local_files_dir = "{}/{}".format(home, params['LOCAL_FILES_DIR']) one_big_tsv = "{}/{}".format(home, params['ONE_BIG_TSV']) manifest_file = "{}/{}".format(home, params['MANIFEST_FILE']) local_pull_list = "{}/{}".format(home, params['LOCAL_PULL_LIST']) file_traversal_list = "{}/{}".format(home, params['FILE_TRAVERSAL_LIST']) hold_schema_dict = "{}/{}".format(home, params['HOLD_SCHEMA_DICT']) hold_schema_list = "{}/{}".format(home, params['HOLD_SCHEMA_LIST']) # Which table are we building? release = "".join(["r", str(params['RELEASE'])]) use_schema = params['VER_SCHEMA_FILE_NAME'] if 'current' in steps: print('This workflow will update the schema for the "current" table') release = 'current' use_schema = params['SCHEMA_FILE_NAME'] # Create table names concat_table = '_'.join([params['PROGRAM'], params['DATA_TYPE'], 'concat']) barcode_table = '_'.join( [params['PROGRAM'], params['DATA_TYPE'], 'barcode']) draft_table = '_'.join( [params['PROGRAM'], params['DATA_TYPE'], params['BUILD'], 'gdc', '{}']) publication_table = '_'.join( [params['DATA_TYPE'], params['BUILD'], 'gdc', '{}']) manifest_table = '_'.join( [params['PROGRAM'], params['DATA_TYPE'], 'manifest']) if params['RELEASE'] < 21 and 'METADATA_REL' not in params: print("The input release is before new metadata process, " "please specify which release of the metadata to use.") metadata_rel = "".join(["r", str(params['METADATA_REL']) ]) if 'METADATA_REL' in params else release if 'build_manifest_from_filters' in steps: max_files = params['MAX_FILES'] if 'MAX_FILES' in params else None manifest_success = get_the_bq_manifest( params['FILE_TABLE'].format(metadata_rel), bq_filters, max_files, params['WORKING_PROJECT'], params['SCRATCH_DATASET'], manifest_table, params['WORKING_BUCKET'], params['BUCKET_MANIFEST_TSV'], manifest_file, params['BQ_AS_BATCH']) if not manifest_success: print("Failure generating manifest") return # # Best practice is to clear out the directory where the files are going. Don't want anything left over: # if 'clear_target_directory' in steps: create_clean_target(local_files_dir) # # We need to create a "pull list" of gs:// URLs to pull from GDC buckets. If you have already # created a pull list, just plunk it in 'LOCAL_PULL_LIST' and skip this step. If creating a pull # list, you can do it using IndexD calls on a manifest file, OR using BQ as long as you have # built the manifest using BQ (that route uses the BQ Manifest table that was created). # if 'build_pull_list' in steps: build_pull_list_with_bq( "{}.{}.{}".format(params['WORKING_PROJECT'], params['SCRATCH_DATASET'], manifest_table), params['INDEXD_BQ_TABLE'].format(metadata_rel), params['WORKING_PROJECT'], params['SCRATCH_DATASET'], "_".join([params['PROGRAM'], params['DATA_TYPE'], 'pull', 'list']), params['WORKING_BUCKET'], params['BUCKET_PULL_LIST'], local_pull_list, params['BQ_AS_BATCH']) # # Now hitting GDC cloud buckets, not "downloading". Get the files in the pull list: # if 'download_from_gdc' in steps: with open(local_pull_list, mode='r') as pull_list_file: pull_list = pull_list_file.read().splitlines() pull_from_buckets(pull_list, local_files_dir) # # Traverse the tree of downloaded files and create a flat list of all files: # if 'build_traversal_list' in steps: all_files = build_file_list(local_files_dir) with open(file_traversal_list, mode='w') as traversal_list: for line in all_files: traversal_list.write("{}\n".format(line)) if 'concat_all_files' in steps: with open(file_traversal_list, mode='r') as traversal_list_file: all_files = traversal_list_file.read().splitlines() concat_all_files(all_files, one_big_tsv, params['PROGRAM'], callers, params['FIELDS_TO_FIX']) # # Schemas and table descriptions are maintained in the github repo: # if 'pull_table_info_from_git' in steps: print('pull_table_info_from_git') try: create_clean_target(params['SCHEMA_REPO_LOCAL']) repo = Repo.clone_from(params['SCHEMA_REPO_URL'], params['SCHEMA_REPO_LOCAL']) repo.git.checkout(params['SCHEMA_REPO_BRANCH']) except Exception as ex: print("pull_table_info_from_git failed: {}".format(str(ex))) return for table in update_schema_tables: if table == 'current': use_schema = params['SCHEMA_FILE_NAME'] schema_release = 'current' else: use_schema = params['VER_SCHEMA_FILE_NAME'] schema_release = release if 'process_git_schemas' in steps: print('process_git_schema') # Where do we dump the schema git repository? schema_file = "{}/{}/{}".format(params['SCHEMA_REPO_LOCAL'], params['RAW_SCHEMA_DIR'], use_schema) full_file_prefix = "{}/{}".format( params['PROX_DESC_PREFIX'], draft_table.format(schema_release)) # Write out the details success = generate_table_detail_files(schema_file, full_file_prefix) if not success: print("process_git_schemas failed") return # Customize generic schema to this data program: if 'replace_schema_tags' in steps: print('replace_schema_tags') pn = params['PROGRAM'] dataset_tuple = (pn, pn.replace(".", "_")) tag_map_list = [] for tag_pair in schema_tags: for tag in tag_pair: val = tag_pair[tag] use_pair = {} tag_map_list.append(use_pair) if val.find('~-') == 0 or val.find( '~lc-') == 0 or val.find('~lcbqs-') == 0: chunks = val.split('-', 1) if chunks[1] == 'programs': if val.find('~lcbqs-') == 0: rep_val = dataset_tuple[1].lower( ) # can't have "." in a tag... else: rep_val = dataset_tuple[0] elif chunks[1] == 'builds': rep_val = params['BUILD'] else: raise Exception() if val.find('~lc-') == 0: rep_val = rep_val.lower() use_pair[tag] = rep_val else: use_pair[tag] = val full_file_prefix = "{}/{}".format( params['PROX_DESC_PREFIX'], draft_table.format(schema_release)) # Write out the details success = customize_labels_and_desc(full_file_prefix, tag_map_list) if not success: print("replace_schema_tags failed") return False if 'analyze_the_schema' in steps: print('analyze_the_schema') typing_tups = build_schema(one_big_tsv, params['SCHEMA_SAMPLE_SKIPS']) full_file_prefix = "{}/{}".format( params['PROX_DESC_PREFIX'], draft_table.format(schema_release)) schema_dict_loc = "{}_schema.json".format(full_file_prefix) build_combined_schema(None, schema_dict_loc, typing_tups, hold_schema_list, hold_schema_dict) bucket_target_blob = '{}/{}-{}-{}.tsv'.format(params['WORKING_BUCKET_DIR'], params['DATE'], params['PROGRAM'], params['DATA_TYPE']) if 'upload_to_bucket' in steps: print('upload_to_bucket') upload_to_bucket(params['WORKING_BUCKET'], bucket_target_blob, one_big_tsv) # # Create the BQ table from the TSV: # if 'create_bq_from_tsv' in steps: print('create_bq_from_tsv') bucket_src_url = 'gs://{}/{}'.format(params['WORKING_BUCKET'], bucket_target_blob) with open(hold_schema_list, mode='r') as schema_hold_dict: typed_schema = json_loads(schema_hold_dict.read()) csv_to_bq(typed_schema, bucket_src_url, params['SCRATCH_DATASET'], concat_table, params['BQ_AS_BATCH']) # # Need to merge in aliquot and sample barcodes from other tables: # if 'collect_barcodes' in steps: skel_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['SCRATCH_DATASET'], concat_table) if params['RELEASE'] < 25: case_table = params['CASE_TABLE'].format('25') else: case_table = params['CASE_TABLE'].format(release) if params['PROGRAM'] == 'TCGA': success = attach_aliquot_ids(skel_table, params['FILE_TABLE'].format(release), params['SCRATCH_DATASET'], '_'.join([barcode_table, 'pre']), params['BQ_AS_BATCH']) if not success: print("attach_aliquot_ids job failed") return step_1_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['SCRATCH_DATASET'], '_'.join([barcode_table, 'pre'])) else: step_1_table = skel_table success = attach_barcodes(step_1_table, params['ALIQUOT_TABLE'].format(release), params['SCRATCH_DATASET'], barcode_table, params['BQ_AS_BATCH'], params['PROGRAM'], case_table) if not success: print("attach_barcodes job failed") return # # Merge the barcode info into the final table we are building: # if 'create_final_table' in steps: skel_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['SCRATCH_DATASET'], concat_table) barcodes_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['SCRATCH_DATASET'], barcode_table) success = final_merge(skel_table, barcodes_table, params['SCRATCH_DATASET'], draft_table.format(release), params['BQ_AS_BATCH'], params['PROGRAM']) if not success: print("Join job failed") return # # Create second table # if 'create_current_table' in steps: source_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['SCRATCH_DATASET'], draft_table.format(release)) current_dest = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['SCRATCH_DATASET'], draft_table.format('current')) success = publish_table(source_table, current_dest) if not success: print("create current table failed") return # # The derived table we generate has no field descriptions. Add them from the github json files: # for table in update_schema_tables: schema_release = 'current' if table == 'current' else release if 'update_final_schema' in steps: success = update_schema(params['SCRATCH_DATASET'], draft_table.format(schema_release), hold_schema_dict) if not success: print("Schema update failed") return # # Add the table description: # if 'add_table_description' in steps: print('update_table_description') full_file_prefix = "{}/{}".format( params['PROX_DESC_PREFIX'], draft_table.format(schema_release)) success = install_labels_and_desc( params['SCRATCH_DATASET'], draft_table.format(schema_release), full_file_prefix) if not success: print("update_table_description failed") return # # compare and remove old current table # # compare the two tables if 'compare_remove_old_current' in steps: old_current_table = '{}.{}.{}'.format( params['PUBLICATION_PROJECT'], params['PUBLICATION_DATASET'], publication_table.format('current')) previous_ver_table = '{}.{}.{}'.format( params['PUBLICATION_PROJECT'], "_".join([params['PUBLICATION_DATASET'], 'versioned']), publication_table.format("".join( ["r", str(params['PREVIOUS_RELEASE'])]))) table_temp = '{}.{}.{}'.format( params['WORKING_PROJECT'], params['SCRATCH_DATASET'], "_".join([ params['PROGRAM'], publication_table.format("".join( ["r", str(params['PREVIOUS_RELEASE'])])), 'backup' ])) print('Compare {} to {}'.format(old_current_table, previous_ver_table)) compare = compare_two_tables(old_current_table, previous_ver_table, params['BQ_AS_BATCH']) num_rows = compare.total_rows if num_rows == 0: print('the tables are the same') else: print('the tables are NOT the same and differ by {} rows'.format( num_rows)) if not compare: print('compare_tables failed') return # move old table to a temporary location elif compare and num_rows == 0: print('Move old table to temp location') table_moved = publish_table(old_current_table, table_temp) if not table_moved: print('Old Table was not moved and will not be deleted') # remove old table elif table_moved: print('Deleting old table: {}'.format(old_current_table)) delete_table = delete_table_bq_job( params['PUBLICATION_DATASET'], publication_table.format('current')) if not delete_table: print('delete table failed') return # # publish table: # if 'publish' in steps: tables = ['versioned', 'current'] for table in tables: if table == 'versioned': print(table) source_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['SCRATCH_DATASET'], draft_table.format(release)) publication_dest = '{}.{}.{}'.format( params['PUBLICATION_PROJECT'], "_".join([params['PUBLICATION_DATASET'], 'versioned']), publication_table.format(release)) elif table == 'current': print(table) source_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['SCRATCH_DATASET'], draft_table.format('current')) publication_dest = '{}.{}.{}'.format( params['PUBLICATION_PROJECT'], params['PUBLICATION_DATASET'], publication_table.format('current')) success = publish_table(source_table, publication_dest) if not success: print("publish table failed") return # # Update previous versioned table with archived tag # if 'update_status_tag' in steps: print('Update previous table') success = update_status_tag( "_".join([params['PUBLICATION_DATASET'], 'versioned']), publication_table.format("".join( ["r", str(params['PREVIOUS_RELEASE'])])), 'archived') if not success: print("update status tag table failed") return # # Clear out working temp tables: # if 'dump_working_tables' in steps: dump_tables = [ concat_table, barcode_table, draft_table.format('current'), draft_table.format(release), manifest_table ] for table in dump_tables: delete_table_bq_job(params['SCRATCH_DATASET'], table) # # Done! # print('job completed') if 'archive' in steps: print('archive files from VM') archive_file_prefix = "{}_{}".format(date.today(), params['PUBLICATION_DATASET']) if params['ARCHIVE_YAML']: yaml_file = re.search(r"\/(\w*.yaml)$", args[1]) archive_yaml = "{}/{}/{}_{}".format(params['ARCHIVE_BUCKET_DIR'], params['ARCHIVE_CONFIG'], archive_file_prefix, yaml_file.group(1)) upload_to_bucket(params['ARCHIVE_BUCKET'], archive_yaml, args[1]) archive_pull_file = "{}/{}_{}".format(params['ARCHIVE_BUCKET_DIR'], archive_file_prefix, params['LOCAL_PULL_LIST']) upload_to_bucket(params['ARCHIVE_BUCKET'], archive_pull_file, params['LOCAL_PULL_LIST']) archive_manifest_file = "{}/{}_{}".format(params['ARCHIVE_BUCKET_DIR'], archive_file_prefix, params['MANIFEST_FILE']) upload_to_bucket(params['ARCHIVE_BUCKET'], archive_manifest_file, params['MANIFEST_FILE'])
def main(args): if not confirm_google_vm(): print('This job needs to run on a Google Cloud Compute Engine to avoid storage egress charges [EXITING]') return if len(args) != 2: print(" ") print(" Usage : {} <configuration_yaml>".format(args[0])) return print('job started') # # Get the YAML config loaded: # with open(args[1], mode='r') as yaml_file: params, bq_filters, na_values, steps = load_config(yaml_file.read()) # # BQ does not like to be given paths that have "~". So make all local paths absolute: # home = expanduser("~") local_files_dir = "{}/{}".format(home, params['LOCAL_FILES_DIR']) one_big_tsv = "{}/{}".format(home, params['ONE_BIG_TSV']) manifest_file = "{}/{}".format(home, params['MANIFEST_FILE']) local_pull_list = "{}/{}".format(home, params['LOCAL_PULL_LIST']) file_traversal_list = "{}/{}".format(home, params['FILE_TRAVERSAL_LIST']) hold_schema_dict = "{}/{}".format(home, params['HOLD_SCHEMA_DICT']) hold_schema_list = "{}/{}".format(home, params['HOLD_SCHEMA_LIST']) # # Actual fields have brackets: # na_set = set() for val in na_values: na_set.add("[{}]".format(val)) if 'clear_target_directory' in steps: print('clear_target_directory') create_clean_target(local_files_dir) # # Use the filter set to build a manifest. Note that if a pull list is # provided, these steps can be omitted: # if 'build_manifest_from_filters' in steps: print('build_manifest_from_filters') max_files = params['MAX_FILES'] if 'MAX_FILES' in params else None manifest_success = get_the_bq_manifest(params['FILE_TABLE'], bq_filters, max_files, params['WORKING_PROJECT'], params['TARGET_DATASET'], params['BQ_MANIFEST_TABLE'], params['WORKING_BUCKET'], params['BUCKET_MANIFEST_TSV'], manifest_file, params['BQ_AS_BATCH']) if not manifest_success: print("Failure generating manifest") return # # We need to create a "pull list" of gs:// URLs to pull from GDC buckets. If you have already # created a pull list, just plunk it in 'LOCAL_PULL_LIST' and skip this step. If creating a pull # list, uses BQ as long as you have built the manifest using BQ (that route uses the BQ Manifest # table that was created). # if 'build_pull_list' in steps: print('build_pull_list') full_manifest = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['BQ_MANIFEST_TABLE']) success = build_pull_list_with_bq_public(full_manifest, params['INDEXD_BQ_TABLE'], params['WORKING_PROJECT'], params['TARGET_DATASET'], params['BQ_PULL_LIST_TABLE'], params['WORKING_BUCKET'], params['BUCKET_PULL_LIST'], local_pull_list, params['BQ_AS_BATCH']) if not success: print("Build pull list failed") return; # # Now hitting GDC cloud buckets. Get the files in the pull list: # if 'download_from_gdc' in steps: print('download_from_gdc') with open(local_pull_list, mode='r') as pull_list_file: pull_list = pull_list_file.read().splitlines() print("Preparing to download %s files from buckets\n" % len(pull_list)) bp = BucketPuller(10) bp.pull_from_buckets(pull_list, local_files_dir) if 'build_file_list' in steps: print('build_file_list') all_files = build_file_list(local_files_dir) with open(file_traversal_list, mode='w') as traversal_list: for line in all_files: traversal_list.write("{}\n".format(line)) if 'group_by_type' in steps: print('group_by_type') print(file_traversal_list) with open(file_traversal_list, mode='r') as traversal_list_file: all_files = traversal_list_file.read().splitlines() group_dict = group_by_suffixes(all_files) # WRITE OUT AS JSON!! if 'convert_excel_to_csv' in steps: print('convert_excel_to_csv') with open(file_traversal_list, mode='r') as traversal_list_file: all_files = traversal_list_file.read().splitlines() convert_excel_to_csv(all_files, local_files_dir) if 'concat_all_files' in steps: print('concat_all_files') for k, v in group_dict.items(): concat_all_files(v, one_big_tsv.format(k), na_set) # # Schemas and table descriptions are maintained in the github repo: # if 'pull_table_info_from_git' in steps: print('pull_table_info_from_git') try: create_clean_target(params['SCHEMA_REPO_LOCAL']) repo = Repo.clone_from(params['SCHEMA_REPO_URL'], params['SCHEMA_REPO_LOCAL']) repo.git.checkout(params['SCHEMA_REPO_BRANCH']) except Exception as ex: print("pull_table_info_from_git failed: {}".format(str(ex))) return if 'process_git_schemas' in steps: print('process_git_schema') # Where do we dump the schema git repository? schema_file = "{}/{}/{}".format(params['SCHEMA_REPO_LOCAL'], params['RAW_SCHEMA_DIR'], params['SCHEMA_FILE_NAME']) full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE']) # Write out the details success = generate_table_detail_files(schema_file, full_file_prefix) if not success: print("process_git_schemas failed") return if 'analyze_the_schema' in steps: print('analyze_the_schema') for k in group_dict: typing_tups = build_schema(one_big_tsv.format(k), params['SCHEMA_SAMPLE_SKIPS']) #full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE']) #schema_dict_loc = "{}_schema.json".format(full_file_prefix) hold_schema_dict_for_group = hold_schema_dict.format(k) hold_schema_list_for_group = hold_schema_list.format(k) build_combined_schema(None, None, typing_tups, hold_schema_list_for_group, hold_schema_dict_for_group) bucket_target_blob = '{}/{}'.format(params['WORKING_BUCKET_DIR'], params['BUCKET_TSV']) if 'upload_to_bucket' in steps: print('upload_to_bucket') for k in group_dict: upload_to_bucket(params['WORKING_BUCKET'], bucket_target_blob.format(k), one_big_tsv.format(k)) if 'create_bq_from_tsv' in steps: print('create_bq_from_tsv') for k in group_dict: bucket_src_url = 'gs://{}/{}'.format(params['WORKING_BUCKET'], bucket_target_blob.format(k)) with open(hold_schema_list.format(k), mode='r') as schema_hold_dict: typed_schema = json_loads(schema_hold_dict.read()) csv_to_bq(typed_schema, bucket_src_url, params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'].format(k.replace(".", "_").replace("-", "_")), params['BQ_AS_BATCH']) if 'add_aliquot_fields' in steps: print('add_aliquot_fields') full_target_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['TARGET_TABLE']) success = join_with_aliquot_table(full_target_table, params['ALIQUOT_TABLE'], params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], params['BQ_AS_BATCH']) if not success: print("Join job failed") # # Update the per-field descriptions: # if 'update_field_descriptions' in steps: print('update_field_descriptions') full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE']) schema_dict_loc = "{}_schema.json".format(full_file_prefix) schema_dict = {} with open(schema_dict_loc, mode='r') as schema_hold_dict: full_schema_list = json_loads(schema_hold_dict.read()) for entry in full_schema_list: schema_dict[entry['name']] = {'description': entry['description']} success = update_schema_with_dict(params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], schema_dict) if not success: print("update_field_descriptions failed") return # # Add description and labels to the target table: # if 'update_table_description' in steps: print('update_table_description') full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE']) success = install_labels_and_desc(params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], full_file_prefix) if not success: print("update_table_description failed") return # # publish table: # if 'publish' in steps: source_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['FINAL_TARGET_TABLE']) publication_dest = '{}.{}.{}'.format(params['PUBLICATION_PROJECT'], params['PUBLICATION_DATASET'], params['PUBLICATION_TABLE']) success = publish_table(source_table, publication_dest) if not success: print("publish table failed") return # # Clear out working temp tables: # if 'dump_working_tables' in steps: dump_table_tags = ['TARGET_TABLE'] dump_tables = [params[x] for x in dump_table_tags] for table in dump_tables: delete_table_bq_job(params['TARGET_DATASET'], table) print('job completed')
def main(args): if not confirm_google_vm(): print('This job needs to run on a Google Cloud Compute Engine to avoid storage egress charges [EXITING]') return if len(args) != 2: print(" ") print(" Usage : {} <configuration_yaml>".format(args[0])) return print('job started') # # Get the YAML config loaded: # with open(args[1], mode='r') as yaml_file: params, bq_filters, steps, extra_cols = load_config(yaml_file.read()) if params is None: print("Bad YAML load") return # Schema that describes table columns: AUGMENTED_SCHEMA_FILE = "SchemaFiles/mirna_augmented_schema_list.json" # # BQ does not like to be given paths that have "~". So make all local paths absolute: # home = expanduser("~") local_files_dir = "{}/{}".format(home, params['LOCAL_FILES_DIR']) one_big_tsv = "{}/{}".format(home, params['ONE_BIG_TSV']) manifest_file = "{}/{}".format(home, params['MANIFEST_FILE']) local_pull_list = "{}/{}".format(home, params['LOCAL_PULL_LIST']) file_traversal_list = "{}/{}".format(home, params['FILE_TRAVERSAL_LIST']) hold_schema_dict = "{}/{}".format(home, params['HOLD_SCHEMA_DICT']) hold_schema_list = "{}/{}".format(home, params['HOLD_SCHEMA_LIST']) # # Best practice is to clear out the directory where the files are going. Don't want anything left over. # Also creates the destination directory # if 'clear_target_directory' in steps: create_clean_target(local_files_dir) # # Use the filter set to get a manifest. Note that is a pull list is # provided, these steps can be omitted: # if 'build_manifest_from_filters' in steps: max_files = params['MAX_FILES'] if 'MAX_FILES' in params else None manifest_success = get_the_bq_manifest(params['FILE_TABLE'], bq_filters, max_files, params['WORKING_PROJECT'], params['TARGET_DATASET'], params['BQ_MANIFEST_TABLE'], params['WORKING_BUCKET'], params['BUCKET_MANIFEST_TSV'], manifest_file, params['BQ_AS_BATCH']) if not manifest_success: print("Failure generating manifest") return # # If you have already created a pull list, just plunk it in 'LOCAL_PULL_LIST' and skip this step. # if 'build_pull_list' in steps: full_manifest = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['BQ_MANIFEST_TABLE']) build_pull_list_with_bq(full_manifest, params['INDEXD_BQ_TABLE'], params['WORKING_PROJECT'], params['TARGET_DATASET'], params['BQ_PULL_LIST_TABLE'], params['WORKING_BUCKET'], params['BUCKET_PULL_LIST'], local_pull_list, params['BQ_AS_BATCH']) # # Now hitting GDC cloud buckets, not "downloading". Get the files in the pull list: # if 'download_from_gdc' in steps: with open(local_pull_list, mode='r') as pull_list_file: pull_list = pull_list_file.read().splitlines() print("Preparing to download %s files from buckets\n" % len(pull_list)) bp = BucketPuller(10) bp.pull_from_buckets(pull_list, local_files_dir) # # Traverse the tree of downloaded files and create a flat list of all files: # if 'build_traversal_list' in steps: all_files = build_file_list(local_files_dir) with open(file_traversal_list, mode='w') as traversal_list: for line in all_files: traversal_list.write("{}\n".format(line)) # # Take all the files and make one BIG TSV file to upload: # if 'concat_all_files' in steps: with open(file_traversal_list, mode='r') as traversal_list_file: all_files = traversal_list_file.read().splitlines() concat_all_files(all_files, one_big_tsv, params['PROGRAM_PREFIX'], extra_cols, file_info, None) # # For the legacy table, the descriptions had lots of analysis tidbits. Very nice, but hard to maintain. # We just use hardwired schema descriptions now, most directly pulled from the GDC website: # if 'build_the_schema' in steps: typing_tups = build_schema(one_big_tsv, params['SCHEMA_SAMPLE_SKIPS']) build_combined_schema(None, AUGMENTED_SCHEMA_FILE, typing_tups, hold_schema_list, hold_schema_dict) # # Upload the giant TSV into a cloud bucket: # if 'upload_to_bucket' in steps: upload_to_bucket(params['WORKING_BUCKET'], params['BUCKET_SKEL_TSV'], one_big_tsv) # # Create the BQ table from the TSV: # if 'create_bq_from_tsv' in steps: bucket_src_url = 'gs://{}/{}'.format(params['WORKING_BUCKET'], params['BUCKET_SKEL_TSV']) with open(hold_schema_list, mode='r') as schema_hold_dict: typed_schema = json_loads(schema_hold_dict.read()) csv_to_bq(typed_schema, bucket_src_url, params['TARGET_DATASET'], params['SKELETON_TABLE'], params['BQ_AS_BATCH']) # # Need to merge in aliquot and sample barcodes from other tables: # if 'collect_barcodes' in steps: skel_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['SKELETON_TABLE']) success = attach_aliquot_ids(skel_table, params['FILE_TABLE'], params['TARGET_DATASET'], params['BARCODE_STEP_1_TABLE'], params['BQ_AS_BATCH']) if not success: print("attach_aliquot_ids job failed") return step_1_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['BARCODE_STEP_1_TABLE']) success = attach_barcodes(step_1_table, params['ALIQUOT_TABLE'], params['TARGET_DATASET'], params['BARCODE_STEP_2_TABLE'], params['BQ_AS_BATCH']) if not success: print("attach_barcodes job failed") return # # Merge the barcode info into the final table we are building: # if 'create_final_table' in steps: skel_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['SKELETON_TABLE']) barcodes_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['BARCODE_STEP_2_TABLE']) success = final_merge(skel_table, barcodes_table, params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], params['BQ_AS_BATCH']) if not success: print("Join job failed") return # # Schemas and table descriptions are maintained in the github repo: # if 'pull_table_info_from_git' in steps: print('pull_table_info_from_git') try: create_clean_target(params['SCHEMA_REPO_LOCAL']) repo = Repo.clone_from(params['SCHEMA_REPO_URL'], params['SCHEMA_REPO_LOCAL']) repo.git.checkout(params['SCHEMA_REPO_BRANCH']) except Exception as ex: print("pull_table_info_from_git failed: {}".format(str(ex))) return if 'process_git_schemas' in steps: print('process_git_schema') # Where do we dump the schema git repository? schema_file = "{}/{}/{}".format(params['SCHEMA_REPO_LOCAL'], params['RAW_SCHEMA_DIR'], params['SCHEMA_FILE_NAME']) full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE']) # Write out the details success = generate_table_detail_files(schema_file, full_file_prefix) if not success: print("process_git_schemas failed") return if 'analyze_the_schema' in steps: print('analyze_the_schema') typing_tups = build_schema(one_big_tsv, params['SCHEMA_SAMPLE_SKIPS']) full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE']) schema_dict_loc = "{}_schema.json".format(full_file_prefix) build_combined_schema(None, schema_dict_loc, typing_tups, hold_schema_list, hold_schema_dict) # # Update the per-field descriptions: # if 'update_field_descriptions' in steps: print('update_field_descriptions') full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE']) schema_dict_loc = "{}_schema.json".format(full_file_prefix) schema_dict = {} with open(schema_dict_loc, mode='r') as schema_hold_dict: full_schema_list = json_loads(schema_hold_dict.read()) for entry in full_schema_list: schema_dict[entry['name']] = {'description': entry['description']} success = update_schema_with_dict(params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], schema_dict) if not success: print("update_field_descriptions failed") return # # Add description and labels to the target table: # if 'update_table_description' in steps: print('update_table_description') full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE']) success = install_labels_and_desc(params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], full_file_prefix) if not success: print("update_table_description failed") return # # Clear out working temp tables: # if 'dump_working_tables' in steps: dump_table_tags = ['SKELETON_TABLE', 'BARCODE_STEP_1_TABLE', 'BARCODE_STEP_2_TABLE', 'BQ_MANIFEST_TABLE', 'BQ_PULL_LIST_TABLE'] dump_tables = [params[x] for x in dump_table_tags] for table in dump_tables: delete_table_bq_job(params['TARGET_DATASET'], table) # # publish table: # if 'publish' in steps: source_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['FINAL_TARGET_TABLE']) publication_dest = '{}.{}.{}'.format(params['PUBLICATION_PROJECT'], params['PUBLICATION_DATASET'], params['PUBLICATION_TABLE']) success = publish_table(source_table, publication_dest) if not success: print("publish table failed") return print('job completed')
def main(args): if not confirm_google_vm(): print( 'This job needs to run on a Google Cloud Compute Engine to avoid storage egress charges [EXITING]' ) return if len(args) != 2: print(" ") print(" Usage : {} <configuration_yaml>".format(args[0])) return print('job started') # # Get the YAML config loaded: # with open(args[1], mode='r') as yaml_file: params, file_sets, update_schema_tables, schema_tags, steps = load_config( yaml_file.read()) # # BQ does not like to be given paths that have "~". So make all local paths absolute: # home = expanduser("~") local_files_dir = "{}/{}".format(home, params['LOCAL_FILES_DIR']) one_big_tsv = "{}/{}".format(home, params['ONE_BIG_TSV']) manifest_file = "{}/{}".format(home, params['MANIFEST_FILE']) local_pull_list = "{}/{}".format(home, params['LOCAL_PULL_LIST']) file_traversal_list = "{}/{}".format(home, params['FILE_TRAVERSAL_LIST']) hold_schema_dict = "{}/{}".format(home, params['HOLD_SCHEMA_DICT']) hold_schema_list = "{}/{}".format(home, params['HOLD_SCHEMA_LIST']) # Which release is the workflow running on? release = "".join(["r", str(params['RELEASE'])]) # Create table names upload_table = '_'.join([params['PROGRAM'], params['DATA_TYPE'], '{}']) manifest_table = '_'.join( [params['PROGRAM'], params['DATA_TYPE'], 'manifest', '{}']) pull_list_table = '_'.join( [params['PROGRAM'], params['DATA_TYPE'], 'pull', 'list', '{}']) files_to_case_table = '_'.join( [params['PROGRAM'], params['DATA_TYPE'], 'files_to_case']) files_to_case_w_plat_table = '_'.join( [params['PROGRAM'], params['DATA_TYPE'], 'files_to_case_with_plat']) barcodes_table = '_'.join( [params['PROGRAM'], params['DATA_TYPE'], 'barcodes']) counts_w_metadata_table = '_'.join( [params['PROGRAM'], params['DATA_TYPE'], 'counts_and_meta', '{}']) merged_counts_table = '_'.join( [params['PROGRAM'], params['DATA_TYPE'], 'merged_counts']) draft_table = '_'.join( [params['PROGRAM'], params['DATA_TYPE'], params['BUILD'], 'gdc', '{}']) publication_table = '_'.join( [params['DATA_TYPE'], params['BUILD'], 'gdc', '{}']) if params['RELEASE'] < 21 and 'METADATA_REL' not in params: print("The input release is before new metadata process, " "please specify which release of the metadata to use.") metadata_rel = "".join(["r", str(params['METADATA_REL']) ]) if 'METADATA_REL' in params else release if 'clear_target_directory' in steps: for file_set in file_sets: count_name, _ = next(iter(file_set.items())) create_clean_target(local_files_dir.format(count_name)) if 'build_manifest_from_filters' in steps: for file_set in file_sets: count_name, count_dict = next(iter(file_set.items())) mani_for_count = manifest_file.format(count_name) table_for_count = manifest_table.format(count_name) tsv_for_count = params['BUCKET_MANIFEST_TSV'].format(count_name) max_files = params['MAX_FILES'] if 'MAX_FILES' in params else None manifest_success = get_the_bq_manifest( params['FILE_TABLE'].format(metadata_rel), count_dict['filters'], max_files, params['WORKING_PROJECT'], params['SCRATCH_DATASET'], table_for_count, params['WORKING_BUCKET'], tsv_for_count, mani_for_count, params['BQ_AS_BATCH']) if not manifest_success: print("Failure generating manifest") return # # We need to create a "pull list" of gs:// URLs to pull from GDC buckets. If you have already # created a pull list, just plunk it in 'LOCAL_PULL_LIST' and skip this step. If creating a pull # list, uses BQ as long as you have built the manifest using BQ (that route uses the BQ Manifest # table that was created). # if 'build_pull_list' in steps: for file_set in file_sets: count_name, count_dict = next(iter(file_set.items())) table_for_count = manifest_table.format(count_name) local_pull_for_count = local_pull_list.format(count_name) pull_table_for_count = pull_list_table.format(count_name) bucket_pull_list_for_count = params['BUCKET_PULL_LIST'].format( count_name) full_manifest = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['SCRATCH_DATASET'], table_for_count) build_pull_list_with_bq( full_manifest, params['INDEXD_BQ_TABLE'].format(metadata_rel), params['WORKING_PROJECT'], params['SCRATCH_DATASET'], pull_table_for_count, params['WORKING_BUCKET'], bucket_pull_list_for_count, local_pull_for_count, params['BQ_AS_BATCH']) # # Now hitting GDC cloud buckets. Get the files in the pull list: # if 'download_from_gdc' in steps: for file_set in file_sets: count_name, _ = next(iter(file_set.items())) pull_for_count = local_pull_list.format(count_name) with open(pull_for_count, mode='r') as pull_list_file: pull_list = pull_list_file.read().splitlines() print("Preparing to download %s files from buckets\n" % len(pull_list)) bp = BucketPuller(10) local_files_dir_for_count = local_files_dir.format(count_name) bp.pull_from_buckets(pull_list, local_files_dir_for_count) if 'build_file_list' in steps: for file_set in file_sets: count_name, _ = next(iter(file_set.items())) local_files_dir_for_count = local_files_dir.format(count_name) all_files = build_file_list(local_files_dir_for_count) file_traversal_list_for_count = file_traversal_list.format( count_name) with open(file_traversal_list_for_count, mode='w') as traversal_list: for line in all_files: traversal_list.write("{}\n".format(line)) if 'concat_all_files' in steps: for file_set in file_sets: count_name, count_dict = next(iter(file_set.items())) header = count_dict['header'] if 'header' in count_dict else None file_traversal_list_for_count = file_traversal_list.format( count_name) with open(file_traversal_list_for_count, mode='r') as traversal_list_file: all_files = traversal_list_file.read().splitlines() concat_all_files(all_files, one_big_tsv.format(count_name), header) # # Schemas and table descriptions are maintained in the github repo: # if 'pull_table_info_from_git' in steps: print('pull_table_info_from_git') try: create_clean_target(params['SCHEMA_REPO_LOCAL']) repo = Repo.clone_from(params['SCHEMA_REPO_URL'], params['SCHEMA_REPO_LOCAL']) repo.git.checkout(params['SCHEMA_REPO_BRANCH']) except Exception as ex: print("pull_table_info_from_git failed: {}".format(str(ex))) return for table in update_schema_tables: if table == 'current': use_schema = params['SCHEMA_FILE_NAME'] schema_release = 'current' else: use_schema = params['VER_SCHEMA_FILE_NAME'] schema_release = release if 'process_git_schemas' in steps: print('process_git_schema') # Where do we dump the schema git repository? schema_file = "{}/{}/{}".format(params['SCHEMA_REPO_LOCAL'], params['RAW_SCHEMA_DIR'], use_schema) full_file_prefix = "{}/{}".format( params['PROX_DESC_PREFIX'], draft_table.format(schema_release)) # Write out the details success = generate_table_detail_files(schema_file, full_file_prefix) if not success: print("process_git_schemas failed") return # Customize generic schema to this data program: if 'replace_schema_tags' in steps: print('replace_schema_tags') pn = params['PROGRAM'] dataset_tuple = (pn, pn.replace(".", "_")) tag_map_list = [] for tag_pair in schema_tags: for tag in tag_pair: val = tag_pair[tag] use_pair = {} tag_map_list.append(use_pair) if val.find('~-') == 0 or val.find( '~lc-') == 0 or val.find('~lcbqs-') == 0: chunks = val.split('-', 1) if chunks[1] == 'programs': if val.find('~lcbqs-') == 0: rep_val = dataset_tuple[1].lower( ) # can't have "." in a tag... else: rep_val = dataset_tuple[0] elif chunks[1] == 'builds': rep_val = params['BUILD'] else: raise Exception() if val.find('~lc-') == 0: rep_val = rep_val.lower() use_pair[tag] = rep_val else: use_pair[tag] = val full_file_prefix = "{}/{}".format( params['PROX_DESC_PREFIX'], draft_table.format(schema_release)) # Write out the details success = customize_labels_and_desc(full_file_prefix, tag_map_list) if not success: print("replace_schema_tags failed") return False if 'analyze_the_schema' in steps: print('analyze_the_schema') for file_set in file_sets: count_name, _ = next(iter(file_set.items())) typing_tups = build_schema(one_big_tsv.format(count_name), params['SCHEMA_SAMPLE_SKIPS']) full_file_prefix = "{}/{}".format( params['PROX_DESC_PREFIX'], draft_table.format(schema_release)) schema_dict_loc = "{}_schema.json".format(full_file_prefix) build_combined_schema(None, schema_dict_loc, typing_tups, hold_schema_list.format(count_name), hold_schema_dict.format(count_name)) bucket_target_blob_sets = {} for file_set in file_sets: count_name, _ = next(iter(file_set.items())) bucket_target_blob_sets[count_name] = '{}/{}-{}-{}-{}.tsv'.format( params['WORKING_BUCKET_DIR'], params['DATE'], params['PROGRAM'], params['DATA_TYPE'], count_name) if 'upload_to_bucket' in steps: for file_set in file_sets: count_name, _ = next(iter(file_set.items())) upload_to_bucket(params['WORKING_BUCKET'], bucket_target_blob_sets[count_name], one_big_tsv.format(count_name)) if 'delete_all_bq' in steps: table_cleaner(params, file_sets, True) if 'create_bq_from_tsv' in steps: for file_set in file_sets: count_name, _ = next(iter(file_set.items())) bucket_src_url = 'gs://{}/{}'.format( params['WORKING_BUCKET'], bucket_target_blob_sets[count_name]) hold_schema_list_for_count = hold_schema_list.format(count_name) with open(hold_schema_list_for_count, mode='r') as schema_hold_dict: typed_schema = json_loads(schema_hold_dict.read()) csv_to_bq_write_depo(typed_schema, bucket_src_url, params['SCRATCH_DATASET'], upload_table.format(count_name), params['BQ_AS_BATCH'], None) if 'attach_ids_to_files' in steps: count = 0 for file_set in file_sets: count_name, _ = next(iter(file_set.items())) write_depo = "WRITE_TRUNCATE" if (count == 0) else "WRITE_APPEND" gexp_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['SCRATCH_DATASET'], upload_table.format(count_name)) success = build_aliquot_and_case( gexp_table, params['FILEDATA_TABLE'].format(release), params['SCRATCH_DATASET'], files_to_case_table, write_depo, {}, params['BQ_AS_BATCH']) count += 1 if not success: print("attach_ids_to_files failed") return if 'extract_platform' in steps: step2_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['SCRATCH_DATASET'], files_to_case_table) success = extract_platform_for_files( step2_table, params['FILEDATA_TABLE'].format(release), params['SCRATCH_DATASET'], files_to_case_w_plat_table, True, {}, params['BQ_AS_BATCH']) if not success: print("extract_platform failed") return if 'attach_barcodes_to_ids' in steps: step2_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['SCRATCH_DATASET'], files_to_case_w_plat_table) if params['RELEASE'] < 25: case_table = params['CASE_TABLE'].format("r25") else: case_table = params['CASE_TABLE'].format(release) success = attach_barcodes(step2_table, params['ALIQUOT_TABLE'].format(metadata_rel), case_table, params['SCRATCH_DATASET'], barcodes_table, True, params['BQ_AS_BATCH']) if not success: print("attach_barcodes_to_ids failed") return if 'merge_counts_and_metadata' in steps: for file_set in file_sets: count_name, count_dict = next(iter(file_set.items())) if 'header' not in count_dict: print("must have defined headers to work") break header = count_dict['header'] print(header) sql_dict = {} sql_dict['count_column'] = header.split(',')[1].strip() sql_dict['file_column'] = 'file_gdc_id_{}'.format(count_name) step3_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['SCRATCH_DATASET'], barcodes_table) counts_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['SCRATCH_DATASET'], upload_table.format(count_name)) success = merge_counts_and_metadata( step3_table, counts_table, params['SCRATCH_DATASET'], counts_w_metadata_table.format(count_name), True, sql_dict, params['BQ_AS_BATCH']) if not success: print("merge_counts_and_metadata failed") return if 'merge_all' in steps: sql_dict = {} count = 0 for file_set in file_sets: count_name, count_dict = next(iter(file_set.items())) dict_for_set = {} sql_dict['table_{}'.format(count)] = dict_for_set count += 1 if 'header' not in count_dict: print("must have defined headers to work") return header = count_dict['header'] dict_for_set['count_column'] = header.split(',')[1].strip() dict_for_set['file_column'] = 'file_gdc_id_{}'.format(count_name) dict_for_set['table'] = '{}.{}.{}'.format( params['WORKING_PROJECT'], params['SCRATCH_DATASET'], counts_w_metadata_table.format(count_name)) success = all_counts_to_one_table(params['SCRATCH_DATASET'], merged_counts_table, True, sql_dict, params['BQ_AS_BATCH']) if not success: print("merge_counts_and_metadata failed") return if 'glue_gene_names' in steps: sql_dict = {} count = 0 for file_set in file_sets: count_name, count_dict = next(iter(file_set.items())) dict_for_set = {} sql_dict['table_{}'.format(count)] = dict_for_set count += 1 if 'header' not in count_dict: print("must have defined headers to work") return header = count_dict['header'] dict_for_set['count_column'] = header.split(',')[1].strip() dict_for_set['file_column'] = 'file_gdc_id_{}'.format(count_name) three_counts_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['SCRATCH_DATASET'], merged_counts_table) success = glue_in_gene_names(three_counts_table, params['GENE_NAMES_TABLE'], params['SCRATCH_DATASET'], draft_table.format(release), True, sql_dict, params['BQ_AS_BATCH']) if not success: print("glue_gene_names failed") return # # Create second table # if 'create_current_table' in steps: source_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['SCRATCH_DATASET'], draft_table.format(release)) current_dest = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['SCRATCH_DATASET'], draft_table.format('current')) success = publish_table(source_table, current_dest) if not success: print("create current table failed") return # # The derived table we generate has no field descriptions. Add them from the github json files: # for table in update_schema_tables: schema_release = 'current' if table == 'current' else release if 'update_final_schema' in steps: success = update_schema(params['SCRATCH_DATASET'], draft_table.format(schema_release), hold_schema_dict.format('counts')) if not success: print("Schema update failed") return # # Add description and labels to the target table: # if 'add_table_description' in steps: print('update_table_description') full_file_prefix = "{}/{}".format( params['PROX_DESC_PREFIX'], draft_table.format(schema_release)) success = install_labels_and_desc( params['SCRATCH_DATASET'], draft_table.format(schema_release), full_file_prefix) if not success: print("update_table_description failed") return # # compare and remove old current table # # compare the two tables if 'compare_remove_old_current' in steps: old_current_table = '{}.{}.{}'.format( params['PUBLICATION_PROJECT'], params['PUBLICATION_DATASET'], publication_table.format('current')) previous_ver_table = '{}.{}.{}'.format( params['PUBLICATION_PROJECT'], "_".join([params['PUBLICATION_DATASET'], 'versioned']), publication_table.format("".join( ["r", str(params['PREVIOUS_RELEASE'])]))) table_temp = '{}.{}.{}'.format( params['WORKING_PROJECT'], params['SCRATCH_DATASET'], "_".join([ params['PROGRAM'], publication_table.format("".join( ["r", str(params['PREVIOUS_RELEASE'])])), 'backup' ])) print('Compare {} to {}'.format(old_current_table, previous_ver_table)) compare = compare_two_tables(old_current_table, previous_ver_table, params['BQ_AS_BATCH']) num_rows = compare.total_rows if num_rows == 0: print('the tables are the same') else: print('the tables are NOT the same and differ by {} rows'.format( num_rows)) if not compare: print('compare_tables failed') return # move old table to a temporary location elif compare and num_rows == 0: print('Move old table to temp location') table_moved = publish_table(old_current_table, table_temp) if not table_moved: print('Old Table was not moved and will not be deleted') # remove old table elif table_moved: print('Deleting old table: {}'.format(old_current_table)) delete_table = delete_table_bq_job( params['PUBLICATION_DATASET'], publication_table.format('current')) if not delete_table: print('delete table failed') return # # publish table: # if 'publish' in steps: print('publish tables') tables = ['versioned', 'current'] for table in tables: if table == 'versioned': print(table) source_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['SCRATCH_DATASET'], draft_table.format(release)) publication_dest = '{}.{}.{}'.format( params['PUBLICATION_PROJECT'], "_".join([params['PUBLICATION_DATASET'], 'versioned']), publication_table.format(release)) elif table == 'current': print(table) source_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['SCRATCH_DATASET'], draft_table.format('current')) publication_dest = '{}.{}.{}'.format( params['PUBLICATION_PROJECT'], params['PUBLICATION_DATASET'], publication_table.format('current')) success = publish_table(source_table, publication_dest) if not success: print("publish table failed") return # # Update previous versioned table with archived tag # if 'update_status_tag' in steps: print('Update previous table') success = update_status_tag( "_".join([params['PUBLICATION_DATASET'], 'versioned']), publication_table.format("".join( ["r", str(params['PREVIOUS_RELEASE'])])), 'archived') if not success: print("update status tag table failed") return if 'dump_working_tables' in steps: dump_tables = [ files_to_case_table, files_to_case_w_plat_table, barcodes_table, counts_w_metadata_table, merge_counts_and_metadata, merged_counts_table, draft_table ] for file_set in file_sets: count_name, _ = next(iter(file_set.items())) dump_tables.append(upload_table.format(count_name)) dump_tables.append(counts_w_metadata_table.format(count_name)) dump_tables.append(manifest_table.format(count_name)) dump_tables.append(pull_list_table.format(count_name)) table_cleaner(dump_tables, False) # # archive files on VM: # bucket_archive_blob_sets = {} for file_set in file_sets: count_name, _ = next(iter(file_set.items())) bucket_target_blob_sets[count_name] = '{}/{}-{}-{}-{}'.format( params['ARCHIVE_BUCKET_DIR'], params['DATE'], params['PROGRAM'], params['DATA_TYPE'], release, count_name) if 'archive' in steps: print('archive files from VM') archive_file_prefix = "{}_{}".format(date.today(), params['PUBLICATION_DATASET']) if params['ARCHIVE_YAML']: yaml_file = re.search(r"\/(\w*.yaml)$", args[1]) archive_yaml = "{}/{}/{}_{}".format(params['ARCHIVE_BUCKET_DIR'], params['ARCHIVE_CONFIG'], archive_file_prefix, yaml_file.group(1)) upload_to_bucket(params['ARCHIVE_BUCKET'], archive_yaml, args[1]) for file_set in file_sets: count_name, count_dict = next(iter(file_set.items())) pull_file_name = params['LOCAL_PULL_LIST'] archive_pull_file = "{}/{}_{}".format( params['ARCHIVE_BUCKET_DIR'], archive_file_prefix, pull_file_name.format(count_name)) upload_to_bucket(params['ARCHIVE_BUCKET'], archive_pull_file, local_pull_list.format(count_name)) manifest_file_name = params['MANIFEST_FILE'] archive_manifest_file = "{}/{}_{}".format( params['ARCHIVE_BUCKET_DIR'], archive_file_prefix, manifest_file_name.format(count_name)) upload_to_bucket(params['ARCHIVE_BUCKET'], archive_manifest_file, manifest_file.format(count_name)) print('job completed')
def main(args): if not confirm_google_vm(): print( 'This job needs to run on a Google Cloud Compute Engine to avoid storage egress charges [EXITING]' ) return if len(args) != 2: print(" ") print(" Usage : {} <configuration_yaml>".format(args[0])) return print('job started') # # Get the YAML config loaded: # with open(args[1], mode='r') as yaml_file: params, filters, bq_filters, steps, extra_cols, key_fields, callers = load_config( yaml_file.read()) if params is None: print("Bad YAML load") return # # BQ does not like to be given paths that have "~". So make all local paths absolute: # home = expanduser("~") local_files_dir = "{}/{}".format(home, params['LOCAL_FILES_DIR']) one_big_tsv = "{}/{}".format(home, params['ONE_BIG_TSV']) manifest_file = "{}/{}".format(home, params['MANIFEST_FILE']) local_pull_list = "{}/{}".format(home, params['LOCAL_PULL_LIST']) file_traversal_list = "{}/{}".format(home, params['FILE_TRAVERSAL_LIST']) hold_schema_dict = "{}/{}".format(home, params['HOLD_SCHEMA_DICT']) hold_schema_list = "{}/{}".format(home, params['HOLD_SCHEMA_LIST']) hold_scraped_dict = "{}/{}".format(home, params['HOLD_SCRAPED_DICT']) AUGMENTED_SCHEMA_FILE = "SchemaFiles/augmented_schema_list.json" # # Empirical evidence suggests this workflow is going to be very memory hungry if you are doing # merging, and requires at least 26 GB to be safe. Confirm that before starting! # do_merging = params['DO_MERGED_OUTPUT'] if do_merging: meminfo = dict((i.split()[0].rstrip(':'), int(i.split()[1])) for i in open('/proc/meminfo').readlines()) mem_kib = meminfo['MemTotal'] print("Machine memory: {}".format(mem_kib)) if int(mem_kib) < 26000000: print("Job requires at least 26 GB physical memory to complete") return # # Next, use the filter set to get a manifest from GDC using their API. Note that is a pull list is # provided, these steps can be omitted: # if 'build_manifest_from_filters' in steps: max_files = params['MAX_FILES'] if 'MAX_FILES' in params else None manifest_success = get_the_bq_manifest( params['FILE_TABLE'], bq_filters, max_files, params['WORKING_PROJECT'], params['TARGET_DATASET'], params['BQ_MANIFEST_TABLE'], params['WORKING_BUCKET'], params['BUCKET_MANIFEST_TSV'], manifest_file, params['BQ_AS_BATCH']) if not manifest_success: print("Failure generating manifest") return # # Best practice is to clear out the directory where the files are going. Don't want anything left over: # if 'clear_target_directory' in steps: create_clean_target(local_files_dir) # # We need to create a "pull list" of gs:// URLs to pull from GDC buckets. If you have already # created a pull list, just plunk it in 'LOCAL_PULL_LIST' and skip this step. If creating a pull # list, you can do it using IndexD calls on a manifest file, OR using BQ as long as you have # built the manifest using BQ (that route uses the BQ Manifest table that was created). # if 'build_pull_list' in steps: full_manifest = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['BQ_MANIFEST_TABLE']) build_pull_list_with_bq( full_manifest, params['INDEXD_BQ_TABLE'], params['WORKING_PROJECT'], params['TARGET_DATASET'], params['BQ_PULL_LIST_TABLE'], params['WORKING_BUCKET'], params['BUCKET_PULL_LIST'], local_pull_list, params['BQ_AS_BATCH']) # # Now hitting GDC cloud buckets, not "downloading". Get the files in the pull list: # if 'download_from_gdc' in steps: with open(local_pull_list, mode='r') as pull_list_file: pull_list = pull_list_file.read().splitlines() pull_from_buckets(pull_list, local_files_dir) # # Traverse the tree of downloaded files and create a flat list of all files: # if 'build_traversal_list' in steps: all_files = build_file_list(local_files_dir) program_list = build_program_list(all_files) if not check_caller_list(all_files, callers): print("Unexpected caller mismatch! Expecting {}".format(callers)) return with open(file_traversal_list, mode='w') as traversal_list: for line in all_files: traversal_list.write("{}\n".format(line)) # # We can create either a table that merges identical mutations from the different callers into # one row, or keep them separate: # if do_merging: do_debug = params['DO_DEBUG_LOGGING'] target_count = int(params['EXPECTED_COLUMNS']) for program in program_list: print("Look at MAFS for {}".format(program)) if 'run_maf_reader' in steps: with open(file_traversal_list, mode='r') as traversal_list_file: all_files = traversal_list_file.read().splitlines() print("Start reading MAFS for {}".format(program)) mut_calls, hdr_pick = read_MAFs(program, all_files, params['PROGRAM_PREFIX'], extra_cols, target_count, do_debug, key_fields, params['FIRST_MAF_COL'], file_info) print("Finish reading MAFS for {}".format(program)) if 'run_maf_writer' in steps: print("Start writing MAFS for {}".format(program)) hist_count = write_MAFs(program, mut_calls, hdr_pick, callers, do_debug) for ii in range(len(hist_count)): if hist_count[ii] > 0: print(" %6d %9d " % (ii, hist_count[ii])) print("Finish writing MAFS for {}".format(program)) # # Take all the files and make one BIG TSV file to upload: # if 'concat_all_files' in steps: if do_merging: maf_list = ["mergeA." + tumor + ".maf" for tumor in program_list] concat_all_merged_files(maf_list, one_big_tsv) else: with open(file_traversal_list, mode='r') as traversal_list_file: all_files = traversal_list_file.read().splitlines() concat_all_files(all_files, one_big_tsv, params['PROGRAM_PREFIX'], extra_cols, file_info) # # Scrape the column descriptions from the GDC web page # if 'scrape_schema' in steps: scrape_list = scrape_schema(params['MAF_URL'], params['FIRST_MAF_COL']) with open(hold_scraped_dict, mode='w') as scraped_hold_list: scraped_hold_list.write(json_dumps(scrape_list)) # # For the legacy table, the descriptions had lots of analysis tidbits. Very nice, but hard to maintain. # We just use hardwired schema descriptions now, most directly pulled from the GDC website: # if 'build_the_schema' in steps: typing_tups = build_schema(one_big_tsv, params['SCHEMA_SAMPLE_SKIPS']) build_combined_schema(hold_scraped_dict, AUGMENTED_SCHEMA_FILE, typing_tups, hold_schema_list, hold_schema_dict) # # Upload the giant TSV into a cloud bucket: # if 'upload_to_bucket' in steps: upload_to_bucket(params['WORKING_BUCKET'], params['BUCKET_SKEL_TSV'], one_big_tsv) # # Create the BQ table from the TSV: # if 'create_bq_from_tsv' in steps: bucket_src_url = 'gs://{}/{}'.format(params['WORKING_BUCKET'], params['BUCKET_SKEL_TSV']) with open(hold_schema_list, mode='r') as schema_hold_dict: typed_schema = json_loads(schema_hold_dict.read()) csv_to_bq(typed_schema, bucket_src_url, params['TARGET_DATASET'], params['SKELETON_TABLE'], params['BQ_AS_BATCH']) # # Need to merge in aliquot and sample barcodes from other tables: # if 'collect_barcodes' in steps: skel_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['SKELETON_TABLE']) success = attach_aliquot_ids(skel_table, params['FILE_TABLE'], params['TARGET_DATASET'], params['BARCODE_STEP_1_TABLE'], params['BQ_AS_BATCH']) if not success: print("attach_aliquot_ids job failed") return step_1_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['BARCODE_STEP_1_TABLE']) success = attach_barcodes(step_1_table, params['ALIQUOT_TABLE'], params['TARGET_DATASET'], params['BARCODE_STEP_2_TABLE'], params['BQ_AS_BATCH']) if not success: print("attach_barcodes job failed") return # # Merge the barcode info into the final table we are building: # if 'create_final_table' in steps: skel_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['SKELETON_TABLE']) barcodes_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['BARCODE_STEP_2_TABLE']) success = final_merge(skel_table, barcodes_table, params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], params['BQ_AS_BATCH']) if not success: print("Join job failed") return # # The derived table we generate has no field descriptions. Add them from the scraped page: # if 'update_final_schema' in steps: success = update_schema(params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], hold_schema_dict) if not success: print("Schema update failed") return # # Add the table description: # if 'add_table_description' in steps: desc = params['TABLE_DESCRIPTION'].format(params['MAF_URL']) update_description(params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], desc) # # Clear out working temp tables: # if 'dump_working_tables' in steps: dump_table_tags = [ 'SKELETON_TABLE', 'BARCODE_STEP_1_TABLE', 'BARCODE_STEP_2_TABLE', 'BQ_MANIFEST_TABLE', 'BQ_PULL_LIST_TABLE' ] dump_tables = [params[x] for x in dump_table_tags] for table in dump_tables: delete_table_bq_job(params['TARGET_DATASET'], table) # # Done! # print('job completed')
def main(args): if not confirm_google_vm(): print( 'This job needs to run on a Google Cloud Compute Engine to avoid storage egress charges [EXITING]' ) return if len(args) != 2: print(" ") print(" Usage : {} <configuration_yaml>".format(args[0])) return print('job started') # # Get the YAML config loaded: # with open(args[1], mode='r') as yaml_file: params, file_sets, steps = load_config(yaml_file.read()) # # BQ does not like to be given paths that have "~". So make all local paths absolute: # home = expanduser("~") local_files_dir = "{}/{}".format(home, params['LOCAL_FILES_DIR']) one_big_tsv = "{}/{}".format(home, params['ONE_BIG_TSV']) manifest_file = "{}/{}".format(home, params['MANIFEST_FILE']) local_pull_list = "{}/{}".format(home, params['LOCAL_PULL_LIST']) file_traversal_list = "{}/{}".format(home, params['FILE_TRAVERSAL_LIST']) hold_schema_dict = "{}/{}".format(home, params['HOLD_SCHEMA_DICT']) hold_schema_list = "{}/{}".format(home, params['HOLD_SCHEMA_LIST']) if 'clear_target_directory' in steps: for file_set in file_sets: count_name, _ = next(iter(file_set.items())) create_clean_target(local_files_dir.format(count_name)) if 'build_manifest_from_filters' in steps: for file_set in file_sets: count_name, count_dict = next(iter(file_set.items())) mani_for_count = manifest_file.format(count_name) table_for_count = params['BQ_MANIFEST_TABLE'].format(count_name) tsv_for_count = params['BUCKET_MANIFEST_TSV'].format(count_name) max_files = params['MAX_FILES'] if 'MAX_FILES' in params else None manifest_success = get_the_bq_manifest( params['FILE_TABLE'], count_dict['filters'], max_files, params['WORKING_PROJECT'], params['TARGET_DATASET'], table_for_count, params['WORKING_BUCKET'], tsv_for_count, mani_for_count, params['BQ_AS_BATCH']) if not manifest_success: print("Failure generating manifest") return # # We need to create a "pull list" of gs:// URLs to pull from GDC buckets. If you have already # created a pull list, just plunk it in 'LOCAL_PULL_LIST' and skip this step. If creating a pull # list, uses BQ as long as you have built the manifest using BQ (that route uses the BQ Manifest # table that was created). # if 'build_pull_list' in steps: for file_set in file_sets: count_name, count_dict = next(iter(file_set.items())) table_for_count = params['BQ_MANIFEST_TABLE'].format(count_name) local_pull_for_count = local_pull_list.format(count_name) pull_table_for_count = params['BQ_PULL_LIST_TABLE'].format( count_name) bucket_pull_list_for_count = params['BUCKET_PULL_LIST'].format( count_name) full_manifest = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], table_for_count) build_pull_list_with_bq( full_manifest, params['INDEXD_BQ_TABLE'], params['WORKING_PROJECT'], params['TARGET_DATASET'], pull_table_for_count, params['WORKING_BUCKET'], bucket_pull_list_for_count, local_pull_for_count, params['BQ_AS_BATCH']) # # Now hitting GDC cloud buckets. Get the files in the pull list: # if 'download_from_gdc' in steps: for file_set in file_sets: count_name, _ = next(iter(file_set.items())) pull_for_count = local_pull_list.format(count_name) with open(pull_for_count, mode='r') as pull_list_file: pull_list = pull_list_file.read().splitlines() print("Preparing to download %s files from buckets\n" % len(pull_list)) bp = BucketPuller(10) local_files_dir_for_count = local_files_dir.format(count_name) bp.pull_from_buckets(pull_list, local_files_dir_for_count) if 'build_file_list' in steps: for file_set in file_sets: count_name, _ = next(iter(file_set.items())) local_files_dir_for_count = local_files_dir.format(count_name) all_files = build_file_list(local_files_dir_for_count) file_traversal_list_for_count = file_traversal_list.format( count_name) with open(file_traversal_list_for_count, mode='w') as traversal_list: for line in all_files: traversal_list.write("{}\n".format(line)) if 'concat_all_files' in steps: for file_set in file_sets: count_name, count_dict = next(iter(file_set.items())) header = count_dict['header'] if 'header' in count_dict else None file_traversal_list_for_count = file_traversal_list.format( count_name) with open(file_traversal_list_for_count, mode='r') as traversal_list_file: all_files = traversal_list_file.read().splitlines() concat_all_files(all_files, one_big_tsv.format(count_name), header) # # Schemas and table descriptions are maintained in the github repo: # if 'pull_table_info_from_git' in steps: print('pull_table_info_from_git') try: create_clean_target(params['SCHEMA_REPO_LOCAL']) repo = Repo.clone_from(params['SCHEMA_REPO_URL'], params['SCHEMA_REPO_LOCAL']) repo.git.checkout(params['SCHEMA_REPO_BRANCH']) except Exception as ex: print("pull_table_info_from_git failed: {}".format(str(ex))) return if 'process_git_schemas' in steps: print('process_git_schema') # Where do we dump the schema git repository? schema_file = "{}/{}/{}".format(params['SCHEMA_REPO_LOCAL'], params['RAW_SCHEMA_DIR'], params['SCHEMA_FILE_NAME']) full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE']) # Write out the details success = generate_table_detail_files(schema_file, full_file_prefix) if not success: print("process_git_schemas failed") return if 'analyze_the_schema' in steps: print('analyze_the_schema') for file_set in file_sets: count_name, _ = next(iter(file_set.items())) typing_tups = build_schema(one_big_tsv.format(count_name), params['SCHEMA_SAMPLE_SKIPS']) full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE']) schema_dict_loc = "{}_schema.json".format(full_file_prefix) build_combined_schema(None, schema_dict_loc, typing_tups, hold_schema_list.format(count_name), hold_schema_dict.format(count_name)) bucket_target_blob_sets = {} for file_set in file_sets: count_name, _ = next(iter(file_set.items())) bucket_target_blob_sets[count_name] = '{}/{}'.format( params['WORKING_BUCKET_DIR'], params['BUCKET_TSV'].format(count_name)) if 'upload_to_bucket' in steps: for file_set in file_sets: count_name, _ = next(iter(file_set.items())) upload_to_bucket(params['WORKING_BUCKET'], bucket_target_blob_sets[count_name], one_big_tsv.format(count_name)) if 'delete_all_bq' in steps: table_cleaner(params, file_sets, True) if 'create_bq_from_tsv' in steps: for file_set in file_sets: count_name, _ = next(iter(file_set.items())) bucket_src_url = 'gs://{}/{}'.format( params['WORKING_BUCKET'], bucket_target_blob_sets[count_name]) hold_schema_list_for_count = hold_schema_list.format(count_name) with open(hold_schema_list_for_count, mode='r') as schema_hold_dict: typed_schema = json_loads(schema_hold_dict.read()) csv_to_bq_write_depo(typed_schema, bucket_src_url, params['TARGET_DATASET'], params['TARGET_TABLE'].format(count_name), params['BQ_AS_BATCH'], None) if 'attach_ids_to_files' in steps: count = 0 for file_set in file_sets: count_name, _ = next(iter(file_set.items())) write_depo = "WRITE_TRUNCATE" if (count == 0) else "WRITE_APPEND" gexp_table = '{}.{}.{}'.format( params['WORKING_PROJECT'], params['TARGET_DATASET'], params['TARGET_TABLE'].format(count_name)) success = build_aliquot_and_case( gexp_table, params['FILEDATA_TABLE'], params['TARGET_DATASET'], params['STEP_2_TABLE'], write_depo, {}, params['BQ_AS_BATCH']) count += 1 if not success: print("attach_ids_to_files failed") return if 'extract_platform' in steps: step2_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['STEP_2_TABLE']) success = extract_platform_for_files(step2_table, params['FILEDATA_TABLE'], params['TARGET_DATASET'], params['STEP_2A_TABLE'], True, {}, params['BQ_AS_BATCH']) if not success: print("extract_platform failed") return if 'attach_barcodes_to_ids' in steps: step2_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['STEP_2A_TABLE']) success = attach_barcodes(step2_table, params['ALIQUOT_TABLE'], params['TARGET_DATASET'], params['STEP_3_TABLE'], True, {}, params['BQ_AS_BATCH']) if not success: print("attach_barcodes_to_ids failed") return if 'merge_counts_and_metadata' in steps: for file_set in file_sets: count_name, count_dict = next(iter(file_set.items())) if 'header' not in count_dict: print("must have defined headers to work") break header = count_dict['header'] print(header) sql_dict = {} sql_dict['count_column'] = header.split(',')[1].strip() sql_dict['file_column'] = 'file_gdc_id_{}'.format(count_name) step3_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['STEP_3_TABLE']) counts_table = '{}.{}.{}'.format( params['WORKING_PROJECT'], params['TARGET_DATASET'], params['TARGET_TABLE'].format(count_name)) success = merge_counts_and_metadata( step3_table, counts_table, params['TARGET_DATASET'], params['COUNTS_WITH_METADATA_TABLE'].format(count_name), True, sql_dict, params['BQ_AS_BATCH']) if not success: print("merge_counts_and_metadata failed") return if 'merge_all' in steps: sql_dict = {} count = 0 for file_set in file_sets: count_name, count_dict = next(iter(file_set.items())) dict_for_set = {} sql_dict['table_{}'.format(count)] = dict_for_set count += 1 if 'header' not in count_dict: print("must have defined headers to work") return header = count_dict['header'] dict_for_set['count_column'] = header.split(',')[1].strip() dict_for_set['file_column'] = 'file_gdc_id_{}'.format(count_name) dict_for_set['table'] = '{}.{}.{}'.format( params['WORKING_PROJECT'], params['TARGET_DATASET'], params['COUNTS_WITH_METADATA_TABLE'].format(count_name)) success = all_counts_to_one_table(params['TARGET_DATASET'], params['THREE_COUNTS_TABLE'], True, sql_dict, params['BQ_AS_BATCH']) if not success: print("merge_counts_and_metadata failed") return if 'glue_gene_names' in steps: sql_dict = {} count = 0 for file_set in file_sets: count_name, count_dict = next(iter(file_set.items())) dict_for_set = {} sql_dict['table_{}'.format(count)] = dict_for_set count += 1 if 'header' not in count_dict: print("must have defined headers to work") return header = count_dict['header'] dict_for_set['count_column'] = header.split(',')[1].strip() dict_for_set['file_column'] = 'file_gdc_id_{}'.format(count_name) three_counts_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['THREE_COUNTS_TABLE']) success = glue_in_gene_names(three_counts_table, params['GENE_NAMES_TABLE'], params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], True, sql_dict, params['BQ_AS_BATCH']) if not success: print("glue_gene_names failed") return # # Update the per-field descriptions: # if 'update_field_descriptions' in steps: print('update_field_descriptions') full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE']) schema_dict_loc = "{}_schema.json".format(full_file_prefix) schema_dict = {} with open(schema_dict_loc, mode='r') as schema_hold_dict: full_schema_list = json_loads(schema_hold_dict.read()) for entry in full_schema_list: schema_dict[entry['name']] = {'description': entry['description']} success = update_schema_with_dict(params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], schema_dict) if not success: print("update_field_descriptions failed") return # # Add description and labels to the target table: # if 'update_table_description' in steps: print('update_table_description') full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE']) success = install_labels_and_desc(params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], full_file_prefix) if not success: print("update_table_description failed") return if 'dump_working_tables' in steps: table_cleaner(params, file_sets, False) # # archive files on VM: # bucket_archive_blob_sets = {} for file_set in file_sets: count_name, _ = next(iter(file_set.items())) bucket_target_blob_sets[count_name] = '{}/{}'.format( params['ARCHIVE_BUCKET_DIR'], params['BUCKET_TSV'].format(count_name)) if 'archive' in steps: print('archive files from VM') archive_file_prefix = "{}_{}".format(date.today(), params['PUBLICATION_DATASET']) yaml_file = re.search(r"\/(\w*.yaml)$", args[1]) archive_yaml = "{}/{}/{}_{}".format(params['ARCHIVE_BUCKET_DIR'], params['ARCHIVE_CONFIG'], archive_file_prefix, yaml_file.group(1)) upload_to_bucket(params['ARCHIVE_BUCKET'], archive_yaml, args[1]) for file_set in file_sets: count_name, count_dict = next(iter(file_set.items())) pull_file_name = params['LOCAL_PULL_LIST'] archive_pull_file = "{}/{}_{}".format( params['ARCHIVE_BUCKET_DIR'], archive_file_prefix, pull_file_name.format(count_name)) upload_to_bucket(params['ARCHIVE_BUCKET'], archive_pull_file, local_pull_list.format(count_name)) manifest_file_name = params['MANIFEST_FILE'] archive_manifest_file = "{}/{}_{}".format( params['ARCHIVE_BUCKET_DIR'], archive_file_prefix, manifest_file_name.format(count_name)) upload_to_bucket(params['ARCHIVE_BUCKET'], archive_manifest_file, manifest_file.format(count_name)) # # publish table: # if 'publish' in steps: source_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['FINAL_TARGET_TABLE']) publication_dest = '{}.{}.{}'.format(params['PUBLICATION_PROJECT'], params['PUBLICATION_DATASET'], params['PUBLICATION_TABLE']) success = publish_table(source_table, publication_dest) if not success: print("publish table failed") return print('job completed')
def main(args): if not confirm_google_vm(): print( 'This job needs to run on a Google Cloud Compute Engine to avoid storage egress charges [EXITING]' ) return if len(args) != 2: print(" ") print(" Usage : {} <configuration_yaml>".format(args[0])) return print('job started') # # Get the YAML config loaded: # with open(args[1], mode='r') as yaml_file: params, bq_filters, steps = load_config(yaml_file.read()) # # BQ does not like to be given paths that have "~". So make all local paths absolute: # home = expanduser("~") local_files_dir = "{}/{}".format(home, params['LOCAL_FILES_DIR']) one_big_tsv = "{}/{}".format(home, params['ONE_BIG_TSV']) manifest_file = "{}/{}".format(home, params['MANIFEST_FILE']) local_pull_list = "{}/{}".format(home, params['LOCAL_PULL_LIST']) file_traversal_list = "{}/{}".format(home, params['FILE_TRAVERSAL_LIST']) hold_schema_dict = "{}/{}".format(home, params['HOLD_SCHEMA_DICT']) hold_schema_list = "{}/{}".format(home, params['HOLD_SCHEMA_LIST']) # Schema that describes CNVR table: AUGMENTED_SCHEMA_FILE = "SchemaFiles/cnvr_augmented_schema_list.json" # # Use the filter set to get a manifest from GDC using their API. Note that if a pull list is # provided, these steps can be omitted: # if 'build_manifest_from_filters' in steps: max_files = params['MAX_FILES'] if 'MAX_FILES' in params else None manifest_success = get_the_bq_manifest( params['FILE_TABLE'], bq_filters, max_files, params['WORKING_PROJECT'], params['TARGET_DATASET'], params['BQ_MANIFEST_TABLE'], params['WORKING_BUCKET'], params['BUCKET_MANIFEST_TSV'], manifest_file, params['BQ_AS_BATCH']) if not manifest_success: print("Failure generating manifest") return if 'clear_target_directory' in steps: create_clean_target(local_files_dir) # # We need to create a "pull list" of gs:// URLs to pull from GDC buckets. If you have already # created a pull list, just plunk it in 'LOCAL_PULL_LIST' and skip this step. If creating a pull # list, uses BQ as long as you have built the manifest using BQ (that route uses the BQ Manifest # table that was created). # if 'build_pull_list' in steps: full_manifest = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['BQ_MANIFEST_TABLE']) build_pull_list_with_bq( full_manifest, params['INDEXD_BQ_TABLE'], params['WORKING_PROJECT'], params['TARGET_DATASET'], params['BQ_PULL_LIST_TABLE'], params['WORKING_BUCKET'], params['BUCKET_PULL_LIST'], local_pull_list, params['BQ_AS_BATCH']) # # Now hitting GDC cloud buckets. Get the files in the pull list: # if 'download_from_gdc' in steps: with open(local_pull_list, mode='r') as pull_list_file: pull_list = pull_list_file.read().splitlines() print("Preparing to download %s files from buckets\n" % len(pull_list)) bp = BucketPuller(10) bp.pull_from_buckets(pull_list, local_files_dir) if 'build_file_list' in steps: all_files = build_file_list(local_files_dir) with open(file_traversal_list, mode='w') as traversal_list: for line in all_files: traversal_list.write("{}\n".format(line)) if 'concat_all_files' in steps: with open(file_traversal_list, mode='r') as traversal_list_file: all_files = traversal_list_file.read().splitlines() concat_all_files(all_files, one_big_tsv) if 'build_the_schema' in steps: typing_tups = build_schema(one_big_tsv, params['SCHEMA_SAMPLE_SKIPS']) build_combined_schema(None, AUGMENTED_SCHEMA_FILE, typing_tups, hold_schema_list, hold_schema_dict) bucket_target_blob = '{}/{}'.format(params['WORKING_BUCKET_DIR'], params['BUCKET_TSV']) if 'upload_to_bucket' in steps: upload_to_bucket(params['WORKING_BUCKET'], bucket_target_blob, params['ONE_BIG_TSV']) if 'create_bq_from_tsv' in steps: bucket_src_url = 'gs://{}/{}'.format(params['WORKING_BUCKET'], bucket_target_blob) with open(hold_schema_list, mode='r') as schema_hold_dict: typed_schema = json_loads(schema_hold_dict.read()) csv_to_bq(typed_schema, bucket_src_url, params['TARGET_DATASET'], params['TARGET_TABLE'], params['BQ_AS_BATCH']) if 'add_aliquot_fields' in steps: full_target_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['TARGET_TABLE']) success = join_with_aliquot_table(full_target_table, params['ALIQUOT_TABLE'], params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], params['BQ_AS_BATCH']) if not success: print("Join job failed") # # Clear out working temp tables: # if 'dump_working_tables' in steps: dump_table_tags = ['TARGET_TABLE'] dump_tables = [params[x] for x in dump_table_tags] for table in dump_tables: delete_table_bq_job(params['TARGET_DATASET'], table) print('job completed')
def main(args): if not confirm_google_vm(): print( 'This job needs to run on a Google Cloud Compute Engine to avoid storage egress charges [EXITING]' ) return if len(args) != 2: print(" ") print(" Usage : {} <configuration_yaml>".format(args[0])) return print('job started') # # Get the YAML config loaded: # with open(args[1], mode='r') as yaml_file: params, bq_filters, steps, extra_cols = load_config(yaml_file.read()) if params is None: print("Bad YAML load") return # Schema that describes table columns: AUGMENTED_SCHEMA_FILE = "SchemaFiles/isoform_augmented_schema_list.json" # # BQ does not like to be given paths that have "~". So make all local paths absolute: # home = expanduser("~") local_files_dir = "{}/{}".format(home, params['LOCAL_FILES_DIR']) one_big_tsv = "{}/{}".format(home, params['ONE_BIG_TSV']) manifest_file = "{}/{}".format(home, params['MANIFEST_FILE']) local_pull_list = "{}/{}".format(home, params['LOCAL_PULL_LIST']) file_traversal_list = "{}/{}".format(home, params['FILE_TRAVERSAL_LIST']) hold_schema_dict = "{}/{}".format(home, params['HOLD_SCHEMA_DICT']) hold_schema_list = "{}/{}".format(home, params['HOLD_SCHEMA_LIST']) # # Use the filter set to get a manifest from GDC using their API. Note that is a pull list is # provided, these steps can be omitted: # if 'build_manifest_from_filters' in steps: max_files = params['MAX_FILES'] if 'MAX_FILES' in params else None manifest_success = get_the_bq_manifest( params['FILE_TABLE'], bq_filters, max_files, params['WORKING_PROJECT'], params['TARGET_DATASET'], params['BQ_MANIFEST_TABLE'], params['WORKING_BUCKET'], params['BUCKET_MANIFEST_TSV'], manifest_file, params['BQ_AS_BATCH']) if not manifest_success: print("Failure generating manifest") return # # Best practice is to clear out the directory where the files are going. Don't want anything left over: # if 'clear_target_directory' in steps: create_clean_target(local_files_dir) # # We need to create a "pull list" of gs:// URLs to pull from GDC buckets. If you have already # created a pull list, just plunk it in 'LOCAL_PULL_LIST' and skip this step. If creating a pull # list, uses BQ as long as you have built the manifest using BQ (that route uses the BQ Manifest # table that was created). # if 'build_pull_list' in steps: full_manifest = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['BQ_MANIFEST_TABLE']) build_pull_list_with_bq( full_manifest, params['INDEXD_BQ_TABLE'], params['WORKING_PROJECT'], params['TARGET_DATASET'], params['BQ_PULL_LIST_TABLE'], params['WORKING_BUCKET'], params['BUCKET_PULL_LIST'], local_pull_list, params['BQ_AS_BATCH']) # # Now hitting GDC cloud buckets. Get the files in the pull list: # if 'download_from_gdc' in steps: with open(local_pull_list, mode='r') as pull_list_file: pull_list = pull_list_file.read().splitlines() print("Preparing to download %s files from buckets\n" % len(pull_list)) bp = BucketPuller(10) bp.pull_from_buckets(pull_list, local_files_dir) # # Traverse the tree of downloaded files and create a flat list of all files: # if 'build_traversal_list' in steps: all_files = build_file_list(local_files_dir) with open(file_traversal_list, mode='w') as traversal_list: for line in all_files: traversal_list.write("{}\n".format(line)) # # Take all the files and make one BIG TSV file to upload: # if 'concat_all_files' in steps: with open(file_traversal_list, mode='r') as traversal_list_file: all_files = traversal_list_file.read().splitlines() concat_all_files(all_files, one_big_tsv, params['PROGRAM_PREFIX'], extra_cols, file_info, split_col_func) # # For the legacy table, the descriptions had lots of analysis tidbits. Very nice, but hard to maintain. # We just use hardwired schema descriptions now, most directly pulled from the GDC website: # if 'build_the_schema' in steps: typing_tups = build_schema(one_big_tsv, params['SCHEMA_SAMPLE_SKIPS']) build_combined_schema(None, AUGMENTED_SCHEMA_FILE, typing_tups, hold_schema_list, hold_schema_dict) # # Upload the giant TSV into a cloud bucket: # if 'upload_to_bucket' in steps: upload_to_bucket(params['WORKING_BUCKET'], params['BUCKET_SKEL_TSV'], one_big_tsv) # # Create the BQ table from the TSV: # if 'create_bq_from_tsv' in steps: bucket_src_url = 'gs://{}/{}'.format(params['WORKING_BUCKET'], params['BUCKET_SKEL_TSV']) with open(hold_schema_list, mode='r') as schema_hold_dict: typed_schema = json_loads(schema_hold_dict.read()) csv_to_bq(typed_schema, bucket_src_url, params['TARGET_DATASET'], params['SKELETON_TABLE'], params['BQ_AS_BATCH']) # # Need to merge in aliquot and sample barcodes from other tables: # if 'collect_barcodes' in steps: skel_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['SKELETON_TABLE']) success = attach_aliquot_ids(skel_table, params['FILE_TABLE'], params['TARGET_DATASET'], params['BARCODE_STEP_1_TABLE'], params['BQ_AS_BATCH']) if not success: print("attach_aliquot_ids job failed") return step_1_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['BARCODE_STEP_1_TABLE']) success = attach_barcodes(step_1_table, params['ALIQUOT_TABLE'], params['TARGET_DATASET'], params['BARCODE_STEP_2_TABLE'], params['BQ_AS_BATCH']) if not success: print("attach_barcodes job failed") return # # Merge the barcode info into the final table we are building: # if 'create_final_table' in steps: skel_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['SKELETON_TABLE']) barcodes_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['BARCODE_STEP_2_TABLE']) success = final_merge(skel_table, barcodes_table, params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], params['BQ_AS_BATCH']) if not success: print("Join job failed") return # # The derived table we generate has no field descriptions. Add them from the scraped page: # if 'update_final_schema' in steps: success = update_schema(params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], hold_schema_dict) if not success: print("Schema update failed") return # # Add the table description: # if 'add_table_description' in steps: update_description(params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], params['TABLE_DESCRIPTION']) # # Clear out working temp tables: # if 'dump_working_tables' in steps: dump_table_tags = [ 'SKELETON_TABLE', 'BARCODE_STEP_1_TABLE', 'BARCODE_STEP_2_TABLE', 'BQ_MANIFEST_TABLE', 'BQ_PULL_LIST_TABLE' ] dump_tables = [params[x] for x in dump_table_tags] for table in dump_tables: delete_table_bq_job(params['TARGET_DATASET'], table) # # Done! # print('job completed')
def main(args): if not confirm_google_vm(): print( 'This job needs to run on a Google Cloud Compute Engine to avoid storage egress charges [EXITING]' ) return if len(args) != 2: print(" ") print(" Usage : {} <configuration_yaml>".format(args[0])) return print('job started') # # Get the YAML config loaded: # with open(args[1], mode='r') as yaml_file: params, steps = load_config(yaml_file.read()) # # BQ does not like to be given paths that have "~". So make all local paths absolute: # home = expanduser("~") local_files_dir = "{}/{}".format(home, params['LOCAL_FILES_DIR']) local_file = "{}/{}".format(home, params['DOWNLOAD_FILE']) local_pull_list = "{}/{}".format(home, params['LOCAL_PULL_LIST']) file_traversal_list = "{}/{}".format(home, params['FILE_TRAVERSAL_LIST']) hold_schema_dict = "{}/{}".format(home, params['HOLD_SCHEMA_DICT']) hold_schema_list = "{}/{}".format(home, params['HOLD_SCHEMA_LIST']) if 'clear_target_directory' in steps: print('clear_target_directory') create_clean_target(local_files_dir) if 'build_pull_list' in steps: bucket_to_local(params['WORKING_BUCKET'], params['COSMIC_FILE'], local_file) print('build_pull_list') success = build_pull_list_from_txt(local_file, local_pull_list, params['VERSION']) if not success: print("Build pull list failed") return if 'download_from_cosmic' in steps: print("Download from Sanger") with open(local_pull_list, mode='r') as pull_list_file: pull_list = pull_list_file.read().splitlines() print("Preaparing to download {} files from AWS buckets\n".format( len(pull_list))) for line in pull_list: file_name, url = line.split('\t') file_location = ''.join([local_files_dir, "/", file_name]) with open(file_location, mode='wb') as data_file: response = requests.get(url) if response.status_code == 200: data_file.write(response.content) # add an unzip step & dump zip file else: print("Download failed. Problem downloading {}".format( file_name)) return file, ext = os.path.splitext(file_name.split('/')[-1]) new_file_location = ''.join([local_files_dir, "/", file]) if ext == ".gz": # Unzip the file and remove zip file print("Uncompressing {}".format(file)) with gzip.open(file_location, "rb") as gzip_in: with open(new_file_location, "wb") as uncomp_out: shutil.copyfileobj(gzip_in, uncomp_out) os.remove(file_location) else: print("{} doesn't need to be uncompressed".format(file)) if 'build_file_list' in steps: print('build_file_list') all_files = build_file_list(local_files_dir) with open(file_traversal_list, mode='w') as traversal_list: for line in all_files: traversal_list.write("{}\n".format(line)) # # Schemas and table descriptions are maintained in the github repo: # if 'pull_table_info_from_git' in steps: print('pull_table_info_from_git') try: create_clean_target(params['SCHEMA_REPO_LOCAL']) repo = Repo.clone_from(params['SCHEMA_REPO_URL'], params['SCHEMA_REPO_LOCAL']) repo.git.checkout(params['SCHEMA_REPO_BRANCH']) except Exception as ex: print("pull_table_info_from_git failed: {}".format(str(ex))) return if 'process_and_create_schema' in steps: with open(file_traversal_list, mode='r') as traversal_list_file: all_files = traversal_list_file.read().splitlines() for line in all_files: file_name, ext = os.path.splitext(line.split('/')[-1]) file_components = file_name.split("_") data_type = "_".join(file_components[0:(len(file_components) - 2)]) version = ''.join(['VERSION ', file_components[-1]]) hg = 'hg19' if file_components[-2] == 'GRCh37' else 'hg38' schema_tags = { '---tag-ref-genome-0---': hg, '---tag-release---': version } if 'process_git_schemas' in steps: print('process_git_schema: {}'.format(line)) # Where do we dump the schema git repository? schema_file_name = ''.join([data_type, ".json"]) print("schema_file_name: " + schema_file_name) schema_file = "{}/{}/{}".format(params['SCHEMA_REPO_LOCAL'], params['RAW_SCHEMA_DIR'], schema_file_name) full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], schema_file_name) print(schema_file + "\t" + full_file_prefix) # Write out the details success = generate_table_detail_files(schema_file, full_file_prefix) if not success: print("process_git_schemas failed") return # Customize generic schema to this data program: if 'replace_schema_tags' in steps: print('replace_schema_tags') version = ''.join(['VERSION ', file_components[-1]]) hg = 'hg19' if file_components[-2] == 'GRCh37' else 'hg38' schema_tags = { '---tag-ref-genome-0---': hg, '---tag-release---': version } tag_map_list = [] for tag in schema_tags: use_pair = {tag: schema_tags[tag]} tag_map_list.append(use_pair) full_file_prefix = "{}/{}".format( params['PROX_DESC_PREFIX'], '_'.join(file_components[:-2])) # Write out the details success = customize_labels_and_desc(full_file_prefix, tag_map_list) if not success: print("replace_schema_tags failed") return # Create BQ tables if 'create_bq_tables' in steps: with open(file_traversal_list, mode='r') as traversal_list_file: all_files = traversal_list_file.read().splitlines() for line in all_files: file = line.split('/')[-1] file_name, ext = os.path.splitext(file) file_components = file_name.split("_") data_type = "_".join(file_components[0:(len(file_components) - 2)]) bucket_target_blob = '{}/{}'.format(params['WORKING_BUCKET_DIR'], file) if 'upload_to_bucket' in steps: print('upload_to_bucket') upload_to_bucket(params['WORKING_BUCKET'], bucket_target_blob, line)
def main(args): if not confirm_google_vm(): print( 'This job needs to run on a Google Cloud Compute Engine to avoid storage egress charges [EXITING]' ) return if len(args) != 2: print(" ") print(" Usage : {} <configuration_yaml>".format(args[0])) return print('job started') # # Get the YAML config loaded: # with open(args[1], mode='r') as yaml_file: params, file_sets, steps = load_config(yaml_file.read()) # # BQ does not like to be given paths that have "~". So make all local paths absolute: # home = expanduser("~") local_files_dir = "{}/{}".format(home, params['LOCAL_FILES_DIR']) one_big_tsv = "{}/{}".format(home, params['ONE_BIG_TSV']) manifest_file = "{}/{}".format(home, params['MANIFEST_FILE']) local_pull_list = "{}/{}".format(home, params['LOCAL_PULL_LIST']) file_traversal_list = "{}/{}".format(home, params['FILE_TRAVERSAL_LIST']) hold_schema_dict = "{}/{}".format(home, params['HOLD_SCHEMA_DICT']) hold_schema_list = "{}/{}".format(home, params['HOLD_SCHEMA_LIST']) if 'clear_target_directory' in steps: for file_set in file_sets: count_name, _ = next(iter(file_set.items())) create_clean_target(local_files_dir.format(count_name)) if 'build_manifest_from_filters' in steps: for file_set in file_sets: count_name, count_dict = next(iter(file_set.items())) mani_for_count = manifest_file.format(count_name) table_for_count = params['BQ_MANIFEST_TABLE'].format(count_name) tsv_for_count = params['BUCKET_MANIFEST_TSV'].format(count_name) max_files = params['MAX_FILES'] if 'MAX_FILES' in params else None manifest_success = get_the_bq_manifest( params['FILE_TABLE'], count_dict['filters'], max_files, params['WORKING_PROJECT'], params['TARGET_DATASET'], table_for_count, params['WORKING_BUCKET'], tsv_for_count, mani_for_count, params['BQ_AS_BATCH']) if not manifest_success: print("Failure generating manifest") return # # We need to create a "pull list" of gs:// URLs to pull from GDC buckets. If you have already # created a pull list, just plunk it in 'LOCAL_PULL_LIST' and skip this step. If creating a pull # list, uses BQ as long as you have built the manifest using BQ (that route uses the BQ Manifest # table that was created). # if 'build_pull_list' in steps: for file_set in file_sets: count_name, count_dict = next(iter(file_set.items())) table_for_count = params['BQ_MANIFEST_TABLE'].format(count_name) local_pull_for_count = local_pull_list.format(count_name) pull_table_for_count = params['BQ_PULL_LIST_TABLE'].format( count_name) bucket_pull_list_for_count = params['BUCKET_PULL_LIST'].format( count_name) full_manifest = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], table_for_count) build_pull_list_with_bq( full_manifest, params['INDEXD_BQ_TABLE'], params['WORKING_PROJECT'], params['TARGET_DATASET'], pull_table_for_count, params['WORKING_BUCKET'], bucket_pull_list_for_count, local_pull_for_count, params['BQ_AS_BATCH']) # # Now hitting GDC cloud buckets. Get the files in the pull list: # if 'download_from_gdc' in steps: for file_set in file_sets: count_name, _ = next(iter(file_set.items())) pull_for_count = local_pull_list.format(count_name) with open(pull_for_count, mode='r') as pull_list_file: pull_list = pull_list_file.read().splitlines() print("Preparing to download %s files from buckets\n" % len(pull_list)) bp = BucketPuller(10) local_files_dir_for_count = local_files_dir.format(count_name) bp.pull_from_buckets(pull_list, local_files_dir_for_count) if 'build_file_list' in steps: for file_set in file_sets: count_name, _ = next(iter(file_set.items())) local_files_dir_for_count = local_files_dir.format(count_name) all_files = build_file_list(local_files_dir_for_count) file_traversal_list_for_count = file_traversal_list.format( count_name) with open(file_traversal_list_for_count, mode='w') as traversal_list: for line in all_files: traversal_list.write("{}\n".format(line)) if 'concat_all_files' in steps: for file_set in file_sets: count_name, count_dict = next(iter(file_set.items())) header = count_dict['header'] if 'header' in count_dict else None file_traversal_list_for_count = file_traversal_list.format( count_name) with open(file_traversal_list_for_count, mode='r') as traversal_list_file: all_files = traversal_list_file.read().splitlines() concat_all_files(all_files, one_big_tsv.format(count_name), header) if 'build_the_schema' in steps: for file_set in file_sets: count_name, _ = next(iter(file_set.items())) typing_tups = build_schema(one_big_tsv.format(count_name), params['SCHEMA_SAMPLE_SKIPS']) for tup in typing_tups: print(tup) hold_schema_list_for_count = hold_schema_list.format(count_name) typing_tups_to_schema_list(typing_tups, hold_schema_list_for_count) #hold_schema_list_for_count = hold_schema_list.format(count_name) #hold_schema_dict_for_count = hold_schema_dict.format(count_name) ## build_combined_schema(None, AUGMENTED_SCHEMA_FILE, # typing_tups, hold_schema_list_for_count, hold_schema_dict_for_count) bucket_target_blob_sets = {} for file_set in file_sets: count_name, _ = next(iter(file_set.items())) bucket_target_blob_sets[count_name] = '{}/{}'.format( params['WORKING_BUCKET_DIR'], params['BUCKET_TSV'].format(count_name)) if 'upload_to_bucket' in steps: for file_set in file_sets: count_name, _ = next(iter(file_set.items())) upload_to_bucket(params['WORKING_BUCKET'], bucket_target_blob_sets[count_name], one_big_tsv.format(count_name)) if 'delete_all_bq' in steps: table_cleaner(params, file_sets, True) if 'create_bq_from_tsv' in steps: for file_set in file_sets: count_name, _ = next(iter(file_set.items())) bucket_src_url = 'gs://{}/{}'.format( params['WORKING_BUCKET'], bucket_target_blob_sets[count_name]) hold_schema_list_for_count = hold_schema_list.format(count_name) with open(hold_schema_list_for_count, mode='r') as schema_hold_dict: typed_schema = json_loads(schema_hold_dict.read()) csv_to_bq_write_depo(typed_schema, bucket_src_url, params['TARGET_DATASET'], params['TARGET_TABLE'].format(count_name), params['BQ_AS_BATCH'], None) if 'attach_ids_to_files' in steps: count = 0 for file_set in file_sets: count_name, _ = next(iter(file_set.items())) write_depo = "WRITE_TRUNCATE" if (count == 0) else "WRITE_APPEND" gexp_table = '{}.{}.{}'.format( params['WORKING_PROJECT'], params['TARGET_DATASET'], params['TARGET_TABLE'].format(count_name)) success = build_aliquot_and_case( gexp_table, params['FILEDATA_TABLE'], params['TARGET_DATASET'], params['STEP_2_TABLE'], write_depo, {}, params['BQ_AS_BATCH']) count += 1 if not success: print("attach_ids_to_files failed") return if 'extract_platform' in steps: step2_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['STEP_2_TABLE']) success = extract_platform_for_files(step2_table, params['FILEDATA_TABLE'], params['TARGET_DATASET'], params['STEP_2A_TABLE'], True, {}, params['BQ_AS_BATCH']) if not success: print("extract_platform failed") return if 'attach_barcodes_to_ids' in steps: step2_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['STEP_2A_TABLE']) success = attach_barcodes(step2_table, params['ALIQUOT_TABLE'], params['TARGET_DATASET'], params['STEP_3_TABLE'], True, {}, params['BQ_AS_BATCH']) if not success: print("attach_barcodes_to_ids failed") return if 'merge_counts_and_metadata' in steps: for file_set in file_sets: count_name, count_dict = next(iter(file_set.items())) if 'header' not in count_dict: print("must have defined headers to work") break header = count_dict['header'] print(header) sql_dict = {} sql_dict['count_column'] = header.split(',')[1].strip() sql_dict['file_column'] = 'file_gdc_id_{}'.format(count_name) step3_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['STEP_3_TABLE']) counts_table = '{}.{}.{}'.format( params['WORKING_PROJECT'], params['TARGET_DATASET'], params['TARGET_TABLE'].format(count_name)) success = merge_counts_and_metadata( step3_table, counts_table, params['TARGET_DATASET'], params['COUNTS_WITH_METADATA_TABLE'].format(count_name), True, sql_dict, params['BQ_AS_BATCH']) if not success: print("merge_counts_and_metadata failed") return if 'merge_all' in steps: sql_dict = {} count = 0 for file_set in file_sets: count_name, count_dict = next(iter(file_set.items())) dict_for_set = {} sql_dict['table_{}'.format(count)] = dict_for_set count += 1 if 'header' not in count_dict: print("must have defined headers to work") return header = count_dict['header'] dict_for_set['count_column'] = header.split(',')[1].strip() dict_for_set['file_column'] = 'file_gdc_id_{}'.format(count_name) dict_for_set['table'] = '{}.{}.{}'.format( params['WORKING_PROJECT'], params['TARGET_DATASET'], params['COUNTS_WITH_METADATA_TABLE'].format(count_name)) success = all_counts_to_one_table(params['TARGET_DATASET'], params['THREE_COUNTS_TABLE'], True, sql_dict, params['BQ_AS_BATCH']) if not success: print("merge_counts_and_metadata failed") return if 'glue_gene_names' in steps: sql_dict = {} count = 0 for file_set in file_sets: count_name, count_dict = next(iter(file_set.items())) dict_for_set = {} sql_dict['table_{}'.format(count)] = dict_for_set count += 1 if 'header' not in count_dict: print("must have defined headers to work") return header = count_dict['header'] dict_for_set['count_column'] = header.split(',')[1].strip() dict_for_set['file_column'] = 'file_gdc_id_{}'.format(count_name) three_counts_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['THREE_COUNTS_TABLE']) success = glue_in_gene_names(three_counts_table, params['GENE_NAMES_TABLE'], params['TARGET_DATASET'], params['FINAL_FINAL_TABLE'], True, sql_dict, params['BQ_AS_BATCH']) if not success: print("glue_gene_names failed") return if 'dump_working_tables' in steps: table_cleaner(params, file_sets, False) print('job completed')