def main(args): if len(args) != 2: print(" ") print(" Usage : {} <configuration_yaml>".format(args[0])) return print('job started') # # Get the YAML config loaded: # with open(args[1], mode='r') as yaml_file: params, steps = load_config(yaml_file.read()) if params is None: print("Bad YAML load") return # # Schemas and table descriptions are maintained in the github repo: # if 'pull_table_info_from_git' in steps: print('pull_table_info_from_git') try: create_clean_target(params['SCHEMA_REPO_LOCAL']) repo = Repo.clone_from(params['SCHEMA_REPO_URL'], params['SCHEMA_REPO_LOCAL']) repo.git.checkout(params['SCHEMA_REPO_BRANCH']) except Exception as ex: print("pull_table_info_from_git failed: {}".format(str(ex))) return for dict in params['FIX_LIST']: table, repo_file = next(iter(dict.items())) if 'process_git_schemas' in steps: print('process_git_schemas: {}'.format(table)) # Where do we dump the schema git repository? schema_file = "{}/{}/{}".format(params['SCHEMA_REPO_LOCAL'], params['RAW_SCHEMA_DIR'], repo_file) full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], table) # Write out the details success = generate_table_detail_files(schema_file, full_file_prefix) if not success: print("process_git_schemas failed") return # # Update the per-field descriptions: # if 'update_field_descriptions' in steps: print('update_field_descriptions: {}'.format(table)) full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], table) schema_dict_loc = "{}_schema.json".format(full_file_prefix) schema_dict = {} with open(schema_dict_loc, mode='r') as schema_hold_dict: full_schema_list = json_loads(schema_hold_dict.read()) for entry in full_schema_list: schema_dict[entry['name']] = {'description': entry['description']} set_and_table = table.split('.', maxsplit=1) success = update_schema_with_dict(set_and_table[0], set_and_table[1], schema_dict, project=params['TARGET_PROJECT']) if not success: print("update_field_descriptions failed") return # # Add description and labels to the target table: # if 'update_table_description' in steps: print('update_table_description: {}'.format(table)) full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], table) set_and_table = table.split('.', maxsplit=1) success = install_labels_and_desc(set_and_table[0], set_and_table[1], full_file_prefix, project=params['TARGET_PROJECT']) if not success: print("update_table_description failed") return print('job completed')
def main(args): if not confirm_google_vm(): print('This job needs to run on a Google Cloud Compute Engine to avoid storage egress charges [EXITING]') return if len(args) != 2: print(" ") print(" Usage : {} <configuration_yaml>".format(args[0])) return print('job started') # # Get the YAML config loaded: # with open(args[1], mode='r') as yaml_file: params, bq_filters, steps, extra_cols = load_config(yaml_file.read()) if params is None: print("Bad YAML load") return # Schema that describes table columns: AUGMENTED_SCHEMA_FILE = "SchemaFiles/mirna_augmented_schema_list.json" # # BQ does not like to be given paths that have "~". So make all local paths absolute: # home = expanduser("~") local_files_dir = "{}/{}".format(home, params['LOCAL_FILES_DIR']) one_big_tsv = "{}/{}".format(home, params['ONE_BIG_TSV']) manifest_file = "{}/{}".format(home, params['MANIFEST_FILE']) local_pull_list = "{}/{}".format(home, params['LOCAL_PULL_LIST']) file_traversal_list = "{}/{}".format(home, params['FILE_TRAVERSAL_LIST']) hold_schema_dict = "{}/{}".format(home, params['HOLD_SCHEMA_DICT']) hold_schema_list = "{}/{}".format(home, params['HOLD_SCHEMA_LIST']) # # Best practice is to clear out the directory where the files are going. Don't want anything left over. # Also creates the destination directory # if 'clear_target_directory' in steps: create_clean_target(local_files_dir) # # Use the filter set to get a manifest. Note that is a pull list is # provided, these steps can be omitted: # if 'build_manifest_from_filters' in steps: max_files = params['MAX_FILES'] if 'MAX_FILES' in params else None manifest_success = get_the_bq_manifest(params['FILE_TABLE'], bq_filters, max_files, params['WORKING_PROJECT'], params['TARGET_DATASET'], params['BQ_MANIFEST_TABLE'], params['WORKING_BUCKET'], params['BUCKET_MANIFEST_TSV'], manifest_file, params['BQ_AS_BATCH']) if not manifest_success: print("Failure generating manifest") return # # If you have already created a pull list, just plunk it in 'LOCAL_PULL_LIST' and skip this step. # if 'build_pull_list' in steps: full_manifest = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['BQ_MANIFEST_TABLE']) build_pull_list_with_bq(full_manifest, params['INDEXD_BQ_TABLE'], params['WORKING_PROJECT'], params['TARGET_DATASET'], params['BQ_PULL_LIST_TABLE'], params['WORKING_BUCKET'], params['BUCKET_PULL_LIST'], local_pull_list, params['BQ_AS_BATCH']) # # Now hitting GDC cloud buckets, not "downloading". Get the files in the pull list: # if 'download_from_gdc' in steps: with open(local_pull_list, mode='r') as pull_list_file: pull_list = pull_list_file.read().splitlines() print("Preparing to download %s files from buckets\n" % len(pull_list)) bp = BucketPuller(10) bp.pull_from_buckets(pull_list, local_files_dir) # # Traverse the tree of downloaded files and create a flat list of all files: # if 'build_traversal_list' in steps: all_files = build_file_list(local_files_dir) with open(file_traversal_list, mode='w') as traversal_list: for line in all_files: traversal_list.write("{}\n".format(line)) # # Take all the files and make one BIG TSV file to upload: # if 'concat_all_files' in steps: with open(file_traversal_list, mode='r') as traversal_list_file: all_files = traversal_list_file.read().splitlines() concat_all_files(all_files, one_big_tsv, params['PROGRAM_PREFIX'], extra_cols, file_info, None) # # For the legacy table, the descriptions had lots of analysis tidbits. Very nice, but hard to maintain. # We just use hardwired schema descriptions now, most directly pulled from the GDC website: # if 'build_the_schema' in steps: typing_tups = build_schema(one_big_tsv, params['SCHEMA_SAMPLE_SKIPS']) build_combined_schema(None, AUGMENTED_SCHEMA_FILE, typing_tups, hold_schema_list, hold_schema_dict) # # Upload the giant TSV into a cloud bucket: # if 'upload_to_bucket' in steps: upload_to_bucket(params['WORKING_BUCKET'], params['BUCKET_SKEL_TSV'], one_big_tsv) # # Create the BQ table from the TSV: # if 'create_bq_from_tsv' in steps: bucket_src_url = 'gs://{}/{}'.format(params['WORKING_BUCKET'], params['BUCKET_SKEL_TSV']) with open(hold_schema_list, mode='r') as schema_hold_dict: typed_schema = json_loads(schema_hold_dict.read()) csv_to_bq(typed_schema, bucket_src_url, params['TARGET_DATASET'], params['SKELETON_TABLE'], params['BQ_AS_BATCH']) # # Need to merge in aliquot and sample barcodes from other tables: # if 'collect_barcodes' in steps: skel_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['SKELETON_TABLE']) success = attach_aliquot_ids(skel_table, params['FILE_TABLE'], params['TARGET_DATASET'], params['BARCODE_STEP_1_TABLE'], params['BQ_AS_BATCH']) if not success: print("attach_aliquot_ids job failed") return step_1_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['BARCODE_STEP_1_TABLE']) success = attach_barcodes(step_1_table, params['ALIQUOT_TABLE'], params['TARGET_DATASET'], params['BARCODE_STEP_2_TABLE'], params['BQ_AS_BATCH']) if not success: print("attach_barcodes job failed") return # # Merge the barcode info into the final table we are building: # if 'create_final_table' in steps: skel_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['SKELETON_TABLE']) barcodes_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['BARCODE_STEP_2_TABLE']) success = final_merge(skel_table, barcodes_table, params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], params['BQ_AS_BATCH']) if not success: print("Join job failed") return # # Schemas and table descriptions are maintained in the github repo: # if 'pull_table_info_from_git' in steps: print('pull_table_info_from_git') try: create_clean_target(params['SCHEMA_REPO_LOCAL']) repo = Repo.clone_from(params['SCHEMA_REPO_URL'], params['SCHEMA_REPO_LOCAL']) repo.git.checkout(params['SCHEMA_REPO_BRANCH']) except Exception as ex: print("pull_table_info_from_git failed: {}".format(str(ex))) return if 'process_git_schemas' in steps: print('process_git_schema') # Where do we dump the schema git repository? schema_file = "{}/{}/{}".format(params['SCHEMA_REPO_LOCAL'], params['RAW_SCHEMA_DIR'], params['SCHEMA_FILE_NAME']) full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE']) # Write out the details success = generate_table_detail_files(schema_file, full_file_prefix) if not success: print("process_git_schemas failed") return if 'analyze_the_schema' in steps: print('analyze_the_schema') typing_tups = build_schema(one_big_tsv, params['SCHEMA_SAMPLE_SKIPS']) full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE']) schema_dict_loc = "{}_schema.json".format(full_file_prefix) build_combined_schema(None, schema_dict_loc, typing_tups, hold_schema_list, hold_schema_dict) # # Update the per-field descriptions: # if 'update_field_descriptions' in steps: print('update_field_descriptions') full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE']) schema_dict_loc = "{}_schema.json".format(full_file_prefix) schema_dict = {} with open(schema_dict_loc, mode='r') as schema_hold_dict: full_schema_list = json_loads(schema_hold_dict.read()) for entry in full_schema_list: schema_dict[entry['name']] = {'description': entry['description']} success = update_schema_with_dict(params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], schema_dict) if not success: print("update_field_descriptions failed") return # # Add description and labels to the target table: # if 'update_table_description' in steps: print('update_table_description') full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE']) success = install_labels_and_desc(params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], full_file_prefix) if not success: print("update_table_description failed") return # # Clear out working temp tables: # if 'dump_working_tables' in steps: dump_table_tags = ['SKELETON_TABLE', 'BARCODE_STEP_1_TABLE', 'BARCODE_STEP_2_TABLE', 'BQ_MANIFEST_TABLE', 'BQ_PULL_LIST_TABLE'] dump_tables = [params[x] for x in dump_table_tags] for table in dump_tables: delete_table_bq_job(params['TARGET_DATASET'], table) # # publish table: # if 'publish' in steps: source_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['FINAL_TARGET_TABLE']) publication_dest = '{}.{}.{}'.format(params['PUBLICATION_PROJECT'], params['PUBLICATION_DATASET'], params['PUBLICATION_TABLE']) success = publish_table(source_table, publication_dest) if not success: print("publish table failed") return print('job completed')
def main(args): #if not confirm_google_vm(): # print('This job needs to run on a Google Cloud Compute Engine to avoid storage egress charges [EXITING]') # return if len(args) != 2: print(" ") print(" Usage : {} <configuration_yaml>".format(args[0])) return print('job started') # # Get the YAML config loaded: # with open(args[1], mode='r') as yaml_file: params, steps = load_config(yaml_file.read()) if params is None: print("Bad YAML load") return # Schema that describes DCF manifests: MANIFEST_SCHEMA_LIST = "SchemaFiles/dcf_manifest_schema.json" # Schema that describes our final map table: FILE_MAP_SCHEMA_LIST = "SchemaFiles/dcf_file_map_schema.json" # # Decide if we are doing active, legacy, or both manifests: # mani_dict = {} map_dict = {} if params['DO_ACTIVE']: mani_dict['ACTIVE_MANIFEST_TSV'] = 'ACTIVE_MANIFEST_BQ' map_dict['ACTIVE_MANIFEST_BQ'] = 'ACTIVE_FILE_MAP_BQ' if params['DO_LEGACY']: mani_dict['LEGACY_MANIFEST_TSV'] = 'LEGACY_MANIFEST_BQ' map_dict['LEGACY_MANIFEST_BQ'] = 'LEGACY_FILE_MAP_BQ' # # Create a manifest BQ table from a TSV: # if 'create_bq_manifest_from_tsv' in steps: with open(MANIFEST_SCHEMA_LIST, mode='r') as schema_hold_dict: typed_schema = json_loads(schema_hold_dict.read()) for manikey in list(mani_dict.keys()): bucket_src_url = 'gs://{}/{}'.format(params['WORKING_BUCKET'], params[manikey]) success = csv_to_bq(typed_schema, bucket_src_url, params['TARGET_DATASET'], params[mani_dict[manikey]], params['BQ_AS_BATCH']) if not success: print("create_bq_manifest_from_tsv failed") return # # Create the file map tables: # if 'create_file_map_bq' in steps: for mapkey in list(map_dict.keys()): mani_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params[mapkey]) success = build_file_map(mani_table, params['TARGET_DATASET'], params[map_dict[mapkey]], params['BQ_AS_BATCH']) if not success: print("create_file_map_bq failed") return # Install a schema in the new table: schema_dict = schema_list_to_dict(FILE_MAP_SCHEMA_LIST) success = update_schema_with_dict(params['TARGET_DATASET'], params[map_dict[mapkey]], schema_dict) if not success: print("install file map schema failed") return # # Add descriptions # if 'add_table_descriptions' in steps: for mapkey in list(map_dict.keys()): success = update_description(params['TARGET_DATASET'], params[mapkey], params['DCF_MANIFEST_TABLE_DESCRIPTION']) if not success: print("install manifest description failed") return success = update_description(params['TARGET_DATASET'], params[map_dict[mapkey]], params['FILE_MAP_TABLE_DESCRIPTION']) if not success: print("install file map description failed") return print('job completed')
def main(args): if not confirm_google_vm(): print( 'This job needs to run on a Google Cloud Compute Engine to avoid storage egress charges [EXITING]' ) return if len(args) != 2: print(" ") print(" Usage : {} <configuration_yaml>".format(args[0])) return print('job started') # # Get the YAML config loaded: # with open(args[1], mode='r') as yaml_file: params, steps = load_config(yaml_file.read()) # # BQ does not like to be given paths that have "~". So make all local paths absolute: # home = expanduser("~") local_files_dir = "{}/{}".format(home, params['LOCAL_FILES_DIR']) prog_tsv = "{}/{}".format(home, params['PROG_TSV']) case_tsv = "{}/{}".format(home, params['CASE_TSV']) sample_tsv = "{}/{}".format(home, params['SAMPLE_TSV']) aliquot_tsv = "{}/{}".format(home, params['ALIQUOT_TSV']) hold_schema_dict_prog = "{}/{}".format(home, params['HOLD_SCHEMA_DICT_PROG']) hold_schema_list_prog = "{}/{}".format(home, params['HOLD_SCHEMA_LIST_PROG']) hold_schema_dict_case = "{}/{}".format(home, params['HOLD_SCHEMA_DICT_CASE']) hold_schema_list_case = "{}/{}".format(home, params['HOLD_SCHEMA_LIST_CASE']) hold_schema_dict_sample = "{}/{}".format(home, params['HOLD_SCHEMA_DICT_SAMPLE']) hold_schema_list_sample = "{}/{}".format(home, params['HOLD_SCHEMA_LIST_SAMPLE']) hold_schema_dict_aliquot = "{}/{}".format( home, params['HOLD_SCHEMA_DICT_ALIQUOT']) hold_schema_list_aliquot = "{}/{}".format( home, params['HOLD_SCHEMA_LIST_ALIQUOT']) if 'clear_target_directory' in steps: print('clear_target_directory') create_clean_target(local_files_dir) # # Use the filter set to build a manifest. Note that if a pull list is # provided, these steps can be omitted: # if 'pull_cases_per_program_from_pdc' in steps: endpoint = params["PDC_ENDPOINT"] success = pull_cases_per_program_from_pdc(endpoint, prog_tsv) if not success: print("Failure pulling programs") return if 'pull_aliquots_from_pdc' in steps: endpoint = params["PDC_ENDPOINT"] success = pull_aliquots_from_pdc(endpoint, case_tsv, sample_tsv, aliquot_tsv) if not success: print("Failure pulling programs") return # # Schemas and table descriptions are maintained in the github repo: # if 'pull_table_info_from_git' in steps: print('pull_table_info_from_git') try: create_clean_target(params['SCHEMA_REPO_LOCAL']) repo = Repo.clone_from(params['SCHEMA_REPO_URL'], params['SCHEMA_REPO_LOCAL']) repo.git.checkout(params['SCHEMA_REPO_BRANCH']) except Exception as ex: print("pull_table_info_from_git failed: {}".format(str(ex))) return if 'process_git_schemas' in steps: print('process_git_schema') # Where do we dump the schema git repository? schema_file = "{}/{}/{}".format(params['SCHEMA_REPO_LOCAL'], params['RAW_SCHEMA_DIR'], params['SCHEMA_FILE_NAME']) full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE']) # Write out the details success = generate_table_detail_files(schema_file, full_file_prefix) if not success: print("process_git_schemas failed") return if 'analyze_the_schema' in steps: print('analyze_the_schema') typing_tups = build_schema(prog_tsv, params['SCHEMA_SAMPLE_SKIPS']) full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['TARGET_TABLE_PROG']) schema_dict_loc = "{}_schema.json".format(full_file_prefix) build_combined_schema(None, None, typing_tups, hold_schema_list_prog, hold_schema_dict_prog) typing_tups = build_schema(case_tsv, params['SCHEMA_SAMPLE_SKIPS']) full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_CASE_TABLE']) schema_dict_loc = "{}_schema.json".format(full_file_prefix) build_combined_schema(None, None, typing_tups, hold_schema_list_case, hold_schema_dict_case) typing_tups = build_schema(sample_tsv, params['SCHEMA_SAMPLE_SKIPS']) full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['TARGET_TABLE_SAMPLE']) schema_dict_loc = "{}_schema.json".format(full_file_prefix) build_combined_schema(None, None, typing_tups, hold_schema_list_sample, hold_schema_dict_sample) typing_tups = build_schema(aliquot_tsv, params['SCHEMA_SAMPLE_SKIPS']) full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['TARGET_TABLE_ALIQUOT']) schema_dict_loc = "{}_schema.json".format(full_file_prefix) build_combined_schema(None, None, typing_tups, hold_schema_list_aliquot, hold_schema_dict_aliquot) bucket_target_program = '{}/{}'.format(params['WORKING_BUCKET_DIR'], params['BUCKET_TSV_PROGRAM']) bucket_target_case = '{}/{}'.format(params['WORKING_BUCKET_DIR'], params['BUCKET_TSV_CASE']) bucket_target_sample = '{}/{}'.format(params['WORKING_BUCKET_DIR'], params['BUCKET_TSV_SAMPLE']) bucket_target_aliquot = '{}/{}'.format(params['WORKING_BUCKET_DIR'], params['BUCKET_TSV_ALIQUOT']) if 'upload_to_bucket' in steps: print('upload_to_bucket') upload_to_bucket(params['WORKING_BUCKET'], bucket_target_program, prog_tsv) upload_to_bucket(params['WORKING_BUCKET'], bucket_target_case, case_tsv) upload_to_bucket(params['WORKING_BUCKET'], bucket_target_sample, sample_tsv) upload_to_bucket(params['WORKING_BUCKET'], bucket_target_aliquot, aliquot_tsv) if 'create_bq_from_tsv' in steps: print('create_bq_from_tsv') bucket_src_url = 'gs://{}/{}'.format(params['WORKING_BUCKET'], bucket_target_program) with open(hold_schema_list_prog, mode='r') as schema_hold_dict: typed_schema = json_loads(schema_hold_dict.read()) csv_to_bq(typed_schema, bucket_src_url, params['TARGET_DATASET'], params['TARGET_TABLE_PROG'], params['BQ_AS_BATCH']) bucket_src_url = 'gs://{}/{}'.format(params['WORKING_BUCKET'], bucket_target_case) with open(hold_schema_list_case, mode='r') as schema_hold_dict: typed_schema = json_loads(schema_hold_dict.read()) csv_to_bq(typed_schema, bucket_src_url, params['TARGET_DATASET'], params['TARGET_TABLE_CASE'], params['BQ_AS_BATCH']) bucket_src_url = 'gs://{}/{}'.format(params['WORKING_BUCKET'], bucket_target_sample) with open(hold_schema_list_sample, mode='r') as schema_hold_dict: typed_schema = json_loads(schema_hold_dict.read()) csv_to_bq(typed_schema, bucket_src_url, params['TARGET_DATASET'], params['TARGET_TABLE_SAMPLE'], params['BQ_AS_BATCH']) bucket_src_url = 'gs://{}/{}'.format(params['WORKING_BUCKET'], bucket_target_aliquot) with open(hold_schema_list_aliquot, mode='r') as schema_hold_dict: typed_schema = json_loads(schema_hold_dict.read()) csv_to_bq(typed_schema, bucket_src_url, params['TARGET_DATASET'], params['TARGET_TABLE_ALIQUOT'], params['BQ_AS_BATCH']) if 'join_case_tables' in steps: print('join_case_tables') full_target_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['TARGET_TABLE']) success = join_with_aliquot_table(full_target_table, params['ALIQUOT_TABLE'], params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], params['BQ_AS_BATCH']) if not success: print("Join job failed") # # Update the per-field descriptions: # if 'update_field_descriptions' in steps: print('update_field_descriptions') full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE']) schema_dict_loc = "{}_schema.json".format(full_file_prefix) schema_dict = {} with open(schema_dict_loc, mode='r') as schema_hold_dict: full_schema_list = json_loads(schema_hold_dict.read()) for entry in full_schema_list: schema_dict[entry['name']] = {'description': entry['description']} success = update_schema_with_dict(params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], schema_dict) if not success: print("update_field_descriptions failed") return # # Add description and labels to the target table: # if 'update_table_description' in steps: print('update_table_description') full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE']) success = install_labels_and_desc(params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], full_file_prefix) if not success: print("update_table_description failed") return # # publish table: # if 'publish' in steps: source_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['FINAL_TARGET_TABLE']) publication_dest = '{}.{}.{}'.format(params['PUBLICATION_PROJECT'], params['PUBLICATION_DATASET'], params['PUBLICATION_TABLE']) success = publish_table(source_table, publication_dest) if not success: print("publish table failed") return # # Clear out working temp tables: # if 'dump_working_tables' in steps: dump_table_tags = ['TARGET_TABLE'] dump_tables = [params[x] for x in dump_table_tags] for table in dump_tables: delete_table_bq_job(params['TARGET_DATASET'], table) print('job completed')
def main(args): if not confirm_google_vm(): print('This job needs to run on a Google Cloud Compute Engine to avoid storage egress charges [EXITING]') return if len(args) != 2: print(" ") print(" Usage : {} <configuration_yaml>".format(args[0])) return print('job started') # # Get the YAML config loaded: # with open(args[1], mode='r') as yaml_file: params, bq_filters, na_values, steps = load_config(yaml_file.read()) # # BQ does not like to be given paths that have "~". So make all local paths absolute: # home = expanduser("~") local_files_dir = "{}/{}".format(home, params['LOCAL_FILES_DIR']) one_big_tsv = "{}/{}".format(home, params['ONE_BIG_TSV']) manifest_file = "{}/{}".format(home, params['MANIFEST_FILE']) local_pull_list = "{}/{}".format(home, params['LOCAL_PULL_LIST']) file_traversal_list = "{}/{}".format(home, params['FILE_TRAVERSAL_LIST']) hold_schema_dict = "{}/{}".format(home, params['HOLD_SCHEMA_DICT']) hold_schema_list = "{}/{}".format(home, params['HOLD_SCHEMA_LIST']) # # Actual fields have brackets: # na_set = set() for val in na_values: na_set.add("[{}]".format(val)) if 'clear_target_directory' in steps: print('clear_target_directory') create_clean_target(local_files_dir) # # Use the filter set to build a manifest. Note that if a pull list is # provided, these steps can be omitted: # if 'build_manifest_from_filters' in steps: print('build_manifest_from_filters') max_files = params['MAX_FILES'] if 'MAX_FILES' in params else None manifest_success = get_the_bq_manifest(params['FILE_TABLE'], bq_filters, max_files, params['WORKING_PROJECT'], params['TARGET_DATASET'], params['BQ_MANIFEST_TABLE'], params['WORKING_BUCKET'], params['BUCKET_MANIFEST_TSV'], manifest_file, params['BQ_AS_BATCH']) if not manifest_success: print("Failure generating manifest") return # # We need to create a "pull list" of gs:// URLs to pull from GDC buckets. If you have already # created a pull list, just plunk it in 'LOCAL_PULL_LIST' and skip this step. If creating a pull # list, uses BQ as long as you have built the manifest using BQ (that route uses the BQ Manifest # table that was created). # if 'build_pull_list' in steps: print('build_pull_list') full_manifest = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['BQ_MANIFEST_TABLE']) success = build_pull_list_with_bq_public(full_manifest, params['INDEXD_BQ_TABLE'], params['WORKING_PROJECT'], params['TARGET_DATASET'], params['BQ_PULL_LIST_TABLE'], params['WORKING_BUCKET'], params['BUCKET_PULL_LIST'], local_pull_list, params['BQ_AS_BATCH']) if not success: print("Build pull list failed") return; # # Now hitting GDC cloud buckets. Get the files in the pull list: # if 'download_from_gdc' in steps: print('download_from_gdc') with open(local_pull_list, mode='r') as pull_list_file: pull_list = pull_list_file.read().splitlines() print("Preparing to download %s files from buckets\n" % len(pull_list)) bp = BucketPuller(10) bp.pull_from_buckets(pull_list, local_files_dir) if 'build_file_list' in steps: print('build_file_list') all_files = build_file_list(local_files_dir) with open(file_traversal_list, mode='w') as traversal_list: for line in all_files: traversal_list.write("{}\n".format(line)) if 'group_by_type' in steps: print('group_by_type') print(file_traversal_list) with open(file_traversal_list, mode='r') as traversal_list_file: all_files = traversal_list_file.read().splitlines() group_dict = group_by_suffixes(all_files) # WRITE OUT AS JSON!! if 'convert_excel_to_csv' in steps: print('convert_excel_to_csv') with open(file_traversal_list, mode='r') as traversal_list_file: all_files = traversal_list_file.read().splitlines() convert_excel_to_csv(all_files, local_files_dir) if 'concat_all_files' in steps: print('concat_all_files') for k, v in group_dict.items(): concat_all_files(v, one_big_tsv.format(k), na_set) # # Schemas and table descriptions are maintained in the github repo: # if 'pull_table_info_from_git' in steps: print('pull_table_info_from_git') try: create_clean_target(params['SCHEMA_REPO_LOCAL']) repo = Repo.clone_from(params['SCHEMA_REPO_URL'], params['SCHEMA_REPO_LOCAL']) repo.git.checkout(params['SCHEMA_REPO_BRANCH']) except Exception as ex: print("pull_table_info_from_git failed: {}".format(str(ex))) return if 'process_git_schemas' in steps: print('process_git_schema') # Where do we dump the schema git repository? schema_file = "{}/{}/{}".format(params['SCHEMA_REPO_LOCAL'], params['RAW_SCHEMA_DIR'], params['SCHEMA_FILE_NAME']) full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE']) # Write out the details success = generate_table_detail_files(schema_file, full_file_prefix) if not success: print("process_git_schemas failed") return if 'analyze_the_schema' in steps: print('analyze_the_schema') for k in group_dict: typing_tups = build_schema(one_big_tsv.format(k), params['SCHEMA_SAMPLE_SKIPS']) #full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE']) #schema_dict_loc = "{}_schema.json".format(full_file_prefix) hold_schema_dict_for_group = hold_schema_dict.format(k) hold_schema_list_for_group = hold_schema_list.format(k) build_combined_schema(None, None, typing_tups, hold_schema_list_for_group, hold_schema_dict_for_group) bucket_target_blob = '{}/{}'.format(params['WORKING_BUCKET_DIR'], params['BUCKET_TSV']) if 'upload_to_bucket' in steps: print('upload_to_bucket') for k in group_dict: upload_to_bucket(params['WORKING_BUCKET'], bucket_target_blob.format(k), one_big_tsv.format(k)) if 'create_bq_from_tsv' in steps: print('create_bq_from_tsv') for k in group_dict: bucket_src_url = 'gs://{}/{}'.format(params['WORKING_BUCKET'], bucket_target_blob.format(k)) with open(hold_schema_list.format(k), mode='r') as schema_hold_dict: typed_schema = json_loads(schema_hold_dict.read()) csv_to_bq(typed_schema, bucket_src_url, params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'].format(k.replace(".", "_").replace("-", "_")), params['BQ_AS_BATCH']) if 'add_aliquot_fields' in steps: print('add_aliquot_fields') full_target_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['TARGET_TABLE']) success = join_with_aliquot_table(full_target_table, params['ALIQUOT_TABLE'], params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], params['BQ_AS_BATCH']) if not success: print("Join job failed") # # Update the per-field descriptions: # if 'update_field_descriptions' in steps: print('update_field_descriptions') full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE']) schema_dict_loc = "{}_schema.json".format(full_file_prefix) schema_dict = {} with open(schema_dict_loc, mode='r') as schema_hold_dict: full_schema_list = json_loads(schema_hold_dict.read()) for entry in full_schema_list: schema_dict[entry['name']] = {'description': entry['description']} success = update_schema_with_dict(params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], schema_dict) if not success: print("update_field_descriptions failed") return # # Add description and labels to the target table: # if 'update_table_description' in steps: print('update_table_description') full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE']) success = install_labels_and_desc(params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], full_file_prefix) if not success: print("update_table_description failed") return # # publish table: # if 'publish' in steps: source_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['FINAL_TARGET_TABLE']) publication_dest = '{}.{}.{}'.format(params['PUBLICATION_PROJECT'], params['PUBLICATION_DATASET'], params['PUBLICATION_TABLE']) success = publish_table(source_table, publication_dest) if not success: print("publish table failed") return # # Clear out working temp tables: # if 'dump_working_tables' in steps: dump_table_tags = ['TARGET_TABLE'] dump_tables = [params[x] for x in dump_table_tags] for table in dump_tables: delete_table_bq_job(params['TARGET_DATASET'], table) print('job completed')
def main(args): if not confirm_google_vm(): print( 'This job needs to run on a Google Cloud Compute Engine to avoid storage egress charges [EXITING]' ) return if len(args) != 2: print(" ") print(" Usage : {} <configuration_yaml>".format(args[0])) return print('job started') # # Get the YAML config loaded: # with open(args[1], mode='r') as yaml_file: params, steps, tables_to_patch = load_config(yaml_file.read()) if params is None: print("Bad YAML load") return # # Schemas and table descriptions are maintained in the github repo. Only do this once: # if 'pull_table_info_from_git' in steps: print('pull_table_info_from_git') try: create_clean_target(params['SCHEMA_REPO_LOCAL']) repo = Repo.clone_from(params['SCHEMA_REPO_URL'], params['SCHEMA_REPO_LOCAL']) repo.git.checkout(params['SCHEMA_REPO_BRANCH']) except Exception as ex: print("pull_table_info_from_git failed: {}".format(str(ex))) return for mydict in tables_to_patch: full_table, table_dict = next(iter(mydict.items())) # # Extract the project, dataset, and table name: # split_table = full_table.split('.') target_project = split_table[0] target_dataset = split_table[1] target_table = split_table[2] if 'process_git_schemas' in steps: print('process_git_schema') # Where do we dump the schema git repository? schema_file_name = table_dict["generic_schema_file"] schema_file = "{}/{}/{}".format(params['SCHEMA_REPO_LOCAL'], params['RAW_SCHEMA_DIR'], schema_file_name) full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], full_table) # Write out the details success = generate_table_detail_files(schema_file, full_file_prefix) if not success: print("process_git_schemas failed") return False # Customize generic schema to this data program: if 'replace_schema_tags' in steps: print('replace_schema_tags') full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], full_table) # Write out the details success = customize_labels_and_desc(full_file_prefix, table_dict["schema_tags"]) if not success: print("replace_schema_tags failed") return False # # Update the per-field descriptions: # if 'install_field_descriptions' in steps: print('install_field_descriptions: {}'.format(full_table)) full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], full_table) schema_dict_loc = "{}_schema.json".format(full_file_prefix) schema_dict = {} with open(schema_dict_loc, mode='r') as schema_hold_dict: full_schema_list = json_loads(schema_hold_dict.read()) for entry in full_schema_list: schema_dict[entry['name']] = { 'description': entry['description'] } success = update_schema_with_dict(target_dataset, target_table, schema_dict, project=target_project) if not success: print("install_field_descriptions failed") return False # # Add description and labels to the target table: # if 'install_table_description' in steps: print('install_table_description: {}'.format(full_table)) full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], full_table) success = install_labels_and_desc(target_dataset, target_table, full_file_prefix, project=target_project) if not success: print("install_table_description failed") return False print('job completed')
def main(args): if not confirm_google_vm(): print( 'This job needs to run on a Google Cloud Compute Engine to avoid storage egress charges [EXITING]' ) return if len(args) != 2: print(" ") print(" Usage : {} <configuration_yaml>".format(args[0])) return print('job started') # # Get the YAML config loaded: # with open(args[1], mode='r') as yaml_file: params, steps = load_config(yaml_file.read()) # # Schemas and table descriptions are maintained in the github repo: # if 'pull_table_info_from_git' in steps: print('pull_table_info_from_git') try: create_clean_target(params['SCHEMA_REPO_LOCAL']) repo = Repo.clone_from(params['SCHEMA_REPO_URL'], params['SCHEMA_REPO_LOCAL']) repo.git.checkout(params['SCHEMA_REPO_BRANCH']) except Exception as ex: print("pull_table_info_from_git failed: {}".format(str(ex))) return if 'process_git_schemas' in steps: print('process_git_schema') # Where do we dump the schema git repository? schema_file = "{}/{}/{}".format(params['SCHEMA_REPO_LOCAL'], params['RAW_SCHEMA_DIR'], params['SCHEMA_FILE_NAME']) full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE']) # Write out the details success = generate_table_detail_files(schema_file, full_file_prefix) if not success: print("process_git_schemas failed") return if 'build_final_table' in steps: print('build_final_table') success = build_final_table( params['PDC_META_ALIQUOT_TABLE'], params['PDC_QUANT_ALIQUOT_TABLE'], params['PDC_META_CASES_TABLE'], params['GDC_CASE_DATA_TABLE'], params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], params['BQ_AS_BATCH']) if not success: print("Join job failed") # # Update the per-field descriptions: # if 'update_field_descriptions' in steps: print('update_field_descriptions') full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE']) schema_dict_loc = "{}_schema.json".format(full_file_prefix) schema_dict = {} with open(schema_dict_loc, mode='r') as schema_hold_dict: full_schema_list = json_loads(schema_hold_dict.read()) for entry in full_schema_list: schema_dict[entry['name']] = {'description': entry['description']} success = update_schema_with_dict(params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], schema_dict) if not success: print("update_field_descriptions failed") return # # Add description and labels to the target table: # if 'update_table_description' in steps: print('update_table_description') full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE']) success = install_labels_and_desc(params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], full_file_prefix) if not success: print("update_table_description failed") return # # publish table: # if 'publish' in steps: source_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['FINAL_TARGET_TABLE']) publication_dest = '{}.{}.{}'.format(params['PUBLICATION_PROJECT'], params['PUBLICATION_DATASET'], params['PUBLICATION_TABLE']) success = publish_table(source_table, publication_dest) if not success: print("publish table failed") return # # Clear out working temp tables: # if 'dump_working_tables' in steps: dump_table_tags = ['TARGET_TABLE'] dump_tables = [params[x] for x in dump_table_tags] for table in dump_tables: delete_table_bq_job(params['TARGET_DATASET'], table) print('job completed')
def main(args): if len(args) != 2: print(" ") print(" Usage : {} <configuration_yaml>".format(args[0])) return print('job started') # # Get the YAML config loaded: # with open(args[1], mode='r') as yaml_file: params, steps = load_config(yaml_file.read()) if params is None: print("Bad YAML load") return # # Schemas and table descriptions are maintained in the github repo. Only do this once: # if 'pull_table_info_from_git' in steps: print('pull_table_info_from_git') try: create_clean_target(params['SCHEMA_REPO_LOCAL']) repo = Repo.clone_from(params['SCHEMA_REPO_URL'], params['SCHEMA_REPO_LOCAL']) repo.git.checkout(params['SCHEMA_REPO_BRANCH']) except Exception as ex: print("pull_table_info_from_git failed: {}".format(str(ex))) return # # Pull the radiology entries # if 'pull_radiology' in steps: success = import_radiology(params['RADIOLOGY_SOURCE'], params['TARGET_DATASET'], params['RADIOLOGY_TABLE_NAME'], params['BQ_AS_BATCH']) if not success: print("pull_radiology job failed") return if bq_table_is_empty(params['TARGET_DATASET'], params['RADIOLOGY_TABLE_NAME']): delete_table_bq_job(params['TARGET_DATASET'], params['RADIOLOGY_TABLE_NAME']) print("{} pull_slide table result was empty: table deleted".format( params['RADIOLOGY_TABLE_NAME'])) # # Stage the schema metadata from the repo copy: # if 'process_git_schemas' in steps: print('process_git_schema') # Where do we dump the schema git repository? schema_file = "{}/{}/{}".format(params['SCHEMA_REPO_LOCAL'], params['RAW_SCHEMA_DIR'], params['SCHEMA_FILE_NAME']) full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['RADIOLOGY_TABLE_NAME']) # Write out the details success = generate_table_detail_files(schema_file, full_file_prefix) if not success: print("process_git_schemas failed") return False # # Update the per-field descriptions: # if 'update_field_descriptions' in steps: print('update_field_descriptions') if bq_table_exists(params['TARGET_DATASET'], params['RADIOLOGY_TABLE_NAME']): full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['RADIOLOGY_TABLE_NAME']) schema_dict_loc = "{}_schema.json".format(full_file_prefix) schema_dict = {} with open(schema_dict_loc, mode='r') as schema_hold_dict: full_schema_list = json_loads(schema_hold_dict.read()) for entry in full_schema_list: schema_dict[entry['name']] = { 'description': entry['description'] } success = update_schema_with_dict( params['TARGET_DATASET'], params['RADIOLOGY_TABLE_NAME'], schema_dict, project=params['WORKING_PROJECT']) if not success: print("update_field_descriptions failed") return # # Add description and labels to the target table: # if 'update_table_description' in steps: print('update_table_description') if bq_table_exists(params['TARGET_DATASET'], params['RADIOLOGY_TABLE_NAME']): full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['RADIOLOGY_TABLE_NAME']) success = install_labels_and_desc(params['TARGET_DATASET'], params['RADIOLOGY_TABLE_NAME'], full_file_prefix) if not success: print("update_table_description failed") return # # publish table: # if 'publish' in steps: if bq_table_exists(params['TARGET_DATASET'], params['RADIOLOGY_TABLE_NAME']): source_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['RADIOLOGY_TABLE_NAME']) publication_dest = '{}.{}.{}'.format(params['PUBLICATION_PROJECT'], params['PUBLICATION_DATASET'], params['PUBLICATION_TABLE']) success = publish_table(source_table, publication_dest) if not success: print("publish table failed") return print('job completed')
def main(args): if not confirm_google_vm(): print( 'This job needs to run on a Google Cloud Compute Engine to avoid storage egress charges [EXITING]' ) return if len(args) != 2: print(" ") print(" Usage : {} <configuration_yaml>".format(args[0])) return print('job started') # # Get the YAML config loaded: # with open(args[1], mode='r') as yaml_file: params, file_sets, steps = load_config(yaml_file.read()) # # BQ does not like to be given paths that have "~". So make all local paths absolute: # home = expanduser("~") local_files_dir = "{}/{}".format(home, params['LOCAL_FILES_DIR']) one_big_tsv = "{}/{}".format(home, params['ONE_BIG_TSV']) manifest_file = "{}/{}".format(home, params['MANIFEST_FILE']) local_pull_list = "{}/{}".format(home, params['LOCAL_PULL_LIST']) file_traversal_list = "{}/{}".format(home, params['FILE_TRAVERSAL_LIST']) hold_schema_dict = "{}/{}".format(home, params['HOLD_SCHEMA_DICT']) hold_schema_list = "{}/{}".format(home, params['HOLD_SCHEMA_LIST']) if 'clear_target_directory' in steps: for file_set in file_sets: count_name, _ = next(iter(file_set.items())) create_clean_target(local_files_dir.format(count_name)) if 'build_manifest_from_filters' in steps: for file_set in file_sets: count_name, count_dict = next(iter(file_set.items())) mani_for_count = manifest_file.format(count_name) table_for_count = params['BQ_MANIFEST_TABLE'].format(count_name) tsv_for_count = params['BUCKET_MANIFEST_TSV'].format(count_name) max_files = params['MAX_FILES'] if 'MAX_FILES' in params else None manifest_success = get_the_bq_manifest( params['FILE_TABLE'], count_dict['filters'], max_files, params['WORKING_PROJECT'], params['TARGET_DATASET'], table_for_count, params['WORKING_BUCKET'], tsv_for_count, mani_for_count, params['BQ_AS_BATCH']) if not manifest_success: print("Failure generating manifest") return # # We need to create a "pull list" of gs:// URLs to pull from GDC buckets. If you have already # created a pull list, just plunk it in 'LOCAL_PULL_LIST' and skip this step. If creating a pull # list, uses BQ as long as you have built the manifest using BQ (that route uses the BQ Manifest # table that was created). # if 'build_pull_list' in steps: for file_set in file_sets: count_name, count_dict = next(iter(file_set.items())) table_for_count = params['BQ_MANIFEST_TABLE'].format(count_name) local_pull_for_count = local_pull_list.format(count_name) pull_table_for_count = params['BQ_PULL_LIST_TABLE'].format( count_name) bucket_pull_list_for_count = params['BUCKET_PULL_LIST'].format( count_name) full_manifest = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], table_for_count) build_pull_list_with_bq( full_manifest, params['INDEXD_BQ_TABLE'], params['WORKING_PROJECT'], params['TARGET_DATASET'], pull_table_for_count, params['WORKING_BUCKET'], bucket_pull_list_for_count, local_pull_for_count, params['BQ_AS_BATCH']) # # Now hitting GDC cloud buckets. Get the files in the pull list: # if 'download_from_gdc' in steps: for file_set in file_sets: count_name, _ = next(iter(file_set.items())) pull_for_count = local_pull_list.format(count_name) with open(pull_for_count, mode='r') as pull_list_file: pull_list = pull_list_file.read().splitlines() print("Preparing to download %s files from buckets\n" % len(pull_list)) bp = BucketPuller(10) local_files_dir_for_count = local_files_dir.format(count_name) bp.pull_from_buckets(pull_list, local_files_dir_for_count) if 'build_file_list' in steps: for file_set in file_sets: count_name, _ = next(iter(file_set.items())) local_files_dir_for_count = local_files_dir.format(count_name) all_files = build_file_list(local_files_dir_for_count) file_traversal_list_for_count = file_traversal_list.format( count_name) with open(file_traversal_list_for_count, mode='w') as traversal_list: for line in all_files: traversal_list.write("{}\n".format(line)) if 'concat_all_files' in steps: for file_set in file_sets: count_name, count_dict = next(iter(file_set.items())) header = count_dict['header'] if 'header' in count_dict else None file_traversal_list_for_count = file_traversal_list.format( count_name) with open(file_traversal_list_for_count, mode='r') as traversal_list_file: all_files = traversal_list_file.read().splitlines() concat_all_files(all_files, one_big_tsv.format(count_name), header) # # Schemas and table descriptions are maintained in the github repo: # if 'pull_table_info_from_git' in steps: print('pull_table_info_from_git') try: create_clean_target(params['SCHEMA_REPO_LOCAL']) repo = Repo.clone_from(params['SCHEMA_REPO_URL'], params['SCHEMA_REPO_LOCAL']) repo.git.checkout(params['SCHEMA_REPO_BRANCH']) except Exception as ex: print("pull_table_info_from_git failed: {}".format(str(ex))) return if 'process_git_schemas' in steps: print('process_git_schema') # Where do we dump the schema git repository? schema_file = "{}/{}/{}".format(params['SCHEMA_REPO_LOCAL'], params['RAW_SCHEMA_DIR'], params['SCHEMA_FILE_NAME']) full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE']) # Write out the details success = generate_table_detail_files(schema_file, full_file_prefix) if not success: print("process_git_schemas failed") return if 'analyze_the_schema' in steps: print('analyze_the_schema') for file_set in file_sets: count_name, _ = next(iter(file_set.items())) typing_tups = build_schema(one_big_tsv.format(count_name), params['SCHEMA_SAMPLE_SKIPS']) full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE']) schema_dict_loc = "{}_schema.json".format(full_file_prefix) build_combined_schema(None, schema_dict_loc, typing_tups, hold_schema_list.format(count_name), hold_schema_dict.format(count_name)) bucket_target_blob_sets = {} for file_set in file_sets: count_name, _ = next(iter(file_set.items())) bucket_target_blob_sets[count_name] = '{}/{}'.format( params['WORKING_BUCKET_DIR'], params['BUCKET_TSV'].format(count_name)) if 'upload_to_bucket' in steps: for file_set in file_sets: count_name, _ = next(iter(file_set.items())) upload_to_bucket(params['WORKING_BUCKET'], bucket_target_blob_sets[count_name], one_big_tsv.format(count_name)) if 'delete_all_bq' in steps: table_cleaner(params, file_sets, True) if 'create_bq_from_tsv' in steps: for file_set in file_sets: count_name, _ = next(iter(file_set.items())) bucket_src_url = 'gs://{}/{}'.format( params['WORKING_BUCKET'], bucket_target_blob_sets[count_name]) hold_schema_list_for_count = hold_schema_list.format(count_name) with open(hold_schema_list_for_count, mode='r') as schema_hold_dict: typed_schema = json_loads(schema_hold_dict.read()) csv_to_bq_write_depo(typed_schema, bucket_src_url, params['TARGET_DATASET'], params['TARGET_TABLE'].format(count_name), params['BQ_AS_BATCH'], None) if 'attach_ids_to_files' in steps: count = 0 for file_set in file_sets: count_name, _ = next(iter(file_set.items())) write_depo = "WRITE_TRUNCATE" if (count == 0) else "WRITE_APPEND" gexp_table = '{}.{}.{}'.format( params['WORKING_PROJECT'], params['TARGET_DATASET'], params['TARGET_TABLE'].format(count_name)) success = build_aliquot_and_case( gexp_table, params['FILEDATA_TABLE'], params['TARGET_DATASET'], params['STEP_2_TABLE'], write_depo, {}, params['BQ_AS_BATCH']) count += 1 if not success: print("attach_ids_to_files failed") return if 'extract_platform' in steps: step2_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['STEP_2_TABLE']) success = extract_platform_for_files(step2_table, params['FILEDATA_TABLE'], params['TARGET_DATASET'], params['STEP_2A_TABLE'], True, {}, params['BQ_AS_BATCH']) if not success: print("extract_platform failed") return if 'attach_barcodes_to_ids' in steps: step2_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['STEP_2A_TABLE']) success = attach_barcodes(step2_table, params['ALIQUOT_TABLE'], params['TARGET_DATASET'], params['STEP_3_TABLE'], True, {}, params['BQ_AS_BATCH']) if not success: print("attach_barcodes_to_ids failed") return if 'merge_counts_and_metadata' in steps: for file_set in file_sets: count_name, count_dict = next(iter(file_set.items())) if 'header' not in count_dict: print("must have defined headers to work") break header = count_dict['header'] print(header) sql_dict = {} sql_dict['count_column'] = header.split(',')[1].strip() sql_dict['file_column'] = 'file_gdc_id_{}'.format(count_name) step3_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['STEP_3_TABLE']) counts_table = '{}.{}.{}'.format( params['WORKING_PROJECT'], params['TARGET_DATASET'], params['TARGET_TABLE'].format(count_name)) success = merge_counts_and_metadata( step3_table, counts_table, params['TARGET_DATASET'], params['COUNTS_WITH_METADATA_TABLE'].format(count_name), True, sql_dict, params['BQ_AS_BATCH']) if not success: print("merge_counts_and_metadata failed") return if 'merge_all' in steps: sql_dict = {} count = 0 for file_set in file_sets: count_name, count_dict = next(iter(file_set.items())) dict_for_set = {} sql_dict['table_{}'.format(count)] = dict_for_set count += 1 if 'header' not in count_dict: print("must have defined headers to work") return header = count_dict['header'] dict_for_set['count_column'] = header.split(',')[1].strip() dict_for_set['file_column'] = 'file_gdc_id_{}'.format(count_name) dict_for_set['table'] = '{}.{}.{}'.format( params['WORKING_PROJECT'], params['TARGET_DATASET'], params['COUNTS_WITH_METADATA_TABLE'].format(count_name)) success = all_counts_to_one_table(params['TARGET_DATASET'], params['THREE_COUNTS_TABLE'], True, sql_dict, params['BQ_AS_BATCH']) if not success: print("merge_counts_and_metadata failed") return if 'glue_gene_names' in steps: sql_dict = {} count = 0 for file_set in file_sets: count_name, count_dict = next(iter(file_set.items())) dict_for_set = {} sql_dict['table_{}'.format(count)] = dict_for_set count += 1 if 'header' not in count_dict: print("must have defined headers to work") return header = count_dict['header'] dict_for_set['count_column'] = header.split(',')[1].strip() dict_for_set['file_column'] = 'file_gdc_id_{}'.format(count_name) three_counts_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['THREE_COUNTS_TABLE']) success = glue_in_gene_names(three_counts_table, params['GENE_NAMES_TABLE'], params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], True, sql_dict, params['BQ_AS_BATCH']) if not success: print("glue_gene_names failed") return # # Update the per-field descriptions: # if 'update_field_descriptions' in steps: print('update_field_descriptions') full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE']) schema_dict_loc = "{}_schema.json".format(full_file_prefix) schema_dict = {} with open(schema_dict_loc, mode='r') as schema_hold_dict: full_schema_list = json_loads(schema_hold_dict.read()) for entry in full_schema_list: schema_dict[entry['name']] = {'description': entry['description']} success = update_schema_with_dict(params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], schema_dict) if not success: print("update_field_descriptions failed") return # # Add description and labels to the target table: # if 'update_table_description' in steps: print('update_table_description') full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], params['FINAL_TARGET_TABLE']) success = install_labels_and_desc(params['TARGET_DATASET'], params['FINAL_TARGET_TABLE'], full_file_prefix) if not success: print("update_table_description failed") return if 'dump_working_tables' in steps: table_cleaner(params, file_sets, False) # # archive files on VM: # bucket_archive_blob_sets = {} for file_set in file_sets: count_name, _ = next(iter(file_set.items())) bucket_target_blob_sets[count_name] = '{}/{}'.format( params['ARCHIVE_BUCKET_DIR'], params['BUCKET_TSV'].format(count_name)) if 'archive' in steps: print('archive files from VM') archive_file_prefix = "{}_{}".format(date.today(), params['PUBLICATION_DATASET']) yaml_file = re.search(r"\/(\w*.yaml)$", args[1]) archive_yaml = "{}/{}/{}_{}".format(params['ARCHIVE_BUCKET_DIR'], params['ARCHIVE_CONFIG'], archive_file_prefix, yaml_file.group(1)) upload_to_bucket(params['ARCHIVE_BUCKET'], archive_yaml, args[1]) for file_set in file_sets: count_name, count_dict = next(iter(file_set.items())) pull_file_name = params['LOCAL_PULL_LIST'] archive_pull_file = "{}/{}_{}".format( params['ARCHIVE_BUCKET_DIR'], archive_file_prefix, pull_file_name.format(count_name)) upload_to_bucket(params['ARCHIVE_BUCKET'], archive_pull_file, local_pull_list.format(count_name)) manifest_file_name = params['MANIFEST_FILE'] archive_manifest_file = "{}/{}_{}".format( params['ARCHIVE_BUCKET_DIR'], archive_file_prefix, manifest_file_name.format(count_name)) upload_to_bucket(params['ARCHIVE_BUCKET'], archive_manifest_file, manifest_file.format(count_name)) # # publish table: # if 'publish' in steps: source_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], params['FINAL_TARGET_TABLE']) publication_dest = '{}.{}.{}'.format(params['PUBLICATION_PROJECT'], params['PUBLICATION_DATASET'], params['PUBLICATION_TABLE']) success = publish_table(source_table, publication_dest) if not success: print("publish table failed") return print('job completed')
def do_dataset_and_build(steps, build, build_tag, path_tag, dataset_tuple, aliquot_map_programs, params, schema_tags): file_table = "{}_{}".format(params['FILE_TABLE'], build_tag) # # Pull stuff from rel: # if 'pull_slides' in steps: step_one_table = "{}_{}_{}".format(dataset_tuple[1], build, params['SLIDE_STEP_1_TABLE']) success = extract_active_slide_file_data(file_table, dataset_tuple[0], params['TARGET_DATASET'], step_one_table, params['BQ_AS_BATCH']) if not success: print("{} {} pull_slides job failed".format( dataset_tuple[0], build)) return False if bq_table_is_empty(params['TARGET_DATASET'], step_one_table): delete_table_bq_job(params['TARGET_DATASET'], step_one_table) print("{} pull_slide table result was empty: table deleted".format( params['SLIDE_STEP_1_TABLE'])) if 'pull_aliquot' in steps: step_one_table = "{}_{}_{}".format(dataset_tuple[1], build, params['ALIQUOT_STEP_1_TABLE']) success = extract_active_aliquot_file_data(file_table, dataset_tuple[0], params['TARGET_DATASET'], step_one_table, params['BQ_AS_BATCH']) if not success: print("{} {} pull_aliquot job failed".format( dataset_tuple[0], build)) return False if bq_table_is_empty(params['TARGET_DATASET'], step_one_table): delete_table_bq_job(params['TARGET_DATASET'], step_one_table) print( "{} pull_aliquot table result was empty: table deleted".format( params['ALIQUOT_STEP_1_TABLE'])) if 'pull_case' in steps: step_one_table = "{}_{}_{}".format(dataset_tuple[1], build, params['CASE_STEP_1_TABLE']) success = extract_active_case_file_data(file_table, dataset_tuple[0], params['TARGET_DATASET'], step_one_table, params['BQ_AS_BATCH']) if not success: print("{} {} pull_clinbio job failed".format( dataset_tuple[0], build)) return False if bq_table_is_empty(params['TARGET_DATASET'], step_one_table): delete_table_bq_job(params['TARGET_DATASET'], step_one_table) print("{} pull_case table result was empty: table deleted".format( params['CASE_STEP_1_TABLE'])) if 'slide_barcodes' in steps: table_name = "{}_{}_{}".format(dataset_tuple[1], build, params['SLIDE_STEP_1_TABLE']) in_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], table_name) if bq_table_exists(params['TARGET_DATASET'], table_name): step_two_table = "{}_{}_{}".format(dataset_tuple[1], build, params['SLIDE_STEP_2_TABLE']) success = extract_slide_barcodes(in_table, params['SLIDE_TABLE'], dataset_tuple[0], params['TARGET_DATASET'], step_two_table, params['BQ_AS_BATCH']) if not success: print("{} {} slide_barcodes job failed".format( dataset_tuple[0], build)) return False if 'aliquot_barcodes' in steps: table_name = "{}_{}_{}".format(dataset_tuple[1], build, params['ALIQUOT_STEP_1_TABLE']) in_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], table_name) if bq_table_exists(params['TARGET_DATASET'], table_name): step_two_table = "{}_{}_{}".format(dataset_tuple[1], build, params['ALIQUOT_STEP_2_TABLE']) if dataset_tuple[0] in aliquot_map_programs: success = extract_aliquot_barcodes(in_table, params['ALIQUOT_TABLE'], dataset_tuple[0], params['TARGET_DATASET'], step_two_table, params['BQ_AS_BATCH']) if not success: print("{} {} align_barcodes job failed".format( dataset_tuple[0], build)) return False else: success = prepare_aliquot_without_map(in_table, params['CASE_TABLE'], dataset_tuple[0], params['TARGET_DATASET'], step_two_table, params['BQ_AS_BATCH']) if not success: print("{} {} align_barcodes job failed".format( dataset_tuple[0], build)) return False else: print( "{} {} aliquot_barcodes step skipped (no input table)".format( dataset_tuple[0], build)) if 'case_barcodes' in steps: table_name = "{}_{}_{}".format(dataset_tuple[1], build, params['CASE_STEP_1_TABLE']) in_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], table_name) if bq_table_exists(params['TARGET_DATASET'], table_name): step_two_table = "{}_{}_{}".format(dataset_tuple[1], build, params['CASE_STEP_2_TABLE']) success = extract_case_barcodes(in_table, params['CASE_TABLE'], dataset_tuple[0], params['TARGET_DATASET'], step_two_table, params['BQ_AS_BATCH']) if not success: print("{} {} case_barcodes job failed".format( dataset_tuple[0], build)) return False if 'union_tables' in steps: table_list = [] union_table_tags = [ 'SLIDE_STEP_2_TABLE', 'ALIQUOT_STEP_2_TABLE', 'CASE_STEP_2_TABLE' ] for tag in union_table_tags: if tag in params: table_name = "{}_{}_{}".format(dataset_tuple[1], build, params[tag]) if bq_table_exists(params['TARGET_DATASET'], table_name): full_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], table_name) table_list.append(full_table) union_table = "{}_{}_{}".format(dataset_tuple[1], build, params['UNION_TABLE']) success = build_union(table_list, params['TARGET_DATASET'], union_table, params['BQ_AS_BATCH']) if not success: print("{} {} union_tables job failed".format( dataset_tuple[0], build)) return False # Merge the URL info into the final table we are building: if 'create_final_table' in steps: union_table = '{}.{}.{}'.format( params['WORKING_PROJECT'], params['TARGET_DATASET'], "{}_{}_{}".format(dataset_tuple[1], build, params['UNION_TABLE'])) success = install_uris( union_table, "{}{}".format(params['UUID_2_URL_TABLE'], path_tag), params['TARGET_DATASET'], "{}_{}_{}".format(dataset_tuple[1], build, params['FINAL_TABLE']), params['BQ_AS_BATCH']) if not success: print("{} {} create_final_table job failed".format( dataset_tuple[0], build)) return False # Stage the schema metadata from the repo copy: if 'process_git_schemas' in steps: print('process_git_schema') # Where do we dump the schema git repository? schema_file = "{}/{}/{}".format(params['SCHEMA_REPO_LOCAL'], params['RAW_SCHEMA_DIR'], params['GENERIC_SCHEMA_FILE_NAME']) table_name = "{}_{}_{}".format(dataset_tuple[1], build, params['FINAL_TABLE']) full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], table_name) # Write out the details success = generate_table_detail_files(schema_file, full_file_prefix) if not success: print("process_git_schemas failed") return # Customize generic schema to this data program: if 'replace_schema_tags' in steps: print('replace_schema_tags') tag_map_list = [] for tag_pair in schema_tags: for tag in tag_pair: val = tag_pair[tag] use_pair = {} tag_map_list.append(use_pair) if val.find('~-') == 0 or val.find('~lc-') == 0 or val.find( '~lcbqs-') == 0: chunks = val.split('-', 1) if chunks[1] == 'programs': if val.find('~lcbqs-') == 0: rep_val = dataset_tuple[1].lower( ) # can't have "." in a tag... else: rep_val = dataset_tuple[0] elif chunks[1] == 'path_tags': rep_val = path_tag elif chunks[1] == 'builds': rep_val = build else: raise Exception() if val.find('~lc-') == 0: rep_val = rep_val.lower() use_pair[tag] = rep_val else: use_pair[tag] = val table_name = "{}_{}_{}".format(dataset_tuple[1], build, params['FINAL_TABLE']) full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], table_name) # Write out the details success = customize_labels_and_desc(full_file_prefix, tag_map_list) if not success: print("replace_schema_tags failed") return # # Update the per-field descriptions: # if 'install_field_descriptions' in steps: table_name = "{}_{}_{}".format(dataset_tuple[1], build, params['FINAL_TABLE']) print('install_field_descriptions: {}'.format(table_name)) full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], table_name) schema_dict_loc = "{}_schema.json".format(full_file_prefix) schema_dict = {} with open(schema_dict_loc, mode='r') as schema_hold_dict: full_schema_list = json_loads(schema_hold_dict.read()) for entry in full_schema_list: schema_dict[entry['name']] = {'description': entry['description']} success = update_schema_with_dict(params['TARGET_DATASET'], table_name, schema_dict, project=params['WORKING_PROJECT']) if not success: print("install_field_descriptions failed") return # # Add description and labels to the target table: # if 'install_table_description' in steps: table_name = "{}_{}_{}".format(dataset_tuple[1], build, params['FINAL_TABLE']) print('install_table_description: {}'.format(table_name)) full_file_prefix = "{}/{}".format(params['PROX_DESC_PREFIX'], table_name) success = install_labels_and_desc(params['TARGET_DATASET'], table_name, full_file_prefix, project=params['WORKING_PROJECT']) if not success: print("install_table_description failed") return # # publish table: # if 'publish' in steps: table_name = "{}_{}_{}".format(dataset_tuple[1], build, params['FINAL_TABLE']) print('publish: {}'.format(table_name)) source_table = '{}.{}.{}'.format(params['WORKING_PROJECT'], params['TARGET_DATASET'], table_name) publication_dest = '{}.{}.{}'.format(params['PUBLICATION_PROJECT'], dataset_tuple[1], table_name) success = publish_table(source_table, publication_dest) if not success: print("publish failed") return # # Clear out working temp tables: # if 'dump_working_tables' in steps: print('dump_working_tables') dump_tables = [] dump_table_tags = [ 'SLIDE_STEP_1_TABLE', 'SLIDE_STEP_2_TABLE', 'ALIQUOT_STEP_1_TABLE', 'ALIQUOT_STEP_2_TABLE', 'CASE_STEP_1_TABLE', 'CASE_STEP_2_TABLE', 'UNION_TABLE' ] for tag in dump_table_tags: table_name = "{}_{}_{}".format(dataset_tuple[1], build, params[tag]) if bq_table_exists(params['TARGET_DATASET'], table_name): dump_tables.append(table_name) for table in dump_tables: success = delete_table_bq_job(params['TARGET_DATASET'], table) if not success: print("problem deleting table {}".format(table)) # # Done! # return True