def main(): p, args = parse_args() df = pd.read_table(args.cram_and_tsv_paths_table) if {"sample_id", "output_bamout_bam", "output_bamout_bai", "variants_tsv_bgz"} - set(df.columns): p.error(f"{args.tsv_path} must contain 'sample_id', 'output_bamout_bam', 'variants_tsv_bgz' columns") if args.num_samples_to_process: if args.random: df = df.sample(n=args.num_samples_to_process) else: df = df.iloc[:args.num_samples_to_process] if args.sample_to_process: df = df[df.sample_id.isin(set(args.sample_to_process))] logging.info(f"Processing {len(df)} samples") # check that all buckets are in "US-CENTRAL1" or are multi-regional to avoid egress charges to the Batch cluster batch_utils.set_gcloud_project(GCLOUD_PROJECT) with open("deidentify_bamout.py", "rt") as f: deidentify_bamouts_script = f.read() # process sample(s) if not args.sample_to_process and not args.num_samples_to_process: # if processing entire table, listing all files up front ends up being faster existing_deidentify_output_bams = subprocess.check_output(f"gsutil -m ls {args.output_dir}/*.deidentify_output.bam", shell=True, encoding="UTF-8").strip().split("\n") existing_deidentify_output_sorted_bams = subprocess.check_output(f"gsutil -m ls {args.output_dir}/*.deidentify_output.sorted.bam", shell=True, encoding="UTF-8").strip().split("\n") hl.init(log="/dev/null") with batch_utils.run_batch(args, batch_name=f"deidentify bamouts: {len(df)} samples") as batch: for _, row in tqdm.tqdm(df.iterrows(), unit=" samples"): output_bam_path = os.path.join(args.output_dir, f"{row.sample_id}.deidentify_output.bam") output_sorted_bam_path = os.path.join(args.output_dir, f"{row.sample_id}.deidentify_output.sorted.bam") if args.sample_to_process or args.num_samples_to_process: run_deidentify = args.force or not hl.hadoop_is_file(output_bam_path) run_sort = run_deidentify or not hl.hadoop_is_file(output_sorted_bam_path) else: run_deidentify = args.force or output_bam_path not in existing_deidentify_output_bams run_sort = run_deidentify or output_sorted_bam_path not in existing_deidentify_output_sorted_bams if run_deidentify or run_sort: bamout_stat = hl.hadoop_stat(row.output_bamout_bam) cpu = 0.25 if bamout_stat['size_bytes'] > 0.25 * 20_000_000_000: cpu = 0.5 if bamout_stat['size_bytes'] > 0.5 * 20_000_000_000: cpu = 1 if bamout_stat['size_bytes'] > 1 * 20_000_000_000: cpu = 2 if run_deidentify: j = batch_utils.init_job(batch, f"{row.sample_id} - deidentify - cpu:{cpu}", DOCKER_IMAGE if not args.raw else None, cpu=cpu, disk_size=21*cpu) batch_utils.switch_gcloud_auth_to_user_account(j, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT) local_tsv_path = batch_utils.localize_file(j, row.variants_tsv_bgz, use_gcsfuse=True) local_exclude_tsv_path = batch_utils.localize_file(j, row.exclude_variants_tsv_bgz, use_gcsfuse=True) local_bamout_path = batch_utils.localize_file(j, row.output_bamout_bam, use_gcsfuse=True) batch_utils.localize_file(j, row.output_bamout_bai, use_gcsfuse=True) j.command(f"""echo -------------- echo "Start - time: $(date)" df -kh cat <<EOF > deidentify_bamout.py {deidentify_bamouts_script} EOF time python3 deidentify_bamout.py -x "{local_exclude_tsv_path}" "{row.sample_id}" "{local_bamout_path}" "{local_tsv_path}" ls -lh gsutil -m cp "{row.sample_id}.deidentify_output.bam" {args.output_dir}/ gsutil -m cp "{row.sample_id}.deidentify_output.db" {args.output_dir}/ echo --------------; free -h; df -kh; uptime; set +xe; echo "Done - time: $(date)"; echo -------------- """) else: logger.info(f"Skipping deidentify {row.sample_id}...") if run_sort: j2 = batch_utils.init_job(batch, f"{row.sample_id} - sort - cpu:{cpu}", DOCKER_IMAGE if not args.raw else None, cpu=cpu) batch_utils.switch_gcloud_auth_to_user_account(j2, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT) if run_deidentify: j2.depends_on(j) local_bamout_path = batch_utils.localize_file(j2, output_bam_path, use_gcsfuse=True) j2.command(f"""echo -------------- echo "Start - time: $(date)" df -kh samtools sort -o "{row.sample_id}.deidentify_output.sorted.bam" "{local_bamout_path}" samtools index "{row.sample_id}.deidentify_output.sorted.bam" ls -lh gsutil -m cp "{row.sample_id}.deidentify_output.sorted.bam" {args.output_dir}/ gsutil -m cp "{row.sample_id}.deidentify_output.sorted.bam.bai" {args.output_dir}/ echo --------------; free -h; df -kh; uptime; set +xe; echo "Done - time: $(date)"; echo -------------- """) elif run_sort: logger.info(f"Sorted output files exist (eg. {output_sorted_bam_path}). Skipping sort for {row.sample_id}...")
def main(): p, args = parse_args() df = pd.read_table(args.cram_and_tsv_paths_table) if {"sample_id", "cram_path", "crai_path", "variants_tsv_bgz"} - set( df.columns): p.error( f"{args.tsv_path} must contain 'sample_id', 'cram_path' columns") # check that all buckets are in "US-CENTRAL1" or are multi-regional to avoid egress charges to the Batch cluster batch_utils.set_gcloud_project(GCLOUD_PROJECT) if args.cluster: batch_utils.check_storage_bucket_region(df.cram_path) if not args.force: hl.init(log="/dev/null", quiet=True) # process samples with batch_utils.run_batch(args, batch_name=f"HaplotypeCaller -bamout") as batch: counter = 0 for _, row in tqdm.tqdm(df.iterrows(), unit=" rows", total=len(df)): if args.sample_to_process and row.sample_id not in set( args.sample_to_process): continue input_filename = os.path.basename(row.cram_path) output_prefix = input_filename.replace(".bam", "").replace(".cram", "") output_bam_path = os.path.join(args.output_dir, f"{output_prefix}.bamout.bam") output_bai_path = os.path.join(args.output_dir, f"{output_prefix}.bamout.bai") if not args.force and hl.hadoop_is_file( output_bam_path) and hl.hadoop_is_file(output_bai_path): logger.info( f"Output files exist (eg. {output_bam_path}). Skipping {input_filename}..." ) continue counter += 1 if args.num_samples_to_process and counter > args.num_samples_to_process: break j = batch_utils.init_job(batch, f"readviz: {row.sample_id}", DOCKER_IMAGE if not args.raw else None, args.cpu, args.memory) batch_utils.switch_gcloud_auth_to_user_account( j, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT) local_exclude_intervals = batch_utils.localize_file( j, EXCLUDE_INTERVALS) local_fasta = batch_utils.localize_file( j, batch_utils.HG38_REF_PATHS.fasta, use_gcsfuse=True) local_fasta_fai = batch_utils.localize_file( j, batch_utils.HG38_REF_PATHS.fai, use_gcsfuse=True) batch_utils.localize_file(j, batch_utils.HG38_REF_PATHS.dict, use_gcsfuse=True) local_tsv_bgz = batch_utils.localize_file(j, row.variants_tsv_bgz) local_cram_path = batch_utils.localize_file(j, row.cram_path) local_crai_path = batch_utils.localize_file(j, row.crai_path) j.command(f"""echo -------------- echo "Start - time: $(date)" df -kh # 1) Convert variants_tsv_bgz to sorted interval list gunzip -c "{local_tsv_bgz}" | awk '{{ OFS="\t" }} {{ print( "chr"$1, $2, $2 ) }}' | bedtools slop -b {PADDING_AROUND_VARIANT} -g {local_fasta_fai} > variant_windows.bed # Sort the .bed file so that chromosomes are in the same order as in the input_cram file. # Without this, if the input_cram has a different chromosome ordering (eg. chr1, chr10, .. vs. chr1, chr2, ..) # than the interval list passed to GATK tools' -L arg, then GATK may silently skip some of regions in the -L intervals. # The sort is done by first retrieving the input_cram header and passing it to GATK BedToIntervalList. java -Xms2g -jar /gatk/gatk.jar PrintReadsHeader \ --gcs-project-for-requester-pays {GCLOUD_PROJECT} \ -R {local_fasta} \ -I "{local_cram_path}" \ -O header.bam java -Xms2g -jar /gatk/gatk.jar BedToIntervalList \ --SORT true \ --SEQUENCE_DICTIONARY header.bam \ --INPUT variant_windows.bed \ --OUTPUT variant_windows.interval_list # 2) Get reads from the input_cram for the intervals in variant_windows.interval_list time java -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -XX:+DisableAttachMechanism -XX:MaxHeapSize=2000m -Xmx30000m \ -jar /gatk/GATK35.jar \ -T HaplotypeCaller \ -R {local_fasta} \ -I "{local_cram_path}" \ -L variant_windows.interval_list \ -XL {local_exclude_intervals} \ --disable_auto_index_creation_and_locking_when_reading_rods \ -ERC GVCF \ --max_alternate_alleles 3 \ -variant_index_parameter 128000 \ -variant_index_type LINEAR \ --read_filter OverclippedRead \ -bamout "{output_prefix}.bamout.bam" \ -o "{output_prefix}.gvcf" |& grep -v "^DEBUG" bgzip "{output_prefix}.gvcf" tabix "{output_prefix}.gvcf.gz" gsutil -m cp "{output_prefix}.bamout.bam" {args.output_dir} gsutil -m cp "{output_prefix}.bamout.bai" {args.output_dir} gsutil -m cp "{output_prefix}.gvcf.gz" {args.output_dir} gsutil -m cp "{output_prefix}.gvcf.gz.tbi" {args.output_dir} ls -lh echo --------------; free -h; df -kh; uptime; set +xe; echo "Done - time: $(date)"; echo -------------- """)
def main(): p = batch_utils.init_arg_parser( default_cpu=NUM_CPU, default_memory=NUM_CPU*3.75, gsa_key_file=os.path.expanduser("~/.config/gcloud/misc-270914-cb9992ec9b25.json")) grp = p.add_mutually_exclusive_group(required=True) grp.add_argument("--all", action="store_true", help="run all samples") grp.add_argument("-s", "--sample", help="process specific sample name(s)", action="append") grp.add_argument("-n", "--n-samples", type=int, help="run on the 1st n samples only. Useful for debugging") p.add_argument("--offset", type=int, default=0, help="apply this offset before applying -n. Useful for debugging") p.add_argument("--model", help="Which DeepTrio model to use", choices={"WES", "WGS", "PACBIO"}, required=True) p.add_argument("trios_tsv", help="Trios tsv", default="trios.tsv") args = p.parse_args() if not os.path.isfile(args.trios_tsv): p.error(f"File not found: {args.trios_tsv}") if args.trios_tsv.endswith(".xls") or args.trios_tsv.endswith(".xlsx"): df = pd.read_excel(args.trios_tsv) else: df = pd.read_table(args.trios_tsv) missing_columns = EXPECTED_COLUMNS - set(df.columns) if missing_columns: p.error(f"{args.trios_tsv} is missing columns: {missing_columns}") if args.n_samples: df = df[args.offset:args.offset+args.n_samples] if args.sample: df = df[df.sample_id.isin(set(args.sample))] if len(df) < len(set(filter(None, args.sample))): p.error(", ".join(set(args.sample) - set(df.sample_id)) + ": sample ids not found or don't have a bam file path") logger.info(f"Processing {len(df)} sample(s): " + ", ".join(list(df.sample_id[:10]))) else: logger.info(f"Processing all {len(df)} samples") output_subdir = ".".join(os.path.basename(args.trios_tsv).split(".")[:-1]) existing_output_files = batch_utils.generate_path_to_file_size_dict( os.path.join(OUTPUT_BASE_DIR, f"{output_subdir}/results_*.tar.gz")) # process samples with batch_utils.run_batch(args, batch_name=f"DeepTrio: " + (", ".join(df.individual_id) if len(df) < 5 else f"{len(df)} trio(s)")) as batch: for i, row in df.iterrows(): name = re.sub(".bam$|.cram$", "", os.path.basename(row.reads)) name_parent1 = re.sub(".bam$|.cram$", "", os.path.basename(row.parent1_reads)) name_parent2 = re.sub(".bam$|.cram$", "", os.path.basename(row.parent2_reads)) output_file = os.path.join(OUTPUT_BASE_DIR, f"{output_subdir}/results_{name}.tar.gz") if not args.force and output_file in existing_output_files: logger.info(f"Output file exists: {output_file} . Skipping {row.individual_id}...") continue # init Job j = batch_utils.init_job(batch, None, DEEP_TRIO_DOCKER_IMAGE_WITHOUT_GPU if not args.raw else None, cpu=NUM_CPU) batch_utils.switch_gcloud_auth_to_user_account(j, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT, GCLOUD_PROJECT) # localize files local_ref_fasta_path = batch_utils.localize_file(j, row.ref_fasta, use_gcsfuse=True) local_reads_path = batch_utils.localize_via_temp_bucket(j, row.reads) local_parent1_reads_path = batch_utils.localize_via_temp_bucket(j, row.parent1_reads) local_parent2_reads_path = batch_utils.localize_via_temp_bucket(j, row.parent2_reads) batch_utils.localize_file(j, row.ref_fasta_fai, use_gcsfuse=True) batch_utils.localize_via_temp_bucket(j, row.reads_index) batch_utils.localize_via_temp_bucket(j, row.parent1_reads_index) batch_utils.localize_via_temp_bucket(j, row.parent2_reads_index) local_ref_cache_tar_gz_path = batch_utils.localize_file(j, "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.ref_cache.tar.gz", use_gcsfuse=True) # --regions chr22:38982347-38992804 \ j.command(f"""mkdir ref_cache cd ref_cache tar xzf {local_ref_cache_tar_gz_path} 2>&1 | grep -v '^tar:' || true export REF_PATH="/ref_cache/ref/cache/%2s/%2s/%s:http://www.ebi.ac.uk/ena/cram/md5/%s" export REF_CACHE="/ref_cache/ref/cache/%2s/%2s/%s" mkdir "/results_{name}" cd "/results_{name}" /opt/deepvariant/bin/deeptrio/run_deeptrio \ --model_type {args.model} \ --ref {local_ref_fasta_path} \ --reads_child "{local_reads_path}" \ --reads_parent1 "{local_parent1_reads_path}" \ --reads_parent2 "{local_parent2_reads_path}" \ --output_gvcf_child "variants_{name}.gvcf.gz" \ --output_gvcf_parent1 "variants_{name_parent1}.gvcf.gz" \ --output_gvcf_parent2 "variants_{name_parent2}.gvcf.gz" \ --output_vcf_child "variants_{name}.vcf.gz" \ --output_vcf_parent1 "variants_{name_parent1}.vcf.gz" \ --output_vcf_parent2 "variants_{name_parent2}.vcf.gz" \ --sample_name_child "{name}" \ --sample_name_parent1 "{name_parent1}" \ --sample_name_parent2 "{name_parent2}" \ --vcf_stats_report rm *.gvcf.gz* cd / tar czf "results_{name}.tar.gz" "/results_{name}" gsutil -m cp "results_{name}.tar.gz" {output_file}""")
def combine_splice_junctions(args, batch, batch_name, SJ_out_tab_paths, save_individual_tables, normalize_read_counts, output_dir): output_filename = f"combined.{batch_name}.{len(SJ_out_tab_paths)}_samples.SJ.out.tsv.gz" output_path = os.path.join(output_dir, output_filename) output_path_exists = hl.hadoop_is_file(output_path) output_filename2 = output_filename.replace(".SJ.out.tsv.gz", ".junctions.bed.gz") output_path2 = os.path.join(output_dir, output_filename2) output_path2_exists = hl.hadoop_is_file(output_path2) if not args.force and output_path_exists and output_path2_exists: logger.info( f"Output files \n{output_path} and \n{output_path2} exist.\n Skipping..." ) return j = batch_utils.init_job( batch, f"combine junctions: {batch_name} ({len(SJ_out_tab_paths)} files)", DOCKER_IMAGE if not args.raw else None, args.cpu, args.cpu * 3.75) batch_utils.switch_gcloud_auth_to_user_account( j, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT, GCLOUD_PROJECT) if args.force or not output_path_exists: local_SJ_out_tab_paths = [] for SJ_out_tab_path in SJ_out_tab_paths: local_path = batch_utils.localize_file(j, SJ_out_tab_path, use_gcsfuse=False) local_SJ_out_tab_paths.append(local_path) #local_gencode_gff_path = batch_utils.localize_file(j, "gencode.v26.annotation.gff3.gz", use_gcsfuse=False) save_individual_tables_option = "--save-individual-tables" if save_individual_tables else "" normalize_read_counts_option = "--normalize-read-counts" if normalize_read_counts else "" local_SJ_out_tab_paths = " ".join(local_SJ_out_tab_paths) j.command( f"python3 -u combine_splice_junctions_using_pandas.py --add-sample-id-column -n 20 " f"{save_individual_tables_option} " f"{normalize_read_counts_option} " f"{local_SJ_out_tab_paths}") j.command( f"mv combined.{len(SJ_out_tab_paths)}_samples.SJ.out.tsv.gz {output_filename}" ) j.command(f"""gsutil -m cp {output_filename} {output_dir}""") input_path_for_step2 = output_filename else: logger.info(f"Output file {output_path} exists. Skipping...") local_path = batch_utils.localize_file(j, output_path, use_gcsfuse=False) j.command(f"mv {local_path} .") input_path_for_step2 = os.path.basename(local_path) if args.force or not output_path2_exists: j.command(f"python3 -u convert_SJ_out_tab_to_junctions_bed.py " f"-g gencode.v35.annotation.gff3.gz " f"{input_path_for_step2}") j.command(f"""gsutil -m cp {output_filename2}* {output_dir}""") else: logger.info(f"Output file {output_path2} exists. Skipping...") logger.info(f"Output: {output_path}") logger.info(f"Output2: {output_path2}")
def add_command_to_combine_dbs(j, output_db_filename, input_db_paths, select_chrom=None, set_combined_bamout_id=None, create_index=False, temp_dir="./temp"): sqlite_queries = [] sqlite_queries.append('CREATE TABLE "variants" (' '"id" INTEGER NOT NULL PRIMARY KEY, ' '"chrom" VARCHAR(2) NOT NULL, ' '"pos" INTEGER NOT NULL, ' '"ref" TEXT NOT NULL, ' '"alt" TEXT NOT NULL, ' '"zygosity" INTEGER NOT NULL, ' '"qual" INTEGER NOT NULL, ' '"combined_bamout_id" TEXT, ' '"read_group_id" INTEGER NOT NULL);') column_names_string = "chrom, pos, ref, alt, zygosity, qual, combined_bamout_id, read_group_id" where_clause = f'WHERE chrom="{select_chrom}"' if select_chrom else "" for input_db_path in input_db_paths: sqlite_queries.append( f'ATTACH "{input_db_path}" as toMerge; ' f'BEGIN; ' f'INSERT INTO variants ({column_names_string}) SELECT {column_names_string} FROM toMerge.variants {where_clause}; ' f'COMMIT; ' f'DETACH toMerge;') if set_combined_bamout_id: sqlite_queries.append( f'UPDATE variants SET combined_bamout_id="{set_combined_bamout_id}";' ) if create_index: sqlite_queries.append( 'CREATE INDEX variant_index ON "variants" ("chrom", "pos", "ref", "alt", "zygosity", "qual");' ) sqlite_queries = "\n".join(sqlite_queries) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") if not os.path.isdir(temp_dir): os.mkdir(temp_dir) sqlite_queries_filename = os.path.join( os.path.normpath(temp_dir), f"sqlite_queries__{output_db_filename}__merge_{len(input_db_paths)}_dbs__{timestamp}.sql" ) with open(sqlite_queries_filename, "wt") as f: f.write(sqlite_queries) sqlite_queries_temp_google_bucket_path = os.path.join( f"gs://gnomad-bw2/", sqlite_queries_filename) #hl.hadoop_copy(sqlite_queries_filename, sqlite_queries_temp_google_bucket_path) local_sqlite_queries_file_path = batch_utils.localize_file( j, sqlite_queries_temp_google_bucket_path) j.command(f"""echo -------------- echo "Start - time: $(date)" df -kh ls -lh wc -l {local_sqlite_queries_file_path} time sqlite3 {output_db_filename} < {local_sqlite_queries_file_path} """)
def combine_db_files_in_group_for_chrom(args, batch, combined_bamout_id, group, chrom_to_combine_db_jobs, input_db_size_dict, existing_combined_dbs, temp_dir="./temp"): # check how much disk will be needed try: chr1_db_size_estimate = 0 for _, row in group.iterrows(): chr1_db_size_estimate += input_db_size_dict[ f"{INPUT_BAM_BUCKET}/{row.sample_id}.deidentify_output.db"] * 0.1 # multipy by 0.1 because chr1 is < 10% of the genome except Exception as e: logger.error( f"ERROR in group {combined_bamout_id}: {e}. Unable to combine dbs for group {combined_bamout_id}. Skipping..." ) return 1 for chrom in ALL_CHROMOSOMES: cpu = 0.25 if chr1_db_size_estimate > 0.25 * 20_000_000_000: cpu = 0.5 if chr1_db_size_estimate > 0.5 * 20_000_000_000: cpu = 1 combined_db_filename = f"{combined_bamout_id}.chr{chrom}.db" output_db_path = os.path.join(args.output_dir, combined_db_filename) if args.db_names_to_process and combined_db_filename not in args.db_names_to_process: continue if not args.force and output_db_path in existing_combined_dbs: logger.info( f"Combined db already exists: {output_db_path}. Skipping combine db step for {combined_bamout_id}..." ) continue j2 = batch_utils.init_job( batch, f"combine dbs (cpu: {cpu}): {combined_db_filename}", DOCKER_IMAGE if not args.raw else None, cpu) batch_utils.switch_gcloud_auth_to_user_account( j2, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT) chrom_to_combine_db_jobs[chrom].append(j2) local_input_db_paths = [] for _, row in group.iterrows(): local_input_db_path = batch_utils.localize_file( j2, f"{INPUT_BAM_BUCKET}/{row.sample_id}.deidentify_output.db", use_gcsfuse=False) local_input_db_paths.append(local_input_db_path) add_command_to_combine_dbs(j2, combined_db_filename, local_input_db_paths, select_chrom=chrom, set_combined_bamout_id=combined_bamout_id, create_index=False, temp_dir=temp_dir) j2.command(f"gsutil -m cp {combined_db_filename} {args.output_dir}/") return 0
def combine_bam_files_in_group(args, batch, combined_bamout_id, group, input_bam_size_dict, existing_combined_bamout_bams): output_bam_path = os.path.join(args.output_dir, f"{combined_bamout_id}.bam") if not args.force and output_bam_path in existing_combined_bamout_bams: logger.info( f"Combined bam already exists: {output_bam_path}. Skipping {combined_bamout_id}..." ) return 0 # check how much disk will be needed total_bam_size = 0 try: for _, row in group.iterrows(): total_bam_size += input_bam_size_dict[ f"{INPUT_BAM_BUCKET}/{row.sample_id}.deidentify_output.sorted.bam"] except Exception as e: logger.error( f"ERROR in group {combined_bamout_id}: {e}. Unable to combine bams for group {combined_bamout_id}. Skipping..." ) return 1 cpu = 0.25 if total_bam_size > 0.25 * 20_000_000_000: cpu = 0.5 if total_bam_size > 0.5 * 20_000_000_000: cpu = 1 if total_bam_size > 1 * 20_000_000_000: cpu = 2 if total_bam_size > 2 * 20_000_000_000: cpu = 4 j = batch_utils.init_job( batch, f"combine bams (cpu: {cpu}): {combined_bamout_id}", DOCKER_IMAGE if not args.raw else None, cpu) batch_utils.switch_gcloud_auth_to_user_account( j, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT) for_loop_bam_list = "" picard_merge_bam_inputs = "" for _, row in group.iterrows(): local_input_bam_path = batch_utils.localize_file( j, f"{INPUT_BAM_BUCKET}/{row.sample_id}.deidentify_output.sorted.bam", use_gcsfuse=True) #local_input_bai_path = batch_utils.localize_file(j, f"{INPUT_BAM_BUCKET}/{row.sample_id}.deidentify_output.sorted.bam.bai", use_gcsfuse=True) for_loop_bam_list += f" '{local_input_bam_path}'" picard_merge_bam_inputs += f" -I '{os.path.basename(local_input_bam_path).replace(' ', '_').replace(':', '_')}' " j.command(f"""echo -------------- echo "Start - time: $(date)" df -kh # create symlinks to make the filenames shorter, so the merge command doesn't get too long for p in {for_loop_bam_list}; do ln -s "${{p}}" $(basename "${{p}}" | sed "s/ /_/g" | sed "s/:/_/g") done ls -lh # run the merge command java -jar /gatk/gatk.jar MergeSamFiles --VALIDATION_STRINGENCY SILENT --ASSUME_SORTED --CREATE_INDEX {picard_merge_bam_inputs} -O {combined_bamout_id}.bam gsutil -m cp {combined_bamout_id}.bam {combined_bamout_id}.bai {args.output_dir}/ echo --------------; free -h; df -kh; uptime; set +xe; echo "Done - time: $(date)"; echo -------------- """) return 0