def merge_vcf(b: hb.batch.Batch, gvcf_list: List = None, output_vcf_name: str = None, merge_vcfs_img: str = None, memory: int = 3, out_dir: str = None, storage: int = None): # Combine multiple VCFs or GVCFs from scattered HaplotypeCaller runs docker_image = merge_vcfs_img if merge_vcfs_img else 'us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.3-1564508330' outname = output_vcf_name + '.g.vcf.gz' # disk_size = bytes_to_gb((inputs_vcfs_list * 2.5)) + 10 merge_vcf_i = '' for line in gvcf_list: input_gvcf = b.read_input(line) merge_vcf_i += f'I={input_gvcf} \t' merge_vcfs = b.new_job(name=output_vcf_name) merge_vcfs.image(docker_image) merge_vcfs.memory(f'{memory}Gi') merge_vcfs.storage(f'{storage}Gi') merge_vcfs.command(f'java -Xms2000m -jar /usr/gitc/picard.jar \ MergeVcfs \ {merge_vcf_i} \ O={outname}') merge_vcfs.command(f'mv {outname} {merge_vcfs.ofile}') b.write_output(merge_vcfs.ofile, f'{out_dir}/merged-gvcf/{output_vcf_name}/{outname}') return merge_vcfs
def merge_vcf(b: hb.batch.Batch, gvcf_list: List = None, output_vcf_name: str = None, merge_vcfs_img: str = None, memory: int = 3, out_dir: str = None, storage: int = None): """ Combine multiple VCFs or GVCFs from scattered HaplotypeCaller runs :param b: batch :param gvcf_list: list of GVCF files to merge :param output_vcf_name: output GVCF name :param merge_vcfs_img: image to use for the job :param storage: Storage to use fo the job :param out_dir: output directory :param memory: job memory :return: """ docker_image = merge_vcfs_img if merge_vcfs_img else\ 'us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.3-1564508330' outname = output_vcf_name + '.g.vcf.gz' merge_vcf_i = '' for line in gvcf_list: input_gvcf = b.read_input(line) merge_vcf_i += f'I={input_gvcf} \t' merge_vcfs = b.new_job(name=output_vcf_name) merge_vcfs.image(docker_image) merge_vcfs.memory(f'{memory}Gi') merge_vcfs.storage(f'{storage}Gi') merge_vcfs.command(f'java -Xms2000m -jar /usr/gitc/picard.jar \ MergeVcfs \ {merge_vcf_i} \ O={outname}') merge_vcfs.command(f'mv {outname} {merge_vcfs.ofile}') b.write_output(merge_vcfs.ofile, f'{out_dir}/merged-gvcf/{output_vcf_name}/{outname}') return merge_vcfs
def cram_to_bam(b: hb.batch.Batch, input_cram_file: str = None, ref_fasta: str = None, ref_dict: str = None, ref_ind: str = None, bam_out_name: str = None, memory: int = 15, samtools_image: str = None, out_dir: str = None): docker_image = samtools_image if samtools_image else 'gcr.io/genomics-tools/samtools' out_bam_name = bam_out_name + '.bam' output_bam_size: float = bytes_to_gb(input_cram_file) / 0.40 ref_size: float = bytes_to_gb(ref_fasta) + bytes_to_gb(ref_ind) disk_size: int = round( bytes_to_gb(input_cram_file) + output_bam_size + ref_size) + 25 job_memory = str(memory) + 'Gi' job_storage = str(disk_size) + 'Gi' crams_to_bams = b.new_job(name=out_bam_name) in_cram = b.read_input(input_cram_file) fasta = b.read_input_group(**{ 'fasta': ref_fasta, 'fasta.fai': ref_ind, 'dict': ref_dict }) crams_to_bams.memory(job_memory) crams_to_bams.image(docker_image) crams_to_bams.storage(job_storage) crams_to_bams.command( f'samtools view -b -T {fasta.fasta} -o {out_bam_name} {in_cram}') crams_to_bams.command(f'samtools index {out_bam_name}') crams_to_bams.command(f'mv {out_bam_name} {crams_to_bams.bamout}') crams_to_bams.command(f'mv {out_bam_name}.bai {crams_to_bams.bamind}') b.write_output(crams_to_bams.bamout, f'{out_dir}/BAMS/{out_bam_name}') b.write_output(crams_to_bams.bamind, f'{out_dir}/BAMS/{out_bam_name}.bai') return crams_to_bams