def index_gvcf(b: hb.batch.Batch, input_vcf: hb.resource.ResourceFile, output_vcf_ind_name: str = None, memory: int = 3, storage: int = 5, docker_img: str = None, out_dir: str = None): """ Index a GVCF file :param b: batch :param input_vcf: GVCF file to index :param output_vcf_ind_name: output GVCF index name :param memory: job memory :param storage: storage to use fo the job :param docker_img: image to use for the job :param out_dir: output directory :return: """ docker_image = docker_img if docker_img else 'us.gcr.io/broad-gatk/gatk:4.2.0.0' outname = output_vcf_ind_name + '.g.vcf.gz.tbi' index_gvcf_file = b.new_job(name=f'index-{output_vcf_ind_name}') index_gvcf_file.image(docker_image) index_gvcf_file.memory(f'{memory}Gi') index_gvcf_file.storage(f'{storage}Gi') index_gvcf_file.command(f'gatk IndexFeatureFile \ -I {input_vcf} \ -O {outname}') index_gvcf_file.command(f'mv {outname} {index_gvcf_file.ofile}') b.write_output(index_gvcf_file.ofile, f'{out_dir}/merged-gvcf/{output_vcf_ind_name}/{outname}') return index_gvcf_file
def merge_vcf(b: hb.batch.Batch, gvcf_list: List = None, output_vcf_name: str = None, merge_vcfs_img: str = None, memory: int = 3, out_dir: str = None, storage: int = None): # Combine multiple VCFs or GVCFs from scattered HaplotypeCaller runs docker_image = merge_vcfs_img if merge_vcfs_img else 'us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.3-1564508330' outname = output_vcf_name + '.g.vcf.gz' # disk_size = bytes_to_gb((inputs_vcfs_list * 2.5)) + 10 merge_vcf_i = '' for line in gvcf_list: input_gvcf = b.read_input(line) merge_vcf_i += f'I={input_gvcf} \t' merge_vcfs = b.new_job(name=output_vcf_name) merge_vcfs.image(docker_image) merge_vcfs.memory(f'{memory}Gi') merge_vcfs.storage(f'{storage}Gi') merge_vcfs.command(f'java -Xms2000m -jar /usr/gitc/picard.jar \ MergeVcfs \ {merge_vcf_i} \ O={outname}') merge_vcfs.command(f'mv {outname} {merge_vcfs.ofile}') b.write_output(merge_vcfs.ofile, f'{out_dir}/merged-gvcf/{output_vcf_name}/{outname}') return merge_vcfs
def validate_vcf(b: hb.batch.Batch, input_vcf: hb.resource.ResourceFile, ref_fasta: hb.resource.ResourceGroup, dbsnp_vcf_file: hb.resource.ResourceGroup, calling_int_file: hb.resource.ResourceFile, validate_vcf_img: str = None, memory: int = 7, storage: int = None, output_vcf_ind_name: str = None): # Validate the (g)VCF output of HaplotypeCaller docker_image = validate_vcf_img if validate_vcf_img else 'us.gcr.io/broad-gatk/gatk:4.2.0.0' # ref_size = bytes_to_gb(ref_fasta) + bytes_to_gb(ref_fasta_index) + bytes_to_gb(ref_dict) # disk_size = bytes_to_gb(input_vcf) + bytes_to_gb(dbsnp_vcf) + ref_size + 20 validate_gvcf = b.new_job(name=output_vcf_ind_name) validate_gvcf.image(docker_image) validate_gvcf.memory(f'{memory}Gi') validate_gvcf.storage(f'{storage}Gi') validate_gvcf.command(f'gatk IndexFeatureFile \ -I {input_vcf}') validate_gvcf.command(f'gatk --java-options -Xms6000m \ ValidateVariants \ -V {input_vcf} \ -R {ref_fasta.fasta} \ -L {calling_int_file} \ -gvcf \ --validation-type-to-exclude ALLELES \ --dbsnp {dbsnp_vcf_file.vcf}') return validate_gvcf
def scatter_interval_list(b: hb.batch.Batch, interval_list_file: hb.resource.ResourceFile, scatter_count: int = 50, break_bands_at_multiples_of: int = 1000000, scatter_img: str = None, memory: int = 2, out_dir: str = None): """ break the calling interval list into sub-intervals :param b: batch :param interval_list_file: one or more interval lists :param scatter_count: the number of files into which to scatter the resulting list by locus :param break_bands_at_multiples_of: if set to a positive value will create a new interval list with the original intervals broken up at integer multiples of this value. Set to 0 to NOT break up intervals :param scatter_img: image to use for the job :param memory: job memory :param out_dir: output directory :return: """ # break the calling interval list into sub-intervals docker_image = scatter_img if scatter_img else 'us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.3-1564508330' scatter_list = b.new_job(name='scatter-interval-list') scatter_list.image(docker_image) scatter_list.cpu(1) scatter_list.memory(f'{memory}Gi') scatter_list.command('mkdir /scatter_intervals') scatter_list.command(f'java -Xms1g -jar /usr/gitc/picard.jar \ IntervalListTools \ SCATTER_COUNT={scatter_count} \ SUBDIVISION_MODE=BALANCING_WITHOUT_INTERVAL_SUBDIVISION_WITH_OVERFLOW \ UNIQUE=true \ SORT=true \ BREAK_BANDS_AT_MULTIPLES_OF={break_bands_at_multiples_of} \ INPUT={interval_list_file} \ OUTPUT=/scatter_intervals') scatter_list.command(''' cat > my_script.py <<EOF import sys import os import glob intervals = sorted(glob.glob('/scatter_intervals/*/*.interval_list')) for i, interval in enumerate(intervals): (directory, filename) = os.path.split(interval) newName = os.path.join(directory, str(i + 1) + filename) os.rename(interval, newName) EOF python3 my_script.py ''') scatter_list.command(f'mv /scatter_intervals {scatter_list.outfiles}') b.write_output(scatter_list.outfiles, f'{out_dir}/scatter-intervals') # We return the `scatter_list` Job object that can be used in downstream jobs. return scatter_list
def collect_variant_calling_metrics( b: hb.batch.Batch, input_vcf: hb.resource.ResourceGroup, dbsnp_vcf_file: hb.resource.ResourceGroup, ref_dict: hb.resource.ResourceGroup, evaluation_int_list: hb.resource.ResourceFile, metrics_basename: str = None, memory: int = 6, docker_img: str = None, storage: int = None, out_dir: str = None): """ Call germline SNPs and indels :param b: batch :param input_vcf: GVCF file to collect variant calling metrics for :param dbsnp_vcf_file: DBSNP VCF and its index to use in collecting metrics :param ref_dict: reference dictionary file from a reference Group (fasta, index, and dict) Resource :param evaluation_int_list: evaluation list file :param metrics_basename: name to be used for output :param memory: job memory :param docker_img: image to use for the job :param storage: storage to use fo the job :param out_dir: output directory :return: """ docker_image = docker_img if docker_img else 'us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.3-1564508330' collect_vc_metrics = b.new_job(name=metrics_basename) collect_vc_metrics.image(docker_image) collect_vc_metrics.memory(f'{memory}Gi') collect_vc_metrics.storage(f'{storage}Gi') collect_vc_metrics.command(f'java -Xms2000m -jar /usr/gitc/picard.jar \ CollectVariantCallingMetrics \ INPUT={input_vcf.vcf} \ OUTPUT={metrics_basename} \ DBSNP={dbsnp_vcf_file.vcf} \ SEQUENCE_DICTIONARY={ref_dict.dict} \ TARGET_INTERVALS={evaluation_int_list} \ GVCF_INPUT=true') collect_vc_metrics.command( f'mv {metrics_basename}.variant_calling_detail_metrics {collect_vc_metrics.detail}' ) collect_vc_metrics.command( f'mv {metrics_basename}.variant_calling_summary_metrics {collect_vc_metrics.summary}' ) b.write_output( collect_vc_metrics.detail, f'{out_dir}/variant-calling-metrics/{metrics_basename}/{metrics_basename}.variant_calling_detail_metrics' ) b.write_output( collect_vc_metrics.summary, f'{out_dir}/variant-calling-metrics/{metrics_basename}/{metrics_basename}.variant_calling_summary_metrics' ) return collect_vc_metrics
def haplotype_caller_gatk(b: hb.batch.Batch, input_bam: hb.resource.ResourceGroup, ref_fasta: hb.resource.ResourceGroup, interval_list_file: hb.resource.ResourceFile, bam_filename_no_ext: str = None, out_dir: str = None, interval_list_name: str = None, storage: int = None, contamination: float = None, gatk_img: str = None, memory: float = 6.5, ncpu: int = 2): """ Call germline SNPs and indels :param b: batch :param input_bam: BAM file :param ref_fasta: reference files, including fasta and index :param interval_list_file: interval list file with intervals to run variant calling on :param bam_filename_no_ext: BAM filename without extension :param interval_list_name: interval list name, used to name output GVCF :param storage: storage to use fo the job :param contamination: fraction of contamination in sequencing data to aggressively remove :param gatk_img: image to use for the job :param out_dir: output directory :param memory: job memory :param ncpu: number of CPUs :return: """ docker_image = gatk_img if gatk_img else 'us.gcr.io/broad-gatk/gatk:4.2.0.0' output_file_name = bam_filename_no_ext + '_' + interval_list_name + '.g.vcf.gz' variant_calling = b.new_job(name=bam_filename_no_ext) variant_calling.image(docker_image) variant_calling.cpu(ncpu) variant_calling.memory(f'{memory}Gi') variant_calling.storage(f'{storage}Gi') variant_calling.command( f'gatk --java-options "-Xms6000m -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10" \ HaplotypeCaller \ -R {ref_fasta.fasta} \ -I {input_bam.bam} \ -L {interval_list_file} \ -O {variant_calling.ofile}\ -contamination {contamination} \ -G StandardAnnotation -G StandardHCAnnotation -G AS_StandardAnnotation \ -GQB 10 -GQB 20 -GQB 30 -GQB 40 -GQB 50 -GQB 60 -GQB 70 -GQB 80 -GQB 90 \ -ERC GVCF') # variant_calling.command(f'mv {output_file_name} {variant_calling.ofile}') b.write_output( variant_calling.ofile, f'{out_dir}/variant-calling/{bam_filename_no_ext}/{output_file_name}') return variant_calling
def imputation(b: hb.batch.Batch, vcf: str = None, vcf_filename_no_ext: str = None, ref: hb.ResourceGroup = None, ref_size: Union[int, float] = None, region: str = None, chromosome: str = None, cpu: int = 8, memory: str = 'highmem', img: str = 'docker.io/lindonkambule/gwaspy:v1', threads: int = 7, out_dir: str = None): # in_vcf = b.read_input(vcf) in_vcf = b.read_input_group(**{'bcf': vcf, 'bcf.csi': f'{vcf}.csi'}) vcf_size = bytes_to_gb(vcf) output_file_name = vcf_filename_no_ext + '.imputed.bcf' file_dir = vcf_filename_no_ext.split('.')[0] disk_size = ref_size + (vcf_size * 4) map_file = f'/shapeit4/maps/b38/{chromosome}.b38.gmap.gz' impute = b.new_job(name=output_file_name) impute.cpu(cpu) impute.memory(memory) impute.storage(f'{disk_size}Gi') impute.image(img) cmd = f''' impute5_1.1.5_static \ --h {ref.bcf} \ --m {map_file} \ --g {in_vcf.bcf} \ --r {region} \ --out-gp-field \ --o {output_file_name} \ --threads {threads} ''' impute.command(cmd) # index file to use when merging impute.command(f'bcftools index {output_file_name}') impute.command(f'mv {output_file_name} {impute.ofile}') impute.command(f'mv {output_file_name}.csi {impute.ind}') b.write_output( impute.ofile, f'{out_dir}/GWASpy/Imputation/{file_dir}/imputed_chunks/{output_file_name}' ) b.write_output( impute.ind, f'{out_dir}/GWASpy/Imputation/{file_dir}/imputed_chunks/{output_file_name}.csi' )
def concat_vcfs(b: hb.batch.Batch, vcf_basename: str = None, vcfs_to_merge: List = None, output_type: str = 'vcf', chrom: str = None, cpu: int = 16, memory: str = 'standard', docker_img: str = 'docker.io/lindonkambule/gwaspy:v1', out_dir: str = None): global index_cmd out_type = 'b' if output_type == 'bcf' else 'z' vcfs_sizes_sum = 0 merge_vcf_i = '' out_filename = f'{vcf_basename}.{chrom}.merged.bcf' if output_type == 'bcf' else \ f'{vcf_basename}.{chrom}.merged.vcf.gz' out_index_name = f'{vcf_basename}.{chrom}.merged.bcf.csi' if output_type == 'bcf' else \ f'{vcf_basename}.{chrom}.merged.vcf.gz.csi' for line in vcfs_to_merge: vcfs_sizes_sum += 2 + bytes_to_gb(line) disk_size = int(round(10 + (2 * vcfs_sizes_sum))) threads = cpu - 1 concat = b.new_job(name=f'concat-{vcf_basename}') concat.memory(memory) concat.storage(f'{disk_size}Gi') concat.image(docker_img) concat.cpu(cpu) for line in vcfs_to_merge: input_vcf = b.read_input_group(vcf=line, ind=f'{line}.csi') merge_vcf_i += f'{input_vcf.vcf} ' cmd = f''' bcftools concat \ --no-version \ --output-type {out_type} \ --output {out_filename} \ --threads {threads} \ {merge_vcf_i} ''' concat.command(cmd) # index the merged output concat.command(f'bcftools index --force {out_filename}') concat.command(f'mv {out_filename} {concat.ofile}') concat.command(f'mv {out_index_name} {concat.idx}') b.write_output(concat.ofile, f'{out_dir}/GWASpy/{vcf_basename}/Imputation/imputed_merged/{out_filename}') b.write_output(concat.idx, f'{out_dir}/GWASpy/{vcf_basename}/Imputation/imputed_merged/{out_index_name}')
def aut_impute(b: hb.batch.Batch, vcf: hb.ResourceGroup = None, vcf_filename_no_ext: str = None, ref: hb.ResourceGroup = None, region: str = None, chromosome: str = None, buffer: int = 250, storage: int = None, memory: str = None, cpu: int = None, img: str = 'docker.io/lindonkambule/gwaspy:v1', out_dir: str = None): out_file_name = vcf_filename_no_ext + '.imputed.bcf' file_dir = vcf_filename_no_ext.split('.')[0] map_file = f'/shapeit4/maps/b38/{chromosome}.b38.gmap.gz' threads = cpu - 1 impute = b.new_job(name=out_file_name) impute.cpu(cpu) impute.memory(memory) impute.storage(f'{storage}Gi') impute.image(img) cmd = f''' impute5_1.1.5_static \ --h {ref.bcf} \ --m {map_file} \ --g {vcf.bcf} \ --r {region} \ --out-gp-field \ --o {out_file_name} \ --b {buffer} \ --threads {threads} ''' impute.command(cmd) # index file to use when merging impute.command(f'bcftools index {out_file_name}') impute.command(f'mv {out_file_name} {impute.ofile}') impute.command(f'mv {out_file_name}.csi {impute.idx}') b.write_output( impute.ofile, f'{out_dir}/GWASpy/{file_dir}/Imputation/imputed_chunks/{out_file_name}' ) b.write_output( impute.idx, f'{out_dir}/GWASpy/{file_dir}/Imputation/imputed_chunks/{out_file_name}.csi' )
def clump(batch: hb.batch.Batch, image: str, bfile: hb.resource.ResourceGroup, assoc: hb.resource.ResourceFile, chr: int) -> hb.job.Job: """ Clump association results with PLINK. https://zzz.bwh.harvard.edu/plink/clump.shtml """ # Define a new job `c` in Batch `batch` with name `clump-CHR` c = batch.new_job(name=f'clump-{chr}') # TODO: Tell the new `c` job the location of the image of where you pushed your image in GCR that contains your Hail script # This image name is defined by the `image` variable as an argument to the function c.image(image) # Tell Batch to use 1Gi of memory for this job c.memory('1Gi') # Tell Batch to use 1 cpu for this job c.cpu(1) # Notice that we can simply call plink2 here because we put it on the PATH in the Dockerfile # TODO: Fill in the argument <BFILE> which uses the `bfile` argument we passed in to the function above. # `bfile` is a resource group and is expected to have three files at a common root name: {root}.bed, {root}.bim, {root}.fam # TODO: Fill in the argument <ASSOC> which uses the `assoc` argument we passed in to the function above. This file has the p-values of the GWAS. # TODO: Fill in the argument <CHR> which uses the `chr` we passed in to the function above. This will tell PLINK to only compute # the clumping results for this chromosome. This is how we achieve parallelism by chromosome. c.command(f''' plink --bfile {bfile} \ --clump {assoc} \ --chr {chr} \ --clump-p1 0.0001 \ --clump-p2 0.001 \ --clump-r2 0.5 \ --clump-kb 1000 \ --memory 1024 \ --threads 1 mv plink.clumped {c.clumped} || \ echo " CHR F SNP BP P TOTAL NSIG S05 S01 S001 S0001 SP2" > {c.clumped} ''') # PLINK outputs the results at a hardcoded path. So we'll move it to a path Batch will know to copy. # PLINK doesn't output a file if there are no results so we'll make one # We return the `c` Job object that can be used in downstream jobs. return c
def scatter_interval_list(b: hb.batch.Batch, interval_list_file: hb.resource.ResourceFile, scatter_count: int = 50, break_bands_at_multiples_of: int = 1000000, scatter_img: str = None, memory: int = 2, out_dir: str = None): # break the calling interval list into sub-intervals docker_image = scatter_img if scatter_img else 'us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.3-1564508330' scatter_list = b.new_job(name='scatter-interval-list') scatter_list.image(docker_image) scatter_list.cpu(1) # this should be lower, check DSP pipeline scatter_list.memory(f'{memory}Gi') scatter_list.command('mkdir /scatter_intervals') scatter_list.command(f'java -Xms1g -jar /usr/gitc/picard.jar \ IntervalListTools \ SCATTER_COUNT={scatter_count} \ SUBDIVISION_MODE=BALANCING_WITHOUT_INTERVAL_SUBDIVISION_WITH_OVERFLOW \ UNIQUE=true \ SORT=true \ BREAK_BANDS_AT_MULTIPLES_OF={break_bands_at_multiples_of} \ INPUT={interval_list_file} \ OUTPUT=/scatter_intervals') scatter_list.command(''' cat > my_script.py <<EOF import sys import os import glob intervals = sorted(glob.glob('/scatter_intervals/*/*.interval_list')) for i, interval in enumerate(intervals): (directory, filename) = os.path.split(interval) newName = os.path.join(directory, str(i + 1) + filename) os.rename(interval, newName) EOF python3 my_script.py ''') scatter_list.command(f'mv /scatter_intervals {scatter_list.outfiles}') b.write_output(scatter_list.outfiles, f'{out_dir}/scatter-intervals') # We return the `scatter_list` Job object that can be used in downstream jobs. return scatter_list
def run_gwas(batch: hb.batch.Batch, image: str, vcf: hb.resource.ResourceFile, phenotypes: hb.resource.ResourceFile) -> hb.job.Job: """ QC data Get association test statistics Also, export PLINK file """ # TODO: Take the input batch and create a new job object called `gwas`. You can give it the name 'run-gwas' # which will be useful when looking at the Batch UI to see what job corresponds to your code. gwas = batch.new_job(name='run-gwas') # TODO: Tell the new `gwas` job the image name of where you pushed your image in GCR that contains your Hail script # This image name is defined by the `image` variable as an argument to the function gwas.image(image) # This is how we tell Batch that we want this job to have 4 cores. This number must match the argument # to `gwas_hail.py`, which tells Hail to run in local mode with 4 cores available. gwas.cpu(4) # This is how we tell Batch that we're defining a new ResourceGroup that is the output of the `gwas` Job. # PLINK will output four files here with a common root name. We designate this common file root with `{root}` # and hard code the extensions pertaining to each file. Now we can reference the common file root as `gwas.ofile` # To reference the bim file specifically, we can use `gwas.ofile.bim` or `gwas.ofile['bim']` gwas.declare_resource_group(ofile={ 'bed': '{root}.bed', 'bim': '{root}.bim', 'fam': '{root}.fam', 'assoc': '{root}.assoc' }) # The command definition below uses f-strings. The contents in between curly braces ({, }) are evaluated as Python expressions. # TODO: Fill in the <PATH> to the Python script `gwas_hail.py` with its location in the Docker image specified above. # TODO: Fill in the argument <VCF> which represents the VCF file we passed in to the function above. # TODO: Fill in the argument <OUTPUT_FILE> to the `--output-file` argument below. This should be the file root of the resource group declared above. gwas.command(f''' python3 /gwas_hail.py \ --vcf {vcf} \ --phenotypes {phenotypes} \ --output-file {gwas.ofile} \ --cores 4 ''') # We return the `gwas` Job object that can be used in downstream jobs. return gwas
def merge_vcf(b: hb.batch.Batch, gvcf_list: List = None, output_vcf_name: str = None, merge_vcfs_img: str = None, memory: int = 3, out_dir: str = None, storage: int = None): """ Combine multiple VCFs or GVCFs from scattered HaplotypeCaller runs :param b: batch :param gvcf_list: list of GVCF files to merge :param output_vcf_name: output GVCF name :param merge_vcfs_img: image to use for the job :param storage: Storage to use fo the job :param out_dir: output directory :param memory: job memory :return: """ docker_image = merge_vcfs_img if merge_vcfs_img else\ 'us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.3-1564508330' outname = output_vcf_name + '.g.vcf.gz' merge_vcf_i = '' for line in gvcf_list: input_gvcf = b.read_input(line) merge_vcf_i += f'I={input_gvcf} \t' merge_vcfs = b.new_job(name=output_vcf_name) merge_vcfs.image(docker_image) merge_vcfs.memory(f'{memory}Gi') merge_vcfs.storage(f'{storage}Gi') merge_vcfs.command(f'java -Xms2000m -jar /usr/gitc/picard.jar \ MergeVcfs \ {merge_vcf_i} \ O={outname}') merge_vcfs.command(f'mv {outname} {merge_vcfs.ofile}') b.write_output(merge_vcfs.ofile, f'{out_dir}/merged-gvcf/{output_vcf_name}/{outname}') return merge_vcfs
def collect_variant_calling_metrics( b: hb.batch.Batch, input_vcf: hb.resource.ResourceGroup, dbsnp_vcf_file: hb.resource.ResourceGroup, ref_dict: hb.resource.ResourceGroup, evaluation_int_list: hb.resource.ResourceFile, metrics_basename: str = None, memory: int = 6, docker_img: str = None, storage: int = None, out_dir: str = None): docker_image = docker_img if docker_img else 'us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.3-1564508330' collect_vc_metrics = b.new_job(name=metrics_basename) collect_vc_metrics.image(docker_image) collect_vc_metrics.memory(f'{memory}Gi') collect_vc_metrics.storage(f'{storage}Gi') collect_vc_metrics.command(f'java -Xms2000m -jar /usr/gitc/picard.jar \ CollectVariantCallingMetrics \ INPUT={input_vcf.vcf} \ OUTPUT={metrics_basename} \ DBSNP={dbsnp_vcf_file.vcf} \ SEQUENCE_DICTIONARY={ref_dict.dict} \ TARGET_INTERVALS={evaluation_int_list} \ GVCF_INPUT=true') # collect_vc_metrics.command(f'ls') collect_vc_metrics.command( f'mv {metrics_basename}.variant_calling_detail_metrics {collect_vc_metrics.detail}' ) collect_vc_metrics.command( f'mv {metrics_basename}.variant_calling_summary_metrics {collect_vc_metrics.summary}' ) b.write_output( collect_vc_metrics.detail, f'{out_dir}/variant-calling-metrics/{metrics_basename}/{metrics_basename}.variant_calling_detail_metrics' ) b.write_output( collect_vc_metrics.summary, f'{out_dir}/variant-calling-metrics/{metrics_basename}/{metrics_basename}.variant_calling_summary_metrics' ) return collect_vc_metrics
def cram_to_bam(b: hb.batch.Batch, input_cram_file: str = None, ref_fasta: str = None, ref_dict: str = None, ref_ind: str = None, bam_out_name: str = None, memory: int = 15, samtools_image: str = None, out_dir: str = None): docker_image = samtools_image if samtools_image else 'gcr.io/genomics-tools/samtools' out_bam_name = bam_out_name + '.bam' output_bam_size: float = bytes_to_gb(input_cram_file) / 0.40 ref_size: float = bytes_to_gb(ref_fasta) + bytes_to_gb(ref_ind) disk_size: int = round( bytes_to_gb(input_cram_file) + output_bam_size + ref_size) + 25 job_memory = str(memory) + 'Gi' job_storage = str(disk_size) + 'Gi' crams_to_bams = b.new_job(name=out_bam_name) in_cram = b.read_input(input_cram_file) fasta = b.read_input_group(**{ 'fasta': ref_fasta, 'fasta.fai': ref_ind, 'dict': ref_dict }) crams_to_bams.memory(job_memory) crams_to_bams.image(docker_image) crams_to_bams.storage(job_storage) crams_to_bams.command( f'samtools view -b -T {fasta.fasta} -o {out_bam_name} {in_cram}') crams_to_bams.command(f'samtools index {out_bam_name}') crams_to_bams.command(f'mv {out_bam_name} {crams_to_bams.bamout}') crams_to_bams.command(f'mv {out_bam_name}.bai {crams_to_bams.bamind}') b.write_output(crams_to_bams.bamout, f'{out_dir}/BAMS/{out_bam_name}') b.write_output(crams_to_bams.bamind, f'{out_dir}/BAMS/{out_bam_name}.bai') return crams_to_bams
def haplotype_caller_gatk(b: hb.batch.Batch, input_bam: hb.resource.ResourceGroup, ref_fasta: hb.resource.ResourceGroup, interval_list_file: hb.resource.ResourceFile, bam_filename_no_ext: str = None, out_dir: str = None, interval_list_name: str = None, storage: int = None, contamination: float = None, gatk_img: str = None, memory: float = 6.5, ncpu: int = 2): docker_image = gatk_img if gatk_img else 'us.gcr.io/broad-gatk/gatk:4.2.0.0' output_file_name = bam_filename_no_ext + '_' + interval_list_name + '.g.vcf.gz' variant_calling = b.new_job(name=bam_filename_no_ext) variant_calling.image(docker_image) variant_calling.cpu(ncpu) variant_calling.memory(f'{memory}Gi') variant_calling.storage(f'{storage}Gi') variant_calling.command( f'gatk --java-options "-Xms6000m -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10" \ HaplotypeCaller \ -R {ref_fasta.fasta} \ -I {input_bam.bam} \ -L {interval_list_file} \ -O {variant_calling.ofile}\ -contamination {contamination} \ -G StandardAnnotation -G StandardHCAnnotation -G AS_StandardAnnotation \ -GQB 10 -GQB 20 -GQB 30 -GQB 40 -GQB 50 -GQB 60 -GQB 70 -GQB 80 -GQB 90 \ -ERC GVCF') # variant_calling.command(f'mv {output_file_name} {variant_calling.ofile}') b.write_output( variant_calling.ofile, f'{out_dir}/variant-calling/{bam_filename_no_ext}/{output_file_name}') # We return the `variant_calling` Job object that can be used in downstream jobs. return variant_calling
def index_gvcf(b: hb.batch.Batch, input_vcf: hb.resource.ResourceFile, output_vcf_ind_name: str = None, memory: int = 3, storage: int = 5, docker_img: str = None, out_dir: str = None): docker_image = docker_img if docker_img else 'us.gcr.io/broad-gatk/gatk:4.2.0.0' outname = output_vcf_ind_name + '.g.vcf.gz.tbi' index_gvcf_file = b.new_job(name=f'index-{output_vcf_ind_name}') index_gvcf_file.image(docker_image) index_gvcf_file.memory(f'{memory}Gi') index_gvcf_file.storage(f'{storage}Gi') index_gvcf_file.command(f'gatk IndexFeatureFile \ -I {input_vcf} \ -O {outname}') index_gvcf_file.command(f'mv {outname} {index_gvcf_file.ofile}') b.write_output(index_gvcf_file.ofile, f'{out_dir}/merged-gvcf/{output_vcf_ind_name}/{outname}') return index_gvcf_file
def validate_vcf(b: hb.batch.Batch, input_vcf: hb.resource.ResourceFile, ref_fasta: hb.resource.ResourceGroup, dbsnp_vcf_file: hb.resource.ResourceGroup, calling_int_file: hb.resource.ResourceFile, validate_vcf_img: str = None, memory: int = 7, storage: int = None, output_vcf_ind_name: str = None): """ Validate the GVCF output of HaplotypeCaller :param b: batch :param input_vcf: GVCF file to validate :param ref_fasta: reference files, including fasta and index :param dbsnp_vcf_file: DBSNP VCF and its index to use in the validation :param calling_int_file: calling interval file :param validate_vcf_img: image to use for the job :param memory: job memory :param storage: storage to use fo the job :param output_vcf_ind_name: name to be used for the job :return: """ docker_image = validate_vcf_img if validate_vcf_img else 'us.gcr.io/broad-gatk/gatk:4.2.0.0' validate_gvcf = b.new_job(name=output_vcf_ind_name) validate_gvcf.image(docker_image) validate_gvcf.memory(f'{memory}Gi') validate_gvcf.storage(f'{storage}Gi') validate_gvcf.command(f'gatk IndexFeatureFile \ -I {input_vcf}') validate_gvcf.command(f'gatk --java-options -Xms6000m \ ValidateVariants \ -V {input_vcf} \ -R {ref_fasta.fasta} \ -L {calling_int_file} \ -gvcf \ --validation-type-to-exclude ALLELES \ --dbsnp {dbsnp_vcf_file.vcf}') return validate_gvcf
def merge(batch: hb.batch.Batch, results: List[hb.resource.ResourceFile]) -> hb.job.Job: """ Merge clumped results files together """ # Define a new job `merger` in Batch `batch` with name `merge-results` merger = batch.new_job(name='merge-results') # Use the ubuntu:18.04 image which Batch caches merger.image('ubuntu:18.04') # Do some file munging to concatenate all of the clumped results together for all chromosomes if results: merger.command(f''' head -n 1 {results[0]} > {merger.ofile} for result in {" ".join(results)} do tail -n +2 "$result" >> {merger.ofile} done sed -i '/^$/d' {merger.ofile} ''') # We return the `merger` Job object that can be used in downstream jobs. return merger
def sex_impute(b: hb.batch.Batch, vcf: hb.ResourceGroup = None, vcf_filename_no_ext: str = None, females_list: hb.ResourceFile = None, ref: hb.ResourceGroup = None, region: str = None, buffer: int = 250, storage: int = None, memory: str = None, cpu: int = None, img: str = 'docker.io/lindonkambule/gwaspy:v1', out_dir: str = None): global cmd in_females = females_list out_file_name = vcf_filename_no_ext + '.imputed.bcf' file_dir = vcf_filename_no_ext.split('.')[0] threads = cpu - 1 impute = b.new_job(name=out_file_name) impute.cpu(cpu) impute.memory(memory) impute.storage(f'{storage}Gi') impute.image(img) start = int(region.split(':')[1].split('-')[0]) end = int(region.split(':')[1].split('-')[1]) # A. PAR1 REGION ONLY if end <= 2781479: # run diploid imputation impute.command( 'echo THIS CHUNK IS IN PAR1 REGION, SO WE WILL RUN DIPLOID IMPUTATION' ) map_file = '/shapeit4/maps/b38/chrX_par1.b38.gmap.gz' if start < 10001: start_new = 10001 end_new = end else: start_new = start end_new = end new_imp_region = f'chrX:{start_new}-{end_new}' cmd = f''' impute5_1.1.5_static \ --h {ref.bcf} \ --m {map_file} \ --g {vcf.bcf} \ --r {new_imp_region} \ --out-gp-field \ --o {out_file_name} \ --b {buffer} \ --threads {threads} ''' impute.command(cmd) # index file to use when merging impute.command(f'bcftools index {out_file_name}') impute.command(f'mv {out_file_name} {impute.ofile}') impute.command(f'mv {out_file_name}.csi {impute.idx}') b.write_output( impute.ofile, f'{out_dir}/GWASpy/{file_dir}/Imputation/imputed_chunks/{out_file_name}' ) b.write_output( impute.idx, f'{out_dir}/GWASpy/{file_dir}/Imputation/imputed_chunks/{out_file_name}.csi' ) # B. PAR2 REGION ONLY elif start >= 155701383: # run diploid imputation impute.command( 'echo THIS CHUNK IS IN PAR2 REGION, SO WE WILL RUN DIPLOID IMPUTATION' ) map_file = '/shapeit4/maps/b38/chrX_par2.b38.gmap.gz' if end > 156030895: end_new = 156030895 start_new = start else: start_new = start end_new = end new_imp_region = f'chrX:{start_new}-{end_new}' cmd = f''' impute5_1.1.5_static \ --h {ref.bcf} \ --m {map_file} \ --g {vcf.bcf} \ --r {new_imp_region} \ --out-gp-field \ --o {out_file_name} \ --b {buffer} \ --threads {threads} ''' impute.command(cmd) # index file to use when merging impute.command(f'bcftools index {out_file_name}') impute.command(f'mv {out_file_name} {impute.ofile}') impute.command(f'mv {out_file_name}.csi {impute.idx}') b.write_output( impute.ofile, f'{out_dir}/GWASpy/{file_dir}/Imputation/imputed_chunks/{out_file_name}' ) b.write_output( impute.idx, f'{out_dir}/GWASpy/{file_dir}/Imputation/imputed_chunks/{out_file_name}.csi' ) # C. NON-PAR REGION ONLY elif (start >= 2781479) & (end <= 155701382): map_file = '/shapeit4/maps/b38/chrX.b38.gmap.gz' start_new = start end_new = end new_imp_region = f'chrX:{start_new}-{end_new}' # (1) split by sex NB: THE USER SHOULD SUPPLY A FILE CONTAINING EITHER ONLY FEMALES OR MALE SAMPLE IDS # WITH ONE SAMPLE ID PER LINE cmd_split = f''' echo THIS CHUNK IS ONLY IN THE NON-PAR REGION, SO WE WILL SPLIT IT BY SEX BEFORE RUNNING IMPUTATION echo GETTING THE ORDER OF SAMPLES bcftools query -l {vcf.bcf} > samples_order.txt echo SPLITTING SAMPLES BY SEX bcftools view -S {in_females} {vcf.bcf} --output-type b --output females.bcf bcftools view -S ^{in_females} {vcf.bcf} --output-type b --output males.bcf echo INDEXING THE FILES BEFORE RUNNING IMPUTATION bcftools index females.bcf bcftools index males.bcf ''' impute.command(cmd_split) # (2) run imputation separately for females and males cmd_impute = f''' impute5_1.1.5_static --h {ref.bcf} --m {map_file} --g females.bcf --r {new_imp_region} --out-gp-field \ --o females.imputed.bcf --b {buffer} --threads {threads} impute5_1.1.5_static --h {ref.bcf} --m {map_file} --g males.bcf --r {new_imp_region} --out-gp-field \ --o males.imputed.bcf --b {buffer} --threads {threads} -- haploid ''' impute.command(cmd_impute) # (3) merge the imputed files back together into one chunk # (3a) index females and males files to be merged and create a file with these for bcftools cmd_index_chunks = f''' bcftools index females.imputed.bcf bcftools index males.imputed.bcf rm females.bcf* males.bcf* ''' impute.command(cmd_index_chunks) # (3b) sometimes there are duplicates (raw and imputed with flipped alleles) and this causes an error when # merging, so we remove any duplicates cmd_sort = f''' echo CHECKING FOR DUPLICATE VARIANTS AND REMOVING THEM bcftools norm -d any females.imputed.bcf --output-type b --output females.imputed.sorted.bcf rm females.imputed.bcf* bcftools norm -d any males.imputed.bcf --output-type b --output males.imputed.sorted.bcf rm males.imputed.bcf* bcftools index females.imputed.sorted.bcf bcftools index males.imputed.sorted.bcf echo -e "females.imputed.sorted.bcf\nmales.imputed.sorted.bcf" > merge.txt ''' impute.command(cmd_sort) # (3c) merge the files into one cmd_merge = f''' echo MERGING THE MALES AND FEMALES FILE BACK TOGETHER bcftools merge --file-list merge.txt --output-type b --output merged.sex.bcf ''' impute.command(cmd_merge) # (3d) reorder samples back to how they were initially # splitting and re-merging samples will result in change in order compared to the initial file # so we have to use the initial samples order for the concat step by chromosome of imputation cmd_order = f''' echo ORDERING SAMPLES TO HOW THEY WERE INITIALLY bcftools view -S samples_order.txt merged.sex.bcf --output-type b --output {out_file_name} ''' impute.command(cmd_order) impute.command(f'bcftools index {out_file_name}') impute.command(f'mv {out_file_name} {impute.ofile}') impute.command(f'mv {out_file_name}.csi {impute.idx}') b.write_output( impute.ofile, f'{out_dir}/GWASpy/{file_dir}/Imputation/imputed_chunks/{out_file_name}' ) b.write_output( impute.idx, f'{out_dir}/GWASpy/{file_dir}/Imputation/imputed_chunks/{out_file_name}.csi' ) # D. MIXED REGIONS else: if (start <= 2781479) & (end >= 2781479): map_file_par = '/shapeit4/maps/b38/chrX_par1.b38.gmap.gz' par_region = f'chrX:{start}-{2781479}' non_par_region = f'chrX:{2781479}-{end}' # to be used in step 3d cmd_merge_regs = f''' echo ORDERING PAR1 SAMPLES TO HOW THEY WERE INITIALLY bcftools view -S samples_order.txt par.imputed.bcf --output-type b --output par.imputed.sort.bcf bcftools index par.imputed.sort.bcf rm par.imputed.bcf* echo MERGING THE PAR1 AND NON-PAR FILES BACK TOGETHER echo -e "par.imputed.sort.bcf\nnonpar.imputed.sort.bcf" > merge_regions.txt bcftools concat --naive --file-list merge_regions.txt --output-type b --output {out_file_name} ''' else: map_file_par = '/shapeit4/maps/b38/chrX_par2.b38.gmap.gz' if end > 156030895: par_region = f'chrX:{155701383}-{156030895}' else: par_region = f'chrX:{155701383}-{end}' non_par_region = f'chrX:{start}-{155701382}' # to be used in step 3c cmd_merge_regs = f''' echo ORDERING PAR2 SAMPLES TO HOW THEY WERE INITIALLY bcftools view -S samples_order.txt par.imputed.bcf --output-type b --output par.imputed.sort.bcf bcftools index par.imputed.sort.bcf rm par.imputed.bcf* echo MERGING THE PAR2 AND NON-PAR FILES BACK TOGETHER echo -e "nonpar.imputed.sort.bcf\npar.imputed.sort.bcf" > merge_regions.txt bcftools concat --naive --file-list merge_regions.txt --output-type b --output {out_file_name} ''' cmd_split = f''' echo THIS CHUNK IS IN PAR1/2 and NON-PAR REGION, SO WE WILL SPLIT THESE TWO REGIONS echo GETTING THE ORDER OF SAMPLES bcftools query -l {vcf.bcf} > samples_order.txt echo SPLITTING OUT THE PAR REGION bcftools view {vcf.bcf} --regions {par_region} --output-type b --output par.bcf echo SPLITTING OUT THE NON-PAR REGION bcftools view {vcf.bcf} --regions {non_par_region} --output-type b --output nonpar.bcf echo SPLITTING THE NON-PAR REGION BY SEX bcftools view -S {in_females} nonpar.bcf --output-type b --output females.bcf bcftools view -S ^{in_females} nonpar.bcf --output-type b --output males.bcf echo INDEXING THE FILES BEFORE RUNNING IMPUTATION bcftools index par.bcf bcftools index females.bcf bcftools index males.bcf rm {vcf.bcf} rm nonpar.bcf ''' impute.command(cmd_split) # (2) run imputation separately for par, females, and males map_file = '/shapeit4/maps/b38/chrX.b38.gmap.gz' cmd_impute = f''' impute5_1.1.5_static --h {ref.bcf} --m {map_file_par} --g par.bcf --r {par_region} --out-gp-field \ --o par.imputed.bcf --b {buffer} --threads {threads} impute5_1.1.5_static --h {ref.bcf} --m {map_file} --g females.bcf --r {non_par_region} --out-gp-field \ --o females.imputed.bcf --b {buffer} --threads {threads} impute5_1.1.5_static --h {ref.bcf} --m {map_file} --g males.bcf --r {non_par_region} --out-gp-field \ --o males.imputed.bcf --b {buffer} --threads {threads} -- haploid ''' impute.command(cmd_impute) # (3) merge the imputed files back together into one chunk # (3a) index females and males files to be merged and create a file with these for bcftools cmd_index_chunks = f''' bcftools index par.imputed.bcf bcftools index females.imputed.bcf bcftools index males.imputed.bcf ''' impute.command(cmd_index_chunks) # (3b) sometimes there are duplicates (raw and imputed with flipped alleles) and this causes an error when # merging, so we remove any duplicates # bcftools view Nepal_PTSD_GSA_Updated_May2021_qced.chrX.phased.bcf | grep "#" cmd_sort = f''' echo CHECKING FOR DUPLICATE VARIANTS AND REMOVING THEM bcftools norm -d any females.imputed.bcf --output-type b --output females.imputed.sorted.bcf rm females.imputed.bcf* bcftools norm -d any males.imputed.bcf --output-type b --output males.imputed.sorted.bcf rm males.imputed.bcf* bcftools index females.imputed.sorted.bcf bcftools index males.imputed.sorted.bcf echo -e "females.imputed.sorted.bcf\nmales.imputed.sorted.bcf" > merge_sex.txt ''' impute.command(cmd_sort) # (3c) merge the sex files into one cmd_merge_sex = f''' echo MERGING THE MALES AND FEMALES FILE BACK TOGETHER bcftools merge --file-list merge_sex.txt --output-type b --output nonpar.imputed.bcf bcftools index nonpar.imputed.bcf echo ORDERING NON-PAR SAMPLES TO HOW THEY WERE INITIALLY bcftools view -S samples_order.txt nonpar.imputed.bcf --output-type b --output nonpar.imputed.sort.bcf bcftools index nonpar.imputed.sort.bcf rm nonpar.imputed.bcf* females.imputed.sorted.bcf* males.imputed.sorted.bcf ''' impute.command(cmd_merge_sex) # (3d) merge the non-par and par regions together impute.command(cmd_merge_regs) impute.command(f'bcftools index {out_file_name}') impute.command(f'mv {out_file_name} {impute.ofile}') impute.command(f'mv {out_file_name}.csi {impute.idx}') b.write_output( impute.ofile, f'{out_dir}/GWASpy/{file_dir}/Imputation/imputed_chunks/{out_file_name}' ) b.write_output( impute.idx, f'{out_dir}/GWASpy/{file_dir}/Imputation/imputed_chunks/{out_file_name}.csi' )
def concat_vcfs(b: hb.batch.Batch, vcf_basename: str = None, vcfs_to_merge: List = None, output_type: str = 'bcf', software: str = None, chrom: str = None, docker_img: str = 'docker.io/lindonkambule/gwaspy:v1', cpu: int = 8, out_dir: str = None): global index_cmd out_type = 'b' if output_type == 'bcf' else 'z' threads = cpu - 1 vcfs_sizes_sum = 0 merge_vcf_i = '' out_filename = f'{vcf_basename}.{chrom}.phased.{software}.bcf' if output_type == 'bcf' else \ f'{vcf_basename}.{chrom}.phased.{software}.vcf.gz' out_index_name = f'{vcf_basename}.{chrom}.phased.{software}.bcf.csi' if output_type == 'bcf' else \ f'{vcf_basename}.{chrom}.phased.{software}.vcf.gz.csi' for line in vcfs_to_merge: vcfs_sizes_sum += 1 + bytes_to_gb(line) mem = 'highmem' if vcfs_sizes_sum > 2 else 'standard' disk_size = 10 + vcfs_sizes_sum concat = b.new_job(name=f'concat-{vcf_basename}') concat.memory(mem) concat.storage(f'{disk_size}Gi') concat.image(docker_img) concat.cpu(cpu) for line in vcfs_to_merge: input_vcf = b.read_input_group(vcf=line, ind=f'{line}.csi') merge_vcf_i += f'{input_vcf.vcf} ' cmd = f''' bcftools concat \ --no-version \ --output-type {out_type} \ --output {out_filename} \ --threads {threads} \ --ligate \ {merge_vcf_i} ''' concat.command(cmd) # index the merged output concat.command(f'bcftools index {out_filename}') concat.command(f'mv {out_filename} {concat.ofile}') concat.command(f'mv {out_index_name} {concat.idx}') b.write_output( concat.ofile, f'{out_dir}/GWASpy/{vcf_basename}/Phasing/phased_merged/{out_filename}' ) b.write_output( concat.idx, f'{out_dir}/GWASpy/{vcf_basename}/Phasing/phased_merged/{out_index_name}' )