def haplotype_caller(core_req=16, mem_req=29 * 1024, in_bams=find('bam$', n='>0'), in_bais=find('bai$', n='>0'), in_target_bed=find('target.bed'), out_vcf=out_dir('raw_variants.g.vcf')): in_bams = bam_list_to_inputs(in_bams) intervals = arg('--intervals', in_target_bed) return r""" {gatk} \ -T HaplotypeCaller \ -R {s[ref][reference_fasta]} \ -D {s[ref][dbsnp_vcf]} \ -nct {core_req} \ --emitRefConfidence GVCF \ -I {in_bams} \ -o {out_vcf} \ {intervals} \ -A Coverage \ -A GCContent \ -A AlleleBalanceBySample \ -A AlleleBalance \ -A MappingQualityRankSumTest \ -A InbreedingCoeff \ -A FisherStrand \ -A QualByDepth """.format(s=s, gatk=gatk(mem_req), **locals())
def realigner_target_creator(core_req=8, mem_req=8 * 1024, in_target_bed=find('target.bed'), in_bams=find('bam$', n='>0'), in_bais=find('bai$', n='>0'), out_bams=forward('in_bams'), out_bais=forward('in_bais'), out_sites=out_dir('denovo_realign_targets.bed')): in_bams = bam_list_to_inputs(in_bams) if s['ref']['version'] == 'b37': in_knowns = s['ref']['1kg_indel_vcf'], s['ref']['mills_vcf'] elif s['ref']['version'] == 'hg38': in_knowns = [s['ref']['mills_and_1kg_indel_vcf']] # TODO should we pad intervals? might be indels on perimeter that need realigner. Not too worried because we're using HaplotypeCaller, though. return r""" #could add more knowns from ESP and other seq projects... {gatk} \ -T RealignerTargetCreator \ -R {s[ref][reference_fasta]} \ -I {in_bams} \ -o {out_sites} \ {knowns} \ -nt {core_req} \ {args} """.format(s=s, gatk=gatk(mem_req), args=arg('--intervals', in_target_bed), knowns=' '.join('-known %s' % p for p in in_knowns), **locals())
def haplotype_caller(core_req=16, mem_req=12 * 1024, in_bams=find('bam$', n='>0'), in_bais=find('bai$', n='>0'), in_target_bed=find('target.bed'), out_vcf=out_dir('raw_variants.g.vcf')): in_bams = bam_list_to_inputs(in_bams) intervals = arg('--intervals', in_target_bed) return r""" {gatk} \ -T HaplotypeCaller \ -R {s[ref][reference_fasta]} \ -D {s[ref][dbsnp_vcf]} \ -nct {core_req} \ --emitRefConfidence GVCF \ -stand_call_conf 30 \ -stand_emit_conf 10 \ -I {in_bams} \ -o {out_vcf} \ {intervals} \ -A Coverage \ -A GCContent \ -A AlleleBalanceBySample \ -A AlleleBalance \ -A MappingQualityRankSumTest \ -A InbreedingCoeff \ -A FisherStrand \ -A QualByDepth """.format(s=s, gatk=gatk(mem_req), **locals())
def indel_realigner(core_req=4, # proxy for mem_req until i test mem_req out mem_req=8 * 1024, contig=None, in_bams=find('bam$', n='>0'), in_bais=find('bai$', n='>0'), in_sites=find('denovo_realign_targets.bed'), out_bam=out_dir('realigned.bam'), out_bai=out_dir('realigned.bai')): in_bams = bam_list_to_inputs(in_bams) if s['ref']['version'] == 'b37': in_knowns = s['ref']['1kg_indel_vcf'], s['ref']['mills_vcf'] elif s['ref']['version'] == 'hg38': in_knowns = [s['ref']['mills_and_1kg_indel_vcf']] return r""" # IR does not support parallelization {gatk} \ -T IndelRealigner \ -R {s[ref][reference_fasta]} \ -I {in_bams} \ -o {out_bam} \ -targetIntervals {in_sites} \ {knowns} \ -model USE_READS \ --filter_bases_not_stored \ {intervals} {s[opt][samtools]} index {out_bam} """.format(s=s, intervals=arg('--intervals', contig), gatk=gatk(mem_req), knowns=' '.join('-known %s' % p for p in in_knowns), **locals())
def trim_galore(in_fastq1=find('fq.gz|\.fastq|fastq.gz', tags=dict(read_pair='1')), in_fastq2=find('fq.gz|\.fastq|fastq.gz', tags=dict(read_pair='2')), out_directory=out_dir(''), out_fastq1=out_dir('trimmed_r1.fastq.gz'), out_fastq2=out_dir('trimmed_r2.fastq.gz')): return r""" {s[opt][trim_galore]} \ --paired \ --dont_gzip \ -o {out_directory} \ --path_to_cutadapt {s[opt][cutadapt]} \ {in_fastq1} {in_fastq2} """.format(s=s, **locals())
def filter_bed_by_contig(contig, drm='local', in_bed=find('bed$'), out_bed=out_dir('target.bed')): return r""" grep -P "^{contig}\t" {in_bed} > {out_bed} """.format(s=s, **locals())
def word_count(chars=False, in_txts=find('txt$', n='>=1'), out_txt=out_dir('wc.txt')): c = ' -c' if chars else '' return 'wc{c} {input} > {out_txt}'.format(input=' '.join(map(str, in_txts)), **locals())
def freebayes(reference_fasta=settings['ref']['reference_fasta'], max_complex_gap=2, no_complex=True, in_target_bed=find('bed$'), in_bam=find('bam$'), out_vcf=out_dir('variants.vcf')): return r""" {s[opt][freebayes]} -f {reference_fasta} \ --vcf {out_vcf} \ --targets {in_target_bed} \ {args} \ -m 30 -q 10 -R 0 -S 0 -F 0.1 \ {in_bam} """.format(s=settings, args=args(('--max-complex-gap', max_complex_gap), ('--no-complex', no_complex)), **locals())
def filter_bed_by_contig(contig, drm='local', in_bed=find('bed$'), out_bed=out_dir('target.bed')): return r""" grep -P "^{contig}\t" {in_bed} > {out_bed} """.format(s=settings, **locals())
def sam_to_fastq_interleave(in_bam=find('bam$'), out_fastq=out_dir('reads.fastq')): return r""" {picard} SamToFastq \ I={in_bam} \ FASTQ={out_fastq} """.format(s=s, picard=picard(), **locals())
def cut_adapt(minimum_length=50, in_fastq1=find('fq.gz|\.fastq|fastq.gz', tags=dict(read_pair='1')), in_fastq2=find('fq.gz|\.fastq|fastq.gz', tags=dict(read_pair='2')), out_fastq1=out_dir('trimmed_r1.fastq.gz'), out_fastq2=out_dir('trimmed_r2.fastq.gz')): # out_fastq1='>( gzip > %s)' % out_fastq1 # out_fastq2='>( gzip > %s)' % out_fastq2 return r""" {s[opt][cutadapt]} \ -a AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC \ -A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT \ {args} \ -o {out_fastq1} -p {out_fastq2} \ {in_fastq1} {in_fastq2} """.format(s=s, args=args(('--minimum-length', minimum_length)), **locals())
def split_fastq_file(num_chunks, prefix, out_fastqs, in_fastq=find('fq.gz|\.fastq|fastq.gz')): return r""" python {b} {in_fastq} {prefix} {num_chunks} """.format(s=s, b=bin('fastq/split_fastq_file.py'), **locals())
def ngsutils_fastq_split(num_chunks, prefix, in_fastq=find('fq.gz|\.fastq|fastq.gz')): """ Doesn't work with streams :( """ return r""" {s[opt][ngsutils]}/fastqutils split {in_fastq} {prefix} {num_chunks} -gz """.format(s=s, **locals())
def fastq_to_sam(rgid, sample_name, library, platform, platform_unit, in_fastq1=find('.fastq', tags=dict(read_pair='1')), in_fastq2=find('.fastq', tags=dict(read_pair='2')), out_bam=out_dir('unaligned.bam')): return r""" {picard} FastqToSam \ FASTQ={in_fastq1} \ FASTQ2={in_fastq2} \ O={out_bam} \ SAMPLE_NAME={sample_name} \ LIBRARY_NAME={library} \ PLATFORM_UNIT={platform_unit} \ PLATFORM={platform} \ READ_GROUP_NAME={rgid} """.format(s=s, picard=picard(), **locals())
def collect_variant_calling_metrics(in_vcf=find('in_vcf'), in_dbsnp=s['ref']['dbsnp_vcf'], out_path=out_dir('picard.variant_metrics')): return r""" {picard} CollectVariantCallingMetrics \ I={in_vcf} \ DBSNP={in_dbsnp} \ O={out_path} """.format(picard=picard(), **locals())
def merge_sam_files(in_bams=find('bam', n='>=1'), out_bam=out_dir('merged.bam'), out_bai=out_dir('merged.bai')): return r""" {picard} MergeSamFiles \ {inputs} \ O={out_bam} \ ASSUME_SORTED=True \ CREATE_INDEX=True """.format(picard=picard(), inputs=list_to_input(in_bams), **locals())
def collect_variant_calling_metrics( in_vcf=find('in_vcf'), in_dbsnp=s['ref']['dbsnp_vcf'], out_path=out_dir('picard.variant_metrics')): return r""" {picard} CollectVariantCallingMetrics \ I={in_vcf} \ DBSNP={in_dbsnp} \ O={out_path} """.format(picard=picard(), **locals())
def mark_illumina_adapters(mem_req=8 * 1024, in_bam=find('bam'), out_bam=out_dir('unaligned_trimmed.bam'), out_metrics=out_dir('adapter.metrics')): return r""" {picard} MarkIlluminaAdapters\ I={in_bam} \ O={out_bam} \ METRICS={out_metrics} """.format(s=s, picard=picard(), **locals())
def bwa_mem(rgid, sample_name, library, platform, platform_unit, reference=s['ref']['reference_fasta'], core_req=16, in_fastqs=find('.fastq|.fq|.fq.gz|.fastq.gz', n=2), out_cutadapt_log=out_dir('cutadapt.log'), out_bam=out_dir('aligned.bam'), out_bai=out_dir('aligned.bai')): in_fastq1, in_fastq2 = in_fastqs fifo1 = out_bam.replace('aligned.bam', 'fifo1') fifo2 = out_bam.replace('aligned.bam', 'fifo2') return r""" {s[opt][bwa]} mem \ -t {bwa_cores} -L 0 -M \ -R "@RG\tID:{rgid}\tLB:{library}\tSM:{sample_name}\tPL:{platform}\tPU:{platform_unit}" \ {reference} \ {in_fastq1} \ {in_fastq2} \ | {picard} SortSam I=/dev/stdin O={out_bam} CREATE_INDEX=true SORT_ORDER=coordinate """.format(s=s, bwa_cores=core_req-2, picard=picard.picard(), **locals()) # @can_stream(['in_fastq1', 'in_fastq2']) # def bwa_mem_with_trimming(rgid, sample_name, library, platform, platform_unit, # reference=s['ref']['reference_fasta'], # core_req=16, # in_fastq1=find('.fastq', tags=dict(read_pair='1')), # in_fastq2=find('.fastq', tags=dict(read_pair='2')), # out_bam=out_dir('aligned.bam'), # out_bai=out_dir('aligned.bam.bai'), # out_adapter_metrics=out_dir('adapter.metrics')): # return r""" # # {fastq_to_sam} \ # | {mark_illumina_adapters} \ # | {sam_to_fastq} # | {s[opt][bwa]} mem \ # -t {core_req} -L 0 -M -p \ # -R "@RG\tID:{rgid}\tLB:{library}\tSM:{sample_name}\tPL:{platform}\tPU:{platform_unit}" \ # {reference} \ # /dev/stdin \ # | {s[opt][samtools]} sort -@ 2 -m 2G - {samtools_out} # # {s[opt][samtools]} index {out_bam} # """.format(s=s, # fastq_to_sam=picard.fastq_to_sam(rgid=rgid, sample_name=sample_name, library=library, platform=platform, platform_unit=platform_unit, # in_fastq1=in_fastq1, in_fastq2=in_fastq2, out_bam='/dev/stdout').strip(), # mark_illumina_adapters=picard.mark_illumina_adapters(in_bam='/dev/stdin', out_bam='/dev/stdout', metrics=out_adapter_metrics).strip(), # sam_to_fastq=picard.sam_to_fastq_interleave('/dev/stdin', '/dev/stdout'), # # samtools_out=out_bam.replace('.bam', ''), # **locals())
def mark_duplicates( core_req=4, # for scratch space mem_req=12 * 1024, in_bams=find('bam$', n='>=1'), in_bais=find('bai$', n='>=1'), out_bam=out_dir('deduped.bam'), out_bai=out_dir('deduped.bam.bai'), out_metrics=out_dir('deduped.metrics')): return r""" {picard} MarkDuplicates \ {inputs} \ O={out_bam} \ METRICS_FILE={out_metrics} \ ASSUME_SORTED=True \ MAX_RECORDS_IN_RAM=1000000 \ VALIDATION_STRINGENCY=SILENT \ VERBOSITY=INFO {s[opt][samtools]} index {out_bam} """.format(inputs=list_to_input(in_bams), s=s, picard=picard(), **locals())
def merge(in_bams=find('bam$', n='>0'), out_bam=out_dir('merged.bam')): if len(in_bams) == 1: # Can't merge 1 bam, just copy it return r""" cp {in_bams[0]} {out_bam} """.format(**locals()) else: in_bams = ' '.join(map(str, in_bams)) return r""" {s[opt][samtools]} merge -f {out_bam} {in_bams} """.format(s=s, **locals())
def collect_wgs_metrics(in_bam=find('bam'), out_path=out_dir('picard.raw_wgs_metrics.txt'), reference_fasta=s['ref']['reference_fasta']): return r""" {picard} CollectRawWgsMetrics \ I={in_bam} \ O={out_path} \ R={reference_fasta} \ INCLUDE_BQ_HISTOGRAM=true """.format(picard=picard(), **locals())
def fastqc(core_req=8, in_r1s=find('fq.gz|\.fastq|fastq.gz', n='>=1', tags=dict(read_pair='1')), in_r2s=find('fq.gz|\.fastq|fastq.gz', n='>=1', tags=dict(read_pair='2')), out_dir=out_dir('fastqc/')): assert len(in_r1s) == len(in_r2s) # if len(in_r1s) > 1 or in_r1s[0].startswith('<('): # # If there are more than 1 fastqs per read_pair, merge them into one file per read_pair # # Note, catting compressed files together seems fine # # Have to cat because fastqc does not support streaming # # TODO make sure we are concating to local temp disc if available. For the usual S3 option this is fine, since we're already in a tmp dir # # TODO stream from s3 into a cat command when input files start with s3:// # # r1, r2 = 'cat_r1.fastq.gz', 'cat_r2.fastq.gz' # cat = r""" # cat {r1s_join} > {r1} # cat {r2s_join} > {r2} # """.format(s=s, # r1s_join=' '.join(map(str, in_r1s)), # r2s_join=' '.join(map(str, in_r2s)), # **locals()) # cleanup = 'rm %s %s' % (r1, r2) # else: # r1, r2 = in_r1s[0], in_r2s[0] # cat = "" # cleanup = "" cat = 'cat {fqs} | {zcat_or_cat}'.format(fqs=' '.join(in_r1s + in_r2s), zcat_or_cat='zcat' if '.gz' in in_r1s[0] else 'cat') return r""" mkdir -p {out_dir} {cat} | \ {s[opt][fastqc]} \ --threads {core_req} \ --dir {s[gk][tmp_dir]} \ -o {out_dir} \ /dev/stdin """.format(s=s, **locals())
def combine_gvcfs(mem_req=12 * 1024, in_vcfs=find('vcf|vcf.gz$', n='>0'), out_vcf=out_dir('variants.g.vcf')): in_vcfs = vcf_list_to_input(in_vcfs) return r""" {gatk} \ -T CombineGVCFs \ -R {s[ref][reference_fasta]} \ {in_vcfs} \ -o {out_vcf} """.format(s=s, gatk=gatk(mem_req), **locals())
def mark_duplicates(core_req=4, # for scratch space mem_req=12 * 1024, in_bams=find('bam$', n='>=1'), in_bais=find('bai$', n='>=1'), out_bam=out_dir('deduped.bam'), out_bai=out_dir('deduped.bam.bai'), out_metrics=out_dir('deduped.metrics')): return r""" {picard} MarkDuplicates \ {inputs} \ O={out_bam} \ METRICS_FILE={out_metrics} \ ASSUME_SORTED=True \ MAX_RECORDS_IN_RAM=1000000 \ VALIDATION_STRINGENCY=SILENT \ VERBOSITY=INFO {s[opt][samtools]} index {out_bam} """.format(inputs=list_to_input(in_bams), s=s, picard=picard(), **locals())
def genotype_gvcfs(core_req=8, mem_req=12 * 1024, in_vcfs=find('vcf|vcf.gz$', n='>0'), out_vcf=out_dir('variants.vcf')): return r""" {gatk} \ -T GenotypeGVCFs \ -R {s[ref][reference_fasta]} \ -D {s[ref][dbsnp_vcf]} \ -nt {core_req} \ {inputs} \ -o {out_vcf} """.format(s=s, gatk=gatk(mem_req), inputs=vcf_list_to_input(in_vcfs), **locals())
def fastqc(core_req=8, in_r1s=find('fq.gz|\.fastq|fastq.gz', n='>=1', tags=dict(read_pair='1')), in_r2s=find('fq.gz|\.fastq|fastq.gz', n='>=1', tags=dict(read_pair='2')), out_dir=out_dir('fastqc/')): assert len(in_r1s) == len(in_r2s) # if len(in_r1s) > 1 or in_r1s[0].startswith('<('): # # If there are more than 1 fastqs per read_pair, merge them into one file per read_pair # # Note, catting compressed files together seems fine # # Have to cat because fastqc does not support streaming # # TODO make sure we are concating to local temp disc if available. For the usual S3 option this is fine, since we're already in a tmp dir # # TODO stream from s3 into a cat command when input files start with s3:// # # r1, r2 = 'cat_r1.fastq.gz', 'cat_r2.fastq.gz' # cat = r""" # cat {r1s_join} > {r1} # cat {r2s_join} > {r2} # """.format(s=s, # r1s_join=' '.join(map(str, in_r1s)), # r2s_join=' '.join(map(str, in_r2s)), # **locals()) # cleanup = 'rm %s %s' % (r1, r2) # else: # r1, r2 = in_r1s[0], in_r2s[0] # cat = "" # cleanup = "" return r""" mkdir -p {out_dir} {s[opt][fastqc]} \ --threads {core_req} \ --dir {s[gk][tmp_dir]} \ -o {out_dir} \ {fqs} """.format(s=s, fqs=' '.join(in_r1s + in_r2s),**locals())
def collect_multiple_metrics(in_bam=find('bam'), out_path=out_dir('picard'), reference_fasta=s['ref']['reference_fasta']): return r""" {picard} CollectMultipleMetrics \ I={in_bam} \ O={out_path} \ R={reference_fasta} \ {programs} """.format(picard=picard(), programs=' '.join('PROGRAM=%s' % p for p in ['CollectAlignmentSummaryMetrics', 'CollectInsertSizeMetrics', 'QualityScoreDistribution', 'MeanQualityByCycle', 'CollectBaseDistributionByCycle', 'CollectGcBiasMetrics', 'CollectSequencingArtifactMetrics', 'CollectQualityYieldMetrics', ]), **locals())
def collect_multiple_metrics(in_bam=find('bam'), out_path=out_dir('picard'), reference_fasta=s['ref']['reference_fasta']): return r""" {picard} CollectMultipleMetrics \ I={in_bam} \ O={out_path} \ R={reference_fasta} \ {programs} """.format(picard=picard(), programs=' '.join('PROGRAM=%s' % p for p in [ 'CollectAlignmentSummaryMetrics', 'CollectInsertSizeMetrics', 'QualityScoreDistribution', 'MeanQualityByCycle', 'CollectBaseDistributionByCycle', 'CollectGcBiasMetrics', 'CollectSequencingArtifactMetrics', 'CollectQualityYieldMetrics', 'CollectWgsMetrics' ]), **locals())
def select_variants(select_type, in_vcfs=find('vcf|vcf.gz$', n='>0'), out_vcf=out_dir('variants.vcf'), in_reference_fasta=s['ref']['reference_fasta'], mem_req=6 * 1024): """ :param select_type: "SNP" or "INDEL" """ return r""" {gatk} \ -T SelectVariants \ -R {in_reference_fasta} \ {inputs} \ -selectType {select_type} \ -o {out_vcf} """.format(s=s, gatk=gatk(mem_req), inputs=vcf_list_to_input(in_vcfs), **locals())
def cat(in_txts=find('txt$', n='>=1'), out_txt=out_dir('cat.txt')): return 'cat {input_str} > {out_txt}'.format(input_str=' '.join( map(str, in_txts)), **locals())
def md5sum(in_file=find('.*', n=1), out_md5=out_dir('checksum.md5')): out_md5.basename = in_file.basename + '.md5' return 'md5sum {in_file}'.format(**locals())
def word_count(chars=False, in_txts=find('txt$', n='>=1'), out_txt=out_dir('wc.txt')): c = ' -c' if chars else '' return 'wc{c} {input} > {out_txt}'.format( input=' '.join(map(str, in_txts)), **locals() )
def cat(in_txts=find('txt$', n='>=1'), out_txt=out_dir('cat.txt')): return 'cat {input_str} > {out_txt}'.format( input_str=' '.join(map(str, in_txts)), **locals() )
def paste(in_txts=find('txt$', n='>=1'), out_txt=out_dir('paste.txt')): return 'paste {input} > {out_txt}'.format( input=' '.join(map(str, (in_txts,))), **locals() )
def view(f, in_bam=find('bam$'), out_bam=out_dir('reads.bam')): return '{s[opt][samtools]} view -f {f} -h {in_bam} > {out_bam}'.format( s=s, **locals())
def vcf_concat_parts(in_vcfs=find('vcf$', n='>0'), out_vcf=out_dir('freebayes.vcf')): return r""" {s[opt][vcf_concat_parts]} {vcfs} > {out_vcf} """.format(s=settings, vcfs=' '.join(in_vcfs), **locals())
def paste(in_txts=find('txt$', n='>=1'), out_txt=out_dir('paste.txt')): return 'paste {input} > {out_txt}'.format(input=' '.join( map(str, (in_txts, ))), **locals())
def view(f, in_bam=find('bam$'), out_bam=out_dir('reads.bam')): return '{s[opt][samtools]} view -f {f} -h {in_bam} > {out_bam}'.format(s=s, **locals())