def indel_realigner(core_req=4, # proxy for mem_req until i test mem_req out mem_req=8 * 1024, contig=None, in_bams=find('bam$', n='>0'), in_bais=find('bai$', n='>0'), in_sites=find('denovo_realign_targets.bed'), out_bam=out_dir('realigned.bam'), out_bai=out_dir('realigned.bai')): in_bams = bam_list_to_inputs(in_bams) if s['ref']['version'] == 'b37': in_knowns = s['ref']['1kg_indel_vcf'], s['ref']['mills_vcf'] elif s['ref']['version'] == 'hg38': in_knowns = [s['ref']['mills_and_1kg_indel_vcf']] return r""" # IR does not support parallelization {gatk} \ -T IndelRealigner \ -R {s[ref][reference_fasta]} \ -I {in_bams} \ -o {out_bam} \ -targetIntervals {in_sites} \ {knowns} \ -model USE_READS \ --filter_bases_not_stored \ {intervals} {s[opt][samtools]} index {out_bam} """.format(s=s, intervals=arg('--intervals', contig), gatk=gatk(mem_req), knowns=' '.join('-known %s' % p for p in in_knowns), **locals())
def merge_sam_files(in_bams=find('bam', n='>=1'), out_bam=out_dir('merged.bam'), out_bai=out_dir('merged.bai')): return r""" {picard} MergeSamFiles \ {inputs} \ O={out_bam} \ ASSUME_SORTED=True \ CREATE_INDEX=True """.format(picard=picard(), inputs=list_to_input(in_bams), **locals())
def mark_illumina_adapters(mem_req=8 * 1024, in_bam=find('bam'), out_bam=out_dir('unaligned_trimmed.bam'), out_metrics=out_dir('adapter.metrics')): return r""" {picard} MarkIlluminaAdapters\ I={in_bam} \ O={out_bam} \ METRICS={out_metrics} """.format(s=s, picard=picard(), **locals())
def bwa_mem(rgid, sample_name, library, platform, platform_unit, reference=s['ref']['reference_fasta'], core_req=16, in_fastqs=find('.fastq|.fq|.fq.gz|.fastq.gz', n=2), out_cutadapt_log=out_dir('cutadapt.log'), out_bam=out_dir('aligned.bam'), out_bai=out_dir('aligned.bai')): in_fastq1, in_fastq2 = in_fastqs fifo1 = out_bam.replace('aligned.bam', 'fifo1') fifo2 = out_bam.replace('aligned.bam', 'fifo2') return r""" {s[opt][bwa]} mem \ -t {bwa_cores} -L 0 -M \ -R "@RG\tID:{rgid}\tLB:{library}\tSM:{sample_name}\tPL:{platform}\tPU:{platform_unit}" \ {reference} \ {in_fastq1} \ {in_fastq2} \ | {picard} SortSam I=/dev/stdin O={out_bam} CREATE_INDEX=true SORT_ORDER=coordinate """.format(s=s, bwa_cores=core_req-2, picard=picard.picard(), **locals()) # @can_stream(['in_fastq1', 'in_fastq2']) # def bwa_mem_with_trimming(rgid, sample_name, library, platform, platform_unit, # reference=s['ref']['reference_fasta'], # core_req=16, # in_fastq1=find('.fastq', tags=dict(read_pair='1')), # in_fastq2=find('.fastq', tags=dict(read_pair='2')), # out_bam=out_dir('aligned.bam'), # out_bai=out_dir('aligned.bam.bai'), # out_adapter_metrics=out_dir('adapter.metrics')): # return r""" # # {fastq_to_sam} \ # | {mark_illumina_adapters} \ # | {sam_to_fastq} # | {s[opt][bwa]} mem \ # -t {core_req} -L 0 -M -p \ # -R "@RG\tID:{rgid}\tLB:{library}\tSM:{sample_name}\tPL:{platform}\tPU:{platform_unit}" \ # {reference} \ # /dev/stdin \ # | {s[opt][samtools]} sort -@ 2 -m 2G - {samtools_out} # # {s[opt][samtools]} index {out_bam} # """.format(s=s, # fastq_to_sam=picard.fastq_to_sam(rgid=rgid, sample_name=sample_name, library=library, platform=platform, platform_unit=platform_unit, # in_fastq1=in_fastq1, in_fastq2=in_fastq2, out_bam='/dev/stdout').strip(), # mark_illumina_adapters=picard.mark_illumina_adapters(in_bam='/dev/stdin', out_bam='/dev/stdout', metrics=out_adapter_metrics).strip(), # sam_to_fastq=picard.sam_to_fastq_interleave('/dev/stdin', '/dev/stdout'), # # samtools_out=out_bam.replace('.bam', ''), # **locals())
def trim_galore(in_fastq1=find('fq.gz|\.fastq|fastq.gz', tags=dict(read_pair='1')), in_fastq2=find('fq.gz|\.fastq|fastq.gz', tags=dict(read_pair='2')), out_directory=out_dir(''), out_fastq1=out_dir('trimmed_r1.fastq.gz'), out_fastq2=out_dir('trimmed_r2.fastq.gz')): return r""" {s[opt][trim_galore]} \ --paired \ --dont_gzip \ -o {out_directory} \ --path_to_cutadapt {s[opt][cutadapt]} \ {in_fastq1} {in_fastq2} """.format(s=s, **locals())
def haplotype_caller(core_req=16, mem_req=12 * 1024, in_bams=find('bam$', n='>0'), in_bais=find('bai$', n='>0'), in_target_bed=find('target.bed'), out_vcf=out_dir('raw_variants.g.vcf')): in_bams = bam_list_to_inputs(in_bams) intervals = arg('--intervals', in_target_bed) return r""" {gatk} \ -T HaplotypeCaller \ -R {s[ref][reference_fasta]} \ -D {s[ref][dbsnp_vcf]} \ -nct {core_req} \ --emitRefConfidence GVCF \ -stand_call_conf 30 \ -stand_emit_conf 10 \ -I {in_bams} \ -o {out_vcf} \ {intervals} \ -A Coverage \ -A GCContent \ -A AlleleBalanceBySample \ -A AlleleBalance \ -A MappingQualityRankSumTest \ -A InbreedingCoeff \ -A FisherStrand \ -A QualByDepth """.format(s=s, gatk=gatk(mem_req), **locals())
def word_count(chars=False, in_txts=find('txt$', n='>=1'), out_txt=out_dir('wc.txt')): c = ' -c' if chars else '' return 'wc{c} {input} > {out_txt}'.format(input=' '.join(map(str, in_txts)), **locals())
def realigner_target_creator(core_req=8, mem_req=8 * 1024, in_target_bed=find('target.bed'), in_bams=find('bam$', n='>0'), in_bais=find('bai$', n='>0'), out_bams=forward('in_bams'), out_bais=forward('in_bais'), out_sites=out_dir('denovo_realign_targets.bed')): in_bams = bam_list_to_inputs(in_bams) if s['ref']['version'] == 'b37': in_knowns = s['ref']['1kg_indel_vcf'], s['ref']['mills_vcf'] elif s['ref']['version'] == 'hg38': in_knowns = [s['ref']['mills_and_1kg_indel_vcf']] # TODO should we pad intervals? might be indels on perimeter that need realigner. Not too worried because we're using HaplotypeCaller, though. return r""" #could add more knowns from ESP and other seq projects... {gatk} \ -T RealignerTargetCreator \ -R {s[ref][reference_fasta]} \ -I {in_bams} \ -o {out_sites} \ {knowns} \ -nt {core_req} \ {args} """.format(s=s, gatk=gatk(mem_req), args=arg('--intervals', in_target_bed), knowns=' '.join('-known %s' % p for p in in_knowns), **locals())
def sam_to_fastq_interleave(in_bam=find('bam$'), out_fastq=out_dir('reads.fastq')): return r""" {picard} SamToFastq \ I={in_bam} \ FASTQ={out_fastq} """.format(s=s, picard=picard(), **locals())
def haplotype_caller(core_req=16, mem_req=29 * 1024, in_bams=find('bam$', n='>0'), in_bais=find('bai$', n='>0'), in_target_bed=find('target.bed'), out_vcf=out_dir('raw_variants.g.vcf')): in_bams = bam_list_to_inputs(in_bams) intervals = arg('--intervals', in_target_bed) return r""" {gatk} \ -T HaplotypeCaller \ -R {s[ref][reference_fasta]} \ -D {s[ref][dbsnp_vcf]} \ -nct {core_req} \ --emitRefConfidence GVCF \ -I {in_bams} \ -o {out_vcf} \ {intervals} \ -A Coverage \ -A GCContent \ -A AlleleBalanceBySample \ -A AlleleBalance \ -A MappingQualityRankSumTest \ -A InbreedingCoeff \ -A FisherStrand \ -A QualByDepth """.format(s=s, gatk=gatk(mem_req), **locals())
def filter_bed_by_contig(contig, drm='local', in_bed=find('bed$'), out_bed=out_dir('target.bed')): return r""" grep -P "^{contig}\t" {in_bed} > {out_bed} """.format(s=s, **locals())
def filter_bed_by_contig(contig, drm='local', in_bed=find('bed$'), out_bed=out_dir('target.bed')): return r""" grep -P "^{contig}\t" {in_bed} > {out_bed} """.format(s=settings, **locals())
def run_germline(execution, max_cores, max_attempts, target_bed, input_path=None, s3fs=None): """ Executes the germline variant calling pipeline :type execution: Execution :param str target_bed: The target bed to call variants in :param str input_path: The path to the input_file tsv of fastq files """ #: chrom -> target_bed_path target_bed = os.path.abspath(os.path.expanduser(target_bed)) input_path = os.path.abspath(os.path.expanduser(input_path)) # Copy the target.bed to the output_dir assert os.path.exists(target_bed), '%s does not exist' % target_bed cp_target_bed_task = execution.add_task( lambda drm='local', out_bed=out_dir('target.bed'): 'cp %s %s' % (target_bed, out_bed), out_dir='', stage_name='Copy_Target_Bed') target_bed_tasks = [ execution.add_task(bed.filter_bed_by_contig, dict(contig=contig), [cp_target_bed_task], 'work/contigs/{contig}') for contig in util.get_bed_contigs(target_bed) ] fastq_tasks = list(util.gen_fastq_tasks(execution, input_path)) # fastq_tasks = [execution.add_task(load_input, dict(in_file=fastq_path, **tags), stage_name='Load_Fastqs') # for fastq_path, tags in parse_inputs(input_path)] fastqc_tasks = many2one(fastqc.fastqc, fastq_tasks, ['sample_name', 'library'], out_dir='SM_{sample_name}/qc/LB_{library}') # fastq_tasks = split_large_fastq_files(execution, fastq_tasks) # not working yet aligned_tasks = align(execution, fastq_tasks, target_bed_tasks) call_task = variant_call(execution, aligned_tasks, target_bed_tasks) execution.run(max_cores=max_cores, max_attempts=max_attempts, cmd_wrapper=make_s3_cmd_fxn_wrapper(s3fs) if s3fs else shared_fs_cmd_fxn_wrapper) if execution.successful: execution.log.info('Final vcf: %s' % opj( s3fs if s3fs else execution.output_dir, call_task.output_files[0])) # Copy the sqlite db to s3 dburl = env.config['gk']['database_url'] if s3fs and dburl.startswith('sqlite'): # TODO implement so there is a 1-to-1 relationship between a sqlite database and an Execution. Currently this is pushing way too much information, # TODO but will soon be replaced. Alternative: use amazon RDS! Or perhaps both? Could do a sqlalchemy merge and save to sqlite, or implement # TODO cosmos multiverse s3cmd.cp(dburl.replace('sqlite:///', ''), opj(s3fs, 'sqlite.db.backup'))
def cut_adapt(minimum_length=50, in_fastq1=find('fq.gz|\.fastq|fastq.gz', tags=dict(read_pair='1')), in_fastq2=find('fq.gz|\.fastq|fastq.gz', tags=dict(read_pair='2')), out_fastq1=out_dir('trimmed_r1.fastq.gz'), out_fastq2=out_dir('trimmed_r2.fastq.gz')): # out_fastq1='>( gzip > %s)' % out_fastq1 # out_fastq2='>( gzip > %s)' % out_fastq2 return r""" {s[opt][cutadapt]} \ -a AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC \ -A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT \ {args} \ -o {out_fastq1} -p {out_fastq2} \ {in_fastq1} {in_fastq2} """.format(s=s, args=args(('--minimum-length', minimum_length)), **locals())
def gen_fastq_tasks(execution, input_path): for fastq_path, tags in parse_inputs(input_path): if fastq_path.startswith('s3://'): yield execution.add_task(download_from_s3, dict(in_file=fastq_path, out_file=out_dir('SM_{sample_name}/work/input/%s' % os.path.basename(fastq_path)), **tags), stage_name='Download_Fastqs_From_S3') else: yield execution.add_task(load_input, dict(in_file=fastq_path, **tags), stage_name='Load_Fastqs')
def collect_variant_calling_metrics(in_vcf=find('in_vcf'), in_dbsnp=s['ref']['dbsnp_vcf'], out_path=out_dir('picard.variant_metrics')): return r""" {picard} CollectVariantCallingMetrics \ I={in_vcf} \ DBSNP={in_dbsnp} \ O={out_path} """.format(picard=picard(), **locals())
def collect_variant_calling_metrics( in_vcf=find('in_vcf'), in_dbsnp=s['ref']['dbsnp_vcf'], out_path=out_dir('picard.variant_metrics')): return r""" {picard} CollectVariantCallingMetrics \ I={in_vcf} \ DBSNP={in_dbsnp} \ O={out_path} """.format(picard=picard(), **locals())
def mark_duplicates( core_req=4, # for scratch space mem_req=12 * 1024, in_bams=find('bam$', n='>=1'), in_bais=find('bai$', n='>=1'), out_bam=out_dir('deduped.bam'), out_bai=out_dir('deduped.bam.bai'), out_metrics=out_dir('deduped.metrics')): return r""" {picard} MarkDuplicates \ {inputs} \ O={out_bam} \ METRICS_FILE={out_metrics} \ ASSUME_SORTED=True \ MAX_RECORDS_IN_RAM=1000000 \ VALIDATION_STRINGENCY=SILENT \ VERBOSITY=INFO {s[opt][samtools]} index {out_bam} """.format(inputs=list_to_input(in_bams), s=s, picard=picard(), **locals())
def merge(in_bams=find('bam$', n='>0'), out_bam=out_dir('merged.bam')): if len(in_bams) == 1: # Can't merge 1 bam, just copy it return r""" cp {in_bams[0]} {out_bam} """.format(**locals()) else: in_bams = ' '.join(map(str, in_bams)) return r""" {s[opt][samtools]} merge -f {out_bam} {in_bams} """.format(s=s, **locals())
def collect_wgs_metrics(in_bam=find('bam'), out_path=out_dir('picard.raw_wgs_metrics.txt'), reference_fasta=s['ref']['reference_fasta']): return r""" {picard} CollectRawWgsMetrics \ I={in_bam} \ O={out_path} \ R={reference_fasta} \ INCLUDE_BQ_HISTOGRAM=true """.format(picard=picard(), **locals())
def mark_duplicates(core_req=4, # for scratch space mem_req=12 * 1024, in_bams=find('bam$', n='>=1'), in_bais=find('bai$', n='>=1'), out_bam=out_dir('deduped.bam'), out_bai=out_dir('deduped.bam.bai'), out_metrics=out_dir('deduped.metrics')): return r""" {picard} MarkDuplicates \ {inputs} \ O={out_bam} \ METRICS_FILE={out_metrics} \ ASSUME_SORTED=True \ MAX_RECORDS_IN_RAM=1000000 \ VALIDATION_STRINGENCY=SILENT \ VERBOSITY=INFO {s[opt][samtools]} index {out_bam} """.format(inputs=list_to_input(in_bams), s=s, picard=picard(), **locals())
def combine_gvcfs(mem_req=12 * 1024, in_vcfs=find('vcf|vcf.gz$', n='>0'), out_vcf=out_dir('variants.g.vcf')): in_vcfs = vcf_list_to_input(in_vcfs) return r""" {gatk} \ -T CombineGVCFs \ -R {s[ref][reference_fasta]} \ {in_vcfs} \ -o {out_vcf} """.format(s=s, gatk=gatk(mem_req), **locals())
def genotype_gvcfs(core_req=8, mem_req=12 * 1024, in_vcfs=find('vcf|vcf.gz$', n='>0'), out_vcf=out_dir('variants.vcf')): return r""" {gatk} \ -T GenotypeGVCFs \ -R {s[ref][reference_fasta]} \ -D {s[ref][dbsnp_vcf]} \ -nt {core_req} \ {inputs} \ -o {out_vcf} """.format(s=s, gatk=gatk(mem_req), inputs=vcf_list_to_input(in_vcfs), **locals())
def gen_fastq_tasks(execution, input_path): for fastq_path, tags in parse_inputs(input_path): if fastq_path.startswith('s3://'): yield execution.add_task( download_from_s3, dict(in_file=fastq_path, out_file=out_dir('SM_{sample_name}/work/input/%s' % os.path.basename(fastq_path)), **tags), stage_name='Download_Fastqs_From_S3') else: yield execution.add_task(load_input, dict(in_file=fastq_path, **tags), stage_name='Load_Fastqs')
def freebayes(reference_fasta=settings['ref']['reference_fasta'], max_complex_gap=2, no_complex=True, in_target_bed=find('bed$'), in_bam=find('bam$'), out_vcf=out_dir('variants.vcf')): return r""" {s[opt][freebayes]} -f {reference_fasta} \ --vcf {out_vcf} \ --targets {in_target_bed} \ {args} \ -m 30 -q 10 -R 0 -S 0 -F 0.1 \ {in_bam} """.format(s=settings, args=args(('--max-complex-gap', max_complex_gap), ('--no-complex', no_complex)), **locals())
def collect_multiple_metrics(in_bam=find('bam'), out_path=out_dir('picard'), reference_fasta=s['ref']['reference_fasta']): return r""" {picard} CollectMultipleMetrics \ I={in_bam} \ O={out_path} \ R={reference_fasta} \ {programs} """.format(picard=picard(), programs=' '.join('PROGRAM=%s' % p for p in ['CollectAlignmentSummaryMetrics', 'CollectInsertSizeMetrics', 'QualityScoreDistribution', 'MeanQualityByCycle', 'CollectBaseDistributionByCycle', 'CollectGcBiasMetrics', 'CollectSequencingArtifactMetrics', 'CollectQualityYieldMetrics', ]), **locals())
def run_germline(execution, max_cores, max_attempts, target_bed, input_path=None, s3fs=None): """ Executes the germline variant calling pipeline :type execution: Execution :param str target_bed: The target bed to call variants in :param str input_path: The path to the input_file tsv of fastq files """ #: chrom -> target_bed_path target_bed = os.path.abspath(os.path.expanduser(target_bed)) input_path = os.path.abspath(os.path.expanduser(input_path)) # Copy the target.bed to the output_dir assert os.path.exists(target_bed), '%s does not exist' % target_bed cp_target_bed_task = execution.add_task(lambda drm='local', out_bed=out_dir('target.bed'): 'cp %s %s' % (target_bed, out_bed), out_dir='', stage_name='Copy_Target_Bed') target_bed_tasks = [execution.add_task(bed.filter_bed_by_contig, dict(contig=contig), [cp_target_bed_task], 'work/contigs/{contig}') for contig in util.get_bed_contigs(target_bed)] fastq_tasks = list(util.gen_fastq_tasks(execution, input_path)) # fastq_tasks = [execution.add_task(load_input, dict(in_file=fastq_path, **tags), stage_name='Load_Fastqs') # for fastq_path, tags in parse_inputs(input_path)] fastqc_tasks = many2one(fastqc.fastqc, fastq_tasks, ['sample_name', 'library'], out_dir='SM_{sample_name}/qc/LB_{library}') # fastq_tasks = split_large_fastq_files(execution, fastq_tasks) # not working yet aligned_tasks = align(execution, fastq_tasks, target_bed_tasks) call_task = variant_call(execution, aligned_tasks, target_bed_tasks) execution.run(max_cores=max_cores, max_attempts=max_attempts, cmd_wrapper=make_s3_cmd_fxn_wrapper(s3fs) if s3fs else shared_fs_cmd_fxn_wrapper) if execution.successful: execution.log.info('Final vcf: %s' % opj(s3fs if s3fs else execution.output_dir, call_task.output_files[0])) # Copy the sqlite db to s3 dburl = env.config['gk']['database_url'] if s3fs and dburl.startswith('sqlite'): # TODO implement so there is a 1-to-1 relationship between a sqlite database and an Execution. Currently this is pushing way too much information, # TODO but will soon be replaced. Alternative: use amazon RDS! Or perhaps both? Could do a sqlalchemy merge and save to sqlite, or implement # TODO cosmos multiverse s3cmd.cp(dburl.replace('sqlite:///', ''), opj(s3fs, 'sqlite.db.backup'))
def fastq_to_sam(rgid, sample_name, library, platform, platform_unit, in_fastq1=find('.fastq', tags=dict(read_pair='1')), in_fastq2=find('.fastq', tags=dict(read_pair='2')), out_bam=out_dir('unaligned.bam')): return r""" {picard} FastqToSam \ FASTQ={in_fastq1} \ FASTQ2={in_fastq2} \ O={out_bam} \ SAMPLE_NAME={sample_name} \ LIBRARY_NAME={library} \ PLATFORM_UNIT={platform_unit} \ PLATFORM={platform} \ READ_GROUP_NAME={rgid} """.format(s=s, picard=picard(), **locals())
def collect_multiple_metrics(in_bam=find('bam'), out_path=out_dir('picard'), reference_fasta=s['ref']['reference_fasta']): return r""" {picard} CollectMultipleMetrics \ I={in_bam} \ O={out_path} \ R={reference_fasta} \ {programs} """.format(picard=picard(), programs=' '.join('PROGRAM=%s' % p for p in [ 'CollectAlignmentSummaryMetrics', 'CollectInsertSizeMetrics', 'QualityScoreDistribution', 'MeanQualityByCycle', 'CollectBaseDistributionByCycle', 'CollectGcBiasMetrics', 'CollectSequencingArtifactMetrics', 'CollectQualityYieldMetrics', 'CollectWgsMetrics' ]), **locals())
def select_variants(select_type, in_vcfs=find('vcf|vcf.gz$', n='>0'), out_vcf=out_dir('variants.vcf'), in_reference_fasta=s['ref']['reference_fasta'], mem_req=6 * 1024): """ :param select_type: "SNP" or "INDEL" """ return r""" {gatk} \ -T SelectVariants \ -R {in_reference_fasta} \ {inputs} \ -selectType {select_type} \ -o {out_vcf} """.format(s=s, gatk=gatk(mem_req), inputs=vcf_list_to_input(in_vcfs), **locals())
def fastqc(core_req=8, in_r1s=find('fq.gz|\.fastq|fastq.gz', n='>=1', tags=dict(read_pair='1')), in_r2s=find('fq.gz|\.fastq|fastq.gz', n='>=1', tags=dict(read_pair='2')), out_dir=out_dir('fastqc/')): assert len(in_r1s) == len(in_r2s) # if len(in_r1s) > 1 or in_r1s[0].startswith('<('): # # If there are more than 1 fastqs per read_pair, merge them into one file per read_pair # # Note, catting compressed files together seems fine # # Have to cat because fastqc does not support streaming # # TODO make sure we are concating to local temp disc if available. For the usual S3 option this is fine, since we're already in a tmp dir # # TODO stream from s3 into a cat command when input files start with s3:// # # r1, r2 = 'cat_r1.fastq.gz', 'cat_r2.fastq.gz' # cat = r""" # cat {r1s_join} > {r1} # cat {r2s_join} > {r2} # """.format(s=s, # r1s_join=' '.join(map(str, in_r1s)), # r2s_join=' '.join(map(str, in_r2s)), # **locals()) # cleanup = 'rm %s %s' % (r1, r2) # else: # r1, r2 = in_r1s[0], in_r2s[0] # cat = "" # cleanup = "" cat = 'cat {fqs} | {zcat_or_cat}'.format(fqs=' '.join(in_r1s + in_r2s), zcat_or_cat='zcat' if '.gz' in in_r1s[0] else 'cat') return r""" mkdir -p {out_dir} {cat} | \ {s[opt][fastqc]} \ --threads {core_req} \ --dir {s[gk][tmp_dir]} \ -o {out_dir} \ /dev/stdin """.format(s=s, **locals())
def fastqc(core_req=8, in_r1s=find('fq.gz|\.fastq|fastq.gz', n='>=1', tags=dict(read_pair='1')), in_r2s=find('fq.gz|\.fastq|fastq.gz', n='>=1', tags=dict(read_pair='2')), out_dir=out_dir('fastqc/')): assert len(in_r1s) == len(in_r2s) # if len(in_r1s) > 1 or in_r1s[0].startswith('<('): # # If there are more than 1 fastqs per read_pair, merge them into one file per read_pair # # Note, catting compressed files together seems fine # # Have to cat because fastqc does not support streaming # # TODO make sure we are concating to local temp disc if available. For the usual S3 option this is fine, since we're already in a tmp dir # # TODO stream from s3 into a cat command when input files start with s3:// # # r1, r2 = 'cat_r1.fastq.gz', 'cat_r2.fastq.gz' # cat = r""" # cat {r1s_join} > {r1} # cat {r2s_join} > {r2} # """.format(s=s, # r1s_join=' '.join(map(str, in_r1s)), # r2s_join=' '.join(map(str, in_r2s)), # **locals()) # cleanup = 'rm %s %s' % (r1, r2) # else: # r1, r2 = in_r1s[0], in_r2s[0] # cat = "" # cleanup = "" return r""" mkdir -p {out_dir} {s[opt][fastqc]} \ --threads {core_req} \ --dir {s[gk][tmp_dir]} \ -o {out_dir} \ {fqs} """.format(s=s, fqs=' '.join(in_r1s + in_r2s),**locals())
def view(f, in_bam=find('bam$'), out_bam=out_dir('reads.bam')): return '{s[opt][samtools]} view -f {f} -h {in_bam} > {out_bam}'.format( s=s, **locals())
def cat(in_txts=find('txt$', n='>=1'), out_txt=out_dir('cat.txt')): return 'cat {input_str} > {out_txt}'.format(input_str=' '.join( map(str, in_txts)), **locals())
def echo(word, out_txt=out_dir('echo.txt')): return '{s[echo_path]} {word} > {out_txt}'.format(s=s, **locals())
def md5sum(in_file=find('.*', n=1), out_md5=out_dir('checksum.md5')): out_md5.basename = in_file.basename + '.md5' return 'md5sum {in_file}'.format(**locals())
def word_count(chars=False, in_txts=find('txt$', n='>=1'), out_txt=out_dir('wc.txt')): c = ' -c' if chars else '' return 'wc{c} {input} > {out_txt}'.format( input=' '.join(map(str, in_txts)), **locals() )
def download_from_s3(in_file, out_file=out_dir('{in_file}')): assert in_file.startswith('s3://') return s3cmd.cp(in_file, out_file)
def paste(in_txts=find('txt$', n='>=1'), out_txt=out_dir('paste.txt')): return 'paste {input} > {out_txt}'.format(input=' '.join( map(str, (in_txts, ))), **locals())
def paste(in_txts=find('txt$', n='>=1'), out_txt=out_dir('paste.txt')): return 'paste {input} > {out_txt}'.format( input=' '.join(map(str, (in_txts,))), **locals() )
def align(execution, fastq_tasks, target_bed_tasks): """ Reads -> Alignments :param Execution execution: The Execution instance to create Tasks in :param list[Task] | [(str, dict)] fastq_tasks: Fastq input (file_path, dict) tuples or Tasks :param list[Task] target_bed_tasks: target beds to parallelize/split on :return: Indel Realigned Tasks """ # Do we need to split fastqs into smaller pieces? aligns = [] for tags, fastq_task_group in group(fastq_tasks, by=[ 'sample_name', 'library', 'platform', 'platform_unit', 'rgid', 'chunk' ]): # trim_task = execution.add_task(fastq.trim_galore, # tags=dict(**tags), # parents=fastq_task_group, # out_dir='SM_{sample_name}/work/RG_{rgid}/CH_{chunk}') align_task = execution.add_task( bwa.bwa_mem, tags=dict(**tags), parents=fastq_task_group, out_dir='SM_{sample_name}/work/RG_{rgid}/CH_{chunk}') aligns.append(align_task) dedupe = many2one(picard.mark_duplicates, aligns, groupby=['sample_name', 'library'], out_dir='SM_{sample_name}/work/LB_{library}') # Note, could get slightly improved results by indel realigning over multiple samples, especially if low coverage # for tags, parents in group(dedupe, ['sample_name']): # for target_bed_task in target_bed_tasks: # d = dict(contig=target_bed_task.tags['contig'], # in_target_bed=target_bed_task.output_files[0], # **tags) # rtc_tasks = [ execution.add_task(gatk.realigner_target_creator, dict(contig=target_bed_task.tags['contig'], in_target_bed=target_bed_task.output_files[0], **tags), parents + [target_bed_task], out_dir='SM_{sample_name}/work/contigs/{contig}') for tags, parents in group(dedupe, ['sample_name']) # Many2one for target_bed_task in target_bed_tasks ] # One2many realigned_by_sample_contig_tasks = one2one(gatk.indel_realigner, rtc_tasks) realigned_by_sample_contig_tasks += [ execution.add_task(samtools.view, dict(out_bam=out_dir('both_pairs_unmapped.bam' % lb_task), f='12', sample_name=tags['sample_name'], contig='BOTH_PAIRS_UNMAPPED', library=lb_task.tags['library']), parents=lb_task, out_dir='SM_{sample_name}/work/LB_{library}', stage_name='Filter_Both_Pairs_Unmapped') for tags, sm_tasks in group(dedupe, ['sample_name']) for lb_task in sm_tasks ] # Skipping BQSR. Will improve results only slightly, if at all. # Merge bams so we have a sample bam. Returning realign, so bams remained split by contig for downstream # parallelization merged = many2one(picard.merge_sam_files, realigned_by_sample_contig_tasks, ['sample_name'], out_dir='SM_{sample_name}', stage_name="Merge_Sample_Bams") one2one(picard.collect_multiple_metrics, merged, out_dir='SM_{sample_name}/metrics') return merged
def vcf_concat_parts(in_vcfs=find('vcf$', n='>0'), out_vcf=out_dir('freebayes.vcf')): return r""" {s[opt][vcf_concat_parts]} {vcfs} > {out_vcf} """.format(s=settings, vcfs=' '.join(in_vcfs), **locals())
def align(execution, fastq_tasks, target_bed_tasks): """ Reads -> Alignments :param Execution execution: The Execution instance to create Tasks in :param list[Task] | [(str, dict)] fastq_tasks: Fastq input (file_path, dict) tuples or Tasks :param list[Task] target_bed_tasks: target beds to parallelize/split on :return: Indel Realigned Tasks """ # Do we need to split fastqs into smaller pieces? aligns = [] for tags, fastq_task_group in group(fastq_tasks, by=['sample_name', 'library', 'platform', 'platform_unit', 'rgid', 'chunk']): # trim_task = execution.add_task(fastq.trim_galore, # tags=dict(**tags), # parents=fastq_task_group, # out_dir='SM_{sample_name}/work/RG_{rgid}/CH_{chunk}') align_task = execution.add_task(bwa.bwa_mem, tags=dict(**tags), parents=fastq_task_group, out_dir='SM_{sample_name}/work/RG_{rgid}/CH_{chunk}') aligns.append(align_task) dedupe = many2one(picard.mark_duplicates, aligns, groupby=['sample_name', 'library'], out_dir='SM_{sample_name}/work/LB_{library}') # Note, could get slightly improved results by indel realigning over multiple samples, especially if low coverage # for tags, parents in group(dedupe, ['sample_name']): # for target_bed_task in target_bed_tasks: # d = dict(contig=target_bed_task.tags['contig'], # in_target_bed=target_bed_task.output_files[0], # **tags) # rtc_tasks = [execution.add_task(gatk.realigner_target_creator, dict(contig=target_bed_task.tags['contig'], in_target_bed=target_bed_task.output_files[0], **tags), parents + [target_bed_task], out_dir='SM_{sample_name}/work/contigs/{contig}') for tags, parents in group(dedupe, ['sample_name']) # Many2one for target_bed_task in target_bed_tasks] # One2many realigned_by_sample_contig_tasks = one2one(gatk.indel_realigner, rtc_tasks) realigned_by_sample_contig_tasks += [execution.add_task(samtools.view, dict(out_bam=out_dir('both_pairs_unmapped.bam' % lb_task), f='12', sample_name=tags['sample_name'], contig='BOTH_PAIRS_UNMAPPED', library=lb_task.tags['library']), parents=lb_task, out_dir='SM_{sample_name}/work/LB_{library}', stage_name='Filter_Both_Pairs_Unmapped') for tags, sm_tasks in group(dedupe, ['sample_name']) for lb_task in sm_tasks] # Skipping BQSR. Will improve results only slightly, if at all. # Merge bams so we have a sample bam. Returning realign, so bams remained split by contig for downstream # parallelization merged = many2one(picard.merge_sam_files, realigned_by_sample_contig_tasks, ['sample_name'], out_dir='SM_{sample_name}', stage_name="Merge_Sample_Bams") one2one(picard.collect_multiple_metrics, merged, out_dir='SM_{sample_name}/metrics') one2one(picard.collect_wgs_metrics, merged, out_dir='SM_{sample_name}/metrics') return merged
def view(f, in_bam=find('bam$'), out_bam=out_dir('reads.bam')): return '{s[opt][samtools]} view -f {f} -h {in_bam} > {out_bam}'.format(s=s, **locals())
def cat(in_txts=find('txt$', n='>=1'), out_txt=out_dir('cat.txt')): return 'cat {input_str} > {out_txt}'.format( input_str=' '.join(map(str, in_txts)), **locals() )