def run_ex2(execution): # Create two jobs that echo "hello" and "world" respectively (source nodes in the graph). echos = [execution.add_task(echo, tags=dict(word=word), out_dir='{word}') for word in ['hello', 'world']] # Split each echo into two jobs (a one2many relationship). cats = [execution.add_task(cat, tags=dict(n=n, **echo_task.tags), parents=[echo_task], out_dir='{word}/{n}') for echo_task in echos for n in [1, 2]] # Count the words in the previous stage. An example of a one2one relationship, # the most common stage dependency pattern. For each task in StageA, you create a single dependent task in StageB. word_counts = one2one(cmd_fxn=word_count, parents=cats, tag=dict(chars=True)) # Cat the contents of all word_counts into one file. Note only one node is being created who's parents are # all of the WordCounts (a many2one relationship). summarize = execution.add_task(cat, dict(), word_counts, '', 'Summary_Analysis') if pygraphviz_available: # These images can also be seen on the fly in the web-interface draw_stage_graph(execution.stage_graph(), '/tmp/ex2_task_graph.png', format='png') draw_task_graph(execution.task_graph(), '/tmp/ex2_stage_graph.png', format='png') else: print 'Pygraphviz is not available :(' execution.run(max_attempts=1, max_cores=10)
def run_ex2(execution): # Create two jobs that echo "hello" and "world" respectively (source nodes in the graph). echos = [execution.add_task(echo, tags=dict(word=word), out_dir='{word}') for word in ['hello', 'world']] # Split each echo into two jobs (a one2many relationship). cats = [execution.add_task(cat, tags=dict(n=n, **echo_task.tags), parents=[echo_task], out_dir='{word}/{n}') for echo_task in echos for n in [1, 2]] # Count the words in the previous stage. An example of a one2one relationship, # the most common stage dependency pattern. For each task in StageA, you create a single dependent task in StageB. word_counts = one2one(execution, cmd_fxn=word_count, parents=cats, tag=dict(chars=True)) # Cat the contents of all word_counts into one file. Note only one node is being created who's parents are # all of the WordCounts (a many2one relationship). summarize = execution.add_task(cat, dict(), word_counts, '', 'Summary_Analysis') if pygraphviz_available: # These images can also be seen on the fly in the web-interface draw_stage_graph(execution.stage_graph(), '/tmp/ex1_task_graph.png', format='png') draw_task_graph(execution.task_graph(), '/tmp//ex1_stage_graph.png', format='png') else: print 'Pygraphviz is not available :(' execution.run()
def variant_call(execution, bam_path, target_bed_path, max_complex_gap): """ Bioinformatics variant calling workflow """ contigs = sp.check_output("cat %s |cut -f1|uniq" % target_bed_path, shell=True).strip().split("\n") bed_tasks = [execution.add_task(tools.filter_bed_by_contig, tags=dict(in_bam=bam_path, in_bed=target_bed_path, contig=contig), out_dir='work/{contig}') for contig in contigs ] freebayes_tasks = one2one(tools.freebayes, bed_tasks, dict(max_complex_gap=max_complex_gap)) merge_vcf_tasks = many2one(tools.vcf_concat_parts, freebayes_tasks) execution.run()
def variant_call(execution, bam_path, target_bed_path, max_complex_gap): """ Bioinformatics variant calling workflow """ contigs = sp.check_output("cat %s |cut -f1|uniq" % target_bed_path, shell=True).strip().split("\n") bed_tasks = [ execution.add_task(tools.filter_bed_by_contig, tags=dict(in_bam=bam_path, in_bed=target_bed_path, contig=contig), out_dir='work/{contig}') for contig in contigs ] freebayes_tasks = one2one(tools.freebayes, bed_tasks, dict(max_complex_gap=max_complex_gap)) merge_vcf_tasks = many2one(tools.vcf_concat_parts, freebayes_tasks) execution.run()
def align(execution, fastq_tasks, target_bed_tasks): """ Reads -> Alignments :param Execution execution: The Execution instance to create Tasks in :param list[Task] | [(str, dict)] fastq_tasks: Fastq input (file_path, dict) tuples or Tasks :param list[Task] target_bed_tasks: target beds to parallelize/split on :return: Indel Realigned Tasks """ # Do we need to split fastqs into smaller pieces? aligns = [] for tags, fastq_task_group in group(fastq_tasks, by=[ 'sample_name', 'library', 'platform', 'platform_unit', 'rgid', 'chunk' ]): # trim_task = execution.add_task(fastq.trim_galore, # tags=dict(**tags), # parents=fastq_task_group, # out_dir='SM_{sample_name}/work/RG_{rgid}/CH_{chunk}') align_task = execution.add_task( bwa.bwa_mem, tags=dict(**tags), parents=fastq_task_group, out_dir='SM_{sample_name}/work/RG_{rgid}/CH_{chunk}') aligns.append(align_task) dedupe = many2one(picard.mark_duplicates, aligns, groupby=['sample_name', 'library'], out_dir='SM_{sample_name}/work/LB_{library}') # Note, could get slightly improved results by indel realigning over multiple samples, especially if low coverage # for tags, parents in group(dedupe, ['sample_name']): # for target_bed_task in target_bed_tasks: # d = dict(contig=target_bed_task.tags['contig'], # in_target_bed=target_bed_task.output_files[0], # **tags) # rtc_tasks = [ execution.add_task(gatk.realigner_target_creator, dict(contig=target_bed_task.tags['contig'], in_target_bed=target_bed_task.output_files[0], **tags), parents + [target_bed_task], out_dir='SM_{sample_name}/work/contigs/{contig}') for tags, parents in group(dedupe, ['sample_name']) # Many2one for target_bed_task in target_bed_tasks ] # One2many realigned_by_sample_contig_tasks = one2one(gatk.indel_realigner, rtc_tasks) realigned_by_sample_contig_tasks += [ execution.add_task(samtools.view, dict(out_bam=out_dir('both_pairs_unmapped.bam' % lb_task), f='12', sample_name=tags['sample_name'], contig='BOTH_PAIRS_UNMAPPED', library=lb_task.tags['library']), parents=lb_task, out_dir='SM_{sample_name}/work/LB_{library}', stage_name='Filter_Both_Pairs_Unmapped') for tags, sm_tasks in group(dedupe, ['sample_name']) for lb_task in sm_tasks ] # Skipping BQSR. Will improve results only slightly, if at all. # Merge bams so we have a sample bam. Returning realign, so bams remained split by contig for downstream # parallelization merged = many2one(picard.merge_sam_files, realigned_by_sample_contig_tasks, ['sample_name'], out_dir='SM_{sample_name}', stage_name="Merge_Sample_Bams") one2one(picard.collect_multiple_metrics, merged, out_dir='SM_{sample_name}/metrics') return merged
def align(execution, fastq_tasks, target_bed_tasks): """ Reads -> Alignments :param Execution execution: The Execution instance to create Tasks in :param list[Task] | [(str, dict)] fastq_tasks: Fastq input (file_path, dict) tuples or Tasks :param list[Task] target_bed_tasks: target beds to parallelize/split on :return: Indel Realigned Tasks """ # Do we need to split fastqs into smaller pieces? aligns = [] for tags, fastq_task_group in group(fastq_tasks, by=['sample_name', 'library', 'platform', 'platform_unit', 'rgid', 'chunk']): # trim_task = execution.add_task(fastq.trim_galore, # tags=dict(**tags), # parents=fastq_task_group, # out_dir='SM_{sample_name}/work/RG_{rgid}/CH_{chunk}') align_task = execution.add_task(bwa.bwa_mem, tags=dict(**tags), parents=fastq_task_group, out_dir='SM_{sample_name}/work/RG_{rgid}/CH_{chunk}') aligns.append(align_task) dedupe = many2one(picard.mark_duplicates, aligns, groupby=['sample_name', 'library'], out_dir='SM_{sample_name}/work/LB_{library}') # Note, could get slightly improved results by indel realigning over multiple samples, especially if low coverage # for tags, parents in group(dedupe, ['sample_name']): # for target_bed_task in target_bed_tasks: # d = dict(contig=target_bed_task.tags['contig'], # in_target_bed=target_bed_task.output_files[0], # **tags) # rtc_tasks = [execution.add_task(gatk.realigner_target_creator, dict(contig=target_bed_task.tags['contig'], in_target_bed=target_bed_task.output_files[0], **tags), parents + [target_bed_task], out_dir='SM_{sample_name}/work/contigs/{contig}') for tags, parents in group(dedupe, ['sample_name']) # Many2one for target_bed_task in target_bed_tasks] # One2many realigned_by_sample_contig_tasks = one2one(gatk.indel_realigner, rtc_tasks) realigned_by_sample_contig_tasks += [execution.add_task(samtools.view, dict(out_bam=out_dir('both_pairs_unmapped.bam' % lb_task), f='12', sample_name=tags['sample_name'], contig='BOTH_PAIRS_UNMAPPED', library=lb_task.tags['library']), parents=lb_task, out_dir='SM_{sample_name}/work/LB_{library}', stage_name='Filter_Both_Pairs_Unmapped') for tags, sm_tasks in group(dedupe, ['sample_name']) for lb_task in sm_tasks] # Skipping BQSR. Will improve results only slightly, if at all. # Merge bams so we have a sample bam. Returning realign, so bams remained split by contig for downstream # parallelization merged = many2one(picard.merge_sam_files, realigned_by_sample_contig_tasks, ['sample_name'], out_dir='SM_{sample_name}', stage_name="Merge_Sample_Bams") one2one(picard.collect_multiple_metrics, merged, out_dir='SM_{sample_name}/metrics') one2one(picard.collect_wgs_metrics, merged, out_dir='SM_{sample_name}/metrics') return merged