def test_newstyle_collate(self): """ As above but create pipeline on the fly using object orientated syntax rather than decorators """ # # Create pipeline on the fly, joining up tasks # test_pipeline = Pipeline("test") test_pipeline.originate(task_func = generate_initial_files, output = original_files)\ .mkdir(tempdir, tempdir+"/test") test_pipeline.subdivide( task_func = split_fasta_file, input = generate_initial_files, filter = regex(r".*\/original_(\d+).fa"), # match original files output = [tempdir + r"/files.split.\1.success", # flag file for each original file tempdir + r"/files.split.\1.*.fa"], # glob pattern extras = [r"\1"])\ .posttask(lambda: sys.stderr.write("\tSplit into %d files each\n" % JOBS_PER_TASK)) test_pipeline.transform(task_func = align_sequences, input = split_fasta_file, filter = suffix(".fa"), output = ".aln") \ .posttask(lambda: sys.stderr.write("\tSequences aligned\n")) test_pipeline.transform(task_func = percentage_identity, input = align_sequences, # find all results from align_sequences filter = suffix(".aln"), # replace suffix with: output = [r".pcid", # .pcid suffix for the result r".pcid_success"] # .pcid_success to indicate job completed )\ .posttask(lambda: sys.stderr.write("\t%Identity calculated\n")) test_pipeline.collate(task_func = combine_results, input = percentage_identity, filter = regex(r".*files.split\.(\d+)\.\d+.pcid"), output = [tempdir + r"/\1.all.combine_results", tempdir + r"/\1.all.combine_results_success"])\ .posttask(lambda: sys.stderr.write("\tResults recombined\n")) # # Cleanup, printout and run # self.cleanup_tmpdir() s = StringIO() test_pipeline.printout(s, [combine_results], verbose=5, wrap_width=10000) self.assertTrue( re.search('Job needs update:.*Missing files.*', s.getvalue(), re.DOTALL) is not None) test_pipeline.run(verbose=0)
def test_newstyle_collate(self): """ As above but create pipeline on the fly using object orientated syntax rather than decorators """ # # Create pipeline on the fly, joining up tasks # test_pipeline = Pipeline("test") test_pipeline.originate(task_func=generate_initial_files, output=original_files)\ .mkdir(tempdir, tempdir+"/test") test_pipeline.subdivide(task_func=split_fasta_file, input=generate_initial_files, # match original files filter=regex(r".*\/original_(\d+).fa"), output=[tempdir + r"/files.split.\1.success", # flag file for each original file tempdir + r"/files.split.\1.*.fa"], # glob pattern extras=[r"\1"])\ .posttask(lambda: sys.stderr.write("\tSplit into %d files each\n" % JOBS_PER_TASK)) test_pipeline.transform(task_func=align_sequences, input=split_fasta_file, filter=suffix(".fa"), output=".aln") \ .posttask(lambda: sys.stderr.write("\tSequences aligned\n")) test_pipeline.transform(task_func=percentage_identity, input=align_sequences, # find all results from align_sequences # replace suffix with: filter=suffix(".aln"), output=[r".pcid", # .pcid suffix for the result r".pcid_success"] # .pcid_success to indicate job completed )\ .posttask(lambda: sys.stderr.write("\t%Identity calculated\n")) test_pipeline.collate(task_func=combine_results, input=percentage_identity, filter=regex(r".*files.split\.(\d+)\.\d+.pcid"), output=[tempdir + r"/\1.all.combine_results", tempdir + r"/\1.all.combine_results_success"])\ .posttask(lambda: sys.stderr.write("\tResults recombined\n")) # # Cleanup, printout and run # self.cleanup_tmpdir() s = StringIO() test_pipeline.printout(s, [combine_results], verbose=5, wrap_width=10000) self.assertTrue(re.search( 'Job needs update:.*Missing files.*', s.getvalue(), re.DOTALL) is not None) test_pipeline.run(verbose=0)
def test_newstyle_ruffus(self): test_pipeline = Pipeline("test") test_pipeline.split(task_func=prepare_files, input=None, output=tempdir + '*.animal')\ .follows(mkdir(tempdir, tempdir + "test"))\ .posttask(lambda: do_write(tempdir + "task.done", "Task 1 Done\n")) test_pipeline.collate(task_func=summarise_by_grouping, input=prepare_files, filter=regex(r'(.*/).*\.(.*)\.animal'), output=r'\1\2.results')\ .posttask(lambda: do_write(tempdir + "task.done", "Task 2 Done\n")) test_pipeline.run(multiprocess=10, verbose=0) check_species_correct()
def test_newstyle_ruffus (self): test_pipeline = Pipeline("test") test_pipeline.follows(setup_simulation_data, mkdir(gene_data_dir, simulation_data_dir)) test_pipeline.files(gwas_simulation, generate_simulation_params)\ .follows(setup_simulation_data)\ .follows(mkdir(working_dir, os.path.join(working_dir, "simulation_results"))) test_pipeline.collate(statistical_summary, gwas_simulation, regex(r"simulation_results/(\d+).\d+.simulation_res"), r"\1.mean")\ .posttask(lambda : sys.stdout.write("\nOK\n")) test_pipeline.run(multiprocess = 50, verbose = 0) for oo in "000.mean", "001.mean": results_file_name = os.path.join(working_dir, oo) if not os.path.exists(results_file_name): raise Exception("Missing %s" % results_file_name)
def test_newstyle_ruffus(self): test_pipeline = Pipeline("test") test_pipeline.follows(setup_simulation_data, mkdir(gene_data_dir, simulation_data_dir)) test_pipeline.files(gwas_simulation, generate_simulation_params)\ .follows(setup_simulation_data)\ .follows(mkdir(working_dir, os.path.join(working_dir, "simulation_results"))) test_pipeline.collate(statistical_summary, gwas_simulation, regex(r"simulation_results/(\d+).\d+.simulation_res"), r"\1.mean")\ .posttask(lambda: sys.stdout.write("\nOK\n")) test_pipeline.run(multiprocess=50, verbose=0) for oo in "000.mean", "001.mean": results_file_name = os.path.join(working_dir, oo) if not os.path.exists(results_file_name): raise Exception("Missing %s" % results_file_name)
def create_pipeline(self): #each pipeline has a different name global cnt_pipelines cnt_pipelines = cnt_pipelines + 1 test_pipeline = Pipeline("test %d" % cnt_pipelines) test_pipeline.originate( task_func=generate_initial_files1, output=[tempdir + prefix + "_name.tmp1" for prefix in "abcd"]) test_pipeline.originate( task_func=generate_initial_files2, output=[tempdir + "e_name.tmp1", tempdir + "f_name.tmp1"]) test_pipeline.originate( task_func=generate_initial_files3, output=[tempdir + "g_name.tmp1", tempdir + "h_name.tmp1"]) test_pipeline.originate(task_func=generate_initial_files4, output=tempdir + "i_name.tmp1") test_pipeline.collate(task_func=test_task2, input=[ generate_initial_files1, generate_initial_files2, generate_initial_files3, generate_initial_files4 ], filter=formatter(), output="{path[0]}/all.tmp2") test_pipeline.transform(task_func=test_task3, input=test_task2, filter=suffix(".tmp2"), output=".tmp3") test_pipeline.transform(task_func=test_task4, input=test_task3, filter=suffix(".tmp3"), output=".tmp4") return test_pipeline
def create_pipeline (self): #each pipeline has a different name global cnt_pipelines cnt_pipelines = cnt_pipelines + 1 test_pipeline = Pipeline("test %d" % cnt_pipelines) test_pipeline.originate(task_func = generate_initial_files1, output = [tempdir + prefix + "_name.tmp1" for prefix in "abcd"]) test_pipeline.originate(task_func = generate_initial_files2, output = [tempdir + "e_name.tmp1", tempdir + "f_name.tmp1"]) test_pipeline.originate(task_func = generate_initial_files3, output = [tempdir + "g_name.tmp1", tempdir + "h_name.tmp1"]) test_pipeline.originate(task_func = generate_initial_files4, output = tempdir + "i_name.tmp1") test_pipeline.collate( task_func = test_task2, input = [generate_initial_files1, generate_initial_files2, generate_initial_files3, generate_initial_files4], filter = formatter(), output = "{path[0]}/all.tmp2") test_pipeline.transform(task_func = test_task3, input = test_task2, filter = suffix(".tmp2"), output = ".tmp3") test_pipeline.transform(task_func = test_task4, input = test_task3, filter = suffix(".tmp3"), output = ".tmp4") return test_pipeline
"test_active_if/a.1" -> "test_active_if/a.2" "test_active_if/a.2" -> "test_active_if/a.4" null -> "test_active_if/b.1" "test_active_if/b.1" -> "test_active_if/b.2" "test_active_if/b.2" -> "test_active_if/b.4" "test_active_if/b.4" -> "test_active_if/summary.5" """ # alternative syntax test_pipeline = Pipeline("test") test_pipeline.originate(task1, ['test_active_if/a.1', 'test_active_if/b.1'], "an extra_parameter")\ .follows(mkdir("test_active_if")) test_pipeline.transform(task2, task1, suffix(".1"), ".2") test_pipeline.transform(task3, task1, suffix(".1"), ".3").active_if(lambda: pipeline_active_if) test_pipeline.collate(task4, [task2, task3], regex(r"(.+)\.[23]"), r"\1.4") test_pipeline.merge(task5, task4, "test_active_if/summary.5") class Test_ruffus(unittest.TestCase): def setUp(self): try: shutil.rmtree(tempdir) except: pass os.makedirs(tempdir) def tearDown(self): try: shutil.rmtree(tempdir) pass
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name="rnapipe") # Get the details of the experiment (samples, config, inputs, ...) experiment = Experiment(state) # Get reference file locations reference_genome = state.config.get_options("reference_genome") gene_ref = state.config.get_options("gene_ref") # Print out samples sample_text = [s.info() for s in experiment.sample_list] logging.info("Analysis samples:\n{}".format("\n".join(sample_text))) # Stages are dependent on the state. Experiment object is also passed so # we can access metadata later. stages = PipelineStages(state, experiment=experiment) # Make directories output_dir = get_output_paths( results_dir=state.config.get_options("results_dir"), default_paths=OUTPUT_PATHS) make_output_dirs(output_dir) logging.debug(output_dir) # The original FASTQ files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate(task_func=stages.do_nothing, name="original_fastqs", output=experiment.R1_files) # Create reference index for alignment if not experiment.index_provided: pipeline.originate(task_func=stages.do_nothing, name="reference_genome", output=reference_genome) if experiment.alignment_method == "star": # Create reference index for STAR pipeline.transform(task_func=stages.create_star_index, name="create_star_index", input=output_from("reference_genome"), filter=formatter(".*"), add_inputs=add_inputs(gene_ref), output=path_list_join( output_dir["star_index"], ["SA", "Genome", "genomeParameters.txt"]), extras=[output_dir["star_index"]]) elif experiment.alignment_method == "hisat2": # Create reference index for HISAT2 hisat_basename = path.join(output_dir["hisat_index"], "genome") pipeline.transform( task_func=stages.create_hisat_index, name="create_hisat_index", input=output_from("reference_genome"), filter=formatter(".*"), add_inputs=add_inputs(gene_ref), output=path_list_join(output_dir["hisat_index"], ["genome.1.ht2", "genome.2.ht2"]), extras=[hisat_basename]) else: # Don't create index if index is supplied if experiment.alignment_method == "star": output_dir["star_index"] = state.config.get_options("star_index") pipeline.originate(task_func=stages.do_nothing, name="create_star_index", output=path_list_join( output_dir["star_index"], ["SA", "Genome", "genomeParameters.txt"])) elif experiment.alignment_method == "hisat2": hisat_basename = state.config.get_options("hisat_index") output_dir["hisat_index"] = path.dirname(hisat_basename) prefix = path.basename(hisat_basename) pipeline.originate(task_func=stages.do_nothing, name="create_hisat_index", output=path_list_join( output_dir["hisat_index"], [ "{prefix}.1.ht2".format(prefix=prefix), "{prefix}.2.ht2".format(prefix=prefix) ])) # Pre-trim FastQC if experiment.paired_end: pipeline.transform( task_func=stages.fastqc, name="fastqc", input=output_from("original_fastqs"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9-_]+)_R1.fastq.gz"), add_inputs=add_inputs("{path[0]}/{sample[0]}_R2.fastq.gz"), output=path_list_join( output_dir["fastqc"], ["{sample[0]}_R1_fastqc.zip", "{sample[0]}_R2_fastqc.zip"]), extras=[output_dir["fastqc"]]) else: pipeline.transform(task_func=stages.fastqc, name="fastqc", input=output_from("original_fastqs"), filter=suffix(".fastq.gz"), output="_fastqc.zip", output_dir=output_dir["fastqc"], extras=[output_dir["fastqc"]]) # Trimmomatic if experiment.trim_reads and experiment.paired_end: pipeline.transform( task_func=stages.trim_reads, name="trim_reads", input=output_from("original_fastqs"), # Get R1 file and the corresponding R2 file filter=formatter(".+/(?P<sample>[a-zA-Z0-9-_]+)_R1.fastq.gz"), add_inputs=add_inputs("{path[0]}/{sample[0]}_R2.fastq.gz"), output=path_list_join(output_dir["seq"], [ "{sample[0]}_R1.trimmed.fastq.gz", "{sample[0]}_R2.trimmed.fastq.gz" ]), extras=path_list_join(output_dir["seq"], [ "{sample[0]}_R1.unpaired.fastq.gz", "{sample[0]}_R2.unpaired.fastq.gz" ])) elif experiment.trim_reads: pipeline.transform( task_func=stages.trim_reads, name="trim_reads", input=output_from("original_fastqs"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9-_]+)_R1.fastq.gz"), output=path.join(output_dir["seq"], "{sample[0]}_R1.trimmed.fastq.gz")) # Post-trim FastQC if experiment.paired_end and experiment.trim_reads: pipeline.transform( task_func=stages.fastqc, name="post_trim_fastqc", input=output_from("trim_reads"), filter=formatter( ".+/(?P<sample>[a-zA-Z0-9-_]+)_R1.trimmed.fastq.gz"), output=path_list_join(output_dir["post_trim_fastqc"], [ "{sample[0]}_R1.trimmed_fastqc.gz", "{sample[0]}_R2.trimmed_fastqc.gz" ]), extras=["results/qc/post_trim_fastqc/"]) elif experiment.trim_reads: pipeline.transform(task_func=stages.fastqc, name="post_trim_fastqc", input=output_from("trim_reads"), filter=suffix(".trimmed.fastq.gz"), output=".trimmed_fastqc.gz", output_dir=output_dir["post_trim_fastqc"], extras=[output_dir["post_trim_fastqc"]]) # If there are technical replicates, each is mapped independently. # This is so each technical replicate maintains a separate read group. if experiment.alignment_method == "star": align_task_name = "star_align" if experiment.trim_reads: (pipeline.transform( task_func=stages.star_align, name=align_task_name, input=output_from("trim_reads"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9-_]+)" \ "_R[12](.trimmed)?.fastq.gz"), output="%s/{sample[0]}/{sample[0]}.star.Aligned.out.bam" \ % output_dir["alignments"], extras=[output_dir["star_index"], "{sample[0]}"]) ).follows("create_star_index") else: (pipeline.transform( task_func=stages.star_align, name=align_task_name, input=output_from("original_fastqs"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9-_]+)" \ "_R[12](.trimmed)?.fastq.gz"), output="%s/{sample[0]}/{sample[0]}.star.Aligned.out.bam" \ % output_dir["alignments"], extras=[output_dir["star_index"], "{sample[0]}"]) ).follows("create_star_index") if experiment.alignment_method == "hisat2": align_task_name = "hisat_align" if experiment.trim_reads: (pipeline.transform( task_func=stages.hisat_align, name="hisat_align", input=output_from("trim_reads"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9-_]+)" \ "_R[12](.trimmed)?.fastq.gz"), output="%s/{sample[0]}/{sample[0]}.hisat2.bam" \ % output_dir["alignments"], extras=[hisat_basename, "{sample[0]}"]) ).follows("create_hisat_index") else: (pipeline.transform( task_func=stages.hisat_align, name="hisat_align", input=output_from("original_fastqs"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9-_]+)" \ "_R[12](.trimmed)?.fastq.gz"), output="%s/{sample[0]}/{sample[0]}.hisat2.bam" \ % output_dir["alignments"], extras=[hisat_basename, "{sample[0]}"]) ).follows("create_hisat_index") # Sort BAM by coordinates pipeline.transform( task_func=stages.sort_bam_by_coordinate, name="sort_bam_by_coordinate", input=output_from(align_task_name), filter=formatter( ".+/(?P<sample>[a-zA-Z0-9-_]+)\.(?P<method>(star|hisat2))\..*bam"), output=[ "{path[0]}/{sample[0]}.{method[0]}.sorted.bam", "{path[0]}/{sample[0]}.{method[0]}.sorted.bam.bai" ]) # Merge files with the same sample name if experiment.multiple_technical_replicates: pipeline.collate( task_func=stages.merge_bams, name="merge_bams", input=output_from("sort_bam_by_coordinate"), filter=formatter( ".+/(SM_)?(?P<sm>[a-zA-Z0-9-]+)[^.]*\.(?P<method>(star|hisat2)).sorted.bam" ), output=path_list_join( output_dir["alignments"], ["{sm[0]}.{method[0]}.bam", "{sm[0]}.{method[0]}.bam.bai"])) else: pipeline.transform( task_func=stages.create_symlinks, name="merge_bams", input=output_from("sort_bam_by_coordinate"), filter=formatter( ".+/(SM_)?(?P<sm>[a-zA-Z0-9-]+)[^.]*\.(?P<method>(star|hisat2)).sorted.bam" ), output=path_list_join( output_dir["alignments"], ["{sm[0]}.{method[0]}.bam", "{sm[0]}.{method[0]}.bam.bai"])) # Sort BAM by name for counting features pipeline.transform(task_func=stages.sort_bam_by_name, name="sort_bam_by_name", input=output_from("merge_bams"), filter=suffix(".bam"), output=".nameSorted.bam") # Count features with HTSeq-count pipeline.transform(task_func=stages.htseq_count, name="htseq_count", input=output_from("sort_bam_by_name"), filter=suffix(".nameSorted.bam"), output_dir=output_dir["counts"], output=".htseq.txt") # Count features with featureCounts pipeline.transform(task_func=stages.featurecounts, name="featurecounts", input=output_from("sort_bam_by_name"), filter=suffix(".nameSorted.bam"), output_dir=output_dir["counts"], output=".featureCounts.txt") # TODO: add multiqc step # # Stringtie assembly # pipeline.transform( # task_func=stages.stringtie_assembly, # name="stringtie_assembly", # input=output_from("merge_bams"), # filter=suffix(".bam"), # output_dir=output_dir["stringtie_assembly"], # output=".gtf") # Stringtie estimates pipeline.transform( task_func=stages.stringtie_estimates, name="stringtie_estimates", input=output_from("merge_bams"), filter=formatter( ".+/(?P<sm>[a-zA-Z0-9-]+)\.(?P<method>(star|hisat2)).bam"), output=path_list_join(output_dir["stringtie_estimates"], ["{sm[0]}/{sm[0]}.gtf", "{sm[0]}/e_data.ctab"])) # Stringtie counts pipeline.collate( task_func=stages.stringtie_prepDE, name="stringtie_prepDE", input=output_from("stringtie_estimates"), filter=formatter(".+\.gtf"), output=path_list_join( output_dir["stringtie_estimates"], ["gene_count_matrix.csv", "transcript_count_matrix.csv"])) return pipeline
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='cellfree_seq') # Stages are dependent on the state stages = Stages(state) safe_make_dir('alignments') # The original FASTQ files fastq_files = glob.glob('fastqs/*') # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate(task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) # Align paired end reads in FASTQ to the reference producing a BAM file pipeline.transform( task_func=stages.align_bwa, name='align_bwa', input=output_from('original_fastqs'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+)_R1.fastq.gz'), # Add one more inputs to the stage: # 1. The corresponding R2 FASTQ file add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the sample name. This is needed within the stage for finding out # sample specific configuration options extras=['{sample[0]}'], # The output file name is the sample name with a .bam extension. output='alignments/{sample[0]}.sort.hq.bam') pipeline.transform(task_func=stages.run_connor, name='run_connor', input=output_from('align_bwa'), filter=suffix('.sort.hq.bam'), output='.sort.hq.connor.bam') safe_make_dir('metrics') safe_make_dir('metrics/summary') safe_make_dir('metrics/connor') pipeline.transform( task_func=stages.intersect_bed, name='intersect_bed_raw', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.bam'), output='metrics/summary/{sample[0]}.intersectbed.bam') pipeline.transform(task_func=stages.coverage_bed, name='coverage_bed_raw', input=output_from('intersect_bed_raw'), filter=suffix('.intersectbed.bam'), output='.bedtools_hist_all.txt') pipeline.transform( task_func=stages.genome_reads, name='genome_reads_raw', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.bam'), output='metrics/summary/{sample[0]}.mapped_to_genome.txt') pipeline.transform(task_func=stages.target_reads, name='target_reads_raw', input=output_from('intersect_bed_raw'), filter=suffix('.intersectbed.bam'), output='.mapped_to_target.txt') pipeline.transform( task_func=stages.total_reads, name='total_reads_raw', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.bam'), output='metrics/summary/{sample[0]}.total_raw_reads.txt') pipeline.collate( task_func=stages.generate_stats, name='generate_stats_raw', input=output_from('coverage_bed_raw', 'genome_reads_raw', 'target_reads_raw', 'total_reads_raw'), filter=regex( r'.+/(.+)\.(bedtools_hist_all|mapped_to_genome|mapped_to_target|total_raw_reads)\.txt' ), output=r'metrics/summary/all_sample.summary.\1.txt', extras=[r'\1', 'summary.txt']) pipeline.transform( task_func=stages.intersect_bed, name='intersect_bed_connor', input=output_from('run_connor'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.connor.bam'), output='metrics/connor/{sample[0]}.intersectbed.bam') pipeline.transform(task_func=stages.coverage_bed, name='coverage_bed_connor', input=output_from('intersect_bed_connor'), filter=suffix('.intersectbed.bam'), output='.bedtools_hist_all.txt') pipeline.transform( task_func=stages.genome_reads, name='genome_reads_connor', input=output_from('run_connor'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.connor.bam'), output='metrics/summary/{sample[0]}.mapped_to_genome.txt') pipeline.transform(task_func=stages.target_reads, name='target_reads_connor', input=output_from('intersect_bed_connor'), filter=suffix('.intersectbed.bam'), output='.mapped_to_target.txt') pipeline.transform( task_func=stages.total_reads, name='total_reads_connor', input=output_from('run_connor'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.connor.bam'), output='metrics/summary/{sample[0]}.total_raw_reads.txt') pipeline.collate( task_func=stages.generate_stats, name='generate_stats_connor', input=output_from('coverage_bed_connor', 'genome_reads_connor', 'target_reads_connor', 'total_reads_connor'), filter=regex( r'.+/(.+)\.(bedtools_hist_all|mapped_to_genome|mapped_to_target|total_raw_reads)\.txt' ), output=r'metrics/connor/all_sample.summary.\1.txt', extras=[r'\1', 'connor.summary.txt']) safe_make_dir('variants') safe_make_dir('variants/vardict') pipeline.transform( task_func=stages.run_vardict, name='run_vardict', input=output_from('run_connor'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.connor.bam'), output='variants/vardict/{sample[0]}.vcf', extras=['{sample[0]}']) pipeline.transform( task_func=stages.sort_vcfs, name='sort_vcfs', input=output_from('run_vardict'), filter=formatter('variants/vardict/(?P<sample>[a-zA-Z0-9_-]+).vcf'), output='variants/vardict/{sample[0]}.sorted.vcf.gz') pipeline.transform(task_func=stages.index_vcfs, name='index_vcfs', input=output_from('sort_vcfs'), filter=suffix('.sorted.vcf.gz'), output='.sorted.vcf.gz.tbi') (pipeline.merge( task_func=stages.concatenate_vcfs, name='concatenate_vcfs', input=output_from('sort_vcfs'), output='variants/vardict/combined.vcf.gz').follows('index_vcfs')) pipeline.transform(task_func=stages.vt_decompose_normalise, name='vt_decompose_normalise', input=output_from('concatenate_vcfs'), filter=suffix('.vcf.gz'), output='.decomp.norm.vcf.gz') pipeline.transform(task_func=stages.index_vcfs, name='index_final_vcf', input=output_from('vt_decompose_normalise'), filter=suffix('.decomp.norm.vcf.gz'), output='.decomp.norm.vcf.gz.tbi') (pipeline.transform( task_func=stages.apply_vep, name='apply_vep', input=output_from('vt_decompose_normalise'), filter=suffix('.decomp.norm.vcf.gz'), output='.decomp.norm.vep.vcf').follows('index_final_vcf')) return pipeline
def make_pipeline_map(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='haloplexpipe') # Get a list of paths to all the FASTQ files #fastq_files = state.config.get_option('fastqs') fastq_files = glob.glob("fastqs/*.gz") # Stages are dependent on the state stages = Stages(state) safe_make_dir('alignments') safe_make_dir('processed_fastqs') safe_make_dir('metrics') safe_make_dir('metrics/amplicon') safe_make_dir('metrics/summary') safe_make_dir('metrics/pass_samples') safe_make_dir('variants') safe_make_dir('variants/gatk') safe_make_dir('variants/vardict') # The original FASTQ files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate(task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) pipeline.transform( task_func=stages.run_surecalltrimmer, name='run_surecalltrimmer', input=output_from('original_fastqs'), filter=formatter('fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1.fastq.gz'), add_inputs=add_inputs('fastqs/{sample[0]}_R2.fastq.gz'), #filter=formatter('fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1_001.fastq.gz'), #add_inputs=add_inputs('fastqs/{sample[0]}_R3_001.fastq.gz'), extras=['{sample[0]}'], # output only needs to know about one file to track progress of the pipeline, but the second certainly exists after this step. output='processed_fastqs/{sample[0]}_R1.processed.fastq.gz') #output='processed_fastqs/{sample[0]}_R1_001.processed.fastq.gz') # Align paired end reads in FASTQ to the reference producing a BAM file pipeline.transform( task_func=stages.align_bwa, name='align_bwa', input=output_from('run_surecalltrimmer'), filter=formatter( 'processed_fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1.processed.fastq.gz' ), add_inputs=add_inputs( 'processed_fastqs/{sample[0]}_R2.processed.fastq.gz'), #filter=formatter('processed_fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1_001.processed.fastq.gz'), #add_inputs=add_inputs('processed_fastqs/{sample[0]}_R3_001.processed.fastq.gz'), extras=['{sample[0]}'], output='alignments/{sample[0]}.bam') # Run locatit from agilent. this should produce sorted bam files, so no sorting needed at the next step pipeline.collate(task_func=stages.run_locatit, name='run_locatit', input=output_from('align_bwa', 'original_fastqs'), filter=regex(r'.+/(.+_L\d\d\d).+'), output=r'alignments/\1.locatit.bam') pipeline.transform(task_func=stages.sort_bam, name='sort_bam', input=output_from('run_locatit'), filter=suffix('.locatit.bam'), output='.sorted.locatit.bam') # # # # # Metrics stages # # # # # # generate mapping metrics (post locatit) pipeline.transform( task_func=stages.generate_amplicon_metrics, name='generate_amplicon_metrics', input=output_from('sort_bam'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'), output='metrics/amplicon/{sample[0]}.amplicon-metrics.txt', extras=['{sample[0]}']) # Intersect the bam file with the region of interest pipeline.transform( task_func=stages.intersect_bed, name='intersect_bed', input=output_from('sort_bam'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'), output='metrics/summary/{sample[0]}.intersectbed.bam') # Calculate coverage metrics from the intersected bam file pipeline.transform(task_func=stages.coverage_bed, name='coverage_bed', input=output_from('intersect_bed'), filter=suffix('.intersectbed.bam'), output='.bedtools_hist_all.txt') # Count the number of mapped reads pipeline.transform( task_func=stages.genome_reads, name='genome_reads', input=output_from('sort_bam'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'), output='metrics/summary/{sample[0]}.mapped_to_genome.txt') # Count the number of on-target reads pipeline.transform(task_func=stages.target_reads, name='target_reads', input=output_from('intersect_bed'), filter=suffix('.intersectbed.bam'), output='.mapped_to_target.txt') # Count the number of total reads pipeline.transform( task_func=stages.total_reads, name='total_reads', input=output_from('sort_bam'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'), output='metrics/summary/{sample[0]}.total_raw_reads.txt') # Generate summary metrics from the stats files produces pipeline.collate( task_func=stages.generate_stats, name='generate_stats', input=output_from('coverage_bed', 'genome_reads', 'target_reads', 'total_reads'), #filter=regex(r'.+/(.+BS\d{4,6}.+S\d+)\..+\.txt'), filter=regex( r'.+/(.+)\.(bedtools_hist_all|mapped_to_genome|mapped_to_target|total_raw_reads)\.txt' ), output=r'metrics/summary/all_sample.summary.\1.txt', extras=[r'\1', 'all_sample.summary.txt']) # # # # # Metrics stages end # # # # # # # # # # Checking metrics and calling # # # # # # Originate to set the location of the metrics summary file (pipeline.originate( task_func=stages.grab_summary_file, name='grab_summary_file', output='all_sample.summary.txt').follows('generate_stats')) # Awk command to produce a list of bam files passing filters pipeline.transform(task_func=stages.filter_stats, name='filter_stats', input=output_from('grab_summary_file'), filter=suffix('.summary.txt'), output='.passed.summary.txt') # Touch passed bams to the pass_samples folder and pass the glob of that folder to HaplotypeCaller pipeline.subdivide(name='passed_filter_files', task_func=stages.read_samples, input=output_from('filter_stats'), filter=formatter(), output="metrics/pass_samples/*.bam") # Call variants using GATK (pipeline.transform( task_func=stages.call_haplotypecaller_gatk, name='call_haplotypecaller_gatk', input=output_from('passed_filter_files'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9-_]+).sorted.locatit.bam'), output='variants/gatk/{sample[0]}.g.vcf').follows('sort_bam')) # Call variants with vardict (pipeline.transform( task_func=stages.run_vardict, name='run_vardict', input=output_from('passed_filter_files'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9-_]+).sorted.locatit.bam'), output='variants/vardict/{sample[0]}.vcf', extras=['{sample[0]}']).follows('sort_bam')) pipeline.transform( task_func=stages.sort_vcfs, name='sort_vcfs', input=output_from('run_vardict'), filter=formatter('variants/vardict/(?P<sample>[a-zA-Z0-9_-]+).vcf'), output='variants/vardict/{sample[0]}.sorted.vcf.gz') pipeline.transform(task_func=stages.index_vcfs, name='index_vcfs', input=output_from('sort_vcfs'), filter=suffix('.sorted.vcf.gz'), output='.sorted.vcf.gz.tbi') return (pipeline)
"test_active_if/a.2" -> "test_active_if/a.4" null -> "test_active_if/b.1" "test_active_if/b.1" -> "test_active_if/b.2" "test_active_if/b.2" -> "test_active_if/b.4" "test_active_if/b.4" -> "test_active_if/summary.5" """ # alternative syntax test_pipeline = Pipeline("test") test_pipeline.originate(task1, ['test_active_if/a.1', 'test_active_if/b.1'], "an extra_parameter")\ .follows(mkdir("test_active_if")) test_pipeline.transform(task2, task1, suffix(".1"), ".2") test_pipeline.transform(task3, task1, suffix( ".1"), ".3").active_if(lambda: pipeline_active_if) test_pipeline.collate(task4, [task2, task3], regex(r"(.+)\.[23]"), r"\1.4") test_pipeline.merge(task5, task4, "test_active_if/summary.5") class Test_ruffus(unittest.TestCase): def setUp(self): try: shutil.rmtree(tempdir) except: pass os.makedirs(tempdir) def tearDown(self): try: shutil.rmtree(tempdir) pass
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='thepipeline') # Get a list of paths to all the FASTQ files fastq_files = state.config.get_option('fastqs') # Stages are dependent on the state stages = Stages(state) # The original FASTQ files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate( task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) # Align paired end reads in FASTQ to the reference producing a BAM file pipeline.transform( task_func=stages.align_bwa, name='align_bwa', input=output_from('original_fastqs'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. # We assume the sample name may consist of only alphanumeric # characters. # filter=formatter('(?P<path>.+)/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+)_1.fastq.gz'), # 1_HFYLVCCXX:2:TCCGCGAA_2_GE0343_1.fastq.gz # 1_HCJWFBCXX:GGACTCCT_L001_9071584415739518822-AGRF-023_R2.fastq.gz filter=formatter( '.+/(?P<readid>[a-zA-Z0-9-]+)_(?P<lib>[a-zA-Z0-9-:]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+)_R1.fastq.gz'), # Add one more inputs to the stage: # 1. The corresponding R2 FASTQ file # e.g. C2WPF.5_Solexa-201237_5_X4311_1.fastq.gz add_inputs=add_inputs( '{path[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}_R2.fastq.gz'), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the sample name. This is needed within the stage for finding out # sample specific configuration options extras=['{readid[0]}', '{lib[0]}', '{lane[0]}', '{sample[0]}'], # extras=['{sample[0]}'], # The output file name is the sample name with a .bam extension. output='alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.bam') # Sort the BAM file using Picard pipeline.transform( task_func=stages.sort_bam_picard, name='sort_bam_picard', input=output_from('align_bwa'), filter=suffix('.bam'), output='.sort.bam') # Mark duplicates in the BAM file using Picard pipeline.transform( task_func=stages.mark_duplicates_picard, name='mark_duplicates_picard', input=output_from('sort_bam_picard'), filter=suffix('.sort.bam'), # XXX should make metricsup an extra output? output=['.sort.dedup.bam', '.metricsdup']) # Local realignment using GATK # Generate RealignerTargetCreator using GATK pipeline.transform( task_func=stages.realigner_target_creator, name='realigner_target_creator', input=output_from('mark_duplicates_picard'), filter=suffix('.sort.dedup.bam'), output='.intervals') # Local realignment using GATK (pipeline.transform( task_func=stages.local_realignment_gatk, name='local_realignment_gatk', input=output_from('realigner_target_creator'), # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).chr.intervals'), filter=formatter( '.+/(?P<readid>[a-zA-Z0-9-]+)_(?P<lib>[a-zA-Z0-9-:]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+).intervals'), # add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'), add_inputs=add_inputs( 'alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.bam'), output='alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.realn.bam') .follows('mark_duplicates_picard')) # Base recalibration using GATK pipeline.transform( task_func=stages.base_recalibration_gatk, name='base_recalibration_gatk', input=output_from('local_realignment_gatk'), filter=suffix('.sort.dedup.realn.bam'), output=['.recal_data.csv', '.count_cov.log']) # Print reads using GATK (pipeline.transform( task_func=stages.print_reads_gatk, name='print_reads_gatk', input=output_from('base_recalibration_gatk'), # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).recal_data.csv'), filter=formatter( # '.+/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+).recal_data.csv'), '.+/(?P<readid>[a-zA-Z0-9-]+)_(?P<lib>[a-zA-Z0-9-:]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+).recal_data.csv'), # '.+/(?P<readid>[a-zA-Z0-9-]+)_(?P<lib>[a-zA-Z0-9-:]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+).recal_data.csv'), # add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.realn.bam'), add_inputs=add_inputs( 'alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.realn.bam'), # output='{path[0]}/{sample[0]}.sort.dedup.realn.recal.bam') output='alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.realn.recal.bam') .follows('local_realignment_gatk')) # Merge lane bams to sample bams pipeline.collate( task_func=stages.merge_sample_bams, name='merge_sample_bams', filter=formatter( # '.+/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+).sort.dedup.realn.recal.bam'), '.+/(?P<readid>[a-zA-Z0-9-]+)_(?P<lib>[a-zA-Z0-9-:]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+).sort.dedup.realn.recal.bam'), # inputs=add_inputs('alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.realn.bam'), input=output_from('print_reads_gatk'), output='alignments/{sample[0]}/{sample[0]}.merged.bam') # Mark duplicates in the BAM file using Picard pipeline.transform( task_func=stages.mark_duplicates_picard, name='mark_duplicates_picard2', input=output_from('merge_sample_bams'), # filter=formatter( # '.+/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+).merged.bam'), filter=suffix('.merged.bam'), # XXX should make metricsup an extra output? output=['.merged.dedup.bam', '.metricsdup']) # Local realignment2 using GATK # Generate RealignerTargetCreator using GATK pipeline.transform( task_func=stages.realigner_target_creator, name='realigner_target_creator2', input=output_from('mark_duplicates_picard2'), filter=suffix('.dedup.bam'), output='.intervals') # Local realignment using GATK (pipeline.transform( task_func=stages.local_realignment_gatk, name='local_realignment_gatk2', input=output_from('realigner_target_creator2'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9-]+).merged.intervals'), # filter=formatter( # '.+/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+).intervals'), # add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'), add_inputs=add_inputs( 'alignments/{sample[0]}/{sample[0]}.merged.dedup.bam'), output='alignments/{sample[0]}/{sample[0]}.merged.dedup.realn.bam') .follows('mark_duplicates_picard2')) # Call variants using GATK pipeline.transform( task_func=stages.call_haplotypecaller_gatk, name='call_haplotypecaller_gatk', input=output_from('local_realignment_gatk2'), # filter=suffix('.merged.dedup.realn.bam'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9-]+).merged.dedup.realn.bam'), output='variants/{sample[0]}.g.vcf') # Combine G.VCF files for all samples using GATK pipeline.merge( task_func=stages.combine_gvcf_gatk, name='combine_gvcf_gatk', input=output_from('call_haplotypecaller_gatk'), output='variants/ALL.combined.vcf') # Genotype G.VCF files using GATK pipeline.transform( task_func=stages.genotype_gvcf_gatk, name='genotype_gvcf_gatk', input=output_from('combine_gvcf_gatk'), filter=suffix('.combined.vcf'), output='.raw.vcf') # SNP recalibration using GATK pipeline.transform( task_func=stages.snp_recalibrate_gatk, name='snp_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.raw.vcf'), output=['.snp_recal', '.snp_tranches', '.snp_plots.R']) # INDEL recalibration using GATK pipeline.transform( task_func=stages.indel_recalibrate_gatk, name='indel_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.raw.vcf'), output=['.indel_recal', '.indel_tranches', '.indel_plots.R']) # Apply SNP recalibration using GATK (pipeline.transform( task_func=stages.apply_snp_recalibrate_gatk, name='apply_snp_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.raw.vcf'), add_inputs=add_inputs(['ALL.snp_recal', 'ALL.snp_tranches']), output='.recal_SNP.vcf') .follows('snp_recalibrate_gatk')) # Apply INDEL recalibration using GATK (pipeline.transform( task_func=stages.apply_indel_recalibrate_gatk, name='apply_indel_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.raw.vcf'), add_inputs=add_inputs( ['ALL.indel_recal', 'ALL.indel_tranches']), output='.recal_INDEL.vcf') .follows('indel_recalibrate_gatk')) # Combine variants using GATK (pipeline.transform( task_func=stages.combine_variants_gatk, name='combine_variants_gatk', input=output_from('apply_snp_recalibrate_gatk'), filter=suffix('.recal_SNP.vcf'), add_inputs=add_inputs(['ALL.recal_INDEL.vcf']), # output='.combined.vcf') output='ALL.raw.vqsr.vcf') .follows('apply_indel_recalibrate_gatk')) # # # Select variants using GATK # pipeline.transform( # task_func=stages.select_variants_gatk, # name='select_variants_gatk', # input=output_from('combine_variants_gatk'), # filter=suffix('.combined.vcf'), # output='.selected.vcf') return pipeline
def make_pipeline_map(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='hiplexpipe') # Stages are dependent on the state stages = Stages(state) safe_make_dir('alignments') safe_make_dir('metrics') safe_make_dir('metrics/amplicon') safe_make_dir('metrics/summary') # The original FASTQ files fastq_files = glob.glob('fastqs/*') # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate(task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) # Align paired end reads in FASTQ to the reference producing a BAM file pipeline.transform( task_func=stages.align_bwa, name='align_bwa', input=output_from('original_fastqs'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. filter=formatter( '.+/(?P<sample>[a-zA-Z0-9_-]+)_R1_(?P<lib>[a-zA-Z0-9-:]+).fastq.gz' ), # Add one more inputs to the stage: # 1. The corresponding R2 FASTQ file add_inputs=add_inputs('{path[0]}/{sample[0]}_R2_{lib[0]}.fastq.gz'), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the sample name. This is needed within the stage for finding out # sample specific configuration options extras=['{sample[0]}', '{lib[0]}'], # The output file name is the sample name with a .bam extension. output='alignments/{sample[0]}.clipped.sort.hq.bam') # generate mapping metrics. pipeline.transform( task_func=stages.generate_amplicon_metrics, name='generate_amplicon_metrics', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).clipped.sort.hq.bam'), output='metrics/amplicon/{sample[0]}.amplicon-metrics.txt', extras=['{sample[0]}']) pipeline.transform( task_func=stages.intersect_bed, name='intersect_bed', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).clipped.sort.hq.bam'), output='metrics/summary/{sample[0]}.intersectbed.bam') pipeline.transform(task_func=stages.coverage_bed, name='coverage_bed', input=output_from('intersect_bed'), filter=suffix('.intersectbed.bam'), output='.bedtools_hist_all.txt') pipeline.transform( task_func=stages.genome_reads, name='genome_reads', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).clipped.sort.hq.bam'), output='metrics/summary/{sample[0]}.mapped_to_genome.txt') pipeline.transform(task_func=stages.target_reads, name='target_reads', input=output_from('intersect_bed'), filter=suffix('.intersectbed.bam'), output='.mapped_to_target.txt') pipeline.transform( task_func=stages.total_reads, name='total_reads', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).clipped.sort.hq.bam'), output='metrics/summary/{sample[0]}.total_raw_reads.txt') pipeline.collate( task_func=stages.generate_stats, name='generate_stats', input=output_from('coverage_bed', 'genome_reads', 'target_reads', 'total_reads'), #filter=regex(r'.+/(.+BS\d{4,6}.+)\..+\.txt'), filter=regex( r'.+/(.+)\.(bedtools_hist_all|mapped_to_genome|mapped_to_target|total_raw_reads)\.txt' ), output=r'metrics/summary/all_sample.summary.\1.txt', extras=[r'\1', 'all_sample.summary.txt']) summary_file = 'all_sample.summary.txt' (pipeline.originate(task_func=stages.grab_summary_file, name='grab_summary_file', output=summary_file).follows('generate_stats')) pipeline.transform(task_func=stages.filter_stats, name='filter_stats', input=output_from('grab_summary_file'), filter=suffix('.summary.txt'), output='.passed.summary.txt', extras=['all_sample.failed.summary.txt']) return pipeline