def test_newstyle_ruffus (self): print(" Run pipeline normally...") test_pipeline = Pipeline("test") test_pipeline.originate(make_start, [tempdir + 'start']) test_pipeline.split(split_start, make_start, tempdir + '*.split') test_pipeline.subdivide(subdivide_start, split_start, formatter(), tempdir + '{basename[0]}_*.subdivided', tempdir + '{basename[0]}') if self.graph_viz_present: test_pipeline.printout_graph(tempdir + "flowchart.dot") test_pipeline.printout_graph(tempdir + "flowchart.jpg", target_tasks =[subdivide_start], forcedtorun_tasks = [split_start], no_key_legend = True) test_pipeline.printout_graph(tempdir + "flowchart.svg", no_key_legend = False) # Unknown format try: test_pipeline.printout_graph(tempdir + "flowchart.unknown", no_key_legend = False) raise Exception("Failed to throw exception for test_pipeline.printout_graph unknown extension ") except CalledProcessError as err: pass test_pipeline.printout_graph(tempdir + "flowchart.unknown", "svg", no_key_legend = False) else: test_pipeline.printout_graph(tempdir + "flowchart.dot", target_tasks =[subdivide_start], forcedtorun_tasks = [split_start], no_key_legend = True)
def test_newstyle_ruffus(self): # alternative syntax test_pipeline = Pipeline("test") test_pipeline.mkdir(data_dir, work_dir) test_pipeline.originate(task_func=task1, output=[os.path.join(data_dir, "%s.1" % aa) for aa in "abcd"]) test_pipeline.mkdir(filter=suffix(".1"), output=".dir", output_dir=work_dir) test_pipeline.transform(task_func=task2, input=task1, filter=suffix(".1"), output=[".1", ".bak"], extras=["extra.tst", 4, r"orig_dir=\1"], output_dir=work_dir) test_pipeline.subdivide(task3, task2, suffix( ".1"), r"\1.*.2", [r"\1.a.2", r"\1.b.2"], output_dir=data_dir) test_pipeline.transform(task4, task3, suffix( ".2"), ".3", output_dir=work_dir) test_pipeline.merge(task5, task4, os.path.join(data_dir, "summary.5")) test_pipeline.run(multiprocess=50, verbose=0) with open(os.path.join(data_dir, "summary.5")) as ii: active_text = ii.read() if active_text != expected_active_text: raise Exception("Error:\n\tExpected\n%s\nInstead\n%s\n" % (expected_active_text, active_text))
def test_newstyle_collate(self): """ As above but create pipeline on the fly using object orientated syntax rather than decorators """ # # Create pipeline on the fly, joining up tasks # test_pipeline = Pipeline("test") test_pipeline.originate(task_func = generate_initial_files, output = original_files)\ .mkdir(tempdir, tempdir+"/test") test_pipeline.subdivide( task_func = split_fasta_file, input = generate_initial_files, filter = regex(r".*\/original_(\d+).fa"), # match original files output = [tempdir + r"/files.split.\1.success", # flag file for each original file tempdir + r"/files.split.\1.*.fa"], # glob pattern extras = [r"\1"])\ .posttask(lambda: sys.stderr.write("\tSplit into %d files each\n" % JOBS_PER_TASK)) test_pipeline.transform(task_func = align_sequences, input = split_fasta_file, filter = suffix(".fa"), output = ".aln") \ .posttask(lambda: sys.stderr.write("\tSequences aligned\n")) test_pipeline.transform(task_func = percentage_identity, input = align_sequences, # find all results from align_sequences filter = suffix(".aln"), # replace suffix with: output = [r".pcid", # .pcid suffix for the result r".pcid_success"] # .pcid_success to indicate job completed )\ .posttask(lambda: sys.stderr.write("\t%Identity calculated\n")) test_pipeline.collate(task_func = combine_results, input = percentage_identity, filter = regex(r".*files.split\.(\d+)\.\d+.pcid"), output = [tempdir + r"/\1.all.combine_results", tempdir + r"/\1.all.combine_results_success"])\ .posttask(lambda: sys.stderr.write("\tResults recombined\n")) # # Cleanup, printout and run # self.cleanup_tmpdir() s = StringIO() test_pipeline.printout(s, [combine_results], verbose=5, wrap_width=10000) self.assertTrue( re.search('Job needs update:.*Missing files.*', s.getvalue(), re.DOTALL) is not None) test_pipeline.run(verbose=0)
def test_newstyle_collate(self): """ As above but create pipeline on the fly using object orientated syntax rather than decorators """ # # Create pipeline on the fly, joining up tasks # test_pipeline = Pipeline("test") test_pipeline.originate(task_func=generate_initial_files, output=original_files)\ .mkdir(tempdir, tempdir+"/test") test_pipeline.subdivide(task_func=split_fasta_file, input=generate_initial_files, # match original files filter=regex(r".*\/original_(\d+).fa"), output=[tempdir + r"/files.split.\1.success", # flag file for each original file tempdir + r"/files.split.\1.*.fa"], # glob pattern extras=[r"\1"])\ .posttask(lambda: sys.stderr.write("\tSplit into %d files each\n" % JOBS_PER_TASK)) test_pipeline.transform(task_func=align_sequences, input=split_fasta_file, filter=suffix(".fa"), output=".aln") \ .posttask(lambda: sys.stderr.write("\tSequences aligned\n")) test_pipeline.transform(task_func=percentage_identity, input=align_sequences, # find all results from align_sequences # replace suffix with: filter=suffix(".aln"), output=[r".pcid", # .pcid suffix for the result r".pcid_success"] # .pcid_success to indicate job completed )\ .posttask(lambda: sys.stderr.write("\t%Identity calculated\n")) test_pipeline.collate(task_func=combine_results, input=percentage_identity, filter=regex(r".*files.split\.(\d+)\.\d+.pcid"), output=[tempdir + r"/\1.all.combine_results", tempdir + r"/\1.all.combine_results_success"])\ .posttask(lambda: sys.stderr.write("\tResults recombined\n")) # # Cleanup, printout and run # self.cleanup_tmpdir() s = StringIO() test_pipeline.printout(s, [combine_results], verbose=5, wrap_width=10000) self.assertTrue(re.search( 'Job needs update:.*Missing files.*', s.getvalue(), re.DOTALL) is not None) test_pipeline.run(verbose=0)
def test_newstyle_ruffus(self): test_pipeline = Pipeline("test") test_pipeline.originate(task_func=make_start, output=[tempdir + 'start']) test_pipeline.split(task_func=split_start, input=make_start, output=tempdir + '*.split') test_pipeline.subdivide(task_func=subdivide_start, input=split_start, filter=formatter( ), output=tempdir + '{basename[0]}_*.subdivided', extras=[tempdir + '{basename[0]}']) expected_files_after_1_runs = ["start", "0.split", "0_0.subdivided"] expected_files_after_2_runs = [ "1.split", "0_1.subdivided", "1_0.subdivided"] expected_files_after_3_runs = [ "2.split", "0_2.subdivided", "1_1.subdivided", "2_0.subdivided"] expected_files_after_4_runs = [ "3.split", "0_3.subdivided", "1_2.subdivided", "2_1.subdivided", "3_0.subdivided"] print(" 1 Run pipeline normally...") test_pipeline.run(multiprocess=10, verbose=TEST_VERBOSITY) self.check_file_exists_or_not_as_expected(expected_files_after_1_runs, expected_files_after_2_runs) print(" 2 Check that running again does nothing. (All up to date).") test_pipeline.run(multiprocess=10, verbose=TEST_VERBOSITY) self.check_file_exists_or_not_as_expected(expected_files_after_1_runs, expected_files_after_2_runs) time.sleep(2) print(" 3 Running again with forced tasks to generate more files...") test_pipeline.run(forcedtorun_tasks=[ "test::make_start"], multiprocess=10, verbose=TEST_VERBOSITY) self.check_file_exists_or_not_as_expected(expected_files_after_1_runs + expected_files_after_2_runs, expected_files_after_3_runs) print(" 4 Check that running again does nothing. (All up to date).") test_pipeline.run(multiprocess=10, verbose=TEST_VERBOSITY) self.check_file_exists_or_not_as_expected(expected_files_after_1_runs + expected_files_after_2_runs, expected_files_after_3_runs) time.sleep(2) print(" 5 Running again with forced tasks to generate even more files...") test_pipeline.run(forcedtorun_tasks=make_start, multiprocess=10, verbose=TEST_VERBOSITY) self.check_file_exists_or_not_as_expected(expected_files_after_1_runs + expected_files_after_2_runs + expected_files_after_3_runs, expected_files_after_4_runs) print(" 6 Check that running again does nothing. (All up to date).") test_pipeline.run(multiprocess=10, verbose=TEST_VERBOSITY) self.check_file_exists_or_not_as_expected(expected_files_after_1_runs + expected_files_after_2_runs + expected_files_after_3_runs, expected_files_after_4_runs)
def test_newstyle_ruffus (self): test_pipeline = Pipeline("test") test_pipeline.originate(task_func = make_start, output = [tempdir + 'start']) test_pipeline.split(task_func = split_start, input = make_start, output = tempdir + '*.split') test_pipeline.subdivide(task_func = subdivide_start, input = split_start, filter = formatter(), output = tempdir + '{basename[0]}_*.subdivided', extras = [tempdir + '{basename[0]}']) expected_files_after_1_runs = ["start", "0.split", "0_0.subdivided"] expected_files_after_2_runs = ["1.split", "0_1.subdivided", "1_0.subdivided"] expected_files_after_3_runs = ["2.split", "0_2.subdivided", "1_1.subdivided", "2_0.subdivided"] expected_files_after_4_runs = ["3.split", "0_3.subdivided", "1_2.subdivided", "2_1.subdivided", "3_0.subdivided"] print(" 1 Run pipeline normally...") test_pipeline.run(multiprocess = 10, verbose = TEST_VERBOSITY) self.check_file_exists_or_not_as_expected(expected_files_after_1_runs, expected_files_after_2_runs) print(" 2 Check that running again does nothing. (All up to date).") test_pipeline.run(multiprocess = 10, verbose = TEST_VERBOSITY) self.check_file_exists_or_not_as_expected(expected_files_after_1_runs, expected_files_after_2_runs) time.sleep(2) print(" 3 Running again with forced tasks to generate more files...") test_pipeline.run(forcedtorun_tasks = ["test::make_start"], multiprocess = 10, verbose = TEST_VERBOSITY) self.check_file_exists_or_not_as_expected(expected_files_after_1_runs + expected_files_after_2_runs, expected_files_after_3_runs) print(" 4 Check that running again does nothing. (All up to date).") test_pipeline.run(multiprocess = 10, verbose = TEST_VERBOSITY) self.check_file_exists_or_not_as_expected(expected_files_after_1_runs + expected_files_after_2_runs, expected_files_after_3_runs) time.sleep(2) print(" 5 Running again with forced tasks to generate even more files...") test_pipeline.run(forcedtorun_tasks = make_start, multiprocess = 10, verbose = TEST_VERBOSITY) self.check_file_exists_or_not_as_expected(expected_files_after_1_runs + expected_files_after_2_runs + expected_files_after_3_runs, expected_files_after_4_runs) print(" 6 Check that running again does nothing. (All up to date).") test_pipeline.run(multiprocess = 10, verbose = TEST_VERBOSITY) self.check_file_exists_or_not_as_expected(expected_files_after_1_runs + expected_files_after_2_runs + expected_files_after_3_runs, expected_files_after_4_runs)
def test_newstyle_ruffus(self): # alternative syntax test_pipeline = Pipeline("test") test_pipeline.mkdir(data_dir, work_dir) test_pipeline.originate( task_func=task1, output=[os.path.join(data_dir, "%s.1" % aa) for aa in "abcd"]) test_pipeline.mkdir(filter=suffix(".1"), output=".dir", output_dir=work_dir) test_pipeline.transform(task_func=task2, input=task1, filter=suffix(".1"), output=[".1", ".bak"], extras=["extra.tst", 4, r"orig_dir=\1"], output_dir=work_dir) test_pipeline.subdivide(task3, task2, suffix(".1"), r"\1.*.2", [r"\1.a.2", r"\1.b.2"], output_dir=data_dir) test_pipeline.transform(task4, task3, suffix(".2"), ".3", output_dir=work_dir) test_pipeline.merge(task5, task4, os.path.join(data_dir, "summary.5")) test_pipeline.run(multiprocess=50, verbose=0) with open(os.path.join(data_dir, "summary.5")) as ii: active_text = ii.read() if active_text != expected_active_text: raise Exception("Error:\n\tExpected\n%s\nInstead\n%s\n" % (expected_active_text, active_text))
def test_newstyle_ruffus(self): print(" Run pipeline normally...") test_pipeline = Pipeline("test") test_pipeline.originate(make_start, [tempdir + 'start']) test_pipeline.split(split_start, make_start, tempdir + '*.split') test_pipeline.subdivide(subdivide_start, split_start, formatter(), tempdir + '{basename[0]}_*.subdivided', tempdir + '{basename[0]}') if self.graph_viz_present: test_pipeline.printout_graph(tempdir + "flowchart.dot") test_pipeline.printout_graph(tempdir + "flowchart.jpg", target_tasks=[subdivide_start], forcedtorun_tasks=[split_start], no_key_legend=True) test_pipeline.printout_graph(tempdir + "flowchart.svg", no_key_legend=False) # Unknown format try: test_pipeline.printout_graph(tempdir + "flowchart.unknown", no_key_legend=False) raise Exception( "Failed to throw exception for test_pipeline.printout_graph unknown extension " ) except CalledProcessError as err: pass test_pipeline.printout_graph(tempdir + "flowchart.unknown", "svg", no_key_legend=False) else: test_pipeline.printout_graph(tempdir + "flowchart.dot", target_tasks=[subdivide_start], forcedtorun_tasks=[split_start], no_key_legend=True)
def make_pipeline_map(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='haloplexpipe') # Get a list of paths to all the FASTQ files #fastq_files = state.config.get_option('fastqs') fastq_files = glob.glob("fastqs/*.gz") # Stages are dependent on the state stages = Stages(state) safe_make_dir('alignments') safe_make_dir('processed_fastqs') safe_make_dir('metrics') safe_make_dir('metrics/amplicon') safe_make_dir('metrics/summary') safe_make_dir('metrics/pass_samples') safe_make_dir('variants') safe_make_dir('variants/gatk') safe_make_dir('variants/vardict') # The original FASTQ files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate(task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) pipeline.transform( task_func=stages.run_surecalltrimmer, name='run_surecalltrimmer', input=output_from('original_fastqs'), filter=formatter('fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1.fastq.gz'), add_inputs=add_inputs('fastqs/{sample[0]}_R2.fastq.gz'), #filter=formatter('fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1_001.fastq.gz'), #add_inputs=add_inputs('fastqs/{sample[0]}_R3_001.fastq.gz'), extras=['{sample[0]}'], # output only needs to know about one file to track progress of the pipeline, but the second certainly exists after this step. output='processed_fastqs/{sample[0]}_R1.processed.fastq.gz') #output='processed_fastqs/{sample[0]}_R1_001.processed.fastq.gz') # Align paired end reads in FASTQ to the reference producing a BAM file pipeline.transform( task_func=stages.align_bwa, name='align_bwa', input=output_from('run_surecalltrimmer'), filter=formatter( 'processed_fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1.processed.fastq.gz' ), add_inputs=add_inputs( 'processed_fastqs/{sample[0]}_R2.processed.fastq.gz'), #filter=formatter('processed_fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1_001.processed.fastq.gz'), #add_inputs=add_inputs('processed_fastqs/{sample[0]}_R3_001.processed.fastq.gz'), extras=['{sample[0]}'], output='alignments/{sample[0]}.bam') # Run locatit from agilent. this should produce sorted bam files, so no sorting needed at the next step pipeline.collate(task_func=stages.run_locatit, name='run_locatit', input=output_from('align_bwa', 'original_fastqs'), filter=regex(r'.+/(.+_L\d\d\d).+'), output=r'alignments/\1.locatit.bam') pipeline.transform(task_func=stages.sort_bam, name='sort_bam', input=output_from('run_locatit'), filter=suffix('.locatit.bam'), output='.sorted.locatit.bam') # # # # # Metrics stages # # # # # # generate mapping metrics (post locatit) pipeline.transform( task_func=stages.generate_amplicon_metrics, name='generate_amplicon_metrics', input=output_from('sort_bam'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'), output='metrics/amplicon/{sample[0]}.amplicon-metrics.txt', extras=['{sample[0]}']) # Intersect the bam file with the region of interest pipeline.transform( task_func=stages.intersect_bed, name='intersect_bed', input=output_from('sort_bam'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'), output='metrics/summary/{sample[0]}.intersectbed.bam') # Calculate coverage metrics from the intersected bam file pipeline.transform(task_func=stages.coverage_bed, name='coverage_bed', input=output_from('intersect_bed'), filter=suffix('.intersectbed.bam'), output='.bedtools_hist_all.txt') # Count the number of mapped reads pipeline.transform( task_func=stages.genome_reads, name='genome_reads', input=output_from('sort_bam'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'), output='metrics/summary/{sample[0]}.mapped_to_genome.txt') # Count the number of on-target reads pipeline.transform(task_func=stages.target_reads, name='target_reads', input=output_from('intersect_bed'), filter=suffix('.intersectbed.bam'), output='.mapped_to_target.txt') # Count the number of total reads pipeline.transform( task_func=stages.total_reads, name='total_reads', input=output_from('sort_bam'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'), output='metrics/summary/{sample[0]}.total_raw_reads.txt') # Generate summary metrics from the stats files produces pipeline.collate( task_func=stages.generate_stats, name='generate_stats', input=output_from('coverage_bed', 'genome_reads', 'target_reads', 'total_reads'), #filter=regex(r'.+/(.+BS\d{4,6}.+S\d+)\..+\.txt'), filter=regex( r'.+/(.+)\.(bedtools_hist_all|mapped_to_genome|mapped_to_target|total_raw_reads)\.txt' ), output=r'metrics/summary/all_sample.summary.\1.txt', extras=[r'\1', 'all_sample.summary.txt']) # # # # # Metrics stages end # # # # # # # # # # Checking metrics and calling # # # # # # Originate to set the location of the metrics summary file (pipeline.originate( task_func=stages.grab_summary_file, name='grab_summary_file', output='all_sample.summary.txt').follows('generate_stats')) # Awk command to produce a list of bam files passing filters pipeline.transform(task_func=stages.filter_stats, name='filter_stats', input=output_from('grab_summary_file'), filter=suffix('.summary.txt'), output='.passed.summary.txt') # Touch passed bams to the pass_samples folder and pass the glob of that folder to HaplotypeCaller pipeline.subdivide(name='passed_filter_files', task_func=stages.read_samples, input=output_from('filter_stats'), filter=formatter(), output="metrics/pass_samples/*.bam") # Call variants using GATK (pipeline.transform( task_func=stages.call_haplotypecaller_gatk, name='call_haplotypecaller_gatk', input=output_from('passed_filter_files'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9-_]+).sorted.locatit.bam'), output='variants/gatk/{sample[0]}.g.vcf').follows('sort_bam')) # Call variants with vardict (pipeline.transform( task_func=stages.run_vardict, name='run_vardict', input=output_from('passed_filter_files'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9-_]+).sorted.locatit.bam'), output='variants/vardict/{sample[0]}.vcf', extras=['{sample[0]}']).follows('sort_bam')) pipeline.transform( task_func=stages.sort_vcfs, name='sort_vcfs', input=output_from('run_vardict'), filter=formatter('variants/vardict/(?P<sample>[a-zA-Z0-9_-]+).vcf'), output='variants/vardict/{sample[0]}.sorted.vcf.gz') pipeline.transform(task_func=stages.index_vcfs, name='index_vcfs', input=output_from('sort_vcfs'), filter=suffix('.sorted.vcf.gz'), output='.sorted.vcf.gz.tbi') return (pipeline)