def build_pipeline(): pipe = Pipeline("my_pipeline") pipe.originate( name="create_three_new_files", task_func=create_new_file, output=[os.path.join(WORK_DIR, f"file{i}.csv") for i in range(1, 4)], ) pipe.transform( name="convert_csv_files_to_tsv", task_func=csv_to_tsv, input=output_from("create_three_new_files"), filter=suffix(".csv"), output=".tsv", ) pipe.transform( name="calculate_md5", task_func=md5, input=output_from("convert_csv_files_to_tsv"), filter=suffix(".tsv"), output=".md5sum", ) return pipe
def test_newstyle_task(self): test_pipeline = Pipeline("test") test_pipeline.files(task1, [[None, tempdir + "a.1"], [None, tempdir + "b.1"]])\ .follows(mkdir(tempdir)) test_pipeline.files(task2, [[None, tempdir + "c.1"], [None, tempdir + "d.1"]])\ .follows(mkdir(tempdir)) test_pipeline.transform(task_func=task3, input=task1, filter=regex(r"(.+)"), replace_inputs=ruffus.inputs( ((r"\1"), task2, "test_transform_inputs.*y")), output=r"\1.output") test_pipeline.merge(task4, (task3), tempdir + "final.output") test_pipeline.run([task4], multiprocess=10, verbose=0) correct_output = "{tempdir}a.1.output:test_transform_inputs.py,{tempdir}a.1,{tempdir}c.1,{tempdir}d.1;{tempdir}b.1.output:test_transform_inputs.py,{tempdir}b.1,{tempdir}c.1,{tempdir}d.1;".format( tempdir=tempdir) with open(tempdir + "final.output") as ff: real_output = ff.read() self.assertEqual(correct_output, real_output)
def test_newstyle_ruffus(self): test_pipeline = Pipeline("test") test_pipeline.files(create_random_numbers, None, tempdir + "random_numbers.list")\ .follows(mkdir(tempdir)) test_pipeline.split(task_func = step_4_split_numbers_into_chunks, input = tempdir + "random_numbers.list", output = tempdir + "*.chunks")\ .follows(create_random_numbers) test_pipeline.transform(task_func=step_5_calculate_sum_of_squares, input=step_4_split_numbers_into_chunks, filter=suffix(".chunks"), output=".sums") test_pipeline.merge(task_func = step_6_calculate_variance, input = step_5_calculate_sum_of_squares, output = os.path.join(tempdir, "variance.result"))\ .posttask(lambda: sys.stdout.write(" hooray\n"))\ .posttask(print_hooray_again, print_whoppee_again, touch_file(os.path.join(tempdir, "done"))) test_pipeline.run(multiprocess=50, verbose=0) output_file = os.path.join(tempdir, "variance.result") if not os.path.exists(output_file): raise Exception("Missing %s" % output_file)
def test_newstyle_ruffus(self): test_pipeline = Pipeline("test") test_pipeline.split(task_func=split_fasta_file, input=tempdir + "original.fa", output=[tempdir + "files.split.success", tempdir + "files.split.*.fa"])\ .posttask(lambda: verbose_output.write(" Split into %d files\n" % 10)) test_pipeline.transform(task_func=align_sequences, input=split_fasta_file, filter=suffix(".fa"), output=".aln" # fa -> aln )\ .posttask(lambda: verbose_output.write(" Sequences aligned\n")) test_pipeline.transform(task_func=percentage_identity, input=align_sequences, # find all results from align_sequences # replace suffix with: filter=suffix(".aln"), output=[r".pcid", # .pcid suffix for the result r".pcid_success"] # .pcid_success to indicate job completed )\ .posttask(lambda: verbose_output.write(" %Identity calculated\n")) test_pipeline.merge(task_func=combine_results, input=percentage_identity, output=[tempdir + "all.combine_results", tempdir + "all.combine_results_success"])\ .posttask(lambda: verbose_output.write(" Results recombined\n")) test_pipeline.run(multiprocess=50, verbose=0) if not os.path.exists(tempdir + "all.combine_results"): raise Exception("Missing %s" % (tempdir + "all.combine_results"))
def test_newstyle_ruffus (self): test_pipeline = Pipeline("test") test_pipeline.originate(start_task, ["a.1", "b.1"]) test_pipeline.transform(same_file_name_task, start_task, suffix(".1"), ".1") test_pipeline.transform(linked_file_name_task, start_task, suffix(".1"), ".linked.1") test_pipeline.transform(final_task, [linked_file_name_task, same_file_name_task], suffix(".1"), ".3") test_pipeline.run(log_exceptions = True, verbose = 0)
def test_newstyle_mkdir_run(self): test_pipeline = Pipeline("test") test_pipeline.split(task_func=generate_initial_files1, input=1, output=[ tempdir + "/" + prefix + "_name.tmp1" for prefix in "abcd" ]) test_pipeline.transform( task_func = test_transform, input = generate_initial_files1, filter = formatter(), output = "{path[0]}/{basename[0]}.dir/{basename[0]}.tmp2")\ .mkdir(tempdir + "/test1")\ .mkdir(tempdir + "/test2")\ .mkdir(generate_initial_files1, formatter(), ["{path[0]}/{basename[0]}.dir", 3, "{path[0]}/{basename[0]}.dir2"]) test_pipeline.mkdir(test_transform2, tempdir + "/test3")\ .mkdir(generate_initial_files1, formatter(), "{path[0]}/{basename[0]}.dir2") cleanup_tmpdir() pipeline_run([test_transform, test_transform2], verbose=0, multiprocess=2, pipeline="main")
def test_transform_with_missing_formatter_args_b(self): test_pipeline = Pipeline("test") test_pipeline.originate(task_func=generate_initial_files, output=[os.path.join(tempdir, ff + ".tmp") for ff in "abcd"])\ .mkdir(tempdir) test_pipeline.transform( task_func=transform_with_missing_formatter_args, input=generate_initial_files, filter=formatter(), output="{path[0]}/{basename[0]}.task1", extras=['echo {dynamic_message} > {some_file}']) s = StringIO() test_pipeline.printout(s, [transform_with_missing_formatter_args], verbose=4, wrap_width=10000, pipeline="test") self.assertIn("Unmatched field {dynamic_message}", s.getvalue()) # log to stream s = StringIO() logger = t_stream_logger(s) test_pipeline.run([transform_with_missing_formatter_args], verbose=5, pipeline="test", logger=logger) self.assertIn("Unmatched field {dynamic_message}", s.getvalue())
def create_pipeline(self): """ Create new pipeline on the fly without using decorators """ global count_pipelines count_pipelines = count_pipelines + 1 test_pipeline = Pipeline("test %d" % count_pipelines) test_pipeline.transform(task_func=transform1, input=input_file, filter=suffix('.txt'), output='.output', extras=[runtime_data]) test_pipeline.transform(task_func=transform_raise_error, input=input_file, filter=suffix('.txt'), output='.output', extras=[runtime_data]) test_pipeline.split(task_func=split1, input=input_file, output=split1_outputs) test_pipeline.merge(task_func=merge2, input=split1, output=merge2_output) return test_pipeline
def make_pipeline_call(state): #this part of the pipeline will take the summary results of "map" and turn them into gatk and undr_rover vcfs pipeline = Pipeline(name='genericpipe') with open("all_sample.passed.summary.txt", 'r') as inputf: passed_files = inputf.read().split('\n') stages = Stages(state) safe_make_dir('variants') safe_make_dir('variants/gatk') pipeline.originate(task_func=stages.passed_filter_files, name='passed_filter_files', output=passed_files) ###### GATK VARIANT CALLING ###### # Call variants using GATK pipeline.transform( task_func=stages.call_haplotypecaller_gatk, name='call_haplotypecaller_gatk', input=output_from('passed_filter_files'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9-_]+).sort.hq.bam'), output='variants/gatk/{sample[0]}.g.vcf') return pipeline
def test_newstyle_ruffus (self): test_pipeline = Pipeline("test") test_pipeline.parallel(parallel_task, [['A', 1], ['B',3], ['C',3], ['D',4], ['E',4], ['F',4]]) try: test_pipeline.run(multiprocess = 50, verbose = 0) except ruffus.ruffus_exceptions.RethrownJobError: return raise Exception("Missing exception")
def test_newstyle_collate(self): """ As above but create pipeline on the fly using object orientated syntax rather than decorators """ # # Create pipeline on the fly, joining up tasks # test_pipeline = Pipeline("test") test_pipeline.originate(task_func = generate_initial_files, output = original_files)\ .mkdir(tempdir, tempdir+"/test") test_pipeline.subdivide( task_func = split_fasta_file, input = generate_initial_files, filter = regex(r".*\/original_(\d+).fa"), # match original files output = [tempdir + r"/files.split.\1.success", # flag file for each original file tempdir + r"/files.split.\1.*.fa"], # glob pattern extras = [r"\1"])\ .posttask(lambda: sys.stderr.write("\tSplit into %d files each\n" % JOBS_PER_TASK)) test_pipeline.transform(task_func = align_sequences, input = split_fasta_file, filter = suffix(".fa"), output = ".aln") \ .posttask(lambda: sys.stderr.write("\tSequences aligned\n")) test_pipeline.transform(task_func = percentage_identity, input = align_sequences, # find all results from align_sequences filter = suffix(".aln"), # replace suffix with: output = [r".pcid", # .pcid suffix for the result r".pcid_success"] # .pcid_success to indicate job completed )\ .posttask(lambda: sys.stderr.write("\t%Identity calculated\n")) test_pipeline.collate(task_func = combine_results, input = percentage_identity, filter = regex(r".*files.split\.(\d+)\.\d+.pcid"), output = [tempdir + r"/\1.all.combine_results", tempdir + r"/\1.all.combine_results_success"])\ .posttask(lambda: sys.stderr.write("\tResults recombined\n")) # # Cleanup, printout and run # self.cleanup_tmpdir() s = StringIO() test_pipeline.printout(s, [combine_results], verbose=5, wrap_width=10000) self.assertTrue( re.search('Job needs update:.*Missing files.*', s.getvalue(), re.DOTALL) is not None) test_pipeline.run(verbose=0)
def make_pipeline1( pipeline_name, # Pipelines need to have a unique name starting_file_names): test_pipeline = Pipeline(pipeline_name) # We can change the starting files later using # set_input() for transform etc. # or set_output() for originate # But it can be more convenient to just pass this to the function making the pipeline # test_pipeline.originate(task_originate, starting_file_names)\ .follows(mkdir(tempdir), mkdir(tempdir + "/testdir", tempdir + "/testdir2"))\ .posttask(touch_file(tempdir + "/testdir/whatever.txt")) test_pipeline.transform( task_func=task_m_to_1, name="add_input", # Lookup Task from function name task_originate() # So long as this is unique in the pipeline input=task_originate, # requires an anchor from 3.7 onwards, see # https://bugs.python.org/issue34982 filter=regex(r"^(.*)"), add_inputs=add_inputs(tempdir + "/testdir/whatever.txt"), output=r"\1.22") test_pipeline.transform( task_func=task_1_to_1, name="22_to_33", # Lookup Task from Task name # Function name is not unique in the pipeline input=output_from("add_input"), filter=suffix(".22"), output=".33") tail_task = test_pipeline.transform( task_func=task_1_to_1, name="33_to_44", # Ask Pipeline to lookup Task from Task name input=test_pipeline["22_to_33"], filter=suffix(".33"), output=".44") # Set the tail task so that users of my sub pipeline can use it as a dependency # without knowing the details of task names # # Use Task() object directly without having to lookup test_pipeline.set_tail_tasks([tail_task]) # If we try to connect a Pipeline without tail tasks defined, we have to # specify the exact task within the Pipeline. # Otherwise Ruffus will not know which task we mean and throw an exception if DEBUG_do_not_define_tail_task: test_pipeline.set_tail_tasks([]) # Set the head task so that users of my sub pipeline send input into it # without knowing the details of task names test_pipeline.set_head_tasks([test_pipeline[task_originate]]) return test_pipeline
def test_newstyle_task (self): test_pipeline = Pipeline("test") test_pipeline.files(task1, a) save_to_str_logger = t_save_to_str_logger() test_pipeline.run(multiprocess = 10, logger = save_to_str_logger, verbose = 1) self.assertTrue("@files() was empty" in save_to_str_logger.warning_str) print("\n Warning printed out correctly", file=sys.stderr)
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='ovarian_cancer_pipeline') # Get a list of paths to all the FASTQ files fastq_files = state.config.get_option('fastqs') human_reference_genome_file = state.config.get_option('human_reference_genome') # Stages are dependent on the state stages = PipelineStages(state) # The original FASTQ files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate( task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) # The human reference genome in FASTA format pipeline.originate( task_func=stages.human_reference_genome, name='human_reference_genome', output=human_reference_genome_file) # Index the human reference genome with BWA, needed before we can map reads pipeline.transform( task_func=stages.index_ref_bwa, name='index_ref_bwa', input=output_from('human_reference_genome'), filter=suffix('.fa'), output=['.fa.amb', '.fa.ann', '.fa.pac', '.fa.sa', '.fa.bwt']) # Align paired end reads in FASTQ to the reference producing a BAM file (pipeline.transform( task_func=stages.align_bwa, name='align_bwa', input=output_from('original_fastqs'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. # We assume the sample name may consist of only alphanumeric # characters. filter=formatter('.+/(?P<sample>[_a-zA-Z0-9]+)_R1.fastq.gz'), # Add one more inputs to the stage: # 1. The corresponding R2 FASTQ file add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the sample name. This is needed within the stage for finding out # sample specific configuration options extras=['{sample[0]}'], # The output file name is the sample name with a .bam extension. output='{path[0]}/{sample[0]}.bam') .follows('index_ref_bwa')) return pipeline
def test_newstyle_mkdir(self): test_pipeline = Pipeline("test") test_pipeline.follows(task_which_makes_directories, mkdir(directories), mkdir(tempdir + 'c'), mkdir(tempdir + 'd', tempdir + 'e'), mkdir(tempdir + 'e')) test_pipeline.run(multiprocess=10, verbose=0) for d in 'abcde': fullpath = os.path.join(os.path.dirname(__file__), tempdir, d) self.assertTrue(os.path.exists(fullpath))
def make_pipeline_call(state): #this part of the pipeline will take the summary results of "map" and turn them into gatk and undr_rover vcfs pipeline = Pipeline(name='hiplexpipe') with open("all_sample.passed.summary.txt", 'r') as inputf: passed_files = inputf.read().split('\n') stages = Stages(state) safe_make_dir('variants') safe_make_dir('variants/gatk') safe_make_dir('variants/undr_rover') safe_make_dir('variants/undr_rover/coverdir') pipeline.originate(task_func=stages.passed_filter_files, name='passed_filter_files', output=passed_files) # Call variants using undr_rover pipeline.transform( task_func=stages.apply_undr_rover, name='apply_undr_rover', input=output_from('passed_filter_files'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).clipped.sort.hq.bam'), output='variants/undr_rover/{sample[0]}.vcf', extras=['{sample[0]}']) #### concatenate undr_rover vcfs #### pipeline.transform( task_func=stages.sort_vcfs, name='sort_vcfs', input=output_from('apply_undr_rover'), filter=formatter('variants/undr_rover/(?P<sample>[a-zA-Z0-9_-]+).vcf'), output='variants/undr_rover/{sample[0]}.sorted.vcf.gz') pipeline.transform(task_func=stages.index_vcfs, name='index_vcfs', input=output_from('sort_vcfs'), filter=suffix('.sorted.vcf.gz'), output='.sorted.vcf.gz.tbi') ###### GATK VARIANT CALLING ###### # Call variants using GATK pipeline.transform( task_func=stages.call_haplotypecaller_gatk, name='call_haplotypecaller_gatk', input=output_from('passed_filter_files'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9-_]+).clipped.sort.hq.bam'), output='variants/gatk/{sample[0]}.g.vcf') return pipeline
def test_newstyle_no_re_match(self): test_pipeline = Pipeline("test") test_pipeline.originate(task_1, tempdir + "a").mkdir(tempdir) test_pipeline.transform(task_2, task_1, regex("b"), "task_2.output") save_to_str_logger = t_save_to_str_logger() test_pipeline.run( multiprocess=10, logger=save_to_str_logger, verbose=1) print(save_to_str_logger.warning_str) self.assertTrue( "no file names matched" in save_to_str_logger.warning_str) print("\n Warning printed out correctly", file=sys.stderr)
def test_newstyle_ruffus(self): test_pipeline = Pipeline("test") test_pipeline.originate(task_func=task1, output=[tempdir + 'a.1'] + runtime_files) test_pipeline.transform(task2, task1, suffix(".1"), ".2") test_pipeline.transform(task_func=task3, input=task2, filter=suffix(".2"), output=".3") test_pipeline.transform(task_func=task4, input=runtime_parameter("a"), filter=suffix(".3"), output=".4").follows(task3) test_pipeline.run(verbose=0, runtime_data={"a": runtime_files})
def test_newstyle_task(self): """ Same as above but construct a new pipeline on the fly without decorators """ test_pipeline = Pipeline("test") test_pipeline.files(task1, None, tempdir + 'a.1')\ .follows(mkdir(tempdir)) test_pipeline.transform(task_func=task2, input=task1, filter=regex(r".*"), output=tempdir + 'b.1') test_pipeline.files(task3, task2, tempdir + 'c.1') test_pipeline.files(task4, [[None, tempdir + 'd.1'], [None, tempdir + 'e.1']])\ .follows(task3) test_pipeline.files(task5, task4, tempdir + "f.1") test_pipeline.run(multiprocess=10, verbose=0)
def test_newstyle_ruffus(self): test_pipeline = Pipeline("test") test_pipeline.split(task_func=prepare_files, input=None, output=tempdir + '*.animal')\ .follows(mkdir(tempdir, tempdir + "test"))\ .posttask(lambda: do_write(tempdir + "task.done", "Task 1 Done\n")) test_pipeline.collate(task_func=summarise_by_grouping, input=prepare_files, filter=regex(r'(.*/).*\.(.*)\.animal'), output=r'\1\2.results')\ .posttask(lambda: do_write(tempdir + "task.done", "Task 2 Done\n")) test_pipeline.run(multiprocess=10, verbose=0) check_species_correct()
def test_newstyle_ruffus (self): test_pipeline = Pipeline("test") test_pipeline.originate(task_func = make_start, output = [tempdir + 'start']) test_pipeline.split(task_func = split_start, input = make_start, output = tempdir + '*.split') test_pipeline.subdivide(task_func = subdivide_start, input = split_start, filter = formatter(), output = tempdir + '{basename[0]}_*.subdivided', extras = [tempdir + '{basename[0]}']) expected_files_after_1_runs = ["start", "0.split", "0_0.subdivided"] expected_files_after_2_runs = ["1.split", "0_1.subdivided", "1_0.subdivided"] expected_files_after_3_runs = ["2.split", "0_2.subdivided", "1_1.subdivided", "2_0.subdivided"] expected_files_after_4_runs = ["3.split", "0_3.subdivided", "1_2.subdivided", "2_1.subdivided", "3_0.subdivided"] print(" 1 Run pipeline normally...") test_pipeline.run(multiprocess = 10, verbose = TEST_VERBOSITY) self.check_file_exists_or_not_as_expected(expected_files_after_1_runs, expected_files_after_2_runs) print(" 2 Check that running again does nothing. (All up to date).") test_pipeline.run(multiprocess = 10, verbose = TEST_VERBOSITY) self.check_file_exists_or_not_as_expected(expected_files_after_1_runs, expected_files_after_2_runs) time.sleep(2) print(" 3 Running again with forced tasks to generate more files...") test_pipeline.run(forcedtorun_tasks = ["test::make_start"], multiprocess = 10, verbose = TEST_VERBOSITY) self.check_file_exists_or_not_as_expected(expected_files_after_1_runs + expected_files_after_2_runs, expected_files_after_3_runs) print(" 4 Check that running again does nothing. (All up to date).") test_pipeline.run(multiprocess = 10, verbose = TEST_VERBOSITY) self.check_file_exists_or_not_as_expected(expected_files_after_1_runs + expected_files_after_2_runs, expected_files_after_3_runs) time.sleep(2) print(" 5 Running again with forced tasks to generate even more files...") test_pipeline.run(forcedtorun_tasks = make_start, multiprocess = 10, verbose = TEST_VERBOSITY) self.check_file_exists_or_not_as_expected(expected_files_after_1_runs + expected_files_after_2_runs + expected_files_after_3_runs, expected_files_after_4_runs) print(" 6 Check that running again does nothing. (All up to date).") test_pipeline.run(multiprocess = 10, verbose = TEST_VERBOSITY) self.check_file_exists_or_not_as_expected(expected_files_after_1_runs + expected_files_after_2_runs + expected_files_after_3_runs, expected_files_after_4_runs)
def test_newstyle_mkdir(self): test_pipeline = Pipeline("test") test_pipeline.follows(task_which_makes_directories, mkdir(directories), mkdir(unicode(tempdir + "c")), mkdir(unicode(tempdir + "d"), unicode(tempdir + "e")), mkdir(unicode(tempdir + "e")))\ .posttask(touch_file(unicode(tempdir + "f"))) test_pipeline.originate(task_which_makes_files, [tempdir + "g", tempdir + "h"]) test_pipeline.run(multiprocess=10, verbose=0) for d in 'abcdefgh': fullpath = os.path.join(os.path.dirname(__file__), tempdir, d) self.assertTrue(os.path.exists(fullpath))
def test_newstyle_no_re_match(self): try: test_pipeline = Pipeline("test") test_pipeline.transform(task_func=task_2, input=None, filter=regex(tempdir + "b"), replace_inputs=inputs( tempdir + "a", tempdir + "b"), output="task_1.output") test_pipeline.run(multiprocess=10, verbose=0) except ruffus.ruffus_exceptions.error_task_transform_inputs_multiple_args: print("\tExpected exception thrown 1") return except ruffus.ruffus_exceptions.error_inputs_multiple_args: print("\tExpected exception thrown 2") return raise Exception( "Inputs(...) with multiple arguments should have thrown an exception" )
def test_newstyle_ruffus(self): test_pipeline = Pipeline("test") test_pipeline.follows(setup_simulation_data, mkdir(gene_data_dir, simulation_data_dir)) test_pipeline.files(gwas_simulation, generate_simulation_params)\ .follows(setup_simulation_data)\ .follows(mkdir(working_dir, os.path.join(working_dir, "simulation_results"))) test_pipeline.collate(statistical_summary, gwas_simulation, regex(r"simulation_results/(\d+).\d+.simulation_res"), r"\1.mean")\ .posttask(lambda: sys.stdout.write("\nOK\n")) test_pipeline.run(multiprocess=50, verbose=0) for oo in "000.mean", "001.mean": results_file_name = os.path.join(working_dir, oo) if not os.path.exists(results_file_name): raise Exception("Missing %s" % results_file_name)
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='fastq2bam') # Get a list of paths to all the FASTQ files input_files = state.config.get_option('files') # Stages are dependent on the state stages = Stages(state) # The original files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate(task_func=stages.original_files, name='original_files', output=input_files) pipeline.transform( task_func=stages.fastq2bam, name='fastq2bam', input=output_from('original_files'), filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+)_R1.fastq.gz'), add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'), extras=['{sample[0]}'], output='{path[0]}/out/{sample[0]}.bam') pipeline.transform( task_func=stages.validate_prealigned_bam, name='validate_prealigned_bam', input=output_from('fastq2bam'), filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).bam'), output='{path[0]}/{sample[0]}.validation') pipeline.transform( task_func=stages.align, name='align', input=output_from('validate_prealigned_bam'), filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).validation'), add_inputs=add_inputs('{path[0]}/{sample[0]}.bam'), output='{path[0]}/{sample[0]}.mapped.bam') return pipeline
def create_pipeline(self): #each pipeline has a different name global cnt_pipelines cnt_pipelines = cnt_pipelines + 1 test_pipeline = Pipeline("test %d" % cnt_pipelines) test_pipeline.originate( task_func=generate_initial_files1, output=[tempdir + prefix + "_name.tmp1" for prefix in "abcd"]) test_pipeline.originate( task_func=generate_initial_files2, output=[tempdir + "e_name.tmp1", tempdir + "f_name.tmp1"]) test_pipeline.originate( task_func=generate_initial_files3, output=[tempdir + "g_name.tmp1", tempdir + "h_name.tmp1"]) test_pipeline.originate(task_func=generate_initial_files4, output=tempdir + "i_name.tmp1") test_pipeline.collate(task_func=test_task2, input=[ generate_initial_files1, generate_initial_files2, generate_initial_files3, generate_initial_files4 ], filter=formatter(), output="{path[0]}/all.tmp2") test_pipeline.transform(task_func=test_task3, input=test_task2, filter=suffix(".tmp2"), output=".tmp3") test_pipeline.transform(task_func=test_task4, input=test_task3, filter=suffix(".tmp3"), output=".tmp4") return test_pipeline
def test_newstyle_simpler(self): test_pipeline = Pipeline("test") test_pipeline.originate(task1, input_file_names, extras=[logger_proxy, logging_mutex]) test_pipeline.transform(task2, task1, suffix(".1"), ".2", extras=[logger_proxy, logging_mutex]) test_pipeline.transform(task3, task2, suffix(".2"), ".3", extras=[logger_proxy, logging_mutex]) test_pipeline.merge(task4, task3, final_file_name, extras=[logger_proxy, logging_mutex]) #test_pipeline.merge(task4, task3, final_file_name, extras = {"logger_proxy": logger_proxy, "logging_mutex": logging_mutex}) test_pipeline.run(multiprocess=500, verbose=0)
def test_newstyle_graphviz_dot(self): test_pipeline = Pipeline("test") test_pipeline.check_if_uptodate (Up_to_date_task1, lambda : (False, "")) test_pipeline.follows(Up_to_date_task2, Up_to_date_task1)\ .check_if_uptodate (lambda : (False, ""))\ .graphviz(URL='"http://cnn.com"', fillcolor = '"#FFCCCC"', color = '"#FF0000"', pencolor='"#FF0000"', fontcolor='"#4B6000"', label_suffix = "???", label_prefix = "What is this?<BR/> ", label = "<What <FONT COLOR=\"red\">is</FONT>this>", shape= "component", height = 1.5, peripheries = 5, style="dashed") test_pipeline.follows(Up_to_date_task3, Up_to_date_task2)\ .check_if_uptodate (lambda : (False, "")) test_pipeline.follows(Up_to_date_final_target, Up_to_date_task3)\ .check_if_uptodate (lambda : (False, "")) test_pipeline.follows(Explicitly_specified_task, Up_to_date_task1)\ .check_if_uptodate (lambda : (False, "")) test_pipeline.follows(Task_to_run1, Explicitly_specified_task) test_pipeline.follows(Task_to_run2, Task_to_run1) test_pipeline.follows(Task_to_run3, Task_to_run2) test_pipeline.follows(Up_to_date_task_forced_to_rerun, Task_to_run2)\ .check_if_uptodate (lambda : (False, "")) test_pipeline.follows(Final_target, Up_to_date_task_forced_to_rerun, Task_to_run3) test_pipeline.follows(Downstream_task1_ignored, Final_target) test_pipeline.follows(Downstream_task2_ignored, Final_target) if sys.hexversion >= 0x03000000: # everything is unicode in python3 s = BytesIO() else: s = StringIO() test_pipeline.printout_graph ( s, # use flowchart file name extension to decide flowchart format # e.g. svg, jpg etc. "dot", [Final_target, Up_to_date_final_target]) self.assertTrue('[URL="http://cnn.com", color="#FF0000", fillcolor="#FFCCCC", fontcolor="#4B6000", height=1.5, label=<What is this?<BR/> What <FONT COLOR="red">is</FONT>this???>, pencolor="#FF0000", peripheries=5, shape=component, style=dashed]' in s.getvalue().decode())
def make_pipeline2(pipeline_name="pipeline2"): test_pipeline2 = Pipeline(pipeline_name) test_pipeline2.transform( task_func=task_1_to_1, # task name name="44_to_55", # placeholder: will be replaced later with set_input() input=None, filter=suffix(".44"), output=".55") test_pipeline2.merge( task_func=task_m_to_1, input=test_pipeline2["44_to_55"], output=tempdir + "/final.output", ) # Set head and tail test_pipeline2.set_tail_tasks([test_pipeline2[task_m_to_1]]) if not DEBUG_do_not_define_head_task: test_pipeline2.set_head_tasks([test_pipeline2["44_to_55"]]) return test_pipeline2
def build_pipeline(config): """ Assemble the pipeline Parameters ---------- config: ApusConfig object Hold the configurations Returns ------- pipe: Pipeline object The pipeline object with tasks setup """ pipe = Pipeline(name=config.jobkey) # mkdirs # t00 = {'name': 'create jobdir'} # dirs = config.get_dirs() # pipe.mkdir(dirs, name=t00['name']) for task in config.tlist: create_ruffus_task(pipe, config, task) return pipe