def quick(ifold): # sorting bam file pipeline = ruffus.Pipeline('BamDNaseSeq') bam_file = '*.bam' sort_bam_regex = r'(.*)\/(.*).bam$' sort_bam_task = pipeline.collate(tasks.sort_bam, name='sorting_bam', input=os.path.join(ifold, bam_file), filter=ruffus.regex(sort_bam_regex), output=r'\1/\2.sorted.bam') ## bam to bed using bam2bed sorted_bam_file = '*.sorted.bam' sorted_bam_regex = r'(.*)\/(.*).sorted.bam$' sorted_bam_task = pipeline.collate(tasks.bam2bed, name='bam2bed', input=os.path.join( ifold, sorted_bam_file), filter=ruffus.regex(sorted_bam_regex), output=r'\1/\2.sorted.bed') sorted_bam_task.follows('sorting_bam') full_pipe = ruffus.Pipeline('Full pipeline', input=['bam2bed']) full_pipe.run()
def test_newstyle_collate(self): """ As above but create pipeline on the fly using object orientated syntax rather than decorators """ # # Create pipeline on the fly, joining up tasks # test_pipeline = Pipeline("test") test_pipeline.originate(task_func = generate_initial_files, output = original_files)\ .mkdir(tempdir, tempdir+"/test") test_pipeline.subdivide( task_func = split_fasta_file, input = generate_initial_files, filter = regex(r".*\/original_(\d+).fa"), # match original files output = [tempdir + r"/files.split.\1.success", # flag file for each original file tempdir + r"/files.split.\1.*.fa"], # glob pattern extras = [r"\1"])\ .posttask(lambda: sys.stderr.write("\tSplit into %d files each\n" % JOBS_PER_TASK)) test_pipeline.transform(task_func = align_sequences, input = split_fasta_file, filter = suffix(".fa"), output = ".aln") \ .posttask(lambda: sys.stderr.write("\tSequences aligned\n")) test_pipeline.transform(task_func = percentage_identity, input = align_sequences, # find all results from align_sequences filter = suffix(".aln"), # replace suffix with: output = [r".pcid", # .pcid suffix for the result r".pcid_success"] # .pcid_success to indicate job completed )\ .posttask(lambda: sys.stderr.write("\t%Identity calculated\n")) test_pipeline.collate(task_func = combine_results, input = percentage_identity, filter = regex(r".*files.split\.(\d+)\.\d+.pcid"), output = [tempdir + r"/\1.all.combine_results", tempdir + r"/\1.all.combine_results_success"])\ .posttask(lambda: sys.stderr.write("\tResults recombined\n")) # # Cleanup, printout and run # self.cleanup_tmpdir() s = StringIO() test_pipeline.printout(s, [combine_results], verbose=5, wrap_width=10000) self.assertTrue( re.search('Job needs update:.*Missing files.*', s.getvalue(), re.DOTALL) is not None) test_pipeline.run(verbose=0)
def test_newstyle_collate(self): """ As above but create pipeline on the fly using object orientated syntax rather than decorators """ # # Create pipeline on the fly, joining up tasks # test_pipeline = Pipeline("test") test_pipeline.originate(task_func=generate_initial_files, output=original_files)\ .mkdir(tempdir, tempdir+"/test") test_pipeline.subdivide(task_func=split_fasta_file, input=generate_initial_files, # match original files filter=regex(r".*\/original_(\d+).fa"), output=[tempdir + r"/files.split.\1.success", # flag file for each original file tempdir + r"/files.split.\1.*.fa"], # glob pattern extras=[r"\1"])\ .posttask(lambda: sys.stderr.write("\tSplit into %d files each\n" % JOBS_PER_TASK)) test_pipeline.transform(task_func=align_sequences, input=split_fasta_file, filter=suffix(".fa"), output=".aln") \ .posttask(lambda: sys.stderr.write("\tSequences aligned\n")) test_pipeline.transform(task_func=percentage_identity, input=align_sequences, # find all results from align_sequences # replace suffix with: filter=suffix(".aln"), output=[r".pcid", # .pcid suffix for the result r".pcid_success"] # .pcid_success to indicate job completed )\ .posttask(lambda: sys.stderr.write("\t%Identity calculated\n")) test_pipeline.collate(task_func=combine_results, input=percentage_identity, filter=regex(r".*files.split\.(\d+)\.\d+.pcid"), output=[tempdir + r"/\1.all.combine_results", tempdir + r"/\1.all.combine_results_success"])\ .posttask(lambda: sys.stderr.write("\tResults recombined\n")) # # Cleanup, printout and run # self.cleanup_tmpdir() s = StringIO() test_pipeline.printout(s, [combine_results], verbose=5, wrap_width=10000) self.assertTrue(re.search( 'Job needs update:.*Missing files.*', s.getvalue(), re.DOTALL) is not None) test_pipeline.run(verbose=0)
def test_newstyle_task(self): test_pipeline = Pipeline("test") test_pipeline.files(task1, [[None, tempdir + "a.1"], [None, tempdir + "b.1"]])\ .follows(mkdir(tempdir)) test_pipeline.files(task2, [[None, tempdir + "c.1"], [None, tempdir + "d.1"]])\ .follows(mkdir(tempdir)) test_pipeline.transform(task_func=task3, input=task1, filter=regex(r"(.+)"), replace_inputs=ruffus.inputs( ((r"\1"), task2, "test_transform_inputs.*y")), output=r"\1.output") test_pipeline.merge(task4, (task3), tempdir + "final.output") test_pipeline.run([task4], multiprocess=10, verbose=0) correct_output = "{tempdir}a.1.output:test_transform_inputs.py,{tempdir}a.1,{tempdir}c.1,{tempdir}d.1;{tempdir}b.1.output:test_transform_inputs.py,{tempdir}b.1,{tempdir}c.1,{tempdir}d.1;".format( tempdir=tempdir) with open(tempdir + "final.output") as ff: real_output = ff.read() self.assertEqual(correct_output, real_output)
def make_pipeline1( pipeline_name, # Pipelines need to have a unique name starting_file_names): test_pipeline = Pipeline(pipeline_name) # We can change the starting files later using # set_input() for transform etc. # or set_output() for originate # But it can be more convenient to just pass this to the function making the pipeline # test_pipeline.originate(task_originate, starting_file_names)\ .follows(mkdir(tempdir), mkdir(tempdir + "/testdir", tempdir + "/testdir2"))\ .posttask(touch_file(tempdir + "/testdir/whatever.txt")) test_pipeline.transform( task_func=task_m_to_1, name="add_input", # Lookup Task from function name task_originate() # So long as this is unique in the pipeline input=task_originate, # requires an anchor from 3.7 onwards, see # https://bugs.python.org/issue34982 filter=regex(r"^(.*)"), add_inputs=add_inputs(tempdir + "/testdir/whatever.txt"), output=r"\1.22") test_pipeline.transform( task_func=task_1_to_1, name="22_to_33", # Lookup Task from Task name # Function name is not unique in the pipeline input=output_from("add_input"), filter=suffix(".22"), output=".33") tail_task = test_pipeline.transform( task_func=task_1_to_1, name="33_to_44", # Ask Pipeline to lookup Task from Task name input=test_pipeline["22_to_33"], filter=suffix(".33"), output=".44") # Set the tail task so that users of my sub pipeline can use it as a dependency # without knowing the details of task names # # Use Task() object directly without having to lookup test_pipeline.set_tail_tasks([tail_task]) # If we try to connect a Pipeline without tail tasks defined, we have to # specify the exact task within the Pipeline. # Otherwise Ruffus will not know which task we mean and throw an exception if DEBUG_do_not_define_tail_task: test_pipeline.set_tail_tasks([]) # Set the head task so that users of my sub pipeline send input into it # without knowing the details of task names test_pipeline.set_head_tasks([test_pipeline[task_originate]]) return test_pipeline
def make_pipeline1(pipeline_name, # Pipelines need to have a unique name starting_file_names): test_pipeline = Pipeline(pipeline_name) # We can change the starting files later using # set_input() for transform etc. # or set_output() for originate # But it can be more convenient to just pass this to the function making the pipeline # test_pipeline.originate(task_originate, starting_file_names)\ .follows(mkdir(tempdir), mkdir(tempdir + "/testdir", tempdir + "/testdir2"))\ .posttask(touch_file(tempdir + "/testdir/whatever.txt")) test_pipeline.transform(task_func=task_m_to_1, name="add_input", # Lookup Task from function name task_originate() # So long as this is unique in the pipeline input=task_originate, # requires an anchor from 3.7 onwards, see # https://bugs.python.org/issue34982 filter=regex(r"^(.*)"), add_inputs=add_inputs( tempdir + "/testdir/whatever.txt"), output=r"\1.22") test_pipeline.transform(task_func=task_1_to_1, name="22_to_33", # Lookup Task from Task name # Function name is not unique in the pipeline input=output_from("add_input"), filter=suffix(".22"), output=".33") tail_task = test_pipeline.transform(task_func=task_1_to_1, name="33_to_44", # Ask Pipeline to lookup Task from Task name input=test_pipeline["22_to_33"], filter=suffix(".33"), output=".44") # Set the tail task so that users of my sub pipeline can use it as a dependency # without knowing the details of task names # # Use Task() object directly without having to lookup test_pipeline.set_tail_tasks([tail_task]) # If we try to connect a Pipeline without tail tasks defined, we have to # specify the exact task within the Pipeline. # Otherwise Ruffus will not know which task we mean and throw an exception if DEBUG_do_not_define_tail_task: test_pipeline.set_tail_tasks([]) # Set the head task so that users of my sub pipeline send input into it # without knowing the details of task names test_pipeline.set_head_tasks([test_pipeline[task_originate]]) return test_pipeline
def test_newstyle_no_re_match (self): test_pipeline = Pipeline("test") test_pipeline.originate(task_1, tempdir + "a").mkdir(tempdir) test_pipeline.transform(task_2, task_1, regex("b"), "task_2.output") save_to_str_logger = t_save_to_str_logger() test_pipeline.run(multiprocess = 10, logger = save_to_str_logger, verbose = 1) print(save_to_str_logger.warning_str) self.assertTrue("no file names matched" in save_to_str_logger.warning_str) print("\n Warning printed out correctly", file=sys.stderr)
def test_newstyle_no_re_match(self): test_pipeline = Pipeline("test") test_pipeline.originate(task_1, tempdir + "a").mkdir(tempdir) test_pipeline.transform(task_2, task_1, regex("b"), "task_2.output") save_to_str_logger = t_save_to_str_logger() test_pipeline.run( multiprocess=10, logger=save_to_str_logger, verbose=1) print(save_to_str_logger.warning_str) self.assertTrue( "no file names matched" in save_to_str_logger.warning_str) print("\n Warning printed out correctly", file=sys.stderr)
def test_newstyle_task (self): """ Same as above but construct a new pipeline on the fly without decorators """ test_pipeline = Pipeline("test") test_pipeline.files(task1, None, tempdir + 'a.1')\ .follows(mkdir(tempdir)) test_pipeline.transform(task_func = task2, input = task1, filter = regex(r".*"), output = tempdir + 'b.1') test_pipeline.files(task3, task2, tempdir + 'c.1') test_pipeline.files(task4, [[None, tempdir + 'd.1'], [None, tempdir + 'e.1']])\ .follows(task3) test_pipeline.files(task5, task4, tempdir + "f.1") test_pipeline.run(multiprocess = 10, verbose = 0)
def test_newstyle_task(self): """ Same as above but construct a new pipeline on the fly without decorators """ test_pipeline = Pipeline("test") test_pipeline.files(task1, None, tempdir + 'a.1')\ .follows(mkdir(tempdir)) test_pipeline.transform(task_func=task2, input=task1, filter=regex(r".*"), output=tempdir + 'b.1') test_pipeline.files(task3, task2, tempdir + 'c.1') test_pipeline.files(task4, [[None, tempdir + 'd.1'], [None, tempdir + 'e.1']])\ .follows(task3) test_pipeline.files(task5, task4, tempdir + "f.1") test_pipeline.run(multiprocess=10, verbose=0)
def add_export_to_pipeline(pipeline, tool_runners, suffix, config, **kwargs): conf = config.get("export", {}) prefix = conf.get("prefix", "").strip() result_dir = "export.dir" filter_regex = ruffus.regex("(.*).dir/(.*).{}".format(suffix)) output = r"{}/{}\1.{}".format(result_dir, prefix, suffix) export_result.__name__ = "export" pipeline.transform(task_func=export_result, input=tool_runners, filter=filter_regex, output=output, **kwargs).mkdir(result_dir)
def test_newstyle_ruffus(self): test_pipeline = Pipeline("test") test_pipeline.split(task_func=prepare_files, input=None, output=tempdir + '*.animal')\ .follows(mkdir(tempdir, tempdir + "test"))\ .posttask(lambda: do_write(tempdir + "task.done", "Task 1 Done\n")) test_pipeline.collate(task_func=summarise_by_grouping, input=prepare_files, filter=regex(r'(.*/).*\.(.*)\.animal'), output=r'\1\2.results')\ .posttask(lambda: do_write(tempdir + "task.done", "Task 2 Done\n")) test_pipeline.run(multiprocess=10, verbose=0) check_species_correct()
def test_newstyle_no_re_match (self): try: test_pipeline = Pipeline("test") test_pipeline.transform(task_func = task_2, input = None, filter = regex(tempdir + "b"), replace_inputs = inputs(tempdir + "a", tempdir + "b"), output = "task_1.output") test_pipeline.run(multiprocess = 10, verbose = 0) except ruffus.ruffus_exceptions.error_task_transform_inputs_multiple_args: print("\tExpected exception thrown 1") return except ruffus.ruffus_exceptions.error_inputs_multiple_args: print("\tExpected exception thrown 2") return raise Exception("Inputs(...) with multiple arguments should have thrown an exception")
def test_newstyle_ruffus (self): test_pipeline = Pipeline("test") test_pipeline.follows(setup_simulation_data, mkdir(gene_data_dir, simulation_data_dir)) test_pipeline.files(gwas_simulation, generate_simulation_params)\ .follows(setup_simulation_data)\ .follows(mkdir(working_dir, os.path.join(working_dir, "simulation_results"))) test_pipeline.collate(statistical_summary, gwas_simulation, regex(r"simulation_results/(\d+).\d+.simulation_res"), r"\1.mean")\ .posttask(lambda : sys.stdout.write("\nOK\n")) test_pipeline.run(multiprocess = 50, verbose = 0) for oo in "000.mean", "001.mean": results_file_name = os.path.join(working_dir, oo) if not os.path.exists(results_file_name): raise Exception("Missing %s" % results_file_name)
def test_newstyle_ruffus(self): test_pipeline = Pipeline("test") test_pipeline.follows(setup_simulation_data, mkdir(gene_data_dir, simulation_data_dir)) test_pipeline.files(gwas_simulation, generate_simulation_params)\ .follows(setup_simulation_data)\ .follows(mkdir(working_dir, os.path.join(working_dir, "simulation_results"))) test_pipeline.collate(statistical_summary, gwas_simulation, regex(r"simulation_results/(\d+).\d+.simulation_res"), r"\1.mean")\ .posttask(lambda: sys.stdout.write("\nOK\n")) test_pipeline.run(multiprocess=50, verbose=0) for oo in "000.mean", "001.mean": results_file_name = os.path.join(working_dir, oo) if not os.path.exists(results_file_name): raise Exception("Missing %s" % results_file_name)
def test_newstyle_no_re_match(self): try: test_pipeline = Pipeline("test") test_pipeline.transform(task_func=task_2, input=None, filter=regex(tempdir + "b"), replace_inputs=inputs( tempdir + "a", tempdir + "b"), output="task_1.output") test_pipeline.run(multiprocess=10, verbose=0) except ruffus.ruffus_exceptions.error_task_transform_inputs_multiple_args: print("\tExpected exception thrown 1") return except ruffus.ruffus_exceptions.error_inputs_multiple_args: print("\tExpected exception thrown 2") return raise Exception( "Inputs(...) with multiple arguments should have thrown an exception" )
REGEX_TRACK_BOTH = r"(processed.dir/)*([^/]+)\.(fastq.1.gz|fastq.gz|sra|csfasta.gz|remote)" SEQUENCEFILES_REGEX = r"([^/]+).(?P<suffix>fastq.1.gz|fastq.gz|sra|csfasta.gz|remote)" def connect(): ''' Setup a connection to an sqlite database ''' dbh = sqlite3.connect(P.get_params()['database']) return dbh @transform(P.get_params()["input_globs"].get("default", INPUT_FORMATS), regex("(.*)"), r"\1") def unprocessReads(infiles, outfiles): """dummy task - no processing of reads.""" # if preprocess tools are specified, preprocessing is done on output that has # already been generated in the first run if P.get_params().get("preprocessors", None): if P.get_params()["auto_remove"]: # check if FastQC has been run for x in iotools.flatten([glob.glob(y) for y in P.get_params()["input_globs"].get("default", INPUT_FORMATS)]): f = "fastqc.dir/" + re.match(REGEX_TRACK, x).group(1) + ".fastqc" if not os.path.exists(f): raise ValueError( "file %s missing, "
def main(): ######### # SETUP # ######### # catch jgi logon and password from cli parser = ruffus.cmdline.get_argparse( description='5 accessions variant calling pipeline.') parser.add_argument('--email', '-e', help='Logon email address for JGI', type=str, dest='jgi_logon') parser.add_argument('--password', '-p', help='JGI password', type=str, dest='jgi_password') options = parser.parse_args() jgi_logon = options.jgi_logon jgi_password = options.jgi_password ################## # PIPELINE STEPS # ################## # test function for checking input/output passed to job_script and parsing # by io_parser test_job_function = functions.generate_job_function( job_script='src/sh/io_parser', job_name='test') # initialise pipeline main_pipeline = ruffus.Pipeline.pipelines["main"] # bamfiles raw_files = [x.path for x in os.scandir('data/bam') if x.name.endswith('.bam') and x.is_file] # subset the files while the pipeline is in development. Make this equal # to the raw_files to run the whole pipline. # active_raw_files = [x for x in raw_files if # 'G1' in x or 'G4' in x or 'J1' in x or 'J4' in x] active_raw_files = raw_files # species short names for vcf splitting species_short_names = list(set( [os.path.basename(x)[0] for x in active_raw_files])) # check that the files exist mapped_raw = main_pipeline.originate( name='mapped_raw', task_func=os.path.isfile, output=active_raw_files) # genome fasta ref_fa = main_pipeline.originate( name='ref_fa', task_func=functions.generate_job_function( job_script='src/sh/download_genome', job_name='ref_fa', job_type='download'), output='data/genome/Osativa_323_v7.0.fa', extras=[jgi_logon, jgi_password]) # indexes fa_idx = main_pipeline.transform( name='fa_idx', task_func=functions.generate_job_function( job_script='src/sh/fa_idx', job_name='fa_idx', job_type='transform', cpus_per_task=6), input=ref_fa, filter=ruffus.suffix(".fa"), output=['.dict', '.fa.fai']) # annotation annot = main_pipeline.originate( name='annot', task_func=functions.generate_job_function( job_script='src/sh/download_genome', job_name='annot', job_type='download'), output=('data/genome/' 'Osativa_323_v7.0.gene_exons.gffread.rRNAremoved.gtf'), extras=[jgi_logon, jgi_password]) # convert annotation to .bed annot_bed = main_pipeline.transform( name='annot_bed', task_func=functions.generate_job_function( job_script='src/sh/annot_bed', job_name='annot_bed', job_type='transform', cpus_per_task=7), input=annot, filter=ruffus.suffix('.gtf'), output='.bed') # mark duplicates with picard deduped = main_pipeline.transform( name='dedupe', task_func=functions.generate_job_function( job_script='src/sh/mark_duplicates_and_sort', job_name='dedupe', job_type='transform', cpus_per_task=2), input=mapped_raw, filter=ruffus.regex(r"data/bam/(.*).Aligned.out.bam"), output=(r"output/mark_duplicates_and_sort/\1.deduped.bam")) # Split'N'Trim and reassign mapping qualities split_and_trimmed = main_pipeline.transform( name='split_trim', task_func=functions.generate_job_function( job_script='src/sh/split_trim', job_name='split_trim', job_type='transform', cpus_per_task=2), input=deduped, add_inputs=ruffus.add_inputs(ref_fa), filter=ruffus.formatter( "output/mark_duplicates_and_sort/(?P<LIB>.+).deduped.bam"), output=["{subdir[0][1]}/split_trim/{LIB[0]}.split.bam"])\ .follows(fa_idx) # we're going to recycle call_variants, merge_variants, filter_variants # and analyze_covar so we'll get the functions in advance call_variants = functions.generate_queue_job_function( job_script='src/sh/call_variants', job_name='call_variants') merge_variants = functions.generate_job_function( job_script='src/sh/merge_variants', job_name='merge_variants', job_type='transform', cpus_per_task=8) filter_variants = functions.generate_job_function( job_script='src/sh/filter_variants', job_name='filter_variants', job_type='transform', cpus_per_task=1) analyze_covar = functions.generate_queue_job_function( job_script='src/sh/analyze_covar', job_name='analyze_covar') # call variants without recalibration tables uncalibrated_variants = main_pipeline.transform( name='uncalibrated_variants', task_func=call_variants, input=split_and_trimmed, add_inputs=ruffus.add_inputs([ref_fa, annot_bed]), filter=ruffus.formatter('output/split_trim/(?P<LIB>.+).split.bam'), output='{subdir[0][1]}/variants_uncalibrated/{LIB[0]}.g.vcf.gz') # merge gVCF variants uncalibrated_variants_merged = main_pipeline.merge( name='uncalibrated_variants_merged', task_func=merge_variants, input=[uncalibrated_variants, ref_fa], output='output/variants_uncalibrated/variants_uncalibrated.vcf.gz') # filter variants on un-corrected bamfiles uncalibrated_variants_filtered = main_pipeline.transform( name='uncalibrated_variants_filtered', task_func=filter_variants, input=uncalibrated_variants_merged, add_inputs=ruffus.add_inputs(ref_fa), filter=ruffus.suffix('_uncalibrated.vcf.gz'), output='_uncalibrated_filtered.vcf.gz') # select variant (only recalibrate using passed SNPs) uncalibrated_variants_selected = main_pipeline.transform( name='uncalibrated_variants_selected', task_func=functions.generate_job_function( job_script='src/sh/select_variants', job_name='select_variants', job_type='transform'), input=uncalibrated_variants_filtered, add_inputs=ruffus.add_inputs(ref_fa), filter=ruffus.suffix('_uncalibrated_filtered.vcf.gz'), output='_uncalibrated_selected.vcf.gz') # create recalibration report with filtered variants covar_report = main_pipeline.merge( name='covar_report', task_func=analyze_covar, input=[split_and_trimmed, ref_fa, annot_bed, uncalibrated_variants_selected], output="output/covar_analysis/recal_data.table") # second pass to analyze covariation remaining after recalibration second_pass_covar_report = main_pipeline.merge( name='second_pass_covar_report', task_func=analyze_covar, input=[split_and_trimmed, ref_fa, annot_bed, uncalibrated_variants_filtered, covar_report], output="output/covar_analysis/post_recal_data.table") # plot effect of base recalibration recal_plot = main_pipeline.transform( name='recal_plot', task_func=functions.generate_job_function( job_script='src/R/recal_plot.R', job_name='recal_plot', job_type='transform', cpus_per_task=1), input=second_pass_covar_report, filter=ruffus.suffix('post_recal_data.table'), add_inputs=ruffus.add_inputs(covar_report), output='recalibration_plots.pdf') # recalibrate bases using recalibration report recalibrated = main_pipeline.transform( name='recalibrate', task_func=functions.generate_job_function( job_script='src/sh/recalibrate', job_name='recalibrate', job_type='transform', cpus_per_task=2), input=split_and_trimmed, add_inputs=ruffus.add_inputs([ref_fa, covar_report]), filter=ruffus.formatter('output/split_trim/(?P<LIB>.+).split.bam'), output='{subdir[0][1]}/recal/{LIB[0]}.recal.bam') # final variant calling variants = main_pipeline.transform( name='variants', task_func=call_variants, input=recalibrated, add_inputs=ruffus.add_inputs(ref_fa, annot_bed), filter=ruffus.formatter('output/recal/(?P<LIB>.+).recal.bam'), output='{subdir[0][1]}/variants/{LIB[0]}.g.vcf.gz') # merge gVCF variants variants_merged = main_pipeline.merge( name='variants_merged', task_func=merge_variants, input=[variants, ref_fa], output='output/variants/variants.vcf.gz') # variant filtering variants_filtered = main_pipeline.transform( name='variants_filtered', task_func=filter_variants, input=variants_merged, add_inputs=ruffus.add_inputs(ref_fa), filter=ruffus.suffix('.vcf.gz'), output='_filtered.vcf.gz') # variants by species split_variants = main_pipeline.subdivide( name='split_variants', task_func=functions.generate_job_function( job_script='src/sh/split_variants', job_name='split_variants', job_type='transform', cpus_per_task=1, ntasks=len(species_short_names)), input=variants_filtered, filter=ruffus.formatter(), add_inputs=ruffus.add_inputs(ref_fa), output=[('output/split_variants/' + x + '.variants_filtered.vcf.gz') for x in species_short_names]) # count variants per gene per species cds_variants = main_pipeline.transform( name='cds_variants', task_func=functions.generate_job_function( job_script='src/R/cds_variants.R', job_name='cds_variants', job_type='transform'), input=split_variants, add_inputs=ruffus.add_inputs([ref_fa, annot]), filter=ruffus.formatter( 'output/split_variants/(?P<LIB>.+).variants_filtered.vcf.gz'), output='{subdir[0][1]}/cds_variants/{LIB[0]}.cds_variants.Rds') # merge counted variants variants_per_gene = main_pipeline.merge( name='cds_merge', task_func=functions.generate_job_function( job_script='src/R/cds_merge.R', job_name='cds_merge', job_type='transform'), input=cds_variants, output='output/cds_variants/cds_variants.Rds') ################### # RUFFUS COMMANDS # ################### # print the flowchart ruffus.pipeline_printout_graph( "ruffus/flowchart.pdf", "pdf", pipeline_name="5 accessions variant calling pipeline") # run the pipeline ruffus.cmdline.run(options, multithread=8)
def mappipe(ifold, ref_file, minlen=20, rclip=0): ifold = os.path.join(ifold, '') ifile = '*.fastq.gz' # '*.fastq.gz' #ref_file = '/data/index/HG19.fasta' trim_regex = r'(.*)\/(SRR.+).fastq.gz$' pipeline = ruffus.Pipeline('FastqDNaseSeq') trim_task = pipeline.collate( tasks.trimmer, name='TrimGalore', input=ifold + ifile, filter=ruffus.regex(trim_regex), output=r'\1/\2_trimmed.fq.gz', # extras[0]: minimum length, # [1]:right end clip size extras=[[minlen, rclip]]) trfile = '*_trimmed.fq.gz' aln_regex = r'(.*)\/(.*).fq.gz$' align_task = pipeline.collate(tasks.bwa_aln, name='bwa_aln', input=ifold + trfile, filter=ruffus.regex(aln_regex), output=r'\1/\2.sai', extras=[ref_file]) align_task.follows('TrimGalore') ## sai to sam file using bwa samse sai_file = '*.sai' samse_regex = r'(.*)\/(.*).sai$' samse_task = pipeline.collate( tasks.bwa_samse, name='bwa_samse', input=ifold + sai_file, filter=ruffus.regex(samse_regex), output=r'\1/\2.sam', # extras[0]: fastq required for samse, # [1]: ref indexed fasta, # [2]: max multiple mapped reads [Default=3] extras=[[r'\1/\2.fq.gz', ref_file, 10]]) samse_task.follows('bwa_aln') ## sam to bam using sambamba view sam_file = '*.sam' tobam_regex = r'(.*)\/(.*).sam$' tobam_task = pipeline.collate(tasks.sam_to_bam, name='sam_bam', input=ifold + sam_file, filter=ruffus.regex(tobam_regex), output=r'\1/\2.bam') tobam_task.follows('bwa_samse') ## sorting bam with sambamba sort bam_file = '*trimmed.bam' sort_bam_regex = r'(.*)\/(.*).bam$' sort_bam_task = pipeline.collate(tasks.sort_bam, name='sorting_bam', input=ifold + bam_file, filter=ruffus.regex(sort_bam_regex), output=r'\1/\2.sorted.bam') sort_bam_task.follows('sam_bam') ## bam to bed using bam2bed sorted_bam_file = '*trimmed.sorted.bam' sorted_bam_regex = r'(.*)\/(.*).sorted.bam$' sorted_bam_task = pipeline.collate(tasks.bam2bed, name='bam2bed', input=ifold + sorted_bam_file, filter=ruffus.regex(sorted_bam_regex), output=r'\1/\2.sorted.bed') sorted_bam_task.follows('sorting_bam') full_pipe = ruffus.Pipeline('Full pipeline', input=['bam2bed']) full_pipe.run()
if filetype == "bam": preamble += "samtools index %(tmpfile)s && " postamble += " && rm %(tmpfile)s.bai " elif filetype == "bed.gz": tmp2 = P.get_temp_filename(shared=False) preamble += ''' zcat %(tmpfile)s | sort -k1,1 -k2,2n | bgzip > %(tmp2)s && mv %(tmp2)s %(tmpfile)s && tabix -p bed %(tmpfile)s && ''' postamble += "&& rm %(tmpfile)s.tbi" return preamble % locals(), postamble % locals(), tmpfile, filetype # ------------------------------------------------------------------------------ @subdivide("*.categories.tsv", regex("(.+).categories.tsv"), add_inputs(PARAMS["geneset"]), r"\1_*.gtf.gz", r"\1") def split_gtf_by_category(infiles, outfiles, catname): catfile, gtffile = infiles categories = pd.read_csv(catfile, index_col=0, squeeze=True, sep="\t") # create output filepool outpool = iotools.FilePool("{}_%s.gtf.gz".format(catname), force=True) gtffile = iotools.open_file(gtffile) for gtfline in gtf.iterator(gtffile): try: transcript_id = gtfline.transcript_id
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='cellfree_seq') # Stages are dependent on the state stages = Stages(state) safe_make_dir('alignments') # The original FASTQ files fastq_files = glob.glob('fastqs/*') # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate(task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) # Align paired end reads in FASTQ to the reference producing a BAM file pipeline.transform( task_func=stages.align_bwa, name='align_bwa', input=output_from('original_fastqs'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+)_R1.fastq.gz'), # Add one more inputs to the stage: # 1. The corresponding R2 FASTQ file add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the sample name. This is needed within the stage for finding out # sample specific configuration options extras=['{sample[0]}'], # The output file name is the sample name with a .bam extension. output='alignments/{sample[0]}.sort.hq.bam') pipeline.transform(task_func=stages.run_connor, name='run_connor', input=output_from('align_bwa'), filter=suffix('.sort.hq.bam'), output='.sort.hq.connor.bam') safe_make_dir('metrics') safe_make_dir('metrics/summary') safe_make_dir('metrics/connor') pipeline.transform( task_func=stages.intersect_bed, name='intersect_bed_raw', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.bam'), output='metrics/summary/{sample[0]}.intersectbed.bam') pipeline.transform(task_func=stages.coverage_bed, name='coverage_bed_raw', input=output_from('intersect_bed_raw'), filter=suffix('.intersectbed.bam'), output='.bedtools_hist_all.txt') pipeline.transform( task_func=stages.genome_reads, name='genome_reads_raw', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.bam'), output='metrics/summary/{sample[0]}.mapped_to_genome.txt') pipeline.transform(task_func=stages.target_reads, name='target_reads_raw', input=output_from('intersect_bed_raw'), filter=suffix('.intersectbed.bam'), output='.mapped_to_target.txt') pipeline.transform( task_func=stages.total_reads, name='total_reads_raw', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.bam'), output='metrics/summary/{sample[0]}.total_raw_reads.txt') pipeline.collate( task_func=stages.generate_stats, name='generate_stats_raw', input=output_from('coverage_bed_raw', 'genome_reads_raw', 'target_reads_raw', 'total_reads_raw'), filter=regex( r'.+/(.+)\.(bedtools_hist_all|mapped_to_genome|mapped_to_target|total_raw_reads)\.txt' ), output=r'metrics/summary/all_sample.summary.\1.txt', extras=[r'\1', 'summary.txt']) pipeline.transform( task_func=stages.intersect_bed, name='intersect_bed_connor', input=output_from('run_connor'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.connor.bam'), output='metrics/connor/{sample[0]}.intersectbed.bam') pipeline.transform(task_func=stages.coverage_bed, name='coverage_bed_connor', input=output_from('intersect_bed_connor'), filter=suffix('.intersectbed.bam'), output='.bedtools_hist_all.txt') pipeline.transform( task_func=stages.genome_reads, name='genome_reads_connor', input=output_from('run_connor'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.connor.bam'), output='metrics/summary/{sample[0]}.mapped_to_genome.txt') pipeline.transform(task_func=stages.target_reads, name='target_reads_connor', input=output_from('intersect_bed_connor'), filter=suffix('.intersectbed.bam'), output='.mapped_to_target.txt') pipeline.transform( task_func=stages.total_reads, name='total_reads_connor', input=output_from('run_connor'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.connor.bam'), output='metrics/summary/{sample[0]}.total_raw_reads.txt') pipeline.collate( task_func=stages.generate_stats, name='generate_stats_connor', input=output_from('coverage_bed_connor', 'genome_reads_connor', 'target_reads_connor', 'total_reads_connor'), filter=regex( r'.+/(.+)\.(bedtools_hist_all|mapped_to_genome|mapped_to_target|total_raw_reads)\.txt' ), output=r'metrics/connor/all_sample.summary.\1.txt', extras=[r'\1', 'connor.summary.txt']) safe_make_dir('variants') safe_make_dir('variants/vardict') pipeline.transform( task_func=stages.run_vardict, name='run_vardict', input=output_from('run_connor'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.connor.bam'), output='variants/vardict/{sample[0]}.vcf', extras=['{sample[0]}']) pipeline.transform( task_func=stages.sort_vcfs, name='sort_vcfs', input=output_from('run_vardict'), filter=formatter('variants/vardict/(?P<sample>[a-zA-Z0-9_-]+).vcf'), output='variants/vardict/{sample[0]}.sorted.vcf.gz') pipeline.transform(task_func=stages.index_vcfs, name='index_vcfs', input=output_from('sort_vcfs'), filter=suffix('.sorted.vcf.gz'), output='.sorted.vcf.gz.tbi') (pipeline.merge( task_func=stages.concatenate_vcfs, name='concatenate_vcfs', input=output_from('sort_vcfs'), output='variants/vardict/combined.vcf.gz').follows('index_vcfs')) pipeline.transform(task_func=stages.vt_decompose_normalise, name='vt_decompose_normalise', input=output_from('concatenate_vcfs'), filter=suffix('.vcf.gz'), output='.decomp.norm.vcf.gz') pipeline.transform(task_func=stages.index_vcfs, name='index_final_vcf', input=output_from('vt_decompose_normalise'), filter=suffix('.decomp.norm.vcf.gz'), output='.decomp.norm.vcf.gz.tbi') (pipeline.transform( task_func=stages.apply_vep, name='apply_vep', input=output_from('vt_decompose_normalise'), filter=suffix('.decomp.norm.vcf.gz'), output='.decomp.norm.vep.vcf').follows('index_final_vcf')) return pipeline
'''Determine indel candidate intervals''' cmd_dict = CMD_DICT.copy() cmd_dict['infile'] = input_file cmd_dict['outfile'] = output_file pmsg('Interval Creation', input_file, output_file) gatk_cmd = '%(gatk)s --analysis_type RealignerTargetCreator ' + \ '--reference_sequence %(reference)s ' + \ '--DBSNP %(dbsnp)s ' + \ '--input_file %(infile)s ' + \ '--out %(outfile)s' call(gatk_cmd, cmd_dict) # Realign around possible indels @follows(mkdir('realigned')) @transform(create_intervals, regex(r'^intervals/(.+)\.intervals$'), inputs([r'deduped/\1.deduped.bam', r'intervals/\1.intervals']), r'realigned/\1.realigned.bam') def local_realignment(input_files, output_file): '''Realign reads around candidate indels''' cmd_dict = CMD_DICT.copy() cmd_dict['bam_file'] = input_files[0] cmd_dict['indel_intervals'] = input_files[1] cmd_dict['outfile'] = output_file pmsg('Local Realignment', ', '.join(input_files), output_file) gatk_cmd = '%(gatk)s --analysis_type IndelRealigner ' + \ '--reference_sequence %(reference)s ' + \ '--DBSNP %(dbsnp)s ' + \ '--input_file %(bam_file)s ' + \ '--targetIntervals %(indel_intervals)s ' + \ '--out %(outfile)s'
# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 @follows(mkdir(tempdir)) @ruffus.files([[None, tempdir + "a.1"], [None, tempdir + "b.1"]]) def task1(i, o): touch(o) @follows(mkdir(tempdir)) @ruffus.files([[None, tempdir + "c.1"], [None, tempdir + "d.1"]]) def task2(i, o): touch(o) @transform(task1, regex(r"(.+)"), ruffus.inputs(((r"\1"), task2, "test_transform_inputs.*y")), r"\1.output") def task3(i, o): names = ",".join(sorted(i)) for f in o: with open(o, "w") as ff: ff.write(names) @merge((task3), tempdir + "final.output") def task4(i, o): with open(o, "w") as o_file: for f in sorted(i): with open(f) as ff: o_file.write(f + ":" + ff.read() + ";")
s = 0 except: s = -1 with open(in_genes) as infile: with open(out_bed, 'w') as outfile: for line in infile: fields = line.strip().split('\t') chrom, start, stop = fields[s + 2], fields[s + 4], fields[s + 5] name, strand = fields[s + 1], fields[s + 3] outfile.write( '\t'.join([chrom, start, stop, name, '0', strand]) + '\n') @follows(get_refseq_genes, convert_gtf_genes_to_bed) @split('%s.*_genes' % cfg.get('DEFAULT', 'genome'), regex(r'(.*)_genes$'), [ r'\1_genes.promoter*_ext*', r'\1_genes.down*_ext*', r'\1_genes.utr5', r'\1_genes.utr3', r'\1_genes.exon', r'\1_genes.intron', r'\1_genes.tss', r'\1_genes.noncoding' ]) def refseq_genes_to_regions(in_genes, out_pattern): """make regions (promoter, downstream, 5UTR, etc) from refseq_genes""" args = shlex.split('''%s --promoter_size=%s --promoter_extend=%s --downstream_size=%s --downstream_extend=%s --with_gene_name''' % (in_genes, cfg.get('genes', 'promoter_size'), cfg.get('genes', 'promoter_extend'), cfg.get('genes', 'downstream_size'), cfg.get('genes', 'downstream_extend'))) makeGeneStructure.main(args)
try: _ = int(open(in_genes).readline().strip().split('\t')[0]) s = 0 except: s = -1 with open(in_genes) as infile: with open(out_bed, 'w') as outfile: for line in infile: fields = line.strip().split('\t') chrom, start, stop = fields[s+2], fields[s+4], fields[s+5] name, strand = fields[s+1], fields[s+3] outfile.write('\t'.join([chrom, start, stop, name, '0', strand]) + '\n') @follows(get_refseq_genes, convert_gtf_genes_to_bed) @split('%s.*_genes' % cfg.get('DEFAULT', 'genome'), regex(r'(.*)_genes$'), [r'\1_genes.promoter*_ext*', r'\1_genes.down*_ext*', r'\1_genes.utr5', r'\1_genes.utr3', r'\1_genes.exon', r'\1_genes.intron', r'\1_genes.tss', r'\1_genes.noncoding']) def refseq_genes_to_regions(in_genes, out_pattern): """make regions (promoter, downstream, 5UTR, etc) from refseq_genes""" args = shlex.split('''%s --promoter_size=%s --promoter_extend=%s --downstream_size=%s --downstream_extend=%s --with_gene_name''' % ( in_genes, cfg.get('genes', 'promoter_size'), cfg.get('genes', 'promoter_extend'), cfg.get('genes', 'downstream_size'), cfg.get('genes', 'downstream_extend'))) makeGeneStructure.main(args)
REGEX_TRACK_BOTH = \ r"(processed.dir/)*([^/]+)\.(fastq.1.gz|fastq.gz|sra|csfasta.gz|remote)" SEQUENCEFILES_REGEX = r"(\S+).(?P<suffix>fastq.1.gz|fastq.gz|sra|csfasta.gz|remote)" def connect(): ''' Setup a connection to an sqlite database ''' dbh = sqlite3.connect(PARAMS['database']) return dbh @transform(INPUT_FORMATS, regex("(.*)"), r"\1") def unprocessReads(infiles, outfiles): """dummy task - no processing of reads.""" # if preprocess tools are specified, preprocessing is done on output that has # already been generated in the first run if PARAMS.get("preprocessors", None): if PARAMS["auto_remove"]: # check if fastqc has been run for x in IOTools.flatten([glob.glob(y) for y in INPUT_FORMATS]): f = re.match(REGEX_TRACK, x).group(1) + ".fastqc" if not os.path.exists(f): raise ValueError( "file %s missing, " "you need to run the pipeline once before "
# _________________________________________________________________________________________ # # Step 2: # # Statistical summary per gene/gwas file pair # # for n_file in NNN_pairs_of_input_files: # working_dir/simulation_results/n.*.simulation_res # -> working_dir/n.mean # # _________________________________________________________________________________________ @collate(gwas_simulation, regex(r"simulation_results/(\d+).\d+.simulation_res"), r"\1.mean") @posttask(lambda: sys.stdout.write("\nOK\n")) def statistical_summary(result_files, summary_file): """ Simulate statistical summary """ summary_file = open(summary_file, "w") for f in result_files: with open(f) as ii: summary_file.write(ii.read()) summary_file.close() try: from StringIO import StringIO
# TODO Also have to run the new bag file extractor for mark2 @follows('convert_params_to_h5') @files(None, '%s/sentinel' % LDR_DIR) @posttask(touch_file('%s/sentinel' % LDR_DIR)) def align_ldr(dummy, sentinel): cmd = 'python %s/process/LidarAlign.py %s %s' % (SAIL_CAR_LOG_PATH, DSET_DIR, '%s%d.avi' % (DSET, CAMERA)) print cmd check_call(cmd, shell=True) @follows('align_ldr') #@files('params.ini', '%s/sentinel' % POINTS_H5_DIR) @transform('%s/*.ldr' % LDR_DIR, regex('%s/(.*?).ldr' % LDR_DIR), r'%s/\1.h5' % POINTS_H5_DIR) def convert_ldr_to_h5(ldr_file, h5_file): exporter = '%s/mapping/pipeline/ldr_to_h5.py' % SAIL_CAR_LOG_PATH cmd = 'python {exporter} {fgps} {ldr_file} {h5_file}'.format(exporter=exporter, fgps=GPS_FILE, ldr_file=ldr_file, h5_file=h5_file) if NO_TRANSFORM: cmd += ' --no_transform' print cmd check_call(cmd, shell=True) @follows('convert_ldr_to_h5') @transform('%s/*.h5' % POINTS_H5_DIR, regex('%s/(.*?).h5' % POINTS_H5_DIR), r'%s/\1.pcd' % PCD_DIR) def convert_h5_to_pcd(input_file, output_file):
strand = fields[5] if len(fields) >= 6 else "+" # +:RED, -:GREEN color = "255,0,0" if strand == "+" else "0,255,0" outfile.write("\t".join(fields + [start, stop, color]) + "\n") @transform(bed_color_strand, suffix(""), ".bigbed") def bed_to_bigbed(in_bed, out_bigbed): """Convert a BED file to .bigbed for viewing on UCSC browser""" cmd = "bedToBigBed %s %s.chrom.sizes %s" % (in_bed, genome_path(), out_bigbed) sys_call(cmd) @transform( [bed_uniquefy, clip_and_sort_peaks] + mapping.all_mappers_output, regex("(.*mapped_reads).clipped.sorted(.unique|)"), # suffix('.mapped_reads'), add_inputs(bootstrap.get_chrom_sizes), r"\1\2.bedgraph", ) # r'.bedgraph') def bed_to_bedgraph(in_files, out_bedgraph): "extend reads to the full fragment length and create a bedgraph from them" in_bed, in_chrom_sizes = in_files cmd = ("slopBed -i %s -s -r %s -l 0 -g %s | " + "bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s") % ( in_bed, cfg.getint("DEFAULT", "fragment_size") - cfg.getint("DEFAULT", "tag_size"), in_chrom_sizes, cfg.get("DEFAULT", "genome"), genome_path(), out_bedgraph,
# check whether this is a illumina or sanger fastq file try: SeqIO.convert(input_file_handle, 'fastq-illumina', output_file_handle, 'fastq-sanger') except ValueError as e: # check if this is a quality score problem if e.args != ('Invalid character in quality string',): raise e input_file_handle.seek(0) output_file_handle.seek(0) output_file_handle.writelines(input_file_handle.readlines()) finally: input_file_handle.close() output_file_handle.close() @follows(mkdir('sai'), mkdir('logs')) @transform(copy_sequence, regex(r'^fastq/(.+)_sequence\.fastq\.gz$'), r'sai/\1.sai') def fastq_to_sai(input_file, output_file): '''Convert FASTQ files to SAI files.''' cmd_dict = CMD_DICT.copy() cmd_dict['infile'] = input_file cmd_dict['outfile'] = output_file pmsg('Aligning sequences', cmd_dict['infile'], cmd_dict['outfile']) bwacmd = '%(bwa)s aln -t %(threads)s -f %(outfile)s %(reference)s %(infile)s' call(bwacmd, cmd_dict) # Merge paired ends to SAM @follows(mkdir('sam')) @transform(fastq_to_sai, regex(r'^sai/(\w+)_s_(\d)(_1)?\.sai$'), inputs([r'sai/\1_s_\2*.sai', r'fastq/\1_s_\2*.fastq.gz']), r'sam/\1_s_\2.sam') def make_sam(input_files, output_file):
from hts_waterworks.bootstrap import (genome_path, get_genome, cfg, get_chrom_sizes) import hts_waterworks.preprocessing as preprocessing #: the references to map against for this run (genome, transcriptome, etc) reference_genomes = [genome_path()] if cfg.getboolean('mapping', 'map_to_transcriptome'): reference_genomes.append('*_genes.transcriptome.fasta') @follows(mkdir('mapped')) def make_mapping_dir(): pass @active_if(cfg.getboolean('mapping', 'map_to_transcriptome')) @split('*_genes', regex(r'(.*)_genes$'), [r'\1_genes.transcriptome.fasta', r'\1_genes.transcriptome.seqdb', r'\1_genes.transcriptome.msa']) def make_transcriptome(in_genes, out_files): """Splice UTR's and exons from gene annotations into a transcriptome. Creates a fasta-file of resulting genes and a gene to genome alignment. """ out_fasta, out_db, out_msa = out_files startCol = 1 msa = cnestedlist.NLMSA(out_msa, mode='w', pairwiseMode=True) genome = get_genome(None, None, touch_file=False) for chrom in genome.values(): msa += chrom outfile = open(out_fasta, 'w')
P.run(statement, job_memory=PARAMS["job_highmemory"]) statement = ''' tabix -p bed %(outfile)s ''' P.run(statement, job_memory=PARAMS["job_highmemory"]) ################################################################### # ENSEMBL gene set ################################################################### @follows(mkdir('ensembl.dir')) @transform(PARAMS["ensembl_filename_gtf"], regex("(\S+)"), r"%s" % PARAMS['interface_geneset_all_gtf']) def buildUCSCGeneSet(infile, outfile): '''output sanitized ENSEMBL geneset. This method outputs an ENSEMBL gene set after some sanitizing steps: 1. Chromosome names are changed to the UCSC convention. 2. Chromosomes that match the regular expression specified in the configuration file are removed. Arguments --------- infiles : string ENSEMBL geneset in :term:`gtf` format. NCBI Assembly report in 'txt' format.
# # task3 # @active_if(lambda: pipeline_active_if) @transform(task1, suffix(".1"), ".3") def task3(infile, outfile): """ Third task """ helper(infile, outfile) # # task4 # @collate([task2, task3], regex(r"(.+)\.[23]"), r"\1.4") def task4(infiles, outfile): """ Fourth task """ helper(infiles, outfile) # # task4 # @merge(task4, "test_active_if/summary.5") def task5(infiles, outfile): """ Fifth task
#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 # Tasks #88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 @mkdir(tempdir) @originate(tempdir + "a") def task_1 (o): open(o, 'w').close() @transform(task_1, regex("b"), "task_2.output") def task_2 (i, o): for f in o: with open(f, 'w') as oo: pass import unittest class t_save_to_str_logger: """ Everything to stderr """ def __init__ (self): self.info_str = "" self.warning_str = ""
def prepare_files(no_inputs, outputs): # cleanup previous for f in outputs: os.unlink(f) for grouping in species_list: for species_name in species_list[grouping]: filename = tempdir + "%s.%s.animal" % (species_name, grouping) with open(filename, "w") as oo: oo.write(species_name + "\n") # # task2 # @collate(prepare_files, regex(r'(.*/).*\.(.*)\.animal'), r'\1\2.results') @posttask(lambda: do_write(tempdir + "task.done", "Task 2 Done\n")) def summarise_by_grouping(infiles, outfile): """ Summarise by each species group, e.g. mammals, reptiles, fish """ with open(tempdir + "jobs.start", "a") as oo: oo.write('job = %s\n' % json.dumps([infiles, outfile])) with open(outfile, "w") as oo: for i in infiles: with open(i) as ii: oo.write(ii.read()) with open(tempdir + "jobs.finish", "a") as oo: oo.write('job = %s\n' % json.dumps([infiles, outfile]))
def main(): ######### # SETUP # ######### # test function for checking input/output passed to job_script and parsing # by src/sh/io_parser test_job_function = tompltools.generate_job_function( job_script='src/sh/io_parser', job_name='test', verbose=True) # parse email etc. here? parser = ruffus.cmdline.get_argparse( description='ASW genome assembly pipeline.') parser.add_argument('--blast-db-folder', help='Path to BLAST db folder', type=str, dest='blast_db_folder') # parser.add_argument('--email', '-e', # help='Logon email address for JGI', # type=str, # dest='jgi_logon') # parser.add_argument('--password', '-p', # help='JGI password', # type=str, # dest='jgi_password') options = parser.parse_args() # jgi_logon = options.jgi_logon # jgi_password = options.jgi_password if options.blast_db_folder: os.environ['BLASTDB'] = options.blast_db_folder # initialise pipeline main_pipeline = ruffus.Pipeline.pipelines['main'] # find fastq.gz files dir_listing = [x[0] for x in os.walk(top='data', followlinks=True)] fastq_file_list = [] for directory in dir_listing: file_list = os.scandir(directory) fastq_file_list.append([x.path for x in file_list if (x.name.endswith('fastq.gz') or x.name.endswith('.fastq')) and x.is_file()]) fastq_files = list(tompytools.flatten_list(fastq_file_list)) # extract only MH gDNA fastq data, i.e. # 2125-06-11-1 = MH PE # 2125-06-06-1 = MH MP active_fq_files = [x for x in fastq_files if ('2125-06-11-1' in x or '2125-06-06-1' in x)] # load files into ruffus raw_fq_files = main_pipeline.originate( name='raw_fq_files', task_func=os.path.isfile, output=active_fq_files) # merge libraries merged_fq_files = main_pipeline.collate( name='merged_fq_files', task_func=tompltools.generate_job_function( job_script='src/sh/merge_fq', job_name='merge_fq'), input=raw_fq_files, filter=ruffus.formatter( r'data/NZGL02125/.*/' '[^-]+-(?P<LIB>[^_]+).+_R(?P<RN>\d)_.*.fastq.gz'), output=[r'output/fq_merged/{LIB[0]}_R{RN[0]}_merged.fastq.gz']) # make pairs and send to cutadapt for trimming external adaptors trim_cutadapt = main_pipeline.collate( name='trim_cutadapt', task_func=tompltools.generate_job_function( job_script='src/sh/cutadapt_pe', job_name='cutadapt'), input=merged_fq_files, filter=ruffus.formatter( r'.+/(?P<LIB>[^_]+)_R(?P<RN>\d)_merged.fastq.gz'), output=[['output/cutadapt/{LIB[0]}_R1_trimmed.fastq.gz', 'output/cutadapt/{LIB[0]}_R2_trimmed.fastq.gz']]) # send trimmed reads to splitnextera mp_splitnextera = main_pipeline.subdivide( name='splitnextera', task_func=tompltools.generate_job_function( job_script='src/sh/splitnextera', job_name='splitnextera'), input=trim_cutadapt, filter=ruffus.regex( r'.+?/2125-06-06-1_R(?P<RN>\d)_trimmed.fastq.gz'), output=['output/splitnextera/2125-06-06-1.pe.fastq.gz', 'output/splitnextera/2125-06-06-1.se.fastq.gz', 'output/splitnextera/2125-06-06-1.mp.fastq.gz', 'output/splitnextera/2125-06-06-1.unknown.fastq.gz']) # decontaminate PhiX (other?) sequences decon_mp = main_pipeline.transform( name='decon_mp', task_func=tompltools.generate_job_function( job_script='src/sh/decon', job_name='decon_mp'), input=mp_splitnextera, filter=ruffus.formatter( r'.+/2125-06-06-1\.(?P<VL>[^.]+)\.fastq.gz'), output=['output/decon/2125-06-06-1_{VL[0]}.fastq.gz']) decon_pe = main_pipeline.transform( name='decon_pe', task_func=tompltools.generate_job_function( job_script='src/sh/decon', job_name='decon_pe'), input=trim_cutadapt, filter=ruffus.regex( r'.+?/2125-06-11-1_R(?P<RN>\d)_trimmed.fastq.gz'), output=[r'output/decon/2125-06-11-1.fastq.gz']) decon = [decon_mp, decon_pe] # digital normalisation and error correction w/ bbnorm bbnorm = main_pipeline.subdivide( name='bbnorm', task_func=tompltools.generate_job_function( job_script='src/sh/bbnorm', job_name='bbnorm', mem_per_cpu=7000, cpus_per_task=8), input=decon, filter=ruffus.formatter(r'.+/(?P<LN>[^(_|.)]+)(?P<VL>_?\w*).fastq.gz'), output=[r'output/bbnorm/{LN[0]}{VL[0]}.fastq.gz']) # download NCBI databases for taxonomy data download_taxonomy_databases = main_pipeline.originate( name='download_taxonomy_databases', task_func=tompltools.generate_job_function( job_script='src/r/download_taxonomy_databases.R', job_name='download_taxonomy_databases', job_type='originate'), output=[['data/ncbi/nucl_gb.accession2taxid.Rds', 'data/ncbi/nodes.dmp.Rds', 'data/ncbi/names.dmp.Rds']]) # subsample reads, blast with biopython and parse results fq_subsample = main_pipeline.subdivide( name='fq_subsample', task_func=tompltools.generate_job_function( job_script='src/sh/fq_subsample', job_name='fq_subsample'), input=bbnorm, filter=ruffus.formatter(r'.+/(?P<LN>[^(_|.)]+)(?P<VL>_?\w*).fastq.gz'), output=[r'output/blastqc/{LN[0]}{VL[0]}_R1.fastq.gz', r'output/blastqc/{LN[0]}{VL[0]}_R2.fastq.gz']) blast_reads = main_pipeline.transform( name='blast_reads', task_func=tompltools.generate_job_function( job_script='src/py/blast_reads.py', job_name='blast_reads', cpus_per_task=4), input=fq_subsample, filter=ruffus.suffix('.fastq.gz'), output=['.xml']) parse_blast_results = main_pipeline.transform( name='parse_blast_results', task_func=tompltools.generate_job_function( job_script='src/py/parse_blast_results.py', job_name='parse_blast_results'), input=blast_reads, filter=ruffus.suffix('.xml'), output=['.table']) main_pipeline.merge( name='plot_blast_resuts', task_func=tompltools.generate_job_function( job_script='src/r/extract_blast_hits_per_taxid.R', job_name='plot_blast_resuts'), input=[parse_blast_results, download_taxonomy_databases], output=['output/blastqc/plots.pdf']) # trim reads to 100 bp for edena? clip_to_100b = main_pipeline.subdivide( name='clip_to_100b', task_func=tompltools.generate_job_function( job_script='src/sh/clip_to_100b', job_name='clip_to_100b'), input=bbnorm, # filter=ruffus.formatter(r'.+/(?P<LN>[^(_|.)]+)(?P<VL>_?\w*).fastq.gz'), filter=ruffus.regex(r'.+/2125-06-11-1.fastq.gz'), output=[r'output/trunc_100/2125-06-11-1_R1.fastq.gz', r'output/trunc_100/2125-06-11-1_R2.fastq.gz']) # print raw and normalised kmer distribution plots main_pipeline.merge( name='kmer_distribution_plots', task_func=tompltools.generate_job_function( job_script='src/r/kmer_distribution_plots.R', job_name='kmer_distribution_plots'), input=bbnorm, output=['output/bbnorm/plots.pdf', 'output/bbnorm/plot_data.Rds']) # run fastqc on decontaminated and normalised libraries main_pipeline.transform( name='fastqc', task_func=tompltools.generate_job_function( job_script='src/sh/fastqc', job_name='fastqc', cpus_per_task=1), input=bbnorm, filter=ruffus.formatter(r'.+/(?P<LN>[^(_|.)]+)(?P<VL>_?\w*).fastq.gz'), output=[r'output/fastqc/{LN[0]}{VL[0]}_fastqc.html']) # overlap step with edena # edena_overlaps = main_pipeline.collate( # name='edena_overlaps', # task_func=tompltools.generate_job_function( # job_script='src/sh/edena_overlaps', # job_name='edena_overlaps'), # input=clip_to_100b, # filter=ruffus.formatter(r'.+/(?P<LN>[^_]+)_R\d.fastq.gz'), # output=[r'output/edena/{LN[0]}.ovc']) # prepare files with velveth # set threads for velvet to 1 !!! min_kmer = 71 max_kmer = 87 step = 8 kmer_lengths = [x for x in range(min_kmer, max_kmer + 1, step)] velveth_output = list( tompytools.flatten_list( [('output/velveth_' + str(x) + '/Sequences') for x in kmer_lengths])) # velveth = main_pipeline.merge( # name='hash_files', # task_func=test_job_function, # # task_func=tompltools.generate_job_function( # # job_script='src/sh/hash_files', # # job_name='hash_files'), # input=bbnorm, # output=velveth_output) ################### # RUFFUS COMMANDS # ################### # print the flowchart ruffus.pipeline_printout_graph( "ruffus/flowchart.pdf", "pdf", pipeline_name="ASW genome assembly pipeline") # run the pipeline ruffus.cmdline.run(options, multithread=8)
@transform(task1, suffix(".1"), ".2") def task2(infiles, outfiles, *extra_params): """ Second task """ with open(tempdir + "jobs.start", "a") as oo: oo.write('job = %s\n' % json.dumps([infiles, outfiles])) test_job_io(infiles, outfiles, extra_params) with open(tempdir + "jobs.finish", "a") as oo: oo.write('job = %s\n' % json.dumps([infiles, outfiles])) # # task3 # @transform(task2, regex('(.*).2'), inputs([r"\1.2", tempdir + "a.1"]), r'\1.3') @posttask(lambda: do_write(test_file, "Task 3 Done\n")) def task3(infiles, outfiles, *extra_params): """ Third task """ with open(tempdir + "jobs.start", "a") as oo: oo.write('job = %s\n' % json.dumps([infiles, outfiles])) test_job_io(infiles, outfiles, extra_params) with open(tempdir + "jobs.finish", "a") as oo: oo.write('job = %s\n' % json.dumps([infiles, outfiles])) # # task4 #
DATADIR = "data.dir" else: DATADIR = PARAMS['data'] # -------------------------------------- FASTQ_SUFFIXES = ("*.fastq.1.gz", "*.fastq.2.gz", "*.fastq.gz") FASTQ_DIR = PARAMS['fastq_dir'] # set to value for testing purposes (see regexes below) if FASTQ_DIR == "?!": FASTQ_DIR = "" FASTQ_FILES = tuple([os.path.join(FASTQ_DIR, suffix_name) for suffix_name in FASTQ_SUFFIXES]) FASTQ_REGEX = regex(os.path.join(FASTQ_DIR, r"(\S+).fastq.1.gz")) FASTQ_PAIR = os.path.join(FASTQ_DIR, r"\1.fastq.2.gz") SE_REGEX = regex(os.path.join(FASTQ_DIR, r"(\S+).fastq.gz")) GENESETS = [y for y in glob.glob(os.path.join("reference.dir/*.gtf.gz"))] @follows(mkdir("transcripts.dir")) @transform("%s" % PARAMS['annotations_geneset_gtf'], regex("reference.dir/(.+).gtf.gz"), r"transcripts.dir/\1.fa") def makeRepTranscripts(infile, outfile): ''' make a single representative transcript for each gene - put into a multi-fasta file '''
# # task3 # @active_if(lambda: pipeline_active_if) @transform(task1, suffix(".1"), ".3") def task3(infile, outfile): """ Third task """ helper(infile, outfile) # # task4 # @collate([task2, task3], regex(r"(.+)\.[23]"), r"\1.4") def task4(infiles, outfile): """ Fourth task """ helper(infiles, outfile) # # task4 # @merge(task4, "test_active_if/summary.5") def task5(infiles, outfile): """
def build_pipeline(options, work_folder, log, context): main_pipeline = Pipeline.pipelines['main'] # Triage task_triage = main_pipeline.transform( task_func=triage, input=os.path.join(work_folder, 'origin'), filter=formatter('(?i)'), output=os.path.join(work_folder, 'origin.pdf'), extras=[log, context]) task_repair_and_parse_pdf = main_pipeline.transform( task_func=repair_and_parse_pdf, input=task_triage, filter=suffix('.pdf'), output='.repaired.pdf', output_dir=work_folder, extras=[log, context]) # Split (kwargs for split seems to be broken, so pass plain args) task_marker_pages = main_pipeline.split( marker_pages, task_repair_and_parse_pdf, os.path.join(work_folder, '*.marker.pdf'), extras=[log, context]) task_ocr_or_skip = main_pipeline.split( ocr_or_skip, task_marker_pages, [os.path.join(work_folder, '*.ocr.page.pdf'), os.path.join(work_folder, '*.skip.page.pdf')], extras=[log, context]) # Rasterize preview task_rasterize_preview = main_pipeline.transform( task_func=rasterize_preview, input=task_ocr_or_skip, filter=suffix('.page.pdf'), output='.preview.jpg', output_dir=work_folder, extras=[log, context]) task_rasterize_preview.active_if(options.rotate_pages) # Orient task_orient_page = main_pipeline.collate( task_func=orient_page, input=[task_ocr_or_skip, task_rasterize_preview], filter=regex(r".*/(\d{6})(\.ocr|\.skip)(?:\.page\.pdf|\.preview\.jpg)"), output=os.path.join(work_folder, r'\1\2.oriented.pdf'), extras=[log, context]) # Rasterize actual task_rasterize_with_ghostscript = main_pipeline.transform( task_func=rasterize_with_ghostscript, input=task_orient_page, filter=suffix('.ocr.oriented.pdf'), output='.page.png', output_dir=work_folder, extras=[log, context]) # Preprocessing subpipeline task_preprocess_remove_background = main_pipeline.transform( task_func=preprocess_remove_background, input=task_rasterize_with_ghostscript, filter=suffix(".page.png"), output=".pp-background.png", extras=[log, context]) task_preprocess_deskew = main_pipeline.transform( task_func=preprocess_deskew, input=task_preprocess_remove_background, filter=suffix(".pp-background.png"), output=".pp-deskew.png", extras=[log, context]) task_preprocess_clean = main_pipeline.transform( task_func=preprocess_clean, input=task_preprocess_deskew, filter=suffix(".pp-deskew.png"), output=".pp-clean.png", extras=[log, context]) task_select_ocr_image = main_pipeline.collate( task_func=select_ocr_image, input=[task_preprocess_clean], filter=regex(r".*/(\d{6})(?:\.page|\.pp-.*)\.png"), output=os.path.join(work_folder, r"\1.ocr.png"), extras=[log, context]) # HOCR OCR task_ocr_tesseract_hocr = main_pipeline.transform( task_func=ocr_tesseract_hocr, input=task_select_ocr_image, filter=suffix(".ocr.png"), output=[".hocr", ".txt"], extras=[log, context]) task_ocr_tesseract_hocr.graphviz(fillcolor='"#00cc66"') task_ocr_tesseract_hocr.active_if(options.pdf_renderer == 'hocr') task_select_visible_page_image = main_pipeline.collate( task_func=select_visible_page_image, input=[task_rasterize_with_ghostscript, task_preprocess_remove_background, task_preprocess_deskew, task_preprocess_clean], filter=regex(r".*/(\d{6})(?:\.page|\.pp-.*)\.png"), output=os.path.join(work_folder, r'\1.image'), extras=[log, context]) task_select_visible_page_image.graphviz(shape='diamond') task_select_image_layer = main_pipeline.collate( task_func=select_image_layer, input=[task_select_visible_page_image, task_orient_page], filter=regex(r".*/(\d{6})(?:\.image|\.ocr\.oriented\.pdf)"), output=os.path.join(work_folder, r'\1.image-layer.pdf'), extras=[log, context]) task_select_image_layer.graphviz( fillcolor='"#00cc66"', shape='diamond') task_render_hocr_page = main_pipeline.transform( task_func=render_hocr_page, input=task_ocr_tesseract_hocr, filter=regex(r".*/(\d{6})(?:\.hocr)"), output=os.path.join(work_folder, r'\1.text.pdf'), extras=[log, context]) task_render_hocr_page.graphviz(fillcolor='"#00cc66"') task_render_hocr_page.active_if(options.pdf_renderer == 'hocr') # Tesseract OCR + text only PDF task_ocr_tesseract_textonly_pdf = main_pipeline.collate( task_func=ocr_tesseract_textonly_pdf, input=[task_select_ocr_image], filter=regex(r".*/(\d{6})(?:\.ocr.png)"), output=[os.path.join(work_folder, r'\1.text.pdf'), os.path.join(work_folder, r'\1.text.txt')], extras=[log, context]) task_ocr_tesseract_textonly_pdf.graphviz(fillcolor='"#ff69b4"') task_ocr_tesseract_textonly_pdf.active_if(options.pdf_renderer == 'sandwich') task_weave_layers = main_pipeline.collate( task_func=weave_layers, input=[task_repair_and_parse_pdf, task_render_hocr_page, task_ocr_tesseract_textonly_pdf, task_select_image_layer], filter=regex( r".*/((?:\d{6}(?:\.text\.pdf|\.image-layer\.pdf))|(?:origin\.repaired\.pdf))"), output=os.path.join(work_folder, r'layers.rendered.pdf'), extras=[log, context]) task_weave_layers.graphviz(fillcolor='"#00cc66"') # PDF/A pdfmark task_generate_postscript_stub = main_pipeline.transform( task_func=generate_postscript_stub, input=task_repair_and_parse_pdf, filter=formatter(r'\.repaired\.pdf'), output=os.path.join(work_folder, 'pdfa.ps'), extras=[log, context]) task_generate_postscript_stub.active_if(options.output_type.startswith('pdfa')) # PDF/A conversion task_convert_to_pdfa = main_pipeline.merge( task_func=convert_to_pdfa, input=[task_generate_postscript_stub, task_weave_layers], output=os.path.join(work_folder, 'pdfa.pdf'), extras=[log, context] ) task_convert_to_pdfa.active_if(options.output_type.startswith('pdfa')) task_metadata_fixup = main_pipeline.merge( task_func=metadata_fixup, input=[task_repair_and_parse_pdf, task_weave_layers, task_convert_to_pdfa], output=os.path.join(work_folder, 'metafix.pdf'), extras=[log, context] ) task_merge_sidecars = main_pipeline.merge( task_func=merge_sidecars, input=[task_ocr_tesseract_hocr, task_ocr_tesseract_textonly_pdf], output=options.sidecar, extras=[log, context]) task_merge_sidecars.active_if(options.sidecar) # Optimize task_optimize_pdf = main_pipeline.transform( task_func=optimize_pdf, input=task_metadata_fixup, filter=suffix('.pdf'), output='.optimized.pdf', output_dir=work_folder, extras=[log, context]) # Finalize main_pipeline.merge( task_func=copy_final, input=[task_optimize_pdf], output=options.output_file, extras=[log, context])
from ruffus import (transform, follows, collate, files, split, merge, add_inputs, regex, suffix, mkdir, jobs_limit, output_from) from ruffus.task import active_if from hts_waterworks.utils.ruffus_utils import (sys_call, main_logger as log, main_mutex as log_mtx) from hts_waterworks.bootstrap import cfg, get_chrom_sizes, genome_path import hts_waterworks.mapping as mapping import hts_waterworks.clip_seq as clip_seq from hts_waterworks.utils.common import (bedCommentFilter, readBedLines, parse_ucsc_range) @active_if(cfg.getboolean('peaks', 'run_macs')) @collate(mapping.all_mappers_output, regex(r'(.+)\.treat(.*)\.mapped_reads'), add_inputs(r'\1.control\2.mapped_reads'), r'\1.treat\2.macs.peaks', cfg.getfloat('peaks', 'max_FDR')) def run_macs(in_files, out_peaks, max_fdr): """Call peak with MACS (v1.3). Apply a maximum FDR threshold and treat centers as peak summits """ in_treat, in_control = in_files[0] matches = re.search(r'(.*\.treat)(.*)\.mapped_reads', in_treat).groups() name = matches[0] + matches[1] + '.macs.peaks' max_fdr = cfg.getfloat('peaks', 'max_FDR') cmd = 'macs -t %s -c %s --name=%s %s' % (in_treat, in_control, name, cfg.get('peaks', 'macs_params')) sys_call(cmd)
@jobs_limit(cfg.getint('DEFAULT', 'max_throttled_jobs'), 'throttled') @transform(mapping.all_mappers_output, suffix('.mapped_reads'), '.overlap.mapped_reads', cfg.getint('PAS-Seq', 'min_read_count')) def remove_nonoverlapping_reads(in_bed, out_bed, min_read_count): """ Remove mapped reads that don't overlap with at least *min_read_count* reads """ cmd = "intersectBed -wa -c -a %s -b %s | awk '$(NF) >= %s' |" \ r"cut -f 1,2,3,4,5,6 > %s" % (in_bed, in_bed, min_read_count + 1, out_bed) sys_call(cmd, file_log=False) @active_if(cfg.getboolean('PAS-Seq', 'merge_adjacent_reads')) #@split(mapping.all_mappers_output, regex('(.*).mapped_reads$'), @split(remove_nonoverlapping_reads, regex('(.*).mapped_reads$'), [r'\1.merged.mapped_reads', r'\1.merged.pileup_reads'], cfg.getint('PAS-Seq', 'merge_window_width'), cfg.getint('PAS-Seq', 'merge_num_iterations'), r'\1.merged.mapped_reads', r'\1.merged.pileup_reads', cfg.getint('PAS-Seq', 'min_read_count')) def merge_adjacent_reads(in_bed, out_pattern, window_width, iterations, out_merged, out_pileup, min_read_count): """Reassign read ends to a weighted average of adjacent reads""" # helper functions for parsing bed files filter_lines = lambda l: l.strip() and (not l.startswith('#') or \ l.startswith('"')) read_bed_lines = lambda infile: itertools.ifilter(filter_lines, infile) # sort the input by chrom, stop tmpfile = in_bed + '.merged_adjacent_sorted'
# # 1. pipeline_genesets: any parameters will be added with the # prefix "annotations_". The interface will be updated with # "annotations_dir" to point to the absolute path names. PARAMS.update( P.peek_parameters(PARAMS["annotations_dir"], "pipeline_genesets.py", "genesets", on_error_raise=__name__ == "__main__", prefix="annotations_", update_interface=True)) # --------------------------------------------------- # Specific pipeline tasks @transform(("pipeline.yml", ), regex("(.*)\.(.*)"), r"\1.counts") def count_words(infile, outfile): '''count the number of words in the pipeline configuration files.''' # the command line statement we want to execute statement = '''awk 'BEGIN { printf("word\\tfreq\\n"); } {for (i = 1; i <= NF; i++) freq[$i]++} END { for (word in freq) printf "%%s\\t%%d\\n", word, freq[word] }' < %(infile)s > %(outfile)s''' # execute command in variable statement. # # The command will be sent to the cluster. The statement will be # interpolated with any options that are defined in in the # configuration files or variable that are declared in the calling # function. For example, %(infile)s will we substituted with the
chrom, start, stop = fields[:3] strand = fields[5] if len(fields) >= 6 else '+' # +:RED, -:GREEN color = '255,0,0' if strand == '+' else '0,255,0' outfile.write('\t'.join(fields + [start, stop, color]) + '\n') @transform(bed_color_strand, suffix(''), '.bigbed') def bed_to_bigbed(in_bed, out_bigbed): """Convert a BED file to .bigbed for viewing on UCSC browser""" cmd = 'bedToBigBed %s %s.chrom.sizes %s' % (in_bed, genome_path(), out_bigbed) sys_call(cmd) @transform([bed_uniquefy, clip_and_sort_peaks] + mapping.all_mappers_output, regex('(.*mapped_reads).clipped.sorted(.unique|)'), #suffix('.mapped_reads'), add_inputs(bootstrap.get_chrom_sizes), r'\1\2.bedgraph') #r'.bedgraph') def bed_to_bedgraph(in_files, out_bedgraph): 'extend reads to the full fragment length and create a bedgraph from them' in_bed, in_chrom_sizes = in_files cmd = ('slopBed -i %s -s -r %s -l 0 -g %s | ' + \ 'bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s') % ( in_bed, cfg.getint('DEFAULT','fragment_size') - \ cfg.getint('DEFAULT','tag_size'), in_chrom_sizes, cfg.get('DEFAULT', 'genome'), genome_path(), out_bedgraph) sys_call(cmd)
# # task1 # @files(None, tempdir + 'a.1') def task1(infiles, outfiles, *extra_params): """ First task """ test_job_io(infiles, outfiles, extra_params) # # task2 # @transform(task1, regex(r".*"), tempdir + 'b.1') def task2(infiles, outfiles, *extra_params): """ Second task """ test_job_io(infiles, outfiles, extra_params) assert(infiles == tempdir + "a.1") # # task3 # @files(task2, tempdir + 'c.1') def task3(infiles, outfiles, *extra_params): """
# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 @mkdir(tempdir) @originate(original_files) def generate_initial_files(out_name): with open(out_name, 'w') as outfile: pass # # split_fasta_file # @posttask(lambda: sys.stderr.write("\tSplit into %d files each\n" % JOBS_PER_TASK)) @subdivide(generate_initial_files, # match original files regex(r".*\/original_(\d+).fa"), [tempdir + r"/files.split.\1.success", # flag file for each original file tempdir + r"/files.split.\1.*.fa"], # glob pattern r"\1") # index of original file def split_fasta_file(input_file, outputs, original_index): # # remove previous fasta files # success_flag = outputs[0] output_file_names = outputs[1:] for f in output_file_names: os.unlink(f) # # create as many files as we are simulating in JOBS_PER_TASK
output_file, log, pdfinfo, pdfinfo_lock): ghostscript.rasterize_pdf( input_file=input_file, output_file=output_file, xres=200, yres=200, raster_device='jpeggray', log=log) @collate( input=[split_pages, rasterize_preview], filter=regex(r".*/(\d{6})(\.ocr|\.skip)(?:\.page\.pdf|\.preview\.jpg)"), output=os.path.join(work_folder, r'\1\2.oriented.pdf'), extras=[_log, _pdfinfo, _pdfinfo_lock]) def orient_page( infiles, output_file, log, pdfinfo, pdfinfo_lock): page_pdf = next(ii for ii in infiles if ii.endswith('.page.pdf')) if not options.rotate_pages: re_symlink(page_pdf, output_file) return preview = next(ii for ii in infiles if ii.endswith('.preview.jpg'))
def make_pipeline_map(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='haloplexpipe') # Get a list of paths to all the FASTQ files #fastq_files = state.config.get_option('fastqs') fastq_files = glob.glob("fastqs/*.gz") # Stages are dependent on the state stages = Stages(state) safe_make_dir('alignments') safe_make_dir('processed_fastqs') safe_make_dir('metrics') safe_make_dir('metrics/amplicon') safe_make_dir('metrics/summary') safe_make_dir('metrics/pass_samples') safe_make_dir('variants') safe_make_dir('variants/gatk') safe_make_dir('variants/vardict') # The original FASTQ files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate(task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) pipeline.transform( task_func=stages.run_surecalltrimmer, name='run_surecalltrimmer', input=output_from('original_fastqs'), filter=formatter('fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1.fastq.gz'), add_inputs=add_inputs('fastqs/{sample[0]}_R2.fastq.gz'), #filter=formatter('fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1_001.fastq.gz'), #add_inputs=add_inputs('fastqs/{sample[0]}_R3_001.fastq.gz'), extras=['{sample[0]}'], # output only needs to know about one file to track progress of the pipeline, but the second certainly exists after this step. output='processed_fastqs/{sample[0]}_R1.processed.fastq.gz') #output='processed_fastqs/{sample[0]}_R1_001.processed.fastq.gz') # Align paired end reads in FASTQ to the reference producing a BAM file pipeline.transform( task_func=stages.align_bwa, name='align_bwa', input=output_from('run_surecalltrimmer'), filter=formatter( 'processed_fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1.processed.fastq.gz' ), add_inputs=add_inputs( 'processed_fastqs/{sample[0]}_R2.processed.fastq.gz'), #filter=formatter('processed_fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1_001.processed.fastq.gz'), #add_inputs=add_inputs('processed_fastqs/{sample[0]}_R3_001.processed.fastq.gz'), extras=['{sample[0]}'], output='alignments/{sample[0]}.bam') # Run locatit from agilent. this should produce sorted bam files, so no sorting needed at the next step pipeline.collate(task_func=stages.run_locatit, name='run_locatit', input=output_from('align_bwa', 'original_fastqs'), filter=regex(r'.+/(.+_L\d\d\d).+'), output=r'alignments/\1.locatit.bam') pipeline.transform(task_func=stages.sort_bam, name='sort_bam', input=output_from('run_locatit'), filter=suffix('.locatit.bam'), output='.sorted.locatit.bam') # # # # # Metrics stages # # # # # # generate mapping metrics (post locatit) pipeline.transform( task_func=stages.generate_amplicon_metrics, name='generate_amplicon_metrics', input=output_from('sort_bam'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'), output='metrics/amplicon/{sample[0]}.amplicon-metrics.txt', extras=['{sample[0]}']) # Intersect the bam file with the region of interest pipeline.transform( task_func=stages.intersect_bed, name='intersect_bed', input=output_from('sort_bam'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'), output='metrics/summary/{sample[0]}.intersectbed.bam') # Calculate coverage metrics from the intersected bam file pipeline.transform(task_func=stages.coverage_bed, name='coverage_bed', input=output_from('intersect_bed'), filter=suffix('.intersectbed.bam'), output='.bedtools_hist_all.txt') # Count the number of mapped reads pipeline.transform( task_func=stages.genome_reads, name='genome_reads', input=output_from('sort_bam'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'), output='metrics/summary/{sample[0]}.mapped_to_genome.txt') # Count the number of on-target reads pipeline.transform(task_func=stages.target_reads, name='target_reads', input=output_from('intersect_bed'), filter=suffix('.intersectbed.bam'), output='.mapped_to_target.txt') # Count the number of total reads pipeline.transform( task_func=stages.total_reads, name='total_reads', input=output_from('sort_bam'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'), output='metrics/summary/{sample[0]}.total_raw_reads.txt') # Generate summary metrics from the stats files produces pipeline.collate( task_func=stages.generate_stats, name='generate_stats', input=output_from('coverage_bed', 'genome_reads', 'target_reads', 'total_reads'), #filter=regex(r'.+/(.+BS\d{4,6}.+S\d+)\..+\.txt'), filter=regex( r'.+/(.+)\.(bedtools_hist_all|mapped_to_genome|mapped_to_target|total_raw_reads)\.txt' ), output=r'metrics/summary/all_sample.summary.\1.txt', extras=[r'\1', 'all_sample.summary.txt']) # # # # # Metrics stages end # # # # # # # # # # Checking metrics and calling # # # # # # Originate to set the location of the metrics summary file (pipeline.originate( task_func=stages.grab_summary_file, name='grab_summary_file', output='all_sample.summary.txt').follows('generate_stats')) # Awk command to produce a list of bam files passing filters pipeline.transform(task_func=stages.filter_stats, name='filter_stats', input=output_from('grab_summary_file'), filter=suffix('.summary.txt'), output='.passed.summary.txt') # Touch passed bams to the pass_samples folder and pass the glob of that folder to HaplotypeCaller pipeline.subdivide(name='passed_filter_files', task_func=stages.read_samples, input=output_from('filter_stats'), filter=formatter(), output="metrics/pass_samples/*.bam") # Call variants using GATK (pipeline.transform( task_func=stages.call_haplotypecaller_gatk, name='call_haplotypecaller_gatk', input=output_from('passed_filter_files'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9-_]+).sorted.locatit.bam'), output='variants/gatk/{sample[0]}.g.vcf').follows('sort_bam')) # Call variants with vardict (pipeline.transform( task_func=stages.run_vardict, name='run_vardict', input=output_from('passed_filter_files'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9-_]+).sorted.locatit.bam'), output='variants/vardict/{sample[0]}.vcf', extras=['{sample[0]}']).follows('sort_bam')) pipeline.transform( task_func=stages.sort_vcfs, name='sort_vcfs', input=output_from('run_vardict'), filter=formatter('variants/vardict/(?P<sample>[a-zA-Z0-9_-]+).vcf'), output='variants/vardict/{sample[0]}.sorted.vcf.gz') pipeline.transform(task_func=stages.index_vcfs, name='index_vcfs', input=output_from('sort_vcfs'), filter=suffix('.sorted.vcf.gz'), output='.sorted.vcf.gz.tbi') return (pipeline)
# ___________________________________________________________________________ def check_regex_out_of_range_regex_reference_error_task(infiles, outfile, prefix1, prefix2, extension): raise Exception("Should blow up first") test_pipeline = Pipeline("test") test_pipeline.originate(task_func=generate_initial_files1, output=[tempdir + "/" + prefix + "_name.tmp1" for prefix in "abcdefghi"]) test_pipeline.transform(task_func=check_regex_task, input=generate_initial_files1, filter=regex("(.*)/(?P<PREFIX>[abcd])(_name)(.tmp1)"), output=r"\1/\g<PREFIX>\3.tmp2", # output file extras=[r"\2", # extra: prefix = \2 r"\g<PREFIX>", # extra: prefix = \2 r"\4"]) # extra: extension test_pipeline.transform(task_func=check_regex_unmatched_task, input=generate_initial_files1, filter=regex("(.*)/(?P<PREFIX>[abcd])(_name)(.xxx)"), output=r"\1/\g<PREFIXA>\3.tmp2", # output file extras=[r"\2", # extra: prefix = \2 r"\g<PREFIX>", # extra: prefix = \2 r"\4"]) # extra: extension test_pipeline.transform(task_func=check_suffix_task, input=generate_initial_files1, filter=suffix(".tmp1"),
import hts_waterworks.utils.get_bed_sequence as get_bed_sequence import hts_waterworks.utils.sequence_motif as sequence_motif import hts_waterworks.utils.sampling as sampling import hts_waterworks.utils.motif_significance as motif_significance from hts_waterworks.bootstrap import cfg, get_genome, genome_path import hts_waterworks.call_peaks as call_peaks import hts_waterworks.annotation as annotation #from ipdb import set_trace as breakpoint # motif setup @transform(call_peaks.all_peak_caller_functions + ['*.peaks_summits.%s_around' % cfg.get('peaks', 'peak_summit_size')], regex(r'(.*\.peaks$|.*\..*_around$|_genes.promoter.*_ext[\d]+$)'), r'\1.top%s.peaks' % cfg.getint('motifs', 'motif_chunk_size'), cfg.getint('motifs', 'motif_chunk_size')) def get_top_peaks(in_peaks, out_subset, num_peaks_to_keep): """keep only the top peaks as input to motif discovery""" with open(in_peaks) as infile: seqs = list(readBedLines(infile, dataOnly=False)) # sort by score, highest first seqs.sort(key=lambda x: int(x[4]), reverse=True) with open(out_subset, 'w') as outfile: subset = seqs[:num_peaks_to_keep] outfile.writelines('\t'.join(map(str, s)) + '\n' for s in subset) #@follows(get_genome) @transform([get_top_peaks], suffix(''), '.fasta') def get_peak_sequence(in_peaks, out_fasta):
# with a space. Don't know if Tesseract 3.02 does the same. regex_nested_single_quotes = re.compile( r"""title='image "([^"]*)";""") with open(badxml, mode='r', encoding='utf-8') as f_in, \ open(output_file, mode='w', encoding='utf-8') as f_out: for line in f_in: line = regex_nested_single_quotes.sub( r"""title='image " ";""", line) f_out.write(line) @active_if(options.pdf_renderer == 'hocr') @collate( input=[rasterize_with_ghostscript, preprocess_deskew, preprocess_clean], filter=regex(r".*/(\d{6})(?:\.page|\.pp-deskew|\.pp-clean)\.png"), output=os.path.join(work_folder, r'\1.image'), extras=[_log, _pdfinfo, _pdfinfo_lock]) def select_image_for_pdf( infiles, output_file, log, pdfinfo, pdfinfo_lock): if options.clean_final: image_suffix = '.pp-clean.png' elif options.deskew: image_suffix = '.pp-deskew.png' else: image_suffix = '.page.png' image = next(ii for ii in infiles if ii.endswith(image_suffix))
#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 import unittest import json #88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 # Tasks #88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 try: @transform(None, regex(tempdir + "b"), inputs(tempdir + "a", tempdir + "b"), "task_1.output") def task_1 (i, o): for f in o: open(f, 'w') except ruffus.ruffus_exceptions.error_task_transform_inputs_multiple_args: print("\tExpected exception thrown 1") except ruffus.ruffus_exceptions.error_inputs_multiple_args: print("\tExpected exception thrown 2") def task_2 (i, o): for f in o: open(f, 'w') class Test_task_mkdir(unittest.TestCase):
filter=suffix('.page.pdf'), output='.preview.jpg', output_dir=work_folder, extras=[_log, _pdfinfo, _pdfinfo_lock]) def rasterize_preview(input_file, output_file, log, pdfinfo, pdfinfo_lock): ghostscript.rasterize_pdf(input_file=input_file, output_file=output_file, xres=200, yres=200, raster_device='jpeggray', log=log) @collate( input=[split_pages, rasterize_preview], filter=regex(r".*/(\d{6})(\.ocr|\.skip)(?:\.page\.pdf|\.preview\.jpg)"), output=os.path.join(work_folder, r'\1\2.oriented.pdf'), extras=[_log, _pdfinfo, _pdfinfo_lock]) def orient_page(infiles, output_file, log, pdfinfo, pdfinfo_lock): page_pdf = next(ii for ii in infiles if ii.endswith('.page.pdf')) if not options.rotate_pages: re_symlink(page_pdf, output_file) return preview = next(ii for ii in infiles if ii.endswith('.preview.jpg')) orient_conf = tesseract.get_orientation(preview, language=options.language, timeout=options.tesseract_timeout, log=log)
def checkFileExistence(infile, outfile): '''check whether file exists. Files are uncompressed before checking existence. ''' track = P.snip(infile, ".log") compute_file_metrics(infile, outfile, metric="file", suffixes=P.as_list( P.as_list(PARAMS.get('%s_regex_exist' % track, "")))) @collate((buildCheckSums, buildLineCounts, checkFileExistence), regex("([^.]*).(.*)"), r"\1.stats") def mergeFileStatistics(infiles, outfile): '''merge all file statistics.''' to_cluster = False infiles = " ".join(sorted(infiles)) statement = ''' %(scriptsdir)s/merge_testing_output.sh %(infiles)s > %(outfile)s''' P.run(statement) @merge(mergeFileStatistics, "md5_compare.tsv") def compareCheckSums(infiles, outfile):
def build_pipeline(options, work_folder, log, context): main_pipeline = Pipeline.pipelines['main'] # Triage task_triage = main_pipeline.transform( task_func=triage, input=os.path.join(work_folder, 'origin'), filter=formatter('(?i)'), output=os.path.join(work_folder, 'origin.pdf'), extras=[log, context]) task_repair_pdf = main_pipeline.transform(task_func=repair_pdf, input=task_triage, filter=suffix('.pdf'), output='.repaired.pdf', output_dir=work_folder, extras=[log, context]) # Split (kwargs for split seems to be broken, so pass plain args) task_split_pages = main_pipeline.split(split_pages, task_repair_pdf, os.path.join( work_folder, '*.page.pdf'), extras=[log, context]) # Rasterize preview task_rasterize_preview = main_pipeline.transform( task_func=rasterize_preview, input=task_split_pages, filter=suffix('.page.pdf'), output='.preview.jpg', output_dir=work_folder, extras=[log, context]) task_rasterize_preview.active_if(options.rotate_pages) # Orient task_orient_page = main_pipeline.collate( task_func=orient_page, input=[task_split_pages, task_rasterize_preview], filter=regex( r".*/(\d{6})(\.ocr|\.skip)(?:\.page\.pdf|\.preview\.jpg)"), output=os.path.join(work_folder, r'\1\2.oriented.pdf'), extras=[log, context]) # Rasterize actual task_rasterize_with_ghostscript = main_pipeline.transform( task_func=rasterize_with_ghostscript, input=task_orient_page, filter=suffix('.ocr.oriented.pdf'), output='.page.png', output_dir=work_folder, extras=[log, context]) # Preprocessing subpipeline task_preprocess_remove_background = main_pipeline.transform( task_func=preprocess_remove_background, input=task_rasterize_with_ghostscript, filter=suffix(".page.png"), output=".pp-background.png", extras=[log, context]) task_preprocess_deskew = main_pipeline.transform( task_func=preprocess_deskew, input=task_preprocess_remove_background, filter=suffix(".pp-background.png"), output=".pp-deskew.png", extras=[log, context]) task_preprocess_clean = main_pipeline.transform( task_func=preprocess_clean, input=task_preprocess_deskew, filter=suffix(".pp-deskew.png"), output=".pp-clean.png", extras=[log, context]) task_select_ocr_image = main_pipeline.collate( task_func=select_ocr_image, input=[task_preprocess_clean], filter=regex(r".*/(\d{6})(?:\.page|\.pp-.*)\.png"), output=os.path.join(work_folder, r"\1.ocr.png"), extras=[log, context]) # HOCR OCR task_ocr_tesseract_hocr = main_pipeline.transform( task_func=ocr_tesseract_hocr, input=task_select_ocr_image, filter=suffix(".ocr.png"), output=".hocr", extras=[log, context]) task_ocr_tesseract_hocr.graphviz(fillcolor='"#00cc66"') task_ocr_tesseract_hocr.active_if(options.pdf_renderer == 'hocr') if tesseract.v4(): task_ocr_tesseract_hocr.jobs_limit(2) # Uses multi-core on its own task_select_visible_page_image = main_pipeline.collate( task_func=select_visible_page_image, input=[ task_rasterize_with_ghostscript, task_preprocess_remove_background, task_preprocess_deskew, task_preprocess_clean ], filter=regex(r".*/(\d{6})(?:\.page|\.pp-.*)\.png"), output=os.path.join(work_folder, r'\1.image'), extras=[log, context]) task_select_visible_page_image.graphviz(shape='diamond') task_select_image_layer = main_pipeline.collate( task_func=select_image_layer, input=[task_select_visible_page_image, task_orient_page], filter=regex(r".*/(\d{6})(?:\.image|\.ocr\.oriented\.pdf)"), output=os.path.join(work_folder, r'\1.image-layer.pdf'), extras=[log, context]) task_select_image_layer.graphviz(fillcolor='"#00cc66"', shape='diamond') task_select_image_layer.active_if(options.pdf_renderer == 'hocr' or options.pdf_renderer == 'tess4') task_render_hocr_page = main_pipeline.transform( task_func=render_hocr_page, input=task_ocr_tesseract_hocr, filter=suffix('.hocr'), output='.text.pdf', extras=[log, context]) task_render_hocr_page.graphviz(fillcolor='"#00cc66"') task_render_hocr_page.active_if(options.pdf_renderer == 'hocr') task_render_hocr_debug_page = main_pipeline.collate( task_func=render_hocr_debug_page, input=[task_select_visible_page_image, task_ocr_tesseract_hocr], filter=regex(r".*/(\d{6})(?:\.image|\.hocr)"), output=os.path.join(work_folder, r'\1.debug.pdf'), extras=[log, context]) task_render_hocr_debug_page.graphviz(fillcolor='"#00cc66"') task_render_hocr_debug_page.active_if(options.pdf_renderer == 'hocr') task_render_hocr_debug_page.active_if(options.debug_rendering) # Tesseract OCR + text only PDF task_ocr_tesseract_textonly_pdf = main_pipeline.collate( task_func=ocr_tesseract_textonly_pdf, input=[task_select_ocr_image, task_orient_page], filter=regex(r".*/(\d{6})(?:\.ocr.png|\.ocr\.oriented\.pdf)"), output=os.path.join(work_folder, r'\1.text.pdf'), extras=[log, context]) task_ocr_tesseract_textonly_pdf.graphviz(fillcolor='"#ff69b4"') task_ocr_tesseract_textonly_pdf.active_if(options.pdf_renderer == 'tess4') if tesseract.v4(): task_ocr_tesseract_textonly_pdf.jobs_limit(2) task_combine_layers = main_pipeline.collate( task_func=combine_layers, input=[ task_render_hocr_page, task_ocr_tesseract_textonly_pdf, task_select_image_layer ], filter=regex(r".*/(\d{6})(?:\.text\.pdf|\.image-layer\.pdf)"), output=os.path.join(work_folder, r'\1.rendered.pdf'), extras=[log, context]) task_combine_layers.graphviz(fillcolor='"#00cc66"') task_combine_layers.active_if(options.pdf_renderer == 'hocr' or options.pdf_renderer == 'tess4') # Tesseract OCR+PDF task_ocr_tesseract_and_render_pdf = main_pipeline.collate( task_func=ocr_tesseract_and_render_pdf, input=[task_select_visible_page_image, task_orient_page], filter=regex(r".*/(\d{6})(?:\.image|\.ocr\.oriented\.pdf)"), output=os.path.join(work_folder, r'\1.rendered.pdf'), extras=[log, context]) task_ocr_tesseract_and_render_pdf.graphviz(fillcolor='"#66ccff"') task_ocr_tesseract_and_render_pdf.active_if( options.pdf_renderer == 'tesseract') if tesseract.v4(): task_ocr_tesseract_and_render_pdf.jobs_limit(2) # Uses multi-core # PDF/A task_generate_postscript_stub = main_pipeline.transform( task_func=generate_postscript_stub, input=task_repair_pdf, filter=formatter(r'\.repaired\.pdf'), output=os.path.join(work_folder, 'pdfa.ps'), extras=[log, context]) task_generate_postscript_stub.active_if(options.output_type == 'pdfa') # Bypass valve task_skip_page = main_pipeline.transform( task_func=skip_page, input=task_orient_page, filter=suffix('.skip.oriented.pdf'), output='.done.pdf', output_dir=work_folder, extras=[log, context]) # Merge pages task_merge_pages_ghostscript = main_pipeline.merge( task_func=merge_pages_ghostscript, input=[ task_combine_layers, task_render_hocr_debug_page, task_skip_page, task_ocr_tesseract_and_render_pdf, task_generate_postscript_stub ], output=os.path.join(work_folder, 'merged.pdf'), extras=[log, context]) task_merge_pages_ghostscript.active_if(options.output_type == 'pdfa') task_merge_pages_qpdf = main_pipeline.merge( task_func=merge_pages_qpdf, input=[ task_combine_layers, task_render_hocr_debug_page, task_skip_page, task_ocr_tesseract_and_render_pdf, task_repair_pdf ], output=os.path.join(work_folder, 'merged.pdf'), extras=[log, context]) task_merge_pages_qpdf.active_if(options.output_type == 'pdf') # Finalize task_copy_final = main_pipeline.merge( task_func=copy_final, input=[task_merge_pages_ghostscript, task_merge_pages_qpdf], output=options.output_file, extras=[log, context])
r"StrandSpec.dir/\1.strand") def strandSpecificity(infile, outfile): '''This function will determine the strand specificity of your library from the bam file''' statement = ( "cgat bam2libtype " "--max-iterations 10000 " "< {infile} " "> {outfile}".format(**locals())) return P.run(statement) @follows(mkdir("BamFiles.dir")) @transform("*.bam", regex("(.*).bam$"), r"BamFiles.dir/\1.bam") def intBam(infile, outfile): '''make an intermediate bam file if there is no sequence infomation. If there is no sequence quality then make a softlink. Picard tools has an issue when quality score infomation is missing''' if PARAMS["bam_sequence_stripped"] is True: bamstats.addPseudoSequenceQuality(infile, outfile) else: bamstats.copyBamFile(infile, outfile) @follows(mkdir("Picard_stats.dir"))