Example #1
0
def quick(ifold):

    # sorting bam file
    pipeline = ruffus.Pipeline('BamDNaseSeq')
    bam_file = '*.bam'
    sort_bam_regex = r'(.*)\/(.*).bam$'
    sort_bam_task = pipeline.collate(tasks.sort_bam,
                                     name='sorting_bam',
                                     input=os.path.join(ifold, bam_file),
                                     filter=ruffus.regex(sort_bam_regex),
                                     output=r'\1/\2.sorted.bam')
    ## bam to bed using bam2bed
    sorted_bam_file = '*.sorted.bam'
    sorted_bam_regex = r'(.*)\/(.*).sorted.bam$'
    sorted_bam_task = pipeline.collate(tasks.bam2bed,
                                       name='bam2bed',
                                       input=os.path.join(
                                           ifold, sorted_bam_file),
                                       filter=ruffus.regex(sorted_bam_regex),
                                       output=r'\1/\2.sorted.bed')
    sorted_bam_task.follows('sorting_bam')

    full_pipe = ruffus.Pipeline('Full pipeline', input=['bam2bed'])

    full_pipe.run()
    def test_newstyle_collate(self):
        """
        As above but create pipeline on the fly using object orientated syntax rather than decorators
        """

        #
        # Create pipeline on the fly, joining up tasks
        #
        test_pipeline = Pipeline("test")

        test_pipeline.originate(task_func   = generate_initial_files,
                                output      = original_files)\
            .mkdir(tempdir, tempdir+"/test")


        test_pipeline.subdivide(    task_func   = split_fasta_file,
                                    input       = generate_initial_files,
                                    filter      = regex(r".*\/original_(\d+).fa"),       # match original files
                                    output      = [tempdir + r"/files.split.\1.success", # flag file for each original file
                                                   tempdir + r"/files.split.\1.*.fa"],   # glob pattern
                                    extras      = [r"\1"])\
            .posttask(lambda: sys.stderr.write("\tSplit into %d files each\n" % JOBS_PER_TASK))


        test_pipeline.transform(task_func   = align_sequences,
                                input       = split_fasta_file,
                                filter      = suffix(".fa"),
                                output      = ".aln")  \
            .posttask(lambda: sys.stderr.write("\tSequences aligned\n"))

        test_pipeline.transform(task_func   = percentage_identity,
                                input       = align_sequences,             # find all results from align_sequences
                                filter      = suffix(".aln"),             # replace suffix with:
                                output      = [r".pcid",                  #   .pcid suffix for the result
                                               r".pcid_success"]         #   .pcid_success to indicate job completed
                                )\
            .posttask(lambda: sys.stderr.write("\t%Identity calculated\n"))


        test_pipeline.collate(task_func   = combine_results,
                              input       = percentage_identity,
                              filter      = regex(r".*files.split\.(\d+)\.\d+.pcid"),
                              output      = [tempdir + r"/\1.all.combine_results",
                                             tempdir + r"/\1.all.combine_results_success"])\
            .posttask(lambda: sys.stderr.write("\tResults recombined\n"))

        #
        # Cleanup, printout and run
        #
        self.cleanup_tmpdir()
        s = StringIO()
        test_pipeline.printout(s, [combine_results],
                               verbose=5,
                               wrap_width=10000)
        self.assertTrue(
            re.search('Job needs update:.*Missing files.*', s.getvalue(),
                      re.DOTALL) is not None)
        test_pipeline.run(verbose=0)
    def test_newstyle_collate(self):
        """
        As above but create pipeline on the fly using object orientated syntax rather than decorators
        """

        #
        # Create pipeline on the fly, joining up tasks
        #
        test_pipeline = Pipeline("test")

        test_pipeline.originate(task_func=generate_initial_files,
                                output=original_files)\
            .mkdir(tempdir, tempdir+"/test")

        test_pipeline.subdivide(task_func=split_fasta_file,
                                input=generate_initial_files,
                                # match original files
                                filter=regex(r".*\/original_(\d+).fa"),
                                output=[tempdir + r"/files.split.\1.success",  # flag file for each original file
                                        tempdir + r"/files.split.\1.*.fa"],   # glob pattern
                                extras=[r"\1"])\
            .posttask(lambda: sys.stderr.write("\tSplit into %d files each\n" % JOBS_PER_TASK))

        test_pipeline.transform(task_func=align_sequences,
                                input=split_fasta_file,
                                filter=suffix(".fa"),
                                output=".aln")  \
            .posttask(lambda: sys.stderr.write("\tSequences aligned\n"))

        test_pipeline.transform(task_func=percentage_identity,
                                input=align_sequences,             # find all results from align_sequences
                                # replace suffix with:
                                filter=suffix(".aln"),
                                output=[r".pcid",  # .pcid suffix for the result
                                        r".pcid_success"]  # .pcid_success to indicate job completed
                                )\
            .posttask(lambda: sys.stderr.write("\t%Identity calculated\n"))

        test_pipeline.collate(task_func=combine_results,
                              input=percentage_identity,
                              filter=regex(r".*files.split\.(\d+)\.\d+.pcid"),
                              output=[tempdir + r"/\1.all.combine_results",
                                      tempdir + r"/\1.all.combine_results_success"])\
            .posttask(lambda: sys.stderr.write("\tResults recombined\n"))

        #
        # Cleanup, printout and run
        #
        self.cleanup_tmpdir()
        s = StringIO()
        test_pipeline.printout(s, [combine_results],
                               verbose=5, wrap_width=10000)
        self.assertTrue(re.search(
            'Job needs update:.*Missing files.*', s.getvalue(), re.DOTALL) is not None)
        test_pipeline.run(verbose=0)
Example #4
0
    def test_newstyle_task(self):
        test_pipeline = Pipeline("test")

        test_pipeline.files(task1, [[None, tempdir + "a.1"], [None, tempdir + "b.1"]])\
            .follows(mkdir(tempdir))

        test_pipeline.files(task2, [[None, tempdir + "c.1"], [None, tempdir + "d.1"]])\
            .follows(mkdir(tempdir))

        test_pipeline.transform(task_func=task3,
                                input=task1,
                                filter=regex(r"(.+)"),
                                replace_inputs=ruffus.inputs(
                                    ((r"\1"), task2,
                                     "test_transform_inputs.*y")),
                                output=r"\1.output")
        test_pipeline.merge(task4, (task3), tempdir + "final.output")

        test_pipeline.run([task4], multiprocess=10, verbose=0)

        correct_output = "{tempdir}a.1.output:test_transform_inputs.py,{tempdir}a.1,{tempdir}c.1,{tempdir}d.1;{tempdir}b.1.output:test_transform_inputs.py,{tempdir}b.1,{tempdir}c.1,{tempdir}d.1;".format(
            tempdir=tempdir)
        with open(tempdir + "final.output") as ff:
            real_output = ff.read()
        self.assertEqual(correct_output, real_output)
Example #5
0
def make_pipeline1(
        pipeline_name,  # Pipelines need to have a unique name
        starting_file_names):
    test_pipeline = Pipeline(pipeline_name)

    #   We can change the starting files later using
    #          set_input() for transform etc.
    #       or set_output() for originate
    #   But it can be more convenient to just pass this to the function making the pipeline
    #
    test_pipeline.originate(task_originate, starting_file_names)\
        .follows(mkdir(tempdir), mkdir(tempdir + "/testdir", tempdir + "/testdir2"))\
        .posttask(touch_file(tempdir + "/testdir/whatever.txt"))
    test_pipeline.transform(
        task_func=task_m_to_1,
        name="add_input",
        # Lookup Task from function name task_originate()
        #   So long as this is unique in the pipeline
        input=task_originate,
        # requires an anchor from 3.7 onwards, see
        # https://bugs.python.org/issue34982
        filter=regex(r"^(.*)"),
        add_inputs=add_inputs(tempdir + "/testdir/whatever.txt"),
        output=r"\1.22")
    test_pipeline.transform(
        task_func=task_1_to_1,
        name="22_to_33",
        # Lookup Task from Task name
        #   Function name is not unique in the pipeline
        input=output_from("add_input"),
        filter=suffix(".22"),
        output=".33")
    tail_task = test_pipeline.transform(
        task_func=task_1_to_1,
        name="33_to_44",
        # Ask Pipeline to lookup Task from Task name
        input=test_pipeline["22_to_33"],
        filter=suffix(".33"),
        output=".44")

    #   Set the tail task so that users of my sub pipeline can use it as a dependency
    #       without knowing the details of task names
    #
    #   Use Task() object directly without having to lookup
    test_pipeline.set_tail_tasks([tail_task])

    #   If we try to connect a Pipeline without tail tasks defined, we have to
    #       specify the exact task within the Pipeline.
    #   Otherwise Ruffus will not know which task we mean and throw an exception
    if DEBUG_do_not_define_tail_task:
        test_pipeline.set_tail_tasks([])

    # Set the head task so that users of my sub pipeline send input into it
    #   without knowing the details of task names
    test_pipeline.set_head_tasks([test_pipeline[task_originate]])

    return test_pipeline
def make_pipeline1(pipeline_name,   # Pipelines need to have a unique name
                   starting_file_names):
    test_pipeline = Pipeline(pipeline_name)

    #   We can change the starting files later using
    #          set_input() for transform etc.
    #       or set_output() for originate
    #   But it can be more convenient to just pass this to the function making the pipeline
    #
    test_pipeline.originate(task_originate, starting_file_names)\
        .follows(mkdir(tempdir), mkdir(tempdir + "/testdir", tempdir + "/testdir2"))\
        .posttask(touch_file(tempdir + "/testdir/whatever.txt"))
    test_pipeline.transform(task_func=task_m_to_1,
                            name="add_input",
                            # Lookup Task from function name task_originate()
                            #   So long as this is unique in the pipeline
                            input=task_originate,
                            # requires an anchor from 3.7 onwards, see
                            # https://bugs.python.org/issue34982
                            filter=regex(r"^(.*)"),
                            add_inputs=add_inputs(
                                tempdir + "/testdir/whatever.txt"),
                            output=r"\1.22")
    test_pipeline.transform(task_func=task_1_to_1,
                            name="22_to_33",
                            # Lookup Task from Task name
                            #   Function name is not unique in the pipeline
                            input=output_from("add_input"),
                            filter=suffix(".22"),
                            output=".33")
    tail_task = test_pipeline.transform(task_func=task_1_to_1,
                                        name="33_to_44",
                                        # Ask Pipeline to lookup Task from Task name
                                        input=test_pipeline["22_to_33"],
                                        filter=suffix(".33"),
                                        output=".44")

    #   Set the tail task so that users of my sub pipeline can use it as a dependency
    #       without knowing the details of task names
    #
    #   Use Task() object directly without having to lookup
    test_pipeline.set_tail_tasks([tail_task])

    #   If we try to connect a Pipeline without tail tasks defined, we have to
    #       specify the exact task within the Pipeline.
    #   Otherwise Ruffus will not know which task we mean and throw an exception
    if DEBUG_do_not_define_tail_task:
        test_pipeline.set_tail_tasks([])

    # Set the head task so that users of my sub pipeline send input into it
    #   without knowing the details of task names
    test_pipeline.set_head_tasks([test_pipeline[task_originate]])

    return test_pipeline
    def test_newstyle_no_re_match (self):

        test_pipeline = Pipeline("test")
        test_pipeline.originate(task_1, tempdir + "a").mkdir(tempdir)
        test_pipeline.transform(task_2, task_1, regex("b"), "task_2.output")


        save_to_str_logger = t_save_to_str_logger()
        test_pipeline.run(multiprocess = 10, logger = save_to_str_logger, verbose = 1)
        print(save_to_str_logger.warning_str)
        self.assertTrue("no file names matched" in save_to_str_logger.warning_str)
        print("\n    Warning printed out correctly", file=sys.stderr)
    def test_newstyle_no_re_match(self):

        test_pipeline = Pipeline("test")
        test_pipeline.originate(task_1, tempdir + "a").mkdir(tempdir)
        test_pipeline.transform(task_2, task_1, regex("b"), "task_2.output")

        save_to_str_logger = t_save_to_str_logger()
        test_pipeline.run(
            multiprocess=10, logger=save_to_str_logger, verbose=1)
        print(save_to_str_logger.warning_str)
        self.assertTrue(
            "no file names matched" in save_to_str_logger.warning_str)
        print("\n    Warning printed out correctly", file=sys.stderr)
 def test_newstyle_task (self):
     """
     Same as above but construct a new pipeline on the fly without decorators
     """
     test_pipeline = Pipeline("test")
     test_pipeline.files(task1, None, tempdir + 'a.1')\
         .follows(mkdir(tempdir))
     test_pipeline.transform(task_func   = task2,
                             input       = task1,
                             filter      = regex(r".*"),
                             output      = tempdir + 'b.1')
     test_pipeline.files(task3, task2, tempdir + 'c.1')
     test_pipeline.files(task4, [[None, tempdir + 'd.1'], [None, tempdir + 'e.1']])\
         .follows(task3)
     test_pipeline.files(task5, task4, tempdir + "f.1")
     test_pipeline.run(multiprocess = 10, verbose = 0)
Example #10
0
 def test_newstyle_task(self):
     """
     Same as above but construct a new pipeline on the fly without decorators
     """
     test_pipeline = Pipeline("test")
     test_pipeline.files(task1, None, tempdir + 'a.1')\
         .follows(mkdir(tempdir))
     test_pipeline.transform(task_func=task2,
                             input=task1,
                             filter=regex(r".*"),
                             output=tempdir + 'b.1')
     test_pipeline.files(task3, task2, tempdir + 'c.1')
     test_pipeline.files(task4, [[None, tempdir + 'd.1'], [None, tempdir + 'e.1']])\
         .follows(task3)
     test_pipeline.files(task5, task4, tempdir + "f.1")
     test_pipeline.run(multiprocess=10, verbose=0)
Example #11
0
def add_export_to_pipeline(pipeline, tool_runners, suffix, config, **kwargs):

    conf = config.get("export", {})
    prefix = conf.get("prefix", "").strip()

    result_dir = "export.dir"
    filter_regex = ruffus.regex("(.*).dir/(.*).{}".format(suffix))
    output = r"{}/{}\1.{}".format(result_dir, prefix, suffix)

    export_result.__name__ = "export"

    pipeline.transform(task_func=export_result,
                       input=tool_runners,
                       filter=filter_regex,
                       output=output,
                       **kwargs).mkdir(result_dir)
Example #12
0
    def test_newstyle_ruffus(self):
        test_pipeline = Pipeline("test")
        test_pipeline.split(task_func=prepare_files,
                            input=None,
                            output=tempdir + '*.animal')\
            .follows(mkdir(tempdir, tempdir + "test"))\
            .posttask(lambda: do_write(tempdir + "task.done", "Task 1 Done\n"))

        test_pipeline.collate(task_func=summarise_by_grouping,
                              input=prepare_files,
                              filter=regex(r'(.*/).*\.(.*)\.animal'),
                              output=r'\1\2.results')\
            .posttask(lambda: do_write(tempdir + "task.done", "Task 2 Done\n"))

        test_pipeline.run(multiprocess=10, verbose=0)
        check_species_correct()
 def test_newstyle_no_re_match (self):
     try:
         test_pipeline = Pipeline("test")
         test_pipeline.transform(task_func = task_2,
                                 input = None,
                                 filter = regex(tempdir + "b"),
                                 replace_inputs = inputs(tempdir + "a", tempdir + "b"),
                                 output = "task_1.output")
         test_pipeline.run(multiprocess = 10, verbose = 0)
     except ruffus.ruffus_exceptions.error_task_transform_inputs_multiple_args:
         print("\tExpected exception thrown 1")
         return
     except ruffus.ruffus_exceptions.error_inputs_multiple_args:
         print("\tExpected exception thrown 2")
         return
     raise Exception("Inputs(...) with multiple arguments should have thrown an exception")
Example #14
0
    def test_newstyle_ruffus(self):
        test_pipeline = Pipeline("test")
        test_pipeline.split(task_func=prepare_files,
                            input=None,
                            output=tempdir + '*.animal')\
            .follows(mkdir(tempdir, tempdir + "test"))\
            .posttask(lambda: do_write(tempdir + "task.done", "Task 1 Done\n"))

        test_pipeline.collate(task_func=summarise_by_grouping,
                              input=prepare_files,
                              filter=regex(r'(.*/).*\.(.*)\.animal'),
                              output=r'\1\2.results')\
            .posttask(lambda: do_write(tempdir + "task.done", "Task 2 Done\n"))

        test_pipeline.run(multiprocess=10, verbose=0)
        check_species_correct()
    def test_newstyle_ruffus (self):

        test_pipeline = Pipeline("test")

        test_pipeline.follows(setup_simulation_data, mkdir(gene_data_dir, simulation_data_dir))

        test_pipeline.files(gwas_simulation, generate_simulation_params)\
            .follows(setup_simulation_data)\
            .follows(mkdir(working_dir, os.path.join(working_dir, "simulation_results")))

        test_pipeline.collate(statistical_summary, gwas_simulation, regex(r"simulation_results/(\d+).\d+.simulation_res"), r"\1.mean")\
            .posttask(lambda : sys.stdout.write("\nOK\n"))

        test_pipeline.run(multiprocess = 50, verbose = 0)
        for oo in "000.mean", "001.mean":
            results_file_name = os.path.join(working_dir, oo)
            if not os.path.exists(results_file_name):
                raise Exception("Missing %s" % results_file_name)
Example #16
0
    def test_newstyle_ruffus(self):

        test_pipeline = Pipeline("test")

        test_pipeline.follows(setup_simulation_data,
                              mkdir(gene_data_dir, simulation_data_dir))

        test_pipeline.files(gwas_simulation, generate_simulation_params)\
            .follows(setup_simulation_data)\
            .follows(mkdir(working_dir, os.path.join(working_dir, "simulation_results")))

        test_pipeline.collate(statistical_summary, gwas_simulation, regex(r"simulation_results/(\d+).\d+.simulation_res"), r"\1.mean")\
            .posttask(lambda: sys.stdout.write("\nOK\n"))

        test_pipeline.run(multiprocess=50, verbose=0)
        for oo in "000.mean", "001.mean":
            results_file_name = os.path.join(working_dir, oo)
            if not os.path.exists(results_file_name):
                raise Exception("Missing %s" % results_file_name)
 def test_newstyle_no_re_match(self):
     try:
         test_pipeline = Pipeline("test")
         test_pipeline.transform(task_func=task_2,
                                 input=None,
                                 filter=regex(tempdir + "b"),
                                 replace_inputs=inputs(
                                     tempdir + "a", tempdir + "b"),
                                 output="task_1.output")
         test_pipeline.run(multiprocess=10, verbose=0)
     except ruffus.ruffus_exceptions.error_task_transform_inputs_multiple_args:
         print("\tExpected exception thrown 1")
         return
     except ruffus.ruffus_exceptions.error_inputs_multiple_args:
         print("\tExpected exception thrown 2")
         return
     raise Exception(
         "Inputs(...) with multiple arguments should have thrown an exception"
     )
Example #18
0
    def test_newstyle_task(self):
        test_pipeline = Pipeline("test")

        test_pipeline.files(task1, [[None, tempdir + "a.1"], [None, tempdir + "b.1"]])\
            .follows(mkdir(tempdir))

        test_pipeline.files(task2, [[None, tempdir + "c.1"], [None, tempdir + "d.1"]])\
            .follows(mkdir(tempdir))

        test_pipeline.transform(task_func=task3,
                                input=task1,
                                filter=regex(r"(.+)"),
                                replace_inputs=ruffus.inputs(
                                    ((r"\1"), task2, "test_transform_inputs.*y")),
                                output=r"\1.output")
        test_pipeline.merge(task4, (task3), tempdir + "final.output")

        test_pipeline.run([task4], multiprocess=10, verbose=0)

        correct_output = "{tempdir}a.1.output:test_transform_inputs.py,{tempdir}a.1,{tempdir}c.1,{tempdir}d.1;{tempdir}b.1.output:test_transform_inputs.py,{tempdir}b.1,{tempdir}c.1,{tempdir}d.1;".format(
            tempdir=tempdir)
        with open(tempdir + "final.output") as ff:
            real_output = ff.read()
        self.assertEqual(correct_output, real_output)
Example #19
0
REGEX_TRACK_BOTH = r"(processed.dir/)*([^/]+)\.(fastq.1.gz|fastq.gz|sra|csfasta.gz|remote)"

SEQUENCEFILES_REGEX = r"([^/]+).(?P<suffix>fastq.1.gz|fastq.gz|sra|csfasta.gz|remote)"


def connect():
    '''
    Setup a connection to an sqlite database
    '''

    dbh = sqlite3.connect(P.get_params()['database'])
    return dbh


@transform(P.get_params()["input_globs"].get("default", INPUT_FORMATS),
           regex("(.*)"), r"\1")
def unprocessReads(infiles, outfiles):
    """dummy task - no processing of reads."""


# if preprocess tools are specified, preprocessing is done on output that has
# already been generated in the first run
if P.get_params().get("preprocessors", None):
    if P.get_params()["auto_remove"]:
        # check if FastQC has been run
        for x in iotools.flatten([glob.glob(y) for y in
                                  P.get_params()["input_globs"].get("default", INPUT_FORMATS)]):
            f = "fastqc.dir/" + re.match(REGEX_TRACK, x).group(1) + ".fastqc"
            if not os.path.exists(f):
                raise ValueError(
                    "file %s missing, "
Example #20
0
def main():

    #########
    # SETUP #
    #########

    # catch jgi logon and password from cli
    parser = ruffus.cmdline.get_argparse(
        description='5 accessions variant calling pipeline.')
    parser.add_argument('--email', '-e',
                        help='Logon email address for JGI',
                        type=str,
                        dest='jgi_logon')
    parser.add_argument('--password', '-p',
                        help='JGI password',
                        type=str,
                        dest='jgi_password')
    options = parser.parse_args()
    jgi_logon = options.jgi_logon
    jgi_password = options.jgi_password

    ##################
    # PIPELINE STEPS #
    ##################

    # test function for checking input/output passed to job_script and parsing
    # by io_parser
    test_job_function = functions.generate_job_function(
        job_script='src/sh/io_parser',
        job_name='test')

    # initialise pipeline
    main_pipeline = ruffus.Pipeline.pipelines["main"]

    # bamfiles
    raw_files = [x.path for x in os.scandir('data/bam') if
                 x.name.endswith('.bam') and x.is_file]

    # subset the files while the pipeline is in development. Make this equal
    # to the raw_files to run the whole pipline.
    # active_raw_files = [x for x in raw_files if
    #                     'G1' in x or 'G4' in x or 'J1' in x or 'J4' in x]
    active_raw_files = raw_files

    # species short names for vcf splitting
    species_short_names = list(set(
        [os.path.basename(x)[0] for x in active_raw_files]))

    # check that the files exist
    mapped_raw = main_pipeline.originate(
        name='mapped_raw',
        task_func=os.path.isfile,
        output=active_raw_files)

    # genome fasta
    ref_fa = main_pipeline.originate(
        name='ref_fa',
        task_func=functions.generate_job_function(
            job_script='src/sh/download_genome',
            job_name='ref_fa',
            job_type='download'),
        output='data/genome/Osativa_323_v7.0.fa',
        extras=[jgi_logon, jgi_password])

    # indexes
    fa_idx = main_pipeline.transform(
        name='fa_idx',
        task_func=functions.generate_job_function(
            job_script='src/sh/fa_idx',
            job_name='fa_idx',
            job_type='transform',
            cpus_per_task=6),
        input=ref_fa,
        filter=ruffus.suffix(".fa"),
        output=['.dict', '.fa.fai'])

    # annotation
    annot = main_pipeline.originate(
        name='annot',
        task_func=functions.generate_job_function(
            job_script='src/sh/download_genome',
            job_name='annot',
            job_type='download'),
        output=('data/genome/'
                'Osativa_323_v7.0.gene_exons.gffread.rRNAremoved.gtf'),
        extras=[jgi_logon, jgi_password])

    # convert annotation to .bed
    annot_bed = main_pipeline.transform(
        name='annot_bed',
        task_func=functions.generate_job_function(
            job_script='src/sh/annot_bed',
            job_name='annot_bed',
            job_type='transform',
            cpus_per_task=7),
        input=annot,
        filter=ruffus.suffix('.gtf'),
        output='.bed')

    # mark duplicates with picard
    deduped = main_pipeline.transform(
        name='dedupe',
        task_func=functions.generate_job_function(
            job_script='src/sh/mark_duplicates_and_sort',
            job_name='dedupe',
            job_type='transform',
            cpus_per_task=2),
        input=mapped_raw,
        filter=ruffus.regex(r"data/bam/(.*).Aligned.out.bam"),
        output=(r"output/mark_duplicates_and_sort/\1.deduped.bam"))

    # Split'N'Trim and reassign mapping qualities
    split_and_trimmed = main_pipeline.transform(
        name='split_trim',
        task_func=functions.generate_job_function(
            job_script='src/sh/split_trim',
            job_name='split_trim',
            job_type='transform',
            cpus_per_task=2),
        input=deduped,
        add_inputs=ruffus.add_inputs(ref_fa),
        filter=ruffus.formatter(
            "output/mark_duplicates_and_sort/(?P<LIB>.+).deduped.bam"),
        output=["{subdir[0][1]}/split_trim/{LIB[0]}.split.bam"])\
        .follows(fa_idx)

    # we're going to recycle call_variants, merge_variants, filter_variants
    # and analyze_covar so we'll get the functions in advance
    call_variants = functions.generate_queue_job_function(
        job_script='src/sh/call_variants',
        job_name='call_variants')
    merge_variants = functions.generate_job_function(
        job_script='src/sh/merge_variants',
        job_name='merge_variants',
        job_type='transform',
        cpus_per_task=8)
    filter_variants = functions.generate_job_function(
        job_script='src/sh/filter_variants',
        job_name='filter_variants',
        job_type='transform',
        cpus_per_task=1)
    analyze_covar = functions.generate_queue_job_function(
        job_script='src/sh/analyze_covar',
        job_name='analyze_covar')

    # call variants without recalibration tables
    uncalibrated_variants = main_pipeline.transform(
        name='uncalibrated_variants',
        task_func=call_variants,
        input=split_and_trimmed,
        add_inputs=ruffus.add_inputs([ref_fa, annot_bed]),
        filter=ruffus.formatter('output/split_trim/(?P<LIB>.+).split.bam'),
        output='{subdir[0][1]}/variants_uncalibrated/{LIB[0]}.g.vcf.gz')

    # merge gVCF variants
    uncalibrated_variants_merged = main_pipeline.merge(
        name='uncalibrated_variants_merged',
        task_func=merge_variants,
        input=[uncalibrated_variants, ref_fa],
        output='output/variants_uncalibrated/variants_uncalibrated.vcf.gz')

    # filter variants on un-corrected bamfiles
    uncalibrated_variants_filtered = main_pipeline.transform(
        name='uncalibrated_variants_filtered',
        task_func=filter_variants,
        input=uncalibrated_variants_merged,
        add_inputs=ruffus.add_inputs(ref_fa),
        filter=ruffus.suffix('_uncalibrated.vcf.gz'),
        output='_uncalibrated_filtered.vcf.gz')

    # select variant (only recalibrate using passed SNPs)
    uncalibrated_variants_selected = main_pipeline.transform(
        name='uncalibrated_variants_selected',
        task_func=functions.generate_job_function(
            job_script='src/sh/select_variants',
            job_name='select_variants',
            job_type='transform'),
        input=uncalibrated_variants_filtered,
        add_inputs=ruffus.add_inputs(ref_fa),
        filter=ruffus.suffix('_uncalibrated_filtered.vcf.gz'),
        output='_uncalibrated_selected.vcf.gz')

    # create recalibration report with filtered variants
    covar_report = main_pipeline.merge(
        name='covar_report',
        task_func=analyze_covar,
        input=[split_and_trimmed, ref_fa, annot_bed,
               uncalibrated_variants_selected],
        output="output/covar_analysis/recal_data.table")

    # second pass to analyze covariation remaining after recalibration
    second_pass_covar_report = main_pipeline.merge(
        name='second_pass_covar_report',
        task_func=analyze_covar,
        input=[split_and_trimmed, ref_fa, annot_bed,
               uncalibrated_variants_filtered, covar_report],
        output="output/covar_analysis/post_recal_data.table")

    # plot effect of base recalibration
    recal_plot = main_pipeline.transform(
        name='recal_plot',
        task_func=functions.generate_job_function(
            job_script='src/R/recal_plot.R',
            job_name='recal_plot',
            job_type='transform',
            cpus_per_task=1),
        input=second_pass_covar_report,
        filter=ruffus.suffix('post_recal_data.table'),
        add_inputs=ruffus.add_inputs(covar_report),
        output='recalibration_plots.pdf')

    # recalibrate bases using recalibration report
    recalibrated = main_pipeline.transform(
        name='recalibrate',
        task_func=functions.generate_job_function(
            job_script='src/sh/recalibrate',
            job_name='recalibrate',
            job_type='transform',
            cpus_per_task=2),
        input=split_and_trimmed,
        add_inputs=ruffus.add_inputs([ref_fa, covar_report]),
        filter=ruffus.formatter('output/split_trim/(?P<LIB>.+).split.bam'),
        output='{subdir[0][1]}/recal/{LIB[0]}.recal.bam')

    # final variant calling
    variants = main_pipeline.transform(
        name='variants',
        task_func=call_variants,
        input=recalibrated,
        add_inputs=ruffus.add_inputs(ref_fa, annot_bed),
        filter=ruffus.formatter('output/recal/(?P<LIB>.+).recal.bam'),
        output='{subdir[0][1]}/variants/{LIB[0]}.g.vcf.gz')

    # merge gVCF variants
    variants_merged = main_pipeline.merge(
        name='variants_merged',
        task_func=merge_variants,
        input=[variants, ref_fa],
        output='output/variants/variants.vcf.gz')

    # variant filtering
    variants_filtered = main_pipeline.transform(
        name='variants_filtered',
        task_func=filter_variants,
        input=variants_merged,
        add_inputs=ruffus.add_inputs(ref_fa),
        filter=ruffus.suffix('.vcf.gz'),
        output='_filtered.vcf.gz')

    # variants by species
    split_variants = main_pipeline.subdivide(
        name='split_variants',
        task_func=functions.generate_job_function(
            job_script='src/sh/split_variants',
            job_name='split_variants',
            job_type='transform',
            cpus_per_task=1,
            ntasks=len(species_short_names)),
        input=variants_filtered,
        filter=ruffus.formatter(),
        add_inputs=ruffus.add_inputs(ref_fa),
        output=[('output/split_variants/' + x + '.variants_filtered.vcf.gz')
                for x in species_short_names])

    # count variants per gene per species
    cds_variants = main_pipeline.transform(
        name='cds_variants',
        task_func=functions.generate_job_function(
            job_script='src/R/cds_variants.R',
            job_name='cds_variants',
            job_type='transform'),
        input=split_variants,
        add_inputs=ruffus.add_inputs([ref_fa, annot]),
        filter=ruffus.formatter(
            'output/split_variants/(?P<LIB>.+).variants_filtered.vcf.gz'),
        output='{subdir[0][1]}/cds_variants/{LIB[0]}.cds_variants.Rds')

    # merge counted variants
    variants_per_gene = main_pipeline.merge(
        name='cds_merge',
        task_func=functions.generate_job_function(
            job_script='src/R/cds_merge.R',
            job_name='cds_merge',
            job_type='transform'),
        input=cds_variants,
        output='output/cds_variants/cds_variants.Rds')

    ###################
    # RUFFUS COMMANDS #
    ###################

    # print the flowchart
    ruffus.pipeline_printout_graph(
        "ruffus/flowchart.pdf", "pdf",
        pipeline_name="5 accessions variant calling pipeline")

    # run the pipeline
    ruffus.cmdline.run(options, multithread=8)
Example #21
0
def mappipe(ifold, ref_file, minlen=20, rclip=0):

    ifold = os.path.join(ifold, '')
    ifile = '*.fastq.gz'  #  '*.fastq.gz'
    #ref_file = '/data/index/HG19.fasta'
    trim_regex = r'(.*)\/(SRR.+).fastq.gz$'
    pipeline = ruffus.Pipeline('FastqDNaseSeq')
    trim_task = pipeline.collate(
        tasks.trimmer,
        name='TrimGalore',
        input=ifold + ifile,
        filter=ruffus.regex(trim_regex),
        output=r'\1/\2_trimmed.fq.gz',
        # extras[0]: minimum length,
        # [1]:right end clip size
        extras=[[minlen, rclip]])
    trfile = '*_trimmed.fq.gz'
    aln_regex = r'(.*)\/(.*).fq.gz$'
    align_task = pipeline.collate(tasks.bwa_aln,
                                  name='bwa_aln',
                                  input=ifold + trfile,
                                  filter=ruffus.regex(aln_regex),
                                  output=r'\1/\2.sai',
                                  extras=[ref_file])
    align_task.follows('TrimGalore')

    ## sai to sam file using bwa samse
    sai_file = '*.sai'
    samse_regex = r'(.*)\/(.*).sai$'
    samse_task = pipeline.collate(
        tasks.bwa_samse,
        name='bwa_samse',
        input=ifold + sai_file,
        filter=ruffus.regex(samse_regex),
        output=r'\1/\2.sam',
        # extras[0]: fastq required for samse,
        # [1]: ref indexed fasta,
        # [2]: max multiple mapped reads [Default=3]
        extras=[[r'\1/\2.fq.gz', ref_file, 10]])
    samse_task.follows('bwa_aln')

    ## sam to bam using sambamba view
    sam_file = '*.sam'
    tobam_regex = r'(.*)\/(.*).sam$'
    tobam_task = pipeline.collate(tasks.sam_to_bam,
                                  name='sam_bam',
                                  input=ifold + sam_file,
                                  filter=ruffus.regex(tobam_regex),
                                  output=r'\1/\2.bam')
    tobam_task.follows('bwa_samse')

    ## sorting bam with sambamba sort
    bam_file = '*trimmed.bam'
    sort_bam_regex = r'(.*)\/(.*).bam$'
    sort_bam_task = pipeline.collate(tasks.sort_bam,
                                     name='sorting_bam',
                                     input=ifold + bam_file,
                                     filter=ruffus.regex(sort_bam_regex),
                                     output=r'\1/\2.sorted.bam')
    sort_bam_task.follows('sam_bam')

    ## bam to bed using bam2bed
    sorted_bam_file = '*trimmed.sorted.bam'
    sorted_bam_regex = r'(.*)\/(.*).sorted.bam$'
    sorted_bam_task = pipeline.collate(tasks.bam2bed,
                                       name='bam2bed',
                                       input=ifold + sorted_bam_file,
                                       filter=ruffus.regex(sorted_bam_regex),
                                       output=r'\1/\2.sorted.bed')
    sorted_bam_task.follows('sorting_bam')

    full_pipe = ruffus.Pipeline('Full pipeline', input=['bam2bed'])

    full_pipe.run()
    if filetype == "bam":
        preamble += "samtools index %(tmpfile)s && "
        postamble += " && rm %(tmpfile)s.bai "
    elif filetype == "bed.gz":
        tmp2 = P.get_temp_filename(shared=False)
        preamble += ''' zcat %(tmpfile)s | sort -k1,1 -k2,2n | bgzip > %(tmp2)s &&
                        mv %(tmp2)s %(tmpfile)s &&
                        tabix -p bed %(tmpfile)s && '''
        postamble += "&& rm %(tmpfile)s.tbi"

    return preamble % locals(), postamble % locals(), tmpfile, filetype


# ------------------------------------------------------------------------------
@subdivide("*.categories.tsv", regex("(.+).categories.tsv"),
           add_inputs(PARAMS["geneset"]), r"\1_*.gtf.gz", r"\1")
def split_gtf_by_category(infiles, outfiles, catname):

    catfile, gtffile = infiles
    categories = pd.read_csv(catfile, index_col=0, squeeze=True, sep="\t")

    # create output filepool
    outpool = iotools.FilePool("{}_%s.gtf.gz".format(catname), force=True)

    gtffile = iotools.open_file(gtffile)

    for gtfline in gtf.iterator(gtffile):

        try:
            transcript_id = gtfline.transcript_id
Example #23
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='cellfree_seq')
    # Stages are dependent on the state
    stages = Stages(state)

    safe_make_dir('alignments')

    # The original FASTQ files
    fastq_files = glob.glob('fastqs/*')

    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.original_fastqs,
                       name='original_fastqs',
                       output=fastq_files)

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+)_R1.fastq.gz'),
        # Add one more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=['{sample[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='alignments/{sample[0]}.sort.hq.bam')

    pipeline.transform(task_func=stages.run_connor,
                       name='run_connor',
                       input=output_from('align_bwa'),
                       filter=suffix('.sort.hq.bam'),
                       output='.sort.hq.connor.bam')

    safe_make_dir('metrics')
    safe_make_dir('metrics/summary')
    safe_make_dir('metrics/connor')

    pipeline.transform(
        task_func=stages.intersect_bed,
        name='intersect_bed_raw',
        input=output_from('align_bwa'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.bam'),
        output='metrics/summary/{sample[0]}.intersectbed.bam')

    pipeline.transform(task_func=stages.coverage_bed,
                       name='coverage_bed_raw',
                       input=output_from('intersect_bed_raw'),
                       filter=suffix('.intersectbed.bam'),
                       output='.bedtools_hist_all.txt')

    pipeline.transform(
        task_func=stages.genome_reads,
        name='genome_reads_raw',
        input=output_from('align_bwa'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.bam'),
        output='metrics/summary/{sample[0]}.mapped_to_genome.txt')

    pipeline.transform(task_func=stages.target_reads,
                       name='target_reads_raw',
                       input=output_from('intersect_bed_raw'),
                       filter=suffix('.intersectbed.bam'),
                       output='.mapped_to_target.txt')

    pipeline.transform(
        task_func=stages.total_reads,
        name='total_reads_raw',
        input=output_from('align_bwa'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.bam'),
        output='metrics/summary/{sample[0]}.total_raw_reads.txt')

    pipeline.collate(
        task_func=stages.generate_stats,
        name='generate_stats_raw',
        input=output_from('coverage_bed_raw', 'genome_reads_raw',
                          'target_reads_raw', 'total_reads_raw'),
        filter=regex(
            r'.+/(.+)\.(bedtools_hist_all|mapped_to_genome|mapped_to_target|total_raw_reads)\.txt'
        ),
        output=r'metrics/summary/all_sample.summary.\1.txt',
        extras=[r'\1', 'summary.txt'])

    pipeline.transform(
        task_func=stages.intersect_bed,
        name='intersect_bed_connor',
        input=output_from('run_connor'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.connor.bam'),
        output='metrics/connor/{sample[0]}.intersectbed.bam')

    pipeline.transform(task_func=stages.coverage_bed,
                       name='coverage_bed_connor',
                       input=output_from('intersect_bed_connor'),
                       filter=suffix('.intersectbed.bam'),
                       output='.bedtools_hist_all.txt')

    pipeline.transform(
        task_func=stages.genome_reads,
        name='genome_reads_connor',
        input=output_from('run_connor'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.connor.bam'),
        output='metrics/summary/{sample[0]}.mapped_to_genome.txt')

    pipeline.transform(task_func=stages.target_reads,
                       name='target_reads_connor',
                       input=output_from('intersect_bed_connor'),
                       filter=suffix('.intersectbed.bam'),
                       output='.mapped_to_target.txt')

    pipeline.transform(
        task_func=stages.total_reads,
        name='total_reads_connor',
        input=output_from('run_connor'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.connor.bam'),
        output='metrics/summary/{sample[0]}.total_raw_reads.txt')

    pipeline.collate(
        task_func=stages.generate_stats,
        name='generate_stats_connor',
        input=output_from('coverage_bed_connor', 'genome_reads_connor',
                          'target_reads_connor', 'total_reads_connor'),
        filter=regex(
            r'.+/(.+)\.(bedtools_hist_all|mapped_to_genome|mapped_to_target|total_raw_reads)\.txt'
        ),
        output=r'metrics/connor/all_sample.summary.\1.txt',
        extras=[r'\1', 'connor.summary.txt'])

    safe_make_dir('variants')
    safe_make_dir('variants/vardict')

    pipeline.transform(
        task_func=stages.run_vardict,
        name='run_vardict',
        input=output_from('run_connor'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.connor.bam'),
        output='variants/vardict/{sample[0]}.vcf',
        extras=['{sample[0]}'])

    pipeline.transform(
        task_func=stages.sort_vcfs,
        name='sort_vcfs',
        input=output_from('run_vardict'),
        filter=formatter('variants/vardict/(?P<sample>[a-zA-Z0-9_-]+).vcf'),
        output='variants/vardict/{sample[0]}.sorted.vcf.gz')

    pipeline.transform(task_func=stages.index_vcfs,
                       name='index_vcfs',
                       input=output_from('sort_vcfs'),
                       filter=suffix('.sorted.vcf.gz'),
                       output='.sorted.vcf.gz.tbi')

    (pipeline.merge(
        task_func=stages.concatenate_vcfs,
        name='concatenate_vcfs',
        input=output_from('sort_vcfs'),
        output='variants/vardict/combined.vcf.gz').follows('index_vcfs'))

    pipeline.transform(task_func=stages.vt_decompose_normalise,
                       name='vt_decompose_normalise',
                       input=output_from('concatenate_vcfs'),
                       filter=suffix('.vcf.gz'),
                       output='.decomp.norm.vcf.gz')

    pipeline.transform(task_func=stages.index_vcfs,
                       name='index_final_vcf',
                       input=output_from('vt_decompose_normalise'),
                       filter=suffix('.decomp.norm.vcf.gz'),
                       output='.decomp.norm.vcf.gz.tbi')

    (pipeline.transform(
        task_func=stages.apply_vep,
        name='apply_vep',
        input=output_from('vt_decompose_normalise'),
        filter=suffix('.decomp.norm.vcf.gz'),
        output='.decomp.norm.vep.vcf').follows('index_final_vcf'))

    return pipeline
    '''Determine indel candidate intervals'''
    cmd_dict = CMD_DICT.copy()
    cmd_dict['infile'] = input_file
    cmd_dict['outfile'] = output_file
    pmsg('Interval Creation', input_file, output_file)
    gatk_cmd = '%(gatk)s --analysis_type RealignerTargetCreator ' + \
            '--reference_sequence %(reference)s ' + \
            '--DBSNP %(dbsnp)s ' + \
            '--input_file %(infile)s ' + \
            '--out %(outfile)s'
    call(gatk_cmd, cmd_dict)

# Realign around possible indels
@follows(mkdir('realigned'))
@transform(create_intervals,
           regex(r'^intervals/(.+)\.intervals$'),
           inputs([r'deduped/\1.deduped.bam', r'intervals/\1.intervals']),
           r'realigned/\1.realigned.bam')
def local_realignment(input_files, output_file):
    '''Realign reads around candidate indels'''
    cmd_dict = CMD_DICT.copy()
    cmd_dict['bam_file'] = input_files[0]
    cmd_dict['indel_intervals'] = input_files[1]
    cmd_dict['outfile'] = output_file
    pmsg('Local Realignment', ', '.join(input_files), output_file)
    gatk_cmd = '%(gatk)s --analysis_type IndelRealigner ' + \
            '--reference_sequence %(reference)s ' + \
            '--DBSNP %(dbsnp)s ' + \
            '--input_file %(bam_file)s ' + \
            '--targetIntervals %(indel_intervals)s ' + \
            '--out %(outfile)s'
Example #25
0

# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
@follows(mkdir(tempdir))
@ruffus.files([[None, tempdir + "a.1"], [None, tempdir + "b.1"]])
def task1(i, o):
    touch(o)


@follows(mkdir(tempdir))
@ruffus.files([[None, tempdir + "c.1"], [None, tempdir + "d.1"]])
def task2(i, o):
    touch(o)


@transform(task1, regex(r"(.+)"),
           ruffus.inputs(((r"\1"), task2, "test_transform_inputs.*y")),
           r"\1.output")
def task3(i, o):
    names = ",".join(sorted(i))
    for f in o:
        with open(o, "w") as ff:
            ff.write(names)


@merge((task3), tempdir + "final.output")
def task4(i, o):
    with open(o, "w") as o_file:
        for f in sorted(i):
            with open(f) as ff:
                o_file.write(f + ":" + ff.read() + ";")
Example #26
0
        s = 0
    except:
        s = -1
    with open(in_genes) as infile:
        with open(out_bed, 'w') as outfile:
            for line in infile:
                fields = line.strip().split('\t')
                chrom, start, stop = fields[s + 2], fields[s + 4], fields[s +
                                                                          5]
                name, strand = fields[s + 1], fields[s + 3]
                outfile.write(
                    '\t'.join([chrom, start, stop, name, '0', strand]) + '\n')


@follows(get_refseq_genes, convert_gtf_genes_to_bed)
@split('%s.*_genes' % cfg.get('DEFAULT', 'genome'), regex(r'(.*)_genes$'), [
    r'\1_genes.promoter*_ext*', r'\1_genes.down*_ext*', r'\1_genes.utr5',
    r'\1_genes.utr3', r'\1_genes.exon', r'\1_genes.intron', r'\1_genes.tss',
    r'\1_genes.noncoding'
])
def refseq_genes_to_regions(in_genes, out_pattern):
    """make regions (promoter, downstream, 5UTR, etc) from refseq_genes"""
    args = shlex.split('''%s --promoter_size=%s --promoter_extend=%s
                          --downstream_size=%s --downstream_extend=%s
                          --with_gene_name''' %
                       (in_genes, cfg.get('genes', 'promoter_size'),
                        cfg.get('genes', 'promoter_extend'),
                        cfg.get('genes', 'downstream_size'),
                        cfg.get('genes', 'downstream_extend')))
    makeGeneStructure.main(args)
Example #27
0
    try:
        _ = int(open(in_genes).readline().strip().split('\t')[0])
        s = 0
    except:
        s = -1
    with open(in_genes) as infile:
        with open(out_bed, 'w') as outfile:
            for line in infile:
                fields = line.strip().split('\t')
                chrom, start, stop = fields[s+2], fields[s+4], fields[s+5]
                name, strand = fields[s+1], fields[s+3]
                outfile.write('\t'.join([chrom, start, stop, name,
                                         '0', strand]) + '\n')

@follows(get_refseq_genes, convert_gtf_genes_to_bed)
@split('%s.*_genes' % cfg.get('DEFAULT', 'genome'), regex(r'(.*)_genes$'),
       [r'\1_genes.promoter*_ext*', r'\1_genes.down*_ext*', r'\1_genes.utr5',
        r'\1_genes.utr3', r'\1_genes.exon', r'\1_genes.intron', r'\1_genes.tss',
        r'\1_genes.noncoding'])
def refseq_genes_to_regions(in_genes, out_pattern):
    """make regions (promoter, downstream, 5UTR, etc) from refseq_genes"""
    args = shlex.split('''%s --promoter_size=%s --promoter_extend=%s
                          --downstream_size=%s --downstream_extend=%s
                          --with_gene_name''' % (
                            in_genes,
                            cfg.get('genes', 'promoter_size'),
                            cfg.get('genes', 'promoter_extend'),
                            cfg.get('genes', 'downstream_size'),
                            cfg.get('genes', 'downstream_extend')))
    makeGeneStructure.main(args)
REGEX_TRACK_BOTH = \
    r"(processed.dir/)*([^/]+)\.(fastq.1.gz|fastq.gz|sra|csfasta.gz|remote)"

SEQUENCEFILES_REGEX = r"(\S+).(?P<suffix>fastq.1.gz|fastq.gz|sra|csfasta.gz|remote)"


def connect():
    '''
    Setup a connection to an sqlite database
    '''

    dbh = sqlite3.connect(PARAMS['database'])
    return dbh


@transform(INPUT_FORMATS, regex("(.*)"), r"\1")
def unprocessReads(infiles, outfiles):
    """dummy task - no processing of reads."""


# if preprocess tools are specified, preprocessing is done on output that has
# already been generated in the first run
if PARAMS.get("preprocessors", None):
    if PARAMS["auto_remove"]:
        # check if fastqc has been run
        for x in IOTools.flatten([glob.glob(y) for y in INPUT_FORMATS]):
            f = re.match(REGEX_TRACK, x).group(1) + ".fastqc"
            if not os.path.exists(f):
                raise ValueError(
                    "file %s missing, "
                    "you need to run the pipeline once before "
Example #29
0

# _________________________________________________________________________________________
#
#   Step 2:
#
#       Statistical summary per gene/gwas file pair
#
#        for n_file in NNN_pairs_of_input_files:
#            working_dir/simulation_results/n.*.simulation_res
#               -> working_dir/n.mean
#
# _________________________________________________________________________________________


@collate(gwas_simulation, regex(r"simulation_results/(\d+).\d+.simulation_res"), r"\1.mean")
@posttask(lambda: sys.stdout.write("\nOK\n"))
def statistical_summary(result_files, summary_file):
    """
    Simulate statistical summary
    """

    summary_file = open(summary_file, "w")
    for f in result_files:
        with open(f) as ii:
            summary_file.write(ii.read())
    summary_file.close()


try:
    from StringIO import StringIO
Example #30
0
# TODO Also have to run the new bag file extractor for mark2

@follows('convert_params_to_h5')
@files(None, '%s/sentinel' % LDR_DIR)
@posttask(touch_file('%s/sentinel' % LDR_DIR))
def align_ldr(dummy, sentinel):
    cmd = 'python %s/process/LidarAlign.py %s %s' % (SAIL_CAR_LOG_PATH, DSET_DIR, '%s%d.avi' % (DSET, CAMERA))
    print cmd
    check_call(cmd, shell=True)


@follows('align_ldr')
#@files('params.ini', '%s/sentinel' % POINTS_H5_DIR)
@transform('%s/*.ldr' % LDR_DIR,
           regex('%s/(.*?).ldr' % LDR_DIR),
           r'%s/\1.h5' % POINTS_H5_DIR)
def convert_ldr_to_h5(ldr_file, h5_file):
    exporter = '%s/mapping/pipeline/ldr_to_h5.py' % SAIL_CAR_LOG_PATH
    cmd = 'python {exporter} {fgps} {ldr_file} {h5_file}'.format(exporter=exporter, fgps=GPS_FILE, ldr_file=ldr_file, h5_file=h5_file)
    if NO_TRANSFORM:
        cmd += ' --no_transform'
    print cmd
    check_call(cmd, shell=True)


@follows('convert_ldr_to_h5')
@transform('%s/*.h5' % POINTS_H5_DIR,
           regex('%s/(.*?).h5' % POINTS_H5_DIR),
           r'%s/\1.pcd' % PCD_DIR)
def convert_h5_to_pcd(input_file, output_file):
Example #31
0
                strand = fields[5] if len(fields) >= 6 else "+"
                # +:RED, -:GREEN
                color = "255,0,0" if strand == "+" else "0,255,0"
                outfile.write("\t".join(fields + [start, stop, color]) + "\n")


@transform(bed_color_strand, suffix(""), ".bigbed")
def bed_to_bigbed(in_bed, out_bigbed):
    """Convert a BED file to .bigbed for viewing on UCSC browser"""
    cmd = "bedToBigBed %s %s.chrom.sizes %s" % (in_bed, genome_path(), out_bigbed)
    sys_call(cmd)


@transform(
    [bed_uniquefy, clip_and_sort_peaks] + mapping.all_mappers_output,
    regex("(.*mapped_reads).clipped.sorted(.unique|)"),
    # suffix('.mapped_reads'),
    add_inputs(bootstrap.get_chrom_sizes),
    r"\1\2.bedgraph",
)
# r'.bedgraph')
def bed_to_bedgraph(in_files, out_bedgraph):
    "extend reads to the full fragment length and create a bedgraph from them"
    in_bed, in_chrom_sizes = in_files
    cmd = ("slopBed -i %s -s -r %s -l 0 -g %s | " + "bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s") % (
        in_bed,
        cfg.getint("DEFAULT", "fragment_size") - cfg.getint("DEFAULT", "tag_size"),
        in_chrom_sizes,
        cfg.get("DEFAULT", "genome"),
        genome_path(),
        out_bedgraph,
Example #32
0
    # check whether this is a illumina or sanger fastq file
    try:
        SeqIO.convert(input_file_handle, 'fastq-illumina', output_file_handle, 'fastq-sanger')
    except ValueError as e:
        # check if this is a quality score problem
        if e.args != ('Invalid character in quality string',):
            raise e
        input_file_handle.seek(0)
        output_file_handle.seek(0)
        output_file_handle.writelines(input_file_handle.readlines())
    finally:
        input_file_handle.close()
        output_file_handle.close()

@follows(mkdir('sai'), mkdir('logs'))
@transform(copy_sequence, regex(r'^fastq/(.+)_sequence\.fastq\.gz$'), r'sai/\1.sai')
def fastq_to_sai(input_file, output_file):
    '''Convert FASTQ files to SAI files.'''
    cmd_dict = CMD_DICT.copy()
    cmd_dict['infile'] = input_file
    cmd_dict['outfile'] = output_file
    pmsg('Aligning sequences', cmd_dict['infile'], cmd_dict['outfile'])
    bwacmd = '%(bwa)s aln -t %(threads)s -f %(outfile)s %(reference)s %(infile)s'
    call(bwacmd, cmd_dict)

# Merge paired ends to SAM
@follows(mkdir('sam'))
@transform(fastq_to_sai, regex(r'^sai/(\w+)_s_(\d)(_1)?\.sai$'),
           inputs([r'sai/\1_s_\2*.sai', r'fastq/\1_s_\2*.fastq.gz']),
           r'sam/\1_s_\2.sam')
def make_sam(input_files, output_file):
Example #33
0
from hts_waterworks.bootstrap import (genome_path, get_genome, cfg,
                                      get_chrom_sizes)
import hts_waterworks.preprocessing as preprocessing

#: the references to map against for this run (genome, transcriptome, etc)
reference_genomes = [genome_path()]
if cfg.getboolean('mapping', 'map_to_transcriptome'):
    reference_genomes.append('*_genes.transcriptome.fasta')

@follows(mkdir('mapped'))
def make_mapping_dir():
    pass


@active_if(cfg.getboolean('mapping', 'map_to_transcriptome'))
@split('*_genes', regex(r'(.*)_genes$'),
       [r'\1_genes.transcriptome.fasta',
        r'\1_genes.transcriptome.seqdb',
        r'\1_genes.transcriptome.msa'])
def make_transcriptome(in_genes, out_files):
    """Splice UTR's and exons from gene annotations into a transcriptome.
    Creates a fasta-file of resulting genes and a gene to genome alignment.
    
    """
    out_fasta, out_db, out_msa = out_files
    startCol = 1
    msa = cnestedlist.NLMSA(out_msa, mode='w', pairwiseMode=True)
    genome = get_genome(None, None, touch_file=False)
    for chrom in genome.values():
        msa += chrom
    outfile = open(out_fasta, 'w')
Example #34
0
    P.run(statement, job_memory=PARAMS["job_highmemory"])

    statement = '''
    tabix -p bed %(outfile)s
    '''
    P.run(statement, job_memory=PARAMS["job_highmemory"])


###################################################################
# ENSEMBL gene set
###################################################################


@follows(mkdir('ensembl.dir'))
@transform(PARAMS["ensembl_filename_gtf"], regex("(\S+)"),
           r"%s" % PARAMS['interface_geneset_all_gtf'])
def buildUCSCGeneSet(infile, outfile):
    '''output sanitized ENSEMBL geneset.

    This method outputs an ENSEMBL gene set after some sanitizing steps:

    1. Chromosome names are changed to the UCSC convention.
    2. Chromosomes that match the regular expression specified in
       the configuration file are removed.

    Arguments
    ---------
    infiles : string
       ENSEMBL geneset in :term:`gtf` format.
       NCBI Assembly report in 'txt' format.
Example #35
0
#
#    task3
#
@active_if(lambda: pipeline_active_if)
@transform(task1, suffix(".1"), ".3")
def task3(infile, outfile):
    """
    Third task
    """
    helper(infile, outfile)


#
#    task4
#
@collate([task2, task3], regex(r"(.+)\.[23]"), r"\1.4")
def task4(infiles, outfile):
    """
    Fourth task
    """
    helper(infiles, outfile)

#
#    task4
#


@merge(task4, "test_active_if/summary.5")
def task5(infiles, outfile):
    """
    Fifth task



#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888

#   Tasks


#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
@mkdir(tempdir)
@originate(tempdir + "a")
def task_1 (o):
    open(o, 'w').close()

@transform(task_1, regex("b"), "task_2.output")
def task_2 (i, o):
    for f in o:
        with open(f, 'w') as oo:
            pass

import unittest


class t_save_to_str_logger:
    """
    Everything to stderr
    """
    def __init__ (self):
        self.info_str = ""
        self.warning_str = ""
Example #37
0
def prepare_files(no_inputs, outputs):
    # cleanup previous
    for f in outputs:
        os.unlink(f)

    for grouping in species_list:
        for species_name in species_list[grouping]:
            filename = tempdir + "%s.%s.animal" % (species_name, grouping)
            with open(filename, "w") as oo:
                oo.write(species_name + "\n")


#
#    task2
#
@collate(prepare_files, regex(r'(.*/).*\.(.*)\.animal'), r'\1\2.results')
@posttask(lambda: do_write(tempdir + "task.done", "Task 2 Done\n"))
def summarise_by_grouping(infiles, outfile):
    """
    Summarise by each species group, e.g. mammals, reptiles, fish
    """
    with open(tempdir + "jobs.start",  "a") as oo:
        oo.write('job = %s\n' % json.dumps([infiles, outfile]))
    with open(outfile, "w") as oo:
        for i in infiles:
            with open(i) as ii:
                oo.write(ii.read())
    with open(tempdir + "jobs.finish",  "a") as oo:
        oo.write('job = %s\n' % json.dumps([infiles, outfile]))

Example #38
0
def main():

    #########
    # SETUP #
    #########

    # test function for checking input/output passed to job_script and parsing
    # by src/sh/io_parser
    test_job_function = tompltools.generate_job_function(
        job_script='src/sh/io_parser',
        job_name='test',
        verbose=True)

    # parse email etc. here?
    parser = ruffus.cmdline.get_argparse(
        description='ASW genome assembly pipeline.')
    parser.add_argument('--blast-db-folder',
                        help='Path to BLAST db folder',
                        type=str,
                        dest='blast_db_folder')

    # parser.add_argument('--email', '-e',
    #                     help='Logon email address for JGI',
    #                     type=str,
    #                     dest='jgi_logon')
    # parser.add_argument('--password', '-p',
    #                     help='JGI password',
    #                     type=str,
    #                     dest='jgi_password')
    options = parser.parse_args()
    # jgi_logon = options.jgi_logon
    # jgi_password = options.jgi_password
    if options.blast_db_folder:
        os.environ['BLASTDB'] = options.blast_db_folder

    # initialise pipeline
    main_pipeline = ruffus.Pipeline.pipelines['main']

    # find fastq.gz files
    dir_listing = [x[0] for x in os.walk(top='data', followlinks=True)]
    fastq_file_list = []
    for directory in dir_listing:
        file_list = os.scandir(directory)
        fastq_file_list.append([x.path for x in file_list
                               if (x.name.endswith('fastq.gz')
                                   or x.name.endswith('.fastq'))
                               and x.is_file()])

    fastq_files = list(tompytools.flatten_list(fastq_file_list))

    # extract only MH gDNA fastq data, i.e.
    # 2125-06-11-1 = MH PE
    # 2125-06-06-1 = MH MP
    active_fq_files = [x for x in fastq_files
                       if ('2125-06-11-1' in x
                           or '2125-06-06-1' in x)]

    # load files into ruffus
    raw_fq_files = main_pipeline.originate(
        name='raw_fq_files',
        task_func=os.path.isfile,
        output=active_fq_files)

    # merge libraries
    merged_fq_files = main_pipeline.collate(
        name='merged_fq_files',
        task_func=tompltools.generate_job_function(
            job_script='src/sh/merge_fq',
            job_name='merge_fq'),
        input=raw_fq_files,
        filter=ruffus.formatter(
            r'data/NZGL02125/.*/'
            '[^-]+-(?P<LIB>[^_]+).+_R(?P<RN>\d)_.*.fastq.gz'),
        output=[r'output/fq_merged/{LIB[0]}_R{RN[0]}_merged.fastq.gz'])


    # make pairs and send to cutadapt for trimming external adaptors
    trim_cutadapt = main_pipeline.collate(
        name='trim_cutadapt',
        task_func=tompltools.generate_job_function(
            job_script='src/sh/cutadapt_pe',
            job_name='cutadapt'),
        input=merged_fq_files,
        filter=ruffus.formatter(
            r'.+/(?P<LIB>[^_]+)_R(?P<RN>\d)_merged.fastq.gz'),
        output=[['output/cutadapt/{LIB[0]}_R1_trimmed.fastq.gz',
                'output/cutadapt/{LIB[0]}_R2_trimmed.fastq.gz']])

    # send trimmed reads to splitnextera
    mp_splitnextera = main_pipeline.subdivide(
        name='splitnextera',
        task_func=tompltools.generate_job_function(
            job_script='src/sh/splitnextera',
            job_name='splitnextera'),
        input=trim_cutadapt,
        filter=ruffus.regex(
            r'.+?/2125-06-06-1_R(?P<RN>\d)_trimmed.fastq.gz'),
        output=['output/splitnextera/2125-06-06-1.pe.fastq.gz',
                'output/splitnextera/2125-06-06-1.se.fastq.gz',
                'output/splitnextera/2125-06-06-1.mp.fastq.gz',
                'output/splitnextera/2125-06-06-1.unknown.fastq.gz'])

    # decontaminate PhiX (other?) sequences
    decon_mp = main_pipeline.transform(
        name='decon_mp',
        task_func=tompltools.generate_job_function(
            job_script='src/sh/decon',
            job_name='decon_mp'),
        input=mp_splitnextera,
        filter=ruffus.formatter(
            r'.+/2125-06-06-1\.(?P<VL>[^.]+)\.fastq.gz'),
        output=['output/decon/2125-06-06-1_{VL[0]}.fastq.gz'])

    decon_pe = main_pipeline.transform(
        name='decon_pe',
        task_func=tompltools.generate_job_function(
            job_script='src/sh/decon',
            job_name='decon_pe'),
        input=trim_cutadapt,
        filter=ruffus.regex(
            r'.+?/2125-06-11-1_R(?P<RN>\d)_trimmed.fastq.gz'),
        output=[r'output/decon/2125-06-11-1.fastq.gz'])

    decon = [decon_mp, decon_pe]

    # digital normalisation and error correction w/ bbnorm
    bbnorm = main_pipeline.subdivide(
        name='bbnorm',
        task_func=tompltools.generate_job_function(
            job_script='src/sh/bbnorm',
            job_name='bbnorm',
            mem_per_cpu=7000,
            cpus_per_task=8),
        input=decon,
        filter=ruffus.formatter(r'.+/(?P<LN>[^(_|.)]+)(?P<VL>_?\w*).fastq.gz'),
        output=[r'output/bbnorm/{LN[0]}{VL[0]}.fastq.gz'])

    # download NCBI databases for taxonomy data
    download_taxonomy_databases = main_pipeline.originate(
        name='download_taxonomy_databases',
        task_func=tompltools.generate_job_function(
            job_script='src/r/download_taxonomy_databases.R',
            job_name='download_taxonomy_databases',
            job_type='originate'),
        output=[['data/ncbi/nucl_gb.accession2taxid.Rds',
                'data/ncbi/nodes.dmp.Rds',
                'data/ncbi/names.dmp.Rds']])

    # subsample reads, blast with biopython and parse results
    fq_subsample = main_pipeline.subdivide(
        name='fq_subsample',
        task_func=tompltools.generate_job_function(
            job_script='src/sh/fq_subsample',
            job_name='fq_subsample'),
        input=bbnorm,
        filter=ruffus.formatter(r'.+/(?P<LN>[^(_|.)]+)(?P<VL>_?\w*).fastq.gz'),
        output=[r'output/blastqc/{LN[0]}{VL[0]}_R1.fastq.gz',
                r'output/blastqc/{LN[0]}{VL[0]}_R2.fastq.gz'])
    blast_reads = main_pipeline.transform(
        name='blast_reads',
        task_func=tompltools.generate_job_function(
            job_script='src/py/blast_reads.py',
            job_name='blast_reads',
            cpus_per_task=4),
        input=fq_subsample,
        filter=ruffus.suffix('.fastq.gz'),
        output=['.xml'])
    parse_blast_results = main_pipeline.transform(
        name='parse_blast_results',
        task_func=tompltools.generate_job_function(
            job_script='src/py/parse_blast_results.py',
            job_name='parse_blast_results'),
        input=blast_reads,
        filter=ruffus.suffix('.xml'),
        output=['.table'])
    main_pipeline.merge(
        name='plot_blast_resuts',
        task_func=tompltools.generate_job_function(
            job_script='src/r/extract_blast_hits_per_taxid.R',
            job_name='plot_blast_resuts'),
        input=[parse_blast_results, download_taxonomy_databases],
        output=['output/blastqc/plots.pdf'])
    
    # trim reads to 100 bp for edena?
    clip_to_100b = main_pipeline.subdivide(
        name='clip_to_100b',
        task_func=tompltools.generate_job_function(
            job_script='src/sh/clip_to_100b',
            job_name='clip_to_100b'),
        input=bbnorm,
#        filter=ruffus.formatter(r'.+/(?P<LN>[^(_|.)]+)(?P<VL>_?\w*).fastq.gz'),
        filter=ruffus.regex(r'.+/2125-06-11-1.fastq.gz'),
        output=[r'output/trunc_100/2125-06-11-1_R1.fastq.gz',
                r'output/trunc_100/2125-06-11-1_R2.fastq.gz'])

    # print raw and normalised kmer distribution plots
    main_pipeline.merge(
        name='kmer_distribution_plots',
        task_func=tompltools.generate_job_function(
            job_script='src/r/kmer_distribution_plots.R',
            job_name='kmer_distribution_plots'),
        input=bbnorm,
        output=['output/bbnorm/plots.pdf', 'output/bbnorm/plot_data.Rds'])

    # run fastqc on decontaminated and normalised libraries
    main_pipeline.transform(
        name='fastqc',
        task_func=tompltools.generate_job_function(
            job_script='src/sh/fastqc',
            job_name='fastqc',
            cpus_per_task=1),
        input=bbnorm,
        filter=ruffus.formatter(r'.+/(?P<LN>[^(_|.)]+)(?P<VL>_?\w*).fastq.gz'),
        output=[r'output/fastqc/{LN[0]}{VL[0]}_fastqc.html'])

    # overlap step with edena
    # edena_overlaps = main_pipeline.collate(
    #     name='edena_overlaps',
    #     task_func=tompltools.generate_job_function(
    #         job_script='src/sh/edena_overlaps',
    #         job_name='edena_overlaps'),
    #     input=clip_to_100b,
    #     filter=ruffus.formatter(r'.+/(?P<LN>[^_]+)_R\d.fastq.gz'),
    #     output=[r'output/edena/{LN[0]}.ovc'])

    # prepare files with velveth
    # set threads for velvet to 1 !!!
    min_kmer = 71
    max_kmer = 87
    step = 8
    kmer_lengths = [x for x in range(min_kmer, max_kmer + 1, step)]
    velveth_output = list(
        tompytools.flatten_list(
            [('output/velveth_' + str(x) + '/Sequences')
             for x in kmer_lengths]))
    # velveth = main_pipeline.merge(
    #     name='hash_files',
    #     task_func=test_job_function,
    #     # task_func=tompltools.generate_job_function(
    #     #     job_script='src/sh/hash_files',
    #     #     job_name='hash_files'),
    #     input=bbnorm,
    #     output=velveth_output)

    ###################
    # RUFFUS COMMANDS #
    ###################

    # print the flowchart
    ruffus.pipeline_printout_graph(
        "ruffus/flowchart.pdf", "pdf",
        pipeline_name="ASW genome assembly pipeline")

    # run the pipeline
    ruffus.cmdline.run(options, multithread=8)
@transform(task1, suffix(".1"), ".2")
def task2(infiles, outfiles, *extra_params):
    """
    Second task
    """
    with open(tempdir + "jobs.start", "a") as oo:
        oo.write('job = %s\n' % json.dumps([infiles, outfiles]))
    test_job_io(infiles, outfiles, extra_params)
    with open(tempdir + "jobs.finish", "a") as oo:
        oo.write('job = %s\n' % json.dumps([infiles, outfiles]))


#
#    task3
#
@transform(task2, regex('(.*).2'), inputs([r"\1.2", tempdir + "a.1"]), r'\1.3')
@posttask(lambda: do_write(test_file, "Task 3 Done\n"))
def task3(infiles, outfiles, *extra_params):
    """
    Third task
    """
    with open(tempdir + "jobs.start", "a") as oo:
        oo.write('job = %s\n' % json.dumps([infiles, outfiles]))
    test_job_io(infiles, outfiles, extra_params)
    with open(tempdir + "jobs.finish", "a") as oo:
        oo.write('job = %s\n' % json.dumps([infiles, outfiles]))


#
#    task4
#
Example #40
0
        DATADIR = "data.dir"
    else:
        DATADIR = PARAMS['data']

# --------------------------------------
FASTQ_SUFFIXES = ("*.fastq.1.gz",
                  "*.fastq.2.gz",
                  "*.fastq.gz")
FASTQ_DIR = PARAMS['fastq_dir']
# set to value for testing purposes (see regexes below)
if FASTQ_DIR == "?!":
    FASTQ_DIR = ""

FASTQ_FILES = tuple([os.path.join(FASTQ_DIR, suffix_name)
                     for suffix_name in FASTQ_SUFFIXES])
FASTQ_REGEX = regex(os.path.join(FASTQ_DIR, r"(\S+).fastq.1.gz"))
FASTQ_PAIR = os.path.join(FASTQ_DIR, r"\1.fastq.2.gz")
SE_REGEX = regex(os.path.join(FASTQ_DIR, r"(\S+).fastq.gz"))
GENESETS = [y for y in glob.glob(os.path.join("reference.dir/*.gtf.gz"))]


@follows(mkdir("transcripts.dir"))
@transform("%s" % PARAMS['annotations_geneset_gtf'],
           regex("reference.dir/(.+).gtf.gz"),
           r"transcripts.dir/\1.fa")
def makeRepTranscripts(infile, outfile):
    '''
    make a single representative transcript for each
    gene - put into a multi-fasta file
    '''
Example #41
0
#
#    task3
#
@active_if(lambda: pipeline_active_if)
@transform(task1, suffix(".1"), ".3")
def task3(infile, outfile):
    """
    Third task
    """
    helper(infile, outfile)


#
#    task4
#
@collate([task2, task3], regex(r"(.+)\.[23]"), r"\1.4")
def task4(infiles, outfile):
    """
    Fourth task
    """
    helper(infiles, outfile)


#
#    task4
#


@merge(task4, "test_active_if/summary.5")
def task5(infiles, outfile):
    """
Example #42
0
def build_pipeline(options, work_folder, log, context):
    main_pipeline = Pipeline.pipelines['main']

    # Triage
    task_triage = main_pipeline.transform(
        task_func=triage,
        input=os.path.join(work_folder, 'origin'),
        filter=formatter('(?i)'),
        output=os.path.join(work_folder, 'origin.pdf'),
        extras=[log, context])

    task_repair_and_parse_pdf = main_pipeline.transform(
        task_func=repair_and_parse_pdf,
        input=task_triage,
        filter=suffix('.pdf'),
        output='.repaired.pdf',
        output_dir=work_folder,
        extras=[log, context])

    # Split (kwargs for split seems to be broken, so pass plain args)
    task_marker_pages = main_pipeline.split(
        marker_pages,
        task_repair_and_parse_pdf,
        os.path.join(work_folder, '*.marker.pdf'),
        extras=[log, context])

    task_ocr_or_skip = main_pipeline.split(
        ocr_or_skip,
        task_marker_pages,
        [os.path.join(work_folder, '*.ocr.page.pdf'),
         os.path.join(work_folder, '*.skip.page.pdf')],
        extras=[log, context])

    # Rasterize preview
    task_rasterize_preview = main_pipeline.transform(
        task_func=rasterize_preview,
        input=task_ocr_or_skip,
        filter=suffix('.page.pdf'),
        output='.preview.jpg',
        output_dir=work_folder,
        extras=[log, context])
    task_rasterize_preview.active_if(options.rotate_pages)

    # Orient
    task_orient_page = main_pipeline.collate(
        task_func=orient_page,
        input=[task_ocr_or_skip, task_rasterize_preview],
        filter=regex(r".*/(\d{6})(\.ocr|\.skip)(?:\.page\.pdf|\.preview\.jpg)"),
        output=os.path.join(work_folder, r'\1\2.oriented.pdf'),
        extras=[log, context])

    # Rasterize actual
    task_rasterize_with_ghostscript = main_pipeline.transform(
        task_func=rasterize_with_ghostscript,
        input=task_orient_page,
        filter=suffix('.ocr.oriented.pdf'),
        output='.page.png',
        output_dir=work_folder,
        extras=[log, context])

    # Preprocessing subpipeline
    task_preprocess_remove_background = main_pipeline.transform(
        task_func=preprocess_remove_background,
        input=task_rasterize_with_ghostscript,
        filter=suffix(".page.png"),
        output=".pp-background.png",
        extras=[log, context])

    task_preprocess_deskew = main_pipeline.transform(
        task_func=preprocess_deskew,
        input=task_preprocess_remove_background,
        filter=suffix(".pp-background.png"),
        output=".pp-deskew.png",
        extras=[log, context])

    task_preprocess_clean = main_pipeline.transform(
        task_func=preprocess_clean,
        input=task_preprocess_deskew,
        filter=suffix(".pp-deskew.png"),
        output=".pp-clean.png",
        extras=[log, context])

    task_select_ocr_image = main_pipeline.collate(
        task_func=select_ocr_image,
        input=[task_preprocess_clean],
        filter=regex(r".*/(\d{6})(?:\.page|\.pp-.*)\.png"),
        output=os.path.join(work_folder, r"\1.ocr.png"),
        extras=[log, context])

    # HOCR OCR
    task_ocr_tesseract_hocr = main_pipeline.transform(
        task_func=ocr_tesseract_hocr,
        input=task_select_ocr_image,
        filter=suffix(".ocr.png"),
        output=[".hocr", ".txt"],
        extras=[log, context])
    task_ocr_tesseract_hocr.graphviz(fillcolor='"#00cc66"')
    task_ocr_tesseract_hocr.active_if(options.pdf_renderer == 'hocr')

    task_select_visible_page_image = main_pipeline.collate(
        task_func=select_visible_page_image,
        input=[task_rasterize_with_ghostscript,
               task_preprocess_remove_background,
               task_preprocess_deskew,
               task_preprocess_clean],
        filter=regex(r".*/(\d{6})(?:\.page|\.pp-.*)\.png"),
        output=os.path.join(work_folder, r'\1.image'),
        extras=[log, context])
    task_select_visible_page_image.graphviz(shape='diamond')

    task_select_image_layer = main_pipeline.collate(
        task_func=select_image_layer,
        input=[task_select_visible_page_image, task_orient_page],
        filter=regex(r".*/(\d{6})(?:\.image|\.ocr\.oriented\.pdf)"),
        output=os.path.join(work_folder, r'\1.image-layer.pdf'),
        extras=[log, context])
    task_select_image_layer.graphviz(
        fillcolor='"#00cc66"', shape='diamond')

    task_render_hocr_page = main_pipeline.transform(
        task_func=render_hocr_page,
        input=task_ocr_tesseract_hocr,
        filter=regex(r".*/(\d{6})(?:\.hocr)"),
        output=os.path.join(work_folder, r'\1.text.pdf'),
        extras=[log, context])
    task_render_hocr_page.graphviz(fillcolor='"#00cc66"')
    task_render_hocr_page.active_if(options.pdf_renderer == 'hocr')

    # Tesseract OCR + text only PDF
    task_ocr_tesseract_textonly_pdf = main_pipeline.collate(
        task_func=ocr_tesseract_textonly_pdf,
        input=[task_select_ocr_image],
        filter=regex(r".*/(\d{6})(?:\.ocr.png)"),
        output=[os.path.join(work_folder, r'\1.text.pdf'),
                os.path.join(work_folder, r'\1.text.txt')],
        extras=[log, context])
    task_ocr_tesseract_textonly_pdf.graphviz(fillcolor='"#ff69b4"')
    task_ocr_tesseract_textonly_pdf.active_if(options.pdf_renderer == 'sandwich')

    task_weave_layers = main_pipeline.collate(
        task_func=weave_layers,
        input=[task_repair_and_parse_pdf,
               task_render_hocr_page,
               task_ocr_tesseract_textonly_pdf,
               task_select_image_layer],
        filter=regex(
            r".*/((?:\d{6}(?:\.text\.pdf|\.image-layer\.pdf))|(?:origin\.repaired\.pdf))"),
        output=os.path.join(work_folder, r'layers.rendered.pdf'),
        extras=[log, context])
    task_weave_layers.graphviz(fillcolor='"#00cc66"')

    # PDF/A pdfmark
    task_generate_postscript_stub = main_pipeline.transform(
        task_func=generate_postscript_stub,
        input=task_repair_and_parse_pdf,
        filter=formatter(r'\.repaired\.pdf'),
        output=os.path.join(work_folder, 'pdfa.ps'),
        extras=[log, context])
    task_generate_postscript_stub.active_if(options.output_type.startswith('pdfa'))

    # PDF/A conversion
    task_convert_to_pdfa = main_pipeline.merge(
        task_func=convert_to_pdfa,
        input=[task_generate_postscript_stub,
               task_weave_layers],
        output=os.path.join(work_folder, 'pdfa.pdf'),
        extras=[log, context]
    )
    task_convert_to_pdfa.active_if(options.output_type.startswith('pdfa'))

    task_metadata_fixup = main_pipeline.merge(
        task_func=metadata_fixup,
        input=[task_repair_and_parse_pdf,
               task_weave_layers,
               task_convert_to_pdfa],
        output=os.path.join(work_folder, 'metafix.pdf'),
        extras=[log, context]
    )

    task_merge_sidecars = main_pipeline.merge(
        task_func=merge_sidecars,
        input=[task_ocr_tesseract_hocr,
               task_ocr_tesseract_textonly_pdf],
        output=options.sidecar,
        extras=[log, context])
    task_merge_sidecars.active_if(options.sidecar)

    # Optimize
    task_optimize_pdf = main_pipeline.transform(
        task_func=optimize_pdf,
        input=task_metadata_fixup,
        filter=suffix('.pdf'),
        output='.optimized.pdf',
        output_dir=work_folder,
        extras=[log, context])

    # Finalize
    main_pipeline.merge(
        task_func=copy_final,
        input=[task_optimize_pdf],
        output=options.output_file,
        extras=[log, context])
Example #43
0
from ruffus import (transform, follows, collate, files, split, merge,
                    add_inputs, regex, suffix, mkdir, jobs_limit, output_from)
from ruffus.task import active_if

from hts_waterworks.utils.ruffus_utils import (sys_call, main_logger as log,
                                           main_mutex as log_mtx)
from hts_waterworks.bootstrap import cfg, get_chrom_sizes, genome_path
import hts_waterworks.mapping as mapping
import hts_waterworks.clip_seq as clip_seq
from hts_waterworks.utils.common import (bedCommentFilter, readBedLines,
                                         parse_ucsc_range)


@active_if(cfg.getboolean('peaks', 'run_macs'))
@collate(mapping.all_mappers_output, regex(r'(.+)\.treat(.*)\.mapped_reads'), 
         add_inputs(r'\1.control\2.mapped_reads'), r'\1.treat\2.macs.peaks',
         cfg.getfloat('peaks', 'max_FDR'))
def run_macs(in_files, out_peaks, max_fdr):
    """Call peak with MACS (v1.3).
    Apply a maximum FDR threshold and treat centers as peak summits
    
    """
    in_treat, in_control = in_files[0]
    matches = re.search(r'(.*\.treat)(.*)\.mapped_reads', in_treat).groups()
    name = matches[0] + matches[1] + '.macs.peaks'
    max_fdr = cfg.getfloat('peaks', 'max_FDR')
    cmd = 'macs -t %s -c %s --name=%s %s' % (in_treat, in_control, name,
                                               cfg.get('peaks', 'macs_params'))
    sys_call(cmd)
    
Example #44
0
@jobs_limit(cfg.getint('DEFAULT', 'max_throttled_jobs'), 'throttled')
@transform(mapping.all_mappers_output, suffix('.mapped_reads'),
           '.overlap.mapped_reads', cfg.getint('PAS-Seq', 'min_read_count'))
def remove_nonoverlapping_reads(in_bed, out_bed, min_read_count):
    """
    Remove mapped reads that don't overlap with at least *min_read_count* reads
    """
    cmd = "intersectBed -wa -c -a %s -b %s | awk '$(NF) >= %s' |" \
          r"cut -f 1,2,3,4,5,6 > %s" % (in_bed, in_bed, min_read_count + 1,
                                        out_bed)
    sys_call(cmd, file_log=False)


@active_if(cfg.getboolean('PAS-Seq', 'merge_adjacent_reads'))
#@split(mapping.all_mappers_output, regex('(.*).mapped_reads$'),
@split(remove_nonoverlapping_reads, regex('(.*).mapped_reads$'),
           [r'\1.merged.mapped_reads', r'\1.merged.pileup_reads'],
           cfg.getint('PAS-Seq', 'merge_window_width'),
           cfg.getint('PAS-Seq', 'merge_num_iterations'),
           r'\1.merged.mapped_reads', r'\1.merged.pileup_reads',
           cfg.getint('PAS-Seq', 'min_read_count'))
def merge_adjacent_reads(in_bed, out_pattern, window_width, iterations,
                         out_merged, out_pileup, min_read_count):
    """Reassign read ends to a weighted average of adjacent reads"""
    # helper functions for parsing bed files
    filter_lines = lambda l: l.strip() and (not l.startswith('#') or \
                                            l.startswith('"'))
    read_bed_lines = lambda infile: itertools.ifilter(filter_lines, infile)
    
    # sort the input by chrom, stop
    tmpfile = in_bed + '.merged_adjacent_sorted'
#
# 1. pipeline_genesets: any parameters will be added with the
#    prefix "annotations_". The interface will be updated with
#    "annotations_dir" to point to the absolute path names.
PARAMS.update(
    P.peek_parameters(PARAMS["annotations_dir"],
                      "pipeline_genesets.py",
                      "genesets",
                      on_error_raise=__name__ == "__main__",
                      prefix="annotations_",
                      update_interface=True))


# ---------------------------------------------------
# Specific pipeline tasks
@transform(("pipeline.yml", ), regex("(.*)\.(.*)"), r"\1.counts")
def count_words(infile, outfile):
    '''count the number of words in the pipeline configuration files.'''

    # the command line statement we want to execute
    statement = '''awk 'BEGIN { printf("word\\tfreq\\n"); } 
    {for (i = 1; i <= NF; i++) freq[$i]++}
    END { for (word in freq) printf "%%s\\t%%d\\n", word, freq[word] }'
    < %(infile)s > %(outfile)s'''

    # execute command in variable statement.
    #
    # The command will be sent to the cluster.  The statement will be
    # interpolated with any options that are defined in in the
    # configuration files or variable that are declared in the calling
    # function.  For example, %(infile)s will we substituted with the
Example #46
0
                chrom, start, stop = fields[:3]
                strand = fields[5] if len(fields) >= 6 else '+'
                # +:RED, -:GREEN
                color = '255,0,0' if strand == '+' else '0,255,0'
                outfile.write('\t'.join(fields + [start, stop, color]) + '\n')


@transform(bed_color_strand, suffix(''), '.bigbed')
def bed_to_bigbed(in_bed, out_bigbed):
    """Convert a BED file to .bigbed for viewing on UCSC browser"""
    cmd = 'bedToBigBed %s %s.chrom.sizes %s' % (in_bed,
                                                genome_path(), out_bigbed)
    sys_call(cmd)

@transform([bed_uniquefy, clip_and_sort_peaks] + mapping.all_mappers_output,
    regex('(.*mapped_reads).clipped.sorted(.unique|)'),
    #suffix('.mapped_reads'),
    add_inputs(bootstrap.get_chrom_sizes),
    r'\1\2.bedgraph')
    #r'.bedgraph')
def bed_to_bedgraph(in_files, out_bedgraph):
    'extend reads to the full fragment length and create a bedgraph from them'
    in_bed, in_chrom_sizes = in_files
    cmd = ('slopBed -i %s -s -r %s -l 0 -g %s | ' + \
            'bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s') % (
                        in_bed,
                        cfg.getint('DEFAULT','fragment_size') - \
                                            cfg.getint('DEFAULT','tag_size'),
                        in_chrom_sizes, cfg.get('DEFAULT', 'genome'),
                        genome_path(), out_bedgraph)
    sys_call(cmd)
#
#    task1
#
@files(None, tempdir + 'a.1')
def task1(infiles, outfiles, *extra_params):
    """
    First task
    """
    test_job_io(infiles, outfiles, extra_params)



#
#    task2
#
@transform(task1, regex(r".*"), tempdir + 'b.1')
def task2(infiles, outfiles, *extra_params):
    """
    Second task
    """
    test_job_io(infiles, outfiles, extra_params)
    assert(infiles == tempdir + "a.1")



#
#    task3
#
@files(task2, tempdir + 'c.1')
def task3(infiles, outfiles, *extra_params):
    """
# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
@mkdir(tempdir)
@originate(original_files)
def generate_initial_files(out_name):
    with open(out_name, 'w') as outfile:
        pass


#
#    split_fasta_file
#

@posttask(lambda: sys.stderr.write("\tSplit into %d files each\n" % JOBS_PER_TASK))
@subdivide(generate_initial_files,
           # match original files
           regex(r".*\/original_(\d+).fa"),
           [tempdir + r"/files.split.\1.success",                         # flag file for each original file
            tempdir + r"/files.split.\1.*.fa"],                           # glob pattern
           r"\1")                                                        # index of original file
def split_fasta_file(input_file, outputs, original_index):

    #
    # remove previous fasta files
    #
    success_flag = outputs[0]
    output_file_names = outputs[1:]
    for f in output_file_names:
        os.unlink(f)

    #
    # create as many files as we are simulating in JOBS_PER_TASK
Example #49
0

# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
@follows(mkdir(tempdir))
@ruffus.files([[None, tempdir + "a.1"], [None, tempdir + "b.1"]])
def task1(i, o):
    touch(o)


@follows(mkdir(tempdir))
@ruffus.files([[None, tempdir + "c.1"], [None, tempdir + "d.1"]])
def task2(i, o):
    touch(o)


@transform(task1, regex(r"(.+)"), ruffus.inputs(((r"\1"), task2, "test_transform_inputs.*y")), r"\1.output")
def task3(i, o):
    names = ",".join(sorted(i))
    for f in o:
        with open(o,  "w") as ff:
            ff.write(names)


@merge((task3), tempdir + "final.output")
def task4(i, o):
    with open(o, "w") as o_file:
        for f in sorted(i):
            with open(f) as ff:
                o_file.write(f + ":" + ff.read() + ";")

Example #50
0
        output_file,
        log,
        pdfinfo,
        pdfinfo_lock):
    ghostscript.rasterize_pdf(
        input_file=input_file,
        output_file=output_file,
        xres=200,
        yres=200,
        raster_device='jpeggray',
        log=log)


@collate(
    input=[split_pages, rasterize_preview],
    filter=regex(r".*/(\d{6})(\.ocr|\.skip)(?:\.page\.pdf|\.preview\.jpg)"),
    output=os.path.join(work_folder, r'\1\2.oriented.pdf'),
    extras=[_log, _pdfinfo, _pdfinfo_lock])
def orient_page(
        infiles,
        output_file,
        log,
        pdfinfo,
        pdfinfo_lock):

    page_pdf = next(ii for ii in infiles if ii.endswith('.page.pdf'))

    if not options.rotate_pages:
        re_symlink(page_pdf, output_file)
        return
    preview = next(ii for ii in infiles if ii.endswith('.preview.jpg'))
Example #51
0
def make_pipeline_map(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='haloplexpipe')
    # Get a list of paths to all the FASTQ files
    #fastq_files = state.config.get_option('fastqs')
    fastq_files = glob.glob("fastqs/*.gz")
    # Stages are dependent on the state
    stages = Stages(state)

    safe_make_dir('alignments')
    safe_make_dir('processed_fastqs')
    safe_make_dir('metrics')
    safe_make_dir('metrics/amplicon')
    safe_make_dir('metrics/summary')
    safe_make_dir('metrics/pass_samples')
    safe_make_dir('variants')
    safe_make_dir('variants/gatk')
    safe_make_dir('variants/vardict')

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.original_fastqs,
                       name='original_fastqs',
                       output=fastq_files)

    pipeline.transform(
        task_func=stages.run_surecalltrimmer,
        name='run_surecalltrimmer',
        input=output_from('original_fastqs'),
        filter=formatter('fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1.fastq.gz'),
        add_inputs=add_inputs('fastqs/{sample[0]}_R2.fastq.gz'),
        #filter=formatter('fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1_001.fastq.gz'),
        #add_inputs=add_inputs('fastqs/{sample[0]}_R3_001.fastq.gz'),
        extras=['{sample[0]}'],
        # output only needs to know about one file to track progress of the pipeline, but the second certainly exists after this step.
        output='processed_fastqs/{sample[0]}_R1.processed.fastq.gz')
    #output='processed_fastqs/{sample[0]}_R1_001.processed.fastq.gz')

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('run_surecalltrimmer'),
        filter=formatter(
            'processed_fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1.processed.fastq.gz'
        ),
        add_inputs=add_inputs(
            'processed_fastqs/{sample[0]}_R2.processed.fastq.gz'),
        #filter=formatter('processed_fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1_001.processed.fastq.gz'),
        #add_inputs=add_inputs('processed_fastqs/{sample[0]}_R3_001.processed.fastq.gz'),
        extras=['{sample[0]}'],
        output='alignments/{sample[0]}.bam')

    # Run locatit from agilent.  this should produce sorted bam files, so no sorting needed at the next step
    pipeline.collate(task_func=stages.run_locatit,
                     name='run_locatit',
                     input=output_from('align_bwa', 'original_fastqs'),
                     filter=regex(r'.+/(.+_L\d\d\d).+'),
                     output=r'alignments/\1.locatit.bam')

    pipeline.transform(task_func=stages.sort_bam,
                       name='sort_bam',
                       input=output_from('run_locatit'),
                       filter=suffix('.locatit.bam'),
                       output='.sorted.locatit.bam')

    # # # # # Metrics stages # # # # #
    # generate mapping metrics (post locatit)
    pipeline.transform(
        task_func=stages.generate_amplicon_metrics,
        name='generate_amplicon_metrics',
        input=output_from('sort_bam'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'),
        output='metrics/amplicon/{sample[0]}.amplicon-metrics.txt',
        extras=['{sample[0]}'])

    # Intersect the bam file with the region of interest
    pipeline.transform(
        task_func=stages.intersect_bed,
        name='intersect_bed',
        input=output_from('sort_bam'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'),
        output='metrics/summary/{sample[0]}.intersectbed.bam')

    # Calculate coverage metrics from the intersected bam file
    pipeline.transform(task_func=stages.coverage_bed,
                       name='coverage_bed',
                       input=output_from('intersect_bed'),
                       filter=suffix('.intersectbed.bam'),
                       output='.bedtools_hist_all.txt')

    # Count the number of mapped reads
    pipeline.transform(
        task_func=stages.genome_reads,
        name='genome_reads',
        input=output_from('sort_bam'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'),
        output='metrics/summary/{sample[0]}.mapped_to_genome.txt')

    # Count the number of on-target reads
    pipeline.transform(task_func=stages.target_reads,
                       name='target_reads',
                       input=output_from('intersect_bed'),
                       filter=suffix('.intersectbed.bam'),
                       output='.mapped_to_target.txt')

    # Count the number of total reads
    pipeline.transform(
        task_func=stages.total_reads,
        name='total_reads',
        input=output_from('sort_bam'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'),
        output='metrics/summary/{sample[0]}.total_raw_reads.txt')

    # Generate summary metrics from the stats files produces
    pipeline.collate(
        task_func=stages.generate_stats,
        name='generate_stats',
        input=output_from('coverage_bed', 'genome_reads', 'target_reads',
                          'total_reads'),
        #filter=regex(r'.+/(.+BS\d{4,6}.+S\d+)\..+\.txt'),
        filter=regex(
            r'.+/(.+)\.(bedtools_hist_all|mapped_to_genome|mapped_to_target|total_raw_reads)\.txt'
        ),
        output=r'metrics/summary/all_sample.summary.\1.txt',
        extras=[r'\1', 'all_sample.summary.txt'])
    # # # # # Metrics stages end # # # # #

    # # # # # Checking metrics and calling # # # # #
    # Originate to set the location of the metrics summary file
    (pipeline.originate(
        task_func=stages.grab_summary_file,
        name='grab_summary_file',
        output='all_sample.summary.txt').follows('generate_stats'))

    # Awk command to produce a list of bam files passing filters
    pipeline.transform(task_func=stages.filter_stats,
                       name='filter_stats',
                       input=output_from('grab_summary_file'),
                       filter=suffix('.summary.txt'),
                       output='.passed.summary.txt')

    # Touch passed bams to the pass_samples folder and pass the glob of that folder to HaplotypeCaller
    pipeline.subdivide(name='passed_filter_files',
                       task_func=stages.read_samples,
                       input=output_from('filter_stats'),
                       filter=formatter(),
                       output="metrics/pass_samples/*.bam")

    # Call variants using GATK
    (pipeline.transform(
        task_func=stages.call_haplotypecaller_gatk,
        name='call_haplotypecaller_gatk',
        input=output_from('passed_filter_files'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9-_]+).sorted.locatit.bam'),
        output='variants/gatk/{sample[0]}.g.vcf').follows('sort_bam'))

    # Call variants with vardict
    (pipeline.transform(
        task_func=stages.run_vardict,
        name='run_vardict',
        input=output_from('passed_filter_files'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9-_]+).sorted.locatit.bam'),
        output='variants/vardict/{sample[0]}.vcf',
        extras=['{sample[0]}']).follows('sort_bam'))

    pipeline.transform(
        task_func=stages.sort_vcfs,
        name='sort_vcfs',
        input=output_from('run_vardict'),
        filter=formatter('variants/vardict/(?P<sample>[a-zA-Z0-9_-]+).vcf'),
        output='variants/vardict/{sample[0]}.sorted.vcf.gz')

    pipeline.transform(task_func=stages.index_vcfs,
                       name='index_vcfs',
                       input=output_from('sort_vcfs'),
                       filter=suffix('.sorted.vcf.gz'),
                       output='.sorted.vcf.gz.tbi')

    return (pipeline)
# ___________________________________________________________________________
def check_regex_out_of_range_regex_reference_error_task(infiles, outfile,
                                                        prefix1,
                                                        prefix2,
                                                        extension):
    raise Exception("Should blow up first")


test_pipeline = Pipeline("test")

test_pipeline.originate(task_func=generate_initial_files1,
                        output=[tempdir + "/" + prefix + "_name.tmp1" for prefix in "abcdefghi"])

test_pipeline.transform(task_func=check_regex_task,
                        input=generate_initial_files1,
                        filter=regex("(.*)/(?P<PREFIX>[abcd])(_name)(.tmp1)"),
                        output=r"\1/\g<PREFIX>\3.tmp2",  # output file
                        extras=[r"\2",                # extra: prefix = \2
                                r"\g<PREFIX>",        # extra: prefix = \2
                                r"\4"])               # extra: extension
test_pipeline.transform(task_func=check_regex_unmatched_task,
                        input=generate_initial_files1,
                        filter=regex("(.*)/(?P<PREFIX>[abcd])(_name)(.xxx)"),
                        output=r"\1/\g<PREFIXA>\3.tmp2",  # output file
                        extras=[r"\2",                 # extra: prefix = \2
                                r"\g<PREFIX>",         # extra: prefix = \2
                                r"\4"])                # extra: extension

test_pipeline.transform(task_func=check_suffix_task,
                        input=generate_initial_files1,
                        filter=suffix(".tmp1"),
Example #53
0
import hts_waterworks.utils.get_bed_sequence as get_bed_sequence
import hts_waterworks.utils.sequence_motif as sequence_motif
import hts_waterworks.utils.sampling as sampling
import hts_waterworks.utils.motif_significance as motif_significance
from hts_waterworks.bootstrap import cfg, get_genome, genome_path
import hts_waterworks.call_peaks as call_peaks
import hts_waterworks.annotation as annotation


#from ipdb import set_trace as breakpoint

# motif setup

@transform(call_peaks.all_peak_caller_functions + 
          ['*.peaks_summits.%s_around' % cfg.get('peaks', 'peak_summit_size')],
        regex(r'(.*\.peaks$|.*\..*_around$|_genes.promoter.*_ext[\d]+$)'),
        r'\1.top%s.peaks' % cfg.getint('motifs', 'motif_chunk_size'),
        cfg.getint('motifs', 'motif_chunk_size'))
def get_top_peaks(in_peaks, out_subset, num_peaks_to_keep):
    """keep only the top peaks as input to motif discovery"""
    with open(in_peaks) as infile:
        seqs = list(readBedLines(infile, dataOnly=False))
        # sort by score, highest first
        seqs.sort(key=lambda x: int(x[4]), reverse=True)
        with open(out_subset, 'w') as outfile:
            subset = seqs[:num_peaks_to_keep]
            outfile.writelines('\t'.join(map(str, s)) + '\n' for s in subset)

#@follows(get_genome)
@transform([get_top_peaks], suffix(''), '.fasta')
def get_peak_sequence(in_peaks, out_fasta):
Example #54
0
        # with a space.  Don't know if Tesseract 3.02 does the same.

        regex_nested_single_quotes = re.compile(
            r"""title='image "([^"]*)";""")
        with open(badxml, mode='r', encoding='utf-8') as f_in, \
                open(output_file, mode='w', encoding='utf-8') as f_out:
            for line in f_in:
                line = regex_nested_single_quotes.sub(
                    r"""title='image " ";""", line)
                f_out.write(line)


@active_if(options.pdf_renderer == 'hocr')
@collate(
    input=[rasterize_with_ghostscript, preprocess_deskew, preprocess_clean],
    filter=regex(r".*/(\d{6})(?:\.page|\.pp-deskew|\.pp-clean)\.png"),
    output=os.path.join(work_folder, r'\1.image'),
    extras=[_log, _pdfinfo, _pdfinfo_lock])
def select_image_for_pdf(
        infiles,
        output_file,
        log,
        pdfinfo,
        pdfinfo_lock):
    if options.clean_final:
        image_suffix = '.pp-clean.png'
    elif options.deskew:
        image_suffix = '.pp-deskew.png'
    else:
        image_suffix = '.page.png'
    image = next(ii for ii in infiles if ii.endswith(image_suffix))
#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888

import unittest

import json


#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888

#   Tasks


#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
try:
    @transform(None, regex(tempdir + "b"), inputs(tempdir + "a", tempdir + "b"), "task_1.output")
    def task_1 (i, o):
        for f in o:
            open(f, 'w')
except ruffus.ruffus_exceptions.error_task_transform_inputs_multiple_args:
    print("\tExpected exception thrown 1")
except ruffus.ruffus_exceptions.error_inputs_multiple_args:
    print("\tExpected exception thrown 2")

def task_2 (i, o):
    for f in o:
        open(f, 'w')


class Test_task_mkdir(unittest.TestCase):
Example #56
0
           filter=suffix('.page.pdf'),
           output='.preview.jpg',
           output_dir=work_folder,
           extras=[_log, _pdfinfo, _pdfinfo_lock])
def rasterize_preview(input_file, output_file, log, pdfinfo, pdfinfo_lock):
    ghostscript.rasterize_pdf(input_file=input_file,
                              output_file=output_file,
                              xres=200,
                              yres=200,
                              raster_device='jpeggray',
                              log=log)


@collate(
    input=[split_pages, rasterize_preview],
    filter=regex(r".*/(\d{6})(\.ocr|\.skip)(?:\.page\.pdf|\.preview\.jpg)"),
    output=os.path.join(work_folder, r'\1\2.oriented.pdf'),
    extras=[_log, _pdfinfo, _pdfinfo_lock])
def orient_page(infiles, output_file, log, pdfinfo, pdfinfo_lock):

    page_pdf = next(ii for ii in infiles if ii.endswith('.page.pdf'))

    if not options.rotate_pages:
        re_symlink(page_pdf, output_file)
        return
    preview = next(ii for ii in infiles if ii.endswith('.preview.jpg'))

    orient_conf = tesseract.get_orientation(preview,
                                            language=options.language,
                                            timeout=options.tesseract_timeout,
                                            log=log)
Example #57
0
def checkFileExistence(infile, outfile):
    '''check whether file exists.

    Files are uncompressed before checking existence.
    '''
    track = P.snip(infile, ".log")
    compute_file_metrics(infile,
                         outfile,
                         metric="file",
                         suffixes=P.as_list(
                             P.as_list(PARAMS.get('%s_regex_exist' % track,
                                                  ""))))


@collate((buildCheckSums, buildLineCounts, checkFileExistence),
         regex("([^.]*).(.*)"), r"\1.stats")
def mergeFileStatistics(infiles, outfile):
    '''merge all file statistics.'''

    to_cluster = False
    infiles = " ".join(sorted(infiles))

    statement = '''
    %(scriptsdir)s/merge_testing_output.sh
    %(infiles)s
    > %(outfile)s'''
    P.run(statement)


@merge(mergeFileStatistics, "md5_compare.tsv")
def compareCheckSums(infiles, outfile):
Example #58
0
def build_pipeline(options, work_folder, log, context):
    main_pipeline = Pipeline.pipelines['main']

    # Triage
    task_triage = main_pipeline.transform(
        task_func=triage,
        input=os.path.join(work_folder, 'origin'),
        filter=formatter('(?i)'),
        output=os.path.join(work_folder, 'origin.pdf'),
        extras=[log, context])

    task_repair_pdf = main_pipeline.transform(task_func=repair_pdf,
                                              input=task_triage,
                                              filter=suffix('.pdf'),
                                              output='.repaired.pdf',
                                              output_dir=work_folder,
                                              extras=[log, context])

    # Split (kwargs for split seems to be broken, so pass plain args)
    task_split_pages = main_pipeline.split(split_pages,
                                           task_repair_pdf,
                                           os.path.join(
                                               work_folder, '*.page.pdf'),
                                           extras=[log, context])

    # Rasterize preview
    task_rasterize_preview = main_pipeline.transform(
        task_func=rasterize_preview,
        input=task_split_pages,
        filter=suffix('.page.pdf'),
        output='.preview.jpg',
        output_dir=work_folder,
        extras=[log, context])
    task_rasterize_preview.active_if(options.rotate_pages)

    # Orient
    task_orient_page = main_pipeline.collate(
        task_func=orient_page,
        input=[task_split_pages, task_rasterize_preview],
        filter=regex(
            r".*/(\d{6})(\.ocr|\.skip)(?:\.page\.pdf|\.preview\.jpg)"),
        output=os.path.join(work_folder, r'\1\2.oriented.pdf'),
        extras=[log, context])

    # Rasterize actual
    task_rasterize_with_ghostscript = main_pipeline.transform(
        task_func=rasterize_with_ghostscript,
        input=task_orient_page,
        filter=suffix('.ocr.oriented.pdf'),
        output='.page.png',
        output_dir=work_folder,
        extras=[log, context])

    # Preprocessing subpipeline
    task_preprocess_remove_background = main_pipeline.transform(
        task_func=preprocess_remove_background,
        input=task_rasterize_with_ghostscript,
        filter=suffix(".page.png"),
        output=".pp-background.png",
        extras=[log, context])

    task_preprocess_deskew = main_pipeline.transform(
        task_func=preprocess_deskew,
        input=task_preprocess_remove_background,
        filter=suffix(".pp-background.png"),
        output=".pp-deskew.png",
        extras=[log, context])

    task_preprocess_clean = main_pipeline.transform(
        task_func=preprocess_clean,
        input=task_preprocess_deskew,
        filter=suffix(".pp-deskew.png"),
        output=".pp-clean.png",
        extras=[log, context])

    task_select_ocr_image = main_pipeline.collate(
        task_func=select_ocr_image,
        input=[task_preprocess_clean],
        filter=regex(r".*/(\d{6})(?:\.page|\.pp-.*)\.png"),
        output=os.path.join(work_folder, r"\1.ocr.png"),
        extras=[log, context])

    # HOCR OCR
    task_ocr_tesseract_hocr = main_pipeline.transform(
        task_func=ocr_tesseract_hocr,
        input=task_select_ocr_image,
        filter=suffix(".ocr.png"),
        output=".hocr",
        extras=[log, context])
    task_ocr_tesseract_hocr.graphviz(fillcolor='"#00cc66"')
    task_ocr_tesseract_hocr.active_if(options.pdf_renderer == 'hocr')
    if tesseract.v4():
        task_ocr_tesseract_hocr.jobs_limit(2)  # Uses multi-core on its own

    task_select_visible_page_image = main_pipeline.collate(
        task_func=select_visible_page_image,
        input=[
            task_rasterize_with_ghostscript, task_preprocess_remove_background,
            task_preprocess_deskew, task_preprocess_clean
        ],
        filter=regex(r".*/(\d{6})(?:\.page|\.pp-.*)\.png"),
        output=os.path.join(work_folder, r'\1.image'),
        extras=[log, context])
    task_select_visible_page_image.graphviz(shape='diamond')

    task_select_image_layer = main_pipeline.collate(
        task_func=select_image_layer,
        input=[task_select_visible_page_image, task_orient_page],
        filter=regex(r".*/(\d{6})(?:\.image|\.ocr\.oriented\.pdf)"),
        output=os.path.join(work_folder, r'\1.image-layer.pdf'),
        extras=[log, context])
    task_select_image_layer.graphviz(fillcolor='"#00cc66"', shape='diamond')
    task_select_image_layer.active_if(options.pdf_renderer == 'hocr'
                                      or options.pdf_renderer == 'tess4')

    task_render_hocr_page = main_pipeline.transform(
        task_func=render_hocr_page,
        input=task_ocr_tesseract_hocr,
        filter=suffix('.hocr'),
        output='.text.pdf',
        extras=[log, context])
    task_render_hocr_page.graphviz(fillcolor='"#00cc66"')
    task_render_hocr_page.active_if(options.pdf_renderer == 'hocr')

    task_render_hocr_debug_page = main_pipeline.collate(
        task_func=render_hocr_debug_page,
        input=[task_select_visible_page_image, task_ocr_tesseract_hocr],
        filter=regex(r".*/(\d{6})(?:\.image|\.hocr)"),
        output=os.path.join(work_folder, r'\1.debug.pdf'),
        extras=[log, context])
    task_render_hocr_debug_page.graphviz(fillcolor='"#00cc66"')
    task_render_hocr_debug_page.active_if(options.pdf_renderer == 'hocr')
    task_render_hocr_debug_page.active_if(options.debug_rendering)

    # Tesseract OCR + text only PDF
    task_ocr_tesseract_textonly_pdf = main_pipeline.collate(
        task_func=ocr_tesseract_textonly_pdf,
        input=[task_select_ocr_image, task_orient_page],
        filter=regex(r".*/(\d{6})(?:\.ocr.png|\.ocr\.oriented\.pdf)"),
        output=os.path.join(work_folder, r'\1.text.pdf'),
        extras=[log, context])
    task_ocr_tesseract_textonly_pdf.graphviz(fillcolor='"#ff69b4"')
    task_ocr_tesseract_textonly_pdf.active_if(options.pdf_renderer == 'tess4')
    if tesseract.v4():
        task_ocr_tesseract_textonly_pdf.jobs_limit(2)

    task_combine_layers = main_pipeline.collate(
        task_func=combine_layers,
        input=[
            task_render_hocr_page, task_ocr_tesseract_textonly_pdf,
            task_select_image_layer
        ],
        filter=regex(r".*/(\d{6})(?:\.text\.pdf|\.image-layer\.pdf)"),
        output=os.path.join(work_folder, r'\1.rendered.pdf'),
        extras=[log, context])
    task_combine_layers.graphviz(fillcolor='"#00cc66"')
    task_combine_layers.active_if(options.pdf_renderer == 'hocr'
                                  or options.pdf_renderer == 'tess4')

    # Tesseract OCR+PDF
    task_ocr_tesseract_and_render_pdf = main_pipeline.collate(
        task_func=ocr_tesseract_and_render_pdf,
        input=[task_select_visible_page_image, task_orient_page],
        filter=regex(r".*/(\d{6})(?:\.image|\.ocr\.oriented\.pdf)"),
        output=os.path.join(work_folder, r'\1.rendered.pdf'),
        extras=[log, context])
    task_ocr_tesseract_and_render_pdf.graphviz(fillcolor='"#66ccff"')
    task_ocr_tesseract_and_render_pdf.active_if(
        options.pdf_renderer == 'tesseract')
    if tesseract.v4():
        task_ocr_tesseract_and_render_pdf.jobs_limit(2)  # Uses multi-core

    # PDF/A
    task_generate_postscript_stub = main_pipeline.transform(
        task_func=generate_postscript_stub,
        input=task_repair_pdf,
        filter=formatter(r'\.repaired\.pdf'),
        output=os.path.join(work_folder, 'pdfa.ps'),
        extras=[log, context])
    task_generate_postscript_stub.active_if(options.output_type == 'pdfa')

    # Bypass valve
    task_skip_page = main_pipeline.transform(
        task_func=skip_page,
        input=task_orient_page,
        filter=suffix('.skip.oriented.pdf'),
        output='.done.pdf',
        output_dir=work_folder,
        extras=[log, context])

    # Merge pages
    task_merge_pages_ghostscript = main_pipeline.merge(
        task_func=merge_pages_ghostscript,
        input=[
            task_combine_layers, task_render_hocr_debug_page, task_skip_page,
            task_ocr_tesseract_and_render_pdf, task_generate_postscript_stub
        ],
        output=os.path.join(work_folder, 'merged.pdf'),
        extras=[log, context])
    task_merge_pages_ghostscript.active_if(options.output_type == 'pdfa')

    task_merge_pages_qpdf = main_pipeline.merge(
        task_func=merge_pages_qpdf,
        input=[
            task_combine_layers, task_render_hocr_debug_page, task_skip_page,
            task_ocr_tesseract_and_render_pdf, task_repair_pdf
        ],
        output=os.path.join(work_folder, 'merged.pdf'),
        extras=[log, context])
    task_merge_pages_qpdf.active_if(options.output_type == 'pdf')

    # Finalize
    task_copy_final = main_pipeline.merge(
        task_func=copy_final,
        input=[task_merge_pages_ghostscript, task_merge_pages_qpdf],
        output=options.output_file,
        extras=[log, context])
Example #59
0
           r"StrandSpec.dir/\1.strand")
def strandSpecificity(infile, outfile):
    '''This function will determine the strand specificity of your library
    from the bam file'''

    statement = (
        "cgat bam2libtype "
        "--max-iterations 10000 "
        "< {infile} "
        "> {outfile}".format(**locals()))
    return P.run(statement)


@follows(mkdir("BamFiles.dir"))
@transform("*.bam",
           regex("(.*).bam$"),
           r"BamFiles.dir/\1.bam")
def intBam(infile, outfile):
    '''make an intermediate bam file if there is no sequence infomation.
    If there is no sequence quality then make a softlink. Picard tools
    has an issue when quality score infomation is missing'''

    if PARAMS["bam_sequence_stripped"] is True:
        bamstats.addPseudoSequenceQuality(infile,
                                          outfile)
    else:
        bamstats.copyBamFile(infile,
                             outfile)


@follows(mkdir("Picard_stats.dir"))