Beispiel #1
0
    def test_newstyle_ruffus (self):
        test_pipeline = Pipeline("test")

        test_pipeline.files(create_random_numbers, None, tempdir + "random_numbers.list")\
            .follows(mkdir(tempdir))


        test_pipeline.split(task_func = step_4_split_numbers_into_chunks,
                       input = tempdir + "random_numbers.list",
                       output = tempdir + "*.chunks")\
            .follows(create_random_numbers)

        test_pipeline.transform(task_func = step_5_calculate_sum_of_squares,
                           input = step_4_split_numbers_into_chunks,
                           filter = suffix(".chunks"),
                           output = ".sums")

        test_pipeline.merge(task_func = step_6_calculate_variance, input = step_5_calculate_sum_of_squares, output = os.path.join(tempdir, "variance.result"))\
            .posttask(lambda: sys.stdout.write("     hooray\n"))\
            .posttask(print_hooray_again, print_whoppee_again, touch_file(os.path.join(tempdir, "done")))

        test_pipeline.run(multiprocess = 50, verbose = 0)
        output_file = os.path.join(tempdir, "variance.result")
        if not os.path.exists (output_file):
            raise Exception("Missing %s" % output_file)
Beispiel #2
0
def make_pipeline_call(state):
    #this part of the pipeline will take the summary results of "map" and turn them into gatk and undr_rover vcfs
    pipeline = Pipeline(name='genericpipe')

    with open("all_sample.passed.summary.txt", 'r') as inputf:
        passed_files = inputf.read().split('\n')

        stages = Stages(state)

    safe_make_dir('variants')
    safe_make_dir('variants/gatk')

    pipeline.originate(task_func=stages.passed_filter_files,
                       name='passed_filter_files',
                       output=passed_files)

    ###### GATK VARIANT CALLING ######
    # Call variants using GATK
    pipeline.transform(
        task_func=stages.call_haplotypecaller_gatk,
        name='call_haplotypecaller_gatk',
        input=output_from('passed_filter_files'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9-_]+).sort.hq.bam'),
        output='variants/gatk/{sample[0]}.g.vcf')

    return pipeline
    def create_pipeline(self):
        """
        Create new pipeline on the fly without using decorators
        """
        global count_pipelines
        count_pipelines = count_pipelines + 1
        test_pipeline = Pipeline("test %d" % count_pipelines)

        test_pipeline.transform(task_func=transform1,
                                input=input_file,
                                filter=suffix('.txt'),
                                output='.output',
                                extras=[runtime_data])

        test_pipeline.transform(task_func=transform_raise_error,
                                input=input_file,
                                filter=suffix('.txt'),
                                output='.output',
                                extras=[runtime_data])

        test_pipeline.split(task_func=split1,
                            input=input_file,
                            output=split1_outputs)

        test_pipeline.merge(task_func=merge2,
                            input=split1,
                            output=merge2_output)
        return test_pipeline
Beispiel #4
0
    def test_newstyle_ruffus(self):
        test_pipeline = Pipeline("test")

        test_pipeline.files(create_random_numbers, None, tempdir + "random_numbers.list")\
            .follows(mkdir(tempdir))


        test_pipeline.split(task_func = step_4_split_numbers_into_chunks,
                       input = tempdir + "random_numbers.list",
                       output = tempdir + "*.chunks")\
            .follows(create_random_numbers)

        test_pipeline.transform(task_func=step_5_calculate_sum_of_squares,
                                input=step_4_split_numbers_into_chunks,
                                filter=suffix(".chunks"),
                                output=".sums")

        test_pipeline.merge(task_func = step_6_calculate_variance, input = step_5_calculate_sum_of_squares, output = os.path.join(tempdir, "variance.result"))\
            .posttask(lambda: sys.stdout.write("     hooray\n"))\
            .posttask(print_hooray_again, print_whoppee_again, touch_file(os.path.join(tempdir, "done")))

        test_pipeline.run(multiprocess=50, verbose=0)
        output_file = os.path.join(tempdir, "variance.result")
        if not os.path.exists(output_file):
            raise Exception("Missing %s" % output_file)
Beispiel #5
0
def build_pipeline():

    pipe = Pipeline("my_pipeline")

    pipe.originate(
        name="create_three_new_files",
        task_func=create_new_file,
        output=[os.path.join(WORK_DIR, f"file{i}.csv") for i in range(1, 4)],
    )

    pipe.transform(
        name="convert_csv_files_to_tsv",
        task_func=csv_to_tsv,
        input=output_from("create_three_new_files"),
        filter=suffix(".csv"),
        output=".tsv",
    )

    pipe.transform(
        name="calculate_md5",
        task_func=md5,
        input=output_from("convert_csv_files_to_tsv"),
        filter=suffix(".tsv"),
        output=".md5sum",
    )

    return pipe
    def test_newstyle_ruffus(self):
        # alternative syntax
        test_pipeline = Pipeline("test")

        test_pipeline.mkdir(data_dir, work_dir)
        test_pipeline.originate(task_func=task1,
                                output=[os.path.join(data_dir, "%s.1" % aa) for aa in "abcd"])

        test_pipeline.mkdir(filter=suffix(".1"),
                            output=".dir",
                            output_dir=work_dir)

        test_pipeline.transform(task_func=task2,
                                input=task1,
                                filter=suffix(".1"),
                                output=[".1", ".bak"],
                                extras=["extra.tst", 4, r"orig_dir=\1"],
                                output_dir=work_dir)

        test_pipeline.subdivide(task3, task2, suffix(
            ".1"), r"\1.*.2", [r"\1.a.2", r"\1.b.2"], output_dir=data_dir)
        test_pipeline.transform(task4, task3, suffix(
            ".2"), ".3", output_dir=work_dir)
        test_pipeline.merge(task5, task4, os.path.join(data_dir, "summary.5"))
        test_pipeline.run(multiprocess=50, verbose=0)

        with open(os.path.join(data_dir, "summary.5")) as ii:
            active_text = ii.read()
        if active_text != expected_active_text:
            raise Exception("Error:\n\tExpected\n%s\nInstead\n%s\n" %
                            (expected_active_text, active_text))
    def create_pipeline(self):
        """
        Create new pipeline on the fly without using decorators
        """
        global count_pipelines
        count_pipelines = count_pipelines + 1
        test_pipeline = Pipeline("test %d" % count_pipelines)

        test_pipeline.transform(task_func=transform1,
                                input=input_file,
                                filter=suffix('.txt'),
                                output='.output',
                                extras=[runtime_data])

        test_pipeline.transform(task_func=transform_raise_error,
                                input=input_file,
                                filter=suffix('.txt'),
                                output='.output',
                                extras=[runtime_data])

        test_pipeline.split(task_func=split1,
                            input=input_file,
                            output=split1_outputs)

        test_pipeline.merge(task_func=merge2,
                            input=split1,
                            output=merge2_output)
        return test_pipeline
Beispiel #8
0
    def test_newstyle_mkdir_run(self):
        test_pipeline = Pipeline("test")

        test_pipeline.split(task_func=generate_initial_files1,
                            input=1,
                            output=[
                                tempdir + "/" + prefix + "_name.tmp1"
                                for prefix in "abcd"
                            ])

        test_pipeline.transform( task_func = test_transform,
                                 input     = generate_initial_files1,
                                 filter    = formatter(),
                                 output    = "{path[0]}/{basename[0]}.dir/{basename[0]}.tmp2")\
            .mkdir(tempdir + "/test1")\
            .mkdir(tempdir + "/test2")\
            .mkdir(generate_initial_files1, formatter(),
                        ["{path[0]}/{basename[0]}.dir", 3, "{path[0]}/{basename[0]}.dir2"])

        test_pipeline.mkdir(test_transform2, tempdir + "/test3")\
            .mkdir(generate_initial_files1, formatter(),
                    "{path[0]}/{basename[0]}.dir2")
        cleanup_tmpdir()
        pipeline_run([test_transform, test_transform2],
                     verbose=0,
                     multiprocess=2,
                     pipeline="main")
    def test_transform_with_missing_formatter_args_b(self):
        test_pipeline = Pipeline("test")

        test_pipeline.originate(task_func=generate_initial_files,
                                output=[os.path.join(tempdir, ff + ".tmp") for ff in "abcd"])\
            .mkdir(tempdir)

        test_pipeline.transform(
            task_func=transform_with_missing_formatter_args,
            input=generate_initial_files,
            filter=formatter(),
            output="{path[0]}/{basename[0]}.task1",
            extras=['echo {dynamic_message} > {some_file}'])
        s = StringIO()
        test_pipeline.printout(s, [transform_with_missing_formatter_args],
                               verbose=4,
                               wrap_width=10000,
                               pipeline="test")
        self.assertIn("Unmatched field {dynamic_message}", s.getvalue())

        # log to stream
        s = StringIO()
        logger = t_stream_logger(s)
        test_pipeline.run([transform_with_missing_formatter_args],
                          verbose=5,
                          pipeline="test",
                          logger=logger)
        self.assertIn("Unmatched field {dynamic_message}", s.getvalue())
Beispiel #10
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='md5')
    # Get a list of paths to all the input files
    input_files = state.config.get_option('files')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(
        task_func=stages.original_files,
        name='original_files',
        output=input_files)

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.md5_checksum,
        name='md5_checksum',
        input=output_from('original_files'),
        filter=suffix(''),
        output='.md5')


    return pipeline
Beispiel #11
0
    def test_newstyle_ruffus(self):

        test_pipeline = Pipeline("test")

        test_pipeline.split(task_func=split_fasta_file,
                            input=tempdir + "original.fa",
                            output=[tempdir + "files.split.success",
                                    tempdir + "files.split.*.fa"])\
            .posttask(lambda: verbose_output.write("    Split into %d files\n" % 10))

        test_pipeline.transform(task_func=align_sequences,
                                input=split_fasta_file,
                                filter=suffix(".fa"),
                                output=".aln"                     # fa -> aln
                                )\
            .posttask(lambda: verbose_output.write("    Sequences aligned\n"))

        test_pipeline.transform(task_func=percentage_identity,
                                input=align_sequences,      # find all results from align_sequences
                                # replace suffix with:
                                filter=suffix(".aln"),
                                output=[r".pcid",  # .pcid suffix for the result
                                        r".pcid_success"]  # .pcid_success to indicate job completed
                                )\
            .posttask(lambda: verbose_output.write("    %Identity calculated\n"))

        test_pipeline.merge(task_func=combine_results,
                            input=percentage_identity,
                            output=[tempdir + "all.combine_results",
                                    tempdir + "all.combine_results_success"])\
            .posttask(lambda: verbose_output.write("    Results recombined\n"))

        test_pipeline.run(multiprocess=50, verbose=0)
        if not os.path.exists(tempdir + "all.combine_results"):
            raise Exception("Missing %s" % (tempdir + "all.combine_results"))
Beispiel #12
0
    def test_newstyle_task(self):
        test_pipeline = Pipeline("test")

        test_pipeline.files(task1, [[None, tempdir + "a.1"], [None, tempdir + "b.1"]])\
            .follows(mkdir(tempdir))

        test_pipeline.files(task2, [[None, tempdir + "c.1"], [None, tempdir + "d.1"]])\
            .follows(mkdir(tempdir))

        test_pipeline.transform(task_func=task3,
                                input=task1,
                                filter=regex(r"(.+)"),
                                replace_inputs=ruffus.inputs(
                                    ((r"\1"), task2,
                                     "test_transform_inputs.*y")),
                                output=r"\1.output")
        test_pipeline.merge(task4, (task3), tempdir + "final.output")

        test_pipeline.run([task4], multiprocess=10, verbose=0)

        correct_output = "{tempdir}a.1.output:test_transform_inputs.py,{tempdir}a.1,{tempdir}c.1,{tempdir}d.1;{tempdir}b.1.output:test_transform_inputs.py,{tempdir}b.1,{tempdir}c.1,{tempdir}d.1;".format(
            tempdir=tempdir)
        with open(tempdir + "final.output") as ff:
            real_output = ff.read()
        self.assertEqual(correct_output, real_output)
Beispiel #13
0
 def test_newstyle_simpler (self):
     test_pipeline = Pipeline("test")
     test_pipeline.originate(task1, input_file_names, extras = [logger_proxy, logging_mutex])
     test_pipeline.transform(task2, task1, suffix(".1"), ".2", extras = [logger_proxy, logging_mutex])
     test_pipeline.transform(task3, task2, suffix(".2"), ".3", extras = [logger_proxy, logging_mutex])
     test_pipeline.merge(task4, task3, final_file_name, extras = [logger_proxy, logging_mutex])
     #test_pipeline.merge(task4, task3, final_file_name, extras = {"logger_proxy": logger_proxy, "logging_mutex": logging_mutex})
     test_pipeline.run(multiprocess = 500, verbose = 0)
    def test_newstyle_collate(self):
        """
        As above but create pipeline on the fly using object orientated syntax rather than decorators
        """

        #
        # Create pipeline on the fly, joining up tasks
        #
        test_pipeline = Pipeline("test")

        test_pipeline.originate(task_func   = generate_initial_files,
                                output      = original_files)\
            .mkdir(tempdir, tempdir+"/test")


        test_pipeline.subdivide(    task_func   = split_fasta_file,
                                    input       = generate_initial_files,
                                    filter      = regex(r".*\/original_(\d+).fa"),       # match original files
                                    output      = [tempdir + r"/files.split.\1.success", # flag file for each original file
                                                   tempdir + r"/files.split.\1.*.fa"],   # glob pattern
                                    extras      = [r"\1"])\
            .posttask(lambda: sys.stderr.write("\tSplit into %d files each\n" % JOBS_PER_TASK))


        test_pipeline.transform(task_func   = align_sequences,
                                input       = split_fasta_file,
                                filter      = suffix(".fa"),
                                output      = ".aln")  \
            .posttask(lambda: sys.stderr.write("\tSequences aligned\n"))

        test_pipeline.transform(task_func   = percentage_identity,
                                input       = align_sequences,             # find all results from align_sequences
                                filter      = suffix(".aln"),             # replace suffix with:
                                output      = [r".pcid",                  #   .pcid suffix for the result
                                               r".pcid_success"]         #   .pcid_success to indicate job completed
                                )\
            .posttask(lambda: sys.stderr.write("\t%Identity calculated\n"))


        test_pipeline.collate(task_func   = combine_results,
                              input       = percentage_identity,
                              filter      = regex(r".*files.split\.(\d+)\.\d+.pcid"),
                              output      = [tempdir + r"/\1.all.combine_results",
                                             tempdir + r"/\1.all.combine_results_success"])\
            .posttask(lambda: sys.stderr.write("\tResults recombined\n"))

        #
        # Cleanup, printout and run
        #
        self.cleanup_tmpdir()
        s = StringIO()
        test_pipeline.printout(s, [combine_results],
                               verbose=5,
                               wrap_width=10000)
        self.assertTrue(
            re.search('Job needs update:.*Missing files.*', s.getvalue(),
                      re.DOTALL) is not None)
        test_pipeline.run(verbose=0)
def make_pipeline1(
        pipeline_name,  # Pipelines need to have a unique name
        starting_file_names):
    test_pipeline = Pipeline(pipeline_name)

    #   We can change the starting files later using
    #          set_input() for transform etc.
    #       or set_output() for originate
    #   But it can be more convenient to just pass this to the function making the pipeline
    #
    test_pipeline.originate(task_originate, starting_file_names)\
        .follows(mkdir(tempdir), mkdir(tempdir + "/testdir", tempdir + "/testdir2"))\
        .posttask(touch_file(tempdir + "/testdir/whatever.txt"))
    test_pipeline.transform(
        task_func=task_m_to_1,
        name="add_input",
        # Lookup Task from function name task_originate()
        #   So long as this is unique in the pipeline
        input=task_originate,
        # requires an anchor from 3.7 onwards, see
        # https://bugs.python.org/issue34982
        filter=regex(r"^(.*)"),
        add_inputs=add_inputs(tempdir + "/testdir/whatever.txt"),
        output=r"\1.22")
    test_pipeline.transform(
        task_func=task_1_to_1,
        name="22_to_33",
        # Lookup Task from Task name
        #   Function name is not unique in the pipeline
        input=output_from("add_input"),
        filter=suffix(".22"),
        output=".33")
    tail_task = test_pipeline.transform(
        task_func=task_1_to_1,
        name="33_to_44",
        # Ask Pipeline to lookup Task from Task name
        input=test_pipeline["22_to_33"],
        filter=suffix(".33"),
        output=".44")

    #   Set the tail task so that users of my sub pipeline can use it as a dependency
    #       without knowing the details of task names
    #
    #   Use Task() object directly without having to lookup
    test_pipeline.set_tail_tasks([tail_task])

    #   If we try to connect a Pipeline without tail tasks defined, we have to
    #       specify the exact task within the Pipeline.
    #   Otherwise Ruffus will not know which task we mean and throw an exception
    if DEBUG_do_not_define_tail_task:
        test_pipeline.set_tail_tasks([])

    # Set the head task so that users of my sub pipeline send input into it
    #   without knowing the details of task names
    test_pipeline.set_head_tasks([test_pipeline[task_originate]])

    return test_pipeline
Beispiel #16
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='ovarian_cancer_pipeline')
    # Get a list of paths to all the FASTQ files
    fastq_files = state.config.get_option('fastqs')
    human_reference_genome_file = state.config.get_option('human_reference_genome')
    # Stages are dependent on the state
    stages = PipelineStages(state)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(
        task_func=stages.original_fastqs,
        name='original_fastqs',
        output=fastq_files)

    # The human reference genome in FASTA format
    pipeline.originate(
        task_func=stages.human_reference_genome,
        name='human_reference_genome',
        output=human_reference_genome_file)

    # Index the human reference genome with BWA, needed before we can map reads
    pipeline.transform(
        task_func=stages.index_ref_bwa,
        name='index_ref_bwa',
        input=output_from('human_reference_genome'),
        filter=suffix('.fa'),
        output=['.fa.amb', '.fa.ann', '.fa.pac', '.fa.sa', '.fa.bwt'])

    # Align paired end reads in FASTQ to the reference producing a BAM file
    (pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name. 
        # This will be the first input to the stage.
        # We assume the sample name may consist of only alphanumeric
        # characters.
        filter=formatter('.+/(?P<sample>[_a-zA-Z0-9]+)_R1.fastq.gz'),
        # Add one more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=['{sample[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='{path[0]}/{sample[0]}.bam')
        .follows('index_ref_bwa'))


    return pipeline
    def test_newstyle_collate(self):
        """
        As above but create pipeline on the fly using object orientated syntax rather than decorators
        """

        #
        # Create pipeline on the fly, joining up tasks
        #
        test_pipeline = Pipeline("test")

        test_pipeline.originate(task_func=generate_initial_files,
                                output=original_files)\
            .mkdir(tempdir, tempdir+"/test")

        test_pipeline.subdivide(task_func=split_fasta_file,
                                input=generate_initial_files,
                                # match original files
                                filter=regex(r".*\/original_(\d+).fa"),
                                output=[tempdir + r"/files.split.\1.success",  # flag file for each original file
                                        tempdir + r"/files.split.\1.*.fa"],   # glob pattern
                                extras=[r"\1"])\
            .posttask(lambda: sys.stderr.write("\tSplit into %d files each\n" % JOBS_PER_TASK))

        test_pipeline.transform(task_func=align_sequences,
                                input=split_fasta_file,
                                filter=suffix(".fa"),
                                output=".aln")  \
            .posttask(lambda: sys.stderr.write("\tSequences aligned\n"))

        test_pipeline.transform(task_func=percentage_identity,
                                input=align_sequences,             # find all results from align_sequences
                                # replace suffix with:
                                filter=suffix(".aln"),
                                output=[r".pcid",  # .pcid suffix for the result
                                        r".pcid_success"]  # .pcid_success to indicate job completed
                                )\
            .posttask(lambda: sys.stderr.write("\t%Identity calculated\n"))

        test_pipeline.collate(task_func=combine_results,
                              input=percentage_identity,
                              filter=regex(r".*files.split\.(\d+)\.\d+.pcid"),
                              output=[tempdir + r"/\1.all.combine_results",
                                      tempdir + r"/\1.all.combine_results_success"])\
            .posttask(lambda: sys.stderr.write("\tResults recombined\n"))

        #
        # Cleanup, printout and run
        #
        self.cleanup_tmpdir()
        s = StringIO()
        test_pipeline.printout(s, [combine_results],
                               verbose=5, wrap_width=10000)
        self.assertTrue(re.search(
            'Job needs update:.*Missing files.*', s.getvalue(), re.DOTALL) is not None)
        test_pipeline.run(verbose=0)
def make_pipeline1(pipeline_name,   # Pipelines need to have a unique name
                   starting_file_names):
    test_pipeline = Pipeline(pipeline_name)

    #   We can change the starting files later using
    #          set_input() for transform etc.
    #       or set_output() for originate
    #   But it can be more convenient to just pass this to the function making the pipeline
    #
    test_pipeline.originate(task_originate, starting_file_names)\
        .follows(mkdir(tempdir), mkdir(tempdir + "/testdir", tempdir + "/testdir2"))\
        .posttask(touch_file(tempdir + "/testdir/whatever.txt"))
    test_pipeline.transform(task_func=task_m_to_1,
                            name="add_input",
                            # Lookup Task from function name task_originate()
                            #   So long as this is unique in the pipeline
                            input=task_originate,
                            # requires an anchor from 3.7 onwards, see
                            # https://bugs.python.org/issue34982
                            filter=regex(r"^(.*)"),
                            add_inputs=add_inputs(
                                tempdir + "/testdir/whatever.txt"),
                            output=r"\1.22")
    test_pipeline.transform(task_func=task_1_to_1,
                            name="22_to_33",
                            # Lookup Task from Task name
                            #   Function name is not unique in the pipeline
                            input=output_from("add_input"),
                            filter=suffix(".22"),
                            output=".33")
    tail_task = test_pipeline.transform(task_func=task_1_to_1,
                                        name="33_to_44",
                                        # Ask Pipeline to lookup Task from Task name
                                        input=test_pipeline["22_to_33"],
                                        filter=suffix(".33"),
                                        output=".44")

    #   Set the tail task so that users of my sub pipeline can use it as a dependency
    #       without knowing the details of task names
    #
    #   Use Task() object directly without having to lookup
    test_pipeline.set_tail_tasks([tail_task])

    #   If we try to connect a Pipeline without tail tasks defined, we have to
    #       specify the exact task within the Pipeline.
    #   Otherwise Ruffus will not know which task we mean and throw an exception
    if DEBUG_do_not_define_tail_task:
        test_pipeline.set_tail_tasks([])

    # Set the head task so that users of my sub pipeline send input into it
    #   without knowing the details of task names
    test_pipeline.set_head_tasks([test_pipeline[task_originate]])

    return test_pipeline
Beispiel #19
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='test_pipeline')
    # Get a list of paths to all the FASTQ files
    input_files = state.config.get_option('files')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(
        task_func=stages.original_files,
        name='original_files',
        output=input_files)

    pipeline.transform(
        task_func=stages.stage1,
        name='stage1',
        input=output_from('original_files'),
        filter=suffix('.0'),
        output='.1')

    pipeline.transform(
        task_func=stages.stage2,
        name='stage2',
        input=output_from('stage1'),
        filter=suffix('.1'),
        output='.2')

    pipeline.transform(
        task_func=stages.stage3,
        name='stage3',
        input=output_from('stage2'),
        filter=suffix('.2'),
        output='.3')

    pipeline.transform(
        task_func=stages.stage4,
        name='stage4',
        input=output_from('stage3'),
        filter=suffix('.3'),
        output='.4')

    pipeline.transform(
        task_func=stages.stage5,
        name='stage5',
        input=output_from('stage4'),
        filter=suffix('.4'),
        output='.5')

    return pipeline
Beispiel #20
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='twin ion')
    # Get a list of paths to all the MZML files
    mzml_files = state.config.get_option('mzml')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original MZML files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(
        task_func=stages.original_mzml,
        name='original_mzml',
        output=mzml_files)

    pipeline.transform(
        task_func=stages.resample,
        name='resample',
        input=output_from('original_mzml'),
        filter=suffix('.mzML'),
        output='.resample.mzML')

    pipeline.transform(
        task_func=stages.noise_filter_sgolay,
        name='noise_filter_sgolay',
        input=output_from('resample'),
        filter=suffix('.resample.mzML'),
        output='.denoise.mzML')

    pipeline.transform(
        task_func=stages.baseline_filter,
        name='baseline_filter',
        input=output_from('noise_filter_sgolay'),
        filter=suffix('.denoise.mzML'),
        output='.baseline.mzML')

    pipeline.transform(
        task_func=stages.peak_picker_hires,
        name='peak_picker_hires',
        input=output_from('baseline_filter'),
        filter=suffix('.baseline.mzML'),
        output='.peaks.mzML')

    pipeline.transform(
        task_func=stages.feature_finder_centroid,
        name='feature_finder_centroid',
        input=output_from('peak_picker_hires'),
        filter=suffix('.peaks.mzML'),
        output='.featureXML')

    return pipeline
    def test_newstyle_no_re_match (self):

        test_pipeline = Pipeline("test")
        test_pipeline.originate(task_1, tempdir + "a").mkdir(tempdir)
        test_pipeline.transform(task_2, task_1, regex("b"), "task_2.output")


        save_to_str_logger = t_save_to_str_logger()
        test_pipeline.run(multiprocess = 10, logger = save_to_str_logger, verbose = 1)
        print(save_to_str_logger.warning_str)
        self.assertTrue("no file names matched" in save_to_str_logger.warning_str)
        print("\n    Warning printed out correctly", file=sys.stderr)
    def test_newstyle_no_re_match(self):

        test_pipeline = Pipeline("test")
        test_pipeline.originate(task_1, tempdir + "a").mkdir(tempdir)
        test_pipeline.transform(task_2, task_1, regex("b"), "task_2.output")

        save_to_str_logger = t_save_to_str_logger()
        test_pipeline.run(
            multiprocess=10, logger=save_to_str_logger, verbose=1)
        print(save_to_str_logger.warning_str)
        self.assertTrue(
            "no file names matched" in save_to_str_logger.warning_str)
        print("\n    Warning printed out correctly", file=sys.stderr)
Beispiel #23
0
 def test_newstyle_task(self):
     """
     Same as above but construct a new pipeline on the fly without decorators
     """
     test_pipeline = Pipeline("test")
     test_pipeline.files(task1, None, tempdir + 'a.1')\
         .follows(mkdir(tempdir))
     test_pipeline.transform(task_func=task2,
                             input=task1,
                             filter=regex(r".*"),
                             output=tempdir + 'b.1')
     test_pipeline.files(task3, task2, tempdir + 'c.1')
     test_pipeline.files(task4, [[None, tempdir + 'd.1'], [None, tempdir + 'e.1']])\
         .follows(task3)
     test_pipeline.files(task5, task4, tempdir + "f.1")
     test_pipeline.run(multiprocess=10, verbose=0)
 def test_newstyle_no_re_match (self):
     try:
         test_pipeline = Pipeline("test")
         test_pipeline.transform(task_func = task_2,
                                 input = None,
                                 filter = regex(tempdir + "b"),
                                 replace_inputs = inputs(tempdir + "a", tempdir + "b"),
                                 output = "task_1.output")
         test_pipeline.run(multiprocess = 10, verbose = 0)
     except ruffus.ruffus_exceptions.error_task_transform_inputs_multiple_args:
         print("\tExpected exception thrown 1")
         return
     except ruffus.ruffus_exceptions.error_inputs_multiple_args:
         print("\tExpected exception thrown 2")
         return
     raise Exception("Inputs(...) with multiple arguments should have thrown an exception")
 def test_newstyle_task (self):
     """
     Same as above but construct a new pipeline on the fly without decorators
     """
     test_pipeline = Pipeline("test")
     test_pipeline.files(task1, None, tempdir + 'a.1')\
         .follows(mkdir(tempdir))
     test_pipeline.transform(task_func   = task2,
                             input       = task1,
                             filter      = regex(r".*"),
                             output      = tempdir + 'b.1')
     test_pipeline.files(task3, task2, tempdir + 'c.1')
     test_pipeline.files(task4, [[None, tempdir + 'd.1'], [None, tempdir + 'e.1']])\
         .follows(task3)
     test_pipeline.files(task5, task4, tempdir + "f.1")
     test_pipeline.run(multiprocess = 10, verbose = 0)
 def test_newstyle_no_re_match(self):
     try:
         test_pipeline = Pipeline("test")
         test_pipeline.transform(task_func=task_2,
                                 input=None,
                                 filter=regex(tempdir + "b"),
                                 replace_inputs=inputs(
                                     tempdir + "a", tempdir + "b"),
                                 output="task_1.output")
         test_pipeline.run(multiprocess=10, verbose=0)
     except ruffus.ruffus_exceptions.error_task_transform_inputs_multiple_args:
         print("\tExpected exception thrown 1")
         return
     except ruffus.ruffus_exceptions.error_inputs_multiple_args:
         print("\tExpected exception thrown 2")
         return
     raise Exception(
         "Inputs(...) with multiple arguments should have thrown an exception"
     )
Beispiel #27
0
def make_pipeline2( pipeline_name = "pipeline2"):
    test_pipeline2 = Pipeline(pipeline_name)
    test_pipeline2.transform(task_func   = task_1_to_1,
                             # task name
                            name        = "44_to_55",
                             # placeholder: will be replaced later with set_input()
                            input       = None,
                            filter      = suffix(".44"),
                            output      = ".55")
    test_pipeline2.merge(   task_func   = task_m_to_1,
                            input       = test_pipeline2["44_to_55"],
                            output      = tempdir + "/final.output",)

    # Set head and tail
    test_pipeline2.set_tail_tasks([test_pipeline2[task_m_to_1]])
    if not DEBUG_do_not_define_head_task:
        test_pipeline2.set_head_tasks([test_pipeline2["44_to_55"]])

    return test_pipeline2
Beispiel #28
0
def make_pipeline_call(state):
    #this part of the pipeline will take the summary results of "map" and turn them into gatk and undr_rover vcfs
    pipeline = Pipeline(name='hiplexpipe')

    with open("all_sample.passed.summary.txt", 'r') as inputf:
        passed_files = inputf.read().split('\n')

        stages = Stages(state)

    safe_make_dir('variants')
    safe_make_dir('variants/gatk')
    safe_make_dir('variants/undr_rover')
    safe_make_dir('variants/undr_rover/coverdir')

    pipeline.originate(task_func=stages.passed_filter_files,
                       name='passed_filter_files',
                       output=passed_files)

    # Call variants using undr_rover
    pipeline.transform(
        task_func=stages.apply_undr_rover,
        name='apply_undr_rover',
        input=output_from('passed_filter_files'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).clipped.sort.hq.bam'),
        output='variants/undr_rover/{sample[0]}.vcf',
        extras=['{sample[0]}'])

    #### concatenate undr_rover vcfs ####
    pipeline.transform(
        task_func=stages.sort_vcfs,
        name='sort_vcfs',
        input=output_from('apply_undr_rover'),
        filter=formatter('variants/undr_rover/(?P<sample>[a-zA-Z0-9_-]+).vcf'),
        output='variants/undr_rover/{sample[0]}.sorted.vcf.gz')

    pipeline.transform(task_func=stages.index_vcfs,
                       name='index_vcfs',
                       input=output_from('sort_vcfs'),
                       filter=suffix('.sorted.vcf.gz'),
                       output='.sorted.vcf.gz.tbi')

    ###### GATK VARIANT CALLING ######
    # Call variants using GATK
    pipeline.transform(
        task_func=stages.call_haplotypecaller_gatk,
        name='call_haplotypecaller_gatk',
        input=output_from('passed_filter_files'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9-_]+).clipped.sort.hq.bam'),
        output='variants/gatk/{sample[0]}.g.vcf')

    return pipeline
Beispiel #29
0
    def create_pipeline(self):
        #each pipeline has a different name
        global cnt_pipelines
        cnt_pipelines = cnt_pipelines + 1
        test_pipeline = Pipeline("test %d" % cnt_pipelines)

        test_pipeline.originate(
            task_func=generate_initial_files1,
            output=[tempdir + prefix + "_name.tmp1" for prefix in "abcd"])

        test_pipeline.originate(
            task_func=generate_initial_files2,
            output=[tempdir + "e_name.tmp1", tempdir + "f_name.tmp1"])

        test_pipeline.originate(
            task_func=generate_initial_files3,
            output=[tempdir + "g_name.tmp1", tempdir + "h_name.tmp1"])

        test_pipeline.originate(task_func=generate_initial_files4,
                                output=tempdir + "i_name.tmp1")

        test_pipeline.collate(task_func=test_task2,
                              input=[
                                  generate_initial_files1,
                                  generate_initial_files2,
                                  generate_initial_files3,
                                  generate_initial_files4
                              ],
                              filter=formatter(),
                              output="{path[0]}/all.tmp2")

        test_pipeline.transform(task_func=test_task3,
                                input=test_task2,
                                filter=suffix(".tmp2"),
                                output=".tmp3")

        test_pipeline.transform(task_func=test_task4,
                                input=test_task3,
                                filter=suffix(".tmp3"),
                                output=".tmp4")
        return test_pipeline
Beispiel #30
0
 def test_newstyle_simpler(self):
     test_pipeline = Pipeline("test")
     test_pipeline.originate(task1,
                             input_file_names,
                             extras=[logger_proxy, logging_mutex])
     test_pipeline.transform(task2,
                             task1,
                             suffix(".1"),
                             ".2",
                             extras=[logger_proxy, logging_mutex])
     test_pipeline.transform(task3,
                             task2,
                             suffix(".2"),
                             ".3",
                             extras=[logger_proxy, logging_mutex])
     test_pipeline.merge(task4,
                         task3,
                         final_file_name,
                         extras=[logger_proxy, logging_mutex])
     #test_pipeline.merge(task4, task3, final_file_name, extras = {"logger_proxy": logger_proxy, "logging_mutex": logging_mutex})
     test_pipeline.run(multiprocess=500, verbose=0)
Beispiel #31
0
    def test_newstyle_mkdir_run(self):
        test_pipeline = Pipeline("test")

        test_pipeline.split(task_func = generate_initial_files1,
                            input = 1,
                            output = [tempdir +  "/" + prefix + "_name.tmp1" for prefix in "abcd"])

        test_pipeline.transform( task_func = test_transform,
                                 input     = generate_initial_files1,
                                 filter    = formatter(),
                                 output    = "{path[0]}/{basename[0]}.dir/{basename[0]}.tmp2")\
            .mkdir(tempdir + "/test1")\
            .mkdir(tempdir + "/test2")\
            .mkdir(generate_initial_files1, formatter(),
                        ["{path[0]}/{basename[0]}.dir", 3, "{path[0]}/{basename[0]}.dir2"])

        test_pipeline.mkdir(test_transform2, tempdir + "/test3")\
            .mkdir(generate_initial_files1, formatter(),
                    "{path[0]}/{basename[0]}.dir2")
        cleanup_tmpdir()
        pipeline_run([test_transform, test_transform2], verbose=0, multiprocess = 2, pipeline= "main")
Beispiel #32
0
 def test_newstyle_ruffus (self):
     test_pipeline = Pipeline("test")
     test_pipeline.originate(start_task, ["a.1", "b.1"])
     test_pipeline.transform(same_file_name_task, start_task, suffix(".1"), ".1")
     test_pipeline.transform(linked_file_name_task, start_task, suffix(".1"), ".linked.1")
     test_pipeline.transform(final_task, [linked_file_name_task, same_file_name_task], suffix(".1"), ".3")
     test_pipeline.run(log_exceptions = True, verbose = 0)
Beispiel #33
0
def make_pipeline2(pipeline_name="pipeline2"):
    test_pipeline2 = Pipeline(pipeline_name)
    test_pipeline2.transform(
        task_func=task_1_to_1,
        # task name
        name="44_to_55",
        # placeholder: will be replaced later with set_input()
        input=None,
        filter=suffix(".44"),
        output=".55")
    test_pipeline2.merge(
        task_func=task_m_to_1,
        input=test_pipeline2["44_to_55"],
        output=tempdir + "/final.output",
    )

    # Set head and tail
    test_pipeline2.set_tail_tasks([test_pipeline2[task_m_to_1]])
    if not DEBUG_do_not_define_head_task:
        test_pipeline2.set_head_tasks([test_pipeline2["44_to_55"]])

    return test_pipeline2
    def test_newstyle_ruffus(self):
        # alternative syntax
        test_pipeline = Pipeline("test")

        test_pipeline.mkdir(data_dir, work_dir)
        test_pipeline.originate(
            task_func=task1,
            output=[os.path.join(data_dir, "%s.1" % aa) for aa in "abcd"])

        test_pipeline.mkdir(filter=suffix(".1"),
                            output=".dir",
                            output_dir=work_dir)

        test_pipeline.transform(task_func=task2,
                                input=task1,
                                filter=suffix(".1"),
                                output=[".1", ".bak"],
                                extras=["extra.tst", 4, r"orig_dir=\1"],
                                output_dir=work_dir)

        test_pipeline.subdivide(task3,
                                task2,
                                suffix(".1"),
                                r"\1.*.2", [r"\1.a.2", r"\1.b.2"],
                                output_dir=data_dir)
        test_pipeline.transform(task4,
                                task3,
                                suffix(".2"),
                                ".3",
                                output_dir=work_dir)
        test_pipeline.merge(task5, task4, os.path.join(data_dir, "summary.5"))
        test_pipeline.run(multiprocess=50, verbose=0)

        with open(os.path.join(data_dir, "summary.5")) as ii:
            active_text = ii.read()
        if active_text != expected_active_text:
            raise Exception("Error:\n\tExpected\n%s\nInstead\n%s\n" %
                            (expected_active_text, active_text))
    def test_transform_with_missing_formatter_args_b(self):
        test_pipeline = Pipeline("test")


        test_pipeline.originate(task_func   = generate_initial_files,
                                output      = [os.path.join(tempdir, ff + ".tmp") for ff in "abcd"])\
            .mkdir(tempdir)


        test_pipeline.transform(task_func   = transform_with_missing_formatter_args,
                                input       = generate_initial_files,
                                filter      = formatter(),
                                output      = "{path[0]}/{basename[0]}.task1",
                                extras      =['echo {dynamic_message} > {some_file}'])
        s = StringIO()
        test_pipeline.printout(s, [transform_with_missing_formatter_args], verbose=4, wrap_width = 10000, pipeline= "test")
        self.assertIn("Missing key = {dynamic_message}", s.getvalue())

        #log to stream
        s = StringIO()
        logger = t_stream_logger(s)
        test_pipeline.run([transform_with_missing_formatter_args], verbose=5, pipeline= "test", logger=logger)
        self.assertIn("Missing key = {dynamic_message}", s.getvalue())
    def create_pipeline (self):
        #each pipeline has a different name
        global cnt_pipelines
        cnt_pipelines = cnt_pipelines + 1
        test_pipeline = Pipeline("test %d" % cnt_pipelines)

        test_pipeline.originate(task_func   = generate_initial_files1,
                                output      = [tempdir + prefix + "_name.tmp1" for prefix in "abcd"])

        test_pipeline.originate(task_func   = generate_initial_files2,
                                output      = [tempdir +  "e_name.tmp1", tempdir +  "f_name.tmp1"])

        test_pipeline.originate(task_func   = generate_initial_files3,
                                output      = [tempdir +  "g_name.tmp1", tempdir +  "h_name.tmp1"])

        test_pipeline.originate(task_func   = generate_initial_files4,
                                output      = tempdir +  "i_name.tmp1")

        test_pipeline.collate(  task_func   = test_task2,
                                input       = [generate_initial_files1,
                                               generate_initial_files2,
                                               generate_initial_files3,
                                               generate_initial_files4],
                                filter      = formatter(),
                                output      = "{path[0]}/all.tmp2")

        test_pipeline.transform(task_func   = test_task3,
                                input       = test_task2,
                                filter      = suffix(".tmp2"),
                                output      = ".tmp3")

        test_pipeline.transform(task_func   = test_task4,
                                input       = test_task3,
                                filter      = suffix(".tmp3"),
                                output      = ".tmp4")
        return test_pipeline
Beispiel #37
0
    def test_newstyle_task(self):
        test_pipeline = Pipeline("test")

        test_pipeline.files(task1, [[None, tempdir + "a.1"], [None, tempdir + "b.1"]])\
            .follows(mkdir(tempdir))

        test_pipeline.files(task2, [[None, tempdir + "c.1"], [None, tempdir + "d.1"]])\
            .follows(mkdir(tempdir))

        test_pipeline.transform(task_func=task3,
                                input=task1,
                                filter=regex(r"(.+)"),
                                replace_inputs=ruffus.inputs(
                                    ((r"\1"), task2, "test_transform_inputs.*y")),
                                output=r"\1.output")
        test_pipeline.merge(task4, (task3), tempdir + "final.output")

        test_pipeline.run([task4], multiprocess=10, verbose=0)

        correct_output = "{tempdir}a.1.output:test_transform_inputs.py,{tempdir}a.1,{tempdir}c.1,{tempdir}d.1;{tempdir}b.1.output:test_transform_inputs.py,{tempdir}b.1,{tempdir}c.1,{tempdir}d.1;".format(
            tempdir=tempdir)
        with open(tempdir + "final.output") as ff:
            real_output = ff.read()
        self.assertEqual(correct_output, real_output)
Beispiel #38
0
    def test_newstyle_ruffus(self):

        test_pipeline = Pipeline("test")
        test_pipeline.originate(task_func=task1,
                                output=[tempdir + 'a.1'] + runtime_files)
        test_pipeline.transform(task2, task1, suffix(".1"), ".2")
        test_pipeline.transform(task_func=task3,
                                input=task2,
                                filter=suffix(".2"),
                                output=".3")
        test_pipeline.transform(task_func=task4,
                                input=runtime_parameter("a"),
                                filter=suffix(".3"),
                                output=".4").follows(task3)
        test_pipeline.run(verbose=0, runtime_data={"a": runtime_files})
Beispiel #39
0
    def test_newstyle_ruffus(self):

        test_pipeline = Pipeline("test")
        test_pipeline.originate(task_func=task1,
                                output=[tempdir + 'a.1'] + runtime_files)
        test_pipeline.transform(task2, task1, suffix(".1"), ".2")
        test_pipeline.transform(task_func=task3,
                                input=task2,
                                filter=suffix(".2"),
                                output=".3")
        test_pipeline.transform(task_func=task4,
                                input=runtime_parameter("a"),
                                filter=suffix(".3"),
                                output=".4").follows(task3)
        test_pipeline.run(verbose=0, runtime_data={"a": runtime_files})
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='fastq2bam')
    # Get a list of paths to all the FASTQ files
    input_files = state.config.get_option('files')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.original_files,
                       name='original_files',
                       output=input_files)

    pipeline.transform(
        task_func=stages.fastq2bam,
        name='fastq2bam',
        input=output_from('original_files'),
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+)_R1.fastq.gz'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'),
        extras=['{sample[0]}'],
        output='{path[0]}/out/{sample[0]}.bam')

    pipeline.transform(
        task_func=stages.validate_prealigned_bam,
        name='validate_prealigned_bam',
        input=output_from('fastq2bam'),
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).bam'),
        output='{path[0]}/{sample[0]}.validation')

    pipeline.transform(
        task_func=stages.align,
        name='align',
        input=output_from('validate_prealigned_bam'),
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).validation'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}.bam'),
        output='{path[0]}/{sample[0]}.mapped.bam')

    return pipeline
Beispiel #41
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name="radpipe")

    # Stages are dependent on the state
    stages = PipelineStages(state)

    # Get a list of library objects.
    libraries = parse_libraries(
        libraries=state.config.get_options("libraries"))

    # Get a list of input files
    input_files = [l.files for l in libraries]
    # input_files = [item for sublist in input_files for item in sublist]
    state.logger.info("Input files: " + str(input_files))

    # Get a list of all samples for each library
    samples_dict = OrderedDict()
    for l in libraries:
        samples_dict[l.name] = l.samples
    state.logger.debug("Samples: " + str(samples_dict))

    # Make sure that there are no duplicate samples
    sample_list = [
        item for sublist in samples_dict.values() for item in sublist
    ]
    sample_counts = Counter(sample_list)
    for sample in sample_counts:
        if sample_counts[sample] > 1:
            print("Sample {} appears {} times in the barcodes files. "
                  "Sample names must be unique".format(sample,
                                                       sample_counts[sample]))
            sys.exit(radpipe.error_codes.INVALID_INPUT_FILE)

    # Define output directories
    output_dir = get_output_paths(state)
    state.logger.debug(output_dir)

    # Allow multiple comma-separated tasks
    if len(state.options.target_tasks) == 1:
        state.options.target_tasks = state.options.target_tasks[0].split(",")
    if len(state.options.forced_tasks) == 1:
        state.options.forced_tasks = state.options.forced_tasks[0].split(",")
    state.logger.debug("Target tasks: " + str(state.options.target_tasks))
    state.logger.debug("Forced tasks: " + str(state.options.forced_tasks))

    # Check if alignment_method is valid
    alignment_method = state.config.get_options(
        "alignment_method").strip().lower()
    if alignment_method not in ["bwa mem", "bowtie"]:
        print("Error: Invalid alignment_method in config file. " \
              "Valid options are ['bwa mem', 'bowtie'].")
        sys.exit(radpipe.error_codes.INVALID_ARGUMENT)
    if alignment_method == "bwa mem":
        align_task_name = "bwa_mem"
        index_task_name = "bwa_index"
    else:
        align_task_name = "bowtie"
        index_task_name = "bowtie_index"

    # TODO: Refactor this
    # If 'alignment' is in target_tasks or forced_tasks, specify which
    # type of alignment job
    if "alignment" in state.options.target_tasks:
        index = state.options.target_tasks.index("alignment")
        state.options.target_tasks[index] = align_task_name
    if "alignment" in state.options.forced_tasks:
        index = state.options.forced_tasks.index("alignment")
        state.options.forced_tasks[index] = align_task_name

    # If 'build_index' is in target_tasks or forced_tasks, specify which
    # type of index job
    if "build_index" in state.options.target_tasks:
        index = state.options.target_tasks.index("build_index")
        state.options.target_tasks[index] = index_task_name
    if "build_index" in state.options.forced_tasks:
        index = state.options.forced_tasks.index("build_index")
        state.options.forced_tasks[index] = index_task_name
    state.logger.debug(state)

    # Whether to include filter_bam stage or not
    filter_bams = False
    try:
        samtools_view_options = state.config.get_options(
            "samtools_view_options")
        if samtools_view_options:
            filter_bams = True
    except:
        pass
    state.logger.info("Filter bams: {}".format(filter_bams))

    # Population map filenames
    popmap_file = "{output_dir}/{name}_popmap.txt".format(
        output_dir=output_dir["populations"],
        name=state.config.get_options("analysis_id"))
    try:
        config_popmap_file = state.config.get_options("popmap_file")
        if config_popmap_file:
            state.logger.info(
                "Using popmap file: {}".format(config_popmap_file))
        else:
            raise (Exception)
    except Exception:
        config_popmap_file = None
        state.logger.info("Creating new popmap file: {}".format(popmap_file))

    # Population r values
    populations_r = state.config.get_options("populations_r")
    assert (isinstance(populations_r, list))

    # Dummy stages. These do nothing except provide a node at the beginning
    # for the pipeline graph, giving the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.do_nothing,
                       name="original_fastqs",
                       output=input_files)

    pipeline.originate(task_func=stages.do_nothing,
                       name="reference_genome",
                       output=state.config.get_options("reference_genome"))

    # Create a copy of the population map file needed for stacks, or create
    # one denovo using the sample list.
    pipeline.originate(task_func=stages.create_popmap_file,
                       name="create_popmap_file",
                       output=[popmap_file],
                       extras=[config_popmap_file, sample_list])

    # Create index for reference genome based on alignment method.
    if alignment_method == "bwa mem":
        pipeline.transform(
            task_func=stages.bwa_index,
            name="bwa_index",
            input=output_from("reference_genome"),
            filter=formatter(".+/(?P<ref>[^/]+).(fa|fasta)"),
            output=path_list_join(output_dir["reference"],
                                  ["reference.fa.bwt", "reference.fa.sa"]),
            extras=[output_dir["reference"]])

    if alignment_method == "bowtie":
        pipeline.transform(task_func=stages.bowtie_index,
                           name="bowtie_index",
                           input=output_from("reference_genome"),
                           filter=formatter(".+/(?P<ref>[^/]+).(fa|fasta)"),
                           output=path_list_join(
                               output_dir["reference"],
                               ["reference.1.ebwt", "reference.rev.1.ebwt"]),
                           extras=[output_dir["reference"]])

    # FastQC
    pipeline.transform(
        task_func=stages.fastqc,
        name="fastqc",
        input=output_from("original_fastqs"),
        filter=formatter(".+/(?P<lib>[^/]+)/(?P<fn>[^/]+).(fastq|fq).gz"),
        output="%s/{lib[0]}/{fn[0]}_fastqc.zip" % output_dir["fastqc"],
        extras=[output_dir["fastqc"], "{lib[0]}"])

    # MultiQC: FastQC
    pipeline.merge(task_func=stages.multiqc_fastqc,
                   name="multiqc_fastqc",
                   input=output_from("fastqc"),
                   output="%s/multiqc_fastqc_report.html" % output_dir["qc"],
                   extras=[output_dir["qc"], output_dir["fastqc"]])

    # Stacks: Process RAD-Tags
    pipeline.transform(task_func=stages.process_radtags,
                       name="process_radtags",
                       input=output_from("original_fastqs"),
                       filter=formatter(".+/(?P<lib>[^/]+)/[^/]+"),
                       output="%s/{lib[0]}/{lib[0]}.success" %
                       output_dir["process_radtags"],
                       extras=[
                           output_dir["process_radtags"], "{lib[0]}",
                           state.config.get_options("renz_1"),
                           state.config.get_options("renz_2"),
                           state.config.get_options("process_radtags_options")
                       ])

    # Create a list for alignment with the input fastq files from process_radtags
    process_radtags_outputs = []
    for l in libraries:
        for s in l.samples:
            base = "{dir}/{lib}/{sample}".format(
                dir=output_dir["process_radtags"], lib=l.lib_id, sample=s)
            process_radtags_outputs.append(
                [base + ".1.fq.gz", base + ".2.fq.gz"])
    # print(process_radtags_outputs)

    # Alignment
    if align_task_name == "bwa_mem":
        (pipeline.transform(
            task_func=stages.bwa_align,
            name=align_task_name,
            input=process_radtags_outputs,
            filter=formatter(".+/(?P<sm>[^/]+).1.fq.gz"),
            output="%s/{sm[0]}.bwa.bam" % output_dir["alignments"],
            extras=[
                os.path.join(output_dir["reference"], "reference.fa"),
                "{path[0]}", output_dir["alignments"], "{sm[0]}",
                state.config.get_options("alignment_options")
            ])).follows("bwa_index").follows("process_radtags")

    if align_task_name == "bowtie":
        (pipeline.transform(
            task_func=stages.bowtie_align,
            name=align_task_name,
            input=process_radtags_outputs,
            filter=formatter(".+/(?P<sm>[^/]+).1.fq.gz"),
            output="%s/{sm[0]}.bowtie.bam" % output_dir["alignments"],
            extras=[
                os.path.join(output_dir["reference"], "reference"),
                "{path[0]}", output_dir["alignments"], "{sm[0]}",
                state.config.get_options("alignment_options")
            ])).follows("bowtie_index").follows("process_radtags")

    # Sort BAM and index
    pipeline.transform(task_func=stages.sort_bam,
                       name="sort_bam",
                       input=output_from(align_task_name),
                       filter=suffix(".bam"),
                       output=".sorted.bam")

    if filter_bams:
        final_bam_task_name = "filter_bam"
        pipeline.transform(
            task_func=stages.filter_bam,
            name="filter_bam",
            input=output_from("sort_bam"),
            filter=suffix(".sorted.bam"),
            output=".sorted.filtered.bam",
            extras=[state.config.get_options("samtools_view_options")])
    else:
        final_bam_task_name = "sort_bam"

    # Samtools flagstat
    pipeline.transform(task_func=stages.flagstat,
                       name="flagstat",
                       input=output_from(final_bam_task_name),
                       filter=suffix(".bam"),
                       output=".flagstat.txt",
                       output_dir=output_dir["flagstat"])

    # MultiQC: flagstat
    pipeline.merge(task_func=stages.multiqc_flagstat,
                   name="multiqc_flagstat",
                   input=output_from("flagstat"),
                   output="%s/multiqc_flagstat_report.html" % output_dir["qc"],
                   extras=[output_dir["qc"], output_dir["flagstat"]])

    # Stacks: gstacks
    pipeline.merge(task_func=stages.gstacks,
                   name="gstacks",
                   input=output_from(final_bam_task_name),
                   output="%s/catalog.fa.gz" % output_dir["gstacks"],
                   extras=[
                       output_dir["alignments"], output_dir["gstacks"],
                       align_task_name, final_bam_task_name, sample_list,
                       state.config.get_options("gstacks_options")
                   ])

    # Define outputs from each run of populations
    populations_outputs = []
    for r in populations_r:
        dir_name = "{pop_dir}/{analysis_name}_r{r}".format(
            pop_dir=output_dir["populations"],
            analysis_name=state.config.get_options("analysis_id"),
            r=r)
        populations_outputs.append(
            os.path.join(dir_name, "populations.snps.vcf"))
    # print(populations_outputs)

    # Stacks: populations
    pipeline.originate(task_func=stages.populations,
                       name="popluations",
                       output=populations_outputs,
                       extras=[
                           output_dir["gstacks"], output_dir["populations"],
                           popmap_file,
                           state.config.get_options("populations_options")
                       ]).follows("gstacks").follows("create_popmap_file")

    return pipeline
Beispiel #42
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='hiplexpipe')
    # Get a list of paths to all the FASTQ files
    fastq_files = state.config.get_option('fastqs')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(
        task_func=stages.original_fastqs,
        name='original_fastqs',
        output=fastq_files)

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        # Hi-Plex example: OHI031002-P02F04_S318_L001_R1_001.fastq
        # new sample name = OHI031002-P02F04
        filter=formatter(
            '.+/(?P<sample>[a-zA-Z0-9-]+)-(?P<tumor>[TN]+)_(?P<readid>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_R1_(?P<lib>[a-zA-Z0-9-:]+).fastq'),

        # Add one more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        # Hi-Plex example: OHI031002-P02F04_S318_L001_R2_001.fastq
        add_inputs=add_inputs(
            '{path[0]}/{sample[0]}-{tumor[0]}_{readid[0]}_{lane[0]}_R2_{lib[0]}.fastq'),

        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=['{sample[0]}', '{tumor[0]}', '{readid[0]}', '{lane[0]}', '{lib[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='alignments/{sample[0]}/{sample[0]}_{tumor[0]}.bam')

    # Sort the BAM file using Picard
    pipeline.transform(
        task_func=stages.sort_bam_picard,
        name='sort_bam_picard',
        input=output_from('align_bwa'),
        filter=suffix('.bam'),
        output='.sort.bam')

    # High quality and primary alignments
    pipeline.transform(
        task_func=stages.primary_bam,
        name='primary_bam',
        input=output_from('sort_bam_picard'),
        filter=suffix('.sort.bam'),
        output='.primary.bam')

    # index bam file
    pipeline.transform(
        task_func=stages.index_sort_bam_picard,
        name='index_bam',
        input=output_from('primary_bam'),
        filter=suffix('.primary.bam'),
        output='.primary.bam.bai')

    # Clip the primer_seq from BAM File
    (pipeline.transform(
        task_func=stages.clip_bam,
        name='clip_bam',
        input=output_from('primary_bam'),
        filter=suffix('.primary.bam'),
        output='.primary.primerclipped.bam')
        .follows('index_bam'))

    ###### GATK VARIANT CALLING - MuTect2 ######

    # Call somatics variants using MuTect2
    pipeline.transform(
        task_func=stages.call_mutect2_gatk,
        name='call_mutect2_gatk',
        input=output_from('clip_bam'),
        # filter=suffix('.merged.dedup.realn.bam'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9-]+)_T.primary.primerclipped.bam'),
        add_inputs=add_inputs(
            '{path[0]}/{sample[0]}_N.primary.primerclipped.bam'),
        # extras=['{sample[0]}'],
        output='variants/mutect2/{sample[0]}.mutect2.vcf')
        # .follows('clip_bam')

    ###### GATK VARIANT CALLING - MuTect2 ######

    # -------- VEP ----------
    # Apply NORM
    (pipeline.transform(
        task_func=stages.apply_vt,
        name='apply_vt',
        input=output_from('call_mutect2_gatk'),
        filter=suffix('.mutect2.vcf'),
        # add_inputs=add_inputs(['variants/ALL.indel_recal', 'variants/ALL.indel_tranches']),
        output='.mutect2.vt.vcf')
        .follows('call_mutect2_gatk'))
    #
    # Apply VEP
    (pipeline.transform(
        task_func=stages.apply_vep,
        name='apply_vep',
        input=output_from('apply_vt'),
        filter=suffix('.mutect2.vt.vcf'),
        # add_inputs=add_inputs(['variants/ALL.indel_recal', 'variants/ALL.indel_tranches']),
        output='.mutect2.vt.vep.vcf')
        .follows('apply_vt'))
    #
    # Apply vcfanno
    (pipeline.transform(
        task_func=stages.apply_vcfanno,
        name='apply_vcfanno',
        input=output_from('apply_vep'),
        filter=suffix('.mutect2.vt.vep.vcf'),
        # add_inputs=add_inputs(['variants/ALL.indel_recal', 'variants/ALL.indel_tranches']),
        output='.mutect2.annotated.vcf')
        .follows('apply_vep'))

    return pipeline
Beispiel #43
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='complexo')
    # Get a list of paths to all the FASTQ files
    fastq_files = state.config.get_option('fastqs')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.original_fastqs,
                       name='original_fastqs',
                       output=fastq_files)

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        # We assume the sample name may consist of only alphanumeric
        # characters.
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_]+)_R1.fastq.gz'),
        # Add one more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=['{sample[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='{path[0]}/{sample[0]}.bam')

    # Sort the BAM file using Picard
    pipeline.transform(task_func=stages.sort_bam_picard,
                       name='sort_bam_picard',
                       input=output_from('align_bwa'),
                       filter=suffix('.bam'),
                       output='.sort.bam')

    # Mark duplicates in the BAM file using Picard
    pipeline.transform(
        task_func=stages.mark_duplicates_picard,
        name='mark_duplicates_picard',
        input=output_from('sort_bam_picard'),
        filter=suffix('.sort.bam'),
        # XXX should make metricsup an extra output?
        output=['.sort.dedup.bam', '.metricsdup'])

    # Generate chromosome intervals using GATK
    pipeline.transform(task_func=stages.chrom_intervals_gatk,
                       name='chrom_intervals_gatk',
                       input=output_from('mark_duplicates_picard'),
                       filter=suffix('.sort.dedup.bam'),
                       output='.chr.intervals')

    # Local realignment using GATK
    (pipeline.transform(
        task_func=stages.local_realignment_gatk,
        name='local_realignment_gatk',
        input=output_from('chrom_intervals_gatk'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_]+).chr.intervals'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'),
        output='{path[0]}/{sample[0]}.sort.dedup.realn.bam').follows(
            'mark_duplicates_picard'))

    # Base recalibration using GATK
    pipeline.transform(task_func=stages.base_recalibration_gatk,
                       name='base_recalibration_gatk',
                       input=output_from('local_realignment_gatk'),
                       filter=suffix('.sort.dedup.realn.bam'),
                       output=['.recal_data.csv', '.count_cov.log'])

    # Print reads using GATK
    (pipeline.transform(
        task_func=stages.print_reads_gatk,
        name='print_reads_gatk',
        input=output_from('base_recalibration_gatk'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_]+).recal_data.csv'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.realn.bam'),
        output='{path[0]}/{sample[0]}.sort.dedup.realn.recal.bam').follows(
            'local_realignment_gatk'))

    # Call variants using GATK
    pipeline.transform(task_func=stages.call_variants_gatk,
                       name='call_variants_gatk',
                       input=output_from('print_reads_gatk'),
                       filter=suffix('.sort.dedup.realn.recal.bam'),
                       output='.raw.snps.indels.g.vcf')

    # Combine G.VCF files for all samples using GATK
    pipeline.merge(task_func=stages.combine_gvcf_gatk,
                   name='combine_gvcf_gatk',
                   input=output_from('call_variants_gatk'),
                   output='COMPLEXO.mergedgvcf.vcf')

    # Genotype G.VCF files using GATK
    pipeline.transform(task_func=stages.genotype_gvcf_gatk,
                       name='genotype_gvcf_gatk',
                       input=output_from('combine_gvcf_gatk'),
                       filter=suffix('.mergedgvcf.vcf'),
                       output='.genotyped.vcf')

    # SNP recalibration using GATK
    pipeline.transform(task_func=stages.snp_recalibrate_gatk,
                       name='snp_recalibrate_gatk',
                       input=output_from('genotype_gvcf_gatk'),
                       filter=suffix('.genotyped.vcf'),
                       output=['.snp_recal', '.snp_tranches', '.snp_plots.R'])

    # INDEL recalibration using GATK
    pipeline.transform(
        task_func=stages.indel_recalibrate_gatk,
        name='indel_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        output=['.indel_recal', '.indel_tranches', '.indel_plots.R'])

    # Apply SNP recalibration using GATK
    (pipeline.transform(
        task_func=stages.apply_snp_recalibrate_gatk,
        name='apply_snp_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        add_inputs=add_inputs(['COMPLEXO.snp_recal', 'COMPLEXO.snp_tranches']),
        output='.recal_SNP.vcf').follows('snp_recalibrate_gatk'))

    # Apply INDEL recalibration using GATK
    (pipeline.transform(
        task_func=stages.apply_indel_recalibrate_gatk,
        name='apply_indel_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        add_inputs=add_inputs(
            ['COMPLEXO.indel_recal', 'COMPLEXO.indel_tranches']),
        output='.recal_INDEL.vcf').follows('indel_recalibrate_gatk'))

    # Combine variants using GATK
    (pipeline.transform(
        task_func=stages.combine_variants_gatk,
        name='combine_variants_gatk',
        input=output_from('apply_snp_recalibrate_gatk'),
        filter=suffix('.recal_SNP.vcf'),
        add_inputs=add_inputs(['COMPLEXO.recal_INDEL.vcf']),
        output='.combined.vcf').follows('apply_indel_recalibrate_gatk'))

    # Select variants using GATK
    pipeline.transform(task_func=stages.select_variants_gatk,
                       name='select_variants_gatk',
                       input=output_from('combine_variants_gatk'),
                       filter=suffix('.combined.vcf'),
                       output='.selected.vcf')

    return pipeline
Beispiel #44
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name="rnapipe")

    # Get the details of the experiment (samples, config, inputs, ...)
    experiment = Experiment(state)

    # Get reference file locations
    reference_genome = state.config.get_options("reference_genome")
    gene_ref = state.config.get_options("gene_ref")

    # Print out samples
    sample_text = [s.info() for s in experiment.sample_list]
    logging.info("Analysis samples:\n{}".format("\n".join(sample_text)))

    # Stages are dependent on the state. Experiment object is also passed so
    # we can access metadata later.
    stages = PipelineStages(state, experiment=experiment)

    # Make directories
    output_dir = get_output_paths(
        results_dir=state.config.get_options("results_dir"),
        default_paths=OUTPUT_PATHS)
    make_output_dirs(output_dir)
    logging.debug(output_dir)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.do_nothing,
                       name="original_fastqs",
                       output=experiment.R1_files)

    # Create reference index for alignment
    if not experiment.index_provided:
        pipeline.originate(task_func=stages.do_nothing,
                           name="reference_genome",
                           output=reference_genome)

        if experiment.alignment_method == "star":
            # Create reference index for STAR
            pipeline.transform(task_func=stages.create_star_index,
                               name="create_star_index",
                               input=output_from("reference_genome"),
                               filter=formatter(".*"),
                               add_inputs=add_inputs(gene_ref),
                               output=path_list_join(
                                   output_dir["star_index"],
                                   ["SA", "Genome", "genomeParameters.txt"]),
                               extras=[output_dir["star_index"]])

        elif experiment.alignment_method == "hisat2":
            # Create reference index for HISAT2
            hisat_basename = path.join(output_dir["hisat_index"], "genome")
            pipeline.transform(
                task_func=stages.create_hisat_index,
                name="create_hisat_index",
                input=output_from("reference_genome"),
                filter=formatter(".*"),
                add_inputs=add_inputs(gene_ref),
                output=path_list_join(output_dir["hisat_index"],
                                      ["genome.1.ht2", "genome.2.ht2"]),
                extras=[hisat_basename])
    else:
        # Don't create index if index is supplied
        if experiment.alignment_method == "star":
            output_dir["star_index"] = state.config.get_options("star_index")
            pipeline.originate(task_func=stages.do_nothing,
                               name="create_star_index",
                               output=path_list_join(
                                   output_dir["star_index"],
                                   ["SA", "Genome", "genomeParameters.txt"]))
        elif experiment.alignment_method == "hisat2":
            hisat_basename = state.config.get_options("hisat_index")
            output_dir["hisat_index"] = path.dirname(hisat_basename)
            prefix = path.basename(hisat_basename)
            pipeline.originate(task_func=stages.do_nothing,
                               name="create_hisat_index",
                               output=path_list_join(
                                   output_dir["hisat_index"], [
                                       "{prefix}.1.ht2".format(prefix=prefix),
                                       "{prefix}.2.ht2".format(prefix=prefix)
                                   ]))

    # Pre-trim FastQC
    if experiment.paired_end:
        pipeline.transform(
            task_func=stages.fastqc,
            name="fastqc",
            input=output_from("original_fastqs"),
            filter=formatter(".+/(?P<sample>[a-zA-Z0-9-_]+)_R1.fastq.gz"),
            add_inputs=add_inputs("{path[0]}/{sample[0]}_R2.fastq.gz"),
            output=path_list_join(
                output_dir["fastqc"],
                ["{sample[0]}_R1_fastqc.zip", "{sample[0]}_R2_fastqc.zip"]),
            extras=[output_dir["fastqc"]])
    else:
        pipeline.transform(task_func=stages.fastqc,
                           name="fastqc",
                           input=output_from("original_fastqs"),
                           filter=suffix(".fastq.gz"),
                           output="_fastqc.zip",
                           output_dir=output_dir["fastqc"],
                           extras=[output_dir["fastqc"]])

    # Trimmomatic
    if experiment.trim_reads and experiment.paired_end:
        pipeline.transform(
            task_func=stages.trim_reads,
            name="trim_reads",
            input=output_from("original_fastqs"),
            # Get R1 file and the corresponding R2 file
            filter=formatter(".+/(?P<sample>[a-zA-Z0-9-_]+)_R1.fastq.gz"),
            add_inputs=add_inputs("{path[0]}/{sample[0]}_R2.fastq.gz"),
            output=path_list_join(output_dir["seq"], [
                "{sample[0]}_R1.trimmed.fastq.gz",
                "{sample[0]}_R2.trimmed.fastq.gz"
            ]),
            extras=path_list_join(output_dir["seq"], [
                "{sample[0]}_R1.unpaired.fastq.gz",
                "{sample[0]}_R2.unpaired.fastq.gz"
            ]))
    elif experiment.trim_reads:
        pipeline.transform(
            task_func=stages.trim_reads,
            name="trim_reads",
            input=output_from("original_fastqs"),
            filter=formatter(".+/(?P<sample>[a-zA-Z0-9-_]+)_R1.fastq.gz"),
            output=path.join(output_dir["seq"],
                             "{sample[0]}_R1.trimmed.fastq.gz"))

    # Post-trim FastQC
    if experiment.paired_end and experiment.trim_reads:
        pipeline.transform(
            task_func=stages.fastqc,
            name="post_trim_fastqc",
            input=output_from("trim_reads"),
            filter=formatter(
                ".+/(?P<sample>[a-zA-Z0-9-_]+)_R1.trimmed.fastq.gz"),
            output=path_list_join(output_dir["post_trim_fastqc"], [
                "{sample[0]}_R1.trimmed_fastqc.gz",
                "{sample[0]}_R2.trimmed_fastqc.gz"
            ]),
            extras=["results/qc/post_trim_fastqc/"])
    elif experiment.trim_reads:
        pipeline.transform(task_func=stages.fastqc,
                           name="post_trim_fastqc",
                           input=output_from("trim_reads"),
                           filter=suffix(".trimmed.fastq.gz"),
                           output=".trimmed_fastqc.gz",
                           output_dir=output_dir["post_trim_fastqc"],
                           extras=[output_dir["post_trim_fastqc"]])

    # If there are technical replicates, each is mapped independently.
    # This is so each technical replicate maintains a separate read group.
    if experiment.alignment_method == "star":
        align_task_name = "star_align"
        if experiment.trim_reads:
            (pipeline.transform(
                task_func=stages.star_align,
                name=align_task_name,
                input=output_from("trim_reads"),
                filter=formatter(".+/(?P<sample>[a-zA-Z0-9-_]+)" \
                                 "_R[12](.trimmed)?.fastq.gz"),
                output="%s/{sample[0]}/{sample[0]}.star.Aligned.out.bam" \
                        % output_dir["alignments"],
                extras=[output_dir["star_index"], "{sample[0]}"])
            ).follows("create_star_index")
        else:
            (pipeline.transform(
                task_func=stages.star_align,
                name=align_task_name,
                input=output_from("original_fastqs"),
                filter=formatter(".+/(?P<sample>[a-zA-Z0-9-_]+)" \
                                 "_R[12](.trimmed)?.fastq.gz"),
                output="%s/{sample[0]}/{sample[0]}.star.Aligned.out.bam" \
                        % output_dir["alignments"],
                extras=[output_dir["star_index"], "{sample[0]}"])
            ).follows("create_star_index")

    if experiment.alignment_method == "hisat2":
        align_task_name = "hisat_align"
        if experiment.trim_reads:
            (pipeline.transform(
                task_func=stages.hisat_align,
                name="hisat_align",
                input=output_from("trim_reads"),
                filter=formatter(".+/(?P<sample>[a-zA-Z0-9-_]+)" \
                                 "_R[12](.trimmed)?.fastq.gz"),
                output="%s/{sample[0]}/{sample[0]}.hisat2.bam" \
                        % output_dir["alignments"],
                extras=[hisat_basename, "{sample[0]}"])
            ).follows("create_hisat_index")
        else:
            (pipeline.transform(
                task_func=stages.hisat_align,
                name="hisat_align",
                input=output_from("original_fastqs"),
                filter=formatter(".+/(?P<sample>[a-zA-Z0-9-_]+)" \
                                 "_R[12](.trimmed)?.fastq.gz"),
                output="%s/{sample[0]}/{sample[0]}.hisat2.bam" \
                        % output_dir["alignments"],
                extras=[hisat_basename, "{sample[0]}"])
            ).follows("create_hisat_index")

    # Sort BAM by coordinates
    pipeline.transform(
        task_func=stages.sort_bam_by_coordinate,
        name="sort_bam_by_coordinate",
        input=output_from(align_task_name),
        filter=formatter(
            ".+/(?P<sample>[a-zA-Z0-9-_]+)\.(?P<method>(star|hisat2))\..*bam"),
        output=[
            "{path[0]}/{sample[0]}.{method[0]}.sorted.bam",
            "{path[0]}/{sample[0]}.{method[0]}.sorted.bam.bai"
        ])

    # Merge files with the same sample name
    if experiment.multiple_technical_replicates:
        pipeline.collate(
            task_func=stages.merge_bams,
            name="merge_bams",
            input=output_from("sort_bam_by_coordinate"),
            filter=formatter(
                ".+/(SM_)?(?P<sm>[a-zA-Z0-9-]+)[^.]*\.(?P<method>(star|hisat2)).sorted.bam"
            ),
            output=path_list_join(
                output_dir["alignments"],
                ["{sm[0]}.{method[0]}.bam", "{sm[0]}.{method[0]}.bam.bai"]))
    else:
        pipeline.transform(
            task_func=stages.create_symlinks,
            name="merge_bams",
            input=output_from("sort_bam_by_coordinate"),
            filter=formatter(
                ".+/(SM_)?(?P<sm>[a-zA-Z0-9-]+)[^.]*\.(?P<method>(star|hisat2)).sorted.bam"
            ),
            output=path_list_join(
                output_dir["alignments"],
                ["{sm[0]}.{method[0]}.bam", "{sm[0]}.{method[0]}.bam.bai"]))

    # Sort BAM by name for counting features
    pipeline.transform(task_func=stages.sort_bam_by_name,
                       name="sort_bam_by_name",
                       input=output_from("merge_bams"),
                       filter=suffix(".bam"),
                       output=".nameSorted.bam")

    # Count features with HTSeq-count
    pipeline.transform(task_func=stages.htseq_count,
                       name="htseq_count",
                       input=output_from("sort_bam_by_name"),
                       filter=suffix(".nameSorted.bam"),
                       output_dir=output_dir["counts"],
                       output=".htseq.txt")

    # Count features with featureCounts
    pipeline.transform(task_func=stages.featurecounts,
                       name="featurecounts",
                       input=output_from("sort_bam_by_name"),
                       filter=suffix(".nameSorted.bam"),
                       output_dir=output_dir["counts"],
                       output=".featureCounts.txt")

    # TODO: add multiqc step

    #     # Stringtie assembly
    #     pipeline.transform(
    #         task_func=stages.stringtie_assembly,
    #         name="stringtie_assembly",
    #         input=output_from("merge_bams"),
    #         filter=suffix(".bam"),
    #         output_dir=output_dir["stringtie_assembly"],
    #         output=".gtf")

    # Stringtie estimates
    pipeline.transform(
        task_func=stages.stringtie_estimates,
        name="stringtie_estimates",
        input=output_from("merge_bams"),
        filter=formatter(
            ".+/(?P<sm>[a-zA-Z0-9-]+)\.(?P<method>(star|hisat2)).bam"),
        output=path_list_join(output_dir["stringtie_estimates"],
                              ["{sm[0]}/{sm[0]}.gtf", "{sm[0]}/e_data.ctab"]))

    # Stringtie counts
    pipeline.collate(
        task_func=stages.stringtie_prepDE,
        name="stringtie_prepDE",
        input=output_from("stringtie_estimates"),
        filter=formatter(".+\.gtf"),
        output=path_list_join(
            output_dir["stringtie_estimates"],
            ["gene_count_matrix.csv", "transcript_count_matrix.csv"]))
    return pipeline
Beispiel #45
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='crpipe')
    # Get a list of paths to all the FASTQ files
    fastq_files = state.config.get_option('fastqs')
    # Find the path to the reference genome
    # Stages are dependent on the state
    stages = Stages(state)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.original_fastqs,
                       name='original_fastqs',
                       output=fastq_files)

    # Convert FASTQ file to FASTA using fastx toolkit
    # pipeline.transform(
    #     task_func=stages.fastq_to_fasta,
    #     name='fastq_to_fasta',
    #     input=output_from('original_fastqs'),
    #     filter=suffix('.fastq.gz'),
    #     output='.fasta')

    # The original reference file
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    #pipeline.originate(
    #    task_func=stages.original_reference,
    #    name='original_reference',
    #    output=reference_file)

    # Run fastQC on the FASTQ files
    pipeline.transform(task_func=stages.fastqc,
                       name='fastqc',
                       input=output_from('original_fastqs'),
                       filter=suffix('.fastq.gz'),
                       output='_fastqc')

    # Index the reference using BWA
    #pipeline.transform(
    #    task_func=stages.index_reference_bwa,
    #    name='index_reference_bwa',
    #    input=output_from('original_reference'),
    #    filter=suffix('.fa'),
    #    output=['.fa.amb', '.fa.ann', '.fa.pac', '.fa.sa', '.fa.bwt'])

    # Index the reference using samtools
    # pipeline.transform(
    #     task_func=stages.index_reference_samtools,
    #    name='index_reference_samtools',
    #    input=output_from('original_reference'),
    #    filter=suffix('.fa'),
    #    output='.fa.fai')

    # Index the reference using bowtie 2
    # pipeline.transform(
    #     task_func=stages.index_reference_bowtie2,
    #     name='index_reference_bowtie2',
    #     input=output_from('original_reference'),
    #     filter=formatter('.+/(?P<refname>[a-zA-Z0-9]+\.fa)'),
    #     output=['{path[0]}/{refname[0]}.1.bt2',
    #             '{path[0]}/{refname[0]}.2.bt2',
    #             '{path[0]}/{refname[0]}.3.bt2',
    #             '{path[0]}/{refname[0]}.4.bt2',
    #             '{path[0]}/{refname[0]}.rev.1.bt2',
    #             '{path[0]}/{refname[0]}.rev.2.bt2'],
    #     extras=['{path[0]}/{refname[0]}'])

    # # Create a FASTA sequence dictionary for the reference using picard
    # pipeline.transform(
    #     task_func=stages.reference_dictionary_picard,
    #     name='reference_dictionary_picard',
    #     input=output_from('original_reference'),
    #     filter=suffix('.fa'),
    #     output='.dict')

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        # We assume the sample name may consist of only alphanumeric
        # characters.
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+)_R1.fastq.gz'),
        # Add two more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=['{sample[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='{path[0]}/{sample[0]}.bam')

    # Sort alignment with sambamba
    pipeline.transform(task_func=stages.sort_bam_sambamba,
                       name='sort_alignment',
                       input=output_from('align_bwa'),
                       filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).bam'),
                       output='{path[0]}/{sample[0]}.sorted.bam')

    # Extract MMR genes from the sorted BAM file
    pipeline.transform(
        task_func=stages.extract_genes_bedtools,
        name='extract_genes_bedtools',
        input=output_from('sort_alignment'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
        output='{path[0]}/{sample[0]}.mmr.bam')

    # Extract selected chromosomes from the sorted BAM file
    pipeline.transform(
        task_func=stages.extract_chromosomes_samtools,
        name='extract_chromosomes_samtools',
        input=output_from('sort_alignment'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
        output='{path[0]}/{sample[0]}.chroms.bam')

    # Index the MMR genes bam file with samtools
    pipeline.transform(task_func=stages.index_bam,
                       name='index_mmr_alignment',
                       input=output_from('extract_genes_bedtools'),
                       filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).mmr.bam'),
                       output='{path[0]}/{sample[0]}.mmr.bam.bai')

    # Compute depth of coverage of the alignment with GATK DepthOfCoverage
    #pipeline.transform(
    #    task_func=stages.alignment_coverage_gatk,
    #    name='alignment_coverage_gatk',
    #    input=output_from('sort_alignment'),
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
    #    add_inputs=add_inputs([reference_file]),
    #    output='{path[0]}/{sample[0]}.coverage_summary',
    #    extras=['{path[0]}/{sample[0]}_coverage'])

    # Index the alignment with samtools
    pipeline.transform(
        task_func=stages.index_bam,
        name='index_alignment',
        input=output_from('sort_alignment'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
        output='{path[0]}/{sample[0]}.sorted.bam.bai')

    # Generate alignment stats with bamtools
    pipeline.transform(task_func=stages.bamtools_stats,
                       name='bamtools_stats',
                       input=output_from('align_bwa'),
                       filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).bam'),
                       output='{path[0]}/{sample[0]}.stats.txt')

    # Extract the discordant paired-end alignments
    pipeline.transform(task_func=stages.extract_discordant_alignments,
                       name='extract_discordant_alignments',
                       input=output_from('align_bwa'),
                       filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).bam'),
                       output='{path[0]}/{sample[0]}.discordants.unsorted.bam')

    # Extract split-read alignments
    pipeline.transform(task_func=stages.extract_split_read_alignments,
                       name='extract_split_read_alignments',
                       input=output_from('align_bwa'),
                       filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).bam'),
                       output='{path[0]}/{sample[0]}.splitters.unsorted.bam')

    # Sort discordant reads.
    # Samtools annoyingly takes the prefix of the output bam name as its argument.
    # So we pass this as an extra argument. However Ruffus needs to know the full name
    # of the output bam file, so we pass that as the normal output parameter.
    pipeline.transform(
        task_func=stages.sort_bam,
        name='sort_discordants',
        input=output_from('extract_discordant_alignments'),
        filter=formatter(
            '.+/(?P<sample>[a-zA-Z0-9]+).discordants.unsorted.bam'),
        extras=['{path[0]}/{sample[0]}.discordants'],
        output='{path[0]}/{sample[0]}.discordants.bam')

    # Index the sorted discordant bam with samtools
    # pipeline.transform(
    #   task_func=stages.index_bam,
    #   name='index_discordants',
    #   input=output_from('sort_discordants'),
    #   filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).discordants.bam'),
    #   output='{path[0]}/{sample[0]}.discordants.bam.bai')

    # Sort discordant reads
    # Samtools annoyingly takes the prefix of the output bam name as its argument.
    # So we pass this as an extra argument. However Ruffus needs to know the full name
    # of the output bam file, so we pass that as the normal output parameter.
    pipeline.transform(
        task_func=stages.sort_bam,
        name='sort_splitters',
        input=output_from('extract_split_read_alignments'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).splitters.unsorted.bam'),
        extras=['{path[0]}/{sample[0]}.splitters'],
        output='{path[0]}/{sample[0]}.splitters.bam')

    # Index the sorted splitters bam with samtools
    # pipeline.transform(
    #    task_func=stages.index_bam,
    #    name='index_splitters',
    #    input=output_from('sort_splitters'),
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).splitters.bam'),
    #    output='{path[0]}/{sample[0]}.splitters.bam.bai')

    # Call structural variants with lumpy
    (pipeline.transform(
        task_func=stages.structural_variants_lumpy,
        name='structural_variants_lumpy',
        input=output_from('sort_alignment'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
        add_inputs=add_inputs([
            '{path[0]}/{sample[0]}.splitters.bam',
            '{path[0]}/{sample[0]}.discordants.bam'
        ]),
        output='{path[0]}/{sample[0]}.lumpy.vcf').follows('index_alignment').
     follows('sort_splitters').follows('sort_discordants'))

    # Call genotypes on lumpy output using SVTyper
    #(pipeline.transform(
    #    task_func=stages.genotype_svtyper,
    #    name='genotype_svtyper',
    #    input=output_from('structural_variants_lumpy'),
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).lumpy.vcf'),
    #    add_inputs=add_inputs(['{path[0]}/{sample[0]}.sorted.bam', '{path[0]}/{sample[0]}.splitters.bam']),
    #    output='{path[0]}/{sample[0]}.svtyper.vcf')
    #    .follows('align_bwa')
    #    .follows('sort_splitters')
    #    .follows('index_alignment')
    #    .follows('index_splitters')
    #    .follows('index_discordants'))

    # Call SVs with Socrates
    (pipeline.transform(
        task_func=stages.structural_variants_socrates,
        name='structural_variants_socrates',
        input=output_from('sort_alignment'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
        # output goes to {path[0]}/socrates/
        output=
        '{path[0]}/socrates/results_Socrates_paired_{sample[0]}.sorted_long_sc_l25_q5_m5_i95.txt',
        extras=['{path[0]}']))

    # Call DELs with DELLY
    pipeline.merge(task_func=stages.deletions_delly,
                   name='deletions_delly',
                   input=output_from('sort_alignment'),
                   output='delly.DEL.vcf')

    # Call DUPs with DELLY
    pipeline.merge(task_func=stages.duplications_delly,
                   name='duplications_delly',
                   input=output_from('sort_alignment'),
                   output='delly.DUP.vcf')

    # Call INVs with DELLY
    pipeline.merge(task_func=stages.inversions_delly,
                   name='inversions_delly',
                   input=output_from('sort_alignment'),
                   output='delly.INV.vcf')

    # Call TRAs with DELLY
    pipeline.merge(task_func=stages.translocations_delly,
                   name='translocations_delly',
                   input=output_from('sort_alignment'),
                   output='delly.TRA.vcf')

    # Join both read pair files using gustaf_mate_joining
    #pipeline.transform(
    #    task_func=stages.gustaf_mate_joining,
    #    name='gustaf_mate_joining',
    #    input=output_from('fastq_to_fasta'),
    #    # Match the R1 (read 1) FASTA file and grab the path and sample name.
    #    # This will be the first input to the stage.
    #    # We assume the sample name may consist of only alphanumeric
    #    # characters.
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+)_R1.fasta'),
    #    # Add one more input to the stage:
    #    #    1. The corresponding R2 FASTA file
    #    add_inputs=add_inputs(['{path[0]}/{sample[0]}_R2.fasta']),
    #    output='{path[0]}/{sample[0]}.joined_mates.fasta')

    # Call structural variants with pindel
    #(pipeline.transform(
    #    task_func=stages.structural_variants_pindel,
    #    name='structural_variants_pindel',
    #    input=output_from('sort_alignment'),
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
    #    add_inputs=add_inputs(['{path[0]}/{sample[0]}.pindel_config.txt', reference_file]),
    #    output='{path[0]}/{sample[0]}.pindel')
    #    .follows('index_reference_bwa')
    #    .follows('index_reference_samtools'))

    return pipeline
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='vcf_annotation')
    # Get a list of paths to all the FASTQ files
    vcf_files = state.config.get_option('vcfs')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original VCF files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(
        task_func=stages.original_vcf,
        name='original_vcf',
        output=vcf_file)

    # Decompose VCF using Vt
    pipeline.transform(
        task_func=stages.decompose_vcf,
        name='decompose_vcf',
        input=output_from('original_vcf'),
        # This will be the first input to the stage.
        # We assume the sample name may consist of only alphanumeric
        # characters.
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).vcf'),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the VCF file name (e.g. study/family name.
        # This is needed within the stage for finding out sample specific
        # configuration options
        extras=['{sample[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='{path[0]}/{sample[0]}.decompose.normalize.vcf')

    # FILTER COMMON VARIANTS
    # ADD FILTER COMMON VARIANTS USING VEP

    # Annotate using VEP
    pipeline.transform(
        task_func=stages.annotate_vep,
        name='annotate_vep',
        input=output_from('decompose_vcf'),
        filter=suffix('.vcf'),
        output='.vep.vcf')

    # Annotate using SnpEff
    pipeline.transform(
        task_func=stages.annotate_snpeff,
        name='annotate_snpeff',
        input=output_from('annotate_vep'),
        filter=suffix('.vcf'),
        output='.snpeff.vcf')

    # Mark duplicates in the BAM file using Picard
    pipeline.transform(
        task_func=stages.mark_duplicates_picard,
        name='mark_duplicates_picard',
        input=output_from('sort_bam_picard'),
        filter=suffix('.sort.bam'),
        # XXX should make metricsup an extra output?
        output=['.sort.dedup.bam', '.metricsdup'])

    # Generate chromosome intervals using GATK
    pipeline.transform(
        task_func=stages.chrom_intervals_gatk,
        name='chrom_intervals_gatk',
        input=output_from('mark_duplicates_picard'),
        filter=suffix('.sort.dedup.bam'),
        output='.chr.intervals')

    # Local realignment using GATK
    (pipeline.transform(
        task_func=stages.local_realignment_gatk,
        name='local_realignment_gatk',
        input=output_from('chrom_intervals_gatk'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).chr.intervals'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'),
        output='{path[0]}/{sample[0]}.sort.dedup.realn.bam')
        .follows('mark_duplicates_picard'))

    # Base recalibration using GATK
    pipeline.transform(
        task_func=stages.base_recalibration_gatk,
        name='base_recalibration_gatk',
        input=output_from('local_realignment_gatk'),
        filter=suffix('.sort.dedup.realn.bam'),
        output=['.recal_data.csv', '.count_cov.log'])

    # Print reads using GATK
    (pipeline.transform(
        task_func=stages.print_reads_gatk,
        name='print_reads_gatk',
        input=output_from('base_recalibration_gatk'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).recal_data.csv'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.realn.bam'),
        output='{path[0]}/{sample[0]}.sort.dedup.realn.recal.bam')
        .follows('local_realignment_gatk'))

    # Call variants using GATK
    pipeline.transform(
        task_func=stages.call_variants_gatk,
        name='call_variants_gatk',
        input=output_from('print_reads_gatk'),
        filter=suffix('.sort.dedup.realn.recal.bam'),
        output='.raw.snps.indels.g.vcf')

    # Combine G.VCF files for all samples using GATK
    pipeline.merge(
        task_func=stages.combine_gvcf_gatk,
        name='combine_gvcf_gatk',
        input=output_from('call_variants_gatk'),
        output='PCExomes.mergegvcf.vcf')

    # Genotype G.VCF files using GATK
    pipeline.transform(
        task_func=stages.genotype_gvcf_gatk,
        name='genotype_gvcf_gatk',
        input=output_from('combine_gvcf_gatk'),
        filter=suffix('.mergegvcf.vcf'),
        output='.genotyped.vcf')

    # SNP recalibration using GATK
    pipeline.transform(
        task_func=stages.snp_recalibrate_gatk,
        name='snp_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        output=['.snp_recal', '.snp_tranches', '.snp_plots.R'])

    # INDEL recalibration using GATK
    pipeline.transform(
        task_func=stages.indel_recalibrate_gatk,
        name='indel_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        output=['.indel_recal', '.indel_tranches', '.indel_plots.R'])

    # Apply SNP recalibration using GATK
    (pipeline.transform(
        task_func=stages.apply_snp_recalibrate_gatk,
        name='apply_snp_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        add_inputs=add_inputs(['PCExomes.snp_recal', 'PCExomes.snp_tranches']),
        output='.recal_SNP.vcf')
        .follows('snp_recalibrate_gatk'))

    # Apply INDEL recalibration using GATK
    (pipeline.transform(
        task_func=stages.apply_indel_recalibrate_gatk,
        name='apply_indel_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        add_inputs=add_inputs(
            ['PCExomes.indel_recal', 'PCExomes.indel_tranches']),
        output='.recal_INDEL.vcf')
        .follows('indel_recalibrate_gatk'))

    # Combine variants using GATK
    (pipeline.transform(
        task_func=stages.combine_variants_gatk,
        name='combine_variants_gatk',
        input=output_from('apply_snp_recalibrate_gatk'),
        filter=suffix('.recal_SNP.vcf'),
        add_inputs=add_inputs(['PCExomes.recal_INDEL.vcf']),
        output='.combined.vcf')
        .follows('apply_indel_recalibrate_gatk'))

    # Select variants using GATK
    pipeline.transform(
        task_func=stages.select_variants_gatk,
        name='select_variants_gatk',
        input=output_from('combine_variants_gatk'),
        filter=suffix('.combined.vcf'),
        output='.selected.vcf')

    return pipeline
Beispiel #47
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='complexo')
    # Get a list of paths to all the FASTQ files
    fastq_files = state.config.get_option('fastqs')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(
        task_func=stages.original_fastqs,
        name='original_fastqs',
        output=fastq_files)

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name. 
        # This will be the first input to the stage.
        # We assume the sample name may consist of only alphanumeric
        # characters.
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+)_R1.fastq.gz'),
        # Add one more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=['{sample[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='{path[0]}/{sample[0]}.bam')

    # Sort the BAM file using Picard 
    pipeline.transform(
        task_func=stages.sort_bam_picard,
        name='sort_bam_picard',
        input=output_from('align_bwa'),
        filter=suffix('.bam'),
        output='.sort.bam')

    # Mark duplicates in the BAM file using Picard 
    pipeline.transform(
        task_func=stages.mark_duplicates_picard,
        name='mark_duplicates_picard',
        input=output_from('sort_bam_picard'),
        filter=suffix('.sort.bam'),
        # XXX should make metricsup an extra output?
        output=['.sort.dedup.bam', '.metricsdup'])

    # Generate chromosome intervals using GATK 
    pipeline.transform(
        task_func=stages.chrom_intervals_gatk,
        name='chrom_intervals_gatk',
        input=output_from('mark_duplicates_picard'),
        filter=suffix('.sort.dedup.bam'),
        output='.chr.intervals')

    # Local realignment using GATK 
    (pipeline.transform(
        task_func=stages.local_realignment_gatk,
        name='local_realignment_gatk',
        input=output_from('chrom_intervals_gatk'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).chr.intervals'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'),
        output='{path[0]}/{sample[0]}.sort.dedup.realn.bam')
        .follows('mark_duplicates_picard'))

    # Base recalibration using GATK 
    pipeline.transform(
        task_func=stages.base_recalibration_gatk,
        name='base_recalibration_gatk',
        input=output_from('local_realignment_gatk'),
        filter=suffix('.sort.dedup.realn.bam'),
        output=['.recal_data.csv', '.count_cov.log'])

    # Print reads using GATK 
    (pipeline.transform(
        task_func=stages.print_reads_gatk,
        name='print_reads_gatk',
        input=output_from('base_recalibration_gatk'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).recal_data.csv'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.realn.bam'),
        output='{path[0]}/{sample[0]}.sort.dedup.realn.recal.bam')
        .follows('local_realignment_gatk'))

    # Call variants using GATK 
    pipeline.transform(
        task_func=stages.call_variants_gatk,
        name='call_variants_gatk',
        input=output_from('print_reads_gatk'),
        filter=suffix('.sort.dedup.realn.recal.bam'),
        output='.raw.snps.indels.g.vcf')

    # Combine G.VCF files for all samples using GATK
    pipeline.merge(
        task_func=stages.combine_gvcf_gatk,
        name='combine_gvcf_gatk',
        input=output_from('call_variants_gatk'),
        output='PCExomes.mergegvcf.vcf')

    # Genotype G.VCF files using GATK 
    pipeline.transform(
        task_func=stages.genotype_gvcf_gatk,
        name='genotype_gvcf_gatk',
        input=output_from('combine_gvcf_gatk'),
        filter=suffix('.mergegvcf.vcf'),
        output='.genotyped.vcf')

    # SNP recalibration using GATK
    pipeline.transform(
        task_func=stages.snp_recalibrate_gatk,
        name='snp_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        output=['.snp_recal', '.snp_tranches', '.snp_plots.R'])

    # INDEL recalibration using GATK
    pipeline.transform(
        task_func=stages.indel_recalibrate_gatk,
        name='indel_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        output=['.indel_recal', '.indel_tranches', '.indel_plots.R'])

    # Apply SNP recalibration using GATK  
    (pipeline.transform(
        task_func=stages.apply_snp_recalibrate_gatk,
        name='apply_snp_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        add_inputs=add_inputs(['PCExomes.snp_recal', 'PCExomes.snp_tranches']),
        output='.recal_SNP.vcf')
        .follows('snp_recalibrate_gatk'))

    # Apply INDEL recalibration using GATK  
    (pipeline.transform(
        task_func=stages.apply_indel_recalibrate_gatk,
        name='apply_indel_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        add_inputs=add_inputs(['PCExomes.indel_recal', 'PCExomes.indel_tranches']),
        output='.recal_INDEL.vcf')
        .follows('indel_recalibrate_gatk'))

    # Combine variants using GATK  
    (pipeline.transform(
        task_func=stages.combine_variants_gatk,
        name='combine_variants_gatk',
        input=output_from('apply_snp_recalibrate_gatk'),
        filter=suffix('.recal_SNP.vcf'),
        add_inputs=add_inputs(['PCExomes.recal_INDEL.vcf']),
        output='.combined.vcf')
        .follows('apply_indel_recalibrate_gatk'))

    # Select variants using GATK 
    pipeline.transform(
        task_func=stages.select_variants_gatk,
        name='select_variants_gatk',
        input=output_from('combine_variants_gatk'),
        filter=suffix('.combined.vcf'),
        output='.selected.vcf')

    return pipeline


#
#   Use equivalent but new sytle syntax
#
test_pipeline = Pipeline("test")

test_pipeline.originate(task_func = task1,
                   output    = [tempdir + d for d in ('a.1', 'b.1', 'c.1')])\
    .follows(mkdir(tempdir))\
    .posttask(lambda: do_write(test_file, "Task 1 Done\n"))

test_pipeline.transform(task_func = task2,
                   input     = task1,
                   filter    = suffix(".1"),
                   output    = ".2") \
    .posttask(lambda: do_write(test_file, "Task 2 Done\n"))

test_pipeline.transform(task3, task2, regex('(.*).2'), inputs([r"\1.2", tempdir + "a.1"]), r'\1.3')\
    .posttask(lambda: do_write(test_file, "Task 3 Done\n"))


test_pipeline.transform(task4, tempdir + "*.1", suffix(".1"), ".4")\
    .follows(task1)\
    .posttask(lambda: do_write(test_file, "Task 4 Done\n"))\
    .jobs_limit(1)

test_pipeline.files(task5, None, tempdir + 'a.5')\
    .follows(mkdir(tempdir))\
    .posttask(lambda: do_write(test_file, "Task 5 Done\n"))
        oo.write('job = %s\n' % json.dumps([infiles, outfiles]))


#
#   Use equivalent but new sytle syntax
#
test_pipeline = Pipeline("test")

test_pipeline.originate(task_func = task1,
                   output    = [tempdir + d for d in ('a.1', 'b.1', 'c.1')])\
    .follows(mkdir(tempdir))\
    .posttask(lambda: do_write(test_file, "Task 1 Done\n"))

test_pipeline.transform(task_func = task2,
                   input     = task1,
                   filter    = suffix(".1"),
                   output    = ".2") \
    .posttask(lambda: do_write(test_file, "Task 2 Done\n"))

test_pipeline.transform(task3, task2, regex('(.*).2'), inputs([r"\1.2", tempdir + "a.1"]), r'\1.3')\
    .posttask(lambda: do_write(test_file, "Task 3 Done\n"))


test_pipeline.transform(task4, tempdir + "*.1", suffix(".1"), ".4")\
    .follows(task1)\
    .posttask(lambda: do_write(test_file, "Task 4 Done\n"))\
    .jobs_limit(1)

test_pipeline.files(task5, None, tempdir + 'a.5')\
    .follows(mkdir(tempdir))\
    .posttask(lambda: do_write(test_file, "Task 5 Done\n"))
def check_regex_out_of_range_regex_reference_error_task(infiles, outfile,
                                                        prefix1,
                                                        prefix2,
                                                        extension):
    raise Exception("Should blow up first")


test_pipeline = Pipeline("test")

test_pipeline.originate(task_func=generate_initial_files1,
                        output=[tempdir + "/" + prefix + "_name.tmp1" for prefix in "abcdefghi"])

test_pipeline.transform(task_func=check_regex_task,
                        input=generate_initial_files1,
                        filter=regex("(.*)/(?P<PREFIX>[abcd])(_name)(.tmp1)"),
                        output=r"\1/\g<PREFIX>\3.tmp2",  # output file
                        extras=[r"\2",                # extra: prefix = \2
                                r"\g<PREFIX>",        # extra: prefix = \2
                                r"\4"])               # extra: extension
test_pipeline.transform(task_func=check_regex_unmatched_task,
                        input=generate_initial_files1,
                        filter=regex("(.*)/(?P<PREFIX>[abcd])(_name)(.xxx)"),
                        output=r"\1/\g<PREFIXA>\3.tmp2",  # output file
                        extras=[r"\2",                 # extra: prefix = \2
                                r"\g<PREFIX>",         # extra: prefix = \2
                                r"\4"])                # extra: extension

test_pipeline.transform(task_func=check_suffix_task,
                        input=generate_initial_files1,
                        filter=suffix(".tmp1"),
                        output=r".tmp2",           # output file
Beispiel #51
0
expected_inactive_text = """null -> "test_active_if/a.1"
    "test_active_if/a.1" -> "test_active_if/a.2"
        "test_active_if/a.2" -> "test_active_if/a.4"
null -> "test_active_if/b.1"
    "test_active_if/b.1" -> "test_active_if/b.2"
        "test_active_if/b.2" -> "test_active_if/b.4"
            "test_active_if/b.4" -> "test_active_if/summary.5"
"""


# alternative syntax
test_pipeline = Pipeline("test")
test_pipeline.originate(task1, ['test_active_if/a.1', 'test_active_if/b.1'], "an extra_parameter")\
    .follows(mkdir("test_active_if"))
test_pipeline.transform(task2, task1, suffix(".1"), ".2")
test_pipeline.transform(task3, task1, suffix(
    ".1"), ".3").active_if(lambda: pipeline_active_if)
test_pipeline.collate(task4, [task2, task3], regex(r"(.+)\.[23]"), r"\1.4")
test_pipeline.merge(task5, task4, "test_active_if/summary.5")


class Test_ruffus(unittest.TestCase):
    def setUp(self):
        try:
            shutil.rmtree(tempdir)
        except:
            pass
        os.makedirs(tempdir)

    def tearDown(self):
Beispiel #52
0
def first_task(input_files, output_file):
    with open(output_file, "w"): pass


#---------------------------------------------------------------
#   second task
@transform(first_task, suffix(".output.1"), ".output.2")
def second_task(input_files, output_file):
    with open(output_file, "w"): pass

test_pipeline = Pipeline("test")
test_pipeline.originate(output = [    [tempdir + 'data/scratch/lg/what/one/two/three/four/five/six/seven/job1.a.start',  tempdir + 'job1.b.start'],
                                       [tempdir + 'data/scratch/lg/what/one/two/three/four/five/six/seven/job2.a.start', tempdir + 'job2.b.start'],
                                       [tempdir + 'data/scratch/lg/what/one/two/three/four/five/six/seven/job3.a.start', tempdir + 'job3.b.start']    ],
                                       task_func = create_initial_file_pairs)
test_pipeline.transform(task_func = first_task, input = create_initial_file_pairs, filter = suffix(".start"), output = ".output.1")
test_pipeline.transform(input = first_task, filter = suffix(".output.1"), output = ".output.2", task_func= second_task)


decorator_syntax = 0
oop_syntax = 1

class Test_verbosity(unittest.TestCase):
    #___________________________________________________________________________
    #
    #   test_printout_abbreviated_path1
    #___________________________________________________________________________
    def test_printout_abbreviated_path1(self):
        """Input file exists, output doesn't exist"""
        for syntax in decorator_syntax, oop_syntax:
            s = StringIO()
Beispiel #53
0
def make_pipeline(state):
    """Build the pipeline by constructing stages and connecting them together"""
    # Build an empty pipeline
    pipeline = Pipeline(name="crpipe")
    # Get a list of paths to all the FASTQ files
    fastq_files = state.config.get_option("fastqs")
    # Find the path to the reference genome
    # Stages are dependent on the state
    stages = Stages(state)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.original_fastqs, name="original_fastqs", output=fastq_files)

    # Convert FASTQ file to FASTA using fastx toolkit
    # pipeline.transform(
    #     task_func=stages.fastq_to_fasta,
    #     name='fastq_to_fasta',
    #     input=output_from('original_fastqs'),
    #     filter=suffix('.fastq.gz'),
    #     output='.fasta')

    # The original reference file
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    # pipeline.originate(
    #    task_func=stages.original_reference,
    #    name='original_reference',
    #    output=reference_file)

    # Run fastQC on the FASTQ files
    pipeline.transform(
        task_func=stages.fastqc,
        name="fastqc",
        input=output_from("original_fastqs"),
        filter=suffix(".fastq.gz"),
        output="_fastqc",
    )

    # Index the reference using BWA
    # pipeline.transform(
    #    task_func=stages.index_reference_bwa,
    #    name='index_reference_bwa',
    #    input=output_from('original_reference'),
    #    filter=suffix('.fa'),
    #    output=['.fa.amb', '.fa.ann', '.fa.pac', '.fa.sa', '.fa.bwt'])

    # Index the reference using samtools
    # pipeline.transform(
    #     task_func=stages.index_reference_samtools,
    #    name='index_reference_samtools',
    #    input=output_from('original_reference'),
    #    filter=suffix('.fa'),
    #    output='.fa.fai')

    # Index the reference using bowtie 2
    # pipeline.transform(
    #     task_func=stages.index_reference_bowtie2,
    #     name='index_reference_bowtie2',
    #     input=output_from('original_reference'),
    #     filter=formatter('.+/(?P<refname>[a-zA-Z0-9]+\.fa)'),
    #     output=['{path[0]}/{refname[0]}.1.bt2',
    #             '{path[0]}/{refname[0]}.2.bt2',
    #             '{path[0]}/{refname[0]}.3.bt2',
    #             '{path[0]}/{refname[0]}.4.bt2',
    #             '{path[0]}/{refname[0]}.rev.1.bt2',
    #             '{path[0]}/{refname[0]}.rev.2.bt2'],
    #     extras=['{path[0]}/{refname[0]}'])

    # # Create a FASTA sequence dictionary for the reference using picard
    # pipeline.transform(
    #     task_func=stages.reference_dictionary_picard,
    #     name='reference_dictionary_picard',
    #     input=output_from('original_reference'),
    #     filter=suffix('.fa'),
    #     output='.dict')

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name="align_bwa",
        input=output_from("original_fastqs"),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        # We assume the sample name may consist of only alphanumeric
        # characters.
        filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+)_R1.fastq.gz"),
        # Add two more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        add_inputs=add_inputs("{path[0]}/{sample[0]}_R2.fastq.gz"),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=["{sample[0]}"],
        # The output file name is the sample name with a .bam extension.
        output="{path[0]}/{sample[0]}.bam",
    )

    # Sort alignment with sambamba
    pipeline.transform(
        task_func=stages.sort_bam_sambamba,
        name="sort_alignment",
        input=output_from("align_bwa"),
        filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).bam"),
        output="{path[0]}/{sample[0]}.sorted.bam",
    )

    # Extract MMR genes from the sorted BAM file
    pipeline.transform(
        task_func=stages.extract_genes_bedtools,
        name="extract_genes_bedtools",
        input=output_from("sort_alignment"),
        filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).sorted.bam"),
        output="{path[0]}/{sample[0]}.mmr.bam",
    )

    # Extract selected chromosomes from the sorted BAM file
    pipeline.transform(
        task_func=stages.extract_chromosomes_samtools,
        name="extract_chromosomes_samtools",
        input=output_from("sort_alignment"),
        filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).sorted.bam"),
        output="{path[0]}/{sample[0]}.chroms.bam",
    )

    # Index the MMR genes bam file with samtools
    pipeline.transform(
        task_func=stages.index_bam,
        name="index_mmr_alignment",
        input=output_from("extract_genes_bedtools"),
        filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).mmr.bam"),
        output="{path[0]}/{sample[0]}.mmr.bam.bai",
    )

    # Compute depth of coverage of the alignment with GATK DepthOfCoverage
    # pipeline.transform(
    #    task_func=stages.alignment_coverage_gatk,
    #    name='alignment_coverage_gatk',
    #    input=output_from('sort_alignment'),
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
    #    add_inputs=add_inputs([reference_file]),
    #    output='{path[0]}/{sample[0]}.coverage_summary',
    #    extras=['{path[0]}/{sample[0]}_coverage'])

    # Index the alignment with samtools
    pipeline.transform(
        task_func=stages.index_bam,
        name="index_alignment",
        input=output_from("sort_alignment"),
        filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).sorted.bam"),
        output="{path[0]}/{sample[0]}.sorted.bam.bai",
    )

    # Generate alignment stats with bamtools
    pipeline.transform(
        task_func=stages.bamtools_stats,
        name="bamtools_stats",
        input=output_from("align_bwa"),
        filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).bam"),
        output="{path[0]}/{sample[0]}.stats.txt",
    )

    # Extract the discordant paired-end alignments
    pipeline.transform(
        task_func=stages.extract_discordant_alignments,
        name="extract_discordant_alignments",
        input=output_from("align_bwa"),
        filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).bam"),
        output="{path[0]}/{sample[0]}.discordants.unsorted.bam",
    )

    # Extract split-read alignments
    pipeline.transform(
        task_func=stages.extract_split_read_alignments,
        name="extract_split_read_alignments",
        input=output_from("align_bwa"),
        filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).bam"),
        output="{path[0]}/{sample[0]}.splitters.unsorted.bam",
    )

    # Sort discordant reads.
    # Samtools annoyingly takes the prefix of the output bam name as its argument.
    # So we pass this as an extra argument. However Ruffus needs to know the full name
    # of the output bam file, so we pass that as the normal output parameter.
    pipeline.transform(
        task_func=stages.sort_bam,
        name="sort_discordants",
        input=output_from("extract_discordant_alignments"),
        filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).discordants.unsorted.bam"),
        extras=["{path[0]}/{sample[0]}.discordants"],
        output="{path[0]}/{sample[0]}.discordants.bam",
    )

    # Index the sorted discordant bam with samtools
    # pipeline.transform(
    #   task_func=stages.index_bam,
    #   name='index_discordants',
    #   input=output_from('sort_discordants'),
    #   filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).discordants.bam'),
    #   output='{path[0]}/{sample[0]}.discordants.bam.bai')

    # Sort discordant reads
    # Samtools annoyingly takes the prefix of the output bam name as its argument.
    # So we pass this as an extra argument. However Ruffus needs to know the full name
    # of the output bam file, so we pass that as the normal output parameter.
    pipeline.transform(
        task_func=stages.sort_bam,
        name="sort_splitters",
        input=output_from("extract_split_read_alignments"),
        filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).splitters.unsorted.bam"),
        extras=["{path[0]}/{sample[0]}.splitters"],
        output="{path[0]}/{sample[0]}.splitters.bam",
    )

    # Index the sorted splitters bam with samtools
    # pipeline.transform(
    #    task_func=stages.index_bam,
    #    name='index_splitters',
    #    input=output_from('sort_splitters'),
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).splitters.bam'),
    #    output='{path[0]}/{sample[0]}.splitters.bam.bai')

    # Call structural variants with lumpy
    (
        pipeline.transform(
            task_func=stages.structural_variants_lumpy,
            name="structural_variants_lumpy",
            input=output_from("sort_alignment"),
            filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).sorted.bam"),
            add_inputs=add_inputs(["{path[0]}/{sample[0]}.splitters.bam", "{path[0]}/{sample[0]}.discordants.bam"]),
            output="{path[0]}/{sample[0]}.lumpy.vcf",
        )
        .follows("index_alignment")
        .follows("sort_splitters")
        .follows("sort_discordants")
    )

    # Call genotypes on lumpy output using SVTyper
    # (pipeline.transform(
    #    task_func=stages.genotype_svtyper,
    #    name='genotype_svtyper',
    #    input=output_from('structural_variants_lumpy'),
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).lumpy.vcf'),
    #    add_inputs=add_inputs(['{path[0]}/{sample[0]}.sorted.bam', '{path[0]}/{sample[0]}.splitters.bam']),
    #    output='{path[0]}/{sample[0]}.svtyper.vcf')
    #    .follows('align_bwa')
    #    .follows('sort_splitters')
    #    .follows('index_alignment')
    #    .follows('index_splitters')
    #    .follows('index_discordants'))

    # Call SVs with Socrates
    (
        pipeline.transform(
            task_func=stages.structural_variants_socrates,
            name="structural_variants_socrates",
            input=output_from("sort_alignment"),
            filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).sorted.bam"),
            # output goes to {path[0]}/socrates/
            output="{path[0]}/socrates/results_Socrates_paired_{sample[0]}.sorted_long_sc_l25_q5_m5_i95.txt",
            extras=["{path[0]}"],
        )
    )

    # Call DELs with DELLY
    pipeline.merge(
        task_func=stages.deletions_delly,
        name="deletions_delly",
        input=output_from("sort_alignment"),
        output="delly.DEL.vcf",
    )

    # Call DUPs with DELLY
    pipeline.merge(
        task_func=stages.duplications_delly,
        name="duplications_delly",
        input=output_from("sort_alignment"),
        output="delly.DUP.vcf",
    )

    # Call INVs with DELLY
    pipeline.merge(
        task_func=stages.inversions_delly,
        name="inversions_delly",
        input=output_from("sort_alignment"),
        output="delly.INV.vcf",
    )

    # Call TRAs with DELLY
    pipeline.merge(
        task_func=stages.translocations_delly,
        name="translocations_delly",
        input=output_from("sort_alignment"),
        output="delly.TRA.vcf",
    )

    # Join both read pair files using gustaf_mate_joining
    # pipeline.transform(
    #    task_func=stages.gustaf_mate_joining,
    #    name='gustaf_mate_joining',
    #    input=output_from('fastq_to_fasta'),
    #    # Match the R1 (read 1) FASTA file and grab the path and sample name.
    #    # This will be the first input to the stage.
    #    # We assume the sample name may consist of only alphanumeric
    #    # characters.
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+)_R1.fasta'),
    #    # Add one more input to the stage:
    #    #    1. The corresponding R2 FASTA file
    #    add_inputs=add_inputs(['{path[0]}/{sample[0]}_R2.fasta']),
    #    output='{path[0]}/{sample[0]}.joined_mates.fasta')

    # Call structural variants with pindel
    # (pipeline.transform(
    #    task_func=stages.structural_variants_pindel,
    #    name='structural_variants_pindel',
    #    input=output_from('sort_alignment'),
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
    #    add_inputs=add_inputs(['{path[0]}/{sample[0]}.pindel_config.txt', reference_file]),
    #    output='{path[0]}/{sample[0]}.pindel')
    #    .follows('index_reference_bwa')
    #    .follows('index_reference_samtools'))

    return pipeline
Beispiel #54
0
"""

expected_inactive_text = """null -> "test_active_if/a.1"
    "test_active_if/a.1" -> "test_active_if/a.2"
        "test_active_if/a.2" -> "test_active_if/a.4"
null -> "test_active_if/b.1"
    "test_active_if/b.1" -> "test_active_if/b.2"
        "test_active_if/b.2" -> "test_active_if/b.4"
            "test_active_if/b.4" -> "test_active_if/summary.5"
"""

# alternative syntax
test_pipeline = Pipeline("test")
test_pipeline.originate(task1, ['test_active_if/a.1', 'test_active_if/b.1'], "an extra_parameter")\
    .follows(mkdir("test_active_if"))
test_pipeline.transform(task2, task1, suffix(".1"), ".2")
test_pipeline.transform(task3, task1, suffix(".1"),
                        ".3").active_if(lambda: pipeline_active_if)
test_pipeline.collate(task4, [task2, task3], regex(r"(.+)\.[23]"), r"\1.4")
test_pipeline.merge(task5, task4, "test_active_if/summary.5")


class Test_ruffus(unittest.TestCase):
    def setUp(self):
        try:
            shutil.rmtree(tempdir)
        except:
            pass
        os.makedirs(tempdir)

    def tearDown(self):
Beispiel #55
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='thepipeline')
    # Get a list of paths to all the FASTQ files
    fastq_files = state.config.get_option('fastqs')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(
        task_func=stages.original_fastqs,
        name='original_fastqs',
        output=fastq_files)

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        # We assume the sample name may consist of only alphanumeric
        # characters.
        # filter=formatter('(?P<path>.+)/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+)_1.fastq.gz'),
        # 1_HFYLVCCXX:2:TCCGCGAA_2_GE0343_1.fastq.gz
        # 1_HCJWFBCXX:GGACTCCT_L001_9071584415739518822-AGRF-023_R2.fastq.gz
        filter=formatter(
            '.+/(?P<readid>[a-zA-Z0-9-]+)_(?P<lib>[a-zA-Z0-9-:]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+)_R1.fastq.gz'),
        # Add one more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        # e.g. C2WPF.5_Solexa-201237_5_X4311_1.fastq.gz
        add_inputs=add_inputs(
            '{path[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}_R2.fastq.gz'),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=['{readid[0]}', '{lib[0]}', '{lane[0]}', '{sample[0]}'],
        # extras=['{sample[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.bam')

    # Sort the BAM file using Picard
    pipeline.transform(
        task_func=stages.sort_bam_picard,
        name='sort_bam_picard',
        input=output_from('align_bwa'),
        filter=suffix('.bam'),
        output='.sort.bam')

    # Mark duplicates in the BAM file using Picard
    pipeline.transform(
        task_func=stages.mark_duplicates_picard,
        name='mark_duplicates_picard',
        input=output_from('sort_bam_picard'),
        filter=suffix('.sort.bam'),
        # XXX should make metricsup an extra output?
        output=['.sort.dedup.bam', '.metricsdup'])

    # Local realignment using GATK
    # Generate RealignerTargetCreator using GATK
    pipeline.transform(
        task_func=stages.realigner_target_creator,
        name='realigner_target_creator',
        input=output_from('mark_duplicates_picard'),
        filter=suffix('.sort.dedup.bam'),
        output='.intervals')

    # Local realignment using GATK
    (pipeline.transform(
        task_func=stages.local_realignment_gatk,
        name='local_realignment_gatk',
        input=output_from('realigner_target_creator'),
        # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).chr.intervals'),
        filter=formatter(
            '.+/(?P<readid>[a-zA-Z0-9-]+)_(?P<lib>[a-zA-Z0-9-:]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+).intervals'),
        # add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'),
        add_inputs=add_inputs(
            'alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.bam'),
        output='alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.realn.bam')
        .follows('mark_duplicates_picard'))

    # Base recalibration using GATK
    pipeline.transform(
        task_func=stages.base_recalibration_gatk,
        name='base_recalibration_gatk',
        input=output_from('local_realignment_gatk'),
        filter=suffix('.sort.dedup.realn.bam'),
        output=['.recal_data.csv', '.count_cov.log'])

    # Print reads using GATK
    (pipeline.transform(
        task_func=stages.print_reads_gatk,
        name='print_reads_gatk',
        input=output_from('base_recalibration_gatk'),
        # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).recal_data.csv'),
        filter=formatter(
            # '.+/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+).recal_data.csv'),
            '.+/(?P<readid>[a-zA-Z0-9-]+)_(?P<lib>[a-zA-Z0-9-:]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+).recal_data.csv'),
            # '.+/(?P<readid>[a-zA-Z0-9-]+)_(?P<lib>[a-zA-Z0-9-:]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+).recal_data.csv'),
        # add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.realn.bam'),
        add_inputs=add_inputs(
            'alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.realn.bam'),
        # output='{path[0]}/{sample[0]}.sort.dedup.realn.recal.bam')
        output='alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.realn.recal.bam')
        .follows('local_realignment_gatk'))

    # Merge lane bams to sample bams
    pipeline.collate(
        task_func=stages.merge_sample_bams,
        name='merge_sample_bams',
        filter=formatter(
            # '.+/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+).sort.dedup.realn.recal.bam'),
            '.+/(?P<readid>[a-zA-Z0-9-]+)_(?P<lib>[a-zA-Z0-9-:]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+).sort.dedup.realn.recal.bam'),
        # inputs=add_inputs('alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.realn.bam'),
        input=output_from('print_reads_gatk'),
        output='alignments/{sample[0]}/{sample[0]}.merged.bam')

    # Mark duplicates in the BAM file using Picard
    pipeline.transform(
        task_func=stages.mark_duplicates_picard,
        name='mark_duplicates_picard2',
        input=output_from('merge_sample_bams'),
        # filter=formatter(
        # '.+/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+).merged.bam'),
        filter=suffix('.merged.bam'),
        # XXX should make metricsup an extra output?
        output=['.merged.dedup.bam', '.metricsdup'])

    # Local realignment2 using GATK
    # Generate RealignerTargetCreator using GATK
    pipeline.transform(
        task_func=stages.realigner_target_creator,
        name='realigner_target_creator2',
        input=output_from('mark_duplicates_picard2'),
        filter=suffix('.dedup.bam'),
        output='.intervals')

    # Local realignment using GATK
    (pipeline.transform(
        task_func=stages.local_realignment_gatk,
        name='local_realignment_gatk2',
        input=output_from('realigner_target_creator2'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9-]+).merged.intervals'),
        # filter=formatter(
        # '.+/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+).intervals'),
        # add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'),
        add_inputs=add_inputs(
            'alignments/{sample[0]}/{sample[0]}.merged.dedup.bam'),
        output='alignments/{sample[0]}/{sample[0]}.merged.dedup.realn.bam')
        .follows('mark_duplicates_picard2'))

    # Call variants using GATK
    pipeline.transform(
        task_func=stages.call_haplotypecaller_gatk,
        name='call_haplotypecaller_gatk',
        input=output_from('local_realignment_gatk2'),
        # filter=suffix('.merged.dedup.realn.bam'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9-]+).merged.dedup.realn.bam'),
        output='variants/{sample[0]}.g.vcf')

    # Combine G.VCF files for all samples using GATK
    pipeline.merge(
        task_func=stages.combine_gvcf_gatk,
        name='combine_gvcf_gatk',
        input=output_from('call_haplotypecaller_gatk'),
        output='variants/ALL.combined.vcf')

    # Genotype G.VCF files using GATK
    pipeline.transform(
        task_func=stages.genotype_gvcf_gatk,
        name='genotype_gvcf_gatk',
        input=output_from('combine_gvcf_gatk'),
        filter=suffix('.combined.vcf'),
        output='.raw.vcf')

    # SNP recalibration using GATK
    pipeline.transform(
        task_func=stages.snp_recalibrate_gatk,
        name='snp_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.raw.vcf'),
        output=['.snp_recal', '.snp_tranches', '.snp_plots.R'])

    # INDEL recalibration using GATK
    pipeline.transform(
        task_func=stages.indel_recalibrate_gatk,
        name='indel_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.raw.vcf'),
        output=['.indel_recal', '.indel_tranches', '.indel_plots.R'])

    # Apply SNP recalibration using GATK
    (pipeline.transform(
        task_func=stages.apply_snp_recalibrate_gatk,
        name='apply_snp_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.raw.vcf'),
        add_inputs=add_inputs(['ALL.snp_recal', 'ALL.snp_tranches']),
        output='.recal_SNP.vcf')
        .follows('snp_recalibrate_gatk'))

    # Apply INDEL recalibration using GATK
    (pipeline.transform(
        task_func=stages.apply_indel_recalibrate_gatk,
        name='apply_indel_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.raw.vcf'),
        add_inputs=add_inputs(
            ['ALL.indel_recal', 'ALL.indel_tranches']),
        output='.recal_INDEL.vcf')
        .follows('indel_recalibrate_gatk'))

    # Combine variants using GATK
    (pipeline.transform(
        task_func=stages.combine_variants_gatk,
        name='combine_variants_gatk',
        input=output_from('apply_snp_recalibrate_gatk'),
        filter=suffix('.recal_SNP.vcf'),
        add_inputs=add_inputs(['ALL.recal_INDEL.vcf']),
        # output='.combined.vcf')
        output='ALL.raw.vqsr.vcf')
        .follows('apply_indel_recalibrate_gatk'))
    #
    # # Select variants using GATK
    # pipeline.transform(
    #     task_func=stages.select_variants_gatk,
    #     name='select_variants_gatk',
    #     input=output_from('combine_variants_gatk'),
    #     filter=suffix('.combined.vcf'),
    #     output='.selected.vcf')


    return pipeline