Ejemplo n.º 1
0
 def test_newstyle_ruffus (self):
     test_pipeline = Pipeline("test")
     test_pipeline.originate(start_task, ["a.1", "b.1"])
     test_pipeline.transform(same_file_name_task, start_task, suffix(".1"), ".1")
     test_pipeline.transform(linked_file_name_task, start_task, suffix(".1"), ".linked.1")
     test_pipeline.transform(final_task, [linked_file_name_task, same_file_name_task], suffix(".1"), ".3")
     test_pipeline.run(log_exceptions = True, verbose = 0)
Ejemplo n.º 2
0
def build_pipeline():

    pipe = Pipeline("my_pipeline")

    pipe.originate(
        name="create_three_new_files",
        task_func=create_new_file,
        output=[os.path.join(WORK_DIR, f"file{i}.csv") for i in range(1, 4)],
    )

    pipe.transform(
        name="convert_csv_files_to_tsv",
        task_func=csv_to_tsv,
        input=output_from("create_three_new_files"),
        filter=suffix(".csv"),
        output=".tsv",
    )

    pipe.transform(
        name="calculate_md5",
        task_func=md5,
        input=output_from("convert_csv_files_to_tsv"),
        filter=suffix(".tsv"),
        output=".md5sum",
    )

    return pipe
Ejemplo n.º 3
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='md5')
    # Get a list of paths to all the input files
    input_files = state.config.get_option('files')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(
        task_func=stages.original_files,
        name='original_files',
        output=input_files)

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.md5_checksum,
        name='md5_checksum',
        input=output_from('original_files'),
        filter=suffix(''),
        output='.md5')


    return pipeline
Ejemplo n.º 4
0
    def test_newstyle_ruffus(self):
        # alternative syntax
        test_pipeline = Pipeline("test")

        test_pipeline.mkdir(data_dir, work_dir)
        test_pipeline.originate(task_func=task1,
                                output=[os.path.join(data_dir, "%s.1" % aa) for aa in "abcd"])

        test_pipeline.mkdir(filter=suffix(".1"),
                            output=".dir",
                            output_dir=work_dir)

        test_pipeline.transform(task_func=task2,
                                input=task1,
                                filter=suffix(".1"),
                                output=[".1", ".bak"],
                                extras=["extra.tst", 4, r"orig_dir=\1"],
                                output_dir=work_dir)

        test_pipeline.subdivide(task3, task2, suffix(
            ".1"), r"\1.*.2", [r"\1.a.2", r"\1.b.2"], output_dir=data_dir)
        test_pipeline.transform(task4, task3, suffix(
            ".2"), ".3", output_dir=work_dir)
        test_pipeline.merge(task5, task4, os.path.join(data_dir, "summary.5"))
        test_pipeline.run(multiprocess=50, verbose=0)

        with open(os.path.join(data_dir, "summary.5")) as ii:
            active_text = ii.read()
        if active_text != expected_active_text:
            raise Exception("Error:\n\tExpected\n%s\nInstead\n%s\n" %
                            (expected_active_text, active_text))
Ejemplo n.º 5
0
    def test_transform_with_missing_formatter_args_b(self):
        test_pipeline = Pipeline("test")

        test_pipeline.originate(task_func=generate_initial_files,
                                output=[os.path.join(tempdir, ff + ".tmp") for ff in "abcd"])\
            .mkdir(tempdir)

        test_pipeline.transform(
            task_func=transform_with_missing_formatter_args,
            input=generate_initial_files,
            filter=formatter(),
            output="{path[0]}/{basename[0]}.task1",
            extras=['echo {dynamic_message} > {some_file}'])
        s = StringIO()
        test_pipeline.printout(s, [transform_with_missing_formatter_args],
                               verbose=4,
                               wrap_width=10000,
                               pipeline="test")
        self.assertIn("Unmatched field {dynamic_message}", s.getvalue())

        # log to stream
        s = StringIO()
        logger = t_stream_logger(s)
        test_pipeline.run([transform_with_missing_formatter_args],
                          verbose=5,
                          pipeline="test",
                          logger=logger)
        self.assertIn("Unmatched field {dynamic_message}", s.getvalue())
    def test_newstyle_ruffus (self):

        print("     Run pipeline normally...")
        test_pipeline = Pipeline("test")
        test_pipeline.originate(make_start, [tempdir + 'start'])

        test_pipeline.split(split_start, make_start, tempdir + '*.split')

        test_pipeline.subdivide(subdivide_start, split_start, formatter(), tempdir + '{basename[0]}_*.subdivided', tempdir + '{basename[0]}')
        if self.graph_viz_present:
            test_pipeline.printout_graph(tempdir + "flowchart.dot")
            test_pipeline.printout_graph(tempdir + "flowchart.jpg",
                                        target_tasks =[subdivide_start],
                                        forcedtorun_tasks = [split_start],
                                        no_key_legend = True)
            test_pipeline.printout_graph(tempdir + "flowchart.svg", no_key_legend = False)
            # Unknown format
            try:
                test_pipeline.printout_graph(tempdir + "flowchart.unknown", no_key_legend = False)
                raise Exception("Failed to throw exception for test_pipeline.printout_graph unknown extension ")
            except CalledProcessError as err:
                pass
            test_pipeline.printout_graph(tempdir + "flowchart.unknown", "svg", no_key_legend = False)

        else:
            test_pipeline.printout_graph(tempdir + "flowchart.dot",
                                        target_tasks =[subdivide_start],
                                        forcedtorun_tasks = [split_start],
                                        no_key_legend = True)
Ejemplo n.º 7
0
def make_pipeline_call(state):
    #this part of the pipeline will take the summary results of "map" and turn them into gatk and undr_rover vcfs
    pipeline = Pipeline(name='genericpipe')

    with open("all_sample.passed.summary.txt", 'r') as inputf:
        passed_files = inputf.read().split('\n')

        stages = Stages(state)

    safe_make_dir('variants')
    safe_make_dir('variants/gatk')

    pipeline.originate(task_func=stages.passed_filter_files,
                       name='passed_filter_files',
                       output=passed_files)

    ###### GATK VARIANT CALLING ######
    # Call variants using GATK
    pipeline.transform(
        task_func=stages.call_haplotypecaller_gatk,
        name='call_haplotypecaller_gatk',
        input=output_from('passed_filter_files'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9-_]+).sort.hq.bam'),
        output='variants/gatk/{sample[0]}.g.vcf')

    return pipeline
Ejemplo n.º 8
0
 def test_newstyle_simpler (self):
     test_pipeline = Pipeline("test")
     test_pipeline.originate(task1, input_file_names, extras = [logger_proxy, logging_mutex])
     test_pipeline.transform(task2, task1, suffix(".1"), ".2", extras = [logger_proxy, logging_mutex])
     test_pipeline.transform(task3, task2, suffix(".2"), ".3", extras = [logger_proxy, logging_mutex])
     test_pipeline.merge(task4, task3, final_file_name, extras = [logger_proxy, logging_mutex])
     #test_pipeline.merge(task4, task3, final_file_name, extras = {"logger_proxy": logger_proxy, "logging_mutex": logging_mutex})
     test_pipeline.run(multiprocess = 500, verbose = 0)
    def test_newstyle_collate(self):
        """
        As above but create pipeline on the fly using object orientated syntax rather than decorators
        """

        #
        # Create pipeline on the fly, joining up tasks
        #
        test_pipeline = Pipeline("test")

        test_pipeline.originate(task_func   = generate_initial_files,
                                output      = original_files)\
            .mkdir(tempdir, tempdir+"/test")


        test_pipeline.subdivide(    task_func   = split_fasta_file,
                                    input       = generate_initial_files,
                                    filter      = regex(r".*\/original_(\d+).fa"),       # match original files
                                    output      = [tempdir + r"/files.split.\1.success", # flag file for each original file
                                                   tempdir + r"/files.split.\1.*.fa"],   # glob pattern
                                    extras      = [r"\1"])\
            .posttask(lambda: sys.stderr.write("\tSplit into %d files each\n" % JOBS_PER_TASK))


        test_pipeline.transform(task_func   = align_sequences,
                                input       = split_fasta_file,
                                filter      = suffix(".fa"),
                                output      = ".aln")  \
            .posttask(lambda: sys.stderr.write("\tSequences aligned\n"))

        test_pipeline.transform(task_func   = percentage_identity,
                                input       = align_sequences,             # find all results from align_sequences
                                filter      = suffix(".aln"),             # replace suffix with:
                                output      = [r".pcid",                  #   .pcid suffix for the result
                                               r".pcid_success"]         #   .pcid_success to indicate job completed
                                )\
            .posttask(lambda: sys.stderr.write("\t%Identity calculated\n"))


        test_pipeline.collate(task_func   = combine_results,
                              input       = percentage_identity,
                              filter      = regex(r".*files.split\.(\d+)\.\d+.pcid"),
                              output      = [tempdir + r"/\1.all.combine_results",
                                             tempdir + r"/\1.all.combine_results_success"])\
            .posttask(lambda: sys.stderr.write("\tResults recombined\n"))

        #
        # Cleanup, printout and run
        #
        self.cleanup_tmpdir()
        s = StringIO()
        test_pipeline.printout(s, [combine_results],
                               verbose=5,
                               wrap_width=10000)
        self.assertTrue(
            re.search('Job needs update:.*Missing files.*', s.getvalue(),
                      re.DOTALL) is not None)
        test_pipeline.run(verbose=0)
Ejemplo n.º 10
0
def make_pipeline1(
        pipeline_name,  # Pipelines need to have a unique name
        starting_file_names):
    test_pipeline = Pipeline(pipeline_name)

    #   We can change the starting files later using
    #          set_input() for transform etc.
    #       or set_output() for originate
    #   But it can be more convenient to just pass this to the function making the pipeline
    #
    test_pipeline.originate(task_originate, starting_file_names)\
        .follows(mkdir(tempdir), mkdir(tempdir + "/testdir", tempdir + "/testdir2"))\
        .posttask(touch_file(tempdir + "/testdir/whatever.txt"))
    test_pipeline.transform(
        task_func=task_m_to_1,
        name="add_input",
        # Lookup Task from function name task_originate()
        #   So long as this is unique in the pipeline
        input=task_originate,
        # requires an anchor from 3.7 onwards, see
        # https://bugs.python.org/issue34982
        filter=regex(r"^(.*)"),
        add_inputs=add_inputs(tempdir + "/testdir/whatever.txt"),
        output=r"\1.22")
    test_pipeline.transform(
        task_func=task_1_to_1,
        name="22_to_33",
        # Lookup Task from Task name
        #   Function name is not unique in the pipeline
        input=output_from("add_input"),
        filter=suffix(".22"),
        output=".33")
    tail_task = test_pipeline.transform(
        task_func=task_1_to_1,
        name="33_to_44",
        # Ask Pipeline to lookup Task from Task name
        input=test_pipeline["22_to_33"],
        filter=suffix(".33"),
        output=".44")

    #   Set the tail task so that users of my sub pipeline can use it as a dependency
    #       without knowing the details of task names
    #
    #   Use Task() object directly without having to lookup
    test_pipeline.set_tail_tasks([tail_task])

    #   If we try to connect a Pipeline without tail tasks defined, we have to
    #       specify the exact task within the Pipeline.
    #   Otherwise Ruffus will not know which task we mean and throw an exception
    if DEBUG_do_not_define_tail_task:
        test_pipeline.set_tail_tasks([])

    # Set the head task so that users of my sub pipeline send input into it
    #   without knowing the details of task names
    test_pipeline.set_head_tasks([test_pipeline[task_originate]])

    return test_pipeline
Ejemplo n.º 11
0
    def test_newstyle_collate(self):
        """
        As above but create pipeline on the fly using object orientated syntax rather than decorators
        """

        #
        # Create pipeline on the fly, joining up tasks
        #
        test_pipeline = Pipeline("test")

        test_pipeline.originate(task_func=generate_initial_files,
                                output=original_files)\
            .mkdir(tempdir, tempdir+"/test")

        test_pipeline.subdivide(task_func=split_fasta_file,
                                input=generate_initial_files,
                                # match original files
                                filter=regex(r".*\/original_(\d+).fa"),
                                output=[tempdir + r"/files.split.\1.success",  # flag file for each original file
                                        tempdir + r"/files.split.\1.*.fa"],   # glob pattern
                                extras=[r"\1"])\
            .posttask(lambda: sys.stderr.write("\tSplit into %d files each\n" % JOBS_PER_TASK))

        test_pipeline.transform(task_func=align_sequences,
                                input=split_fasta_file,
                                filter=suffix(".fa"),
                                output=".aln")  \
            .posttask(lambda: sys.stderr.write("\tSequences aligned\n"))

        test_pipeline.transform(task_func=percentage_identity,
                                input=align_sequences,             # find all results from align_sequences
                                # replace suffix with:
                                filter=suffix(".aln"),
                                output=[r".pcid",  # .pcid suffix for the result
                                        r".pcid_success"]  # .pcid_success to indicate job completed
                                )\
            .posttask(lambda: sys.stderr.write("\t%Identity calculated\n"))

        test_pipeline.collate(task_func=combine_results,
                              input=percentage_identity,
                              filter=regex(r".*files.split\.(\d+)\.\d+.pcid"),
                              output=[tempdir + r"/\1.all.combine_results",
                                      tempdir + r"/\1.all.combine_results_success"])\
            .posttask(lambda: sys.stderr.write("\tResults recombined\n"))

        #
        # Cleanup, printout and run
        #
        self.cleanup_tmpdir()
        s = StringIO()
        test_pipeline.printout(s, [combine_results],
                               verbose=5, wrap_width=10000)
        self.assertTrue(re.search(
            'Job needs update:.*Missing files.*', s.getvalue(), re.DOTALL) is not None)
        test_pipeline.run(verbose=0)
Ejemplo n.º 12
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='ovarian_cancer_pipeline')
    # Get a list of paths to all the FASTQ files
    fastq_files = state.config.get_option('fastqs')
    human_reference_genome_file = state.config.get_option('human_reference_genome')
    # Stages are dependent on the state
    stages = PipelineStages(state)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(
        task_func=stages.original_fastqs,
        name='original_fastqs',
        output=fastq_files)

    # The human reference genome in FASTA format
    pipeline.originate(
        task_func=stages.human_reference_genome,
        name='human_reference_genome',
        output=human_reference_genome_file)

    # Index the human reference genome with BWA, needed before we can map reads
    pipeline.transform(
        task_func=stages.index_ref_bwa,
        name='index_ref_bwa',
        input=output_from('human_reference_genome'),
        filter=suffix('.fa'),
        output=['.fa.amb', '.fa.ann', '.fa.pac', '.fa.sa', '.fa.bwt'])

    # Align paired end reads in FASTQ to the reference producing a BAM file
    (pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name. 
        # This will be the first input to the stage.
        # We assume the sample name may consist of only alphanumeric
        # characters.
        filter=formatter('.+/(?P<sample>[_a-zA-Z0-9]+)_R1.fastq.gz'),
        # Add one more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=['{sample[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='{path[0]}/{sample[0]}.bam')
        .follows('index_ref_bwa'))


    return pipeline
    def test_newstyle_ruffus(self):

        test_pipeline = Pipeline("test")
        test_pipeline.originate(task_func=make_start,
                                output=[tempdir + 'start'])
        test_pipeline.split(task_func=split_start,
                            input=make_start, output=tempdir + '*.split')
        test_pipeline.subdivide(task_func=subdivide_start, input=split_start, filter=formatter(
        ), output=tempdir + '{basename[0]}_*.subdivided', extras=[tempdir + '{basename[0]}'])

        expected_files_after_1_runs = ["start", "0.split", "0_0.subdivided"]
        expected_files_after_2_runs = [
            "1.split", "0_1.subdivided", "1_0.subdivided"]
        expected_files_after_3_runs = [
            "2.split", "0_2.subdivided", "1_1.subdivided", "2_0.subdivided"]
        expected_files_after_4_runs = [
            "3.split", "0_3.subdivided", "1_2.subdivided", "2_1.subdivided", "3_0.subdivided"]

        print("     1 Run pipeline normally...")
        test_pipeline.run(multiprocess=10, verbose=TEST_VERBOSITY)
        self.check_file_exists_or_not_as_expected(expected_files_after_1_runs,
                                                  expected_files_after_2_runs)
        print("     2 Check that running again does nothing. (All up to date).")
        test_pipeline.run(multiprocess=10, verbose=TEST_VERBOSITY)
        self.check_file_exists_or_not_as_expected(expected_files_after_1_runs,
                                                  expected_files_after_2_runs)
        time.sleep(2)

        print("     3 Running again with forced tasks to generate more files...")
        test_pipeline.run(forcedtorun_tasks=[
                          "test::make_start"], multiprocess=10, verbose=TEST_VERBOSITY)
        self.check_file_exists_or_not_as_expected(expected_files_after_1_runs
                                                  + expected_files_after_2_runs,
                                                  expected_files_after_3_runs)
        print("     4 Check that running again does nothing. (All up to date).")
        test_pipeline.run(multiprocess=10, verbose=TEST_VERBOSITY)
        self.check_file_exists_or_not_as_expected(expected_files_after_1_runs
                                                  + expected_files_after_2_runs,
                                                  expected_files_after_3_runs)
        time.sleep(2)

        print("     5 Running again with forced tasks to generate even more files...")
        test_pipeline.run(forcedtorun_tasks=make_start,
                          multiprocess=10, verbose=TEST_VERBOSITY)
        self.check_file_exists_or_not_as_expected(expected_files_after_1_runs
                                                  + expected_files_after_2_runs
                                                  + expected_files_after_3_runs,
                                                  expected_files_after_4_runs)
        print("     6 Check that running again does nothing. (All up to date).")
        test_pipeline.run(multiprocess=10, verbose=TEST_VERBOSITY)
        self.check_file_exists_or_not_as_expected(expected_files_after_1_runs
                                                  + expected_files_after_2_runs
                                                  + expected_files_after_3_runs,
                                                  expected_files_after_4_runs)
Ejemplo n.º 14
0
def make_pipeline1(pipeline_name,   # Pipelines need to have a unique name
                   starting_file_names):
    test_pipeline = Pipeline(pipeline_name)

    #   We can change the starting files later using
    #          set_input() for transform etc.
    #       or set_output() for originate
    #   But it can be more convenient to just pass this to the function making the pipeline
    #
    test_pipeline.originate(task_originate, starting_file_names)\
        .follows(mkdir(tempdir), mkdir(tempdir + "/testdir", tempdir + "/testdir2"))\
        .posttask(touch_file(tempdir + "/testdir/whatever.txt"))
    test_pipeline.transform(task_func=task_m_to_1,
                            name="add_input",
                            # Lookup Task from function name task_originate()
                            #   So long as this is unique in the pipeline
                            input=task_originate,
                            # requires an anchor from 3.7 onwards, see
                            # https://bugs.python.org/issue34982
                            filter=regex(r"^(.*)"),
                            add_inputs=add_inputs(
                                tempdir + "/testdir/whatever.txt"),
                            output=r"\1.22")
    test_pipeline.transform(task_func=task_1_to_1,
                            name="22_to_33",
                            # Lookup Task from Task name
                            #   Function name is not unique in the pipeline
                            input=output_from("add_input"),
                            filter=suffix(".22"),
                            output=".33")
    tail_task = test_pipeline.transform(task_func=task_1_to_1,
                                        name="33_to_44",
                                        # Ask Pipeline to lookup Task from Task name
                                        input=test_pipeline["22_to_33"],
                                        filter=suffix(".33"),
                                        output=".44")

    #   Set the tail task so that users of my sub pipeline can use it as a dependency
    #       without knowing the details of task names
    #
    #   Use Task() object directly without having to lookup
    test_pipeline.set_tail_tasks([tail_task])

    #   If we try to connect a Pipeline without tail tasks defined, we have to
    #       specify the exact task within the Pipeline.
    #   Otherwise Ruffus will not know which task we mean and throw an exception
    if DEBUG_do_not_define_tail_task:
        test_pipeline.set_tail_tasks([])

    # Set the head task so that users of my sub pipeline send input into it
    #   without knowing the details of task names
    test_pipeline.set_head_tasks([test_pipeline[task_originate]])

    return test_pipeline
Ejemplo n.º 15
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='test_pipeline')
    # Get a list of paths to all the FASTQ files
    input_files = state.config.get_option('files')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(
        task_func=stages.original_files,
        name='original_files',
        output=input_files)

    pipeline.transform(
        task_func=stages.stage1,
        name='stage1',
        input=output_from('original_files'),
        filter=suffix('.0'),
        output='.1')

    pipeline.transform(
        task_func=stages.stage2,
        name='stage2',
        input=output_from('stage1'),
        filter=suffix('.1'),
        output='.2')

    pipeline.transform(
        task_func=stages.stage3,
        name='stage3',
        input=output_from('stage2'),
        filter=suffix('.2'),
        output='.3')

    pipeline.transform(
        task_func=stages.stage4,
        name='stage4',
        input=output_from('stage3'),
        filter=suffix('.3'),
        output='.4')

    pipeline.transform(
        task_func=stages.stage5,
        name='stage5',
        input=output_from('stage4'),
        filter=suffix('.4'),
        output='.5')

    return pipeline
Ejemplo n.º 16
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='twin ion')
    # Get a list of paths to all the MZML files
    mzml_files = state.config.get_option('mzml')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original MZML files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(
        task_func=stages.original_mzml,
        name='original_mzml',
        output=mzml_files)

    pipeline.transform(
        task_func=stages.resample,
        name='resample',
        input=output_from('original_mzml'),
        filter=suffix('.mzML'),
        output='.resample.mzML')

    pipeline.transform(
        task_func=stages.noise_filter_sgolay,
        name='noise_filter_sgolay',
        input=output_from('resample'),
        filter=suffix('.resample.mzML'),
        output='.denoise.mzML')

    pipeline.transform(
        task_func=stages.baseline_filter,
        name='baseline_filter',
        input=output_from('noise_filter_sgolay'),
        filter=suffix('.denoise.mzML'),
        output='.baseline.mzML')

    pipeline.transform(
        task_func=stages.peak_picker_hires,
        name='peak_picker_hires',
        input=output_from('baseline_filter'),
        filter=suffix('.baseline.mzML'),
        output='.peaks.mzML')

    pipeline.transform(
        task_func=stages.feature_finder_centroid,
        name='feature_finder_centroid',
        input=output_from('peak_picker_hires'),
        filter=suffix('.peaks.mzML'),
        output='.featureXML')

    return pipeline
Ejemplo n.º 17
0
def make_pipeline_call(state):
    #this part of the pipeline will take the summary results of "map" and turn them into gatk and undr_rover vcfs
    pipeline = Pipeline(name='hiplexpipe')

    with open("all_sample.passed.summary.txt", 'r') as inputf:
        passed_files = inputf.read().split('\n')

        stages = Stages(state)

    safe_make_dir('variants')
    safe_make_dir('variants/gatk')
    safe_make_dir('variants/undr_rover')
    safe_make_dir('variants/undr_rover/coverdir')

    pipeline.originate(task_func=stages.passed_filter_files,
                       name='passed_filter_files',
                       output=passed_files)

    # Call variants using undr_rover
    pipeline.transform(
        task_func=stages.apply_undr_rover,
        name='apply_undr_rover',
        input=output_from('passed_filter_files'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).clipped.sort.hq.bam'),
        output='variants/undr_rover/{sample[0]}.vcf',
        extras=['{sample[0]}'])

    #### concatenate undr_rover vcfs ####
    pipeline.transform(
        task_func=stages.sort_vcfs,
        name='sort_vcfs',
        input=output_from('apply_undr_rover'),
        filter=formatter('variants/undr_rover/(?P<sample>[a-zA-Z0-9_-]+).vcf'),
        output='variants/undr_rover/{sample[0]}.sorted.vcf.gz')

    pipeline.transform(task_func=stages.index_vcfs,
                       name='index_vcfs',
                       input=output_from('sort_vcfs'),
                       filter=suffix('.sorted.vcf.gz'),
                       output='.sorted.vcf.gz.tbi')

    ###### GATK VARIANT CALLING ######
    # Call variants using GATK
    pipeline.transform(
        task_func=stages.call_haplotypecaller_gatk,
        name='call_haplotypecaller_gatk',
        input=output_from('passed_filter_files'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9-_]+).clipped.sort.hq.bam'),
        output='variants/gatk/{sample[0]}.g.vcf')

    return pipeline
    def test_newstyle_no_re_match (self):

        test_pipeline = Pipeline("test")
        test_pipeline.originate(task_1, tempdir + "a").mkdir(tempdir)
        test_pipeline.transform(task_2, task_1, regex("b"), "task_2.output")


        save_to_str_logger = t_save_to_str_logger()
        test_pipeline.run(multiprocess = 10, logger = save_to_str_logger, verbose = 1)
        print(save_to_str_logger.warning_str)
        self.assertTrue("no file names matched" in save_to_str_logger.warning_str)
        print("\n    Warning printed out correctly", file=sys.stderr)
    def test_newstyle_no_re_match(self):

        test_pipeline = Pipeline("test")
        test_pipeline.originate(task_1, tempdir + "a").mkdir(tempdir)
        test_pipeline.transform(task_2, task_1, regex("b"), "task_2.output")

        save_to_str_logger = t_save_to_str_logger()
        test_pipeline.run(
            multiprocess=10, logger=save_to_str_logger, verbose=1)
        print(save_to_str_logger.warning_str)
        self.assertTrue(
            "no file names matched" in save_to_str_logger.warning_str)
        print("\n    Warning printed out correctly", file=sys.stderr)
Ejemplo n.º 20
0
    def test_newstyle_ruffus(self):

        test_pipeline = Pipeline("test")
        test_pipeline.originate(task_func=task1,
                                output=[tempdir + 'a.1'] + runtime_files)
        test_pipeline.transform(task2, task1, suffix(".1"), ".2")
        test_pipeline.transform(task_func=task3,
                                input=task2,
                                filter=suffix(".2"),
                                output=".3")
        test_pipeline.transform(task_func=task4,
                                input=runtime_parameter("a"),
                                filter=suffix(".3"),
                                output=".4").follows(task3)
        test_pipeline.run(verbose=0, runtime_data={"a": runtime_files})
Ejemplo n.º 21
0
    def test_newstyle_ruffus(self):

        test_pipeline = Pipeline("test")
        test_pipeline.originate(task_func=task1,
                                output=[tempdir + 'a.1'] + runtime_files)
        test_pipeline.transform(task2, task1, suffix(".1"), ".2")
        test_pipeline.transform(task_func=task3,
                                input=task2,
                                filter=suffix(".2"),
                                output=".3")
        test_pipeline.transform(task_func=task4,
                                input=runtime_parameter("a"),
                                filter=suffix(".3"),
                                output=".4").follows(task3)
        test_pipeline.run(verbose=0, runtime_data={"a": runtime_files})
    def test_newstyle_ruffus (self):

        test_pipeline = Pipeline("test")
        test_pipeline.originate(task_func = make_start, output = [tempdir + 'start'])
        test_pipeline.split(task_func = split_start, input = make_start, output = tempdir + '*.split')
        test_pipeline.subdivide(task_func = subdivide_start, input = split_start, filter = formatter(), output = tempdir + '{basename[0]}_*.subdivided', extras = [tempdir + '{basename[0]}'])

        expected_files_after_1_runs = ["start", "0.split", "0_0.subdivided"]
        expected_files_after_2_runs = ["1.split", "0_1.subdivided", "1_0.subdivided"]
        expected_files_after_3_runs = ["2.split", "0_2.subdivided", "1_1.subdivided", "2_0.subdivided"]
        expected_files_after_4_runs = ["3.split", "0_3.subdivided", "1_2.subdivided", "2_1.subdivided", "3_0.subdivided"]

        print("     1 Run pipeline normally...")
        test_pipeline.run(multiprocess = 10, verbose = TEST_VERBOSITY)
        self.check_file_exists_or_not_as_expected(expected_files_after_1_runs,
                                                 expected_files_after_2_runs)
        print("     2 Check that running again does nothing. (All up to date).")
        test_pipeline.run(multiprocess = 10, verbose = TEST_VERBOSITY)
        self.check_file_exists_or_not_as_expected(expected_files_after_1_runs,
                                                 expected_files_after_2_runs)
        time.sleep(2)

        print("     3 Running again with forced tasks to generate more files...")
        test_pipeline.run(forcedtorun_tasks = ["test::make_start"], multiprocess = 10, verbose = TEST_VERBOSITY)
        self.check_file_exists_or_not_as_expected(expected_files_after_1_runs
                                                 + expected_files_after_2_runs,
                                                 expected_files_after_3_runs)
        print("     4 Check that running again does nothing. (All up to date).")
        test_pipeline.run(multiprocess = 10, verbose = TEST_VERBOSITY)
        self.check_file_exists_or_not_as_expected(expected_files_after_1_runs
                                                 + expected_files_after_2_runs,
                                                 expected_files_after_3_runs)
        time.sleep(2)


        print("     5 Running again with forced tasks to generate even more files...")
        test_pipeline.run(forcedtorun_tasks = make_start, multiprocess = 10, verbose = TEST_VERBOSITY)
        self.check_file_exists_or_not_as_expected(expected_files_after_1_runs
                                                 + expected_files_after_2_runs
                                                 + expected_files_after_3_runs,
                                                 expected_files_after_4_runs)
        print("     6 Check that running again does nothing. (All up to date).")
        test_pipeline.run(multiprocess = 10, verbose = TEST_VERBOSITY)
        self.check_file_exists_or_not_as_expected(expected_files_after_1_runs
                                                 + expected_files_after_2_runs
                                                 + expected_files_after_3_runs,
                                                 expected_files_after_4_runs)
Ejemplo n.º 23
0
    def test_newstyle_mkdir (self):
        test_pipeline = Pipeline("test")

        test_pipeline.follows(task_which_makes_directories,
                         mkdir(directories),
                         mkdir(unicode(tempdir + "c")),
                         mkdir(unicode(tempdir + "d"),
                               unicode(tempdir + "e")),
                         mkdir(unicode(tempdir + "e")))\
            .posttask(touch_file(unicode(tempdir + "f")))

        test_pipeline.originate(task_which_makes_files, [tempdir + "g", tempdir + "h"])
        test_pipeline.run(multiprocess = 10, verbose = 0)

        for d in 'abcdefgh':
            fullpath = os.path.join(os.path.dirname(__file__), tempdir, d)
            self.assertTrue(os.path.exists(fullpath))
Ejemplo n.º 24
0
    def test_newstyle_mkdir(self):
        test_pipeline = Pipeline("test")

        test_pipeline.follows(task_which_makes_directories,
                              mkdir(directories),
                              mkdir(unicode(tempdir + "c")),
                              mkdir(unicode(tempdir + "d"),
                                    unicode(tempdir + "e")),
                              mkdir(unicode(tempdir + "e")))\
            .posttask(touch_file(unicode(tempdir + "f")))

        test_pipeline.originate(task_which_makes_files,
                                [tempdir + "g", tempdir + "h"])
        test_pipeline.run(multiprocess=10, verbose=0)

        for d in 'abcdefgh':
            fullpath = os.path.join(os.path.dirname(__file__), tempdir, d)
            self.assertTrue(os.path.exists(fullpath))
Ejemplo n.º 25
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='fastq2bam')
    # Get a list of paths to all the FASTQ files
    input_files = state.config.get_option('files')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.original_files,
                       name='original_files',
                       output=input_files)

    pipeline.transform(
        task_func=stages.fastq2bam,
        name='fastq2bam',
        input=output_from('original_files'),
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+)_R1.fastq.gz'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'),
        extras=['{sample[0]}'],
        output='{path[0]}/out/{sample[0]}.bam')

    pipeline.transform(
        task_func=stages.validate_prealigned_bam,
        name='validate_prealigned_bam',
        input=output_from('fastq2bam'),
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).bam'),
        output='{path[0]}/{sample[0]}.validation')

    pipeline.transform(
        task_func=stages.align,
        name='align',
        input=output_from('validate_prealigned_bam'),
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).validation'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}.bam'),
        output='{path[0]}/{sample[0]}.mapped.bam')

    return pipeline
Ejemplo n.º 26
0
 def test_newstyle_simpler(self):
     test_pipeline = Pipeline("test")
     test_pipeline.originate(task1,
                             input_file_names,
                             extras=[logger_proxy, logging_mutex])
     test_pipeline.transform(task2,
                             task1,
                             suffix(".1"),
                             ".2",
                             extras=[logger_proxy, logging_mutex])
     test_pipeline.transform(task3,
                             task2,
                             suffix(".2"),
                             ".3",
                             extras=[logger_proxy, logging_mutex])
     test_pipeline.merge(task4,
                         task3,
                         final_file_name,
                         extras=[logger_proxy, logging_mutex])
     #test_pipeline.merge(task4, task3, final_file_name, extras = {"logger_proxy": logger_proxy, "logging_mutex": logging_mutex})
     test_pipeline.run(multiprocess=500, verbose=0)
Ejemplo n.º 27
0
    def test_newstyle_ruffus(self):
        # alternative syntax
        test_pipeline = Pipeline("test")

        test_pipeline.mkdir(data_dir, work_dir)
        test_pipeline.originate(
            task_func=task1,
            output=[os.path.join(data_dir, "%s.1" % aa) for aa in "abcd"])

        test_pipeline.mkdir(filter=suffix(".1"),
                            output=".dir",
                            output_dir=work_dir)

        test_pipeline.transform(task_func=task2,
                                input=task1,
                                filter=suffix(".1"),
                                output=[".1", ".bak"],
                                extras=["extra.tst", 4, r"orig_dir=\1"],
                                output_dir=work_dir)

        test_pipeline.subdivide(task3,
                                task2,
                                suffix(".1"),
                                r"\1.*.2", [r"\1.a.2", r"\1.b.2"],
                                output_dir=data_dir)
        test_pipeline.transform(task4,
                                task3,
                                suffix(".2"),
                                ".3",
                                output_dir=work_dir)
        test_pipeline.merge(task5, task4, os.path.join(data_dir, "summary.5"))
        test_pipeline.run(multiprocess=50, verbose=0)

        with open(os.path.join(data_dir, "summary.5")) as ii:
            active_text = ii.read()
        if active_text != expected_active_text:
            raise Exception("Error:\n\tExpected\n%s\nInstead\n%s\n" %
                            (expected_active_text, active_text))
Ejemplo n.º 28
0
    def test_transform_with_missing_formatter_args_b(self):
        test_pipeline = Pipeline("test")


        test_pipeline.originate(task_func   = generate_initial_files,
                                output      = [os.path.join(tempdir, ff + ".tmp") for ff in "abcd"])\
            .mkdir(tempdir)


        test_pipeline.transform(task_func   = transform_with_missing_formatter_args,
                                input       = generate_initial_files,
                                filter      = formatter(),
                                output      = "{path[0]}/{basename[0]}.task1",
                                extras      =['echo {dynamic_message} > {some_file}'])
        s = StringIO()
        test_pipeline.printout(s, [transform_with_missing_formatter_args], verbose=4, wrap_width = 10000, pipeline= "test")
        self.assertIn("Missing key = {dynamic_message}", s.getvalue())

        #log to stream
        s = StringIO()
        logger = t_stream_logger(s)
        test_pipeline.run([transform_with_missing_formatter_args], verbose=5, pipeline= "test", logger=logger)
        self.assertIn("Missing key = {dynamic_message}", s.getvalue())
Ejemplo n.º 29
0
    def test_newstyle_ruffus(self):

        print("     Run pipeline normally...")
        test_pipeline = Pipeline("test")
        test_pipeline.originate(make_start, [tempdir + 'start'])

        test_pipeline.split(split_start, make_start, tempdir + '*.split')

        test_pipeline.subdivide(subdivide_start, split_start, formatter(),
                                tempdir + '{basename[0]}_*.subdivided',
                                tempdir + '{basename[0]}')
        if self.graph_viz_present:
            test_pipeline.printout_graph(tempdir + "flowchart.dot")
            test_pipeline.printout_graph(tempdir + "flowchart.jpg",
                                         target_tasks=[subdivide_start],
                                         forcedtorun_tasks=[split_start],
                                         no_key_legend=True)
            test_pipeline.printout_graph(tempdir + "flowchart.svg",
                                         no_key_legend=False)
            # Unknown format
            try:
                test_pipeline.printout_graph(tempdir + "flowchart.unknown",
                                             no_key_legend=False)
                raise Exception(
                    "Failed to throw exception for test_pipeline.printout_graph unknown extension "
                )
            except CalledProcessError as err:
                pass
            test_pipeline.printout_graph(tempdir + "flowchart.unknown",
                                         "svg",
                                         no_key_legend=False)

        else:
            test_pipeline.printout_graph(tempdir + "flowchart.dot",
                                         target_tasks=[subdivide_start],
                                         forcedtorun_tasks=[split_start],
                                         no_key_legend=True)
Ejemplo n.º 30
0
    def create_pipeline(self):
        #each pipeline has a different name
        global cnt_pipelines
        cnt_pipelines = cnt_pipelines + 1
        test_pipeline = Pipeline("test %d" % cnt_pipelines)

        test_pipeline.originate(
            task_func=generate_initial_files1,
            output=[tempdir + prefix + "_name.tmp1" for prefix in "abcd"])

        test_pipeline.originate(
            task_func=generate_initial_files2,
            output=[tempdir + "e_name.tmp1", tempdir + "f_name.tmp1"])

        test_pipeline.originate(
            task_func=generate_initial_files3,
            output=[tempdir + "g_name.tmp1", tempdir + "h_name.tmp1"])

        test_pipeline.originate(task_func=generate_initial_files4,
                                output=tempdir + "i_name.tmp1")

        test_pipeline.collate(task_func=test_task2,
                              input=[
                                  generate_initial_files1,
                                  generate_initial_files2,
                                  generate_initial_files3,
                                  generate_initial_files4
                              ],
                              filter=formatter(),
                              output="{path[0]}/all.tmp2")

        test_pipeline.transform(task_func=test_task3,
                                input=test_task2,
                                filter=suffix(".tmp2"),
                                output=".tmp3")

        test_pipeline.transform(task_func=test_task4,
                                input=test_task3,
                                filter=suffix(".tmp3"),
                                output=".tmp4")
        return test_pipeline
    def create_pipeline (self):
        #each pipeline has a different name
        global cnt_pipelines
        cnt_pipelines = cnt_pipelines + 1
        test_pipeline = Pipeline("test %d" % cnt_pipelines)

        test_pipeline.originate(task_func   = generate_initial_files1,
                                output      = [tempdir + prefix + "_name.tmp1" for prefix in "abcd"])

        test_pipeline.originate(task_func   = generate_initial_files2,
                                output      = [tempdir +  "e_name.tmp1", tempdir +  "f_name.tmp1"])

        test_pipeline.originate(task_func   = generate_initial_files3,
                                output      = [tempdir +  "g_name.tmp1", tempdir +  "h_name.tmp1"])

        test_pipeline.originate(task_func   = generate_initial_files4,
                                output      = tempdir +  "i_name.tmp1")

        test_pipeline.collate(  task_func   = test_task2,
                                input       = [generate_initial_files1,
                                               generate_initial_files2,
                                               generate_initial_files3,
                                               generate_initial_files4],
                                filter      = formatter(),
                                output      = "{path[0]}/all.tmp2")

        test_pipeline.transform(task_func   = test_task3,
                                input       = test_task2,
                                filter      = suffix(".tmp2"),
                                output      = ".tmp3")

        test_pipeline.transform(task_func   = test_task4,
                                input       = test_task3,
                                filter      = suffix(".tmp3"),
                                output      = ".tmp4")
        return test_pipeline
Ejemplo n.º 32
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='hiplexpipe')
    # Get a list of paths to all the FASTQ files
    fastq_files = state.config.get_option('fastqs')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(
        task_func=stages.original_fastqs,
        name='original_fastqs',
        output=fastq_files)

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        # Hi-Plex example: OHI031002-P02F04_S318_L001_R1_001.fastq
        # new sample name = OHI031002-P02F04
        filter=formatter(
            '.+/(?P<sample>[a-zA-Z0-9-]+)-(?P<tumor>[TN]+)_(?P<readid>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_R1_(?P<lib>[a-zA-Z0-9-:]+).fastq'),

        # Add one more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        # Hi-Plex example: OHI031002-P02F04_S318_L001_R2_001.fastq
        add_inputs=add_inputs(
            '{path[0]}/{sample[0]}-{tumor[0]}_{readid[0]}_{lane[0]}_R2_{lib[0]}.fastq'),

        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=['{sample[0]}', '{tumor[0]}', '{readid[0]}', '{lane[0]}', '{lib[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='alignments/{sample[0]}/{sample[0]}_{tumor[0]}.bam')

    # Sort the BAM file using Picard
    pipeline.transform(
        task_func=stages.sort_bam_picard,
        name='sort_bam_picard',
        input=output_from('align_bwa'),
        filter=suffix('.bam'),
        output='.sort.bam')

    # High quality and primary alignments
    pipeline.transform(
        task_func=stages.primary_bam,
        name='primary_bam',
        input=output_from('sort_bam_picard'),
        filter=suffix('.sort.bam'),
        output='.primary.bam')

    # index bam file
    pipeline.transform(
        task_func=stages.index_sort_bam_picard,
        name='index_bam',
        input=output_from('primary_bam'),
        filter=suffix('.primary.bam'),
        output='.primary.bam.bai')

    # Clip the primer_seq from BAM File
    (pipeline.transform(
        task_func=stages.clip_bam,
        name='clip_bam',
        input=output_from('primary_bam'),
        filter=suffix('.primary.bam'),
        output='.primary.primerclipped.bam')
        .follows('index_bam'))

    ###### GATK VARIANT CALLING - MuTect2 ######

    # Call somatics variants using MuTect2
    pipeline.transform(
        task_func=stages.call_mutect2_gatk,
        name='call_mutect2_gatk',
        input=output_from('clip_bam'),
        # filter=suffix('.merged.dedup.realn.bam'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9-]+)_T.primary.primerclipped.bam'),
        add_inputs=add_inputs(
            '{path[0]}/{sample[0]}_N.primary.primerclipped.bam'),
        # extras=['{sample[0]}'],
        output='variants/mutect2/{sample[0]}.mutect2.vcf')
        # .follows('clip_bam')

    ###### GATK VARIANT CALLING - MuTect2 ######

    # -------- VEP ----------
    # Apply NORM
    (pipeline.transform(
        task_func=stages.apply_vt,
        name='apply_vt',
        input=output_from('call_mutect2_gatk'),
        filter=suffix('.mutect2.vcf'),
        # add_inputs=add_inputs(['variants/ALL.indel_recal', 'variants/ALL.indel_tranches']),
        output='.mutect2.vt.vcf')
        .follows('call_mutect2_gatk'))
    #
    # Apply VEP
    (pipeline.transform(
        task_func=stages.apply_vep,
        name='apply_vep',
        input=output_from('apply_vt'),
        filter=suffix('.mutect2.vt.vcf'),
        # add_inputs=add_inputs(['variants/ALL.indel_recal', 'variants/ALL.indel_tranches']),
        output='.mutect2.vt.vep.vcf')
        .follows('apply_vt'))
    #
    # Apply vcfanno
    (pipeline.transform(
        task_func=stages.apply_vcfanno,
        name='apply_vcfanno',
        input=output_from('apply_vep'),
        filter=suffix('.mutect2.vt.vep.vcf'),
        # add_inputs=add_inputs(['variants/ALL.indel_recal', 'variants/ALL.indel_tranches']),
        output='.mutect2.annotated.vcf')
        .follows('apply_vep'))

    return pipeline
Ejemplo n.º 33
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='complexo')
    # Get a list of paths to all the FASTQ files
    fastq_files = state.config.get_option('fastqs')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.original_fastqs,
                       name='original_fastqs',
                       output=fastq_files)

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        # We assume the sample name may consist of only alphanumeric
        # characters.
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_]+)_R1.fastq.gz'),
        # Add one more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=['{sample[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='{path[0]}/{sample[0]}.bam')

    # Sort the BAM file using Picard
    pipeline.transform(task_func=stages.sort_bam_picard,
                       name='sort_bam_picard',
                       input=output_from('align_bwa'),
                       filter=suffix('.bam'),
                       output='.sort.bam')

    # Mark duplicates in the BAM file using Picard
    pipeline.transform(
        task_func=stages.mark_duplicates_picard,
        name='mark_duplicates_picard',
        input=output_from('sort_bam_picard'),
        filter=suffix('.sort.bam'),
        # XXX should make metricsup an extra output?
        output=['.sort.dedup.bam', '.metricsdup'])

    # Generate chromosome intervals using GATK
    pipeline.transform(task_func=stages.chrom_intervals_gatk,
                       name='chrom_intervals_gatk',
                       input=output_from('mark_duplicates_picard'),
                       filter=suffix('.sort.dedup.bam'),
                       output='.chr.intervals')

    # Local realignment using GATK
    (pipeline.transform(
        task_func=stages.local_realignment_gatk,
        name='local_realignment_gatk',
        input=output_from('chrom_intervals_gatk'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_]+).chr.intervals'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'),
        output='{path[0]}/{sample[0]}.sort.dedup.realn.bam').follows(
            'mark_duplicates_picard'))

    # Base recalibration using GATK
    pipeline.transform(task_func=stages.base_recalibration_gatk,
                       name='base_recalibration_gatk',
                       input=output_from('local_realignment_gatk'),
                       filter=suffix('.sort.dedup.realn.bam'),
                       output=['.recal_data.csv', '.count_cov.log'])

    # Print reads using GATK
    (pipeline.transform(
        task_func=stages.print_reads_gatk,
        name='print_reads_gatk',
        input=output_from('base_recalibration_gatk'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_]+).recal_data.csv'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.realn.bam'),
        output='{path[0]}/{sample[0]}.sort.dedup.realn.recal.bam').follows(
            'local_realignment_gatk'))

    # Call variants using GATK
    pipeline.transform(task_func=stages.call_variants_gatk,
                       name='call_variants_gatk',
                       input=output_from('print_reads_gatk'),
                       filter=suffix('.sort.dedup.realn.recal.bam'),
                       output='.raw.snps.indels.g.vcf')

    # Combine G.VCF files for all samples using GATK
    pipeline.merge(task_func=stages.combine_gvcf_gatk,
                   name='combine_gvcf_gatk',
                   input=output_from('call_variants_gatk'),
                   output='COMPLEXO.mergedgvcf.vcf')

    # Genotype G.VCF files using GATK
    pipeline.transform(task_func=stages.genotype_gvcf_gatk,
                       name='genotype_gvcf_gatk',
                       input=output_from('combine_gvcf_gatk'),
                       filter=suffix('.mergedgvcf.vcf'),
                       output='.genotyped.vcf')

    # SNP recalibration using GATK
    pipeline.transform(task_func=stages.snp_recalibrate_gatk,
                       name='snp_recalibrate_gatk',
                       input=output_from('genotype_gvcf_gatk'),
                       filter=suffix('.genotyped.vcf'),
                       output=['.snp_recal', '.snp_tranches', '.snp_plots.R'])

    # INDEL recalibration using GATK
    pipeline.transform(
        task_func=stages.indel_recalibrate_gatk,
        name='indel_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        output=['.indel_recal', '.indel_tranches', '.indel_plots.R'])

    # Apply SNP recalibration using GATK
    (pipeline.transform(
        task_func=stages.apply_snp_recalibrate_gatk,
        name='apply_snp_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        add_inputs=add_inputs(['COMPLEXO.snp_recal', 'COMPLEXO.snp_tranches']),
        output='.recal_SNP.vcf').follows('snp_recalibrate_gatk'))

    # Apply INDEL recalibration using GATK
    (pipeline.transform(
        task_func=stages.apply_indel_recalibrate_gatk,
        name='apply_indel_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        add_inputs=add_inputs(
            ['COMPLEXO.indel_recal', 'COMPLEXO.indel_tranches']),
        output='.recal_INDEL.vcf').follows('indel_recalibrate_gatk'))

    # Combine variants using GATK
    (pipeline.transform(
        task_func=stages.combine_variants_gatk,
        name='combine_variants_gatk',
        input=output_from('apply_snp_recalibrate_gatk'),
        filter=suffix('.recal_SNP.vcf'),
        add_inputs=add_inputs(['COMPLEXO.recal_INDEL.vcf']),
        output='.combined.vcf').follows('apply_indel_recalibrate_gatk'))

    # Select variants using GATK
    pipeline.transform(task_func=stages.select_variants_gatk,
                       name='select_variants_gatk',
                       input=output_from('combine_variants_gatk'),
                       filter=suffix('.combined.vcf'),
                       output='.selected.vcf')

    return pipeline
Ejemplo n.º 34
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name="rnapipe")

    # Get the details of the experiment (samples, config, inputs, ...)
    experiment = Experiment(state)

    # Get reference file locations
    reference_genome = state.config.get_options("reference_genome")
    gene_ref = state.config.get_options("gene_ref")

    # Print out samples
    sample_text = [s.info() for s in experiment.sample_list]
    logging.info("Analysis samples:\n{}".format("\n".join(sample_text)))

    # Stages are dependent on the state. Experiment object is also passed so
    # we can access metadata later.
    stages = PipelineStages(state, experiment=experiment)

    # Make directories
    output_dir = get_output_paths(
        results_dir=state.config.get_options("results_dir"),
        default_paths=OUTPUT_PATHS)
    make_output_dirs(output_dir)
    logging.debug(output_dir)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.do_nothing,
                       name="original_fastqs",
                       output=experiment.R1_files)

    # Create reference index for alignment
    if not experiment.index_provided:
        pipeline.originate(task_func=stages.do_nothing,
                           name="reference_genome",
                           output=reference_genome)

        if experiment.alignment_method == "star":
            # Create reference index for STAR
            pipeline.transform(task_func=stages.create_star_index,
                               name="create_star_index",
                               input=output_from("reference_genome"),
                               filter=formatter(".*"),
                               add_inputs=add_inputs(gene_ref),
                               output=path_list_join(
                                   output_dir["star_index"],
                                   ["SA", "Genome", "genomeParameters.txt"]),
                               extras=[output_dir["star_index"]])

        elif experiment.alignment_method == "hisat2":
            # Create reference index for HISAT2
            hisat_basename = path.join(output_dir["hisat_index"], "genome")
            pipeline.transform(
                task_func=stages.create_hisat_index,
                name="create_hisat_index",
                input=output_from("reference_genome"),
                filter=formatter(".*"),
                add_inputs=add_inputs(gene_ref),
                output=path_list_join(output_dir["hisat_index"],
                                      ["genome.1.ht2", "genome.2.ht2"]),
                extras=[hisat_basename])
    else:
        # Don't create index if index is supplied
        if experiment.alignment_method == "star":
            output_dir["star_index"] = state.config.get_options("star_index")
            pipeline.originate(task_func=stages.do_nothing,
                               name="create_star_index",
                               output=path_list_join(
                                   output_dir["star_index"],
                                   ["SA", "Genome", "genomeParameters.txt"]))
        elif experiment.alignment_method == "hisat2":
            hisat_basename = state.config.get_options("hisat_index")
            output_dir["hisat_index"] = path.dirname(hisat_basename)
            prefix = path.basename(hisat_basename)
            pipeline.originate(task_func=stages.do_nothing,
                               name="create_hisat_index",
                               output=path_list_join(
                                   output_dir["hisat_index"], [
                                       "{prefix}.1.ht2".format(prefix=prefix),
                                       "{prefix}.2.ht2".format(prefix=prefix)
                                   ]))

    # Pre-trim FastQC
    if experiment.paired_end:
        pipeline.transform(
            task_func=stages.fastqc,
            name="fastqc",
            input=output_from("original_fastqs"),
            filter=formatter(".+/(?P<sample>[a-zA-Z0-9-_]+)_R1.fastq.gz"),
            add_inputs=add_inputs("{path[0]}/{sample[0]}_R2.fastq.gz"),
            output=path_list_join(
                output_dir["fastqc"],
                ["{sample[0]}_R1_fastqc.zip", "{sample[0]}_R2_fastqc.zip"]),
            extras=[output_dir["fastqc"]])
    else:
        pipeline.transform(task_func=stages.fastqc,
                           name="fastqc",
                           input=output_from("original_fastqs"),
                           filter=suffix(".fastq.gz"),
                           output="_fastqc.zip",
                           output_dir=output_dir["fastqc"],
                           extras=[output_dir["fastqc"]])

    # Trimmomatic
    if experiment.trim_reads and experiment.paired_end:
        pipeline.transform(
            task_func=stages.trim_reads,
            name="trim_reads",
            input=output_from("original_fastqs"),
            # Get R1 file and the corresponding R2 file
            filter=formatter(".+/(?P<sample>[a-zA-Z0-9-_]+)_R1.fastq.gz"),
            add_inputs=add_inputs("{path[0]}/{sample[0]}_R2.fastq.gz"),
            output=path_list_join(output_dir["seq"], [
                "{sample[0]}_R1.trimmed.fastq.gz",
                "{sample[0]}_R2.trimmed.fastq.gz"
            ]),
            extras=path_list_join(output_dir["seq"], [
                "{sample[0]}_R1.unpaired.fastq.gz",
                "{sample[0]}_R2.unpaired.fastq.gz"
            ]))
    elif experiment.trim_reads:
        pipeline.transform(
            task_func=stages.trim_reads,
            name="trim_reads",
            input=output_from("original_fastqs"),
            filter=formatter(".+/(?P<sample>[a-zA-Z0-9-_]+)_R1.fastq.gz"),
            output=path.join(output_dir["seq"],
                             "{sample[0]}_R1.trimmed.fastq.gz"))

    # Post-trim FastQC
    if experiment.paired_end and experiment.trim_reads:
        pipeline.transform(
            task_func=stages.fastqc,
            name="post_trim_fastqc",
            input=output_from("trim_reads"),
            filter=formatter(
                ".+/(?P<sample>[a-zA-Z0-9-_]+)_R1.trimmed.fastq.gz"),
            output=path_list_join(output_dir["post_trim_fastqc"], [
                "{sample[0]}_R1.trimmed_fastqc.gz",
                "{sample[0]}_R2.trimmed_fastqc.gz"
            ]),
            extras=["results/qc/post_trim_fastqc/"])
    elif experiment.trim_reads:
        pipeline.transform(task_func=stages.fastqc,
                           name="post_trim_fastqc",
                           input=output_from("trim_reads"),
                           filter=suffix(".trimmed.fastq.gz"),
                           output=".trimmed_fastqc.gz",
                           output_dir=output_dir["post_trim_fastqc"],
                           extras=[output_dir["post_trim_fastqc"]])

    # If there are technical replicates, each is mapped independently.
    # This is so each technical replicate maintains a separate read group.
    if experiment.alignment_method == "star":
        align_task_name = "star_align"
        if experiment.trim_reads:
            (pipeline.transform(
                task_func=stages.star_align,
                name=align_task_name,
                input=output_from("trim_reads"),
                filter=formatter(".+/(?P<sample>[a-zA-Z0-9-_]+)" \
                                 "_R[12](.trimmed)?.fastq.gz"),
                output="%s/{sample[0]}/{sample[0]}.star.Aligned.out.bam" \
                        % output_dir["alignments"],
                extras=[output_dir["star_index"], "{sample[0]}"])
            ).follows("create_star_index")
        else:
            (pipeline.transform(
                task_func=stages.star_align,
                name=align_task_name,
                input=output_from("original_fastqs"),
                filter=formatter(".+/(?P<sample>[a-zA-Z0-9-_]+)" \
                                 "_R[12](.trimmed)?.fastq.gz"),
                output="%s/{sample[0]}/{sample[0]}.star.Aligned.out.bam" \
                        % output_dir["alignments"],
                extras=[output_dir["star_index"], "{sample[0]}"])
            ).follows("create_star_index")

    if experiment.alignment_method == "hisat2":
        align_task_name = "hisat_align"
        if experiment.trim_reads:
            (pipeline.transform(
                task_func=stages.hisat_align,
                name="hisat_align",
                input=output_from("trim_reads"),
                filter=formatter(".+/(?P<sample>[a-zA-Z0-9-_]+)" \
                                 "_R[12](.trimmed)?.fastq.gz"),
                output="%s/{sample[0]}/{sample[0]}.hisat2.bam" \
                        % output_dir["alignments"],
                extras=[hisat_basename, "{sample[0]}"])
            ).follows("create_hisat_index")
        else:
            (pipeline.transform(
                task_func=stages.hisat_align,
                name="hisat_align",
                input=output_from("original_fastqs"),
                filter=formatter(".+/(?P<sample>[a-zA-Z0-9-_]+)" \
                                 "_R[12](.trimmed)?.fastq.gz"),
                output="%s/{sample[0]}/{sample[0]}.hisat2.bam" \
                        % output_dir["alignments"],
                extras=[hisat_basename, "{sample[0]}"])
            ).follows("create_hisat_index")

    # Sort BAM by coordinates
    pipeline.transform(
        task_func=stages.sort_bam_by_coordinate,
        name="sort_bam_by_coordinate",
        input=output_from(align_task_name),
        filter=formatter(
            ".+/(?P<sample>[a-zA-Z0-9-_]+)\.(?P<method>(star|hisat2))\..*bam"),
        output=[
            "{path[0]}/{sample[0]}.{method[0]}.sorted.bam",
            "{path[0]}/{sample[0]}.{method[0]}.sorted.bam.bai"
        ])

    # Merge files with the same sample name
    if experiment.multiple_technical_replicates:
        pipeline.collate(
            task_func=stages.merge_bams,
            name="merge_bams",
            input=output_from("sort_bam_by_coordinate"),
            filter=formatter(
                ".+/(SM_)?(?P<sm>[a-zA-Z0-9-]+)[^.]*\.(?P<method>(star|hisat2)).sorted.bam"
            ),
            output=path_list_join(
                output_dir["alignments"],
                ["{sm[0]}.{method[0]}.bam", "{sm[0]}.{method[0]}.bam.bai"]))
    else:
        pipeline.transform(
            task_func=stages.create_symlinks,
            name="merge_bams",
            input=output_from("sort_bam_by_coordinate"),
            filter=formatter(
                ".+/(SM_)?(?P<sm>[a-zA-Z0-9-]+)[^.]*\.(?P<method>(star|hisat2)).sorted.bam"
            ),
            output=path_list_join(
                output_dir["alignments"],
                ["{sm[0]}.{method[0]}.bam", "{sm[0]}.{method[0]}.bam.bai"]))

    # Sort BAM by name for counting features
    pipeline.transform(task_func=stages.sort_bam_by_name,
                       name="sort_bam_by_name",
                       input=output_from("merge_bams"),
                       filter=suffix(".bam"),
                       output=".nameSorted.bam")

    # Count features with HTSeq-count
    pipeline.transform(task_func=stages.htseq_count,
                       name="htseq_count",
                       input=output_from("sort_bam_by_name"),
                       filter=suffix(".nameSorted.bam"),
                       output_dir=output_dir["counts"],
                       output=".htseq.txt")

    # Count features with featureCounts
    pipeline.transform(task_func=stages.featurecounts,
                       name="featurecounts",
                       input=output_from("sort_bam_by_name"),
                       filter=suffix(".nameSorted.bam"),
                       output_dir=output_dir["counts"],
                       output=".featureCounts.txt")

    # TODO: add multiqc step

    #     # Stringtie assembly
    #     pipeline.transform(
    #         task_func=stages.stringtie_assembly,
    #         name="stringtie_assembly",
    #         input=output_from("merge_bams"),
    #         filter=suffix(".bam"),
    #         output_dir=output_dir["stringtie_assembly"],
    #         output=".gtf")

    # Stringtie estimates
    pipeline.transform(
        task_func=stages.stringtie_estimates,
        name="stringtie_estimates",
        input=output_from("merge_bams"),
        filter=formatter(
            ".+/(?P<sm>[a-zA-Z0-9-]+)\.(?P<method>(star|hisat2)).bam"),
        output=path_list_join(output_dir["stringtie_estimates"],
                              ["{sm[0]}/{sm[0]}.gtf", "{sm[0]}/e_data.ctab"]))

    # Stringtie counts
    pipeline.collate(
        task_func=stages.stringtie_prepDE,
        name="stringtie_prepDE",
        input=output_from("stringtie_estimates"),
        filter=formatter(".+\.gtf"),
        output=path_list_join(
            output_dir["stringtie_estimates"],
            ["gene_count_matrix.csv", "transcript_count_matrix.csv"]))
    return pipeline
Ejemplo n.º 35
0
def make_pipeline(state):
    """Build the pipeline by constructing stages and connecting them together"""
    # Build an empty pipeline
    pipeline = Pipeline(name="crpipe")
    # Get a list of paths to all the FASTQ files
    fastq_files = state.config.get_option("fastqs")
    # Find the path to the reference genome
    # Stages are dependent on the state
    stages = Stages(state)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.original_fastqs, name="original_fastqs", output=fastq_files)

    # Convert FASTQ file to FASTA using fastx toolkit
    # pipeline.transform(
    #     task_func=stages.fastq_to_fasta,
    #     name='fastq_to_fasta',
    #     input=output_from('original_fastqs'),
    #     filter=suffix('.fastq.gz'),
    #     output='.fasta')

    # The original reference file
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    # pipeline.originate(
    #    task_func=stages.original_reference,
    #    name='original_reference',
    #    output=reference_file)

    # Run fastQC on the FASTQ files
    pipeline.transform(
        task_func=stages.fastqc,
        name="fastqc",
        input=output_from("original_fastqs"),
        filter=suffix(".fastq.gz"),
        output="_fastqc",
    )

    # Index the reference using BWA
    # pipeline.transform(
    #    task_func=stages.index_reference_bwa,
    #    name='index_reference_bwa',
    #    input=output_from('original_reference'),
    #    filter=suffix('.fa'),
    #    output=['.fa.amb', '.fa.ann', '.fa.pac', '.fa.sa', '.fa.bwt'])

    # Index the reference using samtools
    # pipeline.transform(
    #     task_func=stages.index_reference_samtools,
    #    name='index_reference_samtools',
    #    input=output_from('original_reference'),
    #    filter=suffix('.fa'),
    #    output='.fa.fai')

    # Index the reference using bowtie 2
    # pipeline.transform(
    #     task_func=stages.index_reference_bowtie2,
    #     name='index_reference_bowtie2',
    #     input=output_from('original_reference'),
    #     filter=formatter('.+/(?P<refname>[a-zA-Z0-9]+\.fa)'),
    #     output=['{path[0]}/{refname[0]}.1.bt2',
    #             '{path[0]}/{refname[0]}.2.bt2',
    #             '{path[0]}/{refname[0]}.3.bt2',
    #             '{path[0]}/{refname[0]}.4.bt2',
    #             '{path[0]}/{refname[0]}.rev.1.bt2',
    #             '{path[0]}/{refname[0]}.rev.2.bt2'],
    #     extras=['{path[0]}/{refname[0]}'])

    # # Create a FASTA sequence dictionary for the reference using picard
    # pipeline.transform(
    #     task_func=stages.reference_dictionary_picard,
    #     name='reference_dictionary_picard',
    #     input=output_from('original_reference'),
    #     filter=suffix('.fa'),
    #     output='.dict')

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name="align_bwa",
        input=output_from("original_fastqs"),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        # We assume the sample name may consist of only alphanumeric
        # characters.
        filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+)_R1.fastq.gz"),
        # Add two more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        add_inputs=add_inputs("{path[0]}/{sample[0]}_R2.fastq.gz"),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=["{sample[0]}"],
        # The output file name is the sample name with a .bam extension.
        output="{path[0]}/{sample[0]}.bam",
    )

    # Sort alignment with sambamba
    pipeline.transform(
        task_func=stages.sort_bam_sambamba,
        name="sort_alignment",
        input=output_from("align_bwa"),
        filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).bam"),
        output="{path[0]}/{sample[0]}.sorted.bam",
    )

    # Extract MMR genes from the sorted BAM file
    pipeline.transform(
        task_func=stages.extract_genes_bedtools,
        name="extract_genes_bedtools",
        input=output_from("sort_alignment"),
        filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).sorted.bam"),
        output="{path[0]}/{sample[0]}.mmr.bam",
    )

    # Extract selected chromosomes from the sorted BAM file
    pipeline.transform(
        task_func=stages.extract_chromosomes_samtools,
        name="extract_chromosomes_samtools",
        input=output_from("sort_alignment"),
        filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).sorted.bam"),
        output="{path[0]}/{sample[0]}.chroms.bam",
    )

    # Index the MMR genes bam file with samtools
    pipeline.transform(
        task_func=stages.index_bam,
        name="index_mmr_alignment",
        input=output_from("extract_genes_bedtools"),
        filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).mmr.bam"),
        output="{path[0]}/{sample[0]}.mmr.bam.bai",
    )

    # Compute depth of coverage of the alignment with GATK DepthOfCoverage
    # pipeline.transform(
    #    task_func=stages.alignment_coverage_gatk,
    #    name='alignment_coverage_gatk',
    #    input=output_from('sort_alignment'),
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
    #    add_inputs=add_inputs([reference_file]),
    #    output='{path[0]}/{sample[0]}.coverage_summary',
    #    extras=['{path[0]}/{sample[0]}_coverage'])

    # Index the alignment with samtools
    pipeline.transform(
        task_func=stages.index_bam,
        name="index_alignment",
        input=output_from("sort_alignment"),
        filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).sorted.bam"),
        output="{path[0]}/{sample[0]}.sorted.bam.bai",
    )

    # Generate alignment stats with bamtools
    pipeline.transform(
        task_func=stages.bamtools_stats,
        name="bamtools_stats",
        input=output_from("align_bwa"),
        filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).bam"),
        output="{path[0]}/{sample[0]}.stats.txt",
    )

    # Extract the discordant paired-end alignments
    pipeline.transform(
        task_func=stages.extract_discordant_alignments,
        name="extract_discordant_alignments",
        input=output_from("align_bwa"),
        filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).bam"),
        output="{path[0]}/{sample[0]}.discordants.unsorted.bam",
    )

    # Extract split-read alignments
    pipeline.transform(
        task_func=stages.extract_split_read_alignments,
        name="extract_split_read_alignments",
        input=output_from("align_bwa"),
        filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).bam"),
        output="{path[0]}/{sample[0]}.splitters.unsorted.bam",
    )

    # Sort discordant reads.
    # Samtools annoyingly takes the prefix of the output bam name as its argument.
    # So we pass this as an extra argument. However Ruffus needs to know the full name
    # of the output bam file, so we pass that as the normal output parameter.
    pipeline.transform(
        task_func=stages.sort_bam,
        name="sort_discordants",
        input=output_from("extract_discordant_alignments"),
        filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).discordants.unsorted.bam"),
        extras=["{path[0]}/{sample[0]}.discordants"],
        output="{path[0]}/{sample[0]}.discordants.bam",
    )

    # Index the sorted discordant bam with samtools
    # pipeline.transform(
    #   task_func=stages.index_bam,
    #   name='index_discordants',
    #   input=output_from('sort_discordants'),
    #   filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).discordants.bam'),
    #   output='{path[0]}/{sample[0]}.discordants.bam.bai')

    # Sort discordant reads
    # Samtools annoyingly takes the prefix of the output bam name as its argument.
    # So we pass this as an extra argument. However Ruffus needs to know the full name
    # of the output bam file, so we pass that as the normal output parameter.
    pipeline.transform(
        task_func=stages.sort_bam,
        name="sort_splitters",
        input=output_from("extract_split_read_alignments"),
        filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).splitters.unsorted.bam"),
        extras=["{path[0]}/{sample[0]}.splitters"],
        output="{path[0]}/{sample[0]}.splitters.bam",
    )

    # Index the sorted splitters bam with samtools
    # pipeline.transform(
    #    task_func=stages.index_bam,
    #    name='index_splitters',
    #    input=output_from('sort_splitters'),
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).splitters.bam'),
    #    output='{path[0]}/{sample[0]}.splitters.bam.bai')

    # Call structural variants with lumpy
    (
        pipeline.transform(
            task_func=stages.structural_variants_lumpy,
            name="structural_variants_lumpy",
            input=output_from("sort_alignment"),
            filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).sorted.bam"),
            add_inputs=add_inputs(["{path[0]}/{sample[0]}.splitters.bam", "{path[0]}/{sample[0]}.discordants.bam"]),
            output="{path[0]}/{sample[0]}.lumpy.vcf",
        )
        .follows("index_alignment")
        .follows("sort_splitters")
        .follows("sort_discordants")
    )

    # Call genotypes on lumpy output using SVTyper
    # (pipeline.transform(
    #    task_func=stages.genotype_svtyper,
    #    name='genotype_svtyper',
    #    input=output_from('structural_variants_lumpy'),
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).lumpy.vcf'),
    #    add_inputs=add_inputs(['{path[0]}/{sample[0]}.sorted.bam', '{path[0]}/{sample[0]}.splitters.bam']),
    #    output='{path[0]}/{sample[0]}.svtyper.vcf')
    #    .follows('align_bwa')
    #    .follows('sort_splitters')
    #    .follows('index_alignment')
    #    .follows('index_splitters')
    #    .follows('index_discordants'))

    # Call SVs with Socrates
    (
        pipeline.transform(
            task_func=stages.structural_variants_socrates,
            name="structural_variants_socrates",
            input=output_from("sort_alignment"),
            filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).sorted.bam"),
            # output goes to {path[0]}/socrates/
            output="{path[0]}/socrates/results_Socrates_paired_{sample[0]}.sorted_long_sc_l25_q5_m5_i95.txt",
            extras=["{path[0]}"],
        )
    )

    # Call DELs with DELLY
    pipeline.merge(
        task_func=stages.deletions_delly,
        name="deletions_delly",
        input=output_from("sort_alignment"),
        output="delly.DEL.vcf",
    )

    # Call DUPs with DELLY
    pipeline.merge(
        task_func=stages.duplications_delly,
        name="duplications_delly",
        input=output_from("sort_alignment"),
        output="delly.DUP.vcf",
    )

    # Call INVs with DELLY
    pipeline.merge(
        task_func=stages.inversions_delly,
        name="inversions_delly",
        input=output_from("sort_alignment"),
        output="delly.INV.vcf",
    )

    # Call TRAs with DELLY
    pipeline.merge(
        task_func=stages.translocations_delly,
        name="translocations_delly",
        input=output_from("sort_alignment"),
        output="delly.TRA.vcf",
    )

    # Join both read pair files using gustaf_mate_joining
    # pipeline.transform(
    #    task_func=stages.gustaf_mate_joining,
    #    name='gustaf_mate_joining',
    #    input=output_from('fastq_to_fasta'),
    #    # Match the R1 (read 1) FASTA file and grab the path and sample name.
    #    # This will be the first input to the stage.
    #    # We assume the sample name may consist of only alphanumeric
    #    # characters.
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+)_R1.fasta'),
    #    # Add one more input to the stage:
    #    #    1. The corresponding R2 FASTA file
    #    add_inputs=add_inputs(['{path[0]}/{sample[0]}_R2.fasta']),
    #    output='{path[0]}/{sample[0]}.joined_mates.fasta')

    # Call structural variants with pindel
    # (pipeline.transform(
    #    task_func=stages.structural_variants_pindel,
    #    name='structural_variants_pindel',
    #    input=output_from('sort_alignment'),
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
    #    add_inputs=add_inputs(['{path[0]}/{sample[0]}.pindel_config.txt', reference_file]),
    #    output='{path[0]}/{sample[0]}.pindel')
    #    .follows('index_reference_bwa')
    #    .follows('index_reference_samtools'))

    return pipeline
Ejemplo n.º 36
0
def make_pipeline_map(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='haloplexpipe')
    # Get a list of paths to all the FASTQ files
    #fastq_files = state.config.get_option('fastqs')
    fastq_files = glob.glob("fastqs/*.gz")
    # Stages are dependent on the state
    stages = Stages(state)

    safe_make_dir('alignments')
    safe_make_dir('processed_fastqs')
    safe_make_dir('metrics')
    safe_make_dir('metrics/amplicon')
    safe_make_dir('metrics/summary')
    safe_make_dir('metrics/pass_samples')
    safe_make_dir('variants')
    safe_make_dir('variants/gatk')
    safe_make_dir('variants/vardict')

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.original_fastqs,
                       name='original_fastqs',
                       output=fastq_files)

    pipeline.transform(
        task_func=stages.run_surecalltrimmer,
        name='run_surecalltrimmer',
        input=output_from('original_fastqs'),
        filter=formatter('fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1.fastq.gz'),
        add_inputs=add_inputs('fastqs/{sample[0]}_R2.fastq.gz'),
        #filter=formatter('fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1_001.fastq.gz'),
        #add_inputs=add_inputs('fastqs/{sample[0]}_R3_001.fastq.gz'),
        extras=['{sample[0]}'],
        # output only needs to know about one file to track progress of the pipeline, but the second certainly exists after this step.
        output='processed_fastqs/{sample[0]}_R1.processed.fastq.gz')
    #output='processed_fastqs/{sample[0]}_R1_001.processed.fastq.gz')

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('run_surecalltrimmer'),
        filter=formatter(
            'processed_fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1.processed.fastq.gz'
        ),
        add_inputs=add_inputs(
            'processed_fastqs/{sample[0]}_R2.processed.fastq.gz'),
        #filter=formatter('processed_fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1_001.processed.fastq.gz'),
        #add_inputs=add_inputs('processed_fastqs/{sample[0]}_R3_001.processed.fastq.gz'),
        extras=['{sample[0]}'],
        output='alignments/{sample[0]}.bam')

    # Run locatit from agilent.  this should produce sorted bam files, so no sorting needed at the next step
    pipeline.collate(task_func=stages.run_locatit,
                     name='run_locatit',
                     input=output_from('align_bwa', 'original_fastqs'),
                     filter=regex(r'.+/(.+_L\d\d\d).+'),
                     output=r'alignments/\1.locatit.bam')

    pipeline.transform(task_func=stages.sort_bam,
                       name='sort_bam',
                       input=output_from('run_locatit'),
                       filter=suffix('.locatit.bam'),
                       output='.sorted.locatit.bam')

    # # # # # Metrics stages # # # # #
    # generate mapping metrics (post locatit)
    pipeline.transform(
        task_func=stages.generate_amplicon_metrics,
        name='generate_amplicon_metrics',
        input=output_from('sort_bam'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'),
        output='metrics/amplicon/{sample[0]}.amplicon-metrics.txt',
        extras=['{sample[0]}'])

    # Intersect the bam file with the region of interest
    pipeline.transform(
        task_func=stages.intersect_bed,
        name='intersect_bed',
        input=output_from('sort_bam'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'),
        output='metrics/summary/{sample[0]}.intersectbed.bam')

    # Calculate coverage metrics from the intersected bam file
    pipeline.transform(task_func=stages.coverage_bed,
                       name='coverage_bed',
                       input=output_from('intersect_bed'),
                       filter=suffix('.intersectbed.bam'),
                       output='.bedtools_hist_all.txt')

    # Count the number of mapped reads
    pipeline.transform(
        task_func=stages.genome_reads,
        name='genome_reads',
        input=output_from('sort_bam'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'),
        output='metrics/summary/{sample[0]}.mapped_to_genome.txt')

    # Count the number of on-target reads
    pipeline.transform(task_func=stages.target_reads,
                       name='target_reads',
                       input=output_from('intersect_bed'),
                       filter=suffix('.intersectbed.bam'),
                       output='.mapped_to_target.txt')

    # Count the number of total reads
    pipeline.transform(
        task_func=stages.total_reads,
        name='total_reads',
        input=output_from('sort_bam'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'),
        output='metrics/summary/{sample[0]}.total_raw_reads.txt')

    # Generate summary metrics from the stats files produces
    pipeline.collate(
        task_func=stages.generate_stats,
        name='generate_stats',
        input=output_from('coverage_bed', 'genome_reads', 'target_reads',
                          'total_reads'),
        #filter=regex(r'.+/(.+BS\d{4,6}.+S\d+)\..+\.txt'),
        filter=regex(
            r'.+/(.+)\.(bedtools_hist_all|mapped_to_genome|mapped_to_target|total_raw_reads)\.txt'
        ),
        output=r'metrics/summary/all_sample.summary.\1.txt',
        extras=[r'\1', 'all_sample.summary.txt'])
    # # # # # Metrics stages end # # # # #

    # # # # # Checking metrics and calling # # # # #
    # Originate to set the location of the metrics summary file
    (pipeline.originate(
        task_func=stages.grab_summary_file,
        name='grab_summary_file',
        output='all_sample.summary.txt').follows('generate_stats'))

    # Awk command to produce a list of bam files passing filters
    pipeline.transform(task_func=stages.filter_stats,
                       name='filter_stats',
                       input=output_from('grab_summary_file'),
                       filter=suffix('.summary.txt'),
                       output='.passed.summary.txt')

    # Touch passed bams to the pass_samples folder and pass the glob of that folder to HaplotypeCaller
    pipeline.subdivide(name='passed_filter_files',
                       task_func=stages.read_samples,
                       input=output_from('filter_stats'),
                       filter=formatter(),
                       output="metrics/pass_samples/*.bam")

    # Call variants using GATK
    (pipeline.transform(
        task_func=stages.call_haplotypecaller_gatk,
        name='call_haplotypecaller_gatk',
        input=output_from('passed_filter_files'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9-_]+).sorted.locatit.bam'),
        output='variants/gatk/{sample[0]}.g.vcf').follows('sort_bam'))

    # Call variants with vardict
    (pipeline.transform(
        task_func=stages.run_vardict,
        name='run_vardict',
        input=output_from('passed_filter_files'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9-_]+).sorted.locatit.bam'),
        output='variants/vardict/{sample[0]}.vcf',
        extras=['{sample[0]}']).follows('sort_bam'))

    pipeline.transform(
        task_func=stages.sort_vcfs,
        name='sort_vcfs',
        input=output_from('run_vardict'),
        filter=formatter('variants/vardict/(?P<sample>[a-zA-Z0-9_-]+).vcf'),
        output='variants/vardict/{sample[0]}.sorted.vcf.gz')

    pipeline.transform(task_func=stages.index_vcfs,
                       name='index_vcfs',
                       input=output_from('sort_vcfs'),
                       filter=suffix('.sorted.vcf.gz'),
                       output='.sorted.vcf.gz.tbi')

    return (pipeline)
    final task
    """
    with open(tempdir + "jobs.start", "a") as oo:
        oo.write('job = %s\n' % json.dumps([infiles, outfiles]))
    test_job_io(infiles, outfiles, extra_params)
    with open(tempdir + "jobs.finish", "a") as oo:
        oo.write('job = %s\n' % json.dumps([infiles, outfiles]))


#
#   Use equivalent but new sytle syntax
#
test_pipeline = Pipeline("test")

test_pipeline.originate(task_func = task1,
                   output    = [tempdir + d for d in ('a.1', 'b.1', 'c.1')])\
    .follows(mkdir(tempdir))\
    .posttask(lambda: do_write(test_file, "Task 1 Done\n"))

test_pipeline.transform(task_func = task2,
                   input     = task1,
                   filter    = suffix(".1"),
                   output    = ".2") \
    .posttask(lambda: do_write(test_file, "Task 2 Done\n"))

test_pipeline.transform(task3, task2, regex('(.*).2'), inputs([r"\1.2", tempdir + "a.1"]), r'\1.3')\
    .posttask(lambda: do_write(test_file, "Task 3 Done\n"))


test_pipeline.transform(task4, tempdir + "*.1", suffix(".1"), ".4")\
    .follows(task1)\
    .posttask(lambda: do_write(test_file, "Task 4 Done\n"))\

# ___________________________________________________________________________
#
#   check_regex_out_of_range_regex_reference_error_task
# ___________________________________________________________________________
def check_regex_out_of_range_regex_reference_error_task(infiles, outfile,
                                                        prefix1,
                                                        prefix2,
                                                        extension):
    raise Exception("Should blow up first")


test_pipeline = Pipeline("test")

test_pipeline.originate(task_func=generate_initial_files1,
                        output=[tempdir + "/" + prefix + "_name.tmp1" for prefix in "abcdefghi"])

test_pipeline.transform(task_func=check_regex_task,
                        input=generate_initial_files1,
                        filter=regex("(.*)/(?P<PREFIX>[abcd])(_name)(.tmp1)"),
                        output=r"\1/\g<PREFIX>\3.tmp2",  # output file
                        extras=[r"\2",                # extra: prefix = \2
                                r"\g<PREFIX>",        # extra: prefix = \2
                                r"\4"])               # extra: extension
test_pipeline.transform(task_func=check_regex_unmatched_task,
                        input=generate_initial_files1,
                        filter=regex("(.*)/(?P<PREFIX>[abcd])(_name)(.xxx)"),
                        output=r"\1/\g<PREFIXA>\3.tmp2",  # output file
                        extras=[r"\2",                 # extra: prefix = \2
                                r"\g<PREFIX>",         # extra: prefix = \2
                                r"\4"])                # extra: extension
        oo.write('job = %s\n' % json.dumps([infiles, outfiles]))
    test_job_io(infiles, outfiles, extra_params)
    with open(tempdir + "jobs.finish",  "a") as oo:
        oo.write('job = %s\n' % json.dumps([infiles, outfiles]))





#
#   Use equivalent but new sytle syntax
#
test_pipeline = Pipeline("test")

test_pipeline.originate(task_func = task1,
                   output    = [tempdir + d for d in ('a.1', 'b.1', 'c.1')])\
    .follows(mkdir(tempdir))\
    .posttask(lambda: do_write(test_file, "Task 1 Done\n"))

test_pipeline.transform(task_func = task2,
                   input     = task1,
                   filter    = suffix(".1"),
                   output    = ".2") \
    .posttask(lambda: do_write(test_file, "Task 2 Done\n"))

test_pipeline.transform(task3, task2, regex('(.*).2'), inputs([r"\1.2", tempdir + "a.1"]), r'\1.3')\
    .posttask(lambda: do_write(test_file, "Task 3 Done\n"))


test_pipeline.transform(task4, tempdir + "*.1", suffix(".1"), ".4")\
    .follows(task1)\
    .posttask(lambda: do_write(test_file, "Task 4 Done\n"))\
Ejemplo n.º 40
0
def check_combinations_with_replacement3_merged_task(infiles, outfile):
    with open(outfile, "w") as p:
        for infile in sorted(infiles):
            with open(infile) as ii:
                p.write(ii.read())


def cleanup_tmpdir():
    os.system('rm -f %s %s' %
              (os.path.join(tempdir, '*'), RUFFUS_HISTORY_FILE))


test_pipeline1 = Pipeline("test1")
test_pipeline2 = Pipeline("test2")
gen_task1 = test_pipeline1.originate(task_func=generate_initial_files1,
                                     name="WOWWWEEE",
                                     output=[tempdir + "/" + prefix + "_name.tmp1" for prefix in "abcd"])
test_pipeline1.originate(task_func=generate_initial_files2,
                         output=[tempdir + "/e_name.tmp1", tempdir + "/f_name.tmp1"])
test_pipeline1.originate(task_func=generate_initial_files3,
                         output=[tempdir + "/g_name.tmp1", tempdir + "/h_name.tmp1"])
test_pipeline1.product(task_func=check_product_task,
                       input=[tempdir + "/" + prefix +
                              "_name.tmp1" for prefix in "abcd"],
                       filter=formatter(".*/(?P<FILE_PART>.+).tmp1$"),
                       input2=generate_initial_files2,
                       filter2=formatter(),
                       input3=generate_initial_files3,
                       filter3=formatter(r"tmp1$"),
                       output="{path[0][0]}/{FILE_PART[0][0]}.{basename[1][0]}.{basename[2][0]}.tmp2",
                       extras=["{basename[0][0][0]}{basename[1][0][0]}{basename[2][0][0]}",       # extra: prefices only (abcd etc)
Ejemplo n.º 41
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='methylation_pipeline')
    # Get a list of paths to all the FASTQ files
    fastq_files = state.config.get_option('fastqs')
    # Stages are dependent on the state
    stages = PipelineStages(state)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.original_fastqs,
                       name='original_fastqs',
                       output=fastq_files)

    # Run bismark genome preparation on the reference genome
    pipeline.originate(task_func=stages.bismark_genome_prepare,
                       name='bismark_genome_prepare',
                       output='reference/Bisulfite_Genome')

    # Run FASTQC on the input fastq files
    pipeline.transform(
        task_func=stages.fastqc,
        name='fastqc',
        input=output_from('original_fastqs'),
        filter=formatter('(?P<path>.+)/(?P<filename>.+).fastq.gz'),
        output='{path[0]}/{filename[0]}_fastqc')

    # Run bismark on the input fastq files
    (pipeline.transform(
        task_func=stages.bismark,
        name='bismark',
        input=output_from('original_fastqs'),
        filter=formatter(
            '(?P<path>.+)/(?P<filename>.+)_R1_(?P<num>.+).fastq.gz'),
        add_inputs=add_inputs('{path[0]}/{filename[0]}_R2_{num[0]}.fastq.gz'),
        extras=['{path[0]}/bismark_output/'],
        output=
        '{path[0]}/bismark_output/{filename[0]}_R1_{num[0]}_bismark_bt2_pe.sam.gz'
    )).follows('bismark_genome_prepare')

    # Run bismark methylation extractor on the bismark output
    pipeline.transform(
        task_func=stages.bismark_methylation_extractor,
        name='bismark_methylation_extractor',
        input=output_from('bismark'),
        filter=formatter(
            '(?P<path>.+)/(?P<filename>.+)_R1_(?P<num>.+)_bismark_bt2_pe.sam.gz'
        ),
        extras=['{path[0]}'],
        output=
        '{path[0]}/CpG_context_{filename[0]}_R1_{num[0]}_bismark_bt2_pe.sam.gz.txt'
    )

    # Run methpt on the bismark methylation extractor output
    pipeline.transform(
        task_func=stages.methpat,
        name='methpat',
        input=output_from('bismark_methylation_extractor'),
        filter=formatter('(?P<path>.+)/CpG_context_(?P<filename>.+)'),
        extras=['{path[0]}', '{filename[0]}'],
        output='{path[0]}/CpG_context_{filename[0]}.methpat.html')

    return pipeline
def test_combinations_with_replacement3_merged_task(infiles, outfile):
    with open(outfile, "w") as p:
        for infile in sorted(infiles):
            with open(infile) as ii:
                p.write(ii.read())


def cleanup_tmpdir():
    os.system('rm -f %s %s' %
              (os.path.join(tempdir, '*'), RUFFUS_HISTORY_FILE))


test_pipeline1 = Pipeline("test1")
test_pipeline2 = Pipeline("test2")
gen_task1 = test_pipeline1.originate(
    task_func=generate_initial_files1,
    name="WOWWWEEE",
    output=[tempdir + "/" + prefix + "_name.tmp1" for prefix in "abcd"])
test_pipeline1.originate(
    task_func=generate_initial_files2,
    output=[tempdir + "/e_name.tmp1", tempdir + "/f_name.tmp1"])
test_pipeline1.originate(
    task_func=generate_initial_files3,
    output=[tempdir + "/g_name.tmp1", tempdir + "/h_name.tmp1"])
test_pipeline1.product(
    task_func=test_product_task,
    input=[tempdir + "/" + prefix + "_name.tmp1" for prefix in "abcd"],
    filter=formatter(".*/(?P<FILE_PART>.+).tmp1$"),
    input2=generate_initial_files2,
    filter2=formatter(),
    input3=generate_initial_files3,
    filter3=formatter(r"tmp1$"),
Ejemplo n.º 43
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='thepipeline')
    # Get a list of paths to all the FASTQ files
    fastq_files = state.config.get_option('fastqs')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(
        task_func=stages.original_fastqs,
        name='original_fastqs',
        output=fastq_files)

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        # We assume the sample name may consist of only alphanumeric
        # characters.
        # filter=formatter('(?P<path>.+)/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+)_1.fastq.gz'),
        # 1_HFYLVCCXX:2:TCCGCGAA_2_GE0343_1.fastq.gz
        # 1_HCJWFBCXX:GGACTCCT_L001_9071584415739518822-AGRF-023_R2.fastq.gz
        filter=formatter(
            '.+/(?P<readid>[a-zA-Z0-9-]+)_(?P<lib>[a-zA-Z0-9-:]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+)_R1.fastq.gz'),
        # Add one more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        # e.g. C2WPF.5_Solexa-201237_5_X4311_1.fastq.gz
        add_inputs=add_inputs(
            '{path[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}_R2.fastq.gz'),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=['{readid[0]}', '{lib[0]}', '{lane[0]}', '{sample[0]}'],
        # extras=['{sample[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.bam')

    # Sort the BAM file using Picard
    pipeline.transform(
        task_func=stages.sort_bam_picard,
        name='sort_bam_picard',
        input=output_from('align_bwa'),
        filter=suffix('.bam'),
        output='.sort.bam')

    # Mark duplicates in the BAM file using Picard
    pipeline.transform(
        task_func=stages.mark_duplicates_picard,
        name='mark_duplicates_picard',
        input=output_from('sort_bam_picard'),
        filter=suffix('.sort.bam'),
        # XXX should make metricsup an extra output?
        output=['.sort.dedup.bam', '.metricsdup'])

    # Local realignment using GATK
    # Generate RealignerTargetCreator using GATK
    pipeline.transform(
        task_func=stages.realigner_target_creator,
        name='realigner_target_creator',
        input=output_from('mark_duplicates_picard'),
        filter=suffix('.sort.dedup.bam'),
        output='.intervals')

    # Local realignment using GATK
    (pipeline.transform(
        task_func=stages.local_realignment_gatk,
        name='local_realignment_gatk',
        input=output_from('realigner_target_creator'),
        # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).chr.intervals'),
        filter=formatter(
            '.+/(?P<readid>[a-zA-Z0-9-]+)_(?P<lib>[a-zA-Z0-9-:]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+).intervals'),
        # add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'),
        add_inputs=add_inputs(
            'alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.bam'),
        output='alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.realn.bam')
        .follows('mark_duplicates_picard'))

    # Base recalibration using GATK
    pipeline.transform(
        task_func=stages.base_recalibration_gatk,
        name='base_recalibration_gatk',
        input=output_from('local_realignment_gatk'),
        filter=suffix('.sort.dedup.realn.bam'),
        output=['.recal_data.csv', '.count_cov.log'])

    # Print reads using GATK
    (pipeline.transform(
        task_func=stages.print_reads_gatk,
        name='print_reads_gatk',
        input=output_from('base_recalibration_gatk'),
        # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).recal_data.csv'),
        filter=formatter(
            # '.+/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+).recal_data.csv'),
            '.+/(?P<readid>[a-zA-Z0-9-]+)_(?P<lib>[a-zA-Z0-9-:]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+).recal_data.csv'),
            # '.+/(?P<readid>[a-zA-Z0-9-]+)_(?P<lib>[a-zA-Z0-9-:]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+).recal_data.csv'),
        # add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.realn.bam'),
        add_inputs=add_inputs(
            'alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.realn.bam'),
        # output='{path[0]}/{sample[0]}.sort.dedup.realn.recal.bam')
        output='alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.realn.recal.bam')
        .follows('local_realignment_gatk'))

    # Merge lane bams to sample bams
    pipeline.collate(
        task_func=stages.merge_sample_bams,
        name='merge_sample_bams',
        filter=formatter(
            # '.+/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+).sort.dedup.realn.recal.bam'),
            '.+/(?P<readid>[a-zA-Z0-9-]+)_(?P<lib>[a-zA-Z0-9-:]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+).sort.dedup.realn.recal.bam'),
        # inputs=add_inputs('alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.realn.bam'),
        input=output_from('print_reads_gatk'),
        output='alignments/{sample[0]}/{sample[0]}.merged.bam')

    # Mark duplicates in the BAM file using Picard
    pipeline.transform(
        task_func=stages.mark_duplicates_picard,
        name='mark_duplicates_picard2',
        input=output_from('merge_sample_bams'),
        # filter=formatter(
        # '.+/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+).merged.bam'),
        filter=suffix('.merged.bam'),
        # XXX should make metricsup an extra output?
        output=['.merged.dedup.bam', '.metricsdup'])

    # Local realignment2 using GATK
    # Generate RealignerTargetCreator using GATK
    pipeline.transform(
        task_func=stages.realigner_target_creator,
        name='realigner_target_creator2',
        input=output_from('mark_duplicates_picard2'),
        filter=suffix('.dedup.bam'),
        output='.intervals')

    # Local realignment using GATK
    (pipeline.transform(
        task_func=stages.local_realignment_gatk,
        name='local_realignment_gatk2',
        input=output_from('realigner_target_creator2'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9-]+).merged.intervals'),
        # filter=formatter(
        # '.+/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+).intervals'),
        # add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'),
        add_inputs=add_inputs(
            'alignments/{sample[0]}/{sample[0]}.merged.dedup.bam'),
        output='alignments/{sample[0]}/{sample[0]}.merged.dedup.realn.bam')
        .follows('mark_duplicates_picard2'))

    # Call variants using GATK
    pipeline.transform(
        task_func=stages.call_haplotypecaller_gatk,
        name='call_haplotypecaller_gatk',
        input=output_from('local_realignment_gatk2'),
        # filter=suffix('.merged.dedup.realn.bam'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9-]+).merged.dedup.realn.bam'),
        output='variants/{sample[0]}.g.vcf')

    # Combine G.VCF files for all samples using GATK
    pipeline.merge(
        task_func=stages.combine_gvcf_gatk,
        name='combine_gvcf_gatk',
        input=output_from('call_haplotypecaller_gatk'),
        output='variants/ALL.combined.vcf')

    # Genotype G.VCF files using GATK
    pipeline.transform(
        task_func=stages.genotype_gvcf_gatk,
        name='genotype_gvcf_gatk',
        input=output_from('combine_gvcf_gatk'),
        filter=suffix('.combined.vcf'),
        output='.raw.vcf')

    # SNP recalibration using GATK
    pipeline.transform(
        task_func=stages.snp_recalibrate_gatk,
        name='snp_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.raw.vcf'),
        output=['.snp_recal', '.snp_tranches', '.snp_plots.R'])

    # INDEL recalibration using GATK
    pipeline.transform(
        task_func=stages.indel_recalibrate_gatk,
        name='indel_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.raw.vcf'),
        output=['.indel_recal', '.indel_tranches', '.indel_plots.R'])

    # Apply SNP recalibration using GATK
    (pipeline.transform(
        task_func=stages.apply_snp_recalibrate_gatk,
        name='apply_snp_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.raw.vcf'),
        add_inputs=add_inputs(['ALL.snp_recal', 'ALL.snp_tranches']),
        output='.recal_SNP.vcf')
        .follows('snp_recalibrate_gatk'))

    # Apply INDEL recalibration using GATK
    (pipeline.transform(
        task_func=stages.apply_indel_recalibrate_gatk,
        name='apply_indel_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.raw.vcf'),
        add_inputs=add_inputs(
            ['ALL.indel_recal', 'ALL.indel_tranches']),
        output='.recal_INDEL.vcf')
        .follows('indel_recalibrate_gatk'))

    # Combine variants using GATK
    (pipeline.transform(
        task_func=stages.combine_variants_gatk,
        name='combine_variants_gatk',
        input=output_from('apply_snp_recalibrate_gatk'),
        filter=suffix('.recal_SNP.vcf'),
        add_inputs=add_inputs(['ALL.recal_INDEL.vcf']),
        # output='.combined.vcf')
        output='ALL.raw.vqsr.vcf')
        .follows('apply_indel_recalibrate_gatk'))
    #
    # # Select variants using GATK
    # pipeline.transform(
    #     task_func=stages.select_variants_gatk,
    #     name='select_variants_gatk',
    #     input=output_from('combine_variants_gatk'),
    #     filter=suffix('.combined.vcf'),
    #     output='.selected.vcf')


    return pipeline
Ejemplo n.º 44
0
def make_pipeline_process(state):
    #originate process pipeline state

    # Define empty pipeline
    pipeline = Pipeline(name='haloplexpipe')
    # Get a list of paths to all the directories to be combined for variant calling
    run_directories = state.config.get_option('runs')
    #grab files from each of the processed directories in "runs"
    gatk_files = []
    for directory in run_directories:
        gatk_files.extend(glob.glob(directory + '/variants/gatk/*.g.vcf'))

    stages = Stages(state)

    #dummy stage to take the globbed outputs of each run that is to be processed
    pipeline.originate(task_func=stages.glob_gatk,
                       name='glob_gatk',
                       output=gatk_files)

    # Combine G.VCF files for all samples using GATK
    pipeline.merge(task_func=stages.combine_gvcf_gatk,
                   name='combine_gvcf_gatk',
                   input=output_from('glob_gatk'),
                   output='processed/gatk/ALL.combined.vcf')

    # Genotype G.VCF files using GATK
    pipeline.transform(task_func=stages.genotype_gvcf_gatk,
                       name='genotype_gvcf_gatk',
                       input=output_from('combine_gvcf_gatk'),
                       filter=suffix('.combined.vcf'),
                       output='.raw.vcf')

    # Apply GT filters to genotyped vcf
    pipeline.transform(task_func=stages.genotype_filter_gatk,
                       name='genotype_filter_gatk',
                       input=output_from('genotype_gvcf_gatk'),
                       filter=suffix('.raw.vcf'),
                       output='.raw.gt-filter.vcf')

    # Decompose and normalise multiallelic sites
    pipeline.transform(task_func=stages.vt_decompose_normalise,
                       name='vt_decompose_normalise',
                       input=output_from('genotype_filter_gatk'),
                       filter=suffix('.raw.gt-filter.vcf'),
                       output='.raw.gt-filter.decomp.norm.vcf')

    # Annotate VCF file using GATK
    pipeline.transform(task_func=stages.variant_annotator_gatk,
                       name='variant_annotator_gatk',
                       input=output_from('vt_decompose_normalise'),
                       filter=suffix('.raw.gt-filter.decomp.norm.vcf'),
                       output='.raw.gt-filter.decomp.norm.annotate.vcf')

    # Filter vcf
    pipeline.transform(
        task_func=stages.gatk_filter,
        name='gatk_filter',
        input=output_from('variant_annotator_gatk'),
        filter=suffix('.raw.gt-filter.decomp.norm.annotate.vcf'),
        output='.raw.gt-filter.decomp.norm.annotate.filter.vcf')

    #Apply VEP
    pipeline.transform(
        task_func=stages.apply_vep,
        name='apply_vep',
        input=output_from('gatk_filter'),
        filter=suffix('.raw.gt-filter.decomp.norm.annotate.filter.vcf'),
        output='.raw.gt-filter.decomp.norm.annotate.filter.vep.vcf')

    ####### vardict stuff

    vardict_files = []
    for directory in run_directories:
        vardict_files.extend(
            glob.glob(directory + '/variants/vardict/*sorted.vcf.gz'))

    #dummy stage to take the globbed outputs of each run that is to be processed
    pipeline.originate(task_func=stages.glob_vardict,
                       name='glob_vardict',
                       output=vardict_files)

    safe_make_dir('processed/vardict')

    #concatenate all vardict vcfs
    pipeline.merge(task_func=stages.concatenate_vcfs,
                   name='concatenate_vcfs',
                   input=output_from('glob_vardict'),
                   output='processed/vardict/combined.vcf.gz')

    pipeline.transform(task_func=stages.vt_decompose_normalise,
                       name='vt_decompose_normalise_vardict',
                       input=output_from('concatenate_vcfs'),
                       filter=suffix('.vcf.gz'),
                       output='.decomp.norm.vcf.gz')

    pipeline.transform(task_func=stages.index_vcfs,
                       name='index_final_vcf',
                       input=output_from('vt_decompose_normalise_vardict'),
                       filter=suffix('.decomp.norm.vcf.gz'),
                       output='.decomp.norm.vcf.gz.tbi')

    (pipeline.transform(
        task_func=stages.apply_vep,
        name='apply_vep_vardict',
        input=output_from('vt_decompose_normalise_vardict'),
        filter=suffix('.decomp.norm.vcf.gz'),
        output='.decomp.norm.vep.vcf').follows('index_final_vcf'))

    return pipeline
Ejemplo n.º 45
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name="radpipe")

    # Stages are dependent on the state
    stages = PipelineStages(state)

    # Get a list of library objects.
    libraries = parse_libraries(
        libraries=state.config.get_options("libraries"))

    # Get a list of input files
    input_files = [l.files for l in libraries]
    # input_files = [item for sublist in input_files for item in sublist]
    state.logger.info("Input files: " + str(input_files))

    # Get a list of all samples for each library
    samples_dict = OrderedDict()
    for l in libraries:
        samples_dict[l.name] = l.samples
    state.logger.debug("Samples: " + str(samples_dict))

    # Make sure that there are no duplicate samples
    sample_list = [
        item for sublist in samples_dict.values() for item in sublist
    ]
    sample_counts = Counter(sample_list)
    for sample in sample_counts:
        if sample_counts[sample] > 1:
            print("Sample {} appears {} times in the barcodes files. "
                  "Sample names must be unique".format(sample,
                                                       sample_counts[sample]))
            sys.exit(radpipe.error_codes.INVALID_INPUT_FILE)

    # Define output directories
    output_dir = get_output_paths(state)
    state.logger.debug(output_dir)

    # Allow multiple comma-separated tasks
    if len(state.options.target_tasks) == 1:
        state.options.target_tasks = state.options.target_tasks[0].split(",")
    if len(state.options.forced_tasks) == 1:
        state.options.forced_tasks = state.options.forced_tasks[0].split(",")
    state.logger.debug("Target tasks: " + str(state.options.target_tasks))
    state.logger.debug("Forced tasks: " + str(state.options.forced_tasks))

    # Check if alignment_method is valid
    alignment_method = state.config.get_options(
        "alignment_method").strip().lower()
    if alignment_method not in ["bwa mem", "bowtie"]:
        print("Error: Invalid alignment_method in config file. " \
              "Valid options are ['bwa mem', 'bowtie'].")
        sys.exit(radpipe.error_codes.INVALID_ARGUMENT)
    if alignment_method == "bwa mem":
        align_task_name = "bwa_mem"
        index_task_name = "bwa_index"
    else:
        align_task_name = "bowtie"
        index_task_name = "bowtie_index"

    # TODO: Refactor this
    # If 'alignment' is in target_tasks or forced_tasks, specify which
    # type of alignment job
    if "alignment" in state.options.target_tasks:
        index = state.options.target_tasks.index("alignment")
        state.options.target_tasks[index] = align_task_name
    if "alignment" in state.options.forced_tasks:
        index = state.options.forced_tasks.index("alignment")
        state.options.forced_tasks[index] = align_task_name

    # If 'build_index' is in target_tasks or forced_tasks, specify which
    # type of index job
    if "build_index" in state.options.target_tasks:
        index = state.options.target_tasks.index("build_index")
        state.options.target_tasks[index] = index_task_name
    if "build_index" in state.options.forced_tasks:
        index = state.options.forced_tasks.index("build_index")
        state.options.forced_tasks[index] = index_task_name
    state.logger.debug(state)

    # Whether to include filter_bam stage or not
    filter_bams = False
    try:
        samtools_view_options = state.config.get_options(
            "samtools_view_options")
        if samtools_view_options:
            filter_bams = True
    except:
        pass
    state.logger.info("Filter bams: {}".format(filter_bams))

    # Population map filenames
    popmap_file = "{output_dir}/{name}_popmap.txt".format(
        output_dir=output_dir["populations"],
        name=state.config.get_options("analysis_id"))
    try:
        config_popmap_file = state.config.get_options("popmap_file")
        if config_popmap_file:
            state.logger.info(
                "Using popmap file: {}".format(config_popmap_file))
        else:
            raise (Exception)
    except Exception:
        config_popmap_file = None
        state.logger.info("Creating new popmap file: {}".format(popmap_file))

    # Population r values
    populations_r = state.config.get_options("populations_r")
    assert (isinstance(populations_r, list))

    # Dummy stages. These do nothing except provide a node at the beginning
    # for the pipeline graph, giving the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.do_nothing,
                       name="original_fastqs",
                       output=input_files)

    pipeline.originate(task_func=stages.do_nothing,
                       name="reference_genome",
                       output=state.config.get_options("reference_genome"))

    # Create a copy of the population map file needed for stacks, or create
    # one denovo using the sample list.
    pipeline.originate(task_func=stages.create_popmap_file,
                       name="create_popmap_file",
                       output=[popmap_file],
                       extras=[config_popmap_file, sample_list])

    # Create index for reference genome based on alignment method.
    if alignment_method == "bwa mem":
        pipeline.transform(
            task_func=stages.bwa_index,
            name="bwa_index",
            input=output_from("reference_genome"),
            filter=formatter(".+/(?P<ref>[^/]+).(fa|fasta)"),
            output=path_list_join(output_dir["reference"],
                                  ["reference.fa.bwt", "reference.fa.sa"]),
            extras=[output_dir["reference"]])

    if alignment_method == "bowtie":
        pipeline.transform(task_func=stages.bowtie_index,
                           name="bowtie_index",
                           input=output_from("reference_genome"),
                           filter=formatter(".+/(?P<ref>[^/]+).(fa|fasta)"),
                           output=path_list_join(
                               output_dir["reference"],
                               ["reference.1.ebwt", "reference.rev.1.ebwt"]),
                           extras=[output_dir["reference"]])

    # FastQC
    pipeline.transform(
        task_func=stages.fastqc,
        name="fastqc",
        input=output_from("original_fastqs"),
        filter=formatter(".+/(?P<lib>[^/]+)/(?P<fn>[^/]+).(fastq|fq).gz"),
        output="%s/{lib[0]}/{fn[0]}_fastqc.zip" % output_dir["fastqc"],
        extras=[output_dir["fastqc"], "{lib[0]}"])

    # MultiQC: FastQC
    pipeline.merge(task_func=stages.multiqc_fastqc,
                   name="multiqc_fastqc",
                   input=output_from("fastqc"),
                   output="%s/multiqc_fastqc_report.html" % output_dir["qc"],
                   extras=[output_dir["qc"], output_dir["fastqc"]])

    # Stacks: Process RAD-Tags
    pipeline.transform(task_func=stages.process_radtags,
                       name="process_radtags",
                       input=output_from("original_fastqs"),
                       filter=formatter(".+/(?P<lib>[^/]+)/[^/]+"),
                       output="%s/{lib[0]}/{lib[0]}.success" %
                       output_dir["process_radtags"],
                       extras=[
                           output_dir["process_radtags"], "{lib[0]}",
                           state.config.get_options("renz_1"),
                           state.config.get_options("renz_2"),
                           state.config.get_options("process_radtags_options")
                       ])

    # Create a list for alignment with the input fastq files from process_radtags
    process_radtags_outputs = []
    for l in libraries:
        for s in l.samples:
            base = "{dir}/{lib}/{sample}".format(
                dir=output_dir["process_radtags"], lib=l.lib_id, sample=s)
            process_radtags_outputs.append(
                [base + ".1.fq.gz", base + ".2.fq.gz"])
    # print(process_radtags_outputs)

    # Alignment
    if align_task_name == "bwa_mem":
        (pipeline.transform(
            task_func=stages.bwa_align,
            name=align_task_name,
            input=process_radtags_outputs,
            filter=formatter(".+/(?P<sm>[^/]+).1.fq.gz"),
            output="%s/{sm[0]}.bwa.bam" % output_dir["alignments"],
            extras=[
                os.path.join(output_dir["reference"], "reference.fa"),
                "{path[0]}", output_dir["alignments"], "{sm[0]}",
                state.config.get_options("alignment_options")
            ])).follows("bwa_index").follows("process_radtags")

    if align_task_name == "bowtie":
        (pipeline.transform(
            task_func=stages.bowtie_align,
            name=align_task_name,
            input=process_radtags_outputs,
            filter=formatter(".+/(?P<sm>[^/]+).1.fq.gz"),
            output="%s/{sm[0]}.bowtie.bam" % output_dir["alignments"],
            extras=[
                os.path.join(output_dir["reference"], "reference"),
                "{path[0]}", output_dir["alignments"], "{sm[0]}",
                state.config.get_options("alignment_options")
            ])).follows("bowtie_index").follows("process_radtags")

    # Sort BAM and index
    pipeline.transform(task_func=stages.sort_bam,
                       name="sort_bam",
                       input=output_from(align_task_name),
                       filter=suffix(".bam"),
                       output=".sorted.bam")

    if filter_bams:
        final_bam_task_name = "filter_bam"
        pipeline.transform(
            task_func=stages.filter_bam,
            name="filter_bam",
            input=output_from("sort_bam"),
            filter=suffix(".sorted.bam"),
            output=".sorted.filtered.bam",
            extras=[state.config.get_options("samtools_view_options")])
    else:
        final_bam_task_name = "sort_bam"

    # Samtools flagstat
    pipeline.transform(task_func=stages.flagstat,
                       name="flagstat",
                       input=output_from(final_bam_task_name),
                       filter=suffix(".bam"),
                       output=".flagstat.txt",
                       output_dir=output_dir["flagstat"])

    # MultiQC: flagstat
    pipeline.merge(task_func=stages.multiqc_flagstat,
                   name="multiqc_flagstat",
                   input=output_from("flagstat"),
                   output="%s/multiqc_flagstat_report.html" % output_dir["qc"],
                   extras=[output_dir["qc"], output_dir["flagstat"]])

    # Stacks: gstacks
    pipeline.merge(task_func=stages.gstacks,
                   name="gstacks",
                   input=output_from(final_bam_task_name),
                   output="%s/catalog.fa.gz" % output_dir["gstacks"],
                   extras=[
                       output_dir["alignments"], output_dir["gstacks"],
                       align_task_name, final_bam_task_name, sample_list,
                       state.config.get_options("gstacks_options")
                   ])

    # Define outputs from each run of populations
    populations_outputs = []
    for r in populations_r:
        dir_name = "{pop_dir}/{analysis_name}_r{r}".format(
            pop_dir=output_dir["populations"],
            analysis_name=state.config.get_options("analysis_id"),
            r=r)
        populations_outputs.append(
            os.path.join(dir_name, "populations.snps.vcf"))
    # print(populations_outputs)

    # Stacks: populations
    pipeline.originate(task_func=stages.populations,
                       name="popluations",
                       output=populations_outputs,
                       extras=[
                           output_dir["gstacks"], output_dir["populations"],
                           popmap_file,
                           state.config.get_options("populations_options")
                       ]).follows("gstacks").follows("create_popmap_file")

    return pipeline
Ejemplo n.º 46
0
#---------------------------------------------------------------
#   first task
@transform(create_initial_file_pairs, suffix(".start"), ".output.1")
def first_task(input_files, output_file):
    with open(output_file, "w"): pass


#---------------------------------------------------------------
#   second task
@transform(first_task, suffix(".output.1"), ".output.2")
def second_task(input_files, output_file):
    with open(output_file, "w"): pass

test_pipeline = Pipeline("test")
test_pipeline.originate(output = [    [tempdir + 'data/scratch/lg/what/one/two/three/four/five/six/seven/job1.a.start',  tempdir + 'job1.b.start'],
                                       [tempdir + 'data/scratch/lg/what/one/two/three/four/five/six/seven/job2.a.start', tempdir + 'job2.b.start'],
                                       [tempdir + 'data/scratch/lg/what/one/two/three/four/five/six/seven/job3.a.start', tempdir + 'job3.b.start']    ],
                                       task_func = create_initial_file_pairs)
test_pipeline.transform(task_func = first_task, input = create_initial_file_pairs, filter = suffix(".start"), output = ".output.1")
test_pipeline.transform(input = first_task, filter = suffix(".output.1"), output = ".output.2", task_func= second_task)


decorator_syntax = 0
oop_syntax = 1

class Test_verbosity(unittest.TestCase):
    #___________________________________________________________________________
    #
    #   test_printout_abbreviated_path1
    #___________________________________________________________________________
    def test_printout_abbreviated_path1(self):
        """Input file exists, output doesn't exist"""
Ejemplo n.º 47
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='vcf_annotation')
    # Get a list of paths to all the FASTQ files
    vcf_files = state.config.get_option('vcfs')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original VCF files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(
        task_func=stages.original_vcf,
        name='original_vcf',
        output=vcf_file)

    # Decompose VCF using Vt
    pipeline.transform(
        task_func=stages.decompose_vcf,
        name='decompose_vcf',
        input=output_from('original_vcf'),
        # This will be the first input to the stage.
        # We assume the sample name may consist of only alphanumeric
        # characters.
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).vcf'),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the VCF file name (e.g. study/family name.
        # This is needed within the stage for finding out sample specific
        # configuration options
        extras=['{sample[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='{path[0]}/{sample[0]}.decompose.normalize.vcf')

    # FILTER COMMON VARIANTS
    # ADD FILTER COMMON VARIANTS USING VEP

    # Annotate using VEP
    pipeline.transform(
        task_func=stages.annotate_vep,
        name='annotate_vep',
        input=output_from('decompose_vcf'),
        filter=suffix('.vcf'),
        output='.vep.vcf')

    # Annotate using SnpEff
    pipeline.transform(
        task_func=stages.annotate_snpeff,
        name='annotate_snpeff',
        input=output_from('annotate_vep'),
        filter=suffix('.vcf'),
        output='.snpeff.vcf')

    # Mark duplicates in the BAM file using Picard
    pipeline.transform(
        task_func=stages.mark_duplicates_picard,
        name='mark_duplicates_picard',
        input=output_from('sort_bam_picard'),
        filter=suffix('.sort.bam'),
        # XXX should make metricsup an extra output?
        output=['.sort.dedup.bam', '.metricsdup'])

    # Generate chromosome intervals using GATK
    pipeline.transform(
        task_func=stages.chrom_intervals_gatk,
        name='chrom_intervals_gatk',
        input=output_from('mark_duplicates_picard'),
        filter=suffix('.sort.dedup.bam'),
        output='.chr.intervals')

    # Local realignment using GATK
    (pipeline.transform(
        task_func=stages.local_realignment_gatk,
        name='local_realignment_gatk',
        input=output_from('chrom_intervals_gatk'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).chr.intervals'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'),
        output='{path[0]}/{sample[0]}.sort.dedup.realn.bam')
        .follows('mark_duplicates_picard'))

    # Base recalibration using GATK
    pipeline.transform(
        task_func=stages.base_recalibration_gatk,
        name='base_recalibration_gatk',
        input=output_from('local_realignment_gatk'),
        filter=suffix('.sort.dedup.realn.bam'),
        output=['.recal_data.csv', '.count_cov.log'])

    # Print reads using GATK
    (pipeline.transform(
        task_func=stages.print_reads_gatk,
        name='print_reads_gatk',
        input=output_from('base_recalibration_gatk'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).recal_data.csv'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.realn.bam'),
        output='{path[0]}/{sample[0]}.sort.dedup.realn.recal.bam')
        .follows('local_realignment_gatk'))

    # Call variants using GATK
    pipeline.transform(
        task_func=stages.call_variants_gatk,
        name='call_variants_gatk',
        input=output_from('print_reads_gatk'),
        filter=suffix('.sort.dedup.realn.recal.bam'),
        output='.raw.snps.indels.g.vcf')

    # Combine G.VCF files for all samples using GATK
    pipeline.merge(
        task_func=stages.combine_gvcf_gatk,
        name='combine_gvcf_gatk',
        input=output_from('call_variants_gatk'),
        output='PCExomes.mergegvcf.vcf')

    # Genotype G.VCF files using GATK
    pipeline.transform(
        task_func=stages.genotype_gvcf_gatk,
        name='genotype_gvcf_gatk',
        input=output_from('combine_gvcf_gatk'),
        filter=suffix('.mergegvcf.vcf'),
        output='.genotyped.vcf')

    # SNP recalibration using GATK
    pipeline.transform(
        task_func=stages.snp_recalibrate_gatk,
        name='snp_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        output=['.snp_recal', '.snp_tranches', '.snp_plots.R'])

    # INDEL recalibration using GATK
    pipeline.transform(
        task_func=stages.indel_recalibrate_gatk,
        name='indel_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        output=['.indel_recal', '.indel_tranches', '.indel_plots.R'])

    # Apply SNP recalibration using GATK
    (pipeline.transform(
        task_func=stages.apply_snp_recalibrate_gatk,
        name='apply_snp_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        add_inputs=add_inputs(['PCExomes.snp_recal', 'PCExomes.snp_tranches']),
        output='.recal_SNP.vcf')
        .follows('snp_recalibrate_gatk'))

    # Apply INDEL recalibration using GATK
    (pipeline.transform(
        task_func=stages.apply_indel_recalibrate_gatk,
        name='apply_indel_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        add_inputs=add_inputs(
            ['PCExomes.indel_recal', 'PCExomes.indel_tranches']),
        output='.recal_INDEL.vcf')
        .follows('indel_recalibrate_gatk'))

    # Combine variants using GATK
    (pipeline.transform(
        task_func=stages.combine_variants_gatk,
        name='combine_variants_gatk',
        input=output_from('apply_snp_recalibrate_gatk'),
        filter=suffix('.recal_SNP.vcf'),
        add_inputs=add_inputs(['PCExomes.recal_INDEL.vcf']),
        output='.combined.vcf')
        .follows('apply_indel_recalibrate_gatk'))

    # Select variants using GATK
    pipeline.transform(
        task_func=stages.select_variants_gatk,
        name='select_variants_gatk',
        input=output_from('combine_variants_gatk'),
        filter=suffix('.combined.vcf'),
        output='.selected.vcf')

    return pipeline
Ejemplo n.º 48
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='complexo')
    # Get a list of paths to all the FASTQ files
    fastq_files = state.config.get_option('fastqs')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(
        task_func=stages.original_fastqs,
        name='original_fastqs',
        output=fastq_files)

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name. 
        # This will be the first input to the stage.
        # We assume the sample name may consist of only alphanumeric
        # characters.
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+)_R1.fastq.gz'),
        # Add one more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=['{sample[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='{path[0]}/{sample[0]}.bam')

    # Sort the BAM file using Picard 
    pipeline.transform(
        task_func=stages.sort_bam_picard,
        name='sort_bam_picard',
        input=output_from('align_bwa'),
        filter=suffix('.bam'),
        output='.sort.bam')

    # Mark duplicates in the BAM file using Picard 
    pipeline.transform(
        task_func=stages.mark_duplicates_picard,
        name='mark_duplicates_picard',
        input=output_from('sort_bam_picard'),
        filter=suffix('.sort.bam'),
        # XXX should make metricsup an extra output?
        output=['.sort.dedup.bam', '.metricsdup'])

    # Generate chromosome intervals using GATK 
    pipeline.transform(
        task_func=stages.chrom_intervals_gatk,
        name='chrom_intervals_gatk',
        input=output_from('mark_duplicates_picard'),
        filter=suffix('.sort.dedup.bam'),
        output='.chr.intervals')

    # Local realignment using GATK 
    (pipeline.transform(
        task_func=stages.local_realignment_gatk,
        name='local_realignment_gatk',
        input=output_from('chrom_intervals_gatk'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).chr.intervals'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'),
        output='{path[0]}/{sample[0]}.sort.dedup.realn.bam')
        .follows('mark_duplicates_picard'))

    # Base recalibration using GATK 
    pipeline.transform(
        task_func=stages.base_recalibration_gatk,
        name='base_recalibration_gatk',
        input=output_from('local_realignment_gatk'),
        filter=suffix('.sort.dedup.realn.bam'),
        output=['.recal_data.csv', '.count_cov.log'])

    # Print reads using GATK 
    (pipeline.transform(
        task_func=stages.print_reads_gatk,
        name='print_reads_gatk',
        input=output_from('base_recalibration_gatk'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).recal_data.csv'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.realn.bam'),
        output='{path[0]}/{sample[0]}.sort.dedup.realn.recal.bam')
        .follows('local_realignment_gatk'))

    # Call variants using GATK 
    pipeline.transform(
        task_func=stages.call_variants_gatk,
        name='call_variants_gatk',
        input=output_from('print_reads_gatk'),
        filter=suffix('.sort.dedup.realn.recal.bam'),
        output='.raw.snps.indels.g.vcf')

    # Combine G.VCF files for all samples using GATK
    pipeline.merge(
        task_func=stages.combine_gvcf_gatk,
        name='combine_gvcf_gatk',
        input=output_from('call_variants_gatk'),
        output='PCExomes.mergegvcf.vcf')

    # Genotype G.VCF files using GATK 
    pipeline.transform(
        task_func=stages.genotype_gvcf_gatk,
        name='genotype_gvcf_gatk',
        input=output_from('combine_gvcf_gatk'),
        filter=suffix('.mergegvcf.vcf'),
        output='.genotyped.vcf')

    # SNP recalibration using GATK
    pipeline.transform(
        task_func=stages.snp_recalibrate_gatk,
        name='snp_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        output=['.snp_recal', '.snp_tranches', '.snp_plots.R'])

    # INDEL recalibration using GATK
    pipeline.transform(
        task_func=stages.indel_recalibrate_gatk,
        name='indel_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        output=['.indel_recal', '.indel_tranches', '.indel_plots.R'])

    # Apply SNP recalibration using GATK  
    (pipeline.transform(
        task_func=stages.apply_snp_recalibrate_gatk,
        name='apply_snp_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        add_inputs=add_inputs(['PCExomes.snp_recal', 'PCExomes.snp_tranches']),
        output='.recal_SNP.vcf')
        .follows('snp_recalibrate_gatk'))

    # Apply INDEL recalibration using GATK  
    (pipeline.transform(
        task_func=stages.apply_indel_recalibrate_gatk,
        name='apply_indel_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        add_inputs=add_inputs(['PCExomes.indel_recal', 'PCExomes.indel_tranches']),
        output='.recal_INDEL.vcf')
        .follows('indel_recalibrate_gatk'))

    # Combine variants using GATK  
    (pipeline.transform(
        task_func=stages.combine_variants_gatk,
        name='combine_variants_gatk',
        input=output_from('apply_snp_recalibrate_gatk'),
        filter=suffix('.recal_SNP.vcf'),
        add_inputs=add_inputs(['PCExomes.recal_INDEL.vcf']),
        output='.combined.vcf')
        .follows('apply_indel_recalibrate_gatk'))

    # Select variants using GATK 
    pipeline.transform(
        task_func=stages.select_variants_gatk,
        name='select_variants_gatk',
        input=output_from('combine_variants_gatk'),
        filter=suffix('.combined.vcf'),
        output='.selected.vcf')

    return pipeline
Ejemplo n.º 49
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='crpipe')
    # Get a list of paths to all the FASTQ files
    fastq_files = state.config.get_option('fastqs')
    # Find the path to the reference genome
    # Stages are dependent on the state
    stages = Stages(state)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.original_fastqs,
                       name='original_fastqs',
                       output=fastq_files)

    # Convert FASTQ file to FASTA using fastx toolkit
    # pipeline.transform(
    #     task_func=stages.fastq_to_fasta,
    #     name='fastq_to_fasta',
    #     input=output_from('original_fastqs'),
    #     filter=suffix('.fastq.gz'),
    #     output='.fasta')

    # The original reference file
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    #pipeline.originate(
    #    task_func=stages.original_reference,
    #    name='original_reference',
    #    output=reference_file)

    # Run fastQC on the FASTQ files
    pipeline.transform(task_func=stages.fastqc,
                       name='fastqc',
                       input=output_from('original_fastqs'),
                       filter=suffix('.fastq.gz'),
                       output='_fastqc')

    # Index the reference using BWA
    #pipeline.transform(
    #    task_func=stages.index_reference_bwa,
    #    name='index_reference_bwa',
    #    input=output_from('original_reference'),
    #    filter=suffix('.fa'),
    #    output=['.fa.amb', '.fa.ann', '.fa.pac', '.fa.sa', '.fa.bwt'])

    # Index the reference using samtools
    # pipeline.transform(
    #     task_func=stages.index_reference_samtools,
    #    name='index_reference_samtools',
    #    input=output_from('original_reference'),
    #    filter=suffix('.fa'),
    #    output='.fa.fai')

    # Index the reference using bowtie 2
    # pipeline.transform(
    #     task_func=stages.index_reference_bowtie2,
    #     name='index_reference_bowtie2',
    #     input=output_from('original_reference'),
    #     filter=formatter('.+/(?P<refname>[a-zA-Z0-9]+\.fa)'),
    #     output=['{path[0]}/{refname[0]}.1.bt2',
    #             '{path[0]}/{refname[0]}.2.bt2',
    #             '{path[0]}/{refname[0]}.3.bt2',
    #             '{path[0]}/{refname[0]}.4.bt2',
    #             '{path[0]}/{refname[0]}.rev.1.bt2',
    #             '{path[0]}/{refname[0]}.rev.2.bt2'],
    #     extras=['{path[0]}/{refname[0]}'])

    # # Create a FASTA sequence dictionary for the reference using picard
    # pipeline.transform(
    #     task_func=stages.reference_dictionary_picard,
    #     name='reference_dictionary_picard',
    #     input=output_from('original_reference'),
    #     filter=suffix('.fa'),
    #     output='.dict')

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        # We assume the sample name may consist of only alphanumeric
        # characters.
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+)_R1.fastq.gz'),
        # Add two more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=['{sample[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='{path[0]}/{sample[0]}.bam')

    # Sort alignment with sambamba
    pipeline.transform(task_func=stages.sort_bam_sambamba,
                       name='sort_alignment',
                       input=output_from('align_bwa'),
                       filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).bam'),
                       output='{path[0]}/{sample[0]}.sorted.bam')

    # Extract MMR genes from the sorted BAM file
    pipeline.transform(
        task_func=stages.extract_genes_bedtools,
        name='extract_genes_bedtools',
        input=output_from('sort_alignment'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
        output='{path[0]}/{sample[0]}.mmr.bam')

    # Extract selected chromosomes from the sorted BAM file
    pipeline.transform(
        task_func=stages.extract_chromosomes_samtools,
        name='extract_chromosomes_samtools',
        input=output_from('sort_alignment'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
        output='{path[0]}/{sample[0]}.chroms.bam')

    # Index the MMR genes bam file with samtools
    pipeline.transform(task_func=stages.index_bam,
                       name='index_mmr_alignment',
                       input=output_from('extract_genes_bedtools'),
                       filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).mmr.bam'),
                       output='{path[0]}/{sample[0]}.mmr.bam.bai')

    # Compute depth of coverage of the alignment with GATK DepthOfCoverage
    #pipeline.transform(
    #    task_func=stages.alignment_coverage_gatk,
    #    name='alignment_coverage_gatk',
    #    input=output_from('sort_alignment'),
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
    #    add_inputs=add_inputs([reference_file]),
    #    output='{path[0]}/{sample[0]}.coverage_summary',
    #    extras=['{path[0]}/{sample[0]}_coverage'])

    # Index the alignment with samtools
    pipeline.transform(
        task_func=stages.index_bam,
        name='index_alignment',
        input=output_from('sort_alignment'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
        output='{path[0]}/{sample[0]}.sorted.bam.bai')

    # Generate alignment stats with bamtools
    pipeline.transform(task_func=stages.bamtools_stats,
                       name='bamtools_stats',
                       input=output_from('align_bwa'),
                       filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).bam'),
                       output='{path[0]}/{sample[0]}.stats.txt')

    # Extract the discordant paired-end alignments
    pipeline.transform(task_func=stages.extract_discordant_alignments,
                       name='extract_discordant_alignments',
                       input=output_from('align_bwa'),
                       filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).bam'),
                       output='{path[0]}/{sample[0]}.discordants.unsorted.bam')

    # Extract split-read alignments
    pipeline.transform(task_func=stages.extract_split_read_alignments,
                       name='extract_split_read_alignments',
                       input=output_from('align_bwa'),
                       filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).bam'),
                       output='{path[0]}/{sample[0]}.splitters.unsorted.bam')

    # Sort discordant reads.
    # Samtools annoyingly takes the prefix of the output bam name as its argument.
    # So we pass this as an extra argument. However Ruffus needs to know the full name
    # of the output bam file, so we pass that as the normal output parameter.
    pipeline.transform(
        task_func=stages.sort_bam,
        name='sort_discordants',
        input=output_from('extract_discordant_alignments'),
        filter=formatter(
            '.+/(?P<sample>[a-zA-Z0-9]+).discordants.unsorted.bam'),
        extras=['{path[0]}/{sample[0]}.discordants'],
        output='{path[0]}/{sample[0]}.discordants.bam')

    # Index the sorted discordant bam with samtools
    # pipeline.transform(
    #   task_func=stages.index_bam,
    #   name='index_discordants',
    #   input=output_from('sort_discordants'),
    #   filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).discordants.bam'),
    #   output='{path[0]}/{sample[0]}.discordants.bam.bai')

    # Sort discordant reads
    # Samtools annoyingly takes the prefix of the output bam name as its argument.
    # So we pass this as an extra argument. However Ruffus needs to know the full name
    # of the output bam file, so we pass that as the normal output parameter.
    pipeline.transform(
        task_func=stages.sort_bam,
        name='sort_splitters',
        input=output_from('extract_split_read_alignments'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).splitters.unsorted.bam'),
        extras=['{path[0]}/{sample[0]}.splitters'],
        output='{path[0]}/{sample[0]}.splitters.bam')

    # Index the sorted splitters bam with samtools
    # pipeline.transform(
    #    task_func=stages.index_bam,
    #    name='index_splitters',
    #    input=output_from('sort_splitters'),
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).splitters.bam'),
    #    output='{path[0]}/{sample[0]}.splitters.bam.bai')

    # Call structural variants with lumpy
    (pipeline.transform(
        task_func=stages.structural_variants_lumpy,
        name='structural_variants_lumpy',
        input=output_from('sort_alignment'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
        add_inputs=add_inputs([
            '{path[0]}/{sample[0]}.splitters.bam',
            '{path[0]}/{sample[0]}.discordants.bam'
        ]),
        output='{path[0]}/{sample[0]}.lumpy.vcf').follows('index_alignment').
     follows('sort_splitters').follows('sort_discordants'))

    # Call genotypes on lumpy output using SVTyper
    #(pipeline.transform(
    #    task_func=stages.genotype_svtyper,
    #    name='genotype_svtyper',
    #    input=output_from('structural_variants_lumpy'),
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).lumpy.vcf'),
    #    add_inputs=add_inputs(['{path[0]}/{sample[0]}.sorted.bam', '{path[0]}/{sample[0]}.splitters.bam']),
    #    output='{path[0]}/{sample[0]}.svtyper.vcf')
    #    .follows('align_bwa')
    #    .follows('sort_splitters')
    #    .follows('index_alignment')
    #    .follows('index_splitters')
    #    .follows('index_discordants'))

    # Call SVs with Socrates
    (pipeline.transform(
        task_func=stages.structural_variants_socrates,
        name='structural_variants_socrates',
        input=output_from('sort_alignment'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
        # output goes to {path[0]}/socrates/
        output=
        '{path[0]}/socrates/results_Socrates_paired_{sample[0]}.sorted_long_sc_l25_q5_m5_i95.txt',
        extras=['{path[0]}']))

    # Call DELs with DELLY
    pipeline.merge(task_func=stages.deletions_delly,
                   name='deletions_delly',
                   input=output_from('sort_alignment'),
                   output='delly.DEL.vcf')

    # Call DUPs with DELLY
    pipeline.merge(task_func=stages.duplications_delly,
                   name='duplications_delly',
                   input=output_from('sort_alignment'),
                   output='delly.DUP.vcf')

    # Call INVs with DELLY
    pipeline.merge(task_func=stages.inversions_delly,
                   name='inversions_delly',
                   input=output_from('sort_alignment'),
                   output='delly.INV.vcf')

    # Call TRAs with DELLY
    pipeline.merge(task_func=stages.translocations_delly,
                   name='translocations_delly',
                   input=output_from('sort_alignment'),
                   output='delly.TRA.vcf')

    # Join both read pair files using gustaf_mate_joining
    #pipeline.transform(
    #    task_func=stages.gustaf_mate_joining,
    #    name='gustaf_mate_joining',
    #    input=output_from('fastq_to_fasta'),
    #    # Match the R1 (read 1) FASTA file and grab the path and sample name.
    #    # This will be the first input to the stage.
    #    # We assume the sample name may consist of only alphanumeric
    #    # characters.
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+)_R1.fasta'),
    #    # Add one more input to the stage:
    #    #    1. The corresponding R2 FASTA file
    #    add_inputs=add_inputs(['{path[0]}/{sample[0]}_R2.fasta']),
    #    output='{path[0]}/{sample[0]}.joined_mates.fasta')

    # Call structural variants with pindel
    #(pipeline.transform(
    #    task_func=stages.structural_variants_pindel,
    #    name='structural_variants_pindel',
    #    input=output_from('sort_alignment'),
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
    #    add_inputs=add_inputs(['{path[0]}/{sample[0]}.pindel_config.txt', reference_file]),
    #    output='{path[0]}/{sample[0]}.pindel')
    #    .follows('index_reference_bwa')
    #    .follows('index_reference_samtools'))

    return pipeline
Ejemplo n.º 50
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='cellfree_seq')
    # Stages are dependent on the state
    stages = Stages(state)

    safe_make_dir('alignments')

    # The original FASTQ files
    fastq_files = glob.glob('fastqs/*')

    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.original_fastqs,
                       name='original_fastqs',
                       output=fastq_files)

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+)_R1.fastq.gz'),
        # Add one more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=['{sample[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='alignments/{sample[0]}.sort.hq.bam')

    pipeline.transform(task_func=stages.run_connor,
                       name='run_connor',
                       input=output_from('align_bwa'),
                       filter=suffix('.sort.hq.bam'),
                       output='.sort.hq.connor.bam')

    safe_make_dir('metrics')
    safe_make_dir('metrics/summary')
    safe_make_dir('metrics/connor')

    pipeline.transform(
        task_func=stages.intersect_bed,
        name='intersect_bed_raw',
        input=output_from('align_bwa'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.bam'),
        output='metrics/summary/{sample[0]}.intersectbed.bam')

    pipeline.transform(task_func=stages.coverage_bed,
                       name='coverage_bed_raw',
                       input=output_from('intersect_bed_raw'),
                       filter=suffix('.intersectbed.bam'),
                       output='.bedtools_hist_all.txt')

    pipeline.transform(
        task_func=stages.genome_reads,
        name='genome_reads_raw',
        input=output_from('align_bwa'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.bam'),
        output='metrics/summary/{sample[0]}.mapped_to_genome.txt')

    pipeline.transform(task_func=stages.target_reads,
                       name='target_reads_raw',
                       input=output_from('intersect_bed_raw'),
                       filter=suffix('.intersectbed.bam'),
                       output='.mapped_to_target.txt')

    pipeline.transform(
        task_func=stages.total_reads,
        name='total_reads_raw',
        input=output_from('align_bwa'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.bam'),
        output='metrics/summary/{sample[0]}.total_raw_reads.txt')

    pipeline.collate(
        task_func=stages.generate_stats,
        name='generate_stats_raw',
        input=output_from('coverage_bed_raw', 'genome_reads_raw',
                          'target_reads_raw', 'total_reads_raw'),
        filter=regex(
            r'.+/(.+)\.(bedtools_hist_all|mapped_to_genome|mapped_to_target|total_raw_reads)\.txt'
        ),
        output=r'metrics/summary/all_sample.summary.\1.txt',
        extras=[r'\1', 'summary.txt'])

    pipeline.transform(
        task_func=stages.intersect_bed,
        name='intersect_bed_connor',
        input=output_from('run_connor'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.connor.bam'),
        output='metrics/connor/{sample[0]}.intersectbed.bam')

    pipeline.transform(task_func=stages.coverage_bed,
                       name='coverage_bed_connor',
                       input=output_from('intersect_bed_connor'),
                       filter=suffix('.intersectbed.bam'),
                       output='.bedtools_hist_all.txt')

    pipeline.transform(
        task_func=stages.genome_reads,
        name='genome_reads_connor',
        input=output_from('run_connor'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.connor.bam'),
        output='metrics/summary/{sample[0]}.mapped_to_genome.txt')

    pipeline.transform(task_func=stages.target_reads,
                       name='target_reads_connor',
                       input=output_from('intersect_bed_connor'),
                       filter=suffix('.intersectbed.bam'),
                       output='.mapped_to_target.txt')

    pipeline.transform(
        task_func=stages.total_reads,
        name='total_reads_connor',
        input=output_from('run_connor'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.connor.bam'),
        output='metrics/summary/{sample[0]}.total_raw_reads.txt')

    pipeline.collate(
        task_func=stages.generate_stats,
        name='generate_stats_connor',
        input=output_from('coverage_bed_connor', 'genome_reads_connor',
                          'target_reads_connor', 'total_reads_connor'),
        filter=regex(
            r'.+/(.+)\.(bedtools_hist_all|mapped_to_genome|mapped_to_target|total_raw_reads)\.txt'
        ),
        output=r'metrics/connor/all_sample.summary.\1.txt',
        extras=[r'\1', 'connor.summary.txt'])

    safe_make_dir('variants')
    safe_make_dir('variants/vardict')

    pipeline.transform(
        task_func=stages.run_vardict,
        name='run_vardict',
        input=output_from('run_connor'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.connor.bam'),
        output='variants/vardict/{sample[0]}.vcf',
        extras=['{sample[0]}'])

    pipeline.transform(
        task_func=stages.sort_vcfs,
        name='sort_vcfs',
        input=output_from('run_vardict'),
        filter=formatter('variants/vardict/(?P<sample>[a-zA-Z0-9_-]+).vcf'),
        output='variants/vardict/{sample[0]}.sorted.vcf.gz')

    pipeline.transform(task_func=stages.index_vcfs,
                       name='index_vcfs',
                       input=output_from('sort_vcfs'),
                       filter=suffix('.sorted.vcf.gz'),
                       output='.sorted.vcf.gz.tbi')

    (pipeline.merge(
        task_func=stages.concatenate_vcfs,
        name='concatenate_vcfs',
        input=output_from('sort_vcfs'),
        output='variants/vardict/combined.vcf.gz').follows('index_vcfs'))

    pipeline.transform(task_func=stages.vt_decompose_normalise,
                       name='vt_decompose_normalise',
                       input=output_from('concatenate_vcfs'),
                       filter=suffix('.vcf.gz'),
                       output='.decomp.norm.vcf.gz')

    pipeline.transform(task_func=stages.index_vcfs,
                       name='index_final_vcf',
                       input=output_from('vt_decompose_normalise'),
                       filter=suffix('.decomp.norm.vcf.gz'),
                       output='.decomp.norm.vcf.gz.tbi')

    (pipeline.transform(
        task_func=stages.apply_vep,
        name='apply_vep',
        input=output_from('vt_decompose_normalise'),
        filter=suffix('.decomp.norm.vcf.gz'),
        output='.decomp.norm.vep.vcf').follows('index_final_vcf'))

    return pipeline
Ejemplo n.º 51
0
        "test_active_if/b.3" -> "test_active_if/b.4"
            "test_active_if/b.4" -> "test_active_if/summary.5"
"""

expected_inactive_text = """null -> "test_active_if/a.1"
    "test_active_if/a.1" -> "test_active_if/a.2"
        "test_active_if/a.2" -> "test_active_if/a.4"
null -> "test_active_if/b.1"
    "test_active_if/b.1" -> "test_active_if/b.2"
        "test_active_if/b.2" -> "test_active_if/b.4"
            "test_active_if/b.4" -> "test_active_if/summary.5"
"""

# alternative syntax
test_pipeline = Pipeline("test")
test_pipeline.originate(task1, ['test_active_if/a.1', 'test_active_if/b.1'], "an extra_parameter")\
    .follows(mkdir("test_active_if"))
test_pipeline.transform(task2, task1, suffix(".1"), ".2")
test_pipeline.transform(task3, task1, suffix(".1"),
                        ".3").active_if(lambda: pipeline_active_if)
test_pipeline.collate(task4, [task2, task3], regex(r"(.+)\.[23]"), r"\1.4")
test_pipeline.merge(task5, task4, "test_active_if/summary.5")


class Test_ruffus(unittest.TestCase):
    def setUp(self):
        try:
            shutil.rmtree(tempdir)
        except:
            pass
        os.makedirs(tempdir)
Ejemplo n.º 52
0
            "test_active_if/b.4" -> "test_active_if/summary.5"
"""

expected_inactive_text = """null -> "test_active_if/a.1"
    "test_active_if/a.1" -> "test_active_if/a.2"
        "test_active_if/a.2" -> "test_active_if/a.4"
null -> "test_active_if/b.1"
    "test_active_if/b.1" -> "test_active_if/b.2"
        "test_active_if/b.2" -> "test_active_if/b.4"
            "test_active_if/b.4" -> "test_active_if/summary.5"
"""


# alternative syntax
test_pipeline = Pipeline("test")
test_pipeline.originate(task1, ['test_active_if/a.1', 'test_active_if/b.1'], "an extra_parameter")\
    .follows(mkdir("test_active_if"))
test_pipeline.transform(task2, task1, suffix(".1"), ".2")
test_pipeline.transform(task3, task1, suffix(
    ".1"), ".3").active_if(lambda: pipeline_active_if)
test_pipeline.collate(task4, [task2, task3], regex(r"(.+)\.[23]"), r"\1.4")
test_pipeline.merge(task5, task4, "test_active_if/summary.5")


class Test_ruffus(unittest.TestCase):
    def setUp(self):
        try:
            shutil.rmtree(tempdir)
        except:
            pass
        os.makedirs(tempdir)