def setUp(self):

        # list of executed tasks
        manager = multiprocessing.managers.SyncManager()
        manager.start()
        global mutex_proxy
        global executed_tasks_proxy
        mutex_proxy = manager.Lock()
        executed_tasks_proxy = manager.dict()

        pipeline = Pipeline.pipelines["main"]
        pipeline.originate(task_func = start_task,
                            output = [tempdir + "a.1", tempdir + "b.1"],
                            extras = [executed_tasks_proxy, mutex_proxy])\
                .mkdir(tempdir)
        pipeline.transform(task_func = same_file_name_task,
                            input = start_task,
                            filter = suffix(".1"),
                            output = ".1",
                            extras = [executed_tasks_proxy, mutex_proxy])
        pipeline.transform( task_func = linked_file_name_task,
                            input = start_task,
                            filter = suffix(".1"),
                            output = ".linked.1",
                            extras = [executed_tasks_proxy, mutex_proxy])
        pipeline.transform(task_func = final_task,
                            input = [linked_file_name_task, same_file_name_task],
                            filter = suffix(".1"),
                            output = ".3",
                            extras = [executed_tasks_proxy, mutex_proxy])
        self.cleanUp()
Exemple #2
0
 def test_newstyle_ruffus (self):
     test_pipeline = Pipeline("test")
     test_pipeline.originate(start_task, ["a.1", "b.1"])
     test_pipeline.transform(same_file_name_task, start_task, suffix(".1"), ".1")
     test_pipeline.transform(linked_file_name_task, start_task, suffix(".1"), ".linked.1")
     test_pipeline.transform(final_task, [linked_file_name_task, same_file_name_task], suffix(".1"), ".3")
     test_pipeline.run(log_exceptions = True, verbose = 0)
Exemple #3
0
def build_pipeline():

    pipe = Pipeline("my_pipeline")

    pipe.originate(
        name="create_three_new_files",
        task_func=create_new_file,
        output=[os.path.join(WORK_DIR, f"file{i}.csv") for i in range(1, 4)],
    )

    pipe.transform(
        name="convert_csv_files_to_tsv",
        task_func=csv_to_tsv,
        input=output_from("create_three_new_files"),
        filter=suffix(".csv"),
        output=".tsv",
    )

    pipe.transform(
        name="calculate_md5",
        task_func=md5,
        input=output_from("convert_csv_files_to_tsv"),
        filter=suffix(".tsv"),
        output=".md5sum",
    )

    return pipe
    def test_newstyle_ruffus(self):
        # alternative syntax
        test_pipeline = Pipeline("test")

        test_pipeline.mkdir(data_dir, work_dir)
        test_pipeline.originate(task_func=task1,
                                output=[os.path.join(data_dir, "%s.1" % aa) for aa in "abcd"])

        test_pipeline.mkdir(filter=suffix(".1"),
                            output=".dir",
                            output_dir=work_dir)

        test_pipeline.transform(task_func=task2,
                                input=task1,
                                filter=suffix(".1"),
                                output=[".1", ".bak"],
                                extras=["extra.tst", 4, r"orig_dir=\1"],
                                output_dir=work_dir)

        test_pipeline.subdivide(task3, task2, suffix(
            ".1"), r"\1.*.2", [r"\1.a.2", r"\1.b.2"], output_dir=data_dir)
        test_pipeline.transform(task4, task3, suffix(
            ".2"), ".3", output_dir=work_dir)
        test_pipeline.merge(task5, task4, os.path.join(data_dir, "summary.5"))
        test_pipeline.run(multiprocess=50, verbose=0)

        with open(os.path.join(data_dir, "summary.5")) as ii:
            active_text = ii.read()
        if active_text != expected_active_text:
            raise Exception("Error:\n\tExpected\n%s\nInstead\n%s\n" %
                            (expected_active_text, active_text))
    def create_pipeline(self):
        """
        Create new pipeline on the fly without using decorators
        """
        global count_pipelines
        count_pipelines = count_pipelines + 1
        test_pipeline = Pipeline("test %d" % count_pipelines)

        test_pipeline.transform(task_func=transform1,
                                input=input_file,
                                filter=suffix('.txt'),
                                output='.output',
                                extras=[runtime_data])

        test_pipeline.transform(task_func=transform_raise_error,
                                input=input_file,
                                filter=suffix('.txt'),
                                output='.output',
                                extras=[runtime_data])

        test_pipeline.split(task_func=split1,
                            input=input_file,
                            output=split1_outputs)

        test_pipeline.merge(task_func=merge2,
                            input=split1,
                            output=merge2_output)
        return test_pipeline
    def create_pipeline(self):
        """
        Create new pipeline on the fly without using decorators
        """
        global count_pipelines
        count_pipelines = count_pipelines + 1
        test_pipeline = Pipeline("test %d" % count_pipelines)

        test_pipeline.transform(task_func=transform1,
                                input=input_file,
                                filter=suffix('.txt'),
                                output='.output',
                                extras=[runtime_data])

        test_pipeline.transform(task_func=transform_raise_error,
                                input=input_file,
                                filter=suffix('.txt'),
                                output='.output',
                                extras=[runtime_data])

        test_pipeline.split(task_func=split1,
                            input=input_file,
                            output=split1_outputs)

        test_pipeline.merge(task_func=merge2,
                            input=split1,
                            output=merge2_output)
        return test_pipeline
Exemple #7
0
    def setUp(self):

        # list of executed tasks
        manager = multiprocessing.managers.SyncManager()
        manager.start()
        global mutex_proxy
        global executed_tasks_proxy
        mutex_proxy = manager.Lock()
        executed_tasks_proxy = manager.dict()

        pipeline = Pipeline.pipelines["main"]
        pipeline.originate(task_func=start_task,
                           output=[tempdir + "a.1", tempdir + "b.1"],
                           extras=[executed_tasks_proxy, mutex_proxy])\
            .mkdir(tempdir)
        pipeline.transform(task_func=same_file_name_task,
                           input=start_task,
                           filter=suffix(".1"),
                           output=".1",
                           extras=[executed_tasks_proxy, mutex_proxy])
        pipeline.transform(task_func=linked_file_name_task,
                           input=start_task,
                           filter=suffix(".1"),
                           output=".linked.1",
                           extras=[executed_tasks_proxy, mutex_proxy])
        pipeline.transform(task_func=final_task,
                           input=[linked_file_name_task, same_file_name_task],
                           filter=suffix(".1"),
                           output=".3",
                           extras=[executed_tasks_proxy, mutex_proxy])
        self.cleanUp()
    def test_newstyle_ruffus(self):

        test_pipeline = Pipeline("test")

        test_pipeline.split(task_func=split_fasta_file,
                            input=tempdir + "original.fa",
                            output=[tempdir + "files.split.success",
                                    tempdir + "files.split.*.fa"])\
            .posttask(lambda: verbose_output.write("    Split into %d files\n" % 10))

        test_pipeline.transform(task_func=align_sequences,
                                input=split_fasta_file,
                                filter=suffix(".fa"),
                                output=".aln"                     # fa -> aln
                                )\
            .posttask(lambda: verbose_output.write("    Sequences aligned\n"))

        test_pipeline.transform(task_func=percentage_identity,
                                input=align_sequences,      # find all results from align_sequences
                                # replace suffix with:
                                filter=suffix(".aln"),
                                output=[r".pcid",  # .pcid suffix for the result
                                        r".pcid_success"]  # .pcid_success to indicate job completed
                                )\
            .posttask(lambda: verbose_output.write("    %Identity calculated\n"))

        test_pipeline.merge(task_func=combine_results,
                            input=percentage_identity,
                            output=[tempdir + "all.combine_results",
                                    tempdir + "all.combine_results_success"])\
            .posttask(lambda: verbose_output.write("    Results recombined\n"))

        test_pipeline.run(multiprocess=50, verbose=0)
        if not os.path.exists(tempdir + "all.combine_results"):
            raise Exception("Missing %s" % (tempdir + "all.combine_results"))
 def test_newstyle_simpler (self):
     test_pipeline = Pipeline("test")
     test_pipeline.originate(task1, input_file_names, extras = [logger_proxy, logging_mutex])
     test_pipeline.transform(task2, task1, suffix(".1"), ".2", extras = [logger_proxy, logging_mutex])
     test_pipeline.transform(task3, task2, suffix(".2"), ".3", extras = [logger_proxy, logging_mutex])
     test_pipeline.merge(task4, task3, final_file_name, extras = [logger_proxy, logging_mutex])
     #test_pipeline.merge(task4, task3, final_file_name, extras = {"logger_proxy": logger_proxy, "logging_mutex": logging_mutex})
     test_pipeline.run(multiprocess = 500, verbose = 0)
    def test_newstyle_collate(self):
        """
        As above but create pipeline on the fly using object orientated syntax rather than decorators
        """

        #
        # Create pipeline on the fly, joining up tasks
        #
        test_pipeline = Pipeline("test")

        test_pipeline.originate(task_func   = generate_initial_files,
                                output      = original_files)\
            .mkdir(tempdir, tempdir+"/test")


        test_pipeline.subdivide(    task_func   = split_fasta_file,
                                    input       = generate_initial_files,
                                    filter      = regex(r".*\/original_(\d+).fa"),       # match original files
                                    output      = [tempdir + r"/files.split.\1.success", # flag file for each original file
                                                   tempdir + r"/files.split.\1.*.fa"],   # glob pattern
                                    extras      = [r"\1"])\
            .posttask(lambda: sys.stderr.write("\tSplit into %d files each\n" % JOBS_PER_TASK))


        test_pipeline.transform(task_func   = align_sequences,
                                input       = split_fasta_file,
                                filter      = suffix(".fa"),
                                output      = ".aln")  \
            .posttask(lambda: sys.stderr.write("\tSequences aligned\n"))

        test_pipeline.transform(task_func   = percentage_identity,
                                input       = align_sequences,             # find all results from align_sequences
                                filter      = suffix(".aln"),             # replace suffix with:
                                output      = [r".pcid",                  #   .pcid suffix for the result
                                               r".pcid_success"]         #   .pcid_success to indicate job completed
                                )\
            .posttask(lambda: sys.stderr.write("\t%Identity calculated\n"))


        test_pipeline.collate(task_func   = combine_results,
                              input       = percentage_identity,
                              filter      = regex(r".*files.split\.(\d+)\.\d+.pcid"),
                              output      = [tempdir + r"/\1.all.combine_results",
                                             tempdir + r"/\1.all.combine_results_success"])\
            .posttask(lambda: sys.stderr.write("\tResults recombined\n"))

        #
        # Cleanup, printout and run
        #
        self.cleanup_tmpdir()
        s = StringIO()
        test_pipeline.printout(s, [combine_results],
                               verbose=5,
                               wrap_width=10000)
        self.assertTrue(
            re.search('Job needs update:.*Missing files.*', s.getvalue(),
                      re.DOTALL) is not None)
        test_pipeline.run(verbose=0)
def make_pipeline1(
        pipeline_name,  # Pipelines need to have a unique name
        starting_file_names):
    test_pipeline = Pipeline(pipeline_name)

    #   We can change the starting files later using
    #          set_input() for transform etc.
    #       or set_output() for originate
    #   But it can be more convenient to just pass this to the function making the pipeline
    #
    test_pipeline.originate(task_originate, starting_file_names)\
        .follows(mkdir(tempdir), mkdir(tempdir + "/testdir", tempdir + "/testdir2"))\
        .posttask(touch_file(tempdir + "/testdir/whatever.txt"))
    test_pipeline.transform(
        task_func=task_m_to_1,
        name="add_input",
        # Lookup Task from function name task_originate()
        #   So long as this is unique in the pipeline
        input=task_originate,
        # requires an anchor from 3.7 onwards, see
        # https://bugs.python.org/issue34982
        filter=regex(r"^(.*)"),
        add_inputs=add_inputs(tempdir + "/testdir/whatever.txt"),
        output=r"\1.22")
    test_pipeline.transform(
        task_func=task_1_to_1,
        name="22_to_33",
        # Lookup Task from Task name
        #   Function name is not unique in the pipeline
        input=output_from("add_input"),
        filter=suffix(".22"),
        output=".33")
    tail_task = test_pipeline.transform(
        task_func=task_1_to_1,
        name="33_to_44",
        # Ask Pipeline to lookup Task from Task name
        input=test_pipeline["22_to_33"],
        filter=suffix(".33"),
        output=".44")

    #   Set the tail task so that users of my sub pipeline can use it as a dependency
    #       without knowing the details of task names
    #
    #   Use Task() object directly without having to lookup
    test_pipeline.set_tail_tasks([tail_task])

    #   If we try to connect a Pipeline without tail tasks defined, we have to
    #       specify the exact task within the Pipeline.
    #   Otherwise Ruffus will not know which task we mean and throw an exception
    if DEBUG_do_not_define_tail_task:
        test_pipeline.set_tail_tasks([])

    # Set the head task so that users of my sub pipeline send input into it
    #   without knowing the details of task names
    test_pipeline.set_head_tasks([test_pipeline[task_originate]])

    return test_pipeline
    def test_newstyle_collate(self):
        """
        As above but create pipeline on the fly using object orientated syntax rather than decorators
        """

        #
        # Create pipeline on the fly, joining up tasks
        #
        test_pipeline = Pipeline("test")

        test_pipeline.originate(task_func=generate_initial_files,
                                output=original_files)\
            .mkdir(tempdir, tempdir+"/test")

        test_pipeline.subdivide(task_func=split_fasta_file,
                                input=generate_initial_files,
                                # match original files
                                filter=regex(r".*\/original_(\d+).fa"),
                                output=[tempdir + r"/files.split.\1.success",  # flag file for each original file
                                        tempdir + r"/files.split.\1.*.fa"],   # glob pattern
                                extras=[r"\1"])\
            .posttask(lambda: sys.stderr.write("\tSplit into %d files each\n" % JOBS_PER_TASK))

        test_pipeline.transform(task_func=align_sequences,
                                input=split_fasta_file,
                                filter=suffix(".fa"),
                                output=".aln")  \
            .posttask(lambda: sys.stderr.write("\tSequences aligned\n"))

        test_pipeline.transform(task_func=percentage_identity,
                                input=align_sequences,             # find all results from align_sequences
                                # replace suffix with:
                                filter=suffix(".aln"),
                                output=[r".pcid",  # .pcid suffix for the result
                                        r".pcid_success"]  # .pcid_success to indicate job completed
                                )\
            .posttask(lambda: sys.stderr.write("\t%Identity calculated\n"))

        test_pipeline.collate(task_func=combine_results,
                              input=percentage_identity,
                              filter=regex(r".*files.split\.(\d+)\.\d+.pcid"),
                              output=[tempdir + r"/\1.all.combine_results",
                                      tempdir + r"/\1.all.combine_results_success"])\
            .posttask(lambda: sys.stderr.write("\tResults recombined\n"))

        #
        # Cleanup, printout and run
        #
        self.cleanup_tmpdir()
        s = StringIO()
        test_pipeline.printout(s, [combine_results],
                               verbose=5, wrap_width=10000)
        self.assertTrue(re.search(
            'Job needs update:.*Missing files.*', s.getvalue(), re.DOTALL) is not None)
        test_pipeline.run(verbose=0)
def make_pipeline1(pipeline_name,   # Pipelines need to have a unique name
                   starting_file_names):
    test_pipeline = Pipeline(pipeline_name)

    #   We can change the starting files later using
    #          set_input() for transform etc.
    #       or set_output() for originate
    #   But it can be more convenient to just pass this to the function making the pipeline
    #
    test_pipeline.originate(task_originate, starting_file_names)\
        .follows(mkdir(tempdir), mkdir(tempdir + "/testdir", tempdir + "/testdir2"))\
        .posttask(touch_file(tempdir + "/testdir/whatever.txt"))
    test_pipeline.transform(task_func=task_m_to_1,
                            name="add_input",
                            # Lookup Task from function name task_originate()
                            #   So long as this is unique in the pipeline
                            input=task_originate,
                            # requires an anchor from 3.7 onwards, see
                            # https://bugs.python.org/issue34982
                            filter=regex(r"^(.*)"),
                            add_inputs=add_inputs(
                                tempdir + "/testdir/whatever.txt"),
                            output=r"\1.22")
    test_pipeline.transform(task_func=task_1_to_1,
                            name="22_to_33",
                            # Lookup Task from Task name
                            #   Function name is not unique in the pipeline
                            input=output_from("add_input"),
                            filter=suffix(".22"),
                            output=".33")
    tail_task = test_pipeline.transform(task_func=task_1_to_1,
                                        name="33_to_44",
                                        # Ask Pipeline to lookup Task from Task name
                                        input=test_pipeline["22_to_33"],
                                        filter=suffix(".33"),
                                        output=".44")

    #   Set the tail task so that users of my sub pipeline can use it as a dependency
    #       without knowing the details of task names
    #
    #   Use Task() object directly without having to lookup
    test_pipeline.set_tail_tasks([tail_task])

    #   If we try to connect a Pipeline without tail tasks defined, we have to
    #       specify the exact task within the Pipeline.
    #   Otherwise Ruffus will not know which task we mean and throw an exception
    if DEBUG_do_not_define_tail_task:
        test_pipeline.set_tail_tasks([])

    # Set the head task so that users of my sub pipeline send input into it
    #   without knowing the details of task names
    test_pipeline.set_head_tasks([test_pipeline[task_originate]])

    return test_pipeline
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='test_pipeline')
    # Get a list of paths to all the FASTQ files
    input_files = state.config.get_option('files')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(
        task_func=stages.original_files,
        name='original_files',
        output=input_files)

    pipeline.transform(
        task_func=stages.stage1,
        name='stage1',
        input=output_from('original_files'),
        filter=suffix('.0'),
        output='.1')

    pipeline.transform(
        task_func=stages.stage2,
        name='stage2',
        input=output_from('stage1'),
        filter=suffix('.1'),
        output='.2')

    pipeline.transform(
        task_func=stages.stage3,
        name='stage3',
        input=output_from('stage2'),
        filter=suffix('.2'),
        output='.3')

    pipeline.transform(
        task_func=stages.stage4,
        name='stage4',
        input=output_from('stage3'),
        filter=suffix('.3'),
        output='.4')

    pipeline.transform(
        task_func=stages.stage5,
        name='stage5',
        input=output_from('stage4'),
        filter=suffix('.4'),
        output='.5')

    return pipeline
Exemple #15
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='twin ion')
    # Get a list of paths to all the MZML files
    mzml_files = state.config.get_option('mzml')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original MZML files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(
        task_func=stages.original_mzml,
        name='original_mzml',
        output=mzml_files)

    pipeline.transform(
        task_func=stages.resample,
        name='resample',
        input=output_from('original_mzml'),
        filter=suffix('.mzML'),
        output='.resample.mzML')

    pipeline.transform(
        task_func=stages.noise_filter_sgolay,
        name='noise_filter_sgolay',
        input=output_from('resample'),
        filter=suffix('.resample.mzML'),
        output='.denoise.mzML')

    pipeline.transform(
        task_func=stages.baseline_filter,
        name='baseline_filter',
        input=output_from('noise_filter_sgolay'),
        filter=suffix('.denoise.mzML'),
        output='.baseline.mzML')

    pipeline.transform(
        task_func=stages.peak_picker_hires,
        name='peak_picker_hires',
        input=output_from('baseline_filter'),
        filter=suffix('.baseline.mzML'),
        output='.peaks.mzML')

    pipeline.transform(
        task_func=stages.feature_finder_centroid,
        name='feature_finder_centroid',
        input=output_from('peak_picker_hires'),
        filter=suffix('.peaks.mzML'),
        output='.featureXML')

    return pipeline
Exemple #16
0
    def test_newstyle_ruffus(self):

        test_pipeline = Pipeline("test")
        test_pipeline.originate(task_func=task1,
                                output=[tempdir + 'a.1'] + runtime_files)
        test_pipeline.transform(task2, task1, suffix(".1"), ".2")
        test_pipeline.transform(task_func=task3,
                                input=task2,
                                filter=suffix(".2"),
                                output=".3")
        test_pipeline.transform(task_func=task4,
                                input=runtime_parameter("a"),
                                filter=suffix(".3"),
                                output=".4").follows(task3)
        test_pipeline.run(verbose=0, runtime_data={"a": runtime_files})
Exemple #17
0
    def test_newstyle_ruffus(self):

        test_pipeline = Pipeline("test")
        test_pipeline.originate(task_func=task1,
                                output=[tempdir + 'a.1'] + runtime_files)
        test_pipeline.transform(task2, task1, suffix(".1"), ".2")
        test_pipeline.transform(task_func=task3,
                                input=task2,
                                filter=suffix(".2"),
                                output=".3")
        test_pipeline.transform(task_func=task4,
                                input=runtime_parameter("a"),
                                filter=suffix(".3"),
                                output=".4").follows(task3)
        test_pipeline.run(verbose=0, runtime_data={"a": runtime_files})
Exemple #18
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='md5')
    # Get a list of paths to all the input files
    input_files = state.config.get_option('files')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(
        task_func=stages.original_files,
        name='original_files',
        output=input_files)

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.md5_checksum,
        name='md5_checksum',
        input=output_from('original_files'),
        filter=suffix(''),
        output='.md5')


    return pipeline
Exemple #19
0
    def test_newstyle_ruffus(self):
        test_pipeline = Pipeline("test")

        test_pipeline.files(create_random_numbers, None, tempdir + "random_numbers.list")\
            .follows(mkdir(tempdir))


        test_pipeline.split(task_func = step_4_split_numbers_into_chunks,
                       input = tempdir + "random_numbers.list",
                       output = tempdir + "*.chunks")\
            .follows(create_random_numbers)

        test_pipeline.transform(task_func=step_5_calculate_sum_of_squares,
                                input=step_4_split_numbers_into_chunks,
                                filter=suffix(".chunks"),
                                output=".sums")

        test_pipeline.merge(task_func = step_6_calculate_variance, input = step_5_calculate_sum_of_squares, output = os.path.join(tempdir, "variance.result"))\
            .posttask(lambda: sys.stdout.write("     hooray\n"))\
            .posttask(print_hooray_again, print_whoppee_again, touch_file(os.path.join(tempdir, "done")))

        test_pipeline.run(multiprocess=50, verbose=0)
        output_file = os.path.join(tempdir, "variance.result")
        if not os.path.exists(output_file):
            raise Exception("Missing %s" % output_file)
def main(argv=None):
    if argv is None:
        argv = sys.argv

    options, args = P.initialize(argv,
                                 config_file="template.yml",
                                 defaults={
                                     "min_value": 0.0,
                                     "num_samples": 1000,
                                     "mu": 0.0,
                                     "sigma": 1.0
                                 })

    pipeline = ruffus.Pipeline("template_pipeline")

    task_create_files = pipeline.originate(
        task_func=create_files,
        output=["sample_{:02}.txt".format(x) for x in range(10)])

    task_compute_mean = pipeline.transform(task_func=compute_mean,
                                           input=task_create_files,
                                           filter=ruffus.suffix(".txt"),
                                           output=".mean")

    task_combine_means = pipeline.merge(task_func=combine_means,
                                        input=task_compute_mean,
                                        output="means.txt")

    # primary targets
    pipeline.merge(task_func=P.EmptyRunner("all"),
                   input=task_combine_means,
                   output="all")

    E.debug("starting workflow")
    return P.run_workflow(options, args)
Exemple #21
0
    def test_newstyle_ruffus (self):
        test_pipeline = Pipeline("test")

        test_pipeline.files(create_random_numbers, None, tempdir + "random_numbers.list")\
            .follows(mkdir(tempdir))


        test_pipeline.split(task_func = step_4_split_numbers_into_chunks,
                       input = tempdir + "random_numbers.list",
                       output = tempdir + "*.chunks")\
            .follows(create_random_numbers)

        test_pipeline.transform(task_func = step_5_calculate_sum_of_squares,
                           input = step_4_split_numbers_into_chunks,
                           filter = suffix(".chunks"),
                           output = ".sums")

        test_pipeline.merge(task_func = step_6_calculate_variance, input = step_5_calculate_sum_of_squares, output = os.path.join(tempdir, "variance.result"))\
            .posttask(lambda: sys.stdout.write("     hooray\n"))\
            .posttask(print_hooray_again, print_whoppee_again, touch_file(os.path.join(tempdir, "done")))

        test_pipeline.run(multiprocess = 50, verbose = 0)
        output_file = os.path.join(tempdir, "variance.result")
        if not os.path.exists (output_file):
            raise Exception("Missing %s" % output_file)
Exemple #22
0
    def build_pipeline(self, pipeline_name, **kwargs):
        # fudge: clear all previous pipelines
        ruffus.Pipeline.clear_all()
        pipeline = ruffus.Pipeline(pipeline_name)

        task_create_files = pipeline.originate(
            task_func=create_files,
            output=["sample_{:02}.txt".format(x) for x in range(10)])

        task_compute_mean = pipeline.transform(task_func=compute_mean,
                                               input=task_create_files,
                                               filter=ruffus.suffix(".txt"),
                                               output=".mean")

        task_combine_means = pipeline.merge(task_func=combine_means,
                                            input=task_compute_mean,
                                            output="means.txt")

        task_run_local_job1 = pipeline.transform(task_func=run_local_job1,
                                                 input=task_create_files,
                                                 filter=ruffus.suffix(".txt"),
                                                 output=".local1")

        # test jobs_limit with local running
        task_run_local_job2 = pipeline.transform(task_func=run_local_job2,
                                                 input=task_create_files,
                                                 filter=ruffus.suffix(".txt"),
                                                 output=".local2").jobs_limit(
                                                     NUM_CORES // 2)

        # multiprocessing and DRMAA do not work at the moment likely
        # cause is the shared session object.
        if not HAVE_DRMAA or (kwargs.get("multiprocess", 1) > 1):
            return

        task_run_remote_job1 = pipeline.transform(task_func=run_remote_job1,
                                                  input=task_create_files,
                                                  filter=ruffus.suffix(".txt"),
                                                  output=".remote1")

        # test jobs_limit with remote running
        task_run_remote_job2 = pipeline.transform(
            task_func=run_remote_job2,
            input=task_create_files,
            filter=ruffus.suffix(".txt"),
            output=".remote2").jobs_limit(NUM_CORES // 2)
Exemple #23
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='ovarian_cancer_pipeline')
    # Get a list of paths to all the FASTQ files
    fastq_files = state.config.get_option('fastqs')
    human_reference_genome_file = state.config.get_option('human_reference_genome')
    # Stages are dependent on the state
    stages = PipelineStages(state)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(
        task_func=stages.original_fastqs,
        name='original_fastqs',
        output=fastq_files)

    # The human reference genome in FASTA format
    pipeline.originate(
        task_func=stages.human_reference_genome,
        name='human_reference_genome',
        output=human_reference_genome_file)

    # Index the human reference genome with BWA, needed before we can map reads
    pipeline.transform(
        task_func=stages.index_ref_bwa,
        name='index_ref_bwa',
        input=output_from('human_reference_genome'),
        filter=suffix('.fa'),
        output=['.fa.amb', '.fa.ann', '.fa.pac', '.fa.sa', '.fa.bwt'])

    # Align paired end reads in FASTQ to the reference producing a BAM file
    (pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name. 
        # This will be the first input to the stage.
        # We assume the sample name may consist of only alphanumeric
        # characters.
        filter=formatter('.+/(?P<sample>[_a-zA-Z0-9]+)_R1.fastq.gz'),
        # Add one more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=['{sample[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='{path[0]}/{sample[0]}.bam')
        .follows('index_ref_bwa'))


    return pipeline
Exemple #24
0
    def create_pipeline(self):
        #each pipeline has a different name
        global cnt_pipelines
        cnt_pipelines = cnt_pipelines + 1
        test_pipeline = Pipeline("test %d" % cnt_pipelines)

        test_pipeline.originate(
            task_func=generate_initial_files1,
            output=[tempdir + prefix + "_name.tmp1" for prefix in "abcd"])

        test_pipeline.originate(
            task_func=generate_initial_files2,
            output=[tempdir + "e_name.tmp1", tempdir + "f_name.tmp1"])

        test_pipeline.originate(
            task_func=generate_initial_files3,
            output=[tempdir + "g_name.tmp1", tempdir + "h_name.tmp1"])

        test_pipeline.originate(task_func=generate_initial_files4,
                                output=tempdir + "i_name.tmp1")

        test_pipeline.collate(task_func=test_task2,
                              input=[
                                  generate_initial_files1,
                                  generate_initial_files2,
                                  generate_initial_files3,
                                  generate_initial_files4
                              ],
                              filter=formatter(),
                              output="{path[0]}/all.tmp2")

        test_pipeline.transform(task_func=test_task3,
                                input=test_task2,
                                filter=suffix(".tmp2"),
                                output=".tmp3")

        test_pipeline.transform(task_func=test_task4,
                                input=test_task3,
                                filter=suffix(".tmp3"),
                                output=".tmp4")
        return test_pipeline
Exemple #25
0
def make_pipeline_call(state):
    #this part of the pipeline will take the summary results of "map" and turn them into gatk and undr_rover vcfs
    pipeline = Pipeline(name='hiplexpipe')

    with open("all_sample.passed.summary.txt", 'r') as inputf:
        passed_files = inputf.read().split('\n')

        stages = Stages(state)

    safe_make_dir('variants')
    safe_make_dir('variants/gatk')
    safe_make_dir('variants/undr_rover')
    safe_make_dir('variants/undr_rover/coverdir')

    pipeline.originate(task_func=stages.passed_filter_files,
                       name='passed_filter_files',
                       output=passed_files)

    # Call variants using undr_rover
    pipeline.transform(
        task_func=stages.apply_undr_rover,
        name='apply_undr_rover',
        input=output_from('passed_filter_files'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).clipped.sort.hq.bam'),
        output='variants/undr_rover/{sample[0]}.vcf',
        extras=['{sample[0]}'])

    #### concatenate undr_rover vcfs ####
    pipeline.transform(
        task_func=stages.sort_vcfs,
        name='sort_vcfs',
        input=output_from('apply_undr_rover'),
        filter=formatter('variants/undr_rover/(?P<sample>[a-zA-Z0-9_-]+).vcf'),
        output='variants/undr_rover/{sample[0]}.sorted.vcf.gz')

    pipeline.transform(task_func=stages.index_vcfs,
                       name='index_vcfs',
                       input=output_from('sort_vcfs'),
                       filter=suffix('.sorted.vcf.gz'),
                       output='.sorted.vcf.gz.tbi')

    ###### GATK VARIANT CALLING ######
    # Call variants using GATK
    pipeline.transform(
        task_func=stages.call_haplotypecaller_gatk,
        name='call_haplotypecaller_gatk',
        input=output_from('passed_filter_files'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9-_]+).clipped.sort.hq.bam'),
        output='variants/gatk/{sample[0]}.g.vcf')

    return pipeline
Exemple #26
0
 def test_newstyle_simpler(self):
     test_pipeline = Pipeline("test")
     test_pipeline.originate(task1,
                             input_file_names,
                             extras=[logger_proxy, logging_mutex])
     test_pipeline.transform(task2,
                             task1,
                             suffix(".1"),
                             ".2",
                             extras=[logger_proxy, logging_mutex])
     test_pipeline.transform(task3,
                             task2,
                             suffix(".2"),
                             ".3",
                             extras=[logger_proxy, logging_mutex])
     test_pipeline.merge(task4,
                         task3,
                         final_file_name,
                         extras=[logger_proxy, logging_mutex])
     #test_pipeline.merge(task4, task3, final_file_name, extras = {"logger_proxy": logger_proxy, "logging_mutex": logging_mutex})
     test_pipeline.run(multiprocess=500, verbose=0)
    def test_newstyle_ruffus(self):
        # alternative syntax
        test_pipeline = Pipeline("test")

        test_pipeline.mkdir(data_dir, work_dir)
        test_pipeline.originate(
            task_func=task1,
            output=[os.path.join(data_dir, "%s.1" % aa) for aa in "abcd"])

        test_pipeline.mkdir(filter=suffix(".1"),
                            output=".dir",
                            output_dir=work_dir)

        test_pipeline.transform(task_func=task2,
                                input=task1,
                                filter=suffix(".1"),
                                output=[".1", ".bak"],
                                extras=["extra.tst", 4, r"orig_dir=\1"],
                                output_dir=work_dir)

        test_pipeline.subdivide(task3,
                                task2,
                                suffix(".1"),
                                r"\1.*.2", [r"\1.a.2", r"\1.b.2"],
                                output_dir=data_dir)
        test_pipeline.transform(task4,
                                task3,
                                suffix(".2"),
                                ".3",
                                output_dir=work_dir)
        test_pipeline.merge(task5, task4, os.path.join(data_dir, "summary.5"))
        test_pipeline.run(multiprocess=50, verbose=0)

        with open(os.path.join(data_dir, "summary.5")) as ii:
            active_text = ii.read()
        if active_text != expected_active_text:
            raise Exception("Error:\n\tExpected\n%s\nInstead\n%s\n" %
                            (expected_active_text, active_text))
    def create_pipeline (self):
        #each pipeline has a different name
        global cnt_pipelines
        cnt_pipelines = cnt_pipelines + 1
        test_pipeline = Pipeline("test %d" % cnt_pipelines)

        test_pipeline.originate(task_func   = generate_initial_files1,
                                output      = [tempdir + prefix + "_name.tmp1" for prefix in "abcd"])

        test_pipeline.originate(task_func   = generate_initial_files2,
                                output      = [tempdir +  "e_name.tmp1", tempdir +  "f_name.tmp1"])

        test_pipeline.originate(task_func   = generate_initial_files3,
                                output      = [tempdir +  "g_name.tmp1", tempdir +  "h_name.tmp1"])

        test_pipeline.originate(task_func   = generate_initial_files4,
                                output      = tempdir +  "i_name.tmp1")

        test_pipeline.collate(  task_func   = test_task2,
                                input       = [generate_initial_files1,
                                               generate_initial_files2,
                                               generate_initial_files3,
                                               generate_initial_files4],
                                filter      = formatter(),
                                output      = "{path[0]}/all.tmp2")

        test_pipeline.transform(task_func   = test_task3,
                                input       = test_task2,
                                filter      = suffix(".tmp2"),
                                output      = ".tmp3")

        test_pipeline.transform(task_func   = test_task4,
                                input       = test_task3,
                                filter      = suffix(".tmp3"),
                                output      = ".tmp4")
        return test_pipeline
Exemple #29
0
    def build_pipeline(self, pipeline_name):
        # fudge: clear all previous pipelines
        ruffus.Pipeline.clear_all()
        pipeline = ruffus.Pipeline(pipeline_name)

        task_create_files = pipeline.originate(
            task_func=create_files,
            output=["sample_{:02}.txt".format(x) for x in range(10)])

        task_compute_mean = pipeline.transform(
            task_func=compute_mean,
            input=task_create_files,
            filter=ruffus.suffix(".txt"),
            output=".mean")

        task_combine_means = pipeline.merge(
            task_func=combine_means,
            input=task_compute_mean,
            output="means.txt")
def make_pipeline2( pipeline_name = "pipeline2"):
    test_pipeline2 = Pipeline(pipeline_name)
    test_pipeline2.transform(task_func   = task_1_to_1,
                             # task name
                            name        = "44_to_55",
                             # placeholder: will be replaced later with set_input()
                            input       = None,
                            filter      = suffix(".44"),
                            output      = ".55")
    test_pipeline2.merge(   task_func   = task_m_to_1,
                            input       = test_pipeline2["44_to_55"],
                            output      = tempdir + "/final.output",)

    # Set head and tail
    test_pipeline2.set_tail_tasks([test_pipeline2[task_m_to_1]])
    if not DEBUG_do_not_define_head_task:
        test_pipeline2.set_head_tasks([test_pipeline2["44_to_55"]])

    return test_pipeline2
Exemple #31
0
def make_pipeline2(pipeline_name="pipeline2"):
    test_pipeline2 = Pipeline(pipeline_name)
    test_pipeline2.transform(
        task_func=task_1_to_1,
        # task name
        name="44_to_55",
        # placeholder: will be replaced later with set_input()
        input=None,
        filter=suffix(".44"),
        output=".55")
    test_pipeline2.merge(
        task_func=task_m_to_1,
        input=test_pipeline2["44_to_55"],
        output=tempdir + "/final.output",
    )

    # Set head and tail
    test_pipeline2.set_tail_tasks([test_pipeline2[task_m_to_1]])
    if not DEBUG_do_not_define_head_task:
        test_pipeline2.set_head_tasks([test_pipeline2["44_to_55"]])

    return test_pipeline2
        with iotools.open_file(fn, "w") as outf:
            df.to_csv(outf, sep="\t", index=True)


@merge(runFastQC, "fastqc_status_summary.tsv.gz")
def buildFastQCSummaryStatus(infiles, outfile):
    '''load FastQC status summaries into a single table.'''
    readqc.buildFastQCSummaryStatus(
        infiles,
        outfile,
        "fastqc.dir")


@jobs_limit(P.get_params().get("jobs_limit_db", 1), "db")
@transform((summarizeFastQC, buildFastQCSummaryStatus),
           suffix(".tsv.gz"), ".load")
def loadFastQC(infile, outfile):
    '''load FASTQC stats into database.'''

    # a check to make sure file isnt empty
    n = 0
    with iotools.open_file(infile) as f:
        for i, line in enumerate(f):
            n =+ i
    if n > 0:
        P.load(infile, outfile, options="--add-index=track")
    else:
        table_name = infile.replace(".tsv.gz", "")
        database_sql = P.get_params()["database"]["url"]
        database_name = os.path.basename(database_sql)
        statement = """sqlite3 %(database_name)s
Exemple #33
0
        ">;)",
    ]
)
UNHAPPY_SMILIES = list(set(SMILIES) - set(HAPPY_SMILIES))


def detect_language(text):
    # details is 3x (langName, langCode, percent, score)
    lang_is_reliable, _, lang_details = cld2.detect(text)
    lang_details = lang_details[0]  # take only the first lang detected
    lang_name, lang_code, lang_percent, lang_score = lang_details

    return lang_name, lang_code, lang_score, lang_is_reliable


@ruffus.transform(os.path.join(tweets_dir, "tweets_100k.json.gz"), ruffus.suffix(".json.gz"), ".english.json.gz")
def extract_english_tweets(input_file, output_file):
    tokenizer = WordPunctTokenizer()

    n_happy = 0
    n_sad = 0

    labelled_tweets = []
    with gzip.open(input_file) as input:
        for line in input:
            tweet_info = json.loads(line)

            if "limit" in tweet_info:
                continue

            # TODO: care about unicode
        m = PipelineMapping.FastQc(nogroup=PARAMS["readqc_no_group"],
                                   outdir=PARAMS["exportdir"] + "/fastqc",
                                   contaminants=PARAMS['contaminants'])
    else:
        m = PipelineMapping.FastQc(nogroup=PARAMS["readqc_no_group"],
                                   outdir=PARAMS["exportdir"] + "/fastqc")
    if PARAMS["general_reconcile"] == 1:
        infiles = infiles.replace("processed.dir/trimmed",
                                  "reconciled.dir/trimmed")

    statement = m.build((infiles,), outfile)
    P.run()


@jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
@transform(runFastqc, suffix(".fastqc"), "_fastqc.load")
def loadFastqc(infile, outfile):
    '''load FASTQC stats into database.'''
    track = P.snip(infile, ".fastqc")
    filename = os.path.join(
        PARAMS["exportdir"], "fastqc", track + "*_fastqc", "fastqc_data.txt")

    PipelineReadqc.loadFastqc(filename,
                              backend=PARAMS["database_backend"],
                              database=PARAMS["database_name"],
                              host=PARAMS["database_host"],
                              username=PARAMS["database_username"],
                              password=PARAMS["database_password"],
                              port=PARAMS["database_port"])
    P.touch(outfile)
Exemple #35
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name="radpipe")

    # Stages are dependent on the state
    stages = PipelineStages(state)

    # Get a list of library objects.
    libraries = parse_libraries(
        libraries=state.config.get_options("libraries"))

    # Get a list of input files
    input_files = [l.files for l in libraries]
    # input_files = [item for sublist in input_files for item in sublist]
    state.logger.info("Input files: " + str(input_files))

    # Get a list of all samples for each library
    samples_dict = OrderedDict()
    for l in libraries:
        samples_dict[l.name] = l.samples
    state.logger.debug("Samples: " + str(samples_dict))

    # Make sure that there are no duplicate samples
    sample_list = [
        item for sublist in samples_dict.values() for item in sublist
    ]
    sample_counts = Counter(sample_list)
    for sample in sample_counts:
        if sample_counts[sample] > 1:
            print("Sample {} appears {} times in the barcodes files. "
                  "Sample names must be unique".format(sample,
                                                       sample_counts[sample]))
            sys.exit(radpipe.error_codes.INVALID_INPUT_FILE)

    # Define output directories
    output_dir = get_output_paths(state)
    state.logger.debug(output_dir)

    # Allow multiple comma-separated tasks
    if len(state.options.target_tasks) == 1:
        state.options.target_tasks = state.options.target_tasks[0].split(",")
    if len(state.options.forced_tasks) == 1:
        state.options.forced_tasks = state.options.forced_tasks[0].split(",")
    state.logger.debug("Target tasks: " + str(state.options.target_tasks))
    state.logger.debug("Forced tasks: " + str(state.options.forced_tasks))

    # Check if alignment_method is valid
    alignment_method = state.config.get_options(
        "alignment_method").strip().lower()
    if alignment_method not in ["bwa mem", "bowtie"]:
        print("Error: Invalid alignment_method in config file. " \
              "Valid options are ['bwa mem', 'bowtie'].")
        sys.exit(radpipe.error_codes.INVALID_ARGUMENT)
    if alignment_method == "bwa mem":
        align_task_name = "bwa_mem"
        index_task_name = "bwa_index"
    else:
        align_task_name = "bowtie"
        index_task_name = "bowtie_index"

    # TODO: Refactor this
    # If 'alignment' is in target_tasks or forced_tasks, specify which
    # type of alignment job
    if "alignment" in state.options.target_tasks:
        index = state.options.target_tasks.index("alignment")
        state.options.target_tasks[index] = align_task_name
    if "alignment" in state.options.forced_tasks:
        index = state.options.forced_tasks.index("alignment")
        state.options.forced_tasks[index] = align_task_name

    # If 'build_index' is in target_tasks or forced_tasks, specify which
    # type of index job
    if "build_index" in state.options.target_tasks:
        index = state.options.target_tasks.index("build_index")
        state.options.target_tasks[index] = index_task_name
    if "build_index" in state.options.forced_tasks:
        index = state.options.forced_tasks.index("build_index")
        state.options.forced_tasks[index] = index_task_name
    state.logger.debug(state)

    # Whether to include filter_bam stage or not
    filter_bams = False
    try:
        samtools_view_options = state.config.get_options(
            "samtools_view_options")
        if samtools_view_options:
            filter_bams = True
    except:
        pass
    state.logger.info("Filter bams: {}".format(filter_bams))

    # Population map filenames
    popmap_file = "{output_dir}/{name}_popmap.txt".format(
        output_dir=output_dir["populations"],
        name=state.config.get_options("analysis_id"))
    try:
        config_popmap_file = state.config.get_options("popmap_file")
        if config_popmap_file:
            state.logger.info(
                "Using popmap file: {}".format(config_popmap_file))
        else:
            raise (Exception)
    except Exception:
        config_popmap_file = None
        state.logger.info("Creating new popmap file: {}".format(popmap_file))

    # Population r values
    populations_r = state.config.get_options("populations_r")
    assert (isinstance(populations_r, list))

    # Dummy stages. These do nothing except provide a node at the beginning
    # for the pipeline graph, giving the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.do_nothing,
                       name="original_fastqs",
                       output=input_files)

    pipeline.originate(task_func=stages.do_nothing,
                       name="reference_genome",
                       output=state.config.get_options("reference_genome"))

    # Create a copy of the population map file needed for stacks, or create
    # one denovo using the sample list.
    pipeline.originate(task_func=stages.create_popmap_file,
                       name="create_popmap_file",
                       output=[popmap_file],
                       extras=[config_popmap_file, sample_list])

    # Create index for reference genome based on alignment method.
    if alignment_method == "bwa mem":
        pipeline.transform(
            task_func=stages.bwa_index,
            name="bwa_index",
            input=output_from("reference_genome"),
            filter=formatter(".+/(?P<ref>[^/]+).(fa|fasta)"),
            output=path_list_join(output_dir["reference"],
                                  ["reference.fa.bwt", "reference.fa.sa"]),
            extras=[output_dir["reference"]])

    if alignment_method == "bowtie":
        pipeline.transform(task_func=stages.bowtie_index,
                           name="bowtie_index",
                           input=output_from("reference_genome"),
                           filter=formatter(".+/(?P<ref>[^/]+).(fa|fasta)"),
                           output=path_list_join(
                               output_dir["reference"],
                               ["reference.1.ebwt", "reference.rev.1.ebwt"]),
                           extras=[output_dir["reference"]])

    # FastQC
    pipeline.transform(
        task_func=stages.fastqc,
        name="fastqc",
        input=output_from("original_fastqs"),
        filter=formatter(".+/(?P<lib>[^/]+)/(?P<fn>[^/]+).(fastq|fq).gz"),
        output="%s/{lib[0]}/{fn[0]}_fastqc.zip" % output_dir["fastqc"],
        extras=[output_dir["fastqc"], "{lib[0]}"])

    # MultiQC: FastQC
    pipeline.merge(task_func=stages.multiqc_fastqc,
                   name="multiqc_fastqc",
                   input=output_from("fastqc"),
                   output="%s/multiqc_fastqc_report.html" % output_dir["qc"],
                   extras=[output_dir["qc"], output_dir["fastqc"]])

    # Stacks: Process RAD-Tags
    pipeline.transform(task_func=stages.process_radtags,
                       name="process_radtags",
                       input=output_from("original_fastqs"),
                       filter=formatter(".+/(?P<lib>[^/]+)/[^/]+"),
                       output="%s/{lib[0]}/{lib[0]}.success" %
                       output_dir["process_radtags"],
                       extras=[
                           output_dir["process_radtags"], "{lib[0]}",
                           state.config.get_options("renz_1"),
                           state.config.get_options("renz_2"),
                           state.config.get_options("process_radtags_options")
                       ])

    # Create a list for alignment with the input fastq files from process_radtags
    process_radtags_outputs = []
    for l in libraries:
        for s in l.samples:
            base = "{dir}/{lib}/{sample}".format(
                dir=output_dir["process_radtags"], lib=l.lib_id, sample=s)
            process_radtags_outputs.append(
                [base + ".1.fq.gz", base + ".2.fq.gz"])
    # print(process_radtags_outputs)

    # Alignment
    if align_task_name == "bwa_mem":
        (pipeline.transform(
            task_func=stages.bwa_align,
            name=align_task_name,
            input=process_radtags_outputs,
            filter=formatter(".+/(?P<sm>[^/]+).1.fq.gz"),
            output="%s/{sm[0]}.bwa.bam" % output_dir["alignments"],
            extras=[
                os.path.join(output_dir["reference"], "reference.fa"),
                "{path[0]}", output_dir["alignments"], "{sm[0]}",
                state.config.get_options("alignment_options")
            ])).follows("bwa_index").follows("process_radtags")

    if align_task_name == "bowtie":
        (pipeline.transform(
            task_func=stages.bowtie_align,
            name=align_task_name,
            input=process_radtags_outputs,
            filter=formatter(".+/(?P<sm>[^/]+).1.fq.gz"),
            output="%s/{sm[0]}.bowtie.bam" % output_dir["alignments"],
            extras=[
                os.path.join(output_dir["reference"], "reference"),
                "{path[0]}", output_dir["alignments"], "{sm[0]}",
                state.config.get_options("alignment_options")
            ])).follows("bowtie_index").follows("process_radtags")

    # Sort BAM and index
    pipeline.transform(task_func=stages.sort_bam,
                       name="sort_bam",
                       input=output_from(align_task_name),
                       filter=suffix(".bam"),
                       output=".sorted.bam")

    if filter_bams:
        final_bam_task_name = "filter_bam"
        pipeline.transform(
            task_func=stages.filter_bam,
            name="filter_bam",
            input=output_from("sort_bam"),
            filter=suffix(".sorted.bam"),
            output=".sorted.filtered.bam",
            extras=[state.config.get_options("samtools_view_options")])
    else:
        final_bam_task_name = "sort_bam"

    # Samtools flagstat
    pipeline.transform(task_func=stages.flagstat,
                       name="flagstat",
                       input=output_from(final_bam_task_name),
                       filter=suffix(".bam"),
                       output=".flagstat.txt",
                       output_dir=output_dir["flagstat"])

    # MultiQC: flagstat
    pipeline.merge(task_func=stages.multiqc_flagstat,
                   name="multiqc_flagstat",
                   input=output_from("flagstat"),
                   output="%s/multiqc_flagstat_report.html" % output_dir["qc"],
                   extras=[output_dir["qc"], output_dir["flagstat"]])

    # Stacks: gstacks
    pipeline.merge(task_func=stages.gstacks,
                   name="gstacks",
                   input=output_from(final_bam_task_name),
                   output="%s/catalog.fa.gz" % output_dir["gstacks"],
                   extras=[
                       output_dir["alignments"], output_dir["gstacks"],
                       align_task_name, final_bam_task_name, sample_list,
                       state.config.get_options("gstacks_options")
                   ])

    # Define outputs from each run of populations
    populations_outputs = []
    for r in populations_r:
        dir_name = "{pop_dir}/{analysis_name}_r{r}".format(
            pop_dir=output_dir["populations"],
            analysis_name=state.config.get_options("analysis_id"),
            r=r)
        populations_outputs.append(
            os.path.join(dir_name, "populations.snps.vcf"))
    # print(populations_outputs)

    # Stacks: populations
    pipeline.originate(task_func=stages.populations,
                       name="popluations",
                       output=populations_outputs,
                       extras=[
                           output_dir["gstacks"], output_dir["populations"],
                           popmap_file,
                           state.config.get_options("populations_options")
                       ]).follows("gstacks").follows("create_popmap_file")

    return pipeline
Exemple #36
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='crpipe')
    # Get a list of paths to all the FASTQ files
    fastq_files = state.config.get_option('fastqs')
    # Find the path to the reference genome
    # Stages are dependent on the state
    stages = Stages(state)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.original_fastqs,
                       name='original_fastqs',
                       output=fastq_files)

    # Convert FASTQ file to FASTA using fastx toolkit
    # pipeline.transform(
    #     task_func=stages.fastq_to_fasta,
    #     name='fastq_to_fasta',
    #     input=output_from('original_fastqs'),
    #     filter=suffix('.fastq.gz'),
    #     output='.fasta')

    # The original reference file
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    #pipeline.originate(
    #    task_func=stages.original_reference,
    #    name='original_reference',
    #    output=reference_file)

    # Run fastQC on the FASTQ files
    pipeline.transform(task_func=stages.fastqc,
                       name='fastqc',
                       input=output_from('original_fastqs'),
                       filter=suffix('.fastq.gz'),
                       output='_fastqc')

    # Index the reference using BWA
    #pipeline.transform(
    #    task_func=stages.index_reference_bwa,
    #    name='index_reference_bwa',
    #    input=output_from('original_reference'),
    #    filter=suffix('.fa'),
    #    output=['.fa.amb', '.fa.ann', '.fa.pac', '.fa.sa', '.fa.bwt'])

    # Index the reference using samtools
    # pipeline.transform(
    #     task_func=stages.index_reference_samtools,
    #    name='index_reference_samtools',
    #    input=output_from('original_reference'),
    #    filter=suffix('.fa'),
    #    output='.fa.fai')

    # Index the reference using bowtie 2
    # pipeline.transform(
    #     task_func=stages.index_reference_bowtie2,
    #     name='index_reference_bowtie2',
    #     input=output_from('original_reference'),
    #     filter=formatter('.+/(?P<refname>[a-zA-Z0-9]+\.fa)'),
    #     output=['{path[0]}/{refname[0]}.1.bt2',
    #             '{path[0]}/{refname[0]}.2.bt2',
    #             '{path[0]}/{refname[0]}.3.bt2',
    #             '{path[0]}/{refname[0]}.4.bt2',
    #             '{path[0]}/{refname[0]}.rev.1.bt2',
    #             '{path[0]}/{refname[0]}.rev.2.bt2'],
    #     extras=['{path[0]}/{refname[0]}'])

    # # Create a FASTA sequence dictionary for the reference using picard
    # pipeline.transform(
    #     task_func=stages.reference_dictionary_picard,
    #     name='reference_dictionary_picard',
    #     input=output_from('original_reference'),
    #     filter=suffix('.fa'),
    #     output='.dict')

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        # We assume the sample name may consist of only alphanumeric
        # characters.
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+)_R1.fastq.gz'),
        # Add two more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=['{sample[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='{path[0]}/{sample[0]}.bam')

    # Sort alignment with sambamba
    pipeline.transform(task_func=stages.sort_bam_sambamba,
                       name='sort_alignment',
                       input=output_from('align_bwa'),
                       filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).bam'),
                       output='{path[0]}/{sample[0]}.sorted.bam')

    # Extract MMR genes from the sorted BAM file
    pipeline.transform(
        task_func=stages.extract_genes_bedtools,
        name='extract_genes_bedtools',
        input=output_from('sort_alignment'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
        output='{path[0]}/{sample[0]}.mmr.bam')

    # Extract selected chromosomes from the sorted BAM file
    pipeline.transform(
        task_func=stages.extract_chromosomes_samtools,
        name='extract_chromosomes_samtools',
        input=output_from('sort_alignment'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
        output='{path[0]}/{sample[0]}.chroms.bam')

    # Index the MMR genes bam file with samtools
    pipeline.transform(task_func=stages.index_bam,
                       name='index_mmr_alignment',
                       input=output_from('extract_genes_bedtools'),
                       filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).mmr.bam'),
                       output='{path[0]}/{sample[0]}.mmr.bam.bai')

    # Compute depth of coverage of the alignment with GATK DepthOfCoverage
    #pipeline.transform(
    #    task_func=stages.alignment_coverage_gatk,
    #    name='alignment_coverage_gatk',
    #    input=output_from('sort_alignment'),
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
    #    add_inputs=add_inputs([reference_file]),
    #    output='{path[0]}/{sample[0]}.coverage_summary',
    #    extras=['{path[0]}/{sample[0]}_coverage'])

    # Index the alignment with samtools
    pipeline.transform(
        task_func=stages.index_bam,
        name='index_alignment',
        input=output_from('sort_alignment'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
        output='{path[0]}/{sample[0]}.sorted.bam.bai')

    # Generate alignment stats with bamtools
    pipeline.transform(task_func=stages.bamtools_stats,
                       name='bamtools_stats',
                       input=output_from('align_bwa'),
                       filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).bam'),
                       output='{path[0]}/{sample[0]}.stats.txt')

    # Extract the discordant paired-end alignments
    pipeline.transform(task_func=stages.extract_discordant_alignments,
                       name='extract_discordant_alignments',
                       input=output_from('align_bwa'),
                       filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).bam'),
                       output='{path[0]}/{sample[0]}.discordants.unsorted.bam')

    # Extract split-read alignments
    pipeline.transform(task_func=stages.extract_split_read_alignments,
                       name='extract_split_read_alignments',
                       input=output_from('align_bwa'),
                       filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).bam'),
                       output='{path[0]}/{sample[0]}.splitters.unsorted.bam')

    # Sort discordant reads.
    # Samtools annoyingly takes the prefix of the output bam name as its argument.
    # So we pass this as an extra argument. However Ruffus needs to know the full name
    # of the output bam file, so we pass that as the normal output parameter.
    pipeline.transform(
        task_func=stages.sort_bam,
        name='sort_discordants',
        input=output_from('extract_discordant_alignments'),
        filter=formatter(
            '.+/(?P<sample>[a-zA-Z0-9]+).discordants.unsorted.bam'),
        extras=['{path[0]}/{sample[0]}.discordants'],
        output='{path[0]}/{sample[0]}.discordants.bam')

    # Index the sorted discordant bam with samtools
    # pipeline.transform(
    #   task_func=stages.index_bam,
    #   name='index_discordants',
    #   input=output_from('sort_discordants'),
    #   filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).discordants.bam'),
    #   output='{path[0]}/{sample[0]}.discordants.bam.bai')

    # Sort discordant reads
    # Samtools annoyingly takes the prefix of the output bam name as its argument.
    # So we pass this as an extra argument. However Ruffus needs to know the full name
    # of the output bam file, so we pass that as the normal output parameter.
    pipeline.transform(
        task_func=stages.sort_bam,
        name='sort_splitters',
        input=output_from('extract_split_read_alignments'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).splitters.unsorted.bam'),
        extras=['{path[0]}/{sample[0]}.splitters'],
        output='{path[0]}/{sample[0]}.splitters.bam')

    # Index the sorted splitters bam with samtools
    # pipeline.transform(
    #    task_func=stages.index_bam,
    #    name='index_splitters',
    #    input=output_from('sort_splitters'),
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).splitters.bam'),
    #    output='{path[0]}/{sample[0]}.splitters.bam.bai')

    # Call structural variants with lumpy
    (pipeline.transform(
        task_func=stages.structural_variants_lumpy,
        name='structural_variants_lumpy',
        input=output_from('sort_alignment'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
        add_inputs=add_inputs([
            '{path[0]}/{sample[0]}.splitters.bam',
            '{path[0]}/{sample[0]}.discordants.bam'
        ]),
        output='{path[0]}/{sample[0]}.lumpy.vcf').follows('index_alignment').
     follows('sort_splitters').follows('sort_discordants'))

    # Call genotypes on lumpy output using SVTyper
    #(pipeline.transform(
    #    task_func=stages.genotype_svtyper,
    #    name='genotype_svtyper',
    #    input=output_from('structural_variants_lumpy'),
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).lumpy.vcf'),
    #    add_inputs=add_inputs(['{path[0]}/{sample[0]}.sorted.bam', '{path[0]}/{sample[0]}.splitters.bam']),
    #    output='{path[0]}/{sample[0]}.svtyper.vcf')
    #    .follows('align_bwa')
    #    .follows('sort_splitters')
    #    .follows('index_alignment')
    #    .follows('index_splitters')
    #    .follows('index_discordants'))

    # Call SVs with Socrates
    (pipeline.transform(
        task_func=stages.structural_variants_socrates,
        name='structural_variants_socrates',
        input=output_from('sort_alignment'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
        # output goes to {path[0]}/socrates/
        output=
        '{path[0]}/socrates/results_Socrates_paired_{sample[0]}.sorted_long_sc_l25_q5_m5_i95.txt',
        extras=['{path[0]}']))

    # Call DELs with DELLY
    pipeline.merge(task_func=stages.deletions_delly,
                   name='deletions_delly',
                   input=output_from('sort_alignment'),
                   output='delly.DEL.vcf')

    # Call DUPs with DELLY
    pipeline.merge(task_func=stages.duplications_delly,
                   name='duplications_delly',
                   input=output_from('sort_alignment'),
                   output='delly.DUP.vcf')

    # Call INVs with DELLY
    pipeline.merge(task_func=stages.inversions_delly,
                   name='inversions_delly',
                   input=output_from('sort_alignment'),
                   output='delly.INV.vcf')

    # Call TRAs with DELLY
    pipeline.merge(task_func=stages.translocations_delly,
                   name='translocations_delly',
                   input=output_from('sort_alignment'),
                   output='delly.TRA.vcf')

    # Join both read pair files using gustaf_mate_joining
    #pipeline.transform(
    #    task_func=stages.gustaf_mate_joining,
    #    name='gustaf_mate_joining',
    #    input=output_from('fastq_to_fasta'),
    #    # Match the R1 (read 1) FASTA file and grab the path and sample name.
    #    # This will be the first input to the stage.
    #    # We assume the sample name may consist of only alphanumeric
    #    # characters.
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+)_R1.fasta'),
    #    # Add one more input to the stage:
    #    #    1. The corresponding R2 FASTA file
    #    add_inputs=add_inputs(['{path[0]}/{sample[0]}_R2.fasta']),
    #    output='{path[0]}/{sample[0]}.joined_mates.fasta')

    # Call structural variants with pindel
    #(pipeline.transform(
    #    task_func=stages.structural_variants_pindel,
    #    name='structural_variants_pindel',
    #    input=output_from('sort_alignment'),
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
    #    add_inputs=add_inputs(['{path[0]}/{sample[0]}.pindel_config.txt', reference_file]),
    #    output='{path[0]}/{sample[0]}.pindel')
    #    .follows('index_reference_bwa')
    #    .follows('index_reference_samtools'))

    return pipeline

#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888

#
#   First task
#
@originate(["a.1", "b.1"])
def start_task(output_file_name):
    with open(output_file_name,  "w") as f:
        pass

#
#   Forwards file names, is always as up to date as its input files...
#
@transform(start_task, suffix(".1"), ".1")
def same_file_name_task(input_file_name, output_file_name):
    pass

#
#   Links file names, is always as up to date if links are not missing
#
@transform(start_task, suffix(".1"), ".linked.1")
def linked_file_name_task(input_file_name, output_file_name):
    try:
        os.symlink(input_file_name, output_file_name)
    except:
        print (input_file_name, output_file_name)
        raise

Exemple #38
0
    try:
        with open(input_file, 'rb') as f:
            signature = f.read(4)
            if signature == b'%PDF':
                re_symlink(input_file, output_file)
                return
    except EnvironmentError as e:
        log.error(e)
        sys.exit(ExitCode.input_file)

    triage_image_file(input_file, output_file, log)


@transform(
    input=triage,
    filter=suffix('.pdf'),
    output='.repaired.pdf',
    output_dir=work_folder,
    extras=[_log, _pdfinfo, _pdfinfo_lock])
def repair_pdf(
        input_file,
        output_file,
        log,
        pdfinfo,
        pdfinfo_lock):

    qpdf.repair(input_file, output_file, log)
    with pdfinfo_lock:
        pdfinfo.extend(pdf_get_all_pageinfo(output_file))
        log.debug(pdfinfo)
    # This also updates timestamps.  Ruffus doesn't recognize these files as complete results unles the
    # timestamp is up to date.
    sh.mv("testdata.manual.2009.06.14.csv", "sentiment140.test.csv")
    sh.mv("training.1600000.processed.noemoticon.csv", "sentiment140.train.csv")

    # Re-encode the files as utf8.  They look like utf8 already (e.g. file thinks they're utf8)
    # but they are actually encoded as latin1.  This doesn't make a difference for the test data
    # (the utf8 and latin1 encoded test data are identical files) but the train data has some
    # byte sequences that are invalid utf8 and this makes simplejson really upset.
    for output_file in output_file_names:
        sh.mv(output_file, "temp")
        sh.iconv("-f", "latin1", "-t", "utf8", "temp", _out=output_file)
        sh.rm("temp")


@ruffus.transform(extract_data, ruffus.suffix(".csv"), ".json")
def reformat_data(input_file_name, output_file_name):
    df = pd.io.parsers.read_csv(
        input_file_name,
        names=["polarity", "id", "date", "query", "user", "text"],
        encoding='utf8')

    # drop columns we don't care about
    df = df[["text", "polarity"]]

    # remove neutral class
    df = df[df.polarity != 2]
    assert all((df.polarity == 4) | (df.polarity == 0))

    # re-map polarity to smilies
    df.polarity = df.polarity.apply(lambda x: ':)' if x == 4 else ':(')
                                   CHECKSUM_HISTORY_TIMESTAMPS,
                                   CHECKSUM_FUNCTIONS,
                                   CHECKSUM_FUNCTIONS_AND_PARAMS)
from ruffus.ruffus_exceptions import RethrownJobError

possible_chksms = range(CHECKSUM_FUNCTIONS_AND_PARAMS + 1)
workdir = 'tmp_test_job_completion/'
input_file = os.path.join(workdir, 'input.txt')
transform1_out = input_file.replace('.txt', '.output')
split1_outputs = [ os.path.join(workdir, 'split.out1.txt'),
                   os.path.join(workdir, 'split.out2.txt')]
merge2_output =  os.path.join(workdir, 'merged.out')

runtime_data = []

@transform(input_file, suffix('.txt'), '.output', runtime_data)
def transform1(in_name, out_name, how_many):
    with open(out_name, 'w') as outfile:
        outfile.write(open(in_name).read())

@transform(input_file, suffix('.txt'), '.output', runtime_data)
def transform_raise_error(in_name, out_name, how_many):
    # raise an error unless runtime_data has 'okay' in it
    with open(out_name, 'w') as outfile:
        outfile.write(open(in_name).read())
    if 'okay' not in runtime_data:
        raise RuntimeError("'okay' wasn't in runtime_data!")

@split(input_file, split1_outputs)
def split1(in_name, out_names):
    for n in out_names:
def task1(outfile, *extra_params):
    """
    First task
    """
    with open(tempdir + "jobs.start", "a") as oo:
        oo.write('job = %s\n' % json.dumps([None, outfile]))
    test_job_io(None, outfile, extra_params)
    with open(tempdir + "jobs.finish", "a") as oo:
        oo.write('job = %s\n' % json.dumps([None, outfile]))


#
#    task2
#
@posttask(lambda: do_write(test_file, "Task 2 Done\n"))
@transform(task1, suffix(".1"), ".2")
def task2(infiles, outfiles, *extra_params):
    """
    Second task
    """
    with open(tempdir + "jobs.start", "a") as oo:
        oo.write('job = %s\n' % json.dumps([infiles, outfiles]))
    test_job_io(infiles, outfiles, extra_params)
    with open(tempdir + "jobs.finish", "a") as oo:
        oo.write('job = %s\n' % json.dumps([infiles, outfiles]))


#
#    task3
#
@transform(task2, regex('(.*).2'), inputs([r"\1.2", tempdir + "a.1"]), r'\1.3')
Exemple #42
0

@follows(mkdir("test_active_if"))
@originate(['test_active_if/a.1', 'test_active_if/b.1'], "an extra_parameter")
def task1(outfile, extra):
    """
    First task
    """
    # N.B. originate works with an extra parameter
    helper(None, outfile)


#
#    task2
#
@transform(task1, suffix(".1"), ".2")
def task2(infile, outfile):
    """
    Second task
    """
    helper(infile, outfile)


#
#    task3
#
@active_if(lambda: pipeline_active_if)
@transform(task1, suffix(".1"), ".3")
def task3(infile, outfile):
    """
    Third task
    #
    for i in range(JOBS_PER_TASK):
        with open(tempdir + "/files.split.%s.%03d.fa" % (original_index, i), "w") as oo:
            pass

    with open(success_flag,  "w") as oo:
        pass


# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
#
#    align_sequences
#
@posttask(lambda: sys.stderr.write("\tSequences aligned\n"))
# fa -> aln
@transform(split_fasta_file, suffix(".fa"), ".aln")
def align_sequences(input_file, output_filename):
    with open(output_filename, "w") as oo:
        oo.write("%s\n" % output_filename)


# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
#
#    percentage_identity
#
@posttask(lambda: sys.stderr.write("\t%Identity calculated\n"))
@transform(align_sequences,             # find all results from align_sequences
           suffix(".aln"),             # replace suffix with:
           [r".pcid",  # .pcid suffix for the result
            r".pcid_success"])  # .pcid_success to indicate job completed
def percentage_identity(input_file, output_files):
Exemple #44
0
#
@originate([tempdir + 'a.1'] + runtime_files)
def task1(outfile):
    """
    First task
    """
    output_text = ""
    output_text += "    -> " + json.dumps(outfile) + "\n"
    with open(outfile, "w") as oo:
        oo.write(output_text)


#
#    task2
#
@transform(task1, suffix(".1"), ".2")
def task2(infile, outfile):
    """
    Second task
    """
    if infile:
        with open(infile) as ii:
            output_text = ii.read()
    else:
        output_text = ""
    output_text += json.dumps(infile) + " -> " + json.dumps(outfile) + "\n"
    with open(outfile, "w") as oo:
        oo.write(output_text)


#
         formatter(),
         "{path[0]}/all.tmp2")
#@transform([generate_initial_files1, generate_initial_files2, generate_initial_files3,
#            generate_initial_files4],
#            formatter( ),
#            "{path[0]}/{basename[0]}.tmp2")
def test_task2( infiles, outfile):
    with open(outfile, "w") as p:
        pass
    #print >>sys.stderr, "8" * 80, "\n", "    task2 :%s %s " % (infiles, outfile)

#___________________________________________________________________________
#
#   test_task3
#___________________________________________________________________________
@transform(test_task2, suffix(".tmp2"), ".tmp3")
def test_task3( infile, outfile):
    global throw_exception
    if throw_exception != None:
        throw_exception = not throw_exception
    if throw_exception:
        #print >>sys.stderr, "Throw exception for ", infile, outfile
        raise Exception("oops")
    else:
        #print >>sys.stderr, "No throw exception for ", infile, outfile
        pass
    with open(outfile, "w") as p: pass
    #print >>sys.stderr, "8" * 80, "\n", "    task3 :%s %s " % (infile, outfile)

#___________________________________________________________________________
#
Exemple #46
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='complexo')
    # Get a list of paths to all the FASTQ files
    fastq_files = state.config.get_option('fastqs')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(
        task_func=stages.original_fastqs,
        name='original_fastqs',
        output=fastq_files)

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name. 
        # This will be the first input to the stage.
        # We assume the sample name may consist of only alphanumeric
        # characters.
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+)_R1.fastq.gz'),
        # Add one more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=['{sample[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='{path[0]}/{sample[0]}.bam')

    # Sort the BAM file using Picard 
    pipeline.transform(
        task_func=stages.sort_bam_picard,
        name='sort_bam_picard',
        input=output_from('align_bwa'),
        filter=suffix('.bam'),
        output='.sort.bam')

    # Mark duplicates in the BAM file using Picard 
    pipeline.transform(
        task_func=stages.mark_duplicates_picard,
        name='mark_duplicates_picard',
        input=output_from('sort_bam_picard'),
        filter=suffix('.sort.bam'),
        # XXX should make metricsup an extra output?
        output=['.sort.dedup.bam', '.metricsdup'])

    # Generate chromosome intervals using GATK 
    pipeline.transform(
        task_func=stages.chrom_intervals_gatk,
        name='chrom_intervals_gatk',
        input=output_from('mark_duplicates_picard'),
        filter=suffix('.sort.dedup.bam'),
        output='.chr.intervals')

    # Local realignment using GATK 
    (pipeline.transform(
        task_func=stages.local_realignment_gatk,
        name='local_realignment_gatk',
        input=output_from('chrom_intervals_gatk'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).chr.intervals'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'),
        output='{path[0]}/{sample[0]}.sort.dedup.realn.bam')
        .follows('mark_duplicates_picard'))

    # Base recalibration using GATK 
    pipeline.transform(
        task_func=stages.base_recalibration_gatk,
        name='base_recalibration_gatk',
        input=output_from('local_realignment_gatk'),
        filter=suffix('.sort.dedup.realn.bam'),
        output=['.recal_data.csv', '.count_cov.log'])

    # Print reads using GATK 
    (pipeline.transform(
        task_func=stages.print_reads_gatk,
        name='print_reads_gatk',
        input=output_from('base_recalibration_gatk'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).recal_data.csv'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.realn.bam'),
        output='{path[0]}/{sample[0]}.sort.dedup.realn.recal.bam')
        .follows('local_realignment_gatk'))

    # Call variants using GATK 
    pipeline.transform(
        task_func=stages.call_variants_gatk,
        name='call_variants_gatk',
        input=output_from('print_reads_gatk'),
        filter=suffix('.sort.dedup.realn.recal.bam'),
        output='.raw.snps.indels.g.vcf')

    # Combine G.VCF files for all samples using GATK
    pipeline.merge(
        task_func=stages.combine_gvcf_gatk,
        name='combine_gvcf_gatk',
        input=output_from('call_variants_gatk'),
        output='PCExomes.mergegvcf.vcf')

    # Genotype G.VCF files using GATK 
    pipeline.transform(
        task_func=stages.genotype_gvcf_gatk,
        name='genotype_gvcf_gatk',
        input=output_from('combine_gvcf_gatk'),
        filter=suffix('.mergegvcf.vcf'),
        output='.genotyped.vcf')

    # SNP recalibration using GATK
    pipeline.transform(
        task_func=stages.snp_recalibrate_gatk,
        name='snp_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        output=['.snp_recal', '.snp_tranches', '.snp_plots.R'])

    # INDEL recalibration using GATK
    pipeline.transform(
        task_func=stages.indel_recalibrate_gatk,
        name='indel_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        output=['.indel_recal', '.indel_tranches', '.indel_plots.R'])

    # Apply SNP recalibration using GATK  
    (pipeline.transform(
        task_func=stages.apply_snp_recalibrate_gatk,
        name='apply_snp_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        add_inputs=add_inputs(['PCExomes.snp_recal', 'PCExomes.snp_tranches']),
        output='.recal_SNP.vcf')
        .follows('snp_recalibrate_gatk'))

    # Apply INDEL recalibration using GATK  
    (pipeline.transform(
        task_func=stages.apply_indel_recalibrate_gatk,
        name='apply_indel_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        add_inputs=add_inputs(['PCExomes.indel_recal', 'PCExomes.indel_tranches']),
        output='.recal_INDEL.vcf')
        .follows('indel_recalibrate_gatk'))

    # Combine variants using GATK  
    (pipeline.transform(
        task_func=stages.combine_variants_gatk,
        name='combine_variants_gatk',
        input=output_from('apply_snp_recalibrate_gatk'),
        filter=suffix('.recal_SNP.vcf'),
        add_inputs=add_inputs(['PCExomes.recal_INDEL.vcf']),
        output='.combined.vcf')
        .follows('apply_indel_recalibrate_gatk'))

    # Select variants using GATK 
    pipeline.transform(
        task_func=stages.select_variants_gatk,
        name='select_variants_gatk',
        input=output_from('combine_variants_gatk'),
        filter=suffix('.combined.vcf'),
        output='.selected.vcf')

    return pipeline
    statement = '''awk 'BEGIN { printf("word\\tfreq\\n"); } 
    {for (i = 1; i <= NF; i++) freq[$i]++}
    END { for (word in freq) printf "%%s\\t%%d\\n", word, freq[word] }'
    < %(infile)s > %(outfile)s'''

    # execute command in variable statement.
    #
    # The command will be sent to the cluster.  The statement will be
    # interpolated with any options that are defined in in the
    # configuration files or variable that are declared in the calling
    # function.  For example, %(infile)s will we substituted with the
    # contents of the variable "infile".
    P.run(statement)


@transform(count_words, suffix(".counts"), "_counts.load")
def load_word_counts(infile, outfile):
    '''load results of word counting into database.'''
    P.load(infile, outfile, "--add-index=word")


# ---------------------------------------------------
# Generic pipeline tasks
@follows(load_word_counts)
def full():
    pass


def main(argv=None):
    if argv is None:
        argv = sys.argv
        r"\g<PREFIX>",          # extra: prefix = \2
        r"\4")                  # extra: extension
def test_regex_unmatched_task(infiles, outfile,
                    prefix1,
                    prefix2,
                    extension):
    raise Exception("Should blow up first")


#___________________________________________________________________________
#
#   test_suffix_task
#___________________________________________________________________________
@transform(
        generate_initial_files1,
        suffix(".tmp1"),
        r".tmp2",           # output file
        r"\1")              # extra: basename
def test_suffix_task(infile, outfile,
                    basename):
    with open (outfile, "w") as f: pass


#___________________________________________________________________________
#
#   test_suffix_unmatched_task
#___________________________________________________________________________
@transform(
        generate_initial_files1,
        suffix(".tmp1"),
        r".tmp2",           # output file
                        filter=regex("(.*)/(?P<PREFIX>[abcd])(_name)(.tmp1)"),
                        output=r"\1/\g<PREFIX>\3.tmp2",  # output file
                        extras=[r"\2",                # extra: prefix = \2
                                r"\g<PREFIX>",        # extra: prefix = \2
                                r"\4"])               # extra: extension
test_pipeline.transform(task_func=check_regex_unmatched_task,
                        input=generate_initial_files1,
                        filter=regex("(.*)/(?P<PREFIX>[abcd])(_name)(.xxx)"),
                        output=r"\1/\g<PREFIXA>\3.tmp2",  # output file
                        extras=[r"\2",                 # extra: prefix = \2
                                r"\g<PREFIX>",         # extra: prefix = \2
                                r"\4"])                # extra: extension

test_pipeline.transform(task_func=check_suffix_task,
                        input=generate_initial_files1,
                        filter=suffix(".tmp1"),
                        output=r".tmp2",           # output file
                        extras=[r"\1"])            # extra: basename

test_pipeline.transform(task_func=check_suffix_unmatched_task,
                        input=generate_initial_files1,
                        filter=suffix(".tmp1"),
                        output=r".tmp2",           # output file
                        extras=[r"\2"])            # extra: unknown

test_pipeline.transform(task_func=check_suffix_unmatched_task2,
                        input=generate_initial_files1,
                        filter=suffix(".tmp2"),
                        output=r".tmp2")           # output file

test_pipeline.transform(task_func=check_regex_misspelt_capture_error_task,
Exemple #50
0
    with open(input_file_name) as ii:
        for i, line in enumerate(ii):
            if i % CHUNK_SIZE == 0:
                cnt_files += 1
                if output_file:
                    output_file.close()
                output_file = open(tempdir + "%d.chunks" % cnt_files, "w")
            output_file.write(line)
    if output_file:
        output_file.close()

#---------------------------------------------------------------
#
#   Calculate sum and sum of squares for each chunk file
#
@transform(step_4_split_numbers_into_chunks, suffix(".chunks"), ".sums")
def step_5_calculate_sum_of_squares (input_file_name, output_file_name):
    with open(output_file_name,  "w") as oo:
        sum_squared, sum = [0.0, 0.0]
        cnt_values = 0
        with open(input_file_name) as ii:
            for line in ii:
                cnt_values += 1
                val = float(line.rstrip())
                sum_squared += val * val
                sum += val
        oo.write("%s\n%s\n%d\n" % (repr(sum_squared), repr(sum), cnt_values))


def print_hooray_again():
    print("     hooray again")
Exemple #51
0
#
#   First task
#
@originate(["a.1", "b.1"])
def start_task(output_file_name):
    with open(output_file_name, "w") as f:
        pass


#
#   Forwards file names, is always as up to date as its input files...
#


@transform(start_task, suffix(".1"), ".1")
def same_file_name_task(input_file_name, output_file_name):
    pass


#
#   Links file names, is always as up to date if links are not missing
#


@transform(start_task, suffix(".1"), ".linked.1")
def linked_file_name_task(input_file_name, output_file_name):
    try:
        os.symlink(input_file_name, output_file_name)
    except:
        print(input_file_name, output_file_name)
from ruffus import (transform, follows, collate, files, split, merge,
                    suffix, mkdir, jobs_limit, output_from)
from ruffus.task import active_if

from hts_waterworks.utils.ruffus_utils import (sys_call, main_logger as log,
                                               main_mutex as log_mtx)
from hts_waterworks.bootstrap import cfg
from hts_waterworks.utils.common import parseFastq

# filtering
original_reads = '*.fastq'
prev_output = original_reads
prev_suffix = '.fastq'

@active_if(cfg.getboolean('filtering', 'convert_sanger_to_illumina'))
@transform(prev_output, suffix(prev_suffix), '.fastq_illumina')
def convert_fastq(in_fastq, out_fastq):
    'convert sanger fastq format (phred-33) to illumina format (phred-64)'
    base_out = os.path.splitext(out_fastq)[0]
    records = SeqIO.parse(in_fastq, "fastq")
    with open(base_out, 'w') as outfile:
        SeqIO.write(records, outfile, "fastq-illumina")
    check_call('gzip %s' % base_out, shell=True)
if cfg.getboolean('filtering', 'convert_sanger_to_illumina'):
    prev_output = convert_fastq
    prev_suffix = ''



@active_if(cfg.getboolean('filtering', 'clip_adapter'))
@transform(prev_output, suffix(prev_suffix), '.noAdapter')
Exemple #53
0
def triage(input_file, output_file, log):
    try:
        with open(input_file, 'rb') as f:
            signature = f.read(4)
            if signature == b'%PDF':
                re_symlink(input_file, output_file)
                return
    except EnvironmentError as e:
        log.error(e)
        sys.exit(ExitCode.input_file)

    triage_image_file(input_file, output_file, log)


@transform(input=triage,
           filter=suffix('.pdf'),
           output='.repaired.pdf',
           output_dir=work_folder,
           extras=[_log, _pdfinfo, _pdfinfo_lock])
def repair_pdf(input_file, output_file, log, pdfinfo, pdfinfo_lock):

    qpdf.repair(input_file, output_file, log)
    with pdfinfo_lock:
        pdfinfo.extend(pdf_get_all_pageinfo(output_file))
        log.debug(pdfinfo)


def get_pageinfo(input_file, pdfinfo, pdfinfo_lock):
    pageno = int(os.path.basename(input_file)[0:6]) - 1
    with pdfinfo_lock:
        pageinfo = pdfinfo[pageno].copy()
Exemple #54
0
import gzip
import simplejson as json


data_dir = os.environ['DATA']
words_dir = os.path.join(data_dir, "words")

# /usr/share/dict/words is a text file full of words on most unix systems

@ruffus.follows(ruffus.mkdir(words_dir))
@ruffus.originate(os.path.join(words_dir, "words.txt"))
def get_words(output_file):
    sh.cp("/usr/share/dict/words", output_file)
    sh.chmod("u+w", output_file)

@ruffus.transform(get_words, ruffus.suffix(".txt"), ".alphabet.json")
def build_alphabet_dictionary(input_file, output_file):
    characters = set()
    with open(input_file) as f:
        for line in f:
            characters = characters.union(line.rstrip())

    alphabet = list(sorted(characters)) + ['PADDING', 'START', 'END']

    with open(output_file, 'w') as f:
        f.write(json.dumps(alphabet))

@ruffus.transform(build_alphabet_dictionary, ruffus.suffix(".alphabet.json"), ".alphabet.encoding.json")
def encode_alphabet_dictionary(input_file, output_file):
    alphabet = dict()
    with open(input_file) as alphabet_file:
Exemple #55
0
def build_pipeline(options, work_folder, log, context):
    main_pipeline = Pipeline.pipelines['main']

    # Triage
    task_triage = main_pipeline.transform(
        task_func=triage,
        input=os.path.join(work_folder, 'origin'),
        filter=formatter('(?i)'),
        output=os.path.join(work_folder, 'origin.pdf'),
        extras=[log, context])

    task_repair_pdf = main_pipeline.transform(task_func=repair_pdf,
                                              input=task_triage,
                                              filter=suffix('.pdf'),
                                              output='.repaired.pdf',
                                              output_dir=work_folder,
                                              extras=[log, context])

    # Split (kwargs for split seems to be broken, so pass plain args)
    task_split_pages = main_pipeline.split(split_pages,
                                           task_repair_pdf,
                                           os.path.join(
                                               work_folder, '*.page.pdf'),
                                           extras=[log, context])

    # Rasterize preview
    task_rasterize_preview = main_pipeline.transform(
        task_func=rasterize_preview,
        input=task_split_pages,
        filter=suffix('.page.pdf'),
        output='.preview.jpg',
        output_dir=work_folder,
        extras=[log, context])
    task_rasterize_preview.active_if(options.rotate_pages)

    # Orient
    task_orient_page = main_pipeline.collate(
        task_func=orient_page,
        input=[task_split_pages, task_rasterize_preview],
        filter=regex(
            r".*/(\d{6})(\.ocr|\.skip)(?:\.page\.pdf|\.preview\.jpg)"),
        output=os.path.join(work_folder, r'\1\2.oriented.pdf'),
        extras=[log, context])

    # Rasterize actual
    task_rasterize_with_ghostscript = main_pipeline.transform(
        task_func=rasterize_with_ghostscript,
        input=task_orient_page,
        filter=suffix('.ocr.oriented.pdf'),
        output='.page.png',
        output_dir=work_folder,
        extras=[log, context])

    # Preprocessing subpipeline
    task_preprocess_remove_background = main_pipeline.transform(
        task_func=preprocess_remove_background,
        input=task_rasterize_with_ghostscript,
        filter=suffix(".page.png"),
        output=".pp-background.png",
        extras=[log, context])

    task_preprocess_deskew = main_pipeline.transform(
        task_func=preprocess_deskew,
        input=task_preprocess_remove_background,
        filter=suffix(".pp-background.png"),
        output=".pp-deskew.png",
        extras=[log, context])

    task_preprocess_clean = main_pipeline.transform(
        task_func=preprocess_clean,
        input=task_preprocess_deskew,
        filter=suffix(".pp-deskew.png"),
        output=".pp-clean.png",
        extras=[log, context])

    task_select_ocr_image = main_pipeline.collate(
        task_func=select_ocr_image,
        input=[task_preprocess_clean],
        filter=regex(r".*/(\d{6})(?:\.page|\.pp-.*)\.png"),
        output=os.path.join(work_folder, r"\1.ocr.png"),
        extras=[log, context])

    # HOCR OCR
    task_ocr_tesseract_hocr = main_pipeline.transform(
        task_func=ocr_tesseract_hocr,
        input=task_select_ocr_image,
        filter=suffix(".ocr.png"),
        output=".hocr",
        extras=[log, context])
    task_ocr_tesseract_hocr.graphviz(fillcolor='"#00cc66"')
    task_ocr_tesseract_hocr.active_if(options.pdf_renderer == 'hocr')
    if tesseract.v4():
        task_ocr_tesseract_hocr.jobs_limit(2)  # Uses multi-core on its own

    task_select_visible_page_image = main_pipeline.collate(
        task_func=select_visible_page_image,
        input=[
            task_rasterize_with_ghostscript, task_preprocess_remove_background,
            task_preprocess_deskew, task_preprocess_clean
        ],
        filter=regex(r".*/(\d{6})(?:\.page|\.pp-.*)\.png"),
        output=os.path.join(work_folder, r'\1.image'),
        extras=[log, context])
    task_select_visible_page_image.graphviz(shape='diamond')

    task_select_image_layer = main_pipeline.collate(
        task_func=select_image_layer,
        input=[task_select_visible_page_image, task_orient_page],
        filter=regex(r".*/(\d{6})(?:\.image|\.ocr\.oriented\.pdf)"),
        output=os.path.join(work_folder, r'\1.image-layer.pdf'),
        extras=[log, context])
    task_select_image_layer.graphviz(fillcolor='"#00cc66"', shape='diamond')
    task_select_image_layer.active_if(options.pdf_renderer == 'hocr'
                                      or options.pdf_renderer == 'tess4')

    task_render_hocr_page = main_pipeline.transform(
        task_func=render_hocr_page,
        input=task_ocr_tesseract_hocr,
        filter=suffix('.hocr'),
        output='.text.pdf',
        extras=[log, context])
    task_render_hocr_page.graphviz(fillcolor='"#00cc66"')
    task_render_hocr_page.active_if(options.pdf_renderer == 'hocr')

    task_render_hocr_debug_page = main_pipeline.collate(
        task_func=render_hocr_debug_page,
        input=[task_select_visible_page_image, task_ocr_tesseract_hocr],
        filter=regex(r".*/(\d{6})(?:\.image|\.hocr)"),
        output=os.path.join(work_folder, r'\1.debug.pdf'),
        extras=[log, context])
    task_render_hocr_debug_page.graphviz(fillcolor='"#00cc66"')
    task_render_hocr_debug_page.active_if(options.pdf_renderer == 'hocr')
    task_render_hocr_debug_page.active_if(options.debug_rendering)

    # Tesseract OCR + text only PDF
    task_ocr_tesseract_textonly_pdf = main_pipeline.collate(
        task_func=ocr_tesseract_textonly_pdf,
        input=[task_select_ocr_image, task_orient_page],
        filter=regex(r".*/(\d{6})(?:\.ocr.png|\.ocr\.oriented\.pdf)"),
        output=os.path.join(work_folder, r'\1.text.pdf'),
        extras=[log, context])
    task_ocr_tesseract_textonly_pdf.graphviz(fillcolor='"#ff69b4"')
    task_ocr_tesseract_textonly_pdf.active_if(options.pdf_renderer == 'tess4')
    if tesseract.v4():
        task_ocr_tesseract_textonly_pdf.jobs_limit(2)

    task_combine_layers = main_pipeline.collate(
        task_func=combine_layers,
        input=[
            task_render_hocr_page, task_ocr_tesseract_textonly_pdf,
            task_select_image_layer
        ],
        filter=regex(r".*/(\d{6})(?:\.text\.pdf|\.image-layer\.pdf)"),
        output=os.path.join(work_folder, r'\1.rendered.pdf'),
        extras=[log, context])
    task_combine_layers.graphviz(fillcolor='"#00cc66"')
    task_combine_layers.active_if(options.pdf_renderer == 'hocr'
                                  or options.pdf_renderer == 'tess4')

    # Tesseract OCR+PDF
    task_ocr_tesseract_and_render_pdf = main_pipeline.collate(
        task_func=ocr_tesseract_and_render_pdf,
        input=[task_select_visible_page_image, task_orient_page],
        filter=regex(r".*/(\d{6})(?:\.image|\.ocr\.oriented\.pdf)"),
        output=os.path.join(work_folder, r'\1.rendered.pdf'),
        extras=[log, context])
    task_ocr_tesseract_and_render_pdf.graphviz(fillcolor='"#66ccff"')
    task_ocr_tesseract_and_render_pdf.active_if(
        options.pdf_renderer == 'tesseract')
    if tesseract.v4():
        task_ocr_tesseract_and_render_pdf.jobs_limit(2)  # Uses multi-core

    # PDF/A
    task_generate_postscript_stub = main_pipeline.transform(
        task_func=generate_postscript_stub,
        input=task_repair_pdf,
        filter=formatter(r'\.repaired\.pdf'),
        output=os.path.join(work_folder, 'pdfa.ps'),
        extras=[log, context])
    task_generate_postscript_stub.active_if(options.output_type == 'pdfa')

    # Bypass valve
    task_skip_page = main_pipeline.transform(
        task_func=skip_page,
        input=task_orient_page,
        filter=suffix('.skip.oriented.pdf'),
        output='.done.pdf',
        output_dir=work_folder,
        extras=[log, context])

    # Merge pages
    task_merge_pages_ghostscript = main_pipeline.merge(
        task_func=merge_pages_ghostscript,
        input=[
            task_combine_layers, task_render_hocr_debug_page, task_skip_page,
            task_ocr_tesseract_and_render_pdf, task_generate_postscript_stub
        ],
        output=os.path.join(work_folder, 'merged.pdf'),
        extras=[log, context])
    task_merge_pages_ghostscript.active_if(options.output_type == 'pdfa')

    task_merge_pages_qpdf = main_pipeline.merge(
        task_func=merge_pages_qpdf,
        input=[
            task_combine_layers, task_render_hocr_debug_page, task_skip_page,
            task_ocr_tesseract_and_render_pdf, task_repair_pdf
        ],
        output=os.path.join(work_folder, 'merged.pdf'),
        extras=[log, context])
    task_merge_pages_qpdf.active_if(options.output_type == 'pdf')

    # Finalize
    task_copy_final = main_pipeline.merge(
        task_func=copy_final,
        input=[task_merge_pages_ghostscript, task_merge_pages_qpdf],
        output=options.output_file,
        extras=[log, context])
Exemple #56
0
#---------------------------------------------------------------
#   create initial files
#
@mkdir(tempdir + 'data/scratch/lg/what/one/two/three/four/five/six/seven')
@originate([   [tempdir + 'data/scratch/lg/what/one/two/three/four/five/six/seven/job1.a.start', tempdir + 'job1.b.start'],
               [tempdir + 'data/scratch/lg/what/one/two/three/four/five/six/seven/job2.a.start', tempdir + 'job2.b.start'],
               [tempdir + 'data/scratch/lg/what/one/two/three/four/five/six/seven/job3.a.start', tempdir + 'job3.b.start']    ])
def create_initial_file_pairs(output_files):
    # create both files as necessary
    for output_file in output_files:
        with open(output_file, "w") as oo: pass

#---------------------------------------------------------------
#   first task
@transform(create_initial_file_pairs, suffix(".start"), ".output.1")
def first_task(input_files, output_file):
    with open(output_file, "w"): pass


#---------------------------------------------------------------
#   second task
@transform(first_task, suffix(".output.1"), ".output.2")
def second_task(input_files, output_file):
    with open(output_file, "w"): pass

test_pipeline = Pipeline("test")
test_pipeline.originate(output = [    [tempdir + 'data/scratch/lg/what/one/two/three/four/five/six/seven/job1.a.start',  tempdir + 'job1.b.start'],
                                       [tempdir + 'data/scratch/lg/what/one/two/three/four/five/six/seven/job2.a.start', tempdir + 'job2.b.start'],
                                       [tempdir + 'data/scratch/lg/what/one/two/three/four/five/six/seven/job3.a.start', tempdir + 'job3.b.start']    ],
                                       task_func = create_initial_file_pairs)