def test_newstyle_ruffus (self):

        print("     Run pipeline normally...")
        test_pipeline = Pipeline("test")
        test_pipeline.originate(make_start, [tempdir + 'start'])

        test_pipeline.split(split_start, make_start, tempdir + '*.split')

        test_pipeline.subdivide(subdivide_start, split_start, formatter(), tempdir + '{basename[0]}_*.subdivided', tempdir + '{basename[0]}')
        if self.graph_viz_present:
            test_pipeline.printout_graph(tempdir + "flowchart.dot")
            test_pipeline.printout_graph(tempdir + "flowchart.jpg",
                                        target_tasks =[subdivide_start],
                                        forcedtorun_tasks = [split_start],
                                        no_key_legend = True)
            test_pipeline.printout_graph(tempdir + "flowchart.svg", no_key_legend = False)
            # Unknown format
            try:
                test_pipeline.printout_graph(tempdir + "flowchart.unknown", no_key_legend = False)
                raise Exception("Failed to throw exception for test_pipeline.printout_graph unknown extension ")
            except CalledProcessError as err:
                pass
            test_pipeline.printout_graph(tempdir + "flowchart.unknown", "svg", no_key_legend = False)

        else:
            test_pipeline.printout_graph(tempdir + "flowchart.dot",
                                        target_tasks =[subdivide_start],
                                        forcedtorun_tasks = [split_start],
                                        no_key_legend = True)
    def test_newstyle_ruffus(self):

        test_pipeline = Pipeline("test")
        test_pipeline.originate(task_func=make_start,
                                output=[tempdir + 'start'])
        test_pipeline.split(task_func=split_start,
                            input=make_start, output=tempdir + '*.split')
        test_pipeline.subdivide(task_func=subdivide_start, input=split_start, filter=formatter(
        ), output=tempdir + '{basename[0]}_*.subdivided', extras=[tempdir + '{basename[0]}'])

        expected_files_after_1_runs = ["start", "0.split", "0_0.subdivided"]
        expected_files_after_2_runs = [
            "1.split", "0_1.subdivided", "1_0.subdivided"]
        expected_files_after_3_runs = [
            "2.split", "0_2.subdivided", "1_1.subdivided", "2_0.subdivided"]
        expected_files_after_4_runs = [
            "3.split", "0_3.subdivided", "1_2.subdivided", "2_1.subdivided", "3_0.subdivided"]

        print("     1 Run pipeline normally...")
        test_pipeline.run(multiprocess=10, verbose=TEST_VERBOSITY)
        self.check_file_exists_or_not_as_expected(expected_files_after_1_runs,
                                                  expected_files_after_2_runs)
        print("     2 Check that running again does nothing. (All up to date).")
        test_pipeline.run(multiprocess=10, verbose=TEST_VERBOSITY)
        self.check_file_exists_or_not_as_expected(expected_files_after_1_runs,
                                                  expected_files_after_2_runs)
        time.sleep(2)

        print("     3 Running again with forced tasks to generate more files...")
        test_pipeline.run(forcedtorun_tasks=[
                          "test::make_start"], multiprocess=10, verbose=TEST_VERBOSITY)
        self.check_file_exists_or_not_as_expected(expected_files_after_1_runs
                                                  + expected_files_after_2_runs,
                                                  expected_files_after_3_runs)
        print("     4 Check that running again does nothing. (All up to date).")
        test_pipeline.run(multiprocess=10, verbose=TEST_VERBOSITY)
        self.check_file_exists_or_not_as_expected(expected_files_after_1_runs
                                                  + expected_files_after_2_runs,
                                                  expected_files_after_3_runs)
        time.sleep(2)

        print("     5 Running again with forced tasks to generate even more files...")
        test_pipeline.run(forcedtorun_tasks=make_start,
                          multiprocess=10, verbose=TEST_VERBOSITY)
        self.check_file_exists_or_not_as_expected(expected_files_after_1_runs
                                                  + expected_files_after_2_runs
                                                  + expected_files_after_3_runs,
                                                  expected_files_after_4_runs)
        print("     6 Check that running again does nothing. (All up to date).")
        test_pipeline.run(multiprocess=10, verbose=TEST_VERBOSITY)
        self.check_file_exists_or_not_as_expected(expected_files_after_1_runs
                                                  + expected_files_after_2_runs
                                                  + expected_files_after_3_runs,
                                                  expected_files_after_4_runs)
Example #3
0
    def test_newstyle_mkdir_run(self):
        test_pipeline = Pipeline("test")

        test_pipeline.split(task_func = generate_initial_files1,
                            input = 1,
                            output = [tempdir +  "/" + prefix + "_name.tmp1" for prefix in "abcd"])

        test_pipeline.transform( task_func = test_transform,
                                 input     = generate_initial_files1,
                                 filter    = formatter(),
                                 output    = "{path[0]}/{basename[0]}.dir/{basename[0]}.tmp2")\
            .mkdir(tempdir + "/test1")\
            .mkdir(tempdir + "/test2")\
            .mkdir(generate_initial_files1, formatter(),
                        ["{path[0]}/{basename[0]}.dir", 3, "{path[0]}/{basename[0]}.dir2"])

        test_pipeline.mkdir(test_transform2, tempdir + "/test3")\
            .mkdir(generate_initial_files1, formatter(),
                    "{path[0]}/{basename[0]}.dir2")
        cleanup_tmpdir()
        pipeline_run([test_transform, test_transform2], verbose=0, multiprocess = 2, pipeline= "main")
    def test_transform_with_missing_formatter_args_b(self):
        test_pipeline = Pipeline("test")


        test_pipeline.originate(task_func   = generate_initial_files,
                                output      = [os.path.join(tempdir, ff + ".tmp") for ff in "abcd"])\
            .mkdir(tempdir)


        test_pipeline.transform(task_func   = transform_with_missing_formatter_args,
                                input       = generate_initial_files,
                                filter      = formatter(),
                                output      = "{path[0]}/{basename[0]}.task1",
                                extras      =['echo {dynamic_message} > {some_file}'])
        s = StringIO()
        test_pipeline.printout(s, [transform_with_missing_formatter_args], verbose=4, wrap_width = 10000, pipeline= "test")
        self.assertIn("Missing key = {dynamic_message}", s.getvalue())

        #log to stream
        s = StringIO()
        logger = t_stream_logger(s)
        test_pipeline.run([transform_with_missing_formatter_args], verbose=5, pipeline= "test", logger=logger)
        self.assertIn("Missing key = {dynamic_message}", s.getvalue())
    def create_pipeline (self):
        #each pipeline has a different name
        global cnt_pipelines
        cnt_pipelines = cnt_pipelines + 1
        test_pipeline = Pipeline("test %d" % cnt_pipelines)

        test_pipeline.originate(task_func   = generate_initial_files1,
                                output      = [tempdir + prefix + "_name.tmp1" for prefix in "abcd"])

        test_pipeline.originate(task_func   = generate_initial_files2,
                                output      = [tempdir +  "e_name.tmp1", tempdir +  "f_name.tmp1"])

        test_pipeline.originate(task_func   = generate_initial_files3,
                                output      = [tempdir +  "g_name.tmp1", tempdir +  "h_name.tmp1"])

        test_pipeline.originate(task_func   = generate_initial_files4,
                                output      = tempdir +  "i_name.tmp1")

        test_pipeline.collate(  task_func   = test_task2,
                                input       = [generate_initial_files1,
                                               generate_initial_files2,
                                               generate_initial_files3,
                                               generate_initial_files4],
                                filter      = formatter(),
                                output      = "{path[0]}/all.tmp2")

        test_pipeline.transform(task_func   = test_task3,
                                input       = test_task2,
                                filter      = suffix(".tmp2"),
                                output      = ".tmp3")

        test_pipeline.transform(task_func   = test_task4,
                                input       = test_task3,
                                filter      = suffix(".tmp3"),
                                output      = ".tmp4")
        return test_pipeline
Example #6
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='thepipeline')
    # Get a list of paths to all the FASTQ files
    fastq_files = state.config.get_option('fastqs')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(
        task_func=stages.original_fastqs,
        name='original_fastqs',
        output=fastq_files)

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        # We assume the sample name may consist of only alphanumeric
        # characters.
        # filter=formatter('(?P<path>.+)/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+)_1.fastq.gz'),
        # 1_HFYLVCCXX:2:TCCGCGAA_2_GE0343_1.fastq.gz
        # 1_HCJWFBCXX:GGACTCCT_L001_9071584415739518822-AGRF-023_R2.fastq.gz
        filter=formatter(
            '.+/(?P<readid>[a-zA-Z0-9-]+)_(?P<lib>[a-zA-Z0-9-:]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+)_R1.fastq.gz'),
        # Add one more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        # e.g. C2WPF.5_Solexa-201237_5_X4311_1.fastq.gz
        add_inputs=add_inputs(
            '{path[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}_R2.fastq.gz'),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=['{readid[0]}', '{lib[0]}', '{lane[0]}', '{sample[0]}'],
        # extras=['{sample[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.bam')

    # Sort the BAM file using Picard
    pipeline.transform(
        task_func=stages.sort_bam_picard,
        name='sort_bam_picard',
        input=output_from('align_bwa'),
        filter=suffix('.bam'),
        output='.sort.bam')

    # Mark duplicates in the BAM file using Picard
    pipeline.transform(
        task_func=stages.mark_duplicates_picard,
        name='mark_duplicates_picard',
        input=output_from('sort_bam_picard'),
        filter=suffix('.sort.bam'),
        # XXX should make metricsup an extra output?
        output=['.sort.dedup.bam', '.metricsdup'])

    # Local realignment using GATK
    # Generate RealignerTargetCreator using GATK
    pipeline.transform(
        task_func=stages.realigner_target_creator,
        name='realigner_target_creator',
        input=output_from('mark_duplicates_picard'),
        filter=suffix('.sort.dedup.bam'),
        output='.intervals')

    # Local realignment using GATK
    (pipeline.transform(
        task_func=stages.local_realignment_gatk,
        name='local_realignment_gatk',
        input=output_from('realigner_target_creator'),
        # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).chr.intervals'),
        filter=formatter(
            '.+/(?P<readid>[a-zA-Z0-9-]+)_(?P<lib>[a-zA-Z0-9-:]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+).intervals'),
        # add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'),
        add_inputs=add_inputs(
            'alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.bam'),
        output='alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.realn.bam')
        .follows('mark_duplicates_picard'))

    # Base recalibration using GATK
    pipeline.transform(
        task_func=stages.base_recalibration_gatk,
        name='base_recalibration_gatk',
        input=output_from('local_realignment_gatk'),
        filter=suffix('.sort.dedup.realn.bam'),
        output=['.recal_data.csv', '.count_cov.log'])

    # Print reads using GATK
    (pipeline.transform(
        task_func=stages.print_reads_gatk,
        name='print_reads_gatk',
        input=output_from('base_recalibration_gatk'),
        # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).recal_data.csv'),
        filter=formatter(
            # '.+/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+).recal_data.csv'),
            '.+/(?P<readid>[a-zA-Z0-9-]+)_(?P<lib>[a-zA-Z0-9-:]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+).recal_data.csv'),
            # '.+/(?P<readid>[a-zA-Z0-9-]+)_(?P<lib>[a-zA-Z0-9-:]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+).recal_data.csv'),
        # add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.realn.bam'),
        add_inputs=add_inputs(
            'alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.realn.bam'),
        # output='{path[0]}/{sample[0]}.sort.dedup.realn.recal.bam')
        output='alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.realn.recal.bam')
        .follows('local_realignment_gatk'))

    # Merge lane bams to sample bams
    pipeline.collate(
        task_func=stages.merge_sample_bams,
        name='merge_sample_bams',
        filter=formatter(
            # '.+/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+).sort.dedup.realn.recal.bam'),
            '.+/(?P<readid>[a-zA-Z0-9-]+)_(?P<lib>[a-zA-Z0-9-:]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+).sort.dedup.realn.recal.bam'),
        # inputs=add_inputs('alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.realn.bam'),
        input=output_from('print_reads_gatk'),
        output='alignments/{sample[0]}/{sample[0]}.merged.bam')

    # Mark duplicates in the BAM file using Picard
    pipeline.transform(
        task_func=stages.mark_duplicates_picard,
        name='mark_duplicates_picard2',
        input=output_from('merge_sample_bams'),
        # filter=formatter(
        # '.+/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+).merged.bam'),
        filter=suffix('.merged.bam'),
        # XXX should make metricsup an extra output?
        output=['.merged.dedup.bam', '.metricsdup'])

    # Local realignment2 using GATK
    # Generate RealignerTargetCreator using GATK
    pipeline.transform(
        task_func=stages.realigner_target_creator,
        name='realigner_target_creator2',
        input=output_from('mark_duplicates_picard2'),
        filter=suffix('.dedup.bam'),
        output='.intervals')

    # Local realignment using GATK
    (pipeline.transform(
        task_func=stages.local_realignment_gatk,
        name='local_realignment_gatk2',
        input=output_from('realigner_target_creator2'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9-]+).merged.intervals'),
        # filter=formatter(
        # '.+/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+).intervals'),
        # add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'),
        add_inputs=add_inputs(
            'alignments/{sample[0]}/{sample[0]}.merged.dedup.bam'),
        output='alignments/{sample[0]}/{sample[0]}.merged.dedup.realn.bam')
        .follows('mark_duplicates_picard2'))

    # Call variants using GATK
    pipeline.transform(
        task_func=stages.call_haplotypecaller_gatk,
        name='call_haplotypecaller_gatk',
        input=output_from('local_realignment_gatk2'),
        # filter=suffix('.merged.dedup.realn.bam'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9-]+).merged.dedup.realn.bam'),
        output='variants/{sample[0]}.g.vcf')

    # Combine G.VCF files for all samples using GATK
    pipeline.merge(
        task_func=stages.combine_gvcf_gatk,
        name='combine_gvcf_gatk',
        input=output_from('call_haplotypecaller_gatk'),
        output='variants/ALL.combined.vcf')

    # Genotype G.VCF files using GATK
    pipeline.transform(
        task_func=stages.genotype_gvcf_gatk,
        name='genotype_gvcf_gatk',
        input=output_from('combine_gvcf_gatk'),
        filter=suffix('.combined.vcf'),
        output='.raw.vcf')

    # SNP recalibration using GATK
    pipeline.transform(
        task_func=stages.snp_recalibrate_gatk,
        name='snp_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.raw.vcf'),
        output=['.snp_recal', '.snp_tranches', '.snp_plots.R'])

    # INDEL recalibration using GATK
    pipeline.transform(
        task_func=stages.indel_recalibrate_gatk,
        name='indel_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.raw.vcf'),
        output=['.indel_recal', '.indel_tranches', '.indel_plots.R'])

    # Apply SNP recalibration using GATK
    (pipeline.transform(
        task_func=stages.apply_snp_recalibrate_gatk,
        name='apply_snp_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.raw.vcf'),
        add_inputs=add_inputs(['ALL.snp_recal', 'ALL.snp_tranches']),
        output='.recal_SNP.vcf')
        .follows('snp_recalibrate_gatk'))

    # Apply INDEL recalibration using GATK
    (pipeline.transform(
        task_func=stages.apply_indel_recalibrate_gatk,
        name='apply_indel_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.raw.vcf'),
        add_inputs=add_inputs(
            ['ALL.indel_recal', 'ALL.indel_tranches']),
        output='.recal_INDEL.vcf')
        .follows('indel_recalibrate_gatk'))

    # Combine variants using GATK
    (pipeline.transform(
        task_func=stages.combine_variants_gatk,
        name='combine_variants_gatk',
        input=output_from('apply_snp_recalibrate_gatk'),
        filter=suffix('.recal_SNP.vcf'),
        add_inputs=add_inputs(['ALL.recal_INDEL.vcf']),
        # output='.combined.vcf')
        output='ALL.raw.vqsr.vcf')
        .follows('apply_indel_recalibrate_gatk'))
    #
    # # Select variants using GATK
    # pipeline.transform(
    #     task_func=stages.select_variants_gatk,
    #     name='select_variants_gatk',
    #     input=output_from('combine_variants_gatk'),
    #     filter=suffix('.combined.vcf'),
    #     output='.selected.vcf')


    return pipeline
Example #7
0
def main():

    #########
    # SETUP #
    #########

    # catch jgi logon and password from cli
    parser = ruffus.cmdline.get_argparse(
        description='5 accessions variant calling pipeline.')
    parser.add_argument('--email', '-e',
                        help='Logon email address for JGI',
                        type=str,
                        dest='jgi_logon')
    parser.add_argument('--password', '-p',
                        help='JGI password',
                        type=str,
                        dest='jgi_password')
    options = parser.parse_args()
    jgi_logon = options.jgi_logon
    jgi_password = options.jgi_password

    ##################
    # PIPELINE STEPS #
    ##################

    # test function for checking input/output passed to job_script and parsing
    # by io_parser
    test_job_function = functions.generate_job_function(
        job_script='src/sh/io_parser',
        job_name='test')

    # initialise pipeline
    main_pipeline = ruffus.Pipeline.pipelines["main"]

    # bamfiles
    raw_files = [x.path for x in os.scandir('data/bam') if
                 x.name.endswith('.bam') and x.is_file]

    # subset the files while the pipeline is in development. Make this equal
    # to the raw_files to run the whole pipline.
    # active_raw_files = [x for x in raw_files if
    #                     'G1' in x or 'G4' in x or 'J1' in x or 'J4' in x]
    active_raw_files = raw_files

    # species short names for vcf splitting
    species_short_names = list(set(
        [os.path.basename(x)[0] for x in active_raw_files]))

    # check that the files exist
    mapped_raw = main_pipeline.originate(
        name='mapped_raw',
        task_func=os.path.isfile,
        output=active_raw_files)

    # genome fasta
    ref_fa = main_pipeline.originate(
        name='ref_fa',
        task_func=functions.generate_job_function(
            job_script='src/sh/download_genome',
            job_name='ref_fa',
            job_type='download'),
        output='data/genome/Osativa_323_v7.0.fa',
        extras=[jgi_logon, jgi_password])

    # indexes
    fa_idx = main_pipeline.transform(
        name='fa_idx',
        task_func=functions.generate_job_function(
            job_script='src/sh/fa_idx',
            job_name='fa_idx',
            job_type='transform',
            cpus_per_task=6),
        input=ref_fa,
        filter=ruffus.suffix(".fa"),
        output=['.dict', '.fa.fai'])

    # annotation
    annot = main_pipeline.originate(
        name='annot',
        task_func=functions.generate_job_function(
            job_script='src/sh/download_genome',
            job_name='annot',
            job_type='download'),
        output=('data/genome/'
                'Osativa_323_v7.0.gene_exons.gffread.rRNAremoved.gtf'),
        extras=[jgi_logon, jgi_password])

    # convert annotation to .bed
    annot_bed = main_pipeline.transform(
        name='annot_bed',
        task_func=functions.generate_job_function(
            job_script='src/sh/annot_bed',
            job_name='annot_bed',
            job_type='transform',
            cpus_per_task=7),
        input=annot,
        filter=ruffus.suffix('.gtf'),
        output='.bed')

    # mark duplicates with picard
    deduped = main_pipeline.transform(
        name='dedupe',
        task_func=functions.generate_job_function(
            job_script='src/sh/mark_duplicates_and_sort',
            job_name='dedupe',
            job_type='transform',
            cpus_per_task=2),
        input=mapped_raw,
        filter=ruffus.regex(r"data/bam/(.*).Aligned.out.bam"),
        output=(r"output/mark_duplicates_and_sort/\1.deduped.bam"))

    # Split'N'Trim and reassign mapping qualities
    split_and_trimmed = main_pipeline.transform(
        name='split_trim',
        task_func=functions.generate_job_function(
            job_script='src/sh/split_trim',
            job_name='split_trim',
            job_type='transform',
            cpus_per_task=2),
        input=deduped,
        add_inputs=ruffus.add_inputs(ref_fa),
        filter=ruffus.formatter(
            "output/mark_duplicates_and_sort/(?P<LIB>.+).deduped.bam"),
        output=["{subdir[0][1]}/split_trim/{LIB[0]}.split.bam"])\
        .follows(fa_idx)

    # we're going to recycle call_variants, merge_variants, filter_variants
    # and analyze_covar so we'll get the functions in advance
    call_variants = functions.generate_queue_job_function(
        job_script='src/sh/call_variants',
        job_name='call_variants')
    merge_variants = functions.generate_job_function(
        job_script='src/sh/merge_variants',
        job_name='merge_variants',
        job_type='transform',
        cpus_per_task=8)
    filter_variants = functions.generate_job_function(
        job_script='src/sh/filter_variants',
        job_name='filter_variants',
        job_type='transform',
        cpus_per_task=1)
    analyze_covar = functions.generate_queue_job_function(
        job_script='src/sh/analyze_covar',
        job_name='analyze_covar')

    # call variants without recalibration tables
    uncalibrated_variants = main_pipeline.transform(
        name='uncalibrated_variants',
        task_func=call_variants,
        input=split_and_trimmed,
        add_inputs=ruffus.add_inputs([ref_fa, annot_bed]),
        filter=ruffus.formatter('output/split_trim/(?P<LIB>.+).split.bam'),
        output='{subdir[0][1]}/variants_uncalibrated/{LIB[0]}.g.vcf.gz')

    # merge gVCF variants
    uncalibrated_variants_merged = main_pipeline.merge(
        name='uncalibrated_variants_merged',
        task_func=merge_variants,
        input=[uncalibrated_variants, ref_fa],
        output='output/variants_uncalibrated/variants_uncalibrated.vcf.gz')

    # filter variants on un-corrected bamfiles
    uncalibrated_variants_filtered = main_pipeline.transform(
        name='uncalibrated_variants_filtered',
        task_func=filter_variants,
        input=uncalibrated_variants_merged,
        add_inputs=ruffus.add_inputs(ref_fa),
        filter=ruffus.suffix('_uncalibrated.vcf.gz'),
        output='_uncalibrated_filtered.vcf.gz')

    # select variant (only recalibrate using passed SNPs)
    uncalibrated_variants_selected = main_pipeline.transform(
        name='uncalibrated_variants_selected',
        task_func=functions.generate_job_function(
            job_script='src/sh/select_variants',
            job_name='select_variants',
            job_type='transform'),
        input=uncalibrated_variants_filtered,
        add_inputs=ruffus.add_inputs(ref_fa),
        filter=ruffus.suffix('_uncalibrated_filtered.vcf.gz'),
        output='_uncalibrated_selected.vcf.gz')

    # create recalibration report with filtered variants
    covar_report = main_pipeline.merge(
        name='covar_report',
        task_func=analyze_covar,
        input=[split_and_trimmed, ref_fa, annot_bed,
               uncalibrated_variants_selected],
        output="output/covar_analysis/recal_data.table")

    # second pass to analyze covariation remaining after recalibration
    second_pass_covar_report = main_pipeline.merge(
        name='second_pass_covar_report',
        task_func=analyze_covar,
        input=[split_and_trimmed, ref_fa, annot_bed,
               uncalibrated_variants_filtered, covar_report],
        output="output/covar_analysis/post_recal_data.table")

    # plot effect of base recalibration
    recal_plot = main_pipeline.transform(
        name='recal_plot',
        task_func=functions.generate_job_function(
            job_script='src/R/recal_plot.R',
            job_name='recal_plot',
            job_type='transform',
            cpus_per_task=1),
        input=second_pass_covar_report,
        filter=ruffus.suffix('post_recal_data.table'),
        add_inputs=ruffus.add_inputs(covar_report),
        output='recalibration_plots.pdf')

    # recalibrate bases using recalibration report
    recalibrated = main_pipeline.transform(
        name='recalibrate',
        task_func=functions.generate_job_function(
            job_script='src/sh/recalibrate',
            job_name='recalibrate',
            job_type='transform',
            cpus_per_task=2),
        input=split_and_trimmed,
        add_inputs=ruffus.add_inputs([ref_fa, covar_report]),
        filter=ruffus.formatter('output/split_trim/(?P<LIB>.+).split.bam'),
        output='{subdir[0][1]}/recal/{LIB[0]}.recal.bam')

    # final variant calling
    variants = main_pipeline.transform(
        name='variants',
        task_func=call_variants,
        input=recalibrated,
        add_inputs=ruffus.add_inputs(ref_fa, annot_bed),
        filter=ruffus.formatter('output/recal/(?P<LIB>.+).recal.bam'),
        output='{subdir[0][1]}/variants/{LIB[0]}.g.vcf.gz')

    # merge gVCF variants
    variants_merged = main_pipeline.merge(
        name='variants_merged',
        task_func=merge_variants,
        input=[variants, ref_fa],
        output='output/variants/variants.vcf.gz')

    # variant filtering
    variants_filtered = main_pipeline.transform(
        name='variants_filtered',
        task_func=filter_variants,
        input=variants_merged,
        add_inputs=ruffus.add_inputs(ref_fa),
        filter=ruffus.suffix('.vcf.gz'),
        output='_filtered.vcf.gz')

    # variants by species
    split_variants = main_pipeline.subdivide(
        name='split_variants',
        task_func=functions.generate_job_function(
            job_script='src/sh/split_variants',
            job_name='split_variants',
            job_type='transform',
            cpus_per_task=1,
            ntasks=len(species_short_names)),
        input=variants_filtered,
        filter=ruffus.formatter(),
        add_inputs=ruffus.add_inputs(ref_fa),
        output=[('output/split_variants/' + x + '.variants_filtered.vcf.gz')
                for x in species_short_names])

    # count variants per gene per species
    cds_variants = main_pipeline.transform(
        name='cds_variants',
        task_func=functions.generate_job_function(
            job_script='src/R/cds_variants.R',
            job_name='cds_variants',
            job_type='transform'),
        input=split_variants,
        add_inputs=ruffus.add_inputs([ref_fa, annot]),
        filter=ruffus.formatter(
            'output/split_variants/(?P<LIB>.+).variants_filtered.vcf.gz'),
        output='{subdir[0][1]}/cds_variants/{LIB[0]}.cds_variants.Rds')

    # merge counted variants
    variants_per_gene = main_pipeline.merge(
        name='cds_merge',
        task_func=functions.generate_job_function(
            job_script='src/R/cds_merge.R',
            job_name='cds_merge',
            job_type='transform'),
        input=cds_variants,
        output='output/cds_variants/cds_variants.Rds')

    ###################
    # RUFFUS COMMANDS #
    ###################

    # print the flowchart
    ruffus.pipeline_printout_graph(
        "ruffus/flowchart.pdf", "pdf",
        pipeline_name="5 accessions variant calling pipeline")

    # run the pipeline
    ruffus.cmdline.run(options, multithread=8)
Example #8
0
work_folder = mkdtemp(prefix="com.github.ocrmypdf.")


@atexit.register
def cleanup_working_files(*args):
    if options.keep_temporary_files:
        print("Temporary working files saved at:")
        print(work_folder)
    else:
        with suppress(FileNotFoundError):
            shutil.rmtree(work_folder)


@transform(
    input=options.input_file,
    filter=formatter('(?i)\.pdf'),
    output=work_folder + '{basename[0]}.repaired.pdf',
    extras=[_log, _pdfinfo, _pdfinfo_lock])
def repair_pdf(
        input_file,
        output_file,
        log,
        pdfinfo,
        pdfinfo_lock):
    args_qpdf = [
        'qpdf', input_file, output_file
    ]
    try:
        out = check_output(args_qpdf, stderr=STDOUT, universal_newlines=True)
    except CalledProcessError as e:
        exit_with_error = True
Example #9
0
def build_pipeline(options, work_folder, log, context):
    main_pipeline = Pipeline.pipelines['main']

    # Triage
    task_triage = main_pipeline.transform(
        task_func=triage,
        input=os.path.join(work_folder, 'origin'),
        filter=formatter('(?i)'),
        output=os.path.join(work_folder, 'origin.pdf'),
        extras=[log, context])

    task_repair_and_parse_pdf = main_pipeline.transform(
        task_func=repair_and_parse_pdf,
        input=task_triage,
        filter=suffix('.pdf'),
        output='.repaired.pdf',
        output_dir=work_folder,
        extras=[log, context])

    # Split (kwargs for split seems to be broken, so pass plain args)
    task_pre_split_pages = main_pipeline.split(pre_split_pages,
                                               task_repair_and_parse_pdf,
                                               os.path.join(
                                                   work_folder,
                                                   '*.presplit.pdf'),
                                               extras=[log, context])

    task_split_pages = main_pipeline.transform(task_func=split_page,
                                               input=task_pre_split_pages,
                                               filter=suffix('.presplit.pdf'),
                                               output='.page.pdf',
                                               output_dir=work_folder,
                                               extras=[log, context])

    task_ocr_or_skip = main_pipeline.split(
        ocr_or_skip,
        task_split_pages, [
            os.path.join(work_folder, '*.ocr.page.pdf'),
            os.path.join(work_folder, '*.skip.page.pdf')
        ],
        extras=[log, context])

    # Rasterize preview
    task_rasterize_preview = main_pipeline.transform(
        task_func=rasterize_preview,
        input=task_ocr_or_skip,
        filter=suffix('.page.pdf'),
        output='.preview.jpg',
        output_dir=work_folder,
        extras=[log, context])
    task_rasterize_preview.active_if(options.rotate_pages)

    # Orient
    task_orient_page = main_pipeline.collate(
        task_func=orient_page,
        input=[task_ocr_or_skip, task_rasterize_preview],
        filter=regex(
            r".*/(\d{6})(\.ocr|\.skip)(?:\.page\.pdf|\.preview\.jpg)"),
        output=os.path.join(work_folder, r'\1\2.oriented.pdf'),
        extras=[log, context])

    # Rasterize actual
    task_rasterize_with_ghostscript = main_pipeline.transform(
        task_func=rasterize_with_ghostscript,
        input=task_orient_page,
        filter=suffix('.ocr.oriented.pdf'),
        output='.page.png',
        output_dir=work_folder,
        extras=[log, context])

    # Preprocessing subpipeline
    task_preprocess_remove_background = main_pipeline.transform(
        task_func=preprocess_remove_background,
        input=task_rasterize_with_ghostscript,
        filter=suffix(".page.png"),
        output=".pp-background.png",
        extras=[log, context])

    task_preprocess_deskew = main_pipeline.transform(
        task_func=preprocess_deskew,
        input=task_preprocess_remove_background,
        filter=suffix(".pp-background.png"),
        output=".pp-deskew.png",
        extras=[log, context])

    task_preprocess_clean = main_pipeline.transform(
        task_func=preprocess_clean,
        input=task_preprocess_deskew,
        filter=suffix(".pp-deskew.png"),
        output=".pp-clean.png",
        extras=[log, context])

    task_select_ocr_image = main_pipeline.collate(
        task_func=select_ocr_image,
        input=[task_preprocess_clean],
        filter=regex(r".*/(\d{6})(?:\.page|\.pp-.*)\.png"),
        output=os.path.join(work_folder, r"\1.ocr.png"),
        extras=[log, context])

    # HOCR OCR
    task_ocr_tesseract_hocr = main_pipeline.transform(
        task_func=ocr_tesseract_hocr,
        input=task_select_ocr_image,
        filter=suffix(".ocr.png"),
        output=[".hocr", ".txt"],
        extras=[log, context])
    task_ocr_tesseract_hocr.graphviz(fillcolor='"#00cc66"')
    task_ocr_tesseract_hocr.active_if(options.pdf_renderer == 'hocr')

    task_select_visible_page_image = main_pipeline.collate(
        task_func=select_visible_page_image,
        input=[
            task_rasterize_with_ghostscript, task_preprocess_remove_background,
            task_preprocess_deskew, task_preprocess_clean
        ],
        filter=regex(r".*/(\d{6})(?:\.page|\.pp-.*)\.png"),
        output=os.path.join(work_folder, r'\1.image'),
        extras=[log, context])
    task_select_visible_page_image.graphviz(shape='diamond')

    task_select_image_layer = main_pipeline.collate(
        task_func=select_image_layer,
        input=[task_select_visible_page_image, task_orient_page],
        filter=regex(r".*/(\d{6})(?:\.image|\.ocr\.oriented\.pdf)"),
        output=os.path.join(work_folder, r'\1.image-layer.pdf'),
        extras=[log, context])
    task_select_image_layer.graphviz(fillcolor='"#00cc66"', shape='diamond')
    task_select_image_layer.active_if(options.pdf_renderer == 'hocr'
                                      or options.pdf_renderer == 'sandwich')

    task_render_hocr_page = main_pipeline.transform(
        task_func=render_hocr_page,
        input=task_ocr_tesseract_hocr,
        filter=regex(r".*/(\d{6})(?:\.hocr)"),
        output=os.path.join(work_folder, r'\1.text.pdf'),
        extras=[log, context])
    task_render_hocr_page.graphviz(fillcolor='"#00cc66"')
    task_render_hocr_page.active_if(options.pdf_renderer == 'hocr')

    task_render_hocr_debug_page = main_pipeline.collate(
        task_func=render_hocr_debug_page,
        input=[task_select_visible_page_image, task_ocr_tesseract_hocr],
        filter=regex(r".*/(\d{6})(?:\.image|\.hocr)"),
        output=os.path.join(work_folder, r'\1.debug.pdf'),
        extras=[log, context])
    task_render_hocr_debug_page.graphviz(fillcolor='"#00cc66"')
    task_render_hocr_debug_page.active_if(options.pdf_renderer == 'hocr')
    task_render_hocr_debug_page.active_if(options.debug_rendering)

    # Tesseract OCR + text only PDF
    task_ocr_tesseract_textonly_pdf = main_pipeline.collate(
        task_func=ocr_tesseract_textonly_pdf,
        input=[task_select_ocr_image, task_orient_page],
        filter=regex(r".*/(\d{6})(?:\.ocr.png|\.ocr\.oriented\.pdf)"),
        output=[
            os.path.join(work_folder, r'\1.text.pdf'),
            os.path.join(work_folder, r'\1.text.txt')
        ],
        extras=[log, context])
    task_ocr_tesseract_textonly_pdf.graphviz(fillcolor='"#ff69b4"')
    task_ocr_tesseract_textonly_pdf.active_if(
        options.pdf_renderer == 'sandwich')

    task_combine_layers = main_pipeline.collate(
        task_func=combine_layers,
        input=[
            task_render_hocr_page, task_ocr_tesseract_textonly_pdf,
            task_select_image_layer
        ],
        filter=regex(r".*/(\d{6})(?:\.text\.pdf|\.image-layer\.pdf)"),
        output=os.path.join(work_folder, r'\1.rendered.pdf'),
        extras=[log, context])
    task_combine_layers.graphviz(fillcolor='"#00cc66"')
    task_combine_layers.active_if(options.pdf_renderer == 'hocr'
                                  or options.pdf_renderer == 'sandwich')

    # Tesseract OCR+PDF
    task_ocr_tesseract_and_render_pdf = main_pipeline.collate(
        task_func=ocr_tesseract_and_render_pdf,
        input=[task_select_visible_page_image, task_orient_page],
        filter=regex(r".*/(\d{6})(?:\.image|\.ocr\.oriented\.pdf)"),
        output=[
            os.path.join(work_folder, r'\1.rendered.pdf'),
            os.path.join(work_folder, r'\1.rendered.txt')
        ],
        extras=[log, context])
    task_ocr_tesseract_and_render_pdf.graphviz(fillcolor='"#66ccff"')
    task_ocr_tesseract_and_render_pdf.active_if(
        options.pdf_renderer == 'tesseract')

    # PDF/A
    task_generate_postscript_stub = main_pipeline.transform(
        task_func=generate_postscript_stub,
        input=task_repair_and_parse_pdf,
        filter=formatter(r'\.repaired\.pdf'),
        output=os.path.join(work_folder, 'pdfa.ps'),
        extras=[log, context])
    task_generate_postscript_stub.active_if(
        options.output_type.startswith('pdfa'))

    # Bypass valve
    task_skip_page = main_pipeline.transform(
        task_func=skip_page,
        input=task_orient_page,
        filter=suffix('.skip.oriented.pdf'),
        output='.done.pdf',
        output_dir=work_folder,
        extras=[log, context])

    # Merge pages
    task_merge_pages_ghostscript = main_pipeline.merge(
        task_func=merge_pages_ghostscript,
        input=[
            task_combine_layers, task_render_hocr_debug_page, task_skip_page,
            task_ocr_tesseract_and_render_pdf, task_generate_postscript_stub
        ],
        output=os.path.join(work_folder, 'merged.pdf'),
        extras=[log, context])
    task_merge_pages_ghostscript.active_if(
        options.output_type.startswith('pdfa'))

    task_merge_pages_qpdf = main_pipeline.merge(
        task_func=merge_pages_qpdf,
        input=[
            task_combine_layers, task_render_hocr_debug_page, task_skip_page,
            task_ocr_tesseract_and_render_pdf, task_repair_and_parse_pdf
        ],
        output=os.path.join(work_folder, 'merged.pdf'),
        extras=[log, context])
    task_merge_pages_qpdf.active_if(options.output_type == 'pdf' and not fitz)

    task_merge_pages_mupdf = main_pipeline.merge(
        task_func=merge_pages_mupdf,
        input=[
            task_combine_layers, task_render_hocr_debug_page, task_skip_page,
            task_ocr_tesseract_and_render_pdf, task_repair_and_parse_pdf
        ],
        output=os.path.join(work_folder, 'merged.pdf'),
        extras=[log, context])
    task_merge_pages_mupdf.active_if(options.output_type == 'pdf' and fitz)

    task_merge_sidecars = main_pipeline.merge(
        task_func=merge_sidecars,
        input=[
            task_ocr_tesseract_hocr, task_ocr_tesseract_and_render_pdf,
            task_ocr_tesseract_textonly_pdf
        ],
        output=options.sidecar,
        extras=[log, context])
    task_merge_sidecars.active_if(options.sidecar)

    # Finalize
    main_pipeline.merge(task_func=copy_final,
                        input=[
                            task_merge_pages_ghostscript,
                            task_merge_pages_mupdf, task_merge_pages_qpdf
                        ],
                        output=options.output_file,
                        extras=[log, context])
Example #10
0
                (options.image_dpi, options.image_dpi))
        with open(output_file, 'wb') as outf:
            img2pdf.convert(
                input_file,
                layout_fun=layout_fun,
                with_pdfrw=False,
                outputstream=outf)
        log.info("Successfully converted to PDF, processing...")
    except img2pdf.ImageOpenError as e:
        log.error(e)
        sys.exit(ExitCode.input_file)


@transform(
    input=options.input_file,
    filter=formatter('(?i)'),
    output=os.path.join(work_folder, '{basename[0]}.pdf'),
    extras=[_log])
def triage(
        input_file,
        output_file,
        log):
    try:
        with open(input_file, 'rb') as f:
            signature = f.read(4)
            if signature == b'%PDF':
                re_symlink(input_file, output_file)
                return
    except EnvironmentError as e:
        log.error(e)
        sys.exit(ExitCode.input_file)
Example #11
0
    cmd = config['CMD_ASCP'].format(
        log_dir=sra_outdir, url_path=sra_url_path, output_dir=sra_outdir)
    returncode = misc.execute(cmd, msg_id, flag_file, options.debug)
    if returncode != 0 or returncode is None:
        # try wget
        # cmd template looks like this:
        # wget ftp://ftp-trace.ncbi.nlm.nih.gov{url_path} -P {output_dir} -N
        cmd = config['CMD_WGET'].format(
            url_path=sra_url_path, output_dir=sra_outdir)
        misc.execute(cmd, msg_id, flag_file, options.debug)

               
@R.subdivide(
    download,
    R.formatter(r'{0}/(?P<RX>[SED]RX\d+)/(?P<RR>[SED]RR\d+)/(.*)\.sra'.format(PATH_RE)),
    ['{subpath[0][2]}/{RR[0]}_[12].fastq.gz',
     '{subpath[0][2]}/{RR[0]}.sra.sra2fastq.COMPLETE'])
def sra2fastq(inputs, outputs):
    """for meaning of [SED]RR, see
    http://www.ncbi.nlm.nih.gov/books/NBK56913/#search.the_entrez_sra_search_response_pa

    S =NCBI-SRA, E = EMBL-SRA, D = DDBJ-SRA
    SRR: SRA run accession
    ERR: ERA run accession
    DRR: DRA run accession
    """
    sra, _ = inputs             # ignore the flag file from previous task
    flag_file = outputs[-1]
    outdir = os.path.dirname(os.path.dirname(os.path.dirname(sra)))
    cmd = config['CMD_FASTQ_DUMP'].format(output_dir=outdir, accession=sra)
Example #12
0
        right that way.'''
        if s.endswith('.ps'):
            return 99999999
        key = int(os.path.basename(s)[0:6]) * 10
        if 'debug' in os.path.basename(s):
            key += 1
        return key

    pdf_pages = sorted(input_files, key=input_file_order)
    log.info(pdf_pages)
    ghostscript.generate_pdfa(pdf_pages, output_file, options.jobs or 1)


@transform(
    input=merge_pages,
    filter=formatter(),
    output=options.output_file,
    extras=[_log, _pdfinfo, _pdfinfo_lock])
def copy_final(
        input_file,
        output_file,
        log,
        pdfinfo,
        pdfinfo_lock):
    shutil.copy(input_file, output_file)


def validate_pdfa(
        input_file,
        log):
Example #13
0
def main():
    # prepare the ruffus pipeline
    main_pipeline = ruffus.Pipeline.pipelines["main"]

    # catch jgi logon and password from cli
    parser = ruffus.cmdline.get_argparse(description='UV-B analysis pipeline.')
    parser.add_argument('--email', '-e',
                        help='Logon email address for JGI',
                        type=str,
                        dest='jgi_logon')
    parser.add_argument('--password', '-p',
                        help='JGI password',
                        type=str,
                        dest='jgi_password')
    options = parser.parse_args()
    jgi_logon = options.jgi_logon
    jgi_password = options.jgi_password

    # need a dictionary of species to genome URL and species to gff.
    # supply this in a text file
    fasta_urls = {}
    annotation_urls = {}
    with open('data/genomeUrls.txt') as tsv:
        genome_urls = csv.reader(tsv, delimiter='\t')
        next(genome_urls, None)
        for row in genome_urls:
            fasta_urls[row[0]] = row[1]
            annotation_urls[row[0]] = row[2]

    # iterate over fasta_urls keys to run jobs
    for species in fasta_urls.keys():
        # call download script
        main_pipeline.originate(
            name=species + "_genome",
            task_func=download_genome,
            output="data/genome/" + species + "/METADATA.csv",
            extras=[species, fasta_urls[species], annotation_urls[species],
                    jgi_logon, jgi_password])
        # generate a star genome for each species
        main_pipeline.transform(
            name=species + "_index",
            task_func=generate_index,
            input=ruffus.output_from(species + "_genome"),
            filter=ruffus.regex(r"data/genome/(.*)/METADATA.csv"),
            output=r"output/\1/star-index/METADATA.csv",
            extras=[r"\1"])
        # define the reads
        main_pipeline.originate(name=species + "_reads",
                                task_func=define_reads,
                                output="ruffus/" + species + "_reads",
                                extras=[species])
        # first mapping step
        main_pipeline.collate(
            name=species + "_mapped_reads",
            task_func=star,
            input=[[ruffus.output_from(species + "_reads"),
                    ruffus.output_from(species + "_index")]],
            filter=ruffus.formatter(),
            output=["output/{subdir[1][1]}/star/METADATA.csv"],
            extras=["{subdir[1][1]}"])
    # FOR LOOP ENDS

    # parse the mapping stats
    mapping_stats = main_pipeline.merge(
        task_func=parse_star_stats_R,
        input=ruffus.output_from(
            list(species + "_mapped_reads" for species in fasta_urls.keys())),
        output="output/mapping_stats/SessionInfo.txt")

    # generate plots for mapping
    mapping_plots = main_pipeline.transform(
        task_func=plot_reads_in_genes_R,
        input=mapping_stats,
        filter=ruffus.formatter(),
        output="{subpath[0][0]}/Figure S1.pdf")

    # use generator in the input field to collate the previous results
    deseq_results = main_pipeline.transform(
                task_func=deseq2_R,
                input=ruffus.output_from(
                        list(species + "_mapped_reads"
                             for species in fasta_urls.keys())),
                filter=ruffus.formatter(),
                output=[r"output/{subdir[0][1]}/deseq2/SessionInfo.txt"],
                extras=[r"{subdir[0][1]}"])

    # combine the deseq results
    de_lists = main_pipeline.merge(
        task_func=list_de_genes_R,
        input=deseq_results,
        output="output/merged/deseq2/SessionInfo.de_genes.txt")

    # run clustering
    mfuzz_results = main_pipeline.transform(
        task_func=mfuzz_R,
        input=deseq_results,
        filter=ruffus.formatter(),
        output='output/{subdir[0][1]}/mfuzz/SessionInfo.mfuzz.txt',
        extras=['{subdir[0][1]}'])

    # combine mfuzz_results
    mfuzz_plot = main_pipeline.merge(
        task_func=combine_mfuzz_results_R,
        input=mfuzz_results,
        output='output/merged/mfuzz/SessionInfo.mfuzz.txt')

    # compare flavonoid synthesis genes
    flavonoid_genes = main_pipeline.transform(
        task_func=compare_saito_genes_R,
        input=de_lists,
        filter=ruffus.formatter(),
        output='{path[0]}/SessionInfo.flavonoid_synthesis.txt')

    # run the pipeline
    ruffus.cmdline.run(options, multithread=8)

    # print the flowchart
    ruffus.pipeline_printout_graph("ruffus/flowchart.pdf", "pdf",
                                   pipeline_name="UV-B analysis pipeline")
Example #14
0
def build_pipeline(options, work_folder, log, context):
    main_pipeline = Pipeline.pipelines['main']

    # Triage
    task_triage = main_pipeline.transform(
        task_func=triage,
        input=os.path.join(work_folder, 'origin'),
        filter=formatter('(?i)'),
        output=os.path.join(work_folder, 'origin.pdf'),
        extras=[log, context],
    )

    task_repair_and_parse_pdf = main_pipeline.transform(
        task_func=repair_and_parse_pdf,
        input=task_triage,
        filter=suffix('.pdf'),
        output='.repaired.pdf',
        output_dir=work_folder,
        extras=[log, context],
    )

    # Split (kwargs for split seems to be broken, so pass plain args)
    task_marker_pages = main_pipeline.split(
        marker_pages,
        task_repair_and_parse_pdf,
        os.path.join(work_folder, '*.marker.pdf'),
        extras=[log, context],
    )

    task_ocr_or_skip = main_pipeline.split(
        ocr_or_skip,
        task_marker_pages,
        [
            os.path.join(work_folder, '*.ocr.page.pdf'),
            os.path.join(work_folder, '*.skip.page.pdf'),
        ],
        extras=[log, context],
    )

    # Rasterize preview
    task_rasterize_preview = main_pipeline.transform(
        task_func=rasterize_preview,
        input=task_ocr_or_skip,
        filter=suffix('.page.pdf'),
        output='.preview.jpg',
        output_dir=work_folder,
        extras=[log, context],
    )
    task_rasterize_preview.active_if(options.rotate_pages)

    # Orient
    task_orient_page = main_pipeline.collate(
        task_func=orient_page,
        input=[task_ocr_or_skip, task_rasterize_preview],
        filter=regex(r".*/(\d{6})(\.ocr|\.skip)(?:\.page\.pdf|\.preview\.jpg)"),
        output=os.path.join(work_folder, r'\1\2.oriented.pdf'),
        extras=[log, context],
    )

    # Rasterize actual
    task_rasterize_with_ghostscript = main_pipeline.transform(
        task_func=rasterize_with_ghostscript,
        input=task_orient_page,
        filter=suffix('.ocr.oriented.pdf'),
        output='.page.png',
        output_dir=work_folder,
        extras=[log, context],
    )

    # Preprocessing subpipeline
    task_preprocess_remove_background = main_pipeline.transform(
        task_func=preprocess_remove_background,
        input=task_rasterize_with_ghostscript,
        filter=suffix(".page.png"),
        output=".pp-background.png",
        extras=[log, context],
    )

    task_preprocess_deskew = main_pipeline.transform(
        task_func=preprocess_deskew,
        input=task_preprocess_remove_background,
        filter=suffix(".pp-background.png"),
        output=".pp-deskew.png",
        extras=[log, context],
    )

    task_preprocess_clean = main_pipeline.transform(
        task_func=preprocess_clean,
        input=task_preprocess_deskew,
        filter=suffix(".pp-deskew.png"),
        output=".pp-clean.png",
        extras=[log, context],
    )

    task_select_ocr_image = main_pipeline.collate(
        task_func=select_ocr_image,
        input=[task_preprocess_clean],
        filter=regex(r".*/(\d{6})(?:\.page|\.pp-.*)\.png"),
        output=os.path.join(work_folder, r"\1.ocr.png"),
        extras=[log, context],
    )

    # HOCR OCR
    task_ocr_tesseract_hocr = main_pipeline.transform(
        task_func=ocr_tesseract_hocr,
        input=task_select_ocr_image,
        filter=suffix(".ocr.png"),
        output=[".hocr", ".txt"],
        extras=[log, context],
    )
    task_ocr_tesseract_hocr.graphviz(fillcolor='"#00cc66"')
    task_ocr_tesseract_hocr.active_if(options.pdf_renderer == 'hocr')

    task_select_visible_page_image = main_pipeline.collate(
        task_func=select_visible_page_image,
        input=[
            task_rasterize_with_ghostscript,
            task_preprocess_remove_background,
            task_preprocess_deskew,
            task_preprocess_clean,
        ],
        filter=regex(r".*/(\d{6})(?:\.page|\.pp-.*)\.png"),
        output=os.path.join(work_folder, r'\1.image'),
        extras=[log, context],
    )
    task_select_visible_page_image.graphviz(shape='diamond')

    task_select_image_layer = main_pipeline.collate(
        task_func=select_image_layer,
        input=[task_select_visible_page_image, task_orient_page],
        filter=regex(r".*/(\d{6})(?:\.image|\.ocr\.oriented\.pdf)"),
        output=os.path.join(work_folder, r'\1.image-layer.pdf'),
        extras=[log, context],
    )
    task_select_image_layer.graphviz(fillcolor='"#00cc66"', shape='diamond')

    task_render_hocr_page = main_pipeline.transform(
        task_func=render_hocr_page,
        input=task_ocr_tesseract_hocr,
        filter=regex(r".*/(\d{6})(?:\.hocr)"),
        output=os.path.join(work_folder, r'\1.text.pdf'),
        extras=[log, context],
    )
    task_render_hocr_page.graphviz(fillcolor='"#00cc66"')
    task_render_hocr_page.active_if(options.pdf_renderer == 'hocr')

    # Tesseract OCR + text only PDF
    task_ocr_tesseract_textonly_pdf = main_pipeline.collate(
        task_func=ocr_tesseract_textonly_pdf,
        input=[task_select_ocr_image],
        filter=regex(r".*/(\d{6})(?:\.ocr.png)"),
        output=[
            os.path.join(work_folder, r'\1.text.pdf'),
            os.path.join(work_folder, r'\1.text.txt'),
        ],
        extras=[log, context],
    )
    task_ocr_tesseract_textonly_pdf.graphviz(fillcolor='"#ff69b4"')
    task_ocr_tesseract_textonly_pdf.active_if(options.pdf_renderer == 'sandwich')

    task_weave_layers = main_pipeline.collate(
        task_func=weave_layers,
        input=[
            task_repair_and_parse_pdf,
            task_render_hocr_page,
            task_ocr_tesseract_textonly_pdf,
            task_select_image_layer,
        ],
        filter=regex(
            r".*/((?:\d{6}(?:\.text\.pdf|\.image-layer\.pdf))|(?:origin\.repaired\.pdf))"
        ),
        output=os.path.join(work_folder, r'layers.rendered.pdf'),
        extras=[log, context],
    )
    task_weave_layers.graphviz(fillcolor='"#00cc66"')

    # PDF/A pdfmark
    task_generate_postscript_stub = main_pipeline.transform(
        task_func=generate_postscript_stub,
        input=task_repair_and_parse_pdf,
        filter=formatter(r'\.repaired\.pdf'),
        output=os.path.join(work_folder, 'pdfa.ps'),
        extras=[log, context],
    )
    task_generate_postscript_stub.active_if(options.output_type.startswith('pdfa'))

    # PDF/A conversion
    task_convert_to_pdfa = main_pipeline.merge(
        task_func=convert_to_pdfa,
        input=[task_generate_postscript_stub, task_weave_layers],
        output=os.path.join(work_folder, 'pdfa.pdf'),
        extras=[log, context],
    )
    task_convert_to_pdfa.active_if(options.output_type.startswith('pdfa'))

    task_metadata_fixup = main_pipeline.merge(
        task_func=metadata_fixup,
        input=[task_repair_and_parse_pdf, task_weave_layers, task_convert_to_pdfa],
        output=os.path.join(work_folder, 'metafix.pdf'),
        extras=[log, context],
    )

    task_merge_sidecars = main_pipeline.merge(
        task_func=merge_sidecars,
        input=[task_ocr_tesseract_hocr, task_ocr_tesseract_textonly_pdf],
        output=options.sidecar,
        extras=[log, context],
    )
    task_merge_sidecars.active_if(options.sidecar)

    # Optimize
    task_optimize_pdf = main_pipeline.transform(
        task_func=optimize_pdf,
        input=task_metadata_fixup,
        filter=suffix('.pdf'),
        output='.optimized.pdf',
        output_dir=work_folder,
        extras=[log, context],
    )

    # Finalize
    main_pipeline.merge(
        task_func=copy_final,
        input=[task_optimize_pdf],
        output=options.output_file,
        extras=[log, context],
    )
#___________________________________________________________________________
#
#   generate_initial_files1
#___________________________________________________________________________
@originate(tempdir +  "i_name.tmp1")
def generate_initial_files4(on):
    with open(on, 'w') as outfile:
        pass

#___________________________________________________________________________
#
#   test_task2
#___________________________________________________________________________
@collate([generate_initial_files1, generate_initial_files2, generate_initial_files3,
            generate_initial_files4],
         formatter(),
         "{path[0]}/all.tmp2")
#@transform([generate_initial_files1, generate_initial_files2, generate_initial_files3,
#            generate_initial_files4],
#            formatter( ),
#            "{path[0]}/{basename[0]}.tmp2")
def test_task2( infiles, outfile):
    with open(outfile, "w") as p:
        pass
    #print >>sys.stderr, "8" * 80, "\n", "    task2 :%s %s " % (infiles, outfile)

#___________________________________________________________________________
#
#   test_task3
#___________________________________________________________________________
@transform(test_task2, suffix(".tmp2"), ".tmp3")
Example #16
0
def build_pipeline(options, work_folder, log, context):
    main_pipeline = Pipeline.pipelines['main']

    # Triage
    task_triage = main_pipeline.transform(
        task_func=triage,
        input=os.path.join(work_folder, 'origin'),
        filter=formatter('(?i)'),
        output=os.path.join(work_folder, 'origin.pdf'),
        extras=[log, context])

    task_repair_pdf = main_pipeline.transform(
        task_func=repair_pdf,
        input=task_triage,
        filter=suffix('.pdf'),
        output='.repaired.pdf',
        output_dir=work_folder,
        extras=[log, context])

    # Split (kwargs for split seems to be broken, so pass plain args)
    task_split_pages = main_pipeline.split(
        split_pages,
        task_repair_pdf,
        os.path.join(work_folder, '*.page.pdf'),
        extras=[log, context])

    # Rasterize preview
    task_rasterize_preview = main_pipeline.transform(
        task_func=rasterize_preview,
        input=task_split_pages,
        filter=suffix('.page.pdf'),
        output='.preview.jpg',
        output_dir=work_folder,
        extras=[log, context])
    task_rasterize_preview.active_if(options.rotate_pages)

    # Orient
    task_orient_page = main_pipeline.collate(
        task_func=orient_page,
        input=[task_split_pages, task_rasterize_preview],
        filter=regex(r".*/(\d{6})(\.ocr|\.skip)(?:\.page\.pdf|\.preview\.jpg)"),
        output=os.path.join(work_folder, r'\1\2.oriented.pdf'),
        extras=[log, context])

    # Rasterize actual
    task_rasterize_with_ghostscript = main_pipeline.transform(
        task_func=rasterize_with_ghostscript,
        input=task_orient_page,
        filter=suffix('.ocr.oriented.pdf'),
        output='.page.png',
        output_dir=work_folder,
        extras=[log, context])

    # Preprocessing subpipeline
    task_preprocess_remove_background = main_pipeline.transform(
        task_func=preprocess_remove_background,
        input=task_rasterize_with_ghostscript,
        filter=suffix(".page.png"),
        output=".pp-background.png",
        extras=[log, context])

    task_preprocess_deskew = main_pipeline.transform(
        task_func=preprocess_deskew,
        input=task_preprocess_remove_background,
        filter=suffix(".pp-background.png"),
        output=".pp-deskew.png",
        extras=[log, context])

    task_preprocess_clean = main_pipeline.transform(
        task_func=preprocess_clean,
        input=task_preprocess_deskew,
        filter=suffix(".pp-deskew.png"),
        output=".pp-clean.png",
        extras=[log, context])

    task_select_ocr_image = main_pipeline.collate(
        task_func=select_ocr_image,
        input=[task_preprocess_clean],
        filter=regex(r".*/(\d{6})(?:\.page|\.pp-.*)\.png"),
        output=os.path.join(work_folder, r"\1.ocr.png"),
        extras=[log, context])


    # HOCR OCR
    task_ocr_tesseract_hocr = main_pipeline.transform(
        task_func=ocr_tesseract_hocr,
        input=task_select_ocr_image,
        filter=suffix(".ocr.png"),
        output=[".hocr", ".txt"],
        extras=[log, context])
    task_ocr_tesseract_hocr.graphviz(fillcolor='"#00cc66"')
    task_ocr_tesseract_hocr.active_if(options.pdf_renderer == 'hocr')
    if tesseract.v4():
        task_ocr_tesseract_hocr.jobs_limit(2)  # Uses multi-core on its own

    task_select_visible_page_image = main_pipeline.collate(
        task_func=select_visible_page_image,
        input=[task_rasterize_with_ghostscript,
               task_preprocess_remove_background,
               task_preprocess_deskew,
               task_preprocess_clean],
        filter=regex(r".*/(\d{6})(?:\.page|\.pp-.*)\.png"),
        output=os.path.join(work_folder, r'\1.image'),
        extras=[log, context])
    task_select_visible_page_image.graphviz(shape='diamond')

    task_select_image_layer = main_pipeline.collate(
        task_func=select_image_layer,
        input=[task_select_visible_page_image, task_orient_page],
        filter=regex(r".*/(\d{6})(?:\.image|\.ocr\.oriented\.pdf)"),
        output=os.path.join(work_folder, r'\1.image-layer.pdf'),
        extras=[log, context])
    task_select_image_layer.graphviz(
        fillcolor='"#00cc66"', shape='diamond')
    task_select_image_layer.active_if(
        options.pdf_renderer == 'hocr' or options.pdf_renderer == 'sandwich')

    task_render_hocr_page = main_pipeline.transform(
        task_func=render_hocr_page,
        input=task_ocr_tesseract_hocr,
        filter=regex(r".*/(\d{6})(?:\.hocr)"),
        output=os.path.join(work_folder, r'\1.text.pdf'),
        extras=[log, context])
    task_render_hocr_page.graphviz(fillcolor='"#00cc66"')
    task_render_hocr_page.active_if(options.pdf_renderer == 'hocr')

    task_render_hocr_debug_page = main_pipeline.collate(
        task_func=render_hocr_debug_page,
        input=[task_select_visible_page_image, task_ocr_tesseract_hocr],
        filter=regex(r".*/(\d{6})(?:\.image|\.hocr)"),
        output=os.path.join(work_folder, r'\1.debug.pdf'),
        extras=[log, context])
    task_render_hocr_debug_page.graphviz(fillcolor='"#00cc66"')
    task_render_hocr_debug_page.active_if(options.pdf_renderer == 'hocr')
    task_render_hocr_debug_page.active_if(options.debug_rendering)

    # Tesseract OCR + text only PDF
    task_ocr_tesseract_textonly_pdf = main_pipeline.collate(
        task_func=ocr_tesseract_textonly_pdf,
        input=[task_select_ocr_image, task_orient_page],
        filter=regex(r".*/(\d{6})(?:\.ocr.png|\.ocr\.oriented\.pdf)"),
        output=[os.path.join(work_folder, r'\1.text.pdf'),
                os.path.join(work_folder, r'\1.text.txt')],
        extras=[log, context])
    task_ocr_tesseract_textonly_pdf.graphviz(fillcolor='"#ff69b4"')
    task_ocr_tesseract_textonly_pdf.active_if(options.pdf_renderer == 'sandwich')
    if tesseract.v4():
        task_ocr_tesseract_textonly_pdf.jobs_limit(2)

    task_combine_layers = main_pipeline.collate(
        task_func=combine_layers,
        input=[task_render_hocr_page,
               task_ocr_tesseract_textonly_pdf,
               task_select_image_layer],
        filter=regex(r".*/(\d{6})(?:\.text\.pdf|\.image-layer\.pdf)"),
        output=os.path.join(work_folder, r'\1.rendered.pdf'),
        extras=[log, context])
    task_combine_layers.graphviz(fillcolor='"#00cc66"')
    task_combine_layers.active_if(options.pdf_renderer == 'hocr' or 
                                  options.pdf_renderer == 'sandwich')

    # Tesseract OCR+PDF
    task_ocr_tesseract_and_render_pdf = main_pipeline.collate(
        task_func=ocr_tesseract_and_render_pdf,
        input=[task_select_visible_page_image, task_orient_page],
        filter=regex(r".*/(\d{6})(?:\.image|\.ocr\.oriented\.pdf)"),
        output=[os.path.join(work_folder, r'\1.rendered.pdf'),
                os.path.join(work_folder, r'\1.rendered.txt')],
        extras=[log, context])
    task_ocr_tesseract_and_render_pdf.graphviz(fillcolor='"#66ccff"')
    task_ocr_tesseract_and_render_pdf.active_if(options.pdf_renderer == 'tesseract')
    if tesseract.v4():
        task_ocr_tesseract_and_render_pdf.jobs_limit(2)  # Uses multi-core

    # PDF/A
    task_generate_postscript_stub = main_pipeline.transform(
        task_func=generate_postscript_stub,
        input=task_repair_pdf,
        filter=formatter(r'\.repaired\.pdf'),
        output=os.path.join(work_folder, 'pdfa.ps'),
        extras=[log, context])
    task_generate_postscript_stub.active_if(options.output_type.startswith('pdfa'))


    # Bypass valve
    task_skip_page = main_pipeline.transform(
        task_func=skip_page,
        input=task_orient_page,
        filter=suffix('.skip.oriented.pdf'),
        output='.done.pdf',
        output_dir=work_folder,
        extras=[log, context])

    # Merge pages
    task_merge_pages_ghostscript = main_pipeline.merge(
        task_func=merge_pages_ghostscript,
        input=[task_combine_layers,
               task_render_hocr_debug_page,
               task_skip_page,
               task_ocr_tesseract_and_render_pdf,
               task_generate_postscript_stub],
        output=os.path.join(work_folder, 'merged.pdf'),
        extras=[log, context])
    task_merge_pages_ghostscript.active_if(options.output_type.startswith('pdfa'))

    task_merge_pages_qpdf = main_pipeline.merge(
        task_func=merge_pages_qpdf,
        input=[task_combine_layers,
               task_render_hocr_debug_page,
               task_skip_page,
               task_ocr_tesseract_and_render_pdf,
               task_repair_pdf],
        output=os.path.join(work_folder, 'merged.pdf'),
        extras=[log, context])
    task_merge_pages_qpdf.active_if(options.output_type == 'pdf')

    task_merge_sidecars = main_pipeline.merge(
        task_func=merge_sidecars,
        input=[task_ocr_tesseract_hocr,
               task_ocr_tesseract_and_render_pdf,
               task_ocr_tesseract_textonly_pdf],
        output=options.sidecar,
        extras=[log, context])
    task_merge_sidecars.active_if(options.sidecar)

    # Finalize
    main_pipeline.merge(
        task_func=copy_final,
        input=[task_merge_pages_ghostscript, task_merge_pages_qpdf],
        output=options.output_file,
        extras=[log, context])
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='vcf_annotation')
    # Get a list of paths to all the FASTQ files
    vcf_files = state.config.get_option('vcfs')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original VCF files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(
        task_func=stages.original_vcf,
        name='original_vcf',
        output=vcf_file)

    # Decompose VCF using Vt
    pipeline.transform(
        task_func=stages.decompose_vcf,
        name='decompose_vcf',
        input=output_from('original_vcf'),
        # This will be the first input to the stage.
        # We assume the sample name may consist of only alphanumeric
        # characters.
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).vcf'),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the VCF file name (e.g. study/family name.
        # This is needed within the stage for finding out sample specific
        # configuration options
        extras=['{sample[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='{path[0]}/{sample[0]}.decompose.normalize.vcf')

    # FILTER COMMON VARIANTS
    # ADD FILTER COMMON VARIANTS USING VEP

    # Annotate using VEP
    pipeline.transform(
        task_func=stages.annotate_vep,
        name='annotate_vep',
        input=output_from('decompose_vcf'),
        filter=suffix('.vcf'),
        output='.vep.vcf')

    # Annotate using SnpEff
    pipeline.transform(
        task_func=stages.annotate_snpeff,
        name='annotate_snpeff',
        input=output_from('annotate_vep'),
        filter=suffix('.vcf'),
        output='.snpeff.vcf')

    # Mark duplicates in the BAM file using Picard
    pipeline.transform(
        task_func=stages.mark_duplicates_picard,
        name='mark_duplicates_picard',
        input=output_from('sort_bam_picard'),
        filter=suffix('.sort.bam'),
        # XXX should make metricsup an extra output?
        output=['.sort.dedup.bam', '.metricsdup'])

    # Generate chromosome intervals using GATK
    pipeline.transform(
        task_func=stages.chrom_intervals_gatk,
        name='chrom_intervals_gatk',
        input=output_from('mark_duplicates_picard'),
        filter=suffix('.sort.dedup.bam'),
        output='.chr.intervals')

    # Local realignment using GATK
    (pipeline.transform(
        task_func=stages.local_realignment_gatk,
        name='local_realignment_gatk',
        input=output_from('chrom_intervals_gatk'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).chr.intervals'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'),
        output='{path[0]}/{sample[0]}.sort.dedup.realn.bam')
        .follows('mark_duplicates_picard'))

    # Base recalibration using GATK
    pipeline.transform(
        task_func=stages.base_recalibration_gatk,
        name='base_recalibration_gatk',
        input=output_from('local_realignment_gatk'),
        filter=suffix('.sort.dedup.realn.bam'),
        output=['.recal_data.csv', '.count_cov.log'])

    # Print reads using GATK
    (pipeline.transform(
        task_func=stages.print_reads_gatk,
        name='print_reads_gatk',
        input=output_from('base_recalibration_gatk'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).recal_data.csv'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.realn.bam'),
        output='{path[0]}/{sample[0]}.sort.dedup.realn.recal.bam')
        .follows('local_realignment_gatk'))

    # Call variants using GATK
    pipeline.transform(
        task_func=stages.call_variants_gatk,
        name='call_variants_gatk',
        input=output_from('print_reads_gatk'),
        filter=suffix('.sort.dedup.realn.recal.bam'),
        output='.raw.snps.indels.g.vcf')

    # Combine G.VCF files for all samples using GATK
    pipeline.merge(
        task_func=stages.combine_gvcf_gatk,
        name='combine_gvcf_gatk',
        input=output_from('call_variants_gatk'),
        output='PCExomes.mergegvcf.vcf')

    # Genotype G.VCF files using GATK
    pipeline.transform(
        task_func=stages.genotype_gvcf_gatk,
        name='genotype_gvcf_gatk',
        input=output_from('combine_gvcf_gatk'),
        filter=suffix('.mergegvcf.vcf'),
        output='.genotyped.vcf')

    # SNP recalibration using GATK
    pipeline.transform(
        task_func=stages.snp_recalibrate_gatk,
        name='snp_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        output=['.snp_recal', '.snp_tranches', '.snp_plots.R'])

    # INDEL recalibration using GATK
    pipeline.transform(
        task_func=stages.indel_recalibrate_gatk,
        name='indel_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        output=['.indel_recal', '.indel_tranches', '.indel_plots.R'])

    # Apply SNP recalibration using GATK
    (pipeline.transform(
        task_func=stages.apply_snp_recalibrate_gatk,
        name='apply_snp_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        add_inputs=add_inputs(['PCExomes.snp_recal', 'PCExomes.snp_tranches']),
        output='.recal_SNP.vcf')
        .follows('snp_recalibrate_gatk'))

    # Apply INDEL recalibration using GATK
    (pipeline.transform(
        task_func=stages.apply_indel_recalibrate_gatk,
        name='apply_indel_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        add_inputs=add_inputs(
            ['PCExomes.indel_recal', 'PCExomes.indel_tranches']),
        output='.recal_INDEL.vcf')
        .follows('indel_recalibrate_gatk'))

    # Combine variants using GATK
    (pipeline.transform(
        task_func=stages.combine_variants_gatk,
        name='combine_variants_gatk',
        input=output_from('apply_snp_recalibrate_gatk'),
        filter=suffix('.recal_SNP.vcf'),
        add_inputs=add_inputs(['PCExomes.recal_INDEL.vcf']),
        output='.combined.vcf')
        .follows('apply_indel_recalibrate_gatk'))

    # Select variants using GATK
    pipeline.transform(
        task_func=stages.select_variants_gatk,
        name='select_variants_gatk',
        input=output_from('combine_variants_gatk'),
        filter=suffix('.combined.vcf'),
        output='.selected.vcf')

    return pipeline
Example #18
0
# ___________________________________________________________________________
@split(1, [tempdir + "/" + prefix + "_name.tmp1" for prefix in "abcd"])
def generate_initial_files1(in_name, out_names):
    for on in out_names:
        with open(on, 'w') as outfile:
            pass

# ___________________________________________________________________________
#
#   check_product_task
# ___________________________________________________________________________


@mkdir(tempdir + "/test1")
@mkdir(tempdir + "/test2")
@mkdir(generate_initial_files1, formatter(),
       ["{path[0]}/{basename[0]}.dir", 3, "{path[0]}/{basename[0]}.dir2"])
@transform(generate_initial_files1,
           formatter(),
           "{path[0]}/{basename[0]}.dir/{basename[0]}.tmp2")
def check_transform(infiles, outfile):
    with open(outfile, "w") as p:
        pass


@mkdir(tempdir + "/test3")
@mkdir(generate_initial_files1, formatter(),
       "{path[0]}/{basename[0]}.dir2")
def check_transform2():
    print("    Loose cannon!", file=sys.stderr)
Example #19
0
        fa_flag = os.path.join(fa_dir, fa_flag_bn)
        fas.extend([fa_all, fa_mer])
        fa_flags.append(fa_flag)

    return k_sizes, bfs, bf_flags, fas, fa_flags

K_MER_SIZES, BFS, BF_FLAGS, FAS, FA_FLAGS = gen_vars(INPUT_FQS)

for __ in FAS:
    print __

for __ in FA_FLAGS:
    print __


@R.mkdir(INPUT_FQS, R.formatter(PATH_RE), ['{prefix[0]}/kon/{chr[0]}/bf'])
@R.collate(INPUT_FQS, R.formatter(), BFS + BF_FLAGS)
def abyss_bloom(input_fqs, outputs):
    fq1, fq2 = input_fqs
    for k_mer_size, bf, bf_flag in zip(K_MER_SIZES, BFS, BF_FLAGS):
        cmd = CONFIG['abyss_bloom']['cmd'].format(**locals())
        # cmd = ('abyss-bloom build -v -k {k_mer_size} -j 8 -b 3G -l 2 -q 15 - '
        #        '{fq1} {fq2} '
        #        '| gzip -c > {bf}'.format(**locals()))
        execute(cmd, flag=bf_flag)


@R.follow(abyss_bloom)
@R.mkdir(abyss_bloom, R.formatter(PATH_RE), ['{subpath[0][1]}/fafq'])
@R.collate(abyss_bloom, R.formatter(), FAS + FA_FLAGS)
def konnector(input_fqs, outputs):
Example #20
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='complexo')
    # Get a list of paths to all the FASTQ files
    fastq_files = state.config.get_option('fastqs')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(
        task_func=stages.original_fastqs,
        name='original_fastqs',
        output=fastq_files)

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name. 
        # This will be the first input to the stage.
        # We assume the sample name may consist of only alphanumeric
        # characters.
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+)_R1.fastq.gz'),
        # Add one more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=['{sample[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='{path[0]}/{sample[0]}.bam')

    # Sort the BAM file using Picard 
    pipeline.transform(
        task_func=stages.sort_bam_picard,
        name='sort_bam_picard',
        input=output_from('align_bwa'),
        filter=suffix('.bam'),
        output='.sort.bam')

    # Mark duplicates in the BAM file using Picard 
    pipeline.transform(
        task_func=stages.mark_duplicates_picard,
        name='mark_duplicates_picard',
        input=output_from('sort_bam_picard'),
        filter=suffix('.sort.bam'),
        # XXX should make metricsup an extra output?
        output=['.sort.dedup.bam', '.metricsdup'])

    # Generate chromosome intervals using GATK 
    pipeline.transform(
        task_func=stages.chrom_intervals_gatk,
        name='chrom_intervals_gatk',
        input=output_from('mark_duplicates_picard'),
        filter=suffix('.sort.dedup.bam'),
        output='.chr.intervals')

    # Local realignment using GATK 
    (pipeline.transform(
        task_func=stages.local_realignment_gatk,
        name='local_realignment_gatk',
        input=output_from('chrom_intervals_gatk'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).chr.intervals'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'),
        output='{path[0]}/{sample[0]}.sort.dedup.realn.bam')
        .follows('mark_duplicates_picard'))

    # Base recalibration using GATK 
    pipeline.transform(
        task_func=stages.base_recalibration_gatk,
        name='base_recalibration_gatk',
        input=output_from('local_realignment_gatk'),
        filter=suffix('.sort.dedup.realn.bam'),
        output=['.recal_data.csv', '.count_cov.log'])

    # Print reads using GATK 
    (pipeline.transform(
        task_func=stages.print_reads_gatk,
        name='print_reads_gatk',
        input=output_from('base_recalibration_gatk'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).recal_data.csv'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.realn.bam'),
        output='{path[0]}/{sample[0]}.sort.dedup.realn.recal.bam')
        .follows('local_realignment_gatk'))

    # Call variants using GATK 
    pipeline.transform(
        task_func=stages.call_variants_gatk,
        name='call_variants_gatk',
        input=output_from('print_reads_gatk'),
        filter=suffix('.sort.dedup.realn.recal.bam'),
        output='.raw.snps.indels.g.vcf')

    # Combine G.VCF files for all samples using GATK
    pipeline.merge(
        task_func=stages.combine_gvcf_gatk,
        name='combine_gvcf_gatk',
        input=output_from('call_variants_gatk'),
        output='PCExomes.mergegvcf.vcf')

    # Genotype G.VCF files using GATK 
    pipeline.transform(
        task_func=stages.genotype_gvcf_gatk,
        name='genotype_gvcf_gatk',
        input=output_from('combine_gvcf_gatk'),
        filter=suffix('.mergegvcf.vcf'),
        output='.genotyped.vcf')

    # SNP recalibration using GATK
    pipeline.transform(
        task_func=stages.snp_recalibrate_gatk,
        name='snp_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        output=['.snp_recal', '.snp_tranches', '.snp_plots.R'])

    # INDEL recalibration using GATK
    pipeline.transform(
        task_func=stages.indel_recalibrate_gatk,
        name='indel_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        output=['.indel_recal', '.indel_tranches', '.indel_plots.R'])

    # Apply SNP recalibration using GATK  
    (pipeline.transform(
        task_func=stages.apply_snp_recalibrate_gatk,
        name='apply_snp_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        add_inputs=add_inputs(['PCExomes.snp_recal', 'PCExomes.snp_tranches']),
        output='.recal_SNP.vcf')
        .follows('snp_recalibrate_gatk'))

    # Apply INDEL recalibration using GATK  
    (pipeline.transform(
        task_func=stages.apply_indel_recalibrate_gatk,
        name='apply_indel_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        add_inputs=add_inputs(['PCExomes.indel_recal', 'PCExomes.indel_tranches']),
        output='.recal_INDEL.vcf')
        .follows('indel_recalibrate_gatk'))

    # Combine variants using GATK  
    (pipeline.transform(
        task_func=stages.combine_variants_gatk,
        name='combine_variants_gatk',
        input=output_from('apply_snp_recalibrate_gatk'),
        filter=suffix('.recal_SNP.vcf'),
        add_inputs=add_inputs(['PCExomes.recal_INDEL.vcf']),
        output='.combined.vcf')
        .follows('apply_indel_recalibrate_gatk'))

    # Select variants using GATK 
    pipeline.transform(
        task_func=stages.select_variants_gatk,
        name='select_variants_gatk',
        input=output_from('combine_variants_gatk'),
        filter=suffix('.combined.vcf'),
        output='.selected.vcf')

    return pipeline
Example #21
0
def make_pipeline(state):
    """Build the pipeline by constructing stages and connecting them together"""
    # Build an empty pipeline
    pipeline = Pipeline(name="crpipe")
    # Get a list of paths to all the FASTQ files
    fastq_files = state.config.get_option("fastqs")
    # Find the path to the reference genome
    # Stages are dependent on the state
    stages = Stages(state)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.original_fastqs, name="original_fastqs", output=fastq_files)

    # Convert FASTQ file to FASTA using fastx toolkit
    # pipeline.transform(
    #     task_func=stages.fastq_to_fasta,
    #     name='fastq_to_fasta',
    #     input=output_from('original_fastqs'),
    #     filter=suffix('.fastq.gz'),
    #     output='.fasta')

    # The original reference file
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    # pipeline.originate(
    #    task_func=stages.original_reference,
    #    name='original_reference',
    #    output=reference_file)

    # Run fastQC on the FASTQ files
    pipeline.transform(
        task_func=stages.fastqc,
        name="fastqc",
        input=output_from("original_fastqs"),
        filter=suffix(".fastq.gz"),
        output="_fastqc",
    )

    # Index the reference using BWA
    # pipeline.transform(
    #    task_func=stages.index_reference_bwa,
    #    name='index_reference_bwa',
    #    input=output_from('original_reference'),
    #    filter=suffix('.fa'),
    #    output=['.fa.amb', '.fa.ann', '.fa.pac', '.fa.sa', '.fa.bwt'])

    # Index the reference using samtools
    # pipeline.transform(
    #     task_func=stages.index_reference_samtools,
    #    name='index_reference_samtools',
    #    input=output_from('original_reference'),
    #    filter=suffix('.fa'),
    #    output='.fa.fai')

    # Index the reference using bowtie 2
    # pipeline.transform(
    #     task_func=stages.index_reference_bowtie2,
    #     name='index_reference_bowtie2',
    #     input=output_from('original_reference'),
    #     filter=formatter('.+/(?P<refname>[a-zA-Z0-9]+\.fa)'),
    #     output=['{path[0]}/{refname[0]}.1.bt2',
    #             '{path[0]}/{refname[0]}.2.bt2',
    #             '{path[0]}/{refname[0]}.3.bt2',
    #             '{path[0]}/{refname[0]}.4.bt2',
    #             '{path[0]}/{refname[0]}.rev.1.bt2',
    #             '{path[0]}/{refname[0]}.rev.2.bt2'],
    #     extras=['{path[0]}/{refname[0]}'])

    # # Create a FASTA sequence dictionary for the reference using picard
    # pipeline.transform(
    #     task_func=stages.reference_dictionary_picard,
    #     name='reference_dictionary_picard',
    #     input=output_from('original_reference'),
    #     filter=suffix('.fa'),
    #     output='.dict')

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name="align_bwa",
        input=output_from("original_fastqs"),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        # We assume the sample name may consist of only alphanumeric
        # characters.
        filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+)_R1.fastq.gz"),
        # Add two more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        add_inputs=add_inputs("{path[0]}/{sample[0]}_R2.fastq.gz"),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=["{sample[0]}"],
        # The output file name is the sample name with a .bam extension.
        output="{path[0]}/{sample[0]}.bam",
    )

    # Sort alignment with sambamba
    pipeline.transform(
        task_func=stages.sort_bam_sambamba,
        name="sort_alignment",
        input=output_from("align_bwa"),
        filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).bam"),
        output="{path[0]}/{sample[0]}.sorted.bam",
    )

    # Extract MMR genes from the sorted BAM file
    pipeline.transform(
        task_func=stages.extract_genes_bedtools,
        name="extract_genes_bedtools",
        input=output_from("sort_alignment"),
        filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).sorted.bam"),
        output="{path[0]}/{sample[0]}.mmr.bam",
    )

    # Extract selected chromosomes from the sorted BAM file
    pipeline.transform(
        task_func=stages.extract_chromosomes_samtools,
        name="extract_chromosomes_samtools",
        input=output_from("sort_alignment"),
        filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).sorted.bam"),
        output="{path[0]}/{sample[0]}.chroms.bam",
    )

    # Index the MMR genes bam file with samtools
    pipeline.transform(
        task_func=stages.index_bam,
        name="index_mmr_alignment",
        input=output_from("extract_genes_bedtools"),
        filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).mmr.bam"),
        output="{path[0]}/{sample[0]}.mmr.bam.bai",
    )

    # Compute depth of coverage of the alignment with GATK DepthOfCoverage
    # pipeline.transform(
    #    task_func=stages.alignment_coverage_gatk,
    #    name='alignment_coverage_gatk',
    #    input=output_from('sort_alignment'),
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
    #    add_inputs=add_inputs([reference_file]),
    #    output='{path[0]}/{sample[0]}.coverage_summary',
    #    extras=['{path[0]}/{sample[0]}_coverage'])

    # Index the alignment with samtools
    pipeline.transform(
        task_func=stages.index_bam,
        name="index_alignment",
        input=output_from("sort_alignment"),
        filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).sorted.bam"),
        output="{path[0]}/{sample[0]}.sorted.bam.bai",
    )

    # Generate alignment stats with bamtools
    pipeline.transform(
        task_func=stages.bamtools_stats,
        name="bamtools_stats",
        input=output_from("align_bwa"),
        filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).bam"),
        output="{path[0]}/{sample[0]}.stats.txt",
    )

    # Extract the discordant paired-end alignments
    pipeline.transform(
        task_func=stages.extract_discordant_alignments,
        name="extract_discordant_alignments",
        input=output_from("align_bwa"),
        filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).bam"),
        output="{path[0]}/{sample[0]}.discordants.unsorted.bam",
    )

    # Extract split-read alignments
    pipeline.transform(
        task_func=stages.extract_split_read_alignments,
        name="extract_split_read_alignments",
        input=output_from("align_bwa"),
        filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).bam"),
        output="{path[0]}/{sample[0]}.splitters.unsorted.bam",
    )

    # Sort discordant reads.
    # Samtools annoyingly takes the prefix of the output bam name as its argument.
    # So we pass this as an extra argument. However Ruffus needs to know the full name
    # of the output bam file, so we pass that as the normal output parameter.
    pipeline.transform(
        task_func=stages.sort_bam,
        name="sort_discordants",
        input=output_from("extract_discordant_alignments"),
        filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).discordants.unsorted.bam"),
        extras=["{path[0]}/{sample[0]}.discordants"],
        output="{path[0]}/{sample[0]}.discordants.bam",
    )

    # Index the sorted discordant bam with samtools
    # pipeline.transform(
    #   task_func=stages.index_bam,
    #   name='index_discordants',
    #   input=output_from('sort_discordants'),
    #   filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).discordants.bam'),
    #   output='{path[0]}/{sample[0]}.discordants.bam.bai')

    # Sort discordant reads
    # Samtools annoyingly takes the prefix of the output bam name as its argument.
    # So we pass this as an extra argument. However Ruffus needs to know the full name
    # of the output bam file, so we pass that as the normal output parameter.
    pipeline.transform(
        task_func=stages.sort_bam,
        name="sort_splitters",
        input=output_from("extract_split_read_alignments"),
        filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).splitters.unsorted.bam"),
        extras=["{path[0]}/{sample[0]}.splitters"],
        output="{path[0]}/{sample[0]}.splitters.bam",
    )

    # Index the sorted splitters bam with samtools
    # pipeline.transform(
    #    task_func=stages.index_bam,
    #    name='index_splitters',
    #    input=output_from('sort_splitters'),
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).splitters.bam'),
    #    output='{path[0]}/{sample[0]}.splitters.bam.bai')

    # Call structural variants with lumpy
    (
        pipeline.transform(
            task_func=stages.structural_variants_lumpy,
            name="structural_variants_lumpy",
            input=output_from("sort_alignment"),
            filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).sorted.bam"),
            add_inputs=add_inputs(["{path[0]}/{sample[0]}.splitters.bam", "{path[0]}/{sample[0]}.discordants.bam"]),
            output="{path[0]}/{sample[0]}.lumpy.vcf",
        )
        .follows("index_alignment")
        .follows("sort_splitters")
        .follows("sort_discordants")
    )

    # Call genotypes on lumpy output using SVTyper
    # (pipeline.transform(
    #    task_func=stages.genotype_svtyper,
    #    name='genotype_svtyper',
    #    input=output_from('structural_variants_lumpy'),
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).lumpy.vcf'),
    #    add_inputs=add_inputs(['{path[0]}/{sample[0]}.sorted.bam', '{path[0]}/{sample[0]}.splitters.bam']),
    #    output='{path[0]}/{sample[0]}.svtyper.vcf')
    #    .follows('align_bwa')
    #    .follows('sort_splitters')
    #    .follows('index_alignment')
    #    .follows('index_splitters')
    #    .follows('index_discordants'))

    # Call SVs with Socrates
    (
        pipeline.transform(
            task_func=stages.structural_variants_socrates,
            name="structural_variants_socrates",
            input=output_from("sort_alignment"),
            filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).sorted.bam"),
            # output goes to {path[0]}/socrates/
            output="{path[0]}/socrates/results_Socrates_paired_{sample[0]}.sorted_long_sc_l25_q5_m5_i95.txt",
            extras=["{path[0]}"],
        )
    )

    # Call DELs with DELLY
    pipeline.merge(
        task_func=stages.deletions_delly,
        name="deletions_delly",
        input=output_from("sort_alignment"),
        output="delly.DEL.vcf",
    )

    # Call DUPs with DELLY
    pipeline.merge(
        task_func=stages.duplications_delly,
        name="duplications_delly",
        input=output_from("sort_alignment"),
        output="delly.DUP.vcf",
    )

    # Call INVs with DELLY
    pipeline.merge(
        task_func=stages.inversions_delly,
        name="inversions_delly",
        input=output_from("sort_alignment"),
        output="delly.INV.vcf",
    )

    # Call TRAs with DELLY
    pipeline.merge(
        task_func=stages.translocations_delly,
        name="translocations_delly",
        input=output_from("sort_alignment"),
        output="delly.TRA.vcf",
    )

    # Join both read pair files using gustaf_mate_joining
    # pipeline.transform(
    #    task_func=stages.gustaf_mate_joining,
    #    name='gustaf_mate_joining',
    #    input=output_from('fastq_to_fasta'),
    #    # Match the R1 (read 1) FASTA file and grab the path and sample name.
    #    # This will be the first input to the stage.
    #    # We assume the sample name may consist of only alphanumeric
    #    # characters.
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+)_R1.fasta'),
    #    # Add one more input to the stage:
    #    #    1. The corresponding R2 FASTA file
    #    add_inputs=add_inputs(['{path[0]}/{sample[0]}_R2.fasta']),
    #    output='{path[0]}/{sample[0]}.joined_mates.fasta')

    # Call structural variants with pindel
    # (pipeline.transform(
    #    task_func=stages.structural_variants_pindel,
    #    name='structural_variants_pindel',
    #    input=output_from('sort_alignment'),
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
    #    add_inputs=add_inputs(['{path[0]}/{sample[0]}.pindel_config.txt', reference_file]),
    #    output='{path[0]}/{sample[0]}.pindel')
    #    .follows('index_reference_bwa')
    #    .follows('index_reference_samtools'))

    return pipeline
        os.unlink(f)


    #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    #
    #   Create more files than the previous invocation
    #
    #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    n_to_produce = len(outfiles) + 1
    for i in range(n_to_produce):
        f = '{}{}.split'.format(tempdir, i)
        open(f, 'a').close()



@subdivide(split_start, formatter(), tempdir + '{basename[0]}_*.subdivided', tempdir + '{basename[0]}')
def subdivide_start(infile, outfiles, infile_basename):
    # cleanup existing
    for f in outfiles:
        os.unlink(f)

    #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    #
    #   Create more files than the previous invocation
    #
    #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    n_to_produce = len(outfiles) + 1
    for i in range(    n_to_produce):
        open('{}_{}.subdivided'.format(infile_basename, i), 'a').close()


#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888

#   Tasks


#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
@mkdir(tempdir)
@originate([os.path.join(tempdir, ff + ".tmp") for ff in "abcd"])
def generate_initial_files(out_name):
    with open(out_name, 'w') as outfile:
        pass


@transform(input = generate_initial_files, filter=formatter(), output = "{path[0]}/{basename[0]}.task1.{whatever}",
                        extras=['echo {dynamic_message} > {some_file}'])
def transform_with_missing_formatter_args(input_file, output_files, output1):
    print ("input = %r, output = %r, extras = %r" % (input_file, output_files, output1))


class Test_ruffus(unittest.TestCase):
    #___________________________________________________________________________
    #
    #   setup and cleanup
    #___________________________________________________________________________
    def setUp(self):
        import os
        try:
            shutil.rmtree(tempdir)
        except:
Example #24
0
#
#   generate_initial_files1
#___________________________________________________________________________
@originate([tempdir +  "/g_name.tmp1", tempdir +  "/h_name.tmp1"])
def generate_initial_files3(out_name):
    with open(out_name, 'w') as outfile:
        pass

#___________________________________________________________________________
#
#   test_product_task
#___________________________________________________________________________
@follows(generate_initial_files1)
@product(
        [tempdir +  "/" + prefix + "_name.tmp1" for prefix in "abcd"],
        formatter(".*/(?P<FILE_PART>.+).tmp1$" ),
        generate_initial_files2,
        formatter(),
        "{path[0][0]}/{FILE_PART[0][0]}.{basename[1][0]}.{basename[2][0]}.tmp2",
        input3 = generate_initial_files3,
        filter3 = formatter(r"tmp1$" ),
        extras = [  "{basename[0][0][0]}{basename[1][0][0]}{basename[2][0][0]}",       # extra: prefices only (abcd etc)
                    "{subpath[0][0][0]}",      # extra: path for 2nd input, 1st file
                    "{subdir[0][0][0]}"])
def test_product_task( infiles, outfile,
            prefices,
            subpath,
            subdir):
    with open(outfile, "w") as p:
        p.write(prefices + ",")
Example #25
0
            "appearances": len(character["films"]),
            "species_id": species_value,
        }
        data.append(row)

    df = pd.DataFrame(data=data)
    df = df.set_index("appearance")
    df.to_csv(output_file)


clean_dir = os.path.join(DATADIR, "clean")
clean_files = [os.path.join(clean_dir, "clean.csv")]


@mkdir(clean_dir)
@transform(get_character_data, formatter(r".*?\.csv"),
           os.path.join(clean_dir, "cleaned.csv"))
def clean_data(input_file, output_file):
    """
    Remove character rows with "unknown" height.
    Take the top ten characters, sorted by appearances, descending)
    """
    df = pd.read_csv(input_file, index_col="appearances")
    # df = df.reset_index(drop=True)
    df = df.fillna("")

    remove_unknown_df = df[df['height'] != "unknown"].copy()
    df = remove_unknown_df.sort_index(ascending=False)

    df = df.head(10)
    df.to_csv(output_file)