Example #1
0
def create_workflow_1(input_filename, output_filename):
    workflow = pypeliner.workflow.Workflow(default_ctx={'mem': 1})

    # Read data into a managed object
    workflow.transform(name='read',
                       func=read_stuff,
                       ret=mgd.TempOutputObj('input_data'),
                       args=(mgd.InputFile(input_filename), ))

    # Extract a property of the managed object, modify it
    # and store the result in another managed object
    workflow.transform(
        name='do',
        func=do_stuff,
        ret=mgd.TempOutputObj('output_data'),
        args=(mgd.TempInputObj('input_data').prop('some_string'), ))

    # Write the object to an output file
    workflow.transform(name='write',
                       func=write_stuff,
                       args=(mgd.TempInputObj('output_data'),
                             mgd.TempOutputFile('output_file')))

    # Recursive workflow
    workflow.subworkflow(name='sub_workflow_2',
                         func=create_workflow_2,
                         args=(mgd.TempInputFile('output_file'),
                               mgd.OutputFile(output_filename)))

    return workflow
Example #2
0
def create_destruct_workflow(
    bam_filenames,
    breakpoint_table,
    breakpoint_library_table,
    breakpoint_read_table,
    config,
    ref_data_dir,
    raw_data_dir=None,
):
    # Optionally cache raw reads for quicker rerun
    if raw_data_dir is not None:
        mgd_stats = mgd.File(
            os.path.join(raw_data_dir, '{bylibrary}_stats.txt'), 'bylibrary')
        mgd_reads_1 = mgd.File(
            os.path.join(raw_data_dir, '{bylibrary}_reads1.fq.gz'),
            'bylibrary')
        mgd_reads_2 = mgd.File(
            os.path.join(raw_data_dir, '{bylibrary}_reads2.fq.gz'),
            'bylibrary')
        mgd_sample_1 = mgd.File(
            os.path.join(raw_data_dir, '{bylibrary}_sample1.fq.gz'),
            'bylibrary')
        mgd_sample_2 = mgd.File(
            os.path.join(raw_data_dir, '{bylibrary}_sample2.fq.gz'),
            'bylibrary')

    else:
        mgd_stats = mgd.TempFile('stats.txt', 'bylibrary')
        mgd_reads_1 = mgd.TempFile('reads1.fq.gz', 'bylibrary')
        mgd_reads_2 = mgd.TempFile('reads2.fq.gz', 'bylibrary')
        mgd_sample_1 = mgd.TempFile('sample1.fq.gz', 'bylibrary')
        mgd_sample_2 = mgd.TempFile('sample2.fq.gz', 'bylibrary')

    config = destruct.defaultconfig.get_config(ref_data_dir, config)

    workflow = pypeliner.workflow.Workflow()

    # Set the library ids

    workflow.setobj(
        obj=mgd.TempOutputObj('library_id', 'bylibrary'),
        value=destruct.tasks.create_library_ids(bam_filenames.keys()),
    )

    # Retrieve discordant reads and stats from bam files

    workflow.commandline(
        name='bamdisc',
        axes=('bylibrary', ),
        ctx={
            'io': 1,
            'mem': 8
        },
        args=(
            'destruct_bamdiscordantfastq',
            '-r',
            '-c',
            config['bam_max_soft_clipped'],
            '-f',
            config['bam_max_fragment_length'],
            '-b',
            mgd.InputFile('bam', 'bylibrary', fnames=bam_filenames),
            '-s',
            mgd_stats.as_output(),
            '--fastq1',
            mgd_reads_1.as_output(),
            '--fastq2',
            mgd_reads_2.as_output(),
            '-t',
            mgd.TempSpace('bamdisc.tempspace', 'bylibrary'),
            '-n',
            config['num_read_samples'],
            '--sample1',
            mgd_sample_1.as_output(),
            '--sample2',
            mgd_sample_2.as_output(),
        ),
    )

    workflow.subworkflow(
        name='destruct_fastq',
        func=create_destruct_fastq_workflow,
        args=(
            mgd_reads_1.as_input(),
            mgd_reads_2.as_input(),
            mgd_sample_1.as_input(),
            mgd_sample_2.as_input(),
            mgd_stats.as_input(),
            mgd.OutputFile(breakpoint_table),
            mgd.OutputFile(breakpoint_library_table),
            mgd.OutputFile(breakpoint_read_table),
            config,
            ref_data_dir,
        ),
        kwargs={
            'raw_data_dir': raw_data_dir,
        },
    )

    return workflow
Example #3
0
def generate_bam(
    simulation_params,
    chromosomes,
    include_nonchromosomal,
    simulated_bam_filename,
    genome_fasta_filename,
    simulated_table_filename,
    raw_data_dir,
):
    workflow = pypeliner.workflow.Workflow(default_ctx={'mem': 4})

    workflow.setobj(mgd.TempOutputObj('simulation.params'), simulation_params)
    workflow.setobj(mgd.TempOutputObj('chromosomes'), chromosomes)
    workflow.setobj(mgd.TempOutputObj('include_nonchromosomal'),
                    include_nonchromosomal)

    workflow.transform(
        name='create_genome',
        func=destruct.benchmark.destruct_test.create_genome,
        args=(
            mgd.TempInputObj('chromosomes'),
            mgd.TempInputObj('include_nonchromosomal'),
            mgd.OutputFile(genome_fasta_filename),
        ),
    )

    workflow.transform(
        name='create_sim',
        func=destruct.benchmark.create_breakpoint_simulation.create,
        args=(
            mgd.TempInputObj('simulation.params'),
            mgd.InputFile(genome_fasta_filename),
            mgd.OutputFile(os.path.join(raw_data_dir, 'simulated.fasta')),
            mgd.OutputFile(simulated_table_filename),
            mgd.TempOutputFile('concordant.1.fastq'),
            mgd.TempOutputFile('concordant.2.fastq'),
            mgd.TempOutputFile('discordant.1.fastq'),
            mgd.TempOutputFile('discordant.2.fastq'),
        ),
    )

    workflow.commandline(
        name='cat1',
        args=(
            'cat',
            mgd.TempInputFile('concordant.1.fastq'),
            mgd.TempInputFile('discordant.1.fastq'),
            '>',
            mgd.OutputFile(os.path.join(raw_data_dir, 'simulated.1.fastq')),
        ),
    )

    workflow.commandline(
        name='cat2',
        args=(
            'cat',
            mgd.TempInputFile('concordant.2.fastq'),
            mgd.TempInputFile('discordant.2.fastq'),
            '>',
            mgd.OutputFile(os.path.join(raw_data_dir, 'simulated.2.fastq')),
        ),
    )

    workflow.subworkflow(
        name='bwa_align',
        func=destruct.benchmark.align.bwa.workflow.bwa_align_workflow,
        args=(
            mgd.InputFile(genome_fasta_filename),
            mgd.InputFile(os.path.join(raw_data_dir, 'simulated.1.fastq')),
            mgd.InputFile(os.path.join(raw_data_dir, 'simulated.2.fastq')),
            mgd.TempOutputFile('simulated.unsorted.bam'),
        ),
    )

    workflow.transform(
        name='samtools_sort_index',
        func=destruct.benchmark.destruct_test.samtools_sort_index,
        args=(
            mgd.TempInputFile('simulated.unsorted.bam'),
            mgd.OutputFile(simulated_bam_filename),
        ),
    )

    return workflow
            mgd.OutputFile(
                os.path.join(args['results_dir'], 'simulated.1.fastq')),
            '-2',
            mgd.OutputFile(
                os.path.join(args['results_dir'], 'simulated.2.fastq')),
        ),
    )

    workflow.subworkflow(
        name='bwa_align',
        func=destruct.benchmark.align.bwa.workflow.bwa_align_workflow,
        args=(
            mgd.InputFile(genome_fasta),
            mgd.InputFile(
                os.path.join(args['results_dir'], 'simulated.1.fastq')),
            mgd.InputFile(
                os.path.join(args['results_dir'], 'simulated.2.fastq')),
            mgd.TempOutputFile('simulated.unsorted.bam'),
        ),
        kwargs={
            'read_group_str': '@RG\\tID:B',
        },
    )

    workflow.transform(
        name='samtools_merge_sort_index',
        func=destruct.benchmark.destruct_test.samtools_merge_sort_index,
        args=(
            mgd.TempOutputFile('tumour.raw.bam'),
            mgd.TempInputFile('tumour.unspiked.bam'),
            mgd.TempInputFile('simulated.unsorted.bam'),
    workflow = pypeliner.workflow.Workflow(default_ctx=ctx)

    workflow.setobj(mgd.TempOutputObj('simulation.params'),
                    sim_config['simulation'])
    workflow.setobj(mgd.TempOutputObj('chromosomes'),
                    sim_config['reference']['chromosomes'])
    workflow.setobj(mgd.TempOutputObj('include_nonchromosomal'),
                    sim_config['reference']['include_nonchromosomal'])

    workflow.subworkflow(
        name='generate_bam_workflow',
        func=destruct.benchmark.generate_bam.generate_bam,
        args=(
            mgd.TempInputObj('simulation.params'),
            mgd.TempInputObj('chromosomes'),
            mgd.TempInputObj('include_nonchromosomal'),
            mgd.OutputFile(os.path.join(args['results_dir'], 'simulated.bam')),
            mgd.OutputFile(os.path.join(args['results_dir'], 'genome.fasta')),
            mgd.OutputFile(os.path.join(args['results_dir'], 'simulated.tsv')),
            os.path.join(args['results_dir'], 'raw'),
        ),
    )

    workflow.setobj(
        obj=mgd.TempOutputObj('tool_defs', 'tool_name'),
        value=tool_defs,
    )

    workflow.subworkflow(
        name='run_tool',
        axes=('tool_name', ),