def create_workflow_1(input_filename, output_filename): workflow = pypeliner.workflow.Workflow(default_ctx={'mem': 1}) # Read data into a managed object workflow.transform(name='read', func=read_stuff, ret=mgd.TempOutputObj('input_data'), args=(mgd.InputFile(input_filename), )) # Extract a property of the managed object, modify it # and store the result in another managed object workflow.transform( name='do', func=do_stuff, ret=mgd.TempOutputObj('output_data'), args=(mgd.TempInputObj('input_data').prop('some_string'), )) # Write the object to an output file workflow.transform(name='write', func=write_stuff, args=(mgd.TempInputObj('output_data'), mgd.TempOutputFile('output_file'))) # Recursive workflow workflow.subworkflow(name='sub_workflow_2', func=create_workflow_2, args=(mgd.TempInputFile('output_file'), mgd.OutputFile(output_filename))) return workflow
def create_destruct_workflow( bam_filenames, breakpoint_table, breakpoint_library_table, breakpoint_read_table, config, ref_data_dir, raw_data_dir=None, ): # Optionally cache raw reads for quicker rerun if raw_data_dir is not None: mgd_stats = mgd.File( os.path.join(raw_data_dir, '{bylibrary}_stats.txt'), 'bylibrary') mgd_reads_1 = mgd.File( os.path.join(raw_data_dir, '{bylibrary}_reads1.fq.gz'), 'bylibrary') mgd_reads_2 = mgd.File( os.path.join(raw_data_dir, '{bylibrary}_reads2.fq.gz'), 'bylibrary') mgd_sample_1 = mgd.File( os.path.join(raw_data_dir, '{bylibrary}_sample1.fq.gz'), 'bylibrary') mgd_sample_2 = mgd.File( os.path.join(raw_data_dir, '{bylibrary}_sample2.fq.gz'), 'bylibrary') else: mgd_stats = mgd.TempFile('stats.txt', 'bylibrary') mgd_reads_1 = mgd.TempFile('reads1.fq.gz', 'bylibrary') mgd_reads_2 = mgd.TempFile('reads2.fq.gz', 'bylibrary') mgd_sample_1 = mgd.TempFile('sample1.fq.gz', 'bylibrary') mgd_sample_2 = mgd.TempFile('sample2.fq.gz', 'bylibrary') config = destruct.defaultconfig.get_config(ref_data_dir, config) workflow = pypeliner.workflow.Workflow() # Set the library ids workflow.setobj( obj=mgd.TempOutputObj('library_id', 'bylibrary'), value=destruct.tasks.create_library_ids(bam_filenames.keys()), ) # Retrieve discordant reads and stats from bam files workflow.commandline( name='bamdisc', axes=('bylibrary', ), ctx={ 'io': 1, 'mem': 8 }, args=( 'destruct_bamdiscordantfastq', '-r', '-c', config['bam_max_soft_clipped'], '-f', config['bam_max_fragment_length'], '-b', mgd.InputFile('bam', 'bylibrary', fnames=bam_filenames), '-s', mgd_stats.as_output(), '--fastq1', mgd_reads_1.as_output(), '--fastq2', mgd_reads_2.as_output(), '-t', mgd.TempSpace('bamdisc.tempspace', 'bylibrary'), '-n', config['num_read_samples'], '--sample1', mgd_sample_1.as_output(), '--sample2', mgd_sample_2.as_output(), ), ) workflow.subworkflow( name='destruct_fastq', func=create_destruct_fastq_workflow, args=( mgd_reads_1.as_input(), mgd_reads_2.as_input(), mgd_sample_1.as_input(), mgd_sample_2.as_input(), mgd_stats.as_input(), mgd.OutputFile(breakpoint_table), mgd.OutputFile(breakpoint_library_table), mgd.OutputFile(breakpoint_read_table), config, ref_data_dir, ), kwargs={ 'raw_data_dir': raw_data_dir, }, ) return workflow
def generate_bam( simulation_params, chromosomes, include_nonchromosomal, simulated_bam_filename, genome_fasta_filename, simulated_table_filename, raw_data_dir, ): workflow = pypeliner.workflow.Workflow(default_ctx={'mem': 4}) workflow.setobj(mgd.TempOutputObj('simulation.params'), simulation_params) workflow.setobj(mgd.TempOutputObj('chromosomes'), chromosomes) workflow.setobj(mgd.TempOutputObj('include_nonchromosomal'), include_nonchromosomal) workflow.transform( name='create_genome', func=destruct.benchmark.destruct_test.create_genome, args=( mgd.TempInputObj('chromosomes'), mgd.TempInputObj('include_nonchromosomal'), mgd.OutputFile(genome_fasta_filename), ), ) workflow.transform( name='create_sim', func=destruct.benchmark.create_breakpoint_simulation.create, args=( mgd.TempInputObj('simulation.params'), mgd.InputFile(genome_fasta_filename), mgd.OutputFile(os.path.join(raw_data_dir, 'simulated.fasta')), mgd.OutputFile(simulated_table_filename), mgd.TempOutputFile('concordant.1.fastq'), mgd.TempOutputFile('concordant.2.fastq'), mgd.TempOutputFile('discordant.1.fastq'), mgd.TempOutputFile('discordant.2.fastq'), ), ) workflow.commandline( name='cat1', args=( 'cat', mgd.TempInputFile('concordant.1.fastq'), mgd.TempInputFile('discordant.1.fastq'), '>', mgd.OutputFile(os.path.join(raw_data_dir, 'simulated.1.fastq')), ), ) workflow.commandline( name='cat2', args=( 'cat', mgd.TempInputFile('concordant.2.fastq'), mgd.TempInputFile('discordant.2.fastq'), '>', mgd.OutputFile(os.path.join(raw_data_dir, 'simulated.2.fastq')), ), ) workflow.subworkflow( name='bwa_align', func=destruct.benchmark.align.bwa.workflow.bwa_align_workflow, args=( mgd.InputFile(genome_fasta_filename), mgd.InputFile(os.path.join(raw_data_dir, 'simulated.1.fastq')), mgd.InputFile(os.path.join(raw_data_dir, 'simulated.2.fastq')), mgd.TempOutputFile('simulated.unsorted.bam'), ), ) workflow.transform( name='samtools_sort_index', func=destruct.benchmark.destruct_test.samtools_sort_index, args=( mgd.TempInputFile('simulated.unsorted.bam'), mgd.OutputFile(simulated_bam_filename), ), ) return workflow
mgd.OutputFile( os.path.join(args['results_dir'], 'simulated.1.fastq')), '-2', mgd.OutputFile( os.path.join(args['results_dir'], 'simulated.2.fastq')), ), ) workflow.subworkflow( name='bwa_align', func=destruct.benchmark.align.bwa.workflow.bwa_align_workflow, args=( mgd.InputFile(genome_fasta), mgd.InputFile( os.path.join(args['results_dir'], 'simulated.1.fastq')), mgd.InputFile( os.path.join(args['results_dir'], 'simulated.2.fastq')), mgd.TempOutputFile('simulated.unsorted.bam'), ), kwargs={ 'read_group_str': '@RG\\tID:B', }, ) workflow.transform( name='samtools_merge_sort_index', func=destruct.benchmark.destruct_test.samtools_merge_sort_index, args=( mgd.TempOutputFile('tumour.raw.bam'), mgd.TempInputFile('tumour.unspiked.bam'), mgd.TempInputFile('simulated.unsorted.bam'),
workflow = pypeliner.workflow.Workflow(default_ctx=ctx) workflow.setobj(mgd.TempOutputObj('simulation.params'), sim_config['simulation']) workflow.setobj(mgd.TempOutputObj('chromosomes'), sim_config['reference']['chromosomes']) workflow.setobj(mgd.TempOutputObj('include_nonchromosomal'), sim_config['reference']['include_nonchromosomal']) workflow.subworkflow( name='generate_bam_workflow', func=destruct.benchmark.generate_bam.generate_bam, args=( mgd.TempInputObj('simulation.params'), mgd.TempInputObj('chromosomes'), mgd.TempInputObj('include_nonchromosomal'), mgd.OutputFile(os.path.join(args['results_dir'], 'simulated.bam')), mgd.OutputFile(os.path.join(args['results_dir'], 'genome.fasta')), mgd.OutputFile(os.path.join(args['results_dir'], 'simulated.tsv')), os.path.join(args['results_dir'], 'raw'), ), ) workflow.setobj( obj=mgd.TempOutputObj('tool_defs', 'tool_name'), value=tool_defs, ) workflow.subworkflow( name='run_tool', axes=('tool_name', ),