def create_hla_type_workflow( normal_bam_file, hla_type_file): workflow = Workflow() workflow.commandline( name='extract_chr6', args=( 'samtools', 'view', '-bh', '-f', '2', '-F', '4', pypeliner.managed.InputFile(normal_bam_file), '6', '|', 'samtools', 'collate', '-O', '-', pypeliner.managed.TempSpace('chr6_collate_temp'), '|', 'samtools', 'bam2fq', '-1', pypeliner.managed.TempOutputFile('chr6_reads_1.fq'), '-2', pypeliner.managed.TempOutputFile('chr6_reads_2.fq'), '-', ), ) workflow.transform( name='optitype', ctx={'mem': 24}, func=tasks.run_optitype, args=( pypeliner.managed.TempInputFile('chr6_reads_1.fq'), pypeliner.managed.TempInputFile('chr6_reads_2.fq'), pypeliner.managed.OutputFile(hla_type_file), pypeliner.managed.TempSpace('optitype_temp'), ) ) return workflow
def create_mappability_wig_file(config, out_file): workflow = Workflow() workflow.subworkflow( name='download_mappability_bigwig', func=biowrappers.components.io.download.create_download_workflow, args=( config['mappability_url'], pypeliner.managed.OutputFile(out_file + '.bigwig'), )) workflow.commandline( name='convert_mappability_to_wig', ctx={'mem': 4}, args=( 'mapCounter', '-w', config['window_size'], pypeliner.managed.InputFile(out_file + '.bigwig'), '>', pypeliner.managed.OutputFile(out_file), ), ) return workflow
def create_setup_theta_workflow(config, databases, **kwargs): mappability_dir = os.path.realpath( os.path.join(os.path.dirname(config['mappability_template']), os.pardir)) map_extract_log = os.path.join(mappability_dir, 'mappability_extract.log') chromosomes_dir = os.path.dirname(config['chromosome_template']) utils.make_directory(mappability_dir) utils.make_directory(chromosomes_dir) workflow = Workflow() workflow.subworkflow( name='download_mappability', func=biowrappers.components.io.download.create_download_workflow, args=( config['mappability_url'], pypeliner.managed.TempOutputFile('mappability.tar.gz'), )) workflow.commandline( name='extract_mappability', args=( 'tar', '-xzvf', pypeliner.managed.TempInputFile('mappability.tar.gz'), '-C', mappability_dir, '>', pypeliner.managed.OutputFile(map_extract_log), ), ) for chromosome in config['chromosomes']: workflow.subworkflow( name='download_chromosome_{}'.format(chromosome), func=biowrappers.components.io.download.create_download_workflow, args=( config['chromosome_url_template'].format(chromosome), pypeliner.managed.TempOutputFile( 'chromosome_{}.fa.gz'.format(chromosome)), )) workflow.commandline( name='extract_chromosome_{}'.format(chromosome), args=( 'gunzip', '-c', pypeliner.managed.TempInputFile( 'chromosome_{}.fa.gz'.format(chromosome)), '>', pypeliner.managed.OutputFile( config['chromosome_template'].format(chromosome)), ), ) return workflow
def realignment_readgroups_pipeline( config, in_file, out_file): workflow = Workflow() workflow.transform( name='get_read_group_configs', func=tasks.get_read_group_configs, ret=pypeliner.managed.TempOutputObj('read_group_config', 'read_group_id'), args=( pypeliner.managed.InputFile(in_file), ) ) workflow.commandline( name='create_read_group_bam', axes=('read_group_id',), args=( 'samtools', 'view', '-b', '-r', pypeliner.managed.InputInstance('read_group_id'), pypeliner.managed.InputFile(in_file), '>', pypeliner.managed.TempOutputFile('read_group_bam', 'read_group_id'), ) ) workflow.subworkflow( name='realignment_pipeline', axes=('read_group_id',), func=realignment_pipeline, args=( config, pypeliner.managed.TempInputFile('read_group_bam', 'read_group_id'), pypeliner.managed.TempOutputFile('realigned_read_group_bam', 'read_group_id'), ), kwargs={ 'read_group_info': pypeliner.managed.TempInputObj('read_group_config', 'read_group_id'), } ) workflow.transform( name='merge_and_markdups', axes=('read_group_id',), ctx={'mem' : 48, 'num_retry' : 3, 'mem_retry_increment' : 16}, func=bam_tasks.mark_duplicates, args=( pypeliner.managed.TempInputFile('realigned_read_group_bam', 'read_group_id'), pypeliner.managed.OutputFile(out_file), ), kwargs={ 'tmp_dir' : pypeliner.managed.TempSpace('markdup_temp', 'read_group_id') } ) return workflow
def create_tophat_transcriptome_index_workflow( ref_genome_fasta_file, transcript_gtf_file, ref_genome_index_prefix, transcriptome_index_prefix, copy_ref_genome=False): workflow = Workflow() local_ref_genome_fasta_path = ref_genome_index_prefix + '.fa' if copy_ref_genome: workflow.commandline( name='copy_genome', ctx={'local': True}, args=( 'cp', mgd.InputFile(ref_genome_fasta_file), mgd.OutputFile(local_ref_genome_fasta_path), ), ) else: workflow.commandline( name='link_genome', ctx={'local': True}, args=( 'ln', '-s', mgd.InputFile(ref_genome_fasta_file), mgd.OutputFile(local_ref_genome_fasta_path), ), ) workflow.transform( name='build_bowtie_index', ctx={'mem': 8, 'num_retry': 3, 'mem_retry_increment': 8}, func=tasks.build_genome_index, args=( mgd.InputFile(local_ref_genome_fasta_path), mgd.OutputFile(ref_genome_index_prefix), ) ) workflow.transform( name='build_tophat_index', ctx={'mem': 8, 'num_retry': 3, 'mem_retry_increment': 8}, func=tasks.build_transcriptome_index, args=( mgd.InputFile(ref_genome_index_prefix), mgd.InputFile(transcript_gtf_file), mgd.OutputFile(transcriptome_index_prefix), ) ) return workflow
def create_gc_wig_file(config, genome_file, out_file): workflow = Workflow() workflow.commandline( name='create_gc', ctx={'mem': 4}, args=( 'gcCounter', '-w', config['window_size'], pypeliner.managed.InputFile(genome_file), '>', pypeliner.managed.OutputFile(out_file), ), ) return workflow
def create_pvacseq_workflow( vcf_file, hla_type_file, results_file, config, ): workflow = Workflow() workflow.commandline( name='vep', ctx={'mem': 16}, args=( 'variant_effect_predictor.pl', '--input_file', pypeliner.managed.InputFile(vcf_file), '--format', 'vcf', '--output_file', pypeliner.managed.TempOutputFile('vep_annotated.vcf'), '--vcf', '--symbol', '--terms', 'SO', '--plugin', 'Downstream', '--plugin', 'Wildtype', '--cache', '--offline', '--force_overwrite', '--assembly', 'GRCh37', '--dir', config['vep_dir'], '--dir_plugins', os.path.join(config['vep_dir'], 'Plugins'), ), ) workflow.transform( name='run_pvacseq', func=tasks.run_pvacseq, args=( pypeliner.managed.TempInputFile('vep_annotated.vcf'), pypeliner.managed.InputFile(hla_type_file), pypeliner.managed.OutputFile(results_file), pypeliner.managed.TempSpace('pvacseq_temp'), config, ), ) return workflow
def create_titan_workflow(seqdata_files, config, out_file, raw_data_dir, somatic_breakpoint_file=None, normal_id=None, **kwargs): if normal_id is None: raise ValueError('Titan requires normal sample') normal_seqdata_file = seqdata_files[normal_id] tumour_seqdata_files = seqdata_files.copy() del tumour_seqdata_files[normal_id] results_files = os.path.join(raw_data_dir, 'results', 'sample_{sample_id}.h5') utils.make_parent_directory(results_files) workflow = Workflow() workflow.setobj( obj=pypeliner.managed.OutputChunks('sample_id'), value=tumour_seqdata_files.keys(), ) workflow.transform( name='prepare_normal_data', ctx={ 'mem': 16, 'num_retry': 3, 'mem_retry_increment': 4 }, func=tasks.prepare_normal_data, args=( pypeliner.managed.InputFile(normal_seqdata_file), pypeliner.managed.TempOutputFile('normal.wig'), pypeliner.managed.TempOutputFile('het_positions.tsv'), config, ), ) workflow.transform( name='prepare_tumour_data', axes=('sample_id', ), ctx={'mem': 20}, func=tasks.prepare_tumour_data, args=( pypeliner.managed.InputFile('tumour_seqdata', 'sample_id', fnames=tumour_seqdata_files), pypeliner.managed.TempInputFile('het_positions.tsv'), pypeliner.managed.TempOutputFile('tumour.wig', 'sample_id'), pypeliner.managed.TempOutputFile('tumour_alleles.tsv', 'sample_id'), config, ), ) workflow.transform( name='create_intialization_parameters', axes=('sample_id', ), ctx={ 'mem': 4, 'num_retry': 3, 'mem_retry_increment': 2 }, func=tasks.create_intialization_parameters, ret=pypeliner.managed.TempOutputObj('init_params', 'sample_id', 'init_param_id'), args=(config, ), ) workflow.transform( name='run_titan', axes=('sample_id', 'init_param_id'), ctx={ 'mem': 16, 'num_retry': 3, 'mem_retry_increment': 4 }, func=tasks.run_titan, args=( pypeliner.managed.TempInputObj('init_params', 'sample_id', 'init_param_id'), pypeliner.managed.TempInputFile('normal.wig'), pypeliner.managed.TempInputFile('tumour.wig', 'sample_id'), pypeliner.managed.TempInputFile('tumour_alleles.tsv', 'sample_id'), pypeliner.managed.TempOutputFile('cn.tsv', 'sample_id', 'init_param_id'), pypeliner.managed.TempOutputFile('params.tsv', 'sample_id', 'init_param_id'), config, ), ) if somatic_breakpoint_file is not None: somatic_breakpoint_file = pypeliner.managed.InputFile( somatic_breakpoint_file) workflow.transform( name='select_solution', axes=('sample_id', ), ctx={ 'mem': 4, 'num_retry': 3, 'mem_retry_increment': 2 }, func=tasks.select_solution, args=( pypeliner.managed.TempInputObj('init_params', 'sample_id', 'init_param_id'), pypeliner.managed.TempInputFile('cn.tsv', 'sample_id', 'init_param_id'), pypeliner.managed.TempInputFile('params.tsv', 'sample_id', 'init_param_id'), pypeliner.managed.OutputFile('results', 'sample_id', template=results_files), pypeliner.managed.OutputFile( os.path.join(raw_data_dir, 'output', '{sample_id}_cn_loci.tsv'), 'sample_id'), pypeliner.managed.OutputFile( os.path.join(raw_data_dir, 'output', '{sample_id}_cn_segments.tsv'), 'sample_id'), pypeliner.managed.OutputFile( os.path.join(raw_data_dir, 'output', '{sample_id}_cn_igv.tsv'), 'sample_id'), pypeliner.managed.OutputFile( os.path.join(raw_data_dir, 'output', '{sample_id}_params.tsv'), 'sample_id'), config, pypeliner.managed.Template('{sample_id}', 'sample_id'), ), kwargs={ 'breakpoints_filename': somatic_breakpoint_file, }, ) workflow.setobj(obj=pypeliner.managed.OutputChunks('sample_id', 'chromosome'), value=config.get('chromosomes', default_chromosomes), axes=('sample_id', )) workflow.commandline( name='plot_chromosome', axes=('sample_id', 'chromosome'), ctx={ 'mem': 4, 'num_retry': 3, 'mem_retry_increment': 2 }, args=( 'plot_titan_chromosome.R', pypeliner.managed.Instance('chromosome'), pypeliner.managed.InputFile( os.path.join(raw_data_dir, 'output', '{sample_id}_cn_loci.tsv'), 'sample_id'), pypeliner.managed.InputFile( os.path.join(raw_data_dir, 'output', '{sample_id}_params.tsv'), 'sample_id'), pypeliner.managed.OutputFile( os.path.join(raw_data_dir, 'output', '{sample_id}_chr_{chromosome}.png'), 'sample_id', 'chromosome'), ), ) workflow.transform( name='merge_results', ctx={ 'mem': 8, 'num_retry': 3, 'mem_retry_increment': 2 }, func=hdf5_tasks.merge_hdf5, args=( pypeliner.managed.InputFile('results', 'sample_id', template=results_files), pypeliner.managed.OutputFile(out_file), ), kwargs={ 'table_names': '/sample_{}', }, ) return workflow
def create_ref_genome_download_and_index_workflow(config, out_file): workflow = Workflow() if config['url'].endswith('gz'): workflow.subworkflow( name='download', func=download.create_download_workflow, args=( config['url'], pypeliner.managed.TempOutputFile('ref.fasta.gz'), ) ) workflow.commandline( name='gunzip', args=( 'gzip', '-cd', pypeliner.managed.TempInputFile('ref.fasta.gz'), '>', pypeliner.managed.OutputFile(out_file) ), ) else: workflow.subworkflow( name='download', func=download.create_download_workflow, args=( config['url'], pypeliner.managed.OutputFile(out_file) ) ) workflow.commandline( name='build_dict', ctx={'mem': 6, 'num_retry': 3, 'mem_retry_increment': 2}, args=( 'samtools', 'dict', pypeliner.managed.InputFile(out_file), '>', pypeliner.managed.OutputFile(out_file + '.build_dict.log'), ) ) workflow.commandline( name='build_fai', ctx={'mem': 6, 'num_retry': 3, 'mem_retry_increment': 2}, args=( 'samtools', 'faidx', pypeliner.managed.InputFile(out_file), '>', pypeliner.managed.OutputFile(out_file + '.build_fai.log'), ) ) workflow.commandline( name='build_bwa_index', ctx={'mem': 6, 'num_retry': 3, 'mem_retry_increment': 2}, args=( 'bwa', 'index', pypeliner.managed.InputFile(out_file), '>', pypeliner.managed.OutputFile(out_file + '.build_bwa_index.log'), ) ) return workflow
def create_setup_reference_dbs_workflow(config): workflow = Workflow() if 'cosmic' in config: workflow.transform( name='cosmic', func=tasks.download_cosmic, args=( config['cosmic'], pypeliner.managed.OutputFile(config['cosmic']['local_path']), pypeliner.managed.TempSpace('cosmic_work', cleanup=None) ) ) if 'dbsnp' in config: workflow.subworkflow( name='dbsnp', func=create_dbsnp_download_workflow, args=( config['dbsnp'], pypeliner.managed.OutputFile(config['dbsnp']['local_path']), ) ) if 'mappability' in config: workflow.subworkflow( name='mappability', func=download.create_download_workflow, args=( config['mappability']['url'], pypeliner.managed.OutputFile(config['mappability']['local_path']), ) ) if 'ref_genome' in config and 'url' in config['ref_genome']: workflow.subworkflow( name='ref_genome', func=create_ref_genome_download_and_index_workflow, args=( config['ref_genome'], pypeliner.managed.OutputFile(config['ref_genome']['local_path']), ) ) if 'snpeff' in config: workflow.commandline( name='snpeff', args=( 'snpEff', 'download', config['snpeff']['db'] ) ) if 'chrom_info' in config: workflow.subworkflow( name='chrom_info', func=download.create_download_workflow, args=( config['chrom_info']['url'], pypeliner.managed.OutputFile(config['chrom_info']['local_path']), ) ) return workflow