def build_index(index_sentinel_file, transcriptome_fasta_file, kmer_length=31, gencode=False, num_threads=1): make_parent_directory(index_sentinel_file) cmd = [ 'salmon', 'index', '-i', os.path.dirname(index_sentinel_file), '-k', kmer_length, '-p', num_threads, '-t', transcriptome_fasta_file, ] if gencode is not None: cmd.append('--gencode') pypeliner.commandline.execute(*cmd) open(index_sentinel_file, 'w').close()
def create_battenberg_workflow( seqdata_files, config, out_file, raw_data_dir, somatic_breakpoint_file=None, normal_id=None, **kwargs ): if normal_id is None: raise ValueError('cloneHD requires normal sample') normal_seqdata_file = seqdata_files[normal_id] tumour_seqdata_files = seqdata_files.copy() del tumour_seqdata_files[normal_id] results_files = os.path.join(raw_data_dir, 'results', 'sample_{sample_id}.h5') utils.make_parent_directory(results_files) workflow = Workflow() workflow.setobj( obj=pypeliner.managed.OutputChunks('sample_id'), value=tumour_seqdata_files.keys(), ) if somatic_breakpoint_file is not None: somatic_breakpoint_file = pypeliner.managed.InputFile(somatic_breakpoint_file) workflow.subworkflow( name='run_battenberg', axes=('sample_id',), func=create_battenberg_single_workflow, args=( pypeliner.managed.InputFile(normal_seqdata_file), pypeliner.managed.InputFile('tumour_seqdata', 'sample_id', fnames=tumour_seqdata_files), normal_id, pypeliner.managed.InputInstance('sample_id'), pypeliner.managed.OutputFile('results', 'sample_id', template=results_files), config, ), kwargs={ 'somatic_breakpoint_file': somatic_breakpoint_file, }, ) workflow.transform( name='merge_results', ctx={'mem': 8}, func=hdf5_tasks.merge_hdf5, args=( pypeliner.managed.InputFile('results', 'sample_id', template=results_files), pypeliner.managed.OutputFile(out_file), ), kwargs={ 'table_names': '/sample_{}', }, ) return workflow
def get_sample_out_file(cmd, ext, out_dir, variant_type): out_file = os.path.join(out_dir, variant_type, cmd, '{{tumour_sample_id}}.{0}'.format(ext)) make_parent_directory(out_file) return out_file
def create_ascat_workflow(seqdata_files, config, out_file, raw_data_dir, somatic_breakpoint_file=None, normal_id=None, **kwargs): if normal_id is None: raise ValueError('ASCAT requires normal sample') normal_seqdata_file = seqdata_files[normal_id] tumour_seqdata_files = seqdata_files.copy() del tumour_seqdata_files[normal_id] results_files = os.path.join(raw_data_dir, 'results', 'sample_{sample_id}.h5') utils.make_parent_directory(results_files) workflow = Workflow() workflow.setobj( obj=pypeliner.managed.OutputChunks('sample_id'), value=tumour_seqdata_files.keys(), ) workflow.transform( name='prepare_normal_data', ctx={ 'mem': 16, 'num_retry': 3, 'mem_retry_increment': 4 }, func=tasks.prepare_normal_data, args=( pypeliner.managed.InputFile(normal_seqdata_file), pypeliner.managed.TempOutputFile('Germline_LogR.txt'), pypeliner.managed.TempOutputFile('Germline_BAF.txt'), config, ), ) workflow.transform( name='prepare_tumour_data', axes=('sample_id', ), ctx={'mem': 20}, func=tasks.prepare_tumour_data, args=( pypeliner.managed.InputFile('tumour_seqdata', 'sample_id', fnames=tumour_seqdata_files), pypeliner.managed.TempOutputFile('Germline_LogR.txt', 'sample_id'), pypeliner.managed.TempOutputFile('Germline_BAF.txt', 'sample_id'), config, ), ) return workflow
def build_index( index_sentinel_file, ref_genome_fasta_file, transcript_gtf_file, overhang=100, num_threads=1): make_parent_directory(index_sentinel_file) cmd = [ 'STAR', '--runMode', 'genomeGenerate', '--runThreadN', num_threads, '--genomeDir', os.path.dirname(index_sentinel_file), '--genomeFastaFiles', ref_genome_fasta_file, '--sjdbGTFfile', transcript_gtf_file, '--sjdbOverhang', overhang, ] pypeliner.commandline.execute(*cmd) open(index_sentinel_file, 'w').close()
def call_and_annotate_pipeline( config, bam_files, raw_data_dir, results_file, normal_id=None, somatic_breakpoint_file=None, patient_config=None, ): sample_ids = bam_files.keys() tumour_ids = bam_files.keys() if normal_id is not None: tumour_ids.remove(normal_id) workflow = Workflow() workflow.setobj( obj=pypeliner.managed.OutputChunks('sample_id'), value=sample_ids, ) workflow.setobj( obj=pypeliner.managed.OutputChunks('tumour_id'), value=tumour_ids, ) seq_data_template = os.path.join(raw_data_dir, 'seqdata', 'sample_{sample_id}.h5') if somatic_breakpoint_file is not None: somatic_breakpoint_file = pypeliner.managed.InputFile( somatic_breakpoint_file) workflow.subworkflow( name='extract_seqdata_workflow', axes=('sample_id', ), func=remixt.workflow.create_extract_seqdata_workflow, args=( pypeliner.managed.InputFile('bam', 'sample_id', fnames=bam_files), pypeliner.managed.OutputFile('seqdata', 'sample_id', template=seq_data_template), config['remixt'].get('extract_seqdata', {}), config['remixt']['ref_data_dir'], ), ) merge_inputs = {} if 'remixt' in config: remixt_raw_data = os.path.join(raw_data_dir, 'remixt') remixt_results_filename = os.path.join(remixt_raw_data, 'results.h5') make_parent_directory(remixt_results_filename) remixt_config = config['remixt']['config'] assert 'sample_specific' not in remixt_config remixt_config.update(patient_config) workflow.subworkflow( name='remixt', func=biowrappers.components.copy_number_calling.remixt. create_remixt_workflow, args=( pypeliner.managed.InputFile('seqdata', 'sample_id', template=seq_data_template), remixt_config, pypeliner.managed.OutputFile(remixt_results_filename), remixt_raw_data, ), kwargs={ 'somatic_breakpoint_file': somatic_breakpoint_file, 'ref_data_dir': config['remixt']['ref_data_dir'], 'normal_id': normal_id, }, ) merge_inputs['/copy_number/remixt'] = pypeliner.managed.InputFile( remixt_results_filename) if 'titan' in config: titan_raw_data = os.path.join(raw_data_dir, 'titan') titan_results_filename = os.path.join(titan_raw_data, 'results.h5') make_parent_directory(titan_results_filename) workflow.subworkflow( name='titan', func=biowrappers.components.copy_number_calling.titan. create_titan_workflow, args=( pypeliner.managed.InputFile('seqdata', 'sample_id', template=seq_data_template), config['titan']['config'], pypeliner.managed.OutputFile(titan_results_filename), titan_raw_data, ), kwargs={ 'somatic_breakpoint_file': somatic_breakpoint_file, 'normal_id': normal_id, }, ) merge_inputs['/copy_number/titan'] = pypeliner.managed.InputFile( titan_results_filename) if 'clonehd' in config: clonehd_raw_data = os.path.join(raw_data_dir, 'clonehd') clonehd_results_filename = os.path.join(clonehd_raw_data, 'results.h5') make_parent_directory(clonehd_results_filename) workflow.subworkflow( name='clonehd', func=biowrappers.components.copy_number_calling.clonehd. create_clonehd_workflow, args=( pypeliner.managed.InputFile('seqdata', 'sample_id', template=seq_data_template), config['clonehd']['config'], pypeliner.managed.OutputFile(clonehd_results_filename), clonehd_raw_data, ), kwargs={ 'somatic_breakpoint_file': somatic_breakpoint_file, 'normal_id': normal_id, }, ) merge_inputs['/copy_number/clonehd'] = pypeliner.managed.InputFile( clonehd_results_filename) if 'theta' in config: theta_raw_data = os.path.join(raw_data_dir, 'theta') theta_results_filename = os.path.join(theta_raw_data, 'results.h5') make_parent_directory(theta_results_filename) workflow.subworkflow( name='theta', func=biowrappers.components.copy_number_calling.theta. create_theta_workflow, args=( pypeliner.managed.InputFile('seqdata', 'sample_id', template=seq_data_template), config['theta']['config'], pypeliner.managed.OutputFile(theta_results_filename), theta_raw_data, ), kwargs={ 'somatic_breakpoint_file': somatic_breakpoint_file, 'normal_id': normal_id, 'num_clones': config['theta']['kwargs']['num_clones'], }, ) merge_inputs['/copy_number/theta'] = pypeliner.managed.InputFile( theta_results_filename) workflow.transform( name='merge_results', ctx={'mem': 8}, func=hdf5_tasks.merge_hdf5, args=( merge_inputs, pypeliner.managed.OutputFile(results_file), ), ) return workflow
def create_theta_workflow(seqdata_files, config, out_file, raw_data_dir, somatic_breakpoint_file=None, normal_id=None, **kwargs): if normal_id is None: raise ValueError('Theta requires normal sample') normal_seqdata_file = seqdata_files[normal_id] tumour_seqdata_files = seqdata_files.copy() del tumour_seqdata_files[normal_id] results_template = os.path.join(raw_data_dir, 'results', 'sample_{sample_id}.h5') bicseq2_seg_template = os.path.join(raw_data_dir, 'bicseq2', 'bicseq2_{sample_id}.seg') utils.make_parent_directory(results_template) utils.make_parent_directory(bicseq2_seg_template) workflow = Workflow() workflow.setobj( obj=pypeliner.managed.OutputChunks('sample_id'), value=tumour_seqdata_files.keys(), ) if somatic_breakpoint_file is not None: somatic_breakpoint_file = pypeliner.managed.InputFile( somatic_breakpoint_file) workflow.transform( name='run_bicseq2', axes=('sample_id', ), ctx={'mem': 30}, func=tasks.run_bicseq2_seg, args=( pypeliner.managed.OutputFile('bicseq2_seg', 'sample_id', template=bicseq2_seg_template), pypeliner.managed.InputFile('normal_seqdata', template=normal_seqdata_file), pypeliner.managed.InputFile('tumour_seqdata', 'sample_id', fnames=tumour_seqdata_files), config, pypeliner.managed.TempSpace('bicseq2_work', 'sample_id', cleanup=None), ), ) workflow.transform( name='run_theta', axes=('sample_id', ), ctx={'mem': 32}, func=tasks.run_theta, args=( pypeliner.managed.OutputFile('results', 'sample_id', template=results_template), pypeliner.managed.InputFile('normal_seqdata', template=normal_seqdata_file), pypeliner.managed.InputFile('tumour_seqdata', 'sample_id', fnames=tumour_seqdata_files), pypeliner.managed.InputFile('bicseq2_seg', 'sample_id', template=bicseq2_seg_template), config, pypeliner.managed.TempSpace('theta_work', 'sample_id', cleanup=None), ), kwargs={ 'breakpoints_filename': somatic_breakpoint_file, 'num_clones': kwargs.get('num_clones', None), }, ) workflow.transform( name='merge_results', ctx={'mem': 8}, func=hdf5_tasks.merge_hdf5, args=( pypeliner.managed.InputFile('results', 'sample_id', template=results_template), pypeliner.managed.OutputFile(out_file), ), kwargs={ 'table_names': '/sample_{}', }, ) return workflow
def call_and_annotate_pipeline( config, normal_bam_path, tumour_bam_paths, raw_data_dir, results_file, ): workflow = Workflow() workflow.setobj( obj=pypeliner.managed.OutputChunks('tumour_sample_id'), value=tumour_bam_paths.keys(), ) merge_inputs = {} if 'destruct' in config: destruct_raw_data = os.path.join(raw_data_dir, 'destruct') destruct_results_filename = os.path.join(destruct_raw_data, 'results.h5') make_parent_directory(destruct_results_filename) workflow.subworkflow( name='destruct', func=destruct.destruct_pipeline, args=( pypeliner.managed.InputFile(normal_bam_path), pypeliner.managed.InputFile('tumour_bams', 'tumour_sample_id', fnames=tumour_bam_paths), config['destruct']['config'], config['destruct']['ref_data_dir'], pypeliner.managed.OutputFile(destruct_results_filename), destruct_raw_data, ), ) merge_inputs['/breakpoints/destruct'] = pypeliner.managed.InputFile( destruct_results_filename) if 'delly' in config: delly_raw_data = os.path.join(raw_data_dir, 'delly') delly_results_filename = os.path.join(delly_raw_data, 'results.h5') make_parent_directory(delly_results_filename) workflow.subworkflow( name='delly', func=delly.delly_pipeline, args=( pypeliner.managed.InputFile(normal_bam_path), pypeliner.managed.InputFile('tumour_bams', 'tumour_sample_id', fnames=tumour_bam_paths), config['delly']['ref_genome_fasta_file'], config['delly']['exclude_file'], pypeliner.managed.OutputFile(delly_results_filename), delly_raw_data, ), ) merge_inputs['/breakpoints/delly'] = pypeliner.managed.InputFile( delly_results_filename) if 'lumpysv' in config: lumpysv_raw_data = os.path.join(raw_data_dir, 'lumpysv') lumpysv_results_filename = os.path.join(lumpysv_raw_data, 'results.h5') make_parent_directory(lumpysv_results_filename) workflow.subworkflow( name='lumpysv', func=lumpysv.lumpysv_pipeline, args=( pypeliner.managed.InputFile(normal_bam_path), pypeliner.managed.InputFile('tumour_bams', 'tumour_sample_id', fnames=tumour_bam_paths), pypeliner.managed.OutputFile(lumpysv_results_filename), lumpysv_raw_data, ), ) merge_inputs['/breakpoints/lumpysv'] = pypeliner.managed.InputFile( lumpysv_results_filename) workflow.transform(name='merge_results', ctx={'mem': 8}, func=hdf5_tasks.merge_hdf5, args=( merge_inputs, pypeliner.managed.OutputFile(results_file), )) return workflow
def create_remixt_workflow( seqdata_files, config, out_file, raw_data_dir, ref_data_dir=None, somatic_breakpoint_file=None, normal_id=None, ): if somatic_breakpoint_file is None: raise ValueError('somatic breakpoints required') if ref_data_dir is None: raise ValueError('ref data directory required') sample_ids = seqdata_files.keys() tumour_ids = seqdata_files.keys() if normal_id is not None: tumour_ids.remove(normal_id) results_files = os.path.join(raw_data_dir, 'results', 'sample_{tumour_id}.h5') selected_files = os.path.join(raw_data_dir, 'selected', 'sample_{tumour_id}.h5') utils.make_parent_directory(results_files) utils.make_parent_directory(selected_files) workflow = Workflow() workflow.setobj( obj=pypeliner.managed.OutputChunks('sample_id'), value=sample_ids, ) workflow.setobj( obj=pypeliner.managed.OutputChunks('tumour_id'), value=tumour_ids, ) workflow.subworkflow( name='remixt', func=remixt.workflow.create_remixt_seqdata_workflow, args=( pypeliner.managed.InputFile(somatic_breakpoint_file), pypeliner.managed.InputFile('seqdata', 'sample_id', fnames=seqdata_files), pypeliner.managed.OutputFile('results', 'tumour_id', template=results_files, axes_origin=[]), raw_data_dir, config, ref_data_dir, ), kwargs={ 'normal_id': normal_id, }) workflow.transform(name='select_solution', ctx={ 'mem': 2, 'num_retry': 3, 'mem_retry_increment': 2 }, func=tasks.select_solution, axes=('tumour_id', ), args=( pypeliner.managed.OutputFile( 'selected', 'tumour_id', template=selected_files), pypeliner.managed.InputFile('results', 'tumour_id', template=results_files), config, )) workflow.transform( name='merge_results', ctx={ 'mem': 8, 'num_retry': 3, 'mem_retry_increment': 2 }, func=hdf5_tasks.merge_hdf5, args=( pypeliner.managed.InputFile('selected', 'tumour_id', template=selected_files), pypeliner.managed.OutputFile(out_file), ), kwargs={ 'table_names': '/sample_{}', }, ) return workflow
def create_titan_workflow(seqdata_files, config, out_file, raw_data_dir, somatic_breakpoint_file=None, normal_id=None, **kwargs): if normal_id is None: raise ValueError('Titan requires normal sample') normal_seqdata_file = seqdata_files[normal_id] tumour_seqdata_files = seqdata_files.copy() del tumour_seqdata_files[normal_id] results_files = os.path.join(raw_data_dir, 'results', 'sample_{sample_id}.h5') utils.make_parent_directory(results_files) workflow = Workflow() workflow.setobj( obj=pypeliner.managed.OutputChunks('sample_id'), value=tumour_seqdata_files.keys(), ) workflow.transform( name='prepare_normal_data', ctx={ 'mem': 16, 'num_retry': 3, 'mem_retry_increment': 4 }, func=tasks.prepare_normal_data, args=( pypeliner.managed.InputFile(normal_seqdata_file), pypeliner.managed.TempOutputFile('normal.wig'), pypeliner.managed.TempOutputFile('het_positions.tsv'), config, ), ) workflow.transform( name='prepare_tumour_data', axes=('sample_id', ), ctx={'mem': 20}, func=tasks.prepare_tumour_data, args=( pypeliner.managed.InputFile('tumour_seqdata', 'sample_id', fnames=tumour_seqdata_files), pypeliner.managed.TempInputFile('het_positions.tsv'), pypeliner.managed.TempOutputFile('tumour.wig', 'sample_id'), pypeliner.managed.TempOutputFile('tumour_alleles.tsv', 'sample_id'), config, ), ) workflow.transform( name='create_intialization_parameters', axes=('sample_id', ), ctx={ 'mem': 4, 'num_retry': 3, 'mem_retry_increment': 2 }, func=tasks.create_intialization_parameters, ret=pypeliner.managed.TempOutputObj('init_params', 'sample_id', 'init_param_id'), args=(config, ), ) workflow.transform( name='run_titan', axes=('sample_id', 'init_param_id'), ctx={ 'mem': 16, 'num_retry': 3, 'mem_retry_increment': 4 }, func=tasks.run_titan, args=( pypeliner.managed.TempInputObj('init_params', 'sample_id', 'init_param_id'), pypeliner.managed.TempInputFile('normal.wig'), pypeliner.managed.TempInputFile('tumour.wig', 'sample_id'), pypeliner.managed.TempInputFile('tumour_alleles.tsv', 'sample_id'), pypeliner.managed.TempOutputFile('cn.tsv', 'sample_id', 'init_param_id'), pypeliner.managed.TempOutputFile('params.tsv', 'sample_id', 'init_param_id'), config, ), ) if somatic_breakpoint_file is not None: somatic_breakpoint_file = pypeliner.managed.InputFile( somatic_breakpoint_file) workflow.transform( name='select_solution', axes=('sample_id', ), ctx={ 'mem': 4, 'num_retry': 3, 'mem_retry_increment': 2 }, func=tasks.select_solution, args=( pypeliner.managed.TempInputObj('init_params', 'sample_id', 'init_param_id'), pypeliner.managed.TempInputFile('cn.tsv', 'sample_id', 'init_param_id'), pypeliner.managed.TempInputFile('params.tsv', 'sample_id', 'init_param_id'), pypeliner.managed.OutputFile('results', 'sample_id', template=results_files), pypeliner.managed.OutputFile( os.path.join(raw_data_dir, 'output', '{sample_id}_cn_loci.tsv'), 'sample_id'), pypeliner.managed.OutputFile( os.path.join(raw_data_dir, 'output', '{sample_id}_cn_segments.tsv'), 'sample_id'), pypeliner.managed.OutputFile( os.path.join(raw_data_dir, 'output', '{sample_id}_cn_igv.tsv'), 'sample_id'), pypeliner.managed.OutputFile( os.path.join(raw_data_dir, 'output', '{sample_id}_params.tsv'), 'sample_id'), config, pypeliner.managed.Template('{sample_id}', 'sample_id'), ), kwargs={ 'breakpoints_filename': somatic_breakpoint_file, }, ) workflow.setobj(obj=pypeliner.managed.OutputChunks('sample_id', 'chromosome'), value=config.get('chromosomes', default_chromosomes), axes=('sample_id', )) workflow.commandline( name='plot_chromosome', axes=('sample_id', 'chromosome'), ctx={ 'mem': 4, 'num_retry': 3, 'mem_retry_increment': 2 }, args=( 'plot_titan_chromosome.R', pypeliner.managed.Instance('chromosome'), pypeliner.managed.InputFile( os.path.join(raw_data_dir, 'output', '{sample_id}_cn_loci.tsv'), 'sample_id'), pypeliner.managed.InputFile( os.path.join(raw_data_dir, 'output', '{sample_id}_params.tsv'), 'sample_id'), pypeliner.managed.OutputFile( os.path.join(raw_data_dir, 'output', '{sample_id}_chr_{chromosome}.png'), 'sample_id', 'chromosome'), ), ) workflow.transform( name='merge_results', ctx={ 'mem': 8, 'num_retry': 3, 'mem_retry_increment': 2 }, func=hdf5_tasks.merge_hdf5, args=( pypeliner.managed.InputFile('results', 'sample_id', template=results_files), pypeliner.managed.OutputFile(out_file), ), kwargs={ 'table_names': '/sample_{}', }, ) return workflow