def create_merge_bams_workflow( input_bams, merged_bams, regions, config, ): merged_bams = dict([(region, merged_bams[region]) for region in regions]) workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=list(input_bams.keys()), ) workflow.setobj( obj=mgd.OutputChunks('region'), value=regions, ) one_split_job = config["one_split_job"] if one_split_job: workflow.transform( name='merge_bams', ctx={'mem': config['memory']['med'], 'ncpus': config['max_cores']}, func="single_cell.workflows.merge_bams.tasks.merge_bams", args=( mgd.InputFile('bam', 'cell_id', fnames=input_bams, extensions=['.bai']), mgd.OutputFile('merged.bam', "region", fnames=merged_bams, axes_origin=[], extensions=['.bai']), regions, mgd.TempSpace("merge_bams_tempdir") ), kwargs={"ncores": config["max_cores"]} ) else: workflow.transform( name='split_merge_tumour', func='single_cell.workflows.merge_bams.tasks.cell_region_merge_bams', axes=('region',), args=( mgd.InputFile('tumour_cells.bam', 'cell_id', extensions=['.bai'], fnames=input_bams), mgd.OutputFile( 'tumour_regions.bam', 'region', axes_origin=[], extensions=['.bai'], fnames=merged_bams), mgd.Instance('region'), ), ) return workflow
def create_cell_region_merge_workflow( cell_bams, region_bams, regions, docker_image, ): region_bams = dict([(region, region_bams[region]) for region in regions]) workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=list(cell_bams.keys()), ) workflow.setobj( obj=mgd.OutputChunks('region'), value=regions, ) workflow.transform( name='split_merge_tumour', func='single_cell.workflows.merge_bams.tasks.cell_region_merge_bams', axes=('region', ), args=(mgd.InputFile('tumour_cells.bam', 'cell_id', extensions=['.bai'], fnames=cell_bams), mgd.OutputFile('tumour_regions.bam', 'region', axes_origin=[], extensions=['.bai'], fnames=region_bams), mgd.Instance('region'), docker_image), ) return workflow
def create_delly_wrapper_workflow(bam_filenames, output_filename, raw_data_dir, control_id=None, ref_genome_fasta_file=None, delly_excl_chrom=None): bams = list() for lib_id, bam_filename in bam_filenames.items(): bams += [destruct.benchmark.wrappers.utils.symlink(bam_filename, link_name='{0}.bam'.format(lib_id), link_directory=raw_data_dir)] destruct.benchmark.wrappers.utils.symlink(bam_filename+'.bai', link_name='{0}.bam.bai'.format(lib_id), link_directory=raw_data_dir) workflow = pypeliner.workflow.Workflow() workflow.transform( name='get_sv_types', func=destruct.benchmark.wrappers.delly.tasks.get_sv_types, ret=pypeliner.managed.OutputChunks('sv_type'), args=( mgd.InputFile(ref_genome_fasta_file), ), ) workflow.transform( name='delly_call', axes=('sv_type',), ctx={'mem': 64, 'num_retry': 2, 'mem_retry_factor': 2}, func=destruct.benchmark.wrappers.delly.tasks.run_delly_call, args=( mgd.Instance('sv_type'), delly_excl_chrom, ref_genome_fasta_file, [mgd.InputFile(bam) for bam in bams], mgd.TempOutputFile('out.bcf', 'sv_type'), ), ) if control_id is None: concat_input = mgd.TempInputFile('out.bcf', 'sv_type') else: workflow.transform( name='delly_filter_somatic', axes=('sv_type',), ctx={'mem': 4, 'num_retry': 2, 'mem_retry_factor': 2}, func=destruct.benchmark.wrappers.delly.tasks.run_delly_filter, args=( mgd.Instance('sv_type'), bam_filenames.keys(), control_id, mgd.TempSpace('samples.tsv'), ref_genome_fasta_file, mgd.TempInputFile('out.bcf', 'sv_type'), mgd.TempOutputFile('somatic.bcf', 'sv_type'), ), ) concat_input = mgd.TempInputFile('somatic.bcf', 'sv_type') workflow.transform( name='concatenate_vcf', func=destruct.benchmark.wrappers.tasks.concatenate_bcf, ctx={'mem': 4, 'num_retry': 2, 'mem_retry_factor': 2}, args=( concat_input, mgd.TempOutputFile('somatic.bcf'), ), ) workflow.transform( name='convert_vcf', func=destruct.benchmark.wrappers.delly.tasks.convert_vcf, ctx={'mem': 4, 'num_retry': 3, 'mem_retry_increment': 2}, args=( mgd.TempInputFile('somatic.bcf'), mgd.OutputFile(output_filename), ), kwargs={ 'control_id': control_id, } ) return workflow
def create_snv_allele_counts_for_vcf_targets_workflow( config, bam_files, vcf_file, out_file, docker_config=None, chromosomes=default_chromosomes, count_duplicates=False, min_bqual=0, min_mqual=0, split_size=int(1e7), table_name='snv_allele_counts', vcf_to_bam_chrom_map=None): ctx = { 'mem': 2, 'num_retry': 3, 'mem_retry_increment': 2, 'pool_id': config['pools']['standard'], 'ncpus': 1 } if docker_config: ctx.update(docker_config) workflow = pypeliner.workflow.Workflow(default_ctx=ctx) workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=bam_files.keys(), ) workflow.transform( name='get_snv_allele_counts_for_vcf_targets', axes=('cell_id', ), func= "biowrappers.components.variant_calling.snv_allele_counts.tasks.get_snv_allele_counts_for_vcf_targets", args=( mgd.InputFile('tumour.bam', 'cell_id', fnames=bam_files), mgd.InputFile(vcf_file), mgd.TempOutputFile('counts.h5', 'cell_id'), table_name, ), kwargs={ 'count_duplicates': count_duplicates, 'min_bqual': min_bqual, 'min_mqual': min_mqual, 'vcf_to_bam_chrom_map': vcf_to_bam_chrom_map, 'cell_id': mgd.Instance('cell_id'), 'report_zero_count_positions': False, }) workflow.transform( name='merge_snv_allele_counts', ctx={ 'mem': config["memory"]['high'], 'pool_id': config['pools']['highmem'], 'ncpus': 1 }, func="biowrappers.components.io.hdf5.tasks.concatenate_tables", args=( mgd.TempInputFile('counts.h5', 'cell_id'), mgd.OutputFile(out_file), ), kwargs={ 'in_memory': False, }, ) return workflow
def destruct_multi_sample_workflow( normal_bam, tumour_bam_files, destruct_config, config, destruct_ref_data_dir, breakpoints_csv, breakpoints_library_csv, cell_counts_csv, raw_data_dir, normal_sample_id='normal', ): ctx = {'docker_image': config['docker']['destruct']} workflow = pypeliner.workflow.Workflow(ctx=ctx) workflow.setobj( obj=mgd.OutputChunks('sample_id', 'library_id', 'cell_id'), value=list(tumour_bam_files.keys()), ) keys = [(sample_id, library_id) for (sample_id, library_id, _) in list(tumour_bam_files.keys())] keys = sorted(set(keys)) breakpoints_csv = dict([(key, breakpoints_csv(*key)) for key in keys]) breakpoints_library_csv = dict([(key, breakpoints_library_csv(*key)) for key in keys]) cell_counts_csv = dict([(key, cell_counts_csv(*key)) for key in keys]) workflow.set_filenames('tumour_cells.bam', 'sample_id', 'library_id', 'cell_id', fnames=tumour_bam_files) workflow.set_filenames('breakpoints.csv', 'sample_id', 'library_id', fnames=breakpoints_csv) workflow.set_filenames('breakpoints_library.csv', 'sample_id', 'library_id', fnames=breakpoints_library_csv) workflow.set_filenames('cell_counts.csv', 'sample_id', 'library_id', fnames=cell_counts_csv) workflow.subworkflow( name='normal_preprocess_destruct', func= 'single_cell.workflows.destruct_singlecell.destruct_preprocess_workflow', args=( normal_bam, mgd.TempOutputFile('normal_stats'), mgd.TempOutputFile('normal_reads_1.fastq.gz'), mgd.TempOutputFile('normal_reads_2.fastq.gz'), mgd.TempOutputFile('normal_sample_1.fastq.gz'), mgd.TempOutputFile('normal_sample_2.fastq.gz'), destruct_ref_data_dir, destruct_config, ), ) workflow.subworkflow( name='tumour_preprocess_destruct', func= 'single_cell.workflows.destruct_singlecell.destruct_preprocess_workflow', axes=('sample_id', 'library_id'), args=( mgd.InputFile('tumour_cells.bam', 'sample_id', 'library_id', 'cell_id', extensions=['.bai']), mgd.TempOutputFile('tumour_stats', 'sample_id', 'library_id'), mgd.TempOutputFile('tumour_reads_1.fastq.gz', 'sample_id', 'library_id'), mgd.TempOutputFile('tumour_reads_2.fastq.gz', 'sample_id', 'library_id'), mgd.TempOutputFile('tumour_sample_1.fastq.gz', 'sample_id', 'library_id'), mgd.TempOutputFile('tumour_sample_2.fastq.gz', 'sample_id', 'library_id'), destruct_ref_data_dir, destruct_config, ), kwargs={'tag': True}) workflow.subworkflow( name='run_destruct', func= 'single_cell.workflows.destruct_singlecell.create_destruct_workflow', axes=('sample_id', 'library_id'), args=( mgd.TempInputFile('normal_stats'), mgd.TempInputFile('normal_reads_1.fastq.gz'), mgd.TempInputFile('normal_reads_2.fastq.gz'), mgd.TempInputFile('normal_sample_1.fastq.gz'), mgd.TempInputFile('normal_sample_2.fastq.gz'), mgd.TempInputFile('tumour_stats', 'sample_id', 'library_id'), mgd.TempInputFile('tumour_reads_1.fastq.gz', 'sample_id', 'library_id'), mgd.TempInputFile('tumour_reads_2.fastq.gz', 'sample_id', 'library_id'), mgd.TempInputFile('tumour_sample_1.fastq.gz', 'sample_id', 'library_id'), mgd.TempInputFile('tumour_sample_2.fastq.gz', 'sample_id', 'library_id'), destruct_config, destruct_ref_data_dir, mgd.OutputFile('breakpoints.csv', 'sample_id', 'library_id'), mgd.OutputFile('breakpoints_library.csv', 'sample_id', 'library_id'), mgd.OutputFile('cell_counts.csv', 'sample_id', 'library_id'), mgd.Template(raw_data_dir, 'sample_id', 'library_id'), ), kwargs={ 'tumour_sample_id': mgd.Instance('sample_id'), 'tumour_library_id': mgd.Instance('library_id'), 'normal_sample_id': normal_sample_id, }, ) return workflow
def create_snv_allele_counts_for_vcf_targets_workflow( bam_files, vcf_file, out_file, memory_cfg, count_duplicates=False, min_bqual=0, min_mqual=0, table_name='snv_allele_counts', vcf_to_bam_chrom_map=None, ): ctx = { 'mem': memory_cfg['low'], 'num_retry': 3, 'mem_retry_increment': 2, 'ncpus': 1, 'disk_retry_increment': 50, } workflow = pypeliner.workflow.Workflow(ctx=ctx) workflow.setobj( obj=mgd.OutputChunks('sample_id', 'library_id', 'cell_id'), value=list(bam_files.keys()), ) workflow.transform( name='get_snv_allele_counts_for_vcf_targets', axes=('sample_id', 'library_id', 'cell_id'), func= "biowrappers.components.variant_calling.snv_allele_counts.tasks.get_snv_allele_counts_for_vcf_targets", args=( mgd.InputFile('tumour.bam', 'sample_id', 'library_id', 'cell_id', fnames=bam_files, extensions=['.bai']), mgd.InputFile(vcf_file), mgd.TempOutputFile('counts.h5', 'sample_id', 'library_id', 'cell_id'), table_name, ), kwargs={ 'count_duplicates': count_duplicates, 'min_bqual': min_bqual, 'min_mqual': min_mqual, 'vcf_to_bam_chrom_map': vcf_to_bam_chrom_map, 'cell_id': mgd.Instance('cell_id'), 'sample_id': mgd.Instance('sample_id'), 'library_id': mgd.Instance('library_id'), 'report_zero_count_positions': False, }) workflow.transform( name='merge_snv_allele_counts', ctx={ 'mem': memory_cfg['high'], 'disk': 20 }, func="biowrappers.components.io.hdf5.tasks.concatenate_tables", args=( mgd.TempInputFile('counts.h5', 'sample_id', 'library_id', 'cell_id'), mgd.TempOutputFile('merged_counts.h5'), ), kwargs={ 'in_memory': False, }, ) workflow.transform(name='convert_h5_to_csv', func='single_cell.utils.hdfutils.convert_hdf_to_csv', args=(mgd.TempInputFile('merged_counts.h5'), { '/snv_allele_counts': mgd.OutputFile(out_file, extensions=['.yaml']), })) return workflow
def delly_pipeline( normal_bam_file, tumour_bam_files, ref_genome_fasta_file, delly_excl_chrom, out_file, raw_data_dir, ): bams = list() for lib_id, bam_filename in tumour_bam_files.items(): bams += [ utils.symlink(bam_filename, link_name='{0}.bam'.format(lib_id), link_directory=raw_data_dir) ] utils.symlink(bam_filename + '.bai', link_name='{0}.bam.bai'.format(lib_id), link_directory=raw_data_dir) bams += [ utils.symlink(normal_bam_file, link_name='Normal.bam', link_directory=raw_data_dir) ] utils.symlink(normal_bam_file + '.bai', link_name='Normal.bam.bai', link_directory=raw_data_dir) sample_type = {'Normal': 'control'} for lib_id in tumour_bam_files.keys(): sample_type[lib_id] = 'tumor' workflow = Workflow() workflow.setobj( obj=pypeliner.managed.TempOutputObj('sample_type', 'sample_id'), value=sample_type, ) workflow.setobj( obj=pypeliner.managed.OutputChunks('sv_type'), value=('DEL', 'DUP', 'INV', 'TRA', 'INS'), ) workflow.transform( name='delly_call', axes=('sv_type', ), ctx={ 'mem': 64, 'num_retry': 2, 'mem_retry_factor': 2 }, func=tasks.run_delly_call, args=( mgd.Instance('sv_type'), delly_excl_chrom, ref_genome_fasta_file, [mgd.InputFile(bam) for bam in bams], mgd.TempOutputFile('out.bcf', 'sv_type'), ), ) workflow.transform( name='write_samples_table', ctx={'mem': 1}, func=tasks.write_samples_table, args=( mgd.TempInputObj('sample_type', 'sample_id'), mgd.TempOutputFile('samples.tsv'), ), ) workflow.transform( name='delly_filter_somatic', axes=('sv_type', ), ctx={ 'mem': 4, 'num_retry': 2, 'mem_retry_factor': 2 }, func=tasks.run_delly_filter, args=( mgd.Instance('sv_type'), mgd.TempInputFile('samples.tsv'), ref_genome_fasta_file, mgd.TempInputFile('out.bcf', 'sv_type'), mgd.TempOutputFile('somatic.bcf', 'sv_type'), ), ) workflow.transform( name='concatenate_vcf', func=vcf_tasks.concatenate_bcf, ctx={ 'mem': 4, 'num_retry': 2, 'mem_retry_factor': 2 }, args=( mgd.TempInputFile('somatic.bcf', 'sv_type'), mgd.TempOutputFile('somatic.bcf'), ), ) workflow.transform( name='convert_vcf', func=tasks.convert_vcf, ctx={ 'mem': 4, 'num_retry': 3, 'mem_retry_increment': 2 }, args=( mgd.TempInputFile('somatic.bcf'), mgd.OutputFile(out_file), ), ) return workflow
def create_variant_counting_workflow(args): """ Count variant reads for multiple sets of variants across cells. """ vcf_files, tumour_cell_bams, sample_library = inpututils.load_variant_counting_input( args['input_yaml']) counts_template = '{sample_id}_{library_id}_counts.csv.gz' counts_output_template = os.path.join(args['out_dir'], counts_template) meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml') input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml') config = inpututils.load_config(args) config = config['variant_calling'] workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('sample_id', 'library_id', 'cell_id'), value=list(tumour_cell_bams.keys()), ) workflow.transform( name='merge_snvs_museq', func='single_cell.utils.vcfutils.merge_vcf', args=([mgd.InputFile(vcf_file) for vcf_file in vcf_files], mgd.TempOutputFile('all.snv.vcf.gz', extensions=['.tbi', '.csi']), mgd.TempSpace("merge_vcf_temp")), ) workflow.subworkflow( name='count_alleles', axes=('sample_id', 'library_id'), func= 'single_cell.workflows.snv_allele_counts.create_snv_allele_counts_for_vcf_targets_workflow', args=( mgd.InputFile('tumour_cells.bam', 'sample_id', 'library_id', 'cell_id', extensions=['.bai'], fnames=tumour_cell_bams, axes_origin=[]), mgd.TempInputFile('all.snv.vcf.gz', extensions=['.tbi', '.csi']), mgd.OutputFile('counts.csv.gz', 'sample_id', 'library_id', template=counts_output_template), mgd.Instance('sample_id'), mgd.Instance('library_id'), config['memory'], ), ) workflow.transform( name='generate_meta_files_results', func='single_cell.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], args['out_dir'], mgd.Template('counts.csv.gz', 'sample_id', 'library_id', template=counts_output_template), mgd.OutputFile(meta_yaml)), kwargs={ 'input_yaml_data': inpututils.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'type': 'snv_genotyping', 'counts': { 'template': counts_template, 'instances': sample_library, } } }) return workflow