def infer_haps_workflow(args): config = helpers.load_config(args) config = config['infer_haps'] baseimage = config['docker']['single_cell_pipeline'] ctx = dict(mem_retry_increment=2, disk_retry_increment=50, ncpus=1, baseimage=baseimage) workflow = pypeliner.workflow.Workflow(ctx=ctx) haps_dir = os.path.join(args["out_dir"], "infer_haps") haplotypes_filename = os.path.join(haps_dir, "results", "haplotypes.tsv") allele_counts_filename = os.path.join(haps_dir, "results", "allele_counts.tsv") data = helpers.load_pseudowgs_input(args['input_yaml']) tumour_wgs = data['tumour_wgs'] normal_wgs = data['normal_wgs'] tumour_cells = data['tumour_cells'] normal_cells = data['normal_cells'] if args['normal']: bam_file = normal_cells if normal_cells else normal_wgs else: bam_file = tumour_cells if tumour_cells else tumour_wgs if isinstance(bam_file, dict): workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=list(bam_file.keys()), ) bam_file = mgd.InputFile('tumour.bam', 'cell_id', fnames=bam_file, extensions=['.bai']) else: bam_file = mgd.InputFile(bam_file, extensions=['.bai']) workflow.subworkflow( name='infer_haps', func=infer_haps, args=( bam_file, mgd.OutputFile(haplotypes_filename), mgd.OutputFile(allele_counts_filename), config, ), kwargs={'normal': args['normal']}, ) return workflow
def merge_bams_workflow(args): config = helpers.load_config(args) config = config['merge_bams'] baseimage = config['docker']['single_cell_pipeline'] ctx = { 'mem_retry_increment': 2, 'disk_retry_increment': 50, 'ncpus': 1, 'mem': config["memory"]['low'], 'docker_image': baseimage } workflow = pypeliner.workflow.Workflow(ctx=ctx) data = helpers.load_pseudowgs_input(args['input_yaml']) tumour_wgs = data['tumour_wgs'] normal_wgs = data['normal_wgs'] tumour_cells = data['tumour_cells'] normal_cells = data['normal_cells'] bam_files = tumour_cells if tumour_cells else normal_cells wgs_bams = tumour_wgs if tumour_cells else normal_wgs workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=list(bam_files.keys()), ) if isinstance(wgs_bams, dict): workflow.setobj( obj=mgd.OutputChunks('regions'), value=list(wgs_bams.keys()), ) workflow.set_filenames("merged.bam", "region", fnames=wgs_bams) else: workflow.transform( name="get_regions", func="single_cell.utils.pysamutils.get_regions_from_reference", ret=pypeliner.managed.OutputChunks('region'), args=( config["ref_genome"], config["split_size"], config["chromosomes"], )) workflow.set_filenames('merged.bam', 'region', template=wgs_bams) workflow.subworkflow(name="wgs_merge_workflow", func=merge_bams.create_merge_bams_workflow, args=( mgd.InputFile('bam_markdups', 'cell_id', fnames=bam_files, extensions=['.bai']), mgd.OutputFile("merged.bam", "region", axes_origin=[], extensions=['.bai']), mgd.TempInputObj("region"), config, )) workflow.transform(name="get_files", ctx={'mem': config['memory']['med']}, func='single_cell.utils.helpers.resolve_template', ret=pypeliner.managed.TempOutputObj('outputs'), args=(pypeliner.managed.TempInputObj('region'), wgs_bams, 'region')) return workflow
def variant_calling_workflow(args): config = helpers.load_config(args) config = config['variant_calling'] meta_yaml = os.path.join(args['out_dir'], 'info.yaml') data = helpers.load_pseudowgs_input(args['input_yaml']) tumour_bams = data['tumour_wgs'] normal_bams = data['normal_wgs'] tumour_cells = data['tumour_cells'] varcalls_dir = os.path.join(args['out_dir'], 'results', 'variant_calling') museq_vcf = os.path.join(varcalls_dir, 'museq_snv.vcf.gz') strelka_snv_vcf = os.path.join(varcalls_dir, 'strelka_snv.vcf.gz') strelka_indel_vcf = os.path.join(varcalls_dir, 'strelka_indel.vcf.gz') snv_h5 = os.path.join(varcalls_dir, 'snv_annotations.h5') raw_data_dir = os.path.join(varcalls_dir, 'raw') baseimage = config['docker']['single_cell_pipeline'] ctx = { 'mem_retry_increment': 2, 'disk_retry_increment': 50, 'ncpus': 1, 'mem': config["memory"]['low'], 'docker_image': baseimage } workflow = pypeliner.workflow.Workflow(ctx=ctx) if isinstance(normal_bams, dict) and isinstance(tumour_bams, dict): assert list(normal_bams.keys()) == list(tumour_bams.keys( )), 'keys for tumour and normal bams should be the same' workflow.setobj( obj=mgd.OutputChunks('region'), value=list(normal_bams.keys()), ) workflow.set_filenames('normal_split.bam', 'normal_split', fnames=normal_bams) workflow.set_filenames('tumour_split.bam', 'normal_split', fnames=tumour_bams) else: workflow.transform( name="get_regions", func="single_cell.utils.pysamutils.get_regions_from_reference", ret=pypeliner.managed.OutputChunks('region'), args=( config["ref_genome"], config["split_size"], config["chromosomes"], )) assert '{region}' in normal_bams, 'only supports a list of files or a template on regions' workflow.set_filenames('normal_split.bam', 'region', template=normal_bams) assert '{region}' in tumour_bams, 'only supports a list of files or a template on regions' workflow.set_filenames('tumour_split.bam', 'region', template=normal_bams) workflow.subworkflow( func=create_variant_calling_workflow, name='create_varcall', args=( tumour_cells, mgd.InputFile('tumour_split.bam', 'region', extensions=['bai']), mgd.InputFile('normal_split.bam', 'region', extensions=['bai']), mgd.OutputFile(museq_vcf), mgd.OutputFile(strelka_snv_vcf), mgd.OutputFile(strelka_indel_vcf), mgd.OutputFile(snv_h5), mgd.OutputFile(meta_yaml), config, raw_data_dir, ), ) return workflow
def copy_number_calling_workflow(args): config = helpers.load_config(args) config = config['copy_number_calling'] pyp = pypeliner.app.Pypeline(config=args) ctx = {'mem_retry_increment': 2, 'disk_retry_increment': 50, 'ncpus': 1, 'docker_image': config['docker']['single_cell_pipeline'] } workflow = pypeliner.workflow.Workflow(ctx=ctx) data = helpers.load_pseudowgs_input(args['input_yaml']) normal_wgs = data['normal_wgs'] tumour_cells = data['tumour_cells'] assert '{region}' in normal_wgs copynumber_dir = os.path.join(args["out_dir"], "copynumber") out_file = os.path.join(copynumber_dir, "results", "results.h5") cloneid = args["clone_id"] remixt_config = config.get('extract_seqdata', {}) workflow.setobj( obj=mgd.OutputChunks('tumour_cell_id'), value=list(tumour_cells.keys()), ) workflow.transform( name="get_regions", ctx=dict(mem=config['memory']['low']), func="single_cell.utils.pysamutils.get_regions_from_reference", ret=mgd.OutputChunks('region'), args=( config["ref_genome"], config["split_size"], config["chromosomes"], ) ) workflow.transform( name="get_snp_positions_filename", func="remixt.config.get_filename", ret=mgd.TempOutputObj('snp_positions_filename'), args=( remixt_config, config['ref_data_dir'], 'snp_positions' ) ) workflow.transform( name="get_bam_max_fragment_length", func="remixt.config.get_param", ret=mgd.TempOutputObj('bam_max_fragment_length'), args=( remixt_config, 'bam_max_fragment_length' ) ) workflow.transform( name="get_bam_max_soft_clipped", func="remixt.config.get_param", ret=mgd.TempOutputObj('bam_max_soft_clipped'), args=( remixt_config, 'bam_max_soft_clipped' ) ) workflow.transform( name="get_bam_check_proper_pair", func="remixt.config.get_param", ret=mgd.TempOutputObj('bam_check_proper_pair'), args=( remixt_config, 'bam_check_proper_pair' ) ) workflow.subworkflow( name="extract_seqdata_tumour", axes=('tumour_cell_id',), func=extract_seqdata.create_extract_seqdata_workflow, args=( mgd.InputFile( 'bam_markdups', 'tumour_cell_id', fnames=tumour_cells, extensions=['.bai'] ), mgd.TempOutputFile("tumour.h5", "tumour_cell_id"), config.get('extract_seqdata', {}), config['ref_data_dir'], config ) ) workflow.subworkflow( name="extract_seqdata_normal", axes=('region',), ctx={'disk': 200}, func=extract_seqdata.create_extract_seqdata_workflow, args=( mgd.InputFile( 'bam_markdups', 'region', template=normal_wgs, extensions=['.bai'] ), mgd.TempOutputFile("normal.h5", "region"), config.get('extract_seqdata', {}), config['ref_data_dir'], config, ) ) workflow.subworkflow( name='titan_workflow', func=titan.create_titan_workflow, args=( mgd.TempInputFile("normal.h5", "region"), mgd.TempInputFile("tumour.h5", "tumour_cell_id"), config['ref_genome'], copynumber_dir, mgd.OutputFile(out_file), config, args, tumour_cells.keys(), mgd.InputChunks('region'), cloneid ), ) pyp.run(workflow)
def germline_calling_workflow(args): config = helpers.load_config(args) config = config['germline_calling'] baseimage = config['docker']['single_cell_pipeline'] basedocker = {'docker_image': config['docker']['single_cell_pipeline']} vcftoolsdocker = {'docker_image': config['docker']['vcftools']} samtoolsdocker = {'docker_image': config['docker']['samtools']} snpeffdocker = {'docker_image': config['docker']['snpeff']} pyp = pypeliner.app.Pypeline(config=args) ctx = { 'mem_retry_increment': 2, 'ncpus': 1, 'mem': config["memory"]['low'], 'disk_retry_increment': 50, 'docker_image': baseimage }, workflow = pypeliner.workflow.Workflow(ctx=ctx) data = helpers.load_pseudowgs_input(args['input_yaml']) normal_bams = data['normal_bams'] tumour_cells = data['tumour_cells'] if not isinstance(normal_bams, dict): workflow.transform( name="get_regions", func="single_cell.utils.pysamutils.get_regions_from_reference", ret=pypeliner.managed.OutputChunks('region'), args=( config["ref_genome"], config["split_size"], config["chromosomes"], )) assert '{region}' in normal_bams, 'only supports a list of files or a template on regions' workflow.set_filenames('normal_split.bam', 'region', template=normal_bams) else: workflow.setobj( obj=mgd.OutputChunks('region'), value=list(normal_bams.keys()), ) workflow.set_filenames('normal_split.bam', 'normal_split', fnames=normal_bams) varcalls_dir = os.path.join(args['out_dir'], 'results', 'germline_calling') samtools_germline_vcf = os.path.join(varcalls_dir, 'raw', 'samtools_germline.vcf.gz') snpeff_vcf_filename = os.path.join(varcalls_dir, 'snpeff.vcf') normal_genotype_filename = os.path.join(varcalls_dir, 'raw', 'normal_genotype.h5') mappability_filename = os.path.join(varcalls_dir, 'raw', 'mappability.h5') counts_template = os.path.join(varcalls_dir, 'counts', 'raw', 'counts.h5') germline_h5_filename = os.path.join(varcalls_dir, 'germline.h5') workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=list(tumour_cells.keys()), ) workflow.subworkflow(name='samtools_germline', func=germline.create_samtools_germline_workflow, args=( mgd.InputFile("normal_split.bam", "region", extensions=['.bai']), config['ref_genome'], mgd.OutputFile(samtools_germline_vcf, extensions=['.tbi']), config, ), kwargs={ 'vcftools_docker': vcftoolsdocker, 'samtools_docker': samtoolsdocker, }) workflow.subworkflow( name='annotate_mappability', func= "biowrappers.components.variant_calling.mappability.create_vcf_mappability_annotation_workflow", args=( config['databases']['mappability']['local_path'], mgd.InputFile(samtools_germline_vcf, extensions=['.tbi']), mgd.OutputFile(mappability_filename), ), kwargs={ 'base_docker': basedocker, 'chromosomes': config['chromosomes'] }) workflow.transform( name='annotate_genotype', func="single_cell.workflows.germline.tasks.annotate_normal_genotype", args=( mgd.InputFile(samtools_germline_vcf, extensions=['.tbi']), mgd.OutputFile(normal_genotype_filename), config["chromosomes"], ), ) workflow.subworkflow( name='snpeff', func= "biowrappers.components.variant_calling.snpeff.create_snpeff_annotation_workflow", args=( config['databases']['snpeff']['db'], mgd.InputFile(samtools_germline_vcf, extensions=['.tbi']), mgd.OutputFile(snpeff_vcf_filename), ), kwargs={ 'hdf5_output': False, 'base_docker': basedocker, 'vcftools_docker': vcftoolsdocker, 'snpeff_docker': snpeffdocker, }) workflow.subworkflow( name='read_counts', func= "single_cell.variant_calling.create_snv_allele_counts_for_vcf_targets_workflow", args=( mgd.InputFile('tumour.bam', 'cell_id', fnames=tumour_cells, extensions=['.bai']), mgd.InputFile(samtools_germline_vcf, extensions=['.tbi']), mgd.OutputFile(counts_template), config['memory'], ), kwargs={ 'table_name': '/germline_allele_counts', }, ) workflow.transform( name='build_results_file', func="biowrappers.components.io.hdf5.tasks.concatenate_tables", args=( [ mgd.InputFile(counts_template), mgd.InputFile(mappability_filename), mgd.InputFile(normal_genotype_filename), ], pypeliner.managed.OutputFile(germline_h5_filename), ), kwargs={ 'drop_duplicates': True, }) pyp.run(workflow)
def breakpoint_calling_workflow(args): run_destruct = args['destruct'] run_lumpy = args['lumpy'] if not any((run_destruct, run_lumpy)): run_destruct = True run_lumpy = True config = helpers.load_config(args) config = config['breakpoint_calling'] data = helpers.load_pseudowgs_input(args['input_yaml']) tumour_cells = data['tumour_cells'] tumour_cells_id = data['tumour_cells_id'] normal_bams = data['normal_wgs'] if data['normal_wgs'] else data[ 'normal_cells'] normal_id = data['normal_wgs_id'] if data['normal_wgs_id'] else data[ 'normal_cells_id'] calls_dir = os.path.join(args['out_dir'], 'results', 'breakpoint_calling') raw_data_directory = os.path.join(calls_dir, 'raw') breakpoints_filename = os.path.join(calls_dir, 'breakpoints.h5') breakpoints_lib_filename = os.path.join(calls_dir, 'breakpoints_lib.h5') cell_counts_filename = os.path.join(calls_dir, 'cell_counts.h5') ref_data_directory = config['ref_data_directory'] workflow = pypeliner.workflow.Workflow( ctx={'docker_image': config['docker']['single_cell_pipeline']}) workflow.setobj( obj=mgd.OutputChunks('tumour_cell_id'), value=list(tumour_cells.keys()), ) if isinstance(normal_bams, dict): workflow.setobj( obj=mgd.OutputChunks('normal_cell_id'), value=list(normal_bams.keys()), ) workflow.set_filenames('normal_cells.bam', 'normal_cell_id', fnames=normal_bams) normal_bam = mgd.InputFile('normal_cells.bam', 'normal_cell_id', extensions=['.bai']) else: normal_bam = mgd.InputFile(normal_bams, extensions=['.bai']) if run_destruct: workflow.subworkflow( name='destruct', ctx={'docker_image': config['docker']['destruct']}, func= "single_cell.workflows.destruct_singlecell.create_destruct_workflow", args=( normal_bam, mgd.InputFile('tumour.bam', 'tumour_cell_id', fnames=tumour_cells), config.get('destruct', {}), ref_data_directory, mgd.OutputFile(breakpoints_filename), mgd.OutputFile(breakpoints_lib_filename), mgd.OutputFile(cell_counts_filename), raw_data_directory, ), ) if run_lumpy: varcalls_dir = os.path.join(args['out_dir'], 'results', 'breakpoint_calling') breakpoints_bed = os.path.join(varcalls_dir, 'lumpy_breakpoints.bed') breakpoints_csv = os.path.join(varcalls_dir, 'lumpy_breakpoints.csv.gz') breakpoints_evidence_csv = os.path.join( varcalls_dir, 'lumpy_breakpoints_evidence.csv.gz') workflow.subworkflow( name='lumpy', func="single_cell.workflows.lumpy.create_lumpy_workflow", args=( config, mgd.InputFile('tumour.bam', 'tumour_cell_id', fnames=tumour_cells, extensions=['.bai']), normal_bam, mgd.OutputFile(breakpoints_bed), mgd.OutputFile(breakpoints_csv), mgd.OutputFile(breakpoints_evidence_csv), ), kwargs={ 'tumour_id': tumour_cells_id, 'normal_id': normal_id }) return workflow