def align_sample_no_split(fastq_1, fastq_2, out_file, samtools_flagstat, sample_id, lane_id, sample_info, refdir, picard_mem=None): ref_genome = config.refdir_data(refdir)['paths']['reference'] out_bai = out_file + '.bai' workflow = pypeliner.workflow.Workflow() workflow.transform(name='align_bwa_mem', ctx=helpers.get_default_ctx(memory=8, walltime='48:00', ncpus='8', disk=300), func='wgs.workflows.alignment.tasks.align_bwa_mem', args=( pypeliner.managed.InputFile(fastq_1), pypeliner.managed.InputFile(fastq_2), ref_genome, pypeliner.managed.TempOutputFile('aligned.bam'), '8', sample_info, ), kwargs={ 'sample_id': sample_id, 'lane_id': lane_id, }) workflow.transform(name='sort', ctx=helpers.get_default_ctx(memory=8, walltime='48:00', ncpus='8', disk=300), func='wgs.workflows.alignment.tasks.bam_sort', args=(pypeliner.managed.TempInputFile('aligned.bam'), pypeliner.managed.OutputFile(out_file), pypeliner.managed.TempSpace('bam_sort_tempdir')), kwargs={ 'threads': '8', 'mem': '{}G'.format(picard_mem) }) workflow.transform( name='index_and_flagstat', func='wgs.workflows.alignment.tasks.index_and_flagstat', ctx=helpers.get_default_ctx(memory=4, walltime='24:00', disk=200), args=(pypeliner.managed.InputFile(out_file), pypeliner.managed.OutputFile(out_bai), pypeliner.managed.OutputFile(samtools_flagstat)), ) return workflow
def breakpoint_calling_workflow(args): pyp = pypeliner.app.Pypeline(config=args) inputs = helpers.load_yaml(args['input_yaml']) meta_yaml = os.path.join(args["out_dir"], 'metadata.yaml') input_yaml_blob = os.path.join(args["out_dir"], 'input.yaml') tumours = helpers.get_values_from_input(inputs, 'tumour') normals = helpers.get_values_from_input(inputs, 'normal') samples = list(tumours.keys()) sv_outdir = os.path.join(args['out_dir'], 'breakpoints', '{sample_id}') destruct_breakpoints = os.path.join( sv_outdir, '{sample_id}_destruct_breakpoints.csv.gz') destruct_library = os.path.join(sv_outdir, '{sample_id}_destruct_library.csv.gz') destruct_raw_breakpoints = os.path.join( sv_outdir, '{sample_id}_destruct_raw_breakpoints.csv.gz') destruct_raw_library = os.path.join( sv_outdir, '{sample_id}_destruct_raw_library.csv.gz') destruct_reads = os.path.join(sv_outdir, '{sample_id}_destruct_reads.csv.gz') lumpy_vcf = os.path.join(sv_outdir, '{sample_id}_lumpy.vcf') parsed_csv = os.path.join(sv_outdir, '{sample_id}_filtered_consensus_calls.csv.gz') svaba_vcf = os.path.join(sv_outdir, '{sample_id}_svaba.vcf') single_node = args['single_node'] refdir_paths = config.refdir_data(args['refdir'])['paths'] chromosomes = config.refdir_data(args['refdir'])['params']['chromosomes'] workflow = pypeliner.workflow.Workflow() workflow.setobj(obj=mgd.OutputChunks('sample_id'), value=samples) workflow.subworkflow( name='destruct', func=destruct_wgs.create_destruct_wgs_workflow, axes=('sample_id', ), args=(mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai'], axes_origin=[]), mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), mgd.OutputFile('destruct_raw_breakpoints', 'sample_id', template=destruct_raw_breakpoints), mgd.OutputFile('destruct_raw_library', 'sample_id', template=destruct_raw_library), mgd.OutputFile('destruct_breakpoints', 'sample_id', template=destruct_breakpoints), mgd.OutputFile('destruct_library', 'sample_id', template=destruct_library), mgd.OutputFile('destruct_reads', 'sample_id', template=destruct_reads), mgd.InputInstance('sample_id'), refdir_paths['reference'], refdir_paths['refdata_destruct'], refdir_paths['gtf'], refdir_paths['blacklist_destruct']), kwargs={'single_node': single_node}) workflow.subworkflow( name='lumpy', func=lumpy.create_lumpy_workflow, axes=('sample_id', ), args=(mgd.OutputFile('lumpy_vcf', 'sample_id', template=lumpy_vcf), ), kwargs={ 'tumour_bam': mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai'], axes_origin=[]), 'normal_bam': mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), 'single_node': single_node }, ) if args['svaba']: workflow.subworkflow( name='svaba', func=svaba.create_svaba_workflow, axes=('sample_id', ), args=( mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai'], axes_origin=[]), mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), mgd.OutputFile('svaba_vcf', 'sample_id', template=svaba_vcf), refdir_paths['reference'], ), ) workflow.subworkflow( name="consensus_calling", func=breakpoint_calling_consensus.create_consensus_workflow, axes=('sample_id', ), args=(mgd.InputFile('destruct_breakpoints', 'sample_id', template=destruct_breakpoints), mgd.InputFile('lumpy_vcf', 'sample_id', template=lumpy_vcf), mgd.OutputFile('consensus_calls', 'sample_id', template=parsed_csv, extensions=['.yaml']), chromosomes), ) filenames = [ destruct_breakpoints, destruct_library, destruct_raw_breakpoints, destruct_raw_library, destruct_reads, lumpy_vcf, parsed_csv ] if args['svaba']: filenames.append(svaba_vcf) outputted_filenames = helpers.expand_list(filenames, samples, "sample_id") workflow.transform(name='generate_meta_files_results', func=helpers.generate_and_upload_metadata, args=(sys.argv[0:], args["out_dir"], outputted_filenames, mgd.OutputFile(meta_yaml)), kwargs={ 'input_yaml_data': helpers.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'type': 'breakpoint_calling' } }) pyp.run(workflow)
def call_germlines_only(samples, normals, museq_ss_vcf, samtools_germline_vcf, roh_calls, museq_single_pdf, refdir, single_node=False): museq_ss_vcf = dict([(sampid, museq_ss_vcf[sampid]) for sampid in samples]) museq_single_pdf = dict([(sampid, museq_single_pdf[sampid]) for sampid in samples]) samtools_germline_vcf = dict([(sampid, samtools_germline_vcf[sampid]) for sampid in samples]) roh_calls = dict([(sampid, roh_calls[sampid]) for sampid in samples]) chromosomes = config.refdir_data(refdir)['params']['chromosomes'] paths_refdir = config.refdir_data(refdir)['paths'] workflow = pypeliner.workflow.Workflow(ctx=helpers.get_default_ctx( docker_image=config.containers('wgs'))) workflow.setobj(obj=mgd.OutputChunks('sample_id'), value=samples) workflow.subworkflow( name="mutationseq_single", func='wgs.workflows.mutationseq.create_museq_workflow', axes=('sample_id', ), args=( mgd.TempOutputFile("museq_germlines.vcf.gz", 'sample_id'), mgd.OutputFile('museq_single_pdf', 'sample_id', fnames=museq_single_pdf), paths_refdir['reference'], chromosomes, ), kwargs={ 'tumour_bam': None, 'normal_bam': mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), 'single_node': single_node, 'germline_refdata': paths_refdir['germline_portrait_ref'], 'thousand_genomes': paths_refdir['thousand_genomes'], 'dbsnp': paths_refdir['dbsnp'], }) workflow.subworkflow( name="samtools_germline", func= 'wgs.workflows.samtools_germline.create_samtools_germline_workflow', axes=('sample_id', ), args=(mgd.TempOutputFile("samtools_germlines.vcf.gz", 'sample_id'), mgd.OutputFile("roh_calls.csv", 'sample_id', fnames=roh_calls), mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), paths_refdir['reference'], chromosomes), kwargs={ 'single_node': single_node, }) workflow.subworkflow( name="annotate_germline_museq", func='wgs.workflows.vcf_annotation.create_annotation_workflow', axes=('sample_id', ), args=(mgd.TempInputFile("museq_germlines.vcf.gz", 'sample_id'), mgd.OutputFile('museq_germlines_ann.vcf.gz', 'sample_id', extensions=['.csi', '.tbi'], fnames=museq_ss_vcf), paths_refdir['snpeff_config'], paths_refdir['mutation_assessor'], paths_refdir['dbsnp'], paths_refdir['thousand_genomes'], paths_refdir['cosmic'], paths_refdir['blacklist']), kwargs={ 'vcftools_docker': config.containers('vcftools'), 'snpeff_docker': config.containers('vcftools'), }) workflow.subworkflow( name="annotate_germline_samtools", func='wgs.workflows.vcf_annotation.create_annotation_workflow', axes=('sample_id', ), args=(mgd.TempInputFile("samtools_germlines.vcf.gz", 'sample_id'), mgd.OutputFile("samtools_germlines_anno.vcf.gz", 'sample_id', extensions=['.csi', '.tbi'], fnames=samtools_germline_vcf), paths_refdir['snpeff_config'], paths_refdir['mutation_assessor'], paths_refdir['dbsnp'], paths_refdir['thousand_genomes'], paths_refdir['cosmic'], paths_refdir['blacklist']), kwargs={ 'vcftools_docker': config.containers('vcftools'), 'snpeff_docker': config.containers('vcftools'), }) return workflow
def call_variants(samples, somatic_calls, somatic_snpeff, somatic_ma, somatic_ids, indel_calls, indel_snpeff, indel_ma, indel_ids, germline_calls, germline_snpeff, germline_ma, germline_ids, tumours, normals, museq_vcf, museq_ss_vcf, samtools_germlines_vcf, roh_calls, strelka_snv_vcf, strelka_indel_vcf, museq_paired_pdf, museq_single_pdf, refdir, single_node=False, is_exome=False): strelka_snv_vcf = dict([(sampid, strelka_snv_vcf[sampid]) for sampid in samples]) strelka_indel_vcf = dict([(sampid, strelka_indel_vcf[sampid]) for sampid in samples]) museq_vcf = dict([(sampid, museq_vcf[sampid]) for sampid in samples]) museq_ss_vcf = dict([(sampid, museq_ss_vcf[sampid]) for sampid in samples]) samtools_germlines_vcf = dict([(sampid, samtools_germlines_vcf[sampid]) for sampid in samples]) roh_calls = dict([(sampid, roh_calls[sampid]) for sampid in samples]) museq_paired_pdf = dict([(sampid, museq_paired_pdf[sampid]) for sampid in samples]) museq_single_pdf = dict([(sampid, museq_single_pdf[sampid]) for sampid in samples]) somatic_calls = dict([(sampid, somatic_calls[sampid]) for sampid in samples]) somatic_snpeff = dict([(sampid, somatic_snpeff[sampid]) for sampid in samples]) somatic_ma = dict([(sampid, somatic_ma[sampid]) for sampid in samples]) somatic_ids = dict([(sampid, somatic_ids[sampid]) for sampid in samples]) indel_calls = dict([(sampid, indel_calls[sampid]) for sampid in samples]) indel_snpeff = dict([(sampid, indel_snpeff[sampid]) for sampid in samples]) indel_ma = dict([(sampid, indel_ma[sampid]) for sampid in samples]) indel_ids = dict([(sampid, indel_ids[sampid]) for sampid in samples]) germline_calls = dict([(sampid, germline_calls[sampid]) for sampid in samples]) germline_snpeff = dict([(sampid, germline_snpeff[sampid]) for sampid in samples]) germline_ma = dict([(sampid, germline_ma[sampid]) for sampid in samples]) germline_ids = dict([(sampid, germline_ids[sampid]) for sampid in samples]) chromosomes = config.refdir_data(refdir)['params']['chromosomes'] paths_refdir = config.refdir_data(refdir)['paths'] workflow = pypeliner.workflow.Workflow(ctx=helpers.get_default_ctx( docker_image=config.containers('wgs'))) workflow.setobj(obj=mgd.OutputChunks('sample_id'), value=samples) workflow.subworkflow( name="mutationseq_paired", func='wgs.workflows.mutationseq.create_museq_workflow', axes=('sample_id', ), args=(mgd.TempOutputFile("museq_snv.vcf.gz", 'sample_id'), mgd.OutputFile('museq_paired_pdf', 'sample_id', fnames=museq_paired_pdf), paths_refdir['reference'], chromosomes), kwargs={ 'tumour_bam': mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai'], axes_origin=[]), 'normal_bam': mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), 'single_node': single_node, }) workflow.subworkflow( name="mutationseq_single", func='wgs.workflows.mutationseq.create_museq_workflow', axes=('sample_id', ), args=(mgd.TempOutputFile("museq_germlines.vcf.gz", 'sample_id'), mgd.OutputFile('museq_single_pdf', 'sample_id', fnames=museq_single_pdf), paths_refdir['reference'], chromosomes), kwargs={ 'tumour_bam': None, 'normal_bam': mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), 'single_node': single_node, 'germline_refdata': paths_refdir['germline_portrait_ref'], 'thousand_genomes': paths_refdir['thousand_genomes'], 'dbsnp': paths_refdir['dbsnp'], }) workflow.subworkflow( name="samtools_germline", func= 'wgs.workflows.samtools_germline.create_samtools_germline_workflow', axes=('sample_id', ), args=(mgd.TempOutputFile("samtools_germlines.vcf.gz", 'sample_id'), mgd.OutputFile("roh_calls.csv.gz", 'sample_id', fnames=roh_calls), mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), paths_refdir['reference'], chromosomes), kwargs={ 'single_node': single_node, }) workflow.subworkflow( name="strelka", func='wgs.workflows.strelka.create_strelka_workflow', axes=('sample_id', ), args=(mgd.InputFile('normal_bam', 'sample_id', fnames=normals, extensions=['.bai']), mgd.InputFile('tumour_bam', 'sample_id', fnames=tumours, extensions=['.bai']), mgd.TempOutputFile('strelka_indel.vcf.gz', 'sample_id'), mgd.TempOutputFile('strelka_snv.vcf.gz', 'sample_id'), paths_refdir['reference'], chromosomes), kwargs={ 'single_node': single_node, 'is_exome': is_exome }, ) workflow.subworkflow( name="annotate_paired_museq", func='wgs.workflows.vcf_annotation.create_annotation_workflow', axes=('sample_id', ), args=(mgd.TempInputFile("museq_snv.vcf.gz", 'sample_id'), mgd.OutputFile('museq_snv_ann.vcf.gz', 'sample_id', extensions=['.csi', '.tbi'], fnames=museq_vcf), paths_refdir['snpeff_config'], paths_refdir['mutation_assessor'], paths_refdir['dbsnp'], paths_refdir['thousand_genomes'], paths_refdir['cosmic'], paths_refdir['blacklist']), kwargs={ 'vcftools_docker': config.containers('vcftools'), 'snpeff_docker': config.containers('vcftools'), }) workflow.subworkflow( name="annotate_germline_museq", func='wgs.workflows.vcf_annotation.create_annotation_workflow', axes=('sample_id', ), args=(mgd.TempInputFile("museq_germlines.vcf.gz", 'sample_id'), mgd.OutputFile('museq_germlines_ann.vcf.gz', 'sample_id', extensions=['.csi', '.tbi'], fnames=museq_ss_vcf), paths_refdir['snpeff_config'], paths_refdir['mutation_assessor'], paths_refdir['dbsnp'], paths_refdir['thousand_genomes'], paths_refdir['cosmic'], paths_refdir['blacklist']), kwargs={ 'vcftools_docker': config.containers('vcftools'), 'snpeff_docker': config.containers('vcftools'), }) workflow.subworkflow( name="annotate_germline_samtools", func='wgs.workflows.vcf_annotation.create_annotation_workflow', axes=('sample_id', ), args=(mgd.TempInputFile("samtools_germlines.vcf.gz", 'sample_id'), mgd.OutputFile("samtools_germlines_ann.vcf.gz", 'sample_id', extensions=['.csi', '.tbi'], fnames=samtools_germlines_vcf), paths_refdir['snpeff_config'], paths_refdir['mutation_assessor'], paths_refdir['dbsnp'], paths_refdir['thousand_genomes'], paths_refdir['cosmic'], paths_refdir['blacklist']), kwargs={ 'vcftools_docker': config.containers('vcftools'), 'snpeff_docker': config.containers('vcftools'), }) workflow.subworkflow( name="annotate_strelka", func='wgs.workflows.vcf_annotation.create_annotation_workflow', axes=('sample_id', ), args=(mgd.TempInputFile("strelka_snv.vcf.gz", 'sample_id'), mgd.OutputFile('strelka_snv_ann.vcf.gz', 'sample_id', extensions=['.csi', '.tbi'], fnames=strelka_snv_vcf), paths_refdir['snpeff_config'], paths_refdir['mutation_assessor'], paths_refdir['dbsnp'], paths_refdir['thousand_genomes'], paths_refdir['cosmic'], paths_refdir['blacklist']), kwargs={ 'vcftools_docker': config.containers('vcftools'), 'snpeff_docker': config.containers('vcftools'), }) workflow.subworkflow( name="annotate_strelka_indel", func='wgs.workflows.vcf_annotation.create_annotation_workflow', axes=('sample_id', ), args=(mgd.TempInputFile("strelka_indel.vcf.gz", 'sample_id'), mgd.OutputFile('strelka_indel_ann.vcf.gz', 'sample_id', extensions=['.csi', '.tbi'], fnames=strelka_indel_vcf), paths_refdir['snpeff_config'], paths_refdir['mutation_assessor'], paths_refdir['dbsnp'], paths_refdir['thousand_genomes'], paths_refdir['cosmic'], paths_refdir['blacklist']), kwargs={ 'vcftools_docker': config.containers('vcftools'), 'snpeff_docker': config.containers('vcftools'), }) workflow.subworkflow( name="consensus_calling", func= 'wgs.workflows.variant_calling_consensus.create_consensus_workflow', axes=('sample_id', ), args=( mgd.InputFile("museq_germlines_ann.vcf.gz", 'sample_id', fnames=museq_ss_vcf), mgd.InputFile("museq_snv_ann.vcf.gz", 'sample_id', fnames=museq_vcf), mgd.InputFile("strelka_snv_ann.vcf.gz", 'sample_id', fnames=strelka_snv_vcf), mgd.InputFile("strelka_indel_ann.vcf.gz", 'sample_id', fnames=strelka_indel_vcf), mgd.OutputFile('somatic_csv', 'sample_id', fnames=somatic_calls), mgd.OutputFile('somatic_snpeff', 'sample_id', fnames=somatic_snpeff), mgd.OutputFile('somatic_ma', 'sample_id', fnames=somatic_ma), mgd.OutputFile('somatic_ids', 'sample_id', fnames=somatic_ids), mgd.OutputFile('indel_csv', 'sample_id', fnames=indel_calls), mgd.OutputFile('indel_snpeff', 'sample_id', fnames=indel_snpeff), mgd.OutputFile('indel_ma', 'sample_id', fnames=indel_ma), mgd.OutputFile('indel_ids', 'sample_id', fnames=indel_ids), mgd.OutputFile('germline_csv', 'sample_id', fnames=germline_calls), mgd.OutputFile('germline_snpeff', 'sample_id', fnames=germline_snpeff), mgd.OutputFile('germline_ma', 'sample_id', fnames=germline_ma), mgd.OutputFile('germline_ids', 'sample_id', fnames=germline_ids), refdir, ), ) return workflow
def create_somatic_calling_workflow(samples, tumours, normals, museq_vcf, museq_maf, museq_paired_pdf, strelka_snv_vcf, strelka_snv_maf, strelka_indel_vcf, strelka_indel_maf, mutect_vcf, mutect_maf, somatic_consensus_maf, refdir, normal_ids, tumour_ids, single_node=False, is_exome=False): strelka_snv_vcf = dict([(sampid, strelka_snv_vcf[sampid]) for sampid in samples]) strelka_indel_vcf = dict([(sampid, strelka_indel_vcf[sampid]) for sampid in samples]) strelka_snv_maf = dict([(sampid, strelka_snv_maf[sampid]) for sampid in samples]) strelka_indel_maf = dict([(sampid, strelka_indel_maf[sampid]) for sampid in samples]) museq_vcf = dict([(sampid, museq_vcf[sampid]) for sampid in samples]) museq_maf = dict([(sampid, museq_maf[sampid]) for sampid in samples]) museq_paired_pdf = dict([(sampid, museq_paired_pdf[sampid]) for sampid in samples]) mutect_vcf = dict([(sampid, mutect_vcf[sampid]) for sampid in samples]) mutect_maf = dict([(sampid, mutect_maf[sampid]) for sampid in samples]) somatic_consensus_maf = dict([(sampid, somatic_consensus_maf[sampid]) for sampid in samples]) chromosomes = config.refdir_data(refdir)['params']['chromosomes'] paths_refdir = config.refdir_data(refdir)['paths'] workflow = pypeliner.workflow.Workflow() workflow.setobj(obj=mgd.OutputChunks('sample_id'), value=samples) workflow.setobj(obj=mgd.TempOutputObj('normal_id', 'sample_id', axes_origin=[]), value={v: normal_ids[v] for v in samples}) workflow.setobj(obj=mgd.TempOutputObj('tumour_id', 'sample_id', axes_origin=[]), value={v: tumour_ids[v] for v in samples}) workflow.subworkflow( name="mutationseq_paired", func='wgs.workflows.mutationseq.create_museq_workflow', axes=('sample_id', ), args=( mgd.OutputFile('museq_snv_ann.vcf.gz', 'sample_id', extensions=['.csi', '.tbi'], fnames=museq_vcf), mgd.OutputFile('museq_snv_ann.maf', 'sample_id', fnames=museq_maf), mgd.OutputFile('museq_paired_pdf', 'sample_id', fnames=museq_paired_pdf), paths_refdir['reference'], paths_refdir['reference_vep'], chromosomes, ), kwargs={ 'normal_id': mgd.TempInputObj('normal_id', 'sample_id'), 'tumour_id': mgd.TempInputObj('tumour_id', 'sample_id'), 'tumour_bam': mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai'], axes_origin=[]), 'normal_bam': mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), 'single_node': single_node, }) workflow.subworkflow( name="strelka", func='wgs.workflows.strelka.create_strelka_workflow', axes=('sample_id', ), args=( mgd.InputFile('normal_bam', 'sample_id', fnames=normals, extensions=['.bai']), mgd.InputFile('tumour_bam', 'sample_id', fnames=tumours, extensions=['.bai']), mgd.OutputFile('strelka_snv_ann.vcf.gz', 'sample_id', extensions=['.csi', '.tbi'], fnames=strelka_snv_vcf), mgd.OutputFile('strelka_snv_ann.maf', 'sample_id', fnames=strelka_snv_maf), mgd.OutputFile('strelka_indel_ann.vcf.gz', 'sample_id', extensions=['.csi', '.tbi'], fnames=strelka_indel_vcf), mgd.OutputFile('strelka_indel_ann.maf', 'sample_id', fnames=strelka_indel_maf), paths_refdir['reference'], paths_refdir['reference_vep'], chromosomes, mgd.TempInputObj('normal_id', 'sample_id'), mgd.TempInputObj('tumour_id', 'sample_id'), ), kwargs={ 'single_node': single_node, 'is_exome': is_exome }, ) workflow.subworkflow( name="mutect", func='wgs.workflows.mutect.create_mutect_workflow', axes=('sample_id', ), args=( mgd.InputFile('normal_bam', 'sample_id', fnames=normals, extensions=['.bai']), mgd.InputFile('tumour_bam', 'sample_id', fnames=tumours, extensions=['.bai']), mgd.OutputFile('mutect_snv_ann.vcf.gz', 'sample_id', extensions=['.csi', '.tbi'], fnames=mutect_vcf), mgd.OutputFile('mutect_snv_ann.maf', 'sample_id', fnames=mutect_maf), paths_refdir['reference'], paths_refdir['reference_vep'], chromosomes, mgd.TempInputObj('normal_id', 'sample_id'), mgd.TempInputObj('tumour_id', 'sample_id'), ), kwargs={ 'single_node': single_node, }, ) workflow.subworkflow( name="somatic_consensus", func= 'wgs.workflows.somatic_calling_consensus.create_somatic_consensus_workflow', axes=('sample_id', ), args=( mgd.InputFile('mutect_snv_ann.vcf.gz', 'sample_id', extensions=['.csi', '.tbi'], fnames=mutect_vcf), mgd.InputFile('strelka_snv_ann.vcf.gz', 'sample_id', extensions=['.csi', '.tbi'], fnames=strelka_snv_vcf), mgd.InputFile('strelka_indel_ann.vcf.gz', 'sample_id', extensions=['.csi', '.tbi'], fnames=strelka_indel_vcf), mgd.InputFile('museq_snv_ann.vcf.gz', 'sample_id', extensions=['.csi', '.tbi'], fnames=museq_vcf), mgd.OutputFile("somatic_consensus.maf", 'sample_id', fnames=somatic_consensus_maf), chromosomes, paths_refdir['reference_vep'], mgd.TempInputObj('normal_id', 'sample_id'), mgd.TempInputObj('tumour_id', 'sample_id'), ), ) return workflow
def create_consensus_workflow(museq_germline, museq_snv, strelka_snv, strelka_indel, somatic_calls, somatic_snpeff, somatic_ma, somatic_ids, indel_calls, indel_snpeff, indel_ma, indel_ids, germline_calls, germline_snpeff, germline_ma, germline_ids, refdir): params = config.default_params('variant_calling') chromosomes = config.refdir_data(refdir)['params']['chromosomes'] workflow = pypeliner.workflow.Workflow() workflow.transform( name='parse_museq_germlines', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.variant_calling_consensus.tasks.parse_vcf', args=(mgd.InputFile(museq_germline, extensions=['.csi', '.tbi']), mgd.OutputFile(germline_calls, extensions=['.yaml']), mgd.OutputFile(germline_snpeff, extensions=['.yaml']), mgd.OutputFile(germline_ma, extensions=['.yaml']), mgd.OutputFile(germline_ids, extensions=['.yaml']), params["parse_museq"], chromosomes, mgd.TempSpace("tempdir_parse_germlines")), ) workflow.transform( name='parse_strelka_indel', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.variant_calling_consensus.tasks.parse_vcf', args=(mgd.InputFile(strelka_indel, extensions=['.csi', '.tbi']), mgd.OutputFile(indel_calls, extensions=['.yaml']), mgd.OutputFile(indel_snpeff, extensions=['.yaml']), mgd.OutputFile(indel_ma, extensions=['.yaml']), mgd.OutputFile(indel_ids, extensions=['.yaml']), params["parse_strelka"], chromosomes, mgd.TempSpace("tempdir_strelka_indel")), ) workflow.transform( name='parse_museq_snv', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.variant_calling_consensus.tasks.parse_vcf', args=(mgd.InputFile(museq_snv, extensions=['.csi', '.tbi']), mgd.TempOutputFile('museq_snv.csv', extensions=['.yaml']), mgd.TempOutputFile('museq_snpeff.csv', extensions=['.yaml']), mgd.TempOutputFile('museq_ma.csv', extensions=['.yaml']), mgd.TempOutputFile('museq_ids.csv', extensions=['.yaml']), params["parse_museq"], chromosomes, mgd.TempSpace("tempdir_parse_museq_snv")), ) workflow.transform( name='parse_strelka_snv', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.variant_calling_consensus.tasks.parse_vcf', args=(mgd.InputFile(strelka_snv, extensions=['.csi', '.tbi']), mgd.TempOutputFile('strelka_snv.csv', extensions=['.yaml']), mgd.TempOutputFile('strelka_snv_snpeff.csv', extensions=['.yaml']), mgd.TempOutputFile('strelka_snv_ma.csv', extensions=['.yaml']), mgd.TempOutputFile('strelka_snv_ids.csv', extensions=['.yaml']), params["parse_strelka"], chromosomes, mgd.TempSpace("tempdir_parse_strelka_snv")), ) workflow.transform( name='merge_snvs', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.variant_calling_consensus.tasks.merge_overlap', args=( [ mgd.TempInputFile('strelka_snv.csv', extensions=['.yaml']), mgd.TempInputFile('museq_snv.csv', extensions=['.yaml']) ], mgd.OutputFile(somatic_calls, extensions=['.yaml']), ), ) workflow.transform( name='merge_snpeff', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.variant_calling_consensus.tasks.merge_overlap', args=( [ mgd.TempInputFile('strelka_snv_snpeff.csv', extensions=['.yaml']), mgd.TempInputFile('museq_snpeff.csv', extensions=['.yaml']) ], mgd.OutputFile(somatic_snpeff, extensions=['.yaml']), ), kwargs={'on': ['chrom', 'pos']}) workflow.transform( name='merge_ma', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.variant_calling_consensus.tasks.merge_overlap', args=( [ mgd.TempInputFile('strelka_snv_ma.csv', extensions=['.yaml']), mgd.TempInputFile('museq_ma.csv', extensions=['.yaml']) ], mgd.OutputFile(somatic_ma, extensions=['.yaml']), ), kwargs={'on': ['chrom', 'pos']}) workflow.transform( name='merge_ids', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.variant_calling_consensus.tasks.merge_overlap', args=( [ mgd.TempInputFile('strelka_snv_ids.csv', extensions=['.yaml']), mgd.TempInputFile('museq_ids.csv', extensions=['.yaml']) ], mgd.OutputFile(somatic_ids, extensions=['.yaml']), ), kwargs={'on': ['chrom', 'pos']}) return workflow
def get_coverage_data( input_bam, output, refdir, chromosomes, mapping_qual, bins, single_node=False ): reference = config.refdir_data(refdir)['paths']['reference'] workflow = pypeliner.workflow.Workflow() if single_node: workflow.transform( name='generate_coverage_bed', func='wgs.workflows.sample_qc.tasks.generate_coverage_bed', ctx=helpers.get_default_ctx( memory=5 ), args=( reference, mgd.TempOutputFile('coverage_bed.bed'), chromosomes, bins, ) ) workflow.transform( name='samtools_coverage', func='wgs.workflows.sample_qc.tasks.samtools_coverage', ctx=helpers.get_default_ctx( memory=5 ), args=( mgd.InputFile(input_bam), mgd.TempInputFile('coverage_bed.bed'), mgd.TempOutputFile('per_interval.txt', 'chromosome'), mapping_qual, ), ) else: workflow.setobj( obj=mgd.OutputChunks('chromosome'), value=chromosomes ) workflow.transform( name='generate_coverage_bed', func='wgs.workflows.sample_qc.tasks.generate_coverage_bed', ctx=helpers.get_default_ctx( memory=5 ), axes=('chromosome',), args=( reference, mgd.TempOutputFile('coverage_bed.bed', 'chromosome'), mgd.InputInstance('chromosome'), bins, ) ) workflow.transform( name='samtools_coverage', func='wgs.workflows.sample_qc.tasks.samtools_coverage', ctx=helpers.get_default_ctx( memory=5 ), axes=('chromosome',), args=( mgd.InputFile(input_bam), mgd.TempInputFile('coverage_bed.bed', 'chromosome'), mgd.TempOutputFile('per_interval.txt', 'chromosome'), mapping_qual, ), ) workflow.transform( name='merge_data', func='wgs.utils.csvutils.concatenate_csv', ctx=helpers.get_default_ctx( memory=5 ), args=( mgd.TempInputFile('per_interval.txt', 'chromosome', axes_origin=[]), mgd.OutputFile(output), ) ) return workflow
def collect_bam_metrics(bam, markdups_metrics, sample_id, refdir, metrics, picard_insert_metrics, picard_insert_pdf, flagstat_metrics, picard_gc_metrics, picard_gc_summary, picard_gc_pdf, picard_wgs_metrics, bam_tdf, picard_mem=8): ''' calculates bam metrics in bams 1. picard insert metrics 2. picard GC metrics 3. picard wgs metrics 4. fastqc metrics :param config: config images for metrics :param bams: sample:bam dictionary :param metrics_csv: output csv containing metrics :param single_node: ''' ref_genome = config.refdir_data(refdir)['paths']['reference'] picard_wgs_params = config.default_params('alignment')['picard_wgs_params'] reftype = config.refdir_data(refdir)['params']['reference_type'] workflow = pypeliner.workflow.Workflow() workflow.transform( name="calc_picard_insert_metrics", ctx=helpers.get_default_ctx(memory=10, walltime='72:00', disk=400), func='wgs.workflows.alignment.tasks.bam_collect_insert_metrics', args=( mgd.InputFile(bam), mgd.OutputFile(flagstat_metrics), mgd.OutputFile(picard_insert_metrics), mgd.OutputFile(picard_insert_pdf), mgd.TempSpace('picard_insert'), ), kwargs={'mem': '{}G'.format(picard_mem)}) workflow.transform( name="calc_picard_gc_metrics", func='wgs.workflows.alignment.tasks.bam_collect_gc_metrics', ctx=helpers.get_default_ctx(memory=10, walltime='72:00', disk=400), args=(mgd.InputFile(bam), ref_genome, mgd.OutputFile(picard_gc_metrics), mgd.OutputFile(picard_gc_summary), mgd.OutputFile(picard_gc_pdf), mgd.TempSpace('picard_gc')), kwargs={'mem': '{}G'.format(picard_mem)}) workflow.transform( name="calc_picard_wgs_metrics", func='wgs.workflows.alignment.tasks.bam_collect_wgs_metrics', ctx=helpers.get_default_ctx(memory=10, walltime='72:00', disk=400), args=(mgd.InputFile(bam), ref_genome, mgd.OutputFile(picard_wgs_metrics), picard_wgs_params, mgd.TempSpace('picard_wgs')), kwargs={'mem': '{}G'.format(picard_mem)}) workflow.transform( name='igvtools_tdf', ctx=helpers.get_default_ctx( memory=4, walltime='16:00', ), func='wgs.workflows.alignment.tasks.get_igvtools_count', args=(pypeliner.managed.InputFile(bam), pypeliner.managed.OutputFile(bam_tdf), reftype), ) workflow.transform( name='collect_metrics', func='wgs.workflows.alignment.tasks.bam_collect_all_metrics', ctx=helpers.get_default_ctx(memory=10, walltime='4:00', disk=400), args=(mgd.InputFile(flagstat_metrics), mgd.InputFile(picard_insert_metrics), mgd.InputFile(picard_wgs_metrics), mgd.InputFile(markdups_metrics), mgd.OutputFile(metrics, extensions=['.yaml']), sample_id), kwargs={ 'main_dtypes': dtypes()['metrics'], 'insert_dtypes': dtypes()['insert_metrics'] }) return workflow
def align_sample_split(fastq_1, fastq_2, out_file, samtools_flagstat, sample_id, lane_id, sample_info, refdir, picard_mem=2): ref_genome = config.refdir_data(refdir)['paths']['reference'] split_size = config.default_params('alignment')['split_size'] out_bai = out_file + '.bai' workflow = pypeliner.workflow.Workflow() workflow.transform( name='split_fastq_1', ctx=helpers.get_default_ctx( memory=4, walltime='24:00', ), func='biowrappers.components.io.fastq.tasks.split_fastq', args=( pypeliner.managed.InputFile(fastq_1), pypeliner.managed.TempOutputFile('read_1', 'split'), split_size, ), ) workflow.transform( name='split_fastq_2', ctx=helpers.get_default_ctx( memory=4, walltime='24:00', ), func='biowrappers.components.io.fastq.tasks.split_fastq', args=( pypeliner.managed.InputFile(fastq_2), pypeliner.managed.TempOutputFile('read_2', 'split', axes_origin=[]), split_size, ), ) workflow.transform(name='align_bwa_mem', axes=('split', ), ctx=helpers.get_default_ctx( memory=8, walltime='16:00', ncpus=8, ), func='wgs.workflows.alignment.tasks.align_bwa_mem', args=( pypeliner.managed.TempInputFile('read_1', 'split'), pypeliner.managed.TempInputFile('read_2', 'split'), ref_genome, pypeliner.managed.TempOutputFile( 'aligned.bam', 'split'), '8', sample_info, ), kwargs={ 'sample_id': sample_id, 'lane_id': lane_id, }) workflow.transform( name='sort', axes=('split', ), ctx=helpers.get_default_ctx( memory=4, walltime='16:00', ), func='wgs.workflows.alignment.tasks.bam_sort', args=(pypeliner.managed.TempInputFile('aligned.bam', 'split'), pypeliner.managed.TempOutputFile('sorted.bam', 'split'), pypeliner.managed.TempSpace('bam_sort_by_split', 'split')), kwargs={'mem': '{}G'.format(picard_mem)}) workflow.transform( name='merge', ctx=helpers.get_default_ctx( memory=8, walltime='72:00', ), func="wgs.workflows.alignment.tasks.merge_bams", args=(pypeliner.managed.TempInputFile('sorted.bam', 'split'), pypeliner.managed.OutputFile(out_file), pypeliner.managed.TempSpace('bam_merge_by_split')), kwargs={'mem': picard_mem}) workflow.commandline( name='index', ctx=helpers.get_default_ctx( memory=4, walltime='16:00', ), args=('samtools', 'index', pypeliner.managed.InputFile(out_file), pypeliner.managed.OutputFile(out_bai)), ) workflow.commandline( name='flagstat', ctx=helpers.get_default_ctx( memory=4, walltime='16:00', ), args=('samtools', 'flagstat', pypeliner.managed.InputFile(out_file), '>', pypeliner.managed.OutputFile(samtools_flagstat)), ) return workflow
def create_postprocessing_workflow(normal_bam, tumour_bam, titan, remixt, breakpoints_consensus, roh, germline_calls, somatic_calls, circos_plot_remixt, circos_plot_titan, genome_wide_plot, refdir, sample_id, single_node=False): refdir_paths = config.refdir_data(refdir)['paths'] refdir_params = config.refdir_data(refdir)['params'] ideogram = refdir_paths["ideogram"] titan_calls = titan[sample_id] remixt_calls = remixt[sample_id] sv_calls = breakpoints_consensus[sample_id] roh_calls = roh[sample_id] germline_vcf = germline_calls[sample_id] somatic_calls = somatic_calls[sample_id] chromosomes = refdir_params['chromosomes'] workflow = pypeliner.workflow.Workflow() workflow.subworkflow(name='coverage_normal_data', func=get_coverage_data, args=( mgd.InputFile(normal_bam), mgd.TempOutputFile('normal_coverage'), refdir, ), kwargs={'single_node': single_node}) workflow.subworkflow(name='coverage_tumour_data', func=get_coverage_data, args=( mgd.InputFile(tumour_bam), mgd.TempOutputFile('tumour_coverage'), refdir, ), kwargs={'single_node': single_node}) workflow.transform( name='parse_roh', ctx=helpers.get_default_ctx(memory=5), func="wgs.workflows.postprocessing.tasks.parse_roh", args=( mgd.InputFile(roh_calls), mgd.TempOutputFile("ROH_parsed"), ), ) if remixt_calls: workflow.transform( name='generate_genome_wide_plot', ctx=helpers.get_default_ctx(memory=10, ), func="wgs.workflows.postprocessing.tasks.genome_wide", args=( mgd.InputFile(titan_calls), mgd.TempInputFile("ROH_parsed"), mgd.InputFile(germline_vcf), mgd.InputFile(somatic_calls), mgd.TempInputFile('tumour_coverage'), mgd.TempInputFile('normal_coverage'), mgd.InputFile(sv_calls), mgd.InputFile(ideogram), chromosomes, mgd.OutputFile(genome_wide_plot), ), kwargs={ "remixt": mgd.InputFile(remixt_calls), "remixt_label": sample_id }) workflow.transform( name='generate_circos_plot', ctx=helpers.get_default_ctx(memory=10), func="wgs.workflows.postprocessing.tasks.circos", args=( mgd.InputFile(titan_calls), sample_id, mgd.InputFile(sv_calls), mgd.TempOutputFile(circos_plot_remixt), mgd.TempOutputFile(circos_plot_titan), mgd.TempSpace('circos'), ), kwargs={ 'docker_image': config.containers('circos'), 'remixt_calls': mgd.InputFile(remixt_calls) }, ) else: workflow.transform( name='generate_genome_wide_plot', ctx=helpers.get_default_ctx(memory=10, ), func="wgs.workflows.postprocessing.tasks.genome_wide", args=( mgd.InputFile(titan_calls), mgd.TempInputFile("ROH_parsed"), mgd.InputFile(germline_vcf), mgd.InputFile(somatic_calls), mgd.TempInputFile('tumour_coverage'), mgd.TempInputFile('normal_coverage'), mgd.InputFile(sv_calls), mgd.InputFile(ideogram), chromosomes, mgd.OutputFile(genome_wide_plot), ), ) workflow.transform( name='generate_circos_plot', ctx=helpers.get_default_ctx(memory=10), func="wgs.workflows.postprocessing.tasks.circos", args=( mgd.InputFile(titan_calls), sample_id, mgd.InputFile(sv_calls), mgd.TempOutputFile(circos_plot_remixt), mgd.TempOutputFile(circos_plot_titan), mgd.TempSpace('circos'), ), kwargs={'docker_image': config.containers('circos')}) return workflow
def get_coverage_data(input_bam, output, refdir, single_node=False): chromosomes = config.refdir_data(refdir)['params']['chromosomes'] chrom_sizes = config.refdir_data(refdir)['paths']['chrom_sizes'] workflow = pypeliner.workflow.Workflow() if single_node: workflow.transform( name='generate_coverage_bed', func='wgs.workflows.postprocessing.tasks.generate_coverage_bed', ctx=helpers.get_default_ctx(memory=5), args=( mgd.TempOutputFile('coverage_bed.bed'), chromosomes, mgd.InputFile(chrom_sizes), )) workflow.transform( name='samtools_coverage', func='wgs.workflows.postprocessing.tasks.samtools_coverage', ctx=helpers.get_default_ctx(memory=5), args=( mgd.InputFile(input_bam), mgd.TempInputFile('coverage_bed.bed'), mgd.TempOutputFile('per_interval.txt', 'chromosome'), ), kwargs={'docker_image': config.containers('samtools')}, ) else: workflow.setobj(obj=mgd.OutputChunks('chromosome'), value=chromosomes) workflow.transform( name='generate_coverage_bed', func='wgs.workflows.postprocessing.tasks.generate_coverage_bed', ctx=helpers.get_default_ctx(memory=5), axes=('chromosome', ), args=( mgd.TempOutputFile('coverage_bed.bed', 'chromosome'), mgd.InputInstance('chromosome'), mgd.InputFile(chrom_sizes), )) workflow.transform( name='samtools_coverage', func='wgs.workflows.postprocessing.tasks.samtools_coverage', ctx=helpers.get_default_ctx(memory=5), axes=('chromosome', ), args=( mgd.InputFile(input_bam), mgd.TempInputFile('coverage_bed.bed', 'chromosome'), mgd.TempOutputFile('per_interval.txt', 'chromosome'), # mgd.InputInstance('chromosome'), # refdir_paths['reference'], ), kwargs={'docker_image': config.containers('samtools')}) workflow.transform(name='merge_data', func='wgs.utils.csvutils.concatenate_csv', ctx=helpers.get_default_ctx(memory=5), args=( mgd.TempInputFile('per_interval.txt', 'chromosome', axes_origin=[]), mgd.OutputFile(output), )) return workflow
def single_sample_copynumber_calling_workflow(args): pyp = pypeliner.app.Pypeline(config=args) inputs = helpers.load_yaml(args['input_yaml']) outdir = args['out_dir'] meta_yaml = os.path.join(outdir, 'metadata.yaml') input_yaml_blob = os.path.join(outdir, 'input.yaml') bams = helpers.get_values_from_input(inputs, 'bam') samples = list(bams.keys()) cna_outdir = os.path.join(args['out_dir'], 'copynumber', '{sample_id}') hmmcopy_raw_dir = os.path.join(cna_outdir, 'hmmcopy') bias_pdf = os.path.join(hmmcopy_raw_dir, 'plots', '{sample_id}_bias.pdf') correction_pdf = os.path.join(hmmcopy_raw_dir, 'plots', '{sample_id}_correction.pdf') hmmcopy_pdf = os.path.join(hmmcopy_raw_dir, 'plots', '{sample_id}_hmmcopy.pdf') correction_table = os.path.join(hmmcopy_raw_dir, '{sample_id}_correctreads_with_state.txt') pygenes = os.path.join(hmmcopy_raw_dir, '{sample_id}_hmmcopy.seg.pygenes') refdir_paths = config.refdir_data(args['refdir'])['paths'] chromosomes = config.refdir_data(args['refdir'])['params']['chromosomes'] workflow = pypeliner.workflow.Workflow() workflow.setobj(obj=mgd.OutputChunks('sample_id'), value=samples) workflow.subworkflow( name='hmmcopy', func=hmmcopy.create_hmmcopy_workflow, axes=('sample_id', ), args=(mgd.InputFile("sample.bam", 'sample_id', fnames=bams, extensions=['.bai' ]), mgd.InputInstance('sample_id'), mgd.OutputFile('bias', 'sample_id', template=bias_pdf), mgd.OutputFile('correction', 'sample_id', template=correction_pdf), mgd.OutputFile('hmmcopy', 'sample_id', template=hmmcopy_pdf), mgd.OutputFile('correction_table', 'sample_id', template=correction_table), mgd.OutputFile('pygenes', 'sample_id', template=pygenes), chromosomes, refdir_paths['map_wig'], refdir_paths['gc_wig'], refdir_paths['gtf']), ) filenames = [ bias_pdf, correction_pdf, hmmcopy_pdf, correction_table, pygenes, ] outputted_filenames = helpers.expand_list(filenames, samples, "sample_id") workflow.transform(name='generate_meta_files_results', func='wgs.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], args["out_dir"], outputted_filenames, mgd.OutputFile(meta_yaml)), kwargs={ 'input_yaml_data': helpers.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'type': 'single_sample_copynumber_calling' } }) pyp.run(workflow)
def sample_qc_workflow(args): inputs = helpers.load_yaml(args['input_yaml']) normal_only = args['normal_only'] samples = list(inputs.keys()) # inputs chromosomes = config.refdir_data(args['refdir'])['params']['chromosomes'] files = make_inputs(inputs, normal_only=normal_only) # outputs out_dir = args['out_dir'] normal_coverage = os.path.join(out_dir, '{sample_id}', '{sample_id}_normal_coverage.tsv') genome_wide_plot = os.path.join(out_dir, '{sample_id}', '{sample_id}_genome_wide.pdf') if not normal_only: circos_plot_remixt = os.path.join(out_dir, '{sample_id}', '{sample_id}_circos_remixt.pdf') circos_plot_titan = os.path.join(out_dir, '{sample_id}', '{sample_id}_circos_titan.pdf') tumour_coverage = os.path.join(out_dir, '{sample_id}', '{sample_id}_tumour_coverage.tsv') pyp = pypeliner.app.Pypeline(config=args) workflow = pypeliner.workflow.Workflow() workflow.setobj(obj=mgd.OutputChunks('sample_id'), value=samples) if normal_only: workflow.subworkflow( name="normal_sample_qc", func=sample_qc.create_sample_qc_workflow_normal_only, ctx=helpers.get_default_ctx(), axes=('sample_id', ), args=(mgd.InputInstance('sample_id'), args["refdir"], mgd.InputFile('normal.bam', 'sample_id', fnames=files["normal"]), mgd.InputFile('roh', 'sample_id', fnames=files["roh"]), mgd.InputFile('germline_calls', 'sample_id', fnames=files["germline"]), mgd.OutputFile('genome_wide_plot.pdf', 'sample_id', template=genome_wide_plot), mgd.OutputFile('normcov', 'sample_id', template=normal_coverage), chromosomes, args['bins'], args['mapping_qual_threshold']), # kwargs={'single_node': args['single_node']} ) outputted_filenames = helpers.expand_list( [normal_coverage, genome_wide_plot], samples, "sample_id") else: workflow.subworkflow( name="sample_qc", func=sample_qc.create_sample_qc_workflow, ctx=helpers.get_default_ctx(), axes=('sample_id', ), args=(mgd.InputInstance('sample_id'), args["refdir"], mgd.InputFile('normal.bam', 'sample_id', fnames=files["normal"]), mgd.InputFile('tumour.bam', 'sample_id', fnames=files["tumor"]), mgd.InputFile('titan', 'sample_id', fnames=files["titan"]), mgd.InputFile('remixt', 'sample_id', fnames=files["remixt"]), mgd.InputFile('breakpoints_consensus', 'sample_id', fnames=files["breakpoints"]), mgd.InputFile('roh', 'sample_id', fnames=files["roh"]), mgd.InputFile('germline_calls', 'sample_id', fnames=files["germline"]), mgd.InputFile('somatic_calls', 'sample_id', fnames=files["somatic"]), mgd.OutputFile('genome_wide_plot.pdf', 'sample_id', template=genome_wide_plot), mgd.OutputFile('normcov', 'sample_id', template=normal_coverage), mgd.OutputFile('tumcov', 'sample_id', template=tumour_coverage), chromosomes, args['bins'], args['mapping_qual_threshold']), kwargs={'single_node': args['single_node']}) workflow.subworkflow( name='generate_circos_plot', ctx=helpers.get_default_ctx(memory=10, walltime='24:00', disk=400), axes=('sample_id', ), func=sample_qc.circos_plot, args=( mgd.InputFile('titan', 'sample_id', fnames=files["titan"]), mgd.InputFile('remixt', 'sample_id', fnames=files["remixt"]), mgd.InputInstance("sample_id"), mgd.InputFile('breakpoints_consensus', 'sample_id', fnames=files["breakpoints"]), mgd.OutputFile('circos_remixt', 'sample_id', template=circos_plot_remixt), mgd.OutputFile('circos_titan', 'sample_id', template=circos_plot_titan), ), ) outputted_filenames = helpers.expand_list([ circos_plot_remixt, circos_plot_titan, normal_coverage, tumour_coverage, genome_wide_plot ], samples, "sample_id") meta_yaml = os.path.join(out_dir, 'metadata.yaml') input_yaml_blob = os.path.join(out_dir, 'input.yaml') workflow.transform(name='generate_meta_files_results', func='wgs.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], args["out_dir"], outputted_filenames, mgd.OutputFile(meta_yaml)), kwargs={ 'input_yaml_data': helpers.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'type': 'sample_qc' } }) pyp.run(workflow)
def copynumber_calling_workflow(args): pyp = pypeliner.app.Pypeline(config=args) run_hmmcopy = args['hmmcopy'] run_titan = args['titan'] run_remixt = args['remixt'] if not run_hmmcopy and not run_titan and not run_remixt: run_hmmcopy = True run_titan = True run_remixt = True inputs = helpers.load_yaml(args['input_yaml']) outdir = args['out_dir'] meta_yaml = os.path.join(outdir, 'metadata.yaml') input_yaml_blob = os.path.join(outdir, 'input.yaml') tumours = helpers.get_values_from_input(inputs, 'tumour') normals = helpers.get_values_from_input(inputs, 'normal') targets = helpers.get_values_from_input(inputs, 'target_list') breakpoints = helpers.get_values_from_input(inputs, 'breakpoints') samples = list(tumours.keys()) cna_outdir = os.path.join(args['out_dir'], 'copynumber', '{sample_id}') titan_raw_dir = os.path.join(cna_outdir, 'titan') titan_outfile = os.path.join(titan_raw_dir, '{sample_id}_titan_markers.csv.gz') titan_params = os.path.join(titan_raw_dir, '{sample_id}_titan_params.csv.gz') titan_segs = os.path.join(titan_raw_dir, '{sample_id}_titan_segs.csv.gz') titan_igv_segs = os.path.join(titan_raw_dir, '{sample_id}_titan_igv_segs.seg') titan_parsed = os.path.join(titan_raw_dir, '{sample_id}_titan_parsed.csv.gz') titan_plots = os.path.join(titan_raw_dir, '{sample_id}_titan_plots.pdf') titan_tar_outputs = os.path.join(titan_raw_dir, '{sample_id}_data_all_parameters.tar.gz') museq_vcf = os.path.join(titan_raw_dir, '{sample_id}_museq.vcf') hmmcopy_normal_raw_dir = os.path.join(cna_outdir, 'hmmcopy_normal') normal_bias_pdf = os.path.join(hmmcopy_normal_raw_dir, 'plots', '{sample_id}_bias.pdf') normal_correction_pdf = os.path.join(hmmcopy_normal_raw_dir, 'plots', '{sample_id}_correction.pdf') normal_hmmcopy_pdf = os.path.join(hmmcopy_normal_raw_dir, 'plots', '{sample_id}_hmmcopy.pdf') normal_correction_table = os.path.join( hmmcopy_normal_raw_dir, '{sample_id}_correctreads_with_state.txt') normal_pygenes = os.path.join(hmmcopy_normal_raw_dir, '{sample_id}_hmmcopy.seg.pygenes') hmmcopy_tumour_raw_dir = os.path.join(cna_outdir, 'hmmcopy_tumour') tumour_bias_pdf = os.path.join(hmmcopy_tumour_raw_dir, 'plots', '{sample_id}_bias.pdf') tumour_correction_pdf = os.path.join(hmmcopy_tumour_raw_dir, 'plots', '{sample_id}_correction.pdf') tumour_hmmcopy_pdf = os.path.join(hmmcopy_tumour_raw_dir, 'plots', '{sample_id}_hmmcopy.pdf') tumour_correction_table = os.path.join( hmmcopy_tumour_raw_dir, '{sample_id}_correctreads_with_state.txt') tumour_pygenes = os.path.join(hmmcopy_tumour_raw_dir, '{sample_id}_hmmcopy.seg.pygenes') remixt_outdir = os.path.join(args['out_dir'], 'remixt', '{sample_id}') remixt_outfile = os.path.join(remixt_outdir, '{sample_id}_remixt.h5') remixt_brk_cn_csv = os.path.join(remixt_outdir, '{sample_id}_remixt_brk_cn.csv.gz') remixt_cn_csv = os.path.join(remixt_outdir, '{sample_id}_remixt_cn.csv.gz') remixt_minor_modes_csv = os.path.join( remixt_outdir, '{sample_id}_remixt_minor_modes.csv.gz') remixt_mix_csv = os.path.join(remixt_outdir, '{sample_id}_remixt_mix.csv.gz') remixt_read_depth_csv = os.path.join( remixt_outdir, '{sample_id}_remixt_read_depth.csv.gz') remixt_stats_csv = os.path.join(remixt_outdir, '{sample_id}_remixt_stats.csv.gz') refdir_paths = config.refdir_data(args['refdir'])['paths'] chromosomes = config.refdir_data(args['refdir'])['params']['chromosomes'] workflow = pypeliner.workflow.Workflow(ctx=helpers.get_default_ctx( docker_image=config.containers('wgs'))) workflow.setobj(obj=mgd.OutputChunks('sample_id'), value=samples) if run_remixt: workflow.subworkflow( name='remixt', func=remixt.create_remixt_workflow, axes=('sample_id', ), args=( mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai']), mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai']), mgd.InputFile("breakpoints", 'sample_id', fnames=breakpoints), mgd.InputInstance('sample_id'), mgd.OutputFile('remixt.h5', 'sample_id', template=remixt_outfile), mgd.OutputFile('remixt_brk_cn.csv', 'sample_id', template=remixt_brk_cn_csv), mgd.OutputFile('remixt_cn.csv', 'sample_id', template=remixt_cn_csv), mgd.OutputFile('remixt_minor_modes.csv', 'sample_id', template=remixt_minor_modes_csv), mgd.OutputFile('remixt_mix.csv', 'sample_id', template=remixt_mix_csv), mgd.OutputFile('remixt_read_depth.csv', 'sample_id', template=remixt_read_depth_csv), mgd.OutputFile('remixt_stats.csv', 'sample_id', template=remixt_stats_csv), refdir_paths['refdata_remixt'], refdir_paths['reference'], ), kwargs={'single_node': args['single_node']}) if run_titan: workflow.subworkflow(name='titan', func=titan.create_titan_workflow, axes=('sample_id', ), args=( mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai']), mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai']), mgd.InputFile("target_list", 'sample_id', fnames=targets), mgd.OutputFile('outfile', 'sample_id', template=titan_outfile), mgd.OutputFile('params', 'sample_id', template=titan_params), mgd.OutputFile('segs', 'sample_id', template=titan_segs), mgd.OutputFile('igv_segs', 'sample_id', template=titan_igv_segs), mgd.OutputFile('parsed', 'sample_id', template=titan_parsed), mgd.OutputFile('plots', 'sample_id', template=titan_plots), mgd.OutputFile('tar_outputs', 'sample_id', template=titan_tar_outputs), mgd.OutputFile('museq.vcf', 'sample_id', template=museq_vcf), mgd.InputInstance('sample_id'), refdir_paths['reference'], chromosomes, refdir_paths['het_positions_titan'], refdir_paths['map_wig'], refdir_paths['gc_wig'], refdir_paths['gtf'], ), kwargs={'single_node': args['single_node']}) if run_hmmcopy: workflow.subworkflow( name='hmmcopy_normal', func=hmmcopy.create_hmmcopy_workflow, axes=('sample_id', ), args=(mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai']), mgd.InputInstance('sample_id'), mgd.OutputFile('normal_bias', 'sample_id', template=normal_bias_pdf), mgd.OutputFile('normal_correction', 'sample_id', template=normal_correction_pdf), mgd.OutputFile('normal_hmmcopy', 'sample_id', template=normal_hmmcopy_pdf), mgd.OutputFile('normal_correction_table', 'sample_id', template=normal_correction_table), mgd.OutputFile('normal_pygenes', 'sample_id', template=normal_pygenes), chromosomes, refdir_paths['map_wig'], refdir_paths['gc_wig'], refdir_paths['gtf']), ) workflow.subworkflow( name='hmmcopy_tumour', func=hmmcopy.create_hmmcopy_workflow, axes=('sample_id', ), args=(mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai']), mgd.InputInstance('sample_id'), mgd.OutputFile('tumour_bias', 'sample_id', template=tumour_bias_pdf), mgd.OutputFile('tumour_correction', 'sample_id', template=tumour_correction_pdf), mgd.OutputFile('tumour_hmmcopy', 'sample_id', template=tumour_hmmcopy_pdf), mgd.OutputFile('tumour_correction_table', 'sample_id', template=tumour_correction_table), mgd.OutputFile('tumour_pygenes', 'sample_id', template=tumour_pygenes), chromosomes, refdir_paths['map_wig'], refdir_paths['gc_wig'], refdir_paths['gtf']), ) filenames = [] if run_titan: filenames += [ titan_outfile, titan_params, titan_segs, titan_igv_segs, titan_parsed, titan_plots, titan_tar_outputs, museq_vcf, ] if run_hmmcopy: filenames += [ normal_bias_pdf, normal_correction_pdf, normal_hmmcopy_pdf, normal_correction_table, normal_pygenes, tumour_bias_pdf, tumour_correction_pdf, tumour_hmmcopy_pdf, tumour_correction_table, tumour_pygenes ] outputted_filenames = helpers.expand_list(filenames, samples, "sample_id") workflow.transform(name='generate_meta_files_results', func='wgs.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], args["out_dir"], outputted_filenames, mgd.OutputFile(meta_yaml)), kwargs={ 'input_yaml_data': helpers.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'type': 'copynumber_calling' } }) pyp.run(workflow)
def create_germline_calling_workflow( samples, normals, museq_ss_vcf, museq_ss_maf, museq_single_pdf, samtools_germline_vcf, samtools_germline_maf, roh_calls, freebayes_germline_vcf, freebayes_germline_maf, rtg_germline_vcf, rtg_germline_maf, consensus_germline_maf, refdir, normal_ids, single_node=False ): museq_ss_vcf = dict([(sampid, museq_ss_vcf[sampid]) for sampid in samples]) museq_ss_maf = dict([(sampid, museq_ss_maf[sampid]) for sampid in samples]) museq_single_pdf = dict([(sampid, museq_single_pdf[sampid]) for sampid in samples]) samtools_germline_vcf = dict([(sampid, samtools_germline_vcf[sampid]) for sampid in samples]) samtools_germline_maf = dict([(sampid, samtools_germline_maf[sampid]) for sampid in samples]) roh_calls = dict([(sampid, roh_calls[sampid]) for sampid in samples]) freebayes_germline_vcf = dict([(sampid, freebayes_germline_vcf[sampid]) for sampid in samples]) freebayes_germline_maf = dict([(sampid, freebayes_germline_maf[sampid]) for sampid in samples]) rtg_germline_vcf = dict([(sampid, rtg_germline_vcf[sampid]) for sampid in samples]) rtg_germline_maf = dict([(sampid, rtg_germline_maf[sampid]) for sampid in samples]) consensus_germline_maf = dict([(sampid, consensus_germline_maf[sampid]) for sampid in samples]) chromosomes = config.refdir_data(refdir)['params']['chromosomes'] paths_refdir = config.refdir_data(refdir)['paths'] workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('sample_id'), value=samples) workflow.setobj( obj=mgd.TempOutputObj('normal_id', 'sample_id', axes_origin=[]), value={v: normal_ids[v] for v in samples}) workflow.subworkflow( name="mutationseq_single", func='wgs.workflows.mutationseq.create_museq_workflow', axes=('sample_id',), args=( mgd.OutputFile( 'museq_germlines.vcf.gz', 'sample_id', extensions=['.csi', '.tbi'], fnames=museq_ss_vcf ), mgd.OutputFile( 'museq_germlines.maf', 'sample_id', fnames=museq_ss_maf ), mgd.OutputFile('museq_single_pdf', 'sample_id', fnames=museq_single_pdf), paths_refdir['reference'], paths_refdir['reference_vep'], chromosomes, ), kwargs={ 'tumour_id': None, 'normal_id': mgd.TempInputObj('normal_id', 'sample_id'), 'tumour_bam': None, 'normal_bam': mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), 'single_node': single_node, 'germline_refdata': paths_refdir['germline_portrait_ref'], 'thousand_genomes': paths_refdir['thousand_genomes'], 'dbsnp': paths_refdir['dbsnp'], } ) workflow.subworkflow( name="samtools_germline", func='wgs.workflows.samtools_germline.create_samtools_germline_workflow', axes=('sample_id',), args=( mgd.OutputFile("samtools_germlines_anno.vcf.gz", 'sample_id', extensions=['.csi', '.tbi'], fnames=samtools_germline_vcf), mgd.OutputFile("samtools_germlines_anno.maf", 'sample_id', fnames=samtools_germline_maf), mgd.OutputFile("roh_calls.csv.gz", 'sample_id', fnames=roh_calls, extensions=['.yaml']), mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), paths_refdir['reference'], paths_refdir['reference_vep'], chromosomes, mgd.TempInputObj('normal_id', 'sample_id', fnames=normal_ids), ), kwargs={ 'single_node': single_node, } ) workflow.subworkflow( name="freebayes_germline", func='wgs.workflows.freebayes.create_freebayes_germline_workflow', axes=('sample_id',), args=( mgd.OutputFile("freebayes_germlines_anno.vcf.gz", 'sample_id', extensions=['.csi', '.tbi'], fnames=freebayes_germline_vcf), mgd.OutputFile("freebayes_germlines_anno.maf", 'sample_id', extensions=['.csi', '.tbi'], fnames=freebayes_germline_maf), mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), paths_refdir['reference'], paths_refdir['reference_vep'], chromosomes, mgd.TempInputObj('normal_id', 'sample_id'), ), kwargs={ 'single_node': single_node, } ) workflow.subworkflow( name="rtg_germline", func='wgs.workflows.rtg_germline.create_rtg_germline_workflow', axes=('sample_id',), args=( mgd.OutputFile("rtg_germlines_anno.vcf.gz", 'sample_id', extensions=['.csi', '.tbi'], fnames=rtg_germline_vcf), mgd.OutputFile("rtg_germlines_anno.maf", 'sample_id', extensions=['.csi', '.tbi'], fnames=rtg_germline_maf), mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), paths_refdir['reference'], paths_refdir['reference_sdf'], paths_refdir['reference_vep'], chromosomes, mgd.TempInputObj('normal_id', 'sample_id'), ), kwargs={ 'single_node': single_node, } ) workflow.subworkflow( name="germline_consensus", func='wgs.workflows.germline_calling_consensus.create_germline_consensus_workflow', axes=('sample_id',), args=( mgd.InputFile('museq_germlines.vcf.gz', 'sample_id', fnames=museq_ss_vcf), mgd.InputFile("samtools_germlines_anno.vcf.gz", 'sample_id', fnames=samtools_germline_vcf), mgd.InputFile("rtg_germlines_anno.vcf.gz", 'sample_id', fnames=rtg_germline_vcf), mgd.InputFile("freebayes_germlines_anno.vcf.gz", 'sample_id', fnames=freebayes_germline_vcf), mgd.OutputFile("germlines_consensus.maf", 'sample_id', fnames=consensus_germline_maf), chromosomes, paths_refdir['reference_vep'], mgd.TempInputObj('normal_id', 'sample_id') ), ) return workflow