def fastqc_workflow(fastq_r1, fastq_r2, r1_html, r1_plot, r2_html, r2_plot): workflow = pypeliner.workflow.Workflow() workflow.transform( name="fastqc_r1", ctx=helpers.get_default_ctx(memory=10, walltime='48:00', disk=400), func='wgs.workflows.alignment.tasks.run_fastqc', args=( mgd.InputFile(fastq_r1), mgd.OutputFile(r1_html), mgd.OutputFile(r1_plot), mgd.TempSpace('fastqc_R1'), ), ) workflow.transform( name="fastqc_r2", func='wgs.workflows.alignment.tasks.run_fastqc', ctx=helpers.get_default_ctx(memory=10, walltime='48:00', disk=400), args=( mgd.InputFile(fastq_r2), mgd.OutputFile(r2_html), mgd.OutputFile(r2_plot), mgd.TempSpace('fastqc_R2'), ), ) return workflow
def realign_bam_workflow(args): pyp = pypeliner.app.Pypeline(config=args) workflow = pypeliner.workflow.Workflow(ctx=helpers.get_default_ctx(docker_image=config.containers('wgs'))) outdir = args['out_dir'] meta_yaml = os.path.join(outdir, 'metadata.yaml') input_yaml_blob = os.path.join(outdir, 'input.yaml') yamldata = yaml.safe_load(open(args['input_yaml'])) samples = list(yamldata.keys()) input_bams = {sample: yamldata[sample]['input'] for sample in samples} output_bams = os.path.join(outdir, '{sample_id}', '{sample_id}.bam') metrics = os.path.join(outdir, '{sample_id}', '{sample_id}.txt') metrics_tar = os.path.join(outdir, '{sample_id}', '{sample_id}.tar') workflow.setobj( obj=mgd.OutputChunks('sample_id'), value=samples) workflow.subworkflow( name="realign", func=realign_bams, ctx=helpers.get_default_ctx(), args=( samples, mgd.InputFile("input.bam", 'sample_id', fnames=input_bams, extensions=['.bai'], axes_origin=[]), mgd.OutputFile("realigned.bam", 'sample_id', template=output_bams, extensions=['.bai', '.tdf'], axes_origin=[]), mgd.OutputFile("realigned.txt", 'sample_id', template=metrics, extensions=['.bai'], axes_origin=[]), mgd.OutputFile("realigned.tar", 'sample_id', template=metrics_tar, extensions=['.bai'], axes_origin=[]), args['refdir'], ), kwargs={'single_node': args['single_node']} ) outputted_filenames = helpers.expand_list([output_bams, metrics, metrics_tar], samples, 'sample_id') workflow.transform( name='generate_meta_files_results', func='wgs.utils.helpers.generate_and_upload_metadata', args=( sys.argv[0:], args["out_dir"], outputted_filenames, mgd.OutputFile(meta_yaml) ), kwargs={ 'input_yaml_data': helpers.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': {'type': 'realignment'} } ) pyp.run(workflow)
def align_sample_no_split(fastq_1, fastq_2, out_file, samtools_flagstat, sample_id, lane_id, sample_info, refdir, picard_mem=None): ref_genome = config.refdir_data(refdir)['paths']['reference'] out_bai = out_file + '.bai' workflow = pypeliner.workflow.Workflow() workflow.transform(name='align_bwa_mem', ctx=helpers.get_default_ctx(memory=8, walltime='48:00', ncpus='8', disk=300), func='wgs.workflows.alignment.tasks.align_bwa_mem', args=( pypeliner.managed.InputFile(fastq_1), pypeliner.managed.InputFile(fastq_2), ref_genome, pypeliner.managed.TempOutputFile('aligned.bam'), '8', sample_info, ), kwargs={ 'sample_id': sample_id, 'lane_id': lane_id, }) workflow.transform(name='sort', ctx=helpers.get_default_ctx(memory=8, walltime='48:00', ncpus='8', disk=300), func='wgs.workflows.alignment.tasks.bam_sort', args=(pypeliner.managed.TempInputFile('aligned.bam'), pypeliner.managed.OutputFile(out_file), pypeliner.managed.TempSpace('bam_sort_tempdir')), kwargs={ 'threads': '8', 'mem': '{}G'.format(picard_mem) }) workflow.transform( name='index_and_flagstat', func='wgs.workflows.alignment.tasks.index_and_flagstat', ctx=helpers.get_default_ctx(memory=4, walltime='24:00', disk=200), args=(pypeliner.managed.InputFile(out_file), pypeliner.managed.OutputFile(out_bai), pypeliner.managed.OutputFile(samtools_flagstat)), ) return workflow
def create_somatic_consensus_workflow( mutect_snv_vcf, strelka_snv_vcf, strelka_indel_vcf, museq_snv_vcf, consensus_maf, chromosomes, reference_vep, normal_id, tumour_id, ): workflow = pypeliner.workflow.Workflow() workflow.transform( name='snv_consensus', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.somatic_calling_consensus.consensus.main', args=( mgd.InputFile(museq_snv_vcf), mgd.InputFile(strelka_snv_vcf), mgd.InputFile(mutect_snv_vcf), mgd.InputFile(strelka_indel_vcf), mgd.TempOutputFile('consensus.vcf'), mgd.TempOutputFile('counts.csv'), chromosomes, ), ) workflow.subworkflow(name="consensus_maf", func='wgs.workflows.vcf2maf.create_vcf2maf_workflow', args=( mgd.TempInputFile('consensus.vcf'), mgd.TempOutputFile('consensus.maf'), reference_vep, ), kwargs={ 'normal_id': normal_id, 'tumour_id': tumour_id }) workflow.transform( name='maf_counts', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.somatic_calling_consensus.tasks.update_maf_counts', args=( mgd.TempInputFile('consensus.maf'), mgd.TempInputFile('counts.csv'), mgd.OutputFile(consensus_maf), )) return workflow
def create_consensus_workflow( destruct_breakpoints, lumpy_vcf, output, chromosomes ): params = config.default_params('breakpoint_calling') workflow = pypeliner.workflow.Workflow() workflow.transform( name='parse_lumpy', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.breakpoint_calling_consensus.tasks.parse_lumpy_task', args=( mgd.InputFile(lumpy_vcf), mgd.TempOutputFile('lumpy.csv'), params["parse_lumpy"], ), kwargs={'chromosomes': chromosomes} ) workflow.transform( name='parse_destruct', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.breakpoint_calling_consensus.tasks.parse_destruct_task', args=( mgd.InputFile(destruct_breakpoints), mgd.TempOutputFile('destruct.csv'), params["parse_destruct"], ), kwargs={'chromosomes': chromosomes} ) workflow.transform( name='consensus_breakpoint_calling', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.breakpoint_calling_consensus.tasks.consensus_calls', args=( mgd.TempInputFile('destruct.csv'), mgd.TempInputFile('lumpy.csv'), mgd.OutputFile(output, extensions=['.yaml']), params['consensus'] ), ) return workflow
def create_lumpy_workflow(lumpy_vcf, tumour_bam=None, normal_bam=None, single_node=False): workflow = pypeliner.workflow.Workflow() lumpy_job_name = 'run_lumpy' if normal_bam: normal_bam = mgd.InputFile(normal_bam) normal_disc = mgd.TempInputFile('normal.discordants.sorted.bam') normal_split = mgd.TempInputFile('normal.splitters.sorted.bam') lumpy_job_name += '_normal' else: normal_disc = None normal_split = None if tumour_bam: tumour_bam = mgd.InputFile(tumour_bam) tumour_disc = mgd.TempInputFile('tumour.discordants.sorted.bam') tumour_split = mgd.TempInputFile('tumour.splitters.sorted.bam') lumpy_job_name += '_tumour' else: tumour_disc = None tumour_split = None if normal_bam: workflow.subworkflow( name='preprocess_lumpy_normal', func=lumpy_preprocess_workflow, args=(normal_bam, mgd.TempOutputFile('normal.discordants.sorted.bam'), mgd.TempOutputFile('normal.splitters.sorted.bam')), kwargs={'single_node': single_node}) if tumour_bam: workflow.subworkflow( name='preprocess_lumpy_tumour', func=lumpy_preprocess_workflow, args=(tumour_bam, mgd.TempOutputFile('tumour.discordants.sorted.bam'), mgd.TempOutputFile('tumour.splitters.sorted.bam')), kwargs={'single_node': single_node}) workflow.transform( name=lumpy_job_name, ctx=helpers.get_default_ctx(memory=10, disk=500, walltime='72:00'), func='wgs.workflows.lumpy.tasks.run_lumpyexpress', args=(mgd.OutputFile(lumpy_vcf), config.default_params('breakpoint_calling')['lumpy_paths']), kwargs={ 'tumour_bam': tumour_bam, 'tumour_discordants': tumour_disc, 'tumour_splitters': tumour_split, 'normal_bam': normal_bam, 'normal_discordants': normal_disc, 'normal_splitters': normal_split, 'docker_image': config.containers('lumpy') }) return workflow
def create_svaba_workflow( tumour_bam, normal_bam, svaba_vcf, reference, ): workflow = pypeliner.workflow.Workflow() workflow.transform( name='run_svaba', ctx=helpers.get_default_ctx(memory=10, walltime='72:00', ncpus='8', disk=300), func='wgs.workflows.svaba.tasks.run_svaba', args=(mgd.InputFile(tumour_bam), mgd.InputFile(normal_bam), mgd.TempOutputFile('germline.indel.vcf.gz'), mgd.TempOutputFile('germline.sv.vcf.gz'), mgd.TempOutputFile('somatic.indel.vcf.gz'), mgd.OutputFile(svaba_vcf), mgd.TempOutputFile('unfiltered.germline.indel.vcf.gz'), mgd.TempOutputFile('unfiltered.germline.sv.vcf.gz'), mgd.TempOutputFile('unfiltered.somatic.indel.vcf.gz'), mgd.TempOutputFile('unfiltered.somatic.sv.vcf.gz'), reference, mgd.TempSpace('svaba_tempdir_full')), kwargs={ 'ncores': 8, }) return workflow
def circos_plot(titan_calls, remixt_calls, sample_id, breakpoints, circos_plot_remixt, circos_plot_titan): workflow = pypeliner.workflow.Workflow() workflow.transform( name='prep_titan', func='wgs_qc_utils.reader.read_titan.make_for_circos', ctx=helpers.get_default_ctx( memory=5 ), args=( mgd.InputFile(titan_calls), mgd.TempOutputFile("titan_prepped"), ) ) workflow.transform( name='prep_remixt', func='wgs_qc_utils.reader.read_remixt.make_for_circos', ctx=helpers.get_default_ctx( memory=5 ), args=( mgd.InputFile(remixt_calls), sample_id, mgd.TempOutputFile("remixt_prepped"), ) ) workflow.transform( name='circos_plot', func='wgs.workflows.sample_qc.tasks.circos', ctx=helpers.get_default_ctx( memory=5 ), args=( mgd.TempInputFile("titan_prepped"), mgd.TempInputFile("remixt_prepped"), sample_id, breakpoints, mgd.OutputFile(circos_plot_remixt), mgd.OutputFile(circos_plot_titan), mgd.TempSpace("circos") ) ) return workflow
def create_sample_qc_workflow_normal_only( sample_id, refdir, normal_bam, roh, germline_calls, genome_wide_plot, normal_coverage, chromosomes, bins, mapping_qual_threshold, single_node=False ): workflow = pypeliner.workflow.Workflow() workflow.subworkflow( name='coverage_normal_data', func=get_coverage_data, args=( mgd.InputFile(normal_bam), mgd.OutputFile(normal_coverage), refdir, chromosomes, mapping_qual_threshold, bins, ), kwargs={'single_node': single_node} ) workflow.transform( name='generate_genome_wide_plot', ctx=helpers.get_default_ctx( memory=10, ), func="wgs.workflows.sample_qc.tasks.genome_wide", args=( sample_id, mgd.InputFile(roh), mgd.InputFile(germline_calls), mgd.InputFile(normal_coverage), chromosomes, mgd.OutputFile(genome_wide_plot), ), kwargs={"normal_only":True} ) return workflow
def lumpy_preprocess_workflow(bamfile, discordants_sorted_bam, splitters_sorted_bam, single_node=False): workflow = pypeliner.workflow.Workflow() if single_node: workflow.transform( name='run_lumpy_preprocess', ctx=helpers.get_default_ctx(memory=10, walltime='96:00', disk=300), func='wgs.workflows.lumpy.tasks.run_lumpy_preprocess', args=(mgd.InputFile(bamfile), mgd.OutputFile(discordants_sorted_bam), mgd.OutputFile(splitters_sorted_bam), mgd.TempSpace("lumpy_preprocess_temp"), config.default_params('breakpoint_calling')['lumpy_paths']), kwargs={ 'lumpy_docker_image': config.containers('lumpy'), 'samtools_docker_image': config.containers('samtools') }) else: workflow.transform( name='run_samtools_view_normal', ctx=helpers.get_default_ctx( memory=10, walltime='24:00', ), func='wgs.workflows.lumpy.tasks.run_samtools_view', args=( mgd.InputFile(bamfile), mgd.TempOutputFile('normal.discordants.unsorted.bam'), ), kwargs={'docker_image': config.containers('samtools')}) workflow.transform( name='run_lumpy_extract_split_reads_bwamem_normal', ctx=helpers.get_default_ctx( memory=10, walltime='24:00', ), func= 'wgs.workflows.lumpy.tasks.run_lumpy_extract_split_reads_bwamem', args=(mgd.InputFile(bamfile), mgd.TempOutputFile('normal.splitters.unsorted.bam'), config.default_params('breakpoint_calling')['lumpy_paths']), kwargs={'docker_image': config.containers('lumpy')}) workflow.transform( name='run_samtools_sort_discordants_normal', ctx=helpers.get_default_ctx( memory=10, walltime='24:00', ), func='wgs.workflows.lumpy.tasks.run_samtools_sort', args=( mgd.TempInputFile('normal.discordants.unsorted.bam'), mgd.OutputFile(discordants_sorted_bam), ), kwargs={'docker_image': config.containers('samtools')}) workflow.transform( name='run_samtools_sort_splitters_normal', ctx=helpers.get_default_ctx( memory=10, walltime='24:00', ), func='wgs.workflows.lumpy.tasks.run_samtools_sort', args=( mgd.TempInputFile('normal.splitters.unsorted.bam'), mgd.OutputFile(splitters_sorted_bam), ), kwargs={'docker_image': config.containers('samtools')}) return workflow
def get_coverage_data( input_bam, output, refdir, chromosomes, mapping_qual, bins, single_node=False ): reference = config.refdir_data(refdir)['paths']['reference'] workflow = pypeliner.workflow.Workflow() if single_node: workflow.transform( name='generate_coverage_bed', func='wgs.workflows.sample_qc.tasks.generate_coverage_bed', ctx=helpers.get_default_ctx( memory=5 ), args=( reference, mgd.TempOutputFile('coverage_bed.bed'), chromosomes, bins, ) ) workflow.transform( name='samtools_coverage', func='wgs.workflows.sample_qc.tasks.samtools_coverage', ctx=helpers.get_default_ctx( memory=5 ), args=( mgd.InputFile(input_bam), mgd.TempInputFile('coverage_bed.bed'), mgd.TempOutputFile('per_interval.txt', 'chromosome'), mapping_qual, ), ) else: workflow.setobj( obj=mgd.OutputChunks('chromosome'), value=chromosomes ) workflow.transform( name='generate_coverage_bed', func='wgs.workflows.sample_qc.tasks.generate_coverage_bed', ctx=helpers.get_default_ctx( memory=5 ), axes=('chromosome',), args=( reference, mgd.TempOutputFile('coverage_bed.bed', 'chromosome'), mgd.InputInstance('chromosome'), bins, ) ) workflow.transform( name='samtools_coverage', func='wgs.workflows.sample_qc.tasks.samtools_coverage', ctx=helpers.get_default_ctx( memory=5 ), axes=('chromosome',), args=( mgd.InputFile(input_bam), mgd.TempInputFile('coverage_bed.bed', 'chromosome'), mgd.TempOutputFile('per_interval.txt', 'chromosome'), mapping_qual, ), ) workflow.transform( name='merge_data', func='wgs.utils.csvutils.concatenate_csv', ctx=helpers.get_default_ctx( memory=5 ), args=( mgd.TempInputFile('per_interval.txt', 'chromosome', axes_origin=[]), mgd.OutputFile(output), ) ) return workflow
def get_coverage_data(input_bam, output, refdir, single_node=False): chromosomes = config.refdir_data(refdir)['params']['chromosomes'] chrom_sizes = config.refdir_data(refdir)['paths']['chrom_sizes'] workflow = pypeliner.workflow.Workflow() if single_node: workflow.transform( name='generate_coverage_bed', func='wgs.workflows.postprocessing.tasks.generate_coverage_bed', ctx=helpers.get_default_ctx(memory=5), args=( mgd.TempOutputFile('coverage_bed.bed'), chromosomes, mgd.InputFile(chrom_sizes), )) workflow.transform( name='samtools_coverage', func='wgs.workflows.postprocessing.tasks.samtools_coverage', ctx=helpers.get_default_ctx(memory=5), args=( mgd.InputFile(input_bam), mgd.TempInputFile('coverage_bed.bed'), mgd.TempOutputFile('per_interval.txt', 'chromosome'), ), kwargs={'docker_image': config.containers('samtools')}, ) else: workflow.setobj(obj=mgd.OutputChunks('chromosome'), value=chromosomes) workflow.transform( name='generate_coverage_bed', func='wgs.workflows.postprocessing.tasks.generate_coverage_bed', ctx=helpers.get_default_ctx(memory=5), axes=('chromosome', ), args=( mgd.TempOutputFile('coverage_bed.bed', 'chromosome'), mgd.InputInstance('chromosome'), mgd.InputFile(chrom_sizes), )) workflow.transform( name='samtools_coverage', func='wgs.workflows.postprocessing.tasks.samtools_coverage', ctx=helpers.get_default_ctx(memory=5), axes=('chromosome', ), args=( mgd.InputFile(input_bam), mgd.TempInputFile('coverage_bed.bed', 'chromosome'), mgd.TempOutputFile('per_interval.txt', 'chromosome'), # mgd.InputInstance('chromosome'), # refdir_paths['reference'], ), kwargs={'docker_image': config.containers('samtools')}) workflow.transform(name='merge_data', func='wgs.utils.csvutils.concatenate_csv', ctx=helpers.get_default_ctx(memory=5), args=( mgd.TempInputFile('per_interval.txt', 'chromosome', axes_origin=[]), mgd.OutputFile(output), )) return workflow
def create_titan_workflow( tumour_bam, normal_bam, targets, outfile, params, segs, igv_segs, parsed, plots, tar_outputs, museq_vcf, sample_id, reference, chromosomes, het_positions, map_wig, gc_wig, pygenes_gtf, single_node=None ): cn_params = config.default_params('copynumber_calling') chunks = [(v['num_clusters'], v['ploidy']) for v in cn_params['titan_intervals']] targets = mgd.InputFile(targets) if targets else None ctx = {'docker_image': config.containers('wgs')} workflow = pypeliner.workflow.Workflow(ctx=ctx) workflow.setobj( obj=mgd.OutputChunks('numclusters', 'ploidy'), value=chunks, ) workflow.transform( name='generate_intervals', func='wgs.workflows.titan.tasks.generate_intervals', ctx=helpers.get_default_ctx( memory=5, walltime='2:00', ), ret=mgd.OutputChunks('interval'), args=( reference, chromosomes, ), kwargs={'size': cn_params['split_size']} ) if single_node: workflow.transform( name='run_museq', ctx=helpers.get_default_ctx( memory=15, walltime='96:00', ncpus=8), func='wgs.utils.museq_utils.run_museq_one_job', args=( mgd.TempSpace("run_museq_temp"), mgd.OutputFile(museq_vcf), reference, mgd.InputChunks('interval'), cn_params['museq_params'], ), kwargs={ 'tumour_bam': mgd.InputFile(tumour_bam, extensions=['.bai']), 'normal_bam': mgd.InputFile(normal_bam, extensions=['.bai']), 'titan_mode': True, 'museq_docker_image': config.containers('mutationseq'), 'vcftools_docker_image': config.containers('vcftools') } ) else: workflow.transform( name='run_museq', ctx=helpers.get_default_ctx( memory=15, walltime='24:00'), axes=('interval',), func='wgs.utils.museq_utils.run_museq', args=( mgd.TempOutputFile('museq.vcf', 'interval'), mgd.TempOutputFile('museq.log', 'interval'), reference, mgd.InputInstance('interval'), cn_params['museq_params'] ), kwargs={ 'tumour_bam': mgd.InputFile(tumour_bam, extensions=['.bai']), 'normal_bam': mgd.InputFile(normal_bam, extensions=['.bai']), 'titan_mode': True, 'docker_image': config.containers('mutationseq') } ) workflow.transform( name='merge_vcfs', ctx=helpers.get_default_ctx( memory=15, walltime='4:00', ), func='wgs.utils.museq_utils.merge_vcfs', args=( mgd.TempInputFile('museq.vcf', 'interval'), mgd.OutputFile(museq_vcf), mgd.TempSpace('merge_vcf'), ), kwargs={'docker_image': config.containers('vcftools')} ) workflow.transform( name='convert_museq_vcf2counts', ctx=helpers.get_default_ctx( memory=10, walltime='4:00', ), func='wgs.workflows.titan.tasks.convert_museq_vcf2counts', args=( mgd.InputFile(museq_vcf), mgd.TempOutputFile('museq_postprocess.txt'), het_positions, ), ) workflow.transform( name='run_readcounter_tumour', ctx=helpers.get_default_ctx( memory=10, walltime='16:00', disk=200 ), func='wgs.workflows.titan.tasks.run_readcounter', args=( mgd.InputFile(tumour_bam, extensions=['.bai']), mgd.TempOutputFile('tumour.wig'), chromosomes, cn_params['readcounter'] ), ) workflow.transform( name='run_readcounter_normal', ctx=helpers.get_default_ctx( memory=10, walltime='16:00', disk=200 ), func='wgs.workflows.titan.tasks.run_readcounter', args=( mgd.InputFile(normal_bam, extensions=['.bai']), mgd.TempOutputFile('normal.wig'), chromosomes, cn_params['readcounter'] ), ) workflow.transform( name='calc_correctreads_wig', ctx=helpers.get_default_ctx( memory=10, walltime='4:00', ), func='wgs.workflows.titan.tasks.calc_correctreads_wig', args=( mgd.TempInputFile('tumour.wig'), mgd.TempInputFile('normal.wig'), targets, mgd.TempOutputFile('correct_reads.txt'), gc_wig, map_wig, cn_params['genome_type'] ), kwargs={'docker_image': config.containers('titan')} ) workflow.transform( name='run_titan', axes=('numclusters', 'ploidy'), ctx=helpers.get_default_ctx( memory=15, walltime='72:00', ncpus='8'), func='wgs.workflows.titan.tasks.run_titan', args=( mgd.TempInputFile('museq_postprocess.txt'), mgd.TempInputFile('correct_reads.txt'), mgd.TempOutputFile('titan_outfile', 'numclusters', 'ploidy'), mgd.TempOutputFile('titan.Rdata', 'numclusters', 'ploidy'), mgd.TempOutputFile('titan_params', 'numclusters', 'ploidy'), mgd.InputInstance('numclusters'), mgd.InputInstance('ploidy'), sample_id, map_wig, cn_params['titan_params'], cn_params['genome_type'] ), kwargs={'docker_image': config.containers('titan'), 'threads': '8'} ) workflow.transform( name='plot_titan', axes=('numclusters', 'ploidy'), ctx=helpers.get_default_ctx( memory=10, walltime='16:00', ), func='wgs.workflows.titan.tasks.plot_titan', args=( mgd.TempInputFile('titan.Rdata', 'numclusters', 'ploidy'), mgd.TempOutputFile('titan_plots', 'numclusters', 'ploidy'), mgd.TempSpace("titan_plots_tempdir", 'numclusters', 'ploidy'), mgd.InputInstance('numclusters'), mgd.InputInstance('ploidy') ), kwargs={ 'chromosomes': chromosomes, 'docker_image': config.containers('titan'), }, ) workflow.transform( name='calc_cnsegments_titan', axes=('numclusters', 'ploidy'), ctx=helpers.get_default_ctx( memory=5, walltime='4:00', ), func='wgs.workflows.titan.tasks.calc_cnsegments_titan', args=( mgd.TempInputFile('titan_outfile', 'numclusters', 'ploidy'), mgd.TempOutputFile('titan_igv', 'numclusters', 'ploidy'), mgd.TempOutputFile('segs.csv', 'numclusters', 'ploidy'), sample_id, ), kwargs={'docker_image': config.containers('titan')} ) workflow.transform( name='annot_pygenes', axes=('numclusters', 'ploidy'), ctx=helpers.get_default_ctx( memory=10, walltime='4:00', ), func='wgs.workflows.titan.tasks.annot_pygenes', args=( mgd.TempInputFile('segs.csv', 'numclusters', 'ploidy'), mgd.TempOutputFile('titan_segs.csv', 'numclusters', 'ploidy'), pygenes_gtf, ), ) workflow.transform( name='parse_titan', axes=('numclusters', 'ploidy'), ctx=helpers.get_default_ctx( memory=5, walltime='4:00', ), func='wgs.workflows.titan.tasks.parse_titan_data', args=( mgd.TempInputFile('titan_segs.csv', 'numclusters', 'ploidy'), mgd.TempInputFile('titan_outfile', 'numclusters', 'ploidy'), mgd.TempOutputFile('titan_parsed.csv', 'numclusters', 'ploidy'), ), ) # select optimal solution workflow.transform( name="select_optimal_solution", ctx=helpers.get_default_ctx( memory=5, walltime='4:00', ), func="wgs.workflows.titan.tasks.select_optimal_solution", args=( chunks, mgd.TempInputFile('titan_params', 'numclusters', 'ploidy', axes_origin=[]), mgd.TempInputFile("titan_segs.csv", 'numclusters', 'ploidy', axes_origin=[]), mgd.TempInputFile('titan_igv', 'numclusters', 'ploidy'), mgd.TempInputFile("titan_outfile", 'numclusters', 'ploidy', axes_origin=[]), mgd.TempInputFile("titan_parsed.csv", 'numclusters', 'ploidy', axes_origin=[]), mgd.TempInputFile("titan_plots", 'numclusters', 'ploidy', axes_origin=[]), mgd.OutputFile(segs, extensions=['.yaml']), mgd.OutputFile(igv_segs, extensions=['.yaml']), mgd.OutputFile(params, extensions=['.yaml']), mgd.OutputFile(outfile, extensions=['.yaml']), mgd.OutputFile(parsed, extensions=['.yaml']), mgd.OutputFile(plots), ) ) workflow.transform( name='tar_all_data', ctx=helpers.get_default_ctx( memory=5, walltime='4:00', ), func="wgs.workflows.titan.tasks.tar_all_data", args=( mgd.TempInputFile('titan_params', 'numclusters', 'ploidy', axes_origin=[]), mgd.TempInputFile("titan_segs.csv", 'numclusters', 'ploidy', axes_origin=[]), mgd.TempInputFile('titan_igv', 'numclusters', 'ploidy'), mgd.TempInputFile("titan_outfile", 'numclusters', 'ploidy', axes_origin=[]), mgd.TempInputFile("titan_parsed.csv", 'numclusters', 'ploidy', axes_origin=[]), mgd.TempInputFile("titan_plots", 'numclusters', 'ploidy', axes_origin=[]), mgd.OutputFile(tar_outputs), mgd.TempSpace("titan_all_parameters_data"), chunks ) ) return workflow
def create_museq_workflow(snv_vcf, museqportrait_pdf, reference, chromosomes, thousand_genomes=None, dbsnp=None, germline_refdata=None, tumour_bam=None, normal_bam=None, single_node=None): name = 'run_museq' if tumour_bam: tumour_bam = mgd.InputFile(tumour_bam, extensions=['.bai']) name += '_tumour' if normal_bam: normal_bam = mgd.InputFile(normal_bam, extensions=['.bai']) name += '_normal' single = False if name == 'run_museq_tumour_normal' else True params = config.default_params('variant_calling') workflow = pypeliner.workflow.Workflow( ctx={'docker_image': config.containers('wgs')}) workflow.transform( name='generate_intervals', func='wgs.workflows.mutationseq.tasks.generate_intervals', ctx=helpers.get_default_ctx( memory=5, walltime='1:00', ), ret=mgd.OutputChunks('interval'), args=(reference, chromosomes), kwargs={'size': params['split_size']}) if single_node: workflow.transform(name=name, ctx=helpers.get_default_ctx(memory=15, walltime='48:00', ncpus='8', disk=600), func='wgs.utils.museq_utils.run_museq_one_job', args=( mgd.TempSpace("run_museq_temp"), mgd.TempOutputFile('merged.vcf'), reference, mgd.InputChunks('interval'), params['museq_params'], ), kwargs={ 'tumour_bam': tumour_bam, 'normal_bam': normal_bam, 'museq_docker_image': config.containers('mutationseq'), 'vcftools_docker_image': config.containers('vcftools') }) else: workflow.transform(name=name, ctx=helpers.get_default_ctx( memory=15, walltime='24:00', ), axes=('interval', ), func='wgs.utils.museq_utils.run_museq', args=( mgd.TempOutputFile('museq.vcf', 'interval'), mgd.TempOutputFile('museq.log', 'interval'), reference, mgd.InputInstance('interval'), params['museq_params'], ), kwargs={ 'tumour_bam': tumour_bam, 'normal_bam': normal_bam, 'docker_image': config.containers('mutationseq'), }) workflow.transform( name='merge_vcfs', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.utils.museq_utils.merge_vcfs', args=( mgd.TempInputFile('museq.vcf', 'interval'), mgd.TempOutputFile('merged.vcf'), mgd.TempSpace('merge_vcf'), ), kwargs={'docker_image': config.containers('vcftools')}) workflow.transform(name='finalise_snvs', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.utils.vcf_tasks.finalise_vcf', args=( mgd.TempInputFile('merged.vcf'), mgd.OutputFile(snv_vcf, extensions=['.tbi', '.csi']), ), kwargs={'docker_image': config.containers('vcftools')}) workflow.transform( name='run_museqportrait', ctx=helpers.get_default_ctx( memory=5, walltime='8:00', ), func='wgs.workflows.mutationseq.tasks.run_museqportrait', args=( mgd.InputFile(snv_vcf, extensions=['.tbi', '.csi']), mgd.OutputFile(museqportrait_pdf), mgd.TempOutputFile('museqportrait.txt'), mgd.TempOutputFile('museqportrait.log'), single, ), kwargs={ 'docker_image': config.containers('mutationseq'), 'thousand_genomes': thousand_genomes, 'dbsnp': dbsnp, 'germline_refdata': germline_refdata, 'germline_plot_threshold': params['germline_portrait_threshold'] }) return workflow
def variant_calling_workflow(args): inputs = helpers.load_yaml(args['input_yaml']) meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml') input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml') tumours = helpers.get_values_from_input(inputs, 'tumour') normals = helpers.get_values_from_input(inputs, 'normal') samples = list(tumours.keys()) var_dir = os.path.join(args['out_dir'], 'variants') museq_vcf = os.path.join(var_dir, '{sample_id}', '{sample_id}_museq_paired_annotated.vcf.gz') museq_ss_vcf = os.path.join(var_dir, '{sample_id}', '{sample_id}_museq_single_annotated.vcf.gz') samtools_germline_vcf = os.path.join( var_dir, '{sample_id}', '{sample_id}_samtools_germline.vcf.gz') samtools_roh = os.path.join(var_dir, '{sample_id}', '{sample_id}_roh.csv') strelka_snv_vcf = os.path.join(var_dir, '{sample_id}', '{sample_id}_strelka_snv_annotated.vcf.gz') strelka_indel_vcf = os.path.join( var_dir, '{sample_id}', '{sample_id}_strelka_indel_annotated.vcf.gz') museq_paired_pdf = os.path.join(var_dir, '{sample_id}', '{sample_id}_paired_museqportrait.pdf') museq_single_pdf = os.path.join(var_dir, '{sample_id}', '{sample_id}_single_museqportrait.pdf') somatic_csv = os.path.join(var_dir, '{sample_id}', '{sample_id}_consensus_somatic.csv.gz') somatic_snpeff = os.path.join( var_dir, '{sample_id}', '{sample_id}_consensus_somatic_snpeff.csv.gz') somatic_ma = os.path.join(var_dir, '{sample_id}', '{sample_id}_consensus_somatic_ma.csv.gz') somatic_ids = os.path.join(var_dir, '{sample_id}', '{sample_id}_consensus_somatic_ids.csv.gz') indel_csv = os.path.join(var_dir, '{sample_id}', '{sample_id}_indel.csv.gz') indel_snpeff = os.path.join(var_dir, '{sample_id}', '{sample_id}_indel_snpeff.csv.gz') indel_ma = os.path.join(var_dir, '{sample_id}', '{sample_id}_indel_ma.csv.gz') indel_ids = os.path.join(var_dir, '{sample_id}', '{sample_id}_indel_ids.csv.gz') germline_csv = os.path.join(var_dir, '{sample_id}', '{sample_id}_germline.csv.gz') germline_snpeff = os.path.join(var_dir, '{sample_id}', '{sample_id}_germline_snpeff.csv.gz') germline_ma = os.path.join(var_dir, '{sample_id}', '{sample_id}_germline_ma.csv.gz') germline_ids = os.path.join(var_dir, '{sample_id}', '{sample_id}_germline_ids.csv.gz') pyp = pypeliner.app.Pypeline(config=args) workflow = pypeliner.workflow.Workflow(ctx=helpers.get_default_ctx( docker_image=config.containers('wgs'))) workflow.setobj( obj=mgd.OutputChunks('sample_id'), value=samples, ) if not all(tumours.values()): workflow.subworkflow( name='variant_calling', func=call_germlines_only, args=(samples, mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), mgd.OutputFile('museq_ss', 'sample_id', template=museq_ss_vcf, axes_origin=[]), mgd.OutputFile('samtools_germline', 'sample_id', template=samtools_germline_vcf, axes_origin=[]), mgd.OutputFile('samtools_roh', 'sample_id', template=samtools_roh, axes_origin=[]), mgd.OutputFile('museq_single_pdf', 'sample_id', template=museq_single_pdf, axes_origin=[]), args['refdir']), kwargs={'single_node': args['single_node']}) else: workflow.subworkflow(name='variant_calling', func=call_variants, args=( samples, mgd.OutputFile('somatic_csv', 'sample_id', template=somatic_csv, axes_origin=[]), mgd.OutputFile('somatic_snpeff', 'sample_id', template=somatic_snpeff, axes_origin=[]), mgd.OutputFile('somatic_ma', 'sample_id', template=somatic_ma, axes_origin=[]), mgd.OutputFile('somatic_ids', 'sample_id', template=somatic_ids, axes_origin=[]), mgd.OutputFile('indel_csv', 'sample_id', template=indel_csv, axes_origin=[]), mgd.OutputFile('indel_snpeff', 'sample_id', template=indel_snpeff, axes_origin=[]), mgd.OutputFile('indel_ma', 'sample_id', template=indel_ma, axes_origin=[]), mgd.OutputFile('indel_ids', 'sample_id', template=indel_ids, axes_origin=[]), mgd.OutputFile('germline_csv', 'sample_id', template=germline_csv, axes_origin=[]), mgd.OutputFile('germline_snpeff', 'sample_id', template=germline_snpeff, axes_origin=[]), mgd.OutputFile('germline_ma', 'sample_id', template=germline_ma, axes_origin=[]), mgd.OutputFile('germline_ids', 'sample_id', template=germline_ids, axes_origin=[]), mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai'], axes_origin=[]), mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), mgd.OutputFile('museq', 'sample_id', template=museq_vcf, axes_origin=[]), mgd.OutputFile('museq_ss', 'sample_id', template=museq_ss_vcf, axes_origin=[]), mgd.OutputFile('samtools_germline', 'sample_id', template=samtools_germline_vcf, axes_origin=[]), mgd.OutputFile('roh_calls', 'sample_id', template=samtools_roh, axes_origin=[]), mgd.OutputFile('strelka_snv', 'sample_id', template=strelka_snv_vcf, axes_origin=[]), mgd.OutputFile('strelka_indel', 'sample_id', template=strelka_indel_vcf, axes_origin=[]), mgd.OutputFile('museq_paired_pdf', 'sample_id', template=museq_paired_pdf, axes_origin=[]), mgd.OutputFile('museq_single_pdf', 'sample_id', template=museq_single_pdf, axes_origin=[]), args['refdir'], ), kwargs={ 'single_node': args['single_node'], 'is_exome': args['is_exome'], }) filenames = [ somatic_csv, somatic_snpeff, somatic_ma, somatic_ids, indel_csv, indel_snpeff, indel_ma, indel_ids, germline_csv, germline_snpeff, germline_ma, germline_ids, museq_vcf, museq_ss_vcf, strelka_snv_vcf, strelka_indel_vcf, museq_paired_pdf, museq_single_pdf ] outputted_filenames = helpers.expand_list(filenames, samples, "sample_id") workflow.transform( name='generate_meta_files_results', func='wgs.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], args['out_dir'], outputted_filenames, mgd.OutputFile(meta_yaml)), kwargs={ 'input_yaml_data': helpers.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'type': 'variant_calling' } }) pyp.run(workflow)
def create_annotation_workflow( input_vcf, annotated_vcf, snpeff, mutationassessor, dbsnp, thousand_genomes, cosmic, mappability, ): databases = { 'snpeff_params': { 'snpeff_config': snpeff, }, 'mutation_assessor_params': { 'db': mutationassessor }, 'dbsnp_params': { 'db': dbsnp }, 'thousandgen_params': { 'db': thousand_genomes }, 'cosmic_params': { 'db': cosmic }, 'mappability_ref': mappability } workflow = pypeliner.workflow.Workflow() workflow.transform( name='run_snpeff', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.vcf_annotation.tasks.run_snpeff', args=( mgd.InputFile(input_vcf), mgd.TempOutputFile('annotSnpEff.vcf'), databases, ), ) workflow.transform( name='run_mutation_assessor', ctx=helpers.get_default_ctx( memory=10, walltime='8:00', ), func='wgs.workflows.vcf_annotation.tasks.run_mutation_assessor', args=( mgd.TempInputFile('annotSnpEff.vcf'), mgd.TempOutputFile('annotMA.vcf'), databases, ), ) workflow.transform( name='run_DBSNP', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.vcf_annotation.tasks.run_DBSNP', args=( mgd.TempInputFile('annotMA.vcf'), mgd.TempOutputFile('flagDBsnp.vcf'), databases, ), ) workflow.transform( name='run_1000gen', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.vcf_annotation.tasks.run_1000gen', args=( mgd.TempInputFile('flagDBsnp.vcf'), mgd.TempOutputFile('flag1000gen.vcf'), databases, ), ) workflow.transform( name='run_cosmic', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.vcf_annotation.tasks.run_cosmic', args=( mgd.TempInputFile('flag1000gen.vcf'), mgd.TempOutputFile('cosmic.vcf'), databases, ), ) workflow.transform( name='low_mappability_flag', func='wgs.workflows.vcf_annotation.tasks.flag_low_mappability', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), args=(mgd.TempInputFile('cosmic.vcf'), mgd.TempOutputFile('low_mapp.vcf'), databases['mappability_ref']), ), workflow.transform( name='finalize', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.utils.vcf_tasks.finalise_vcf', args=( mgd.TempInputFile('low_mapp.vcf'), mgd.OutputFile(annotated_vcf, extensions=['.csi', '.tbi']), ), ) return workflow
def create_postprocessing_workflow(normal_bam, tumour_bam, titan, remixt, breakpoints_consensus, roh, germline_calls, somatic_calls, circos_plot_remixt, circos_plot_titan, genome_wide_plot, refdir, sample_id, single_node=False): refdir_paths = config.refdir_data(refdir)['paths'] refdir_params = config.refdir_data(refdir)['params'] ideogram = refdir_paths["ideogram"] titan_calls = titan[sample_id] remixt_calls = remixt[sample_id] sv_calls = breakpoints_consensus[sample_id] roh_calls = roh[sample_id] germline_vcf = germline_calls[sample_id] somatic_calls = somatic_calls[sample_id] chromosomes = refdir_params['chromosomes'] workflow = pypeliner.workflow.Workflow() workflow.subworkflow(name='coverage_normal_data', func=get_coverage_data, args=( mgd.InputFile(normal_bam), mgd.TempOutputFile('normal_coverage'), refdir, ), kwargs={'single_node': single_node}) workflow.subworkflow(name='coverage_tumour_data', func=get_coverage_data, args=( mgd.InputFile(tumour_bam), mgd.TempOutputFile('tumour_coverage'), refdir, ), kwargs={'single_node': single_node}) workflow.transform( name='parse_roh', ctx=helpers.get_default_ctx(memory=5), func="wgs.workflows.postprocessing.tasks.parse_roh", args=( mgd.InputFile(roh_calls), mgd.TempOutputFile("ROH_parsed"), ), ) if remixt_calls: workflow.transform( name='generate_genome_wide_plot', ctx=helpers.get_default_ctx(memory=10, ), func="wgs.workflows.postprocessing.tasks.genome_wide", args=( mgd.InputFile(titan_calls), mgd.TempInputFile("ROH_parsed"), mgd.InputFile(germline_vcf), mgd.InputFile(somatic_calls), mgd.TempInputFile('tumour_coverage'), mgd.TempInputFile('normal_coverage'), mgd.InputFile(sv_calls), mgd.InputFile(ideogram), chromosomes, mgd.OutputFile(genome_wide_plot), ), kwargs={ "remixt": mgd.InputFile(remixt_calls), "remixt_label": sample_id }) workflow.transform( name='generate_circos_plot', ctx=helpers.get_default_ctx(memory=10), func="wgs.workflows.postprocessing.tasks.circos", args=( mgd.InputFile(titan_calls), sample_id, mgd.InputFile(sv_calls), mgd.TempOutputFile(circos_plot_remixt), mgd.TempOutputFile(circos_plot_titan), mgd.TempSpace('circos'), ), kwargs={ 'docker_image': config.containers('circos'), 'remixt_calls': mgd.InputFile(remixt_calls) }, ) else: workflow.transform( name='generate_genome_wide_plot', ctx=helpers.get_default_ctx(memory=10, ), func="wgs.workflows.postprocessing.tasks.genome_wide", args=( mgd.InputFile(titan_calls), mgd.TempInputFile("ROH_parsed"), mgd.InputFile(germline_vcf), mgd.InputFile(somatic_calls), mgd.TempInputFile('tumour_coverage'), mgd.TempInputFile('normal_coverage'), mgd.InputFile(sv_calls), mgd.InputFile(ideogram), chromosomes, mgd.OutputFile(genome_wide_plot), ), ) workflow.transform( name='generate_circos_plot', ctx=helpers.get_default_ctx(memory=10), func="wgs.workflows.postprocessing.tasks.circos", args=( mgd.InputFile(titan_calls), sample_id, mgd.InputFile(sv_calls), mgd.TempOutputFile(circos_plot_remixt), mgd.TempOutputFile(circos_plot_titan), mgd.TempSpace('circos'), ), kwargs={'docker_image': config.containers('circos')}) return workflow
def collect_bam_metrics(bam, markdups_metrics, sample_id, refdir, metrics, picard_insert_metrics, picard_insert_pdf, flagstat_metrics, picard_gc_metrics, picard_gc_summary, picard_gc_pdf, picard_wgs_metrics, bam_tdf, picard_mem=8): ''' calculates bam metrics in bams 1. picard insert metrics 2. picard GC metrics 3. picard wgs metrics 4. fastqc metrics :param config: config images for metrics :param bams: sample:bam dictionary :param metrics_csv: output csv containing metrics :param single_node: ''' ref_genome = config.refdir_data(refdir)['paths']['reference'] picard_wgs_params = config.default_params('alignment')['picard_wgs_params'] reftype = config.refdir_data(refdir)['params']['reference_type'] workflow = pypeliner.workflow.Workflow() workflow.transform( name="calc_picard_insert_metrics", ctx=helpers.get_default_ctx(memory=10, walltime='72:00', disk=400), func='wgs.workflows.alignment.tasks.bam_collect_insert_metrics', args=( mgd.InputFile(bam), mgd.OutputFile(flagstat_metrics), mgd.OutputFile(picard_insert_metrics), mgd.OutputFile(picard_insert_pdf), mgd.TempSpace('picard_insert'), ), kwargs={'mem': '{}G'.format(picard_mem)}) workflow.transform( name="calc_picard_gc_metrics", func='wgs.workflows.alignment.tasks.bam_collect_gc_metrics', ctx=helpers.get_default_ctx(memory=10, walltime='72:00', disk=400), args=(mgd.InputFile(bam), ref_genome, mgd.OutputFile(picard_gc_metrics), mgd.OutputFile(picard_gc_summary), mgd.OutputFile(picard_gc_pdf), mgd.TempSpace('picard_gc')), kwargs={'mem': '{}G'.format(picard_mem)}) workflow.transform( name="calc_picard_wgs_metrics", func='wgs.workflows.alignment.tasks.bam_collect_wgs_metrics', ctx=helpers.get_default_ctx(memory=10, walltime='72:00', disk=400), args=(mgd.InputFile(bam), ref_genome, mgd.OutputFile(picard_wgs_metrics), picard_wgs_params, mgd.TempSpace('picard_wgs')), kwargs={'mem': '{}G'.format(picard_mem)}) workflow.transform( name='igvtools_tdf', ctx=helpers.get_default_ctx( memory=4, walltime='16:00', ), func='wgs.workflows.alignment.tasks.get_igvtools_count', args=(pypeliner.managed.InputFile(bam), pypeliner.managed.OutputFile(bam_tdf), reftype), ) workflow.transform( name='collect_metrics', func='wgs.workflows.alignment.tasks.bam_collect_all_metrics', ctx=helpers.get_default_ctx(memory=10, walltime='4:00', disk=400), args=(mgd.InputFile(flagstat_metrics), mgd.InputFile(picard_insert_metrics), mgd.InputFile(picard_wgs_metrics), mgd.InputFile(markdups_metrics), mgd.OutputFile(metrics, extensions=['.yaml']), sample_id), kwargs={ 'main_dtypes': dtypes()['metrics'], 'insert_dtypes': dtypes()['insert_metrics'] }) return workflow
def align_sample_split(fastq_1, fastq_2, out_file, samtools_flagstat, sample_id, lane_id, sample_info, refdir, picard_mem=2): ref_genome = config.refdir_data(refdir)['paths']['reference'] split_size = config.default_params('alignment')['split_size'] out_bai = out_file + '.bai' workflow = pypeliner.workflow.Workflow() workflow.transform( name='split_fastq_1', ctx=helpers.get_default_ctx( memory=4, walltime='24:00', ), func='biowrappers.components.io.fastq.tasks.split_fastq', args=( pypeliner.managed.InputFile(fastq_1), pypeliner.managed.TempOutputFile('read_1', 'split'), split_size, ), ) workflow.transform( name='split_fastq_2', ctx=helpers.get_default_ctx( memory=4, walltime='24:00', ), func='biowrappers.components.io.fastq.tasks.split_fastq', args=( pypeliner.managed.InputFile(fastq_2), pypeliner.managed.TempOutputFile('read_2', 'split', axes_origin=[]), split_size, ), ) workflow.transform(name='align_bwa_mem', axes=('split', ), ctx=helpers.get_default_ctx( memory=8, walltime='16:00', ncpus=8, ), func='wgs.workflows.alignment.tasks.align_bwa_mem', args=( pypeliner.managed.TempInputFile('read_1', 'split'), pypeliner.managed.TempInputFile('read_2', 'split'), ref_genome, pypeliner.managed.TempOutputFile( 'aligned.bam', 'split'), '8', sample_info, ), kwargs={ 'sample_id': sample_id, 'lane_id': lane_id, }) workflow.transform( name='sort', axes=('split', ), ctx=helpers.get_default_ctx( memory=4, walltime='16:00', ), func='wgs.workflows.alignment.tasks.bam_sort', args=(pypeliner.managed.TempInputFile('aligned.bam', 'split'), pypeliner.managed.TempOutputFile('sorted.bam', 'split'), pypeliner.managed.TempSpace('bam_sort_by_split', 'split')), kwargs={'mem': '{}G'.format(picard_mem)}) workflow.transform( name='merge', ctx=helpers.get_default_ctx( memory=8, walltime='72:00', ), func="wgs.workflows.alignment.tasks.merge_bams", args=(pypeliner.managed.TempInputFile('sorted.bam', 'split'), pypeliner.managed.OutputFile(out_file), pypeliner.managed.TempSpace('bam_merge_by_split')), kwargs={'mem': picard_mem}) workflow.commandline( name='index', ctx=helpers.get_default_ctx( memory=4, walltime='16:00', ), args=('samtools', 'index', pypeliner.managed.InputFile(out_file), pypeliner.managed.OutputFile(out_bai)), ) workflow.commandline( name='flagstat', ctx=helpers.get_default_ctx( memory=4, walltime='16:00', ), args=('samtools', 'flagstat', pypeliner.managed.InputFile(out_file), '>', pypeliner.managed.OutputFile(samtools_flagstat)), ) return workflow
def align_samples( fastqs_r1, fastqs_r2, bam_outputs, metrics_outputs, metrics_tar, bam_tdf, sample_info, refdir, single_node=False, picard_mem=8, ): if single_node: align_func = align_sample_no_split else: align_func = align_sample_split if not isinstance(bam_outputs, dict): samples = sorted(set([v[0] for v in fastqs_r1.keys()])) bam_outputs = {sample: bam_outputs[sample] for sample in samples} metrics_outputs = { sample: metrics_outputs[sample] for sample in samples } metrics_tar = {sample: metrics_tar[sample] for sample in samples} bam_tdf = {sample: bam_tdf[sample] for sample in samples} workflow = pypeliner.workflow.Workflow() workflow.setobj(obj=mgd.TempOutputObj('sampleinfo', 'sample_id', axes_origin=[]), value=sample_info) workflow.setobj( obj=mgd.OutputChunks('sample_id', 'lane_id'), value=list(fastqs_r1.keys()), ) workflow.subworkflow(name='fastqc_workflow', func=fastqc_workflow, axes=('sample_id', 'lane_id'), args=( mgd.InputFile('input.r1.fastq.gz', 'sample_id', 'lane_id', fnames=fastqs_r1), mgd.InputFile('input.r2.fastq.gz', 'sample_id', 'lane_id', fnames=fastqs_r2), mgd.TempOutputFile('fastqc_R1.html', 'sample_id', 'lane_id'), mgd.TempOutputFile('fastqc_R1.pdf', 'sample_id', 'lane_id'), mgd.TempOutputFile('fastqc_R2.html', 'sample_id', 'lane_id'), mgd.TempOutputFile('fastqc_R2.pdf', 'sample_id', 'lane_id'), )) workflow.subworkflow(name='align_samples', func=align_func, axes=('sample_id', 'lane_id'), args=(mgd.InputFile('input.r1.fastq.gz', 'sample_id', 'lane_id', fnames=fastqs_r1), mgd.InputFile('input.r2.fastq.gz', 'sample_id', 'lane_id', fnames=fastqs_r2), mgd.TempOutputFile('aligned_lanes.bam', 'sample_id', 'lane_id'), mgd.TempOutputFile('samtools_flagstat.txt', 'sample_id', 'lane_id'), mgd.InputInstance("sample_id"), mgd.InputInstance("lane_id"), mgd.TempInputObj('sampleinfo', 'sample_id'), refdir), kwargs={'picard_mem': picard_mem}) workflow.transform(name='merge_tumour_lanes', ctx=helpers.get_default_ctx(memory=10, walltime='24:00', disk=400), func="wgs.workflows.alignment.tasks.merge_bams", axes=('sample_id', ), args=(mgd.TempInputFile('aligned_lanes.bam', 'sample_id', 'lane_id'), mgd.TempOutputFile('merged_lanes.bam', 'sample_id', extensions=['.bai']), mgd.TempSpace('merge_tumour_lanes_tempdir')), kwargs={'mem': picard_mem}) workflow.transform(name='markdups', ctx=helpers.get_default_ctx(memory=12, walltime='24:00', ncpus=1, disk=300), func='wgs.workflows.alignment.tasks.markdups', axes=('sample_id', ), args=( mgd.TempInputFile('merged_lanes.bam', 'sample_id', extensions=['.bai']), mgd.OutputFile('markdups.bam', 'sample_id', fnames=bam_outputs, extensions=['.bai']), mgd.TempOutputFile('markdups_metrics', 'sample_id'), pypeliner.managed.TempSpace("temp_markdups", "sample_id"), ), kwargs={ 'mem': '{}G'.format(picard_mem), }) workflow.subworkflow(name='metrics', func=collect_bam_metrics, axes=('sample_id', ), args=( mgd.InputFile('markdups.bam', 'sample_id', fnames=bam_outputs, extensions=['.bai']), mgd.TempInputFile('markdups_metrics', 'sample_id'), mgd.InputInstance('sample_id'), refdir, mgd.OutputFile('metrics_output', 'sample_id', fnames=metrics_outputs, extensions=['.yaml']), mgd.TempOutputFile('picard_insert_metrics.txt', 'sample_id'), mgd.TempOutputFile('picard_insert_metrics.pdf', 'sample_id'), mgd.TempOutputFile('flagstat_metrics.txt', 'sample_id'), mgd.TempOutputFile('picard_gc_metrics.txt', 'sample_id'), mgd.TempOutputFile('picard_gc_summary.txt', 'sample_id'), mgd.TempOutputFile('picard_gc.pdf', 'sample_id'), mgd.TempOutputFile('picard_wgs_metrics.txt', 'sample_id'), mgd.OutputFile('out.bam.tdf', 'sample_id', fnames=bam_tdf), )) workflow.transform( name='tar', func='wgs.utils.helpers.make_tar_from_files', axes=('sample_id', ), args=(mgd.OutputFile('metrics_tar', 'sample_id', fnames=metrics_tar), [ mgd.TempInputFile('picard_insert_metrics.txt', 'sample_id'), mgd.TempInputFile('picard_insert_metrics.pdf', 'sample_id'), mgd.TempInputFile('flagstat_metrics.txt', 'sample_id'), mgd.TempInputFile('picard_gc_metrics.txt', 'sample_id'), mgd.TempInputFile('picard_gc_summary.txt', 'sample_id'), mgd.TempInputFile('picard_gc.pdf', 'sample_id'), mgd.TempInputFile('picard_wgs_metrics.txt', 'sample_id'), mgd.TempInputFile('markdups_metrics', 'sample_id'), mgd.TempInputFile('fastqc_R1.html', 'sample_id', 'lane_id'), mgd.TempInputFile('fastqc_R1.pdf', 'sample_id', 'lane_id'), mgd.TempInputFile('fastqc_R2.html', 'sample_id', 'lane_id'), mgd.TempInputFile('fastqc_R2.pdf', 'sample_id', 'lane_id'), ], mgd.TempSpace('wgs_metrics'))) return workflow
def create_consensus_workflow(museq_germline, museq_snv, strelka_snv, strelka_indel, somatic_calls, somatic_snpeff, somatic_ma, somatic_ids, indel_calls, indel_snpeff, indel_ma, indel_ids, germline_calls, germline_snpeff, germline_ma, germline_ids, refdir): params = config.default_params('variant_calling') chromosomes = config.refdir_data(refdir)['params']['chromosomes'] workflow = pypeliner.workflow.Workflow() workflow.transform( name='parse_museq_germlines', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.variant_calling_consensus.tasks.parse_vcf', args=(mgd.InputFile(museq_germline, extensions=['.csi', '.tbi']), mgd.OutputFile(germline_calls, extensions=['.yaml']), mgd.OutputFile(germline_snpeff, extensions=['.yaml']), mgd.OutputFile(germline_ma, extensions=['.yaml']), mgd.OutputFile(germline_ids, extensions=['.yaml']), params["parse_museq"], chromosomes, mgd.TempSpace("tempdir_parse_germlines")), ) workflow.transform( name='parse_strelka_indel', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.variant_calling_consensus.tasks.parse_vcf', args=(mgd.InputFile(strelka_indel, extensions=['.csi', '.tbi']), mgd.OutputFile(indel_calls, extensions=['.yaml']), mgd.OutputFile(indel_snpeff, extensions=['.yaml']), mgd.OutputFile(indel_ma, extensions=['.yaml']), mgd.OutputFile(indel_ids, extensions=['.yaml']), params["parse_strelka"], chromosomes, mgd.TempSpace("tempdir_strelka_indel")), ) workflow.transform( name='parse_museq_snv', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.variant_calling_consensus.tasks.parse_vcf', args=(mgd.InputFile(museq_snv, extensions=['.csi', '.tbi']), mgd.TempOutputFile('museq_snv.csv', extensions=['.yaml']), mgd.TempOutputFile('museq_snpeff.csv', extensions=['.yaml']), mgd.TempOutputFile('museq_ma.csv', extensions=['.yaml']), mgd.TempOutputFile('museq_ids.csv', extensions=['.yaml']), params["parse_museq"], chromosomes, mgd.TempSpace("tempdir_parse_museq_snv")), ) workflow.transform( name='parse_strelka_snv', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.variant_calling_consensus.tasks.parse_vcf', args=(mgd.InputFile(strelka_snv, extensions=['.csi', '.tbi']), mgd.TempOutputFile('strelka_snv.csv', extensions=['.yaml']), mgd.TempOutputFile('strelka_snv_snpeff.csv', extensions=['.yaml']), mgd.TempOutputFile('strelka_snv_ma.csv', extensions=['.yaml']), mgd.TempOutputFile('strelka_snv_ids.csv', extensions=['.yaml']), params["parse_strelka"], chromosomes, mgd.TempSpace("tempdir_parse_strelka_snv")), ) workflow.transform( name='merge_snvs', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.variant_calling_consensus.tasks.merge_overlap', args=( [ mgd.TempInputFile('strelka_snv.csv', extensions=['.yaml']), mgd.TempInputFile('museq_snv.csv', extensions=['.yaml']) ], mgd.OutputFile(somatic_calls, extensions=['.yaml']), ), ) workflow.transform( name='merge_snpeff', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.variant_calling_consensus.tasks.merge_overlap', args=( [ mgd.TempInputFile('strelka_snv_snpeff.csv', extensions=['.yaml']), mgd.TempInputFile('museq_snpeff.csv', extensions=['.yaml']) ], mgd.OutputFile(somatic_snpeff, extensions=['.yaml']), ), kwargs={'on': ['chrom', 'pos']}) workflow.transform( name='merge_ma', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.variant_calling_consensus.tasks.merge_overlap', args=( [ mgd.TempInputFile('strelka_snv_ma.csv', extensions=['.yaml']), mgd.TempInputFile('museq_ma.csv', extensions=['.yaml']) ], mgd.OutputFile(somatic_ma, extensions=['.yaml']), ), kwargs={'on': ['chrom', 'pos']}) workflow.transform( name='merge_ids', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.variant_calling_consensus.tasks.merge_overlap', args=( [ mgd.TempInputFile('strelka_snv_ids.csv', extensions=['.yaml']), mgd.TempInputFile('museq_ids.csv', extensions=['.yaml']) ], mgd.OutputFile(somatic_ids, extensions=['.yaml']), ), kwargs={'on': ['chrom', 'pos']}) return workflow
def create_strelka_workflow(normal_bam_file, tumour_bam_file, snv_vcf_file, snv_maf_file, indel_vcf_file, indel_maf_file, reference, reference_vep, chromosomes, normal_id, tumour_id, single_node=False, is_exome=False): params = config.default_params('variant_calling') workflow = Workflow(ctx=helpers.get_default_ctx(memory=5, walltime='4:00'), ) workflow.transform( name='generate_intervals', func='wgs.workflows.mutationseq.tasks.generate_intervals', ret=mgd.OutputChunks('regions'), args=(reference, chromosomes), kwargs={'size': params['split_size']}) workflow.transform( name='count_fasta_bases', func="wgs.workflows.strelka.tasks.count_fasta_bases", args=( reference, pypeliner.managed.TempOutputFile('ref_base_counts.tsv'), ), ) workflow.transform( name="get_chrom_sizes", func="wgs.workflows.strelka.tasks.get_known_chromosome_sizes", ret=pypeliner.managed.TempOutputObj('known_sizes'), args=(pypeliner.managed.TempInputFile('ref_base_counts.tsv'), chromosomes)) if single_node: workflow.transform(name='strelka_one_node', func="wgs.workflows.strelka.tasks.strelka_one_node", args=( pypeliner.managed.InputFile(normal_bam_file, extensions=['.bai' ]), pypeliner.managed.InputFile(tumour_bam_file, extensions=['.bai' ]), reference, mgd.TempOutputFile('indels.vcf.gz', extensions=['.tbi', '.csi']), mgd.TempOutputFile('snvs.vcf.gz', extensions=['.tbi', '.csi']), mgd.TempSpace('call_genome_segment_tmp'), mgd.InputChunks('regions'), mgd.TempInputObj('known_sizes'), ), kwargs={ 'is_exome': is_exome, }) else: workflow.transform( name='get_chromosome_depths', axes=('regions', ), func="wgs.workflows.strelka.tasks.get_chromosome_depth", args=( mgd.InputInstance('regions'), pypeliner.managed.InputFile(normal_bam_file, extensions=['.bai']), reference, mgd.TempOutputFile('chrom_depth.txt', 'regions'), ), ) workflow.transform( name='merge_chromosome_depths', func="wgs.workflows.strelka.tasks.merge_chromosome_depths", args=(mgd.TempInputFile('chrom_depth.txt', 'regions', axes_origin=[]), mgd.TempOutputFile('merged_chrom_depth.txt'))) workflow.transform( name='call_genome_segment', axes=('regions', ), func="wgs.workflows.strelka.tasks.call_genome_segment", args=( mgd.TempInputFile('merged_chrom_depth.txt'), pypeliner.managed.InputFile(normal_bam_file, extensions=['.bai']), pypeliner.managed.InputFile(tumour_bam_file, extensions=['.bai']), reference, mgd.TempOutputFile('indels.vcf', 'regions'), mgd.TempOutputFile('snvs.vcf', 'regions'), mgd.TempSpace('call_genome_segment_tmp', 'regions'), mgd.InputInstance('regions'), mgd.TempInputObj('known_sizes'), ), kwargs={ 'is_exome': False, }) workflow.transform( name='merge_indels', func='wgs.workflows.strelka.tasks.concatenate_vcf', args=(mgd.TempInputFile('indels.vcf', 'regions'), mgd.TempOutputFile('indels.vcf.gz', extensions=['.tbi', '.csi']), mgd.TempSpace("indels_merge")), ) workflow.transform( name='merge_snvs', func='wgs.workflows.strelka.tasks.concatenate_vcf', args=(mgd.TempInputFile('snvs.vcf', 'regions'), mgd.TempOutputFile('snvs.vcf.gz', extensions=['.tbi', '.csi']), mgd.TempSpace("snvs_merge")), ) workflow.transform(name='bcftools_normalize_snv', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.utils.vcfutils.bcftools_normalize', args=( mgd.TempInputFile('snvs.vcf.gz'), mgd.TempOutputFile('normalized_snvs.vcf'), reference, )) workflow.transform( name='finalise_normalize_snvs', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.utils.vcf_tasks.finalise_vcf', args=( mgd.TempInputFile('normalized_snvs.vcf'), mgd.TempOutputFile('normalized_snvs_finalize.vcf.gz', extensions=['.tbi', '.csi']), ), ) workflow.transform(name='bcftools_normalize_indel', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.utils.vcfutils.bcftools_normalize', args=( mgd.TempInputFile('indels.vcf.gz'), mgd.TempOutputFile('normalized_indels.vcf'), reference, )) workflow.transform( name='finalise_normalize_indel', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.utils.vcf_tasks.finalise_vcf', args=( mgd.TempInputFile('normalized_indels.vcf'), mgd.TempOutputFile('normalized_indels_finalize.vcf.gz', extensions=['.tbi', '.csi']), ), ) workflow.transform( name='filter_vcf_indel', func='wgs.workflows.strelka.tasks.filter_vcf', args=( mgd.TempInputFile('normalized_indels_finalize.vcf.gz', extensions=['.tbi', '.csi']), mgd.OutputFile(indel_vcf_file, extensions=['.tbi', '.csi']), ), ) workflow.transform( name='filter_vcf_snv', func='wgs.workflows.strelka.tasks.filter_vcf', args=( mgd.TempInputFile('normalized_snvs_finalize.vcf.gz', extensions=['.tbi', '.csi']), mgd.OutputFile(snv_vcf_file, extensions=['.tbi', '.csi']), ), ) workflow.subworkflow(name="strelka_snv_maf", func='wgs.workflows.vcf2maf.create_vcf2maf_workflow', args=( mgd.InputFile(snv_vcf_file, extensions=['.tbi', '.csi']), mgd.OutputFile(snv_maf_file), reference_vep, ), kwargs={ 'tumour_id': tumour_id, 'normal_id': normal_id }) workflow.subworkflow(name="strelka_indel_maf", func='wgs.workflows.vcf2maf.create_vcf2maf_workflow', args=( mgd.InputFile(indel_vcf_file, extensions=['.tbi', '.csi']), mgd.OutputFile(indel_maf_file), reference_vep, ), kwargs={ 'tumour_id': tumour_id, 'normal_id': normal_id }) return workflow
def call_variants(samples, somatic_calls, somatic_snpeff, somatic_ma, somatic_ids, indel_calls, indel_snpeff, indel_ma, indel_ids, germline_calls, germline_snpeff, germline_ma, germline_ids, tumours, normals, museq_vcf, museq_ss_vcf, samtools_germlines_vcf, roh_calls, strelka_snv_vcf, strelka_indel_vcf, museq_paired_pdf, museq_single_pdf, refdir, single_node=False, is_exome=False): strelka_snv_vcf = dict([(sampid, strelka_snv_vcf[sampid]) for sampid in samples]) strelka_indel_vcf = dict([(sampid, strelka_indel_vcf[sampid]) for sampid in samples]) museq_vcf = dict([(sampid, museq_vcf[sampid]) for sampid in samples]) museq_ss_vcf = dict([(sampid, museq_ss_vcf[sampid]) for sampid in samples]) samtools_germlines_vcf = dict([(sampid, samtools_germlines_vcf[sampid]) for sampid in samples]) roh_calls = dict([(sampid, roh_calls[sampid]) for sampid in samples]) museq_paired_pdf = dict([(sampid, museq_paired_pdf[sampid]) for sampid in samples]) museq_single_pdf = dict([(sampid, museq_single_pdf[sampid]) for sampid in samples]) somatic_calls = dict([(sampid, somatic_calls[sampid]) for sampid in samples]) somatic_snpeff = dict([(sampid, somatic_snpeff[sampid]) for sampid in samples]) somatic_ma = dict([(sampid, somatic_ma[sampid]) for sampid in samples]) somatic_ids = dict([(sampid, somatic_ids[sampid]) for sampid in samples]) indel_calls = dict([(sampid, indel_calls[sampid]) for sampid in samples]) indel_snpeff = dict([(sampid, indel_snpeff[sampid]) for sampid in samples]) indel_ma = dict([(sampid, indel_ma[sampid]) for sampid in samples]) indel_ids = dict([(sampid, indel_ids[sampid]) for sampid in samples]) germline_calls = dict([(sampid, germline_calls[sampid]) for sampid in samples]) germline_snpeff = dict([(sampid, germline_snpeff[sampid]) for sampid in samples]) germline_ma = dict([(sampid, germline_ma[sampid]) for sampid in samples]) germline_ids = dict([(sampid, germline_ids[sampid]) for sampid in samples]) chromosomes = config.refdir_data(refdir)['params']['chromosomes'] paths_refdir = config.refdir_data(refdir)['paths'] workflow = pypeliner.workflow.Workflow(ctx=helpers.get_default_ctx( docker_image=config.containers('wgs'))) workflow.setobj(obj=mgd.OutputChunks('sample_id'), value=samples) workflow.subworkflow( name="mutationseq_paired", func='wgs.workflows.mutationseq.create_museq_workflow', axes=('sample_id', ), args=(mgd.TempOutputFile("museq_snv.vcf.gz", 'sample_id'), mgd.OutputFile('museq_paired_pdf', 'sample_id', fnames=museq_paired_pdf), paths_refdir['reference'], chromosomes), kwargs={ 'tumour_bam': mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai'], axes_origin=[]), 'normal_bam': mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), 'single_node': single_node, }) workflow.subworkflow( name="mutationseq_single", func='wgs.workflows.mutationseq.create_museq_workflow', axes=('sample_id', ), args=(mgd.TempOutputFile("museq_germlines.vcf.gz", 'sample_id'), mgd.OutputFile('museq_single_pdf', 'sample_id', fnames=museq_single_pdf), paths_refdir['reference'], chromosomes), kwargs={ 'tumour_bam': None, 'normal_bam': mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), 'single_node': single_node, 'germline_refdata': paths_refdir['germline_portrait_ref'], 'thousand_genomes': paths_refdir['thousand_genomes'], 'dbsnp': paths_refdir['dbsnp'], }) workflow.subworkflow( name="samtools_germline", func= 'wgs.workflows.samtools_germline.create_samtools_germline_workflow', axes=('sample_id', ), args=(mgd.TempOutputFile("samtools_germlines.vcf.gz", 'sample_id'), mgd.OutputFile("roh_calls.csv.gz", 'sample_id', fnames=roh_calls), mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), paths_refdir['reference'], chromosomes), kwargs={ 'single_node': single_node, }) workflow.subworkflow( name="strelka", func='wgs.workflows.strelka.create_strelka_workflow', axes=('sample_id', ), args=(mgd.InputFile('normal_bam', 'sample_id', fnames=normals, extensions=['.bai']), mgd.InputFile('tumour_bam', 'sample_id', fnames=tumours, extensions=['.bai']), mgd.TempOutputFile('strelka_indel.vcf.gz', 'sample_id'), mgd.TempOutputFile('strelka_snv.vcf.gz', 'sample_id'), paths_refdir['reference'], chromosomes), kwargs={ 'single_node': single_node, 'is_exome': is_exome }, ) workflow.subworkflow( name="annotate_paired_museq", func='wgs.workflows.vcf_annotation.create_annotation_workflow', axes=('sample_id', ), args=(mgd.TempInputFile("museq_snv.vcf.gz", 'sample_id'), mgd.OutputFile('museq_snv_ann.vcf.gz', 'sample_id', extensions=['.csi', '.tbi'], fnames=museq_vcf), paths_refdir['snpeff_config'], paths_refdir['mutation_assessor'], paths_refdir['dbsnp'], paths_refdir['thousand_genomes'], paths_refdir['cosmic'], paths_refdir['blacklist']), kwargs={ 'vcftools_docker': config.containers('vcftools'), 'snpeff_docker': config.containers('vcftools'), }) workflow.subworkflow( name="annotate_germline_museq", func='wgs.workflows.vcf_annotation.create_annotation_workflow', axes=('sample_id', ), args=(mgd.TempInputFile("museq_germlines.vcf.gz", 'sample_id'), mgd.OutputFile('museq_germlines_ann.vcf.gz', 'sample_id', extensions=['.csi', '.tbi'], fnames=museq_ss_vcf), paths_refdir['snpeff_config'], paths_refdir['mutation_assessor'], paths_refdir['dbsnp'], paths_refdir['thousand_genomes'], paths_refdir['cosmic'], paths_refdir['blacklist']), kwargs={ 'vcftools_docker': config.containers('vcftools'), 'snpeff_docker': config.containers('vcftools'), }) workflow.subworkflow( name="annotate_germline_samtools", func='wgs.workflows.vcf_annotation.create_annotation_workflow', axes=('sample_id', ), args=(mgd.TempInputFile("samtools_germlines.vcf.gz", 'sample_id'), mgd.OutputFile("samtools_germlines_ann.vcf.gz", 'sample_id', extensions=['.csi', '.tbi'], fnames=samtools_germlines_vcf), paths_refdir['snpeff_config'], paths_refdir['mutation_assessor'], paths_refdir['dbsnp'], paths_refdir['thousand_genomes'], paths_refdir['cosmic'], paths_refdir['blacklist']), kwargs={ 'vcftools_docker': config.containers('vcftools'), 'snpeff_docker': config.containers('vcftools'), }) workflow.subworkflow( name="annotate_strelka", func='wgs.workflows.vcf_annotation.create_annotation_workflow', axes=('sample_id', ), args=(mgd.TempInputFile("strelka_snv.vcf.gz", 'sample_id'), mgd.OutputFile('strelka_snv_ann.vcf.gz', 'sample_id', extensions=['.csi', '.tbi'], fnames=strelka_snv_vcf), paths_refdir['snpeff_config'], paths_refdir['mutation_assessor'], paths_refdir['dbsnp'], paths_refdir['thousand_genomes'], paths_refdir['cosmic'], paths_refdir['blacklist']), kwargs={ 'vcftools_docker': config.containers('vcftools'), 'snpeff_docker': config.containers('vcftools'), }) workflow.subworkflow( name="annotate_strelka_indel", func='wgs.workflows.vcf_annotation.create_annotation_workflow', axes=('sample_id', ), args=(mgd.TempInputFile("strelka_indel.vcf.gz", 'sample_id'), mgd.OutputFile('strelka_indel_ann.vcf.gz', 'sample_id', extensions=['.csi', '.tbi'], fnames=strelka_indel_vcf), paths_refdir['snpeff_config'], paths_refdir['mutation_assessor'], paths_refdir['dbsnp'], paths_refdir['thousand_genomes'], paths_refdir['cosmic'], paths_refdir['blacklist']), kwargs={ 'vcftools_docker': config.containers('vcftools'), 'snpeff_docker': config.containers('vcftools'), }) workflow.subworkflow( name="consensus_calling", func= 'wgs.workflows.variant_calling_consensus.create_consensus_workflow', axes=('sample_id', ), args=( mgd.InputFile("museq_germlines_ann.vcf.gz", 'sample_id', fnames=museq_ss_vcf), mgd.InputFile("museq_snv_ann.vcf.gz", 'sample_id', fnames=museq_vcf), mgd.InputFile("strelka_snv_ann.vcf.gz", 'sample_id', fnames=strelka_snv_vcf), mgd.InputFile("strelka_indel_ann.vcf.gz", 'sample_id', fnames=strelka_indel_vcf), mgd.OutputFile('somatic_csv', 'sample_id', fnames=somatic_calls), mgd.OutputFile('somatic_snpeff', 'sample_id', fnames=somatic_snpeff), mgd.OutputFile('somatic_ma', 'sample_id', fnames=somatic_ma), mgd.OutputFile('somatic_ids', 'sample_id', fnames=somatic_ids), mgd.OutputFile('indel_csv', 'sample_id', fnames=indel_calls), mgd.OutputFile('indel_snpeff', 'sample_id', fnames=indel_snpeff), mgd.OutputFile('indel_ma', 'sample_id', fnames=indel_ma), mgd.OutputFile('indel_ids', 'sample_id', fnames=indel_ids), mgd.OutputFile('germline_csv', 'sample_id', fnames=germline_calls), mgd.OutputFile('germline_snpeff', 'sample_id', fnames=germline_snpeff), mgd.OutputFile('germline_ma', 'sample_id', fnames=germline_ma), mgd.OutputFile('germline_ids', 'sample_id', fnames=germline_ids), refdir, ), ) return workflow
def create_destruct_wgs_workflow(tumour_bam, normal_bam, raw_breakpoints, raw_library, breakpoints, library, reads, sample_id, reference, destruct_refdata, gtf, mappability, single_node=False): destruct_config = { 'genome_fasta': reference, 'genome_fai': reference + '.fai', 'gtf_filename': gtf } workflow = pypeliner.workflow.Workflow( ctx={'docker_image': config.containers('wgs')}) workflow.transform(name="get_destruct_config", func="destruct.defaultconfig.get_config", ctx=helpers.get_default_ctx( docker_image=config.containers('destruct'), walltime="48:00", ), ret=mgd.TempOutputObj("destruct_config"), args=(destruct_refdata, destruct_config)) if single_node: workflow.transform( name='destruct_local', ctx=helpers.get_default_ctx(walltime='120:00', disk=800), func='wgs.workflows.destruct_wgs.tasks.run_destruct_local', args=( mgd.TempSpace("destruct_local_temp"), mgd.InputFile(tumour_bam), mgd.InputFile(normal_bam), sample_id, mgd.TempOutputFile("raw_breakpoints"), mgd.TempOutputFile("raw_library"), mgd.OutputFile(reads), mgd.TempInputObj("destruct_config"), destruct_refdata, ), kwargs={ 'ncpus': 16, 'docker_image': config.containers('destruct') }) else: workflow.subworkflow( name='destruct_parallel', ctx=helpers.get_default_ctx( docker_image=config.containers('destruct'), walltime="48:00", ), # refers to seperate destruct package func='destruct.workflow.create_destruct_workflow', args=({ sample_id: mgd.InputFile(tumour_bam), sample_id + 'N': mgd.InputFile(normal_bam) }, mgd.TempOutputFile("raw_breakpoints"), mgd.TempOutputFile("raw_library"), mgd.OutputFile(reads), mgd.TempInputObj("destruct_config"), destruct_refdata)) workflow.commandline( name='filter_annotate_breakpoints', ctx=helpers.get_default_ctx(docker_image=config.containers('destruct'), memory=8, walltime='8:00'), args=( 'filter_annotate_breakpoints.py', '--breakpoints', mgd.TempInputFile("raw_breakpoints"), '--library', mgd.TempInputFile("raw_library"), '--control_ids', sample_id + 'N', '--out_breakpoints', mgd.TempOutputFile("filter_annotate_breakpoints_output"), '--out_library', mgd.TempOutputFile("library"), )) workflow.transform( name='mappability_annotate_breakpoints', ctx=helpers.get_default_ctx(memory=8, walltime='8:00'), func='wgs.workflows.destruct_wgs.flag_mappability.main', args=( mgd.TempInputFile("filter_annotate_breakpoints_output"), mgd.TempOutputFile("breakpoints"), mappability, )) workflow.transform( name='finalize_raw_breakpoints', ctx=helpers.get_default_ctx(memory=8, walltime='8:00'), func="wgs.utils.csvutils.finalize_csv", args=( mgd.TempInputFile("raw_breakpoints"), mgd.OutputFile(raw_breakpoints, extensions=['.yaml']), ), ) workflow.transform( name='finalize_raw_library', ctx=helpers.get_default_ctx(memory=8, walltime='8:00'), func="wgs.utils.csvutils.finalize_csv", args=( mgd.TempInputFile("raw_library"), mgd.OutputFile(raw_library, extensions=['.yaml']), ), ) workflow.transform( name='finalize_breakpoints', ctx=helpers.get_default_ctx(memory=8, walltime='8:00'), func="wgs.utils.csvutils.finalize_csv", args=( mgd.TempInputFile("breakpoints"), mgd.OutputFile(breakpoints, extensions=['.yaml']), ), ) workflow.transform(name='finalize_library', ctx=helpers.get_default_ctx(memory=8, walltime='8:00'), func="wgs.utils.csvutils.finalize_csv", args=( mgd.TempInputFile("library"), mgd.OutputFile(library, extensions=['.yaml']), )) return workflow
def call_germlines_only(samples, normals, museq_ss_vcf, samtools_germline_vcf, roh_calls, museq_single_pdf, refdir, single_node=False): museq_ss_vcf = dict([(sampid, museq_ss_vcf[sampid]) for sampid in samples]) museq_single_pdf = dict([(sampid, museq_single_pdf[sampid]) for sampid in samples]) samtools_germline_vcf = dict([(sampid, samtools_germline_vcf[sampid]) for sampid in samples]) roh_calls = dict([(sampid, roh_calls[sampid]) for sampid in samples]) chromosomes = config.refdir_data(refdir)['params']['chromosomes'] paths_refdir = config.refdir_data(refdir)['paths'] workflow = pypeliner.workflow.Workflow(ctx=helpers.get_default_ctx( docker_image=config.containers('wgs'))) workflow.setobj(obj=mgd.OutputChunks('sample_id'), value=samples) workflow.subworkflow( name="mutationseq_single", func='wgs.workflows.mutationseq.create_museq_workflow', axes=('sample_id', ), args=( mgd.TempOutputFile("museq_germlines.vcf.gz", 'sample_id'), mgd.OutputFile('museq_single_pdf', 'sample_id', fnames=museq_single_pdf), paths_refdir['reference'], chromosomes, ), kwargs={ 'tumour_bam': None, 'normal_bam': mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), 'single_node': single_node, 'germline_refdata': paths_refdir['germline_portrait_ref'], 'thousand_genomes': paths_refdir['thousand_genomes'], 'dbsnp': paths_refdir['dbsnp'], }) workflow.subworkflow( name="samtools_germline", func= 'wgs.workflows.samtools_germline.create_samtools_germline_workflow', axes=('sample_id', ), args=(mgd.TempOutputFile("samtools_germlines.vcf.gz", 'sample_id'), mgd.OutputFile("roh_calls.csv", 'sample_id', fnames=roh_calls), mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), paths_refdir['reference'], chromosomes), kwargs={ 'single_node': single_node, }) workflow.subworkflow( name="annotate_germline_museq", func='wgs.workflows.vcf_annotation.create_annotation_workflow', axes=('sample_id', ), args=(mgd.TempInputFile("museq_germlines.vcf.gz", 'sample_id'), mgd.OutputFile('museq_germlines_ann.vcf.gz', 'sample_id', extensions=['.csi', '.tbi'], fnames=museq_ss_vcf), paths_refdir['snpeff_config'], paths_refdir['mutation_assessor'], paths_refdir['dbsnp'], paths_refdir['thousand_genomes'], paths_refdir['cosmic'], paths_refdir['blacklist']), kwargs={ 'vcftools_docker': config.containers('vcftools'), 'snpeff_docker': config.containers('vcftools'), }) workflow.subworkflow( name="annotate_germline_samtools", func='wgs.workflows.vcf_annotation.create_annotation_workflow', axes=('sample_id', ), args=(mgd.TempInputFile("samtools_germlines.vcf.gz", 'sample_id'), mgd.OutputFile("samtools_germlines_anno.vcf.gz", 'sample_id', extensions=['.csi', '.tbi'], fnames=samtools_germline_vcf), paths_refdir['snpeff_config'], paths_refdir['mutation_assessor'], paths_refdir['dbsnp'], paths_refdir['thousand_genomes'], paths_refdir['cosmic'], paths_refdir['blacklist']), kwargs={ 'vcftools_docker': config.containers('vcftools'), 'snpeff_docker': config.containers('vcftools'), }) return workflow
def create_hmmcopy_workflow( bam_file, sample_id, bias_pdf, correction_pdf, hmmcopy_pdf, hmmcopy_table, pygenes_table, chromosomes, map_wig, gc_wig, pygenes_gtf, ): cn_params = config.default_params()['copynumber_calling'] workflow = pypeliner.workflow.Workflow() workflow.transform(name='hmmcopy_readcounter', ctx=helpers.get_default_ctx( memory=5, walltime='2:00', ), func='wgs.workflows.hmmcopy.tasks.hmmcopy_readcounter', args=( mgd.InputFile(bam_file, extensions=['.bai']), mgd.TempOutputFile('infile.wig'), chromosomes, cn_params['readcounter'], )) workflow.transform( name='calc_corr', func='wgs.workflows.hmmcopy.tasks.calc_corr', args=( mgd.TempInputFile('infile.wig'), mgd.TempOutputFile('infile_copy.txt'), mgd.TempOutputFile('infile_copy.obj'), gc_wig, map_wig, cn_params['map_cutoff'], ), ) workflow.transform( name='run_hmmcopy', func='wgs.workflows.hmmcopy.tasks.run_hmmcopy', args=( mgd.TempInputFile('infile_copy.obj'), mgd.TempInputFile('infile_copy.txt'), mgd.TempOutputFile('hmmcopy_res.obj'), mgd.TempOutputFile('hmmcopy_segments.txt'), mgd.OutputFile(hmmcopy_table), sample_id, cn_params['hmmcopy_params'], ), ) workflow.transform( name='plot_hmm', func='wgs.workflows.hmmcopy.tasks.plot_hmm', args=( mgd.TempInputFile('infile_copy.obj'), mgd.TempInputFile('hmmcopy_res.obj'), mgd.TempSpace('correction_plots_dir'), mgd.TempSpace('hmmcopy_plots_dir'), mgd.OutputFile(bias_pdf), mgd.OutputFile(correction_pdf), mgd.OutputFile(hmmcopy_pdf), ), ) workflow.transform(name='annot_hmm', func='wgs.workflows.hmmcopy.tasks.annot_hmm', args=( mgd.TempInputFile('hmmcopy_segments.txt'), mgd.OutputFile(pygenes_table), pygenes_gtf, )) return workflow
def create_mutect_workflow(normal_bam, tumour_bam, snv_vcf, snv_maf, reference, reference_vep, chromosomes, normal_id, tumour_id, single_node=None): params = config.default_params('variant_calling') workflow = pypeliner.workflow.Workflow() workflow.transform(name='generate_intervals', func='wgs.workflows.mutect.tasks.generate_intervals', ctx=helpers.get_default_ctx( memory=5, walltime='1:00', ), ret=mgd.OutputChunks('interval'), args=(reference, chromosomes), kwargs={'size': params['split_size']}) if single_node: workflow.transform( name='mutect_one_node', ctx=helpers.get_default_ctx(memory=15, walltime='48:00', ncpus=8, disk=600), func='wgs.workflows.mutect.tasks.run_mutect_one_job', args=(mgd.TempSpace("run_mutect_temp"), mgd.TempOutputFile('merged.vcf'), reference, mgd.InputChunks('interval'), mgd.InputFile(normal_bam), mgd.InputFile(tumour_bam)), ) else: workflow.transform( name='mutect_caller', ctx=helpers.get_default_ctx( memory=15, walltime='24:00', ), axes=('interval', ), func='wgs.workflows.mutect.tasks.run_mutect', args=(mgd.TempOutputFile('mutect.vcf', 'interval'), reference, mgd.InputInstance('interval'), mgd.InputFile(normal_bam), mgd.InputFile(tumour_bam), mgd.TempSpace('mutect_temp', 'interval')), ) workflow.transform( name='merge_vcfs', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.mutect.tasks.merge_vcfs', args=( mgd.TempInputFile('mutect.vcf', 'interval'), mgd.TempOutputFile('merged.vcf'), mgd.TempSpace('merge_vcf'), ), ) workflow.transform(name='bcftools_normalize', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.utils.vcfutils.bcftools_normalize', args=( mgd.TempInputFile('merged.vcf'), mgd.TempOutputFile('normalized.vcf'), reference, )) workflow.transform( name='finalise_snvs', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.utils.vcf_tasks.finalise_vcf', args=( mgd.TempInputFile('normalized.vcf'), mgd.OutputFile(snv_vcf, extensions=['.tbi', '.csi']), ), ) workflow.subworkflow(name="strelka_indel_maf", func='wgs.workflows.vcf2maf.create_vcf2maf_workflow', args=( mgd.InputFile(snv_vcf, extensions=['.tbi', '.csi']), mgd.OutputFile(snv_maf), reference_vep, ), kwargs={ 'tumour_id': tumour_id, 'normal_id': normal_id }) return workflow
def realign_bam_files(inputs, outputs, metrics_output, metrics_tar, refdir, samples, single_node=False, ignore_bamtofastq_exception=False, picard_mem=8): inputs = dict([(sample, inputs[sample]) for sample in samples]) outputs = dict([(sample, outputs[sample]) for sample in samples]) outputs_tdf = dict([(sample, outputs[sample] + '.tdf') for sample in samples]) metrics_output = dict([(sample, metrics_output[sample]) for sample in samples]) metrics_tar = dict([(sample, metrics_tar[sample]) for sample in samples]) workflow = pypeliner.workflow.Workflow() workflow.setobj(obj=mgd.OutputChunks('sample_id'), value=samples) workflow.transform(name='bam_to_fastq', ctx=helpers.get_default_ctx(walltime='96:00', disk=500), func="wgs.workflows.realignment.tasks.split_by_rg", axes=('sample_id', ), args=(mgd.InputFile('input.bam', 'sample_id', fnames=inputs), mgd.TempOutputFile("inputdata_read1.fastq.gz", 'sample_id', "readgroup"), mgd.TempOutputFile("inputdata_read2.fastq.gz", 'sample_id', "readgroup", axes_origin=[]), mgd.TempSpace("bamtofastq", 'sample_id'), ignore_bamtofastq_exception)) workflow.transform(name='get_sample_info', func="wgs.workflows.realignment.tasks.get_read_group", axes=('sample_id', ), ret=mgd.TempOutputObj('sample_info', 'sample_id'), args=(mgd.InputFile('input.bam', 'sample_id', fnames=inputs), )) workflow.subworkflow(name='align_samples', func=alignment.align_samples, args=(mgd.TempInputFile("inputdata_read1.fastq.gz", "sample_id", "readgroup", axes_origin=[]), mgd.TempInputFile("inputdata_read2.fastq.gz", "sample_id", "readgroup", axes_origin=[]), mgd.OutputFile('output.bam', 'sample_id', fnames=outputs, extensions=['.bai'], axes_origin=[]), mgd.OutputFile('output_metrics.csv', 'sample_id', fnames=metrics_output, extensions=['.yaml'], axes_origin=[]), mgd.OutputFile('output_metrics.tar', 'sample_id', fnames=metrics_tar, axes_origin=[]), mgd.OutputFile('output.bam.tdf', 'sample_id', fnames=outputs_tdf, axes_origin=[]), mgd.TempInputObj('sample_info', 'sample_id', axes_origin=[]), refdir), kwargs={ 'single_node': single_node, 'picard_mem': picard_mem }) return workflow
def create_remixt_workflow( tumour_path, normal_path, breakpoints, sample_id, remixt_results_filename, remixt_brk_cn_csv, remixt_cn_csv, remixt_minor_modes_csv, remixt_mix_csv, remixt_read_depth_csv, remixt_stats_csv, remixt_refdata, reference, single_node=False, ): ctx = {'docker_image': config.containers('wgs')} params = config.default_params('copynumber_calling')['remixt'] workflow = pypeliner.workflow.Workflow(ctx=ctx) remixt_config = { 'genome_fasta': reference, 'genome_fai': reference + '.fai', } if breakpoints is None: workflow.setobj( obj=mgd.TempOutputObj('emptybreakpoints'), value=[], ) workflow.transform( name='write_empty_breakpoints', func='wgs.workflows.remixt.tasks.write_empty_breakpoints', args=( mgd.TempInputObj('emptybreakpoints'), mgd.TempOutputFile('filtered_breakpoints.csv'), ), ) else: workflow.transform( name='filter_breakpoints', func='wgs.workflows.remixt.tasks.filter_destruct_breakpoints', ctx=helpers.get_default_ctx(memory=4, walltime='4:00'), args=(mgd.InputFile(breakpoints), mgd.TempOutputFile('filtered_breakpoints.csv'), params['min_num_reads'])) if single_node: workflow.transform( name='remixt', func='wgs.workflows.remixt.tasks.run_remixt_local', ctx=helpers.get_default_ctx(memory=15, walltime='120:00', ncpus=8), args=( mgd.TempSpace("remixt_temp"), mgd.TempInputFile('filtered_breakpoints.csv'), mgd.InputFile(tumour_path, extensions=['.bai']), mgd.InputFile(normal_path, extensions=['.bai']), sample_id, mgd.OutputFile(remixt_results_filename), mgd.TempSpace('remixt_raw_dir'), remixt_config, remixt_refdata, ), ) else: workflow.subworkflow(name='remixt', func="remixt.workflow.create_remixt_bam_workflow", ctx={ 'docker_image': config.containers('remixt'), 'walltime': '48:00' }, args=( mgd.TempInputFile('filtered_breakpoints.csv'), { sample_id: mgd.InputFile(tumour_path, extensions=['.bai']), sample_id + 'N': mgd.InputFile(normal_path, extensions=['.bai']) }, { sample_id: mgd.OutputFile(remixt_results_filename) }, mgd.TempSpace('remixt_raw_dir'), remixt_config, remixt_refdata, ), kwargs={ 'normal_id': sample_id + 'N', }) workflow.transform( name='parse_remixt', func='wgs.workflows.remixt.tasks.parse_remixt_file', args=(mgd.InputFile(remixt_results_filename), [ mgd.OutputFile(remixt_brk_cn_csv, extensions=['.yaml']), mgd.OutputFile(remixt_cn_csv, extensions=['.yaml']), mgd.OutputFile(remixt_minor_modes_csv, extensions=['.yaml']), mgd.OutputFile(remixt_mix_csv, extensions=['.yaml']), mgd.OutputFile(remixt_read_depth_csv, extensions=['.yaml']), mgd.OutputFile(remixt_stats_csv, extensions=['.yaml']), ], ['/brk_cn', '/cn', '/minor_modes', '/mix', '/read_depth', '/stats'], mgd.TempSpace('tempdir_parse'))) return workflow
def create_sample_qc_workflow( sample_id, refdir, normal_bam, tumour_bam, titan, remixt, breakpoints_consensus, roh, germline_calls, somatic_calls, genome_wide_plot, normal_coverage, tumour_coverage, chromosomes, bins, mapping_qual_threshold, single_node=False ): workflow = pypeliner.workflow.Workflow() workflow.subworkflow( name='coverage_normal_data', func=get_coverage_data, args=( mgd.InputFile(normal_bam), mgd.OutputFile(normal_coverage), refdir, chromosomes, mapping_qual_threshold, bins, ), kwargs={'single_node': single_node} ) workflow.subworkflow( name='coverage_tumour_data', func=get_coverage_data, args=( mgd.InputFile(tumour_bam), mgd.OutputFile(tumour_coverage), refdir, chromosomes, mapping_qual_threshold, bins, ), kwargs={'single_node': single_node} ) workflow.transform( name='generate_genome_wide_plot', ctx=helpers.get_default_ctx( memory=10, ), func="wgs.workflows.sample_qc.tasks.genome_wide", args=( sample_id, mgd.InputFile(roh), mgd.InputFile(germline_calls), mgd.InputFile(normal_coverage), chromosomes, mgd.OutputFile(genome_wide_plot), ), kwargs={"titan": mgd.InputFile(titan), "somatic": mgd.InputFile(somatic_calls), "remixt": mgd.InputFile(remixt), "tumour": mgd.InputFile(tumour_coverage), "breakpoints": mgd.InputFile(breakpoints_consensus) } ) return workflow