def create_mutect_workflow(normal_bam, tumour_bam, snv_vcf, snv_maf, reference, reference_vep, chromosomes, normal_id, tumour_id, single_node=None): params = config.default_params('variant_calling') workflow = pypeliner.workflow.Workflow() workflow.transform(name='generate_intervals', func='wgs.workflows.mutect.tasks.generate_intervals', ctx=helpers.get_default_ctx( memory=5, walltime='1:00', ), ret=mgd.OutputChunks('interval'), args=(reference, chromosomes), kwargs={'size': params['split_size']}) if single_node: workflow.transform( name='mutect_one_node', ctx=helpers.get_default_ctx(memory=15, walltime='48:00', ncpus=8, disk=600), func='wgs.workflows.mutect.tasks.run_mutect_one_job', args=(mgd.TempSpace("run_mutect_temp"), mgd.TempOutputFile('merged.vcf'), reference, mgd.InputChunks('interval'), mgd.InputFile(normal_bam), mgd.InputFile(tumour_bam)), ) else: workflow.transform( name='mutect_caller', ctx=helpers.get_default_ctx( memory=15, walltime='24:00', ), axes=('interval', ), func='wgs.workflows.mutect.tasks.run_mutect', args=(mgd.TempOutputFile('mutect.vcf', 'interval'), reference, mgd.InputInstance('interval'), mgd.InputFile(normal_bam), mgd.InputFile(tumour_bam), mgd.TempSpace('mutect_temp', 'interval')), ) workflow.transform( name='merge_vcfs', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.mutect.tasks.merge_vcfs', args=( mgd.TempInputFile('mutect.vcf', 'interval'), mgd.TempOutputFile('merged.vcf'), mgd.TempSpace('merge_vcf'), ), ) workflow.transform(name='bcftools_normalize', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.utils.vcfutils.bcftools_normalize', args=( mgd.TempInputFile('merged.vcf'), mgd.TempOutputFile('normalized.vcf'), reference, )) workflow.transform( name='finalise_snvs', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.utils.vcf_tasks.finalise_vcf', args=( mgd.TempInputFile('normalized.vcf'), mgd.OutputFile(snv_vcf, extensions=['.tbi', '.csi']), ), ) workflow.subworkflow(name="strelka_indel_maf", func='wgs.workflows.vcf2maf.create_vcf2maf_workflow', args=( mgd.InputFile(snv_vcf, extensions=['.tbi', '.csi']), mgd.OutputFile(snv_maf), reference_vep, ), kwargs={ 'tumour_id': tumour_id, 'normal_id': normal_id }) return workflow
def create_titan_workflow( tumour_bam, normal_bam, targets, outfile, params, segs, igv_segs, parsed, plots, tar_outputs, museq_vcf, sample_id, reference, chromosomes, het_positions, map_wig, gc_wig, pygenes_gtf, single_node=None ): cn_params = config.default_params('copynumber_calling') chunks = [(v['num_clusters'], v['ploidy']) for v in cn_params['titan_intervals']] targets = mgd.InputFile(targets) if targets else None ctx = {'docker_image': config.containers('wgs')} workflow = pypeliner.workflow.Workflow(ctx=ctx) workflow.setobj( obj=mgd.OutputChunks('numclusters', 'ploidy'), value=chunks, ) workflow.transform( name='generate_intervals', func='wgs.workflows.titan.tasks.generate_intervals', ctx=helpers.get_default_ctx( memory=5, walltime='2:00', ), ret=mgd.OutputChunks('interval'), args=( reference, chromosomes, ), kwargs={'size': cn_params['split_size']} ) if single_node: workflow.transform( name='run_museq', ctx=helpers.get_default_ctx( memory=15, walltime='96:00', ncpus=8), func='wgs.utils.museq_utils.run_museq_one_job', args=( mgd.TempSpace("run_museq_temp"), mgd.OutputFile(museq_vcf), reference, mgd.InputChunks('interval'), cn_params['museq_params'], ), kwargs={ 'tumour_bam': mgd.InputFile(tumour_bam, extensions=['.bai']), 'normal_bam': mgd.InputFile(normal_bam, extensions=['.bai']), 'titan_mode': True, 'museq_docker_image': config.containers('mutationseq'), 'vcftools_docker_image': config.containers('vcftools') } ) else: workflow.transform( name='run_museq', ctx=helpers.get_default_ctx( memory=15, walltime='24:00'), axes=('interval',), func='wgs.utils.museq_utils.run_museq', args=( mgd.TempOutputFile('museq.vcf', 'interval'), mgd.TempOutputFile('museq.log', 'interval'), reference, mgd.InputInstance('interval'), cn_params['museq_params'] ), kwargs={ 'tumour_bam': mgd.InputFile(tumour_bam, extensions=['.bai']), 'normal_bam': mgd.InputFile(normal_bam, extensions=['.bai']), 'titan_mode': True, 'docker_image': config.containers('mutationseq') } ) workflow.transform( name='merge_vcfs', ctx=helpers.get_default_ctx( memory=15, walltime='4:00', ), func='wgs.utils.museq_utils.merge_vcfs', args=( mgd.TempInputFile('museq.vcf', 'interval'), mgd.OutputFile(museq_vcf), mgd.TempSpace('merge_vcf'), ), kwargs={'docker_image': config.containers('vcftools')} ) workflow.transform( name='convert_museq_vcf2counts', ctx=helpers.get_default_ctx( memory=10, walltime='4:00', ), func='wgs.workflows.titan.tasks.convert_museq_vcf2counts', args=( mgd.InputFile(museq_vcf), mgd.TempOutputFile('museq_postprocess.txt'), het_positions, ), ) workflow.transform( name='run_readcounter_tumour', ctx=helpers.get_default_ctx( memory=10, walltime='16:00', disk=200 ), func='wgs.workflows.titan.tasks.run_readcounter', args=( mgd.InputFile(tumour_bam, extensions=['.bai']), mgd.TempOutputFile('tumour.wig'), chromosomes, cn_params['readcounter'] ), ) workflow.transform( name='run_readcounter_normal', ctx=helpers.get_default_ctx( memory=10, walltime='16:00', disk=200 ), func='wgs.workflows.titan.tasks.run_readcounter', args=( mgd.InputFile(normal_bam, extensions=['.bai']), mgd.TempOutputFile('normal.wig'), chromosomes, cn_params['readcounter'] ), ) workflow.transform( name='calc_correctreads_wig', ctx=helpers.get_default_ctx( memory=10, walltime='4:00', ), func='wgs.workflows.titan.tasks.calc_correctreads_wig', args=( mgd.TempInputFile('tumour.wig'), mgd.TempInputFile('normal.wig'), targets, mgd.TempOutputFile('correct_reads.txt'), gc_wig, map_wig, cn_params['genome_type'] ), kwargs={'docker_image': config.containers('titan')} ) workflow.transform( name='run_titan', axes=('numclusters', 'ploidy'), ctx=helpers.get_default_ctx( memory=15, walltime='72:00', ncpus='8'), func='wgs.workflows.titan.tasks.run_titan', args=( mgd.TempInputFile('museq_postprocess.txt'), mgd.TempInputFile('correct_reads.txt'), mgd.TempOutputFile('titan_outfile', 'numclusters', 'ploidy'), mgd.TempOutputFile('titan.Rdata', 'numclusters', 'ploidy'), mgd.TempOutputFile('titan_params', 'numclusters', 'ploidy'), mgd.InputInstance('numclusters'), mgd.InputInstance('ploidy'), sample_id, map_wig, cn_params['titan_params'], cn_params['genome_type'] ), kwargs={'docker_image': config.containers('titan'), 'threads': '8'} ) workflow.transform( name='plot_titan', axes=('numclusters', 'ploidy'), ctx=helpers.get_default_ctx( memory=10, walltime='16:00', ), func='wgs.workflows.titan.tasks.plot_titan', args=( mgd.TempInputFile('titan.Rdata', 'numclusters', 'ploidy'), mgd.TempOutputFile('titan_plots', 'numclusters', 'ploidy'), mgd.TempSpace("titan_plots_tempdir", 'numclusters', 'ploidy'), mgd.InputInstance('numclusters'), mgd.InputInstance('ploidy') ), kwargs={ 'chromosomes': chromosomes, 'docker_image': config.containers('titan'), }, ) workflow.transform( name='calc_cnsegments_titan', axes=('numclusters', 'ploidy'), ctx=helpers.get_default_ctx( memory=5, walltime='4:00', ), func='wgs.workflows.titan.tasks.calc_cnsegments_titan', args=( mgd.TempInputFile('titan_outfile', 'numclusters', 'ploidy'), mgd.TempOutputFile('titan_igv', 'numclusters', 'ploidy'), mgd.TempOutputFile('segs.csv', 'numclusters', 'ploidy'), sample_id, ), kwargs={'docker_image': config.containers('titan')} ) workflow.transform( name='annot_pygenes', axes=('numclusters', 'ploidy'), ctx=helpers.get_default_ctx( memory=10, walltime='4:00', ), func='wgs.workflows.titan.tasks.annot_pygenes', args=( mgd.TempInputFile('segs.csv', 'numclusters', 'ploidy'), mgd.TempOutputFile('titan_segs.csv', 'numclusters', 'ploidy'), pygenes_gtf, ), ) workflow.transform( name='parse_titan', axes=('numclusters', 'ploidy'), ctx=helpers.get_default_ctx( memory=5, walltime='4:00', ), func='wgs.workflows.titan.tasks.parse_titan_data', args=( mgd.TempInputFile('titan_segs.csv', 'numclusters', 'ploidy'), mgd.TempInputFile('titan_outfile', 'numclusters', 'ploidy'), mgd.TempOutputFile('titan_parsed.csv', 'numclusters', 'ploidy'), ), ) # select optimal solution workflow.transform( name="select_optimal_solution", ctx=helpers.get_default_ctx( memory=5, walltime='4:00', ), func="wgs.workflows.titan.tasks.select_optimal_solution", args=( chunks, mgd.TempInputFile('titan_params', 'numclusters', 'ploidy', axes_origin=[]), mgd.TempInputFile("titan_segs.csv", 'numclusters', 'ploidy', axes_origin=[]), mgd.TempInputFile('titan_igv', 'numclusters', 'ploidy'), mgd.TempInputFile("titan_outfile", 'numclusters', 'ploidy', axes_origin=[]), mgd.TempInputFile("titan_parsed.csv", 'numclusters', 'ploidy', axes_origin=[]), mgd.TempInputFile("titan_plots", 'numclusters', 'ploidy', axes_origin=[]), mgd.OutputFile(segs, extensions=['.yaml']), mgd.OutputFile(igv_segs, extensions=['.yaml']), mgd.OutputFile(params, extensions=['.yaml']), mgd.OutputFile(outfile, extensions=['.yaml']), mgd.OutputFile(parsed, extensions=['.yaml']), mgd.OutputFile(plots), ) ) workflow.transform( name='tar_all_data', ctx=helpers.get_default_ctx( memory=5, walltime='4:00', ), func="wgs.workflows.titan.tasks.tar_all_data", args=( mgd.TempInputFile('titan_params', 'numclusters', 'ploidy', axes_origin=[]), mgd.TempInputFile("titan_segs.csv", 'numclusters', 'ploidy', axes_origin=[]), mgd.TempInputFile('titan_igv', 'numclusters', 'ploidy'), mgd.TempInputFile("titan_outfile", 'numclusters', 'ploidy', axes_origin=[]), mgd.TempInputFile("titan_parsed.csv", 'numclusters', 'ploidy', axes_origin=[]), mgd.TempInputFile("titan_plots", 'numclusters', 'ploidy', axes_origin=[]), mgd.OutputFile(tar_outputs), mgd.TempSpace("titan_all_parameters_data"), chunks ) ) return workflow
def alignment_workflow(args): config = inpututils.load_config(args) config = config['alignment'] lib = args["library_id"] alignment_dir = args["out_dir"] bams_dir = args["bams_dir"] trim = args['trim'] center = args['sequencing_center'] sampleinfo = inpututils.get_sample_info(args['input_yaml']) cellids = inpututils.get_samples(args['input_yaml']) fastq1_files, fastq2_files = inpututils.get_fastqs(args['input_yaml']) alignment_files = get_output_files(alignment_dir, lib) alignment_meta = os.path.join(alignment_dir, 'metadata.yaml') bam_files_template = os.path.join(bams_dir, '{cell_id}.bam') mt_bam_files_template = os.path.join(bams_dir, '{cell_id}_MT.bam') bams_meta = os.path.join(bams_dir, 'metadata.yaml') lanes = sorted(set([v[1] for v in fastq1_files.keys()])) cells = sorted(set([v[0] for v in fastq1_files.keys()])) input_yaml_blob = os.path.join(alignment_dir, 'input.yaml') workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('cell_id', 'lane'), value=list(fastq1_files.keys()), ) workflow.subworkflow( name='alignment_workflow', func=align.create_alignment_workflow, args=( mgd.InputFile('fastq_1', 'cell_id', 'lane', fnames=fastq1_files, axes_origin=[]), mgd.InputFile('fastq_2', 'cell_id', 'lane', fnames=fastq2_files, axes_origin=[]), mgd.OutputFile('bam_markdups', 'cell_id', template=bam_files_template, axes_origin=[], extensions=['.bai']), mgd.OutputFile('mt_bam_markdups', 'cell_id', template=mt_bam_files_template, axes_origin=[], extensions=['.bai']), mgd.OutputFile(alignment_files['alignment_metrics_csv']), mgd.OutputFile(alignment_files['gc_metrics_csv']), mgd.OutputFile(alignment_files['fastqc_metrics_csv']), mgd.OutputFile(alignment_files['plot_metrics_output']), config['ref_genome'], config, sampleinfo, cellids, mgd.OutputFile(alignment_files['alignment_metrics_tar']), lib, trim, center, ), ) workflow.transform( name='generate_meta_files_results', func='single_cell.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], alignment_dir, list(alignment_files.values()), mgd.OutputFile(alignment_meta)), kwargs={ 'input_yaml_data': inpututils.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'library_id': lib, 'cell_ids': cells, 'lane_ids': lanes, 'type': 'alignment' } }) workflow.transform( name='generate_meta_files_bams', func='single_cell.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], bams_dir, mgd.Template('aligned.bam', 'cell_id', template=bam_files_template), mgd.OutputFile(bams_meta)), kwargs={ 'metadata': { 'library_id': lib, 'cell_ids': cells, 'lane_ids': lanes, 'type': 'cellbams' }, 'template': (mgd.InputChunks('cell_id'), bam_files_template, 'cell_id'), }) return workflow
def create_museq_workflow(snv_vcf, museqportrait_pdf, reference, chromosomes, thousand_genomes=None, dbsnp=None, germline_refdata=None, tumour_bam=None, normal_bam=None, single_node=None): name = 'run_museq' if tumour_bam: tumour_bam = mgd.InputFile(tumour_bam, extensions=['.bai']) name += '_tumour' if normal_bam: normal_bam = mgd.InputFile(normal_bam, extensions=['.bai']) name += '_normal' single = False if name == 'run_museq_tumour_normal' else True params = config.default_params('variant_calling') workflow = pypeliner.workflow.Workflow( ctx={'docker_image': config.containers('wgs')}) workflow.transform( name='generate_intervals', func='wgs.workflows.mutationseq.tasks.generate_intervals', ctx=helpers.get_default_ctx( memory=5, walltime='1:00', ), ret=mgd.OutputChunks('interval'), args=(reference, chromosomes), kwargs={'size': params['split_size']}) if single_node: workflow.transform(name=name, ctx=helpers.get_default_ctx(memory=15, walltime='48:00', ncpus='8', disk=600), func='wgs.utils.museq_utils.run_museq_one_job', args=( mgd.TempSpace("run_museq_temp"), mgd.TempOutputFile('merged.vcf'), reference, mgd.InputChunks('interval'), params['museq_params'], ), kwargs={ 'tumour_bam': tumour_bam, 'normal_bam': normal_bam, 'museq_docker_image': config.containers('mutationseq'), 'vcftools_docker_image': config.containers('vcftools') }) else: workflow.transform(name=name, ctx=helpers.get_default_ctx( memory=15, walltime='24:00', ), axes=('interval', ), func='wgs.utils.museq_utils.run_museq', args=( mgd.TempOutputFile('museq.vcf', 'interval'), mgd.TempOutputFile('museq.log', 'interval'), reference, mgd.InputInstance('interval'), params['museq_params'], ), kwargs={ 'tumour_bam': tumour_bam, 'normal_bam': normal_bam, 'docker_image': config.containers('mutationseq'), }) workflow.transform( name='merge_vcfs', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.utils.museq_utils.merge_vcfs', args=( mgd.TempInputFile('museq.vcf', 'interval'), mgd.TempOutputFile('merged.vcf'), mgd.TempSpace('merge_vcf'), ), kwargs={'docker_image': config.containers('vcftools')}) workflow.transform(name='finalise_snvs', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.utils.vcf_tasks.finalise_vcf', args=( mgd.TempInputFile('merged.vcf'), mgd.OutputFile(snv_vcf, extensions=['.tbi', '.csi']), ), kwargs={'docker_image': config.containers('vcftools')}) workflow.transform( name='run_museqportrait', ctx=helpers.get_default_ctx( memory=5, walltime='8:00', ), func='wgs.workflows.mutationseq.tasks.run_museqportrait', args=( mgd.InputFile(snv_vcf, extensions=['.tbi', '.csi']), mgd.OutputFile(museqportrait_pdf), mgd.TempOutputFile('museqportrait.txt'), mgd.TempOutputFile('museqportrait.log'), single, ), kwargs={ 'docker_image': config.containers('mutationseq'), 'thousand_genomes': thousand_genomes, 'dbsnp': dbsnp, 'germline_refdata': germline_refdata, 'germline_plot_threshold': params['germline_portrait_threshold'] }) return workflow
def create_strelka_workflow(normal_bam_file, tumour_bam_file, snv_vcf_file, snv_maf_file, indel_vcf_file, indel_maf_file, reference, reference_vep, chromosomes, normal_id, tumour_id, single_node=False, is_exome=False): params = config.default_params('variant_calling') workflow = Workflow(ctx=helpers.get_default_ctx(memory=5, walltime='4:00'), ) workflow.transform( name='generate_intervals', func='wgs.workflows.mutationseq.tasks.generate_intervals', ret=mgd.OutputChunks('regions'), args=(reference, chromosomes), kwargs={'size': params['split_size']}) workflow.transform( name='count_fasta_bases', func="wgs.workflows.strelka.tasks.count_fasta_bases", args=( reference, pypeliner.managed.TempOutputFile('ref_base_counts.tsv'), ), ) workflow.transform( name="get_chrom_sizes", func="wgs.workflows.strelka.tasks.get_known_chromosome_sizes", ret=pypeliner.managed.TempOutputObj('known_sizes'), args=(pypeliner.managed.TempInputFile('ref_base_counts.tsv'), chromosomes)) if single_node: workflow.transform(name='strelka_one_node', func="wgs.workflows.strelka.tasks.strelka_one_node", args=( pypeliner.managed.InputFile(normal_bam_file, extensions=['.bai' ]), pypeliner.managed.InputFile(tumour_bam_file, extensions=['.bai' ]), reference, mgd.TempOutputFile('indels.vcf.gz', extensions=['.tbi', '.csi']), mgd.TempOutputFile('snvs.vcf.gz', extensions=['.tbi', '.csi']), mgd.TempSpace('call_genome_segment_tmp'), mgd.InputChunks('regions'), mgd.TempInputObj('known_sizes'), ), kwargs={ 'is_exome': is_exome, }) else: workflow.transform( name='get_chromosome_depths', axes=('regions', ), func="wgs.workflows.strelka.tasks.get_chromosome_depth", args=( mgd.InputInstance('regions'), pypeliner.managed.InputFile(normal_bam_file, extensions=['.bai']), reference, mgd.TempOutputFile('chrom_depth.txt', 'regions'), ), ) workflow.transform( name='merge_chromosome_depths', func="wgs.workflows.strelka.tasks.merge_chromosome_depths", args=(mgd.TempInputFile('chrom_depth.txt', 'regions', axes_origin=[]), mgd.TempOutputFile('merged_chrom_depth.txt'))) workflow.transform( name='call_genome_segment', axes=('regions', ), func="wgs.workflows.strelka.tasks.call_genome_segment", args=( mgd.TempInputFile('merged_chrom_depth.txt'), pypeliner.managed.InputFile(normal_bam_file, extensions=['.bai']), pypeliner.managed.InputFile(tumour_bam_file, extensions=['.bai']), reference, mgd.TempOutputFile('indels.vcf', 'regions'), mgd.TempOutputFile('snvs.vcf', 'regions'), mgd.TempSpace('call_genome_segment_tmp', 'regions'), mgd.InputInstance('regions'), mgd.TempInputObj('known_sizes'), ), kwargs={ 'is_exome': False, }) workflow.transform( name='merge_indels', func='wgs.workflows.strelka.tasks.concatenate_vcf', args=(mgd.TempInputFile('indels.vcf', 'regions'), mgd.TempOutputFile('indels.vcf.gz', extensions=['.tbi', '.csi']), mgd.TempSpace("indels_merge")), ) workflow.transform( name='merge_snvs', func='wgs.workflows.strelka.tasks.concatenate_vcf', args=(mgd.TempInputFile('snvs.vcf', 'regions'), mgd.TempOutputFile('snvs.vcf.gz', extensions=['.tbi', '.csi']), mgd.TempSpace("snvs_merge")), ) workflow.transform(name='bcftools_normalize_snv', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.utils.vcfutils.bcftools_normalize', args=( mgd.TempInputFile('snvs.vcf.gz'), mgd.TempOutputFile('normalized_snvs.vcf'), reference, )) workflow.transform( name='finalise_normalize_snvs', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.utils.vcf_tasks.finalise_vcf', args=( mgd.TempInputFile('normalized_snvs.vcf'), mgd.TempOutputFile('normalized_snvs_finalize.vcf.gz', extensions=['.tbi', '.csi']), ), ) workflow.transform(name='bcftools_normalize_indel', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.utils.vcfutils.bcftools_normalize', args=( mgd.TempInputFile('indels.vcf.gz'), mgd.TempOutputFile('normalized_indels.vcf'), reference, )) workflow.transform( name='finalise_normalize_indel', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.utils.vcf_tasks.finalise_vcf', args=( mgd.TempInputFile('normalized_indels.vcf'), mgd.TempOutputFile('normalized_indels_finalize.vcf.gz', extensions=['.tbi', '.csi']), ), ) workflow.transform( name='filter_vcf_indel', func='wgs.workflows.strelka.tasks.filter_vcf', args=( mgd.TempInputFile('normalized_indels_finalize.vcf.gz', extensions=['.tbi', '.csi']), mgd.OutputFile(indel_vcf_file, extensions=['.tbi', '.csi']), ), ) workflow.transform( name='filter_vcf_snv', func='wgs.workflows.strelka.tasks.filter_vcf', args=( mgd.TempInputFile('normalized_snvs_finalize.vcf.gz', extensions=['.tbi', '.csi']), mgd.OutputFile(snv_vcf_file, extensions=['.tbi', '.csi']), ), ) workflow.subworkflow(name="strelka_snv_maf", func='wgs.workflows.vcf2maf.create_vcf2maf_workflow', args=( mgd.InputFile(snv_vcf_file, extensions=['.tbi', '.csi']), mgd.OutputFile(snv_maf_file), reference_vep, ), kwargs={ 'tumour_id': tumour_id, 'normal_id': normal_id }) workflow.subworkflow(name="strelka_indel_maf", func='wgs.workflows.vcf2maf.create_vcf2maf_workflow', args=( mgd.InputFile(indel_vcf_file, extensions=['.tbi', '.csi']), mgd.OutputFile(indel_maf_file), reference_vep, ), kwargs={ 'tumour_id': tumour_id, 'normal_id': normal_id }) return workflow
def merge_bams_workflow(args): config = inpututils.load_config(args) config = config['merge_bams'] ctx = { 'mem_retry_increment': 2, 'disk_retry_increment': 50, 'ncpus': 1, 'mem': config["memory"]['low'] } workflow = pypeliner.workflow.Workflow(ctx=ctx) bam_files = inpututils.load_merge_cell_bams(args['input_yaml']) merge_out_template = os.path.join(args['out_dir'], '{region}.bam') meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml') input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml') workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=list(bam_files.keys()), ) workflow.transform( name="get_regions", func="single_cell.utils.pysamutils.get_regions_from_reference", ret=pypeliner.managed.OutputChunks('region'), args=( config["ref_genome"], config["split_size"], config["chromosomes"], )) workflow.transform( name="remove_softclipped_reads", func="single_cell.utils.pysamutils.remove_softclipped_reads", axes=('cell_id', ), args=(mgd.InputFile('bam_markdups', 'cell_id', fnames=bam_files, extensions=['.bai']), mgd.TempOutputFile('bam_rm_softclipped.bam', 'cell_id', extensions=['.bai']), args['softclipped_reads_threshold'])) workflow.subworkflow(name="wgs_merge_workflow", func=merge_bams.create_merge_bams_workflow, args=( mgd.TempInputFile('bam_rm_softclipped.bam', 'cell_id', extensions=['.bai']), mgd.OutputFile("merged.bam", "region", axes_origin=[], extensions=['.bai'], template=merge_out_template), mgd.InputChunks("region"), config, )) workflow.transform( name='generate_meta_files_results', func='single_cell.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], args['out_dir'], mgd.Template('bam_filenames', 'region', template=merge_out_template), mgd.OutputFile(meta_yaml)), kwargs={ 'input_yaml_data': inpututils.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'template': (mgd.InputChunks('region'), merge_out_template, 'region'), 'metadata': { 'type': 'pseudowgs_regionbams', 'cell_ids': list(bam_files.keys()) } }) return workflow
def split_bam_workflow(workflow, args): config = helpers.load_config(args) info_file = os.path.join(args["out_dir"], 'results', 'split_bam', 'info.yaml') split_bam_template = args["split_bam_template"] split_bai_template = args["split_bam_template"] + ".bai" by_reads = False if "{region}" in split_bam_template else True splitkeyword = "region" if "{region}" in split_bam_template else "reads" if by_reads: splitnames = [str(i) for i in range(config["num_splits_byreads"])] workflow.setobj( obj=mgd.OutputChunks('reads'), value=splitnames, ) else: workflow.transform( name="get_regions", ctx={ 'mem': 2, 'num_retry': 3, 'mem_retry_increment': 2, 'pool_id': config['pools']['standard'], 'ncpus': 1 }, func="single_cell.utils.pysamutils.get_regions_from_reference", ret=pypeliner.managed.TempOutputObj('region'), args=( config["ref_genome"], config["split_size"], config["chromosomes"], )) workflow.subworkflow(name="split_normal", func=split_bams.create_split_workflow, args=( mgd.InputFile(args['wgs_bam']), mgd.InputFile(args['wgs_bam'] + ".bai"), mgd.OutputFile("normal.split.bam", splitkeyword, template=split_bam_template, axes_origin=[]), mgd.OutputFile("normal.split.bam.bai", splitkeyword, template=split_bai_template, axes_origin=[]), pypeliner.managed.TempInputObj(splitkeyword), config, ), kwargs={"by_reads": by_reads}) regions = mgd.InputChunks( 'reads') if by_reads else pypeliner.managed.TempInputObj('region') workflow.transform(name="get_files", func='single_cell.utils.helpers.resolve_template', ret=pypeliner.managed.TempOutputObj('outputs'), args=(pypeliner.managed.TempInputObj('region'), split_bam_template, 'region')) metadata = { 'split_bams': { 'name': 'merge_bams', 'ref_genome': config["ref_genome"], 'version': single_cell.__version__, 'containers': config['containers'], 'output_datasets': pypeliner.managed.TempInputObj('outputs'), 'input_datasets': args['wgs_bam'], 'results': None } } workflow.transform(name='generate_meta_yaml', ctx=dict( mem=config['memory']['med'], pool_id=config['pools']['standard'], ), func="single_cell.utils.helpers.write_to_yaml", args=(mgd.OutputFile(info_file), metadata)) return workflow
def copy_number_calling_workflow(args): config = helpers.load_config(args) config = config['copy_number_calling'] pyp = pypeliner.app.Pypeline(config=args) ctx = {'mem_retry_increment': 2, 'disk_retry_increment': 50, 'ncpus': 1, 'docker_image': config['docker']['single_cell_pipeline'] } workflow = pypeliner.workflow.Workflow(ctx=ctx) data = helpers.load_pseudowgs_input(args['input_yaml']) normal_wgs = data['normal_wgs'] tumour_cells = data['tumour_cells'] assert '{region}' in normal_wgs copynumber_dir = os.path.join(args["out_dir"], "copynumber") out_file = os.path.join(copynumber_dir, "results", "results.h5") cloneid = args["clone_id"] remixt_config = config.get('extract_seqdata', {}) workflow.setobj( obj=mgd.OutputChunks('tumour_cell_id'), value=list(tumour_cells.keys()), ) workflow.transform( name="get_regions", ctx=dict(mem=config['memory']['low']), func="single_cell.utils.pysamutils.get_regions_from_reference", ret=mgd.OutputChunks('region'), args=( config["ref_genome"], config["split_size"], config["chromosomes"], ) ) workflow.transform( name="get_snp_positions_filename", func="remixt.config.get_filename", ret=mgd.TempOutputObj('snp_positions_filename'), args=( remixt_config, config['ref_data_dir'], 'snp_positions' ) ) workflow.transform( name="get_bam_max_fragment_length", func="remixt.config.get_param", ret=mgd.TempOutputObj('bam_max_fragment_length'), args=( remixt_config, 'bam_max_fragment_length' ) ) workflow.transform( name="get_bam_max_soft_clipped", func="remixt.config.get_param", ret=mgd.TempOutputObj('bam_max_soft_clipped'), args=( remixt_config, 'bam_max_soft_clipped' ) ) workflow.transform( name="get_bam_check_proper_pair", func="remixt.config.get_param", ret=mgd.TempOutputObj('bam_check_proper_pair'), args=( remixt_config, 'bam_check_proper_pair' ) ) workflow.subworkflow( name="extract_seqdata_tumour", axes=('tumour_cell_id',), func=extract_seqdata.create_extract_seqdata_workflow, args=( mgd.InputFile( 'bam_markdups', 'tumour_cell_id', fnames=tumour_cells, extensions=['.bai'] ), mgd.TempOutputFile("tumour.h5", "tumour_cell_id"), config.get('extract_seqdata', {}), config['ref_data_dir'], config ) ) workflow.subworkflow( name="extract_seqdata_normal", axes=('region',), ctx={'disk': 200}, func=extract_seqdata.create_extract_seqdata_workflow, args=( mgd.InputFile( 'bam_markdups', 'region', template=normal_wgs, extensions=['.bai'] ), mgd.TempOutputFile("normal.h5", "region"), config.get('extract_seqdata', {}), config['ref_data_dir'], config, ) ) workflow.subworkflow( name='titan_workflow', func=titan.create_titan_workflow, args=( mgd.TempInputFile("normal.h5", "region"), mgd.TempInputFile("tumour.h5", "tumour_cell_id"), config['ref_genome'], copynumber_dir, mgd.OutputFile(out_file), config, args, tumour_cells.keys(), mgd.InputChunks('region'), cloneid ), ) pyp.run(workflow)
def create_freebayes_germline_workflow(germline_vcf, germline_maf, bam_file, reference, reference_vep, chromosomes, normal_id, single_node=None): params = config.default_params('variant_calling') workflow = pypeliner.workflow.Workflow() workflow.transform(name='generate_intervals', func='wgs.workflows.freebayes.tasks.generate_intervals', ctx=helpers.get_default_ctx( memory=5, walltime='1:00', ), ret=mgd.OutputChunks('interval'), args=(reference, chromosomes), kwargs={'size': params['split_size']}) if single_node: workflow.transform( name='freebayes_one_node', ctx=helpers.get_default_ctx(memory=15, walltime='48:00', ncpus=8, disk=600), func='wgs.workflows.freebayes.tasks.run_freebayes_one_job', args=(mgd.TempSpace("run_freebayes_temp"), mgd.TempOutputFile('merged.vcf'), reference, mgd.InputChunks('interval'), mgd.InputFile(bam_file))) else: workflow.transform( name='freebayes', ctx=helpers.get_default_ctx( memory=15, walltime='24:00', ), axes=('interval', ), func='wgs.workflows.freebayes.tasks.run_freebayes_germline', args=(mgd.TempOutputFile('freebayes_germline.vcf', 'interval'), reference, mgd.InputInstance('interval'), mgd.InputFile(bam_file), mgd.TempSpace('tempdir_freebayes', 'interval')), ) workflow.transform( name='merge_vcfs', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.utils.museq_utils.merge_vcfs', args=( mgd.TempInputFile('freebayes_germline.vcf', 'interval'), mgd.TempOutputFile('merged.vcf'), mgd.TempSpace('merge_vcf'), ), ) workflow.transform(name='bcftools_normalize', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.utils.vcfutils.bcftools_normalize', args=( mgd.TempInputFile('merged.vcf'), mgd.TempOutputFile('normalized.vcf'), reference, )) workflow.transform( name='finalise_snvs', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.utils.vcf_tasks.finalise_vcf', args=( mgd.TempInputFile('normalized.vcf'), mgd.OutputFile(germline_vcf, extensions=['.tbi', '.csi']), ), ) workflow.subworkflow(name="freebayes_maf", func='wgs.workflows.vcf2maf.create_vcf2maf_workflow', args=( mgd.InputFile(germline_vcf, extensions=['.tbi', '.csi']), mgd.OutputFile(germline_maf, extensions=['.tbi', '.csi']), reference_vep, ), kwargs={'normal_id': normal_id}) return workflow
def split_bam_workflow(args): config = inpututils.load_config(args) config = config['split_bam'] bam_file = inpututils.load_split_wgs_input(args['input_yaml']) baseimage = config['docker']['single_cell_pipeline'] split_bam_template = os.path.join(args['out_dir'], '{region}.bam') meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml') input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml') workflow = pypeliner.workflow.Workflow(ctx={'docker_image': baseimage}) workflow.transform( name="get_regions", ctx={ 'mem': config['memory']['low'], 'ncpus': 1, 'docker_image': baseimage }, func="single_cell.utils.pysamutils.get_regions_from_reference", ret=pypeliner.managed.OutputChunks('region'), args=( config["ref_genome"], config["split_size"], config["chromosomes"], )) workflow.subworkflow( name="split_normal", func=split_bams.create_split_workflow, ctx={ 'mem': config['memory']['low'], 'ncpus': 1 }, args=( mgd.InputFile(bam_file), mgd.OutputFile("normal.split.bam", 'region', template=split_bam_template, axes_origin=[]), pypeliner.managed.InputChunks('region'), config, ), ) workflow.transform( name='generate_meta_files_results', func='single_cell.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], args['out_dir'], mgd.Template('bam_filenames', 'region', template=split_bam_template), mgd.OutputFile(meta_yaml)), kwargs={ 'input_yaml_data': inpututils.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'type': 'wgs_regionbams' }, 'template': (mgd.InputChunks('region'), split_bam_template, 'region'), }) return workflow
def create_samtools_germline_workflow(germline_vcf, germline_roh, bam_file, reference, chromosomes, single_node=None): params = config.default_params('variant_calling') workflow = pypeliner.workflow.Workflow( ctx={'docker_image': config.containers('wgs')}) workflow.transform( name='generate_intervals', func='wgs.workflows.samtools_germline.tasks.generate_intervals', ctx=helpers.get_default_ctx( memory=5, walltime='1:00', ), ret=mgd.OutputChunks('interval'), args=(reference, chromosomes), kwargs={'size': params['split_size']}) if single_node: workflow.transform( name='samtools_germline', ctx=helpers.get_default_ctx(memory=15, walltime='48:00', ncpus=8, disk=600), func= 'wgs.workflows.samtools_germline.tasks.run_samtools_germline_one_job', args=(mgd.TempSpace("run_samtools_temp"), mgd.TempOutputFile('merged.vcf'), reference, mgd.InputChunks('interval'), mgd.InputFile(bam_file)), kwargs={ 'samtools_docker_image': config.containers('samtools'), 'vcftools_docker_image': config.containers('vcftools') }) else: workflow.transform( name='samtools_germline', ctx=helpers.get_default_ctx( memory=15, walltime='24:00', ), axes=('interval', ), func='wgs.workflows.samtools_germline.tasks.run_samtools_germline', args=(mgd.TempOutputFile('germline.vcf.gz', 'interval'), reference, mgd.InputInstance('interval'), mgd.InputFile(bam_file)), kwargs={'docker_image': config.containers('samtools')}) workflow.transform( name='merge_vcfs', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.utils.museq_utils.merge_vcfs', args=( mgd.TempInputFile('germline.vcf.gz', 'interval'), mgd.TempOutputFile('merged.vcf'), mgd.TempSpace('merge_vcf'), ), kwargs={'docker_image': config.containers('vcftools')}) workflow.transform(name='finalise_snvs', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.utils.vcf_tasks.finalise_vcf', args=( mgd.TempInputFile('merged.vcf'), mgd.OutputFile(germline_vcf, extensions=['.tbi', '.csi']), ), kwargs={'docker_image': config.containers('vcftools')}) workflow.transform( name='roh_calling', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.workflows.samtools_germline.tasks.roh_calling', args=(mgd.InputFile(germline_vcf, extensions=['.tbi', '.csi']), mgd.OutputFile(germline_roh)), kwargs={'docker_image': config.containers('vcftools')}) return workflow