def _create_download_cosmic_file_subworkflow(host, host_path, user, password, out_file, local_download=False): sandbox = soil.utils.workflow.get_sandbox(['bcftools', 'samtools']) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.transform(name='download', ctx={'local': local_download}, func=tasks.download_from_sftp, args=(host, host_path, mgd.TempOutputFile('file.vcf.gz'), user, password)) workflow.transform(name='decompress', func=tasks.decompress, args=(mgd.TempInputFile('file.vcf.gz'), mgd.TempOutputFile('file.vcf'))) workflow.transform(name='bgzip', func=soil.wrappers.samtools.tasks.compress_vcf, args=(mgd.TempInputFile('file.vcf'), mgd.OutputFile(out_file))) return workflow
def create_db_workflow(in_file, ref_proteome_fasta_file, out_file, genome_version='GRCh37', pyensembl_cache_dir=None): sandbox = pypeliner.sandbox.CondaSandbox(pip_packages=['varcode']) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.transform(name='clean_ref_fasta', func=tasks.clean_ref_proteome_ids, args=(mgd.InputFile(ref_proteome_fasta_file), mgd.TempOutputFile('ref.fasta'))) workflow.transform(name='build_variant_table', func=tasks.build_variant_table, args=(mgd.InputFile(in_file), mgd.TempOutputFile('variant_table.tsv.gz')), kwargs={ 'genome_version': genome_version, 'pyensembl_cache_dir': pyensembl_cache_dir }) workflow.transform(name='build_variant_fasta', func=tasks.build_variant_fasta, args=(mgd.TempInputFile('variant_table.tsv.gz'), mgd.TempOutputFile('var.fasta'))) workflow.commandline(name='build_db', args=('cat', mgd.TempInputFile('ref.fasta'), mgd.TempInputFile('var.fasta'), '>', mgd.OutputFile(out_file))) return workflow
def run_MutationSeq(config, normal_bam, tumour_bam, output_file): workflow = pypeliner.workflow.Workflow() workflow.setobj(obj=mgd.OutputChunks('interval',), value=list(map(str, range(1, 23) + ['X']))) workflow.transform( name='run_museq_paired', ctx={'mem': 8, 'ncpus': 1, 'walltime': '24:00'}, axes=('interval',), func=tasks.run_museq, args=( config, mgd.InputFile(normal_bam), mgd.InputFile(tumour_bam), mgd.InputInstance('interval'), mgd.TempOutputFile('museq.vcf', 'interval'), mgd.TempOutputFile('museq.log', 'interval'), ) ) workflow.transform( name='merge_vcfs', func=tasks.merge_vcfs, args=( mgd.TempInputFile('museq.vcf', 'interval', axes_origin=[]), mgd.OutputFile(output_file), mgd.TempSpace('merge_vcf'), ) ) return workflow
def run_Strelka(config, normal_bam, tumour_bam, snv_output_file, indel_output_file): workflow = pypeliner.workflow.Workflow() workflow.transform(name='configure_bed', func=tasks.configure_bed, args=(mgd.TempSpace('bed_space'), mgd.InputFile(config['bed_file']), mgd.TempOutputFile('bed.gz'), mgd.TempOutputFile('bed.gz.tbi'))) workflow.transform(name='run_strelka', ctx={ 'mem': 10, 'ncpus': 1, 'walltime': '08:00' }, func=tasks.run_strelka, args=( config, mgd.InputFile(normal_bam), mgd.InputFile(tumour_bam), mgd.TempInputFile('bed.gz'), mgd.TempInputFile('bed.gz.tbi'), mgd.TempSpace('strelka_workspace'), mgd.OutputFile(snv_output_file), mgd.OutputFile(indel_output_file), )) return workflow
def create_cohort_qc_report(cohort_label, out_dir, filtered_cohort_maf, cna_table, report_path): oncoplot = os.path.join(out_dir, cohort_label, "cohort_oncoplot.png") somatic_interactions_plot = os.path.join(out_dir, cohort_label, "somatic_interactions.png") summary_plot = os.path.join(out_dir, cohort_label, "summary.png") burden_plot = os.path.join(out_dir, cohort_label, "mutation_burden.png") workflow = pypeliner.workflow.Workflow() non_synonymous_labels = [ "Frame_Shift_Del", "Frame_Shift_Ins", "Splice_Site", "Translation_Start_Site", "Nonsense_Mutation", "Nonstop_Mutation", "In_Frame_Del", "In_Frame_Ins", "Missense_Mutation" ] workflow.transform( name='postprocess_maf', func='wgs.workflows.cohort_qc.tasks.prepare_maf_for_maftools', args=(cohort_label, mgd.InputFile(filtered_cohort_maf), mgd.TempOutputFile("prepared_maf"), non_synonymous_labels, mgd.TempOutputFile("vcNames")), ) workflow.transform( name='burden_plot', func='wgs.workflows.cohort_qc.tasks.plot_mutation_burden', args=( mgd.InputFile(filtered_cohort_maf), mgd.OutputFile(burden_plot), ), ) workflow.transform( name='build_gene_list', func='wgs.workflows.cohort_qc.tasks.build_gene_list', args=(mgd.InputFile(cna_table), mgd.TempOutputFile("genelist")), ) workflow.transform( name='make_cohort_plots', func='wgs.workflows.cohort_qc.tasks.make_R_cohort_plots', args=(mgd.TempInputFile("prepared_maf"), mgd.InputFile(cna_table), mgd.OutputFile(oncoplot), mgd.OutputFile(somatic_interactions_plot), mgd.OutputFile(summary_plot), mgd.TempInputFile("vcNames"), mgd.TempInputFile("genelist"))) workflow.transform(name='make_report', func='wgs.workflows.cohort_qc.tasks.make_report', args=( cohort_label, mgd.InputFile(oncoplot), mgd.InputFile(somatic_interactions_plot), mgd.InputFile(summary_plot), mgd.InputFile(burden_plot), mgd.OutputFile(report_path), )) return workflow
def create_vcf_db_annotation_workflow(db_vcf_file, target_vcf_file, out_file, docker_config={}, split_size=int(1e4)): ctx = dict(mem=2, num_retry=3, mem_retry_increment=2, **docker_config) workflow = pypeliner.workflow.Workflow() workflow.transform(name='split_vcf', ctx=ctx, func='biowrappers.components.io.vcf.tasks.split_vcf', args=(mgd.InputFile(target_vcf_file), mgd.TempOutputFile('split.vcf', 'split')), kwargs={'lines_per_file': split_size}) workflow.transform( name='annotate_db_status', axes=('split', ), ctx=ctx, func= 'biowrappers.components.variant_calling.annotated_db_status.tasks.annotate_db_status', args=(db_vcf_file, mgd.TempInputFile('split.vcf', 'split'), mgd.TempOutputFile('annotated.csv.gz', 'split', extensions=['.yaml']))) workflow.transform(name='merge_tables', ctx=ctx, func='single_cell.utils.csvutils.concatenate_csv', args=(mgd.TempInputFile('annotated.csv.gz', 'split'), mgd.OutputFile(out_file, extensions=['.yaml']))) return workflow
def create_db_annotation_workflow(in_vcf_file, out_csv_file, db_vcf_file, split_size=1e4): workflow = pypeliner.workflow.Workflow( ctx=dict(mem=2, num_retry=3, mem_retry_increment=2)) workflow.transform(name='split_vcf', func='single_cell.utils.vcfutils.split_vcf', args=(mgd.InputFile(in_vcf_file), mgd.TempOutputFile('split.vcf', 'split')), kwargs={'lines_per_file': split_size}) workflow.transform( name='annotate_db_status', axes=('split', ), func='single_cell.workflows.db_annotation.tasks.annotate_db_status', args=(db_vcf_file, mgd.TempInputFile('split.vcf', 'split'), mgd.TempOutputFile('annotated.csv.gz', 'split', extensions=['.yaml']))) workflow.transform(name='merge_tables', func='single_cell.utils.csvutils.concatenate_csv', args=(mgd.TempInputFile('annotated.csv.gz', 'split', extensions=['.yaml']), mgd.OutputFile(out_csv_file, extensions=['.yaml']))) return workflow
def create_lumpy_workflow(lumpy_vcf, tumour_bam=None, normal_bam=None, single_node=False): workflow = pypeliner.workflow.Workflow() lumpy_job_name = 'run_lumpy' if normal_bam: normal_bam = mgd.InputFile(normal_bam) normal_disc = mgd.TempInputFile('normal.discordants.sorted.bam') normal_split = mgd.TempInputFile('normal.splitters.sorted.bam') lumpy_job_name += '_normal' else: normal_disc = None normal_split = None if tumour_bam: tumour_bam = mgd.InputFile(tumour_bam) tumour_disc = mgd.TempInputFile('tumour.discordants.sorted.bam') tumour_split = mgd.TempInputFile('tumour.splitters.sorted.bam') lumpy_job_name += '_tumour' else: tumour_disc = None tumour_split = None if normal_bam: workflow.subworkflow( name='preprocess_lumpy_normal', func=lumpy_preprocess_workflow, args=(normal_bam, mgd.TempOutputFile('normal.discordants.sorted.bam'), mgd.TempOutputFile('normal.splitters.sorted.bam')), kwargs={'single_node': single_node}) if tumour_bam: workflow.subworkflow( name='preprocess_lumpy_tumour', func=lumpy_preprocess_workflow, args=(tumour_bam, mgd.TempOutputFile('tumour.discordants.sorted.bam'), mgd.TempOutputFile('tumour.splitters.sorted.bam')), kwargs={'single_node': single_node}) workflow.transform( name=lumpy_job_name, ctx=helpers.get_default_ctx(memory=10, disk=500, walltime='72:00'), func='wgs.workflows.lumpy.tasks.run_lumpyexpress', args=(mgd.OutputFile(lumpy_vcf), config.default_params('breakpoint_calling')['lumpy_paths']), kwargs={ 'tumour_bam': tumour_bam, 'tumour_discordants': tumour_disc, 'tumour_splitters': tumour_split, 'normal_bam': normal_bam, 'normal_discordants': normal_disc, 'normal_splitters': normal_split, 'docker_image': config.containers('lumpy') }) return workflow
def create_optitype_workflow(bam_file, hla_type_file, is_rna=False, threads=1): if check_chr_prefix(bam_file): chrom_str = 'chr6' else: chrom_str = '6' sandbox = soil.utils.workflow.get_sandbox( ['optitype', 'razers3', 'samtools']) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.commandline( name='extract_chr6', args=( 'samtools', 'view', '-bh', '-f', '2', '-F', '4', mgd.InputFile(bam_file), chrom_str, '|', 'samtools', 'collate', '-O', '-', mgd.TempSpace('chr6_collate_temp'), '|', 'samtools', 'bam2fq', '-1', mgd.TempOutputFile('chr6_reads_1.fq'), '-2', mgd.TempOutputFile('chr6_reads_2.fq'), '-', ), ) workflow.transform(name='optitype', ctx={ 'mem': 24, 'mem_retry_increment': 8, 'num_retry': 3, 'threads': threads }, func=tasks.run_optitype, args=( mgd.TempInputFile('chr6_reads_1.fq'), mgd.TempInputFile('chr6_reads_2.fq'), mgd.OutputFile(hla_type_file), mgd.TempSpace('optitype_temp'), ), kwargs={ 'is_rna': is_rna, 'threads': threads, }) return workflow
def _create_download_cosmic_workflow(ref_data_version, out_file, user, password, host='sftp-cancer.sanger.ac.uk', local_download=False): host_base_path = '/files/{}/cosmic/v83/VCF'.format( ref_data_version.lower()) coding_host_path = '/'.join([host_base_path, 'CosmicCodingMuts.vcf.gz']) non_coding_host_path = '/'.join( [host_base_path, 'CosmicNonCodingVariants.vcf.gz']) sandbox = soil.utils.workflow.get_sandbox(['bcftools', 'samtools']) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.setobj(obj=mgd.TempOutputObj('coding_host_path'), value=coding_host_path) workflow.setobj(obj=mgd.TempOutputObj('non_coding_host_path'), value=non_coding_host_path) workflow.subworkflow(name='download_coding', func=_create_download_cosmic_file_subworkflow, args=( host, mgd.TempInputObj('coding_host_path'), user, password, mgd.TempOutputFile('coding.vcf.gz'), ), kwargs={'local_download': local_download}) workflow.subworkflow(name='download_non_coding', func=_create_download_cosmic_file_subworkflow, args=( host, mgd.TempInputObj('non_coding_host_path'), user, password, mgd.TempOutputFile('non_coding.vcf.gz'), ), kwargs={'local_download': local_download}) workflow.transform(name='merge_files', func=soil.wrappers.samtools.tasks.concatenate_vcf, args=([ mgd.TempInputFile('coding.vcf.gz'), mgd.TempInputFile('non_coding.vcf.gz') ], mgd.OutputFile(out_file)), kwargs={ 'allow_overlap': True, 'index_file': mgd.OutputFile(out_file + '.tbi') }) return workflow
def create_somatic_consensus_workflow( mutect_snv_vcf, strelka_snv_vcf, strelka_indel_vcf, museq_snv_vcf, consensus_maf, chromosomes, reference_vep, normal_id, tumour_id, ): workflow = pypeliner.workflow.Workflow() workflow.transform( name='snv_consensus', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.somatic_calling_consensus.consensus.main', args=( mgd.InputFile(museq_snv_vcf), mgd.InputFile(strelka_snv_vcf), mgd.InputFile(mutect_snv_vcf), mgd.InputFile(strelka_indel_vcf), mgd.TempOutputFile('consensus.vcf'), mgd.TempOutputFile('counts.csv'), chromosomes, ), ) workflow.subworkflow(name="consensus_maf", func='wgs.workflows.vcf2maf.create_vcf2maf_workflow', args=( mgd.TempInputFile('consensus.vcf'), mgd.TempOutputFile('consensus.maf'), reference_vep, ), kwargs={ 'normal_id': normal_id, 'tumour_id': tumour_id }) workflow.transform( name='maf_counts', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.somatic_calling_consensus.tasks.update_maf_counts', args=( mgd.TempInputFile('consensus.maf'), mgd.TempInputFile('counts.csv'), mgd.OutputFile(consensus_maf), )) return workflow
def create_consensus_workflow( destruct_breakpoints, lumpy_vcf, output, chromosomes ): params = config.default_params('breakpoint_calling') workflow = pypeliner.workflow.Workflow() workflow.transform( name='parse_lumpy', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.breakpoint_calling_consensus.tasks.parse_lumpy_task', args=( mgd.InputFile(lumpy_vcf), mgd.TempOutputFile('lumpy.csv'), params["parse_lumpy"], ), kwargs={'chromosomes': chromosomes} ) workflow.transform( name='parse_destruct', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.breakpoint_calling_consensus.tasks.parse_destruct_task', args=( mgd.InputFile(destruct_breakpoints), mgd.TempOutputFile('destruct.csv'), params["parse_destruct"], ), kwargs={'chromosomes': chromosomes} ) workflow.transform( name='consensus_breakpoint_calling', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.breakpoint_calling_consensus.tasks.consensus_calls', args=( mgd.TempInputFile('destruct.csv'), mgd.TempInputFile('lumpy.csv'), mgd.OutputFile(output, extensions=['.yaml']), params['consensus'] ), ) return workflow
def create_snpeff_annotation_workflow( in_vcf_file, out_csv_file, db, data_dir, split_size=int(1e3) ): workflow = pypeliner.workflow.Workflow( ctx={'num_retry': 3, 'mem_retry_increment': 2} ) workflow.transform( name='split_vcf', func='single_cell.utils.vcfutils.split_vcf', args=( mgd.InputFile(in_vcf_file), mgd.TempOutputFile('split.vcf', 'split') ), kwargs={'lines_per_file': split_size} ) workflow.transform( name='run_snpeff', axes=('split',), func='single_cell.workflows.snpeff_annotation.tasks.run_snpeff', args=( db, data_dir, mgd.TempInputFile('split.vcf', 'split'), mgd.TempOutputFile('snpeff.vcf', 'split') ), kwargs={ 'classic_mode': True } ) workflow.transform( name='convert_vcf_to_csv', axes=('split',), func='single_cell.workflows.snpeff_annotation.tasks.convert_vcf_to_table', args=( mgd.TempInputFile('snpeff.vcf', 'split'), mgd.TempOutputFile('snpeff.csv.gz', 'split', extensions=['.yaml']), ) ) workflow.transform( name='concatenate_tables', func='single_cell.utils.csvutils.concatenate_csv', args=( mgd.TempInputFile('snpeff.csv.gz', 'split', extensions=['.yaml']), mgd.OutputFile(out_csv_file, extensions=['.yaml']) ) ) return workflow
def create_pileup2snp_workflow(bam_file, ref_genome_fasta_file, out_file, chromosomes=None, split_size=int(1e7)): sandbox = soil.utils.workflow.get_sandbox(['bcftools', 'samtools', 'varscan']) workflow = pypeliner.workflow.Workflow(default_ctx=low_mem_ctx, default_sandbox=sandbox) workflow.setobj( obj=pypeliner.managed.TempOutputObj('config', 'regions'), value=soil.utils.genome.get_bam_regions(bam_file, split_size, chromosomes=chromosomes) ) workflow.commandline( name='run_mpileup', axes=('regions',), args=( 'samtools', 'mpileup', '-f', mgd.InputFile(ref_genome_fasta_file), '-o', mgd.TempOutputFile('region.mpileup', 'regions'), '-r', mgd.TempInputObj('config', 'regions'), mgd.InputFile(bam_file), ) ) workflow.transform( name='run_mpileup2snp', axes=('regions',), ctx=med_mem_ctx, func=tasks.mpileup2snp, args=( mgd.TempInputFile('region.mpileup', 'regions'), mgd.TempOutputFile('region.vcf', 'regions'), ) ) workflow.transform( name='compress', axes=('regions',), func=soil.wrappers.samtools.tasks.compress_vcf, args=( mgd.TempInputFile('region.vcf', 'regions'), mgd.TempOutputFile('region.vcf.gz', 'regions'), ), ) workflow.transform( name='concatenate_vcfs', func=soil.wrappers.samtools.tasks.concatenate_vcf, args=( mgd.TempInputFile('region.vcf.gz', 'regions'), mgd.OutputFile(out_file), ), ) return workflow
def create_snpeff_annotation_workflow(db, data_dir, target_vcf_file, out_file, base_docker={}, snpeff_docker={}, classic_mode=True, split_size=int(1e3), table_name='snpeff'): ctx = {'num_retry': 3, 'mem_retry_increment': 2} if base_docker: ctx.update(base_docker) workflow = Workflow() workflow.transform(name='split_vcf', ctx=dict(mem=2, **ctx), func='biowrappers.components.io.vcf.tasks.split_vcf', args=(mgd.InputFile(target_vcf_file), mgd.TempOutputFile('split.vcf', 'split')), kwargs={'lines_per_file': split_size}) workflow.transform( name='run_snpeff', axes=('split', ), ctx=dict(mem=8, **ctx), func='biowrappers.components.variant_calling.snpeff.tasks.run_snpeff', args=(db, data_dir, mgd.TempInputFile('split.vcf', 'split'), mgd.TempOutputFile('snpeff.vcf', 'split')), kwargs={ 'classic_mode': classic_mode, 'docker_config': snpeff_docker }) workflow.transform( name='convert_vcf_to_csv', axes=('split', ), ctx=dict(mem=4, **ctx), func= 'biowrappers.components.variant_calling.snpeff.tasks.convert_vcf_to_table', args=(mgd.TempInputFile('snpeff.vcf', 'split'), mgd.TempOutputFile('snpeff.csv.gz', 'split', extensions=['.yaml']), table_name)) workflow.transform(name='concatenate_tables', ctx=dict(mem=4, **ctx), func='single_cell.utils.csvutils.concatenate_csv', args=(mgd.TempInputFile('snpeff.csv.gz', 'split'), mgd.OutputFile(out_file, extensions=['.yaml']))) return workflow
def create_vcf_tric_nucleotide_annotation_workflow( ref_genome_fasta_file, vcf_file, out_file, docker_config=None, split_size=int(1e4), table_name='tri_nucleotide_context'): ctx = {'num_retry': 3, 'mem_retry_increment': 2} if docker_config: ctx.update(docker_config) merged_file = mgd.TempFile('merged.csv.gz') workflow = pypeliner.workflow.Workflow() workflow.transform( name='split_vcf', ctx=dict(mem=2, **ctx), func='biowrappers.components.io.vcf.tasks.split_vcf', args=( mgd.InputFile(vcf_file), mgd.TempOutputFile('split.vcf', 'split') ), kwargs={'lines_per_file': split_size} ) workflow.transform( name='annotate_db_status', axes=('split',), ctx=dict(mem=4, **ctx), func='biowrappers.components.variant_calling.tri_nucleotide_context.tasks.get_tri_nucelotide_context', args=( ref_genome_fasta_file, mgd.TempInputFile('split.vcf', 'split'), mgd.TempOutputFile('tri_nucleotide_context.csv.gz', 'split', extensions=['.yaml']), table_name ) ) workflow.transform( name='merge_tables', ctx=dict(mem=2, **ctx), func='single_cell.utils.csvutils.concatenate_csv', args=( mgd.TempInputFile('tri_nucleotide_context.csv.gz', 'split'), mgd.OutputFile(out_file, extensions=['.yaml'])) ) return workflow
def create_eagle_ref_data_workflow(vcf_url_template, out_file, local_download=False): chrom_map_file = soil.utils.package_data.load_data_file( 'ref_data/data/GRCh37/chrom_map.tsv') chrom_map = pd.read_csv(chrom_map_file, sep='\t') chrom_map = chrom_map[chrom_map['ncbi'].isin( [str(x) for x in range(1, 23)])] chrom_map['url'] = chrom_map['ncbi'].apply( lambda x: vcf_url_template.format(chrom=x)) vcf_urls = chrom_map['url'].to_dict() sandbox = soil.utils.workflow.get_sandbox(['bcftools']) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.setobj(obj=mgd.TempOutputObj('vcf_url', 'chrom'), value=vcf_urls) workflow.transform(name='download_vcf_files', axes=('chrom', ), ctx={'local': local_download}, func=soil.ref_data.tasks.download, args=(mgd.TempInputObj('vcf_url', 'chrom'), mgd.TempOutputFile('raw.vcf.gz', 'chrom'))) workflow.transform(name='write_chrom_map', func=tasks.write_chrom_map_file, args=(mgd.InputFile(chrom_map_file), mgd.TempOutputFile('chrom_map.tsv'))) workflow.transform(name='rename_chroms', axes=('chrom', ), func=soil.wrappers.bcftools.tasks.rename_chroms, args=(mgd.TempInputFile('chrom_map.tsv'), mgd.TempInputFile('raw.vcf.gz', 'chrom'), mgd.TempOutputFile('renamed.bcf', 'chrom'))) workflow.transform(name='concat_vcfs', func=soil.wrappers.bcftools.tasks.concatenate_vcf, args=(mgd.TempInputFile('renamed.bcf', 'chrom'), mgd.OutputFile(out_file)), kwargs={'bcf_output': True}) workflow.commandline(name='index', args=('bcftools', 'index', mgd.InputFile(out_file), '-o', mgd.OutputFile(out_file + '.csi'))) return workflow
def create_variant_counting_workflow( vcfs, tumour_cell_bams, results_h5, config, ): """ Count variant reads for multiple sets of variants across cells. """ workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=tumour_cell_bams.keys(), ) workflow.transform(name='merge_snvs', func='biowrappers.components.io.vcf.tasks.merge_vcfs', args=([mgd.InputFile(vcf) for vcf in vcfs], mgd.TempOutputFile('all.snv.vcf'))) workflow.transform(name='finalise_snvs', func="biowrappers.components.io.vcf.tasks.finalise_vcf", args=(mgd.TempInputFile('all.snv.vcf'), mgd.TempOutputFile('all.snv.vcf.gz', extensions=['.tbi'])), kwargs={ 'docker_config': helpers.get_container_ctx(config['containers'], 'vcftools') }) workflow.subworkflow( name='count_alleles', func=create_snv_allele_counts_for_vcf_targets_workflow, args=( config, mgd.InputFile('tumour_cells.bam', 'cell_id', extensions=['.bai'], fnames=tumour_cell_bams), mgd.TempInputFile('all.snv.vcf.gz'), mgd.OutputFile(results_h5), ), kwargs={ 'docker_config': helpers.get_container_ctx(config['containers'], 'single_cell_pipeline') }, ) return workflow
def create_vcf2maf_workflow(vcf_file, maf_file, reference, tumour_id=None, normal_id=None): workflow = pypeliner.workflow.Workflow() workflow.transform(name='vcf2maf', func='wgs.workflows.vcf2maf.tasks.run_vcf2maf', args=(mgd.InputFile(vcf_file), mgd.TempOutputFile('maf_file.maf'), mgd.TempSpace('vcf2maf_temp'), reference), kwargs={ 'tumour_id': tumour_id, 'normal_id': normal_id }) workflow.transform(name='update_ids', func='wgs.workflows.vcf2maf.tasks.update_ids', args=( mgd.TempInputFile('maf_file.maf'), tumour_id, normal_id, mgd.OutputFile(maf_file), )) return workflow
def create_workflow_1(input_filename, output_filename): workflow = pypeliner.workflow.Workflow(default_ctx={'mem': 1}) # Read data into a managed object workflow.transform(name='read', func=read_stuff, ret=mgd.TempOutputObj('input_data'), args=(mgd.InputFile(input_filename), )) # Extract a property of the managed object, modify it # and store the result in another managed object workflow.transform( name='do', func=do_stuff, ret=mgd.TempOutputObj('output_data'), args=(mgd.TempInputObj('input_data').prop('some_string'), )) # Write the object to an output file workflow.transform(name='write', func=write_stuff, args=(mgd.TempInputObj('output_data'), mgd.TempOutputFile('output_file'))) # Recursive workflow workflow.subworkflow(name='sub_workflow_2', func=create_workflow_2, args=(mgd.TempInputFile('output_file'), mgd.OutputFile(output_filename))) return workflow
def _create_download_decompress_workflow(url, local_path, local_download=False): workflow = pypeliner.workflow.Workflow() workflow.setobj(mgd.TempOutputObj('url'), value=url) workflow.transform( name='download', ctx={'local': local_download}, func=tasks.download, args=( mgd.TempInputObj('url'), mgd.TempOutputFile('download'), ), ) workflow.transform(name='decompress', func=tasks.decompress, args=( mgd.TempInputFile('download'), mgd.OutputFile(local_path), )) return workflow
def create_lumpy_workflow(config, normal_bam, tumour_cell_bams, lumpy_breakpoints_csv, lumpy_breakpoints_evidence, lumpy_breakpoints_bed): ctx = {'docker_image': config['docker']['single_cell_pipeline']} workflow = pypeliner.workflow.Workflow(ctx=ctx) workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=list(tumour_cell_bams.keys()), ) workflow.subworkflow( name='normal_preprocess_lumpy', func='single_cell.workflows.lumpy.lumpy_preprocess_workflow', ctx={'docker_image': config['docker']['single_cell_pipeline']}, args=(normal_bam, config, mgd.TempOutputFile('normal.discordants.sorted.bam'), mgd.TempOutputFile('normal.splitters.sorted.bam'), mgd.TempOutputFile('hist_normal_formatted.csv'), mgd.TempOutputFile('normal_mean_stdev.yaml')), ) workflow.subworkflow( name='tumour_preprocess_lumpy', func='single_cell.workflows.lumpy.lumpy_preprocess_workflow', ctx={'docker_image': config['docker']['single_cell_pipeline']}, args=(mgd.InputFile('tumour_cells.bam', 'cell_id', extensions=['.bai'], fnames=tumour_cell_bams), config, mgd.TempOutputFile('tumour.discordants.sorted.bam'), mgd.TempOutputFile('tumour.splitters.sorted.bam'), mgd.TempOutputFile('hist_tumour_formatted.csv'), mgd.TempOutputFile('tumour_mean_stdev.yaml')), ) workflow.subworkflow( name='lumpy', ctx={'docker_image': config['docker']['single_cell_pipeline']}, func="single_cell.workflows.lumpy.lumpy_calling_workflow", args=( config, mgd.TempInputFile('normal.discordants.sorted.bam'), mgd.TempInputFile('normal.splitters.sorted.bam'), mgd.TempInputFile('hist_normal_formatted.csv'), mgd.TempInputFile('normal_mean_stdev.yaml'), mgd.TempInputFile('tumour.discordants.sorted.bam'), mgd.TempInputFile('tumour.splitters.sorted.bam'), mgd.TempInputFile('hist_tumour_formatted.csv'), mgd.TempInputFile('tumour_mean_stdev.yaml'), mgd.OutputFile(lumpy_breakpoints_bed), mgd.OutputFile(lumpy_breakpoints_csv, extensions=['.yaml']), mgd.OutputFile(lumpy_breakpoints_evidence, extensions=['.yaml']), ), ) return workflow
def create_fit_model_workflow( experiment_filename, results_filename, config, ref_data_dir, tumour_id=None, ): config = remixt.config.get_sample_config(config, tumour_id) workflow = pypeliner.workflow.Workflow(default_ctx={'mem': 16}) workflow.transform( name='init', func=remixt.analysis.pipeline.init, ret=mgd.TempOutputObj('init_params', 'init_id'), args=( mgd.TempOutputFile('init_results'), mgd.InputFile(experiment_filename), config, ), ) workflow.transform( name='fit', axes=('init_id',), func=remixt.analysis.pipeline.fit_task, args=( mgd.TempOutputFile('fit_results', 'init_id'), mgd.InputFile(experiment_filename), mgd.TempInputObj('init_params', 'init_id'), config, ), ) workflow.transform( name='collate', func=remixt.analysis.pipeline.collate, args=( mgd.OutputFile(results_filename), mgd.InputFile(experiment_filename), mgd.TempInputFile('init_results'), mgd.TempInputFile('fit_results', 'init_id'), config, ), ) return workflow
def circos_plot(titan_calls, remixt_calls, sample_id, breakpoints, circos_plot_remixt, circos_plot_titan): workflow = pypeliner.workflow.Workflow() workflow.transform( name='prep_titan', func='wgs_qc_utils.reader.read_titan.make_for_circos', ctx=helpers.get_default_ctx( memory=5 ), args=( mgd.InputFile(titan_calls), mgd.TempOutputFile("titan_prepped"), ) ) workflow.transform( name='prep_remixt', func='wgs_qc_utils.reader.read_remixt.make_for_circos', ctx=helpers.get_default_ctx( memory=5 ), args=( mgd.InputFile(remixt_calls), sample_id, mgd.TempOutputFile("remixt_prepped"), ) ) workflow.transform( name='circos_plot', func='wgs.workflows.sample_qc.tasks.circos', ctx=helpers.get_default_ctx( memory=5 ), args=( mgd.TempInputFile("titan_prepped"), mgd.TempInputFile("remixt_prepped"), sample_id, breakpoints, mgd.OutputFile(circos_plot_remixt), mgd.OutputFile(circos_plot_titan), mgd.TempSpace("circos") ) ) return workflow
def create_museq_workflow( normal_bam, tumour_bam, ref_genome, snv_vcf, config): ctx = {'mem_retry_increment': 2, 'ncpus': 1} docker_ctx = helpers.get_container_ctx(config['containers'], 'single_cell_pipeline') ctx.update(docker_ctx) workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('region'), value=normal_bam.keys(), ) workflow.transform( name='run_museq', ctx=dict(mem=config["memory"]['med'], pool_id=config['pools']['highmem'], **ctx), axes=('region',), func='single_cell.workflows.mutationseq.tasks.run_museq', args=( mgd.InputFile('merged_bam', 'region', fnames=tumour_bam), mgd.InputFile('normal.split.bam', 'region', fnames=normal_bam), mgd.TempOutputFile('museq.vcf', 'region'), mgd.TempOutputFile('museq.log', 'region'), mgd.InputInstance('region'), config, ), kwargs={'docker_kwargs': helpers.get_container_ctx(config['containers'], 'mutationseq')} ) workflow.transform( name='merge_snvs', ctx=dict(mem=config["memory"]['med'], pool_id=config['pools']['standard'], **ctx), func='biowrappers.components.io.vcf.tasks.concatenate_vcf', args=( mgd.TempInputFile('museq.vcf', 'region'), mgd.OutputFile(snv_vcf), ), ) return workflow
def create_basic_workflow(fastq_file_1, fastq_file_2, out_file, threads=1): sandbox = soil.utils.workflow.get_sandbox([ 'mixcr', ]) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.commandline(name='align', ctx={ 'mem': 32, 'mem_retry_increment': 8, 'num_retry': 3, 'threads': threads }, args=('mixcr', 'align', '-f', '-t', threads, mgd.InputFile(fastq_file_1), mgd.InputFile(fastq_file_2), mgd.TempOutputFile('alignments.vdjca'))) workflow.commandline(name='assemble', ctx={ 'mem': 16, 'mem_retry_increment': 8, 'num_retry': 3, 'threads': threads }, args=('mixcr', 'assemble', '-f', '-t', 1, mgd.TempInputFile('alignments.vdjca'), mgd.TempOutputFile('clones.clns'))) workflow.commandline(name='export', ctx={ 'mem': 16, 'mem_retry_increment': 8, 'num_retry': 3 }, args=('mixcr', 'exportClones', '-f', mgd.TempInputFile('clones.clns'), mgd.TempOutputFile('results.tsv'))) workflow.commandline(name='compress', args=('gzip', '-c', mgd.TempInputFile('results.tsv'), '>', mgd.OutputFile(out_file))) return workflow
def create_align_workflow(fastq_file_1, fastq_file_2, ref_genome_dir, out_bam_file, add_xs_tag=False, align_threads=1, read_group_info=None, sort_threads=1): sandbox = soil.utils.workflow.get_sandbox(['star', 'samtools', 'sambamba']) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.transform(name='star_align', ctx={ 'mem': 32, 'mem_retry_increment': 16, 'num_retry': 3, 'threads': align_threads }, func=tasks.align, args=( mgd.InputFile(fastq_file_1), mgd.InputFile(fastq_file_2), ref_genome_dir, mgd.TempOutputFile('aligned.bam'), mgd.TempSpace('align_tmp'), ), kwargs={ 'add_xs_tag': add_xs_tag, 'read_group_info': read_group_info, 'threads': align_threads, }) workflow.transform(name='sort', ctx={ 'mem': 32, 'mem_retry_increment': 16, 'num_retry': 3, 'threads': sort_threads }, func=soil.wrappers.sambamba.tasks.sort, args=( mgd.TempInputFile('aligned.bam'), mgd.OutputFile(out_bam_file), mgd.TempSpace('sort_tmp'), ), kwargs={'threads': sort_threads}) workflow.commandline(name='index', args=( 'samtools', 'index', mgd.InputFile(out_bam_file), mgd.OutputFile(out_bam_file + '.bai'), )) return workflow
def create_svaba_workflow( tumour_bam, normal_bam, svaba_vcf, reference, ): workflow = pypeliner.workflow.Workflow() workflow.transform( name='run_svaba', ctx=helpers.get_default_ctx(memory=10, walltime='72:00', ncpus='8', disk=300), func='wgs.workflows.svaba.tasks.run_svaba', args=(mgd.InputFile(tumour_bam), mgd.InputFile(normal_bam), mgd.TempOutputFile('germline.indel.vcf.gz'), mgd.TempOutputFile('germline.sv.vcf.gz'), mgd.TempOutputFile('somatic.indel.vcf.gz'), mgd.OutputFile(svaba_vcf), mgd.TempOutputFile('unfiltered.germline.indel.vcf.gz'), mgd.TempOutputFile('unfiltered.germline.sv.vcf.gz'), mgd.TempOutputFile('unfiltered.somatic.indel.vcf.gz'), mgd.TempOutputFile('unfiltered.somatic.sv.vcf.gz'), reference, mgd.TempSpace('svaba_tempdir_full')), kwargs={ 'ncores': 8, }) return workflow
def pre_alignment(fastq_r1, fastq_r2, metrics_tar): workflow = pypeliner.workflow.Workflow() workflow.transform( name="fastqc_r1", ctx=helpers.get_default_ctx(memory=10, walltime='48:00', disk=400), func='alignment.workflows.pre_alignment.tasks.run_fastqc', args=( mgd.InputFile(fastq_r1), mgd.TempOutputFile('R1.html'), mgd.TempOutputFile('R1.pdf'), mgd.TempSpace('fastqc_R1'), ), kwargs={ 'docker_image': config.containers("fastqc"), }) workflow.transform( name="fastqc_r2", func='alignment.workflows.pre_alignment.tasks.run_fastqc', ctx=helpers.get_default_ctx(memory=10, walltime='48:00', disk=400), args=( mgd.InputFile(fastq_r2), mgd.TempOutputFile('R2.html'), mgd.TempOutputFile('R2.pdf'), mgd.TempSpace('fastqc_R2'), ), kwargs={ 'docker_image': config.containers('fastqc'), }) workflow.transform(name='tar', func='alignment.utils.helpers.make_tar_from_files', axes=('sample_id', ), args=(mgd.OutputFile(metrics_tar), [ mgd.TempInputFile('R2.html'), mgd.TempInputFile('R2.pdf'), mgd.TempInputFile('R2.html'), mgd.TempInputFile('R2.pdf'), ], mgd.TempSpace('wgs_metrics'))) return workflow
def create_destruct_wrapper_workflow(bam_filenames, output_filename, raw_data_dir, control_id=None, config=None, ref_data_dir=None): workflow = pypeliner.workflow.Workflow(default_ctx={'mem': 4}) workflow.setobj( obj=mgd.OutputChunks('sample_id'), value=list(bam_filenames.keys()), ) workflow.subworkflow( name='run_destruct', func=destruct.workflow.create_destruct_workflow, args=( mgd.InputFile('bam', 'sample_id', fnames=bam_filenames), mgd.TempOutputFile('breakpoint_table'), mgd.TempOutputFile('breakpoint_library_table'), mgd.TempOutputFile('breakpoint_read_table'), config, ref_data_dir, ), kwargs={ 'raw_data_dir': raw_data_dir, }, ) workflow.transform( name='post_process', func=destruct.benchmark.wrappers.destruct.tasks.destruct_postprocess, args=( mgd.TempInputFile('breakpoint_table'), mgd.TempInputFile('breakpoint_library_table'), mgd.OutputFile(output_filename), ), kwargs={ 'control_id': control_id, }) return workflow