def _create_download_decompress_concat_workflow(urls, out_file, local_download=False): workflow = pypeliner.workflow.Workflow() local_files = [] for i, url in enumerate(urls): local_files.append(mgd.TempFile('file_{}'.format(i))) workflow.setobj(mgd.TempOutputObj('url_{}'.format(i)), value=url) workflow.subworkflow(name='download_file_{}'.format(i), func=_create_download_decompress_workflow, args=( mgd.TempInputObj('url_{}'.format(i)), local_files[i].as_output(), ), kwargs={'local_download': local_download}) concat_args = [ 'cat', ] + [x.as_input() for x in local_files] + ['>', mgd.OutputFile(out_file)] workflow.commandline(name='concat', args=concat_args) return workflow
def create_db_workflow(in_file, ref_proteome_fasta_file, out_file, genome_version='GRCh37', pyensembl_cache_dir=None): sandbox = pypeliner.sandbox.CondaSandbox(pip_packages=['varcode']) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.transform(name='clean_ref_fasta', func=tasks.clean_ref_proteome_ids, args=(mgd.InputFile(ref_proteome_fasta_file), mgd.TempOutputFile('ref.fasta'))) workflow.transform(name='build_variant_table', func=tasks.build_variant_table, args=(mgd.InputFile(in_file), mgd.TempOutputFile('variant_table.tsv.gz')), kwargs={ 'genome_version': genome_version, 'pyensembl_cache_dir': pyensembl_cache_dir }) workflow.transform(name='build_variant_fasta', func=tasks.build_variant_fasta, args=(mgd.TempInputFile('variant_table.tsv.gz'), mgd.TempOutputFile('var.fasta'))) workflow.commandline(name='build_db', args=('cat', mgd.TempInputFile('ref.fasta'), mgd.TempInputFile('var.fasta'), '>', mgd.OutputFile(out_file))) return workflow
def create_optitype_workflow(bam_file, hla_type_file, is_rna=False, threads=1): if check_chr_prefix(bam_file): chrom_str = 'chr6' else: chrom_str = '6' sandbox = soil.utils.workflow.get_sandbox( ['optitype', 'razers3', 'samtools']) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.commandline( name='extract_chr6', args=( 'samtools', 'view', '-bh', '-f', '2', '-F', '4', mgd.InputFile(bam_file), chrom_str, '|', 'samtools', 'collate', '-O', '-', mgd.TempSpace('chr6_collate_temp'), '|', 'samtools', 'bam2fq', '-1', mgd.TempOutputFile('chr6_reads_1.fq'), '-2', mgd.TempOutputFile('chr6_reads_2.fq'), '-', ), ) workflow.transform(name='optitype', ctx={ 'mem': 24, 'mem_retry_increment': 8, 'num_retry': 3, 'threads': threads }, func=tasks.run_optitype, args=( mgd.TempInputFile('chr6_reads_1.fq'), mgd.TempInputFile('chr6_reads_2.fq'), mgd.OutputFile(hla_type_file), mgd.TempSpace('optitype_temp'), ), kwargs={ 'is_rna': is_rna, 'threads': threads, }) return workflow
def create_align_workflow(fastq_file_1, fastq_file_2, ref_genome_dir, out_bam_file, add_xs_tag=False, align_threads=1, read_group_info=None, sort_threads=1): sandbox = soil.utils.workflow.get_sandbox(['star', 'samtools', 'sambamba']) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.transform(name='star_align', ctx={ 'mem': 32, 'mem_retry_increment': 16, 'num_retry': 3, 'threads': align_threads }, func=tasks.align, args=( mgd.InputFile(fastq_file_1), mgd.InputFile(fastq_file_2), ref_genome_dir, mgd.TempOutputFile('aligned.bam'), mgd.TempSpace('align_tmp'), ), kwargs={ 'add_xs_tag': add_xs_tag, 'read_group_info': read_group_info, 'threads': align_threads, }) workflow.transform(name='sort', ctx={ 'mem': 32, 'mem_retry_increment': 16, 'num_retry': 3, 'threads': sort_threads }, func=soil.wrappers.sambamba.tasks.sort, args=( mgd.TempInputFile('aligned.bam'), mgd.OutputFile(out_bam_file), mgd.TempSpace('sort_tmp'), ), kwargs={'threads': sort_threads}) workflow.commandline(name='index', args=( 'samtools', 'index', mgd.InputFile(out_bam_file), mgd.OutputFile(out_bam_file + '.bai'), )) return workflow
def create_pileup2snp_workflow(bam_file, ref_genome_fasta_file, out_file, chromosomes=None, split_size=int(1e7)): sandbox = soil.utils.workflow.get_sandbox(['bcftools', 'samtools', 'varscan']) workflow = pypeliner.workflow.Workflow(default_ctx=low_mem_ctx, default_sandbox=sandbox) workflow.setobj( obj=pypeliner.managed.TempOutputObj('config', 'regions'), value=soil.utils.genome.get_bam_regions(bam_file, split_size, chromosomes=chromosomes) ) workflow.commandline( name='run_mpileup', axes=('regions',), args=( 'samtools', 'mpileup', '-f', mgd.InputFile(ref_genome_fasta_file), '-o', mgd.TempOutputFile('region.mpileup', 'regions'), '-r', mgd.TempInputObj('config', 'regions'), mgd.InputFile(bam_file), ) ) workflow.transform( name='run_mpileup2snp', axes=('regions',), ctx=med_mem_ctx, func=tasks.mpileup2snp, args=( mgd.TempInputFile('region.mpileup', 'regions'), mgd.TempOutputFile('region.vcf', 'regions'), ) ) workflow.transform( name='compress', axes=('regions',), func=soil.wrappers.samtools.tasks.compress_vcf, args=( mgd.TempInputFile('region.vcf', 'regions'), mgd.TempOutputFile('region.vcf.gz', 'regions'), ), ) workflow.transform( name='concatenate_vcfs', func=soil.wrappers.samtools.tasks.concatenate_vcf, args=( mgd.TempInputFile('region.vcf.gz', 'regions'), mgd.OutputFile(out_file), ), ) return workflow
def create_eagle_ref_data_workflow(vcf_url_template, out_file, local_download=False): chrom_map_file = soil.utils.package_data.load_data_file( 'ref_data/data/GRCh37/chrom_map.tsv') chrom_map = pd.read_csv(chrom_map_file, sep='\t') chrom_map = chrom_map[chrom_map['ncbi'].isin( [str(x) for x in range(1, 23)])] chrom_map['url'] = chrom_map['ncbi'].apply( lambda x: vcf_url_template.format(chrom=x)) vcf_urls = chrom_map['url'].to_dict() sandbox = soil.utils.workflow.get_sandbox(['bcftools']) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.setobj(obj=mgd.TempOutputObj('vcf_url', 'chrom'), value=vcf_urls) workflow.transform(name='download_vcf_files', axes=('chrom', ), ctx={'local': local_download}, func=soil.ref_data.tasks.download, args=(mgd.TempInputObj('vcf_url', 'chrom'), mgd.TempOutputFile('raw.vcf.gz', 'chrom'))) workflow.transform(name='write_chrom_map', func=tasks.write_chrom_map_file, args=(mgd.InputFile(chrom_map_file), mgd.TempOutputFile('chrom_map.tsv'))) workflow.transform(name='rename_chroms', axes=('chrom', ), func=soil.wrappers.bcftools.tasks.rename_chroms, args=(mgd.TempInputFile('chrom_map.tsv'), mgd.TempInputFile('raw.vcf.gz', 'chrom'), mgd.TempOutputFile('renamed.bcf', 'chrom'))) workflow.transform(name='concat_vcfs', func=soil.wrappers.bcftools.tasks.concatenate_vcf, args=(mgd.TempInputFile('renamed.bcf', 'chrom'), mgd.OutputFile(out_file)), kwargs={'bcf_output': True}) workflow.commandline(name='index', args=('bcftools', 'index', mgd.InputFile(out_file), '-o', mgd.OutputFile(out_file + '.csi'))) return workflow
def create_align_workflow(fastq_file_1, fastq_file_2, ref_genome_fasta_file, out_bam_file, threads=1): sandbox = soil.utils.workflow.get_sandbox(['sambamba', 'samtools']) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.subworkflow( name='align', func=soil.wrappers.bwa.workflows.create_align_workflow, args=(mgd.InputFile(fastq_file_1), mgd.InputFile(fastq_file_2), mgd.InputFile(ref_genome_fasta_file), mgd.TempOutputFile('aligned.bam')), kwargs={ 'align_threads': threads, 'sort_threads': threads }) workflow.transform(name='mark_dups', func=soil.wrappers.sambamba.tasks.markdups, args=(mgd.TempInputFile('aligned.bam'), mgd.OutputFile(out_bam_file), mgd.TempSpace('mark_dups_tmp')), kwargs={'threads': threads}) workflow.commandline(name='index', args=( 'samtools', 'index', mgd.InputFile(out_bam_file), mgd.OutputFile(out_bam_file + '.bai'), )) return workflow
def create_basic_workflow(fastq_file_1, fastq_file_2, out_file, threads=1): sandbox = soil.utils.workflow.get_sandbox([ 'mixcr', ]) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.commandline(name='align', ctx={ 'mem': 32, 'mem_retry_increment': 8, 'num_retry': 3, 'threads': threads }, args=('mixcr', 'align', '-f', '-t', threads, mgd.InputFile(fastq_file_1), mgd.InputFile(fastq_file_2), mgd.TempOutputFile('alignments.vdjca'))) workflow.commandline(name='assemble', ctx={ 'mem': 16, 'mem_retry_increment': 8, 'num_retry': 3, 'threads': threads }, args=('mixcr', 'assemble', '-f', '-t', 1, mgd.TempInputFile('alignments.vdjca'), mgd.TempOutputFile('clones.clns'))) workflow.commandline(name='export', ctx={ 'mem': 16, 'mem_retry_increment': 8, 'num_retry': 3 }, args=('mixcr', 'exportClones', '-f', mgd.TempInputFile('clones.clns'), mgd.TempOutputFile('results.tsv'))) workflow.commandline(name='compress', args=('gzip', '-c', mgd.TempInputFile('results.tsv'), '>', mgd.OutputFile(out_file))) return workflow
def create_somatic_workflow(normal_bam_file, tumour_bam_file, ref_genome_fasta_file, out_file, chromosomes='default', is_exome=False, split_size=int(1e7)): sandbox = soil.utils.workflow.get_sandbox( ['bcftools', 'samtools', 'strelka']) workflow = pypeliner.workflow.Workflow(default_ctx=med_mem_ctx, default_sandbox=sandbox) workflow.setobj(obj=mgd.TempOutputObj('config', 'regions'), value=soil.utils.genome.get_bam_regions( normal_bam_file, split_size, chromosomes=chromosomes)) workflow.setobj(obj=mgd.TempOutputObj('chrom_names', 'chrom_axis'), value=get_chromosomes(normal_bam_file, chromosomes=chromosomes)) workflow.transform( name='count_fasta_bases', func=soil.wrappers.strelka.tasks.count_fasta_bases, args=( mgd.InputFile(ref_genome_fasta_file), mgd.TempOutputFile('ref_base_counts.tsv'), ), ) workflow.transform( name='get_genome_size', ctx={'local': True}, func=get_known_genome_size, ret=mgd.TempOutputObj('genome_size'), args=( mgd.InputFile(tumour_bam_file), mgd.TempInputFile('ref_base_counts.tsv'), chromosomes, ), sandbox=None, ) workflow.transform( name='get_chromosome_depths', axes=('chrom_axis', ), func=soil.wrappers.strelka.tasks.get_chromosome_depth, args=( mgd.TempInputObj('chrom_names', 'chrom_axis'), mgd.InputFile(normal_bam_file), mgd.InputFile(ref_genome_fasta_file), mgd.TempOutputFile('chrom_depth.txt', 'chrom_axis'), ), ) workflow.transform( name='merge_chromosome_depths', func=soil.wrappers.strelka.tasks.merge_chromosome_depth, args=( mgd.TempInputFile('chrom_depth.txt', 'chrom_axis'), mgd.TempOutputFile('chrom_depth_merged.txt'), ), sandbox=None, ) workflow.transform(name='call_genome_segment', axes=('regions', ), func=soil.wrappers.strelka.tasks.call_genome_segment, args=( mgd.TempInputFile('chrom_depth_merged.txt'), mgd.InputFile(normal_bam_file), mgd.InputFile(tumour_bam_file), mgd.InputFile(ref_genome_fasta_file), mgd.TempOutputFile('indels.vcf', 'regions'), mgd.TempOutputFile('snvs.vcf', 'regions'), mgd.TempSpace('call_genome_segment_tmp', 'regions'), mgd.TempInputObj('config', 'regions'), mgd.TempInputObj('genome_size'), ), kwargs={ 'is_exome': is_exome, }) workflow.transform( name='merge_indels', func=soil.wrappers.samtools.tasks.concatenate_vcf, args=( mgd.TempInputFile('indels.vcf', 'regions'), mgd.TempOutputFile('indels.vcf.gz'), ), ) workflow.transform( name='merge_snvs', func=soil.wrappers.samtools.tasks.concatenate_vcf, args=( mgd.TempInputFile('snvs.vcf', 'regions'), mgd.TempOutputFile('snvs.vcf.gz'), ), ) workflow.transform( name='merge_all', func=soil.wrappers.samtools.tasks.concatenate_vcf, args=( [ mgd.TempInputFile('indels.vcf.gz'), mgd.TempInputFile('snvs.vcf.gz') ], mgd.TempOutputFile('merged.vcf.gz'), ), kwargs={ 'allow_overlap': True, }, ) workflow.commandline(name='filter_vcf', ctx=low_mem_ctx, args=( 'bcftools', 'view', '-O', 'z', '-f', '.,PASS', '-o', mgd.OutputFile(out_file), mgd.TempInputFile('merged.vcf.gz'), )) workflow.transform(name='index_vcf', ctx=low_mem_ctx, func=soil.wrappers.samtools.tasks.index_vcf, args=( mgd.InputFile(out_file), mgd.OutputFile(out_file + '.tbi'), )) return workflow
def create_ref_panel_phase_workflow(genetic_map_file, ref_file, target_file, out_file): """ Run EAGLE using a reference panel. """ sandbox = soil.utils.workflow.get_sandbox(['bcftools', 'eagle']) workflow = pypeliner.workflow.Workflow(default_ctx=default_ctx, default_sandbox=sandbox) workflow.setobj( obj=mgd.TempOutputObj('chrom', 'chrom'), value=get_chromosomes(target_file) ) workflow.transform( name='split_ref', axes=('chrom',), func=tasks.get_chrom_variant_file, args=( mgd.TempInputObj('chrom', 'chrom'), mgd.InputFile(ref_file), mgd.TempOutputFile('ref.bcf', 'chrom') ) ) workflow.transform( name='split_target', axes=('chrom',), func=tasks.get_chrom_variant_file, args=( mgd.TempInputObj('chrom', 'chrom'), mgd.InputFile(target_file), mgd.TempOutputFile('target.bcf', 'chrom') ) ) workflow.transform( name='run_eagle', axes=('chrom',), func=tasks.run_eagle, args=( mgd.InputFile(genetic_map_file), mgd.TempInputFile('ref.bcf', 'chrom'), mgd.TempInputFile('target.bcf', 'chrom'), mgd.TempOutputFile('phased.bcf', 'chrom'), mgd.TempSpace('eagle_tmp', 'chrom') ) ) workflow.transform( name='concat_results', func=tasks.concat_results, args=( mgd.TempInputFile('phased.bcf', 'chrom'), mgd.OutputFile(out_file) ) ) workflow.commandline( name='index', args=( 'bcftools', 'index', '-t', '-o', mgd.OutputFile(out_file + '.tbi'), mgd.InputFile(out_file) ) ) return workflow
def create_multiple_lane_align_workflow(fastq_files_1, fastq_files_2, ref_genome_dir, out_bam_file, add_xs_tag=False, align_threads=1, merge_threads=1, read_group_info=None, sort_threads=1): if read_group_info is None: read_group_info = {} for key in fastq_files_1: read_group_info[key] = None sandbox = soil.utils.workflow.get_sandbox(['sambamba', 'samtools']) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.setobj(obj=mgd.TempOutputObj('read_group_info', 'lane'), value=read_group_info) workflow.subworkflow(name='align', axes=('lane', ), func=create_align_workflow, args=( mgd.InputFile('R1.fq.gz', 'lane', fnames=fastq_files_1), mgd.InputFile('R2.fq.gz', 'lane', fnames=fastq_files_2), ref_genome_dir, mgd.TempOutputFile('lane.bam', 'lane'), ), kwargs={ 'add_xs_tag': add_xs_tag, 'align_threads': align_threads, 'read_group_info': mgd.TempInputObj('read_group_info', 'lane'), 'sort_threads': sort_threads, }) workflow.transform(name='markdups_and_merge', axes=(), ctx={ 'mem': 24, 'mem_retry_increment': 8, 'num_retry': 3, 'threads': merge_threads }, func=soil.wrappers.sambamba.tasks.markdups, args=( mgd.TempInputFile('lane.bam', 'lane'), mgd.OutputFile(out_bam_file), mgd.TempSpace('markdup_tmp'), ), kwargs={ 'threads': merge_threads, }) workflow.commandline(name='index', args=( 'samtools', 'index', mgd.InputFile(out_bam_file), mgd.OutputFile(out_bam_file + '.bai'), )) return workflow
def create_vardict_paired_workflow(normal_bam_file, tumour_bam_file, ref_genome_fasta_file, out_file, chromosomes=None, split_size=int(5e6)): sandbox = soil.utils.workflow.get_sandbox( ['bcftools', 'samtools', 'vardict', 'vardict-java']) workflow = pypeliner.workflow.Workflow(default_ctx=low_mem_ctx, default_sandbox=sandbox) workflow.setobj(obj=pypeliner.managed.TempOutputObj('config', 'regions'), value=soil.utils.genome.get_bam_regions( normal_bam_file, split_size, chromosomes=chromosomes)) workflow.transform(name='run_vardict', axes=('regions', ), ctx=med_mem_ctx, func=tasks.run_vardict_paired, args=(mgd.InputFile(normal_bam_file), mgd.InputFile(tumour_bam_file), mgd.InputFile(ref_genome_fasta_file), mgd.TempInputObj('config', 'regions'), mgd.TempOutputFile('call.tsv', 'regions'))) workflow.transform(name='test_somatic', axes=('regions', ), func=tasks.run_test_somatic, args=(mgd.TempInputFile('call.tsv', 'regions'), mgd.TempOutputFile('somatic.tsv', 'regions'))) workflow.transform(name='write_vcf', axes=('regions', ), func=tasks.run_build_paired_vcf, args=(mgd.TempInputFile('somatic.tsv', 'regions'), mgd.TempOutputFile('region.vcf', 'regions'))) workflow.commandline(name='compress_vcf', axes=('regions', ), args=('bcftools', 'view', '-O', 'z', '-o', mgd.TempOutputFile('region.vcf.gz', 'regions'), mgd.TempInputFile('region.vcf', 'regions'))) workflow.transform(name='concatenate_vcfs', func=soil.wrappers.samtools.tasks.concatenate_vcf, args=( mgd.TempInputFile('region.vcf.gz', 'regions'), mgd.TempOutputFile('merged.vcf.gz'), )) workflow.commandline(name='filter_vcf', args=( 'bcftools', 'view', '-O', 'z', '-f', '.,PASS', '-o', mgd.TempOutputFile('filtered.vcf.gz'), mgd.TempInputFile('merged.vcf.gz'), )) workflow.commandline(name='filter_somatics', args=('bcftools', 'filter', '-i', 'INFO/STATUS[0]="StrongSomatic"', '-O', 'z', '-o', mgd.OutputFile(out_file), mgd.TempInputFile('filtered.vcf.gz'))) return workflow
def create_mutect_paired_workflow(normal_bam_file, tumour_bam_file, ref_genome_fasta_file, out_file, chromosomes=None, normal_name='normal', split_size=int(1e7), tumour_name='tumour'): normal_name = get_sample(normal_bam_file, normal_name) tumour_name = get_sample(tumour_bam_file, tumour_name) sandbox = soil.utils.workflow.get_sandbox(['bcftools', 'gatk', 'samtools']) workflow = pypeliner.workflow.Workflow(default_ctx=low_mem_ctx, default_sandbox=sandbox) workflow.setobj(obj=pypeliner.managed.TempOutputObj('config', 'regions'), value=soil.utils.genome.get_bam_regions( normal_bam_file, split_size, chromosomes=chromosomes)) workflow.transform(name='run_mutect', axes=('regions', ), ctx=med_mem_ctx, func=tasks.run_mutect_paired, args=(mgd.InputFile(normal_bam_file), mgd.InputFile(tumour_bam_file), mgd.InputFile(ref_genome_fasta_file), mgd.TempInputObj('config', 'regions'), mgd.TempOutputFile('region.vcf', 'regions')), kwargs={ 'normal_name': normal_name, 'tumour_name': tumour_name }) workflow.transform(name='run_mutect_filter', axes=('regions', ), ctx=med_mem_ctx, func=tasks.run_filter_mutect, args=(mgd.TempInputFile('region.vcf', 'regions'), mgd.TempOutputFile('flagged.vcf', 'regions'))) workflow.transform(name='concatenate_vcfs', func=soil.wrappers.samtools.tasks.concatenate_vcf, args=( mgd.TempInputFile('flagged.vcf', 'regions'), mgd.TempOutputFile('merged.vcf.gz'), )) workflow.commandline(name='filter_vcf', ctx=low_mem_ctx, args=( 'bcftools', 'view', '-O', 'z', '-f', '.,PASS', '-o', mgd.OutputFile(out_file), mgd.TempInputFile('merged.vcf.gz'), )) return workflow
def create_index_ref_data_workflow(out_dir, cosmic=False, threads=1): """ Create index files for references. This workflow is extremely compute and memory heavy. It should be run on a cluster with large memory nodes available. """ ref_data_paths = soil.ref_data.paths.SoilRefDataPaths(out_dir) sandbox = soil.utils.workflow.get_sandbox( ['bwa', 'bcftools', 'kallisto', 'picard', 'samtools', 'star']) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.commandline( name='link_bwa_ref', args=('ln', mgd.InputFile(ref_data_paths.genome_fasta_file), mgd.OutputFile(ref_data_paths.bwa_genome_fasta_file))) workflow.transform( name='bwa_index_ref_genome', ctx={ 'mem': 8, 'mem_retry_increment': 8, 'num_retry': 3 }, func=soil.wrappers.bwa.tasks.index, args=(mgd.InputFile(ref_data_paths.bwa_genome_fasta_file), mgd.OutputFile(ref_data_paths.bwa_genome_fasta_file + '.bwa_index.done'))) workflow.subworkflow( name='build_bwa_mappability_file', func=tasks.mappability_wrapper, args=(mgd.InputFile(ref_data_paths.bwa_genome_fasta_file + '.bwa_index.done'), mgd.OutputFile(ref_data_paths.genome_bwa_mappability_wig_file)), kwargs={ 'k': 100, 'max_map_qual': 60, 'threads': threads }) workflow.commandline( name='link_star_ref', args=('ln', mgd.InputFile(ref_data_paths.genome_fasta_file), mgd.OutputFile(ref_data_paths.star_genome_fasta_file))) workflow.transform( name='star_index_ref_genome', ctx={ 'mem': 32, 'mem_retry_increment': 16, 'num_retry': 3, 'threads': threads }, func=soil.wrappers.star.tasks.index, args=(mgd.InputFile(ref_data_paths.star_genome_fasta_file), mgd.InputFile(ref_data_paths.gene_annotations_gtf_file), mgd.OutputFile(ref_data_paths.star_genome_fasta_file + '.star_index.done')), kwargs={'threads': threads}) workflow.transform(name='samtools_index_ref_genome', func=soil.wrappers.samtools.tasks.index_fasta, args=(mgd.InputFile(ref_data_paths.genome_fasta_file), mgd.OutputFile(ref_data_paths.genome_fasta_file + '.fai'))) workflow.commandline( name='build_ref_genom_dict', args=('picard', 'CreateSequenceDictionary', 'R={}'.format( mgd.InputFile(ref_data_paths.genome_fasta_file)), 'O={}'.format( mgd.OutputFile( os.path.splitext(ref_data_paths.genome_fasta_file)[0] + '.dict')))) workflow.transform( name='kallisto_index', ctx={ 'mem': 4, 'mem_retry_increment': 4, 'num_retry': 3 }, func=soil.wrappers.kallisto.tasks.build_index, args=(mgd.InputFile(ref_data_paths.transcriptome_fasta_file), mgd.OutputFile(ref_data_paths.kallisto_index_file)), kwargs={'kmer_length': 31}) if cosmic: workflow.transform( name='index_cosmic', func=soil.wrappers.samtools.tasks.index_vcf, args=(mgd.InputFile(ref_data_paths.cosmic_vcf_file), mgd.OutputFile(ref_data_paths.cosmic_vcf_file + '.tbi'))) workflow.transform(name='index_dbsnp', func=soil.wrappers.samtools.tasks.index_vcf, args=(mgd.InputFile(ref_data_paths.dbsnp_vcf_file), mgd.OutputFile(ref_data_paths.dbsnp_vcf_file + '.tbi'))) return workflow
def create_allele_counts_workflow(normal_bam_file, tumour_bam_file, dbsnp_vcf_file, ref_genome_fasta_file, allele_counts_file, chromosomes='autosomes'): chromosomes = soil.utils.genome.load_bam_chromosome_lengths( normal_bam_file, chromosomes) sandbox = soil.utils.workflow.get_sandbox(['snpsift']) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.subworkflow( name='call_snps', func=soil.wrappers.platypus.workflows.create_single_sample_workflow, args=( mgd.InputFile(normal_bam_file), mgd.InputFile(ref_genome_fasta_file), mgd.TempOutputFile('normal.vcf.gz'), ), kwargs={ 'chromosomes': chromosomes, 'split_size': int(1e7) }) workflow.commandline(name='annotate_dbsnp_status', ctx={ 'mem': 6, 'mem_retry_increment': 4, 'num_retry': 3 }, args=('SnpSift', 'annotate', mgd.InputFile(dbsnp_vcf_file), mgd.TempInputFile('normal.vcf.gz'), '>', mgd.TempOutputFile('normal.dbsnp.vcf'))) workflow.commandline(name='annotate_variant_type', ctx={ 'mem': 6, 'mem_retry_increment': 4, 'num_retry': 3 }, args=('SnpSift', 'varType', mgd.TempInputFile('normal.dbsnp.vcf'), '>', mgd.TempOutputFile('normal.dbsnp.vartype.vcf'))) workflow.commandline( name='filter_het_snps', ctx={ 'mem': 6, 'mem_retry_increment': 4, 'num_retry': 3 }, args=('SnpSift', 'filter', "isHet(GEN[0]) & ((exists ID) & ( ID =~ 'rs' )) & (exists SNP)", mgd.TempInputFile('normal.dbsnp.vartype.vcf'), '>', mgd.TempOutputFile('het.snps.vcf'))) workflow.transform(name='split_vcf', ctx={ 'mem': 6, 'mem_retry_increment': 4, 'num_retry': 3 }, func=tasks.split_vcf, args=(mgd.TempInputFile('het.snps.vcf'), mgd.TempOutputFile('split.vcf', 'split'), mgd.TempSpace('split_tmp')), kwargs={'split_size': int(1e4)}) workflow.transform(name='get_allele_counts', axes=('split', ), func=tasks.get_snv_allele_counts_for_vcf_targets, args=(mgd.InputFile(tumour_bam_file), mgd.TempInputFile('split.vcf', 'split'), mgd.TempOutputFile('split.tsv', 'split'))) workflow.transform(name='merge_counts', func=tasks.merge_counts, args=(mgd.TempInputFile('split.tsv', 'split'), mgd.OutputFile(allele_counts_file))) return workflow
def create_rnaseq_workflow(fastq_file_1, fastq_file_2, out_file, threads=1): sandbox = soil.utils.workflow.get_sandbox([ 'mixcr', ]) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.commandline(name='align', ctx={ 'mem': 32, 'mem_retry_increment': 8, 'num_retry': 3, 'threads': threads }, args=('mixcr', 'align', '-p', 'rna-seq', '-s', 'hsa', '-OallowPartialAlignments=true', '-f', '-t', threads, mgd.InputFile(fastq_file_1), mgd.InputFile(fastq_file_2), mgd.TempOutputFile('alignments.vdjca'))) workflow.commandline( name='assemblePartial_1', ctx={ 'mem': 16, 'mem_retry_increment': 8, 'num_retry': 3 }, args=('mixcr', 'assemblePartial', '-f', mgd.TempInputFile('alignments.vdjca'), mgd.TempOutputFile('alignments_rescued_1.vdjca'))) workflow.commandline( name='assemblePartial_2', ctx={ 'mem': 16, 'mem_retry_increment': 8, 'num_retry': 3 }, args=('mixcr', 'assemblePartial', '-f', mgd.TempInputFile('alignments_rescued_1.vdjca'), mgd.TempOutputFile('alignments_rescued_2.vdjca'))) workflow.commandline( name='extendAlignments', ctx={ 'mem': 16, 'mem_retry_increment': 8, 'num_retry': 3 }, args=('mixcr', 'extendAlignments', '-f', mgd.TempInputFile('alignments_rescued_2.vdjca'), mgd.TempOutputFile('alignments_rescued_2_extended.vdjca'))) workflow.commandline( name='assemble', ctx={ 'mem': 16, 'mem_retry_increment': 8, 'num_retry': 3, 'threads': threads }, args=('mixcr', 'assemble', '-f', '-t', threads, mgd.TempInputFile('alignments_rescued_2_extended.vdjca'), mgd.TempOutputFile('clones.clns'))) workflow.commandline(name='export', ctx={ 'mem': 16, 'mem_retry_increment': 8, 'num_retry': 3 }, args=('mixcr', 'exportClones', '-f', mgd.TempInputFile('clones.clns'), mgd.TempOutputFile('results.tsv'))) workflow.commandline(name='compress', args=('gzip', '-c', mgd.TempInputFile('results.tsv'), '>', mgd.OutputFile(out_file))) return workflow
def create_titan_workflow(normal_bam_file, tumour_bam_file, dbsnp_vcf_file, mappability_file, ref_genome_fasta_file, out_file, exome_bed_file=None, sample='Tumour', threads=1): sandbox = soil.utils.workflow.get_sandbox( ['hmmcopy', 'hmmcopy_utils', 'titan']) sandbox.channels.append('conda-forge') sandbox.packages.extend(['pandas', 'rpy2']) chromosomes = soil.utils.genome.load_bam_chromosome_lengths( normal_bam_file, 'autosomes') workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.setobj(obj=mgd.TempOutputObj('init_params', 'param_idx'), value=tasks.create_intialization_parameters()) workflow.subworkflow(name='get_allele_counts', func=create_allele_counts_workflow, args=(mgd.InputFile(normal_bam_file), mgd.InputFile(tumour_bam_file), mgd.InputFile(dbsnp_vcf_file), mgd.InputFile(ref_genome_fasta_file), mgd.TempOutputFile('allele_counts.tsv')), kwargs={'chromosomes': 'autosomes'}) workflow.commandline(name='build_normal_wig', args=('readCounter', '-c', ','.join(chromosomes), mgd.InputFile(normal_bam_file), '>', mgd.TempOutputFile('normal.wig'))) workflow.commandline(name='build_tumour_wig', args=('readCounter', '-c', ','.join(chromosomes), mgd.InputFile(tumour_bam_file), '>', mgd.TempOutputFile('tumour.wig'))) workflow.commandline(name='build_gc_wig', args=('gcCounter', '-c', ','.join(chromosomes), mgd.InputFile(ref_genome_fasta_file), '>', mgd.TempOutputFile('gc.wig'))) workflow.commandline(name='build_mappability_wig', args=('mapCounter', '-c', ','.join(chromosomes), mgd.InputFile(mappability_file), '>', mgd.TempOutputFile('mappability.wig'))) workflow.transform(name='build_coverage_file', func=tasks.build_coverage_file, args=(mgd.TempInputFile('normal.wig'), mgd.TempInputFile('tumour.wig'), mgd.TempInputFile('gc.wig'), mgd.TempInputFile('mappability.wig'), mgd.TempOutputFile('coverage.wig')), kwargs={'target_file': exome_bed_file}) workflow.transform(name='run_titan', axes=('param_idx', ), ctx={ 'mem': 8, 'mem_retry_increment': 4, 'num_retry': 3, 'threads': threads }, func=tasks.run_titan, args=(mgd.TempInputFile('coverage.wig'), mgd.TempInputFile('allele_counts.tsv'), mgd.TempInputObj('init_params', 'param_idx'), mgd.TempOutputFile('run.tar.gz', 'param_idx'), mgd.TempSpace('titan_tmp', 'param_idx')), kwargs={ 'is_exome': (exome_bed_file is not None), 'sample': sample, 'threads': threads }) workflow.transform(name='build_run_stats_file', func=tasks.build_run_stats_file, args=(mgd.TempInputFile('run.tar.gz', 'param_idx'), mgd.TempInputObj('init_params', 'param_idx'), mgd.TempOutputFile('stats.tsv'))) workflow.transform(name='build_output', func=tasks.build_final_results_file, args=(mgd.TempInputFile('coverage.wig'), mgd.TempInputFile('allele_counts.tsv'), mgd.TempInputFile('run.tar.gz', 'param_idx'), mgd.TempInputFile('stats.tsv'), mgd.OutputFile(out_file), mgd.TempSpace('build_results'))) return workflow
def create_mappability_workflow( ref_genome_fasta_file, out_file, k=100, max_map_qual=None, split_size=int(1e7), threads=1): sandbox = soil.utils.workflow.get_sandbox(['bwa', 'samtools', 'ucsc-bedgraphtobigwig']) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.transform( name='split_fasta_by_chrom', func=tasks.split_fasta_by_chrom, args=( mgd.InputFile(ref_genome_fasta_file), mgd.TempOutputFile('chrom.fasta', 'chrom') ) ) workflow.transform( name='create_kmer_reads', axes=('chrom',), ctx={'mem': 4, 'mem_retry_increment': 2, 'num_retry': 3}, func=tasks.create_kmer_reads, args=( mgd.TempInputFile('chrom.fasta', 'chrom'), mgd.TempOutputFile('reads.fa', 'chrom', 'kmer_group') ), kwargs={ 'k': k, 'split_size': split_size } ) workflow.transform( name='align_kmers', axes=('chrom', 'kmer_group'), ctx={'mem': 8, 'mem_retry_increment': 8, 'num_retry': 3, 'threads': threads}, func=tasks.bwa_mem_align, args=( mgd.TempInputFile('reads.fa', 'chrom', 'kmer_group'), mgd.InputFile(ref_genome_fasta_file), mgd.TempOutputFile('aligned.bam', 'chrom', 'kmer_group') ), kwargs={ 'threads': threads } ) workflow.transform( name='compute_mappability', axes=('chrom', 'kmer_group'), ctx={'mem': 4, 'mem_retry_increment': 2, 'num_retry': 3}, func=tasks.compute_mappability, args=( mgd.TempInputFile('aligned.bam', 'chrom', 'kmer_group'), mgd.TempOutputFile('mappability.tsv', 'chrom', 'kmer_group') ), kwargs={ 'max_map_qual': max_map_qual, } ) workflow.transform( name='compute_mappability_segs', axes=('chrom', 'kmer_group'), ctx={'mem': 4, 'mem_retry_increment': 2, 'num_retry': 3}, func=tasks.compute_mappability_segs, args=( mgd.TempInputFile('mappability.tsv', 'chrom', 'kmer_group'), mgd.TempOutputFile('mappability_segs.tsv', 'chrom', 'kmer_group') ) ) workflow.transform( name='compute_chrom_mean_mappability', axes=('chrom',), ctx={'mem': 8, 'mem_retry_increment': 8, 'num_retry': 3}, func=tasks.compute_chrom_mean_mappability, args=( mgd.TempInputFile('mappability_segs.tsv', 'chrom', 'kmer_group'), mgd.TempOutputFile('mean_mappability.tsv', 'chrom') ) ) workflow.transform( name='write_bed', ctx={'mem': 8, 'mem_retry_increment': 8, 'num_retry': 3}, func=tasks.write_bed, args=( mgd.TempInputFile('mean_mappability.tsv', 'chrom'), mgd.TempOutputFile('mean_mappability.bed') ) ) workflow.transform( name='write_chrom_sizes', func=tasks.write_chrom_sizes, args=( mgd.InputFile(ref_genome_fasta_file), mgd.TempOutputFile('chrom_sizes.txt'), ) ) workflow.commandline( name='write_big_wig', args=( 'bedGraphToBigWig', mgd.TempInputFile('mean_mappability.bed'), mgd.TempInputFile('chrom_sizes.txt'), mgd.OutputFile(out_file) ) ) return workflow