def create_transdecoder_workflow(in_gtf_file, ref_gtf_file, ref_genome_fasta_file, out_alignment_gff_file, out_cdna_fasta_file, out_cds_fasta_file, out_protein_fasta_file): sandbox = soil.utils.workflow.get_sandbox([ 'transdecoder', ]) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.transform(name='convert_gtf_to_cdna_fasta', ctx={ 'mem': 8, 'mem_retry_increment': 8, 'num_retry': 3 }, func=tasks.convert_gtf_to_cdna_fasta, args=( mgd.InputFile(in_gtf_file), mgd.InputFile(ref_genome_fasta_file), mgd.OutputFile(out_cdna_fasta_file), )) workflow.transform(name='convert_gtf_to_gff', ctx={ 'mem': 8, 'mem_retry_increment': 8, 'num_retry': 3 }, func=tasks.convert_gtf_to_gff_file, args=( mgd.InputFile(in_gtf_file), mgd.TempOutputFile('ref.gff'), )) workflow.transform( name='run_transdecoder', ctx={ 'mem': 8, 'mem_retry_increment': 8, 'num_retry': 3 }, func=tasks.run_transdecoder, args=( mgd.InputFile(out_cdna_fasta_file), mgd.OutputFile(out_cds_fasta_file), mgd.OutputFile(out_protein_fasta_file), mgd.TempOutputFile('transdecoder.gff'), mgd.TempSpace('transdecoder_tmp'), ), ) workflow.transform(name='buil_alignment_gff', ctx={ 'mem': 8, 'mem_retry_increment': 8, 'num_retry': 3 }, func=tasks.build_alignment_gff, args=( mgd.InputFile(out_cdna_fasta_file), mgd.TempInputFile('transdecoder.gff'), mgd.TempInputFile('ref.gff'), mgd.OutputFile(out_alignment_gff_file), )) return workflow
def create_topiary_workflow(hla_alleles, in_file, out_file, copy_pyensembl_cache_dir=False, iedb_dir=None, genome='GRCh37', predictor='netmhc', pyensembl_cache_dir=None): """ Run topiary. Parameters ---------- hla_alleles: list List of HLA alleles i.e. A*02:01. in_file: str Path to VCF file with variants. out_file: str Path where output will be written in tsv format. """ sandbox = soil.utils.workflow.get_sandbox([ 'topiary', ]) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.setobj(obj=mgd.TempOutputObj('raw_hla_alleles'), value=hla_alleles) workflow.setobj(obj=mgd.OutputChunks('pep_len'), value=[8, 9, 10, 11]) workflow.transform(name='filter_hla_alleles', func=tasks.filter_hla_alleles, args=(mgd.TempInputObj('raw_hla_alleles'), ), kwargs={ 'iedb_dir': iedb_dir, 'predictor': predictor, }, ret=mgd.TempOutputObj('hla_alleles')) workflow.transform(name='run_topiary', axes=('pep_len', ), ctx={ 'mem': 8, 'mem_retry_increment': 4, 'num_retry': 3 }, func=tasks.run_topiary, args=(mgd.TempInputObj('hla_alleles'), mgd.InputFile(in_file), mgd.TempOutputFile('raw.tsv', 'pep_len')), kwargs={ 'copy_pyensembl_cache_dir': copy_pyensembl_cache_dir, 'iedb_dir': iedb_dir, 'genome': genome, 'peptide_length': mgd.Template('{pep_len}', 'pep_len'), 'predictor': predictor, 'pyensembl_cache_dir': pyensembl_cache_dir }) workflow.transform(name='reformat_output', axes=(), func=tasks.reformat_output, args=(mgd.TempInputFile('raw.tsv', 'pep_len'), mgd.OutputFile(out_file))) return workflow
def create_multiple_lane_align_workflow(fastq_files_1, fastq_files_2, ref_genome_dir, out_bam_file, add_xs_tag=False, align_threads=1, merge_threads=1, read_group_info=None, sort_threads=1): if read_group_info is None: read_group_info = {} for key in fastq_files_1: read_group_info[key] = None sandbox = soil.utils.workflow.get_sandbox(['sambamba', 'samtools']) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.setobj(obj=mgd.TempOutputObj('read_group_info', 'lane'), value=read_group_info) workflow.subworkflow(name='align', axes=('lane', ), func=create_align_workflow, args=( mgd.InputFile('R1.fq.gz', 'lane', fnames=fastq_files_1), mgd.InputFile('R2.fq.gz', 'lane', fnames=fastq_files_2), ref_genome_dir, mgd.TempOutputFile('lane.bam', 'lane'), ), kwargs={ 'add_xs_tag': add_xs_tag, 'align_threads': align_threads, 'read_group_info': mgd.TempInputObj('read_group_info', 'lane'), 'sort_threads': sort_threads, }) workflow.transform(name='markdups_and_merge', axes=(), ctx={ 'mem': 24, 'mem_retry_increment': 8, 'num_retry': 3, 'threads': merge_threads }, func=soil.wrappers.sambamba.tasks.markdups, args=( mgd.TempInputFile('lane.bam', 'lane'), mgd.OutputFile(out_bam_file), mgd.TempSpace('markdup_tmp'), ), kwargs={ 'threads': merge_threads, }) workflow.commandline(name='index', args=( 'samtools', 'index', mgd.InputFile(out_bam_file), mgd.OutputFile(out_bam_file + '.bai'), )) return workflow
def create_mutect_paired_workflow(normal_bam_file, tumour_bam_file, ref_genome_fasta_file, out_file, chromosomes=None, normal_name='normal', split_size=int(1e7), tumour_name='tumour'): normal_name = get_sample(normal_bam_file, normal_name) tumour_name = get_sample(tumour_bam_file, tumour_name) sandbox = soil.utils.workflow.get_sandbox(['bcftools', 'gatk', 'samtools']) workflow = pypeliner.workflow.Workflow(default_ctx=low_mem_ctx, default_sandbox=sandbox) workflow.setobj(obj=pypeliner.managed.TempOutputObj('config', 'regions'), value=soil.utils.genome.get_bam_regions( normal_bam_file, split_size, chromosomes=chromosomes)) workflow.transform(name='run_mutect', axes=('regions', ), ctx=med_mem_ctx, func=tasks.run_mutect_paired, args=(mgd.InputFile(normal_bam_file), mgd.InputFile(tumour_bam_file), mgd.InputFile(ref_genome_fasta_file), mgd.TempInputObj('config', 'regions'), mgd.TempOutputFile('region.vcf', 'regions')), kwargs={ 'normal_name': normal_name, 'tumour_name': tumour_name }) workflow.transform(name='run_mutect_filter', axes=('regions', ), ctx=med_mem_ctx, func=tasks.run_filter_mutect, args=(mgd.TempInputFile('region.vcf', 'regions'), mgd.TempOutputFile('flagged.vcf', 'regions'))) workflow.transform(name='concatenate_vcfs', func=soil.wrappers.samtools.tasks.concatenate_vcf, args=( mgd.TempInputFile('flagged.vcf', 'regions'), mgd.TempOutputFile('merged.vcf.gz'), )) workflow.commandline(name='filter_vcf', ctx=low_mem_ctx, args=( 'bcftools', 'view', '-O', 'z', '-f', '.,PASS', '-o', mgd.OutputFile(out_file), mgd.TempInputFile('merged.vcf.gz'), )) return workflow
def create_index_ref_data_workflow(out_dir, cosmic=False, threads=1): """ Create index files for references. This workflow is extremely compute and memory heavy. It should be run on a cluster with large memory nodes available. """ ref_data_paths = soil.ref_data.paths.SoilRefDataPaths(out_dir) sandbox = soil.utils.workflow.get_sandbox( ['bwa', 'bcftools', 'kallisto', 'picard', 'samtools', 'star']) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.commandline( name='link_bwa_ref', args=('ln', mgd.InputFile(ref_data_paths.genome_fasta_file), mgd.OutputFile(ref_data_paths.bwa_genome_fasta_file))) workflow.transform( name='bwa_index_ref_genome', ctx={ 'mem': 8, 'mem_retry_increment': 8, 'num_retry': 3 }, func=soil.wrappers.bwa.tasks.index, args=(mgd.InputFile(ref_data_paths.bwa_genome_fasta_file), mgd.OutputFile(ref_data_paths.bwa_genome_fasta_file + '.bwa_index.done'))) workflow.subworkflow( name='build_bwa_mappability_file', func=tasks.mappability_wrapper, args=(mgd.InputFile(ref_data_paths.bwa_genome_fasta_file + '.bwa_index.done'), mgd.OutputFile(ref_data_paths.genome_bwa_mappability_wig_file)), kwargs={ 'k': 100, 'max_map_qual': 60, 'threads': threads }) workflow.commandline( name='link_star_ref', args=('ln', mgd.InputFile(ref_data_paths.genome_fasta_file), mgd.OutputFile(ref_data_paths.star_genome_fasta_file))) workflow.transform( name='star_index_ref_genome', ctx={ 'mem': 32, 'mem_retry_increment': 16, 'num_retry': 3, 'threads': threads }, func=soil.wrappers.star.tasks.index, args=(mgd.InputFile(ref_data_paths.star_genome_fasta_file), mgd.InputFile(ref_data_paths.gene_annotations_gtf_file), mgd.OutputFile(ref_data_paths.star_genome_fasta_file + '.star_index.done')), kwargs={'threads': threads}) workflow.transform(name='samtools_index_ref_genome', func=soil.wrappers.samtools.tasks.index_fasta, args=(mgd.InputFile(ref_data_paths.genome_fasta_file), mgd.OutputFile(ref_data_paths.genome_fasta_file + '.fai'))) workflow.commandline( name='build_ref_genom_dict', args=('picard', 'CreateSequenceDictionary', 'R={}'.format( mgd.InputFile(ref_data_paths.genome_fasta_file)), 'O={}'.format( mgd.OutputFile( os.path.splitext(ref_data_paths.genome_fasta_file)[0] + '.dict')))) workflow.transform( name='kallisto_index', ctx={ 'mem': 4, 'mem_retry_increment': 4, 'num_retry': 3 }, func=soil.wrappers.kallisto.tasks.build_index, args=(mgd.InputFile(ref_data_paths.transcriptome_fasta_file), mgd.OutputFile(ref_data_paths.kallisto_index_file)), kwargs={'kmer_length': 31}) if cosmic: workflow.transform( name='index_cosmic', func=soil.wrappers.samtools.tasks.index_vcf, args=(mgd.InputFile(ref_data_paths.cosmic_vcf_file), mgd.OutputFile(ref_data_paths.cosmic_vcf_file + '.tbi'))) workflow.transform(name='index_dbsnp', func=soil.wrappers.samtools.tasks.index_vcf, args=(mgd.InputFile(ref_data_paths.dbsnp_vcf_file), mgd.OutputFile(ref_data_paths.dbsnp_vcf_file + '.tbi'))) return workflow
def crete_download_ref_data_workflow(config, out_dir, cosmic=False, local_download=False): """ Download reference files. This workflow mainly retrieves files from the internet. There are some light to moderately heavy computational tasks as well. """ if not os.path.exists(out_dir): os.makedirs(out_dir) ref_data_paths = soil.ref_data.paths.SoilRefDataPaths(out_dir) with open(ref_data_paths.config_file, 'w') as fh: yaml.dump(config, fh) if cosmic: cosmic_user = click.prompt('Please enter COSMIC user ID') cosmic_password = click.prompt('Please enter COSMIC password', hide_input=True) sandbox = soil.utils.workflow.get_sandbox(['bcftools', 'samtools']) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) for key in config: if key.endswith('url') or key.endswith('urls'): workflow.setobj(obj=mgd.TempOutputObj(key), value=config[key]) workflow.setobj(mgd.TempOutputObj('snpeff_url'), value=config['snpeff']['url']) workflow.subworkflow( name='download_ref_gene_annotations', func=_create_download_decompress_concat_workflow, args=(mgd.TempInputObj('ref_gene_annotations_gtf_urls'), mgd.OutputFile(ref_data_paths.gene_annotations_gtf_file)), kwargs={'local_download': local_download}) workflow.subworkflow(name='download_ref_genome', func=_create_download_decompress_concat_workflow, args=(mgd.TempInputObj('ref_genome_fasta_urls'), mgd.TempOutputFile('raw_ref.fasta')), kwargs={'local_download': local_download}) workflow.transform(name='lexsort_ref_genome', func=tasks.lex_sort_fasta, args=(mgd.TempInputFile('raw_ref.fasta'), mgd.OutputFile(ref_data_paths.genome_fasta_file))) workflow.subworkflow(name='download_ref_proteome', func=_create_download_decompress_concat_workflow, args=(mgd.TempInputObj('ref_proteome_fasta_urls'), mgd.TempOutputFile('raw_ref_prot.fasta')), kwargs={'local_download': local_download}) workflow.transform(name='filter_bad_proteins', func=tasks.filter_bad_proiteins, args=(mgd.TempInputFile('raw_ref_prot.fasta'), mgd.OutputFile( ref_data_paths.proteome_fasta_file))) workflow.subworkflow( name='download_ref_transcriptome', func=_create_download_decompress_concat_workflow, args=(mgd.TempInputObj('ref_transcriptome_fasta_urls'), mgd.OutputFile(ref_data_paths.transcriptome_fasta_file)), kwargs={'local_download': local_download}) workflow.transform(name='download_dbsnp', ctx={'local': local_download}, func=tasks.download, args=(mgd.TempInputObj('dbsnp_url'), mgd.OutputFile(ref_data_paths.dbsnp_vcf_file))) if cosmic: workflow.subworkflow( name='download_cosmic', func=_create_download_cosmic_workflow, args=(config['cosmic']['ref_genome_version'], mgd.OutputFile(ref_data_paths.cosmic_vcf_file), cosmic_user, cosmic_password), kwargs={'local_download': local_download}) workflow.transform(name='download_snpeff_db', ctx={'local': local_download}, func=tasks.download, args=(mgd.TempInputObj('snpeff_url'), mgd.TempOutputFile('snpeff.zip'))) workflow.transform( name='unzip_snpeff', func=tasks.unzip_file, args=(mgd.TempInputFile('snpeff.zip'), mgd.OutputFile( os.path.join(os.path.dirname(ref_data_paths.snpeff_data_dir), 'done.txt')), mgd.TempSpace('snpeff_tmp'))) workflow.transform(name='download_genetic_map', ctx={'local': local_download}, func=tasks.download, args=(mgd.TempInputObj('genetic_map_txt_url'), mgd.OutputFile(ref_data_paths.genetic_map_file))) workflow.subworkflow( name='ref_haplotype_panel', func=soil.ref_data.haplotype.workflows.create_eagle_ref_data_workflow, args=(mgd.TempInputObj('haplotype_vcf_template_url'), mgd.OutputFile(ref_data_paths.haplotypes_bcf)), kwargs={'local_download': local_download}) workflow.transform(name='download_iedb_mhc_one', ctx={'local': local_download}, func=tasks.download, args=(mgd.TempInputObj('iedb_mhc_one_url'), mgd.TempOutputFile('mhc1.tar.gz'))) workflow.transform(name='extract_iedb_mhc_one', func=tasks.extract_tar_file, args=(mgd.TempInputFile('mhc1.tar.gz'), mgd.OutputFile( os.path.join(ref_data_paths.iedb_mhc_one_dir, 'extract.done')))) workflow.transform(name='config_iedb_mhc_one', func=tasks.configure_iedb_module, args=(mgd.InputFile( os.path.join(ref_data_paths.iedb_mhc_one_dir, 'extract.done')), mgd.OutputFile( os.path.join(ref_data_paths.iedb_mhc_one_dir, 'configure.done')))) workflow.transform(name='download_vep_cache', ctx={'local': local_download}, func=tasks.download, args=(mgd.TempInputObj('vep_cache_url'), mgd.TempOutputFile('vep.tar.gz'))) workflow.transform(name='extract_vep_cache', func=tasks.extract_tar_file, args=(mgd.TempInputFile('vep.tar.gz'), mgd.OutputFile( os.path.join(ref_data_paths.vep_cache_dir, 'homo_sapiens', 'extract.done')))) workflow.subworkflow(name='download_vep_plugins', func=_create_download_vep_plugins_workflow, args=(mgd.TempInputObj('vep_plugins_urls'), ref_data_paths.vep_plugins_dir), kwargs={'local_download': local_download}) workflow.setobj(obj=mgd.TempOutputObj('pyensembl_version'), value=config['pyensembl']['version']) workflow.transform(name='download_pyensembl_cache', ctx={'local': local_download}, func=tasks.download_pyensembl_cache, args=(mgd.TempInputObj('pyensembl_version'), mgd.OutputFile( os.path.join( ref_data_paths.pyensembl_cache_dir, 'download.done'))), sandbox=soil.utils.workflow.get_sandbox(['pyensembl'])) return workflow
def create_vardict_paired_workflow(normal_bam_file, tumour_bam_file, ref_genome_fasta_file, out_file, chromosomes=None, split_size=int(5e6)): sandbox = soil.utils.workflow.get_sandbox( ['bcftools', 'samtools', 'vardict', 'vardict-java']) workflow = pypeliner.workflow.Workflow(default_ctx=low_mem_ctx, default_sandbox=sandbox) workflow.setobj(obj=pypeliner.managed.TempOutputObj('config', 'regions'), value=soil.utils.genome.get_bam_regions( normal_bam_file, split_size, chromosomes=chromosomes)) workflow.transform(name='run_vardict', axes=('regions', ), ctx=med_mem_ctx, func=tasks.run_vardict_paired, args=(mgd.InputFile(normal_bam_file), mgd.InputFile(tumour_bam_file), mgd.InputFile(ref_genome_fasta_file), mgd.TempInputObj('config', 'regions'), mgd.TempOutputFile('call.tsv', 'regions'))) workflow.transform(name='test_somatic', axes=('regions', ), func=tasks.run_test_somatic, args=(mgd.TempInputFile('call.tsv', 'regions'), mgd.TempOutputFile('somatic.tsv', 'regions'))) workflow.transform(name='write_vcf', axes=('regions', ), func=tasks.run_build_paired_vcf, args=(mgd.TempInputFile('somatic.tsv', 'regions'), mgd.TempOutputFile('region.vcf', 'regions'))) workflow.commandline(name='compress_vcf', axes=('regions', ), args=('bcftools', 'view', '-O', 'z', '-o', mgd.TempOutputFile('region.vcf.gz', 'regions'), mgd.TempInputFile('region.vcf', 'regions'))) workflow.transform(name='concatenate_vcfs', func=soil.wrappers.samtools.tasks.concatenate_vcf, args=( mgd.TempInputFile('region.vcf.gz', 'regions'), mgd.TempOutputFile('merged.vcf.gz'), )) workflow.commandline(name='filter_vcf', args=( 'bcftools', 'view', '-O', 'z', '-f', '.,PASS', '-o', mgd.TempOutputFile('filtered.vcf.gz'), mgd.TempInputFile('merged.vcf.gz'), )) workflow.commandline(name='filter_somatics', args=('bcftools', 'filter', '-i', 'INFO/STATUS[0]="StrongSomatic"', '-O', 'z', '-o', mgd.OutputFile(out_file), mgd.TempInputFile('filtered.vcf.gz'))) return workflow
def create_mappability_workflow( ref_genome_fasta_file, out_file, k=100, max_map_qual=None, split_size=int(1e7), threads=1): sandbox = soil.utils.workflow.get_sandbox(['bwa', 'samtools', 'ucsc-bedgraphtobigwig']) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.transform( name='split_fasta_by_chrom', func=tasks.split_fasta_by_chrom, args=( mgd.InputFile(ref_genome_fasta_file), mgd.TempOutputFile('chrom.fasta', 'chrom') ) ) workflow.transform( name='create_kmer_reads', axes=('chrom',), ctx={'mem': 4, 'mem_retry_increment': 2, 'num_retry': 3}, func=tasks.create_kmer_reads, args=( mgd.TempInputFile('chrom.fasta', 'chrom'), mgd.TempOutputFile('reads.fa', 'chrom', 'kmer_group') ), kwargs={ 'k': k, 'split_size': split_size } ) workflow.transform( name='align_kmers', axes=('chrom', 'kmer_group'), ctx={'mem': 8, 'mem_retry_increment': 8, 'num_retry': 3, 'threads': threads}, func=tasks.bwa_mem_align, args=( mgd.TempInputFile('reads.fa', 'chrom', 'kmer_group'), mgd.InputFile(ref_genome_fasta_file), mgd.TempOutputFile('aligned.bam', 'chrom', 'kmer_group') ), kwargs={ 'threads': threads } ) workflow.transform( name='compute_mappability', axes=('chrom', 'kmer_group'), ctx={'mem': 4, 'mem_retry_increment': 2, 'num_retry': 3}, func=tasks.compute_mappability, args=( mgd.TempInputFile('aligned.bam', 'chrom', 'kmer_group'), mgd.TempOutputFile('mappability.tsv', 'chrom', 'kmer_group') ), kwargs={ 'max_map_qual': max_map_qual, } ) workflow.transform( name='compute_mappability_segs', axes=('chrom', 'kmer_group'), ctx={'mem': 4, 'mem_retry_increment': 2, 'num_retry': 3}, func=tasks.compute_mappability_segs, args=( mgd.TempInputFile('mappability.tsv', 'chrom', 'kmer_group'), mgd.TempOutputFile('mappability_segs.tsv', 'chrom', 'kmer_group') ) ) workflow.transform( name='compute_chrom_mean_mappability', axes=('chrom',), ctx={'mem': 8, 'mem_retry_increment': 8, 'num_retry': 3}, func=tasks.compute_chrom_mean_mappability, args=( mgd.TempInputFile('mappability_segs.tsv', 'chrom', 'kmer_group'), mgd.TempOutputFile('mean_mappability.tsv', 'chrom') ) ) workflow.transform( name='write_bed', ctx={'mem': 8, 'mem_retry_increment': 8, 'num_retry': 3}, func=tasks.write_bed, args=( mgd.TempInputFile('mean_mappability.tsv', 'chrom'), mgd.TempOutputFile('mean_mappability.bed') ) ) workflow.transform( name='write_chrom_sizes', func=tasks.write_chrom_sizes, args=( mgd.InputFile(ref_genome_fasta_file), mgd.TempOutputFile('chrom_sizes.txt'), ) ) workflow.commandline( name='write_big_wig', args=( 'bedGraphToBigWig', mgd.TempInputFile('mean_mappability.bed'), mgd.TempInputFile('chrom_sizes.txt'), mgd.OutputFile(out_file) ) ) return workflow