def _create_download_decompress_concat_workflow(urls, out_file, local_download=False): workflow = pypeliner.workflow.Workflow() local_files = [] for i, url in enumerate(urls): local_files.append(mgd.TempFile('file_{}'.format(i))) workflow.setobj(mgd.TempOutputObj('url_{}'.format(i)), value=url) workflow.subworkflow(name='download_file_{}'.format(i), func=_create_download_decompress_workflow, args=( mgd.TempInputObj('url_{}'.format(i)), local_files[i].as_output(), ), kwargs={'local_download': local_download}) concat_args = [ 'cat', ] + [x.as_input() for x in local_files] + ['>', mgd.OutputFile(out_file)] workflow.commandline(name='concat', args=concat_args) return workflow
def _create_download_decompress_workflow(url, local_path, local_download=False): workflow = pypeliner.workflow.Workflow() workflow.setobj(mgd.TempOutputObj('url'), value=url) workflow.transform( name='download', ctx={'local': local_download}, func=tasks.download, args=( mgd.TempInputObj('url'), mgd.TempOutputFile('download'), ), ) workflow.transform(name='decompress', func=tasks.decompress, args=( mgd.TempInputFile('download'), mgd.OutputFile(local_path), )) return workflow
def _create_download_cosmic_workflow(ref_data_version, out_file, user, password, host='sftp-cancer.sanger.ac.uk', local_download=False): host_base_path = '/files/{}/cosmic/v83/VCF'.format( ref_data_version.lower()) coding_host_path = '/'.join([host_base_path, 'CosmicCodingMuts.vcf.gz']) non_coding_host_path = '/'.join( [host_base_path, 'CosmicNonCodingVariants.vcf.gz']) sandbox = soil.utils.workflow.get_sandbox(['bcftools', 'samtools']) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.setobj(obj=mgd.TempOutputObj('coding_host_path'), value=coding_host_path) workflow.setobj(obj=mgd.TempOutputObj('non_coding_host_path'), value=non_coding_host_path) workflow.subworkflow(name='download_coding', func=_create_download_cosmic_file_subworkflow, args=( host, mgd.TempInputObj('coding_host_path'), user, password, mgd.TempOutputFile('coding.vcf.gz'), ), kwargs={'local_download': local_download}) workflow.subworkflow(name='download_non_coding', func=_create_download_cosmic_file_subworkflow, args=( host, mgd.TempInputObj('non_coding_host_path'), user, password, mgd.TempOutputFile('non_coding.vcf.gz'), ), kwargs={'local_download': local_download}) workflow.transform(name='merge_files', func=soil.wrappers.samtools.tasks.concatenate_vcf, args=([ mgd.TempInputFile('coding.vcf.gz'), mgd.TempInputFile('non_coding.vcf.gz') ], mgd.OutputFile(out_file)), kwargs={ 'allow_overlap': True, 'index_file': mgd.OutputFile(out_file + '.tbi') }) return workflow
def create_pileup2snp_workflow(bam_file, ref_genome_fasta_file, out_file, chromosomes=None, split_size=int(1e7)): sandbox = soil.utils.workflow.get_sandbox(['bcftools', 'samtools', 'varscan']) workflow = pypeliner.workflow.Workflow(default_ctx=low_mem_ctx, default_sandbox=sandbox) workflow.setobj( obj=pypeliner.managed.TempOutputObj('config', 'regions'), value=soil.utils.genome.get_bam_regions(bam_file, split_size, chromosomes=chromosomes) ) workflow.commandline( name='run_mpileup', axes=('regions',), args=( 'samtools', 'mpileup', '-f', mgd.InputFile(ref_genome_fasta_file), '-o', mgd.TempOutputFile('region.mpileup', 'regions'), '-r', mgd.TempInputObj('config', 'regions'), mgd.InputFile(bam_file), ) ) workflow.transform( name='run_mpileup2snp', axes=('regions',), ctx=med_mem_ctx, func=tasks.mpileup2snp, args=( mgd.TempInputFile('region.mpileup', 'regions'), mgd.TempOutputFile('region.vcf', 'regions'), ) ) workflow.transform( name='compress', axes=('regions',), func=soil.wrappers.samtools.tasks.compress_vcf, args=( mgd.TempInputFile('region.vcf', 'regions'), mgd.TempOutputFile('region.vcf.gz', 'regions'), ), ) workflow.transform( name='concatenate_vcfs', func=soil.wrappers.samtools.tasks.concatenate_vcf, args=( mgd.TempInputFile('region.vcf.gz', 'regions'), mgd.OutputFile(out_file), ), ) return workflow
def create_eagle_ref_data_workflow(vcf_url_template, out_file, local_download=False): chrom_map_file = soil.utils.package_data.load_data_file( 'ref_data/data/GRCh37/chrom_map.tsv') chrom_map = pd.read_csv(chrom_map_file, sep='\t') chrom_map = chrom_map[chrom_map['ncbi'].isin( [str(x) for x in range(1, 23)])] chrom_map['url'] = chrom_map['ncbi'].apply( lambda x: vcf_url_template.format(chrom=x)) vcf_urls = chrom_map['url'].to_dict() sandbox = soil.utils.workflow.get_sandbox(['bcftools']) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.setobj(obj=mgd.TempOutputObj('vcf_url', 'chrom'), value=vcf_urls) workflow.transform(name='download_vcf_files', axes=('chrom', ), ctx={'local': local_download}, func=soil.ref_data.tasks.download, args=(mgd.TempInputObj('vcf_url', 'chrom'), mgd.TempOutputFile('raw.vcf.gz', 'chrom'))) workflow.transform(name='write_chrom_map', func=tasks.write_chrom_map_file, args=(mgd.InputFile(chrom_map_file), mgd.TempOutputFile('chrom_map.tsv'))) workflow.transform(name='rename_chroms', axes=('chrom', ), func=soil.wrappers.bcftools.tasks.rename_chroms, args=(mgd.TempInputFile('chrom_map.tsv'), mgd.TempInputFile('raw.vcf.gz', 'chrom'), mgd.TempOutputFile('renamed.bcf', 'chrom'))) workflow.transform(name='concat_vcfs', func=soil.wrappers.bcftools.tasks.concatenate_vcf, args=(mgd.TempInputFile('renamed.bcf', 'chrom'), mgd.OutputFile(out_file)), kwargs={'bcf_output': True}) workflow.commandline(name='index', args=('bcftools', 'index', mgd.InputFile(out_file), '-o', mgd.OutputFile(out_file + '.csi'))) return workflow
def _create_download_vep_plugins_workflow(urls, out_dir, local_download=False): workflow = pypeliner.workflow.Workflow() for i, url in enumerate(urls): out_file = os.path.join(out_dir, os.path.basename(url)) workflow.setobj(mgd.TempOutputObj('url_{}'.format(i)), value=url) workflow.transform(name='download_file_{}'.format(i), ctx={'local': local_download}, func=tasks.download, args=( mgd.TempInputObj('url_{}'.format(i)), mgd.OutputFile(out_file), )) return workflow
def create_titan_workflow(normal_bam_file, tumour_bam_file, dbsnp_vcf_file, mappability_file, ref_genome_fasta_file, out_file, exome_bed_file=None, sample='Tumour', threads=1): sandbox = soil.utils.workflow.get_sandbox( ['hmmcopy', 'hmmcopy_utils', 'titan']) sandbox.channels.append('conda-forge') sandbox.packages.extend(['pandas', 'rpy2']) chromosomes = soil.utils.genome.load_bam_chromosome_lengths( normal_bam_file, 'autosomes') workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.setobj(obj=mgd.TempOutputObj('init_params', 'param_idx'), value=tasks.create_intialization_parameters()) workflow.subworkflow(name='get_allele_counts', func=create_allele_counts_workflow, args=(mgd.InputFile(normal_bam_file), mgd.InputFile(tumour_bam_file), mgd.InputFile(dbsnp_vcf_file), mgd.InputFile(ref_genome_fasta_file), mgd.TempOutputFile('allele_counts.tsv')), kwargs={'chromosomes': 'autosomes'}) workflow.commandline(name='build_normal_wig', args=('readCounter', '-c', ','.join(chromosomes), mgd.InputFile(normal_bam_file), '>', mgd.TempOutputFile('normal.wig'))) workflow.commandline(name='build_tumour_wig', args=('readCounter', '-c', ','.join(chromosomes), mgd.InputFile(tumour_bam_file), '>', mgd.TempOutputFile('tumour.wig'))) workflow.commandline(name='build_gc_wig', args=('gcCounter', '-c', ','.join(chromosomes), mgd.InputFile(ref_genome_fasta_file), '>', mgd.TempOutputFile('gc.wig'))) workflow.commandline(name='build_mappability_wig', args=('mapCounter', '-c', ','.join(chromosomes), mgd.InputFile(mappability_file), '>', mgd.TempOutputFile('mappability.wig'))) workflow.transform(name='build_coverage_file', func=tasks.build_coverage_file, args=(mgd.TempInputFile('normal.wig'), mgd.TempInputFile('tumour.wig'), mgd.TempInputFile('gc.wig'), mgd.TempInputFile('mappability.wig'), mgd.TempOutputFile('coverage.wig')), kwargs={'target_file': exome_bed_file}) workflow.transform(name='run_titan', axes=('param_idx', ), ctx={ 'mem': 8, 'mem_retry_increment': 4, 'num_retry': 3, 'threads': threads }, func=tasks.run_titan, args=(mgd.TempInputFile('coverage.wig'), mgd.TempInputFile('allele_counts.tsv'), mgd.TempInputObj('init_params', 'param_idx'), mgd.TempOutputFile('run.tar.gz', 'param_idx'), mgd.TempSpace('titan_tmp', 'param_idx')), kwargs={ 'is_exome': (exome_bed_file is not None), 'sample': sample, 'threads': threads }) workflow.transform(name='build_run_stats_file', func=tasks.build_run_stats_file, args=(mgd.TempInputFile('run.tar.gz', 'param_idx'), mgd.TempInputObj('init_params', 'param_idx'), mgd.TempOutputFile('stats.tsv'))) workflow.transform(name='build_output', func=tasks.build_final_results_file, args=(mgd.TempInputFile('coverage.wig'), mgd.TempInputFile('allele_counts.tsv'), mgd.TempInputFile('run.tar.gz', 'param_idx'), mgd.TempInputFile('stats.tsv'), mgd.OutputFile(out_file), mgd.TempSpace('build_results'))) return workflow
def create_somatic_workflow(normal_bam_file, tumour_bam_file, ref_genome_fasta_file, out_file, chromosomes='default', is_exome=False, split_size=int(1e7)): sandbox = soil.utils.workflow.get_sandbox( ['bcftools', 'samtools', 'strelka']) workflow = pypeliner.workflow.Workflow(default_ctx=med_mem_ctx, default_sandbox=sandbox) workflow.setobj(obj=mgd.TempOutputObj('config', 'regions'), value=soil.utils.genome.get_bam_regions( normal_bam_file, split_size, chromosomes=chromosomes)) workflow.setobj(obj=mgd.TempOutputObj('chrom_names', 'chrom_axis'), value=get_chromosomes(normal_bam_file, chromosomes=chromosomes)) workflow.transform( name='count_fasta_bases', func=soil.wrappers.strelka.tasks.count_fasta_bases, args=( mgd.InputFile(ref_genome_fasta_file), mgd.TempOutputFile('ref_base_counts.tsv'), ), ) workflow.transform( name='get_genome_size', ctx={'local': True}, func=get_known_genome_size, ret=mgd.TempOutputObj('genome_size'), args=( mgd.InputFile(tumour_bam_file), mgd.TempInputFile('ref_base_counts.tsv'), chromosomes, ), sandbox=None, ) workflow.transform( name='get_chromosome_depths', axes=('chrom_axis', ), func=soil.wrappers.strelka.tasks.get_chromosome_depth, args=( mgd.TempInputObj('chrom_names', 'chrom_axis'), mgd.InputFile(normal_bam_file), mgd.InputFile(ref_genome_fasta_file), mgd.TempOutputFile('chrom_depth.txt', 'chrom_axis'), ), ) workflow.transform( name='merge_chromosome_depths', func=soil.wrappers.strelka.tasks.merge_chromosome_depth, args=( mgd.TempInputFile('chrom_depth.txt', 'chrom_axis'), mgd.TempOutputFile('chrom_depth_merged.txt'), ), sandbox=None, ) workflow.transform(name='call_genome_segment', axes=('regions', ), func=soil.wrappers.strelka.tasks.call_genome_segment, args=( mgd.TempInputFile('chrom_depth_merged.txt'), mgd.InputFile(normal_bam_file), mgd.InputFile(tumour_bam_file), mgd.InputFile(ref_genome_fasta_file), mgd.TempOutputFile('indels.vcf', 'regions'), mgd.TempOutputFile('snvs.vcf', 'regions'), mgd.TempSpace('call_genome_segment_tmp', 'regions'), mgd.TempInputObj('config', 'regions'), mgd.TempInputObj('genome_size'), ), kwargs={ 'is_exome': is_exome, }) workflow.transform( name='merge_indels', func=soil.wrappers.samtools.tasks.concatenate_vcf, args=( mgd.TempInputFile('indels.vcf', 'regions'), mgd.TempOutputFile('indels.vcf.gz'), ), ) workflow.transform( name='merge_snvs', func=soil.wrappers.samtools.tasks.concatenate_vcf, args=( mgd.TempInputFile('snvs.vcf', 'regions'), mgd.TempOutputFile('snvs.vcf.gz'), ), ) workflow.transform( name='merge_all', func=soil.wrappers.samtools.tasks.concatenate_vcf, args=( [ mgd.TempInputFile('indels.vcf.gz'), mgd.TempInputFile('snvs.vcf.gz') ], mgd.TempOutputFile('merged.vcf.gz'), ), kwargs={ 'allow_overlap': True, }, ) workflow.commandline(name='filter_vcf', ctx=low_mem_ctx, args=( 'bcftools', 'view', '-O', 'z', '-f', '.,PASS', '-o', mgd.OutputFile(out_file), mgd.TempInputFile('merged.vcf.gz'), )) workflow.transform(name='index_vcf', ctx=low_mem_ctx, func=soil.wrappers.samtools.tasks.index_vcf, args=( mgd.InputFile(out_file), mgd.OutputFile(out_file + '.tbi'), )) return workflow
def create_ref_panel_phase_workflow(genetic_map_file, ref_file, target_file, out_file): """ Run EAGLE using a reference panel. """ sandbox = soil.utils.workflow.get_sandbox(['bcftools', 'eagle']) workflow = pypeliner.workflow.Workflow(default_ctx=default_ctx, default_sandbox=sandbox) workflow.setobj( obj=mgd.TempOutputObj('chrom', 'chrom'), value=get_chromosomes(target_file) ) workflow.transform( name='split_ref', axes=('chrom',), func=tasks.get_chrom_variant_file, args=( mgd.TempInputObj('chrom', 'chrom'), mgd.InputFile(ref_file), mgd.TempOutputFile('ref.bcf', 'chrom') ) ) workflow.transform( name='split_target', axes=('chrom',), func=tasks.get_chrom_variant_file, args=( mgd.TempInputObj('chrom', 'chrom'), mgd.InputFile(target_file), mgd.TempOutputFile('target.bcf', 'chrom') ) ) workflow.transform( name='run_eagle', axes=('chrom',), func=tasks.run_eagle, args=( mgd.InputFile(genetic_map_file), mgd.TempInputFile('ref.bcf', 'chrom'), mgd.TempInputFile('target.bcf', 'chrom'), mgd.TempOutputFile('phased.bcf', 'chrom'), mgd.TempSpace('eagle_tmp', 'chrom') ) ) workflow.transform( name='concat_results', func=tasks.concat_results, args=( mgd.TempInputFile('phased.bcf', 'chrom'), mgd.OutputFile(out_file) ) ) workflow.commandline( name='index', args=( 'bcftools', 'index', '-t', '-o', mgd.OutputFile(out_file + '.tbi'), mgd.InputFile(out_file) ) ) return workflow
def create_multiple_lane_align_workflow(fastq_files_1, fastq_files_2, ref_genome_dir, out_bam_file, add_xs_tag=False, align_threads=1, merge_threads=1, read_group_info=None, sort_threads=1): if read_group_info is None: read_group_info = {} for key in fastq_files_1: read_group_info[key] = None sandbox = soil.utils.workflow.get_sandbox(['sambamba', 'samtools']) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.setobj(obj=mgd.TempOutputObj('read_group_info', 'lane'), value=read_group_info) workflow.subworkflow(name='align', axes=('lane', ), func=create_align_workflow, args=( mgd.InputFile('R1.fq.gz', 'lane', fnames=fastq_files_1), mgd.InputFile('R2.fq.gz', 'lane', fnames=fastq_files_2), ref_genome_dir, mgd.TempOutputFile('lane.bam', 'lane'), ), kwargs={ 'add_xs_tag': add_xs_tag, 'align_threads': align_threads, 'read_group_info': mgd.TempInputObj('read_group_info', 'lane'), 'sort_threads': sort_threads, }) workflow.transform(name='markdups_and_merge', axes=(), ctx={ 'mem': 24, 'mem_retry_increment': 8, 'num_retry': 3, 'threads': merge_threads }, func=soil.wrappers.sambamba.tasks.markdups, args=( mgd.TempInputFile('lane.bam', 'lane'), mgd.OutputFile(out_bam_file), mgd.TempSpace('markdup_tmp'), ), kwargs={ 'threads': merge_threads, }) workflow.commandline(name='index', args=( 'samtools', 'index', mgd.InputFile(out_bam_file), mgd.OutputFile(out_bam_file + '.bai'), )) return workflow
def create_topiary_workflow(hla_alleles, in_file, out_file, copy_pyensembl_cache_dir=False, iedb_dir=None, genome='GRCh37', predictor='netmhc', pyensembl_cache_dir=None): """ Run topiary. Parameters ---------- hla_alleles: list List of HLA alleles i.e. A*02:01. in_file: str Path to VCF file with variants. out_file: str Path where output will be written in tsv format. """ sandbox = soil.utils.workflow.get_sandbox([ 'topiary', ]) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.setobj(obj=mgd.TempOutputObj('raw_hla_alleles'), value=hla_alleles) workflow.setobj(obj=mgd.OutputChunks('pep_len'), value=[8, 9, 10, 11]) workflow.transform(name='filter_hla_alleles', func=tasks.filter_hla_alleles, args=(mgd.TempInputObj('raw_hla_alleles'), ), kwargs={ 'iedb_dir': iedb_dir, 'predictor': predictor, }, ret=mgd.TempOutputObj('hla_alleles')) workflow.transform(name='run_topiary', axes=('pep_len', ), ctx={ 'mem': 8, 'mem_retry_increment': 4, 'num_retry': 3 }, func=tasks.run_topiary, args=(mgd.TempInputObj('hla_alleles'), mgd.InputFile(in_file), mgd.TempOutputFile('raw.tsv', 'pep_len')), kwargs={ 'copy_pyensembl_cache_dir': copy_pyensembl_cache_dir, 'iedb_dir': iedb_dir, 'genome': genome, 'peptide_length': mgd.Template('{pep_len}', 'pep_len'), 'predictor': predictor, 'pyensembl_cache_dir': pyensembl_cache_dir }) workflow.transform(name='reformat_output', axes=(), func=tasks.reformat_output, args=(mgd.TempInputFile('raw.tsv', 'pep_len'), mgd.OutputFile(out_file))) return workflow
def create_mutect_paired_workflow(normal_bam_file, tumour_bam_file, ref_genome_fasta_file, out_file, chromosomes=None, normal_name='normal', split_size=int(1e7), tumour_name='tumour'): normal_name = get_sample(normal_bam_file, normal_name) tumour_name = get_sample(tumour_bam_file, tumour_name) sandbox = soil.utils.workflow.get_sandbox(['bcftools', 'gatk', 'samtools']) workflow = pypeliner.workflow.Workflow(default_ctx=low_mem_ctx, default_sandbox=sandbox) workflow.setobj(obj=pypeliner.managed.TempOutputObj('config', 'regions'), value=soil.utils.genome.get_bam_regions( normal_bam_file, split_size, chromosomes=chromosomes)) workflow.transform(name='run_mutect', axes=('regions', ), ctx=med_mem_ctx, func=tasks.run_mutect_paired, args=(mgd.InputFile(normal_bam_file), mgd.InputFile(tumour_bam_file), mgd.InputFile(ref_genome_fasta_file), mgd.TempInputObj('config', 'regions'), mgd.TempOutputFile('region.vcf', 'regions')), kwargs={ 'normal_name': normal_name, 'tumour_name': tumour_name }) workflow.transform(name='run_mutect_filter', axes=('regions', ), ctx=med_mem_ctx, func=tasks.run_filter_mutect, args=(mgd.TempInputFile('region.vcf', 'regions'), mgd.TempOutputFile('flagged.vcf', 'regions'))) workflow.transform(name='concatenate_vcfs', func=soil.wrappers.samtools.tasks.concatenate_vcf, args=( mgd.TempInputFile('flagged.vcf', 'regions'), mgd.TempOutputFile('merged.vcf.gz'), )) workflow.commandline(name='filter_vcf', ctx=low_mem_ctx, args=( 'bcftools', 'view', '-O', 'z', '-f', '.,PASS', '-o', mgd.OutputFile(out_file), mgd.TempInputFile('merged.vcf.gz'), )) return workflow
def crete_download_ref_data_workflow(config, out_dir, cosmic=False, local_download=False): """ Download reference files. This workflow mainly retrieves files from the internet. There are some light to moderately heavy computational tasks as well. """ if not os.path.exists(out_dir): os.makedirs(out_dir) ref_data_paths = soil.ref_data.paths.SoilRefDataPaths(out_dir) with open(ref_data_paths.config_file, 'w') as fh: yaml.dump(config, fh) if cosmic: cosmic_user = click.prompt('Please enter COSMIC user ID') cosmic_password = click.prompt('Please enter COSMIC password', hide_input=True) sandbox = soil.utils.workflow.get_sandbox(['bcftools', 'samtools']) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) for key in config: if key.endswith('url') or key.endswith('urls'): workflow.setobj(obj=mgd.TempOutputObj(key), value=config[key]) workflow.setobj(mgd.TempOutputObj('snpeff_url'), value=config['snpeff']['url']) workflow.subworkflow( name='download_ref_gene_annotations', func=_create_download_decompress_concat_workflow, args=(mgd.TempInputObj('ref_gene_annotations_gtf_urls'), mgd.OutputFile(ref_data_paths.gene_annotations_gtf_file)), kwargs={'local_download': local_download}) workflow.subworkflow(name='download_ref_genome', func=_create_download_decompress_concat_workflow, args=(mgd.TempInputObj('ref_genome_fasta_urls'), mgd.TempOutputFile('raw_ref.fasta')), kwargs={'local_download': local_download}) workflow.transform(name='lexsort_ref_genome', func=tasks.lex_sort_fasta, args=(mgd.TempInputFile('raw_ref.fasta'), mgd.OutputFile(ref_data_paths.genome_fasta_file))) workflow.subworkflow(name='download_ref_proteome', func=_create_download_decompress_concat_workflow, args=(mgd.TempInputObj('ref_proteome_fasta_urls'), mgd.TempOutputFile('raw_ref_prot.fasta')), kwargs={'local_download': local_download}) workflow.transform(name='filter_bad_proteins', func=tasks.filter_bad_proiteins, args=(mgd.TempInputFile('raw_ref_prot.fasta'), mgd.OutputFile( ref_data_paths.proteome_fasta_file))) workflow.subworkflow( name='download_ref_transcriptome', func=_create_download_decompress_concat_workflow, args=(mgd.TempInputObj('ref_transcriptome_fasta_urls'), mgd.OutputFile(ref_data_paths.transcriptome_fasta_file)), kwargs={'local_download': local_download}) workflow.transform(name='download_dbsnp', ctx={'local': local_download}, func=tasks.download, args=(mgd.TempInputObj('dbsnp_url'), mgd.OutputFile(ref_data_paths.dbsnp_vcf_file))) if cosmic: workflow.subworkflow( name='download_cosmic', func=_create_download_cosmic_workflow, args=(config['cosmic']['ref_genome_version'], mgd.OutputFile(ref_data_paths.cosmic_vcf_file), cosmic_user, cosmic_password), kwargs={'local_download': local_download}) workflow.transform(name='download_snpeff_db', ctx={'local': local_download}, func=tasks.download, args=(mgd.TempInputObj('snpeff_url'), mgd.TempOutputFile('snpeff.zip'))) workflow.transform( name='unzip_snpeff', func=tasks.unzip_file, args=(mgd.TempInputFile('snpeff.zip'), mgd.OutputFile( os.path.join(os.path.dirname(ref_data_paths.snpeff_data_dir), 'done.txt')), mgd.TempSpace('snpeff_tmp'))) workflow.transform(name='download_genetic_map', ctx={'local': local_download}, func=tasks.download, args=(mgd.TempInputObj('genetic_map_txt_url'), mgd.OutputFile(ref_data_paths.genetic_map_file))) workflow.subworkflow( name='ref_haplotype_panel', func=soil.ref_data.haplotype.workflows.create_eagle_ref_data_workflow, args=(mgd.TempInputObj('haplotype_vcf_template_url'), mgd.OutputFile(ref_data_paths.haplotypes_bcf)), kwargs={'local_download': local_download}) workflow.transform(name='download_iedb_mhc_one', ctx={'local': local_download}, func=tasks.download, args=(mgd.TempInputObj('iedb_mhc_one_url'), mgd.TempOutputFile('mhc1.tar.gz'))) workflow.transform(name='extract_iedb_mhc_one', func=tasks.extract_tar_file, args=(mgd.TempInputFile('mhc1.tar.gz'), mgd.OutputFile( os.path.join(ref_data_paths.iedb_mhc_one_dir, 'extract.done')))) workflow.transform(name='config_iedb_mhc_one', func=tasks.configure_iedb_module, args=(mgd.InputFile( os.path.join(ref_data_paths.iedb_mhc_one_dir, 'extract.done')), mgd.OutputFile( os.path.join(ref_data_paths.iedb_mhc_one_dir, 'configure.done')))) workflow.transform(name='download_vep_cache', ctx={'local': local_download}, func=tasks.download, args=(mgd.TempInputObj('vep_cache_url'), mgd.TempOutputFile('vep.tar.gz'))) workflow.transform(name='extract_vep_cache', func=tasks.extract_tar_file, args=(mgd.TempInputFile('vep.tar.gz'), mgd.OutputFile( os.path.join(ref_data_paths.vep_cache_dir, 'homo_sapiens', 'extract.done')))) workflow.subworkflow(name='download_vep_plugins', func=_create_download_vep_plugins_workflow, args=(mgd.TempInputObj('vep_plugins_urls'), ref_data_paths.vep_plugins_dir), kwargs={'local_download': local_download}) workflow.setobj(obj=mgd.TempOutputObj('pyensembl_version'), value=config['pyensembl']['version']) workflow.transform(name='download_pyensembl_cache', ctx={'local': local_download}, func=tasks.download_pyensembl_cache, args=(mgd.TempInputObj('pyensembl_version'), mgd.OutputFile( os.path.join( ref_data_paths.pyensembl_cache_dir, 'download.done'))), sandbox=soil.utils.workflow.get_sandbox(['pyensembl'])) return workflow
def create_vardict_paired_workflow(normal_bam_file, tumour_bam_file, ref_genome_fasta_file, out_file, chromosomes=None, split_size=int(5e6)): sandbox = soil.utils.workflow.get_sandbox( ['bcftools', 'samtools', 'vardict', 'vardict-java']) workflow = pypeliner.workflow.Workflow(default_ctx=low_mem_ctx, default_sandbox=sandbox) workflow.setobj(obj=pypeliner.managed.TempOutputObj('config', 'regions'), value=soil.utils.genome.get_bam_regions( normal_bam_file, split_size, chromosomes=chromosomes)) workflow.transform(name='run_vardict', axes=('regions', ), ctx=med_mem_ctx, func=tasks.run_vardict_paired, args=(mgd.InputFile(normal_bam_file), mgd.InputFile(tumour_bam_file), mgd.InputFile(ref_genome_fasta_file), mgd.TempInputObj('config', 'regions'), mgd.TempOutputFile('call.tsv', 'regions'))) workflow.transform(name='test_somatic', axes=('regions', ), func=tasks.run_test_somatic, args=(mgd.TempInputFile('call.tsv', 'regions'), mgd.TempOutputFile('somatic.tsv', 'regions'))) workflow.transform(name='write_vcf', axes=('regions', ), func=tasks.run_build_paired_vcf, args=(mgd.TempInputFile('somatic.tsv', 'regions'), mgd.TempOutputFile('region.vcf', 'regions'))) workflow.commandline(name='compress_vcf', axes=('regions', ), args=('bcftools', 'view', '-O', 'z', '-o', mgd.TempOutputFile('region.vcf.gz', 'regions'), mgd.TempInputFile('region.vcf', 'regions'))) workflow.transform(name='concatenate_vcfs', func=soil.wrappers.samtools.tasks.concatenate_vcf, args=( mgd.TempInputFile('region.vcf.gz', 'regions'), mgd.TempOutputFile('merged.vcf.gz'), )) workflow.commandline(name='filter_vcf', args=( 'bcftools', 'view', '-O', 'z', '-f', '.,PASS', '-o', mgd.TempOutputFile('filtered.vcf.gz'), mgd.TempInputFile('merged.vcf.gz'), )) workflow.commandline(name='filter_somatics', args=('bcftools', 'filter', '-i', 'INFO/STATUS[0]="StrongSomatic"', '-O', 'z', '-o', mgd.OutputFile(out_file), mgd.TempInputFile('filtered.vcf.gz'))) return workflow