def merge_perchrom_vcfs(job, perchrom_vcfs, tool_name, univ_options): """ This module will merge per-chromosome vcf files into a single genome level vcf. :param dict perchrom_vcfs: Dictionary with chromosome name as key and jobstore ID of corresponding vcf as value :param str tool_name: Name of the tool that generated the vcfs :returns: Job Store File ID for the merged vcf """ job.fileStore.logToMaster('Running merge_perchrom_vcfs for %s' % tool_name) work_dir = os.getcwd() input_files = {''.join([chrom, '.vcf']): jsid for chrom, jsid in perchrom_vcfs.items()} input_files = get_files_from_filestore(job, input_files, work_dir, docker=False) first = True with open(''.join([work_dir, '/', 'all_merged.vcf']), 'w') as outvcf: for chromvcfname in chrom_sorted([x.rstrip('.vcf') for x in input_files.keys()]): with open(input_files[chromvcfname + '.vcf'], 'r') as infile: for line in infile: line = line.strip() if line.startswith('#'): if first: print(line, file=outvcf) continue first = False print(line, file=outvcf) export_results(job, outvcf.name, univ_options, subfolder='mutations/' + tool_name) output_file = job.fileStore.writeGlobalFile(outvcf.name) return output_file
def unmerge(job, input_vcf, tool_name, chromosomes, tool_options, univ_options): """ Un-merge a vcf file into per-chromosome vcfs. :param str input_vcf: Input vcf :param str tool_name: The name of the mutation caller :param list chromosomes: List of chromosomes to retain :param dict tool_options: Options specific to the mutation caller :param dict univ_options: Dict of universal options used by almost all tools :return: dict of fsIDs, one for each chromosomal vcf :rtype: dict """ work_dir = os.getcwd() input_files = { 'input.vcf': input_vcf, 'genome.fa.fai.tar.gz': tool_options['genome_fai'] } input_files = get_files_from_filestore(job, input_files, work_dir, docker=False) input_files['genome.fa.fai'] = untargz(input_files['genome.fa.fai.tar.gz'], work_dir) read_chromosomes = defaultdict() with open(input_files['input.vcf'], 'r') as in_vcf: header = [] for line in in_vcf: if line.startswith('#'): header.append(line) continue line = line.strip() chrom = line.split()[0] if chrom in read_chromosomes: print(line, file=read_chromosomes[chrom]) else: read_chromosomes[chrom] = open( os.path.join(os.getcwd(), chrom + '.vcf'), 'w') print(''.join(header), file=read_chromosomes[chrom], end='') print(line, file=read_chromosomes[chrom]) # Process chromosomes that had no mutations for chrom in set(chromosomes).difference(set(read_chromosomes.keys())): read_chromosomes[chrom] = open( os.path.join(os.getcwd(), chrom + '.vcf'), 'w') print(''.join(header), file=read_chromosomes[chrom], end='') outdict = {} chroms = set(chromosomes).intersection(set(read_chromosomes.keys())) for chrom, chromvcf in read_chromosomes.items(): chromvcf.close() if chrom not in chroms: continue outdict[chrom] = job.fileStore.writeGlobalFile(chromvcf.name) export_results(job, outdict[chrom], chromvcf.name, univ_options, subfolder='mutations/' + tool_name) return outdict
def run_snpeff(job, merged_mutation_file, univ_options, snpeff_options): """ Run snpeff on an input vcf. :param toil.fileStore.FileID merged_mutation_file: fsID for input vcf :param dict univ_options: Dict of universal options used by almost all tools :param dict snpeff_options: Options specific to snpeff :return: fsID for the snpeffed vcf :rtype: toil.fileStore.FileID """ work_dir = os.getcwd() input_files = { 'merged_mutations.vcf': merged_mutation_file, 'snpeff_index.tar.gz': snpeff_options['index'] } input_files = get_files_from_filestore(job, input_files, work_dir, docker=False) input_files['snpeff_index'] = untargz(input_files['snpeff_index.tar.gz'], work_dir) input_files = {key: docker_path(path) for key, path in input_files.items()} parameters = [ 'eff', '-dataDir', input_files['snpeff_index'], '-c', '/'.join([ input_files['snpeff_index'], 'snpEff_' + univ_options['ref'] + '_gencode.config' ]), '-no-intergenic', '-no-downstream', '-no-upstream', # '-canon', '-noStats', univ_options['ref'] + '_gencode', input_files['merged_mutations.vcf'] ] xmx = snpeff_options['java_Xmx'] if snpeff_options[ 'java_Xmx'] else univ_options['java_Xmx'] with open('/'.join([work_dir, 'mutations.vcf']), 'w') as snpeff_file: docker_call(tool='snpeff', tool_parameters=parameters, work_dir=work_dir, dockerhub=univ_options['dockerhub'], java_xmx=xmx, outfile=snpeff_file, tool_version=snpeff_options['version']) output_file = job.fileStore.writeGlobalFile(snpeff_file.name) export_results(job, output_file, snpeff_file.name, univ_options, subfolder='mutations/snpeffed') job.fileStore.logToMaster('Ran snpeff on %s successfully' % univ_options['patient']) return output_file
def index_bamfile(job, bamfile, sample_type, univ_options, samtools_options, sample_info=None, export=True): """ Index `bamfile` using samtools :param toil.fileStore.FileID bamfile: fsID for the bam file :param str sample_type: Description of the sample to inject into the filename :param dict univ_options: Dict of universal options used by almost all tools :param dict samtools_options: Options specific to samtools :param str sample_info: Information regarding the sample that will beinjected into the filename as `sample_type`_`sample_info`.bam(.bai) :param bool export: Should the bam and bai be exported to the output directory? :return: Dict containing input bam and the generated index (.bam.bai) output_files: |- '<sample_type>(_<sample_info>).bam': fsID +- '<sample_type>(_<sample_info>).bam.bai': fsID :rtype: dict """ work_dir = os.getcwd() in_bamfile = sample_type if sample_info is not None: assert isinstance(sample_info, str) in_bamfile = '_'.join([in_bamfile, sample_info]) in_bamfile += '.bam' input_files = {in_bamfile: bamfile} input_files = get_files_from_filestore(job, input_files, work_dir, docker=True) parameters = ['index', input_files[in_bamfile]] docker_call(tool='samtools', tool_parameters=parameters, work_dir=work_dir, dockerhub=univ_options['dockerhub'], tool_version=samtools_options['version']) out_bai = '/'.join([work_dir, in_bamfile + '.bai']) output_files = { in_bamfile: bamfile, in_bamfile + '.bai': job.fileStore.writeGlobalFile(out_bai) } if export: export_results(job, bamfile, os.path.splitext(out_bai)[0], univ_options, subfolder='alignments') export_results(job, output_files[in_bamfile + '.bai'], out_bai, univ_options, subfolder='alignments') job.fileStore.logToMaster('Ran samtools-index on %s:%s successfully' % (univ_options['patient'], sample_type)) return output_files
def merge_perchrom_vcfs(job, perchrom_vcfs, tool_name, univ_options): """ This module will merge per-chromosome vcf files into a single genome level vcf. :param dict perchrom_vcfs: Dictionary with chromosome name as key and jobstore ID of corresponding vcf as value :param str tool_name: Name of the tool that generated the vcfs :returns: Job Store File ID for the merged vcf """ job.fileStore.logToMaster('Running merge_perchrom_vcfs for %s' % tool_name) work_dir = os.getcwd() input_files = {''.join([chrom, '.vcf']): jsid for chrom, jsid in perchrom_vcfs.items()} input_files = get_files_from_filestore(job, input_files, work_dir, docker=False) first = True with open(''.join([work_dir, '/', 'all_merged.vcf']), 'w') as outvcf: for chromvcfname in chrom_sorted([x.rstrip('.vcf') for x in input_files.keys()]): with open(input_files[chromvcfname + '.vcf'], 'r') as infile: for line in infile: line = line.strip() if line.startswith('#'): if first: print(line, file=outvcf) continue first = False print(line, file=outvcf) output_file = job.fileStore.writeGlobalFile(outvcf.name) export_results(job, output_file, outvcf.name, univ_options, subfolder='mutations/' + tool_name) return output_file
def run_muse_sump_perchrom(job, muse_output, univ_options, muse_options, chrom): """ This module will run muse sump on the muse output """ job.fileStore.logToMaster('Running muse sump on %s:%s' % (univ_options['patient'], chrom)) work_dir = os.getcwd() input_files = { 'MuSE.txt': muse_output, 'dbsnp_coding.vcf.gz': muse_options['dbsnp_vcf'], 'dbsnp_coding.vcf.gz.tbi.tmp': muse_options['dbsnp_tbi']} input_files = get_files_from_filestore(job, input_files, work_dir, docker=False) tbi = os.path.splitext(input_files['dbsnp_coding.vcf.gz.tbi.tmp'])[0] print({x: os.stat(x) for x in os.listdir(work_dir)}, file=sys.stderr) time.sleep(2) shutil.copy(input_files['dbsnp_coding.vcf.gz.tbi.tmp'], tbi) os.chmod(tbi, 0777) open(tbi, 'a').close() input_files = {key: docker_path(path) for key, path in input_files.items()} print({x: os.stat(x) for x in os.listdir(work_dir)}, file=sys.stderr) output_file = ''.join([work_dir, '/', chrom, '.vcf']) parameters = ['sump', '-I', input_files['MuSE.txt'], '-O', docker_path(output_file), '-D', input_files['dbsnp_coding.vcf.gz'], '-E'] docker_call(tool='muse', tool_parameters=parameters, work_dir=work_dir, dockerhub=univ_options['dockerhub']) export_results(job, output_file, univ_options, subfolder='mutations/muse') outfile = job.fileStore.writeGlobalFile(output_file) return outfile
def index_bamfile(job, bamfile, sample_type, univ_options): """ This module indexes BAMFILE ARGUMENTS 1. bamfile: <JSid for a bam file> 2. sample_type: string of 'tumor_dna' or 'normal_dna' 3. univ_options: Dict of universal arguments used by almost all tools univ_options +- 'dockerhub': <dockerhub to use> RETURN VALUES 1. output_files: REFER output_files in run_bwa(). This module is the one is the one that generates the files. """ job.fileStore.logToMaster('Running samtools-index on %s:%s' % (univ_options['patient'], sample_type)) work_dir = os.getcwd() in_bamfile = '_'.join([sample_type, 'fix_pg_sorted.bam']) input_files = { in_bamfile: bamfile} input_files = get_files_from_filestore(job, input_files, work_dir, docker=True) parameters = ['index', input_files[in_bamfile]] docker_call(tool='samtools', tool_parameters=parameters, work_dir=work_dir, dockerhub=univ_options['dockerhub']) out_bai = '/'.join([work_dir, in_bamfile + '.bai']) output_files = {in_bamfile: bamfile, in_bamfile + '.bai': job.fileStore.writeGlobalFile(out_bai)} export_results(job, os.path.splitext(out_bai)[0], univ_options, subfolder='alignments') export_results(job, out_bai, univ_options, subfolder='alignments') return output_files
def run_filter_radia(job, bams, radia_file, univ_options, radia_options, chrom): """ This module will run filterradia on the RNA and DNA bams. ARGUMENTS 1. bams: REFER ARGUMENTS of run_radia() 2. univ_options: REFER ARGUMENTS of run_radia() 3. radia_file: <JSid of vcf generated by run_radia()> 3. radia_options: REFER ARGUMENTS of run_radia() 4. chrom: REFER ARGUMENTS of run_radia() RETURN VALUES 1. output_file: <JSid of radia_filtered_CHROM.vcf> """ job.fileStore.logToMaster('Running filter-radia on %s:%s' % (univ_options['patient'], chrom)) work_dir = os.getcwd() input_files = { 'rna.bam': bams['tumor_rna'], 'rna.bam.bai': bams['tumor_rnai'], 'tumor.bam': bams['tumor_dna'], 'tumor.bam.bai': bams['tumor_dnai'], 'normal.bam': bams['normal_dna'], 'normal.bam.bai': bams['normal_dnai'], 'radia.vcf': radia_file, 'genome.fa.tar.gz': radia_options['genome_fasta'], 'genome.fa.fai.tar.gz': radia_options['genome_fai']} input_files = get_files_from_filestore(job, input_files, work_dir, docker=False) for key in ('genome.fa', 'genome.fa.fai'): input_files[key] = untargz(input_files[key + '.tar.gz'], work_dir) input_files = {key: docker_path(path) for key, path in input_files.items()} filterradia_log = ''.join([work_dir, '/radia_filtered_', chrom, '_radia.log']) parameters = [univ_options['patient'], # shortID chrom.lstrip('chr'), input_files['radia.vcf'], '/data', '/home/radia/scripts', '-d', '/home/radia/data/hg19/snp135', '-r', '/home/radia/data/hg19/retroGenes/', '-p', '/home/radia/data/hg19/pseudoGenes/', '-c', '/home/radia/data/hg19/cosmic/', '-t', '/home/radia/data/hg19/gaf/2_1', '--noSnpEff', '--noBlacklist', '--noTargets', '--noRnaBlacklist', '-f', input_files['genome.fa'], '--log=INFO', '-g', docker_path(filterradia_log)] docker_call(tool='filterradia', tool_parameters=parameters, work_dir=work_dir, dockerhub=univ_options['dockerhub']) output_file = ''.join([work_dir, '/', chrom, '.vcf']) os.rename(''.join([work_dir, '/', univ_options['patient'], '_', chrom, '.vcf']), output_file) export_results(job, output_file, univ_options, subfolder='mutations/radia') output_file = job.fileStore.writeGlobalFile(output_file) return output_file
def run_mutect_perchrom(job, tumor_bam, normal_bam, univ_options, mutect_options, chrom): """ Run MuTect call on a single chromosome in the input bams. :param dict tumor_bam: Dict of bam and bai for tumor DNA-Seq :param dict normal_bam: Dict of bam and bai for normal DNA-Seq :param dict univ_options: Dict of universal options used by almost all tools :param dict mutect_options: Options specific to MuTect :param str chrom: Chromosome to process :return: fsID for the chromsome vcf :rtype: toil.fileStore.FileID """ work_dir = os.getcwd() input_files = { 'tumor.bam': tumor_bam['tumor_dna_fix_pg_sorted.bam'], 'tumor.bam.bai': tumor_bam['tumor_dna_fix_pg_sorted.bam.bai'], 'normal.bam': normal_bam['normal_dna_fix_pg_sorted.bam'], 'normal.bam.bai': normal_bam['normal_dna_fix_pg_sorted.bam.bai'], 'genome.fa.tar.gz': mutect_options['genome_fasta'], 'genome.fa.fai.tar.gz': mutect_options['genome_fai'], 'genome.dict.tar.gz': mutect_options['genome_dict'], 'cosmic.vcf.tar.gz': mutect_options['cosmic_vcf'], 'cosmic.vcf.idx.tar.gz': mutect_options['cosmic_idx'], 'dbsnp.vcf.gz': mutect_options['dbsnp_vcf'], 'dbsnp.vcf.idx.tar.gz': mutect_options['dbsnp_idx']} input_files = get_files_from_filestore(job, input_files, work_dir, docker=False) # dbsnp.vcf should be bgzipped, but all others should be tar.gz'd input_files['dbsnp.vcf'] = gunzip(input_files['dbsnp.vcf.gz']) for key in ('genome.fa', 'genome.fa.fai', 'genome.dict', 'cosmic.vcf', 'cosmic.vcf.idx', 'dbsnp.vcf.idx'): input_files[key] = untargz(input_files[key + '.tar.gz'], work_dir) input_files = {key: docker_path(path) for key, path in input_files.items()} mutout = ''.join([work_dir, '/', chrom, '.out']) mutvcf = ''.join([work_dir, '/', chrom, '.vcf']) parameters = ['-R', input_files['genome.fa'], '--cosmic', input_files['cosmic.vcf'], '--dbsnp', input_files['dbsnp.vcf'], '--input_file:normal', input_files['normal.bam'], '--input_file:tumor', input_files['tumor.bam'], # '--tumor_lod', str(10), # '--initial_tumor_lod', str(4.0), '-L', chrom, '--out', docker_path(mutout), '--vcf', docker_path(mutvcf) ] java_xmx = mutect_options['java_Xmx'] if mutect_options['java_Xmx'] \ else univ_options['java_Xmx'] docker_call(tool='mutect', tool_parameters=parameters, work_dir=work_dir, dockerhub=univ_options['dockerhub'], java_xmx=java_xmx, tool_version=mutect_options['version']) output_file = job.fileStore.writeGlobalFile(mutvcf) export_results(job, output_file, mutvcf, univ_options, subfolder='mutations/mutect') job.fileStore.logToMaster('Ran MuTect on %s:%s successfully' % (univ_options['patient'], chrom)) return output_file
def merge_perchrom_mutations(job, chrom, mutations, univ_options): """ This module will accept job store ids for vcf files for all snvs, and will merge the calls for a single provided chromosome. :param job: job :param str chrom: Chromosome to process :param dict mutations: dict of dicts of the various mutation caller names as keys, and a dict of per chromosome job store ids for vcfs as value :param dict univ_options: Universal Options :returns dict of merged vcf """ work_dir = os.getcwd() from protect.mutation_calling.muse import process_muse_vcf from protect.mutation_calling.mutect import process_mutect_vcf from protect.mutation_calling.radia import process_radia_vcf from protect.mutation_calling.somaticsniper import process_somaticsniper_vcf from protect.mutation_calling.strelka import process_strelka_vcf mutations.pop('indels') mutations.pop('fusions') mutations['strelka'] = mutations['strelka']['snvs'] vcf_processor = {'mutect': process_mutect_vcf, 'muse': process_muse_vcf, 'radia': process_radia_vcf, 'somaticsniper': process_somaticsniper_vcf, 'strelka': process_strelka_vcf, } # 'fusions': lambda x: None, # 'indels': lambda x: None} # For now, let's just say 2 out of n need to call it. # num_preds = len(mutations) # majority = int((num_preds + 0.5) / 2) majority = 2 # Get input files perchrom_mutations = {caller: vcf_processor[caller](job, mutations[caller][chrom], work_dir, univ_options) for caller in mutations.keys()} # Read in each file to a dict vcf_lists = {caller: read_vcf(vcf_file) for caller, vcf_file in perchrom_mutations.items()} all_positions = list(set(itertools.chain(*vcf_lists.values()))) with open(''.join([work_dir, '/', chrom, '.vcf']), 'w') as outfile: print('##fileformat=VCFv4.0', file=outfile) print('##INFO=<ID=callers,Number=.,Type=String,Description=List of supporting callers.', file=outfile) print('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO', file=outfile) for position in sorted(all_positions): hits = {caller: position in vcf_lists[caller] for caller in perchrom_mutations.keys()} if sum(hits.values()) >= majority: print(position[0], position[1], '.', position[2], position[3], '.', 'PASS', 'callers=' + ','.join([caller for caller, hit in hits.items() if hit]), sep='\t', file=outfile) fsid = job.fileStore.writeGlobalFile(outfile.name) export_results(job, fsid, outfile.name, univ_options, subfolder='mutations/merged') return fsid
def merge_perchrom_mutations(job, chrom, mutations, univ_options): """ This module will accept job store ids for vcf files for all snvs, and will merge the calls for a single provided chromosome. :param job: job :param str chrom: Chromosome to process :param dict mutations: dict of dicts of the various mutation caller names as keys, and a dict of per chromosome job store ids for vcfs as value :param dict univ_options: Universal Options :returns dict of merged vcf """ work_dir = os.getcwd() from protect.mutation_calling.muse import process_muse_vcf from protect.mutation_calling.mutect import process_mutect_vcf from protect.mutation_calling.radia import process_radia_vcf from protect.mutation_calling.somaticsniper import process_somaticsniper_vcf from protect.mutation_calling.strelka import process_strelka_vcf mutations.pop('indels') mutations.pop('fusions') mutations['strelka'] = mutations['strelka']['snvs'] vcf_processor = {'mutect': process_mutect_vcf, 'muse': process_muse_vcf, 'radia': process_radia_vcf, 'somaticsniper': process_somaticsniper_vcf, 'strelka': process_strelka_vcf, } # 'fusions': lambda x: None, # 'indels': lambda x: None} # For now, let's just say 2 out of n need to call it. # num_preds = len(mutations) # majority = int((num_preds + 0.5) / 2) majority = 2 # Get input files perchrom_mutations = {caller: vcf_processor[caller](job, mutations[caller][chrom], work_dir, univ_options) for caller in mutations.keys()} # Read in each file to a dict vcf_lists = {caller: read_vcf(vcf_file) for caller, vcf_file in perchrom_mutations.items()} all_positions = list(set(itertools.chain(*vcf_lists.values()))) with open(''.join([work_dir, '/', chrom, '.vcf']), 'w') as outfile: print('##fileformat=VCFv4.0', file=outfile) print('##INFO=<ID=callers,Number=.,Type=String,Description=List of supporting callers.', file=outfile) print('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO', file=outfile) for position in sorted(all_positions): hits = {caller: position in vcf_lists[caller] for caller in perchrom_mutations.keys()} if sum(hits.values()) >= majority: print(position[0], position[1], '.', position[2], position[3], '.', 'PASS', 'callers=' + ','.join([caller for caller, hit in hits.items() if hit]), sep='\t', file=outfile) export_results(job, outfile.name, univ_options, subfolder='mutations/merged') outfile = job.fileStore.writeGlobalFile(outfile.name) return outfile
def run_rsem(job, rna_bam, univ_options, rsem_options): """ Run rsem on the input RNA bam. ARGUMENTS :param toil.fileStore.FileID rna_bam: fsID of a transcriptome bam generated by STAR :param dict univ_options: Dict of universal options used by almost all tools :param dict rsem_options: Options specific to rsem :return: Dict of gene- and isoform-level expression calls output_files: |- 'rsem.genes.results': fsID +- 'rsem.isoforms.results': fsID :rtype: dict """ work_dir = os.getcwd() input_files = { 'star_transcriptome.bam': rna_bam, 'rsem_index.tar.gz': rsem_options['index'] } input_files = get_files_from_filestore(job, input_files, work_dir, docker=False) input_files['rsem_index'] = untargz(input_files['rsem_index.tar.gz'], work_dir) input_files = { key: docker_path(path) for key, path in list(input_files.items()) } parameters = [ '--paired-end', '-p', str(20), '--bam', input_files['star_transcriptome.bam'], '--no-bam-output', '/'.join([input_files['rsem_index'], univ_options['ref']]), 'rsem' ] docker_call(tool='rsem', tool_parameters=parameters, work_dir=work_dir, dockerhub=univ_options['dockerhub'], tool_version=rsem_options['version']) output_files = {} for filename in ('rsem.genes.results', 'rsem.isoforms.results'): output_files[filename] = job.fileStore.writeGlobalFile('/'.join( [work_dir, filename])) export_results(job, output_files[filename], '/'.join([work_dir, filename]), univ_options, subfolder='expression') job.fileStore.logToMaster('Ran rsem on %s successfully' % univ_options['patient']) return output_files
def run_rsem(job, rna_bam, univ_options, rsem_options): """ This module will run rsem on the RNA Bam file. ARGUMENTS 1. rna_bam: <JSid of rnaAligned.toTranscriptome.out.bam> 2. univ_options: Dict of universal arguments used by almost all tools univ_options +- 'dockerhub': <dockerhub to use> 3. rsem_options: Dict of parameters specific to rsem rsem_options |- 'tool_index': <JSid for the rsem index tarball> +- 'n': <number of threads to allocate> RETURN VALUES 1. output_file: <Jsid of rsem.isoforms.results> This module corresponds to node 9 on the tree """ job.fileStore.logToMaster('Running rsem on %s' % univ_options['patient']) work_dir = os.getcwd() input_files = { 'star_transcriptome.bam': rna_bam, 'rsem_index.tar.gz': rsem_options['tool_index'] } input_files = get_files_from_filestore(job, input_files, work_dir, docker=False) input_files['rsem_index'] = untargz(input_files['rsem_index.tar.gz'], work_dir) input_files = {key: docker_path(path) for key, path in input_files.items()} parameters = [ '--paired-end', '-p', str(rsem_options['n']), '--bam', input_files['star_transcriptome.bam'], '--no-bam-output', '/'.join([input_files['rsem_index'], 'hg19']), 'rsem' ] docker_call(tool='rsem', tool_parameters=parameters, work_dir=work_dir, dockerhub=univ_options['dockerhub']) output_files = {} for filename in ('rsem.genes.results', 'rsem.isoforms.results'): output_files[filename] = job.fileStore.writeGlobalFile('/'.join( [work_dir, filename])) export_results(job, output_files[filename], '/'.join([work_dir, filename]), univ_options, subfolder='expression') return output_files
def merge_phlat_calls(job, tumor_phlat, normal_phlat, rna_phlat, univ_options): """ This module will merge the results form running PHLAT on the 3 input fastq pairs. ARGUMENTS 1. tumor_phlat: <JSid for tumor DNA called alleles> 2. normal_phlat: <JSid for normal DNA called alleles> 3. rna_phlat: <JSid for tumor RNA called alleles> RETURN VALUES 1. output_files: Dict of JSids for consensus MHCI and MHCII alleles output_files |- 'mhci_alleles.list': <JSid> +- 'mhcii_alleles.list': <JSid> This module corresponds to node 14 on the tree """ job.fileStore.logToMaster('Merging Phlat calls') work_dir = os.getcwd() input_files = { 'tumor_dna': tumor_phlat, 'normal_dna': normal_phlat, 'tumor_rna': rna_phlat} input_files = get_files_from_filestore(job, input_files, work_dir) with open(input_files['tumor_dna'], 'r') as td_file, \ open(input_files['normal_dna'], 'r') as nd_file, \ open(input_files['tumor_rna'], 'r') as tr_file: # TODO: Could this be a defautdict? mhc_alleles = {'HLA_A': [], 'HLA_B': [], 'HLA_C': [], 'HLA_DPA': [], 'HLA_DQA': [], 'HLA_DPB': [], 'HLA_DQB': [], 'HLA_DRB': []} for phlatfile in td_file, nd_file, tr_file: mhc_alleles = parse_phlat_file(phlatfile, mhc_alleles) # Get most probable alleles for each allele group and print to output with open(os.path.join(work_dir, 'mhci_alleles.list'), 'w') as mhci_file, \ open(os.path.join(work_dir, 'mhcii_alleles.list'), 'w') as mhcii_file: for mhci_group in ['HLA_A', 'HLA_B', 'HLA_C']: mpa = most_probable_alleles(mhc_alleles[mhci_group]) print('\n'.join([''.join(['HLA-', x]) for x in mpa]), file=mhci_file) drb_mpa = most_probable_alleles(mhc_alleles['HLA_DRB']) print('\n'.join([''.join(['HLA-', x]) for x in drb_mpa]), file=mhcii_file) dqa_mpa = most_probable_alleles(mhc_alleles['HLA_DQA']) dqb_mpa = most_probable_alleles(mhc_alleles['HLA_DQB']) for dqa_allele in dqa_mpa: for dqb_allele in dqb_mpa: print(''.join(['HLA-', dqa_allele, '/', dqb_allele]), file=mhcii_file) output_files = defaultdict() for allele_file in ['mhci_alleles.list', 'mhcii_alleles.list']: output_files[allele_file] = job.fileStore.writeGlobalFile(os.path.join(work_dir, allele_file)) export_results(job, os.path.join(work_dir, allele_file), univ_options, subfolder='haplotyping') return output_files
def merge_phlat_calls(job, tumor_phlat, normal_phlat, rna_phlat, univ_options): """ This module will merge the results form running PHLAT on the 3 input fastq pairs. ARGUMENTS 1. tumor_phlat: <JSid for tumor DNA called alleles> 2. normal_phlat: <JSid for normal DNA called alleles> 3. rna_phlat: <JSid for tumor RNA called alleles> RETURN VALUES 1. output_files: Dict of JSids for consensus MHCI and MHCII alleles output_files |- 'mhci_alleles.list': <JSid> +- 'mhcii_alleles.list': <JSid> This module corresponds to node 14 on the tree """ job.fileStore.logToMaster('Merging Phlat calls') work_dir = os.getcwd() input_files = { 'tumor_dna': tumor_phlat, 'normal_dna': normal_phlat, 'tumor_rna': rna_phlat} input_files = get_files_from_filestore(job, input_files, work_dir) with open(input_files['tumor_dna'], 'r') as td_file, \ open(input_files['normal_dna'], 'r') as nd_file, \ open(input_files['tumor_rna'], 'r') as tr_file: # TODO: Could this be a defautdict? mhc_alleles = {'HLA_A': [], 'HLA_B': [], 'HLA_C': [], 'HLA_DPA': [], 'HLA_DQA': [], 'HLA_DPB': [], 'HLA_DQB': [], 'HLA_DRB': []} for phlatfile in td_file, nd_file, tr_file: mhc_alleles = parse_phlat_file(phlatfile, mhc_alleles) # Get most probable alleles for each allele group and print to output with open(os.path.join(work_dir, 'mhci_alleles.list'), 'w') as mhci_file, \ open(os.path.join(work_dir, 'mhcii_alleles.list'), 'w') as mhcii_file: for mhci_group in ['HLA_A', 'HLA_B', 'HLA_C']: mpa = most_probable_alleles(mhc_alleles[mhci_group]) print('\n'.join([''.join(['HLA-', x]) for x in mpa]), file=mhci_file) drb_mpa = most_probable_alleles(mhc_alleles['HLA_DRB']) print('\n'.join([''.join(['HLA-', x]) for x in drb_mpa]), file=mhcii_file) dqa_mpa = most_probable_alleles(mhc_alleles['HLA_DQA']) dqb_mpa = most_probable_alleles(mhc_alleles['HLA_DQB']) for dqa_allele in dqa_mpa: for dqb_allele in dqb_mpa: print(''.join(['HLA-', dqa_allele, '/', dqb_allele]), file=mhcii_file) output_files = defaultdict() for allele_file in ['mhci_alleles.list', 'mhcii_alleles.list']: output_files[allele_file] = job.fileStore.writeGlobalFile(os.path.join(work_dir, allele_file)) export_results(job, output_files[allele_file], os.path.join(work_dir, allele_file), univ_options, subfolder='haplotyping') return output_files
def run_muse_sump_perchrom(job, muse_output, univ_options, muse_options, chrom): """ Run MuSE sump on the MuSE call generated vcf. :param toil.fileStore.FileID muse_output: vcf generated by MuSE call :param dict univ_options: Dict of universal options used by almost all tools :param dict muse_options: Options specific to MuSE :param str chrom: Chromosome to process :return: fsID for the chromsome vcf :rtype: toil.fileStore.FileID """ work_dir = os.getcwd() input_files = { 'MuSE.txt': muse_output, 'dbsnp_coding.vcf.gz': muse_options['dbsnp_vcf'], 'dbsnp_coding.vcf.gz.tbi.tmp': muse_options['dbsnp_tbi'] } input_files = get_files_from_filestore(job, input_files, work_dir, docker=False) tbi = os.path.splitext(input_files['dbsnp_coding.vcf.gz.tbi.tmp'])[0] time.sleep(2) shutil.copy(input_files['dbsnp_coding.vcf.gz.tbi.tmp'], tbi) os.chmod(tbi, 0777) open(tbi, 'a').close() input_files = {key: docker_path(path) for key, path in input_files.items()} output_file = ''.join([work_dir, '/', chrom, '.vcf']) parameters = [ 'sump', '-I', input_files['MuSE.txt'], '-O', docker_path(output_file), '-D', input_files['dbsnp_coding.vcf.gz'], '-E' ] docker_call(tool='muse', tool_parameters=parameters, work_dir=work_dir, dockerhub=univ_options['dockerhub'], tool_version=muse_options['version']) outfile = job.fileStore.writeGlobalFile(output_file) export_results(job, outfile, output_file, univ_options, subfolder='mutations/muse') job.fileStore.logToMaster('Ran MuSE sump on %s:%s successfully' % (univ_options['patient'], chrom)) return outfile
def merge_phlat_calls(job, tumor_phlat, normal_phlat, rna_phlat, univ_options): """ Merge tumor, normal and tumor rna Haplotypes into consensus calls. :param toil.fileStore.FileID tumor_phlat: fsID for HLA haplotypes called from tumor DNA :param toil.fileStore.FileID normal_phlat: fsID for HLA haplotypes called from normal DNA :param toil.fileStore.FileID rna_phlat: fsID for HLA haplotypes called from tumor RNA :param dict univ_options: Dict of universal options used by almost all tools :return: Dict of fsIDs for consensus MHCI and MHCII alleles output_files |- 'mhci_alleles.list': fsID +- 'mhcii_alleles.list': fsID :rtype: dict """ job.fileStore.logToMaster('Merging Phlat calls') work_dir = os.getcwd() input_files = { 'tumor_dna': tumor_phlat, 'normal_dna': normal_phlat, 'tumor_rna': rna_phlat} input_files = get_files_from_filestore(job, input_files, work_dir) with open(input_files['tumor_dna'], 'r') as td_file, \ open(input_files['normal_dna'], 'r') as nd_file, \ open(input_files['tumor_rna'], 'r') as tr_file: # TODO: Could this be a defautdict? mhc_alleles = {'HLA_A': [], 'HLA_B': [], 'HLA_C': [], 'HLA_DPA': [], 'HLA_DQA': [], 'HLA_DPB': [], 'HLA_DQB': [], 'HLA_DRB': []} for phlatfile in td_file, nd_file, tr_file: mhc_alleles = parse_phlat_file(phlatfile, mhc_alleles) # Get most probable alleles for each allele group and print to output with open(os.path.join(work_dir, 'mhci_alleles.list'), 'w') as mhci_file, \ open(os.path.join(work_dir, 'mhcii_alleles.list'), 'w') as mhcii_file: for mhci_group in ['HLA_A', 'HLA_B', 'HLA_C']: mpa = most_probable_alleles(mhc_alleles[mhci_group]) print('\n'.join([''.join(['HLA-', x]) for x in mpa]), file=mhci_file) drb_mpa = most_probable_alleles(mhc_alleles['HLA_DRB']) print('\n'.join([''.join(['HLA-', x]) for x in drb_mpa]), file=mhcii_file) dqa_mpa = most_probable_alleles(mhc_alleles['HLA_DQA']) dqb_mpa = most_probable_alleles(mhc_alleles['HLA_DQB']) for dqa_allele in dqa_mpa: for dqb_allele in dqb_mpa: print(''.join(['HLA-', dqa_allele, '/', dqb_allele]), file=mhcii_file) output_files = defaultdict() for allele_file in ['mhci_alleles.list', 'mhcii_alleles.list']: output_files[allele_file] = job.fileStore.writeGlobalFile(os.path.join(work_dir, allele_file)) export_results(job, output_files[allele_file], os.path.join(work_dir, allele_file), univ_options, subfolder='haplotyping') return output_files
def run_transgene(job, snpeffed_file, univ_options, transgene_options): """ This module will run transgene on the input vcf file from the aggregator and produce the peptides for MHC prediction ARGUMENTS 1. snpeffed_file: <JSid for snpeffed vcf> 2. univ_options: Dict of universal arguments used by almost all tools univ_options +- 'dockerhub': <dockerhub to use> 3. transgene_options: Dict of parameters specific to transgene transgene_options +- 'gencode_peptide_fasta': <JSid for the gencode protein fasta> RETURN VALUES 1. output_files: Dict of transgened n-mer peptide fastas output_files |- 'transgened_tumor_9_mer_snpeffed.faa': <JSid> |- 'transgened_tumor_10_mer_snpeffed.faa': <JSid> +- 'transgened_tumor_15_mer_snpeffed.faa': <JSid> This module corresponds to node 17 on the tree """ job.fileStore.logToMaster('Running transgene on %s' % univ_options['patient']) work_dir = os.getcwd() input_files = { 'snpeffed_muts.vcf': snpeffed_file, 'pepts.fa.tar.gz': transgene_options['gencode_peptide_fasta']} input_files = get_files_from_filestore(job, input_files, work_dir, docker=False) input_files['pepts.fa'] = untargz(input_files['pepts.fa.tar.gz'], work_dir) input_files = {key: docker_path(path) for key, path in input_files.items()} parameters = ['--peptides', input_files['pepts.fa'], '--snpeff', input_files['snpeffed_muts.vcf'], '--prefix', 'transgened', '--pep_lens', '9,10,15'] docker_call(tool='transgene', tool_parameters=parameters, work_dir=work_dir, dockerhub=univ_options['dockerhub']) output_files = defaultdict() for peplen in ['9', '10', '15']: peptfile = '_'.join(['transgened_tumor', peplen, 'mer_snpeffed.faa']) mapfile = '_'.join(['transgened_tumor', peplen, 'mer_snpeffed.faa.map']) export_results(job, peptfile, univ_options, subfolder='peptides') export_results(job, mapfile, univ_options, subfolder='peptides') output_files[peptfile] = job.fileStore.writeGlobalFile(os.path.join(work_dir, peptfile)) output_files[mapfile] = job.fileStore.writeGlobalFile(os.path.join(work_dir, mapfile)) return output_files
def run_rsem(job, rna_bam, univ_options, rsem_options): """ This module will run rsem on the RNA Bam file. ARGUMENTS 1. rna_bam: <JSid of rnaAligned.toTranscriptome.out.bam> 2. univ_options: Dict of universal arguments used by almost all tools univ_options +- 'dockerhub': <dockerhub to use> 3. rsem_options: Dict of parameters specific to rsem rsem_options |- 'tool_index': <JSid for the rsem index tarball> +- 'n': <number of threads to allocate> RETURN VALUES 1. output_file: <Jsid of rsem.isoforms.results> This module corresponds to node 9 on the tree """ job.fileStore.logToMaster('Running rsem on %s' % univ_options['patient']) work_dir = os.getcwd() input_files = { 'star_transcriptome.bam': rna_bam, 'rsem_index.tar.gz': rsem_options['tool_index']} input_files = get_files_from_filestore(job, input_files, work_dir, docker=False) input_files['rsem_index'] = untargz(input_files['rsem_index.tar.gz'], work_dir) input_files = {key: docker_path(path) for key, path in input_files.items()} print(os.listdir('.'), file=sys.stderr) parameters = ['--paired-end', '-p', str(rsem_options['n']), '--bam', input_files['star_transcriptome.bam'], '--no-bam-output', '/'.join([input_files['rsem_index'], 'hg19']), 'rsem'] print(parameters, file=sys.stderr) docker_call(tool='rsem', tool_parameters=parameters, work_dir=work_dir, dockerhub=univ_options['dockerhub']) print(os.listdir('.'), file=sys.stderr) output_files = {} for filename in ('rsem.genes.results', 'rsem.isoforms.results'): output_files[filename] = job.fileStore.writeGlobalFile('/'.join([work_dir, filename])) export_results(job, '/'.join([work_dir, filename]), univ_options, subfolder='expression') return output_files
def run_snpeff(job, merged_mutation_file, univ_options, snpeff_options): """ This module will run snpeff on the aggregated mutation calls. Currently the only mutations called are SNPs hence SnpEff suffices. This node will be replaced in the future with another translator. ARGUMENTS 1. merged_mutation_file: <JSid for merged vcf> 2. univ_options: Dict of universal arguments used by almost all tools univ_options +- 'dockerhub': <dockerhub to use> 3. snpeff_options: Dict of parameters specific to snpeff snpeff_options +- 'tool_index': <JSid for the snpEff index tarball> RETURN VALUES 1. output_file: <JSid for the snpeffed vcf> This node corresponds to node 16 on the tree """ job.fileStore.logToMaster('Running snpeff on %s' % univ_options['patient']) work_dir = os.getcwd() input_files = { 'merged_mutations.vcf': merged_mutation_file, 'snpeff_index.tar.gz': snpeff_options['tool_index']} input_files = get_files_from_filestore(job, input_files, work_dir, docker=False) input_files['snpeff_index'] = untargz(input_files['snpeff_index.tar.gz'], work_dir) input_files = {key: docker_path(path) for key, path in input_files.items()} parameters = ['eff', '-dataDir', input_files['snpeff_index'], '-c', '/'.join([input_files['snpeff_index'], 'snpEff_hg19_gencode.config']), '-no-intergenic', '-no-downstream', '-no-upstream', # '-canon', '-noStats', 'hg19_gencode', input_files['merged_mutations.vcf']] xmx = snpeff_options['java_Xmx'] if snpeff_options['java_Xmx'] else univ_options['java_Xmx'] with open('/'.join([work_dir, 'mutations.vcf']), 'w') as snpeff_file: docker_call(tool='snpeff', tool_parameters=parameters, work_dir=work_dir, dockerhub=univ_options['dockerhub'], java_opts=xmx, outfile=snpeff_file) export_results(job, snpeff_file.name, univ_options, subfolder='mutations/snpeffed') output_file = job.fileStore.writeGlobalFile(snpeff_file.name) return output_file
def unmerge(job, input_vcf, tool_name, tool_options, univ_options): """ Un-merges a vcf file into a file per chromosome. :param str input_vcf: Input vcf :param str tool_name: The name of the mutation caller :param dict tool_options: Options specific to Somatic Sniper :param dict univ_options: Universal options :returns: dict of jsIDs, onr for each chromosomal vcf :rtype: dict """ work_dir = os.getcwd() input_files = { 'input.vcf': input_vcf, 'genome.fa.fai.tar.gz': tool_options['genome_fai']} input_files = get_files_from_filestore(job, input_files, work_dir, docker=False) input_files['genome.fa.fai'] = untargz(input_files['genome.fa.fai.tar.gz'], work_dir) chromosomes = chromosomes_from_fai(input_files['genome.fa.fai']) read_chromosomes = defaultdict() with open(input_files['input.vcf'], 'r') as in_vcf: header = [] for line in in_vcf: if line.startswith('#'): header.append(line) continue line = line.strip() chrom = line.split()[0] if chrom in read_chromosomes: print(line, file=read_chromosomes[chrom]) else: read_chromosomes[chrom] = open(os.path.join(os.getcwd(), chrom + '.vcf'), 'w') print(''.join(header), file=read_chromosomes[chrom], end='') print(line, file=read_chromosomes[chrom]) # Process chromosomes that had no mutations for chrom in set(chromosomes).difference(set(read_chromosomes.keys())): read_chromosomes[chrom] = open(os.path.join(os.getcwd(), chrom + '.vcf'), 'w') print(''.join(header), file=read_chromosomes[chrom], end='') outdict = {} for chrom, chromvcf in read_chromosomes.items(): chromvcf.close() export_results(job, chromvcf.name, univ_options, subfolder='mutations/' + tool_name) outdict[chrom] = job.fileStore.writeGlobalFile(chromvcf.name) return outdict
def boost_ranks(job, isoform_expression, merged_mhc_calls, transgene_out, univ_options, rank_boost_options): """ This is the final module in the pipeline. It will call the rank boosting R script. This module corresponds to node 21 in the tree """ job.fileStore.logToMaster('Running boost_ranks on %s' % univ_options['patient']) work_dir = os.path.abspath(univ_options['patient']) os.mkdir(work_dir) input_files = { 'rsem_quant.tsv': isoform_expression, 'mhci_merged_files.tsv': merged_mhc_calls['mhci_merged_files.list'], 'mhcii_merged_files.tsv': merged_mhc_calls['mhcii_merged_files.list'], 'mhci_peptides.faa': transgene_out['transgened_tumor_10_mer_snpeffed.faa'], 'mhcii_peptides.faa': transgene_out['transgened_tumor_15_mer_snpeffed.faa']} input_files = get_files_from_filestore(job, input_files, work_dir, docker=True) output_files = {} for mhc in ('mhci', 'mhcii'): parameters = [mhc, input_files[''.join([mhc, '_merged_files.tsv'])], input_files['rsem_quant.tsv'], input_files[''.join([mhc, '_peptides.faa'])], rank_boost_options[''.join([mhc, '_combo'])] ] docker_call(tool='rankboost', tool_parameters=parameters, work_dir=work_dir, dockerhub=univ_options['dockerhub']) mhc_concise = ''.join([work_dir, '/', mhc, '_merged_files_concise_results.tsv']) mhc_detailed = ''.join([work_dir, '/', mhc, '_merged_files_detailed_results.tsv']) output_files[mhc] = {} if os.path.exists(mhc_concise): output_files[os.path.basename(mhc_concise)] = job.fileStore.writeGlobalFile(mhc_concise) export_results(job, mhc_concise, univ_options, subfolder='rankboost') else: output_files[os.path.basename(mhc_concise)] = None if os.path.exists(mhc_detailed): output_files[os.path.basename(mhc_detailed)] = \ job.fileStore.writeGlobalFile(mhc_detailed) export_results(job, mhc_detailed, univ_options, subfolder='rankboost') else: output_files[os.path.basename(mhc_detailed)] = None return output_files
def index_bamfile(job, bamfile, sample_type, univ_options): """ This module indexes BAMFILE ARGUMENTS 1. bamfile: <JSid for a bam file> 2. sample_type: string of 'tumor_dna' or 'normal_dna' 3. univ_options: Dict of universal arguments used by almost all tools univ_options +- 'dockerhub': <dockerhub to use> RETURN VALUES 1. output_files: REFER output_files in run_bwa(). This module is the one is the one that generates the files. """ job.fileStore.logToMaster('Running samtools-index on %s:%s' % (univ_options['patient'], sample_type)) work_dir = os.getcwd() in_bamfile = '_'.join([sample_type, 'fix_pg_sorted.bam']) input_files = {in_bamfile: bamfile} input_files = get_files_from_filestore(job, input_files, work_dir, docker=True) parameters = ['index', input_files[in_bamfile]] docker_call(tool='samtools', tool_parameters=parameters, work_dir=work_dir, dockerhub=univ_options['dockerhub']) out_bai = '/'.join([work_dir, in_bamfile + '.bai']) output_files = { in_bamfile: bamfile, in_bamfile + '.bai': job.fileStore.writeGlobalFile(out_bai) } export_results(job, bamfile, os.path.splitext(out_bai)[0], univ_options, subfolder='alignments') export_results(job, output_files[in_bamfile + '.bai'], out_bai, univ_options, subfolder='alignments') return output_files
def run_muse_sump_perchrom(job, muse_output, univ_options, muse_options, chrom): """ This module will run muse sump on the muse output """ job.fileStore.logToMaster('Running muse sump on %s:%s' % (univ_options['patient'], chrom)) work_dir = os.getcwd() input_files = { 'MuSE.txt': muse_output, 'dbsnp_coding.vcf.gz': muse_options['dbsnp_vcf'], 'dbsnp_coding.vcf.gz.tbi.tmp': muse_options['dbsnp_tbi'] } input_files = get_files_from_filestore(job, input_files, work_dir, docker=False) tbi = os.path.splitext(input_files['dbsnp_coding.vcf.gz.tbi.tmp'])[0] time.sleep(2) shutil.copy(input_files['dbsnp_coding.vcf.gz.tbi.tmp'], tbi) os.chmod(tbi, 0777) open(tbi, 'a').close() input_files = {key: docker_path(path) for key, path in input_files.items()} output_file = ''.join([work_dir, '/', chrom, '.vcf']) parameters = [ 'sump', '-I', input_files['MuSE.txt'], '-O', docker_path(output_file), '-D', input_files['dbsnp_coding.vcf.gz'], '-E' ] docker_call(tool='muse', tool_parameters=parameters, work_dir=work_dir, dockerhub=univ_options['dockerhub']) outfile = job.fileStore.writeGlobalFile(output_file) export_results(job, outfile, output_file, univ_options, subfolder='mutations/muse') return outfile
def assess_mhc_genes(job, isoform_expression, rna_haplotype, univ_options, mhc_genes_options): """ This module will assess the prevalence of the various genes in the MHC pathway and return a report in the tsv format :param isoform_expression: Isoform expression from run_rsem :param rna_haplotype: PHLAT output from running on rna :param univ_options: Universal options for the pipeline :param mhc_genes_options: options specific to this module """ job.fileStore.logToMaster('Running mhc gene assessment on %s' % univ_options['patient']) work_dir = os.getcwd() input_files = { 'rsem_quant.tsv': isoform_expression, 'rna_haplotype.sum': rna_haplotype, 'mhc_genes.json.tar.gz': mhc_genes_options['genes_file']} input_files = get_files_from_filestore(job, input_files, work_dir, docker=False) input_files['mhc_genes.json'] = untargz(input_files['mhc_genes.json.tar.gz'], work_dir) # Read in the MHC genes with open(input_files['mhc_genes.json']) as mhc_file: mhc_genes = json.load(mhc_file) # Parse the rna phlat file with open(input_files['rna_haplotype.sum']) as rna_mhc: mhc_alleles = {'HLA_A': [], 'HLA_B': [], 'HLA_C': [], 'HLA_DPA': [], 'HLA_DQA': [], 'HLA_DPB': [], 'HLA_DQB': [], 'HLA_DRB': []} mhc_alleles = parse_phlat_file(rna_mhc, mhc_alleles) # Process the isoform expressions gene_expressions = Counter() with open(input_files['rsem_quant.tsv']) as rsem_file: line = rsem_file.readline() line = line.strip().split() assert line == ['transcript_id', 'gene_id', 'length', 'effective_length', 'expected_count', 'TPM', 'FPKM', 'IsoPct'] for line in rsem_file: line = line.strip().split() gene_expressions[line[1]] += float(line[5]) with open(os.path.join(work_dir, 'mhc_pathway_report.txt'), 'w') as mpr: for section in mhc_genes: print(section.center(48, ' '), file=mpr) print("{:12}{:12}{:12}{:12}".format("Gene", "Threshold", "Observed", "Result"), file=mpr) if section == 'MHCI loading': for mhci_allele in 'HLA_A', 'HLA_B', 'HLA_C': num_alleles = len(mhc_alleles[mhci_allele]) print("{:12}{:12}{:12}{:12}".format(mhci_allele, '2', num_alleles, 'FAIL' if num_alleles == 0 else 'LOW' if num_alleles == 1 else 'PASS'), file=mpr) elif section == 'MHCII loading': # TODO DP alleles for mhcii_allele in ('HLA_DQA', 'HLA_DQB', 'HLA_DRA', 'HLA_DRB'): if mhcii_allele != 'HLA_DRA': num_alleles = len(mhc_alleles[mhcii_allele]) print("{:12}{:12}{:12}{:12}".format(mhcii_allele, 2, num_alleles, 'FAIL' if num_alleles == 0 else 'LOW' if num_alleles == 1 else 'PASS'), file=mpr) else: # FIXME This is hardcoded for now. We need to change this. print("{:12}{:<12}{:<12}{:12}".format( 'HLA_DRA', gene_expressions['ENSG00000204287.9'], '69.37', 'LOW' if gene_expressions['ENSG00000204287.9'] <= 69.37 else 'PASS'), file=mpr) for gene, ensgene, first_quart in mhc_genes[section]: result = 'LOW' if gene_expressions[ensgene] <= float(first_quart) else 'PASS' print("{:12}{:<12}{:<12}{:12}".format(gene, float(first_quart), gene_expressions[ensgene], result), file=mpr) print('', file=mpr) export_results(job, mpr.name, univ_options, subfolder='reports') output_file = job.fileStore.writeGlobalFile(mpr.name) return output_file
def assess_itx_resistance(job, gene_expression, univ_options, reports_options): """ Assess the prevalence of the various genes in various cancer pathways and return a report in the txt format. :param toil.fileStore.FileID gene_expression: fsID for the rsem gene expression file :param dict univ_options: Dict of universal options used by almost all tools :param dict reports_options: Options specific to reporting modules :return: The fsID for the itx resistance report file :rtype: toil.fileStore.FileID """ work_dir = os.getcwd() tumor_type = univ_options['tumor_type'] # Get the input files input_files = { 'rsem_quant.tsv': gene_expression, 'itx_resistance.tsv.tar.gz': reports_options['itx_resistance_file'], 'immune_resistance_pathways.json.tar.gz': reports_options['immune_resistance_pathways_file'] } input_files = get_files_from_filestore(job, input_files, work_dir, docker=False) input_files['itx_resistance.tsv'] = untargz( input_files['itx_resistance.tsv.tar.gz'], work_dir) input_files['immune_resistance_pathways.json'] = untargz( input_files['immune_resistance_pathways.json.tar.gz'], work_dir) full_data = pd.read_table(input_files['itx_resistance.tsv'], index_col=0) # Read pathways descriptions and cancer pathway data with open(input_files['immune_resistance_pathways.json']) as json_file: json_data = json.load(json_file) # Read patient file patient_df = pd.read_csv('rsem_quant.tsv', sep=' ', delimiter='\t', header='infer', index_col=0) patient_df.index = (patient_df.index).str.replace('\\..*$', '') with open('immunotherapy_resistance_report.txt', 'w') as report_file: # Check if data exsits for specified tumor type try: pathways = json_data['Cancer_to_pathway'][tumor_type] except KeyError: print('Data not available for ' + tumor_type, file=report_file) else: # If data exists, write a report for pathway in pathways: up_is_good = json_data['Pathways'][pathway]['up_is_good'] if up_is_good: comp_fn = lambda x, y: x >= y else: comp_fn = lambda x, y: x < y # Describe pathway and genes for it print('Pathway: ' + pathway + '\n', file=report_file) print('Papers: ' + json_data['Pathways'][pathway]['paper'], file=report_file) description = json_data['Pathways'][pathway]['description'] print('Description of pathway:\n' + textwrap.fill(description, width=100), file=report_file) print('Pathway genes: ', file=report_file) print('\t{:10}{:<20}{:<20}{:<12}'.format( 'Gene', 'GTEX Median', 'TCGA N Median', 'Observed'), file=report_file) status = [] # Write TCGA, GTEX, and observed values for gene in json_data['Pathways'][pathway]['genes']: gtex = '{0:.2f}'.format( float(full_data.loc[gene, TCGAToGTEx[tumor_type]])) \ if gene in full_data.index else 'NA' tcga = '{0:.2f}'.format( float(full_data.loc[gene, tumor_type + ' normal'])) \ if gene in full_data.index else 'NA' tpm_value = '{0:.2f}'.format(float(patient_df.loc[gene, 'TPM'])) \ if gene in patient_df.index else 'NA' ensg = json_data['Pathways'][pathway]['genes'][gene] print('\t{:10}{:<20}{:<20}{:<12}'.format( ensg, gtex, tcga, tpm_value), file=report_file) if gtex != 'NA' and tpm_value != 'NA': tcga_bool = comp_fn(float(tpm_value), float(tcga)) gtex_bool = comp_fn(float(tpm_value), float(gtex)) status.append(tcga_bool and gtex_bool) else: status.append(False) # Based on the number of genes with expression values above normal, assess the status print('Status: ' + json_data['Pathways'][pathway]['status'][str( sum(status) >= 0.75 * len(status))] + '\n', file=report_file) output_file = job.fileStore.writeGlobalFile(report_file.name) export_results(job, output_file, report_file.name, univ_options, subfolder='reports') job.fileStore.logToMaster( 'Ran create immunotherapy resistance report on %s successfully' % univ_options['patient']) return output_file
def boost_ranks(job, isoform_expression, merged_mhc_calls, transgene_out, univ_options, rankboost_options): """ Boost the ranks of the predicted peptides:MHC combinations. :param toil.fileStore.FileID isoform_expression: fsID of rsem isoform expression file :param dict merged_mhc_calls: Dict of results from merging mhc peptide binding predictions :param dict transgene_out: Dict of results from running Transgene :param dict univ_options: Dict of universal options used by almost all tools :param dict rankboost_options: Options specific to rankboost :return: Dict of concise and detailed results for mhci and mhcii output_files: |- 'mhcii_rankboost_concise_results.tsv': fsID |- 'mhcii_rankboost_detailed_results.txt': fsID |- 'mhci_rankboost_concise_results.tsv': fsID +- 'mhci_rankboost_detailed_results.txt': fsID :rtype: dict """ work_dir = os.getcwd() input_files = { 'rsem_quant.tsv': isoform_expression, 'mhci_merged_files.tsv': merged_mhc_calls['mhci_merged_files.list'], 'mhcii_merged_files.tsv': merged_mhc_calls['mhcii_merged_files.list'], 'mhci_peptides.faa': transgene_out['transgened_tumor_10_mer_snpeffed.faa'], 'mhcii_peptides.faa': transgene_out['transgened_tumor_15_mer_snpeffed.faa'] } input_files = get_files_from_filestore(job, input_files, work_dir, docker=True) output_files = {} for mhc in ('mhci', 'mhcii'): import re ratios = re.sub("'", '', repr(rankboost_options[''.join([mhc, '_args'])])) parameters = [ '--' + mhc, '--predictions', input_files[''.join([mhc, '_merged_files.tsv'])], '--expression', input_files['rsem_quant.tsv'], '--peptides', input_files[''.join([mhc, '_peptides.faa'])], '--ratios', ratios ] docker_call(tool='rankboost', tool_parameters=parameters, work_dir=work_dir, dockerhub=univ_options['dockerhub'], tool_version=rankboost_options['version']) mhc_concise = ''.join( [work_dir, '/', mhc, '_rankboost_concise_results.tsv']) mhc_detailed = ''.join( [work_dir, '/', mhc, '_rankboost_detailed_results.txt']) output_files[mhc] = {} if os.path.exists(mhc_concise): output_files[os.path.basename( mhc_concise)] = job.fileStore.writeGlobalFile(mhc_concise) export_results(job, output_files[os.path.basename(mhc_concise)], mhc_concise, univ_options, subfolder='rankboost') else: output_files[os.path.basename(mhc_concise)] = None if os.path.exists(mhc_detailed): output_files[os.path.basename(mhc_detailed)] = \ job.fileStore.writeGlobalFile(mhc_detailed) export_results(job, output_files[os.path.basename(mhc_detailed)], mhc_detailed, univ_options, subfolder='rankboost') else: output_files[os.path.basename(mhc_detailed)] = None job.fileStore.logToMaster('Ran boost_ranks on %s successfully' % univ_options['patient']) return output_files
def run_mutect_perchrom(job, tumor_bam, normal_bam, univ_options, mutect_options, chrom): """ This module will run mutect on the DNA bams ARGUMENTS 1. tumor_bam: REFER ARGUMENTS of spawn_mutect() 2. normal_bam: REFER ARGUMENTS of spawn_mutect() 3. univ_options: REFER ARGUMENTS of spawn_mutect() 4. mutect_options: REFER ARGUMENTS of spawn_mutect() 5. chrom: String containing chromosome name with chr appended RETURN VALUES 1. output_files: Dict of results of mutect for chromosome output_files |- 'mutect_CHROM.vcf': <JSid> +- 'mutect_CHROM.out': <JSid> This module corresponds to node 12 on the tree """ job.fileStore.logToMaster('Running mutect on %s:%s' % (univ_options['patient'], chrom)) work_dir = os.getcwd() input_files = { 'tumor.bam': tumor_bam['tumor_dna_fix_pg_sorted.bam'], 'tumor.bam.bai': tumor_bam['tumor_dna_fix_pg_sorted.bam.bai'], 'normal.bam': normal_bam['normal_dna_fix_pg_sorted.bam'], 'normal.bam.bai': normal_bam['normal_dna_fix_pg_sorted.bam.bai'], 'genome.fa.tar.gz': mutect_options['genome_fasta'], 'genome.fa.fai.tar.gz': mutect_options['genome_fai'], 'genome.dict.tar.gz': mutect_options['genome_dict'], 'cosmic.vcf.tar.gz': mutect_options['cosmic_vcf'], 'cosmic.vcf.idx.tar.gz': mutect_options['cosmic_idx'], 'dbsnp.vcf.gz': mutect_options['dbsnp_vcf'], 'dbsnp.vcf.idx.tar.gz': mutect_options['dbsnp_idx']} input_files = get_files_from_filestore(job, input_files, work_dir, docker=False) # dbsnp.vcf should be bgzipped, but all others should be tar.gz'd input_files['dbsnp.vcf'] = gunzip(input_files['dbsnp.vcf.gz']) for key in ('genome.fa', 'genome.fa.fai', 'genome.dict', 'cosmic.vcf', 'cosmic.vcf.idx', 'dbsnp.vcf.idx'): input_files[key] = untargz(input_files[key + '.tar.gz'], work_dir) input_files = {key: docker_path(path) for key, path in input_files.items()} mutout = ''.join([work_dir, '/', chrom, '.out']) mutvcf = ''.join([work_dir, '/', chrom, '.vcf']) parameters = ['-R', input_files['genome.fa'], '--cosmic', input_files['cosmic.vcf'], '--dbsnp', input_files['dbsnp.vcf'], '--input_file:normal', input_files['normal.bam'], '--input_file:tumor', input_files['tumor.bam'], # '--tumor_lod', str(10), # '--initial_tumor_lod', str(4.0), '-L', chrom, '--out', docker_path(mutout), '--vcf', docker_path(mutvcf) ] print(parameters, file=sys.stderr) java_xmx = mutect_options['java_Xmx'] if mutect_options['java_Xmx'] \ else univ_options['java_Xmx'] docker_call(tool='mutect:1.1.7', tool_parameters=parameters, work_dir=work_dir, dockerhub=univ_options['dockerhub'], java_opts=java_xmx) export_results(job, mutvcf, univ_options, subfolder='mutations/mutect') output_file = job.fileStore.writeGlobalFile(mutvcf) return output_file
def run_transgene(job, snpeffed_file, rna_bam, univ_options, transgene_options, tumor_dna_bam=None, fusion_calls=None): """ Run transgene on an input snpeffed vcf file and return the peptides for MHC prediction. :param toil.fileStore.FileID snpeffed_file: fsID for snpeffed vcf :param dict rna_bam: The dict of bams returned by running star :param dict univ_options: Dict of universal options used by almost all tools :param dict transgene_options: Options specific to Transgene :param dict tumor_dna_bam: The dict of bams returned by running bwa :return: A dictionary of 9 files (9-, 10-, and 15-mer peptides each for Tumor and Normal and the corresponding .map files for the 3 Tumor fastas) output_files: |- 'transgened_normal_10_mer_peptides.faa': fsID |- 'transgened_normal_15_mer_peptides.faa': fsID |- 'transgened_normal_9_mer_peptides.faa': fsID |- 'transgened_tumor_10_mer_peptides.faa': fsID |- 'transgened_tumor_10_mer_peptides.faa.map': fsID |- 'transgened_tumor_15_mer_peptides.faa': fsID |- 'transgened_tumor_15_mer_peptides.faa.map': fsID |- 'transgened_tumor_9_mer_peptides.faa': fsID +- 'transgened_tumor_9_mer_peptides.faa.map': fsID :rtype: dict """ assert snpeffed_file or fusion_calls work_dir = os.getcwd() input_files = { 'pepts.fa.tar.gz': transgene_options['gencode_peptide_fasta'], 'annotation.gtf.tar.gz': transgene_options['gencode_annotation_gtf'], 'genome.fa.tar.gz': transgene_options['genome_fasta'] } if snpeffed_file is not None: input_files.update({'snpeffed_muts.vcf': snpeffed_file}) if rna_bam: input_files.update({ 'rna.bam': rna_bam['rna_genome']['rna_genome_sorted.bam'], 'rna.bam.bai': rna_bam['rna_genome']['rna_genome_sorted.bam.bai'], }) if tumor_dna_bam is not None: input_files.update({ 'tumor_dna.bam': tumor_dna_bam['tumor_dna_fix_pg_sorted.bam'], 'tumor_dna.bam.bai': tumor_dna_bam['tumor_dna_fix_pg_sorted.bam.bai'], }) input_files = get_files_from_filestore(job, input_files, work_dir, docker=False) input_files['pepts.fa'] = untargz(input_files['pepts.fa.tar.gz'], work_dir) input_files['genome.fa'] = untargz(input_files['genome.fa.tar.gz'], work_dir) input_files['annotation.gtf'] = untargz( input_files['annotation.gtf.tar.gz'], work_dir) input_files = { key: docker_path(path) for key, path in list(input_files.items()) } parameters = [ '--peptides', input_files['pepts.fa'], '--prefix', 'transgened', '--pep_lens', '9,10,15', '--cores', str(20), '--genome', input_files['genome.fa'], '--annotation', input_files['annotation.gtf'], '--log_file', '/data/transgene.log' ] if snpeffed_file is not None: parameters.extend(['--snpeff', input_files['snpeffed_muts.vcf']]) if rna_bam: parameters.extend(['--rna_file', input_files['rna.bam']]) if tumor_dna_bam is not None: parameters.extend(['--dna_file', input_files['tumor_dna.bam']]) if fusion_calls: fusion_files = { 'fusion_calls': fusion_calls, 'transcripts.fa.tar.gz': transgene_options['gencode_transcript_fasta'] } fusion_files = get_files_from_filestore(job, fusion_files, work_dir, docker=False) fusion_files['transcripts.fa'] = untargz( fusion_files['transcripts.fa.tar.gz'], work_dir) fusion_files = { key: docker_path(path) for key, path in list(fusion_files.items()) } parameters += [ '--transcripts', fusion_files['transcripts.fa'], '--fusions', fusion_files['fusion_calls'] ] try: docker_call(tool='transgene', tool_parameters=parameters, work_dir=work_dir, dockerhub=univ_options['dockerhub'], tool_version=transgene_options['version']) finally: logfile = os.path.join(os.getcwd(), 'transgene.log') export_results(job, job.fileStore.writeGlobalFile(logfile), logfile, univ_options, subfolder='mutations/transgened') output_files = defaultdict() peptides_not_found = False for peplen in ['9', '10', '15']: for tissue_type in ['tumor', 'normal']: pepfile = '_'.join( ['transgened', tissue_type, peplen, 'mer_peptides.faa']) # Backwards compatibility for old transgene output old_pepfile = '_'.join( ['transgened', tissue_type, peplen, 'mer_snpeffed.faa']) if os.path.exists(os.path.join(work_dir, old_pepfile)): os.rename(os.path.join(work_dir, old_pepfile), os.path.join(work_dir, pepfile)) if tissue_type == 'tumor': os.rename(os.path.join(work_dir, old_pepfile + '.map'), os.path.join(work_dir, pepfile + '.map')) if not os.path.exists(pepfile): peptides_not_found = True break output_files[pepfile] = job.fileStore.writeGlobalFile( os.path.join(work_dir, pepfile)) export_results(job, output_files[pepfile], pepfile, univ_options, subfolder='peptides') if peptides_not_found: break mapfile = '_'.join( ['transgened_tumor', peplen, 'mer_peptides.faa.map']) output_files[mapfile] = job.fileStore.writeGlobalFile( os.path.join(work_dir, mapfile)) export_results(job, output_files[mapfile], mapfile, univ_options, subfolder='peptides') if snpeffed_file: # There won't be an output vcf if there's no input os.rename('transgened_transgened.vcf', 'mutations.vcf') export_results(job, job.fileStore.writeGlobalFile('mutations.vcf'), 'mutations.vcf', univ_options, subfolder='mutations/transgened') if fusion_calls: # There won't be an output bedpe if there's no input os.rename('transgened_transgened.bedpe', 'fusions.bedpe') export_results(job, job.fileStore.writeGlobalFile('fusions.bedpe'), 'fusions.bedpe', univ_options, subfolder='mutations/transgened') if peptides_not_found: job.fileStore.logToMaster( 'Transgene failed to find any peptides for %s.' % univ_options['patient']) return None else: job.fileStore.logToMaster('Ran transgene on %s successfully' % univ_options['patient']) return output_files
def run_fusion(job, fastqs, junction_file, univ_options, star_fusion_options, fusion_inspector_options): """ Runs STAR-Fusion and filters fusion calls using FusionInspector :param tuple fastqs: RNA-Seq FASTQ Filestore IDs :param toil.fileStore.FileID junction_file: Chimeric junction file :param dict univ_options: universal arguments used by almost all tools :param dict star_fusion_options: STAR-Fusion specific parameters :return: Transgene BEDPE file :rtype: toil.fileStore.FileID """ work_dir = job.fileStore.getLocalTempDir() input_files = { 'rna_1.fq.gz': fastqs[0], 'rna_2.fq.gz': fastqs[1], 'tool_index.tar.gz': star_fusion_options['index'] } parameters = [] # If there isn't a junction file, then we can run STAR-Fusion from the fastq files if junction_file: input_files['STAR.junction'] = junction_file parameters.extend(['--chimeric_junction', '/data/STAR.junction']) else: parameters.extend([ '--left_fq', '/data/rna_1.fq.gz', '--right_fq', '/data/rna_2.fq.gz' ]) input_files = get_files_from_filestore(job, input_files, work_dir, docker=False) input_files['tool_index'] = os.path.basename( untargz(input_files['tool_index.tar.gz'], work_dir)) cores = star_fusion_options['n'] parameters.extend([ '--output_dir', '/data/fusion-output', '--genome_lib_dir', input_files['tool_index'], '--CPU', str(cores) ]) docker_call(tool='star-fusion', tool_parameters=parameters, work_dir=work_dir, dockerhub=univ_options['dockerhub'], tool_version=star_fusion_options['version']) star_output = 'fusion-output/star-fusion.fusion_candidates.final.abridged' fusion_path = os.path.join(work_dir, star_output) # Export the STAR-Fusion predictions export_results(job, job.fileStore.writeGlobalFile(fusion_path), 'star-fusion-predictions.tsv', univ_options, subfolder='mutations/fusions') # Check for fusion prediction with open(fusion_path, 'r') as f: # Skip header f.next() try: f.next() except StopIteration: logging.warning('%s: Did not find any fusions!' % univ_options['patient']) return parameters = [ '--fusions', '/data/%s' % star_output, '--genome_lib', input_files['tool_index'], '--left_fq', '/data/rna_1.fq.gz', '--right_fq', '/data/rna_2.fq.gz', '--out_dir', '/data/FusionInspector', '--out_prefix', 'FusionInspector', '--CPU', str(cores) ] if fusion_inspector_options['run_trinity']: parameters.append('--include_Trinity') docker_call(tool='fusion-inspector', tool_parameters=parameters, work_dir=work_dir, dockerhub=univ_options['dockerhub'], tool_version=fusion_inspector_options['version']) found_fusion = False inspector_output = 'FusionInspector/FusionInspector.fusion_predictions.final.abridged.FFPM' fusion_path = os.path.join(work_dir, inspector_output) output_path = os.path.join(work_dir, 'fusion.final') # Export the FusionInpsector predictions export_results(job, job.fileStore.writeGlobalFile(fusion_path), 'fusion-inspector-predictions.tsv', univ_options, subfolder='mutations/fusions') # Remove fusions without a large anchor sequence and at least 0.1 # fusion fragments per million reads if os.path.exists(fusion_path): with open(fusion_path, 'r') as f, open(output_path, 'w') as g: g.write(f.next()) for line in f: fields = line.strip().split() # Check for a large anchor support ldas = fields[10] assert ldas in {'YES', 'NO'}, 'FusionInpsector file is malformed!' j_ffpm, s_ffpm = fields[-2:] # Fusions without a larger anchor support or low read support # are suspicious and should not be consider for further analysis if ldas == 'YES' and sum([float(j_ffpm), float(s_ffpm)]) > 0.1: found_fusion = True g.write(line) if found_fusion: fusion_bed_f = 'FusionInspector/FusionInspector.bed' fusion_bed_path = os.path.join(work_dir, fusion_bed_f) transcript_f = 'FusionInspector/FusionInspector.gmap_trinity_GG.fusions.fasta' transcript_path = os.path.join(work_dir, transcript_f) transcript_gff_f = 'FusionInspector/FusionInspector.gmap_trinity_GG.fusions.gff3' transcript_gff_path = os.path.join(work_dir, transcript_gff_f) transcripts = None transcript_annotation = None if os.path.exists(transcript_path): transcripts = job.fileStore.writeGlobalFile(transcript_path) export_results(job, transcripts, transcript_path, univ_options, subfolder='mutations/fusions') if os.path.exists(transcript_gff_path): transcript_annotation = job.fileStore.writeGlobalFile( transcript_gff_path) export_results(job, transcript_annotation, transcript_gff_path, univ_options, subfolder='mutations/fusions') fusion_annotation = job.fileStore.writeGlobalFile(fusion_bed_path) filtered_fusions = job.fileStore.writeGlobalFile(output_path) export_results(job, filtered_fusions, output_path, univ_options, subfolder='mutations/fusions') job.fileStore.logToMaster('Ran STAR-Fusion on %s successfully' % univ_options['patient']) return job.addChildJobFn(reformat_star_fusion_output, fusion_annotation, filtered_fusions, transcripts, transcript_annotation, univ_options).rv() else: job.fileStore.logToMaster('No fusions detected for %s' % univ_options['patient'])
def reformat_star_fusion_output(job, fusion_annot, fusion_file, transcript_file, transcript_gff_file, univ_options): """ Writes STAR-Fusion results in Transgene BEDPE format :param toil.fileStore.FileID fusion_annot: Fusion annotation :param toil.fileStore.FileID fusion_file: STAR-fusion prediction file :param toil.fileStore.FileID transcript_file: Fusion transcript FASTA file :param toil.fileStore.FileID transcript_gff_file: Fusion transcript GFF file :param dict univ_options: universal arguments used by almost all tools :return: Transgene BEDPE file :rtype: toil.fileStore.FileID """ input_files = {'results.tsv': fusion_file, 'fusion.bed': fusion_annot} if transcript_file and transcript_gff_file: input_files['transcripts.fa'] = transcript_file input_files['transcripts.gff'] = transcript_gff_file work_dir = job.fileStore.getLocalTempDir() input_files = get_files_from_filestore(job, input_files, work_dir, docker=False) # Pull in assembled transcript file hugo_to_gene_ids = get_gene_ids(input_files['fusion.bed']) if transcript_file and transcript_gff_file: transcripts = get_transcripts(input_files['transcripts.fa']) five_pr_splits, three_pr_splits = split_fusion_transcript( input_files['transcripts.gff'], transcripts) else: five_pr_splits = collections.defaultdict(dict) three_pr_splits = collections.defaultdict(dict) # Pull in assembled transcript annotation # Header for BEDPE file header = [ '# chr1', 'start1', 'end1', 'chr2', 'start2', 'end2', 'name', 'score', 'strand1', 'strand2', 'junctionSeq1', 'junctionSeq2', 'hugo1', 'hugo2' ] output_path = os.path.join(work_dir, 'fusion_results.bedpe') with open(input_files['results.tsv'], 'r') as in_f, open(output_path, 'w') as out_f: writer = csv.writer(out_f, delimiter='\t') writer.writerow(header) for record in parse_star_fusion(in_f): left_chr, left_break, left_strand = record.LeftBreakpoint.split( ':') right_chr, right_break, right_strand = record.RightBreakpoint.split( ':') fusion = ''.join([record.LeftGene, '--', record.RightGene]) name = '-'.join([ hugo_to_gene_ids[record.LeftGene], hugo_to_gene_ids[record.RightGene] ]) score = 'Junction:%s-Spanning:%s' % (record.JunctionReadCount, record.SpanningFragCount) # Add empty sequences in case Trinity doesn't output one if len(five_pr_splits[fusion].keys()) == 0: five_pr_splits[fusion]['N/A'] = '.' if len(three_pr_splits[fusion].keys()) == 0: three_pr_splits[fusion]['N/A'] = '.' for transcript_id in five_pr_splits[fusion].keys(): five_prime_seq = five_pr_splits[fusion][transcript_id] three_prime_seq = three_pr_splits[fusion][transcript_id] writer.writerow([ left_chr, '.', # Donor start position is not necessary left_break, right_chr, right_break, '.', # Acceptor end position is not necessary name, score, left_strand, right_strand, five_prime_seq, three_prime_seq, record.LeftGene, record.RightGene ]) bedpe_id = job.fileStore.writeGlobalFile(output_path) export_results(job, bedpe_id, 'fusion.bedpe', univ_options, subfolder='mutations/fusions') job.fileStore.logToMaster( 'Reformatted STAR-Fusion output for %s successfully' % univ_options['patient']) return bedpe_id
def assess_mhc_genes(job, isoform_expression, rna_haplotype, univ_options, mhc_genes_options): """ This module will assess the prevalence of the various genes in the MHC pathway and return a report in the tsv format :param isoform_expression: Isoform expression from run_rsem :param rna_haplotype: PHLAT output from running on rna :param univ_options: Universal options for the pipeline :param mhc_genes_options: options specific to this module """ job.fileStore.logToMaster('Running mhc gene assessment on %s' % univ_options['patient']) work_dir = os.getcwd() input_files = { 'rsem_quant.tsv': isoform_expression, 'rna_haplotype.sum': rna_haplotype, 'mhc_genes.json.tar.gz': mhc_genes_options['genes_file'] } input_files = get_files_from_filestore(job, input_files, work_dir, docker=False) input_files['mhc_genes.json'] = untargz( input_files['mhc_genes.json.tar.gz'], work_dir) # Read in the MHC genes with open(input_files['mhc_genes.json']) as mhc_file: mhc_genes = json.load(mhc_file) # Parse the rna phlat file with open(input_files['rna_haplotype.sum']) as rna_mhc: mhc_alleles = { 'HLA_A': [], 'HLA_B': [], 'HLA_C': [], 'HLA_DPA': [], 'HLA_DQA': [], 'HLA_DPB': [], 'HLA_DQB': [], 'HLA_DRB': [] } mhc_alleles = parse_phlat_file(rna_mhc, mhc_alleles) # Process the isoform expressions gene_expressions = Counter() with open(input_files['rsem_quant.tsv']) as rsem_file: line = rsem_file.readline() line = line.strip().split() assert line == [ 'transcript_id', 'gene_id', 'length', 'effective_length', 'expected_count', 'TPM', 'FPKM', 'IsoPct' ] for line in rsem_file: line = line.strip().split() gene_expressions[line[1]] += float(line[5]) with open(os.path.join(work_dir, 'mhc_pathway_report.txt'), 'w') as mpr: for section in mhc_genes: print(section.center(48, ' '), file=mpr) print("{:12}{:12}{:12}{:12}".format("Gene", "Threshold", "Observed", "Result"), file=mpr) if section == 'MHCI loading': for mhci_allele in 'HLA_A', 'HLA_B', 'HLA_C': num_alleles = len(mhc_alleles[mhci_allele]) result = 'FAIL' if num_alleles == 0 else 'LOW' if num_alleles == 1 else 'PASS' print("{:12}{:<12}{:<12}{:12}".format( mhci_allele, 2, num_alleles, result), file=mpr) elif section == 'MHCII loading': # TODO DP alleles for mhcii_allele in ('HLA_DQA', 'HLA_DQB', 'HLA_DRA', 'HLA_DRB'): if mhcii_allele != 'HLA_DRA': num_alleles = len(mhc_alleles[mhcii_allele]) result = 'FAIL' if num_alleles == 0 else 'LOW' if num_alleles == 1 else 'PASS' print("{:12}{:<12}{:<12}{:12}".format( mhcii_allele, 2, num_alleles, result), file=mpr) else: # FIXME This is hardcoded for now. We need to change this. result = 'LOW' if gene_expressions[ 'ENSG00000204287.9'] <= 69.37 else 'PASS' print("{:12}{:<12}{:<12}{:12}".format( 'HLA_DRA', gene_expressions['ENSG00000204287.9'], '69.37', result), file=mpr) for gene, ensgene, first_quart in mhc_genes[section]: result = 'LOW' if gene_expressions[ensgene] <= float( first_quart) else 'PASS' print("{:12}{:<12}{:<12}{:12}".format( gene, float(first_quart), gene_expressions[ensgene], result), file=mpr) print('', file=mpr) output_file = job.fileStore.writeGlobalFile(mpr.name) export_results(job, output_file, mpr.name, univ_options, subfolder='reports') return output_file
def merge_mhc_peptide_calls(job, antigen_predictions, transgened_files, univ_options): """ Merge all the calls generated by spawn_antigen_predictors. :param dict antigen_predictions: The return value from running :meth:`spawn_antigen_predictors` :param dict transgened_files: The transgened peptide files :param dict univ_options: Universal options for ProTECT :return: merged binding predictions output_files: |- 'mhcii_merged_files.list': fsID +- 'mhci_merged_files.list': fsID :rtype: dict """ job.fileStore.logToMaster('Merging MHC calls') work_dir = os.getcwd() pept_files = { '10_mer.faa': transgened_files['transgened_tumor_10_mer_snpeffed.faa'], '10_mer.faa.map': transgened_files['transgened_tumor_10_mer_snpeffed.faa.map'], '15_mer.faa': transgened_files['transgened_tumor_15_mer_snpeffed.faa'], '15_mer.faa.map': transgened_files['transgened_tumor_15_mer_snpeffed.faa.map'] } pept_files = get_files_from_filestore(job, pept_files, work_dir) mhci_preds, mhcii_preds = antigen_predictions mhci_called = mhcii_called = False # Merge MHCI calls # Read 10-mer pepts into memory peptides = read_peptide_file(pept_files['10_mer.faa']) with open(pept_files['10_mer.faa.map'], 'r') as mapfile: pepmap = json.load(mapfile) with open('/'.join([work_dir, 'mhci_merged_files.list']), 'w') as mhci_resfile: for key in mhci_preds: tumor_file = job.fileStore.readGlobalFile(mhci_preds[key]['tumor']) with open(tumor_file) as t_f: tumor_df = pandas.read_json(eval(t_f.read())) if tumor_df.empty: continue mhci_called = True # TODO: There must be a better way of doing this normal_df = _process_mhci(job.fileStore.readGlobalFile( mhci_preds[key]['normal']), normal=True) normal_dict = normal_df.set_index('pept')['tumor_pred'] normal_preds = [ normal_dict[x] for x in list(tumor_df['normal_pept']) ] tumor_df['normal_pred'] = normal_preds for pred in tumor_df.itertuples(): print_mhc_peptide(pred, peptides, pepmap, mhci_resfile) # Merge MHCII calls # read 15-mer pepts into memory peptides = read_peptide_file(pept_files['15_mer.faa']) with open(pept_files['15_mer.faa.map'], 'r') as mapfile: pepmap = json.load(mapfile) # Incorporate peptide names into the merged calls with open('/'.join([work_dir, 'mhcii_merged_files.list']), 'w') as \ mhcii_resfile: for key in mhcii_preds: if mhcii_preds[key]['predictor'] is None: continue mhcii_called = True tumor_file = job.fileStore.readGlobalFile( mhcii_preds[key]['tumor']) with open(tumor_file) as t_f: tumor_df = pandas.read_json(eval(t_f.read())) if tumor_df.empty: continue # TODO: There must be a better way of doing this if mhcii_preds[key]['predictor'] == 'Consensus': normal_df = _process_consensus_mhcii( job.fileStore.readGlobalFile( mhcii_preds[key]['normal'][0]), normal=True) elif mhcii_preds[key]['predictor'] == 'Sturniolo': normal_df = _process_sturniolo_mhcii( job.fileStore.readGlobalFile( mhcii_preds[key]['normal'][0]), normal=True) elif mhcii_preds[key]['predictor'] == 'netMHCIIpan': normal_df = _process_net_mhcii(job.fileStore.readGlobalFile( mhcii_preds[key]['normal'][0]), normal=True) else: assert False normal_dict = normal_df.set_index('pept')['tumor_pred'] normal_preds = [ normal_dict[x] for x in list(tumor_df['normal_pept']) ] tumor_df['normal_pred'] = normal_preds for pred in tumor_df.itertuples(): print_mhc_peptide( pred, peptides, pepmap, mhcii_resfile, netmhc=mhcii_preds[key]['predictor'] == 'netMHCIIpan') if not (mhci_called or mhcii_called): raise RuntimeError('No peptides available for ranking') output_files = defaultdict() for mhc_file in [mhci_resfile.name, mhcii_resfile.name]: output_files[os.path.split(mhc_file) [1]] = job.fileStore.writeGlobalFile(mhc_file) export_results(job, output_files[os.path.split(mhc_file)[1]], mhc_file, univ_options, subfolder='binding_predictions') return output_files
def run_mutect_perchrom(job, tumor_bam, normal_bam, univ_options, mutect_options, chrom): """ This module will run mutect on the DNA bams ARGUMENTS 1. tumor_bam: REFER ARGUMENTS of spawn_mutect() 2. normal_bam: REFER ARGUMENTS of spawn_mutect() 3. univ_options: REFER ARGUMENTS of spawn_mutect() 4. mutect_options: REFER ARGUMENTS of spawn_mutect() 5. chrom: String containing chromosome name with chr appended RETURN VALUES 1. output_files: Dict of results of mutect for chromosome output_files |- 'mutect_CHROM.vcf': <JSid> +- 'mutect_CHROM.out': <JSid> This module corresponds to node 12 on the tree """ job.fileStore.logToMaster('Running mutect on %s:%s' % (univ_options['patient'], chrom)) work_dir = os.getcwd() input_files = { 'tumor.bam': tumor_bam['tumor_dna_fix_pg_sorted.bam'], 'tumor.bam.bai': tumor_bam['tumor_dna_fix_pg_sorted.bam.bai'], 'normal.bam': normal_bam['normal_dna_fix_pg_sorted.bam'], 'normal.bam.bai': normal_bam['normal_dna_fix_pg_sorted.bam.bai'], 'genome.fa.tar.gz': mutect_options['genome_fasta'], 'genome.fa.fai.tar.gz': mutect_options['genome_fai'], 'genome.dict.tar.gz': mutect_options['genome_dict'], 'cosmic.vcf.tar.gz': mutect_options['cosmic_vcf'], 'cosmic.vcf.idx.tar.gz': mutect_options['cosmic_idx'], 'dbsnp.vcf.gz': mutect_options['dbsnp_vcf'], 'dbsnp.vcf.idx.tar.gz': mutect_options['dbsnp_idx'] } input_files = get_files_from_filestore(job, input_files, work_dir, docker=False) # dbsnp.vcf should be bgzipped, but all others should be tar.gz'd input_files['dbsnp.vcf'] = gunzip(input_files['dbsnp.vcf.gz']) for key in ('genome.fa', 'genome.fa.fai', 'genome.dict', 'cosmic.vcf', 'cosmic.vcf.idx', 'dbsnp.vcf.idx'): input_files[key] = untargz(input_files[key + '.tar.gz'], work_dir) input_files = {key: docker_path(path) for key, path in input_files.items()} mutout = ''.join([work_dir, '/', chrom, '.out']) mutvcf = ''.join([work_dir, '/', chrom, '.vcf']) parameters = [ '-R', input_files['genome.fa'], '--cosmic', input_files['cosmic.vcf'], '--dbsnp', input_files['dbsnp.vcf'], '--input_file:normal', input_files['normal.bam'], '--input_file:tumor', input_files['tumor.bam'], # '--tumor_lod', str(10), # '--initial_tumor_lod', str(4.0), '-L', chrom, '--out', docker_path(mutout), '--vcf', docker_path(mutvcf) ] java_xmx = mutect_options['java_Xmx'] if mutect_options['java_Xmx'] \ else univ_options['java_Xmx'] docker_call(tool='mutect:1.1.7', tool_parameters=parameters, work_dir=work_dir, dockerhub=univ_options['dockerhub'], java_opts=java_xmx) output_file = job.fileStore.writeGlobalFile(mutvcf) export_results(job, output_file, mutvcf, univ_options, subfolder='mutations/mutect') return output_file
def run_transgene(job, snpeffed_file, rna_bam, univ_options, transgene_options): """ This module will run transgene on the input vcf file from the aggregator and produce the peptides for MHC prediction ARGUMENTS 1. snpeffed_file: <JSid for snpeffed vcf> 2. univ_options: Dict of universal arguments used by almost all tools univ_options +- 'dockerhub': <dockerhub to use> 3. transgene_options: Dict of parameters specific to transgene transgene_options +- 'gencode_peptide_fasta': <JSid for the gencode protein fasta> RETURN VALUES 1. output_files: Dict of transgened n-mer peptide fastas output_files |- 'transgened_tumor_9_mer_snpeffed.faa': <JSid> |- 'transgened_tumor_10_mer_snpeffed.faa': <JSid> +- 'transgened_tumor_15_mer_snpeffed.faa': <JSid> This module corresponds to node 17 on the tree """ job.fileStore.logToMaster('Running transgene on %s' % univ_options['patient']) work_dir = os.getcwd() rna_bam_key = 'rnaAligned.sortedByCoord.out.bam' # to reduce next line size input_files = { 'snpeffed_muts.vcf': snpeffed_file, 'rna.bam': rna_bam[rna_bam_key]['rna_fix_pg_sorted.bam'], 'rna.bam.bai': rna_bam[rna_bam_key]['rna_fix_pg_sorted.bam.bai'], 'pepts.fa.tar.gz': transgene_options['gencode_peptide_fasta'] } input_files = get_files_from_filestore(job, input_files, work_dir, docker=False) input_files['pepts.fa'] = untargz(input_files['pepts.fa.tar.gz'], work_dir) input_files = {key: docker_path(path) for key, path in input_files.items()} parameters = [ '--peptides', input_files['pepts.fa'], '--snpeff', input_files['snpeffed_muts.vcf'], '--rna_file', input_files['rna.bam'], '--prefix', 'transgened', '--pep_lens', '9,10,15' ] docker_call(tool='transgene', tool_parameters=parameters, work_dir=work_dir, dockerhub=univ_options['dockerhub']) output_files = defaultdict() for peplen in ['9', '10', '15']: peptfile = '_'.join(['transgened_tumor', peplen, 'mer_snpeffed.faa']) mapfile = '_'.join( ['transgened_tumor', peplen, 'mer_snpeffed.faa.map']) output_files[peptfile] = job.fileStore.writeGlobalFile( os.path.join(work_dir, peptfile)) output_files[mapfile] = job.fileStore.writeGlobalFile( os.path.join(work_dir, mapfile)) export_results(job, output_files[peptfile], peptfile, univ_options, subfolder='peptides') export_results(job, output_files[mapfile], mapfile, univ_options, subfolder='peptides') os.rename('transgened_transgened.vcf', 'mutations.vcf') export_results(job, job.fileStore.writeGlobalFile('mutations.vcf'), 'mutations.vcf', univ_options, subfolder='mutations/transgened') return output_files
def run_transgene(job, snpeffed_file, rna_bam, univ_options, transgene_options, tumor_dna_bam=None, fusion_calls=None): """ Run transgene on an input snpeffed vcf file and return the peptides for MHC prediction. :param toil.fileStore.FileID snpeffed_file: fsID for snpeffed vcf :param dict rna_bam: The dict of bams returned by running star :param dict univ_options: Dict of universal options used by almost all tools :param dict transgene_options: Options specific to Transgene :param dict tumor_dna_bam: The dict of bams returned by running bwa :return: A dictionary of 9 files (9-, 10-, and 15-mer peptides each for Tumor and Normal and the corresponding .map files for the 3 Tumor fastas) output_files: |- 'transgened_normal_10_mer_snpeffed.faa': fsID |- 'transgened_normal_15_mer_snpeffed.faa': fsID |- 'transgened_normal_9_mer_snpeffed.faa': fsID |- 'transgened_tumor_10_mer_snpeffed.faa': fsID |- 'transgened_tumor_10_mer_snpeffed.faa.map': fsID |- 'transgened_tumor_15_mer_snpeffed.faa': fsID |- 'transgened_tumor_15_mer_snpeffed.faa.map': fsID |- 'transgened_tumor_9_mer_snpeffed.faa': fsID +- 'transgened_tumor_9_mer_snpeffed.faa.map': fsID :rtype: dict """ work_dir = os.getcwd() input_files = { 'snpeffed_muts.vcf': snpeffed_file, 'rna.bam': rna_bam['rna_genome']['rna_genome_sorted.bam'], 'rna.bam.bai': rna_bam['rna_genome']['rna_genome_sorted.bam.bai'], 'pepts.fa.tar.gz': transgene_options['gencode_peptide_fasta'] } if tumor_dna_bam is not None: input_files.update({ 'tumor_dna.bam': tumor_dna_bam['tumor_dna_fix_pg_sorted.bam'], 'tumor_dna.bam.bai': tumor_dna_bam['tumor_dna_fix_pg_sorted.bam.bai'], }) input_files = get_files_from_filestore(job, input_files, work_dir, docker=False) input_files['pepts.fa'] = untargz(input_files['pepts.fa.tar.gz'], work_dir) input_files = {key: docker_path(path) for key, path in input_files.items()} parameters = [ '--peptides', input_files['pepts.fa'], '--snpeff', input_files['snpeffed_muts.vcf'], '--rna_file', input_files['rna.bam'], '--prefix', 'transgened', '--pep_lens', '9,10,15', '--cores', str(transgene_options['n']) ] if tumor_dna_bam is not None: parameters.extend(['--dna_file', input_files['tumor_dna.bam']]) if fusion_calls: fusion_files = { 'fusion_calls': fusion_calls, 'transcripts.fa.tar.gz': transgene_options['gencode_transcript_fasta'], 'annotation.gtf.tar.gz': transgene_options['gencode_annotation_gtf'], 'genome.fa.tar.gz': transgene_options['genome_fasta'] } fusion_files = get_files_from_filestore(job, fusion_files, work_dir, docker=False) fusion_files['transcripts.fa'] = untargz( fusion_files['transcripts.fa.tar.gz'], work_dir) fusion_files['genome.fa'] = untargz(fusion_files['genome.fa.tar.gz'], work_dir) fusion_files['annotation.gtf'] = untargz( fusion_files['annotation.gtf.tar.gz'], work_dir) fusion_files = { key: docker_path(path) for key, path in fusion_files.items() } parameters += [ '--transcripts', fusion_files['transcripts.fa'], '--fusions', fusion_files['fusion_calls'], '--genome', fusion_files['genome.fa'], '--annotation', fusion_files['annotation.gtf'] ] docker_call(tool='transgene', tool_parameters=parameters, work_dir=work_dir, dockerhub=univ_options['dockerhub'], tool_version=transgene_options['version']) output_files = defaultdict() for peplen in ['9', '10', '15']: for tissue_type in ['tumor', 'normal']: pepfile = '_'.join( ['transgened', tissue_type, peplen, 'mer_snpeffed.faa']) output_files[pepfile] = job.fileStore.writeGlobalFile( os.path.join(work_dir, pepfile)) export_results(job, output_files[pepfile], pepfile, univ_options, subfolder='peptides') mapfile = '_'.join( ['transgened_tumor', peplen, 'mer_snpeffed.faa.map']) output_files[mapfile] = job.fileStore.writeGlobalFile( os.path.join(work_dir, mapfile)) export_results(job, output_files[mapfile], mapfile, univ_options, subfolder='peptides') os.rename('transgened_transgened.vcf', 'mutations.vcf') export_results(job, job.fileStore.writeGlobalFile('mutations.vcf'), 'mutations.vcf', univ_options, subfolder='mutations/transgened') job.fileStore.logToMaster('Ran transgene on %s successfully' % univ_options['patient']) return output_files
def run_filter_radia(job, bams, radia_file, univ_options, radia_options, chrom): """ Run filterradia on the RADIA output. :param dict bams: Dict of bam and bai for tumor DNA-Seq, normal DNA-Seq and tumor RNA-Seq :param toil.fileStore.FileID radia_file: The vcf from runnning RADIA :param dict univ_options: Dict of universal options used by almost all tools :param dict radia_options: Options specific to RADIA :param str chrom: Chromosome to process :return: fsID for the filtered chromsome vcf :rtype: toil.fileStore.FileID """ work_dir = os.getcwd() input_files = { 'rna.bam': bams['tumor_rna'], 'rna.bam.bai': bams['tumor_rnai'], 'tumor.bam': bams['tumor_dna'], 'tumor.bam.bai': bams['tumor_dnai'], 'normal.bam': bams['normal_dna'], 'normal.bam.bai': bams['normal_dnai'], 'radia.vcf': radia_file, 'genome.fa.tar.gz': radia_options['genome_fasta'], 'genome.fa.fai.tar.gz': radia_options['genome_fai'], 'cosmic_beds': radia_options['cosmic_beds'], 'dbsnp_beds': radia_options['dbsnp_beds'], 'retrogene_beds': radia_options['retrogene_beds'], 'pseudogene_beds': radia_options['pseudogene_beds'], 'gencode_beds': radia_options['gencode_beds'] } input_files = get_files_from_filestore(job, input_files, work_dir, docker=False) for key in ('genome.fa', 'genome.fa.fai'): input_files[key] = untargz(input_files[key + '.tar.gz'], work_dir) for key in ('cosmic_beds', 'dbsnp_beds', 'retrogene_beds', 'pseudogene_beds', 'gencode_beds'): input_files[key] = untargz(input_files[key], work_dir) input_files = {key: docker_path(path) for key, path in input_files.items()} filterradia_log = ''.join( [work_dir, '/radia_filtered_', chrom, '_radia.log']) parameters = [ univ_options['patient'], # shortID chrom.lstrip('chr'), input_files['radia.vcf'], '/data', '/home/radia/scripts', '-d', input_files['dbsnp_beds'], '-r', input_files['retrogene_beds'], '-p', input_files['pseudogene_beds'], '-c', input_files['cosmic_beds'], '-t', input_files['gencode_beds'], '--noSnpEff', '--noBlacklist', '--noTargets', '--noRnaBlacklist', '-f', input_files['genome.fa'], '--log=INFO', '-g', docker_path(filterradia_log) ] docker_call(tool='filterradia', tool_parameters=parameters, work_dir=work_dir, dockerhub=univ_options['dockerhub'], tool_version=radia_options['version']) output_file = ''.join([work_dir, '/', chrom, '.vcf']) os.rename( ''.join([work_dir, '/', univ_options['patient'], '_', chrom, '.vcf']), output_file) output_fsid = job.fileStore.writeGlobalFile(output_file) export_results(job, output_fsid, output_file, univ_options, subfolder='mutations/radia') job.fileStore.logToMaster('Ran filter-radia on %s:%s successfully' % (univ_options['patient'], chrom)) return output_fsid
def assess_car_t_validity(job, gene_expression, univ_options, reports_options): """ This function creates a report on the available clinical trials and scientific literature available for the overexpressed genes in the specified tumor type. It also gives a list of clinical trials available for other types of cancer with the same overexpressed gene. :param toil.fileStore.FileID gene_expression: The resm gene expression :param dict univ_options: Dict of universal options used by almost all tools :param dict reports_options: Options specific to reporting modules :return: The results of running assess_car_t_validity :rtype: toil.fileStore.FileID """ work_dir = os.getcwd() tumor_type = univ_options['tumor_type'] input_files = { 'rsem_quant.tsv': gene_expression, 'car_t_targets.tsv.tar.gz': reports_options['car_t_targets_file'] } input_files = get_files_from_filestore(job, input_files, work_dir, docker=False) input_files['car_t_targets.tsv'] = untargz( input_files['car_t_targets.tsv.tar.gz'], work_dir) target_data = pd.read_table(input_files['car_t_targets.tsv'], index_col=0) patient_df = pd.read_csv('rsem_quant.tsv', sep=' ', delimiter='\t', header='infer', index_col=0) patient_df.index = (patient_df.index).str.replace('\\..*$', '') overexpressed = [] # Check if the tumor has a corresponding normal try: tissue_of_origin = TCGAToGTEx[tumor_type] except KeyError: tissue_of_origin = 'NA' # Write the report with open('car_t_target_report.txt', 'w') as car_t_report: #print(target_data.index, file=car_t_report) if tissue_of_origin in target_data.index: print('Available clinical trials for ' + str.lower(tissue_of_origin) + ' cancer with GTEX and TCGA median values', file=car_t_report) print(('\t{:10}{:<10}{:<10}{:<10}{:<40}{:<12}\n'.format( 'Gene', 'GTEX', 'TCGA N', 'Observed', 'DOI for gene papers', 'Clinical Trials')), file=car_t_report) collected_values = [] # Get the gene name, GTEX, TCGA, and observed values for index, row in target_data.iterrows(): if index == tissue_of_origin: gene = row['ENSG'] gtex = '{0:.2f}'.format(float(row['GTEX'])) tcga = '{0:.2f}'.format(float(row['TCGA'])) observed = '{0:.2f}'.format( float(patient_df.loc[ gene, 'TPM'])) if gene in patient_df.index else 'NA' doi = row['DOI'] target = str.upper(row['TARGET']) clinical_trial = row['Clinical trials'] collection = [ target, gtex, tcga, observed, doi, clinical_trial ] collected_values.append(collection) if observed != 'NA': if float(gtex) <= float(observed) or float( tcga) <= float(observed): overexpressed.append(gene) collected_values = sorted(collected_values, key=lambda col: float(col[3]), reverse=True) for entry in collected_values: print(('\t{:10}{:<10}{:<10}{:<10}{:<40}{:<12}'.format( entry[0], entry[1], entry[2], str(entry[3]), entry[4], entry[5])), file=car_t_report) print( '\nBased on the genes overexpressed in this cancer type, here\'s a list of clinical ' 'trials for other types of cancer', file=car_t_report) if len(overexpressed) != 0: # Check if there are other clinical trials for other cancer types print(('\t{:10}{:<10}{:<10}{:<10}{:<40}{:<17}{:<20}\n'.format( 'Gene', 'GTEX', 'TCGA N', 'Observed', 'DOI for gene papers', 'Clinical Trials', 'Cancer')), file=car_t_report) other_trials = [] for index, row in target_data.iterrows(): if row['ENSG'] in overexpressed and index != tissue_of_origin: gene = row['ENSG'] gtex = '{0:.2f}'.format(float(row['GTEX'])) tcga = '{0:.2f}'.format(float(row['TCGA'])) doi = row['DOI'] target = str.upper(row['TARGET']) observed = '{0:.2f}'.format( float(patient_df.loc[ gene, 'TPM'])) if gene in patient_df.index else 'NA' collected_values = [ target, gtex, tcga, observed, doi, row['Clinical trials'], index ] other_trials.append(collected_values) other_trials = sorted(other_trials, key=lambda col: col[0]) for entry in other_trials: print( ('\t{:10}{:<10}{:<10}{:<10}{:<40}{:<17}{:<20}'.format( entry[0], entry[1], entry[2], entry[3], entry[4], entry[5], entry[6])), file=car_t_report) else: print("Data not available", file=car_t_report) else: print('Data not available for ' + tumor_type, file=car_t_report) output_file = job.fileStore.writeGlobalFile(car_t_report.name) export_results(job, output_file, car_t_report.name, univ_options, subfolder='reports') job.fileStore.logToMaster( 'Ran car t validity assessment on %s successfully' % univ_options['patient']) return output_file
def run_snpeff(job, merged_mutation_file, univ_options, snpeff_options): """ This module will run snpeff on the aggregated mutation calls. Currently the only mutations called are SNPs hence SnpEff suffices. This node will be replaced in the future with another translator. ARGUMENTS 1. merged_mutation_file: <JSid for merged vcf> 2. univ_options: Dict of universal arguments used by almost all tools univ_options +- 'dockerhub': <dockerhub to use> 3. snpeff_options: Dict of parameters specific to snpeff snpeff_options +- 'tool_index': <JSid for the snpEff index tarball> RETURN VALUES 1. output_file: <JSid for the snpeffed vcf> This node corresponds to node 16 on the tree """ job.fileStore.logToMaster('Running snpeff on %s' % univ_options['patient']) work_dir = os.getcwd() input_files = { 'merged_mutations.vcf': merged_mutation_file, 'snpeff_index.tar.gz': snpeff_options['tool_index'] } input_files = get_files_from_filestore(job, input_files, work_dir, docker=False) input_files['snpeff_index'] = untargz(input_files['snpeff_index.tar.gz'], work_dir) input_files = {key: docker_path(path) for key, path in input_files.items()} parameters = [ 'eff', '-dataDir', input_files['snpeff_index'], '-c', '/'.join([input_files['snpeff_index'], 'snpEff_hg19_gencode.config']), '-no-intergenic', '-no-downstream', '-no-upstream', # '-canon', '-noStats', 'hg19_gencode', input_files['merged_mutations.vcf'] ] xmx = snpeff_options['java_Xmx'] if snpeff_options[ 'java_Xmx'] else univ_options['java_Xmx'] with open('/'.join([work_dir, 'mutations.vcf']), 'w') as snpeff_file: docker_call(tool='snpeff', tool_parameters=parameters, work_dir=work_dir, dockerhub=univ_options['dockerhub'], java_opts=xmx, outfile=snpeff_file) output_file = job.fileStore.writeGlobalFile(snpeff_file.name) export_results(job, output_file, snpeff_file.name, univ_options, subfolder='mutations/snpeffed') return output_file
def boost_ranks(job, isoform_expression, merged_mhc_calls, transgene_out, univ_options, rank_boost_options): """ This is the final module in the pipeline. It will call the rank boosting R script. This module corresponds to node 21 in the tree """ job.fileStore.logToMaster('Running boost_ranks on %s' % univ_options['patient']) work_dir = os.path.abspath(univ_options['patient']) os.mkdir(work_dir) input_files = { 'rsem_quant.tsv': isoform_expression, 'mhci_merged_files.tsv': merged_mhc_calls['mhci_merged_files.list'], 'mhcii_merged_files.tsv': merged_mhc_calls['mhcii_merged_files.list'], 'mhci_peptides.faa': transgene_out['transgened_tumor_10_mer_snpeffed.faa'], 'mhcii_peptides.faa': transgene_out['transgened_tumor_15_mer_snpeffed.faa'] } input_files = get_files_from_filestore(job, input_files, work_dir, docker=True) output_files = {} for mhc in ('mhci', 'mhcii'): parameters = [ mhc, input_files[''.join([mhc, '_merged_files.tsv'])], input_files['rsem_quant.tsv'], input_files[''.join([mhc, '_peptides.faa'])], rank_boost_options[''.join([mhc, '_combo'])] ] docker_call(tool='rankboost', tool_parameters=parameters, work_dir=work_dir, dockerhub=univ_options['dockerhub']) mhc_concise = ''.join( [work_dir, '/', mhc, '_merged_files_concise_results.tsv']) mhc_detailed = ''.join( [work_dir, '/', mhc, '_merged_files_detailed_results.tsv']) output_files[mhc] = {} if os.path.exists(mhc_concise): output_files[os.path.basename( mhc_concise)] = job.fileStore.writeGlobalFile(mhc_concise) export_results(job, output_files[os.path.basename(mhc_concise)], mhc_concise, univ_options, subfolder='rankboost') else: output_files[os.path.basename(mhc_concise)] = None if os.path.exists(mhc_detailed): output_files[os.path.basename(mhc_detailed)] = \ job.fileStore.writeGlobalFile(mhc_detailed) export_results(job, output_files[os.path.basename(mhc_detailed)], mhc_detailed, univ_options, subfolder='rankboost') else: output_files[os.path.basename(mhc_detailed)] = None return output_files
def merge_perchrom_mutations(job, chrom, mutations, univ_options): """ Merge the mutation calls for a single chromosome. :param str chrom: Chromosome to process :param dict mutations: dict of dicts of the various mutation caller names as keys, and a dict of per chromosome job store ids for vcfs as value :param dict univ_options: Dict of universal options used by almost all tools :returns fsID for vcf contaning merged calls for the given chromosome :rtype: toil.fileStore.FileID """ work_dir = os.getcwd() from protect.mutation_calling.muse import process_muse_vcf from protect.mutation_calling.mutect import process_mutect_vcf from protect.mutation_calling.radia import process_radia_vcf from protect.mutation_calling.somaticsniper import process_somaticsniper_vcf from protect.mutation_calling.strelka import process_strelka_vcf mutations.pop('indels') mutations['strelka_indels'] = mutations['strelka']['indels'] mutations['strelka_snvs'] = mutations['strelka']['snvs'] vcf_processor = { 'snvs': { 'mutect': process_mutect_vcf, 'muse': process_muse_vcf, 'radia': process_radia_vcf, 'somaticsniper': process_somaticsniper_vcf, 'strelka_snvs': process_strelka_vcf }, 'indels': { 'strelka_indels': process_strelka_vcf } } # 'fusions': lambda x: None, # 'indels': lambda x: None} # For now, let's just say 2 out of n need to call it. # num_preds = len(mutations) # majority = int((num_preds + 0.5) / 2) majority = {'snvs': 2, 'indels': 1} accepted_hits = defaultdict(dict) for mut_type in vcf_processor.keys(): # Get input files perchrom_mutations = { caller: vcf_processor[mut_type][caller](job, mutations[caller][chrom], work_dir, univ_options) for caller in vcf_processor[mut_type] } # Process the strelka key perchrom_mutations['strelka'] = perchrom_mutations['strelka_' + mut_type] perchrom_mutations.pop('strelka_' + mut_type) # Read in each file to a dict vcf_lists = { caller: read_vcf(vcf_file) for caller, vcf_file in perchrom_mutations.items() } all_positions = list(set(itertools.chain(*vcf_lists.values()))) for position in sorted(all_positions): hits = { caller: position in vcf_lists[caller] for caller in perchrom_mutations.keys() } if sum(hits.values()) >= majority[mut_type]: callers = ','.join( [caller for caller, hit in hits.items() if hit]) assert position[1] not in accepted_hits[position[0]] accepted_hits[position[0]][position[1]] = (position[2], position[3], callers) with open(''.join([work_dir, '/', chrom, '.vcf']), 'w') as outfile: print('##fileformat=VCFv4.0', file=outfile) print( '##INFO=<ID=callers,Number=.,Type=String,Description=List of supporting callers.', file=outfile) print('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO', file=outfile) for chrom in chrom_sorted(accepted_hits.keys()): for position in sorted(accepted_hits[chrom]): print(chrom, position, '.', accepted_hits[chrom][position][0], accepted_hits[chrom][position][1], '.', 'PASS', 'callers=' + accepted_hits[chrom][position][2], sep='\t', file=outfile) fsid = job.fileStore.writeGlobalFile(outfile.name) export_results(job, fsid, outfile.name, univ_options, subfolder='mutations/merged') return fsid
def run_star(job, fastqs, univ_options, star_options): """ Align a pair of fastqs with STAR. :param list fastqs: The input fastqs for alignment :param dict univ_options: Dict of universal options used by almost all tools :param dict star_options: Options specific to star :return: Dict containing output genome bam, genome bai, and transcriptome bam output_files: |- 'rnaAligned.toTranscriptome.out.bam': fsID +- 'rnaAligned.out.bam': fsID +- 'rnaChimeric.out.junction': fsID :rtype: dict """ assert star_options['type'] in ('star', 'starlong') work_dir = os.getcwd() input_files = { 'rna_cutadapt_1.fastq': fastqs[0], 'rna_cutadapt_2.fastq': fastqs[1], 'star_index.tar.gz': star_options['index']} input_files = get_files_from_filestore(job, input_files, work_dir, docker=False) # Handle gzipped file gz = '.gz' if is_gzipfile(input_files['rna_cutadapt_1.fastq']) else '' if gz: for read_file in 'rna_cutadapt_1.fastq', 'rna_cutadapt_2.fastq': os.symlink(read_file, read_file + gz) input_files[read_file + gz] = input_files[read_file] + gz # Untar the index input_files['star_index'] = untargz(input_files['star_index.tar.gz'], work_dir) # Check to see if user is using a STAR-Fusion index star_fusion_idx = os.path.join(input_files['star_index'], 'ref_genome.fa.star.idx') if os.path.exists(star_fusion_idx): input_files['star_index'] = star_fusion_idx input_files = {key: docker_path(path, work_dir=work_dir) for key, path in input_files.items()} # Using recommended STAR-Fusion parameters: # https://github.com/STAR-Fusion/STAR-Fusion/wiki parameters = ['--runThreadN', str(star_options['n']), '--genomeDir', input_files['star_index'], '--twopassMode', 'Basic', '--outReadsUnmapped', 'None', '--chimSegmentMin', '12', '--chimJunctionOverhangMin', '12', '--alignSJDBoverhangMin', '10', '--alignMatesGapMax', '200000', '--alignIntronMax', '200000', '--chimSegmentReadGapMax', 'parameter', '3', '--alignSJstitchMismatchNmax', '5', '-1', '5', '5', '--outFileNamePrefix', 'rna', '--readFilesIn', input_files['rna_cutadapt_1.fastq' + gz], input_files['rna_cutadapt_2.fastq' + gz], '--outSAMattributes', 'NH', 'HI', 'AS', 'NM', 'MD', '--outSAMtype', 'BAM', 'Unsorted', '--quantMode', 'TranscriptomeSAM'] if gz: parameters.extend(['--readFilesCommand', 'zcat']) if star_options['type'] == 'star': docker_call(tool='star', tool_parameters=parameters, work_dir=work_dir, dockerhub=univ_options['dockerhub'], tool_version=star_options['version']) else: docker_call(tool='starlong', tool_parameters=parameters, work_dir=work_dir, dockerhub=univ_options['dockerhub'], tool_version=star_options['version']) output_files = defaultdict() for output_file in ['rnaAligned.toTranscriptome.out.bam', 'rnaAligned.out.bam', 'rnaChimeric.out.junction']: output_files[output_file] = job.fileStore.writeGlobalFile('/'.join([work_dir, output_file])) export_results(job, output_files['rnaAligned.toTranscriptome.out.bam'], 'rna_transcriptome.bam', univ_options, subfolder='alignments') export_results(job, output_files['rnaChimeric.out.junction'], 'rna_chimeric.junction', univ_options, subfolder='mutations/fusions') job.fileStore.logToMaster('Ran STAR on %s successfully' % univ_options['patient']) return output_files
def merge_mhc_peptide_calls(job, antigen_predictions, transgened_files, univ_options): """ This module will merge all the calls from nodes 18 and 19, and will filter for the top X%% of binders of each allele. The module will then call the rank boosting script to finish off the pipeline. This module corresponds to node 19 on the tree """ job.fileStore.logToMaster('Merging MHC calls') work_dir = os.getcwd() pept_files = { '10_mer.faa': transgened_files['transgened_tumor_10_mer_snpeffed.faa'], '10_mer.faa.map': transgened_files['transgened_tumor_10_mer_snpeffed.faa.map'], '15_mer.faa': transgened_files['transgened_tumor_15_mer_snpeffed.faa'], '15_mer.faa.map': transgened_files['transgened_tumor_15_mer_snpeffed.faa.map']} mhci_preds, mhcii_preds = antigen_predictions mhci_files = get_files_from_filestore(job, mhci_preds, work_dir) # First split mhcii_preds into prediction files and predictors and maintain keys so we can later # reference them in pairs mhcii_predictors = {x: y[1] for x, y in mhcii_preds.items()} mhcii_files = {x: y[0] for x, y in mhcii_preds.items()} mhcii_files = get_files_from_filestore(job, mhcii_files, work_dir) # Get peptide files pept_files = get_files_from_filestore(job, pept_files, work_dir) # Merge MHCI calls # Read 10-mer pepts into memory peptides = read_peptide_file(pept_files['10_mer.faa']) with open(pept_files['10_mer.faa.map'], 'r') as mapfile: pepmap = json.load(mapfile) # Incorporate peptide names into the merged calls with open('/'.join([work_dir, 'mhci_merged_files.list']), 'w') as mhci_resfile: for mhcifile in mhci_files.values(): with open(mhcifile, 'r') as mf: for line in mf: # Skip header lines if not line.startswith('HLA'): continue line = line.strip().split('\t') allele = line[0] pept = line[5] pred = line[7] if float(pred) > 5.00: continue print_mhc_peptide((allele, pept, pred, pept), peptides, pepmap, mhci_resfile) # Merge MHCII calls # read 15-mer pepts into memory peptides = read_peptide_file(pept_files['15_mer.faa']) with open(pept_files['15_mer.faa.map'], 'r') as mapfile: pepmap = json.load(mapfile) # Incorporate peptide names into the merged calls with open('/'.join([work_dir, 'mhcii_merged_files.list']), 'w') as \ mhcii_resfile: for mhciifile in mhcii_files.keys(): core_col = None # Variable to hold the column number with the core if mhcii_predictors[mhciifile] == 'Consensus': with open(mhcii_files[mhciifile], 'r') as mf: for line in mf: # Skip header lines if not line.startswith('HLA'): continue line = line.strip().split('\t') allele = line[0] pept = line[4] pred = line[6] if core_col: core = line[core_col] if core_col else 'NOCORE' else: methods = line[5].lstrip('Consensus(').rstrip(')') methods = methods.split(',') if 'NN' in methods: core_col = 13 elif 'netMHCIIpan' in methods: core_col = 17 elif 'Sturniolo' in methods: core_col = 19 elif 'SMM' in methods: core_col = 10 core = line[core_col] if core_col else 'NOCORE' if float(pred) > 5.00: continue print_mhc_peptide((allele, pept, pred, core), peptides, pepmap, mhcii_resfile) elif mhcii_predictors[mhciifile] == 'Sturniolo': with open(mhcii_files[mhciifile], 'r') as mf: for line in mf: # Skip header lines if not line.startswith('HLA'): continue line = line.strip().split('\t') allele = line[0] pept = line[5] pred = line[6] core = line[19] # if float(pred) > 5.00: continue print_mhc_peptide((allele, pept, pred, core), peptides, pepmap, mhcii_resfile) elif mhcii_predictors[mhciifile] == 'netMHCIIpan': with open(mhcii_files[mhciifile], 'r') as mf: # Get the allele from the first line and skip the second line allele = re.sub('-DQB', '/DQB', mf.readline().strip()) _ = mf.readline() for line in mf: line = line.strip().split('\t') pept = line[1] pred = line[5] core = 'NOCORE' peptide_name = line[2] if float(pred) > 5.00: continue print(allele, pept, peptide_name, core, '0', pred, pepmap[peptide_name], sep='\t', file=mhcii_resfile) else: raise RuntimeError('Shouldn\'t ever see this!!!') output_files = defaultdict() for mhc_file in [mhci_resfile.name, mhcii_resfile.name]: output_files[os.path.split(mhc_file)[1]] = job.fileStore.writeGlobalFile(mhc_file) export_results(job, mhc_file, univ_options, subfolder='binding_predictions') return output_files
def assess_mhc_genes(job, gene_expression, rna_haplotype, univ_options, reports_options): """ Assess the prevalence of the various genes in the MHC pathway and return a report in the tsv format. :param toil.fileStore.FileID gene_expression: fsID for the rsem gene expression file :param toil.fileStore.FileID|None rna_haplotype: fsID for the RNA PHLAT file :param dict univ_options: Dict of universal options used by almost all tools :param dict reports_options: Options specific to reporting modules :return: The fsID for the mhc pathway report file :rtype: toil.fileStore.FileID """ work_dir = os.getcwd() # Take file parameters for both TCGA and GTEX files tumor_type = univ_options['tumor_type'] b_types = { 'tcga': tumor_type + " normal", 'gtex': TCGAToGTEx[tumor_type] if tumor_type in TCGAToGTEx else "NA"} input_files = { 'rsem_quant.tsv': gene_expression, 'mhc_pathways.tsv.tar.gz': reports_options['mhc_pathways_file']} if rna_haplotype is not None: input_files['rna_haplotype.sum'] = rna_haplotype input_files = get_files_from_filestore(job, input_files, work_dir, docker=False) input_files['mhc_pathways.tsv'] = untargz(input_files['mhc_pathways.tsv.tar.gz'], work_dir) # Read the background file background_df = pd.read_table(input_files['mhc_pathways.tsv'], index_col=0, header=0) # Parse the rna phlat file if rna_haplotype is not None: with open(input_files['rna_haplotype.sum']) as rna_mhc: mhc_alleles = {'HLA_A': [], 'HLA_B': [], 'HLA_C': [], 'HLA_DPA': [], 'HLA_DQA': [], 'HLA_DPB': [], 'HLA_DQB': [], 'HLA_DRB': []} mhc_alleles = parse_phlat_file(rna_mhc, mhc_alleles) # Read the patient gene values into a dictionary gene_expressions = pd.read_table(input_files['rsem_quant.tsv'], index_col=0, header=0) gene_expressions = Counter({x.split('.')[0]: y for x, y in gene_expressions['TPM'].to_dict().items()}) # Print the report roles = {x for x in background_df['Roles'].values if ',' not in x} with open('mhc_pathway_report.txt', 'w') as mpr: for role in roles: role_df = background_df[background_df['Roles'].str.contains(role)] print(role.center(90, ' '), file=mpr) print( "{:12}{:<12}{:<17}{:<12}{:<20}{:<17}\n".format('Gene', 'Observed', 'Threshold_GTEX', 'Result', 'Threshold_TCGA_N', 'Result'), file=mpr) # If tumor_type in TCGAToGTEx.keys(): if role == 'MHCI loading': for mhci_allele in 'HLA_A', 'HLA_B', 'HLA_C': if rna_haplotype is not None: num_alleles = len(mhc_alleles[mhci_allele]) result = ('FAIL' if num_alleles == 0 else 'LOW' if num_alleles == 1 else 'PASS') else: result = num_alleles = 'NA' print("{:12}{:<12}{:<17}{:<12}{:<20}{:<17}".format(mhci_allele, 2, num_alleles, result, 2, result), file=mpr) elif role == 'MHCII loading': for mhcii_allele in ('HLA_DQA', 'HLA_DQB', 'HLA_DRB'): if rna_haplotype is not None: num_alleles = len(mhc_alleles[mhcii_allele]) result = ('FAIL' if num_alleles == 0 else 'LOW' if num_alleles == 1 else 'PASS') else: result = num_alleles = 'NA' print( "{:12}{:<12}{:<17}{:<12}{:<20}{:<17}".format(mhcii_allele, 2, num_alleles, result, 2, result), file=mpr) for ensg in role_df.index: ensgName = background_df.ix[ensg, 'Name'] b_vals = {} for bkg in b_types: val = "{0:.2f}".format(role_df.loc[ensg].get(b_types[bkg], default='NA')) result = ('NA' if val == 'NA' else 'LOW' if float(val) >= float(gene_expressions[ensg]) else 'PASS') b_vals[bkg] = val, result print( "{:12}{:<12}{:<17}{:<12}{:<20}{:<17}".format(ensgName, float(gene_expressions[ensg]), b_vals['gtex'][0], b_vals['gtex'][1], b_vals['tcga'][0], b_vals['tcga'][1]), file=mpr) print('\n', file=mpr) output_file = job.fileStore.writeGlobalFile(mpr.name) export_results(job, output_file, mpr.name, univ_options, subfolder='reports') job.fileStore.logToMaster('Ran mhc gene assessment on %s successfully' % univ_options['patient']) return output_file