Example #1
0
def merge_perchrom_vcfs(job, perchrom_vcfs, tool_name, univ_options):
    """
    This module will merge per-chromosome vcf files into a single genome level vcf.

    :param dict perchrom_vcfs: Dictionary with chromosome name as key and jobstore ID of
                               corresponding vcf as value
    :param str tool_name: Name of the tool that generated the vcfs

    :returns: Job Store File ID for the merged vcf
    """
    job.fileStore.logToMaster('Running merge_perchrom_vcfs  for %s' % tool_name)
    work_dir = os.getcwd()
    input_files = {''.join([chrom, '.vcf']): jsid for chrom, jsid in perchrom_vcfs.items()}
    input_files = get_files_from_filestore(job, input_files, work_dir, docker=False)
    first = True
    with open(''.join([work_dir, '/', 'all_merged.vcf']), 'w') as outvcf:
        for chromvcfname in chrom_sorted([x.rstrip('.vcf') for x in input_files.keys()]):
            with open(input_files[chromvcfname + '.vcf'], 'r') as infile:
                for line in infile:
                    line = line.strip()
                    if line.startswith('#'):
                        if first:
                            print(line, file=outvcf)
                        continue
                    first = False
                    print(line, file=outvcf)
    export_results(job, outvcf.name, univ_options, subfolder='mutations/' + tool_name)
    output_file = job.fileStore.writeGlobalFile(outvcf.name)
    return output_file
Example #2
0
def index_bamfile(job, bamfile, sample_type, univ_options):
    """
    This module indexes BAMFILE
    ARGUMENTS
    1. bamfile: <JSid for a bam file>
    2. sample_type: string of 'tumor_dna' or 'normal_dna'
    3. univ_options: Dict of universal arguments used by almost all tools
         univ_options
                +- 'dockerhub': <dockerhub to use>
    RETURN VALUES
    1. output_files: REFER output_files in run_bwa(). This module is the one is
                     the one that generates the files.
    """
    job.fileStore.logToMaster('Running samtools-index on %s:%s' % (univ_options['patient'],
                                                                   sample_type))
    work_dir = os.getcwd()
    in_bamfile = '_'.join([sample_type, 'fix_pg_sorted.bam'])
    input_files = {
        in_bamfile: bamfile}
    input_files = get_files_from_filestore(job, input_files, work_dir, docker=True)
    parameters = ['index',
                  input_files[in_bamfile]]
    docker_call(tool='samtools', tool_parameters=parameters,
                work_dir=work_dir, dockerhub=univ_options['dockerhub'])
    out_bai = '/'.join([work_dir, in_bamfile + '.bai'])
    output_files = {in_bamfile: bamfile,
                    in_bamfile + '.bai': job.fileStore.writeGlobalFile(out_bai)}
    export_results(job, os.path.splitext(out_bai)[0], univ_options, subfolder='alignments')
    export_results(job, out_bai, univ_options, subfolder='alignments')
    return output_files
Example #3
0
def predict_netmhcii_binding(job, peptfile, allele, univ_options):
    """
    This module will predict MHC:peptide binding for peptides in the files created in node YY to
    ALLELE.  ALLELE represents an MHCII allele.

    This module corresponds to node 19 on the tree
    """
    job.fileStore.logToMaster('Running netmhciipan on %s' % allele)
    work_dir = os.getcwd()
    input_files = {
        'peptfile.faa': peptfile}
    input_files = get_files_from_filestore(job, input_files, work_dir, docker=True)
    # netMHCIIpan accepts differently formatted alleles so we need to modify the input alleles
    if allele.startswith('HLA-DQA') or allele.startswith('HLA-DPA'):
        allele = re.sub(r'[*:]', '', allele)
        allele = re.sub(r'/', '-', allele)
    elif allele.startswith('HLA-DRB'):
        allele = re.sub(r':', '', allele)
        allele = re.sub(r'\*', '_', allele)
        allele = allele.lstrip('HLA-')
    else:
        raise RuntimeError('Unknown allele seen')
    parameters = ['-a', allele,
                  '-xls', '1',
                  '-xlsfile', 'predictions.tsv',
                  '-f', input_files['peptfile.faa']]
    # netMHC writes a lot of useless stuff to sys.stdout so we open /dev/null and dump output there.
    with open(os.devnull, 'w') as output_catcher:
        docker_call(tool='netmhciipan:final', tool_parameters=parameters, work_dir=work_dir,
                    dockerhub=univ_options['dockerhub'], outfile=output_catcher)
    output_file = job.fileStore.writeGlobalFile('/'.join([work_dir, 'predictions.tsv']))
    return output_file, 'netMHCIIpan'
Example #4
0
def bam_conversion(job, samfile, sample_type, univ_options):
    """
    This module converts SAMFILE from sam to bam

    ARGUMENTS
    1. samfile: <JSid for a sam file>
    2. sample_type: string of 'tumor_dna' or 'normal_dna'
    3. univ_options: Dict of universal arguments used by almost all tools
         univ_options
                +- 'dockerhub': <dockerhub to use>
    RETURN VALUES
    1. output_files: REFER output_files in run_bwa()
    """
    job.fileStore.logToMaster('Running sam2bam on %s:%s' % (univ_options['patient'], sample_type))
    work_dir = os.getcwd()
    input_files = {
        sample_type + '_aligned.sam': samfile}
    input_files = get_files_from_filestore(job, input_files, work_dir,
                                           docker=True)
    bamfile = '/'.join([work_dir, sample_type + '_aligned.bam'])
    parameters = ['view',
                  '-bS',
                  '-o', docker_path(bamfile),
                  input_files[sample_type + '_aligned.sam']
                  ]
    docker_call(tool='samtools', tool_parameters=parameters, work_dir=work_dir,
                dockerhub=univ_options['dockerhub'])
    output_file = job.fileStore.writeGlobalFile(bamfile)
    # The samfile is no longer useful so delete it
    job.fileStore.deleteGlobalFile(samfile)
    return output_file
Example #5
0
def run_muse_sump_perchrom(job, muse_output, univ_options, muse_options, chrom):
    """
    This module will run muse sump on the muse output
    """
    job.fileStore.logToMaster('Running muse sump on %s:%s' % (univ_options['patient'], chrom))
    work_dir = os.getcwd()
    input_files = {
        'MuSE.txt': muse_output,
        'dbsnp_coding.vcf.gz': muse_options['dbsnp_vcf'],
        'dbsnp_coding.vcf.gz.tbi.tmp': muse_options['dbsnp_tbi']}
    input_files = get_files_from_filestore(job, input_files, work_dir, docker=False)
    tbi = os.path.splitext(input_files['dbsnp_coding.vcf.gz.tbi.tmp'])[0]
    print({x: os.stat(x) for x in os.listdir(work_dir)}, file=sys.stderr)
    time.sleep(2)
    shutil.copy(input_files['dbsnp_coding.vcf.gz.tbi.tmp'], tbi)
    os.chmod(tbi, 0777)
    open(tbi, 'a').close()
    input_files = {key: docker_path(path) for key, path in input_files.items()}
    print({x: os.stat(x) for x in os.listdir(work_dir)}, file=sys.stderr)
    output_file = ''.join([work_dir, '/', chrom, '.vcf'])

    parameters = ['sump',
                  '-I', input_files['MuSE.txt'],
                  '-O', docker_path(output_file),
                  '-D', input_files['dbsnp_coding.vcf.gz'],
                  '-E']

    docker_call(tool='muse', tool_parameters=parameters, work_dir=work_dir,
                dockerhub=univ_options['dockerhub'])
    export_results(job, output_file, univ_options, subfolder='mutations/muse')
    outfile = job.fileStore.writeGlobalFile(output_file)
    return outfile
Example #6
0
def run_pileup(job, tumor_bam, univ_options, somaticsniper_options):
    """
    Runs a samtools pileup on the tumor bam.

    :param toil.Job job: job
    :param dict tumor_bam: Tumor bam file
    :param dict univ_options: Universal Options
    :returns: jsID for the chromsome pileup file
    :rtype: str
    """
    job.fileStore.logToMaster(
        'Running samtools pileup on %s' % univ_options['patient'])
    work_dir = os.getcwd()
    input_files = {
        'tumor.bam': tumor_bam['tumor_dna_fix_pg_sorted.bam'],
        'tumor.bam.bai': tumor_bam['tumor_dna_fix_pg_sorted.bam.bai'],
        'genome.fa.tar.gz': somaticsniper_options['genome_fasta'],
        'genome.fa.fai.tar.gz': somaticsniper_options['genome_fai']}
    input_files = get_files_from_filestore(job, input_files, work_dir, docker=False)

    for key in ('genome.fa', 'genome.fa.fai'):
        input_files[key] = untargz(input_files[key + '.tar.gz'], work_dir)
    input_files = {key: docker_path(path) for key, path in input_files.items()}

    parameters = ['pileup',
                  '-cvi',
                  '-f', docker_path(input_files['genome.fa']),
                  docker_path(input_files['tumor.bam'])]

    with open(os.path.join(work_dir, 'pileup.txt'), 'w') as pileup_file:
        docker_call(tool='samtools:0.1.8', tool_parameters=parameters, work_dir=work_dir,
                    dockerhub=univ_options['dockerhub'], outfile=pileup_file)
    outfile = job.fileStore.writeGlobalFile(pileup_file.name)
    return outfile
Example #7
0
def run_filter_radia(job, bams, radia_file, univ_options, radia_options, chrom):
    """
    This module will run filterradia on the RNA and DNA bams.

    ARGUMENTS
    1. bams: REFER ARGUMENTS of run_radia()
    2. univ_options: REFER ARGUMENTS of run_radia()
    3. radia_file: <JSid of vcf generated by run_radia()>
    3. radia_options: REFER ARGUMENTS of run_radia()
    4. chrom: REFER ARGUMENTS of run_radia()

    RETURN VALUES
    1. output_file: <JSid of radia_filtered_CHROM.vcf>
    """
    job.fileStore.logToMaster('Running filter-radia on %s:%s' % (univ_options['patient'], chrom))
    work_dir = os.getcwd()
    input_files = {
        'rna.bam': bams['tumor_rna'],
        'rna.bam.bai': bams['tumor_rnai'],
        'tumor.bam': bams['tumor_dna'],
        'tumor.bam.bai': bams['tumor_dnai'],
        'normal.bam': bams['normal_dna'],
        'normal.bam.bai': bams['normal_dnai'],
        'radia.vcf': radia_file,
        'genome.fa.tar.gz': radia_options['genome_fasta'],
        'genome.fa.fai.tar.gz': radia_options['genome_fai']}
    input_files = get_files_from_filestore(job, input_files, work_dir, docker=False)

    for key in ('genome.fa', 'genome.fa.fai'):
        input_files[key] = untargz(input_files[key + '.tar.gz'], work_dir)
    input_files = {key: docker_path(path) for key, path in input_files.items()}

    filterradia_log = ''.join([work_dir, '/radia_filtered_', chrom, '_radia.log'])
    parameters = [univ_options['patient'],  # shortID
                  chrom.lstrip('chr'),
                  input_files['radia.vcf'],
                  '/data',
                  '/home/radia/scripts',
                  '-d', '/home/radia/data/hg19/snp135',
                  '-r', '/home/radia/data/hg19/retroGenes/',
                  '-p', '/home/radia/data/hg19/pseudoGenes/',
                  '-c', '/home/radia/data/hg19/cosmic/',
                  '-t', '/home/radia/data/hg19/gaf/2_1',
                  '--noSnpEff',
                  '--noBlacklist',
                  '--noTargets',
                  '--noRnaBlacklist',
                  '-f', input_files['genome.fa'],
                  '--log=INFO',
                  '-g', docker_path(filterradia_log)]
    docker_call(tool='filterradia', tool_parameters=parameters,
                work_dir=work_dir, dockerhub=univ_options['dockerhub'])
    output_file = ''.join([work_dir, '/', chrom, '.vcf'])
    os.rename(''.join([work_dir, '/', univ_options['patient'], '_', chrom, '.vcf']), output_file)
    export_results(job, output_file, univ_options, subfolder='mutations/radia')
    output_file = job.fileStore.writeGlobalFile(output_file)
    return output_file
Example #8
0
def run_bwa(job, fastqs, sample_type, univ_options, bwa_options):
    """
    This module aligns the SAMPLE_TYPE dna fastqs to the reference

    ARGUMENTS -- <ST> depicts the sample type. Substitute with 'tumor'/'normal'
    1. fastqs: Dict of list of input WGS/WXS fastqs
         fastqs
              +- '<ST>_dna': [<JSid for 1.fastq> , <JSid for 2.fastq>]
    2. sample_type: string of 'tumor_dna' or 'normal_dna'
    3. univ_options: Dict of universal arguments used by almost all tools
         univ_options
                +- 'dockerhub': <dockerhub to use>
    4. bwa_options: Dict of parameters specific to bwa
         bwa_options
              |- 'tool_index': <JSid for the bwa index tarball>
              +- 'n': <number of threads to allocate>

    RETURN VALUES
    1. output_files: Dict of aligned bam + reference (nested return)
         output_files
             |- '<ST>_fix_pg_sorted.bam': <JSid>
             +- '<ST>_fix_pg_sorted.bam.bai': <JSid>

    This module corresponds to nodes 3 and 4 on the tree
    """
    job.fileStore.logToMaster('Running bwa on %s:%s' % (univ_options['patient'], sample_type))
    work_dir = os.getcwd()
    input_files = {
        'dna_1.fastq': fastqs[0],
        'dna_2.fastq': fastqs[1],
        'bwa_index.tar.gz': bwa_options['tool_index']}
    input_files = get_files_from_filestore(job, input_files, work_dir, docker=False)
    # Handle gzipped file
    gz = '.gz' if is_gzipfile(input_files['dna_1.fastq']) else ''
    if gz:
        for read_file in 'dna_1.fastq', 'dna_2.fastq':
            os.symlink(read_file, read_file + gz)
            input_files[read_file + gz] = input_files[read_file] + gz
    # Untar the index
    input_files['bwa_index'] = untargz(input_files['bwa_index.tar.gz'], work_dir)
    input_files = {key: docker_path(path) for key, path in input_files.items()}

    parameters = ['mem',
                  '-t', str(bwa_options['n']),
                  '-v', '1',  # Don't print INFO messages to the stderr
                  '/'.join([input_files['bwa_index'], 'hg19']),
                  input_files['dna_1.fastq' + gz],
                  input_files['dna_2.fastq' + gz]]
    with open(''.join([work_dir, '/', sample_type, '_aligned.sam']), 'w') as samfile:
        docker_call(tool='bwa', tool_parameters=parameters, work_dir=work_dir,
                    dockerhub=univ_options['dockerhub'], outfile=samfile)
    # samfile.name retains the path info
    output_file = job.fileStore.writeGlobalFile(samfile.name)
    return output_file
Example #9
0
def merge_phlat_calls(job, tumor_phlat, normal_phlat, rna_phlat, univ_options):
    """
    This module will merge the results form running PHLAT on the 3 input fastq
    pairs.

    ARGUMENTS
    1. tumor_phlat: <JSid for tumor DNA called alleles>
    2. normal_phlat: <JSid for normal DNA called alleles>
    3. rna_phlat: <JSid for tumor RNA called alleles>

    RETURN VALUES
    1. output_files: Dict of JSids for consensus MHCI and MHCII alleles
             output_files
                    |- 'mhci_alleles.list': <JSid>
                    +- 'mhcii_alleles.list': <JSid>

    This module corresponds to node 14 on the tree
    """
    job.fileStore.logToMaster('Merging Phlat calls')
    work_dir = os.getcwd()
    input_files = {
        'tumor_dna': tumor_phlat,
        'normal_dna': normal_phlat,
        'tumor_rna': rna_phlat}
    input_files = get_files_from_filestore(job, input_files, work_dir)
    with open(input_files['tumor_dna'], 'r') as td_file, \
            open(input_files['normal_dna'], 'r') as nd_file, \
            open(input_files['tumor_rna'], 'r') as tr_file:
        # TODO: Could this be a defautdict?
        mhc_alleles = {'HLA_A': [], 'HLA_B': [], 'HLA_C': [], 'HLA_DPA': [], 'HLA_DQA': [],
                       'HLA_DPB': [], 'HLA_DQB': [], 'HLA_DRB': []}
        for phlatfile in td_file, nd_file, tr_file:
            mhc_alleles = parse_phlat_file(phlatfile, mhc_alleles)
    # Get most probable alleles for each allele group and print to output
    with open(os.path.join(work_dir, 'mhci_alleles.list'), 'w') as mhci_file, \
            open(os.path.join(work_dir, 'mhcii_alleles.list'), 'w') as mhcii_file:
        for mhci_group in ['HLA_A', 'HLA_B', 'HLA_C']:
            mpa = most_probable_alleles(mhc_alleles[mhci_group])
            print('\n'.join([''.join(['HLA-', x]) for x in mpa]), file=mhci_file)
        drb_mpa = most_probable_alleles(mhc_alleles['HLA_DRB'])
        print('\n'.join([''.join(['HLA-', x]) for x in drb_mpa]), file=mhcii_file)
        dqa_mpa = most_probable_alleles(mhc_alleles['HLA_DQA'])
        dqb_mpa = most_probable_alleles(mhc_alleles['HLA_DQB'])
        for dqa_allele in dqa_mpa:
            for dqb_allele in dqb_mpa:
                print(''.join(['HLA-', dqa_allele, '/', dqb_allele]), file=mhcii_file)
    output_files = defaultdict()
    for allele_file in ['mhci_alleles.list', 'mhcii_alleles.list']:
        output_files[allele_file] = job.fileStore.writeGlobalFile(os.path.join(work_dir,
                                                                               allele_file))
        export_results(job, os.path.join(work_dir, allele_file), univ_options,
                       subfolder='haplotyping')
    return output_files
Example #10
0
def run_phlat(job, fastqs, sample_type, univ_options, phlat_options):
    """
    This module will run PHLAT on SAMPLE_TYPE fastqs.

    ARGUMENTS -- <ST> depicts the sample type. Substitute with 'tumor_dna',
                 'normal_dna', or 'tumor_rna'
    1. fastqs: Dict of list of input WGS/WXS fastqs
         fastqs
              +- '<ST>': [<JSid for 1.fastq> , <JSid for 2.fastq>]
    2. sample_type: string of 'tumor' or 'normal'
    3. univ_options: Dict of universal arguments used by almost all tools
         univ_options
                +- 'dockerhub': <dockerhub to use>
    4. phlat_options: Dict of parameters specific to phlat
         phlat_options
              |- 'tool_index': <JSid for the PHLAT index tarball>
              +- 'n': <number of threads to allocate>

    RETURN VALUES
    1. output_file: <JSid for the allele predictions for ST>

    This module corresponds to nodes 5, 6 and 7 on the tree
    """
    job.fileStore.logToMaster('Running phlat on %s:%s' % (univ_options['patient'], sample_type))
    print(phlat_options, file=sys.stderr)
    work_dir = os.getcwd()
    input_files = {
        'input_1.fastq': fastqs[0],
        'input_2.fastq': fastqs[1],
        'phlat_index.tar.gz': phlat_options['tool_index']}
    input_files = get_files_from_filestore(job, input_files, work_dir, docker=False)
    # Handle gzipped files
    gz = '.gz' if is_gzipfile(input_files['input_1.fastq']) else ''
    if gz:
        for read_file in 'input_1.fastq', 'input_2.fastq':
            os.symlink(read_file, read_file + gz)
            input_files[read_file + gz] = input_files[read_file] + gz
    # Untar the index
    input_files['phlat_index'] = untargz(input_files['phlat_index.tar.gz'], work_dir)
    input_files = {key: docker_path(path) for key, path in input_files.items()}

    parameters = ['-1', input_files['input_1.fastq' + gz],
                  '-2', input_files['input_2.fastq' + gz],
                  '-index', input_files['phlat_index'],
                  '-b2url', '/usr/local/bin/bowtie2',
                  '-tag', sample_type,
                  '-e', '/home/phlat-1.0',  # Phlat directory home
                  '-o', '/data',  # Output directory
                  '-p', str(phlat_options['n'])]  # Number of threads
    docker_call(tool='phlat', tool_parameters=parameters, work_dir=work_dir,
                dockerhub=univ_options['dockerhub'])
    output_file = job.fileStore.writeGlobalFile(''.join([work_dir, '/', sample_type, '_HLA.sum']))
    return output_file
def run_transgene(job, snpeffed_file, rna_bam, univ_options, transgene_options):
    """
    This module will run transgene on the input vcf file from the aggregator and produce the
    peptides for MHC prediction

    ARGUMENTS
    1. snpeffed_file: <JSid for snpeffed vcf>
    2. univ_options: Dict of universal arguments used by almost all tools
         univ_options
                +- 'dockerhub': <dockerhub to use>
    3. transgene_options: Dict of parameters specific to transgene
         transgene_options
                +- 'gencode_peptide_fasta': <JSid for the gencode protein fasta>

    RETURN VALUES
    1. output_files: Dict of transgened n-mer peptide fastas
         output_files
                |- 'transgened_tumor_9_mer_snpeffed.faa': <JSid>
                |- 'transgened_tumor_10_mer_snpeffed.faa': <JSid>
                +- 'transgened_tumor_15_mer_snpeffed.faa': <JSid>

    This module corresponds to node 17 on the tree
    """
    job.fileStore.logToMaster('Running transgene on %s' % univ_options['patient'])
    work_dir = os.getcwd()
    rna_bam_key = 'rnaAligned.sortedByCoord.out.bam'  # to reduce next line size
    input_files = {
        'snpeffed_muts.vcf': snpeffed_file,
        'rna.bam': rna_bam[rna_bam_key]['rna_fix_pg_sorted.bam'],
        'rna.bam.bai': rna_bam[rna_bam_key]['rna_fix_pg_sorted.bam.bai'],
        'pepts.fa.tar.gz': transgene_options['gencode_peptide_fasta']}
    input_files = get_files_from_filestore(job, input_files, work_dir, docker=False)
    input_files['pepts.fa'] = untargz(input_files['pepts.fa.tar.gz'], work_dir)
    input_files = {key: docker_path(path) for key, path in input_files.items()}

    parameters = ['--peptides', input_files['pepts.fa'],
                  '--snpeff', input_files['snpeffed_muts.vcf'],
                  '--rna_file', input_files['rna.bam'],
                  '--prefix', 'transgened',
                  '--pep_lens', '9,10,15']
    docker_call(tool='transgene', tool_parameters=parameters, work_dir=work_dir,
                dockerhub=univ_options['dockerhub'])
    output_files = defaultdict()
    for peplen in ['9', '10', '15']:
        peptfile = '_'.join(['transgened_tumor', peplen, 'mer_snpeffed.faa'])
        mapfile = '_'.join(['transgened_tumor', peplen, 'mer_snpeffed.faa.map'])
        export_results(job, peptfile, univ_options, subfolder='peptides')
        export_results(job, mapfile, univ_options, subfolder='peptides')
        output_files[peptfile] = job.fileStore.writeGlobalFile(os.path.join(work_dir, peptfile))
        output_files[mapfile] = job.fileStore.writeGlobalFile(os.path.join(work_dir, mapfile))
    os.rename('transgened_transgened.vcf', 'mutations.vcf')
    export_results(job, 'mutations.vcf', univ_options, subfolder='mutations/transgened')
    return output_files
Example #12
0
def add_readgroups(job, bamfile, sample_type, univ_options):
    """
    This module adds the appropriate read groups to the bam file
    ARGUMENTS
    1. bamfile: <JSid for a bam file>
    2. sample_type: string of 'tumor_dna' or 'normal_dna'
    3. univ_options: Dict of universal arguments used by almost all tools
         univ_options
                |- 'dockerhub': <dockerhub to use>
                +- 'java_Xmx': value for max heap passed to java
    RETURN VALUES
    1. output_files: REFER output_files in run_bwa()
    """
    job.fileStore.logToMaster('Running add_read_groups on %s:%s' % (univ_options['patient'],
                                                                    sample_type))
    work_dir = os.getcwd()
    input_files = {
        sample_type + '_aligned_fixpg.bam': bamfile}
    get_files_from_filestore(job, input_files, work_dir, docker=True)
    parameters = ['AddOrReplaceReadGroups',
                  'CREATE_INDEX=false',
                  'I=/data/' + sample_type + '_aligned_fixpg.bam',
                  'O=/data/' + sample_type + '_aligned_fixpg_sorted_reheader.bam',
                  'SO=coordinate',
                  'ID=1',
                  ''.join(['LB=', univ_options['patient']]),
                  'PL=ILLUMINA',
                  'PU=12345',
                  ''.join(['SM=', sample_type.rstrip('_dna')])]
    docker_call(tool='picard', tool_parameters=parameters, work_dir=work_dir,
                dockerhub=univ_options['dockerhub'], java_opts=univ_options['java_Xmx'])
    output_file = job.fileStore.writeGlobalFile(
        '/'.join([work_dir, sample_type + '_aligned_fixpg_sorted_reheader.bam']))
    # Delete the old bam file
    job.fileStore.deleteGlobalFile(bamfile)
    return output_file
Example #13
0
def run_cutadapt(job, fastqs, univ_options, cutadapt_options):
    """
    This module runs cutadapt on the input RNA fastq files and then calls the RNA aligners.

    ARGUMENTS
    1. fastqs: List of input RNA-Seq fastqs [<JSid for 1.fastq> , <JSid for 2.fastq>]
    2. univ_options: Dict of universal arguments used by almost all tools
         univ_options
              +- 'dockerhub': <dockerhub to use>
    3. cutadapt_options: Dict of parameters specific to cutadapt
         cutadapt_options
              |- 'a': <sequence of 3' adapter to trim from fwd read>
              +- 'A': <sequence of 3' adapter to trim from rev read>
    RETURN VALUES
    1. output_files: Dict of cutadapted fastqs
         output_files
             |- 'rna_cutadapt_1.fastq': <JSid>
             +- 'rna_cutadapt_2.fastq': <JSid>

    This module corresponds to node 2 on the tree
    """
    job.fileStore.logToMaster('Running cutadapt on %s' % univ_options['patient'])
    work_dir = os.getcwd()
    input_files = {
        'rna_1.fastq': fastqs[0],
        'rna_2.fastq': fastqs[1]}
    input_files = get_files_from_filestore(job, input_files, work_dir, docker=False)
    # Handle gzipped file
    gz = '.gz' if is_gzipfile(input_files['rna_1.fastq']) else ''
    if gz:
        for read_file in 'rna_1.fastq', 'rna_2.fastq':
            os.symlink(read_file, read_file + gz)
            input_files[read_file + gz] = input_files[read_file] + gz
    input_files = {key: docker_path(path) for key, path in input_files.items()}
    parameters = ['-a', cutadapt_options['a'],  # Fwd read 3' adapter
                  '-A', cutadapt_options['A'],  # Rev read 3' adapter
                  '-m', '35',  # Minimum size of read
                  '-o', docker_path('rna_cutadapt_1.fastq.gz'),  # Output for R1
                  '-p', docker_path('rna_cutadapt_2.fastq.gz'),  # Output for R2
                  input_files['rna_1.fastq' + gz],
                  input_files['rna_2.fastq' + gz]]
    docker_call(tool='cutadapt', tool_parameters=parameters, work_dir=work_dir,
                dockerhub=univ_options['dockerhub'])
    output_files = []
    for fastq_file in ['rna_cutadapt_1.fastq.gz', 'rna_cutadapt_2.fastq.gz']:
        output_files.append(job.fileStore.writeGlobalFile('/'.join([work_dir, fastq_file])))
    return output_files
Example #14
0
def run_rsem(job, rna_bam, univ_options, rsem_options):
    """
    This module will run rsem on the RNA Bam file.

    ARGUMENTS
    1. rna_bam: <JSid of rnaAligned.toTranscriptome.out.bam>
    2. univ_options: Dict of universal arguments used by almost all tools
         univ_options
                +- 'dockerhub': <dockerhub to use>
    3. rsem_options: Dict of parameters specific to rsem
         rsem_options
              |- 'tool_index': <JSid for the rsem index tarball>
              +- 'n': <number of threads to allocate>

    RETURN VALUES
    1. output_file: <Jsid of rsem.isoforms.results>

    This module corresponds to node 9 on the tree
    """
    job.fileStore.logToMaster('Running rsem on %s' % univ_options['patient'])
    work_dir = os.getcwd()
    input_files = {
        'star_transcriptome.bam': rna_bam,
        'rsem_index.tar.gz': rsem_options['tool_index']}
    input_files = get_files_from_filestore(job, input_files, work_dir, docker=False)

    input_files['rsem_index'] = untargz(input_files['rsem_index.tar.gz'], work_dir)
    input_files = {key: docker_path(path) for key, path in input_files.items()}

    print(os.listdir('.'), file=sys.stderr)
    parameters = ['--paired-end',
                  '-p', str(rsem_options['n']),
                  '--bam',
                  input_files['star_transcriptome.bam'],
                  '--no-bam-output',
                  '/'.join([input_files['rsem_index'], 'hg19']),
                  'rsem']
    print(parameters, file=sys.stderr)
    docker_call(tool='rsem', tool_parameters=parameters, work_dir=work_dir,
                dockerhub=univ_options['dockerhub'])
    print(os.listdir('.'), file=sys.stderr)
    output_files = {}
    for filename in ('rsem.genes.results', 'rsem.isoforms.results'):
        output_files[filename] = job.fileStore.writeGlobalFile('/'.join([work_dir, filename]))
        export_results(job, '/'.join([work_dir, filename]), univ_options, subfolder='expression')
    return output_files
Example #15
0
def unmerge(job, input_vcf, tool_name, tool_options, univ_options):
    """
    Un-merges a vcf file into a file per chromosome.

    :param str input_vcf: Input vcf
    :param str tool_name: The name of the mutation caller
    :param dict tool_options: Options specific to Somatic Sniper
    :param dict univ_options: Universal options
    :returns: dict of jsIDs, onr for each chromosomal vcf
    :rtype: dict
    """
    work_dir = os.getcwd()
    input_files = {
        'input.vcf': input_vcf,
        'genome.fa.fai.tar.gz': tool_options['genome_fai']}
    input_files = get_files_from_filestore(job, input_files, work_dir, docker=False)

    input_files['genome.fa.fai'] = untargz(input_files['genome.fa.fai.tar.gz'], work_dir)

    chromosomes = chromosomes_from_fai(input_files['genome.fa.fai'])

    read_chromosomes = defaultdict()
    with open(input_files['input.vcf'], 'r') as in_vcf:
        header = []
        for line in in_vcf:
            if line.startswith('#'):
                header.append(line)
                continue
            line = line.strip()
            chrom = line.split()[0]
            if chrom in read_chromosomes:
                print(line, file=read_chromosomes[chrom])
            else:
                read_chromosomes[chrom] = open(os.path.join(os.getcwd(), chrom + '.vcf'), 'w')
                print(''.join(header), file=read_chromosomes[chrom], end='')
                print(line, file=read_chromosomes[chrom])
    # Process chromosomes that had no mutations
    for chrom in set(chromosomes).difference(set(read_chromosomes.keys())):
        read_chromosomes[chrom] = open(os.path.join(os.getcwd(), chrom + '.vcf'), 'w')
        print(''.join(header), file=read_chromosomes[chrom], end='')
    outdict = {}
    for chrom, chromvcf in read_chromosomes.items():
        chromvcf.close()
        export_results(job, chromvcf.name, univ_options, subfolder='mutations/' + tool_name)
        outdict[chrom] = job.fileStore.writeGlobalFile(chromvcf.name)
    return outdict
Example #16
0
def run_snpeff(job, merged_mutation_file, univ_options, snpeff_options):
    """
    This module will run snpeff on the aggregated mutation calls.  Currently the only mutations
    called are SNPs hence SnpEff suffices. This node will be replaced in the future with another
    translator.

    ARGUMENTS
    1. merged_mutation_file: <JSid for merged vcf>
    2. univ_options: Dict of universal arguments used by almost all tools
         univ_options
                +- 'dockerhub': <dockerhub to use>
    3. snpeff_options: Dict of parameters specific to snpeff
         snpeff_options
                +- 'tool_index': <JSid for the snpEff index tarball>

    RETURN VALUES
    1. output_file: <JSid for the snpeffed vcf>

    This node corresponds to node 16 on the tree
    """
    job.fileStore.logToMaster('Running snpeff on %s' % univ_options['patient'])
    work_dir = os.getcwd()
    input_files = {
        'merged_mutations.vcf': merged_mutation_file,
        'snpeff_index.tar.gz': snpeff_options['tool_index']}
    input_files = get_files_from_filestore(job, input_files, work_dir, docker=False)
    input_files['snpeff_index'] = untargz(input_files['snpeff_index.tar.gz'], work_dir)
    input_files = {key: docker_path(path) for key, path in input_files.items()}

    parameters = ['eff',
                  '-dataDir', input_files['snpeff_index'],
                  '-c', '/'.join([input_files['snpeff_index'], 'snpEff_hg19_gencode.config']),
                  '-no-intergenic',
                  '-no-downstream',
                  '-no-upstream',
                  # '-canon',
                  '-noStats',
                  'hg19_gencode',
                  input_files['merged_mutations.vcf']]
    xmx = snpeff_options['java_Xmx'] if snpeff_options['java_Xmx'] else univ_options['java_Xmx']
    with open('/'.join([work_dir, 'mutations.vcf']), 'w') as snpeff_file:
        docker_call(tool='snpeff', tool_parameters=parameters, work_dir=work_dir,
                    dockerhub=univ_options['dockerhub'], java_opts=xmx, outfile=snpeff_file)
    export_results(job, snpeff_file.name, univ_options, subfolder='mutations/snpeffed')
    output_file = job.fileStore.writeGlobalFile(snpeff_file.name)
    return output_file
Example #17
0
def run_strelka_full(job, tumor_bam, normal_bam, univ_options, strelka_options):
    """
    This module will run strelka on the DNA bams.

    ARGUMENTS
    :param dict tumor_bam: REFER ARGUMENTS of spawn_strelka()
    :param dict normal_bam: REFER ARGUMENTS of spawn_strelka()
    :param dict univ_options: REFER ARGUMENTS of spawn_strelka()
    :param dict strelka_options: REFER ARGUMENTS of spawn_strelka()

    RETURN VALUES
    :returns: dict of output vcfs for each chromosome
    :rtype: dict
    """
    job.fileStore.logToMaster('Running strelka on %s' % univ_options['patient'])
    work_dir = os.getcwd()
    input_files = {
        'tumor.bam': tumor_bam['tumor_dna_fix_pg_sorted.bam'],
        'tumor.bam.bai': tumor_bam['tumor_dna_fix_pg_sorted.bam.bai'],
        'normal.bam': normal_bam['normal_dna_fix_pg_sorted.bam'],
        'normal.bam.bai': normal_bam['normal_dna_fix_pg_sorted.bam.bai'],
        'genome.fa.tar.gz': strelka_options['genome_fasta'],
        'genome.fa.fai.tar.gz': strelka_options['genome_fai'],
        'config.ini.tar.gz': strelka_options['strelka_config']
    }
    input_files = get_files_from_filestore(job, input_files, work_dir, docker=False)

    for key in ('genome.fa', 'genome.fa.fai', 'config.ini'):
        input_files[key] = untargz(input_files[key + '.tar.gz'], work_dir)
    input_files = {key: docker_path(path) for key, path in input_files.items()}

    parameters = [input_files['config.ini'],
                  input_files['tumor.bam'],
                  input_files['normal.bam'],
                  input_files['genome.fa'],
                  str(job.cores)
                  ]
    docker_call(tool='strelka', tool_parameters=parameters, work_dir=work_dir,
                dockerhub=univ_options['dockerhub'])
    output_dict = {}
    for mutation_type in ['snvs', 'indels']:
        output_dict[mutation_type] = job.fileStore.writeGlobalFile(os.path.join(
            work_dir, 'strelka_out', 'results', 'passed.somatic.' + mutation_type + '.vcf'))
    return output_dict
Example #18
0
def boost_ranks(job, isoform_expression, merged_mhc_calls, transgene_out, univ_options,
                rank_boost_options):
    """
    This is the final module in the pipeline.  It will call the rank boosting R
    script.

    This module corresponds to node 21 in the tree
    """
    job.fileStore.logToMaster('Running boost_ranks on %s' % univ_options['patient'])
    work_dir = os.path.abspath(univ_options['patient'])
    os.mkdir(work_dir)
    input_files = {
        'rsem_quant.tsv': isoform_expression,
        'mhci_merged_files.tsv': merged_mhc_calls['mhci_merged_files.list'],
        'mhcii_merged_files.tsv': merged_mhc_calls['mhcii_merged_files.list'],
        'mhci_peptides.faa': transgene_out['transgened_tumor_10_mer_snpeffed.faa'],
        'mhcii_peptides.faa': transgene_out['transgened_tumor_15_mer_snpeffed.faa']}
    input_files = get_files_from_filestore(job, input_files, work_dir, docker=True)
    output_files = {}
    for mhc in ('mhci', 'mhcii'):
        parameters = [mhc,
                      input_files[''.join([mhc, '_merged_files.tsv'])],
                      input_files['rsem_quant.tsv'],
                      input_files[''.join([mhc, '_peptides.faa'])],
                      rank_boost_options[''.join([mhc, '_combo'])]
                      ]
        docker_call(tool='rankboost', tool_parameters=parameters, work_dir=work_dir,
                    dockerhub=univ_options['dockerhub'])
        mhc_concise = ''.join([work_dir, '/', mhc, '_merged_files_concise_results.tsv'])
        mhc_detailed = ''.join([work_dir, '/', mhc, '_merged_files_detailed_results.tsv'])
        output_files[mhc] = {}
        if os.path.exists(mhc_concise):
            output_files[os.path.basename(mhc_concise)] = job.fileStore.writeGlobalFile(mhc_concise)
            export_results(job, mhc_concise, univ_options, subfolder='rankboost')
        else:
            output_files[os.path.basename(mhc_concise)] = None
        if os.path.exists(mhc_detailed):
            output_files[os.path.basename(mhc_detailed)] = \
                job.fileStore.writeGlobalFile(mhc_detailed)
            export_results(job, mhc_detailed, univ_options, subfolder='rankboost')
        else:
            output_files[os.path.basename(mhc_detailed)] = None
    return output_files
Example #19
0
def predict_mhcii_binding(job, peptfile, allele, univ_options, mhcii_options):
    """
    This module will predict MHC:peptide binding for peptides in the files created in node YY to
    ALLELE.  ALLELE represents an MHCII allele.

    The module returns (PREDFILE, PREDICTOR) where PREDFILE contains the predictions and PREDICTOR
    is the predictor used (Consensus, NetMHCIIpan, or Sturniolo).

    This module corresponds to node 19 on the tree
    """
    job.fileStore.logToMaster('Running mhcii on %s:%s' % (univ_options['patient'], allele))
    work_dir = os.getcwd()
    input_files = {
        'peptfile.faa': peptfile}
    input_files = get_files_from_filestore(job, input_files, work_dir, docker=True)
    parameters = [mhcii_options['pred'],
                  allele,
                  input_files['peptfile.faa']]
    with open('/'.join([work_dir, 'predictions.tsv']), 'w') as predfile:
        docker_call(tool='mhcii', tool_parameters=parameters, work_dir=work_dir,
                    dockerhub=univ_options['dockerhub'], outfile=predfile, interactive=True)
    run_netmhciipan = True
    predictor = None
    with open(predfile.name, 'r') as predfile:
        for line in predfile:
            if not line.startswith('HLA'):
                continue
            if line.strip().split('\t')[5] == 'NetMHCIIpan':
                break
            # If the predictor type is sturniolo then it needs to be processed differently
            elif line.strip().split('\t')[5] == 'Sturniolo':
                predictor = 'Sturniolo'
            else:
                predictor = 'Consensus'
            run_netmhciipan = False
            break
    if run_netmhciipan:
        netmhciipan = job.addChildJobFn(predict_netmhcii_binding, peptfile, allele, univ_options,
                                        disk='100M', memory='100M', cores=1)
        return netmhciipan.rv()
    else:
        output_file = job.fileStore.writeGlobalFile(predfile.name)
        return output_file, predictor
Example #20
0
def run_muse_perchrom(job, tumor_bam, normal_bam, univ_options, muse_options, chrom):
    """
    This module will run muse on the DNA bams

    ARGUMENTS
    1. tumor_bam: REFER ARGUMENTS of spawn_muse()
    2. normal_bam: REFER ARGUMENTS of spawn_muse()
    3. univ_options: REFER ARGUMENTS of spawn_muse()
    4. muse_options: REFER ARGUMENTS of spawn_muse()
    5. chrom: String containing chromosome name with chr appended

    RETURN VALUES
    1. output_files: <JSid for CHROM.MuSe.txt>

    This module corresponds to node 12 on the tree
    """
    job.fileStore.logToMaster('Running muse on %s:%s' % (univ_options['patient'], chrom))
    work_dir = os.getcwd()
    input_files = {
        'tumor.bam': tumor_bam['tumor_dna_fix_pg_sorted.bam'],
        'tumor.bam.bai': tumor_bam['tumor_dna_fix_pg_sorted.bam.bai'],
        'normal.bam': normal_bam['normal_dna_fix_pg_sorted.bam'],
        'normal.bam.bai': normal_bam['normal_dna_fix_pg_sorted.bam.bai'],
        'genome.fa.tar.gz': muse_options['genome_fasta'],
        'genome.fa.fai.tar.gz': muse_options['genome_fai']}
    input_files = get_files_from_filestore(job, input_files, work_dir, docker=False)

    for key in ('genome.fa', 'genome.fa.fai'):
        input_files[key] = untargz(input_files[key + '.tar.gz'], work_dir)
    input_files = {key: docker_path(path) for key, path in input_files.items()}

    output_prefix = os.path.join(work_dir, chrom)

    parameters = ['call',
                  '-f', input_files['genome.fa'],
                  '-r', chrom,
                  '-O', docker_path(output_prefix),
                  input_files['tumor.bam'],
                  input_files['normal.bam']]
    docker_call(tool='muse', tool_parameters=parameters, work_dir=work_dir,
                dockerhub=univ_options['dockerhub'])
    outfile = job.fileStore.writeGlobalFile(''.join([output_prefix, '.MuSE.txt']))
    return outfile
Example #21
0
def run_somaticsniper_full(job, tumor_bam, normal_bam, univ_options, somaticsniper_options):
    """
    This module will run somaticsniper on the DNA bams.

    ARGUMENTS
    :param dict tumor_bam: REFER ARGUMENTS of spawn_somaticsniper()
    :param dict normal_bam: REFER ARGUMENTS of spawn_somaticsniper()
    :param dict univ_options: REFER ARGUMENTS of spawn_somaticsniper()
    :param dict somaticsniper_options: REFER ARGUMENTS of spawn_somaticsniper()

    RETURN VALUES
    :returns: dict of output vcfs for each chromosome
    :rtype: dict
    """
    job.fileStore.logToMaster('Running somaticsniper on %s' % univ_options['patient'])
    work_dir = os.getcwd()
    input_files = {
        'tumor.bam': tumor_bam['tumor_dna_fix_pg_sorted.bam'],
        'tumor.bam.bai': tumor_bam['tumor_dna_fix_pg_sorted.bam.bai'],
        'normal.bam': normal_bam['normal_dna_fix_pg_sorted.bam'],
        'normal.bam.bai': normal_bam['normal_dna_fix_pg_sorted.bam.bai'],
        'genome.fa.tar.gz': somaticsniper_options['genome_fasta'],
        'genome.fa.fai.tar.gz': somaticsniper_options['genome_fai']}
    input_files = get_files_from_filestore(job, input_files, work_dir, docker=False)

    for key in ('genome.fa', 'genome.fa.fai'):
        input_files[key] = untargz(input_files[key + '.tar.gz'], work_dir)
    input_files = {key: docker_path(path) for key, path in input_files.items()}

    output_file = os.path.join(work_dir, 'somatic-sniper_full.vcf')
    parameters = ['-f', input_files['genome.fa'],
                  '-F', 'vcf',
                  '-G',
                  '-L',
                  '-q', '1',
                  '-Q', '15',
                  input_files['tumor.bam'],
                  input_files['normal.bam'],
                  docker_path(output_file)]
    docker_call(tool='somaticsniper', tool_parameters=parameters, work_dir=work_dir,
                dockerhub=univ_options['dockerhub'])
    outfile = job.fileStore.writeGlobalFile(output_file)
    return outfile
Example #22
0
def fix_bam_header(job, bamfile, sample_type, univ_options):
    """
    This module modified the header in BAMFILE

    ARGUMENTS
    1. bamfile: <JSid for a bam file>
    2. sample_type: string of 'tumor_dna' or 'normal_dna'
    3. univ_options: Dict of universal arguments used by almost all tools
         univ_options
                +- 'dockerhub': <dockerhub to use>
    RETURN VALUES
    1. output_files: REFER output_files in run_bwa()
    """
    job.fileStore.logToMaster('Running reheader on %s:%s' % (univ_options['patient'], sample_type))
    work_dir = os.getcwd()
    input_files = {
        sample_type + '_aligned.bam': bamfile}
    input_files = get_files_from_filestore(job, input_files, work_dir, docker=True)
    parameters = ['view',
                  '-H',
                  input_files[sample_type + '_aligned.bam']]
    with open('/'.join([work_dir, sample_type + '_aligned_bam.header']), 'w') as headerfile:
        docker_call(tool='samtools', tool_parameters=parameters, work_dir=work_dir,
                    dockerhub=univ_options['dockerhub'], outfile=headerfile)
    with open(headerfile.name, 'r') as headerfile, \
            open('/'.join([work_dir, sample_type + '_output_bam.header']), 'w') as outheaderfile:
        for line in headerfile:
            if line.startswith('@PG'):
                line = '\t'.join([x for x in line.strip().split('\t') if not x.startswith('CL')])
            print(line.strip(), file=outheaderfile)
    parameters = ['reheader',
                  docker_path(outheaderfile.name),
                  input_files[sample_type + '_aligned.bam']]
    with open('/'.join([work_dir, sample_type + '_aligned_fixPG.bam']), 'w') as fixpg_bamfile:
        docker_call(tool='samtools', tool_parameters=parameters, work_dir=work_dir,
                    dockerhub=univ_options['dockerhub'], outfile=fixpg_bamfile)
    output_file = job.fileStore.writeGlobalFile(fixpg_bamfile.name)
    # The old bam file is now useless.
    job.fileStore.deleteGlobalFile(bamfile)
    return output_file
Example #23
0
def predict_mhci_binding(job, peptfile, allele, peplen, univ_options,
                         mhci_options):
    """
    This module will predict MHC:peptide binding for peptides in the files created in node XX to
    ALLELE.  ALLELE represents an MHCI allele.

    This module corresponds to node 18 on the tree
    """
    job.fileStore.logToMaster('Running mhci on %s:%s:%s' % (univ_options['patient'], allele,
                                                            peplen))
    work_dir = os.getcwd()
    input_files = {
        'peptfile.faa': peptfile}
    input_files = get_files_from_filestore(job, input_files, work_dir, docker=True)
    parameters = [mhci_options['pred'],
                  allele,
                  peplen,
                  input_files['peptfile.faa']]
    with open('/'.join([work_dir, 'predictions.tsv']), 'w') as predfile:
        docker_call(tool='mhci', tool_parameters=parameters, work_dir=work_dir,
                    dockerhub=univ_options['dockerhub'], outfile=predfile, interactive=True)
    output_file = job.fileStore.writeGlobalFile(predfile.name)
    return output_file
Example #24
0
def run_radia_perchrom(job, bams, univ_options, radia_options, chrom):
    """
    This module will run radia on the RNA and DNA bams

    ARGUMENTS
    1. bams: Dict of bams and their indexes
        bams
         |- 'tumor_rna': <JSid>
         |- 'tumor_rnai': <JSid>
         |- 'tumor_dna': <JSid>
         |- 'tumor_dnai': <JSid>
         |- 'normal_dna': <JSid>
         +- 'normal_dnai': <JSid>
    2. univ_options: Dict of universal arguments used by almost all tools
         univ_options
                +- 'dockerhub': <dockerhub to use>
    3. radia_options: Dict of parameters specific to radia
         radia_options
              |- 'dbsnp_vcf': <JSid for dnsnp vcf file>
              +- 'genome': <JSid for genome fasta file>
    4. chrom: String containing chromosome name with chr appended

    RETURN VALUES
    1. Dict of filtered radia output vcf and logfile (Nested return)
        |- 'radia_filtered_CHROM.vcf': <JSid>
        +- 'radia_filtered_CHROM_radia.log': <JSid>
    """
    job.fileStore.logToMaster('Running radia on %s:%s' % (univ_options['patient'], chrom))
    work_dir = os.getcwd()
    input_files = {
        'rna.bam': bams['tumor_rna'],
        'rna.bam.bai': bams['tumor_rnai'],
        'tumor.bam': bams['tumor_dna'],
        'tumor.bam.bai': bams['tumor_dnai'],
        'normal.bam': bams['normal_dna'],
        'normal.bam.bai': bams['normal_dnai'],
        'genome.fa.tar.gz': radia_options['genome_fasta'],
        'genome.fa.fai.tar.gz': radia_options['genome_fai']}
    input_files = get_files_from_filestore(job, input_files, work_dir, docker=False)

    for key in ('genome.fa', 'genome.fa.fai'):
        input_files[key] = untargz(input_files[key + '.tar.gz'], work_dir)
    input_files = {key: docker_path(path) for key, path in input_files.items()}

    radia_output = ''.join([work_dir, '/radia_', chrom, '.vcf'])
    radia_log = ''.join([work_dir, '/radia_', chrom, '_radia.log'])
    parameters = [univ_options['patient'],  # shortID
                  chrom,
                  '-n', input_files['normal.bam'],
                  '-t', input_files['tumor.bam'],
                  '-r', input_files['rna.bam'],
                  ''.join(['--rnaTumorFasta=', input_files['genome.fa']]),
                  '-f', input_files['genome.fa'],
                  '-o', docker_path(radia_output),
                  '-i', 'hg19_M_rCRS',
                  '-m', input_files['genome.fa'],
                  '-d', '*****@*****.**',
                  '-q', 'Illumina',
                  '--disease', 'CANCER',
                  '-l', 'INFO',
                  '-g', docker_path(radia_log)]
    docker_call(tool='radia', tool_parameters=parameters, work_dir=work_dir,
                dockerhub=univ_options['dockerhub'])
    output_file = job.fileStore.writeGlobalFile(radia_output)
    return output_file
Example #25
0
def filter_somaticsniper(job, tumor_bam, somaticsniper_output, tumor_pileup, univ_options,
                         somaticsniper_options):
    """
    Filter SomaticSniper calls.

    :param dict tumor_bam: Dict of bam and bai for tumor DNA-Seq
    :param toil.fileStore.FileID somaticsniper_output: SomaticSniper output vcf
    :param toil.fileStore.FileID tumor_pileup: Pileup generated for the tumor bam
    :param dict univ_options: Dict of universal options used by almost all tools
    :param dict somaticsniper_options: Options specific to SomaticSniper
    :returns: fsID for the filtered genome-level vcf
    :rtype: toil.fileStore.FileID
    """
    work_dir = os.getcwd()
    input_files = {
        'tumor.bam': tumor_bam['tumor_dna_fix_pg_sorted.bam'],
        'tumor.bam.bai': tumor_bam['tumor_dna_fix_pg_sorted.bam.bai'],
        'input.vcf': somaticsniper_output,
        'pileup.txt': tumor_pileup,
        'genome.fa.tar.gz': somaticsniper_options['genome_fasta'],
        'genome.fa.fai.tar.gz': somaticsniper_options['genome_fai']}
    input_files = get_files_from_filestore(job, input_files, work_dir, docker=False)

    for key in ('genome.fa', 'genome.fa.fai'):
        input_files[key] = untargz(input_files[key + '.tar.gz'], work_dir)
    input_files = {key: docker_path(path) for key, path in input_files.items()}

    # Run snpfilter.pl
    parameters = ['snpfilter.pl',
                  '--snp-file', input_files['input.vcf'],
                  '--indel-file', input_files['pileup.txt']]
    # Creates /data/input.vcf.SNPfilter
    docker_call(tool='somaticsniper-addons', tool_parameters=parameters, work_dir=work_dir,
                dockerhub=univ_options['dockerhub'], tool_version=somaticsniper_options['version'])

    # Run prepare_for_readcount.pl
    parameters = ['prepare_for_readcount.pl',
                  '--snp-file', input_files['input.vcf'] + '.SNPfilter']
    # Creates /data/input.vcf.SNPfilter.pos
    docker_call(tool='somaticsniper-addons', tool_parameters=parameters, work_dir=work_dir,
                dockerhub=univ_options['dockerhub'], tool_version=somaticsniper_options['version'])

    # Run  bam-readcount
    parameters = ['-b', '15',
                  '-f', input_files['genome.fa'],
                  '-l', input_files['input.vcf'] + '.SNPfilter.pos',
                  '-w', '1',
                  input_files['tumor.bam']]
    # Creates the read counts file
    with open(os.path.join(work_dir, 'readcounts.txt'), 'w') as readcounts_file:
        docker_call(tool='bam-readcount', tool_parameters=parameters, work_dir=work_dir,
                    dockerhub=univ_options['dockerhub'], outfile=readcounts_file,
                    tool_version=somaticsniper_options['bam_readcount']['version'])

    # Run fpfilter.pl
    parameters = ['fpfilter.pl',
                  '--snp-file', input_files['input.vcf'] + '.SNPfilter',
                  '--readcount-file', docker_path(readcounts_file.name)]

    # Creates input.vcf.SNPfilter.fp_pass and input.vcf.SNPfilter.fp_fail
    docker_call(tool='somaticsniper-addons', tool_parameters=parameters, work_dir=work_dir,
                dockerhub=univ_options['dockerhub'], tool_version=somaticsniper_options['version'])

    # Run highconfidence.pl
    parameters = ['highconfidence.pl',
                  '--snp-file', input_files['input.vcf'] + '.SNPfilter.fp_pass']

    # Creates input.vcf.SNPfilter.fp_pass.hc
    docker_call(tool='somaticsniper-addons', tool_parameters=parameters, work_dir=work_dir,
                dockerhub=univ_options['dockerhub'], tool_version=somaticsniper_options['version'])

    outfile = job.fileStore.writeGlobalFile(os.path.join(os.getcwd(),
                                                         'input.vcf.SNPfilter.fp_pass.hc'))
    job.fileStore.logToMaster('Filtered SomaticSniper for %s successfully' %
                              univ_options['patient'])
    return outfile
Example #26
0
def merge_mhc_peptide_calls(job, antigen_predictions, transgened_files,
                            univ_options):
    """
    This module will merge all the calls from nodes 18 and 19, and will filter for the top X%% of
    binders of each allele.  The module will then call the rank boosting script to finish off the
    pipeline.

    This module corresponds to node 19 on the tree
    """
    job.fileStore.logToMaster('Merging MHC calls')
    work_dir = os.getcwd()
    pept_files = {
        '10_mer.faa':
        transgened_files['transgened_tumor_10_mer_snpeffed.faa'],
        '10_mer.faa.map':
        transgened_files['transgened_tumor_10_mer_snpeffed.faa.map'],
        '15_mer.faa':
        transgened_files['transgened_tumor_15_mer_snpeffed.faa'],
        '15_mer.faa.map':
        transgened_files['transgened_tumor_15_mer_snpeffed.faa.map']
    }
    mhci_preds, mhcii_preds = antigen_predictions
    mhci_files = get_files_from_filestore(job, mhci_preds, work_dir)
    # First split mhcii_preds into prediction files and predictors and maintain keys so we can later
    # reference them in pairs
    mhcii_predictors = {x: y[1] for x, y in mhcii_preds.items()}
    mhcii_files = {x: y[0] for x, y in mhcii_preds.items()}
    mhcii_files = get_files_from_filestore(job, mhcii_files, work_dir)
    # Get peptide files
    pept_files = get_files_from_filestore(job, pept_files, work_dir)

    # Merge MHCI calls
    # Read 10-mer pepts into memory
    peptides = read_peptide_file(pept_files['10_mer.faa'])
    with open(pept_files['10_mer.faa.map'], 'r') as mapfile:
        pepmap = json.load(mapfile)
    # Incorporate peptide names into the merged calls
    with open('/'.join([work_dir, 'mhci_merged_files.list']),
              'w') as mhci_resfile:
        for mhcifile in mhci_files.values():
            with open(mhcifile, 'r') as mf:
                for line in mf:
                    # Skip header lines
                    if not line.startswith('HLA'):
                        continue
                    line = line.strip().split('\t')
                    allele = line[0]
                    pept = line[5]
                    pred = line[7]
                    if float(pred) > 5.00:
                        continue
                    print_mhc_peptide((allele, pept, pred, pept), peptides,
                                      pepmap, mhci_resfile)
    # Merge MHCII calls
    # read 15-mer pepts into memory
    peptides = read_peptide_file(pept_files['15_mer.faa'])
    with open(pept_files['15_mer.faa.map'], 'r') as mapfile:
        pepmap = json.load(mapfile)
    # Incorporate peptide names into the merged calls
    with open('/'.join([work_dir, 'mhcii_merged_files.list']), 'w') as \
            mhcii_resfile:
        for mhciifile in mhcii_files.keys():
            core_col = None  # Variable to hold the column number with the core
            if mhcii_predictors[mhciifile] == 'Consensus':
                with open(mhcii_files[mhciifile], 'r') as mf:
                    for line in mf:
                        # Skip header lines
                        if not line.startswith('HLA'):
                            continue
                        line = line.strip().split('\t')
                        allele = line[0]
                        pept = line[4]
                        pred = line[6]
                        if core_col:
                            core = line[core_col] if core_col else 'NOCORE'
                        else:
                            methods = line[5].lstrip('Consensus(').rstrip(')')
                            methods = methods.split(',')
                            if 'NN' in methods:
                                core_col = 13
                            elif 'netMHCIIpan' in methods:
                                core_col = 17
                            elif 'Sturniolo' in methods:
                                core_col = 19
                            elif 'SMM' in methods:
                                core_col = 10
                            core = line[core_col] if core_col else 'NOCORE'
                        if float(pred) > 5.00:
                            continue
                        print_mhc_peptide((allele, pept, pred, core), peptides,
                                          pepmap, mhcii_resfile)
            elif mhcii_predictors[mhciifile] == 'Sturniolo':
                with open(mhcii_files[mhciifile], 'r') as mf:
                    for line in mf:
                        # Skip header lines
                        if not line.startswith('HLA'):
                            continue
                        line = line.strip().split('\t')
                        allele = line[0]
                        pept = line[5]
                        pred = line[6]
                        core = line[19]  #
                        if float(pred) > 5.00:
                            continue
                        print_mhc_peptide((allele, pept, pred, core), peptides,
                                          pepmap, mhcii_resfile)
            elif mhcii_predictors[mhciifile] == 'netMHCIIpan':
                with open(mhcii_files[mhciifile], 'r') as mf:
                    # Get the allele from the first line and skip the second line
                    allele = re.sub('-DQB', '/DQB', mf.readline().strip())
                    _ = mf.readline()
                    for line in mf:
                        line = line.strip().split('\t')
                        pept = line[1]
                        pred = line[5]
                        core = 'NOCORE'
                        peptide_name = line[2]
                        if float(pred) > 5.00:
                            continue
                        print(allele,
                              pept,
                              peptide_name,
                              core,
                              '0',
                              pred,
                              pepmap[peptide_name],
                              sep='\t',
                              file=mhcii_resfile)
            else:
                raise RuntimeError('Shouldn\'t ever see this!!!')
    output_files = defaultdict()
    for mhc_file in [mhci_resfile.name, mhcii_resfile.name]:
        output_files[os.path.split(mhc_file)
                     [1]] = job.fileStore.writeGlobalFile(mhc_file)
        export_results(job,
                       output_files[os.path.split(mhc_file)[1]],
                       mhc_file,
                       univ_options,
                       subfolder='binding_predictions')
    return output_files
Example #27
0
def spawn_antigen_predictors(job, transgened_files, phlat_files, univ_options,
                             mhc_options):
    """
    Based on the number of alleles obtained from node 14, this module will spawn callers to predict
    MHCI:peptide and MHCII:peptide binding on the peptide files from node 17.  Once all MHC:peptide
    predictions are made, merge them via a follow-on job.

    ARGUMENTS
    1. transgened_files: REFER RETURN VALUE of run_transgene()
    2. phlat_files: REFER RETURN VALUE of merge_phlat_calls()
    3. univ_options: Dict of universal arguments used by almost all tools
         univ_options
                +- 'dockerhub': <dockerhub to use>
    4. mhc_options: Dict of dicts of parameters specific to mhci and mhcii
                    respectively
         mhc_options
              |- 'mhci'
              |     |- 'method_file': <JSid for json file containing data
              |     |                  linking alleles, peptide lengths, and
              |     |                  prediction methods>
              |     +- 'pred': String describing prediction method to use
              +- 'mhcii'
                    |- 'method_file': <JSid for json file containing data
                    |                  linking alleles and prediction methods>
                    +- 'pred': String describing prediction method to use

    RETURN VALUES
    1. tuple of (mhci_preds, mhcii_preds)
         mhci_preds: Dict of return value from running predictions on a given
                     mhc for all peptides of length 9 and 10.
             mhci_preds
                |- <MHC molecule 1>_9_mer.faa: <PromisedJobReturnValue>
                |- <MHC molecule 1>_10_mer.faa: <PromisedJobReturnValue>
                |
                ..
                +- <MHC molecule n>_10_mer.faa: <PromisedJobReturnValue>
         mhcii_preds: Dict of return value from running predictions on a given
                     mhc for all peptides of length 15.
             mhci_preds
                |- <MHC molecule 1>_15_mer.faa: <PromisedJobReturnValue>
                |
                ..
                +- <MHC molecule n>_15_mer.faa: <PromisedJobReturnValue>

    This module corresponds to node 18 on the tree
    """
    job.fileStore.logToMaster('Running spawn_anti on %s' %
                              univ_options['patient'])
    work_dir = os.getcwd()
    mhci_options, mhcii_options = mhc_options
    pept_files = {
        '9_mer.faa': transgened_files['transgened_tumor_9_mer_snpeffed.faa'],
        '10_mer.faa': transgened_files['transgened_tumor_10_mer_snpeffed.faa'],
        '15_mer.faa': transgened_files['transgened_tumor_15_mer_snpeffed.faa']
    }
    input_files = {
        'mhci_alleles.list': phlat_files['mhci_alleles.list'],
        'mhcii_alleles.list': phlat_files['mhcii_alleles.list'],
        'mhci_restrictions.json.tar.gz': mhci_options['method_file'],
        'mhcii_restrictions.json.tar.gz': mhcii_options['method_file']
    }
    input_files = get_files_from_filestore(job, input_files, work_dir)
    for key in ('mhci_restrictions.json', 'mhcii_restrictions.json'):
        input_files[key] = untargz(input_files[key + '.tar.gz'], work_dir)

    mhci_alleles, mhcii_alleles = [], []
    with open(input_files['mhci_alleles.list'], 'r') as mhci_file:
        for line in mhci_file:
            mhci_alleles.append(line.strip())
    with open(input_files['mhcii_alleles.list'], 'r') as mhcii_file:
        for line in mhcii_file:
            mhcii_alleles.append(line.strip())
    # This file contains the list of allele:pept length combinations supported
    # by each prediction type.
    with open(input_files['mhci_restrictions.json'], 'r') as restfile:
        mhci_restrictions = json.load(restfile)
    with open(input_files['mhcii_restrictions.json'], 'r') as restfile:
        mhcii_restrictions = json.load(restfile)
    # Make a regexp to convert non alphanumeric characters in HLA names to _
    strip_allele_re = re.compile('[^A-Z0-9]')
    # For each mhci allele:peptfile combination, spawn a job and store the job handle in the dict.
    # Then do the same for mhcii
    mhci_preds, mhcii_preds = {}, {}
    for allele in mhci_alleles:
        stripped_allele = re.sub(strip_allele_re, '_', allele)
        for peptfile in ['9_mer.faa', '10_mer.faa']:
            peplen = peptfile.split('_')[0]
            # Ensure that the allele is among the list of accepted alleles
            try:
                if not mhci_restrictions[allele][peplen]:
                    continue
            except KeyError:
                continue
            predfile = ''.join(
                [stripped_allele, '_', peptfile[:-4], '_mer.pred'])
            mhci_preds[predfile] = job.addChildJobFn(predict_mhci_binding,
                                                     pept_files[peptfile],
                                                     allele,
                                                     peplen,
                                                     univ_options,
                                                     mhci_options,
                                                     disk='100M',
                                                     memory='100M',
                                                     cores=1).rv()
    for allele in mhcii_alleles:
        stripped_allele = re.sub(strip_allele_re, '_', allele)
        predfile = ''.join([stripped_allele, '_15_mer.pred'])
        if allele not in mhcii_restrictions[mhcii_options['pred']]:
            continue
        mhcii_preds[predfile] = job.addChildJobFn(predict_mhcii_binding,
                                                  pept_files['15_mer.faa'],
                                                  allele,
                                                  univ_options,
                                                  mhcii_options,
                                                  disk='100M',
                                                  memory='100M',
                                                  cores=1).rv()
    return mhci_preds, mhcii_preds
Example #28
0
def spawn_antigen_predictors(job, transgened_files, phlat_files, univ_options, mhc_options):
    """
    Based on the number of alleles obtained from node 14, this module will spawn callers to predict
    MHCI:peptide and MHCII:peptide binding on the peptide files from node 17.  Once all MHC:peptide
    predictions are made, merge them via a follow-on job.

    ARGUMENTS
    1. transgened_files: REFER RETURN VALUE of run_transgene()
    2. phlat_files: REFER RETURN VALUE of merge_phlat_calls()
    3. univ_options: Dict of universal arguments used by almost all tools
         univ_options
                +- 'dockerhub': <dockerhub to use>
    4. mhc_options: Dict of dicts of parameters specific to mhci and mhcii
                    respectively
         mhc_options
              |- 'mhci'
              |     |- 'method_file': <JSid for json file containing data
              |     |                  linking alleles, peptide lengths, and
              |     |                  prediction methods>
              |     +- 'pred': String describing prediction method to use
              +- 'mhcii'
                    |- 'method_file': <JSid for json file containing data
                    |                  linking alleles and prediction methods>
                    +- 'pred': String describing prediction method to use

    RETURN VALUES
    1. tuple of (mhci_preds, mhcii_preds)
         mhci_preds: Dict of return value from running predictions on a given
                     mhc for all peptides of length 9 and 10.
             mhci_preds
                |- <MHC molecule 1>_9_mer.faa: <PromisedJobReturnValue>
                |- <MHC molecule 1>_10_mer.faa: <PromisedJobReturnValue>
                |
                ..
                +- <MHC molecule n>_10_mer.faa: <PromisedJobReturnValue>
         mhcii_preds: Dict of return value from running predictions on a given
                     mhc for all peptides of length 15.
             mhci_preds
                |- <MHC molecule 1>_15_mer.faa: <PromisedJobReturnValue>
                |
                ..
                +- <MHC molecule n>_15_mer.faa: <PromisedJobReturnValue>

    This module corresponds to node 18 on the tree
    """
    job.fileStore.logToMaster('Running spawn_anti on %s' % univ_options['patient'])
    work_dir = os.getcwd()
    mhci_options, mhcii_options = mhc_options
    pept_files = {
        '9_mer.faa': transgened_files['transgened_tumor_9_mer_snpeffed.faa'],
        '10_mer.faa': transgened_files['transgened_tumor_10_mer_snpeffed.faa'],
        '15_mer.faa': transgened_files['transgened_tumor_15_mer_snpeffed.faa']}
    input_files = {
        'mhci_alleles.list': phlat_files['mhci_alleles.list'],
        'mhcii_alleles.list': phlat_files['mhcii_alleles.list'],
        'mhci_restrictions.json.tar.gz': mhci_options['method_file'],
        'mhcii_restrictions.json.tar.gz': mhcii_options['method_file']}
    input_files = get_files_from_filestore(job, input_files, work_dir)
    for key in ('mhci_restrictions.json', 'mhcii_restrictions.json'):
        input_files[key] = untargz(input_files[key + '.tar.gz'], work_dir)

    mhci_alleles, mhcii_alleles = [], []
    with open(input_files['mhci_alleles.list'], 'r') as mhci_file:
        for line in mhci_file:
            mhci_alleles.append(line.strip())
    with open(input_files['mhcii_alleles.list'], 'r') as mhcii_file:
        for line in mhcii_file:
            mhcii_alleles.append(line.strip())
    # This file contains the list of allele:pept length combinations supported
    # by each prediction type.
    with open(input_files['mhci_restrictions.json'], 'r') as restfile:
        mhci_restrictions = json.load(restfile)
    with open(input_files['mhcii_restrictions.json'], 'r') as restfile:
        mhcii_restrictions = json.load(restfile)
    # Make a regexp to convert non alphanumeric characters in HLA names to _
    strip_allele_re = re.compile('[^A-Z0-9]')
    # For each mhci allele:peptfile combination, spawn a job and store the job handle in the dict.
    # Then do the same for mhcii
    mhci_preds, mhcii_preds = {}, {}
    for allele in mhci_alleles:
        stripped_allele = re.sub(strip_allele_re, '_', allele)
        for peptfile in ['9_mer.faa', '10_mer.faa']:
            peplen = peptfile.split('_')[0]
            # Ensure that the allele is among the list of accepted alleles
            try:
                if not mhci_restrictions[allele][peplen]:
                    continue
            except KeyError:
                continue
            predfile = ''.join([stripped_allele, '_', peptfile[:-4], '_mer.pred'])
            mhci_preds[predfile] = job.addChildJobFn(predict_mhci_binding, pept_files[peptfile],
                                                     allele, peplen, univ_options,
                                                     mhci_options, disk='100M', memory='100M',
                                                     cores=1).rv()
    for allele in mhcii_alleles:
        stripped_allele = re.sub(strip_allele_re, '_', allele)
        predfile = ''.join([stripped_allele, '_15_mer.pred'])
        if allele not in mhcii_restrictions[mhcii_options['pred']]:
            continue
        mhcii_preds[predfile] = job.addChildJobFn(predict_mhcii_binding, pept_files['15_mer.faa'],
                                                  allele, univ_options, mhcii_options,
                                                  disk='100M', memory='100M', cores=1).rv()
    return mhci_preds, mhcii_preds
Example #29
0
def assess_mhc_genes(job, isoform_expression, rna_haplotype, univ_options, mhc_genes_options):
    """
    This module will assess the prevalence of the various genes in the MHC pathway and return a
    report in the tsv format
    :param isoform_expression: Isoform expression from run_rsem
    :param rna_haplotype: PHLAT output from running on rna
    :param univ_options: Universal options for the pipeline
    :param mhc_genes_options: options specific to this module
    """
    job.fileStore.logToMaster('Running mhc gene assessment on %s' % univ_options['patient'])
    work_dir = os.getcwd()
    input_files = {
        'rsem_quant.tsv': isoform_expression,
        'rna_haplotype.sum': rna_haplotype,
        'mhc_genes.json.tar.gz': mhc_genes_options['genes_file']}
    input_files = get_files_from_filestore(job, input_files, work_dir, docker=False)

    input_files['mhc_genes.json'] = untargz(input_files['mhc_genes.json.tar.gz'], work_dir)

    # Read in the MHC genes
    with open(input_files['mhc_genes.json']) as mhc_file:
        mhc_genes = json.load(mhc_file)

    # Parse the rna phlat file
    with open(input_files['rna_haplotype.sum']) as rna_mhc:
        mhc_alleles = {'HLA_A': [], 'HLA_B': [], 'HLA_C': [], 'HLA_DPA': [], 'HLA_DQA': [],
                       'HLA_DPB': [], 'HLA_DQB': [], 'HLA_DRB': []}
        mhc_alleles = parse_phlat_file(rna_mhc, mhc_alleles)

    # Process the isoform expressions
    gene_expressions = Counter()
    with open(input_files['rsem_quant.tsv']) as rsem_file:
        line = rsem_file.readline()
        line = line.strip().split()
        assert line == ['transcript_id', 'gene_id', 'length', 'effective_length', 'expected_count',
                        'TPM', 'FPKM', 'IsoPct']
        for line in rsem_file:
            line = line.strip().split()
            gene_expressions[line[1]] += float(line[5])

    with open(os.path.join(work_dir, 'mhc_pathway_report.txt'), 'w') as mpr:
        for section in mhc_genes:
            print(section.center(48, ' '), file=mpr)
            print("{:12}{:12}{:12}{:12}".format("Gene", "Threshold", "Observed", "Result"),
                  file=mpr)
            if section == 'MHCI loading':
                for mhci_allele in 'HLA_A', 'HLA_B', 'HLA_C':
                    num_alleles = len(mhc_alleles[mhci_allele])
                    print("{:12}{:12}{:12}{:12}".format(mhci_allele, '2', num_alleles,
                                                        'FAIL' if num_alleles == 0
                                                        else 'LOW' if num_alleles == 1
                                                        else 'PASS'), file=mpr)
            elif section == 'MHCII loading':
                # TODO DP alleles
                for mhcii_allele in ('HLA_DQA', 'HLA_DQB', 'HLA_DRA', 'HLA_DRB'):
                    if mhcii_allele != 'HLA_DRA':
                        num_alleles = len(mhc_alleles[mhcii_allele])
                        print("{:12}{:12}{:12}{:12}".format(mhcii_allele, 2, num_alleles,
                                                            'FAIL' if num_alleles == 0 else
                                                            'LOW' if num_alleles == 1 else
                                                            'PASS'), file=mpr)
                    else:
                        # FIXME This is hardcoded for now. We need to change this.
                        print("{:12}{:<12}{:<12}{:12}".format(
                                    'HLA_DRA', gene_expressions['ENSG00000204287.9'], '69.37',
                                    'LOW' if gene_expressions['ENSG00000204287.9'] <= 69.37
                                    else 'PASS'), file=mpr)
            for gene, ensgene, first_quart in mhc_genes[section]:
                result = 'LOW' if gene_expressions[ensgene] <= float(first_quart) else 'PASS'
                print("{:12}{:<12}{:<12}{:12}".format(gene, float(first_quart),
                                                      gene_expressions[ensgene], result), file=mpr)
            print('', file=mpr)
    export_results(job, mpr.name, univ_options, subfolder='reports')
    output_file = job.fileStore.writeGlobalFile(mpr.name)
    return output_file
Example #30
0
def run_mutect_perchrom(job, tumor_bam, normal_bam, univ_options,
                        mutect_options, chrom):
    """
    Run MuTect call on a single chromosome in the input bams.

    :param dict tumor_bam: Dict of bam and bai for tumor DNA-Seq
    :param dict normal_bam: Dict of bam and bai for normal DNA-Seq
    :param dict univ_options: Dict of universal options used by almost all tools
    :param dict mutect_options: Options specific to MuTect
    :param str chrom: Chromosome to process
    :return: fsID for the chromsome vcf
    :rtype: toil.fileStore.FileID
    """
    work_dir = os.getcwd()
    input_files = {
        'tumor.bam': tumor_bam['tumor_dna_fix_pg_sorted.bam'],
        'tumor.bam.bai': tumor_bam['tumor_dna_fix_pg_sorted.bam.bai'],
        'normal.bam': normal_bam['normal_dna_fix_pg_sorted.bam'],
        'normal.bam.bai': normal_bam['normal_dna_fix_pg_sorted.bam.bai'],
        'genome.fa.tar.gz': mutect_options['genome_fasta'],
        'genome.fa.fai.tar.gz': mutect_options['genome_fai'],
        'genome.dict.tar.gz': mutect_options['genome_dict'],
        'cosmic.vcf.tar.gz': mutect_options['cosmic_vcf'],
        'cosmic.vcf.idx.tar.gz': mutect_options['cosmic_idx'],
        'dbsnp.vcf.gz': mutect_options['dbsnp_vcf'],
        'dbsnp.vcf.idx.tar.gz': mutect_options['dbsnp_idx']
    }
    input_files = get_files_from_filestore(job,
                                           input_files,
                                           work_dir,
                                           docker=False)
    # dbsnp.vcf should be bgzipped, but all others should be tar.gz'd
    input_files['dbsnp.vcf'] = gunzip(input_files['dbsnp.vcf.gz'])
    for key in ('genome.fa', 'genome.fa.fai', 'genome.dict', 'cosmic.vcf',
                'cosmic.vcf.idx', 'dbsnp.vcf.idx'):
        input_files[key] = untargz(input_files[key + '.tar.gz'], work_dir)
    input_files = {key: docker_path(path) for key, path in input_files.items()}

    mutout = ''.join([work_dir, '/', chrom, '.out'])
    mutvcf = ''.join([work_dir, '/', chrom, '.vcf'])
    parameters = [
        '-R',
        input_files['genome.fa'],
        '--cosmic',
        input_files['cosmic.vcf'],
        '--dbsnp',
        input_files['dbsnp.vcf'],
        '--input_file:normal',
        input_files['normal.bam'],
        '--input_file:tumor',
        input_files['tumor.bam'],
        # '--tumor_lod', str(10),
        # '--initial_tumor_lod', str(4.0),
        '-L',
        chrom,
        '--out',
        docker_path(mutout),
        '--vcf',
        docker_path(mutvcf)
    ]
    java_xmx = mutect_options['java_Xmx'] if mutect_options['java_Xmx'] \
        else univ_options['java_Xmx']
    docker_call(tool='mutect',
                tool_parameters=parameters,
                work_dir=work_dir,
                dockerhub=univ_options['dockerhub'],
                java_xmx=java_xmx,
                tool_version=mutect_options['version'])
    output_file = job.fileStore.writeGlobalFile(mutvcf)
    export_results(job,
                   output_file,
                   mutvcf,
                   univ_options,
                   subfolder='mutations/mutect')
    job.fileStore.logToMaster('Ran MuTect on %s:%s successfully' %
                              (univ_options['patient'], chrom))
    return output_file
Example #31
0
def run_star(job, fastqs, univ_options, star_options):
    """
    Align a pair of fastqs with STAR.

    :param list fastqs: The input fastqs for alignment
    :param dict univ_options: Dict of universal options used by almost all tools
    :param dict star_options: Options specific to star
    :return: Dict containing output genome bam, genome bai, and transcriptome bam
                 output_files:
                    |- 'rnaAligned.toTranscriptome.out.bam': fsID
                    +- 'rnaAligned.out.bam': fsID
                    +- 'rnaChimeric.out.junction': fsID
    :rtype: dict
    """
    assert star_options['type'] in ('star', 'starlong')
    work_dir = os.getcwd()
    input_files = {
        'rna_cutadapt_1.fastq': fastqs[0],
        'rna_cutadapt_2.fastq': fastqs[1],
        'star_index.tar.gz': star_options['index']
    }
    input_files = get_files_from_filestore(job,
                                           input_files,
                                           work_dir,
                                           docker=False)
    # Handle gzipped file
    gz = '.gz' if is_gzipfile(input_files['rna_cutadapt_1.fastq']) else ''
    if gz:
        for read_file in 'rna_cutadapt_1.fastq', 'rna_cutadapt_2.fastq':
            os.symlink(read_file, read_file + gz)
            input_files[read_file + gz] = input_files[read_file] + gz
    # Untar the index
    input_files['star_index'] = untargz(input_files['star_index.tar.gz'],
                                        work_dir)

    # Check to see if user is using a STAR-Fusion index
    star_fusion_idx = os.path.join(input_files['star_index'],
                                   'ref_genome.fa.star.idx')
    if os.path.exists(star_fusion_idx):
        input_files['star_index'] = star_fusion_idx

    input_files = {
        key: docker_path(path, work_dir=work_dir)
        for key, path in list(input_files.items())
    }

    # Using recommended STAR-Fusion parameters:
    # https://github.com/STAR-Fusion/STAR-Fusion/wiki
    parameters = [
        '--runThreadN',
        str(star_options['n']), '--genomeDir', input_files['star_index'],
        '--twopassMode', 'Basic', '--outReadsUnmapped', 'None',
        '--chimSegmentMin', '12', '--chimJunctionOverhangMin', '12',
        '--alignSJDBoverhangMin', '10', '--alignMatesGapMax', '200000',
        '--alignIntronMax', '200000', '--chimSegmentReadGapMax', 'parameter',
        '3', '--alignSJstitchMismatchNmax', '5', '-1', '5', '5',
        '--outFileNamePrefix', 'rna', '--readFilesIn',
        input_files['rna_cutadapt_1.fastq' + gz],
        input_files['rna_cutadapt_2.fastq' + gz], '--outSAMattributes', 'NH',
        'HI', 'AS', 'NM', 'MD', '--outSAMtype', 'BAM', 'Unsorted',
        '--quantMode', 'TranscriptomeSAM'
    ]
    if gz:
        parameters.extend(['--readFilesCommand', 'zcat'])

    if star_options['type'] == 'star':
        docker_call(tool='star',
                    tool_parameters=parameters,
                    work_dir=work_dir,
                    dockerhub=univ_options['dockerhub'],
                    tool_version=star_options['version'])
    else:
        docker_call(tool='starlong',
                    tool_parameters=parameters,
                    work_dir=work_dir,
                    dockerhub=univ_options['dockerhub'],
                    tool_version=star_options['version'])
    output_files = defaultdict()
    for output_file in [
            'rnaAligned.toTranscriptome.out.bam', 'rnaAligned.out.bam',
            'rnaChimeric.out.junction'
    ]:
        output_files[output_file] = job.fileStore.writeGlobalFile('/'.join(
            [work_dir, output_file]))
    export_results(job,
                   output_files['rnaAligned.toTranscriptome.out.bam'],
                   'rna_transcriptome.bam',
                   univ_options,
                   subfolder='alignments')
    export_results(job,
                   output_files['rnaChimeric.out.junction'],
                   'rna_chimeric.junction',
                   univ_options,
                   subfolder='mutations/fusions')
    job.fileStore.logToMaster('Ran STAR on %s successfully' %
                              univ_options['patient'])
    return output_files
Example #32
0
def filter_somaticsniper(job, tumor_bam, somaticsniper_output, tumor_pileup, univ_options,
                         somaticsniper_options):
    """
    This module will filter the somaticsniper output for a single chromosome

    :param toil.Job job: Job
    :param dict tumor_bam: Tumor bam file and it's bai
    :param str somaticsniper_output: jsID from somatic sniper
    :param str tumor_pileup: jsID for pileup file for this chromsome
    :param dict univ_options: Universal options
    :param dict somaticsniper_options: Options specific to Somatic Sniper
    :returns: filtered chromsome vcf
    :rtype: str
    """
    job.fileStore.logToMaster('Filtering somaticsniper for %s' % univ_options['patient'])
    work_dir = os.getcwd()
    input_files = {
        'tumor.bam': tumor_bam['tumor_dna_fix_pg_sorted.bam'],
        'tumor.bam.bai': tumor_bam['tumor_dna_fix_pg_sorted.bam.bai'],
        'input.vcf': somaticsniper_output,
        'pileup.txt': tumor_pileup,
        'genome.fa.tar.gz': somaticsniper_options['genome_fasta'],
        'genome.fa.fai.tar.gz': somaticsniper_options['genome_fai']}
    input_files = get_files_from_filestore(job, input_files, work_dir, docker=False)

    for key in ('genome.fa', 'genome.fa.fai'):
        input_files[key] = untargz(input_files[key + '.tar.gz'], work_dir)
    input_files = {key: docker_path(path) for key, path in input_files.items()}

    # Run snpfilter.pl
    parameters = ['snpfilter.pl',
                  '--snp-file', input_files['input.vcf'],
                  '--indel-file', input_files['pileup.txt']]
    # Creates /data/input.vcf.SNPfilter
    docker_call(tool='somaticsniper-addons', tool_parameters=parameters, work_dir=work_dir,
                dockerhub=univ_options['dockerhub'])

    # Run prepare_for_readcount.pl
    parameters = ['prepare_for_readcount.pl',
                  '--snp-file', input_files['input.vcf'] + '.SNPfilter']
    # Creates /data/input.vcf.SNPfilter.pos
    docker_call(tool='somaticsniper-addons', tool_parameters=parameters, work_dir=work_dir,
                dockerhub=univ_options['dockerhub'])

    # Run  bam-readcount
    parameters = ['-b', '15',
                  '-f', input_files['genome.fa'],
                  '-l', input_files['input.vcf'] + '.SNPfilter.pos',
                  '-w', '1',
                  input_files['tumor.bam']]
    # Creates the read counts file
    with open(os.path.join(work_dir, 'readcounts.txt'), 'w') as readcounts_file:
        docker_call(tool='bam-readcount', tool_parameters=parameters, work_dir=work_dir,
                    dockerhub=univ_options['dockerhub'], outfile=readcounts_file)

    # Run fpfilter.pl
    parameters = ['fpfilter.pl',
                  '--snp-file', input_files['input.vcf'] + '.SNPfilter',
                  '--readcount-file', docker_path(readcounts_file.name)]

    # Creates input.vcf.SNPfilter.fp_pass and input.vcf.SNPfilter.fp_fail
    docker_call(tool='somaticsniper-addons', tool_parameters=parameters, work_dir=work_dir,
                dockerhub=univ_options['dockerhub'])

    # Run highconfidence.pl
    parameters = ['highconfidence.pl',
                  '--snp-file', input_files['input.vcf'] + '.SNPfilter.fp_pass']

    # Creates input.vcf.SNPfilter.fp_pass.hc
    docker_call(tool='somaticsniper-addons', tool_parameters=parameters, work_dir=work_dir,
                dockerhub=univ_options['dockerhub'])

    outfile = job.fileStore.writeGlobalFile(os.path.join(os.getcwd(),
                                                         'input.vcf.SNPfilter.fp_pass.hc'))
    return outfile
Example #33
0
def run_radia_perchrom(job, bams, univ_options, radia_options, chrom):
    """
    This module will run radia on the RNA and DNA bams

    ARGUMENTS
    1. bams: Dict of bams and their indexes
        bams
         |- 'tumor_rna': <JSid>
         |- 'tumor_rnai': <JSid>
         |- 'tumor_dna': <JSid>
         |- 'tumor_dnai': <JSid>
         |- 'normal_dna': <JSid>
         +- 'normal_dnai': <JSid>
    2. univ_options: Dict of universal arguments used by almost all tools
         univ_options
                +- 'dockerhub': <dockerhub to use>
    3. radia_options: Dict of parameters specific to radia
         radia_options
              |- 'dbsnp_vcf': <JSid for dnsnp vcf file>
              +- 'genome': <JSid for genome fasta file>
    4. chrom: String containing chromosome name with chr appended

    RETURN VALUES
    1. Dict of filtered radia output vcf and logfile (Nested return)
        |- 'radia_filtered_CHROM.vcf': <JSid>
        +- 'radia_filtered_CHROM_radia.log': <JSid>
    """
    job.fileStore.logToMaster('Running radia on %s:%s' %
                              (univ_options['patient'], chrom))
    work_dir = os.getcwd()
    input_files = {
        'rna.bam': bams['tumor_rna'],
        'rna.bam.bai': bams['tumor_rnai'],
        'tumor.bam': bams['tumor_dna'],
        'tumor.bam.bai': bams['tumor_dnai'],
        'normal.bam': bams['normal_dna'],
        'normal.bam.bai': bams['normal_dnai'],
        'genome.fa.tar.gz': radia_options['genome_fasta'],
        'genome.fa.fai.tar.gz': radia_options['genome_fai']
    }
    input_files = get_files_from_filestore(job,
                                           input_files,
                                           work_dir,
                                           docker=False)

    for key in ('genome.fa', 'genome.fa.fai'):
        input_files[key] = untargz(input_files[key + '.tar.gz'], work_dir)
    input_files = {key: docker_path(path) for key, path in input_files.items()}

    radia_output = ''.join([work_dir, '/radia_', chrom, '.vcf'])
    radia_log = ''.join([work_dir, '/radia_', chrom, '_radia.log'])
    parameters = [
        univ_options['patient'],  # shortID
        chrom,
        '-n',
        input_files['normal.bam'],
        '-t',
        input_files['tumor.bam'],
        '-r',
        input_files['rna.bam'],
        ''.join(['--rnaTumorFasta=', input_files['genome.fa']]),
        '-f',
        input_files['genome.fa'],
        '-o',
        docker_path(radia_output),
        '-i',
        'hg19_M_rCRS',
        '-m',
        input_files['genome.fa'],
        '-d',
        '*****@*****.**',
        '-q',
        'Illumina',
        '--disease',
        'CANCER',
        '-l',
        'INFO',
        '-g',
        docker_path(radia_log)
    ]
    docker_call(tool='radia',
                tool_parameters=parameters,
                work_dir=work_dir,
                dockerhub=univ_options['dockerhub'])
    output_file = job.fileStore.writeGlobalFile(radia_output)
    return output_file
Example #34
0
def run_mutect_perchrom(job, tumor_bam, normal_bam, univ_options, mutect_options, chrom):
    """
    This module will run mutect on the DNA bams

    ARGUMENTS
    1. tumor_bam: REFER ARGUMENTS of spawn_mutect()
    2. normal_bam: REFER ARGUMENTS of spawn_mutect()
    3. univ_options: REFER ARGUMENTS of spawn_mutect()
    4. mutect_options: REFER ARGUMENTS of spawn_mutect()
    5. chrom: String containing chromosome name with chr appended

    RETURN VALUES
    1. output_files: Dict of results of mutect for chromosome
            output_files
              |- 'mutect_CHROM.vcf': <JSid>
              +- 'mutect_CHROM.out': <JSid>

    This module corresponds to node 12 on the tree
    """
    job.fileStore.logToMaster('Running mutect on %s:%s' % (univ_options['patient'], chrom))
    work_dir = os.getcwd()
    input_files = {
        'tumor.bam': tumor_bam['tumor_dna_fix_pg_sorted.bam'],
        'tumor.bam.bai': tumor_bam['tumor_dna_fix_pg_sorted.bam.bai'],
        'normal.bam': normal_bam['normal_dna_fix_pg_sorted.bam'],
        'normal.bam.bai': normal_bam['normal_dna_fix_pg_sorted.bam.bai'],
        'genome.fa.tar.gz': mutect_options['genome_fasta'],
        'genome.fa.fai.tar.gz': mutect_options['genome_fai'],
        'genome.dict.tar.gz': mutect_options['genome_dict'],
        'cosmic.vcf.tar.gz': mutect_options['cosmic_vcf'],
        'cosmic.vcf.idx.tar.gz': mutect_options['cosmic_idx'],
        'dbsnp.vcf.gz': mutect_options['dbsnp_vcf'],
        'dbsnp.vcf.idx.tar.gz': mutect_options['dbsnp_idx']}
    input_files = get_files_from_filestore(job, input_files, work_dir, docker=False)
    # dbsnp.vcf should be bgzipped, but all others should be tar.gz'd
    input_files['dbsnp.vcf'] = gunzip(input_files['dbsnp.vcf.gz'])
    for key in ('genome.fa', 'genome.fa.fai', 'genome.dict', 'cosmic.vcf', 'cosmic.vcf.idx',
                'dbsnp.vcf.idx'):
        input_files[key] = untargz(input_files[key + '.tar.gz'], work_dir)
    input_files = {key: docker_path(path) for key, path in input_files.items()}

    mutout = ''.join([work_dir, '/', chrom, '.out'])
    mutvcf = ''.join([work_dir, '/', chrom, '.vcf'])
    parameters = ['-R', input_files['genome.fa'],
                  '--cosmic', input_files['cosmic.vcf'],
                  '--dbsnp', input_files['dbsnp.vcf'],
                  '--input_file:normal', input_files['normal.bam'],
                  '--input_file:tumor', input_files['tumor.bam'],
                  # '--tumor_lod', str(10),
                  # '--initial_tumor_lod', str(4.0),
                  '-L', chrom,
                  '--out', docker_path(mutout),
                  '--vcf', docker_path(mutvcf)
                  ]
    print(parameters, file=sys.stderr)
    java_xmx = mutect_options['java_Xmx'] if mutect_options['java_Xmx'] \
        else univ_options['java_Xmx']
    docker_call(tool='mutect:1.1.7', tool_parameters=parameters, work_dir=work_dir,
                dockerhub=univ_options['dockerhub'], java_opts=java_xmx)
    export_results(job, mutvcf, univ_options, subfolder='mutations/mutect')
    output_file = job.fileStore.writeGlobalFile(mutvcf)
    return output_file
Example #35
0
def run_star(job, fastqs, univ_options, star_options):
    """
    This module uses STAR to align the RNA fastqs to the reference

    ARGUMENTS
    1. fastqs: REFER RETURN VALUE of run_cutadapt()
    2. univ_options: Dict of universal arguments used by almost all tools
         univ_options
              +- 'dockerhub': <dockerhub to use>
    3. star_options: Dict of parameters specific to STAR
         star_options
             |- 'tool_index': <JSid for the STAR index tarball>
             +- 'n': <number of threads to allocate>
    RETURN VALUES
    1. output_files: Dict of aligned bams
         output_files
             |- 'rnaAligned.toTranscriptome.out.bam': <JSid>
             +- 'rnaAligned.sortedByCoord.out.bam': Dict of genome bam + bai
                                |- 'rna_fix_pg_sorted.bam': <JSid>
                                +- 'rna_fix_pg_sorted.bam.bai': <JSid>

    This module corresponds to node 9 on the tree
    """
    assert star_options['type'] in ('star', 'starlong')
    job.fileStore.logToMaster('Running STAR on %s' % univ_options['patient'])
    work_dir = os.getcwd()
    input_files = {
        'rna_cutadapt_1.fastq': fastqs[0],
        'rna_cutadapt_2.fastq': fastqs[1],
        'star_index.tar.gz': star_options['tool_index']}
    input_files = get_files_from_filestore(job, input_files, work_dir,
                                           docker=False)
    # Handle gzipped file
    gz = '.gz' if is_gzipfile(input_files['rna_cutadapt_1.fastq']) else ''
    if gz:
        for read_file in 'rna_cutadapt_1.fastq', 'rna_cutadapt_2.fastq':
            os.symlink(read_file, read_file + gz)
            input_files[read_file + gz] = input_files[read_file] + gz
    # Untar the index
    input_files['star_index'] = untargz(input_files['star_index.tar.gz'], work_dir)
    input_files = {key: docker_path(path) for key, path in input_files.items()}

    parameters = ['--runThreadN', str(star_options['n']),
                  '--genomeDir', input_files['star_index'],
                  '--outFileNamePrefix', 'rna',
                  '--readFilesIn',
                  input_files['rna_cutadapt_1.fastq' + gz],
                  input_files['rna_cutadapt_2.fastq' + gz],
                  '--outSAMattributes', 'NH', 'HI', 'AS', 'NM', 'MD',
                  '--outSAMtype', 'BAM', 'SortedByCoordinate',
                  '--quantMode', 'TranscriptomeSAM']
    if gz:
        parameters.extend(['--readFilesCommand', 'zcat'])
    if star_options['type'] == 'star':
        docker_call(tool='star', tool_parameters=parameters, work_dir=work_dir,
                    dockerhub=univ_options['dockerhub'])
    else:
        docker_call(tool='starlong', tool_parameters=parameters, work_dir=work_dir,
                    dockerhub=univ_options['dockerhub'])
    output_files = defaultdict()
    for bam_file in ['rnaAligned.toTranscriptome.out.bam',
                     'rnaAligned.sortedByCoord.out.bam']:
        output_files[bam_file] = job.fileStore.writeGlobalFile('/'.join([
            work_dir, bam_file]))
    return output_files
def assess_car_t_validity(job, gene_expression, univ_options, reports_options):
    """
    This function creates a report on the available clinical trials and scientific literature
    available for the overexpressed genes in the specified tumor type.
    It also gives a list of clinical trials available for other types of cancer with the same
    overexpressed gene.

    :param toil.fileStore.FileID gene_expression: The resm gene expression
    :param dict univ_options: Dict of universal options used by almost all tools
    :param dict reports_options: Options specific to reporting modules
    :return: The results of running assess_car_t_validity
    :rtype: toil.fileStore.FileID
    """
    work_dir = os.getcwd()

    tumor_type = univ_options['tumor_type']

    input_files = {
        'rsem_quant.tsv': gene_expression,
        'car_t_targets.tsv.tar.gz': reports_options['car_t_targets_file']
    }
    input_files = get_files_from_filestore(job,
                                           input_files,
                                           work_dir,
                                           docker=False)

    input_files['car_t_targets.tsv'] = untargz(
        input_files['car_t_targets.tsv.tar.gz'], work_dir)

    target_data = pd.read_table(input_files['car_t_targets.tsv'], index_col=0)
    patient_df = pd.read_csv('rsem_quant.tsv',
                             sep=' ',
                             delimiter='\t',
                             header='infer',
                             index_col=0)
    patient_df.index = (patient_df.index).str.replace('\\..*$', '')

    overexpressed = []
    # Check if the tumor has a corresponding normal
    try:
        tissue_of_origin = TCGAToGTEx[tumor_type]
    except KeyError:
        tissue_of_origin = 'NA'
    # Write the report
    with open('car_t_target_report.txt', 'w') as car_t_report:
        #print(target_data.index, file=car_t_report)
        if tissue_of_origin in target_data.index:
            print('Available clinical trials for ' +
                  str.lower(tissue_of_origin) +
                  ' cancer with GTEX and TCGA median values',
                  file=car_t_report)

            print(('\t{:10}{:<10}{:<10}{:<10}{:<40}{:<12}\n'.format(
                'Gene', 'GTEX', 'TCGA N', 'Observed', 'DOI for gene papers',
                'Clinical Trials')),
                  file=car_t_report)
            collected_values = []
            # Get the gene name, GTEX, TCGA, and observed values
            for index, row in target_data.iterrows():
                if index == tissue_of_origin:
                    gene = row['ENSG']
                    gtex = '{0:.2f}'.format(float(row['GTEX']))
                    tcga = '{0:.2f}'.format(float(row['TCGA']))
                    observed = '{0:.2f}'.format(
                        float(patient_df.loc[
                            gene,
                            'TPM'])) if gene in patient_df.index else 'NA'
                    doi = row['DOI']
                    target = str.upper(row['TARGET'])
                    clinical_trial = row['Clinical trials']
                    collection = [
                        target, gtex, tcga, observed, doi, clinical_trial
                    ]
                    collected_values.append(collection)
                    if observed != 'NA':
                        if float(gtex) <= float(observed) or float(
                                tcga) <= float(observed):
                            overexpressed.append(gene)

            collected_values = sorted(collected_values,
                                      key=lambda col: float(col[3]),
                                      reverse=True)
            for entry in collected_values:
                print(('\t{:10}{:<10}{:<10}{:<10}{:<40}{:<12}'.format(
                    entry[0], entry[1], entry[2], str(entry[3]), entry[4],
                    entry[5])),
                      file=car_t_report)

            print(
                '\nBased on the genes overexpressed in this cancer type, here\'s a list of clinical '
                'trials for other types of cancer',
                file=car_t_report)
            if len(overexpressed) != 0:
                # Check if there are other clinical trials for other cancer types
                print(('\t{:10}{:<10}{:<10}{:<10}{:<40}{:<17}{:<20}\n'.format(
                    'Gene', 'GTEX', 'TCGA N', 'Observed',
                    'DOI for gene papers', 'Clinical Trials', 'Cancer')),
                      file=car_t_report)
                other_trials = []
                for index, row in target_data.iterrows():
                    if row['ENSG'] in overexpressed and index != tissue_of_origin:
                        gene = row['ENSG']
                        gtex = '{0:.2f}'.format(float(row['GTEX']))
                        tcga = '{0:.2f}'.format(float(row['TCGA']))
                        doi = row['DOI']
                        target = str.upper(row['TARGET'])
                        observed = '{0:.2f}'.format(
                            float(patient_df.loc[
                                gene,
                                'TPM'])) if gene in patient_df.index else 'NA'
                        collected_values = [
                            target, gtex, tcga, observed, doi,
                            row['Clinical trials'], index
                        ]
                        other_trials.append(collected_values)

                other_trials = sorted(other_trials, key=lambda col: col[0])
                for entry in other_trials:
                    print(
                        ('\t{:10}{:<10}{:<10}{:<10}{:<40}{:<17}{:<20}'.format(
                            entry[0], entry[1], entry[2], entry[3], entry[4],
                            entry[5], entry[6])),
                        file=car_t_report)
            else:
                print("Data not available", file=car_t_report)

        else:
            print('Data not available for ' + tumor_type, file=car_t_report)

    output_file = job.fileStore.writeGlobalFile(car_t_report.name)
    export_results(job,
                   output_file,
                   car_t_report.name,
                   univ_options,
                   subfolder='reports')
    job.fileStore.logToMaster(
        'Ran car t validity assessment on %s successfully' %
        univ_options['patient'])
    return output_file
Example #37
0
def run_transgene(job,
                  snpeffed_file,
                  rna_bam,
                  univ_options,
                  transgene_options,
                  tumor_dna_bam=None,
                  fusion_calls=None):
    """
    Run transgene on an input snpeffed vcf file and return the peptides for MHC prediction.


    :param toil.fileStore.FileID snpeffed_file: fsID for snpeffed vcf
    :param dict rna_bam: The dict of bams returned by running star
    :param dict univ_options: Dict of universal options used by almost all tools
    :param dict transgene_options: Options specific to Transgene
    :param dict tumor_dna_bam: The dict of bams returned by running bwa
    :return: A dictionary of 9 files (9-, 10-, and 15-mer peptides each for Tumor and Normal and the
             corresponding .map files for the 3 Tumor fastas)
             output_files:
                 |- 'transgened_normal_10_mer_snpeffed.faa': fsID
                 |- 'transgened_normal_15_mer_snpeffed.faa': fsID
                 |- 'transgened_normal_9_mer_snpeffed.faa': fsID
                 |- 'transgened_tumor_10_mer_snpeffed.faa': fsID
                 |- 'transgened_tumor_10_mer_snpeffed.faa.map': fsID
                 |- 'transgened_tumor_15_mer_snpeffed.faa': fsID
                 |- 'transgened_tumor_15_mer_snpeffed.faa.map': fsID
                 |- 'transgened_tumor_9_mer_snpeffed.faa': fsID
                 +- 'transgened_tumor_9_mer_snpeffed.faa.map': fsID
    :rtype: dict
    """
    job.fileStore.logToMaster('Running transgene on %s' %
                              univ_options['patient'])
    work_dir = os.getcwd()
    input_files = {
        'snpeffed_muts.vcf': snpeffed_file,
        'rna.bam': rna_bam['rna_genome']['rna_genome_sorted.bam'],
        'rna.bam.bai': rna_bam['rna_genome']['rna_genome_sorted.bam.bai'],
        'pepts.fa.tar.gz': transgene_options['gencode_peptide_fasta']
    }
    if tumor_dna_bam is not None:
        input_files.update({
            'tumor_dna.bam':
            tumor_dna_bam['tumor_dna_fix_pg_sorted.bam'],
            'tumor_dna.bam.bai':
            tumor_dna_bam['tumor_dna_fix_pg_sorted.bam.bai'],
        })
    input_files = get_files_from_filestore(job,
                                           input_files,
                                           work_dir,
                                           docker=False)
    input_files['pepts.fa'] = untargz(input_files['pepts.fa.tar.gz'], work_dir)
    input_files = {key: docker_path(path) for key, path in input_files.items()}

    parameters = [
        '--peptides', input_files['pepts.fa'], '--snpeff',
        input_files['snpeffed_muts.vcf'], '--rna_file', input_files['rna.bam'],
        '--prefix', 'transgened', '--pep_lens', '9,10,15', '--cores',
        str(transgene_options['n'])
    ]

    if tumor_dna_bam is not None:
        parameters.extend(['--dna_file', input_files['tumor_dna.bam']])

    if fusion_calls:
        fusion_files = {
            'fusion_calls': fusion_calls,
            'transcripts.fa.tar.gz':
            transgene_options['gencode_transcript_fasta'],
            'annotation.gtf.tar.gz':
            transgene_options['gencode_annotation_gtf'],
            'genome.fa.tar.gz': transgene_options['genome_fasta']
        }

        fusion_files = get_files_from_filestore(job,
                                                fusion_files,
                                                work_dir,
                                                docker=False)
        fusion_files['transcripts.fa'] = untargz(
            fusion_files['transcripts.fa.tar.gz'], work_dir)
        fusion_files['genome.fa'] = untargz(fusion_files['genome.fa.tar.gz'],
                                            work_dir)
        fusion_files['annotation.gtf'] = untargz(
            fusion_files['annotation.gtf.tar.gz'], work_dir)
        fusion_files = {
            key: docker_path(path)
            for key, path in fusion_files.items()
        }
        parameters += [
            '--transcripts', fusion_files['transcripts.fa'], '--fusions',
            fusion_files['fusion_calls'], '--genome',
            fusion_files['genome.fa'], '--annotation',
            fusion_files['annotation.gtf']
        ]

    docker_call(tool='transgene',
                tool_parameters=parameters,
                work_dir=work_dir,
                dockerhub=univ_options['dockerhub'],
                tool_version=transgene_options['version'])

    output_files = defaultdict()
    for peplen in ['9', '10', '15']:
        for tissue_type in ['tumor', 'normal']:
            pepfile = '_'.join(
                ['transgened', tissue_type, peplen, 'mer_snpeffed.faa'])
            output_files[pepfile] = job.fileStore.writeGlobalFile(
                os.path.join(work_dir, pepfile))
            export_results(job,
                           output_files[pepfile],
                           pepfile,
                           univ_options,
                           subfolder='peptides')
        mapfile = '_'.join(
            ['transgened_tumor', peplen, 'mer_snpeffed.faa.map'])
        output_files[mapfile] = job.fileStore.writeGlobalFile(
            os.path.join(work_dir, mapfile))
        export_results(job,
                       output_files[mapfile],
                       mapfile,
                       univ_options,
                       subfolder='peptides')
    os.rename('transgened_transgened.vcf', 'mutations.vcf')
    export_results(job,
                   job.fileStore.writeGlobalFile('mutations.vcf'),
                   'mutations.vcf',
                   univ_options,
                   subfolder='mutations/transgened')
    return output_files
Example #38
0
def assess_mhc_genes(job, gene_expression, rna_haplotype, univ_options, reports_options):
    """
    Assess the prevalence of the various genes in the MHC pathway and return a report in the tsv
    format.

    :param toil.fileStore.FileID gene_expression: fsID for the rsem gene expression file
    :param toil.fileStore.FileID|None rna_haplotype: fsID for the RNA PHLAT file
    :param dict univ_options: Dict of universal options used by almost all tools
    :param dict reports_options: Options specific to reporting modules
    :return: The fsID for the mhc pathway report file
    :rtype: toil.fileStore.FileID
    """

    work_dir = os.getcwd()
    # Take file parameters for both TCGA and GTEX files
    tumor_type = univ_options['tumor_type']
    b_types = {
    'tcga': tumor_type + " normal",
    'gtex': TCGAToGTEx[tumor_type] if tumor_type in TCGAToGTEx else "NA"}

    input_files = {
        'rsem_quant.tsv': gene_expression,
        'mhc_pathways.tsv.tar.gz': reports_options['mhc_pathways_file']}
    if rna_haplotype is not None:
        input_files['rna_haplotype.sum'] = rna_haplotype
    input_files = get_files_from_filestore(job, input_files, work_dir, docker=False)

    input_files['mhc_pathways.tsv'] = untargz(input_files['mhc_pathways.tsv.tar.gz'], work_dir)

    # Read the background file

    background_df = pd.read_table(input_files['mhc_pathways.tsv'], index_col=0, header=0)

    # Parse the rna phlat file
    if rna_haplotype is not None:
        with open(input_files['rna_haplotype.sum']) as rna_mhc:
            mhc_alleles = {'HLA_A': [], 'HLA_B': [], 'HLA_C': [], 'HLA_DPA': [], 'HLA_DQA': [],
                           'HLA_DPB': [], 'HLA_DQB': [], 'HLA_DRB': []}
            mhc_alleles = parse_phlat_file(rna_mhc, mhc_alleles)

    # Read the patient gene values into a dictionary
    gene_expressions = pd.read_table(input_files['rsem_quant.tsv'], index_col=0, header=0)
    gene_expressions = Counter({x.split('.')[0]: y for x, y in gene_expressions['TPM'].to_dict().items()})
    # Print the report
    roles = {x for x in background_df['Roles'].values if ',' not in x}
    with open('mhc_pathway_report.txt', 'w') as mpr:
        for role in roles:
            role_df = background_df[background_df['Roles'].str.contains(role)]
            print(role.center(90, ' '), file=mpr)
            print(
                "{:12}{:<12}{:<17}{:<12}{:<20}{:<17}\n".format('Gene', 'Observed', 'Threshold_GTEX',
                                                                 'Result', 'Threshold_TCGA_N', 'Result'),
                file=mpr)
            # If tumor_type in TCGAToGTEx.keys():
            if role == 'MHCI loading':
                for mhci_allele in 'HLA_A', 'HLA_B', 'HLA_C':
                    if rna_haplotype is not None:
                        num_alleles = len(mhc_alleles[mhci_allele])
                        result = ('FAIL' if num_alleles == 0 else
                                  'LOW' if num_alleles == 1 else
                                  'PASS')
                    else:
                        result = num_alleles = 'NA'
                    print("{:12}{:<12}{:<17}{:<12}{:<20}{:<17}".format(mhci_allele, 2,
                                                                           num_alleles, result,
                                                                           2, result), file=mpr)
            elif role == 'MHCII loading':
                for mhcii_allele in ('HLA_DQA', 'HLA_DQB', 'HLA_DRB'):
                    if rna_haplotype is not None:
                        num_alleles = len(mhc_alleles[mhcii_allele])
                        result = ('FAIL' if num_alleles == 0 else
                                  'LOW' if num_alleles == 1 else
                                  'PASS')
                    else:
                        result = num_alleles = 'NA'
                    print(
                        "{:12}{:<12}{:<17}{:<12}{:<20}{:<17}".format(mhcii_allele, 2, num_alleles,
                                                                     result, 2, result), file=mpr)

            for ensg in role_df.index:
                ensgName = background_df.ix[ensg, 'Name']
                b_vals = {}
                for bkg in b_types:
                    val = "{0:.2f}".format(role_df.loc[ensg].get(b_types[bkg], default='NA'))
                    result = ('NA' if val == 'NA' else
                              'LOW' if float(val) >= float(gene_expressions[ensg]) else
                              'PASS')
                    b_vals[bkg] = val, result
                print(
                    "{:12}{:<12}{:<17}{:<12}{:<20}{:<17}".format(ensgName, float(gene_expressions[ensg]),
                                                                 b_vals['gtex'][0], b_vals['gtex'][1],
                                                                 b_vals['tcga'][0], b_vals['tcga'][1]),
                    file=mpr)

            print('\n', file=mpr)

    output_file = job.fileStore.writeGlobalFile(mpr.name)
    export_results(job, output_file, mpr.name, univ_options, subfolder='reports')
    job.fileStore.logToMaster('Ran mhc gene assessment on %s successfully'
                              % univ_options['patient'])
    return output_file
Example #39
0
def merge_mhc_peptide_calls(job, antigen_predictions, transgened_files, univ_options):
    """
    This module will merge all the calls from nodes 18 and 19, and will filter for the top X%% of
    binders of each allele.  The module will then call the rank boosting script to finish off the
    pipeline.

    This module corresponds to node 19 on the tree
    """
    job.fileStore.logToMaster('Merging MHC calls')
    work_dir = os.getcwd()
    pept_files = {
        '10_mer.faa': transgened_files['transgened_tumor_10_mer_snpeffed.faa'],
        '10_mer.faa.map': transgened_files['transgened_tumor_10_mer_snpeffed.faa.map'],
        '15_mer.faa': transgened_files['transgened_tumor_15_mer_snpeffed.faa'],
        '15_mer.faa.map': transgened_files['transgened_tumor_15_mer_snpeffed.faa.map']}
    mhci_preds, mhcii_preds = antigen_predictions
    mhci_files = get_files_from_filestore(job, mhci_preds, work_dir)
    # First split mhcii_preds into prediction files and predictors and maintain keys so we can later
    # reference them in pairs
    mhcii_predictors = {x: y[1] for x, y in mhcii_preds.items()}
    mhcii_files = {x: y[0] for x, y in mhcii_preds.items()}
    mhcii_files = get_files_from_filestore(job, mhcii_files, work_dir)
    # Get peptide files
    pept_files = get_files_from_filestore(job, pept_files, work_dir)

    # Merge MHCI calls
    # Read 10-mer pepts into memory
    peptides = read_peptide_file(pept_files['10_mer.faa'])
    with open(pept_files['10_mer.faa.map'], 'r') as mapfile:
        pepmap = json.load(mapfile)
    # Incorporate peptide names into the merged calls
    with open('/'.join([work_dir, 'mhci_merged_files.list']), 'w') as mhci_resfile:
        for mhcifile in mhci_files.values():
            with open(mhcifile, 'r') as mf:
                for line in mf:
                    # Skip header lines
                    if not line.startswith('HLA'):
                        continue
                    line = line.strip().split('\t')
                    allele = line[0]
                    pept = line[5]
                    pred = line[7]
                    if float(pred) > 5.00:
                        continue
                    print_mhc_peptide((allele, pept, pred, pept), peptides, pepmap, mhci_resfile)
    # Merge MHCII calls
    # read 15-mer pepts into memory
    peptides = read_peptide_file(pept_files['15_mer.faa'])
    with open(pept_files['15_mer.faa.map'], 'r') as mapfile:
        pepmap = json.load(mapfile)
    # Incorporate peptide names into the merged calls
    with open('/'.join([work_dir, 'mhcii_merged_files.list']), 'w') as \
            mhcii_resfile:
        for mhciifile in mhcii_files.keys():
            core_col = None  # Variable to hold the column number with the core
            if mhcii_predictors[mhciifile] == 'Consensus':
                with open(mhcii_files[mhciifile], 'r') as mf:
                    for line in mf:
                        # Skip header lines
                        if not line.startswith('HLA'):
                            continue
                        line = line.strip().split('\t')
                        allele = line[0]
                        pept = line[4]
                        pred = line[6]
                        if core_col:
                            core = line[core_col] if core_col else 'NOCORE'
                        else:
                            methods = line[5].lstrip('Consensus(').rstrip(')')
                            methods = methods.split(',')
                            if 'NN' in methods:
                                core_col = 13
                            elif 'netMHCIIpan' in methods:
                                core_col = 17
                            elif 'Sturniolo' in methods:
                                core_col = 19
                            elif 'SMM' in methods:
                                core_col = 10
                            core = line[core_col] if core_col else 'NOCORE'
                        if float(pred) > 5.00:
                            continue
                        print_mhc_peptide((allele, pept, pred, core), peptides, pepmap,
                                          mhcii_resfile)
            elif mhcii_predictors[mhciifile] == 'Sturniolo':
                with open(mhcii_files[mhciifile], 'r') as mf:
                    for line in mf:
                        # Skip header lines
                        if not line.startswith('HLA'):
                            continue
                        line = line.strip().split('\t')
                        allele = line[0]
                        pept = line[5]
                        pred = line[6]
                        core = line[19]  #
                        if float(pred) > 5.00:
                            continue
                        print_mhc_peptide((allele, pept, pred, core), peptides, pepmap,
                                          mhcii_resfile)
            elif mhcii_predictors[mhciifile] == 'netMHCIIpan':
                with open(mhcii_files[mhciifile], 'r') as mf:
                    # Get the allele from the first line and skip the second line
                    allele = re.sub('-DQB', '/DQB', mf.readline().strip())
                    _ = mf.readline()
                    for line in mf:
                        line = line.strip().split('\t')
                        pept = line[1]
                        pred = line[5]
                        core = 'NOCORE'
                        peptide_name = line[2]
                        if float(pred) > 5.00:
                            continue
                        print(allele, pept, peptide_name, core, '0', pred, pepmap[peptide_name],
                              sep='\t', file=mhcii_resfile)
            else:
                raise RuntimeError('Shouldn\'t ever see this!!!')
    output_files = defaultdict()
    for mhc_file in [mhci_resfile.name, mhcii_resfile.name]:
        output_files[os.path.split(mhc_file)[1]] = job.fileStore.writeGlobalFile(mhc_file)
        export_results(job, mhc_file, univ_options, subfolder='binding_predictions')
    return output_files
Example #40
0
def reformat_star_fusion_output(job,
                                fusion_annot,
                                fusion_file,
                                transcript_file,
                                transcript_gff_file,
                                univ_options):
    """
    Writes STAR-Fusion results in Transgene BEDPE format

    :param toil.fileStore.FileID fusion_annot: Fusion annotation
    :param toil.fileStore.FileID fusion_file: STAR-fusion prediction file
    :param toil.fileStore.FileID transcript_file: Fusion transcript FASTA file
    :param toil.fileStore.FileID transcript_gff_file: Fusion transcript GFF file
    :param dict univ_options: universal arguments used by almost all tools
    :return: Transgene BEDPE file
    :rtype: toil.fileStore.FileID
    """
    input_files = {'results.tsv': fusion_file,
                   'fusion.bed': fusion_annot}

    if transcript_file and transcript_gff_file:
        input_files['transcripts.fa'] = transcript_file
        input_files['transcripts.gff'] = transcript_gff_file

    work_dir = job.fileStore.getLocalTempDir()
    input_files = get_files_from_filestore(job, input_files, work_dir, docker=False)

    # Pull in assembled transcript file
    hugo_to_gene_ids = get_gene_ids(input_files['fusion.bed'])

    if transcript_file and transcript_gff_file:
        transcripts = get_transcripts(input_files['transcripts.fa'])
        five_pr_splits, three_pr_splits = split_fusion_transcript(input_files['transcripts.gff'],
                                                                  transcripts)

    else:
        five_pr_splits = collections.defaultdict(dict)
        three_pr_splits = collections.defaultdict(dict)

    # Pull in assembled transcript annotation

    # Header for BEDPE file
    header = ['# chr1', 'start1', 'end1',
              'chr2', 'start2', 'end2',
              'name', 'score',
              'strand1', 'strand2',
              'junctionSeq1', 'junctionSeq2',
              'hugo1', 'hugo2']

    output_path = os.path.join(work_dir, 'fusion_results.bedpe')
    with open(input_files['results.tsv'], 'r') as in_f, open(output_path, 'w') as out_f:
        writer = csv.writer(out_f, delimiter='\t')
        writer.writerow(header)
        for record in parse_star_fusion(in_f):

            left_chr, left_break, left_strand = record.LeftBreakpoint.split(':')

            right_chr, right_break, right_strand = record.RightBreakpoint.split(':')

            fusion = ''.join([record.LeftGene, '--', record.RightGene])
            name = '-'.join([hugo_to_gene_ids[record.LeftGene], hugo_to_gene_ids[record.RightGene]])
            score = 'Junction:%s-Spanning:%s' % (record.JunctionReadCount, record.SpanningFragCount)

            # Add empty sequences in case Trinity doesn't output one
            if len(five_pr_splits[fusion].keys()) == 0:
                five_pr_splits[fusion]['N/A'] = '.'

            if len(three_pr_splits[fusion].keys()) == 0:
                three_pr_splits[fusion]['N/A'] = '.'

            for transcript_id in five_pr_splits[fusion].keys():
                five_prime_seq = five_pr_splits[fusion][transcript_id]
                three_prime_seq = three_pr_splits[fusion][transcript_id]

                writer.writerow([left_chr,
                                 '.',  # Donor start position is not necessary
                                 left_break,
                                 right_chr,
                                 right_break,
                                 '.',  # Acceptor end position is not necessary
                                 name,
                                 score,
                                 left_strand,
                                 right_strand,
                                 five_prime_seq,
                                 three_prime_seq,
                                 record.LeftGene,
                                 record.RightGene])

    bedpe_id = job.fileStore.writeGlobalFile(output_path)
    export_results(job, bedpe_id, 'fusion.bedpe', univ_options, subfolder='mutations/fusions')

    job.fileStore.logToMaster('Reformatted STAR-Fusion output for %s successfully' % univ_options['patient'])
    return bedpe_id
Example #41
0
def run_fusion(job,
               fastqs,
               junction_file,
               univ_options,
               star_fusion_options,
               fusion_inspector_options):
    """
    Runs STAR-Fusion and filters fusion calls using FusionInspector

    :param tuple fastqs: RNA-Seq FASTQ Filestore IDs
    :param toil.fileStore.FileID junction_file: Chimeric junction file
    :param dict univ_options: universal arguments used by almost all tools
    :param dict star_fusion_options: STAR-Fusion specific parameters
    :return: Transgene BEDPE file
    :rtype: toil.fileStore.FileID
    """
    work_dir = job.fileStore.getLocalTempDir()

    input_files = {'rna_1.fq.gz': fastqs[0],
                   'rna_2.fq.gz': fastqs[1],
                   'tool_index.tar.gz': star_fusion_options['index']}

    parameters = []

    # If there isn't a junction file, then we can run STAR-Fusion from the fastq files
    if junction_file:
        input_files['STAR.junction'] = junction_file
        parameters.extend(['--chimeric_junction', '/data/STAR.junction'])

    else:
        parameters.extend(['--left_fq', '/data/rna_1.fq.gz', '--right_fq', '/data/rna_2.fq.gz'])

    input_files = get_files_from_filestore(job, input_files, work_dir, docker=False)
    input_files['tool_index'] = os.path.basename(untargz(input_files['tool_index.tar.gz'],
                                                         work_dir))

    cores = star_fusion_options['n']
    parameters.extend(['--output_dir', '/data/fusion-output',
                       '--genome_lib_dir', input_files['tool_index'],
                       '--CPU', str(cores)])

    docker_call(tool='star-fusion',
                tool_parameters=parameters,
                work_dir=work_dir,
                dockerhub=univ_options['dockerhub'],
                tool_version=star_fusion_options['version'])

    star_output = 'fusion-output/star-fusion.fusion_candidates.final.abridged'
    fusion_path = os.path.join(work_dir, star_output)

    # Export the STAR-Fusion predictions
    export_results(job,
                   job.fileStore.writeGlobalFile(fusion_path),
                   'star-fusion-predictions.tsv',
                   univ_options, subfolder='mutations/fusions')

    # Check for fusion prediction
    with open(fusion_path, 'r') as f:
        # Skip header
        f.next()
        try:
            f.next()
        except StopIteration:
            logging.warning('%s: Did not find any fusions!' % univ_options['patient'])
            return

    parameters = ['--fusions', '/data/%s' % star_output,
                  '--genome_lib', input_files['tool_index'],
                  '--left_fq', '/data/rna_1.fq.gz',
                  '--right_fq', '/data/rna_2.fq.gz',
                  '--out_dir', '/data/FusionInspector',
                  '--out_prefix', 'FusionInspector',
                  '--CPU', str(cores)]

    if fusion_inspector_options['run_trinity']:
        parameters.append('--include_Trinity')

    docker_call(tool='fusion-inspector',
                tool_parameters=parameters,
                work_dir=work_dir,
                dockerhub=univ_options['dockerhub'],
                tool_version=fusion_inspector_options['version'])

    found_fusion = False
    inspector_output = 'FusionInspector/FusionInspector.fusion_predictions.final.abridged.FFPM'
    fusion_path = os.path.join(work_dir, inspector_output)
    output_path = os.path.join(work_dir, 'fusion.final')

    # Export the FusionInpsector predictions
    export_results(job,
                   job.fileStore.writeGlobalFile(fusion_path),
                   'fusion-inspector-predictions.tsv',
                   univ_options, subfolder='mutations/fusions')

    # Remove fusions without a large anchor sequence and at least 0.1
    # fusion fragments per million reads
    if os.path.exists(fusion_path):
        with open(fusion_path, 'r') as f, open(output_path, 'w') as g:
            g.write(f.next())
            for line in f:
                fields = line.strip().split()

                # Check for a large anchor support
                ldas = fields[10]

                assert ldas in {'YES', 'NO'}, 'FusionInpsector file is malformed!'

                j_ffpm, s_ffpm = fields[-2:]

                # Fusions without a larger anchor support or low read support
                # are suspicious and should not be consider for further analysis
                if ldas == 'YES' and sum([float(j_ffpm), float(s_ffpm)]) > 0.1:
                    found_fusion = True
                    g.write(line)

    if found_fusion:
        fusion_bed_f = 'FusionInspector/FusionInspector.bed'
        fusion_bed_path = os.path.join(work_dir, fusion_bed_f)
        transcript_f = 'FusionInspector/FusionInspector.gmap_trinity_GG.fusions.fasta'
        transcript_path = os.path.join(work_dir, transcript_f)
        transcript_gff_f = 'FusionInspector/FusionInspector.gmap_trinity_GG.fusions.gff3'
        transcript_gff_path = os.path.join(work_dir, transcript_gff_f)

        transcripts = None
        transcript_annotation = None

        if os.path.exists(transcript_path):
            transcripts = job.fileStore.writeGlobalFile(transcript_path)
            export_results(job,
                           transcripts,
                           transcript_path,
                           univ_options,
                           subfolder='mutations/fusions')

        if os.path.exists(transcript_gff_path):
            transcript_annotation = job.fileStore.writeGlobalFile(transcript_gff_path)
            export_results(job,
                           transcript_annotation,
                           transcript_gff_path,
                           univ_options,
                           subfolder='mutations/fusions')

        fusion_annotation = job.fileStore.writeGlobalFile(fusion_bed_path)
        filtered_fusions = job.fileStore.writeGlobalFile(output_path)

        export_results(job,
                       filtered_fusions,
                       output_path,
                       univ_options,
                       subfolder='mutations/fusions')

        job.fileStore.logToMaster('Ran STAR-Fusion on %s successfully' % univ_options['patient'])
        return job.addChildJobFn(reformat_star_fusion_output,
                                 fusion_annotation,
                                 filtered_fusions,
                                 transcripts,
                                 transcript_annotation,
                                 univ_options).rv()

    else:
        job.fileStore.logToMaster('No fusions detected for %s' % univ_options['patient'])