Example #1
0
def align_rna(job, fastqs, univ_options, star_options):
    """
    A wrapper for the entire rna alignment subgraph.

    :param list fastqs: The input fastqs for alignment
    :param dict univ_options: Dict of universal options used by almost all tools
    :param dict star_options: Options specific to star
    :return: Dict containing input bam and the generated index (.bam.bai)
    :rtype: dict
    """
    star = job.wrapJobFn(run_star,
                         fastqs,
                         univ_options,
                         star_options,
                         cores=star_options['n'],
                         memory=PromisedRequirement(
                             lambda x: int(1.85 * x.size),
                             star_options['index']),
                         disk=PromisedRequirement(star_disk, fastqs,
                                                  star_options['index']))
    s_and_i = job.wrapJobFn(sort_and_index_star, star.rv(), univ_options,
                            star_options).encapsulate()
    job.addChild(star)
    star.addChild(s_and_i)
    return s_and_i.rv()
Example #2
0
def maxConcurrency(job, cpuCount, filename, coresPerJob):
    """
    Returns the max number of concurrent tasks when using a PromisedRequirement instance
    to allocate the number of cores per job.

    :param int cpuCount: number of available cpus
    :param str filename: path to counter file
    :param int coresPerJob: number of cores assigned to each job
    :return int max concurrency value:
    """
    one = job.addChildFn(getOne, cores=0.1, memory='32M', disk='1M')
    thirtyTwoMb = job.addChildFn(getThirtyTwoMb,
                                 cores=0.1,
                                 memory='32M',
                                 disk='1M')

    values = []
    for _ in range(cpuCount):
        value = job.addFollowOnFn(batchSystemTest.measureConcurrency,
                                  filename,
                                  cores=PromisedRequirement(
                                      lambda x: x * coresPerJob, one.rv()),
                                  memory=PromisedRequirement(thirtyTwoMb.rv()),
                                  disk='1M').rv()
        values.append(value)
    return values
Example #3
0
def wrap_fusion(job, fastqs, star_output, univ_options, star_fusion_options,
                fusion_inspector_options):
    """
    A wrapper for run_fusion using the results from cutadapt and star as input.

    :param tuple fastqs: RNA-Seq FASTQ Filestore IDs
    :param dict star_output: Dictionary containing STAR output files
    :param dict univ_options: universal arguments used by almost all tools
    :param dict star_fusion_options: STAR-Fusion specific parameters
    :param dict fusion_inspector_options: FusionInspector specific parameters
    :return: Transgene BEDPE file
    :rtype: toil.fileStore.FileID

    """
    # Give user option to skip fusion calling
    if not star_fusion_options['run']:
        job.fileStore.logToMaster('Skipping STAR-Fusion on %s' %
                                  univ_options['patient'])
        return

    fusion = job.wrapJobFn(
        run_fusion,
        fastqs,
        star_output['rnaChimeric.out.junction'],
        univ_options,
        star_fusion_options,
        fusion_inspector_options,
        cores=star_fusion_options['n'],
        memory=PromisedRequirement(lambda x: int(1.85 * x.size),
                                   star_fusion_options['index']),
        disk=PromisedRequirement(fusion_disk, fastqs,
                                 star_fusion_options['index'])).encapsulate()
    job.addChild(fusion)
    return fusion.rv()
Example #4
0
        def testConcurrencyStatic(self):
            """
            Asserts that promised core resources are allocated properly using a static DAG
            """
            for coresPerJob in self.allocatedCores:
                tempDir = self._createTempDir('testFiles')
                counterPath = self.getCounterPath(tempDir)

                root = Job()
                one = Job.wrapFn(getOne, cores=0.1, memory='32M', disk='1M')
                thirtyTwoMb = Job.wrapFn(getThirtyTwoMb,
                                         cores=0.1,
                                         memory='32M',
                                         disk='1M')
                root.addChild(one)
                root.addChild(thirtyTwoMb)
                for _ in range(self.cpuCount):
                    root.addFollowOn(
                        Job.wrapFn(batchSystemTest.measureConcurrency,
                                   counterPath,
                                   cores=PromisedRequirement(
                                       lambda x: x * coresPerJob, one.rv()),
                                   memory=PromisedRequirement(
                                       thirtyTwoMb.rv()),
                                   disk='1M'))
                Job.Runner.startToil(root, self.getOptions(tempDir))
                _, maxValue = batchSystemTest.getCounters(counterPath)
                self.assertEqual(maxValue, old_div(self.cpuCount, coresPerJob))
Example #5
0
 def testPromiseRequirementRaceStatic(self):
     """
     Checks for a race condition when using promised requirements and child job functions.
     """
     A = Job.wrapJobFn(logDiskUsage, 'A', sleep=5, disk=PromisedRequirement(1024))
     B = Job.wrapJobFn(logDiskUsage, 'B', disk=PromisedRequirement(lambda x: x + 1024, A.rv()))
     A.addChild(B)
     Job.Runner.startToil(A, self.getOptions(self._createTempDir('testFiles')))
Example #6
0
def run_somaticsniper(job, tumor_bam, normal_bam, univ_options, somaticsniper_options, split=True):
    """
    Run the SomaticSniper subgraph on the DNA bams.  Optionally split the results into
    per-chromosome vcfs.

    :param dict tumor_bam: Dict of bam and bai for tumor DNA-Seq
    :param dict normal_bam: Dict of bam and bai for normal DNA-Seq
    :param dict univ_options: Dict of universal options used by almost all tools
    :param dict somaticsniper_options: Options specific to SomaticSniper
    :param bool split: Should the results be split into perchrom vcfs?
    :return: Either the fsID to the genome-level vcf or a dict of results from running SomaticSniper
             on every chromosome
             perchrom_somaticsniper:
                 |- 'chr1': fsID
                 |- 'chr2' fsID
                 |
                 |-...
                 |
                 +- 'chrM': fsID
    :rtype: toil.fileStore.FileID|dict
    """
    # Get a list of chromosomes to handle
    if somaticsniper_options['chromosomes']:
        chromosomes = somaticsniper_options['chromosomes']
    else:
        chromosomes = sample_chromosomes(job, somaticsniper_options['genome_fai'])
    perchrom_somaticsniper = defaultdict()
    snipe = job.wrapJobFn(run_somaticsniper_full, tumor_bam, normal_bam, univ_options,
                          somaticsniper_options,
                          disk=PromisedRequirement(sniper_disk,
                                                   tumor_bam['tumor_dna_fix_pg_sorted.bam'],
                                                   normal_bam['normal_dna_fix_pg_sorted.bam'],
                                                   somaticsniper_options['genome_fasta']),
                          memory='6G')
    pileup = job.wrapJobFn(run_pileup, tumor_bam, univ_options, somaticsniper_options,
                           disk=PromisedRequirement(pileup_disk,
                                                    tumor_bam['tumor_dna_fix_pg_sorted.bam'],
                                                    somaticsniper_options['genome_fasta']),
                           memory='6G')
    filtersnipes = job.wrapJobFn(filter_somaticsniper, tumor_bam, snipe.rv(), pileup.rv(),
                                 univ_options, somaticsniper_options,
                                 disk=PromisedRequirement(sniper_filter_disk,
                                                          tumor_bam['tumor_dna_fix_pg_sorted.bam'],
                                                          somaticsniper_options['genome_fasta']),
                                 memory='6G')

    job.addChild(snipe)
    job.addChild(pileup)
    snipe.addChild(filtersnipes)
    pileup.addChild(filtersnipes)
    if split:
        unmerge_snipes = job.wrapJobFn(unmerge, filtersnipes.rv(), 'somaticsniper', chromosomes,
                                       somaticsniper_options, univ_options)
        filtersnipes.addChild(unmerge_snipes)
        return unmerge_snipes.rv()
    else:
        return filtersnipes.rv()
Example #7
0
def run_muse(job, tumor_bam, normal_bam, univ_options, muse_options):
    """
    Spawn a MuSE job for each chromosome on the DNA bams.

    :param dict tumor_bam: Dict of bam and bai for tumor DNA-Seq
    :param dict normal_bam: Dict of bam and bai for normal DNA-Seq
    :param dict univ_options: Dict of universal options used by almost all tools
    :param dict muse_options: Options specific to MuSE
    :return: Dict of results from running MuSE on every chromosome
             perchrom_muse:
                 |- 'chr1': fsID
                 |- 'chr2' fsID
                 |
                 |-...
                 |
                 +- 'chrM': fsID
    :rtype: dict
    """
    # Get a list of chromosomes to handle
    if muse_options['chromosomes']:
        chromosomes = muse_options['chromosomes']
    else:
        chromosomes = sample_chromosomes(job, muse_options['genome_fai'])
    perchrom_muse = defaultdict()
    for chrom in chromosomes:
        call = job.addChildJobFn(
            run_muse_perchrom,
            tumor_bam,
            normal_bam,
            univ_options,
            muse_options,
            chrom,
            disk=PromisedRequirement(
                muse_disk, tumor_bam['tumor_dna_fix_pg_sorted.bam'],
                normal_bam['normal_dna_fix_pg_sorted.bam'],
                muse_options['genome_fasta']),
            memory='6G')
        sump = call.addChildJobFn(run_muse_sump_perchrom,
                                  call.rv(),
                                  univ_options,
                                  muse_options,
                                  chrom,
                                  disk=PromisedRequirement(
                                      muse_sump_disk,
                                      muse_options['dbsnp_vcf']),
                                  memory='6G')
        perchrom_muse[chrom] = sump.rv()
    return perchrom_muse
def download_and_process_tar(job, config):
    """
    Download tarball containing fastq(s) and process

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param Expando config: Dict-like object containing workflow options as attributes
    :return: Processed fastqs
    :rtype: tuple(str, str)
    """
    # Define download and process jobs
    disk = '2G' if config.ci_test else config.max_sample_size
    download = job.wrapJobFn(download_url_job,
                             config.url,
                             s3_key_path=config.ssec,
                             disk=disk)
    process = job.wrapJobFn(process_sample,
                            config,
                            input_tar=download.rv(),
                            disk=PromisedRequirement(lambda x: x.size * 10,
                                                     download.rv()))

    # Wire jobs and return processed fastqs
    job.addChild(download)
    download.addChild(process)
    return process.rv()
Example #9
0
def run_strelka(job,
                tumor_bam,
                normal_bam,
                univ_options,
                strelka_options,
                split=True):
    """
    Run the strelka subgraph on the DNA bams.  Optionally split the results into per-chromosome
    vcfs.

    :param dict tumor_bam: Dict of bam and bai for tumor DNA-Seq
    :param dict normal_bam: Dict of bam and bai for normal DNA-Seq
    :param dict univ_options: Dict of universal options used by almost all tools
    :param dict strelka_options: Options specific to strelka
    :param bool split: Should the results be split into perchrom vcfs?
    :return: Either the fsID to the genome-level vcf or a dict of results from running strelka
             on every chromosome
             perchrom_strelka:
                 |- 'chr1':
                 |      |-'snvs': fsID
                 |      +-'indels': fsID
                 |- 'chr2':
                 |      |-'snvs': fsID
                 |      +-'indels': fsID
                 |-...
                 |
                 +- 'chrM':
                        |-'snvs': fsID
                        +-'indels': fsID
    :rtype: toil.fileStore.FileID|dict
    """
    if strelka_options['chromosomes']:
        chromosomes = strelka_options['chromosomes']
    else:
        chromosomes = sample_chromosomes(job, strelka_options['genome_fai'])
    num_cores = min(len(chromosomes), univ_options['max_cores'])
    strelka = job.wrapJobFn(run_strelka_full,
                            tumor_bam,
                            normal_bam,
                            univ_options,
                            strelka_options,
                            disk=PromisedRequirement(
                                strelka_disk,
                                tumor_bam['tumor_dna_fix_pg_sorted.bam'],
                                normal_bam['normal_dna_fix_pg_sorted.bam'],
                                strelka_options['genome_fasta']),
                            memory='6G',
                            cores=num_cores)
    job.addChild(strelka)
    if split:
        unmerge_strelka = job.wrapJobFn(wrap_unmerge, strelka.rv(),
                                        chromosomes, strelka_options,
                                        univ_options).encapsulate()
        strelka.addChild(unmerge_strelka)
        return unmerge_strelka.rv()
    else:
        return strelka.rv()
Example #10
0
def run_mutect(job, tumor_bam, normal_bam, univ_options, mutect_options):
    """
    This module will spawn a mutect job for each chromosome on the DNA bams.

    ARGUMENTS
    1. tumor_bam: Dict of input tumor WGS/WSQ bam + bai
         tumor_bam
              |- 'tumor_fix_pg_sorted.bam': <JSid>
              +- 'tumor_fix_pg_sorted.bam.bai': <JSid>
    2. normal_bam: Dict of input normal WGS/WSQ bam + bai
         normal_bam
              |- 'normal_fix_pg_sorted.bam': <JSid>
              +- 'normal_fix_pg_sorted.bam.bai': <JSid>
    3. univ_options: Dict of universal arguments used by almost all tools
         univ_options
                +- 'dockerhub': <dockerhub to use>
    4. mutect_options: Dict of parameters specific to mutect
         mutect_options
              |- 'dbsnp_vcf': <JSid for dnsnp vcf file>
              |- 'dbsnp_idx': <JSid for dnsnp vcf index file>
              |- 'cosmic_vcf': <JSid for cosmic vcf file>
              |- 'cosmic_idx': <JSid for cosmic vcf index file>
              |- 'genome_fasta': <JSid for genome fasta file>
              +- 'genome_dict': <JSid for genome fasta dict file>
              +- 'genome_fai': <JSid for genome fasta index file>

    RETURN VALUES
    1. perchrom_mutect: Dict of results of mutect per chromosome
         perchrom_mutect
              |- 'chr1'
              |   +- 'mutect_chr1.vcf': <JSid>
              |   +- 'mutect_chr1.out': <JSid>
              |- 'chr2'
              |   |- 'mutect_chr2.vcf': <JSid>
              |   +- 'mutect_chr2.out': <JSid>
             etc...

    This module corresponds to node 11 on the tree
    """
    # Get a list of chromosomes to handle
    chromosomes = sample_chromosomes(job, mutect_options['genome_fai'])
    perchrom_mutect = defaultdict()
    for chrom in chromosomes:
        perchrom_mutect[chrom] = job.addChildJobFn(
            run_mutect_perchrom,
            tumor_bam,
            normal_bam,
            univ_options,
            mutect_options,
            chrom,
            memory='6G',
            disk=PromisedRequirement(
                mutect_disk, tumor_bam['tumor_dna_fix_pg_sorted.bam'],
                normal_bam['normal_dna_fix_pg_sorted.bam'],
                mutect_options['genome_fasta'], mutect_options['dbsnp_vcf'],
                mutect_options['cosmic_vcf'])).rv()
    return perchrom_mutect
Example #11
0
def parentJob(job):
    downloadJob = Job.wrapJobFn(stageFn,
                                "file://" + os.path.realpath(__file__),
                                cores=0.1,
                                memory='32M',
                                disk='1M')
    job.addChild(downloadJob)

    analysis = Job.wrapJobFn(analysisJob,
                             fileStoreID=downloadJob.rv(0),
                             disk=PromisedRequirement(downloadJob.rv(1)))
    job.addFollowOn(analysis)
Example #12
0
def annotate_vcfs(job, vcfs, config):
    """
    Runs Oncotator for a group of VCF files. Each sample is annotated individually.

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param dict vcfs: Dictionary of VCF FileStoreIDs {Sample identifier: FileStoreID}
    :param Namespace config: Input parameters and shared FileStoreIDs
        Requires the following config attributes:
        config.oncotator_db         FileStoreID to Oncotator database
        config.suffix               Suffix added to output filename
        config.output_dir           URL or local path to output directory
        config.ssec                 Path to key file for SSE-C encryption
        config.cores                Number of cores for each job
        config.xmx                  Java heap size in bytes
    """
    job.fileStore.logToMaster(
        'Running Oncotator on the following samples:\n%s' %
        '\n'.join(vcfs.keys()))
    for uuid, vcf_id in vcfs.iteritems():
        # The Oncotator disk requirement depends on the input VCF, the Oncotator database
        # and the output VCF. The annotated VCF will be significantly larger than the input VCF.
        onco_disk = PromisedRequirement(lambda vcf, db: 3 * vcf.size + db.size,
                                        vcf_id, config.oncotator_db)

        annotated_vcf = job.addChildJobFn(run_oncotator,
                                          vcf_id,
                                          config.oncotator_db,
                                          disk=onco_disk,
                                          cores=config.cores,
                                          memory=config.xmx)

        output_dir = os.path.join(config.output_dir, uuid)
        filename = '{}.oncotator{}.vcf'.format(uuid, config.suffix)
        annotated_vcf.addChildJobFn(output_file_job,
                                    filename,
                                    annotated_vcf.rv(),
                                    output_dir,
                                    s3_key_path=config.ssec,
                                    disk=PromisedRequirement(
                                        lambda x: x.size, annotated_vcf.rv()))
Example #13
0
def sort_and_index_star(job, star_bams, univ_options, star_options):
    """
    A wrapper for sorting and indexing the genomic star bam generated by run_star. It is required
    since run_star returns a dict of 2 bams

    :param dict star_bams: The bams from run_star
    :param dict univ_options: Dict of universal options used by almost all tools
    :param dict star_options: Options specific to star
    :return: Dict containing input bam and the generated index (.bam.bai)
                     output_files:
                        |- 'rna_transcriptome.bam': fsID
                        +- 'rna_genome':
                                 |- 'rna_sorted.bam': fsID
                                 +- 'rna_sorted.bam.bai': fsID
                        +- 'rnaChimeric.out.junction': fsID
    :rtype: dict
    """
    star_options['samtools']['n'] = star_options['n']
    sort = job.wrapJobFn(sort_bamfile,
                         star_bams['rnaAligned.out.bam'],
                         'rna',
                         univ_options,
                         samtools_options=star_options['samtools'],
                         disk=PromisedRequirement(
                             sort_disk, star_bams['rnaAligned.out.bam']))
    index = job.wrapJobFn(index_bamfile,
                          sort.rv(),
                          'rna',
                          univ_options,
                          samtools_options=star_options['samtools'],
                          sample_info='genome_sorted',
                          disk=PromisedRequirement(index_disk, sort.rv()))
    job.addChild(sort)
    sort.addChild(index)
    return {
        'rna_genome': index.rv(),
        'rna_transcriptome.bam':
        star_bams['rnaAligned.toTranscriptome.out.bam'],
        'rnaChimeric.out.junction': star_bams['rnaChimeric.out.junction']
    }
Example #14
0
def align_rna(job, fastqs, univ_options, star_options):
    """
    This is a convenience function that runs the entire rna alignment subgraph
    """
    star = job.wrapJobFn(run_star,
                         fastqs,
                         univ_options,
                         star_options,
                         cores=star_options['n'],
                         memory=PromisedRequirement(
                             lambda x: int(1.85 * x.size),
                             star_options['tool_index']),
                         disk=PromisedRequirement(star_disk, fastqs,
                                                  star_options['tool_index']))
    index = job.wrapJobFn(index_star,
                          star.rv(),
                          univ_options,
                          disk=PromisedRequirement(star_disk, fastqs,
                                                   star_options['tool_index']))
    job.addChild(star)
    star.addChild(index)
    return index.rv()
Example #15
0
def align_dna(job, fastqs, sample_type, univ_options, bwa_options):
    """
    This is a convenience function that runs the entire dna alignment subgraph
    """
    bwa = job.wrapJobFn(run_bwa,
                        fastqs,
                        sample_type,
                        univ_options,
                        bwa_options,
                        disk=PromisedRequirement(bwa_disk, fastqs,
                                                 bwa_options['tool_index']))
    sam2bam = job.wrapJobFn(bam_conversion,
                            bwa.rv(),
                            sample_type,
                            univ_options,
                            disk=PromisedRequirement(sam2bam_disk, bwa.rv()))
    # reheader takes the same disk as sam2bam so we can serialize this on the same worker.
    reheader = job.wrapJobFn(fix_bam_header,
                             sam2bam.rv(),
                             sample_type,
                             univ_options,
                             disk=PromisedRequirement(sam2bam_disk, bwa.rv()))
    regroup = job.wrapJobFn(add_readgroups,
                            reheader.rv(),
                            sample_type,
                            univ_options,
                            disk=PromisedRequirement(regroup_disk,
                                                     reheader.rv()))
    index = job.wrapJobFn(index_bamfile,
                          regroup.rv(),
                          sample_type,
                          univ_options,
                          disk=PromisedRequirement(index_disk, regroup.rv()))
    job.addChild(bwa)
    bwa.addChild(sam2bam)
    sam2bam.addChild(reheader)
    reheader.addChild(regroup)
    regroup.addChild(index)
    return index.rv()
Example #16
0
def index_star(job, star_bams, univ_options):
    """
    This is a wrapper functiion for index_bamfile in protect.common which is required since run_star
    returns a dict of 2 bams
    """
    index = job.wrapJobFn(index_bamfile,
                          star_bams['rnaAligned.sortedByCoord.out.bam'],
                          'rna',
                          univ_options,
                          disk=PromisedRequirement(
                              index_disk,
                              star_bams['rnaAligned.sortedByCoord.out.bam']))
    job.addChild(index)
    star_bams['rnaAligned.sortedByCoord.out.bam'] = index.rv()
    return star_bams
Example #17
0
def align_dna(job, fastqs, sample_type, univ_options, bwa_options):
    """
    A wrapper for the entire dna alignment subgraph.

    :param list fastqs: The input fastqs for alignment
    :param str sample_type: Description of the sample to inject into the filename
    :param dict univ_options: Dict of universal options used by almost all tools
    :param dict bwa_options: Options specific to bwa
    :return: Dict containing output bam and bai
             output_files:
                 |- '<sample_type>_fix_pg_sorted.bam': fsID
                 +- '<sample_type>_fix_pg_sorted.bam.bai': fsID
    :rtype: dict
    """
    bwa = job.wrapJobFn(run_bwa, fastqs, sample_type, univ_options, bwa_options,
                        disk=PromisedRequirement(bwa_disk, fastqs, bwa_options['index']),
                        cores=bwa_options['n'])
    sam2bam = job.wrapJobFn(bam_conversion, bwa.rv(), sample_type, univ_options,
                            bwa_options['samtools'],
                            disk=PromisedRequirement(sam2bam_disk, bwa.rv()))
    # reheader takes the same disk as sam2bam so we can serialize this on the same worker.
    reheader = job.wrapJobFn(fix_bam_header, sam2bam.rv(), sample_type, univ_options,
                             bwa_options['samtools'],
                             disk=PromisedRequirement(sam2bam_disk, bwa.rv()))
    regroup = job.wrapJobFn(add_readgroups, reheader.rv(), sample_type, univ_options,
                            bwa_options['picard'],
                            disk=PromisedRequirement(regroup_disk, reheader.rv()))
    index = job.wrapJobFn(index_bamfile, regroup.rv(), sample_type, univ_options,
                          bwa_options['samtools'], sample_info='fix_pg_sorted',
                          disk=PromisedRequirement(index_disk, regroup.rv()))
    job.addChild(bwa)
    bwa.addChild(sam2bam)
    sam2bam.addChild(reheader)
    reheader.addChild(regroup)
    regroup.addChild(index)
    return index.rv()
Example #18
0
 def testPromisesWithJobStoreFileObjects(self):
     """
     Check whether FileID objects are being pickled properly when used as return
     values of functions.  Then ensure that lambdas of promised FileID objects can be
     used to describe the requirements of a subsequent job.  This type of operation will be
     used commonly in Toil scripts.
     :return: None
     """
     file1 = 1024
     file2 = 512
     F1 = Job.wrapJobFn(_writer, file1)
     F2 = Job.wrapJobFn(_writer, file2)
     G = Job.wrapJobFn(_follower, file1+file2,
                       disk=PromisedRequirement(lambda x, y: x.size + y.size,
                                                F1.rv(), F2.rv()))
     F1.addChild(F2)
     F2.addChild(G)
     Job.Runner.startToil(F1, self.getOptions(self._createTempDir('testFiles')))
Example #19
0
def wrap_rsem(job, star_bams, univ_options, rsem_options):
    """
    A wrapper for run_rsem using the results from run_star as input.

    :param dict star_bams: dict of results from star
    :param dict univ_options: Dict of universal options used by almost all tools
    :param dict rsem_options: Options specific to rsem
    :return: Dict of gene- and isoform-level expression calls
             output_files:
                 |- 'rsem.genes.results': fsID
                 +- 'rsem.isoforms.results': fsID
    :rtype: dict
    """
    rsem = job.addChildJobFn(run_rsem, star_bams['rna_transcriptome.bam'],
                             univ_options, rsem_options, cores=rsem_options['n'],
                             disk=PromisedRequirement(rsem_disk, star_bams,
                                                      rsem_options['index']))

    return rsem.rv()
Example #20
0
def align_dna(job, fastqs, sample_type, univ_options, bwa_options):
    """
    A wrapper for the entire dna alignment subgraph.

    :param list fastqs: The input fastqs for alignment
    :param str sample_type: Description of the sample to inject into the filename
    :param dict univ_options: Dict of universal options used by almost all tools
    :param dict bwa_options: Options specific to bwa
    :return: Dict containing output bam and bai
             output_files:
                 |- '<sample_type>_fix_pg_sorted.bam': fsID
                 +- '<sample_type>_fix_pg_sorted.bam.bai': fsID
    :rtype: dict
    """
    # The mkdup and regroup steps use picard that allots heap space using the Xmx key in the
    # univ_options dictionary. This should reflect in the job allotment. Since We want all these
    # jobs to occur on the same node, we ened to give them all the same memory requirements.

    bwa = job.wrapJobFn(run_bwa, fastqs, sample_type, univ_options, bwa_options,
                        disk=PromisedRequirement(bwa_disk, fastqs, bwa_options['index']),
                        memory=univ_options['java_Xmx'],
                        cores=bwa_options['n'])
    sam2bam = job.wrapJobFn(bam_conversion, bwa.rv(), sample_type, univ_options,
                            bwa_options['samtools'],
                            disk=PromisedRequirement(sam2bam_disk, bwa.rv()),
                            memory=univ_options['java_Xmx'])
    # reheader takes the same disk as sam2bam so we can serialize this on the same worker.
    reheader = job.wrapJobFn(fix_bam_header, sam2bam.rv(), sample_type, univ_options,
                             bwa_options['samtools'],
                             disk=PromisedRequirement(sam2bam_disk, bwa.rv()),
                             memory=univ_options['java_Xmx'])
    regroup = job.wrapJobFn(add_readgroups, reheader.rv(), sample_type, univ_options,
                            bwa_options['picard'],
                            disk=PromisedRequirement(regroup_disk, reheader.rv()),
                            memory=univ_options['java_Xmx'])
    mkdup = job.wrapJobFn(mark_duplicates, regroup.rv(), sample_type, univ_options,
                          bwa_options['picard'],
                          disk=PromisedRequirement(mkdup_disk, regroup.rv()),
                          memory=univ_options['java_Xmx'])
    index = job.wrapJobFn(index_bamfile, mkdup.rv(), sample_type, univ_options,
                          bwa_options['samtools'], sample_info='fix_pg_sorted',
                          disk=PromisedRequirement(index_disk, mkdup.rv()),
                          memory=univ_options['java_Xmx'])
    job.addChild(bwa)
    bwa.addChild(sam2bam)
    sam2bam.addChild(reheader)
    reheader.addChild(regroup)
    regroup.addChild(mkdup)
    mkdup.addChild(index)
    return index.rv()
Example #21
0
def get_patient_bams(job, patient_dict, sample_type, univ_options, bwa_options, mutect_options):
    """
    Convenience function to return the bam and its index in the correct format for a sample type.

    :param dict patient_dict: dict of patient info
    :param str sample_type: 'tumor_rna', 'tumor_dna', 'normal_dna'
    :param dict univ_options: Dict of universal options used by almost all tools
    :param dict bwa_options: Options specific to bwa
    :param dict bwa_options: Options specific to mutect
    :return: formatted dict of bam and bai
    :rtype: dict
    """
    output_dict = {}
    if 'dna' in sample_type:
        sample_info = 'fix_pg_sorted'
        prefix = sample_type + '_' + sample_info
    else:
        sample_info = 'genome_sorted'
        prefix = 'rna_' + sample_info
    if sample_type + '_bam' in patient_dict['gdc_inputs']:
        output_dict[prefix + '.bam'] = patient_dict[sample_type + '_bam'][0]
        output_dict[prefix + '.bam.bai'] = patient_dict[sample_type + '_bam'][1]
    elif sample_type + '_bai' in patient_dict:
        output_dict[prefix + '.bam'] = patient_dict[sample_type + '_bam']
        output_dict[prefix + '.bam.bai'] = patient_dict[sample_type + '_bai']
    else:
        from protect.alignment.dna import index_bamfile, index_disk
        output_job = job.wrapJobFn(index_bamfile, patient_dict[sample_type + '_bam'],
                                   'rna' if sample_type == 'tumor_rna' else sample_type,
                                   univ_options, bwa_options['samtools'],
                                   sample_info=sample_info, export=False,
                                   disk=PromisedRequirement(index_disk,
                                                            patient_dict[sample_type + '_bam']))
        job.addChild(output_job)
        output_dict = output_job.rv()
    if sample_type == 'tumor_rna':
        return{'rna_genome': output_dict,
               'rna_transcriptome.bam': patient_dict['tumor_rna_transcriptome_bam']}
    else:
        return output_dict
Example #22
0
def download_and_process_fastqs(job, config):
    """

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param Expando config: Dict-like object containing workflow options as attributes
    :return: Processed fastqs
    :rtype: tuple(str, str)
    """
    # Define download and process jobs
    disk = '2G' if config.ci_test else config.max_sample_size
    download = job.wrapJobFn(multiple_fastq_dowloading,
                             config,
                             sample_disk=disk).encapsulate()
    process = job.wrapJobFn(process_sample,
                            config,
                            fastq_ids=download.rv(),
                            disk=PromisedRequirement(
                                lambda xs: sum(x.size for x in xs) * 5,
                                download.rv()))

    # Wire jobs and return processed fastqs
    job.addChild(download)
    download.addChild(process)
    return process.rv()
Example #23
0
def hard_filter_pipeline(job, uuid, vcf_id, config):
    """
    Runs GATK Hard Filtering on a Genomic VCF file and uploads the results.

    0: Start                0 --> 1 --> 3 --> 5 --> 6
    1: Select SNPs                |           |
    2: Select INDELs              +-> 2 --> 4 +
    3: Apply SNP Filter
    4: Apply INDEL Filter
    5: Merge SNP and INDEL VCFs
    6: Write filtered VCF to output directory

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str uuid: Unique sample identifier
    :param str vcf_id: VCF FileStoreID
    :param Namespace config: Pipeline configuration options and shared files
        Requires the following config attributes:
        config.genome_fasta             FilesStoreID for reference genome fasta file
        config.genome_fai               FilesStoreID for reference genome fasta index file
        config.genome_dict              FilesStoreID for reference genome sequence dictionary file
        config.snp_filter_name          Name of SNP filter for VCF header
        config.snp_filter_expression    SNP JEXL filter expression
        config.indel_filter_name        Name of INDEL filter for VCF header
        config.indel_filter_expression  INDEL JEXL filter expression
        config.xmx                      Java heap size in bytes
        config.suffix                   Suffix added to output filename
        config.output_dir               URL or local path to output directory
        config.ssec                     Path to key file for SSE-C encryption
    :return: SNP and INDEL FileStoreIDs
    :rtype: tuple
    """
    job.fileStore.logToMaster('Running Hard Filter on {}'.format(uuid))

    # Get the total size of the genome reference
    genome_ref_size = config.genome_fasta.size + config.genome_fai.size + config.genome_dict.size

    # The SelectVariants disk requirement depends on the input VCF, the genome reference files,
    # and the output VCF. The output VCF is smaller than the input VCF. The disk requirement
    # is identical for SNPs and INDELs.
    select_variants_disk = PromisedRequirement(
        lambda vcf, ref_size: 2 * vcf.size + ref_size, vcf_id, genome_ref_size)
    select_snps = job.wrapJobFn(gatk_select_variants,
                                'SNP',
                                vcf_id,
                                config.genome_fasta,
                                config.genome_fai,
                                config.genome_dict,
                                memory=config.xmx,
                                disk=select_variants_disk)

    # The VariantFiltration disk requirement depends on the input VCF, the genome reference files,
    # and the output VCF. The filtered VCF is smaller than the input VCF.
    snp_filter_disk = PromisedRequirement(
        lambda vcf, ref_size: 2 * vcf.size + ref_size, select_snps.rv(),
        genome_ref_size)

    snp_filter = job.wrapJobFn(gatk_variant_filtration,
                               select_snps.rv(),
                               config.snp_filter_name,
                               config.snp_filter_expression,
                               config.genome_fasta,
                               config.genome_fai,
                               config.genome_dict,
                               memory=config.xmx,
                               disk=snp_filter_disk)

    select_indels = job.wrapJobFn(gatk_select_variants,
                                  'INDEL',
                                  vcf_id,
                                  config.genome_fasta,
                                  config.genome_fai,
                                  config.genome_dict,
                                  memory=config.xmx,
                                  disk=select_variants_disk)

    indel_filter_disk = PromisedRequirement(
        lambda vcf, ref_size: 2 * vcf.size + ref_size, select_indels.rv(),
        genome_ref_size)

    indel_filter = job.wrapJobFn(gatk_variant_filtration,
                                 select_indels.rv(),
                                 config.indel_filter_name,
                                 config.indel_filter_expression,
                                 config.genome_fasta,
                                 config.genome_fai,
                                 config.genome_dict,
                                 memory=config.xmx,
                                 disk=indel_filter_disk)

    # The CombineVariants disk requirement depends on the SNP and INDEL input VCFs and the
    # genome reference files. The combined VCF is approximately the same size as the input files.
    combine_vcfs_disk = PromisedRequirement(
        lambda vcf1, vcf2, ref_size: 2 * (vcf1.size + vcf2.size) + ref_size,
        indel_filter.rv(), snp_filter.rv(), genome_ref_size)

    combine_vcfs = job.wrapJobFn(
        gatk_combine_variants,
        {
            'SNPs': snp_filter.rv(),
            'INDELs': indel_filter.rv()
        },
        config.genome_fasta,
        config.genome_fai,
        config.genome_dict,
        merge_option='UNSORTED',  # Merges variants from a single sample
        memory=config.xmx,
        disk=combine_vcfs_disk)

    job.addChild(select_snps)
    job.addChild(select_indels)

    select_snps.addChild(snp_filter)
    snp_filter.addChild(combine_vcfs)

    select_indels.addChild(indel_filter)
    indel_filter.addChild(combine_vcfs)

    # Output the hard filtered VCF
    output_dir = os.path.join(config.output_dir, uuid)
    output_filename = '%s.hard_filter%s.vcf' % (uuid, config.suffix)
    output_vcf = job.wrapJobFn(output_file_job,
                               output_filename,
                               combine_vcfs.rv(),
                               output_dir,
                               s3_key_path=config.ssec,
                               disk=PromisedRequirement(
                                   lambda x: x.size, combine_vcfs.rv()))
    combine_vcfs.addChild(output_vcf)
    return combine_vcfs.rv()
Example #24
0
def run_strelka(job,
                tumor_bam,
                normal_bam,
                univ_options,
                strelka_options,
                split=True):
    """
    This module will spawn a strelka job for each chromosome on the DNA bams.

    ARGUMENTS
    1. tumor_bam: Dict of input tumor WGS/WSQ bam + bai
         tumor_bam
              |- 'tumor_fix_pg_sorted.bam': <JSid>
              +- 'tumor_fix_pg_sorted.bam.bai': <JSid>
    2. normal_bam: Dict of input normal WGS/WSQ bam + bai
         normal_bam
              |- 'normal_fix_pg_sorted.bam': <JSid>
              +- 'normal_fix_pg_sorted.bam.bai': <JSid>
    3. univ_options: Dict of universal arguments used by almost all tools
         univ_options
                +- 'dockerhub': <dockerhub to use>
    4. strelka_options: Dict of parameters specific to strelka
         strelka_options
              |- 'dbsnp_vcf': <JSid for dnsnp vcf file>
              |- 'dbsnp_idx': <JSid for dnsnp vcf index file>
              |- 'cosmic_vcf': <JSid for cosmic vcf file>
              |- 'cosmic_idx': <JSid for cosmic vcf index file>
              |- 'genome_fasta': <JSid for genome fasta file>
              +- 'genome_dict': <JSid for genome fasta dict file>
              +- 'genome_fai': <JSid for genome fasta index file>

    RETURN VALUES
    1. perchrom_strelka: Dict of results of strelka per chromosome
         perchrom_strelka
              |- 'chr1'
              |   +- 'strelka_chr1.vcf': <JSid>
              |   +- 'strelka_chr1.out': <JSid>
              |- 'chr2'
              |   |- 'strelka_chr2.vcf': <JSid>
              |   +- 'strelka_chr2.out': <JSid>
             etc...

    This module corresponds to node 11 on the tree
    """
    chromosomes = sample_chromosomes(job, strelka_options['genome_fai'])
    num_cores = min(len(chromosomes), univ_options['max_cores'])
    strelka = job.wrapJobFn(run_strelka_full,
                            tumor_bam,
                            normal_bam,
                            univ_options,
                            strelka_options,
                            disk=PromisedRequirement(
                                strelka_disk,
                                tumor_bam['tumor_dna_fix_pg_sorted.bam'],
                                normal_bam['normal_dna_fix_pg_sorted.bam'],
                                strelka_options['genome_fasta']),
                            memory='6G',
                            cores=num_cores)
    job.addChild(strelka)
    if split:
        unmerge_strelka = job.wrapJobFn(wrap_unmerge, strelka.rv(),
                                        strelka_options,
                                        univ_options).encapsulate()
        strelka.addChild(unmerge_strelka)
        return unmerge_strelka.rv()
    else:
        return strelka.rv()
Example #25
0
def vqsr_pipeline(job, uuid, vcf_id, config):
    """
    Runs GATK Variant Quality Score Recalibration.

    0: Start                        0 --> 1 --> 3 --> 4 --> 5
    1: Recalibrate SNPs                   |      |
    2: Recalibrate INDELS                 +-> 2 -+
    3: Apply SNP Recalibration
    4: Apply INDEL Recalibration
    5: Write VCF to output directory

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str uuid: unique sample identifier
    :param str vcf_id: VCF FileStoreID
    :param Namespace config: Pipeline configuration options and shared files
        Requires the following config attributes:
        config.genome_fasta             FilesStoreID for reference genome fasta file
        config.genome_fai               FilesStoreID for reference genome fasta index file
        config.genome_dict              FilesStoreID for reference genome sequence dictionary file
        config.cores                    Number of cores for each job
        config.xmx                      Java heap size in bytes
        config.suffix                   Suffix for output filename
        config.output_dir               URL or local path to output directory
        config.ssec                     Path to key file for SSE-C encryption

        SNP VQSR attributes:
        config.snp_filter_annotations   List of GATK variant annotations
        config.hapmap                   FileStoreID for HapMap resource file
        config.omni                     FileStoreID for Omni resource file
        config.dbsnp                    FileStoreID for dbSNP resource file
        config.g1k_snp                  FileStoreID for 1000G SNP resource file

        INDEL VQSR attributes:
        config.indel_filter_annotations List of GATK variant annotations
        config.dbsnp                    FileStoreID for dbSNP resource file
        config.mills                    FileStoreID for Mills resource file

    :return: SNP and INDEL VQSR VCF FileStoreID
    :rtype: str
    """
    # Get the total size of the genome reference
    genome_ref_size = config.genome_fasta.size + config.genome_fai.size + config.genome_dict.size

    # The VariantRecalibator disk requirement depends on the input VCF, the resource files,
    # the genome reference files, and the output recalibration table, tranche file, and plots.
    # The sum of these output files are less than the input VCF.
    snp_resources = ['hapmap', 'omni', 'dbsnp', 'g1k_snp']
    snp_resource_size = sum(
        getattr(config, resource).size for resource in snp_resources)
    snp_recal_disk = PromisedRequirement(
        lambda in_vcf, ref_size, resource_size: 2 * in_vcf.size + ref_size +
        resource_size, vcf_id, genome_ref_size, snp_resource_size)

    snp_recal = job.wrapJobFn(gatk_variant_recalibrator,
                              'SNP',
                              vcf_id,
                              config.genome_fasta,
                              config.genome_fai,
                              config.genome_dict,
                              get_short_annotations(
                                  config.snp_filter_annotations),
                              hapmap=config.hapmap,
                              omni=config.omni,
                              phase=config.g1k_snp,
                              dbsnp=config.dbsnp,
                              unsafe_mode=config.unsafe_mode,
                              disk=snp_recal_disk,
                              cores=config.cores,
                              memory=config.xmx)

    indel_resource_size = config.mills.size + config.dbsnp.size
    indel_recal_disk = PromisedRequirement(
        lambda in_vcf, ref_size, resource_size: 2 * in_vcf.size + ref_size +
        resource_size, vcf_id, genome_ref_size, indel_resource_size)

    indel_recal = job.wrapJobFn(gatk_variant_recalibrator,
                                'INDEL',
                                vcf_id,
                                config.genome_fasta,
                                config.genome_fai,
                                config.genome_dict,
                                get_short_annotations(
                                    config.indel_filter_annotations),
                                dbsnp=config.dbsnp,
                                mills=config.mills,
                                unsafe_mode=config.unsafe_mode,
                                disk=indel_recal_disk,
                                cores=config.cores,
                                memory=config.xmx)

    # The ApplyRecalibration disk requirement depends on the input VCF size, the variant
    # recalibration table, the tranche file, the genome reference file, and the output VCF.
    # This step labels variants as filtered, so the output VCF file should be slightly larger
    # than the input file. Estimate a 10% increase in the VCF file size.
    apply_snp_recal_disk = PromisedRequirement(
        lambda in_vcf, recal, tranche, ref_size: int(
            2.1 * in_vcf.size + recal.size + tranche.size + ref_size), vcf_id,
        snp_recal.rv(0), snp_recal.rv(1), genome_ref_size)

    apply_snp_recal = job.wrapJobFn(gatk_apply_variant_recalibration,
                                    'SNP',
                                    vcf_id,
                                    snp_recal.rv(0),
                                    snp_recal.rv(1),
                                    config.genome_fasta,
                                    config.genome_fai,
                                    config.genome_dict,
                                    unsafe_mode=config.unsafe_mode,
                                    disk=apply_snp_recal_disk,
                                    cores=config.cores,
                                    memory=config.xmx)

    apply_indel_recal_disk = PromisedRequirement(
        lambda in_vcf, recal, tranche, ref_size: int(
            2.1 * in_vcf.size + recal.size + tranche.size + ref_size), vcf_id,
        indel_recal.rv(0), indel_recal.rv(1), genome_ref_size)

    apply_indel_recal = job.wrapJobFn(gatk_apply_variant_recalibration,
                                      'INDEL',
                                      apply_snp_recal.rv(),
                                      indel_recal.rv(0),
                                      indel_recal.rv(1),
                                      config.genome_fasta,
                                      config.genome_fai,
                                      config.genome_dict,
                                      unsafe_mode=config.unsafe_mode,
                                      disk=apply_indel_recal_disk,
                                      cores=config.cores,
                                      memory=config.xmx)

    job.addChild(snp_recal)
    job.addChild(indel_recal)
    snp_recal.addChild(apply_snp_recal)
    indel_recal.addChild(apply_indel_recal)
    apply_snp_recal.addChild(apply_indel_recal)

    # Output recalibrated VCF
    output_dir = config.output_dir
    output_dir = os.path.join(output_dir, uuid)
    vqsr_name = '%s.vqsr%s.vcf' % (uuid, config.suffix)
    output_vqsr = job.wrapJobFn(output_file_job,
                                vqsr_name,
                                apply_indel_recal.rv(),
                                output_dir,
                                s3_key_path=config.ssec,
                                disk=PromisedRequirement(
                                    lambda x: x.size, apply_indel_recal.rv()))
    apply_indel_recal.addChild(output_vqsr)
    return apply_indel_recal.rv()
Example #26
0
def launch_protect(job, patient_data, univ_options, tool_options):
    """
    The launchpad for ProTECT. The DAG for ProTECT can be viewed in Flowchart.txt.

    :param dict patient_data: Dict of information regarding the input sequences for the patient
    :param dict univ_options: Dict of universal options used by almost all tools
    :param dict tool_options: Options for the various tools
    """
    # Add Patient id to univ_options as is is passed to every major node in the DAG and can be used
    # as a prefix for the logfile.
    univ_options['patient'] = patient_data['patient_id']
    univ_options['tumor_type'] = patient_data['tumor_type']
    # Ascertain number of cpus to use per job
    for tool in tool_options:
        tool_options[tool]['n'] = ascertain_cpu_share(univ_options['max_cores'])
    # Define the various nodes in the DAG
    # Need a logfile and a way to send it around
    sample_prep = job.wrapJobFn(prepare_samples, patient_data, univ_options, disk='40G')
    job.addChild(sample_prep)
    # Define the fastq deletion step
    fastq_deletion_1 = job.wrapJobFn(delete_fastqs, sample_prep.rv(), disk='100M', memory='100M')
    sample_prep.addChild(fastq_deletion_1)
    # Get all the input files
    haplotype_patient = get_mutations = None
    fastq_files = defaultdict(lambda: None)
    bam_files = defaultdict(lambda: None)
    delete_bam_files = defaultdict(lambda: None)
    phlat_files = defaultdict(lambda: None)
    for sample_type in 'tumor_dna', 'normal_dna', 'tumor_rna':
        if sample_type + '_fastq_1' in patient_data:
            fastq_files[sample_type] = job.wrapJobFn(get_patient_fastqs, sample_prep.rv(),
                                                     sample_type, disk='10M')
            sample_prep.addChild(fastq_files[sample_type])
            fastq_files[sample_type].addChild(fastq_deletion_1)
        elif sample_type + '_bam' in patient_data:
            bam_files[sample_type] = job.wrapJobFn(get_patient_bams, sample_prep.rv(), sample_type,
                                                   univ_options, tool_options['bwa'],
                                                   tool_options['mutect'],
                                                   disk='10M').encapsulate()
            sample_prep.addChild(bam_files[sample_type])

    # define the haplotyping subgraph of the DAG
    if 'hla_haplotype_files' in patient_data:
        haplotype_patient = job.wrapJobFn(get_patient_mhc_haplotype, sample_prep.rv())
        sample_prep.addChild(haplotype_patient)
    else:
        assert None not in fastq_files.values()
        # We are guaranteed to have fastqs here
        for sample_type in 'tumor_dna', 'normal_dna', 'tumor_rna':
            phlat_files[sample_type] = job.wrapJobFn(
                run_phlat, fastq_files[sample_type].rv(), sample_type, univ_options,
                tool_options['phlat'], cores=tool_options['phlat']['n'],
                disk=PromisedRequirement(phlat_disk, fastq_files[sample_type].rv()))
            fastq_files[sample_type].addChild(phlat_files[sample_type])
            phlat_files[sample_type].addChild(fastq_deletion_1)
        haplotype_patient = job.wrapJobFn(merge_phlat_calls,
                                          phlat_files['tumor_dna'].rv(),
                                          phlat_files['normal_dna'].rv(),
                                          phlat_files['tumor_rna'].rv(),
                                          univ_options, disk='100M', memory='100M', cores=1)
        phlat_files['tumor_dna'].addChild(haplotype_patient)
        phlat_files['normal_dna'].addChild(haplotype_patient)
        phlat_files['tumor_rna'].addChild(haplotype_patient)

    # Define the RNA-Seq Alignment subgraph if needed
    if bam_files['tumor_rna'] is None:
        assert fastq_files['tumor_rna'] is not None
        cutadapt = job.wrapJobFn(run_cutadapt, fastq_files['tumor_rna'].rv(), univ_options,
                                 tool_options['cutadapt'], cores=1,
                                 disk=PromisedRequirement(cutadapt_disk,
                                                          fastq_files['tumor_rna'].rv()))
        bam_files['tumor_rna'] = job.wrapJobFn(align_rna, cutadapt.rv(), univ_options,
                                               tool_options['star'], cores=1,
                                               disk='100M').encapsulate()
        fastq_deletion_2 = job.wrapJobFn(delete_fastqs, {'cutadapted_rnas': cutadapt.rv()},
                                         disk='100M', memory='100M')
        fastq_files['tumor_rna'].addChild(cutadapt)
        cutadapt.addChild(fastq_deletion_1)
        cutadapt.addChild(fastq_deletion_2)
        cutadapt.addChild(bam_files['tumor_rna'])
        bam_files['tumor_rna'].addChild(fastq_deletion_2)
        # Define the fusion calling node

        tool_options['star_fusion']['index'] = tool_options['star']['index']
        tool_options['fusion_inspector']['index'] = tool_options['star']['index']
        fusions = job.wrapJobFn(wrap_fusion,
                                cutadapt.rv(),
                                bam_files['tumor_rna'].rv(),
                                univ_options,
                                tool_options['star_fusion'],
                                tool_options['fusion_inspector'],
                                disk='100M', memory='100M', cores=1).encapsulate()

        bam_files['tumor_rna'].addChild(fusions)
        fusions.addChild(fastq_deletion_1)
        fusions.addChild(fastq_deletion_2)
    else:
        if tool_options['star_fusion']['run'] is True:
            job.fileStore.logToMaster('Input RNA bams were provided for sample %s. Fusion detection'
                                      'can only be run with input fastqs.' % univ_options['patient']
                                      )
        fusions = None

    # Define the Expression estimation node
    rsem = job.wrapJobFn(wrap_rsem, bam_files['tumor_rna'].rv(), univ_options, tool_options['rsem'],
                         cores=1, disk='100M').encapsulate()
    bam_files['tumor_rna'].addChild(rsem)
    # Define the bam deletion node
    delete_bam_files['tumor_rna'] = job.wrapJobFn(delete_bams,
                                                  bam_files['tumor_rna'].rv(),
                                                  univ_options['patient'], disk='100M',
                                                  memory='100M')
    bam_files['tumor_rna'].addChild(delete_bam_files['tumor_rna'])
    rsem.addChild(delete_bam_files['tumor_rna'])
    if fusions:
        fusions.addChild(delete_bam_files['tumor_rna'])
    # Define the reporting leaves
    if phlat_files['tumor_rna'] is not None:
        mhc_pathway_assessment = job.wrapJobFn(run_mhc_gene_assessment, rsem.rv(),
                                               phlat_files['tumor_rna'].rv(), univ_options,
                                               tool_options['reports'], disk='100M',
                                               memory='100M', cores=1)
        rsem.addChild(mhc_pathway_assessment)
        phlat_files['tumor_rna'].addChild(mhc_pathway_assessment)
    else:
        mhc_pathway_assessment = job.wrapJobFn(run_mhc_gene_assessment, rsem.rv(), None,
                                               univ_options, tool_options['reports'],
                                               disk='100M', memory='100M', cores=1)
        rsem.addChild(mhc_pathway_assessment)
    itx_resistance_assessment = job.wrapJobFn(run_itx_resistance_assessment, rsem.rv(),
                                              univ_options, tool_options['reports'],
                                              disk='100M', memory='100M', cores=1)
    rsem.addChild(itx_resistance_assessment)
    car_t_validity_assessment = job.wrapJobFn(run_car_t_validity_assessment, rsem.rv(),
                                              univ_options, tool_options['reports'],
                                              disk='100M', memory='100M', cores=1)
    rsem.addChild(car_t_validity_assessment)
    # Define the DNA-Seq alignment and mutation calling subgraphs if necessary
    if 'mutation_vcf' in patient_data:
        get_mutations = job.wrapJobFn(get_patient_vcf, sample_prep.rv())
        sample_prep.addChild(get_mutations)
    else:
        assert (None, None) not in zip(fastq_files.values(), bam_files.values())
        for sample_type in 'tumor_dna', 'normal_dna':
            if bam_files[sample_type] is None:
                assert fastq_files[sample_type] is not None
                bam_files[sample_type] = job.wrapJobFn(align_dna, fastq_files[sample_type].rv(),
                                                       sample_type, univ_options,
                                                       tool_options['bwa'], cores=1,
                                                       disk='100M').encapsulate()
                fastq_files[sample_type].addChild(bam_files[sample_type])
                bam_files[sample_type].addChild(fastq_deletion_1)
            else:
                # We already have the bam ready to go
                pass
            delete_bam_files[sample_type] = job.wrapJobFn(delete_bams,
                                                          bam_files[sample_type].rv(),
                                                          univ_options['patient'], disk='100M',
                                                          memory='100M')
            bam_files[sample_type].addChild(delete_bam_files[sample_type])
        # Time to call mutations
        mutations = {
            'radia': job.wrapJobFn(run_radia, bam_files['tumor_rna'].rv(),
                                   bam_files['tumor_dna'].rv(), bam_files['normal_dna'].rv(),
                                   univ_options, tool_options['radia'],
                                   disk='100M').encapsulate(),
            'mutect': job.wrapJobFn(run_mutect, bam_files['tumor_dna'].rv(),
                                    bam_files['normal_dna'].rv(), univ_options,
                                    tool_options['mutect'], disk='100M').encapsulate(),
            'muse': job.wrapJobFn(run_muse, bam_files['tumor_dna'].rv(),
                                  bam_files['normal_dna'].rv(), univ_options,
                                  tool_options['muse']).encapsulate(),
            'somaticsniper': job.wrapJobFn(run_somaticsniper, bam_files['tumor_dna'].rv(),
                                           bam_files['normal_dna'].rv(), univ_options,
                                           tool_options['somaticsniper']).encapsulate(),
            'strelka': job.wrapJobFn(run_strelka, bam_files['tumor_dna'].rv(),
                                     bam_files['normal_dna'].rv(), univ_options,
                                     tool_options['strelka']).encapsulate(),
            'indels': job.wrapJobFn(run_indel_caller, bam_files['tumor_dna'].rv(),
                                    bam_files['normal_dna'].rv(), univ_options, 'indel_options',
                                    disk='100M', memory='100M', cores=1)}
        for sample_type in 'tumor_dna', 'normal_dna':
            for caller in mutations:
                bam_files[sample_type].addChild(mutations[caller])
        bam_files['tumor_rna'].addChild(mutations['radia'])
        get_mutations = job.wrapJobFn(run_mutation_aggregator,
                                      {caller: cjob.rv() for caller, cjob in mutations.items()},
                                      univ_options, disk='100M', memory='100M',
                                      cores=1).encapsulate()
        for caller in mutations:
            mutations[caller].addChild(get_mutations)
        # We don't need the normal dna bam any more
        get_mutations.addChild(delete_bam_files['normal_dna'])
        # We may need the tumor one depending on OxoG
        if not patient_data['filter_for_OxoG']:
            get_mutations.addChild(delete_bam_files['tumor_dna'])

    # The rest of the subgraph should be unchanged
    snpeff = job.wrapJobFn(run_snpeff, get_mutations.rv(), univ_options, tool_options['snpeff'],
                           disk=PromisedRequirement(snpeff_disk,
                                                    tool_options['snpeff']['index']))
    get_mutations.addChild(snpeff)
    tumor_dna_bam = bam_files['tumor_dna'].rv() if patient_data['filter_for_OxoG'] else None
    fusion_calls = fusions.rv() if fusions else None
    transgene = job.wrapJobFn(run_transgene, snpeff.rv(), bam_files['tumor_rna'].rv(), univ_options,
                              tool_options['transgene'],
                              disk=PromisedRequirement(transgene_disk, bam_files['tumor_rna'].rv()),
                              memory='100M', cores=1, tumor_dna_bam=tumor_dna_bam,
                              fusion_calls=fusion_calls)
    snpeff.addChild(transgene)
    bam_files['tumor_rna'].addChild(transgene)
    transgene.addChild(delete_bam_files['tumor_rna'])
    if patient_data['filter_for_OxoG']:
        bam_files['tumor_dna'].addChild(transgene)
        transgene.addChild(delete_bam_files['tumor_dna'])
    if fusions:
        fusions.addChild(transgene)

    spawn_mhc = job.wrapJobFn(spawn_antigen_predictors, transgene.rv(), haplotype_patient.rv(),
                              univ_options, (tool_options['mhci'], tool_options['mhcii']),
                              disk='100M', memory='100M', cores=1).encapsulate()
    haplotype_patient.addChild(spawn_mhc)
    transgene.addChild(spawn_mhc)

    merge_mhc = job.wrapJobFn(merge_mhc_peptide_calls, spawn_mhc.rv(), transgene.rv(), univ_options,
                              disk='100M', memory='100M', cores=1)
    spawn_mhc.addFollowOn(merge_mhc)
    transgene.addChild(merge_mhc)

    rankboost = job.wrapJobFn(wrap_rankboost, rsem.rv(), merge_mhc.rv(), transgene.rv(),
                              univ_options, tool_options['rankboost'], disk='100M', memory='100M',
                              cores=1)
    rsem.addChild(rankboost)
    merge_mhc.addChild(rankboost)
    transgene.addChild(rankboost)
    report_success = job.wrapJobFn(email_report, univ_options)
    rankboost.addChild(report_success)
    return None
Example #27
0
def run_radia(job, rna_bam, tumor_bam, normal_bam, univ_options,
              radia_options):
    """
    Spawn a RADIA job for each chromosome on the input bam trios.

    :param dict rna_bam: Dict of bam and bai for tumor DNA-Seq.  It can be one of two formats
           rna_bam:   # Just the genomic bam and bai
                |- 'rna_genome_sorted.bam': fsID
                +- 'rna_genome_sorted.bam.bai': fsID
           OR
           rna_bam:   # The output from run_star
               |- 'rna_transcriptome.bam': fsID
               |- 'rna_genome':     # Only this part will be used
                       |- 'rna_genome_sorted.bam': fsID
                       +- 'rna_genome_sorted.bam.bai': fsID
    :param dict tumor_bam: Dict of bam and bai for tumor DNA-Seq
    :param dict normal_bam: Dict of bam and bai for normal DNA-Seq
    :param dict univ_options: Dict of universal options used by almost all tools
    :param dict radia_options: Options specific to RADIA
    :return: Dict of results from running RADIA on every chromosome
             perchrom_radia:
                 |- 'chr1': fsID
                 |- 'chr2' fsID
                 |
                 |-...
                 |
                 +- 'chrM': fsID
    :rtype: dict
    """
    if 'rna_genome' in rna_bam.keys():
        rna_bam = rna_bam['rna_genome']
    elif set(rna_bam.keys()) == {
            'rna_genome_sorted.bam', 'rna_genome_sorted.bam.bai'
    }:
        pass
    else:
        raise RuntimeError(
            'An improperly formatted dict was passed to rna_bam.')

    bams = {
        'tumor_rna': rna_bam['rna_genome_sorted.bam'],
        'tumor_rnai': rna_bam['rna_genome_sorted.bam.bai'],
        'tumor_dna': tumor_bam['tumor_dna_fix_pg_sorted.bam'],
        'tumor_dnai': tumor_bam['tumor_dna_fix_pg_sorted.bam.bai'],
        'normal_dna': normal_bam['normal_dna_fix_pg_sorted.bam'],
        'normal_dnai': normal_bam['normal_dna_fix_pg_sorted.bam.bai']
    }
    # Get a list of chromosomes to process
    if radia_options['chromosomes']:
        chromosomes = radia_options['chromosomes']
    else:
        chromosomes = sample_chromosomes(job, radia_options['genome_fai'])
    perchrom_radia = defaultdict()
    for chrom in chromosomes:
        radia = job.addChildJobFn(
            run_radia_perchrom,
            bams,
            univ_options,
            radia_options,
            chrom,
            memory='6G',
            disk=PromisedRequirement(
                radia_disk, tumor_bam['tumor_dna_fix_pg_sorted.bam'],
                normal_bam['normal_dna_fix_pg_sorted.bam'],
                rna_bam['rna_genome_sorted.bam'],
                radia_options['genome_fasta']))
        filter_radia = radia.addChildJobFn(
            run_filter_radia,
            bams,
            radia.rv(),
            univ_options,
            radia_options,
            chrom,
            memory='6G',
            disk=PromisedRequirement(
                radia_disk, tumor_bam['tumor_dna_fix_pg_sorted.bam'],
                normal_bam['normal_dna_fix_pg_sorted.bam'],
                rna_bam['rna_genome_sorted.bam'],
                radia_options['genome_fasta']))
        perchrom_radia[chrom] = filter_radia.rv()
    job.fileStore.logToMaster('Ran spawn_radia on %s successfully' %
                              univ_options['patient'])
    return perchrom_radia
Example #28
0
def workflow(job, sample, config):
    """
    Creates workflow graph for each sample based on configuration options

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param list(str, str, str, str) sample: Sample information - filetype, paired/unpaired, UUID, and URL
    :param Expando config: Dict-like object containing workflow options as attributes
    """
    # Create copy of config to store sample-specific information
    config = config.copy()
    config.file_type, config.paired, config.uuid, config.url = sample
    config.paired = True if config.paired == 'paired' else False
    config.cores = min(config.maxCores, multiprocessing.cpu_count())

    # Download and process input based on file type
    # `inputs` will return the FileStoreID(s) of the R1 / R2 fastq
    if config.file_type == 'bam':
        disk = '2G' if config.ci_test else config.max_sample_size
        disk = human2bytes(disk) * 5
        inputs = job.wrapJobFn(download_and_process_bam, config, disk=disk)

    elif config.file_type == 'tar':
        inputs = job.wrapJobFn(download_and_process_tar, config).encapsulate()

    else:
        config.gz = True if config.url.split(',')[0].endswith(
            'gz') else None  # Check if fastqs are gzipped
        inputs = job.wrapJobFn(download_and_process_fastqs,
                               config).encapsulate()

    # Add inputs as first child to root job
    job.addChild(inputs)

    # Define preliminary disk and create dictionary for storing output
    disk = PromisedRequirement(
        lambda xs: sum(x.size for x in xs if x) + human2bytes('2G'),
        inputs.rv())
    cores = min(16, config.cores
                ) if config.cores >= 32 else config.cores  # Core optimization
    output = {}

    # DAG wiring for remainder of workflow
    # FASTQC
    if config.fastqc:
        fastqc = job.wrapJobFn(run_fastqc,
                               r1_id=inputs.rv(0),
                               r2_id=inputs.rv(1),
                               cores=2,
                               disk=disk)
        inputs.addChild(fastqc)
        output['QC/fastQC'] = fastqc.rv()

    # Kallisto
    if config.kallisto_index:
        kallisto = job.wrapJobFn(run_kallisto,
                                 r1_id=inputs.rv(0),
                                 r2_id=inputs.rv(1),
                                 kallisto_index_url=config.kallisto_index,
                                 cores=cores,
                                 disk=disk)
        inputs.addChild(kallisto)
        output['Kallisto'] = kallisto.rv()

    # Hera
    if config.hera_index:
        hera = job.wrapJobFn(run_hera,
                             r1_id=inputs.rv(0),
                             r2_id=inputs.rv(1),
                             hera_index_url=config.hera_index,
                             cores=config.cores,
                             disk=disk)
        inputs.addChild(hera)
        output['Hera'] = hera.rv()

    # STAR and RSEM
    if config.star_index and config.rsem_ref:
        if config.ci_test:
            disk = '2G'
            mem = '2G'
        else:
            disk = PromisedRequirement(
                lambda xs: sum(x.size for x in xs if x) + human2bytes('25G'),
                inputs.rv())
            mem = '40G'

        # STAR returns: transcriptome_id, star_id, aligned_id, wiggle_id
        sort = True if config.wiggle else False
        star = job.wrapJobFn(run_star,
                             inputs.rv(0),
                             inputs.rv(1),
                             star_index_url=config.star_index,
                             wiggle=config.wiggle,
                             sort=sort,
                             save_aligned_bam=config.save_bam,
                             cores=config.cores,
                             memory=mem,
                             disk=disk)
        inputs.addChild(star)
        output['QC/STAR'] = star.rv(1)

        # Handle optional files user can save
        if config.save_bam:
            star.addChildJobFn(sort_and_save_bam,
                               config,
                               bam_id=star.rv(2),
                               skip_sort=sort)
        if config.wiggle:
            star.addChildJobFn(save_wiggle, config, wiggle_id=star.rv(3))

        # RSEM returns: gene_id, isoform_id
        rsem = job.wrapJobFn(run_rsem,
                             bam_id=star.rv(0),
                             rsem_ref_url=config.rsem_ref,
                             paired=config.paired,
                             cores=cores,
                             disk=PromisedRequirement(
                                 lambda x: x.size + human2bytes('2G'),
                                 star.rv(0)))
        star.addChild(rsem)

        # RSEM postprocess returns: rsem_id, rsem_hugo_id
        rsem_postprocess = job.wrapJobFn(run_rsem_gene_mapping,
                                         rsem_gene_id=rsem.rv(0),
                                         rsem_isoform_id=rsem.rv(1))
        rsem.addChild(rsem_postprocess)
        output['RSEM'] = rsem_postprocess.rv(0)
        output['RSEM/Hugo'] = rsem_postprocess.rv(1)

        # Cleanup
        star.addFollowOnJobFn(cleanup_ids,
                              ids_to_delete=[star.rv(2),
                                             star.rv(3)])
        rsem.addChildJobFn(cleanup_ids, ids_to_delete=[star.rv(0)])
        rsem_postprocess.addChildJobFn(cleanup_ids,
                                       ids_to_delete=[rsem.rv(0),
                                                      rsem.rv(1)])

    # Cleanup and Consolidate
    job.addFollowOnJobFn(cleanup_ids, [inputs.rv(0), inputs.rv(1)])
    job.addFollowOnJobFn(consolidate_output, config, output)
Example #29
0
def pipeline_launchpad(job, fastqs, univ_options, tool_options):
    """
    The precision immuno pipeline begins at this module. The DAG can be viewed in Flowchart.txt

    :param job job: job
    :param dict fastqs: Dict of lists of fastq files
    :param univ_options: Universal Options
    :param tool_options: Options for the various tools
    :return: None
    """
    # Add Patient id to univ_options as is is passed to every major node in the DAG and can be used
    # as a prefix for the logfile.
    univ_options['patient'] = fastqs['patient_id']
    # Ascertin number of cpus to use per job
    tool_options['star']['n'] = tool_options['bwa']['n'] = tool_options['phlat']['n'] = \
        tool_options['rsem']['n'] = ascertain_cpu_share(univ_options['max_cores'])
    # Define the various nodes in the DAG
    # Need a logfile and a way to send it around
    sample_prep = job.wrapJobFn(prepare_samples,
                                fastqs,
                                univ_options,
                                disk='40G')
    tumor_dna_fqs = job.wrapJobFn(get_fqs,
                                  sample_prep.rv(),
                                  'tumor_dna',
                                  disk='10M')
    normal_dna_fqs = job.wrapJobFn(get_fqs,
                                   sample_prep.rv(),
                                   'normal_dna',
                                   disk='10M')
    tumor_rna_fqs = job.wrapJobFn(get_fqs,
                                  sample_prep.rv(),
                                  'tumor_rna',
                                  disk='10M')
    cutadapt = job.wrapJobFn(run_cutadapt,
                             tumor_rna_fqs.rv(),
                             univ_options,
                             tool_options['cutadapt'],
                             cores=1,
                             disk=PromisedRequirement(cutadapt_disk,
                                                      tumor_rna_fqs.rv()))
    star = job.wrapJobFn(align_rna,
                         cutadapt.rv(),
                         univ_options,
                         tool_options['star'],
                         cores=1,
                         disk='100M').encapsulate()
    bwa_tumor = job.wrapJobFn(align_dna,
                              tumor_dna_fqs.rv(),
                              'tumor_dna',
                              univ_options,
                              tool_options['bwa'],
                              cores=1,
                              disk='100M').encapsulate()
    bwa_normal = job.wrapJobFn(align_dna,
                               normal_dna_fqs.rv(),
                               'normal_dna',
                               univ_options,
                               tool_options['bwa'],
                               cores=1,
                               disk='100M').encapsulate()
    phlat_tumor_dna = job.wrapJobFn(run_phlat,
                                    tumor_dna_fqs.rv(),
                                    'tumor_dna',
                                    univ_options,
                                    tool_options['phlat'],
                                    cores=tool_options['phlat']['n'],
                                    disk=PromisedRequirement(
                                        phlat_disk, tumor_dna_fqs.rv()))
    phlat_normal_dna = job.wrapJobFn(run_phlat,
                                     normal_dna_fqs.rv(),
                                     'normal_dna',
                                     univ_options,
                                     tool_options['phlat'],
                                     cores=tool_options['phlat']['n'],
                                     disk=PromisedRequirement(
                                         phlat_disk, normal_dna_fqs.rv()))
    phlat_tumor_rna = job.wrapJobFn(run_phlat,
                                    tumor_rna_fqs.rv(),
                                    'tumor_rna',
                                    univ_options,
                                    tool_options['phlat'],
                                    cores=tool_options['phlat']['n'],
                                    disk=PromisedRequirement(
                                        phlat_disk, tumor_rna_fqs.rv()))
    fastq_deletion_1 = job.wrapJobFn(delete_fastqs,
                                     sample_prep.rv(),
                                     disk='100M',
                                     memory='100M')
    fastq_deletion_2 = job.wrapJobFn(delete_fastqs,
                                     {'cutadapted_rnas': cutadapt.rv()},
                                     disk='100M',
                                     memory='100M')
    rsem = job.wrapJobFn(wrap_rsem,
                         star.rv(),
                         univ_options,
                         tool_options['rsem'],
                         cores=tool_options['rsem']['n'],
                         disk='100M').encapsulate()
    mhc_pathway_assessment = job.wrapJobFn(
        run_mhc_gene_assessment,
        rsem.rv(),
        phlat_tumor_rna.rv(),
        univ_options,
        tool_options['mhc_pathway_assessment'],
        disk='100M',
        memory='100M',
        cores=1)
    fusions = job.wrapJobFn(run_fusion_caller,
                            star.rv(),
                            univ_options,
                            'fusion_options',
                            disk='100M',
                            memory='100M',
                            cores=1)
    radia = job.wrapJobFn(run_radia,
                          star.rv(),
                          bwa_tumor.rv(),
                          bwa_normal.rv(),
                          univ_options,
                          tool_options['mut_callers'],
                          disk='100M').encapsulate()
    mutect = job.wrapJobFn(run_mutect,
                           bwa_tumor.rv(),
                           bwa_normal.rv(),
                           univ_options,
                           tool_options['mut_callers'],
                           disk='100M').encapsulate()
    muse = job.wrapJobFn(run_muse, bwa_tumor.rv(), bwa_normal.rv(),
                         univ_options,
                         tool_options['mut_callers']).encapsulate()
    somaticsniper = job.wrapJobFn(run_somaticsniper, bwa_tumor.rv(),
                                  bwa_normal.rv(), univ_options,
                                  tool_options['mut_callers']).encapsulate()
    strelka = job.wrapJobFn(run_strelka, bwa_tumor.rv(), bwa_normal.rv(),
                            univ_options,
                            tool_options['mut_callers']).encapsulate()
    indels = job.wrapJobFn(run_indel_caller,
                           bwa_tumor.rv(),
                           bwa_normal.rv(),
                           univ_options,
                           'indel_options',
                           disk='100M',
                           memory='100M',
                           cores=1)
    merge_mutations = job.wrapJobFn(run_mutation_aggregator, {
        'fusions': fusions.rv(),
        'radia': radia.rv(),
        'mutect': mutect.rv(),
        'strelka': strelka.rv(),
        'indels': indels.rv(),
        'muse': muse.rv(),
        'somaticsniper': somaticsniper.rv()
    },
                                    univ_options,
                                    disk='100M',
                                    memory='100M',
                                    cores=1).encapsulate()
    snpeff = job.wrapJobFn(run_snpeff,
                           merge_mutations.rv(),
                           univ_options,
                           tool_options['snpeff'],
                           disk=PromisedRequirement(
                               snpeff_disk,
                               tool_options['snpeff']['tool_index']))
    transgene = job.wrapJobFn(run_transgene,
                              snpeff.rv(),
                              star.rv(),
                              univ_options,
                              tool_options['transgene'],
                              disk='100M',
                              memory='100M',
                              cores=1)
    merge_phlat = job.wrapJobFn(merge_phlat_calls,
                                phlat_tumor_dna.rv(),
                                phlat_normal_dna.rv(),
                                phlat_tumor_rna.rv(),
                                univ_options,
                                disk='100M',
                                memory='100M',
                                cores=1)
    spawn_mhc = job.wrapJobFn(spawn_antigen_predictors,
                              transgene.rv(),
                              merge_phlat.rv(),
                              univ_options,
                              (tool_options['mhci'], tool_options['mhcii']),
                              disk='100M',
                              memory='100M',
                              cores=1).encapsulate()
    merge_mhc = job.wrapJobFn(merge_mhc_peptide_calls,
                              spawn_mhc.rv(),
                              transgene.rv(),
                              univ_options,
                              disk='100M',
                              memory='100M',
                              cores=1)
    rank_boost = job.wrapJobFn(wrap_rankboost,
                               rsem.rv(),
                               merge_mhc.rv(),
                               transgene.rv(),
                               univ_options,
                               tool_options['rank_boost'],
                               disk='100M',
                               memory='100M',
                               cores=1)
    # Define the DAG in a static form
    job.addChild(sample_prep)  # Edge  0->1
    # A. The first step is running the alignments and the MHC haplotypers
    sample_prep.addChild(tumor_dna_fqs)  # Edge  1->2
    sample_prep.addChild(normal_dna_fqs)  # Edge  1->2
    sample_prep.addChild(tumor_rna_fqs)  # Edge  1->2

    tumor_rna_fqs.addChild(cutadapt)  # Edge  1->2
    tumor_dna_fqs.addChild(bwa_tumor)  # Edge  1->3
    normal_dna_fqs.addChild(bwa_normal)  # Edge  1->4

    tumor_dna_fqs.addChild(phlat_tumor_dna)  # Edge  1->5
    normal_dna_fqs.addChild(phlat_normal_dna)  # Edge  1->6
    tumor_rna_fqs.addChild(phlat_tumor_rna)  # Edge  1->7
    # B. cutadapt will be followed by star
    cutadapt.addChild(star)  # Edge 2->9
    # Ci.  gene expression and fusion detection follow start alignment
    star.addChild(rsem)  # Edge  9->10
    star.addChild(fusions)  # Edge  9->11
    # Cii.  Radia depends on all 3 alignments
    star.addChild(radia)  # Edge  9->12
    bwa_tumor.addChild(radia)  # Edge  3->12
    bwa_normal.addChild(radia)  # Edge  4->12
    # Ciii. mutect and indel calling depends on dna to have been aligned
    bwa_tumor.addChild(mutect)  # Edge  3->13
    bwa_normal.addChild(mutect)  # Edge  4->13
    bwa_tumor.addChild(muse)  # Edge  3->13
    bwa_normal.addChild(muse)  # Edge  4->13
    bwa_tumor.addChild(somaticsniper)  # Edge  3->13
    bwa_normal.addChild(somaticsniper)  # Edge  4->13
    bwa_tumor.addChild(strelka)  # Edge  3->13
    bwa_normal.addChild(strelka)  # Edge  4->13
    bwa_tumor.addChild(indels)  # Edge  3->14
    bwa_normal.addChild(indels)  # Edge  4->14
    # D. MHC haplotypes will be merged once all 3 samples have been PHLAT-ed
    phlat_tumor_dna.addChild(merge_phlat)  # Edge  5->15
    phlat_normal_dna.addChild(merge_phlat)  # Edge  6->15
    phlat_tumor_rna.addChild(merge_phlat)  # Edge  7->15
    # E. Delete the fastqs from the job store since all alignments are complete
    sample_prep.addChild(fastq_deletion_1)  # Edge 1->8
    cutadapt.addChild(fastq_deletion_1)  # Edge 2->8
    bwa_normal.addChild(fastq_deletion_1)  # Edge 3->8
    bwa_tumor.addChild(fastq_deletion_1)  # Edge 4->8
    phlat_normal_dna.addChild(fastq_deletion_1)  # Edge 5->8
    phlat_tumor_dna.addChild(fastq_deletion_1)  # Edge 6>8
    phlat_tumor_rna.addChild(fastq_deletion_1)  # Edge 7->8
    star.addChild(fastq_deletion_2)
    # F. Mutation calls need to be merged before they can be used
    # G. All mutations get aggregated when they have finished running
    fusions.addChild(merge_mutations)  # Edge 11->18
    radia.addChild(merge_mutations)  # Edge 16->18
    mutect.addChild(merge_mutations)  # Edge 17->18
    muse.addChild(merge_mutations)  # Edge 17->18
    somaticsniper.addChild(merge_mutations)  # Edge 17->18
    strelka.addChild(merge_mutations)  # Edge 17->18
    indels.addChild(merge_mutations)  # Edge 14->18
    # H. Aggregated mutations will be translated to protein space
    merge_mutations.addChild(snpeff)  # Edge 18->19
    # I. snpeffed mutations will be converted into peptides.
    # Transgene also accepts the RNA-seq bam and bai so that it can be rna-aware
    snpeff.addChild(transgene)  # Edge 19->20
    star.addChild(transgene)
    # J. Merged haplotypes and peptides will be converted into jobs and submitted for mhc:peptide
    # binding prediction
    merge_phlat.addChild(spawn_mhc)  # Edge 15->21
    transgene.addChild(spawn_mhc)  # Edge 20->21
    # K. The results from all the predictions will be merged. This is a follow-on job because
    # spawn_mhc will spawn an undetermined number of children.
    spawn_mhc.addFollowOn(merge_mhc)  # Edges 21->XX->22 and 21->YY->22
    # L. Finally, the merged mhc along with the gene expression will be used for rank boosting
    rsem.addChild(rank_boost)  # Edge  10->23
    merge_mhc.addChild(rank_boost)  # Edge 22->23
    # M. Assess the status of the MHC genes in the patient
    phlat_tumor_rna.addChild(mhc_pathway_assessment)  # Edge 7->24
    rsem.addChild(mhc_pathway_assessment)  # Edge 10->24
    return None
Example #30
0
def run_gatk_preprocessing(job,
                           bam,
                           bai,
                           ref,
                           ref_dict,
                           fai,
                           g1k,
                           mills,
                           dbsnp,
                           unsafe=False):
    """
    GATK Preprocessing Pipeline
    0: Mark duplicates
    1: Create INDEL realignment intervals
    2: Realign INDELs
    3: Recalibrate base quality scores
    4: Apply base score recalibration

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str bam: FileStoreID for BAM file
    :param str bai: FileStoreID for BAM index file
    :param str ref: FileStoreID for reference genome fasta file
    :param str ref_dict: FileStoreID for reference sequence dictionary file
    :param str fai: FileStoreID for reference fasta index file
    :param str g1k: FileStoreID for 1000 Genomes VCF file
    :param str mills: FileStoreID for Mills VCF file
    :param str dbsnp: FileStoreID for dbSNP VCF file
    :param bool unsafe: If True, runs GATK tools in UNSAFE mode: "-U ALLOW_SEQ_DICT_INCOMPATIBILITY"
    :return: FileStoreIDs for BAM and BAI files
    :rtype: tuple(str, str)
    """
    # The MarkDuplicates disk requirement depends on the input BAM and BAI files and the output
    # BAM and BAI files. The output BAM file is approximately the same size as the input BAM file.
    mdups_disk = PromisedRequirement(
        lambda bam_, bai_: 2 * (bam_.size + bai_.size), bam, bai)
    mdups = job.wrapJobFn(picard_mark_duplicates,
                          bam,
                          bai,
                          cores=job.cores,
                          disk=mdups_disk,
                          memory=job.memory)

    # Get genome reference file sizes for calculating disk requirements
    genome_ref_size = ref.size + ref_dict.size + fai.size

    # Get INDEL resource file sizes and genome reference file sizes
    indel_ref_size = mills.size + g1k.size + genome_ref_size

    # The RealignerTargetCreator disk requirement depends on the input BAM/BAI files, the genome reference files, and
    # the output intervals file. The intervals file size is less than the reference file size, so estimate the interval
    # file size as the reference file size.
    realigner_target_disk = PromisedRequirement(
        lambda bam_, bai_, ref_size: bam_.size + bai_.size + 2 * ref_size,
        mdups.rv(0), mdups.rv(1), indel_ref_size)

    realigner_target = job.wrapJobFn(
        run_realigner_target_creator,
        mdups.rv(0),
        mdups.rv(1),
        ref,
        ref_dict,
        fai,
        g1k,
        mills,
        unsafe=unsafe,
        cores=1,  # RealignerTargetCreator is single threaded
        disk=realigner_target_disk,
        memory=job.memory)

    # The INDEL realignment disk requirement depends on the input BAM and BAI files, the intervals
    # file, the variant resource files, and the output BAM and BAI files. Here, we assume the
    # output BAM and BAI files are approximately the same size as the input BAM and BAI files.
    indel_realign_disk = PromisedRequirement(
        lambda bam_, bai_, intervals, ref_size: 2 *
        (bam_.size + bai_.size) + intervals.size + ref_size, mdups.rv(0),
        mdups.rv(1), realigner_target.rv(), indel_ref_size)

    indel_realign = job.wrapJobFn(
        run_indel_realignment,
        realigner_target.rv(),
        mdups.rv(0),
        mdups.rv(1),
        ref,
        ref_dict,
        fai,
        g1k,
        mills,
        unsafe=unsafe,
        cores=1,  # IndelRealigner is single threaded
        disk=indel_realign_disk,
        memory=job.memory)

    # Get size of BQSR databases and genome reference files
    bqsr_ref_size = dbsnp.size + mills.size + genome_ref_size

    # The BQSR disk requirement depends on the input BAM and BAI files, the reference files, and the output
    # recalibration table file. The recalibration table file size is less than the reference file sizes, so use
    # the reference file sizes to estimate the recalibration table file size.
    base_recal_disk = PromisedRequirement(
        lambda bam_, bai_, ref_size: bam_.size + bai_.size + 2 * ref_size,
        indel_realign.rv(0), indel_realign.rv(1), bqsr_ref_size)

    base_recal = job.wrapJobFn(run_base_recalibration,
                               indel_realign.rv(0),
                               indel_realign.rv(1),
                               ref,
                               ref_dict,
                               fai,
                               dbsnp,
                               mills,
                               unsafe=unsafe,
                               cores=job.cores,
                               disk=base_recal_disk,
                               memory=job.memory)

    # The PrintReads disk requirement depends on the input BAM and BAI files, the recalibration table file, the
    # genome reference files, and the output BAM and BAI files. The output BAM and BAI files are approximately the
    # same size as the input BAM and BAI files.
    recalibrate_reads_disk = PromisedRequirement(
        lambda bam_, bai_, recal, ref_size: 2 *
        (bam_.size + bai_.size) + recal.size + ref_size, indel_realign.rv(0),
        indel_realign.rv(1), base_recal.rv(), genome_ref_size)

    recalibrate_reads = job.wrapJobFn(apply_bqsr_recalibration,
                                      base_recal.rv(),
                                      indel_realign.rv(0),
                                      indel_realign.rv(1),
                                      ref,
                                      ref_dict,
                                      fai,
                                      unsafe=unsafe,
                                      cores=job.cores,
                                      disk=recalibrate_reads_disk,
                                      memory=job.memory)

    job.addChild(mdups)
    mdups.addChild(realigner_target)
    realigner_target.addChild(indel_realign)
    indel_realign.addChild(base_recal)
    base_recal.addChild(recalibrate_reads)
    return recalibrate_reads.rv(0), recalibrate_reads.rv(1)