Example #1
0
def run_rsem(job, bam_id, rsem_ref_url, paired=True):
    """
    RNA quantification with RSEM

    :param JobFunctionWrappingJob job: Passed automatically by Toil
    :param str bam_id: FileStoreID of transcriptome bam for quantification
    :param str rsem_ref_url: URL of RSEM reference (tarball)
    :param bool paired: If True, uses parameters for paired end data
    :return: FileStoreIDs for RSEM's gene and isoform output
    :rtype: str
    """
    # Retrieve RSEM reference
    download_url(url=rsem_ref_url,
                 name='rsem_ref.tar.gz',
                 work_dir=job.tempDir)
    subprocess.check_call([
        'tar', '-xvf',
        os.path.join(job.tempDir, 'rsem_ref.tar.gz'), '-C', job.tempDir
    ])
    os.remove(os.path.join(job.tempDir, 'rsem_ref.tar.gz'))
    # Determine tarball structure - based on it, ascertain folder name and rsem reference prefix
    rsem_files = []
    for root, directories, files in os.walk(job.tempDir):
        rsem_files.extend([os.path.join(root, x) for x in files])
    # "grp" is a required RSEM extension that should exist in the RSEM reference
    ref_prefix = [
        os.path.basename(os.path.splitext(x)[0]) for x in rsem_files
        if 'grp' in x
    ][0]
    ref_folder = os.path.join('/data',
                              os.listdir(job.tempDir)[0]) if len(
                                  os.listdir(job.tempDir)) == 1 else '/data'
    # Read bam from fileStore
    job.fileStore.readGlobalFile(
        bam_id, os.path.join(job.tempDir, 'transcriptome.bam'))

    # Call: RSEM
    output_prefix = 'rsem'
    parameters = [
        '--quiet', '--no-qualities', '-p',
        str(job.cores), '--forward-prob', '0.5', '--seed-length', '25',
        '--fragment-length-mean', '-1.0', '--bam', '/data/transcriptome.bam',
        os.path.join(ref_folder, ref_prefix), output_prefix
    ]
    if paired:
        parameters = ['--paired-end'] + parameters
    dockerCall(job,
               parameters=parameters,
               workDir=job.tempDir,
               tool=rsem_version)

    # Store output in fileStore and return
    gene_id = job.fileStore.writeGlobalFile(
        os.path.join(job.tempDir, output_prefix + '.genes.results'))
    isoform_id = job.fileStore.writeGlobalFile(
        os.path.join(job.tempDir, output_prefix + '.isoforms.results'))
    return gene_id, isoform_id
Example #2
0
def run_kallisto(job, r1_id, r2_id, kallisto_index_url):
    """
    RNA quantification via Kallisto

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str r1_id: FileStoreID of fastq (pair 1)
    :param str r2_id: FileStoreID of fastq (pair 2 if applicable, otherwise pass None for single-end)
    :param str kallisto_index_url: FileStoreID for Kallisto index file
    :return: FileStoreID from Kallisto output
    :rtype: str
    """
    # Retrieve files and define parameters
    download_url(url=kallisto_index_url,
                 name='kallisto_hg38.idx',
                 work_dir=job.tempDir)
    job.fileStore.readGlobalFile(r1_id, os.path.join(job.tempDir, 'R1.fastq'))
    parameters = [
        'quant', '-i', '/data/kallisto_hg38.idx', '-t',
        str(job.cores), '-o', '/data/', '-b', '100', '--fusion'
    ]

    # If R2 fastq is present...
    if r2_id:
        job.fileStore.readGlobalFile(r2_id,
                                     os.path.join(job.tempDir, 'R2.fastq'))
        parameters.extend(['/data/R1.fastq', '/data/R2.fastq'])
    else:
        parameters.extend(
            ['--single', '-l', '200', '-s', '15', '/data/R1.fastq'])

    # Call: Kallisto
    dockerCall(job,
               workDir=job.tempDir,
               parameters=parameters,
               tool=kallisto_version)

    # Tar output files together, store in fileStore, and return
    output_names = [
        'run_info.json', 'abundance.tsv', 'abundance.h5', 'fusion.txt'
    ]
    output_files = [os.path.join(job.tempDir, x) for x in output_names]
    tarball_files(tar_name='kallisto.tar.gz',
                  file_paths=output_files,
                  output_dir=job.tempDir)
    return job.fileStore.writeGlobalFile(
        os.path.join(job.tempDir, 'kallisto.tar.gz'))
def download_and_process_bam(job, config):
    """
    Download and process a BAM by converting it to a FASTQ pair

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param Expando config: Dict-like object containing workflow options as attributes
    :return: FileStoreIDs of R1 / R2 fastq files
    :rtype: tuple(str, str)
    """
    parsed_url = urlparse(config.url)

    # Download BAM
    if parsed_url.scheme == 'gdc':
        bam_path = download_bam_from_gdc(job,
                                         job.tempDir,
                                         url=config.url,
                                         token=config.gdc_token)
    else:
        bam_path = download_url(config.url,
                                work_dir=job.tempDir,
                                name='input.bam',
                                s3_key_path=config.ssec)

    # Convert to fastq pairs
    r1, r2 = convert_bam_to_fastq(job, bam_path)

    # Return fastq files
    if config.cutadapt:
        disk = 2 * (r1.size + r2.size)
        return job.addChildJobFn(run_cutadapt,
                                 r1,
                                 r2,
                                 config.fwd_3pr_adapter,
                                 config.rev_3pr_adapter,
                                 disk=disk).rv()
    return r1, r2
Example #4
0
def run_star(job,
             r1_id,
             r2_id,
             star_index_url,
             wiggle=False,
             sort=False,
             save_aligned_bam=False):
    """
    Performs alignment of fastqs to bam via STAR

    --limitBAMsortRAM step added to deal with memory explosion when sorting certain samples.
    The value was chosen to complement the recommended amount of memory to have when running STAR (60G)

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str r1_id: FileStoreID of fastq (pair 1)
    :param str r2_id: FileStoreID of fastq (pair 2 if applicable, else pass None)
    :param str star_index_url: STAR index tarball
    :param bool wiggle: If True, will output a wiggle file and return it
    :param bool sort: If True, will sort output by coordinate
    :param bool save_aligned_bam: If True, will output an aligned BAM and save it
    :return: FileStoreID from RSEM
    :rtype: str
    """
    # Download and untar STAR index file
    download_url(url=star_index_url,
                 name='starIndex.tar.gz',
                 work_dir=job.tempDir)
    subprocess.check_call([
        'tar', '-xvf',
        os.path.join(job.tempDir, 'starIndex.tar.gz'), '-C', job.tempDir
    ])
    os.remove(os.path.join(job.tempDir, 'starIndex.tar.gz'))
    star_index = os.path.join('/data',
                              os.listdir(job.tempDir)[0]) if len(
                                  os.listdir(job.tempDir)) == 1 else '/data'

    # Define parameters
    parameters = [
        '--runThreadN',
        str(job.cores), '--genomeDir', star_index, '--outFileNamePrefix',
        'rna', '--outSAMunmapped', 'Within', '--twopassMode', 'Basic',
        '--quantMode', 'TranscriptomeSAM', '--outFilterMultimapScoreRange',
        '1', '--outFilterMultimapNmax', '20', '--outFilterMismatchNmax', '10',
        '--alignIntronMax', '500000', '--alignMatesGapMax', '1000000',
        '--sjdbScore', '2', '--alignSJDBoverhangMin', '1', '--genomeLoad',
        'NoSharedMemory', '--outFilterMatchNminOverLread', '0.33',
        '--outFilterScoreMinOverLread', '0.33', '--sjdbOverhang', '100',
        '--outSAMstrandField', 'intronMotif', '--outSAMattributes', 'NH', 'HI',
        'NM', 'MD', 'AS', 'XS', '--outSAMheaderHD', '@HD', 'VN:1.4',
        '--alignEndsType', 'EndToEnd'
    ]

    # Modify parameters based on function arguments
    if sort:
        parameters.extend([
            '--outSAMtype', 'BAM', 'SortedByCoordinate', '--limitBAMsortRAM',
            '49268954168'
        ])
        aligned_bam = 'rnaAligned.sortedByCoord.out.bam'
    else:
        parameters.extend(['--outSAMtype', 'BAM', 'Unsorted'])
        aligned_bam = 'rnaAligned.out.bam'
    if wiggle:
        parameters.extend([
            '--outWigType', 'bedGraph', '--outWigStrand', 'Unstranded',
            '--outWigReferencesPrefix', 'chr'
        ])

    # Read in fastq(s) and modify parameters based on
    job.fileStore.readGlobalFile(r1_id, os.path.join(job.tempDir, 'R1.fastq'))
    if r1_id and r2_id:
        job.fileStore.readGlobalFile(r2_id,
                                     os.path.join(job.tempDir, 'R2.fastq'))
        parameters.extend(
            ['--readFilesIn', '/data/R1.fastq', '/data/R2.fastq'])
    else:
        parameters.extend(['--readFilesIn', '/data/R1.fastq'])

    # Call: STAR
    dockerCall(job=job,
               tool=star_version,
               workDir=job.tempDir,
               parameters=parameters)

    # Check output bam isnt size zero if sorted
    aligned_bam_path = os.path.join(job.tempDir, aligned_bam)
    if sort:
        assert os.stat(
            aligned_bam_path
        ).st_size > 0, 'Aligned bam failed to sort. Ensure sufficient memory is free.'

    # Write files to fileStore
    transcriptome_id = job.fileStore.writeGlobalFile(
        os.path.join(job.tempDir, 'rnaAligned.toTranscriptome.out.bam'))
    aligned_id = job.fileStore.writeGlobalFile(
        aligned_bam_path) if save_aligned_bam else None
    wiggle_path = os.path.join(job.tempDir,
                               'rnaSignal.UniqueMultiple.str1.out.bg')
    wiggle_id = job.fileStore.writeGlobalFile(wiggle_path) if wiggle else None

    # Tar output files, store in fileStore, and return FileStoreIDs
    output_files = [
        os.path.join(job.tempDir, x)
        for x in ['rnaLog.final.out', 'rnaSJ.out.tab']
    ]
    tarball_files('star.tar.gz',
                  file_paths=output_files,
                  output_dir=job.tempDir)
    star_id = job.fileStore.writeGlobalFile(
        os.path.join(job.tempDir, 'star.tar.gz'))

    return transcriptome_id, star_id, aligned_id, wiggle_id
Example #5
0
def run_hera(job, r1_id, r2_id, hera_index_url):
    """
    RNA-seq quantification using Hera

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str r1_id: FileStoreID of fastq (pair 1)
    :param str r2_id: FileStoreID of fastq (pair 2 if applicable, otherwise pass None for single-end)
    :param str hera_index_url: URL to hera index file
    :return: FileStoreID of Hera outputs
    :rytpe: str
    """
    # Download and process hera index
    download_url(url=hera_index_url,
                 name='hera-index.tar.gz',
                 work_dir=job.tempDir)
    subprocess.check_call([
        'tar', '-xvf',
        os.path.join(job.tempDir, 'hera-index.tar.gz'), '-C', job.tempDir
    ])
    os.remove(os.path.join(job.tempDir, 'hera-index.tar.gz'))
    hera_index = os.path.join('/data',
                              os.listdir(job.tempDir)[0]) if len(
                                  os.listdir(job.tempDir)) == 1 else '/data'

    # Define parameters
    parameters = [
        'quant',
        '-i',
        hera_index,
        '-t',
        str(job.cores),
        '-b',
        '100',  # Bootstraps
        '-w',
        '1',  # Output BAM (1 = no output)
        '/data/R1.fastq'
    ]

    # Read in fastq(s)
    job.fileStore.readGlobalFile(r1_id, os.path.join(job.tempDir, 'R1.fastq'))
    if r1_id and r2_id:
        job.fileStore.readGlobalFile(r2_id,
                                     os.path.join(job.tempDir, 'R2.fastq'))
        parameters.append('/data/R2.fastq')

    # Call: Hera
    dockerCall(job,
               parameters=parameters,
               workDir=job.tempDir,
               tool=hera_version)

    # Tar output files, store in fileStore, and return FileStoreID
    output_names = [
        'abundance.gene.tsv', 'abundance.h5', 'abundance.tsv', 'fusion.bedpe',
        'summary'
    ]
    output_files = [os.path.join(job.tempDir, x) for x in output_names]
    tarball_files(tar_name='hera.tar.gz',
                  file_paths=output_files,
                  output_dir=job.tempDir)
    return job.fileStore.writeGlobalFile(
        os.path.join(job.tempDir, 'hera.tar.gz'))