def run_rsem(job, bam_id, rsem_ref_url, paired=True): """ RNA quantification with RSEM :param JobFunctionWrappingJob job: Passed automatically by Toil :param str bam_id: FileStoreID of transcriptome bam for quantification :param str rsem_ref_url: URL of RSEM reference (tarball) :param bool paired: If True, uses parameters for paired end data :return: FileStoreIDs for RSEM's gene and isoform output :rtype: str """ # Retrieve RSEM reference download_url(url=rsem_ref_url, name='rsem_ref.tar.gz', work_dir=job.tempDir) subprocess.check_call([ 'tar', '-xvf', os.path.join(job.tempDir, 'rsem_ref.tar.gz'), '-C', job.tempDir ]) os.remove(os.path.join(job.tempDir, 'rsem_ref.tar.gz')) # Determine tarball structure - based on it, ascertain folder name and rsem reference prefix rsem_files = [] for root, directories, files in os.walk(job.tempDir): rsem_files.extend([os.path.join(root, x) for x in files]) # "grp" is a required RSEM extension that should exist in the RSEM reference ref_prefix = [ os.path.basename(os.path.splitext(x)[0]) for x in rsem_files if 'grp' in x ][0] ref_folder = os.path.join('/data', os.listdir(job.tempDir)[0]) if len( os.listdir(job.tempDir)) == 1 else '/data' # Read bam from fileStore job.fileStore.readGlobalFile( bam_id, os.path.join(job.tempDir, 'transcriptome.bam')) # Call: RSEM output_prefix = 'rsem' parameters = [ '--quiet', '--no-qualities', '-p', str(job.cores), '--forward-prob', '0.5', '--seed-length', '25', '--fragment-length-mean', '-1.0', '--bam', '/data/transcriptome.bam', os.path.join(ref_folder, ref_prefix), output_prefix ] if paired: parameters = ['--paired-end'] + parameters dockerCall(job, parameters=parameters, workDir=job.tempDir, tool=rsem_version) # Store output in fileStore and return gene_id = job.fileStore.writeGlobalFile( os.path.join(job.tempDir, output_prefix + '.genes.results')) isoform_id = job.fileStore.writeGlobalFile( os.path.join(job.tempDir, output_prefix + '.isoforms.results')) return gene_id, isoform_id
def run_kallisto(job, r1_id, r2_id, kallisto_index_url): """ RNA quantification via Kallisto :param JobFunctionWrappingJob job: passed automatically by Toil :param str r1_id: FileStoreID of fastq (pair 1) :param str r2_id: FileStoreID of fastq (pair 2 if applicable, otherwise pass None for single-end) :param str kallisto_index_url: FileStoreID for Kallisto index file :return: FileStoreID from Kallisto output :rtype: str """ # Retrieve files and define parameters download_url(url=kallisto_index_url, name='kallisto_hg38.idx', work_dir=job.tempDir) job.fileStore.readGlobalFile(r1_id, os.path.join(job.tempDir, 'R1.fastq')) parameters = [ 'quant', '-i', '/data/kallisto_hg38.idx', '-t', str(job.cores), '-o', '/data/', '-b', '100', '--fusion' ] # If R2 fastq is present... if r2_id: job.fileStore.readGlobalFile(r2_id, os.path.join(job.tempDir, 'R2.fastq')) parameters.extend(['/data/R1.fastq', '/data/R2.fastq']) else: parameters.extend( ['--single', '-l', '200', '-s', '15', '/data/R1.fastq']) # Call: Kallisto dockerCall(job, workDir=job.tempDir, parameters=parameters, tool=kallisto_version) # Tar output files together, store in fileStore, and return output_names = [ 'run_info.json', 'abundance.tsv', 'abundance.h5', 'fusion.txt' ] output_files = [os.path.join(job.tempDir, x) for x in output_names] tarball_files(tar_name='kallisto.tar.gz', file_paths=output_files, output_dir=job.tempDir) return job.fileStore.writeGlobalFile( os.path.join(job.tempDir, 'kallisto.tar.gz'))
def download_and_process_bam(job, config): """ Download and process a BAM by converting it to a FASTQ pair :param JobFunctionWrappingJob job: passed automatically by Toil :param Expando config: Dict-like object containing workflow options as attributes :return: FileStoreIDs of R1 / R2 fastq files :rtype: tuple(str, str) """ parsed_url = urlparse(config.url) # Download BAM if parsed_url.scheme == 'gdc': bam_path = download_bam_from_gdc(job, job.tempDir, url=config.url, token=config.gdc_token) else: bam_path = download_url(config.url, work_dir=job.tempDir, name='input.bam', s3_key_path=config.ssec) # Convert to fastq pairs r1, r2 = convert_bam_to_fastq(job, bam_path) # Return fastq files if config.cutadapt: disk = 2 * (r1.size + r2.size) return job.addChildJobFn(run_cutadapt, r1, r2, config.fwd_3pr_adapter, config.rev_3pr_adapter, disk=disk).rv() return r1, r2
def run_star(job, r1_id, r2_id, star_index_url, wiggle=False, sort=False, save_aligned_bam=False): """ Performs alignment of fastqs to bam via STAR --limitBAMsortRAM step added to deal with memory explosion when sorting certain samples. The value was chosen to complement the recommended amount of memory to have when running STAR (60G) :param JobFunctionWrappingJob job: passed automatically by Toil :param str r1_id: FileStoreID of fastq (pair 1) :param str r2_id: FileStoreID of fastq (pair 2 if applicable, else pass None) :param str star_index_url: STAR index tarball :param bool wiggle: If True, will output a wiggle file and return it :param bool sort: If True, will sort output by coordinate :param bool save_aligned_bam: If True, will output an aligned BAM and save it :return: FileStoreID from RSEM :rtype: str """ # Download and untar STAR index file download_url(url=star_index_url, name='starIndex.tar.gz', work_dir=job.tempDir) subprocess.check_call([ 'tar', '-xvf', os.path.join(job.tempDir, 'starIndex.tar.gz'), '-C', job.tempDir ]) os.remove(os.path.join(job.tempDir, 'starIndex.tar.gz')) star_index = os.path.join('/data', os.listdir(job.tempDir)[0]) if len( os.listdir(job.tempDir)) == 1 else '/data' # Define parameters parameters = [ '--runThreadN', str(job.cores), '--genomeDir', star_index, '--outFileNamePrefix', 'rna', '--outSAMunmapped', 'Within', '--twopassMode', 'Basic', '--quantMode', 'TranscriptomeSAM', '--outFilterMultimapScoreRange', '1', '--outFilterMultimapNmax', '20', '--outFilterMismatchNmax', '10', '--alignIntronMax', '500000', '--alignMatesGapMax', '1000000', '--sjdbScore', '2', '--alignSJDBoverhangMin', '1', '--genomeLoad', 'NoSharedMemory', '--outFilterMatchNminOverLread', '0.33', '--outFilterScoreMinOverLread', '0.33', '--sjdbOverhang', '100', '--outSAMstrandField', 'intronMotif', '--outSAMattributes', 'NH', 'HI', 'NM', 'MD', 'AS', 'XS', '--outSAMheaderHD', '@HD', 'VN:1.4', '--alignEndsType', 'EndToEnd' ] # Modify parameters based on function arguments if sort: parameters.extend([ '--outSAMtype', 'BAM', 'SortedByCoordinate', '--limitBAMsortRAM', '49268954168' ]) aligned_bam = 'rnaAligned.sortedByCoord.out.bam' else: parameters.extend(['--outSAMtype', 'BAM', 'Unsorted']) aligned_bam = 'rnaAligned.out.bam' if wiggle: parameters.extend([ '--outWigType', 'bedGraph', '--outWigStrand', 'Unstranded', '--outWigReferencesPrefix', 'chr' ]) # Read in fastq(s) and modify parameters based on job.fileStore.readGlobalFile(r1_id, os.path.join(job.tempDir, 'R1.fastq')) if r1_id and r2_id: job.fileStore.readGlobalFile(r2_id, os.path.join(job.tempDir, 'R2.fastq')) parameters.extend( ['--readFilesIn', '/data/R1.fastq', '/data/R2.fastq']) else: parameters.extend(['--readFilesIn', '/data/R1.fastq']) # Call: STAR dockerCall(job=job, tool=star_version, workDir=job.tempDir, parameters=parameters) # Check output bam isnt size zero if sorted aligned_bam_path = os.path.join(job.tempDir, aligned_bam) if sort: assert os.stat( aligned_bam_path ).st_size > 0, 'Aligned bam failed to sort. Ensure sufficient memory is free.' # Write files to fileStore transcriptome_id = job.fileStore.writeGlobalFile( os.path.join(job.tempDir, 'rnaAligned.toTranscriptome.out.bam')) aligned_id = job.fileStore.writeGlobalFile( aligned_bam_path) if save_aligned_bam else None wiggle_path = os.path.join(job.tempDir, 'rnaSignal.UniqueMultiple.str1.out.bg') wiggle_id = job.fileStore.writeGlobalFile(wiggle_path) if wiggle else None # Tar output files, store in fileStore, and return FileStoreIDs output_files = [ os.path.join(job.tempDir, x) for x in ['rnaLog.final.out', 'rnaSJ.out.tab'] ] tarball_files('star.tar.gz', file_paths=output_files, output_dir=job.tempDir) star_id = job.fileStore.writeGlobalFile( os.path.join(job.tempDir, 'star.tar.gz')) return transcriptome_id, star_id, aligned_id, wiggle_id
def run_hera(job, r1_id, r2_id, hera_index_url): """ RNA-seq quantification using Hera :param JobFunctionWrappingJob job: passed automatically by Toil :param str r1_id: FileStoreID of fastq (pair 1) :param str r2_id: FileStoreID of fastq (pair 2 if applicable, otherwise pass None for single-end) :param str hera_index_url: URL to hera index file :return: FileStoreID of Hera outputs :rytpe: str """ # Download and process hera index download_url(url=hera_index_url, name='hera-index.tar.gz', work_dir=job.tempDir) subprocess.check_call([ 'tar', '-xvf', os.path.join(job.tempDir, 'hera-index.tar.gz'), '-C', job.tempDir ]) os.remove(os.path.join(job.tempDir, 'hera-index.tar.gz')) hera_index = os.path.join('/data', os.listdir(job.tempDir)[0]) if len( os.listdir(job.tempDir)) == 1 else '/data' # Define parameters parameters = [ 'quant', '-i', hera_index, '-t', str(job.cores), '-b', '100', # Bootstraps '-w', '1', # Output BAM (1 = no output) '/data/R1.fastq' ] # Read in fastq(s) job.fileStore.readGlobalFile(r1_id, os.path.join(job.tempDir, 'R1.fastq')) if r1_id and r2_id: job.fileStore.readGlobalFile(r2_id, os.path.join(job.tempDir, 'R2.fastq')) parameters.append('/data/R2.fastq') # Call: Hera dockerCall(job, parameters=parameters, workDir=job.tempDir, tool=hera_version) # Tar output files, store in fileStore, and return FileStoreID output_names = [ 'abundance.gene.tsv', 'abundance.h5', 'abundance.tsv', 'fusion.bedpe', 'summary' ] output_files = [os.path.join(job.tempDir, x) for x in output_names] tarball_files(tar_name='hera.tar.gz', file_paths=output_files, output_dir=job.tempDir) return job.fileStore.writeGlobalFile( os.path.join(job.tempDir, 'hera.tar.gz'))