def _get_test_fastq_files(tmpdir, tarball=True): # path stuff tmpdir_str = str(tmpdir) tmpdir_repo = os.path.join(tmpdir_str, "single_cell") # get from pachterlab subprocess.check_call( "git clone --no-checkout https://github.com/pachterlab/scRNA-Seq-TCC-prep.git single_cell" .split(), cwd=tmpdir_str) subprocess.check_call("git config core.sparseCheckout true".split(), cwd=tmpdir_repo) with open(os.path.join(tmpdir_repo, ".git", "info", "sparse-checkout"), 'w') as sparse: sparse.write("example_dataset/fastq_files/*ATCGCTCC*") # subprocess.check_call('echo "example_dataset/fastq_files/*ATCGCTCC*" > .git/info/sparse-checkout'.split(),cwd=tmpdir_repo) subprocess.check_call( "git checkout 0469873bdadcc48e34782882dbd24c3939c0542a".split(), cwd=tmpdir_repo) # return location if not tarballed fastqs_location = os.path.join(tmpdir_str, "single_cell", "example_dataset", "fastq_files") if not tarball: return fastqs_location # else, tarball and return that location tarball_files(output_dir=tmpdir_str, tar_name='test_fastq.tar.gz', file_paths=[ os.path.join(fastqs_location, x) for x in os.listdir(fastqs_location) ]) return os.path.join(tmpdir_str, 'test_fastq.tar.gz')
def run_fastqc(job, r1_id, r2_id): """ Run Fastqc on the input reads :param JobFunctionWrappingJob job: passed automatically by Toil :param str r1_id: FileStoreID of fastq read 1 :param str r2_id: FileStoreID of fastq read 2 :return: FileStoreID of fastQC output (tarball) :rtype: str """ work_dir = job.fileStore.getLocalTempDir() job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1.fastq')) parameters = ['/data/R1.fastq'] output_names = ['R1_fastqc.html', 'R1_fastqc.zip'] if r2_id: job.fileStore.readGlobalFile(r2_id, os.path.join(work_dir, 'R2.fastq')) parameters.extend(['-t', '2', '/data/R2.fastq']) output_names.extend(['R2_fastqc.html', 'R2_fastqc.zip']) docker_call( tool= 'quay.io/ucsc_cgl/fastqc:0.11.5--be13567d00cd4c586edf8ae47d991815c8c72a49', work_dir=work_dir, parameters=parameters) output_files = [os.path.join(work_dir, x) for x in output_names] tarball_files(tar_name='fastqc.tar.gz', file_paths=output_files, output_dir=work_dir) return job.fileStore.writeGlobalFile( os.path.join(work_dir, 'fastqc.tar.gz'))
def variant_calling_and_qc(job, inputs, bam_id, bai_id): """ Perform variant calling with samtools nad QC with CheckBias :param JobFunctionWrappingJob job: passed by Toil automatically :param Namespace inputs: Stores input arguments (see main) :param str bam_id: FileStore ID of bam :param str bai_id: FileStore ID of bam index file :return: FileStore ID of qc tarball :rtype: str """ job.fileStore.logToMaster('Variant calling and QC: {}'.format(inputs.uuid)) work_dir = job.fileStore.getLocalTempDir() # Pull in alignment.bam from fileStore job.fileStore.readGlobalFile(bam_id, os.path.join(work_dir, 'alignment.bam')) job.fileStore.readGlobalFile(bai_id, os.path.join(work_dir, 'alignment.bam.bai')) # Download input files input_info = [(inputs.genome, 'genome.fa'), (inputs.positions, 'positions.tsv'), (inputs.genome_index, 'genome.fa.fai'), (inputs.gtf, 'annotation.gtf'), (inputs.gtf_m53, 'annotation.m53')] for url, fname in input_info: download_url(job=job, url=url, work_dir=work_dir, name=fname) # Part 1: Variant Calling variant_command = [ 'mpileup', '-f', 'genome.fa', '-l', 'positions.tsv', '-v', 'alignment.bam', '-t', 'DP,SP,INFO/AD,INFO/ADF,INFO/ADR,INFO/DPR,SP', '-o', '/data/output.vcf.gz' ] docker_call( job=job, work_dir=work_dir, parameters=variant_command, tool= 'quay.io/ucsc_cgl/samtools:1.3--256539928ea162949d8a65ca5c79a72ef557ce7c' ) # Part 2: QC qc_command = [ '-o', 'qc', '-n', 'alignment.bam', '-a', 'annotation.gtf', '-m', 'annotation.m53' ] docker_call( job=job, work_dir=work_dir, parameters=qc_command, tool= 'jvivian/checkbias:612f129--b08a1fb6526a620bbb0304b08356f2ae7c3c0ec3') # Write output to fileStore and return ids output_tsv = glob(os.path.join(work_dir, '*counts.tsv*'))[0] output_vcf = os.path.join(work_dir, 'output.vcf.gz') tarball_files('vcqc.tar.gz', file_paths=[output_tsv, output_vcf], output_dir=work_dir) return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'vcqc.tar.gz'))
def spladder(job, inputs, bam_id, bai_id): """ Run SplAdder to detect and quantify alternative splicing events :param JobFunctionWrappingJob job: passed by Toil automatically :param Namespace inputs: Stores input arguments (see main) :param str bam_id: FileStore ID of bam :param str bai_id: FileStore ID of bam index file :return: FileStore ID of SplAdder tarball :rtype: str """ job.fileStore.logToMaster('SplAdder: {}'.format(inputs.uuid)) work_dir = job.fileStore.getLocalTempDir() # Pull in alignment.bam from fileStore job.fileStore.readGlobalFile(bam_id, os.path.join(work_dir, 'alignment.bam')) job.fileStore.readGlobalFile(bai_id, os.path.join(work_dir, 'alignment.bam.bai')) # Download input file download_url(job=job, url=inputs.gtf, work_dir=work_dir, name='annotation.gtf') download_url(job=job, url=inputs.gtf_pickle, work_dir=work_dir, name='annotation.gtf.pickle') # Call Spladder command = [ '--insert_ir=y', '--insert_es=y', '--insert_ni=y', '--remove_se=n', '--validate_sg=n', '-b', 'alignment.bam', '-o ', '/data', '-a', 'annotation.gtf', '-v', 'y', '-c', '3', '-M', 'single', '-T', 'n', '-n', '50', '-P', 'y', '-p', 'n', '--sparse_bam', 'y' ] docker_call(job=job, work_dir=work_dir, parameters=command, sudo=inputs.sudo, tool='jvivian/spladder:1.0') # Write output to fileStore and return ids output_pickle = os.path.join(work_dir, ' ', 'spladder', 'genes_graph_conf3.alignment.pickle') if not os.path.exists(output_pickle): matches = [] for root, dirnames, filenames in os.walk(work_dir): for filename in fnmatch.filter(filenames, '*genes_graph*'): matches.append(os.path.join(root, filename)) if matches: output_pickle = matches[0] else: raise RuntimeError("Couldn't find genes file!") output_filt = os.path.join(work_dir, 'alignment.filt.hdf5') output = os.path.join(work_dir, 'alignment.hdf5') print os.listdir(work_dir) tarball_files('spladder.tar.gz', file_paths=[output_pickle, output_filt, output], output_dir=work_dir) return job.fileStore.writeGlobalFile( os.path.join(work_dir, 'spladder.tar.gz'))
def test_tarball_files(tmpdir): from toil_lib.files import tarball_files work_dir = str(tmpdir) fpath = os.path.join(work_dir, 'output_file') with open(fpath, 'wb') as fout: fout.write(os.urandom(1024)) tarball_files(output_dir=work_dir, tar_name='test.tar', file_paths=[fpath]) assert os.path.exists(os.path.join(work_dir, 'test.tar'))
def run_rsem_postprocess(job, uuid, rsem_gene_id, rsem_isoform_id): """ Parses RSEMs output to produce the separate .tab files (TPM, FPKM, counts) for both gene and isoform. These are two-column files: Genes and Quantifications. HUGO files are also provided that have been mapped from Gencode/ENSEMBLE names. :param JobFunctionWrappingJob job: passed automatically by Toil :param str uuid: UUID to mark the samples with :param str rsem_gene_id: FileStoreID of rsem_gene_ids :param str rsem_isoform_id: FileStoreID of rsem_isoform_ids :return: FileStoreID from RSEM post process tarball :rytpe: str """ work_dir = job.fileStore.getLocalTempDir() # I/O job.fileStore.readGlobalFile(rsem_gene_id, os.path.join(work_dir, 'rsem_gene.tab'), mutable=True) job.fileStore.readGlobalFile(rsem_isoform_id, os.path.join(work_dir, 'rsem_isoform.tab'), mutable=True) # Convert RSEM files into individual .tab files. docker_call(tool='jvivian/rsem_postprocess', parameters=[uuid], work_dir=work_dir) os.rename(os.path.join(work_dir, 'rsem_gene.tab'), os.path.join(work_dir, 'rsem_genes.results')) os.rename(os.path.join(work_dir, 'rsem_isoform.tab'), os.path.join(work_dir, 'rsem_isoforms.results')) output_files = [ 'rsem.genes.norm_counts.tab', 'rsem.genes.raw_counts.tab', 'rsem.isoform.norm_counts.tab', 'rsem.isoform.raw_counts.tab', 'rsem_genes.results', 'rsem_isoforms.results' ] # Perform HUGO gene / isoform name mapping genes = [x for x in output_files if 'rsem.genes' in x] isoforms = [x for x in output_files if 'rsem.isoform' in x] command = ['-g'] + genes + ['-i'] + isoforms docker_call(tool='jvivian/gencode_hugo_mapping', parameters=command, work_dir=work_dir) hugo_files = [ os.path.splitext(x)[0] + '.hugo' + os.path.splitext(x)[1] for x in genes + isoforms ] # Create tarballs for outputs tarball_files('rsem.tar.gz', file_paths=[os.path.join(work_dir, x) for x in output_files], output_dir=work_dir) tarball_files('rsem_hugo.tar.gz', [os.path.join(work_dir, x) for x in hugo_files], output_dir=work_dir) rsem_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'rsem.tar.gz')) hugo_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'rsem_hugo.tar.gz')) return rsem_id, hugo_id
def rsem_quantification(job, config, star_output): """ Unpack STAR results and run RSEM (and saving BAM from STAR) :param JobFunctionWrappingJob job: passed automatically by Toil :param Namespace config: Argparse Namespace object containing argument inputs :param tuple(FileID, FileID, FileID, FileID)|tuple(FileID, FileID, FileID) star_output: FileStoreIDs from STAR :return: FileStoreID results from RSEM postprocess and STAR log :rtype: tuple(FileID, FileID, FileID) """ work_dir = job.fileStore.getLocalTempDir() cores = min(16, config.cores) if config.wiggle: transcriptome_id, sorted_id, wiggle_id, log_id = flatten(star_output) wiggle_path = os.path.join(work_dir, config.uuid + '.wiggle.bg') job.fileStore.readGlobalFile(wiggle_id, wiggle_path) if urlparse(config.output_dir).scheme == 's3': s3am_upload(fpath=wiggle_path, s3_dir=config.output_dir, s3_key_path=config.ssec) else: copy_files(file_paths=[wiggle_path], output_dir=config.output_dir) else: transcriptome_id, sorted_id, log_id = star_output # Save sorted bam if flag is selected if config.save_bam and not config.bamqc: # if config.bamqc is selected, bam is being saved in run_bam_qc bam_path = os.path.join(work_dir, config.uuid + '.sorted.bam') job.fileStore.readGlobalFile(sorted_id, bam_path) if urlparse(config.output_dir).scheme == 's3' and config.ssec: s3am_upload(fpath=bam_path, s3_dir=config.output_dir, s3_key_path=config.ssec) elif urlparse(config.output_dir).scheme != 's3': copy_files(file_paths=[bam_path], output_dir=config.output_dir) # Declare RSEM and RSEM post-process jobs disk = 5 * transcriptome_id.size rsem_output = job.wrapJobFn(run_rsem, transcriptome_id, config.rsem_ref, paired=config.paired, cores=cores, disk=disk) rsem_postprocess = job.wrapJobFn(run_rsem_postprocess, rsem_output.rv(0), rsem_output.rv(1)) job.addChild(rsem_output) rsem_output.addChild(rsem_postprocess) # Save STAR log log_path = os.path.join(work_dir, 'Log.final.out') job.fileStore.readGlobalFile(log_id, log_path) tarball_files(tar_name='star.tar.gz', file_paths=[log_path], output_dir=work_dir) star_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'star.tar.gz')) return rsem_postprocess.rv(), star_id
def run_bam_qc(job, aligned_bam_id, config): """ Run BAM QC as specified by California Kids Cancer Comparison (CKCC) :param JobFunctionWrappingJob job: :param str aligned_bam_id: FileStoreID of sorted bam from STAR :param Namespace config: Argparse Namespace object containing argument inputs Must contain: config.uuid str: UUID of input sample config.save_bam bool: True/False depending on whether to save bam config.output_dir str: Path to save bam config.ssec str: Path to encryption key for secure upload to S3 :return: boolean flag, FileStoreID for output bam, and FileStoreID for output tar :rtype: tuple(bool, str, str) """ work_dir = job.fileStore.getLocalTempDir() job.fileStore.readGlobalFile( aligned_bam_id, os.path.join(work_dir, 'rnaAligned.sortedByCoord.out.bam')) docker_call(tool='hbeale/treehouse_bam_qc:1.0', work_dir=work_dir, parameters=['runQC.sh', str(job.cores)]) # Tar Output files output_names = [ 'readDist.txt', 'rnaAligned.out.md.sorted.geneBodyCoverage.curves.pdf', 'rnaAligned.out.md.sorted.geneBodyCoverage.txt' ] if os.path.exists(os.path.join(work_dir, 'readDist.txt_PASS_qc.txt')): output_names.append('readDist.txt_PASS_qc.txt') fail_flag = False else: output_names.append('readDist.txt_FAIL_qc.txt') fail_flag = True output_files = [os.path.join(work_dir, x) for x in output_names] tarball_files(tar_name='bam_qc.tar.gz', file_paths=output_files, output_dir=work_dir) # Save output BAM if config.save_bam: bam_path = os.path.join(work_dir, 'rnaAligned.sortedByCoord.md.bam') new_bam_path = os.path.join(work_dir, config.uuid + '.sortedByCoord.md.bam') os.rename(bam_path, new_bam_path) if urlparse(config.output_dir).scheme == 's3' and config.ssec: s3am_upload(fpath=new_bam_path, s3_dir=config.output_dir, s3_key_path=config.ssec) elif urlparse(config.output_dir).scheme != 's3': copy_files(file_paths=[new_bam_path], output_dir=config.output_dir) return fail_flag, job.fileStore.writeGlobalFile( os.path.join(work_dir, 'bam_qc.tar.gz'))
def spladder(job, inputs, bam_id, bai_id): """ Run SplAdder to detect and quantify alternative splicing events :param JobFunctionWrappingJob job: passed by Toil automatically :param Namespace inputs: Stores input arguments (see main) :param str bam_id: FileStore ID of bam :param str bai_id: FileStore ID of bam index file :return: FileStore ID of SplAdder tarball :rtype: str """ job.fileStore.logToMaster('SplAdder: {}'.format(inputs.uuid)) work_dir = job.fileStore.getLocalTempDir() # Pull in alignment.bam from fileStore job.fileStore.readGlobalFile(bam_id, os.path.join(work_dir, 'alignment.bam')) job.fileStore.readGlobalFile(bai_id, os.path.join(work_dir, 'alignment.bam.bai')) # Download input file download_url(url=inputs.gtf, work_dir=work_dir, name='annotation.gtf') download_url(url=inputs.gtf_pickle, work_dir=work_dir, name='annotation.gtf.pickle') # Call Spladder command = ['--insert_ir=y', '--insert_es=y', '--insert_ni=y', '--remove_se=n', '--validate_sg=n', '-b', 'alignment.bam', '-o ', '/data', '-a', 'annotation.gtf', '-v', 'y', '-c', '3', '-M', 'single', '-T', 'n', '-n', '50', '-P', 'y', '-p', 'n', '--sparse_bam', 'y'] docker_call(work_dir=work_dir, parameters=command, sudo=inputs.sudo, tool='jvivian/spladder:1.0') # Write output to fileStore and return ids output_pickle = os.path.join(work_dir, ' ', 'spladder', 'genes_graph_conf3.alignment.pickle') if not os.path.exists(output_pickle): matches = [] for root, dirnames, filenames in os.walk(work_dir): for filename in fnmatch.filter(filenames, '*genes_graph*'): matches.append(os.path.join(root, filename)) if matches: output_pickle = matches[0] else: raise RuntimeError("Couldn't find genes file!") output_filt = os.path.join(work_dir, 'alignment.filt.hdf5') output = os.path.join(work_dir, 'alignment.hdf5') print os.listdir(work_dir) tarball_files('spladder.tar.gz', file_paths=[output_pickle, output_filt, output], output_dir=work_dir) return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'spladder.tar.gz'))
def run_kallisto(job, r1_id, r2_id, kallisto_index_url): """ RNA quantification via Kallisto :param JobFunctionWrappingJob job: passed automatically by Toil :param str r1_id: FileStoreID of fastq (pair 1) :param str r2_id: FileStoreID of fastq (pair 2 if applicable, otherwise pass None for single-end) :param str kallisto_index_url: FileStoreID for Kallisto index file :return: FileStoreID from Kallisto output :rtype: str """ work_dir = job.fileStore.getLocalTempDir() download_url(url=kallisto_index_url, name='kallisto_hg38.idx', work_dir=work_dir) # Retrieve files parameters = [ 'quant', '-i', '/data/kallisto_hg38.idx', '-t', str(job.cores), '-o', '/data/', '-b', '100' ] if r1_id and r2_id: job.fileStore.readGlobalFile( r1_id, os.path.join(work_dir, 'R1_cutadapt.fastq')) job.fileStore.readGlobalFile( r2_id, os.path.join(work_dir, 'R2_cutadapt.fastq')) parameters.extend( ['/data/R1_cutadapt.fastq', '/data/R2_cutadapt.fastq']) else: job.fileStore.readGlobalFile( r1_id, os.path.join(work_dir, 'R1_cutadapt.fastq')) parameters.extend( ['--single', '-l', '200', '-s', '15', '/data/R1_cutadapt.fastq']) # Call: Kallisto docker_call( job=job, tool= 'quay.io/ucsc_cgl/kallisto:0.42.4--35ac87df5b21a8e8e8d159f26864ac1e1db8cf86', work_dir=work_dir, parameters=parameters) # Tar output files together and store in fileStore output_files = [ os.path.join(work_dir, x) for x in ['run_info.json', 'abundance.tsv', 'abundance.h5'] ] tarball_files(tar_name='kallisto.tar.gz', file_paths=output_files, output_dir=work_dir) return job.fileStore.writeGlobalFile( os.path.join(work_dir, 'kallisto.tar.gz'))
def run_pindel(job, normal_bam, normal_bai, tumor_bam, tumor_bai, ref, fai): """ Calls Pindel to compute indels / deletions :param JobFunctionWrappingJob job: Passed automatically by Toil :param str normal_bam: Normal BAM FileStoreID :param str normal_bai: Normal BAM index FileStoreID :param str tumor_bam: Tumor BAM FileStoreID :param str tumor_bai: Tumor BAM Index FileStoreID :param str ref: Reference genome FileStoreID :param str fai: Reference index FileStoreID :return: Pindel output (tarball) FileStoreID :rtype: str """ work_dir = job.fileStore.getLocalTempDir() file_ids = [normal_bam, normal_bai, tumor_bam, tumor_bai, ref, fai] file_names = [ 'normal.bam', 'normal.bai', 'tumor.bam', 'tumor.bai', 'ref.fasta', 'ref.fasta.fai' ] for file_store_id, name in zip(file_ids, file_names): job.fileStore.readGlobalFile(file_store_id, os.path.join(work_dir, name)) # Create Pindel config with open(os.path.join(work_dir, 'pindel-config.txt'), 'w') as f: for bam in ['normal', 'tumor']: f.write('/data/{} {} {}\n'.format( bam + '.bam', get_mean_insert_size(work_dir, bam + '.bam'), bam)) # Call: Pindel parameters = [ '-f', '/data/ref.fasta', '-i', '/data/pindel-config.txt', '--number_of_threads', str(job.cores), '--minimum_support_for_event', '3', '--report_long_insertions', 'true', '--report_breakpoints', 'true', '-o', 'pindel' ] docker_call( tool= 'quay.io/ucsc_cgl/pindel:0.2.5b6--4e8d1b31d4028f464b3409c6558fb9dfcad73f88', work_dir=work_dir, parameters=parameters) # Collect output files and write to file store output_files = glob(os.path.join(work_dir, 'pindel*')) tarball_files('pindel.tar.gz', file_paths=output_files, output_dir=work_dir) return job.fileStore.writeGlobalFile( os.path.join(work_dir, 'pindel.tar.gz'))
def run_muse(job, normal_bam, normal_bai, tumor_bam, tumor_bai, ref, ref_dict, fai, dbsnp): """ Calls MuSe to find variants :param JobFunctionWrappingJob job: passed automatically by Toil :param str normal_bam: Normal BAM FileStoreID :param str normal_bai: Normal BAM index FileStoreID :param str tumor_bam: Tumor BAM FileStoreID :param str tumor_bai: Tumor BAM Index FileStoreID :param str tumor_bai: Tumor BAM Index FileStoreID :param str ref: Reference genome FileStoreID :param str ref_dict: Reference genome dictionary FileStoreID :param str fai: Reference index FileStoreID :param str dbsnp: DBSNP VCF FileStoreID :return: MuSe output (tarball) FileStoreID :rtype: str """ work_dir = job.fileStore.getLocalTempDir() file_ids = [ normal_bam, normal_bai, tumor_bam, tumor_bai, ref, ref_dict, fai, dbsnp ] file_names = [ 'normal.bam', 'normal.bai', 'tumor.bam', 'tumor.bai', 'ref.fasta', 'ref.dict', 'ref.fasta.fai', 'dbsnp.vcf' ] for file_store_id, name in zip(file_ids, file_names): job.fileStore.readGlobalFile(file_store_id, os.path.join(work_dir, name)) # Call: MuSE parameters = [ '--mode', 'wxs', '--dbsnp', '/data/dbsnp.vcf', '--fafile', '/data/ref.fasta', '--tumor-bam', '/data/tumor.bam', '--tumor-bam-index', '/data/tumor.bai', '--normal-bam', '/data/normal.bam', '--normal-bam-index', '/data/normal.bai', '--outfile', '/data/muse.vcf', '--cpus', str(job.cores) ] docker_call( tool= 'quay.io/ucsc_cgl/muse:1.0--6add9b0a1662d44fd13bbc1f32eac49326e48562', work_dir=work_dir, parameters=parameters) # Return fileStore ID tarball_files('muse.tar.gz', file_paths=[os.path.join(work_dir, 'muse.vcf')], output_dir=work_dir) return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'muse.tar.gz'))
def variant_calling_and_qc(job, inputs, bam_id, bai_id): """ Perform variant calling with samtools nad QC with CheckBias :param JobFunctionWrappingJob job: passed by Toil automatically :param Namespace inputs: Stores input arguments (see main) :param str bam_id: FileStore ID of bam :param str bai_id: FileStore ID of bam index file :return: FileStore ID of qc tarball :rtype: str """ job.fileStore.logToMaster('Variant calling and QC: {}'.format(inputs.uuid)) work_dir = job.fileStore.getLocalTempDir() # Pull in alignment.bam from fileStore job.fileStore.readGlobalFile(bam_id, os.path.join(work_dir, 'alignment.bam')) job.fileStore.readGlobalFile(bai_id, os.path.join(work_dir, 'alignment.bam.bai')) # Download input files input_info = [(inputs.genome, 'genome.fa'), (inputs.positions, 'positions.tsv'), (inputs.genome_index, 'genome.fa.fai'), (inputs.gtf, 'annotation.gtf'), (inputs.gtf_m53, 'annotation.m53')] for url, fname in input_info: download_url(url, work_dir=work_dir, name=fname) # Part 1: Variant Calling variant_command = ['mpileup', '-f', 'genome.fa', '-l', 'positions.tsv', '-v', 'alignment.bam', '-t', 'DP,SP,INFO/AD,INFO/ADF,INFO/ADR,INFO/DPR,SP', '-o', '/data/output.vcf.gz'] docker_call(work_dir=work_dir, parameters=variant_command, sudo=inputs.sudo, tool='quay.io/ucsc_cgl/samtools:1.3--256539928ea162949d8a65ca5c79a72ef557ce7c') # Part 2: QC qc_command = ['-o', 'qc', '-n', 'alignment.bam', '-a', 'annotation.gtf', '-m', 'annotation.m53'] docker_call(work_dir=work_dir, parameters=qc_command, tool='jvivian/checkbias:612f129--b08a1fb6526a620bbb0304b08356f2ae7c3c0ec3') # Write output to fileStore and return ids output_tsv = glob(os.path.join(work_dir, '*counts.tsv*'))[0] output_vcf = os.path.join(work_dir, 'output.vcf.gz') tarball_files('vcqc.tar.gz', file_paths=[output_tsv, output_vcf], output_dir=work_dir) return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'vcqc.tar.gz'))
def run_rsem_postprocess(job, rsem_gene_id, rsem_isoform_id): """ Parses RSEMs output to produce the separate .tab files (TPM, FPKM, counts) for both gene and isoform. These are two-column files: Genes and Quantifications. HUGO files are also provided that have been mapped from Gencode/ENSEMBLE names. :param JobFunctionWrappingJob job: passed automatically by Toil :param str rsem_gene_id: FileStoreID of rsem_gene_ids :param str rsem_isoform_id: FileStoreID of rsem_isoform_ids :return: FileStoreID from RSEM post process tarball :rytpe: str """ work_dir = job.fileStore.getLocalTempDir() # I/O genes = job.fileStore.readGlobalFile( rsem_gene_id, os.path.join(work_dir, 'rsem_genes.results')) iso = job.fileStore.readGlobalFile( rsem_isoform_id, os.path.join(work_dir, 'rsem_isoforms.results')) # Perform HUGO gene / isoform name mapping command = ['-g', 'rsem_genes.results', '-i', 'rsem_isoforms.results'] docker_call( job=job, tool= 'quay.io/ucsc_cgl/gencode_hugo_mapping:1.0--cb4865d02f9199462e66410f515c4dabbd061e4d', parameters=command, work_dir=work_dir) hugo_files = [ os.path.join(work_dir, x) for x in ['rsem_genes.hugo.results', 'rsem_isoforms.hugo.results'] ] # Create tarballs for outputs tarball_files('rsem.tar.gz', file_paths=[os.path.join(work_dir, x) for x in [genes, iso]], output_dir=work_dir) tarball_files('rsem_hugo.tar.gz', file_paths=[os.path.join(work_dir, x) for x in hugo_files], output_dir=work_dir) rsem_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'rsem.tar.gz')) hugo_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'rsem_hugo.tar.gz')) return rsem_id, hugo_id
def run_mutect(job, normal_bam, normal_bai, tumor_bam, tumor_bai, ref, ref_dict, fai, cosmic, dbsnp): """ Calls MuTect to perform variant analysis :param JobFunctionWrappingJob job: passed automatically by Toil :param str normal_bam: Normal BAM FileStoreID :param str normal_bai: Normal BAM index FileStoreID :param str tumor_bam: Tumor BAM FileStoreID :param str tumor_bai: Tumor BAM Index FileStoreID :param str ref: Reference genome FileStoreID :param str ref_dict: Reference dictionary FileStoreID :param str fai: Reference index FileStoreID :param str cosmic: Cosmic VCF FileStoreID :param str dbsnp: DBSNP VCF FileStoreID :return: MuTect output (tarball) FileStoreID :rtype: str """ work_dir = job.fileStore.getLocalTempDir() file_ids = [normal_bam, normal_bai, tumor_bam, tumor_bai, ref, fai, ref_dict, cosmic, dbsnp] file_names = ['normal.bam', 'normal.bai', 'tumor.bam', 'tumor.bai', 'ref.fasta', 'ref.fasta.fai', 'ref.dict', 'cosmic.vcf', 'dbsnp.vcf'] for file_store_id, name in zip(file_ids, file_names): job.fileStore.readGlobalFile(file_store_id, os.path.join(work_dir, name)) # Call: MuTect parameters = ['--analysis_type', 'MuTect', '--reference_sequence', 'ref.fasta', '--cosmic', '/data/cosmic.vcf', '--dbsnp', '/data/dbsnp.vcf', '--input_file:normal', '/data/normal.bam', '--input_file:tumor', '/data/tumor.bam', '--tumor_lod', str(10), # Taken from MC3 pipeline '--initial_tumor_lod', str(4.0), # Taken from MC3 pipeline '--out', 'mutect.out', '--coverage_file', 'mutect.cov', '--vcf', 'mutect.vcf'] docker_call(job=job, work_dir=work_dir, parameters=parameters, tool='quay.io/ucsc_cgl/mutect:1.1.7--e8bf09459cf0aecb9f55ee689c2b2d194754cbd3') # Write output to file store output_file_names = ['mutect.vcf', 'mutect.cov', 'mutect.out'] output_file_paths = [os.path.join(work_dir, x) for x in output_file_names] tarball_files('mutect.tar.gz', file_paths=output_file_paths, output_dir=work_dir) return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'mutect.tar.gz'))
def run_margin_phase(job, config, chunk_file_id, chunk_info): # prep start = time.time() work_dir = job.fileStore.getLocalTempDir() chunk_idx = chunk_info[CI_CHUNK_INDEX] chunk_identifier = "{}.{}".format(config.uuid, chunk_idx) chunk_name = "{}.in.bam".format(chunk_identifier) chunk_location = os.path.join(work_dir, chunk_name) log(job, str(datetime.datetime.now()), chunk_identifier, 'run_margin_phase') # download bam chunk job.fileStore.readGlobalFile(chunk_file_id, chunk_location) if not os.path.isfile(chunk_location): raise UserError("Failed to download chunk {} from {}".format( chunk_name, chunk_file_id)) # download references #ref genome genome_reference_name = "reference.fa" genome_reference_location = os.path.join(work_dir, genome_reference_name) job.fileStore.readGlobalFile(config.reference_genome_fileid, genome_reference_location) if not os.path.isfile(genome_reference_location): raise UserError( "Failed to download genome reference {} from {}".format( os.path.basename(config.reference_genome), config.reference_genome_fileid)) # params params_name = "params.json" params_location = os.path.join(work_dir, params_name) job.fileStore.readGlobalFile(config.params_fileid, params_location) if not os.path.isfile(params_location): raise UserError("Failed to download params {} from {}".format( os.path.basename(config.params), config.params_fileid)) # do we want to run cPecan? cpecan_prob_location = None if config.cpecan_probabilities: cpecan_prob_location = run_margin_phase__run_cpecan_alignment( job, config, chunk_identifier, work_dir, chunk_name, genome_reference_name) # run marginPhase params = [ os.path.join("/data", chunk_name), os.path.join("/data", genome_reference_name), os.path.join("/data", params_name), "-o", os.path.join("/data", "{}.out".format(chunk_identifier)), '--tag', "{},{}-{}".format(chunk_idx, chunk_info[CI_CHUNK_BOUNDARY_START], chunk_info[CI_CHUNK_BOUNDARY_END]) ] if cpecan_prob_location is not None: params.extend([ '--singleNuclProbDir', os.path.join("/data", cpecan_prob_location) ]) docker_call(job, config, work_dir, params, config.margin_phase_image, config.margin_phase_tag) log_debug_from_docker(job, os.path.join(work_dir, DOCKER_MARGIN_PHASE_LOG), chunk_identifier, 'margin_phase', [chunk_location, genome_reference_location]) log_location = os.path.join(work_dir, "marginPhase.{}.log".format(chunk_identifier)) os.rename(os.path.join(work_dir, DOCKER_MARGIN_PHASE_LOG), log_location) # document output log(job, "Output files after marginPhase:", chunk_identifier, 'run_margin_phase') output_file_locations = glob.glob( os.path.join(work_dir, "{}*".format(chunk_identifier))) output_file_locations.append(log_location) found_vcf, found_sam = False, False for f in output_file_locations: log(job, "\t\t{}".format(os.path.basename(f)), chunk_identifier, 'run_margin_phase') if f.endswith(VCF_SUFFIX): found_vcf = True if f.endswith(SAM_UNIFIED_SUFFIX): found_sam = True if cpecan_prob_location is not None: cpecan_tarball = glob.glob( os.path.join(work_dir, cpecan_prob_location, "*.tar.gz")) if len(cpecan_tarball) == 0: # todo why has tarball_files failed in this location? log(job, "Found no cpecan output tarball! Trying alt location.", chunk_identifier, 'run_margin_phase') cpecan_tarball = glob.glob(os.path.join(work_dir, "*.tar.gz")) if len(cpecan_tarball) == 0: log(job, "Found no cpecan output tarball!", chunk_identifier, 'run_margin_phase') elif len(cpecan_tarball) > 1: log( job, "Found {} cpecan output tarballs: {}".format( len(cpecan_tarball), cpecan_tarball), chunk_identifier, 'run_margin_phase') else: log(job, "Saving cpecan output tarball: {}".format(cpecan_tarball[0]), chunk_identifier, 'run_margin_phase') output_file_locations.append(cpecan_tarball[0]) # tarball the output and save tarball_name = "{}.tar.gz".format(chunk_identifier) tarball_files(tar_name=tarball_name, file_paths=output_file_locations, output_dir=work_dir) # validate output, retry if not if not (found_sam and found_vcf): if "retry_attempts" not in config: config.retry_attempts = 1 else: config.retry_attempts += 1 if config.retry_attempts > MAX_RETRIES: log(job, "", chunk_identifier, 'run_margin_phase') error = "Failed to generate appropriate output files {} times".format( MAX_RETRIES) log(job, error, chunk_identifier, 'run_margin_phase') # this enables us to "recover" in the face of failure during a run if CONTINUE_AFTER_FAILURE: output_file_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, tarball_name)) chunk_info[CI_OUTPUT_FILE_ID] = output_file_id return chunk_info raise UserError("{}:{}".format(chunk_identifier, error)) log( job, "Missing output files. Attepmting retry {}".format( config.retry_attempts), chunk_identifier, 'run_margin_phase') log(job, "Failed job log file:", chunk_identifier, 'run_margin_phase') log(job, "", chunk_identifier, 'run_margin_phase') with open(log_location, 'r') as input: for line in input: log(job, "\t\t{}".format(line.rstrip()), chunk_identifier, 'run_margin_phase') # new job retry_job = job.addChildJobFn( run_margin_phase, config, chunk_file_id, chunk_info, memory=str(int(config.maxMemory / 1024)) + "K", cores=job.cores, disk=job.disk) # save failed output if config.intermediate_file_location is not None: tarball_fail_name = "{}.FAILURE.{}.tar.gz".format( chunk_identifier, config.retry_attempts) os.rename(os.path.join(work_dir, tarball_name), os.path.join(work_dir, tarball_fail_name)) copy_files(file_paths=[os.path.join(work_dir, tarball_fail_name)], output_dir=config.intermediate_file_location) log_generic_job_debug(job, config.uuid, 'run_margin_phase', work_dir=work_dir) return retry_job.rv() # if successfull, save output if config.intermediate_file_location is not None: copy_files(file_paths=[os.path.join(work_dir, tarball_name)], output_dir=config.intermediate_file_location) output_file_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, tarball_name)) chunk_info[CI_OUTPUT_FILE_ID] = output_file_id # log log_generic_job_debug(job, config.uuid, 'run_margin_phase', work_dir=work_dir) log_time(job, "run_margin_phase", start, chunk_identifier) return chunk_info
def run_data_analysis(job, config, tcc_matrix_id, pwise_dist_l1_id, nonzero_ec_id, kallisto_matrix_id, matrix_tsv_id, matrix_cells_id): """ Generates graphs and plots of results. Uploads images to savedir location. :param job: toil job :param config: toil job configuration :param tcc_matrix_id: jobstore location of TCC matrix (.dat) :param pwise_dist_l1_id: jobstore location of L1 pairwise distance (.dat) :param nonzero_ec_id: jobstore loation of nonzero ec (.dat) :param kallisto_matrix_id: id of kallisto output matrix (.ec) :param matrix_tsv_id: id of kallisto output matrix (.tsv) :param matrix_cells_id: id of kallisto output matrix (.cells) """ # source: https://github.com/pachterlab/scRNA-Seq-TCC-prep (/blob/master/notebooks/10xResults.ipynb) # extract output job.fileStore.logToMaster('Performing data analysis') # read files work_dir = job.fileStore.getLocalTempDir() tcc_matrix = job.fileStore.readGlobalFile( tcc_matrix_id, os.path.join(work_dir, "TCC_matrix.dat")) pwise_dist_l1 = job.fileStore.readGlobalFile( pwise_dist_l1_id, os.path.join(work_dir, "pwise_dist_L1.dat")) nonzero_ec = job.fileStore.readGlobalFile( nonzero_ec_id, os.path.join(work_dir, "nonzero_ec.dat")) kallisto_matrix = job.fileStore.readGlobalFile( kallisto_matrix_id, os.path.join(work_dir, 'kallisto_matrix.ec')) matrix_tsv = job.fileStore.readGlobalFile( matrix_tsv_id, os.path.join(work_dir, MATRIX_TSV_FILENAME)) matrix_cells = job.fileStore.readGlobalFile( matrix_cells_id, os.path.join(work_dir, MATRIX_CELLS_FILENAME)) ############################################################## # load dataset with open(os.path.join(work_dir, "TCC_matrix.dat"), 'rb') as f: tcc_matrix = pickle.load(f) with open(os.path.join(work_dir, "pwise_dist_L1.dat"), 'rb') as f: pwise_dist_l1 = pickle.load(f) with open(os.path.join(work_dir, "nonzero_ec.dat"), 'rb') as f: nonzero_ec = pickle.load(f) ecfile_dir = os.path.join(work_dir, 'kallisto_matrix.ec') eclist = np.loadtxt(ecfile_dir, dtype=str) tcc = tcc_matrix.T T_norm = normalize(tcc_matrix, norm='l1', axis=0) t_normt = T_norm.transpose() num_of_cells = np.shape(tcc_matrix)[1] print("NUM_OF_CELLS =", num_of_cells) print("NUM_OF_nonzero_EC =", np.shape(tcc_matrix)[0]) ################################# EC_dict = {} for i in range(np.shape(eclist)[0]): EC_dict[i] = [int(x) for x in eclist[i, 1].split(',')] union = set() for i in nonzero_ec: new = [tx for tx in EC_dict[i] if tx not in union] # filter out previously seen transcripts union.update(new) NUM_OF_TX_inTCC = len(union) print("NUM_OF_Transcripts =", NUM_OF_TX_inTCC ) # number of distinct transcripts in nonzero eq. classes ############################################################## # inspect # sort eq. classes based on size size_of_ec = [len(EC_dict[i]) for i in nonzero_ec] ec_idx = [i[0] for i in sorted(enumerate(size_of_ec), key=lambda x: x[1])] index_ec = np.array(ec_idx) ec_sort_map = {} nonzero_ec_srt = [] # init for i in range(len(nonzero_ec)): nonzero_ec_srt += [nonzero_ec[index_ec[i]]] ec_sort_map[nonzero_ec[index_ec[i]]] = i sumi = np.array(tcc_matrix.sum(axis=1)) sumi_sorted = sumi[index_ec] total_num_of_umis = int(sumi_sorted.sum()) total_num_of_umis_per_cell = np.array(tcc_matrix.sum(axis=0))[0, :] print("Total number of UMIs =", total_num_of_umis) ################################# fig, ax1 = plt.subplots() ax1.plot(sorted(total_num_of_umis_per_cell)[::-1], 'b-', linewidth=2.0) ax1.set_title('UMI counts per cell') ax1.set_xlabel('cells (sorted by UMI counts)') ax1.set_ylabel('UMI counts') ax1.set_yscale("log", nonposy='clip') ax1.grid(True) ax1.grid(True, 'minor') umi_counts_per_cell = os.path.join(work_dir, "UMI_counts_per_cell.png") plt.savefig(umi_counts_per_cell, format='png') fig, ax1 = plt.subplots() ax1.plot(sorted(sumi.reshape(np.shape(sumi)[0]))[::-1], 'r-', linewidth=2.0) ax1.set_title('UMI counts per eq. class') ax1.set_xlabel('ECs (sorted by UMI counts)') ax1.set_ylabel('UMI counts') ax1.set_yscale("log", nonposy='clip') ax1.grid(True) ax1.grid(True, 'minor') umi_counts_per_class = os.path.join(work_dir, "UMI_counts_per_class.png") plt.savefig(umi_counts_per_class, format='png') cell_nonzeros = np.array(((T_norm != 0)).sum(axis=0))[0] fig, ax1 = plt.subplots() ax1.plot(total_num_of_umis_per_cell, cell_nonzeros, '.g', linewidth=2.0) ax1.set_title('UMI counts vs nonzero ECs') ax1.set_xlabel('total num of umis per cell') ax1.set_ylabel('total num of nonzero ecs per cell') ax1.set_yscale("log", nonposy='clip') ax1.set_xscale("log", nonposy='clip') ax1.grid(True) ax1.grid(True, 'minor') umi_counts_vs_nonzero_ecs = os.path.join(work_dir, "UMI_counts_vs_nonzero_ECs.png") plt.savefig(umi_counts_vs_nonzero_ecs, format='png') # TCC MEAN-VARIANCE #todo verify this works TCC_var = np.var(tcc.todense(), axis=0) TCC_mean = np.mean(tcc.todense(), axis=0) TCC_mean = np.array(TCC_mean)[0] TCC_var = np.array(TCC_var)[0] fig = plt.figure() N = tcc.sum() C = tcc.shape[0] ax = plt.gca() ax.plot(TCC_mean, TCC_var, '.', c='blue', alpha=0.5, markeredgecolor='none') xlims = [0.0001, 10 * TCC_mean.max()] ax.set_xlim(xlims) ax.set_ylim([0.0001, 10 * TCC_var.max()]) ax.set_yscale('symlog') ax.set_xscale('symlog') ax.plot(xlims, [(C - 1) * (xlims[0])**2, (C - 1) * (xlims[1])**2], color='g', linestyle='-', linewidth=2) ax.plot(xlims, [(xlims[0]), (xlims[1])], color='k', linestyle='--', linewidth=1) ax.set_title("TCC Mean-Variance [" + str(tcc.shape[1]) + " TCCs in " + str(C) + " Cells]") ax.set_xlabel("mean(TCC)") ax.set_ylabel("var(TCC)") tcc_mean_variance = os.path.join(work_dir, "TCC_mean_variance.png") plt.savefig(tcc_mean_variance, format='png') ############################################################## # clustering ################################# # t-SNE x_tsne = tSNE_pairwise(2, pwise_dist_l1) ################################# # spectral clustering n_clusters = config.n_clusters similarity_mat = pwise_dist_l1.max() - pwise_dist_l1 labels_spectral = spectral(n_clusters, similarity_mat) spectral_clustering = stain_plot(x_tsne, labels_spectral, [], "TCC -- tSNE, spectral clustering with " + str(n_clusters) + " n_clusters", work_dir=work_dir, filename="spectral_clustering_tSNE") ################################# # affinity propagation pref = -np.median(pwise_dist_l1) * np.ones(num_of_cells) labels_aff = AffinityProp(-pwise_dist_l1, pref, 0.5) np.unique(labels_aff) affinity_propagation_tsne = stain_plot( x_tsne, labels_aff, [], "TCC -- tSNE, affinity propagation", work_dir, "affinity_propagation_tSNE") ################################# # pca pca = PCA(n_components=2) x_pca = pca.fit_transform(t_normt.todense()) affinity_propagation_pca = stain_plot(x_pca, labels_aff, [], "TCC -- PCA, affinity propagation", work_dir, "affinity_propagation_PCA") # SC3 outfilePath = job.fileStore.getLocalTempFile() SC3OutputPath = os.path.join(work_dir, SC3_OUTPUT_DIRECTORY) os.mkdir(SC3OutputPath) shouldUseSC3Output = True with open(outfilePath, "r+") as outfile: def dockerPathTo(resource): return os.path.join(DOCKER_WORK_DIR, resource) def boolForR(aBool): return "TRUE" if aBool else "FALSE" try: dockerCall(job, tool='rscript', workDir=work_dir, parameters=map(str, [ config.min_k, config.max_k, dockerPathTo(MATRIX_TSV_FILENAME), dockerPathTo(MATRIX_CELLS_FILENAME), dockerPathTo(SC3_OUTPUT_DIRECTORY), boolForR(config.use_estimated_k), boolForR(config.debug) ]), outfile=outfile) pass except CalledProcessError: outfile.seek(0, 0) job.fileStore.logToMaster( "Docker failed with the following log: " + str(outfile.read())) shouldUseSC3Output = False # build tarfile of output plots output_files = [ umi_counts_per_cell, umi_counts_per_class, umi_counts_vs_nonzero_ecs, tcc_mean_variance, spectral_clustering, affinity_propagation_tsne, affinity_propagation_pca, outfilePath ] + ([ os.path.join(work_dir, SC3_OUTPUT_DIRECTORY, x) for x in os.listdir(SC3OutputPath) ] if shouldUseSC3Output else []) tarball_files(tar_name='single_cell_plots.tar.gz', file_paths=output_files, output_dir=work_dir) # return file id for consolidation return job.fileStore.writeGlobalFile( os.path.join(work_dir, 'single_cell_plots.tar.gz'))
def run_single_cell(job, sample, config): """ Performs single cell analysis through the quay.io/ucsc_cgl/kallisto_sc image (which uses code from the repo: https://github.com/pachterlab/scRNA-Seq-TCC-prep). Output includes TCC matrix from kallisto process. :param job: toil job :param config: configuration for toil job :param sample: a [UUID, url(s)] pair as constructed by parse_samples """ # Common logic (for handling pre- and post- Kallisto data) config = argparse.Namespace(**vars(config)) # why? config.cores = min(config.maxCores, multiprocessing.cpu_count()) work_dir = job.fileStore.getLocalTempDir() # Get input files uuid, type, urls = sample config.uuid = uuid # Handle kallisto output file (only works w/ one file for now) if type == "plot": filename = os.path.basename(urls[0]) download_url(job, url=urls[0], name=filename, work_dir=work_dir) tar = tarfile.open(name=os.path.join(work_dir, filename)) root_dir = rstrip( os.path.basename(urls[0]), ".tar.gz" ) # post, kallisto, plots folders are in this root folder, with same name as the archive kallisto_output = None # could just forward the kallisto output post_processing_output = None # same with this # method that, given the location of the file in the tar, writes it to the global job store def tarToGlobal(folder, path): with closing(tar.extractfile(os.path.join(root_dir, folder, path))) as file: data = file.read() with job.fileStore.writeGlobalFileStream() as (stream, id): stream.write(data) return id tcc_matrix_id = tarToGlobal("post", TCC_MATRIX_FILENAME) pwise_dist_l1_id = tarToGlobal("post", PWISE_DIST_FILENAME) nonzero_ec_id = tarToGlobal("post", NONZERO_EC_FILENAME) kallisto_matrix_id = tarToGlobal("post", KALLISTO_MATRIX_FILENAME) matrix_tsv_id = tarToGlobal("kallisto", "matrix.tsv") matrix_cells_id = tarToGlobal("kallisto", "matrix.cells") # Handle fastq file(s) else: input_location = os.path.join(work_dir, "fastq_input") os.mkdir(input_location) for url in urls: if url.endswith('.tar') or url.endswith('.tar.gz'): tar_path = os.path.join(work_dir, os.path.basename(url)) download_url(job, url=url, work_dir=work_dir) subprocess.check_call( ['tar', '-xvf', tar_path, '-C', input_location]) os.remove(tar_path) elif url.endswith('.gz'): download_url(job, url=url, work_dir=input_location) subprocess.check_call([ 'gunzip', os.path.join(input_location, os.path.basename(url)) ]) else: job.fileStore.logToMaster("Download url " + str(url)) download_url(job, url=url, work_dir=input_location) # Generate configuration JSON with open(os.path.join(work_dir, "config.json"), 'w') as config_file: config_file.write(build_patcherlab_config(config)) # Get Kallisto index download_url(job, url=config.kallisto_index, name='kallisto_index.idx', work_dir=work_dir) # Create other locations for patcherlab stuff os.mkdir(os.path.join(work_dir, "tcc")) os.mkdir(os.path.join(work_dir, "output")) if type == "pseudo": # Call docker image dockerCall(job, tool='quay.io/ucsc_cgl/kallisto_sc:latest', workDir=work_dir, parameters=["/data/config.json"]) else: # quantification of quake brain-style paired end fastqs, each for a different cell require(type == "quant", "invalid type " + type + " found in manifest ") os.mkdir(os.path.join(work_dir, "quant_output")) # Call docker image dockerCall(job, tool='kallisto_sc_quant', workDir=work_dir, parameters=[ "/data/kallisto_index.idx", "/data/quant_output", str(config.cores), "/data/fastq_input" ]) # Consolidate abundances for the various cells quant_output = os.path.join(work_dir, "quant_output") consolidated = os.path.join(work_dir, "quant_consolidated") os.mkdir(consolidated) for output_folder in os.listdir(quant_output): shutil.copy( os.path.join(quant_output, output_folder, "abundance.tsv"), os.path.join(consolidated, output_folder + ".tsv")) # quant to pseudo quant_to_pseudo(None, consolidated, os.path.join(work_dir, "tcc")) # run post-processing save_dir = os.path.join(work_dir, "save") os.mkdir(save_dir) prep_tcc_matrix( job, threads=config.cores, tcc_output_dir=os.path.join(work_dir, "tcc"), save_dir=save_dir ) # this should be the same as specified in build_pachterlab_config. It may be worth refactoring so that these don't have to be manually synced, although there's no reason for these values to ever change and thus become desynced. # Irrespective of whether quant or pseudo, because of quant-to-pseudo conversion # Build tarfile of output output_files = glob(os.path.join(work_dir, "tcc", "*")) tarball_files(tar_name='kallisto_output.tar.gz', file_paths=output_files, output_dir=work_dir) kallisto_output = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'kallisto_output.tar.gz')) # Consolidate post-processing output tcc_matrix_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'save', TCC_MATRIX_FILENAME)) pwise_dist_l1_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'save', PWISE_DIST_FILENAME)) nonzero_ec_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'save', NONZERO_EC_FILENAME)) kallisto_matrix_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'tcc', 'matrix.ec')) post_processing_output = { TCC_MATRIX_FILENAME: tcc_matrix_id, PWISE_DIST_FILENAME: pwise_dist_l1_id, NONZERO_EC_FILENAME: nonzero_ec_id, KALLISTO_MATRIX_FILENAME: kallisto_matrix_id # technically redundant } # Prepare files to send to plots for SC3 matrix_tsv_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, "tcc", "matrix.tsv")) matrix_cells_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, "tcc", "matrix.cells")) # Graphing step if config.generate_graphs: graphical_output = job.addChildJobFn(run_data_analysis, config, tcc_matrix_id, pwise_dist_l1_id, nonzero_ec_id, kallisto_matrix_id, matrix_tsv_id, matrix_cells_id).rv() job.addFollowOnJobFn(consolidate_output, config, kallisto_output, graphical_output, post_processing_output) else: # converts to UUID name scheme and transfers to output location consolidate_output(job, config, kallisto_output=kallisto_output, graphical_output=None, post_processing_output=post_processing_output)
def archiveBatchAndUploadToFileStore(parent_job, batch, workdir): tarname = "%s.tmp" % uuid.uuid4().hex tarpath = os.path.join(workdir, tarname) tarball_files(tar_name=tarname, file_paths=batch, output_dir=workdir) require(os.path.exists(tarpath), "[archiveBatchAndUploadToFileStore]Didn't make smaller tar") return parent_job.fileStore.writeGlobalFile(tarpath)
def run_margin_phase__run_cpecan_alignment(job, config, chunk_identifier, work_dir, alignment_filename, reference_filename): # prep start = time.time() fcn_identifier = "run_margin_phase:run_cpecan_alignment" log(job, "{}".format(datetime.datetime.now()), chunk_identifier, fcn_identifier) log( job, "Running cPecan positional probabilities on {}".format( alignment_filename), chunk_identifier, fcn_identifier) # index bam _index_bam(job, config, work_dir, alignment_filename) # build cPecan args out_dir_name = "cPecan_out" params = [ '--ref', os.path.join("/data", reference_filename), '--alignment_file', os.path.join("/data", alignment_filename), '--workdir_directory', '/data/tmp', '--output_directory', os.path.join("/data", out_dir_name), '--validate', '--threads', str(job.cores) ] hmm_location = run_margin_phase__infer_cpecan_hmm_location( chunk_identifier) if hmm_location is not None: params.extend(['--realign_hmm', hmm_location]) # run cpecan docker_call(job, config, work_dir, params, config.cpecan_image, config.cpecan_tag) # document output log_debug_from_docker(job, os.path.join(work_dir, DOCKER_CPECAN_LOG), chunk_identifier, fcn_identifier, [ os.path.join(work_dir, alignment_filename), os.path.join(work_dir, reference_filename) ]) require_docker_file_output(job, config, work_dir, [os.path.join(work_dir, out_dir_name)], fcn_identifier, log_filename=DOCKER_CPECAN_LOG) output_files = glob.glob( os.path.join(work_dir, out_dir_name, "*".format(chunk_identifier))) dir_count = len(list(filter(lambda x: os.path.isdir(x), output_files))) file_count = len(list(filter(lambda x: os.path.isfile(x), output_files))) log( job, "cPecan generated {} output files ({} directory, {} file)".format( len(output_files), dir_count, file_count), chunk_identifier, fcn_identifier) if os.path.isfile(os.path.join(work_dir, DOCKER_CPECAN_LOG)): output_files.append(os.path.join(work_dir, DOCKER_CPECAN_LOG)) # tarball the output and save tarball_name = "{}.nuc_pos_prob.tar.gz".format(chunk_identifier) try: tarball_files(tar_name=tarball_name, file_paths=output_files, output_dir=os.path.join(work_dir, out_dir_name)) except Exception, e: log(job, "{} error making cPecan tarball: {}".format(type(e), e), chunk_identifier, fcn_identifier) tarball_files(tar_name=tarball_name, file_paths=output_files, output_dir=work_dir) log(job, "created tarball in work_dir: {}".format(os.path.join(work_dir)), chunk_identifier, fcn_identifier)
def merge_chunks(job, config, chunk_infos): # prep start = time.time() uuid = config.uuid work_dir = job.fileStore.getLocalTempDir() log(job, "{}".format(datetime.datetime.now()), uuid, 'merge_chunks') log(job, "Merging {} chunks".format(len(chunk_infos)), uuid, 'merge_chunks') if config.minimal_output: log( job, "Minimal output is configured, will only save full chromosome vcf and merged BAMs", uuid, 'merge_chunks') # work directory for tar management # output files merged_chunks_directory = os.path.join(work_dir, ID_MERGED) os.mkdir(merged_chunks_directory) full_merged_vcf_file = os.path.join(merged_chunks_directory, "{}.merged.vcf".format(config.uuid)) full_merged_sam_file = os.path.join(merged_chunks_directory, "{}.merged.sam".format(config.uuid)) # sort by chunk index and validate chunk_infos.sort(key=(lambda x: x[CI_CHUNK_INDEX])) idx = 0 missing_indices = [] for ci in chunk_infos: while ci[CI_CHUNK_INDEX] > idx: missing_indices.append(idx) idx += 1 idx += 1 if len(missing_indices) > 0: log( job, "Found {} missing indices: {}".format(len(missing_indices), missing_indices), uuid, 'merge_chunks') # prep for iteration merge_decisions = dict() prev_chunk_workdir = "" prev_chunk_sam_file = None prev_chunk_vcf_file = None prev_chunk = {CI_CHUNK_INDEX: "start"} prev_written_reads = set() prev_vcf_split_pos = None prev_vcf_phase_action = None # iterate over all chunks for chunk in chunk_infos: # get current chunk info/files chunk_idx = chunk[CI_CHUNK_INDEX] chunk_boundary = chunk[CI_CHUNK_BOUNDARY_START] merging_step_identifier = "{}:{}-{}".format(config.uuid, prev_chunk[CI_CHUNK_INDEX], chunk[CI_CHUNK_INDEX]) curr_chunk_workdir = os.path.join(work_dir, "tmp-{}".format(chunk_idx)) curr_chunk_sam_file, curr_chunk_vcf_file = merge_chunks__extract_chunk_tarball( job, config, curr_chunk_workdir, chunk) log( job, "merging {} and {} across boundary {}".format( prev_chunk[CI_CHUNK_INDEX], chunk_idx, chunk_boundary), uuid, 'merge_chunks') # error out if missing files if curr_chunk_sam_file is None or curr_chunk_vcf_file is None: error = "{}: Missing expected output file, sam:{}, vcf:{}, chunk_info:{}".format( chunk_idx, curr_chunk_sam_file, curr_chunk_vcf_file, chunk) log(job, error, uuid, 'merge_chunks') job.fileStore.logToMaster(error) if CONTINUE_AFTER_FAILURE: # prev chunk info is maintained, and will be written during next chunk continue raise UserError("{}:{}".format(uuid, error)) # skip writing the first chunk if prev_chunk_sam_file is None: curr_written_reads = set() curr_vcf_split_pos = 0 curr_vcf_phase_action = dict() # write the rest of the chunks else: # get chunk splitting prev_reads, curr_reads, curr_vcf_split_pos, curr_vcf_phase_action, decision_summary =\ merge_chunks__determine_chunk_splitting(job, merging_step_identifier, prev_chunk_sam_file, curr_chunk_sam_file, chunk_boundary) merge_decisions[decision_summary] =\ merge_decisions[decision_summary] + 1 if decision_summary in merge_decisions else 1 # write sam curr_written_reads = merge_chunks__append_sam_reads( job, merging_step_identifier, prev_chunk_sam_file, full_merged_sam_file, prev_reads, prev_written_reads) if len(curr_reads) > 0: curr_written_right_reads = merge_chunks__append_sam_reads( job, merging_step_identifier, curr_chunk_sam_file, full_merged_sam_file, curr_reads, curr_written_reads) curr_written_reads = curr_written_reads.union( curr_written_right_reads) # write vcf merge_chunks__append_vcf_calls( job, merging_step_identifier, prev_chunk_vcf_file, full_merged_vcf_file, prev_vcf_split_pos, curr_vcf_split_pos, prev_vcf_phase_action, mp_identifier=prev_chunk[CI_CHUNK_INDEX]) # cleanup if os.path.isdir(prev_chunk_workdir): shutil.rmtree(prev_chunk_workdir) # iterate prev_chunk = chunk prev_chunk_workdir = curr_chunk_workdir prev_chunk_sam_file = curr_chunk_sam_file prev_chunk_vcf_file = curr_chunk_vcf_file prev_written_reads = curr_written_reads prev_vcf_split_pos = curr_vcf_split_pos prev_vcf_phase_action = curr_vcf_phase_action # write the final reads and calls merging_step_identifier = "{}:{}-{}".format(config.uuid, prev_chunk[CI_CHUNK_INDEX], "end") merge_chunks__append_sam_reads(job, merging_step_identifier, prev_chunk_sam_file, full_merged_sam_file, {None: None}, prev_written_reads) merge_chunks__append_vcf_calls(job, merging_step_identifier, prev_chunk_vcf_file, full_merged_vcf_file, prev_vcf_split_pos, sys.maxint, prev_vcf_phase_action, mp_identifier=prev_chunk[CI_CHUNK_INDEX]) # loggit log(job, "Finished merge with following matches:", uuid, 'merge_chunks') job.fileStore.logToMaster("{}:merge_chunks: ".format(config.uuid)) for decision, count in merge_decisions.items(): log(job, "\t\t{}: \t{}".format(decision, count), uuid, 'merge_chunks') # tarball the output and save log(job, "Output files for merge:".format(), uuid, 'merge_chunks') output_file_locations = glob.glob( os.path.join(merged_chunks_directory, "*.*")) output_file_locations.sort() tmp = output_file_locations output_file_locations = list() for f in tmp: if os.path.isdir(f): log(job, "\t\t{} (skipped, directory)".format(os.path.basename(f)), uuid, 'merge_chunks') else: log(job, "\t\t{}".format(os.path.basename(f)), uuid, 'merge_chunks') output_file_locations.append(f) tarball_name = "{}.merged.tar.gz".format(config.uuid) tarball_files(tar_name=tarball_name, file_paths=output_file_locations, output_dir=work_dir) output_file_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, tarball_name)) # we need to return the input list of chunk infos for consolidation chunk_infos.append({ CI_UUID: config.uuid, CI_OUTPUT_FILE_ID: output_file_id, CI_CHUNK_INDEX: ID_MERGED }) log_generic_job_debug(job, config.uuid, "merge_chunks", work_dir=work_dir) log_time(job, "merge_chunks", start, config.uuid) return chunk_infos
def run_single_cell(job, sample, config): """ Performs single cell analysis through the quay.io/ucsc_cgl/kallisto_sc image (which uses code from the repo: https://github.com/pachterlab/scRNA-Seq-TCC-prep). Output includes TCC matrix from kallisto process. :param job: toil job :param config: configuration for toil job :param sample: list of samples as constucted by 'parse_samples' function """ config = argparse.Namespace(**vars(config)) config.cores = min(config.maxCores, multiprocessing.cpu_count()) work_dir = job.fileStore.getLocalTempDir() # Generate configuration JSON with open(os.path.join(work_dir, "config.json"), 'w') as config_file: config_file.write(build_patcherlab_config(config)) # Get Kallisto index download_url(job, url=config.kallisto_index, name='kallisto_index.idx', work_dir=work_dir) # Get input files input_location = os.path.join(work_dir, "fastq_input") os.mkdir(input_location) uuid, urls = sample config.uuid = uuid for url in urls: if url.endswith('.tar') or url.endswith('.tar.gz'): tar_path = os.path.join(work_dir, os.path.basename(url)) download_url(job, url=url, work_dir=work_dir) subprocess.check_call( ['tar', '-xvf', tar_path, '-C', input_location]) os.remove(tar_path) else: download_url(job, url=url, work_dir=input_location) # Create other locations for patcherlab stuff os.mkdir(os.path.join(work_dir, "tcc")) os.mkdir(os.path.join(work_dir, "output")) # Call docker image dockerCall(job, tool='quay.io/ucsc_cgl/kallisto_sc:latest', workDir=work_dir, parameters=["/data/config.json"]) # Build tarfile of output output_files = [ os.path.join(work_dir, "tcc", x) for x in ['run_info.json', 'matrix.tsv', 'matrix.ec', 'matrix.cells'] ] tarball_files(tar_name='kallisto_output.tar.gz', file_paths=output_files, output_dir=work_dir) kallisto_output = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'kallisto_output.tar.gz')) # Graphing step if config.generate_graphs: tcc_matrix_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'save', 'TCC_matrix.dat')) pwise_dist_l1_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'save', 'pwise_dist_L1.dat')) nonzero_ec_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'save', 'nonzero_ec.dat')) kallisto_matrix_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'tcc', 'matrix.ec')) graphical_output = job.addChildJobFn(run_data_analysis, config, tcc_matrix_id, pwise_dist_l1_id, nonzero_ec_id, kallisto_matrix_id).rv() job.addFollowOnJobFn(consolidate_output, config, kallisto_output, graphical_output) else: # converts to UUID name scheme and transfers to output location consolidate_output(job, config, kallisto_output=kallisto_output, graphical_output=None)