def test_upload_and_download_with_encryption(tmpdir): from toil_lib.urls import s3am_upload from toil_lib.urls import download_url from boto.s3.connection import S3Connection, Bucket, Key work_dir = str(tmpdir) # Create temporary encryption key key_path = os.path.join(work_dir, 'foo.key') subprocess.check_call([ 'dd', 'if=/dev/urandom', 'bs=1', 'count=32', 'of={}'.format(key_path) ]) # Create test file upload_fpath = os.path.join(work_dir, 'upload_file') with open(upload_fpath, 'wb') as fout: fout.write(os.urandom(1024)) # Upload file random_key = os.path.join('test/', str(uuid4()), 'upload_file') s3_url = os.path.join('s3://cgl-driver-projects/', random_key) try: s3_dir = os.path.split(s3_url)[0] s3am_upload(fpath=upload_fpath, s3_dir=s3_dir, s3_key_path=key_path) # Download the file download_url(url=s3_url, name='download_file', work_dir=work_dir, s3_key_path=key_path) download_fpath = os.path.join(work_dir, 'download_file') assert os.path.exists(download_fpath) assert filecmp.cmp(upload_fpath, download_fpath) finally: # Delete the Key. Key deletion never fails so we don't need to catch any exceptions with closing(S3Connection()) as conn: b = Bucket(conn, 'cgl-driver-projects') k = Key(b) k.key = random_key k.delete()
def variant_calling_and_qc(job, inputs, bam_id, bai_id): """ Perform variant calling with samtools nad QC with CheckBias :param JobFunctionWrappingJob job: passed by Toil automatically :param Namespace inputs: Stores input arguments (see main) :param str bam_id: FileStore ID of bam :param str bai_id: FileStore ID of bam index file :return: FileStore ID of qc tarball :rtype: str """ job.fileStore.logToMaster('Variant calling and QC: {}'.format(inputs.uuid)) work_dir = job.fileStore.getLocalTempDir() # Pull in alignment.bam from fileStore job.fileStore.readGlobalFile(bam_id, os.path.join(work_dir, 'alignment.bam')) job.fileStore.readGlobalFile(bai_id, os.path.join(work_dir, 'alignment.bam.bai')) # Download input files input_info = [(inputs.genome, 'genome.fa'), (inputs.positions, 'positions.tsv'), (inputs.genome_index, 'genome.fa.fai'), (inputs.gtf, 'annotation.gtf'), (inputs.gtf_m53, 'annotation.m53')] for url, fname in input_info: download_url(job=job, url=url, work_dir=work_dir, name=fname) # Part 1: Variant Calling variant_command = [ 'mpileup', '-f', 'genome.fa', '-l', 'positions.tsv', '-v', 'alignment.bam', '-t', 'DP,SP,INFO/AD,INFO/ADF,INFO/ADR,INFO/DPR,SP', '-o', '/data/output.vcf.gz' ] docker_call( job=job, work_dir=work_dir, parameters=variant_command, tool= 'quay.io/ucsc_cgl/samtools:1.3--256539928ea162949d8a65ca5c79a72ef557ce7c' ) # Part 2: QC qc_command = [ '-o', 'qc', '-n', 'alignment.bam', '-a', 'annotation.gtf', '-m', 'annotation.m53' ] docker_call( job=job, work_dir=work_dir, parameters=qc_command, tool= 'jvivian/checkbias:612f129--b08a1fb6526a620bbb0304b08356f2ae7c3c0ec3') # Write output to fileStore and return ids output_tsv = glob(os.path.join(work_dir, '*counts.tsv*'))[0] output_vcf = os.path.join(work_dir, 'output.vcf.gz') tarball_files('vcqc.tar.gz', file_paths=[output_tsv, output_vcf], output_dir=work_dir) return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'vcqc.tar.gz'))
def spladder(job, inputs, bam_id, bai_id): """ Run SplAdder to detect and quantify alternative splicing events :param JobFunctionWrappingJob job: passed by Toil automatically :param Namespace inputs: Stores input arguments (see main) :param str bam_id: FileStore ID of bam :param str bai_id: FileStore ID of bam index file :return: FileStore ID of SplAdder tarball :rtype: str """ job.fileStore.logToMaster('SplAdder: {}'.format(inputs.uuid)) work_dir = job.fileStore.getLocalTempDir() # Pull in alignment.bam from fileStore job.fileStore.readGlobalFile(bam_id, os.path.join(work_dir, 'alignment.bam')) job.fileStore.readGlobalFile(bai_id, os.path.join(work_dir, 'alignment.bam.bai')) # Download input file download_url(job=job, url=inputs.gtf, work_dir=work_dir, name='annotation.gtf') download_url(job=job, url=inputs.gtf_pickle, work_dir=work_dir, name='annotation.gtf.pickle') # Call Spladder command = [ '--insert_ir=y', '--insert_es=y', '--insert_ni=y', '--remove_se=n', '--validate_sg=n', '-b', 'alignment.bam', '-o ', '/data', '-a', 'annotation.gtf', '-v', 'y', '-c', '3', '-M', 'single', '-T', 'n', '-n', '50', '-P', 'y', '-p', 'n', '--sparse_bam', 'y' ] docker_call(job=job, work_dir=work_dir, parameters=command, sudo=inputs.sudo, tool='jvivian/spladder:1.0') # Write output to fileStore and return ids output_pickle = os.path.join(work_dir, ' ', 'spladder', 'genes_graph_conf3.alignment.pickle') if not os.path.exists(output_pickle): matches = [] for root, dirnames, filenames in os.walk(work_dir): for filename in fnmatch.filter(filenames, '*genes_graph*'): matches.append(os.path.join(root, filename)) if matches: output_pickle = matches[0] else: raise RuntimeError("Couldn't find genes file!") output_filt = os.path.join(work_dir, 'alignment.filt.hdf5') output = os.path.join(work_dir, 'alignment.hdf5') print os.listdir(work_dir) tarball_files('spladder.tar.gz', file_paths=[output_pickle, output_filt, output], output_dir=work_dir) return job.fileStore.writeGlobalFile( os.path.join(work_dir, 'spladder.tar.gz'))
def run_star(job, r1_id, r2_id, star_index_url, wiggle=False): """ Performs alignment of fastqs to bam via STAR :param JobFunctionWrappingJob job: passed automatically by Toil :param str r1_id: FileStoreID of fastq (pair 1) :param str r2_id: FileStoreID of fastq (pair 2 if applicable, else pass None) :param str star_index_url: STAR index tarball :param bool wiggle: If True, will output a wiggle file and return it :return: FileStoreID from RSEM :rtype: str """ work_dir = job.fileStore.getLocalTempDir() download_url(url=star_index_url, name='starIndex.tar.gz', work_dir=work_dir) subprocess.check_call(['tar', '-xvf', os.path.join(work_dir, 'starIndex.tar.gz'), '-C', work_dir]) os.remove(os.path.join(work_dir, 'starIndex.tar.gz')) # Determine tarball structure - star index contains are either in a subdir or in the tarball itself star_index = os.path.join('/data', os.listdir(work_dir)[0]) if len(os.listdir(work_dir)) == 1 else '/data' # Parameter handling for paired / single-end data parameters = ['--runThreadN', str(job.cores), '--genomeDir', star_index, '--outFileNamePrefix', 'rna', '--outSAMtype', 'BAM', 'SortedByCoordinate', '--outSAMunmapped', 'Within', '--quantMode', 'TranscriptomeSAM', '--outSAMattributes', 'NH', 'HI', 'AS', 'NM', 'MD', '--outFilterType', 'BySJout', '--outFilterMultimapNmax', '20', '--outFilterMismatchNmax', '999', '--outFilterMismatchNoverReadLmax', '0.04', '--alignIntronMin', '20', '--alignIntronMax', '1000000', '--alignMatesGapMax', '1000000', '--alignSJoverhangMin', '8', '--alignSJDBoverhangMin', '1', '--sjdbScore', '1'] if wiggle: parameters.extend(['--outWigType', 'bedGraph', '--outWigStrand', 'Unstranded', '--outWigReferencesPrefix', 'chr']) if r1_id and r2_id: job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1.fastq')) job.fileStore.readGlobalFile(r2_id, os.path.join(work_dir, 'R2.fastq')) parameters.extend(['--readFilesIn', '/data/R1.fastq', '/data/R2.fastq']) else: job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1.fastq')) parameters.extend(['--readFilesIn', '/data/R1.fastq']) # Call: STAR Mapping docker_call(tool='quay.io/ucsc_cgl/star:2.4.2a--bcbd5122b69ff6ac4ef61958e47bde94001cfe80', work_dir=work_dir, parameters=parameters) # Write to fileStore transcriptome_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rnaAligned.toTranscriptome.out.bam')) sorted_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rnaAligned.sortedByCoord.out.bam')) if wiggle: wiggle_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rnaSignal.UniqueMultiple.str1.out.bg')) return transcriptome_id, sorted_id, wiggle_id else: return transcriptome_id, sorted_id
def run_rsem(job, bam_id, rsem_ref_url, paired=True): """ RNA quantification with RSEM :param JobFunctionWrappingJob job: Passed automatically by Toil :param str bam_id: FileStoreID of transcriptome bam for quantification :param str rsem_ref_url: URL of RSEM reference (tarball) :param bool paired: If True, uses parameters for paired end data :return: FileStoreIDs for RSEM's gene and isoform output :rtype: str """ work_dir = job.fileStore.getLocalTempDir() download_url(url=rsem_ref_url, name='rsem_ref.tar.gz', work_dir=work_dir) subprocess.check_call([ 'tar', '-xvf', os.path.join(work_dir, 'rsem_ref.tar.gz'), '-C', work_dir ]) os.remove(os.path.join(work_dir, 'rsem_ref.tar.gz')) # Determine tarball structure - based on it, ascertain folder name and rsem reference prefix rsem_files = [] for root, directories, files in os.walk(work_dir): rsem_files.extend([os.path.join(root, x) for x in files]) # "grp" is a required RSEM extension that should exist in the RSEM reference ref_prefix = [ os.path.basename(os.path.splitext(x)[0]) for x in rsem_files if 'grp' in x ][0] ref_folder = os.path.join('/data', os.listdir(work_dir)[0]) if len( os.listdir(work_dir)) == 1 else '/data' # I/O job.fileStore.readGlobalFile(bam_id, os.path.join(work_dir, 'transcriptome.bam')) output_prefix = 'rsem' # Call: RSEM parameters = [ '--quiet', '--no-qualities', '-p', str(job.cores), '--forward-prob', '0.5', '--seed-length', '25', '--fragment-length-mean', '-1.0', '--bam', '/data/transcriptome.bam', os.path.join(ref_folder, ref_prefix), output_prefix ] if paired: parameters = ['--paired-end'] + parameters docker_call( tool= 'quay.io/ucsc_cgl/rsem:1.2.25--d4275175cc8df36967db460b06337a14f40d2f21', parameters=parameters, work_dir=work_dir) os.rename(os.path.join(work_dir, output_prefix + '.genes.results'), os.path.join(work_dir, 'rsem_gene.tab')) os.rename(os.path.join(work_dir, output_prefix + '.isoforms.results'), os.path.join(work_dir, 'rsem_isoform.tab')) # Write to FileStore gene_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'rsem_gene.tab')) isoform_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'rsem_isoform.tab')) return gene_id, isoform_id
def spladder(job, inputs, bam_id, bai_id): """ Run SplAdder to detect and quantify alternative splicing events :param JobFunctionWrappingJob job: passed by Toil automatically :param Namespace inputs: Stores input arguments (see main) :param str bam_id: FileStore ID of bam :param str bai_id: FileStore ID of bam index file :return: FileStore ID of SplAdder tarball :rtype: str """ job.fileStore.logToMaster('SplAdder: {}'.format(inputs.uuid)) work_dir = job.fileStore.getLocalTempDir() # Pull in alignment.bam from fileStore job.fileStore.readGlobalFile(bam_id, os.path.join(work_dir, 'alignment.bam')) job.fileStore.readGlobalFile(bai_id, os.path.join(work_dir, 'alignment.bam.bai')) # Download input file download_url(url=inputs.gtf, work_dir=work_dir, name='annotation.gtf') download_url(url=inputs.gtf_pickle, work_dir=work_dir, name='annotation.gtf.pickle') # Call Spladder command = ['--insert_ir=y', '--insert_es=y', '--insert_ni=y', '--remove_se=n', '--validate_sg=n', '-b', 'alignment.bam', '-o ', '/data', '-a', 'annotation.gtf', '-v', 'y', '-c', '3', '-M', 'single', '-T', 'n', '-n', '50', '-P', 'y', '-p', 'n', '--sparse_bam', 'y'] docker_call(work_dir=work_dir, parameters=command, sudo=inputs.sudo, tool='jvivian/spladder:1.0') # Write output to fileStore and return ids output_pickle = os.path.join(work_dir, ' ', 'spladder', 'genes_graph_conf3.alignment.pickle') if not os.path.exists(output_pickle): matches = [] for root, dirnames, filenames in os.walk(work_dir): for filename in fnmatch.filter(filenames, '*genes_graph*'): matches.append(os.path.join(root, filename)) if matches: output_pickle = matches[0] else: raise RuntimeError("Couldn't find genes file!") output_filt = os.path.join(work_dir, 'alignment.filt.hdf5') output = os.path.join(work_dir, 'alignment.hdf5') print os.listdir(work_dir) tarball_files('spladder.tar.gz', file_paths=[output_pickle, output_filt, output], output_dir=work_dir) return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'spladder.tar.gz'))
def star(job, inputs, r1_cutadapt, r2_cutadapt): """ Performs alignment of fastqs to BAM via STAR :param JobFunctionWrappingJob job: passed by Toil automatically :param Namespace inputs: Stores input arguments (see main) :param str r1_cutadapt: FileStore ID of read 1 fastq :param str r2_cutadapt: FileStore ID of read 2 fastq """ job.fileStore.logToMaster('Aligning with STAR: {}'.format(inputs.uuid)) work_dir = job.fileStore.getLocalTempDir() cores = min(inputs.cores, 16) # Retrieve files job.fileStore.readGlobalFile(r1_cutadapt, os.path.join(work_dir, 'R1_cutadapt.fastq')) job.fileStore.readGlobalFile(r2_cutadapt, os.path.join(work_dir, 'R2_cutadapt.fastq')) # Get starIndex download_url(inputs.star_index, work_dir, 'starIndex.tar.gz') subprocess.check_call(['tar', '-xvf', os.path.join(work_dir, 'starIndex.tar.gz'), '-C', work_dir]) # Parameters parameters = ['--runThreadN', str(cores), '--genomeDir', '/data/starIndex', '--outFileNamePrefix', 'rna', '--outSAMtype', 'BAM', 'SortedByCoordinate', '--outSAMunmapped', 'Within', '--quantMode', 'TranscriptomeSAM', '--outSAMattributes', 'NH', 'HI', 'AS', 'NM', 'MD', '--outFilterType', 'BySJout', '--outFilterMultimapNmax', '20', '--outFilterMismatchNmax', '999', '--outFilterMismatchNoverReadLmax', '0.04', '--alignIntronMin', '20', '--alignIntronMax', '1000000', '--alignMatesGapMax', '1000000', '--alignSJoverhangMin', '8', '--alignSJDBoverhangMin', '1', '--sjdbScore', '1', '--readFilesIn', '/data/R1_cutadapt.fastq', '/data/R2_cutadapt.fastq'] # Call: STAR Map docker_call(tool='quay.io/ucsc_cgl/star:2.4.2a--bcbd5122b69ff6ac4ef61958e47bde94001cfe80', work_dir=work_dir, parameters=parameters) # Call Samtools Index index_command = ['index', '/data/rnaAligned.sortedByCoord.out.bam'] docker_call(work_dir=work_dir, parameters=index_command, tool='quay.io/ucsc_cgl/samtools:1.3--256539928ea162949d8a65ca5c79a72ef557ce7c') # fileStore bam_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rnaAligned.sortedByCoord.out.bam')) bai_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rnaAligned.sortedByCoord.out.bam.bai')) job.fileStore.deleteGlobalFile(r1_cutadapt) job.fileStore.deleteGlobalFile(r2_cutadapt) # Launch children and follow-on vcqc_id = job.addChildJobFn(variant_calling_and_qc, inputs, bam_id, bai_id, cores=2, disk='30G').rv() spladder_id = job.addChildJobFn(spladder, inputs, bam_id, bai_id, disk='30G').rv() job.addFollowOnJobFn(consolidate_output_tarballs, inputs, vcqc_id, spladder_id, disk='30G')
def run_kallisto(job, r1_id, r2_id, kallisto_index_url): """ RNA quantification via Kallisto :param JobFunctionWrappingJob job: passed automatically by Toil :param str r1_id: FileStoreID of fastq (pair 1) :param str r2_id: FileStoreID of fastq (pair 2 if applicable, otherwise pass None for single-end) :param str kallisto_index_url: FileStoreID for Kallisto index file :return: FileStoreID from Kallisto output :rtype: str """ work_dir = job.fileStore.getLocalTempDir() download_url(url=kallisto_index_url, name='kallisto_hg38.idx', work_dir=work_dir) # Retrieve files parameters = [ 'quant', '-i', '/data/kallisto_hg38.idx', '-t', str(job.cores), '-o', '/data/', '-b', '100' ] if r1_id and r2_id: job.fileStore.readGlobalFile( r1_id, os.path.join(work_dir, 'R1_cutadapt.fastq')) job.fileStore.readGlobalFile( r2_id, os.path.join(work_dir, 'R2_cutadapt.fastq')) parameters.extend( ['/data/R1_cutadapt.fastq', '/data/R2_cutadapt.fastq']) else: job.fileStore.readGlobalFile( r1_id, os.path.join(work_dir, 'R1_cutadapt.fastq')) parameters.extend( ['--single', '-l', '200', '-s', '15', '/data/R1_cutadapt.fastq']) # Call: Kallisto docker_call( job=job, tool= 'quay.io/ucsc_cgl/kallisto:0.42.4--35ac87df5b21a8e8e8d159f26864ac1e1db8cf86', work_dir=work_dir, parameters=parameters) # Tar output files together and store in fileStore output_files = [ os.path.join(work_dir, x) for x in ['run_info.json', 'abundance.tsv', 'abundance.h5'] ] tarball_files(tar_name='kallisto.tar.gz', file_paths=output_files, output_dir=work_dir) return job.fileStore.writeGlobalFile( os.path.join(work_dir, 'kallisto.tar.gz'))
def variant_calling_and_qc(job, inputs, bam_id, bai_id): """ Perform variant calling with samtools nad QC with CheckBias :param JobFunctionWrappingJob job: passed by Toil automatically :param Namespace inputs: Stores input arguments (see main) :param str bam_id: FileStore ID of bam :param str bai_id: FileStore ID of bam index file :return: FileStore ID of qc tarball :rtype: str """ job.fileStore.logToMaster('Variant calling and QC: {}'.format(inputs.uuid)) work_dir = job.fileStore.getLocalTempDir() # Pull in alignment.bam from fileStore job.fileStore.readGlobalFile(bam_id, os.path.join(work_dir, 'alignment.bam')) job.fileStore.readGlobalFile(bai_id, os.path.join(work_dir, 'alignment.bam.bai')) # Download input files input_info = [(inputs.genome, 'genome.fa'), (inputs.positions, 'positions.tsv'), (inputs.genome_index, 'genome.fa.fai'), (inputs.gtf, 'annotation.gtf'), (inputs.gtf_m53, 'annotation.m53')] for url, fname in input_info: download_url(url, work_dir=work_dir, name=fname) # Part 1: Variant Calling variant_command = ['mpileup', '-f', 'genome.fa', '-l', 'positions.tsv', '-v', 'alignment.bam', '-t', 'DP,SP,INFO/AD,INFO/ADF,INFO/ADR,INFO/DPR,SP', '-o', '/data/output.vcf.gz'] docker_call(work_dir=work_dir, parameters=variant_command, sudo=inputs.sudo, tool='quay.io/ucsc_cgl/samtools:1.3--256539928ea162949d8a65ca5c79a72ef557ce7c') # Part 2: QC qc_command = ['-o', 'qc', '-n', 'alignment.bam', '-a', 'annotation.gtf', '-m', 'annotation.m53'] docker_call(work_dir=work_dir, parameters=qc_command, tool='jvivian/checkbias:612f129--b08a1fb6526a620bbb0304b08356f2ae7c3c0ec3') # Write output to fileStore and return ids output_tsv = glob(os.path.join(work_dir, '*counts.tsv*'))[0] output_vcf = os.path.join(work_dir, 'output.vcf.gz') tarball_files('vcqc.tar.gz', file_paths=[output_tsv, output_vcf], output_dir=work_dir) return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'vcqc.tar.gz'))
def prepare_input(job, sample, config, enqueue_consolidation=True): # job prep config = argparse.Namespace(**vars(config)) uuid, url, contig_name, reference_url, params_url = sample config.uuid = uuid config.contig_name = contig_name config.reference_url = reference_url config.params_url = params_url if config.intermediate_file_location is not None: config.intermediate_file_location = os.path.join( config.intermediate_file_location, uuid) mkdir_p(config.intermediate_file_location) work_dir = job.fileStore.getLocalTempDir() start = time.time() log(job, "{}".format(datetime.datetime.now()), config.uuid, 'START') log( job, "Preparing input with URL:{}, contig:{}, reference_url:{}, params_url:{}" .format(url, contig_name, reference_url, params_url), uuid, 'prepare_input') # todo global resource estimation config.maxCores = min(config.maxCores, multiprocessing.cpu_count()) config.defaultCores = min(MP_CPU, config.maxCores) config.maxMemory = min(config.maxMemory, int(physicalMemory() * .95)) #config.disk # download references - TOIL_JOBSTORE_PROTOCOL queries are so this function can be imported #ref fasta if reference_url.startswith(TOIL_JOBSTORE_PROTOCOL): ref_genome_fileid = reference_url.replace(TOIL_JOBSTORE_PROTOCOL, '', 1) ref_genome_filename = "{}.reference.{}.fa".format(uuid, contig_name) job.fileStore.readGlobalFile( ref_genome_fileid, os.path.join(work_dir, ref_genome_filename)) else: download_url(reference_url, work_dir=work_dir) ref_genome_filename = os.path.basename(reference_url) ref_genome_fileid = job.fileStore.writeGlobalFile( os.path.join(work_dir, ref_genome_filename)) ref_genome_size = os.stat(os.path.join(work_dir, ref_genome_filename)).st_size config.reference_genome_fileid = ref_genome_fileid #params if params_url.startswith(TOIL_JOBSTORE_PROTOCOL): params_fileid = params_url.replace(TOIL_JOBSTORE_PROTOCOL, '', 1) else: download_url(params_url, work_dir=work_dir) params_filename = os.path.basename(params_url) params_fileid = job.fileStore.writeGlobalFile( os.path.join(work_dir, params_filename)) config.params_fileid = params_fileid # download bam if url.startswith(TOIL_JOBSTORE_PROTOCOL): bam_filename = "{}.input.{}.bam".format(uuid, contig_name) job.fileStore.readGlobalFile( url.replace(TOIL_JOBSTORE_PROTOCOL, '', 1), os.path.join(work_dir, bam_filename)) else: download_url(url, work_dir=work_dir) bam_filename = os.path.basename(url) data_bam_location = os.path.join("/data", bam_filename) workdir_bam_location = os.path.join(work_dir, bam_filename) # index the bam _index_bam(job, config, work_dir, bam_filename) # sanity check workdir_bai_location = os.path.join(work_dir, bam_filename + ".bai") if not os.path.isfile(workdir_bai_location): raise UserError("BAM index file not created for {}: {}".format( bam_filename, workdir_bai_location)) # get start and end location start_idx = sys.maxint end_idx = 0 with closing( pysam.AlignmentFile( workdir_bam_location, 'rb' if bam_filename.endswith("bam") else 'r')) as aln: for read in aln.fetch(): align_start = read.reference_start align_end = read.reference_end start_idx = min([start_idx, align_start]) end_idx = max([end_idx, align_end]) log(job, "start_pos:{}, end_pos:{}".format(config.uuid, start_idx, end_idx), uuid, 'prepare_input') # get reads from positions chunk_infos = list() idx = start_idx while idx < end_idx: ci = {CI_UUID: uuid} ci[CI_CHUNK_BOUNDARY_START] = idx chunk_start = idx - config.partition_margin ci[CI_CHUNK_START] = chunk_start idx += config.partition_size ci[CI_CHUNK_BOUNDARY_END] = idx chunk_end = idx + config.partition_margin ci[CI_CHUNK_END] = chunk_end chunk_infos.append(ci) # enqueue jobs log(job, "Enqueueing {} jobs".format(len(chunk_infos)), uuid, 'prepare_input') idx = 0 enqueued_jobs = 0 returned_tarballs = list() for ci in chunk_infos: #prep ci[CI_CHUNK_INDEX] = idx chunk_start = ci[CI_CHUNK_START] chunk_end = ci[CI_CHUNK_END] chunk_position_description = "{}:{}-{}".format(config.contig_name, chunk_start, chunk_end) bam_split_command = [ "view", "-b", data_bam_location, chunk_position_description ] chunk_name = "{}.{}.bam".format(config.uuid, idx) #write chunk chunk_location = os.path.join(work_dir, chunk_name) with open(chunk_location, 'w') as out: docker_call(job, config, work_dir, bam_split_command, DOCKER_SAMTOOLS_IMG, DOCKER_SAMTOOLS_TAG, outfile=out) #document read count chunk_size = os.stat(chunk_location).st_size ci[CI_CHUNK_SIZE] = chunk_size ci[CI_REF_FA_SIZE] = ref_genome_size read_count = prepare_input__get_bam_read_count(job, work_dir, chunk_name) ci[CI_READ_COUNT] = read_count log( job, "chunk from {} for idx {} is {}b ({}mb) and has {} reads".format( chunk_position_description, idx, chunk_size, int(chunk_size / 1024 / 1024), read_count), uuid, 'prepare_input') if config.intermediate_file_location is not None: copy_files(file_paths=[chunk_location], output_dir=config.intermediate_file_location) # enqueue marginPhase job if read_count > 0: chunk_fileid = job.fileStore.writeGlobalFile(chunk_location) mp_cores = config.defaultCores mp_mem = int( min( int(chunk_size * MP_MEM_BAM_FACTOR + ref_genome_size * MP_MEM_REF_FACTOR), config.maxMemory)) mp_disk = int( min( int(chunk_size * MP_DSK_BAM_FACTOR + ref_genome_size * MP_DSK_REF_FACTOR + (0 if config.cpecan_probabilities else MP_DSK_CPECAN_FACTOR) * chunk_size), config.maxDisk)) log( job, "requesting {} cores, {}b ({}mb) disk, {}b ({}gb) mem".format( mp_cores, mp_disk, int(mp_disk / 1024 / 1024), mp_mem, int(mp_mem / 1024 / 1024 / 1024)), "{}.{}".format(uuid, idx), 'prepare_input') mp_mem = str(int(mp_mem / 1024)) + "K" mp_disk = str(int(mp_disk) / 1024) + "K" margin_phase_job = job.addChildJobFn(run_margin_phase, config, chunk_fileid, ci, memory=mp_mem, cores=mp_cores, disk=mp_disk) returned_tarballs.append(margin_phase_job.rv()) enqueued_jobs += 1 idx += 1 log(job, "Enqueued {} jobs".format(enqueued_jobs), uuid, 'prepare_input') # enqueue merging and consolidation job merge_job = job.addFollowOnJobFn(merge_chunks, config, returned_tarballs) final_return_value = merge_job.rv() if enqueue_consolidation: consolidation_job = merge_job.addFollowOnJobFn(consolidate_output, config, merge_job.rv()) final_return_value = consolidation_job.rv() # log log_generic_job_debug(job, config.uuid, 'prepare_input', work_dir=work_dir) log_time(job, "prepare_input", start, config.uuid) # return appropriate output return final_return_value
def test_download_url(tmpdir): from toil_lib.urls import download_url work_dir = str(tmpdir) download_url(work_dir=work_dir, url='www.google.com', name='testy') assert os.path.exists(os.path.join(work_dir, 'testy'))
def run_star(job, r1_id, r2_id, star_index_url, wiggle=False, sort=True): """ Performs alignment of fastqs to bam via STAR --limitBAMsortRAM step added to deal with memory explosion when sorting certain samples. The value was chosen to complement the recommended amount of memory to have when running STAR (60G) :param JobFunctionWrappingJob job: passed automatically by Toil :param str r1_id: FileStoreID of fastq (pair 1) :param str r2_id: FileStoreID of fastq (pair 2 if applicable, else pass None) :param str star_index_url: STAR index tarball :param bool wiggle: If True, will output a wiggle file and return it :return: FileStoreID from RSEM :rtype: str """ work_dir = job.fileStore.getLocalTempDir() download_url(job, url=star_index_url, name='starIndex.tar.gz', work_dir=work_dir) subprocess.check_call(['tar', '-xvf', os.path.join(work_dir, 'starIndex.tar.gz'), '-C', work_dir]) os.remove(os.path.join(work_dir, 'starIndex.tar.gz')) # Determine tarball structure - star index contains are either in a subdir or in the tarball itself star_index = os.path.join('/data', os.listdir(work_dir)[0]) if len(os.listdir(work_dir)) == 1 else '/data' # Parameter handling for paired / single-end data parameters = ['--runThreadN', str(job.cores), '--genomeDir', star_index, '--outFileNamePrefix', 'rna', '--outSAMunmapped', 'Within', '--quantMode', 'TranscriptomeSAM', '--outSAMattributes', 'NH', 'HI', 'AS', 'NM', 'MD', '--outFilterType', 'BySJout', '--outFilterMultimapNmax', '20', '--outFilterMismatchNmax', '999', '--outFilterMismatchNoverReadLmax', '0.04', '--alignIntronMin', '20', '--alignIntronMax', '1000000', '--alignMatesGapMax', '1000000', '--alignSJoverhangMin', '8', '--alignSJDBoverhangMin', '1', '--sjdbScore', '1', '--limitBAMsortRAM', '49268954168'] # Modify paramaters based on function arguments if sort: parameters.extend(['--outSAMtype', 'BAM', 'SortedByCoordinate']) aligned_bam = 'rnaAligned.sortedByCoord.out.bam' else: parameters.extend(['--outSAMtype', 'BAM', 'Unsorted']) aligned_bam = 'rnaAligned.out.bam' if wiggle: parameters.extend(['--outWigType', 'bedGraph', '--outWigStrand', 'Unstranded', '--outWigReferencesPrefix', 'chr']) if r1_id and r2_id: job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1.fastq')) job.fileStore.readGlobalFile(r2_id, os.path.join(work_dir, 'R2.fastq')) parameters.extend(['--readFilesIn', '/data/R1.fastq', '/data/R2.fastq']) else: job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1.fastq')) parameters.extend(['--readFilesIn', '/data/R1.fastq']) # Call: STAR Mapping dockerCall(job=job, tool='quay.io/ucsc_cgl/star:2.4.2a--bcbd5122b69ff6ac4ef61958e47bde94001cfe80', workDir=work_dir, parameters=parameters) # Check output bam isnt size zero if sorted aligned_bam_path = os.path.join(work_dir, aligned_bam) if sort: assert(os.stat(aligned_bam_path).st_size > 0, 'Aligned bam failed to sort. Ensure sufficient memory is free.') # Write to fileStore aligned_id = job.fileStore.writeGlobalFile(aligned_bam_path) transcriptome_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rnaAligned.toTranscriptome.out.bam')) log_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rnaLog.final.out')) sj_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rnaSJ.out.tab')) if wiggle: wiggle_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rnaSignal.UniqueMultiple.str1.out.bg')) return transcriptome_id, aligned_id, wiggle_id, log_id, sj_id else: return transcriptome_id, aligned_id, log_id, sj_id
def star(job, inputs, r1_cutadapt, r2_cutadapt): """ Performs alignment of fastqs to BAM via STAR :param JobFunctionWrappingJob job: passed by Toil automatically :param Namespace inputs: Stores input arguments (see main) :param str r1_cutadapt: FileStore ID of read 1 fastq :param str r2_cutadapt: FileStore ID of read 2 fastq """ job.fileStore.logToMaster('Aligning with STAR: {}'.format(inputs.uuid)) work_dir = job.fileStore.getLocalTempDir() cores = min(inputs.cores, 16) # Retrieve files job.fileStore.readGlobalFile(r1_cutadapt, os.path.join(work_dir, 'R1_cutadapt.fastq')) job.fileStore.readGlobalFile(r2_cutadapt, os.path.join(work_dir, 'R2_cutadapt.fastq')) # Get starIndex download_url(job=job, url=inputs.star_index, work_dir=work_dir, name='starIndex.tar.gz') subprocess.check_call([ 'tar', '-xvf', os.path.join(work_dir, 'starIndex.tar.gz'), '-C', work_dir ]) # Parameters parameters = [ '--runThreadN', str(cores), '--genomeDir', '/data/starIndex', '--outFileNamePrefix', 'rna', '--outSAMtype', 'BAM', 'SortedByCoordinate', '--outSAMunmapped', 'Within', '--quantMode', 'TranscriptomeSAM', '--outSAMattributes', 'NH', 'HI', 'AS', 'NM', 'MD', '--outFilterType', 'BySJout', '--outFilterMultimapNmax', '20', '--outFilterMismatchNmax', '999', '--outFilterMismatchNoverReadLmax', '0.04', '--alignIntronMin', '20', '--alignIntronMax', '1000000', '--alignMatesGapMax', '1000000', '--alignSJoverhangMin', '8', '--alignSJDBoverhangMin', '1', '--sjdbScore', '1', '--readFilesIn', '/data/R1_cutadapt.fastq', '/data/R2_cutadapt.fastq' ] # Call: STAR Map docker_call( job=job, tool= 'quay.io/ucsc_cgl/star:2.4.2a--bcbd5122b69ff6ac4ef61958e47bde94001cfe80', work_dir=work_dir, parameters=parameters) # Call Samtools Index index_command = ['index', '/data/rnaAligned.sortedByCoord.out.bam'] docker_call( job=job, work_dir=work_dir, parameters=index_command, tool= 'quay.io/ucsc_cgl/samtools:1.3--256539928ea162949d8a65ca5c79a72ef557ce7c' ) # fileStore bam_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'rnaAligned.sortedByCoord.out.bam')) bai_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'rnaAligned.sortedByCoord.out.bam.bai')) job.fileStore.deleteGlobalFile(r1_cutadapt) job.fileStore.deleteGlobalFile(r2_cutadapt) # Launch children and follow-on vcqc_id = job.addChildJobFn(variant_calling_and_qc, inputs, bam_id, bai_id, cores=2, disk='30G').rv() spladder_id = job.addChildJobFn(spladder, inputs, bam_id, bai_id, disk='30G').rv() job.addFollowOnJobFn(consolidate_output_tarballs, inputs, vcqc_id, spladder_id, disk='30G')
def run_single_cell(job, sample, config): """ Performs single cell analysis through the quay.io/ucsc_cgl/kallisto_sc image (which uses code from the repo: https://github.com/pachterlab/scRNA-Seq-TCC-prep). Output includes TCC matrix from kallisto process. :param job: toil job :param config: configuration for toil job :param sample: a [UUID, url(s)] pair as constructed by parse_samples """ # Common logic (for handling pre- and post- Kallisto data) config = argparse.Namespace(**vars(config)) # why? config.cores = min(config.maxCores, multiprocessing.cpu_count()) work_dir = job.fileStore.getLocalTempDir() # Get input files uuid, type, urls = sample config.uuid = uuid # Handle kallisto output file (only works w/ one file for now) if type == "plot": filename = os.path.basename(urls[0]) download_url(job, url=urls[0], name=filename, work_dir=work_dir) tar = tarfile.open(name=os.path.join(work_dir, filename)) root_dir = rstrip( os.path.basename(urls[0]), ".tar.gz" ) # post, kallisto, plots folders are in this root folder, with same name as the archive kallisto_output = None # could just forward the kallisto output post_processing_output = None # same with this # method that, given the location of the file in the tar, writes it to the global job store def tarToGlobal(folder, path): with closing(tar.extractfile(os.path.join(root_dir, folder, path))) as file: data = file.read() with job.fileStore.writeGlobalFileStream() as (stream, id): stream.write(data) return id tcc_matrix_id = tarToGlobal("post", TCC_MATRIX_FILENAME) pwise_dist_l1_id = tarToGlobal("post", PWISE_DIST_FILENAME) nonzero_ec_id = tarToGlobal("post", NONZERO_EC_FILENAME) kallisto_matrix_id = tarToGlobal("post", KALLISTO_MATRIX_FILENAME) matrix_tsv_id = tarToGlobal("kallisto", "matrix.tsv") matrix_cells_id = tarToGlobal("kallisto", "matrix.cells") # Handle fastq file(s) else: input_location = os.path.join(work_dir, "fastq_input") os.mkdir(input_location) for url in urls: if url.endswith('.tar') or url.endswith('.tar.gz'): tar_path = os.path.join(work_dir, os.path.basename(url)) download_url(job, url=url, work_dir=work_dir) subprocess.check_call( ['tar', '-xvf', tar_path, '-C', input_location]) os.remove(tar_path) elif url.endswith('.gz'): download_url(job, url=url, work_dir=input_location) subprocess.check_call([ 'gunzip', os.path.join(input_location, os.path.basename(url)) ]) else: job.fileStore.logToMaster("Download url " + str(url)) download_url(job, url=url, work_dir=input_location) # Generate configuration JSON with open(os.path.join(work_dir, "config.json"), 'w') as config_file: config_file.write(build_patcherlab_config(config)) # Get Kallisto index download_url(job, url=config.kallisto_index, name='kallisto_index.idx', work_dir=work_dir) # Create other locations for patcherlab stuff os.mkdir(os.path.join(work_dir, "tcc")) os.mkdir(os.path.join(work_dir, "output")) if type == "pseudo": # Call docker image dockerCall(job, tool='quay.io/ucsc_cgl/kallisto_sc:latest', workDir=work_dir, parameters=["/data/config.json"]) else: # quantification of quake brain-style paired end fastqs, each for a different cell require(type == "quant", "invalid type " + type + " found in manifest ") os.mkdir(os.path.join(work_dir, "quant_output")) # Call docker image dockerCall(job, tool='kallisto_sc_quant', workDir=work_dir, parameters=[ "/data/kallisto_index.idx", "/data/quant_output", str(config.cores), "/data/fastq_input" ]) # Consolidate abundances for the various cells quant_output = os.path.join(work_dir, "quant_output") consolidated = os.path.join(work_dir, "quant_consolidated") os.mkdir(consolidated) for output_folder in os.listdir(quant_output): shutil.copy( os.path.join(quant_output, output_folder, "abundance.tsv"), os.path.join(consolidated, output_folder + ".tsv")) # quant to pseudo quant_to_pseudo(None, consolidated, os.path.join(work_dir, "tcc")) # run post-processing save_dir = os.path.join(work_dir, "save") os.mkdir(save_dir) prep_tcc_matrix( job, threads=config.cores, tcc_output_dir=os.path.join(work_dir, "tcc"), save_dir=save_dir ) # this should be the same as specified in build_pachterlab_config. It may be worth refactoring so that these don't have to be manually synced, although there's no reason for these values to ever change and thus become desynced. # Irrespective of whether quant or pseudo, because of quant-to-pseudo conversion # Build tarfile of output output_files = glob(os.path.join(work_dir, "tcc", "*")) tarball_files(tar_name='kallisto_output.tar.gz', file_paths=output_files, output_dir=work_dir) kallisto_output = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'kallisto_output.tar.gz')) # Consolidate post-processing output tcc_matrix_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'save', TCC_MATRIX_FILENAME)) pwise_dist_l1_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'save', PWISE_DIST_FILENAME)) nonzero_ec_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'save', NONZERO_EC_FILENAME)) kallisto_matrix_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'tcc', 'matrix.ec')) post_processing_output = { TCC_MATRIX_FILENAME: tcc_matrix_id, PWISE_DIST_FILENAME: pwise_dist_l1_id, NONZERO_EC_FILENAME: nonzero_ec_id, KALLISTO_MATRIX_FILENAME: kallisto_matrix_id # technically redundant } # Prepare files to send to plots for SC3 matrix_tsv_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, "tcc", "matrix.tsv")) matrix_cells_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, "tcc", "matrix.cells")) # Graphing step if config.generate_graphs: graphical_output = job.addChildJobFn(run_data_analysis, config, tcc_matrix_id, pwise_dist_l1_id, nonzero_ec_id, kallisto_matrix_id, matrix_tsv_id, matrix_cells_id).rv() job.addFollowOnJobFn(consolidate_output, config, kallisto_output, graphical_output, post_processing_output) else: # converts to UUID name scheme and transfers to output location consolidate_output(job, config, kallisto_output=kallisto_output, graphical_output=None, post_processing_output=post_processing_output)
def download_run_and_upload(job, master_ip, inputs, spark_on_toil): """ Monolithic job that calls data download, conversion, transform, upload. Previously, this was not monolithic; change came in due to #126/#134. """ master_ip = MasterAddress(master_ip) bam_name = inputs.sample.split('://')[-1].split('/')[-1] sample_name = ".".join(os.path.splitext(bam_name)[:-1]) hdfs_subdir = sample_name + "-dir" if inputs.run_local: inputs.local_dir = job.fileStore.getLocalTempDir() if inputs.native_adam_path is None: hdfs_dir = "/data/" else: hdfs_dir = inputs.local_dir else: inputs.local_dir = None hdfs_dir = "hdfs://{0}:{1}/{2}".format(master_ip, HDFS_MASTER_PORT, hdfs_subdir) try: hdfs_prefix = hdfs_dir + "/" + sample_name hdfs_bam = hdfs_dir + "/" + bam_name hdfs_snps = hdfs_dir + "/" + inputs.dbsnp.split('://')[-1].split( '/')[-1] if not inputs.run_local: download_data(job, master_ip, inputs, inputs.dbsnp, inputs.sample, hdfs_snps, hdfs_bam) else: download_url(job, inputs.sample, work_dir=inputs.local_dir) download_url(job, inputs.dbsnp, work_dir=inputs.local_dir) adam_input = hdfs_prefix + ".adam" adam_snps = hdfs_dir + "/snps.var.adam" adam_convert(job, master_ip, inputs, hdfs_bam, hdfs_snps, adam_input, adam_snps, spark_on_toil) adam_output = hdfs_prefix + ".processed.bam" adam_transform(job, master_ip, inputs, adam_input, adam_snps, hdfs_dir, adam_output, spark_on_toil) if inputs.output_dir: out_file = inputs.output_dir + "/" + sample_name + inputs.suffix + ".bam" if not inputs.run_local: upload_data(job, master_ip, inputs, adam_output, out_file, spark_on_toil) else: local_adam_output = "%s/%s.processed.bam" % (inputs.local_dir, sample_name) move_files([local_adam_output], inputs.output_dir) remove_file(master_ip, hdfs_subdir, spark_on_toil) except: remove_file(master_ip, hdfs_subdir, spark_on_toil) raise
def run_single_cell(job, sample, config): """ Performs single cell analysis through the quay.io/ucsc_cgl/kallisto_sc image (which uses code from the repo: https://github.com/pachterlab/scRNA-Seq-TCC-prep). Output includes TCC matrix from kallisto process. :param job: toil job :param config: configuration for toil job :param sample: list of samples as constucted by 'parse_samples' function """ config = argparse.Namespace(**vars(config)) config.cores = min(config.maxCores, multiprocessing.cpu_count()) work_dir = job.fileStore.getLocalTempDir() # Generate configuration JSON with open(os.path.join(work_dir, "config.json"), 'w') as config_file: config_file.write(build_patcherlab_config(config)) # Get Kallisto index download_url(job, url=config.kallisto_index, name='kallisto_index.idx', work_dir=work_dir) # Get input files input_location = os.path.join(work_dir, "fastq_input") os.mkdir(input_location) uuid, urls = sample config.uuid = uuid for url in urls: if url.endswith('.tar') or url.endswith('.tar.gz'): tar_path = os.path.join(work_dir, os.path.basename(url)) download_url(job, url=url, work_dir=work_dir) subprocess.check_call( ['tar', '-xvf', tar_path, '-C', input_location]) os.remove(tar_path) else: download_url(job, url=url, work_dir=input_location) # Create other locations for patcherlab stuff os.mkdir(os.path.join(work_dir, "tcc")) os.mkdir(os.path.join(work_dir, "output")) # Call docker image dockerCall(job, tool='quay.io/ucsc_cgl/kallisto_sc:latest', workDir=work_dir, parameters=["/data/config.json"]) # Build tarfile of output output_files = [ os.path.join(work_dir, "tcc", x) for x in ['run_info.json', 'matrix.tsv', 'matrix.ec', 'matrix.cells'] ] tarball_files(tar_name='kallisto_output.tar.gz', file_paths=output_files, output_dir=work_dir) kallisto_output = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'kallisto_output.tar.gz')) # Graphing step if config.generate_graphs: tcc_matrix_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'save', 'TCC_matrix.dat')) pwise_dist_l1_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'save', 'pwise_dist_L1.dat')) nonzero_ec_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'save', 'nonzero_ec.dat')) kallisto_matrix_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'tcc', 'matrix.ec')) graphical_output = job.addChildJobFn(run_data_analysis, config, tcc_matrix_id, pwise_dist_l1_id, nonzero_ec_id, kallisto_matrix_id).rv() job.addFollowOnJobFn(consolidate_output, config, kallisto_output, graphical_output) else: # converts to UUID name scheme and transfers to output location consolidate_output(job, config, kallisto_output=kallisto_output, graphical_output=None)
def docker_call(job, tool, parameters=None, work_dir='.', rm=True, detached=False, env=None, outfile=None, inputs=None, outputs=None, docker_parameters=None, check_output=False, mock=None, defer=None, container_name=None, mounts=None): """ Calls Docker, passing along parameters and tool. :param toil.Job.job job: The Job instance for the calling function. :param str tool: Name of the Docker image to be used (e.g. quay.io/ucsc_cgl/samtools) :param list[str] parameters: Command line arguments to be passed to the tool :param str work_dir: Directory to mount into the container via `-v`. Destination convention is /data :param bool rm: Should the container be run with the --rm flag (Should it be removed upon container exit)? rm and detached are mutually exclusive in Docker. This is the flag passed to docker and is independent of the defer flag. If this is set to True and `defer` is None, `defer` takes the value `docker_call.RM`. :param bool detached: Should the container be run with the --detached flag (Should it be run in detached mode)? See `rm` above. :param dict[str,str] env: Environment variables to be added (e.g. dict(JAVA_OPTS='-Xmx15G')) :param file outfile: Pipe output of Docker call to file handle :param list[str] inputs: A list of the input files. :param dict[str,str] outputs: A dictionary containing the outputs files as keys with either None or a url. The value is only used if mock=True :param dict[str,str] docker_parameters: Parameters to pass to docker :param bool check_output: When True, this function returns docker's output :param bool mock: Whether to run in mock mode. If this variable is unset, its value will be determined by the environment variable. :param int defer: What action should be taken on the container upon job completion? docker_call.FORGO will leave the container untouched. docker_call.STOP will attempt to stop the container with `docker stop` (useful for debugging). docker_call.RM will stop the container and then forcefully remove it from the system using `docker rm -f`. The default value is None and that shadows docker_call.FORGO, unless rm is true. :param str container_name: An optional name for your container. :param dict mounts: A dictionary of data volumes to mount into the Docker container containing host paths as keys and the corresponding container paths as values """ from toil_lib.urls import download_url if mock is None: mock = mock_mode() if parameters is None: parameters = [] if inputs is None: inputs = [] if outputs is None: outputs = {} # Docker does not allow the --rm flag to be used when the container is run in detached mode. require(not (rm and detached), "Conflicting options 'rm' and 'detached'.") # Ensure the user has passed a valid value for defer require( defer in (None, docker_call.FORGO, docker_call.STOP, docker_call.RM), 'Please provide a valid value for defer.') for filename in inputs: assert (os.path.isfile(os.path.join(work_dir, filename))) if mock: for filename, url in outputs.items(): file_path = os.path.join(work_dir, filename) if url is None: # create mock file if not os.path.exists(file_path): f = open(file_path, 'w') f.write("contents") # FIXME f.close() else: file_path = os.path.join(work_dir, filename) if not os.path.exists(file_path): outfile = download_url(job, url, work_dir=work_dir, name=filename, mock=False) assert os.path.exists(file_path) return if not container_name: container_name = _get_container_name(job) base_docker_call = [ 'docker', 'run', '--log-driver=none', '-v', '{}:/data'.format(os.path.abspath(work_dir)) ] if mounts: require(isinstance(mounts, dict), "'mounts' parameter must be a dictionary object") for k, v in mounts.iteritems(): base_docker_call.extend(['-v', k + ':' + v]) # Defer the permission fixing function. We call this explicitly later on in this function, but # we defer it as well to handle unexpected job failure. job.defer(_fix_permissions, base_docker_call, tool, work_dir) base_docker_call.extend(['--name', container_name]) if rm: base_docker_call.append('--rm') if defer is None: defer = docker_call.RM elif detached: base_docker_call += ['-d'] # Defer the container on-exit action job.defer(_docker_kill, container_name, action=defer) if env: for e, v in env.iteritems(): base_docker_call.extend(['-e', '{}={}'.format(e, v)]) if docker_parameters: base_docker_call += docker_parameters _log.debug("Calling docker with %s." % " ".join(base_docker_call + [tool] + parameters)) call = base_docker_call + [tool] + parameters if outfile: subprocess.check_call(call, stdout=outfile) else: if check_output: return subprocess.check_output(call) else: subprocess.check_call(call) # Fix root ownership of output files _fix_permissions(base_docker_call, tool, work_dir) for filename in outputs.keys(): if not os.path.isabs(filename): filename = os.path.join(work_dir, filename) assert (os.path.isfile(filename))
def docker_call(tool, parameters=None, work_dir='.', rm=True, env=None, outfile=None, inputs=None, outputs=None, docker_parameters=None, check_output=False, mock=None): """ Calls Docker, passing along parameters and tool. :param str tool: Name of the Docker image to be used (e.g. quay.io/ucsc_cgl/samtools) :param list[str] parameters: Command line arguments to be passed to the tool :param str work_dir: Directory to mount into the container via `-v`. Destination convention is /data :param bool rm: Set to True to pass `--rm` flag. :param dict[str,str] env: Environment variables to be added (e.g. dict(JAVA_OPTS='-Xmx15G')) :param bool sudo: If True, prepends `sudo` to the docker call :param file outfile: Pipe output of Docker call to file handle :param list[str] inputs: A list of the input files. :param dict[str,str] outputs: A dictionary containing the outputs files as keys with either None or a url. The value is only used if mock=True :param dict[str,str] docker_parameters: Parameters to pass to docker :param bool check_output: When True, this function returns docker's output :param bool mock: Whether to run in mock mode. If this variable is unset, its value will be determined by the environment variable. """ from toil_lib.urls import download_url if mock is None: mock = mock_mode() if parameters is None: parameters = [] if inputs is None: inputs = [] if outputs is None: outputs = {} for filename in inputs: assert (os.path.isfile(os.path.join(work_dir, filename))) if mock: for filename, url in outputs.items(): file_path = os.path.join(work_dir, filename) if url is None: # create mock file if not os.path.exists(file_path): f = open(file_path, 'w') f.write("contents") # FIXME f.close() else: file_path = os.path.join(work_dir, filename) if not os.path.exists(file_path): outfile = download_url(url, work_dir=work_dir, name=filename) assert os.path.exists(file_path) return base_docker_call = [ 'docker', 'run', '--log-driver=none', '-v', '{}:/data'.format(os.path.abspath(work_dir)) ] if rm: base_docker_call.append('--rm') if env: for e, v in env.iteritems(): base_docker_call.extend(['-e', '{}={}'.format(e, v)]) if docker_parameters: base_docker_call += docker_parameters _log.debug("Calling docker with %s." % " ".join(base_docker_call + [tool] + parameters)) docker_call = base_docker_call + [tool] + parameters try: if outfile: subprocess.check_call(docker_call, stdout=outfile) else: if check_output: return subprocess.check_output(docker_call) else: subprocess.check_call(docker_call) # Fix root ownership of output files except: # Panic avoids hiding the exception raised in the try block with panic(): _fix_permissions(base_docker_call, tool, work_dir) else: _fix_permissions(base_docker_call, tool, work_dir) for filename in outputs.keys(): if not os.path.isabs(filename): filename = os.path.join(work_dir, filename) assert (os.path.isfile(filename))
def docker_call(tool=None, tools=None, parameters=None, work_dir='.', rm=True, env=None, outfile=None, errfile=None, inputs=None, outputs=None, docker_parameters=None, check_output=False, return_stderr=False, mock=None): """ Calls Docker, passing along parameters and tool. :param (str tool | str tools): str tool name of the Docker image to be used (e.g. tool='quay.io/ucsc_cgl/samtools') OR str tools of the Docker images and order to be used when piping commands to Docker. (e.g. 'quay.io/ucsc_cgl/samtools'). Both tool and tools are mutually exclusive parameters to docker_call. :param list[str] parameters: Command line arguments to be passed to the tool :param str work_dir: Directory to mount into the container via `-v`. Destination convention is /data :param bool rm: Set to True to pass `--rm` flag. :param dict[str,str] env: Environment variables to be added (e.g. dict(JAVA_OPTS='-Xmx15G')) :param bool sudo: If True, prepends `sudo` to the docker call :param file outfile: Pipe stdout of Docker call to file handle :param file errfile: Pipe stderr of Docker call to file handle :param list[str] inputs: A list of the input files. :param dict[str,str] outputs: A dictionary containing the outputs files as keys with either None or a url. The value is only used if mock=True :param dict[str,str] docker_parameters: Parameters to pass to docker :param bool check_output: When True, this function returns docker's output :param bool return_stderr: When True, this function includes stderr in docker's output :param bool mock: Whether to run in mock mode. If this variable is unset, its value will be determined by the environment variable. Pipes in docker commands: Running a pipe in docker in 'pipe-in-single-container' mode produces command structure docker '... | ... | ...' where each '...' command corresponds to each element in the 'parameters' argument that uses a docker container. This is the most efficient method if you want to run a pipe of commands where each command uses the same docker container. Example for running command 'head -c 1M /dev/urandom | gzip | gunzip | md5sum 1>&2': Running 'pipe-in-single-container' mode: command= ['head -c 1M /dev/urandom', 'gzip', 'gunzip', 'md5sum 1>&2'] docker_work_dir=curr_work_dir docker_tools='ubuntu' stdout = docker_call(work_dir=docker_work_dir, parameters=command, tools=docker_tools, check_output=True) """ from toil_lib.urls import download_url if mock is None: mock = mock_mode() if parameters is None: parameters = [] if inputs is None: inputs = [] if outputs is None: outputs = {} for filename in inputs: assert(os.path.isfile(os.path.join(work_dir, filename))) if mock: for filename, url in outputs.items(): file_path = os.path.join(work_dir, filename) if url is None: # create mock file if not os.path.exists(file_path): f = open(file_path, 'w') f.write("contents") # FIXME f.close() else: file_path = os.path.join(work_dir, filename) if not os.path.exists(file_path): outfile = download_url(url, work_dir=work_dir, name=filename) assert os.path.exists(file_path) return base_docker_call = ['docker', 'run', '--log-driver=none', '-v', '{}:/data'.format(os.path.abspath(work_dir))] if rm: base_docker_call.append('--rm') if env: for e, v in env.iteritems(): base_docker_call.extend(['-e', '{}={}'.format(e, v)]) if docker_parameters: base_docker_call += docker_parameters docker_call = [] require(bool(tools) != bool(tool), 'Either "tool" or "tools" must contain a value, but not both') # Pipe functionality # each element in the parameters list must represent a sub-pipe command if bool(tools): # If tools is set then format the docker call in the 'pipe-in-single-container' mode docker_call = " ".join(base_docker_call + ['--entrypoint /bin/bash', tools, '-c \'{}\''.format(" | ".join(parameters))]) _log.debug("Calling docker with %s." % docker_call) else: docker_call = " ".join(base_docker_call + [tool] + parameters) _log.debug("Calling docker with %s." % docker_call) try: if outfile: if errfile: subprocess.check_call(docker_call, stdout=outfile, stderr=errfile, shell=True) else: subprocess.check_call(docker_call, stdout=outfile, shell=True) else: if check_output: if return_stderr: return subprocess.check_output(docker_call, shell=True, stderr=subprocess.STDOUT) else: return subprocess.check_output(docker_call, shell=True) else: subprocess.check_call(docker_call, shell=True) # Fix root ownership of output files except: # Panic avoids hiding the exception raised in the try block with panic(): _fix_permissions(base_docker_call, tool, tools, work_dir) else: _fix_permissions(base_docker_call, tool, tools, work_dir) for filename in outputs.keys(): if not os.path.isabs(filename): filename = os.path.join(work_dir, filename) assert(os.path.isfile(filename))