def validate_import_bam(t, bam_path, fasta_sequences, genome): validate_bam_fasta_pairs(bam_path, fasta_sequences, genome) return [ FileID.forPath(t.importFile('file://' + bam_path), bam_path), FileID.forPath(t.importFile('file://' + bam_path + '.bai'), bam_path + '.bai') ]
def align_transcripts(args, toil_options): """ Main entry function for transcript alignment toil pipeline :param args: dictionary of arguments from CAT :param toil_options: toil options Namespace object """ with Toil(toil_options) as t: if not t.options.restart: input_file_ids = argparse.Namespace() input_file_ids.ref_genome_fasta = tools.toilInterface.write_fasta_to_filestore(t, args.ref_genome_fasta) input_file_ids.genome_fasta = tools.toilInterface.write_fasta_to_filestore(t, args.genome_fasta) input_file_ids.annotation_gp = FileID.forPath(t.importFile('file://' + args.annotation_gp), args.annotation_gp) input_file_ids.ref_db = FileID.forPath(t.importFile('file://' + args.ref_db_path), args.ref_db_path) input_file_ids.modes = {} file_ids = [input_file_ids.ref_genome_fasta, input_file_ids.genome_fasta, input_file_ids.annotation_gp, input_file_ids.ref_db] for mode in args.transcript_modes: input_file_ids.modes[mode] = t.importFile('file://' + args.transcript_modes[mode]['gp']) file_ids.append(input_file_ids.modes[mode]) disk_usage = tools.toilInterface.find_total_disk_usage(file_ids) job = Job.wrapJobFn(setup, args, input_file_ids, memory='16G', disk=disk_usage) results_file_ids = t.start(job) else: results_file_ids = t.restart() for file_path, file_id in results_file_ids.items(): tools.fileOps.ensure_file_dir(file_path) t.exportFile(file_id, 'file://' + file_path)
def augustus_pb(args, toil_options): """ Main entry function for AugustusPB toil pipeline :param args: dictionary of arguments from CAT :param toil_options: toil options Namespace object :return: """ with Toil(toil_options) as t: if not t.options.restart: input_file_ids = argparse.Namespace() input_file_ids.genome_fasta = tools.toilInterface.write_fasta_to_filestore( t, args.genome_fasta) input_file_ids.chrom_sizes = FileID.forPath( t.importFile('file://' + args.chrom_sizes), args.chrom_sizes) input_file_ids.pb_cfg = FileID.forPath( t.importFile('file://' + args.pb_cfg), args.pb_cfg) input_file_ids.hints_gff = FileID.forPath( t.importFile('file://' + args.hints_gff), args.hints_gff) job = Job.wrapJobFn(setup, args, input_file_ids, memory='16G', disk='32G') raw_gtf_file_id, gtf_file_id, joined_gp_file_id = t.start(job) else: raw_gtf_file_id, gtf_file_id, joined_gp_file_id = t.restart() tools.fileOps.ensure_file_dir(args.augustus_pb_raw_gtf) t.exportFile(raw_gtf_file_id, 'file://' + args.augustus_pb_raw_gtf) t.exportFile(gtf_file_id, 'file://' + args.augustus_pb_gtf) t.exportFile(joined_gp_file_id, 'file://' + args.augustus_pb_gp)
def hints_db(hints_args, toil_options): """ Entry point for hints database Toil pipeline. """ def validate_import_bam(t, bam_path, fasta_sequences, genome): validate_bam_fasta_pairs(bam_path, fasta_sequences, genome) return [FileID.forPath(t.importFile('file://' + bam_path), bam_path), FileID.forPath(t.importFile('file://' + bam_path + '.bai'), bam_path + '.bai')] fasta = pyfasta.Fasta(hints_args.fasta) fasta_sequences = {(x.split()[0], len(fasta[x])) for x in fasta.keys()} with Toil(toil_options) as t: if not t.options.restart: # load the RNA-seq data, if we have any bam_file_ids = {'BAM': {}, 'INTRONBAM': {}} for dtype in ['BAM', 'INTRONBAM']: if hints_args.genome not in hints_args.cfg[dtype]: continue for bam_path in hints_args.cfg[dtype][hints_args.genome]: bam_file_ids[dtype][os.path.basename(bam_path)] = validate_import_bam(t, bam_path, fasta_sequences, hints_args.genome) # load the IsoSeq data, if we have any iso_seq_file_ids = [] if hints_args.genome in hints_args.cfg['ISO_SEQ_BAM']: for bam_path in hints_args.cfg['ISO_SEQ_BAM'][hints_args.genome]: validate_bam_fasta_pairs(bam_path, fasta_sequences, hints_args.genome) iso_seq_file_ids.append(validate_import_bam(t, bam_path, fasta_sequences, hints_args.genome)) if hints_args.annotation_gp is None: annotation_file_id = None else: annotation_file_id = FileID.forPath(t.importFile('file://' + hints_args.annotation_gp), hints_args.annotation_gp) if hints_args.protein_fasta is None: protein_fasta_file_id = genome_fasta_file_id = None else: protein_fasta_file_id = FileID.forPath(t.importFile('file://' + hints_args.protein_fasta), hints_args.protein_fasta) genome_fasta_file_id = FileID.forPath(t.importFile('file://' + hints_args.fasta), hints_args.fasta) input_file_ids = {'bams': bam_file_ids, 'iso_seq_bams': iso_seq_file_ids, 'annotation': annotation_file_id, 'protein_fasta': protein_fasta_file_id, 'genome_fasta': genome_fasta_file_id} if len(input_file_ids['bams']) + len(input_file_ids['iso_seq_bams']) > 0: logger.info('All BAMs validated for {}. Beginning Toil hints pipeline'.format(hints_args.genome)) disk_usage = tools.toilInterface.find_total_disk_usage(input_file_ids) job = Job.wrapJobFn(setup_hints, input_file_ids, disk=disk_usage) combined_hints = t.start(job) else: logger.info('Restarting Toil hints pipeline for {}.'.format(hints_args.genome)) combined_hints = t.restart() tools.fileOps.ensure_file_dir(hints_args.hints_path) t.exportFile(combined_hints, 'file://' + hints_args.hints_path)
def write_fasta_to_filestore(toil, fasta_local_path): """ Convenience function that loads a fasta and its associated gdx/flat file into the fileStore. Assumes that the paths are consistent with the requirements (i.e. $path.gdx and $path.flat) :param toil: Toil context manager :param fasta_local_path: Path to local fasta to load. :return: List of fileStore IDs for fasta, fasta_gdx, fasta_flat """ fasta_file_id = FileID.forPath(toil.importFile('file:///' + fasta_local_path), fasta_local_path) gdx_file_id = FileID.forPath(toil.importFile('file:///' + fasta_local_path + '.gdx'), fasta_local_path + '.gdx') flat_file_id = FileID.forPath(toil.importFile('file:///' + fasta_local_path + '.flat'), fasta_local_path + '.flat') return fasta_file_id, gdx_file_id, flat_file_id
def writeGlobalFile(self, localFileName, cleanup=False): absLocalFileName = self._resolveAbsoluteLocalPath(localFileName) creatorID = self.jobGraph.jobStoreID fileStoreID = self.jobStore.writeFile(absLocalFileName, creatorID, cleanup) self.localFileMap[fileStoreID].append(absLocalFileName) return FileID.forPath(fileStoreID, absLocalFileName)
def writeGlobalFileStream(self, cleanup=False): """ Similar to writeGlobalFile, but allows the writing of a stream to the job store. The yielded file handle does not need to and should not be closed explicitly. :param bool cleanup: is as in :func:`toil.fileStores.abstractFileStore.AbstractFileStore.writeGlobalFile`. :return: A context manager yielding a tuple of 1) a file handle which can be written to and 2) the toil.fileStores.FileID of the resulting file in the job store. """ # TODO: Make this work with FileID with self.jobStore.writeFileStream(None if not cleanup else self.jobGraph.jobStoreID) as (backingStream, fileStoreID): # We have a string version of the file ID, and the backing stream. # We need to yield a stream the caller can write to, and a FileID # that accurately reflects the size of the data written to the # stream. We assume the stream is not seekable. # Make and keep a reference to the file ID, which is currently empty fileID = FileID(fileStoreID, 0) # Wrap the stream to increment the file ID's size for each byte written wrappedStream = WriteWatchingStream(backingStream) # When the stream is written to, count the bytes def handle(numBytes): fileID.size += numBytes wrappedStream.onWrite(handle) yield wrappedStream, fileID
def _importFile(self, otherCls, url, sharedFileName=None, hardlink=False, symlink=False): if issubclass(otherCls, FileJobStore): if sharedFileName is None: executable = os.stat(url.path).st_mode & stat.S_IXUSR != 0 absPath = self._getUniqueFilePath( url.path ) # use this to get a valid path to write to in job store with self.optionalHardCopy(hardlink): self._copyOrLink(url, absPath, symlink=symlink) # TODO: os.stat(absPath).st_size consistently gives values lower than # getDirSizeRecursively() return FileID(self._getFileIdFromPath(absPath), os.stat(absPath).st_size, executable) else: self._requireValidSharedFileName(sharedFileName) path = self._getSharedFilePath(sharedFileName) with self.optionalHardCopy(hardlink): self._copyOrLink(url, path, symlink=symlink) return None else: return super(FileJobStore, self)._importFile(otherCls, url, sharedFileName=sharedFileName)
def _importFile(self, otherCls, url, sharedFileName=None, hardlink=False): """ Import the file at the given URL using the given job store class to retrieve that file. See also :meth:`.importFile`. This method applies a generic approach to importing: it asks the other job store class for a stream and writes that stream as either a regular or a shared file. :param AbstractJobStore otherCls: The concrete subclass of AbstractJobStore that supports reading from the given URL and getting the file size from the URL. :param urlparse.ParseResult url: The location of the file to import. :param str sharedFileName: Optional name to assign to the imported file within the job store :return The jobStoreFileId of imported file or None if sharedFileName was given :rtype: toil.fileStores.FileID or None """ if sharedFileName is None: with self.writeFileStream() as (writable, jobStoreFileID): size = otherCls._readFromUrl(url, writable) return FileID(jobStoreFileID, size) else: self._requireValidSharedFileName(sharedFileName) with self.writeSharedFileStream(sharedFileName) as writable: otherCls._readFromUrl(url, writable) return None
def writeGlobalFile(self, localFileName, cleanup=False): absLocalFileName = self._resolveAbsoluteLocalPath(localFileName) creatorID = self.jobDesc.jobStoreID fileStoreID = self.jobStore.writeFile(absLocalFileName, creatorID, cleanup) if absLocalFileName.startswith(self.localTempDir): # Only files in the appropriate directory should become local files # we can delete with deleteLocalFile self.localFileMap[fileStoreID].append(absLocalFileName) return FileID.forPath(fileStoreID, absLocalFileName)
def chaining(args, toil_options): """entry point to this program""" with Toil(toil_options) as t: if not t.options.restart: input_file_ids = argparse.Namespace() input_file_ids.hal = FileID.forPath(t.importFile('file://' + args.hal), args.hal) input_file_ids.query_sizes = FileID.forPath(t.importFile('file://' + args.query_sizes), args.query_sizes) input_file_ids.query_two_bit = FileID.forPath(t.importFile('file://' + args.query_two_bit), args.query_two_bit) target_two_bit_file_ids = {genome: FileID.forPath(t.importFile('file://' + f), f) for genome, f in args.target_two_bits.iteritems()} input_file_ids.target_two_bits = target_two_bit_file_ids job = Job.wrapJobFn(setup, args, input_file_ids) chain_file_ids = t.start(job) else: chain_file_ids = t.restart() for chain_file, chain_file_id in chain_file_ids.iteritems(): tools.fileOps.ensure_file_dir(chain_file) t.exportFile(chain_file_id, 'file://' + chain_file)
def writeGlobalFileStream( self, cleanup: bool = False, basename: Optional[str] = None, encoding: Optional[str] = None, errors: Optional[str] = None ) -> Iterator[Tuple[Union[BinaryIO, TextIO], FileID]]: """ Similar to writeGlobalFile, but allows the writing of a stream to the job store. The yielded file handle does not need to and should not be closed explicitly. :param encoding: The name of the encoding used to decode the file. Encodings are the same as for decode(). Defaults to None which represents binary mode. :param errors: Specifies how encoding errors are to be handled. Errors are the same as for open(). Defaults to 'strict' when an encoding is specified. :param cleanup: is as in :func:`toil.fileStores.abstractFileStore.AbstractFileStore.writeGlobalFile`. :param basename: If supported by the backing JobStore, use the given file basename so that when searching the job store with a query matching that basename, the file will be detected. :return: A context manager yielding a tuple of 1) a file handle which can be written to and 2) the toil.fileStores.FileID of the resulting file in the job store. """ with self.jobStore.writeFileStream(self.jobDesc.jobStoreID, cleanup, basename, encoding, errors) as (backingStream, fileStoreID): # We have a string version of the file ID, and the backing stream. # We need to yield a stream the caller can write to, and a FileID # that accurately reflects the size of the data written to the # stream. We assume the stream is not seekable. # Make and keep a reference to the file ID, which is currently empty fileID = FileID(fileStoreID, 0) # Wrap the stream to increment the file ID's size for each byte written wrappedStream = WriteWatchingStream(backingStream) # When the stream is written to, count the bytes def handle(numBytes: int) -> None: # No scope problem here, because we don't assign to a fileID local fileID.size += numBytes wrappedStream.onWrite(handle) yield wrappedStream, fileID
def download_structure( file_store: AbstractFileStore, index: Dict[str, str], existing: Dict[str, str], dir_dict: DirectoryStructure, into_dir: str, ) -> None: """ Download a whole nested dictionary of files and directories from the Toil file store to a local path. :param file_store: The Toil file store to download from. :param index: Maps from downloaded file path back to input Toil URI. :param existing: Maps from file_store_id URI to downloaded file path. :param dir_dict: a dict from string to string (for files) or dict (for subdirectories) describing a directory structure. :param into_dir: The directory to download the top-level dict's files into. """ logger.debug("Downloading directory with %s items", len(dir_dict)) for name, value in dir_dict.items(): if name == ".": # Skip this key that isn't a real child file. continue if isinstance(value, dict): # This is a subdirectory, so make it and download # its contents logger.debug("Downloading subdirectory %s", name) subdir = os.path.join(into_dir, name) os.mkdir(subdir) download_structure(file_store, index, existing, value, subdir) else: # This must be a file path uploaded to Toil. assert isinstance(value, str) assert value.startswith("toilfile:") logger.debug("Downloading contained file %s", name) dest_path = os.path.join(into_dir, name) # So download the file into place file_store.readGlobalFile( FileID.unpack(value[len("toilfile:") :]), dest_path, symlink=True ) # Update the index dicts # TODO: why? index[dest_path] = value existing[value] = dest_path
def augustus(args, coding_gp, toil_options): """ Main entry function for Augustus toil pipeline :param args: dictionary of arguments from CAT :param coding_gp: genePred with only coding transcripts :param toil_options: toil options Namespace object """ with Toil(toil_options) as t: if not t.options.restart: input_file_ids = argparse.Namespace() input_file_ids.genome_fasta = tools.toilInterface.write_fasta_to_filestore( t, args.genome_fasta) input_file_ids.tm_cfg = FileID.forPath( t.importFile('file://' + args.tm_cfg), args.tm_cfg) input_file_ids.coding_gp = FileID.forPath( t.importFile('file://' + coding_gp), coding_gp) input_file_ids.ref_psl = FileID.forPath( t.importFile('file://' + args.ref_psl), args.ref_psl) input_file_ids.tm_psl = FileID.forPath( t.importFile('file://' + args.filtered_tm_psl), args.filtered_tm_psl) input_file_ids.annotation_gp = FileID.forPath( t.importFile('file://' + args.annotation_gp), args.annotation_gp) file_ids = [ input_file_ids.genome_fasta, input_file_ids.coding_gp, input_file_ids.ref_psl, input_file_ids.tm_psl, input_file_ids.annotation_gp ] if args.augustus_tmr: input_file_ids.augustus_hints_db = FileID.forPath( t.importFile('file://' + args.augustus_hints_db), args.augustus_hints_db) input_file_ids.tmr_cfg = FileID.forPath( t.importFile('file://' + args.tmr_cfg), args.tmr_cfg) file_ids.append(args.augustus_hints_db) disk_usage = tools.toilInterface.find_total_disk_usage(file_ids) job = Job.wrapJobFn(setup, args, input_file_ids, disk_usage, disk=disk_usage) tm_file_id, tmr_file_id = t.start(job) else: tm_file_id, tmr_file_id = t.restart() tools.fileOps.ensure_file_dir(args.augustus_tm_gtf) t.exportFile(tm_file_id, 'file://' + args.augustus_tm_gtf) if tmr_file_id is not None: tools.fileOps.ensure_file_dir(args.augustus_tmr_gtf) t.exportFile(tmr_file_id, 'file://' + args.augustus_tmr_gtf)
def writeGlobalFileStream(self, cleanup=False, basename=None): """ Similar to writeGlobalFile, but allows the writing of a stream to the job store. The yielded file handle does not need to and should not be closed explicitly. :param bool cleanup: is as in :func:`toil.fileStores.abstractFileStore.AbstractFileStore.writeGlobalFile`. :param str basename: If supported by the backing JobStore, use the given file basename so that when searching the job store with a query matching that basename, the file will be detected. :return: A context manager yielding a tuple of 1) a file handle which can be written to and 2) the toil.fileStores.FileID of the resulting file in the job store. """ with self.jobStore.writeFileStream(self.jobGraph.jobStoreID, cleanup, basename) as (backingStream, fileStoreID): # We have a string version of the file ID, and the backing stream. # We need to yield a stream the caller can write to, and a FileID # that accurately reflects the size of the data written to the # stream. We assume the stream is not seekable. # Make and keep a reference to the file ID, which is currently empty fileID = FileID(fileStoreID, 0) # Wrap the stream to increment the file ID's size for each byte written wrappedStream = WriteWatchingStream(backingStream) # When the stream is written to, count the bytes def handle(numBytes): # No scope problem here, because we don't assign to a fileID local fileID.size += numBytes wrappedStream.onWrite(handle) yield wrappedStream, fileID
def augustus_cgp(args, toil_options): """ Main entry function for AugustusCGP toil pipeline :param args: dictionary of arguments from CAT :param toil_options: toil options Namespace object :return: """ with Toil(toil_options) as t: if not t.options.restart: input_file_ids = argparse.Namespace() input_file_ids.hal = FileID.forPath(t.importFile('file://' + args.hal), args.hal) input_file_ids.chrom_sizes = FileID.forPath(t.importFile('file://' + args.query_sizes), args.query_sizes) input_file_ids.hints_db = FileID.forPath(t.importFile('file://' + args.hints_db), args.hints_db) if args.cgp_param is not None: input_file_ids.cgp_param = FileID.forPath(t.importFile('file://' + args.cgp_param), args.cgp_param) else: input_file_ids.cgp_param = None input_file_ids.gtf = FileID.forPath(t.importFile('file://' + args.gtf), args.gtf) input_file_ids.cgp_cfg = FileID.forPath(t.importFile('file://' + args.cgp_cfg), args.cgp_cfg) input_file_ids.fasta = {genome: FileID.forPath(t.importFile('file://' + fasta), fasta) for genome, fasta in args.fasta_files.items()} du = tools.toilInterface.find_total_disk_usage([input_file_ids.hints_db], buffer='4G') job = Job.wrapJobFn(setup, args, input_file_ids, memory='8G', disk=du) results, stdout_file_ids, param_file_id = t.start(job) else: results, stdout_file_ids, param_file_id = t.restart() tools.fileOps.ensure_file_dir(args.stdout_file) with open(args.stdout_file, 'w') as outf, tools.fileOps.TemporaryFilePath() as tmp: for (chrom, start, chunksize), stdout_file in stdout_file_ids.items(): outf.write('## BEGIN CHUNK chrom: {} start: {} chunksize: {}\n'.format(chrom, start, chunksize)) t.exportFile(stdout_file, 'file://' + tmp) for l in open(tmp): outf.write(l) for genome, (raw_gtf_file_id, joined_gtf_file_id, joined_gp_file_id) in results.items(): tools.fileOps.ensure_file_dir(args.augustus_cgp_raw_gtf[genome]) t.exportFile(raw_gtf_file_id, 'file://' + args.augustus_cgp_raw_gtf[genome]) t.exportFile(joined_gtf_file_id, 'file://' + args.augustus_cgp_gtf[genome]) t.exportFile(joined_gp_file_id, 'file://' + args.augustus_cgp_gp[genome]) if args.cgp_param is None: t.exportFile(param_file_id, 'file://' + args.param_out_path)
def test_download_structure(self) -> None: """ Make sure that download_structure makes the right calls to what it thinks is the file store. """ # Define what we would download fid1 = FileID('afile', 10, False) fid2 = FileID('adifferentfile', 1000, True) # And what directory structure it would be in structure = { 'dir1': { 'dir2': { 'f1': 'toilfile:' + fid1.pack(), 'f1again': 'toilfile:' + fid1.pack(), 'dir2sub': {} }, 'dir3': {} }, 'anotherfile': 'toilfile:' + fid2.pack() } # Say where to put it on the filesystem to_dir = self._createTempDir() # Make a fake file store file_store = Mock(AbstractFileStore) # These will be populated. # TODO: This cache seems unused. Remove it? # This maps filesystem path to CWL URI index = {} # This maps CWL URI to filesystem path existing = {} # Do the download download_structure(file_store, index, existing, structure, to_dir) # Check the results # 3 files should be made self.assertEqual(len(index), 3) # From 2 unique URIs self.assertEqual(len(existing), 2) # Make sure that the index contents (path to URI) are correct self.assertIn(os.path.join(to_dir, 'dir1/dir2/f1'), index) self.assertIn(os.path.join(to_dir, 'dir1/dir2/f1again'), index) self.assertIn(os.path.join(to_dir, 'anotherfile'), index) self.assertEqual(index[os.path.join(to_dir, 'dir1/dir2/f1')], structure['dir1']['dir2']['f1']) self.assertEqual(index[os.path.join(to_dir, 'dir1/dir2/f1again')], structure['dir1']['dir2']['f1again']) self.assertEqual(index[os.path.join(to_dir, 'anotherfile')], structure['anotherfile']) # And the existing contents (URI to path) self.assertIn('toilfile:' + fid1.pack(), existing) self.assertIn('toilfile:' + fid2.pack(), existing) self.assertIn(existing['toilfile:' + fid1.pack()], [ os.path.join(to_dir, 'dir1/dir2/f1'), os.path.join(to_dir, 'dir1/dir2/f1again') ]) self.assertEqual(existing['toilfile:' + fid2.pack()], os.path.join(to_dir, 'anotherfile')) # The directory structure should be created for real self.assertTrue(os.path.isdir(os.path.join(to_dir, 'dir1'))) self.assertTrue(os.path.isdir(os.path.join(to_dir, 'dir1/dir2'))) self.assertTrue( os.path.isdir(os.path.join(to_dir, 'dir1/dir2/dir2sub'))) self.assertTrue(os.path.isdir(os.path.join(to_dir, 'dir1/dir3'))) # The file store should have been asked to do the download file_store.readGlobalFile.assert_has_calls([ call(fid1, os.path.join(to_dir, 'dir1/dir2/f1'), symlink=True), call(fid1, os.path.join(to_dir, 'dir1/dir2/f1again'), symlink=True), call(fid2, os.path.join(to_dir, 'anotherfile'), symlink=True) ], any_order=True)