def run(self, fileStore): sequenceFiles1 = [ fileStore.readGlobalFile(fileID) for fileID in self.sequenceFileIDs1 ] if self.blastOptions.gpuLastz == True: # wga-gpu has a 3G limit. self.blastOptions.chunkSize = 3000000000 chunks = runGetChunks( sequenceFiles=sequenceFiles1, chunksDir=getTempDirectory(rootDir=fileStore.getLocalTempDir()), chunkSize=self.blastOptions.chunkSize, overlapSize=self.blastOptions.overlapSize) if len(chunks) == 0: raise Exception( "no chunks produced for files: {} ".format(sequenceFiles1)) logger.info( "Broken up the sequence files into individual 'chunk' files") chunkIDs = [ fileStore.writeGlobalFile(chunk, cleanup=True) for chunk in chunks ] diagonalResultsID = self.addChild( MakeSelfBlasts(self.blastOptions, chunkIDs)).rv() offDiagonalResultsID = self.addChild( MakeOffDiagonalBlasts(self.blastOptions, chunkIDs)).rv() logger.debug("Collating the blasts after blasting all-against-all") return self.addFollowOn( CollateBlasts(self.blastOptions, [diagonalResultsID, offDiagonalResultsID])).rv()
def run(self, fileStore): sequenceFiles1 = [ fileStore.readGlobalFile(fileID) for fileID in self.sequenceFileIDs1 ] chunks = runGetChunks( sequenceFiles=sequenceFiles1, chunksDir=getTempDirectory(rootDir=fileStore.getLocalTempDir()), chunkSize=self.blastOptions.chunkSize, overlapSize=self.blastOptions.overlapSize) assert len(chunks) > 0 logger.info( "Broken up the sequence files into individual 'chunk' files") chunkIDs = [ fileStore.writeGlobalFile(chunk, cleanup=True) for chunk in chunks ] diagonalResultsID = self.addChild( MakeSelfBlasts(self.blastOptions, chunkIDs)).rv() offDiagonalResultsID = self.addChild( MakeOffDiagonalBlasts(self.blastOptions, chunkIDs)).rv() logger.debug("Collating the blasts after blasting all-against-all") return self.addFollowOn( CollateBlasts(self.blastOptions, [diagonalResultsID, offDiagonalResultsID])).rv()
def run(self, fileStore): logger.info("Preparing sequence for preprocessing") inSequence = fileStore.readGlobalFile(self.inSequenceID) if self.prepOptions.chunkSize <= 0: # In this first case we don't need to break up the sequence chunked = False inChunkList = [inSequence] else: # chunk it up chunked = True inChunkDirectory = getTempDirectory( rootDir=fileStore.getLocalTempDir()) inChunkList = runGetChunks(sequenceFiles=[inSequence], chunksDir=inChunkDirectory, chunkSize=self.prepOptions.chunkSize, overlapSize=0) inChunkList = [os.path.abspath(path) for path in inChunkList] logger.info("Chunks = %s" % inChunkList) inChunkIDList = [ fileStore.writeGlobalFile(chunk, cleanup=True) for chunk in inChunkList ] outChunkIDList = [] #For each input chunk we create an output chunk, it is the output chunks that get concatenated together. if not self.chunksToCompute: self.chunksToCompute = list(range(len(inChunkList))) for i in self.chunksToCompute: #Calculate the number of chunks to use inChunkNumber = int( max( 1, math.ceil( len(inChunkList) * self.prepOptions.proportionToSample))) assert inChunkNumber <= len(inChunkList) and inChunkNumber > 0 #Now get the list of chunks flanking and including the current chunk j = max(0, i - inChunkNumber // 2) inChunkIDs = inChunkIDList[j:j + inChunkNumber] if len( inChunkIDs ) < inChunkNumber: #This logic is like making the list circular inChunkIDs += inChunkIDList[:inChunkNumber - len(inChunkIDs)] assert len(inChunkIDs) == inChunkNumber outChunkIDList.append( self.addChild( self.getChunkedJobForCurrentStage( inChunkIDs, float(inChunkNumber) / len(inChunkIDList), inChunkIDList[i])).rv()) if chunked: # Merge results of the chunking process back into a genome-wide file return self.addFollowOn( MergeChunks(self.prepOptions, outChunkIDList)).rv() else: # Didn't chunk--we have a genome-wide fasta file return outChunkIDList[0]
def run(self, fileStore): sequenceFiles1 = [fileStore.readGlobalFile(fileID) for fileID in self.sequenceFileIDs1] sequenceFiles2 = [fileStore.readGlobalFile(fileID) for fileID in self.sequenceFileIDs2] chunks1 = runGetChunks(sequenceFiles=sequenceFiles1, chunksDir=getTempDirectory(rootDir=fileStore.getLocalTempDir()), chunkSize=self.blastOptions.chunkSize, overlapSize=self.blastOptions.overlapSize) chunks2 = runGetChunks(sequenceFiles=sequenceFiles2, chunksDir=getTempDirectory(rootDir=fileStore.getLocalTempDir()), chunkSize=self.blastOptions.chunkSize, overlapSize=self.blastOptions.overlapSize) chunkIDs1 = [fileStore.writeGlobalFile(chunk, cleanup=True) for chunk in chunks1] chunkIDs2 = [fileStore.writeGlobalFile(chunk, cleanup=True) for chunk in chunks2] resultsIDs = [] #Make the list of blast jobs. for chunkID1 in chunkIDs1: for chunkID2 in chunkIDs2: #TODO: Make the compression work self.blastOptions.compressFiles = False resultsIDs.append(self.addChild(RunBlast(self.blastOptions, chunkID1, chunkID2)).rv()) logger.info("Made the list of blasts") #Set up the job to collate all the results return self.addFollowOn(CollateBlasts(self.blastOptions, resultsIDs)).rv()
def run(self, fileStore): sequenceFiles1 = [fileStore.readGlobalFile(fileID) for fileID in self.sequenceFileIDs1] chunks = runGetChunks(sequenceFiles=sequenceFiles1, chunksDir=getTempDirectory(rootDir=fileStore.getLocalTempDir()), chunkSize = self.blastOptions.chunkSize, overlapSize=self.blastOptions.overlapSize) assert len(chunks) > 0 logger.info("Broken up the sequence files into individual 'chunk' files") chunkIDs = [fileStore.writeGlobalFile(chunk, cleanup=True) for chunk in chunks] diagonalResultsID = self.addChild(MakeSelfBlasts(self.blastOptions, chunkIDs)).rv() offDiagonalResultsID = self.addChild(MakeOffDiagonalBlasts(self.blastOptions, chunkIDs)).rv() logger.debug("Collating the blasts after blasting all-against-all") return self.addFollowOn(CollateBlasts(self.blastOptions, [diagonalResultsID, offDiagonalResultsID])).rv()
def run(self, fileStore): logger.info("Preparing sequence for preprocessing") # chunk it up inSequence = fileStore.readGlobalFile(self.inSequenceID) inChunkDirectory = getTempDirectory( rootDir=fileStore.getLocalTempDir()) inChunkList = runGetChunks(sequenceFiles=[inSequence], chunksDir=inChunkDirectory, chunkSize=self.prepOptions.chunkSize, overlapSize=0) inChunkList = [os.path.abspath(path) for path in inChunkList] logger.info("Chunks = %s" % inChunkList) logger.info("Chunks dir = %s" % os.listdir(inChunkDirectory)) inChunkIDList = [ fileStore.writeGlobalFile(chunk) for chunk in inChunkList ] outChunkIDList = [] #For each input chunk we create an output chunk, it is the output chunks that get concatenated together. if not self.chunksToCompute: self.chunksToCompute = range(len(inChunkList)) for i in self.chunksToCompute: #Calculate the number of chunks to use inChunkNumber = int( max( 1, math.ceil( len(inChunkList) * self.prepOptions.proportionToSample))) assert inChunkNumber <= len(inChunkList) and inChunkNumber > 0 #Now get the list of chunks flanking and including the current chunk j = max(0, i - inChunkNumber / 2) inChunkIDs = inChunkIDList[j:j + inChunkNumber] if len( inChunkIDs ) < inChunkNumber: #This logic is like making the list circular inChunkIDs += inChunkIDList[:inChunkNumber - len(inChunkIDs)] assert len(inChunkIDs) == inChunkNumber outChunkIDList.append( self.addChild( PreprocessChunk(self.prepOptions, inChunkIDs, float(inChunkNumber) / len(inChunkIDList), inChunkIDList[i])).rv()) # follow on to merge chunks return self.addFollowOn(MergeChunks(self.prepOptions, outChunkIDList)).rv()
def run(self, fileStore): logger.info("Preparing sequence for preprocessing") inSequence = fileStore.readGlobalFile(self.inSequenceID) if self.prepOptions.chunkSize <= 0: # In this first case we don't need to break up the sequence chunked = False inChunkList = [inSequence] else: # chunk it up chunked = True inChunkDirectory = getTempDirectory(rootDir=fileStore.getLocalTempDir()) inChunkList = runGetChunks(sequenceFiles=[inSequence], chunksDir=inChunkDirectory, chunkSize=self.prepOptions.chunkSize, overlapSize=0) inChunkList = [os.path.abspath(path) for path in inChunkList] logger.info("Chunks = %s" % inChunkList) inChunkIDList = [fileStore.writeGlobalFile(chunk, cleanup=True) for chunk in inChunkList] outChunkIDList = [] #For each input chunk we create an output chunk, it is the output chunks that get concatenated together. if not self.chunksToCompute: self.chunksToCompute = range(len(inChunkList)) for i in self.chunksToCompute: #Calculate the number of chunks to use inChunkNumber = int(max(1, math.ceil(len(inChunkList) * self.prepOptions.proportionToSample))) assert inChunkNumber <= len(inChunkList) and inChunkNumber > 0 #Now get the list of chunks flanking and including the current chunk j = max(0, i - inChunkNumber/2) inChunkIDs = inChunkIDList[j:j+inChunkNumber] if len(inChunkIDs) < inChunkNumber: #This logic is like making the list circular inChunkIDs += inChunkIDList[:inChunkNumber-len(inChunkIDs)] assert len(inChunkIDs) == inChunkNumber outChunkIDList.append(self.addChild(self.getChunkedJobForCurrentStage(inChunkIDs, float(inChunkNumber)/len(inChunkIDList), inChunkIDList[i])).rv()) if chunked: # Merge results of the chunking process back into a genome-wide file return self.addFollowOn(MergeChunks(self.prepOptions, outChunkIDList)).rv() else: # Didn't chunk--we have a genome-wide fasta file return outChunkIDList[0]