Ejemplo n.º 1
0
    def run(self, fileStore):
        sequenceFiles1 = [
            fileStore.readGlobalFile(fileID)
            for fileID in self.sequenceFileIDs1
        ]
        if self.blastOptions.gpuLastz == True:
            # wga-gpu has a 3G limit.
            self.blastOptions.chunkSize = 3000000000
        chunks = runGetChunks(
            sequenceFiles=sequenceFiles1,
            chunksDir=getTempDirectory(rootDir=fileStore.getLocalTempDir()),
            chunkSize=self.blastOptions.chunkSize,
            overlapSize=self.blastOptions.overlapSize)
        if len(chunks) == 0:
            raise Exception(
                "no chunks produced for files: {} ".format(sequenceFiles1))
        logger.info(
            "Broken up the sequence files into individual 'chunk' files")
        chunkIDs = [
            fileStore.writeGlobalFile(chunk, cleanup=True) for chunk in chunks
        ]

        diagonalResultsID = self.addChild(
            MakeSelfBlasts(self.blastOptions, chunkIDs)).rv()
        offDiagonalResultsID = self.addChild(
            MakeOffDiagonalBlasts(self.blastOptions, chunkIDs)).rv()
        logger.debug("Collating the blasts after blasting all-against-all")
        return self.addFollowOn(
            CollateBlasts(self.blastOptions,
                          [diagonalResultsID, offDiagonalResultsID])).rv()
Ejemplo n.º 2
0
    def run(self, fileStore):
        sequenceFiles1 = [
            fileStore.readGlobalFile(fileID)
            for fileID in self.sequenceFileIDs1
        ]
        chunks = runGetChunks(
            sequenceFiles=sequenceFiles1,
            chunksDir=getTempDirectory(rootDir=fileStore.getLocalTempDir()),
            chunkSize=self.blastOptions.chunkSize,
            overlapSize=self.blastOptions.overlapSize)
        assert len(chunks) > 0
        logger.info(
            "Broken up the sequence files into individual 'chunk' files")
        chunkIDs = [
            fileStore.writeGlobalFile(chunk, cleanup=True) for chunk in chunks
        ]

        diagonalResultsID = self.addChild(
            MakeSelfBlasts(self.blastOptions, chunkIDs)).rv()
        offDiagonalResultsID = self.addChild(
            MakeOffDiagonalBlasts(self.blastOptions, chunkIDs)).rv()
        logger.debug("Collating the blasts after blasting all-against-all")
        return self.addFollowOn(
            CollateBlasts(self.blastOptions,
                          [diagonalResultsID, offDiagonalResultsID])).rv()
Ejemplo n.º 3
0
    def run(self, fileStore):
        logger.info("Preparing sequence for preprocessing")

        inSequence = fileStore.readGlobalFile(self.inSequenceID)

        if self.prepOptions.chunkSize <= 0:
            # In this first case we don't need to break up the sequence
            chunked = False
            inChunkList = [inSequence]
        else:
            # chunk it up
            chunked = True
            inChunkDirectory = getTempDirectory(
                rootDir=fileStore.getLocalTempDir())
            inChunkList = runGetChunks(sequenceFiles=[inSequence],
                                       chunksDir=inChunkDirectory,
                                       chunkSize=self.prepOptions.chunkSize,
                                       overlapSize=0)
            inChunkList = [os.path.abspath(path) for path in inChunkList]
        logger.info("Chunks = %s" % inChunkList)

        inChunkIDList = [
            fileStore.writeGlobalFile(chunk, cleanup=True)
            for chunk in inChunkList
        ]
        outChunkIDList = []
        #For each input chunk we create an output chunk, it is the output chunks that get concatenated together.
        if not self.chunksToCompute:
            self.chunksToCompute = list(range(len(inChunkList)))
        for i in self.chunksToCompute:
            #Calculate the number of chunks to use
            inChunkNumber = int(
                max(
                    1,
                    math.ceil(
                        len(inChunkList) *
                        self.prepOptions.proportionToSample)))
            assert inChunkNumber <= len(inChunkList) and inChunkNumber > 0
            #Now get the list of chunks flanking and including the current chunk
            j = max(0, i - inChunkNumber // 2)
            inChunkIDs = inChunkIDList[j:j + inChunkNumber]
            if len(
                    inChunkIDs
            ) < inChunkNumber:  #This logic is like making the list circular
                inChunkIDs += inChunkIDList[:inChunkNumber - len(inChunkIDs)]
            assert len(inChunkIDs) == inChunkNumber
            outChunkIDList.append(
                self.addChild(
                    self.getChunkedJobForCurrentStage(
                        inChunkIDs,
                        float(inChunkNumber) / len(inChunkIDList),
                        inChunkIDList[i])).rv())

        if chunked:
            # Merge results of the chunking process back into a genome-wide file
            return self.addFollowOn(
                MergeChunks(self.prepOptions, outChunkIDList)).rv()
        else:
            # Didn't chunk--we have a genome-wide fasta file
            return outChunkIDList[0]
Ejemplo n.º 4
0
 def run(self, fileStore):
     sequenceFiles1 = [fileStore.readGlobalFile(fileID) for fileID in self.sequenceFileIDs1]
     sequenceFiles2 = [fileStore.readGlobalFile(fileID) for fileID in self.sequenceFileIDs2]
     chunks1 = runGetChunks(sequenceFiles=sequenceFiles1, chunksDir=getTempDirectory(rootDir=fileStore.getLocalTempDir()), chunkSize=self.blastOptions.chunkSize, overlapSize=self.blastOptions.overlapSize)
     chunks2 = runGetChunks(sequenceFiles=sequenceFiles2, chunksDir=getTempDirectory(rootDir=fileStore.getLocalTempDir()), chunkSize=self.blastOptions.chunkSize, overlapSize=self.blastOptions.overlapSize)
     chunkIDs1 = [fileStore.writeGlobalFile(chunk, cleanup=True) for chunk in chunks1]
     chunkIDs2 = [fileStore.writeGlobalFile(chunk, cleanup=True) for chunk in chunks2]
     resultsIDs = []
     #Make the list of blast jobs.
     for chunkID1 in chunkIDs1:
         for chunkID2 in chunkIDs2:
             #TODO: Make the compression work
             self.blastOptions.compressFiles = False
             resultsIDs.append(self.addChild(RunBlast(self.blastOptions, chunkID1, chunkID2)).rv())
     logger.info("Made the list of blasts")
     #Set up the job to collate all the results
     return self.addFollowOn(CollateBlasts(self.blastOptions, resultsIDs)).rv()
Ejemplo n.º 5
0
 def run(self, fileStore):
     sequenceFiles1 = [fileStore.readGlobalFile(fileID) for fileID in self.sequenceFileIDs1]
     sequenceFiles2 = [fileStore.readGlobalFile(fileID) for fileID in self.sequenceFileIDs2]
     chunks1 = runGetChunks(sequenceFiles=sequenceFiles1, chunksDir=getTempDirectory(rootDir=fileStore.getLocalTempDir()), chunkSize=self.blastOptions.chunkSize, overlapSize=self.blastOptions.overlapSize)
     chunks2 = runGetChunks(sequenceFiles=sequenceFiles2, chunksDir=getTempDirectory(rootDir=fileStore.getLocalTempDir()), chunkSize=self.blastOptions.chunkSize, overlapSize=self.blastOptions.overlapSize)
     chunkIDs1 = [fileStore.writeGlobalFile(chunk, cleanup=True) for chunk in chunks1]
     chunkIDs2 = [fileStore.writeGlobalFile(chunk, cleanup=True) for chunk in chunks2]
     resultsIDs = []
     #Make the list of blast jobs.
     for chunkID1 in chunkIDs1:
         for chunkID2 in chunkIDs2:
             #TODO: Make the compression work
             self.blastOptions.compressFiles = False
             resultsIDs.append(self.addChild(RunBlast(self.blastOptions, chunkID1, chunkID2)).rv())
     logger.info("Made the list of blasts")
     #Set up the job to collate all the results
     return self.addFollowOn(CollateBlasts(self.blastOptions, resultsIDs)).rv()
Ejemplo n.º 6
0
    def run(self, fileStore):
        sequenceFiles1 = [fileStore.readGlobalFile(fileID) for fileID in self.sequenceFileIDs1]
        chunks = runGetChunks(sequenceFiles=sequenceFiles1, chunksDir=getTempDirectory(rootDir=fileStore.getLocalTempDir()), chunkSize = self.blastOptions.chunkSize, overlapSize=self.blastOptions.overlapSize)
        assert len(chunks) > 0
        logger.info("Broken up the sequence files into individual 'chunk' files")
        chunkIDs = [fileStore.writeGlobalFile(chunk, cleanup=True) for chunk in chunks]

        diagonalResultsID = self.addChild(MakeSelfBlasts(self.blastOptions, chunkIDs)).rv()
        offDiagonalResultsID = self.addChild(MakeOffDiagonalBlasts(self.blastOptions, chunkIDs)).rv()
        logger.debug("Collating the blasts after blasting all-against-all")
        return self.addFollowOn(CollateBlasts(self.blastOptions, [diagonalResultsID, offDiagonalResultsID])).rv()
Ejemplo n.º 7
0
    def run(self, fileStore):
        logger.info("Preparing sequence for preprocessing")
        # chunk it up
        inSequence = fileStore.readGlobalFile(self.inSequenceID)
        inChunkDirectory = getTempDirectory(
            rootDir=fileStore.getLocalTempDir())
        inChunkList = runGetChunks(sequenceFiles=[inSequence],
                                   chunksDir=inChunkDirectory,
                                   chunkSize=self.prepOptions.chunkSize,
                                   overlapSize=0)
        inChunkList = [os.path.abspath(path) for path in inChunkList]
        logger.info("Chunks = %s" % inChunkList)
        logger.info("Chunks dir = %s" % os.listdir(inChunkDirectory))

        inChunkIDList = [
            fileStore.writeGlobalFile(chunk) for chunk in inChunkList
        ]
        outChunkIDList = []
        #For each input chunk we create an output chunk, it is the output chunks that get concatenated together.
        if not self.chunksToCompute:
            self.chunksToCompute = range(len(inChunkList))
        for i in self.chunksToCompute:
            #Calculate the number of chunks to use
            inChunkNumber = int(
                max(
                    1,
                    math.ceil(
                        len(inChunkList) *
                        self.prepOptions.proportionToSample)))
            assert inChunkNumber <= len(inChunkList) and inChunkNumber > 0
            #Now get the list of chunks flanking and including the current chunk
            j = max(0, i - inChunkNumber / 2)
            inChunkIDs = inChunkIDList[j:j + inChunkNumber]
            if len(
                    inChunkIDs
            ) < inChunkNumber:  #This logic is like making the list circular
                inChunkIDs += inChunkIDList[:inChunkNumber - len(inChunkIDs)]
            assert len(inChunkIDs) == inChunkNumber
            outChunkIDList.append(
                self.addChild(
                    PreprocessChunk(self.prepOptions, inChunkIDs,
                                    float(inChunkNumber) / len(inChunkIDList),
                                    inChunkIDList[i])).rv())
        # follow on to merge chunks
        return self.addFollowOn(MergeChunks(self.prepOptions,
                                            outChunkIDList)).rv()
Ejemplo n.º 8
0
    def run(self, fileStore):
        logger.info("Preparing sequence for preprocessing")

        inSequence = fileStore.readGlobalFile(self.inSequenceID)

        if self.prepOptions.chunkSize <= 0:
            # In this first case we don't need to break up the sequence
            chunked = False
            inChunkList = [inSequence]
        else:
            # chunk it up
            chunked = True
            inChunkDirectory = getTempDirectory(rootDir=fileStore.getLocalTempDir())
            inChunkList = runGetChunks(sequenceFiles=[inSequence], chunksDir=inChunkDirectory,
                                       chunkSize=self.prepOptions.chunkSize,
                                       overlapSize=0)
            inChunkList = [os.path.abspath(path) for path in inChunkList]
        logger.info("Chunks = %s" % inChunkList)

        inChunkIDList = [fileStore.writeGlobalFile(chunk, cleanup=True) for chunk in inChunkList]
        outChunkIDList = []
        #For each input chunk we create an output chunk, it is the output chunks that get concatenated together.
        if not self.chunksToCompute:
            self.chunksToCompute = range(len(inChunkList))
        for i in self.chunksToCompute:
            #Calculate the number of chunks to use
            inChunkNumber = int(max(1, math.ceil(len(inChunkList) * self.prepOptions.proportionToSample)))
            assert inChunkNumber <= len(inChunkList) and inChunkNumber > 0
            #Now get the list of chunks flanking and including the current chunk
            j = max(0, i - inChunkNumber/2)
            inChunkIDs = inChunkIDList[j:j+inChunkNumber]
            if len(inChunkIDs) < inChunkNumber: #This logic is like making the list circular
                inChunkIDs += inChunkIDList[:inChunkNumber-len(inChunkIDs)]
            assert len(inChunkIDs) == inChunkNumber
            outChunkIDList.append(self.addChild(self.getChunkedJobForCurrentStage(inChunkIDs, float(inChunkNumber)/len(inChunkIDList), inChunkIDList[i])).rv())

        if chunked:
            # Merge results of the chunking process back into a genome-wide file
            return self.addFollowOn(MergeChunks(self.prepOptions, outChunkIDList)).rv()
        else:
            # Didn't chunk--we have a genome-wide fasta file
            return outChunkIDList[0]