Esempio n. 1
0
    def run(self, fileStore):
        outChunkID = None
        if self.prepOptions.preprocessJob == "checkUniqueHeaders":
            inChunk = fileStore.readGlobalFile(self.inChunkID)
            seqPaths = [
                fileStore.readGlobalFile(fileID) for fileID in self.seqIDs
            ]
            seqString = " ".join(seqPaths)
            args = [inChunk]
            if self.prepOptions.checkAssemblyHub:
                args += ["--checkAssemblyHub"]
            cactus_call(stdin_string=seqString,
                        parameters=["cactus_checkUniqueHeaders.py"] + args)
            outChunkID = self.inChunkID
        elif self.prepOptions.preprocessJob == "lastzRepeatMask":
            repeatMaskOptions = RepeatMaskOptions(
                proportionSampled=self.prepOptions.proportionToSample,
                minPeriod=self.prepOptions.minPeriod)
            outChunkID = self.addChild(
                LastzRepeatMaskJob(repeatMaskOptions=repeatMaskOptions,
                                   queryID=self.inChunkID,
                                   targetIDs=self.seqIDs)).rv()
        elif self.prepOptions.preprocessJob == "none":
            outChunkID = self.inChunkID

        return outChunkID
Esempio n. 2
0
 def getChunkedJobForCurrentStage(self, seqIDs, proportionSampled,
                                  inChunkID):
     """
     Give the chunked work to the appropriate job.
     """
     if self.prepOptions.preprocessJob == "checkUniqueHeaders":
         return CheckUniqueHeaders(self.prepOptions, inChunkID)
     elif self.prepOptions.preprocessJob == "lastzRepeatMask":
         repeatMaskOptions = RepeatMaskOptions(
             proportionSampled=proportionSampled,
             minPeriod=self.prepOptions.minPeriod,
             lastzOpts=self.prepOptions.lastzOptions,
             gpuLastz=self.prepOptions.gpuLastz,
             gpuLastzInterval=self.prepOptions.gpuLastzInterval)
         return LastzRepeatMaskJob(repeatMaskOptions=repeatMaskOptions,
                                   queryID=inChunkID,
                                   targetIDs=seqIDs)
     elif self.prepOptions.preprocessJob == "dna-brnn":
         return DnabrnnMaskJob(
             inChunkID,
             dnabrnnOpts=self.prepOptions.dnabrnnOpts,
             minLength=self.prepOptions.dnabrnnLength,
             mergeLength=self.prepOptions.dnabrnnMerge,
             action=self.prepOptions.dnabrnnAction,
             inputBedID=self.prepOptions.dnabrnnInputBedID,
             eventName=self.prepOptions.dnabrnnEventName,
             cpu=self.prepOptions.cpu)
     else:
         raise RuntimeError("Unknown preprocess job %s" %
                            self.prepOptions.preprocessJob)
Esempio n. 3
0
 def getChunkedJobForCurrentStage(self, seqIDs, proportionSampled, inChunkID):
     """
     Give the chunked work to the appropriate job.
     """
     if self.prepOptions.preprocessJob == "checkUniqueHeaders":
         return CheckUniqueHeaders(self.prepOptions, inChunkID)
     elif self.prepOptions.preprocessJob == "lastzRepeatMask":
         repeatMaskOptions = RepeatMaskOptions(proportionSampled=proportionSampled,
                                               minPeriod=self.prepOptions.minPeriod,
                                               lastzOpts=self.prepOptions.lastzOptions)
         return LastzRepeatMaskJob(repeatMaskOptions=repeatMaskOptions,
                                   queryID=inChunkID,
                                   targetIDs=seqIDs)
     else:
         raise RuntimeError("Unknown preprocess job %s" % self.prepOptions.preprocessJob)
 def testCactusPreprocessor(self):
     #Demo sequences
     sequenceNames = [ "%s.ENm001.fa" % species for species in ['human', 'hedgehog'] ]
     sequenceFiles = [ os.path.join(self.encodePath, self.encodeRegion, sequenceName) for sequenceName in sequenceNames ]
     #Make config file
     configFile = os.path.join(self.tempDir, "config.xml")
     rootElem =  ET.Element("preprocessor")
     #<preprocessor chunkSize="10000" proportionToSample="0.2" memory="littleMemory" preprocessorString="cactus_lastzRepeatMask.py --proportionSampled=PROPORTION_SAMPLED --minPeriod=1 --lastzOpts='--step=1 --ambiguous=iupac,100 --ungapped' IN_FILE OUT_FILE "/>
     preprocessor = ET.SubElement(rootElem, "preprocessor")
     preprocessor.attrib["chunkSize"] = "100000"
     preprocessor.attrib["proportionToSample"] = "0.2"
     preprocessor.attrib["preprocessJob"] = "lastzRepeatMask"
     preprocessor.attrib["minPeriod"] = "1"
     preprocessor.attrib["lastzOpts"] = "--step=1 --ambiguous=iupac,100 --ungapped"
     preprocessor.attrib["fragment"] = "200"
     fileHandle = open(configFile, "w")
     fileHandle.write(ET.tostring(rootElem))
     fileHandle.close()
     #Run preprocessor
     tmpToil = os.path.join(self.tempDir, "toil")
     runCactusPreprocessor(outputSequenceDir=self.tempDir, configFile=configFile, inputSequences=sequenceFiles, toilDir=tmpToil)
     
     for sequenceFile, processedSequenceFile in zip(sequenceFiles, CactusPreprocessor.getOutputSequenceFiles(sequenceFiles, self.tempDir)):
         print "sequenceFile: %s" % sequenceFile
         print "output sequence file: %s" % processedSequenceFile
         #Parse sequences into dictionary
         originalSequences = getSequences(sequenceFile)
         #Load the new sequences
         processedSequences = getSequences(processedSequenceFile)
         
         #Check they are the same module masking
         self.checkSequenceSetsEqualModuloSoftMasking(originalSequences, processedSequences)
         
         #Compare the proportion of bases masked by lastz with original repeat masking
         maskedBasesOriginal = getMaskedBases(originalSequences)
         maskedBasesLastzMasked = getMaskedBases(processedSequences)
         #Total bases
         totalBases = sum([ len(i) for i in originalSequences.values() ])
         #Calculate number of hard masked bases
         totalNBases = len([ (header, i, base) for (header, i, base) in maskedBasesOriginal if base.upper() == "N" ])
         
         print " For the sequence file ", sequenceFile, \
          " the total number of sequences is ", len(originalSequences), \
          " the total number of bases ", totalBases, \
          " the number of bases originally masked was: ", len(maskedBasesOriginal),\
          " the number of bases masked after running lastz repeat masking is: ", len(maskedBasesLastzMasked), \
          " the intersection of these masked sets is: ", len(maskedBasesLastzMasked.intersection(maskedBasesOriginal)), \
          " the total number of bases that are Ns ", totalNBases
          
         #Now compare to running lastz on its own
         toilOptions = Job.Runner.getDefaultOptions(os.path.join(self.tempDir, "lastzRepeatMaskToil"))
         toilOptions.logLevel = "CRITICAL"
         with Toil(toilOptions) as toil:
             queryID = toil.importFile(makeURL(sequenceFile))
             targetIDs = [queryID]
             repeatMaskedID = toil.start(LastzRepeatMaskJob(queryID=queryID, targetIDs=targetIDs, repeatMaskOptions=RepeatMaskOptions(lastzOpts='--step=1 --ambiguous=iupac,100 --ungapped --queryhsplimit=keep,nowarn:30', minPeriod=1, proportionSampled=0.2, fragment=200)))
             toil.exportFile(repeatMaskedID, makeURL(self.tempOutputFile))
             
         lastzSequencesFast = getSequences(self.tempOutputFile)
         maskedBasesLastzMaskedFast = getMaskedBases(lastzSequencesFast)
         
         i = float(len(maskedBasesLastzMaskedFast.intersection(maskedBasesLastzMasked)))
         print " The number of bases masked after running lastz repeat masking without the preprocessor is: ", len(maskedBasesLastzMaskedFast), \
          " the recall of the fast vs. the new is: ", i/len(maskedBasesLastzMasked), \
          " the precision of the fast vs. the new is: ", i/len(maskedBasesLastzMaskedFast)
Esempio n. 5
0
    def testLastzRepeatMask(self):
        #Demo sequences
        sequenceFiles = [
            os.path.join(self.encodePath, self.encodeRegion,
                         "%s.ENm001.fa" % species)
            for species in ('human', "hedgehog")
        ]
        #Max occurrences of a repeat within the sequence
        maxOccurrence = 1

        for sequenceFile in sequenceFiles:
            #Parse sequences into dictionary
            originalSequences = getSequences(sequenceFile)
            #Get the masked bases
            maskedBasesOriginal = getMaskedBases(originalSequences)
            #Total bases
            totalBases = sum(
                [len(i) for i in list(originalSequences.values())])
            #Calculate number of hard masked bases
            totalNBases = len([(header, i, base)
                               for (header, i, base) in maskedBasesOriginal
                               if base.upper() == "N"])

            #Run lastz repeat masker
            startTime = time.time()
            with Toil(self.toilOptions) as toil:
                sequenceID = toil.importFile(makeURL(sequenceFile))
                repeatMaskOptions = RepeatMaskOptions(
                    proportionSampled=1.0,
                    minPeriod=maxOccurrence,
                    lastzOpts="--step=1 --ambiguous=iupac,100,100 --ydrop=3000",
                    fragment=200)

                outputID = toil.start(
                    LastzRepeatMaskJob(repeatMaskOptions=repeatMaskOptions,
                                       queryID=sequenceID,
                                       targetIDs=[sequenceID]))
                toil.exportFile(outputID, makeURL(self.tempOutputFile))
            print(("It took %s seconds to run lastzMasking" %
                   (time.time() - startTime)))

            #Parse lastz masked sequences into dictionary
            lastzSequences = getSequences(self.tempOutputFile)

            #Check the sequences are the same modulo masking
            self.checkSequenceSetsEqualModuloSoftMasking(
                originalSequences, lastzSequences)

            #Compare the proportion of bases masked by lastz with original repeat masking
            maskedBasesOriginal = getMaskedBases(originalSequences)
            maskedBasesLastzMasked = getMaskedBases(lastzSequences)
            print((" For the sequence file ", sequenceFile, \
             " the total number of sequences is ", len(originalSequences), \
             " the total number of bases ", totalBases, \
             " the number of bases originally masked was: ", len(maskedBasesOriginal),\
             " the number of bases masked after running lastz repeat masking is: ", len(maskedBasesLastzMasked), \
             " the intersection of these masked sets is: ", len(maskedBasesLastzMasked.intersection(maskedBasesOriginal)), \
             " the total number of bases that are Ns ", totalNBases, \
             " lastz was filter for max-occurrences of more than : ", maxOccurrence))
            #self.assertGreater(len(maskedBasesLastzMasked), len(maskedBasesOriginal))

            #Run lastz repeat masker using heuristic settings for comparison with the slower settings
            startTime = time.time()
            with Toil(self.toilOptions) as toil:
                sequenceID = toil.importFile(makeURL(sequenceFile))
                repeatMaskOptions = RepeatMaskOptions(
                    proportionSampled=1.0,
                    minPeriod=maxOccurrence,
                    lastzOpts=
                    "--step=3 --ambiguous=iupac,100,100 --ungapped --queryhsplimit=keep,nowarn:%i"
                    % (int(maxOccurrence) * 20),
                    fragment=200)
                outputID = toil.start(
                    LastzRepeatMaskJob(repeatMaskOptions=repeatMaskOptions,
                                       queryID=sequenceID,
                                       targetIDs=[sequenceID]))
                toil.exportFile(outputID, makeURL(self.tempOutputFile))
            print(("It took %s seconds to run lastzMasking fast" %
                   (time.time() - startTime)))
            lastzSequencesFast = getSequences(self.tempOutputFile)
            maskedBasesLastzMaskedFast = getMaskedBases(lastzSequencesFast)

            self.assertGreater(len(maskedBasesLastzMaskedFast),
                               len(maskedBasesOriginal))
            i = float(
                len(
                    maskedBasesLastzMaskedFast.intersection(
                        maskedBasesLastzMasked)))
            precision = i / len(maskedBasesLastzMasked)
            recall = i / len(maskedBasesLastzMaskedFast)
            self.assertGreater(precision, 0.93)
            self.assertGreater(recall, 0.93)