def run(self, fileStore): outChunkID = None if self.prepOptions.preprocessJob == "checkUniqueHeaders": inChunk = fileStore.readGlobalFile(self.inChunkID) seqPaths = [ fileStore.readGlobalFile(fileID) for fileID in self.seqIDs ] seqString = " ".join(seqPaths) args = [inChunk] if self.prepOptions.checkAssemblyHub: args += ["--checkAssemblyHub"] cactus_call(stdin_string=seqString, parameters=["cactus_checkUniqueHeaders.py"] + args) outChunkID = self.inChunkID elif self.prepOptions.preprocessJob == "lastzRepeatMask": repeatMaskOptions = RepeatMaskOptions( proportionSampled=self.prepOptions.proportionToSample, minPeriod=self.prepOptions.minPeriod) outChunkID = self.addChild( LastzRepeatMaskJob(repeatMaskOptions=repeatMaskOptions, queryID=self.inChunkID, targetIDs=self.seqIDs)).rv() elif self.prepOptions.preprocessJob == "none": outChunkID = self.inChunkID return outChunkID
def getChunkedJobForCurrentStage(self, seqIDs, proportionSampled, inChunkID): """ Give the chunked work to the appropriate job. """ if self.prepOptions.preprocessJob == "checkUniqueHeaders": return CheckUniqueHeaders(self.prepOptions, inChunkID) elif self.prepOptions.preprocessJob == "lastzRepeatMask": repeatMaskOptions = RepeatMaskOptions( proportionSampled=proportionSampled, minPeriod=self.prepOptions.minPeriod, lastzOpts=self.prepOptions.lastzOptions, gpuLastz=self.prepOptions.gpuLastz, gpuLastzInterval=self.prepOptions.gpuLastzInterval) return LastzRepeatMaskJob(repeatMaskOptions=repeatMaskOptions, queryID=inChunkID, targetIDs=seqIDs) elif self.prepOptions.preprocessJob == "dna-brnn": return DnabrnnMaskJob( inChunkID, dnabrnnOpts=self.prepOptions.dnabrnnOpts, minLength=self.prepOptions.dnabrnnLength, mergeLength=self.prepOptions.dnabrnnMerge, action=self.prepOptions.dnabrnnAction, inputBedID=self.prepOptions.dnabrnnInputBedID, eventName=self.prepOptions.dnabrnnEventName, cpu=self.prepOptions.cpu) else: raise RuntimeError("Unknown preprocess job %s" % self.prepOptions.preprocessJob)
def getChunkedJobForCurrentStage(self, seqIDs, proportionSampled, inChunkID): """ Give the chunked work to the appropriate job. """ if self.prepOptions.preprocessJob == "checkUniqueHeaders": return CheckUniqueHeaders(self.prepOptions, inChunkID) elif self.prepOptions.preprocessJob == "lastzRepeatMask": repeatMaskOptions = RepeatMaskOptions(proportionSampled=proportionSampled, minPeriod=self.prepOptions.minPeriod, lastzOpts=self.prepOptions.lastzOptions) return LastzRepeatMaskJob(repeatMaskOptions=repeatMaskOptions, queryID=inChunkID, targetIDs=seqIDs) else: raise RuntimeError("Unknown preprocess job %s" % self.prepOptions.preprocessJob)
def testCactusPreprocessor(self): #Demo sequences sequenceNames = [ "%s.ENm001.fa" % species for species in ['human', 'hedgehog'] ] sequenceFiles = [ os.path.join(self.encodePath, self.encodeRegion, sequenceName) for sequenceName in sequenceNames ] #Make config file configFile = os.path.join(self.tempDir, "config.xml") rootElem = ET.Element("preprocessor") #<preprocessor chunkSize="10000" proportionToSample="0.2" memory="littleMemory" preprocessorString="cactus_lastzRepeatMask.py --proportionSampled=PROPORTION_SAMPLED --minPeriod=1 --lastzOpts='--step=1 --ambiguous=iupac,100 --ungapped' IN_FILE OUT_FILE "/> preprocessor = ET.SubElement(rootElem, "preprocessor") preprocessor.attrib["chunkSize"] = "100000" preprocessor.attrib["proportionToSample"] = "0.2" preprocessor.attrib["preprocessJob"] = "lastzRepeatMask" preprocessor.attrib["minPeriod"] = "1" preprocessor.attrib["lastzOpts"] = "--step=1 --ambiguous=iupac,100 --ungapped" preprocessor.attrib["fragment"] = "200" fileHandle = open(configFile, "w") fileHandle.write(ET.tostring(rootElem)) fileHandle.close() #Run preprocessor tmpToil = os.path.join(self.tempDir, "toil") runCactusPreprocessor(outputSequenceDir=self.tempDir, configFile=configFile, inputSequences=sequenceFiles, toilDir=tmpToil) for sequenceFile, processedSequenceFile in zip(sequenceFiles, CactusPreprocessor.getOutputSequenceFiles(sequenceFiles, self.tempDir)): print "sequenceFile: %s" % sequenceFile print "output sequence file: %s" % processedSequenceFile #Parse sequences into dictionary originalSequences = getSequences(sequenceFile) #Load the new sequences processedSequences = getSequences(processedSequenceFile) #Check they are the same module masking self.checkSequenceSetsEqualModuloSoftMasking(originalSequences, processedSequences) #Compare the proportion of bases masked by lastz with original repeat masking maskedBasesOriginal = getMaskedBases(originalSequences) maskedBasesLastzMasked = getMaskedBases(processedSequences) #Total bases totalBases = sum([ len(i) for i in originalSequences.values() ]) #Calculate number of hard masked bases totalNBases = len([ (header, i, base) for (header, i, base) in maskedBasesOriginal if base.upper() == "N" ]) print " For the sequence file ", sequenceFile, \ " the total number of sequences is ", len(originalSequences), \ " the total number of bases ", totalBases, \ " the number of bases originally masked was: ", len(maskedBasesOriginal),\ " the number of bases masked after running lastz repeat masking is: ", len(maskedBasesLastzMasked), \ " the intersection of these masked sets is: ", len(maskedBasesLastzMasked.intersection(maskedBasesOriginal)), \ " the total number of bases that are Ns ", totalNBases #Now compare to running lastz on its own toilOptions = Job.Runner.getDefaultOptions(os.path.join(self.tempDir, "lastzRepeatMaskToil")) toilOptions.logLevel = "CRITICAL" with Toil(toilOptions) as toil: queryID = toil.importFile(makeURL(sequenceFile)) targetIDs = [queryID] repeatMaskedID = toil.start(LastzRepeatMaskJob(queryID=queryID, targetIDs=targetIDs, repeatMaskOptions=RepeatMaskOptions(lastzOpts='--step=1 --ambiguous=iupac,100 --ungapped --queryhsplimit=keep,nowarn:30', minPeriod=1, proportionSampled=0.2, fragment=200))) toil.exportFile(repeatMaskedID, makeURL(self.tempOutputFile)) lastzSequencesFast = getSequences(self.tempOutputFile) maskedBasesLastzMaskedFast = getMaskedBases(lastzSequencesFast) i = float(len(maskedBasesLastzMaskedFast.intersection(maskedBasesLastzMasked))) print " The number of bases masked after running lastz repeat masking without the preprocessor is: ", len(maskedBasesLastzMaskedFast), \ " the recall of the fast vs. the new is: ", i/len(maskedBasesLastzMasked), \ " the precision of the fast vs. the new is: ", i/len(maskedBasesLastzMaskedFast)
def testLastzRepeatMask(self): #Demo sequences sequenceFiles = [ os.path.join(self.encodePath, self.encodeRegion, "%s.ENm001.fa" % species) for species in ('human', "hedgehog") ] #Max occurrences of a repeat within the sequence maxOccurrence = 1 for sequenceFile in sequenceFiles: #Parse sequences into dictionary originalSequences = getSequences(sequenceFile) #Get the masked bases maskedBasesOriginal = getMaskedBases(originalSequences) #Total bases totalBases = sum( [len(i) for i in list(originalSequences.values())]) #Calculate number of hard masked bases totalNBases = len([(header, i, base) for (header, i, base) in maskedBasesOriginal if base.upper() == "N"]) #Run lastz repeat masker startTime = time.time() with Toil(self.toilOptions) as toil: sequenceID = toil.importFile(makeURL(sequenceFile)) repeatMaskOptions = RepeatMaskOptions( proportionSampled=1.0, minPeriod=maxOccurrence, lastzOpts="--step=1 --ambiguous=iupac,100,100 --ydrop=3000", fragment=200) outputID = toil.start( LastzRepeatMaskJob(repeatMaskOptions=repeatMaskOptions, queryID=sequenceID, targetIDs=[sequenceID])) toil.exportFile(outputID, makeURL(self.tempOutputFile)) print(("It took %s seconds to run lastzMasking" % (time.time() - startTime))) #Parse lastz masked sequences into dictionary lastzSequences = getSequences(self.tempOutputFile) #Check the sequences are the same modulo masking self.checkSequenceSetsEqualModuloSoftMasking( originalSequences, lastzSequences) #Compare the proportion of bases masked by lastz with original repeat masking maskedBasesOriginal = getMaskedBases(originalSequences) maskedBasesLastzMasked = getMaskedBases(lastzSequences) print((" For the sequence file ", sequenceFile, \ " the total number of sequences is ", len(originalSequences), \ " the total number of bases ", totalBases, \ " the number of bases originally masked was: ", len(maskedBasesOriginal),\ " the number of bases masked after running lastz repeat masking is: ", len(maskedBasesLastzMasked), \ " the intersection of these masked sets is: ", len(maskedBasesLastzMasked.intersection(maskedBasesOriginal)), \ " the total number of bases that are Ns ", totalNBases, \ " lastz was filter for max-occurrences of more than : ", maxOccurrence)) #self.assertGreater(len(maskedBasesLastzMasked), len(maskedBasesOriginal)) #Run lastz repeat masker using heuristic settings for comparison with the slower settings startTime = time.time() with Toil(self.toilOptions) as toil: sequenceID = toil.importFile(makeURL(sequenceFile)) repeatMaskOptions = RepeatMaskOptions( proportionSampled=1.0, minPeriod=maxOccurrence, lastzOpts= "--step=3 --ambiguous=iupac,100,100 --ungapped --queryhsplimit=keep,nowarn:%i" % (int(maxOccurrence) * 20), fragment=200) outputID = toil.start( LastzRepeatMaskJob(repeatMaskOptions=repeatMaskOptions, queryID=sequenceID, targetIDs=[sequenceID])) toil.exportFile(outputID, makeURL(self.tempOutputFile)) print(("It took %s seconds to run lastzMasking fast" % (time.time() - startTime))) lastzSequencesFast = getSequences(self.tempOutputFile) maskedBasesLastzMaskedFast = getMaskedBases(lastzSequencesFast) self.assertGreater(len(maskedBasesLastzMaskedFast), len(maskedBasesOriginal)) i = float( len( maskedBasesLastzMaskedFast.intersection( maskedBasesLastzMasked))) precision = i / len(maskedBasesLastzMasked) recall = i / len(maskedBasesLastzMaskedFast) self.assertGreater(precision, 0.93) self.assertGreater(recall, 0.93)