# Exclude some sequences from the calculation # ------------------------------------------------------------------------------------------ # Skip sequences with partial CDS annotations #if(r.exists("CDS:taxid:%d:protid:%s:partial" % (taxIdForProcessing, protId))): # skipped += 1 # continue #if( not r.exists(nativeCdsSeqIdKey % (taxIdForProcessing, protId)) ): # skipped +=1 # continue cds = CDSHelper(taxIdForProcessing, protId) seqLength = cds.length() stopCodonPos = cds.CDSlength() if seqLength is None: print( "Warning: Could not find CDS length entry for taxid=%d, protid=%s" % (taxIdForProcessing, protId)) skipped += 1 stats['skipped-cds-length-missing'] += 1 continue # Skip sequences with length <40nt (window width) if (seqLength < windowWidth + 1): print("short seq") stats['skipped-short-seq'] += 1 skipped += 1 continue
def calculateMissingWindowsForSequence(self, taxId, protId, seqIds, requestedShuffleIds, firstWindow, lastWindowStart, windowStep, reference="begin", shuffleType=db.Sources.ShuffleCDSv2_python, debug=False): timerForPreFolding.start() logging.warning("Parameters: %d %s %s %s %d %d %s %d" % (taxId, protId, seqIds, requestedShuffleIds, lastWindowStart, windowStep, reference, shuffleType)) f = self._logfile assert(len(seqIds)>0) assert(len(seqIds)==len(requestedShuffleIds)) # ------------------------------------------------------------------------ # Obtain species-dependent properties needed for some calculations # ---------------- # Optimal Temp optimalSpeciesGrowthTemperature = None if( self._seriesSourceNumber == db.Sources.RNAfoldEnergy_SlidingWindow40_v2_native_temp ): (numericalProp, _) = getSpeciesTemperatureInfo(taxId) optimalSpeciesGrowthTemperature = numericalProp[0] if optimalSpeciesGrowthTemperature is None: raise Exception("No temperature value for taxid={}, can't calculate native-temperature folding profile...".format(taxId)) else: optimalSpeciesGrowthTemperature = float(optimalSpeciesGrowthTemperature) assert(optimalSpeciesGrowthTemperature >= -30.0 and optimalSpeciesGrowthTemperature <= 150.0) # ---------------- # Genomic translation table genomicTranslationTable = None if( self._seriesSourceNumber in (db.Sources.StopCodon_content_SlidingWindow30, db.Sources.StopCodon_content_SlidingWindow40, db.Sources.StopCodon_content_SlidingWindow50 )): genomicTranslationTable = getSpeciesTranslationTable(taxId) assert(genomicTranslationTable>0 and genomicTranslationTable<=31) if( reference != "begin" and reference != "end" and reference != "stop3utr"): timerForPreFolding.stop() e = "Specificed profile reference '%s' is not supported!" % reference logging.error(e) raise Exception(e) # We will process all listed shuffle-ids for the following protein record if( reference == "begin" or reference == "end" ): regionOfInterest = RegionsOfInterset.CDSonly elif reference == "stop3utr": regionOfInterest = RegionsOfInterset.CDSand3UTR else: assert(False) cds = CDSHelper( taxId, protId, regionOfInterest=regionOfInterest ) if( cds.length() < self._windowWidth ): e = "Refusing to process item %s because the sequence length (%d nt) is less than the window size (%d nt)\n" % (itemToProcess, cds.length(), self._windowWidth) f.write(e) logging.error(e) timerForPreFolding.stop() raise Exception(e) # Create a list of the windows we need to calculate for this CDS if reference == "begin": requestedWindowStarts = frozenset(list(range(0, min(lastWindowStart+1, cds.length()-self._windowWidth-1), windowStep))) if( len(requestedWindowStarts) == 0): e = "No windows exist for calculation taxid=%d, protId=%s, CDS-length=%d, lastWindowStart=%d, windowStep=%d, windowWidth=%d - Skipping...\n" % (taxId, protId, cds.length(), lastWindowStart, windowStep, self._windowWidth) f.write(e) logging.error(e) timerForPreFolding.stop() raise Exception(e) elif reference == "end": lastPossibleWindowStart = cds.length() - self._windowWidth #+ 1 # disregard lastWindowStart when reference=="end" #lastWindowCodonStart = (lastPossibleWindowStart-3)-(lastPossibleWindowStart-3)%3 #lastPossibleWindowStart = seqLength - windowWidth # + 1 # disregard lastWindowStart when reference=="end" requestedWindowStarts = frozenset([x for x in range(lastPossibleWindowStart % windowStep, lastPossibleWindowStart+1, windowStep) if x>=lastWindowStart]) elif reference == "stop3utr": seqLength = cds.length() stopCodonPos = cds.CDSlength() isRequired = [1 if abs(pos-stopCodonPos)<((lastWindowStart//2)*windowStep) else 0 for pos in range(0, seqLength - self._windowWidth, windowStep)] requestedWindowStarts = frozenset( compress( range(seqLength), isRequired ) ) #requestedWindowStarts = frozenset(range(lastWindowCodonStart % windowStep, lastWindowCodonStart, windowStep)) #pass else: assert(False) # First, read available results (for all shuffle-ids) in JSON format # Array is indexed by shuffle-id, so results not requested will be represented by None (as will requested items that have no results yet). logging.info("DEBUG: requestedShuffleIds (%d items): %s\n" % (len(requestedShuffleIds), requestedShuffleIds)) existingResults = cds.getCalculationResult2( self._seriesSourceNumber, requestedShuffleIds, True, shuffleType=shuffleType ) #assert(len(existingResults) >= len(requestedShuffleIds)) # The returned array must be at least as large as the requested ids list assert(len(existingResults) == len(requestedShuffleIds)) logging.info("requestedShuffleIds: %s" % requestedShuffleIds) logging.info("existingResults.keys(): %s" % list(existingResults.keys())) assert(frozenset(requestedShuffleIds)==frozenset(list(existingResults.keys()))) #existingResults = [None] * (max(requestedShuffleIds)+1) logging.info("DEBUG: existingResults (%d items): %s\n" % (len(existingResults), existingResults)) # Check for which of the requested shuffle-ids there are values missing shuffleIdsToProcess = {} for shuffleId, r in list(existingResults.items()): if r is None: # There are no existing results for shuffled-id n. If it was requested, it should be calculated now (including all windows) if shuffleId in requestedShuffleIds: shuffleIdsToProcess[shuffleId] = list(requestedWindowStarts) timerForPreFolding.stop() # ------------------------------------------------------------------------------------ continue # TODO - verify this line; should we abort this sequence by throwing???? # ------------------------------------------------------------------------------------ logging.info("/// shuffleId r = %d %s" % (shuffleId, r)) logging.info("r[MFE-profile] %s" % r["MFE-profile"]) # Check the existing results for this shuffle alreadyProcessedWindowStarts = frozenset( [i for i,x in enumerate(r["MFE-profile"] ) if x is not None] ) # Get the indices (=window starts) of all non-None values missingWindows = requestedWindowStarts - alreadyProcessedWindowStarts # Are there any requested windows that are not already computed? if( missingWindows ): shuffleIdsToProcess[shuffleId] = missingWindows if( not shuffleIdsToProcess): e = "All requested shuffle-ids in (taxId: %d, protId: %s, seqs: %s) seem to have already been processed. Skipping...\n" % (taxId, protId, str(list(zip(seqIds, requestedShuffleIds))) ) logging.warning(e) timerForPreFolding.stop() return logging.info("DEBUG: shuffleIdsToProcess (%d items): %s\n" % (len(shuffleIdsToProcess), shuffleIdsToProcess)) logging.info("DEBUG: Before (%d items): %s\n" % (len(existingResults), existingResults)) # Initialize new results records for shuffleId in list(shuffleIdsToProcess.keys()): if existingResults[shuffleId] is None: logging.info(seqIds) logging.info(requestedShuffleIds) logging.info(shuffleId) thisSeqId = seqIds[ requestedShuffleIds.index(shuffleId) ] existingResults[shuffleId] = { "id": "%s/%s/%d/%d" % (taxId, protId, thisSeqId, shuffleId), "seq-crc": None, "MFE-profile": [], "MeanMFE": None, "v": 2, "shuffle-type":shuffleType } logging.info("DEBUG: existingResults (%d items): %s\n" % (len(existingResults),existingResults) ) timerForPreFolding.stop() # Load the sequences of all shuffle-ids we need to work on # TODO - combine loading of multiple sequences into one DB operation for shuffleId, record in list(existingResults.items()): if record is None: logging.info("DEBUG: skipping empty results record for shuffleId={}".format(shuffleId)) continue timerForPreFolding.start() seq = None annotatedSeqId = None # Get the sequence for this entry if( shuffleId < 0 ): seq = cds.sequence() annotatedSeqId = cds.seqId() else: seq = cds.getShuffledSeq(shuffleId, shuffleType) annotatedSeqId = cds.getShuffledSeqId(shuffleId, shuffleType) if( seq is None or (not seq is None and len(seq)==0 )): seq2 = cds.getShuffledSeq2( annotatedSeqId ) seq3 = cds._fetchSequence( annotatedSeqId ) seq4 = cds._cache.get("%d:seq"%annotatedSeqId) if not seq4 is None: del cds._cache["%d:seq"%annotatedSeqId] seq5 = cds.getShuffledSeq2( annotatedSeqId ) e = "Got empty sequence for shuffleId=%d, seqId=%d, taxId=%d, protId=%s, numShuffled=%d, ids[%d:%d]=%s, len(seq2)=%d, len(seq3)=%d, len(seq4)=%d, len(seq5)=%d" % (shuffleId, annotatedSeqId, taxId, protId, len(cds.shuffledSeqIds()), shuffleId-2, shuffleId+2, cds.shuffledSeqIds()[shuffleId-2:shuffleId+2], len(seq2) if not seq2 is None else -1, len(seq3) if not seq3 is None else -1, len(seq4) if not seq4 is None else -1, len(seq5) if not seq5 is None else -1 ) logging.error(e) timerForPreFolding.stop() raise Exception(e) # # Disabled - calculation needn't include the native sequence... # #if( annotatedSeqId not in seqIds ): # e = "Error: SeqId specified in queue item %s does not match annotated seq-id %d\n" % (itemToProcess, annotatedSeqId) # f.write(e) # f.write("Current shuffle-id: %d\n" % shuffleId) # f.write("Ids in existing results:\n") # for shuffleId, record in enumerate(existingResults): # f.write(" %d) %s\n" % (shuffleId, record['id'])) # f.write("Debug info:\n") # f.write("\n".join(cds.getDebugInfo())) # f.write("\n") # f.write("Skipping...\n") # print("Skipping...") # raise Exception(e) expectedSeqLength = cds.length() if( not expectedSeqLength is None ): if( expectedSeqLength != len(seq) ): e = "Warning: taxid=%d, protid=%s, seqid=%d - unexpected length %d (expected: %d)\n" % (taxId, protId, annotatedSeqId, len(seq), expectedSeqLength) f.write(e) logging.error(e) timerForPreFolding.stop() raise Exception(e) if( len(seq) < self._windowWidth ): # Sequence is shorter than required window; skip e = "Warning: skipping sequence because it is shorter than the requested window...\n" f.write(e) logging.error(e) timerForPreFolding.stop() raise Exception(e) logging.info("DEBUG: Processing item taxId=%d, protId=%s, shuffle=%d (length=%d, %d windows)...\n" % (taxId, protId, shuffleId, len(seq), len(requestedWindowStarts))) # TODO - Remove any old value stored in this key? # Skip this for now # This will be made redundant by completing the "updating" implementation # #if( cds.isCalculationDone( seriesSourceNumber, shuffleId )): # # Sufficient data seems to exist. Skip... # f.write("Item %s appears to be already completed, skipping..." % itemToProcess) # continue logging.info(seq[:50]) #f.write("\n") MFEprofile = record["MFE-profile"] #f.write("Profile: %s\n" % MFEprofile) # Make sure the profile array contains enough entries for all new windows (and possibly, if windows are non-contiguous, entries between them that we are not going to compute right now) if( len(MFEprofile) < max(requestedWindowStarts) ): entriesToAdd = max(requestedWindowStarts) - len(MFEprofile) + 1 MFEprofile.extend( [None] * entriesToAdd ) assert(len(MFEprofile) >= max(requestedWindowStarts)) stats = RunningStats() stats.extend([x for x in MFEprofile if x is not None]) timerForPreFolding.stop() timerForFolding.start() for start in requestedWindowStarts: fragment = seq[start:(start+self._windowWidth)] assert(len(fragment)==self._windowWidth) if self._seriesSourceNumber in (db.Sources.RNAfoldEnergy_SlidingWindow30_v2, db.Sources.RNAfoldEnergy_SlidingWindow40_v2, db.Sources.RNAfoldEnergy_SlidingWindow50_v2): # Calculate the RNA folding energy. This is the computation-heavy part. #strct, energy = RNA.fold(fragment) result = RNAfold_direct(fragment) assert(result <= 0.0) elif self._seriesSourceNumber == db.Sources.RNAfoldEnergy_SlidingWindow40_v2_native_temp: # Calculate the RNA folding energy. This is the computation-heavy part. #strct, energy = RNA.fold(fragment) result = RNAfold_direct(fragment, explicitCalculationTemperature = optimalSpeciesGrowthTemperature) assert(result <= 0.0) elif self._seriesSourceNumber == db.Sources.GC_content_SlidingWindow40: result = calcWindowGCContent( fragment ) assert( isnan(result) or (result >= 0.0 and result <= 1.0) ) elif self._seriesSourceNumber == db.Sources.Purine_content_SlidingWindow40: result = calcWindowPurineContent( fragment ) assert( isnan(result) or (result >= 0.0 and result <= 1.0) ) elif self._seriesSourceNumber in (db.Sources.StopCodon_content_SlidingWindow30, db.Sources.StopCodon_content_SlidingWindow40, db.Sources.StopCodon_content_SlidingWindow50): result = calcWindowStopCodonContent( fragment, translationTable=genomicTranslationTable, phase=start%3 ) assert( result >= 0.0 and result <= 1.0 ) elif self._seriesSourceNumber == db.Sources.TEST_StepFunction_BeginReferenced: if shuffleId < 0: result = 0 else: result = start%50 - 20 elif self._seriesSourceNumber == db.Sources.TEST_StepFunction_EndReferenced: if shuffleId < 0: result = 0 else: result = (expectedSeqLength - self._windowWidth - start)%50 - 20 else: logging.error("Received unknown seriesSourceNumber {}".format(self._seriesSourceNumber)) assert(False) # Store the calculation result #print("%d:%s --> %f" % (taxId, protId, energy)) stats.push(result) MFEprofile[start] = result #print("/////////////////// shuffleId={} (len={}) //////////////////////////".format(shuffleId, expectedSeqLength)) if debug: prettyPrintProfile(MFEprofile) timerForFolding.stop() timerForPostFolding.start() # Format crc = getCrc(seq) #result = """{"id":"%s","seq-crc":%d,"MFE-profile":[%s],"MeanMFE":%.6g,v:2}""" % (itemToProcess, crc, ",".join(map(lambda x: "%.3g" % x, MFEprofile)), stats.mean()) record["seq-crc"] = crc record["MFE-profile"] = [round4(x) for x in MFEprofile] # Round items down to save space (these are not exact numbers anyway) record["MeanMFE"] = stats.mean() if reference == "stop3utr": record["stop-codon-pos"] = cds.CDSlength() result = json.dumps(record) f.write(result) f.write("\n") if( not self._debugDoneWriteResults): cds.saveCalculationResult2( self._seriesSourceNumber, result, annotatedSeqId, False ) timerForPostFolding.stop() timerForPostFolding.start() if( not self._debugDoneWriteResults): cds.commitChanges() timerForPostFolding.stop()
newValues = sum([1 for x in profile1 if not x is None]) windowsAddedToProfiles.update( (newValues,) ) rawRecordId = updateRecord[1]['id'] try: recordId = splitLongSequenceIdentifier(rawRecordId) except Exception as e: err[ErrorTypes.UpdateRecordFormatError] += 1 badUpdateRecords.add(updateRecord[2]) print(e) continue cds = CDSHelper(recordId[0], recordId[1] ) cdsLength = cds.CDSlength() newPositions = [x[0] for x in enumerate(profile1) if not x[1] is None] for pos in newPositions: windowsAddedToProfiles_DistanceFromStart.update( (pos,) ) windowsAddedToProfiles_DistanceFromEnd.update( (cdsLength-pos,) ) windowsAddedToProfiles_FrameRelativeToStart.update( (pos%10,) ) windowsAddedToProfiles_FrameRelativeToEnd.update( ((cdsLength-pos)%10,) ) if(rl()): print(total, err, recordsByTaxId) # DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY # #if( total > 90000):
def randomize(self, nucleotideSeq: str, protId: str) -> (int, float, str): #print("-----------"*5) cds = CDSHelper(self.taxId, protId) # Get metadata from genome model #gm = cds.getGenomeModel() #found = gm.findFeatureById( protId ) #if found is None: # raise Exception("Failed to find feature matching protein-id={} in genome model".format(protId)) #(moleculeId, currFeature) = found #if gm.moleculeModels[moleculeId].find3PrimeFlankingRegion( currFeature, debug=True ) is None: # pass #print((moleculeId, feature)) cdsLengthNt = cds.CDSlength() assert (cdsLengthNt % 3 == 0) flankingRegionLengthNt = cds.flankingRegion3UtrLength() nextCDSOppositeStrand = cds.nextCDSOnOppositeStrand() # Case 1 (no overlap): # +--------intergenic--------+ # | | # +-------CDS1--------+ +------------CDS2-----------+ # | | | | # +===================+--------------------------+===========================+ # | | | | # +===================+--------------------------+===========================+ # |<---cdsLengthNt--->|<-flankingRegionLengthNt->| | # | (>= 0) | # |<---------------------------cds.totalLength()---------------------------->| # Case 2 (overlap): # +--------------------------CDS2-------------------------+ # | | # +---------------------CDS1----------------------+ | # | | | | # +===================+===========================+===========================+ # | | | | # +===================+===========================+===========================+ # | |<-flankingRegionLengthNt-->| | # | (<= 0) | | # |<----------------cdsLengthNt------------------>| | # |<----------------------------cds.totalLength()---------------------------->| if flankingRegionLengthNt < 0 and -flankingRegionLengthNt > cdsLengthNt: #flankingRegionLengthNt = -cdsLengthNt raise Exception("Next CDS is fully overlapping...") #----------------------------------------------------------------------------- # Randomize the "main" CDS #----------------------------------------------------------------------------- # First, determine which region to randomize... if (not self.constantOverlaps) or ( flankingRegionLengthNt >= 0): # no overlap, or overlap should be randomized CDSseq = nucleotideSeq[:cdsLengthNt] assert (len(CDSseq) == cdsLengthNt) else: # constant overlaps requested and this CDS is overlapping the next. Remove the overlap from the CDS (it will not be randomized): lastNucBeforeOverlap = cdsLengthNt + flankingRegionLengthNt assert (lastNucBeforeOverlap < cdsLengthNt) lastNucToRandomize = lastNucBeforeOverlap - (lastNucBeforeOverlap % 3) CDSseq = nucleotideSeq[:lastNucToRandomize] assert (len(CDSseq) % 3 == 0) # Then, do the randomization... (CDSpermCount, CDSidentity, randomizedCDS) = self.cdsRand.randomizeAmbiguousSequence(CDSseq) # Finally, add the non-randomized part of the CDS (if any) if (not self.constantOverlaps) or ( flankingRegionLengthNt >= 0): # no overlap, or overlap should be randomized pass else: # constant overlaps requested and this CDS is overlapping the next. randomizedCDS = randomizedCDS + nucleotideSeq[ lastNucToRandomize:cdsLengthNt] assert (len(randomizedCDS) % 3 == 0) assert ( len(randomizedCDS) == cdsLengthNt ) # the length of the resulting sequence matches the original CDS sequence #----------------------------------------------------------------------------- # Randomize the 3'UTR #----------------------------------------------------------------------------- if flankingRegionLengthNt > 0: _3UTRseq = nucleotideSeq[cdsLengthNt:cdsLengthNt + flankingRegionLengthNt] assert (len(_3UTRseq) == flankingRegionLengthNt) (UTRpermCount, UTRidentity, randomizedUTR) = self.utrRand.randomizeAmbiguousSequence(_3UTRseq) else: _3UTRseq = "" UTRpermCount = 1 UTRidentity = 1.0 randomizedUTR = "" #----------------------------------------------------------------------------- # Randomize the downstream CDS #----------------------------------------------------------------------------- nextCDSseq = nucleotideSeq[ cdsLengthNt + flankingRegionLengthNt:] # Should work for positive and negative length UTRs assert (len(nextCDSseq) % 3 == 0) #nextCDSseq = nextCDSseq[(len(nextCDSseq)%3):] # remove partial codons from the start (caused due to the overlap; we can only randomize each codon as part of one CDS, although in the overlap region codons belong to two CDSs...) if nextCDSOppositeStrand: nextCDSseq = str(Seq(nextCDSseq, generic_dna).reverse_complement()) assert (len(nextCDSseq) % 3 == 0) (nextCDSpermCount, nextCDSidentity, randomizedNextCDS ) = self.cdsRand.randomizeAmbiguousSequence(nextCDSseq) if nextCDSOppositeStrand: # if the next CDS is on the opposite strand, revcomp it back to its original frame randomizedNextCDS = str( Seq(randomizedNextCDS, generic_dna).reverse_complement()) if flankingRegionLengthNt < 0: randomizedNextCDS = randomizedNextCDS[-flankingRegionLengthNt:] totalPerms = CDSpermCount * UTRpermCount * nextCDSpermCount totalIdentity = ((CDSidentity * len(CDSseq)) + (UTRidentity * len(_3UTRseq)) + (nextCDSidentity * len(nextCDSseq))) / ( len(CDSseq) + len(_3UTRseq) + len(nextCDSseq)) return (totalPerms, totalIdentity, randomizedCDS + randomizedUTR + randomizedNextCDS)