def calculateMissingWindowsForSequence( self, taxId, protId, seqIds, requestedShuffleIds, firstWindow, lastWindowStart, windowStep, reference="begin", shuffleType=db.Sources.ShuffleCDSv2_python): timerForPreFolding.start() logging.warning("Parameters: %d %s %s %s %d %d %s %d" % (taxId, protId, seqIds, requestedShuffleIds, lastWindowStart, windowStep, reference, shuffleType)) f = self._logfile assert (len(seqIds) > 0) assert (len(seqIds) == len(requestedShuffleIds)) optimalSpeciesGrowthTemperature = None if (self._seriesSourceNumber == db.Sources.RNAfoldEnergy_SlidingWindow40_v2_native_temp): (numericalProp, _) = getSpeciesTemperatureInfo(taxId) optimalSpeciesGrowthTemperature = numericalProp[0] if optimalSpeciesGrowthTemperature is None: raise Exception( "No temperature value for taxid={}, can't calculate native-temperature folding profile..." .format(taxId)) else: optimalSpeciesGrowthTemperature = float( optimalSpeciesGrowthTemperature) assert (optimalSpeciesGrowthTemperature >= -30.0 and optimalSpeciesGrowthTemperature <= 150.0) if (reference != "begin" and reference != "end"): timerForPreFolding.stop() e = "Specificed profile reference '%s' is not supported! (" % reference logging.error(e) raise Exception(e) # We will process all listed shuffle-ids for the following protein record cds = CDSHelper(taxId, protId) if (cds.length() < self._windowWidth): e = "Refusing to process item %s because the sequence length (%d nt) is less than the window size (%d nt)\n" % ( itemToProcess, cds.length(), self._windowWidth) f.write(e) logging.error(e) timerForPreFolding.stop() raise Exception(e) # Create a list of the windows we need to calculate for this CDS if reference == "begin": requestedWindowStarts = frozenset( range( 0, min(lastWindowStart + 1, cds.length() - self._windowWidth - 1), windowStep)) if (len(requestedWindowStarts) == 0): e = "No windows exist for calculation taxid=%d, protId=%s, CDS-length=%d, lastWindowStart=%d, windowStep=%d, windowWidth=%d - Skipping...\n" % ( taxId, protId, cds.length(), lastWindowStart, windowStep, self._windowWidth) f.write(e) logging.error(e) timerForPreFolding.stop() raise Exception(e) elif reference == "end": lastPossibleWindowStart = cds.length( ) - self._windowWidth #+ 1 # disregard lastWindowStart when reference=="end" #lastWindowCodonStart = (lastPossibleWindowStart-3)-(lastPossibleWindowStart-3)%3 #lastPossibleWindowStart = seqLength - windowWidth # + 1 # disregard lastWindowStart when reference=="end" requestedWindowStarts = frozenset( filter( lambda x: x >= lastWindowStart, range(lastPossibleWindowStart % windowStep, lastPossibleWindowStart + 1, windowStep))) #requestedWindowStarts = frozenset(range(lastWindowCodonStart % windowStep, lastWindowCodonStart, windowStep)) #pass else: assert (False) # First, read available results (for all shuffle-ids) in JSON format # Array is indexed by shuffle-id, so results not requested will be represented by None (as will requested items that have no results yet). logging.info("DEBUG: requestedShuffleIds (%d items): %s\n" % (len(requestedShuffleIds), requestedShuffleIds)) existingResults = cds.getCalculationResult2(self._seriesSourceNumber, requestedShuffleIds, True, shuffleType=shuffleType) #assert(len(existingResults) >= len(requestedShuffleIds)) # The returned array must be at least as large as the requested ids list assert (len(existingResults) == len(requestedShuffleIds)) logging.info("requestedShuffleIds: %s" % requestedShuffleIds) logging.info("existingResults.keys(): %s" % existingResults.keys()) assert (frozenset(requestedShuffleIds) == frozenset( existingResults.keys())) #existingResults = [None] * (max(requestedShuffleIds)+1) logging.info("DEBUG: existingResults (%d items): %s\n" % (len(existingResults), existingResults)) # Check for which of the requested shuffle-ids there are values missing shuffleIdsToProcess = {} for shuffleId, r in existingResults.items(): if r is None: # There are no existing results for shuffled-id n. If it was requested, it should be calculated now (including all windows) if shuffleId in requestedShuffleIds: shuffleIdsToProcess[shuffleId] = list( requestedWindowStarts) timerForPreFolding.stop() # ------------------------------------------------------------------------------------ continue # TODO - verify this line; should we abort this sequence by throwing???? # ------------------------------------------------------------------------------------ logging.info("/// shuffleId r = %d %s" % (shuffleId, r)) logging.info("r[MFE-profile] %s" % r["MFE-profile"]) # Check the existing results for this shuffle alreadyProcessedWindowStarts = frozenset([ i for i, x in enumerate(r["MFE-profile"]) if x is not None ]) # Get the indices (=window starts) of all non-None values missingWindows = requestedWindowStarts - alreadyProcessedWindowStarts # Are there any requested windows that are not already computed? if (missingWindows): shuffleIdsToProcess[shuffleId] = missingWindows if (not shuffleIdsToProcess): e = "All requested shuffle-ids in (taxId: %d, protId: %s, seqs: %s) seem to have already been processed. Skipping...\n" % ( taxId, protId, str(list(zip(seqIds, requestedShuffleIds)))) logging.warning(e) timerForPreFolding.stop() return logging.info("DEBUG: shuffleIdsToProcess (%d items): %s\n" % (len(shuffleIdsToProcess), shuffleIdsToProcess)) logging.info("DEBUG: Before (%d items): %s\n" % (len(existingResults), existingResults)) # Initialize new results records for shuffleId in shuffleIdsToProcess.keys(): if existingResults[shuffleId] is None: logging.info(seqIds) logging.info(requestedShuffleIds) logging.info(shuffleId) thisSeqId = seqIds[requestedShuffleIds.index(shuffleId)] existingResults[shuffleId] = { "id": "%s/%s/%d/%d" % (taxId, protId, thisSeqId, shuffleId), "seq-crc": None, "MFE-profile": [], "MeanMFE": None, "v": 2, "shuffle-type": shuffleType } logging.info("DEBUG: existingResults (%d items): %s\n" % (len(existingResults), existingResults)) timerForPreFolding.stop() # Load the sequences of all shuffle-ids we need to work on # TODO - combine loading of multiple sequences into one DB operation for shuffleId, record in existingResults.items(): if record is None: logging.info( "DEBUG: skipping empty results record for shuffleId={}". format(shuffleId)) continue timerForPreFolding.start() seq = None annotatedSeqId = None # Get the sequence for this entry if (shuffleId < 0): seq = cds.sequence() annotatedSeqId = cds.seqId() else: seq = cds.getShuffledSeq(shuffleId, shuffleType) annotatedSeqId = cds.getShuffledSeqId(shuffleId, shuffleType) if (seq is None or (not seq is None and len(seq) == 0)): seq2 = cds.getShuffledSeq2(annotatedSeqId) seq3 = cds._fetchSequence(annotatedSeqId) seq4 = cds._cache.get("%d:seq" % annotatedSeqId) if not seq4 is None: del cds._cache["%d:seq" % annotatedSeqId] seq5 = cds.getShuffledSeq2(annotatedSeqId) e = "Got empty sequence for shuffleId=%d, seqId=%d, taxId=%d, protId=%s, numShuffled=%d, ids[%d:%d]=%s, len(seq2)=%d, len(seq3)=%d, len(seq4)=%d, len(seq5)=%d" % ( shuffleId, annotatedSeqId, taxId, protId, len(cds.shuffledSeqIds()), shuffleId - 2, shuffleId + 2, cds.shuffledSeqIds()[shuffleId - 2:shuffleId + 2], len(seq2) if not seq2 is None else -1, len(seq3) if not seq3 is None else -1, len(seq4) if not seq4 is None else -1, len(seq5) if not seq5 is None else -1) logging.error(e) timerForPreFolding.stop() raise Exception(e) # # Disabled - calculation needn't include the native sequence... # #if( annotatedSeqId not in seqIds ): # e = "Error: SeqId specified in queue item %s does not match annotated seq-id %d\n" % (itemToProcess, annotatedSeqId) # f.write(e) # f.write("Current shuffle-id: %d\n" % shuffleId) # f.write("Ids in existing results:\n") # for shuffleId, record in enumerate(existingResults): # f.write(" %d) %s\n" % (shuffleId, record['id'])) # f.write("Debug info:\n") # f.write("\n".join(cds.getDebugInfo())) # f.write("\n") # f.write("Skipping...\n") # print("Skipping...") # raise Exception(e) expectedSeqLength = cds.length() if (not expectedSeqLength is None): if (expectedSeqLength != len(seq)): e = "Warning: taxid=%d, protid=%s, seqid=%d - unexpected length %d (expected: %d)\n" % ( taxId, protId, annotatedSeqId, len(seq), expectedSeqLength) f.write(e) logging.error(e) timerForPreFolding.stop() raise Exception(e) if (len(seq) < self._windowWidth): # Sequence is shorter than required window; skip e = "Warning: skipping sequence because it is shorter than the requested window...\n" f.write(e) logging.error(e) timerForPreFolding.stop() raise Exception(e) logging.info( "DEBUG: Processing item taxId=%d, protId=%s, shuffle=%d (length=%d, %d windows)...\n" % (taxId, protId, shuffleId, len(seq), len(requestedWindowStarts))) # TODO - Remove any old value stored in this key? # Skip this for now # This will be made redundant by completing the "updating" implementation # #if( cds.isCalculationDone( seriesSourceNumber, shuffleId )): # # Sufficient data seems to exist. Skip... # f.write("Item %s appears to be already completed, skipping..." % itemToProcess) # continue logging.info(seq[:50]) #f.write("\n") MFEprofile = record["MFE-profile"] #f.write("Profile: %s\n" % MFEprofile) # Make sure the profile array contains enough entries for all new windows (and possibly, if windows are non-contiguous, entries between them that we are not going to compute right now) if (len(MFEprofile) < max(requestedWindowStarts)): entriesToAdd = max(requestedWindowStarts) - len(MFEprofile) + 1 MFEprofile.extend([None] * entriesToAdd) assert (len(MFEprofile) >= max(requestedWindowStarts)) stats = RunningStats() stats.extend([x for x in MFEprofile if x is not None]) timerForPreFolding.stop() timerForFolding.start() for start in requestedWindowStarts: fragment = seq[start:(start + self._windowWidth)] assert (len(fragment) == self._windowWidth) if self._seriesSourceNumber == db.Sources.RNAfoldEnergy_SlidingWindow40_v2: # Calculate the RNA folding energy. This is the computation-heavy part. #strct, energy = RNA.fold(fragment) energy = RNAfold_direct(fragment) assert (energy <= 0.0) elif self._seriesSourceNumber == db.Sources.RNAfoldEnergy_SlidingWindow40_v2_native_temp: # Calculate the RNA folding energy. This is the computation-heavy part. #strct, energy = RNA.fold(fragment) energy = RNAfold_direct(fragment, explicitCalculationTemperature= optimalSpeciesGrowthTemperature) assert (energy <= 0.0) elif self._seriesSourceNumber == db.Sources.TEST_StepFunction_BeginReferenced: if shuffleId < 0: energy = 0 else: energy = start % 50 - 20 elif self._seriesSourceNumber == db.Sources.TEST_StepFunction_EndReferenced: if shuffleId < 0: energy = 0 else: energy = (expectedSeqLength - self._windowWidth - start) % 50 - 20 else: logging.error( "Received unknown seriesSourceNumber {}".format( self._seriesSourceNumber)) assert (False) # Store the calculation result #print("%d:%s --> %f" % (taxId, protId, energy)) stats.push(energy) MFEprofile[start] = energy print( "/////////////////// shuffleId={} (len={}) //////////////////////////" .format(shuffleId, expectedSeqLength)) prettyPrintProfile(MFEprofile) timerForFolding.stop() timerForPostFolding.start() # Format crc = calcCrc(seq) #result = """{"id":"%s","seq-crc":%d,"MFE-profile":[%s],"MeanMFE":%.6g,v:2}""" % (itemToProcess, crc, ",".join(map(lambda x: "%.3g" % x, MFEprofile)), stats.mean()) record["seq-crc"] = crc record["MFE-profile"] = [ round4(x) for x in MFEprofile ] # Round items down to save space (these are not exact numbers anyway) record["MeanMFE"] = stats.mean() result = json.dumps(record) f.write(result) f.write("\n") if (not self._debugDoneWriteResults): cds.saveCalculationResult2(self._seriesSourceNumber, result, annotatedSeqId, False) timerForPostFolding.stop() timerForPostFolding.start() if (not self._debugDoneWriteResults): cds.commitChanges() timerForPostFolding.stop()
warnings.update(("has-broken-codons", )) continue nativeCodons = Counter(splitCodons(nativeSeq)) hasMismatchedCodons = False allNativeSeqs[protId] = nativeSeq hashesForShuffles = set() #for i, c in enumerate(splitCodons(nativeSeq)[:maxCodons]): # nativeColumns[i].append(c) shuffledSeqs = [] for shuffleId in range(len(allIds)): shuffledSeq = cds.getShuffledSeq(shuffleId, shuffleType) shuffledCodons = Counter(splitCodons(shuffledSeq)) hashesForShuffles.add(md5(shuffledSeq).hexdigest()) if shuffledCodons != nativeCodons: warnings.update(("num-horizontal-codon-mismatch", )) hasMismatchedCodons = True shuffledSeqs.append(shuffledSeq) #for i, c in enumerate(splitCodons(shuffledSeq)[:maxCodons]): # shuffledColumns[shuffleId][i].append(c) numUniqueShuffles.update((len(hashesForShuffles), )) if len(hashesForShuffles) != len(allIds):