Ejemplo n.º 1
0
    def calculateMissingWindowsForSequence(
            self,
            taxId,
            protId,
            seqIds,
            requestedShuffleIds,
            firstWindow,
            lastWindowStart,
            windowStep,
            reference="begin",
            shuffleType=db.Sources.ShuffleCDSv2_python):

        timerForPreFolding.start()
        logging.warning("Parameters: %d %s %s %s %d %d %s %d" %
                        (taxId, protId, seqIds, requestedShuffleIds,
                         lastWindowStart, windowStep, reference, shuffleType))
        f = self._logfile

        assert (len(seqIds) > 0)
        assert (len(seqIds) == len(requestedShuffleIds))

        optimalSpeciesGrowthTemperature = None
        if (self._seriesSourceNumber ==
                db.Sources.RNAfoldEnergy_SlidingWindow40_v2_native_temp):
            (numericalProp, _) = getSpeciesTemperatureInfo(taxId)
            optimalSpeciesGrowthTemperature = numericalProp[0]

            if optimalSpeciesGrowthTemperature is None:
                raise Exception(
                    "No temperature value for taxid={}, can't calculate native-temperature folding profile..."
                    .format(taxId))
            else:
                optimalSpeciesGrowthTemperature = float(
                    optimalSpeciesGrowthTemperature)
                assert (optimalSpeciesGrowthTemperature >= -30.0
                        and optimalSpeciesGrowthTemperature <= 150.0)

        if (reference != "begin" and reference != "end"):
            timerForPreFolding.stop()
            e = "Specificed profile reference '%s' is not supported! (" % reference
            logging.error(e)
            raise Exception(e)

        # We will process all listed shuffle-ids for the following protein record
        cds = CDSHelper(taxId, protId)

        if (cds.length() < self._windowWidth):
            e = "Refusing to process item %s because the sequence length (%d nt) is less than the window size (%d nt)\n" % (
                itemToProcess, cds.length(), self._windowWidth)
            f.write(e)
            logging.error(e)
            timerForPreFolding.stop()
            raise Exception(e)

        # Create a list of the windows we need to calculate for this CDS
        if reference == "begin":
            requestedWindowStarts = frozenset(
                range(
                    0,
                    min(lastWindowStart + 1,
                        cds.length() - self._windowWidth - 1), windowStep))
            if (len(requestedWindowStarts) == 0):
                e = "No windows exist for calculation taxid=%d, protId=%s, CDS-length=%d, lastWindowStart=%d, windowStep=%d, windowWidth=%d - Skipping...\n" % (
                    taxId, protId, cds.length(), lastWindowStart, windowStep,
                    self._windowWidth)
                f.write(e)
                logging.error(e)
                timerForPreFolding.stop()
                raise Exception(e)
        elif reference == "end":
            lastPossibleWindowStart = cds.length(
            ) - self._windowWidth  #+ 1  # disregard lastWindowStart when reference=="end"
            #lastWindowCodonStart = (lastPossibleWindowStart-3)-(lastPossibleWindowStart-3)%3

            #lastPossibleWindowStart = seqLength - windowWidth # + 1  # disregard lastWindowStart when reference=="end"
            requestedWindowStarts = frozenset(
                filter(
                    lambda x: x >= lastWindowStart,
                    range(lastPossibleWindowStart % windowStep,
                          lastPossibleWindowStart + 1, windowStep)))

            #requestedWindowStarts = frozenset(range(lastWindowCodonStart % windowStep, lastWindowCodonStart, windowStep))
            #pass
        else:
            assert (False)

        # First, read available results (for all shuffle-ids) in JSON format
        # Array is indexed by shuffle-id, so results not requested will be represented by None (as will requested items that have no results yet).
        logging.info("DEBUG: requestedShuffleIds (%d items): %s\n" %
                     (len(requestedShuffleIds), requestedShuffleIds))
        existingResults = cds.getCalculationResult2(self._seriesSourceNumber,
                                                    requestedShuffleIds,
                                                    True,
                                                    shuffleType=shuffleType)
        #assert(len(existingResults) >= len(requestedShuffleIds))  # The returned array must be at least as large as the requested ids list
        assert (len(existingResults) == len(requestedShuffleIds))
        logging.info("requestedShuffleIds: %s" % requestedShuffleIds)
        logging.info("existingResults.keys(): %s" % existingResults.keys())
        assert (frozenset(requestedShuffleIds) == frozenset(
            existingResults.keys()))
        #existingResults = [None] * (max(requestedShuffleIds)+1)
        logging.info("DEBUG: existingResults (%d items): %s\n" %
                     (len(existingResults), existingResults))

        # Check for which of the requested shuffle-ids there are values missing
        shuffleIdsToProcess = {}
        for shuffleId, r in existingResults.items():
            if r is None:
                # There are no existing results for shuffled-id n. If it was requested, it should be calculated now (including all windows)
                if shuffleId in requestedShuffleIds:
                    shuffleIdsToProcess[shuffleId] = list(
                        requestedWindowStarts)

                timerForPreFolding.stop()

                # ------------------------------------------------------------------------------------
                continue  # TODO - verify this line; should we abort this sequence by throwing????
                # ------------------------------------------------------------------------------------

            logging.info("/// shuffleId r = %d %s" % (shuffleId, r))
            logging.info("r[MFE-profile] %s" % r["MFE-profile"])

            # Check the existing results for this shuffle
            alreadyProcessedWindowStarts = frozenset([
                i for i, x in enumerate(r["MFE-profile"]) if x is not None
            ])  # Get the indices (=window starts) of all non-None values
            missingWindows = requestedWindowStarts - alreadyProcessedWindowStarts  # Are there any requested windows that are not already computed?
            if (missingWindows):
                shuffleIdsToProcess[shuffleId] = missingWindows

        if (not shuffleIdsToProcess):
            e = "All requested shuffle-ids in (taxId: %d, protId: %s, seqs: %s) seem to have already been processed. Skipping...\n" % (
                taxId, protId, str(list(zip(seqIds, requestedShuffleIds))))
            logging.warning(e)
            timerForPreFolding.stop()
            return
        logging.info("DEBUG: shuffleIdsToProcess (%d items): %s\n" %
                     (len(shuffleIdsToProcess), shuffleIdsToProcess))

        logging.info("DEBUG: Before (%d items): %s\n" %
                     (len(existingResults), existingResults))
        # Initialize new results records
        for shuffleId in shuffleIdsToProcess.keys():
            if existingResults[shuffleId] is None:
                logging.info(seqIds)
                logging.info(requestedShuffleIds)
                logging.info(shuffleId)
                thisSeqId = seqIds[requestedShuffleIds.index(shuffleId)]

                existingResults[shuffleId] = {
                    "id":
                    "%s/%s/%d/%d" % (taxId, protId, thisSeqId, shuffleId),
                    "seq-crc": None,
                    "MFE-profile": [],
                    "MeanMFE": None,
                    "v": 2,
                    "shuffle-type": shuffleType
                }
        logging.info("DEBUG: existingResults (%d items): %s\n" %
                     (len(existingResults), existingResults))
        timerForPreFolding.stop()

        # Load the sequences of all shuffle-ids we need to work on
        # TODO - combine loading of multiple sequences into one DB operation
        for shuffleId, record in existingResults.items():
            if record is None:
                logging.info(
                    "DEBUG: skipping empty results record for shuffleId={}".
                    format(shuffleId))
                continue
            timerForPreFolding.start()

            seq = None
            annotatedSeqId = None
            # Get the sequence for this entry
            if (shuffleId < 0):
                seq = cds.sequence()
                annotatedSeqId = cds.seqId()
            else:
                seq = cds.getShuffledSeq(shuffleId, shuffleType)
                annotatedSeqId = cds.getShuffledSeqId(shuffleId, shuffleType)

            if (seq is None or (not seq is None and len(seq) == 0)):
                seq2 = cds.getShuffledSeq2(annotatedSeqId)
                seq3 = cds._fetchSequence(annotatedSeqId)
                seq4 = cds._cache.get("%d:seq" % annotatedSeqId)
                if not seq4 is None:
                    del cds._cache["%d:seq" % annotatedSeqId]
                seq5 = cds.getShuffledSeq2(annotatedSeqId)
                e = "Got empty sequence for shuffleId=%d, seqId=%d, taxId=%d, protId=%s, numShuffled=%d, ids[%d:%d]=%s, len(seq2)=%d, len(seq3)=%d, len(seq4)=%d, len(seq5)=%d" % (
                    shuffleId, annotatedSeqId, taxId, protId,
                    len(cds.shuffledSeqIds()), shuffleId - 2, shuffleId + 2,
                    cds.shuffledSeqIds()[shuffleId - 2:shuffleId + 2],
                    len(seq2) if not seq2 is None else -1, len(seq3)
                    if not seq3 is None else -1, len(seq4) if not seq4 is None
                    else -1, len(seq5) if not seq5 is None else -1)
                logging.error(e)
                timerForPreFolding.stop()
                raise Exception(e)

            #
            # Disabled - calculation needn't include the native sequence...
            #
            #if( annotatedSeqId not in seqIds ):
            #    e = "Error: SeqId specified in queue item %s does not match annotated seq-id %d\n" % (itemToProcess, annotatedSeqId)
            #    f.write(e)
            #    f.write("Current shuffle-id: %d\n" % shuffleId)
            #    f.write("Ids in existing results:\n")
            #    for shuffleId, record in enumerate(existingResults):
            #        f.write(" %d) %s\n" % (shuffleId, record['id']))
            #    f.write("Debug info:\n")
            #    f.write("\n".join(cds.getDebugInfo()))
            #    f.write("\n")
            #    f.write("Skipping...\n")
            #    print("Skipping...")
            #    raise Exception(e)

            expectedSeqLength = cds.length()
            if (not expectedSeqLength is None):
                if (expectedSeqLength != len(seq)):
                    e = "Warning: taxid=%d, protid=%s, seqid=%d - unexpected length %d (expected: %d)\n" % (
                        taxId, protId, annotatedSeqId, len(seq),
                        expectedSeqLength)
                    f.write(e)
                    logging.error(e)
                    timerForPreFolding.stop()
                    raise Exception(e)

            if (len(seq) < self._windowWidth):
                # Sequence is shorter than required window; skip
                e = "Warning: skipping sequence because it is shorter than the requested window...\n"
                f.write(e)
                logging.error(e)
                timerForPreFolding.stop()
                raise Exception(e)

            logging.info(
                "DEBUG: Processing item taxId=%d, protId=%s, shuffle=%d (length=%d, %d windows)...\n"
                % (taxId, protId, shuffleId, len(seq),
                   len(requestedWindowStarts)))

            # TODO - Remove any old value stored in this key?

            # Skip this for now
            # This will be made redundant by completing the "updating" implementation
            #
            #if( cds.isCalculationDone( seriesSourceNumber, shuffleId )):
            #    # Sufficient data seems to exist. Skip...
            #    f.write("Item %s appears to be already completed, skipping..." % itemToProcess)
            #    continue

            logging.info(seq[:50])
            #f.write("\n")

            MFEprofile = record["MFE-profile"]
            #f.write("Profile: %s\n" % MFEprofile)

            # Make sure the profile array contains enough entries for all new windows (and possibly, if windows are non-contiguous, entries between them that we are not going to compute right now)
            if (len(MFEprofile) < max(requestedWindowStarts)):
                entriesToAdd = max(requestedWindowStarts) - len(MFEprofile) + 1
                MFEprofile.extend([None] * entriesToAdd)
            assert (len(MFEprofile) >= max(requestedWindowStarts))

            stats = RunningStats()
            stats.extend([x for x in MFEprofile if x is not None])

            timerForPreFolding.stop()
            timerForFolding.start()
            for start in requestedWindowStarts:
                fragment = seq[start:(start + self._windowWidth)]
                assert (len(fragment) == self._windowWidth)

                if self._seriesSourceNumber == db.Sources.RNAfoldEnergy_SlidingWindow40_v2:
                    # Calculate the RNA folding energy. This is the computation-heavy part.
                    #strct, energy = RNA.fold(fragment)
                    energy = RNAfold_direct(fragment)
                    assert (energy <= 0.0)

                elif self._seriesSourceNumber == db.Sources.RNAfoldEnergy_SlidingWindow40_v2_native_temp:
                    # Calculate the RNA folding energy. This is the computation-heavy part.
                    #strct, energy = RNA.fold(fragment)
                    energy = RNAfold_direct(fragment,
                                            explicitCalculationTemperature=
                                            optimalSpeciesGrowthTemperature)
                    assert (energy <= 0.0)

                elif self._seriesSourceNumber == db.Sources.TEST_StepFunction_BeginReferenced:
                    if shuffleId < 0:
                        energy = 0
                    else:
                        energy = start % 50 - 20

                elif self._seriesSourceNumber == db.Sources.TEST_StepFunction_EndReferenced:
                    if shuffleId < 0:
                        energy = 0
                    else:
                        energy = (expectedSeqLength - self._windowWidth -
                                  start) % 50 - 20

                else:
                    logging.error(
                        "Received unknown seriesSourceNumber {}".format(
                            self._seriesSourceNumber))
                    assert (False)

                # Store the calculation result
                #print("%d:%s --> %f" % (taxId, protId, energy))

                stats.push(energy)
                MFEprofile[start] = energy

            print(
                "///////////////////  shuffleId={} (len={}) //////////////////////////"
                .format(shuffleId, expectedSeqLength))
            prettyPrintProfile(MFEprofile)

            timerForFolding.stop()
            timerForPostFolding.start()

            # Format
            crc = calcCrc(seq)
            #result = """{"id":"%s","seq-crc":%d,"MFE-profile":[%s],"MeanMFE":%.6g,v:2}""" % (itemToProcess, crc, ",".join(map(lambda x: "%.3g" % x, MFEprofile)), stats.mean())
            record["seq-crc"] = crc
            record["MFE-profile"] = [
                round4(x) for x in MFEprofile
            ]  # Round items down to save space (these are not exact numbers anyway)
            record["MeanMFE"] = stats.mean()
            result = json.dumps(record)

            f.write(result)
            f.write("\n")

            if (not self._debugDoneWriteResults):
                cds.saveCalculationResult2(self._seriesSourceNumber, result,
                                           annotatedSeqId, False)

            timerForPostFolding.stop()

        timerForPostFolding.start()

        if (not self._debugDoneWriteResults):
            cds.commitChanges()

        timerForPostFolding.stop()
            warnings.update(("has-broken-codons", ))
            continue

        nativeCodons = Counter(splitCodons(nativeSeq))

        hasMismatchedCodons = False
        allNativeSeqs[protId] = nativeSeq
        hashesForShuffles = set()

        #for i, c in enumerate(splitCodons(nativeSeq)[:maxCodons]):
        #    nativeColumns[i].append(c)

        shuffledSeqs = []

        for shuffleId in range(len(allIds)):
            shuffledSeq = cds.getShuffledSeq(shuffleId, shuffleType)
            shuffledCodons = Counter(splitCodons(shuffledSeq))

            hashesForShuffles.add(md5(shuffledSeq).hexdigest())

            if shuffledCodons != nativeCodons:
                warnings.update(("num-horizontal-codon-mismatch", ))
                hasMismatchedCodons = True

            shuffledSeqs.append(shuffledSeq)

            #for i, c in enumerate(splitCodons(shuffledSeq)[:maxCodons]):
            #    shuffledColumns[shuffleId][i].append(c)

        numUniqueShuffles.update((len(hashesForShuffles), ))
        if len(hashesForShuffles) != len(allIds):