Python CDSHelper.getShuffledSeq Examples

Programming Language: Python

Namespace/Package Name: data_helpers

Class/Type: CDSHelper

Method/Function: getShuffledSeq

Examples at hotexamples.com: 2

Python CDSHelper.getShuffledSeq - 2 examples found. These are the top rated real world Python examples of data_helpers.CDSHelper.getShuffledSeq extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

CDSHelper(14)

sequence(7)

shuffledSeqIds(7)

length(6)

seqId(4)

CDSlength(4)

getGeneId(3)

getCalculationResult2(2)

getShuffledSeq2(2)

getShuffledSeq(2)

dropShuffledSeqs(2)

flankingRegion3UtrLength(1)

getProtId(1)

commitChanges(1)

dropRecord(1)

getShuffledSeqId(1)

getTaxId(1)

dropNativeSeq(1)

nextCDSOnOppositeStrand(1)

saveCalculationResult2(1)

crc(1)

_fetchSequence(1)

checkCalculationResultWithWindows(1)

Example #1

Show file

    def calculateMissingWindowsForSequence(
            self,
            taxId,
            protId,
            seqIds,
            requestedShuffleIds,
            firstWindow,
            lastWindowStart,
            windowStep,
            reference="begin",
            shuffleType=db.Sources.ShuffleCDSv2_python):

        timerForPreFolding.start()
        logging.warning("Parameters: %d %s %s %s %d %d %s %d" %
                        (taxId, protId, seqIds, requestedShuffleIds,
                         lastWindowStart, windowStep, reference, shuffleType))
        f = self._logfile

        assert (len(seqIds) > 0)
        assert (len(seqIds) == len(requestedShuffleIds))

        optimalSpeciesGrowthTemperature = None
        if (self._seriesSourceNumber ==
                db.Sources.RNAfoldEnergy_SlidingWindow40_v2_native_temp):
            (numericalProp, _) = getSpeciesTemperatureInfo(taxId)
            optimalSpeciesGrowthTemperature = numericalProp[0]

            if optimalSpeciesGrowthTemperature is None:
                raise Exception(
                    "No temperature value for taxid={}, can't calculate native-temperature folding profile..."
                    .format(taxId))
            else:
                optimalSpeciesGrowthTemperature = float(
                    optimalSpeciesGrowthTemperature)
                assert (optimalSpeciesGrowthTemperature >= -30.0
                        and optimalSpeciesGrowthTemperature <= 150.0)

        if (reference != "begin" and reference != "end"):
            timerForPreFolding.stop()
            e = "Specificed profile reference '%s' is not supported! (" % reference
            logging.error(e)
            raise Exception(e)

        # We will process all listed shuffle-ids for the following protein record
        cds = CDSHelper(taxId, protId)

        if (cds.length() < self._windowWidth):
            e = "Refusing to process item %s because the sequence length (%d nt) is less than the window size (%d nt)\n" % (
                itemToProcess, cds.length(), self._windowWidth)
            f.write(e)
            logging.error(e)
            timerForPreFolding.stop()
            raise Exception(e)

        # Create a list of the windows we need to calculate for this CDS
        if reference == "begin":
            requestedWindowStarts = frozenset(
                range(
                    0,
                    min(lastWindowStart + 1,
                        cds.length() - self._windowWidth - 1), windowStep))
            if (len(requestedWindowStarts) == 0):
                e = "No windows exist for calculation taxid=%d, protId=%s, CDS-length=%d, lastWindowStart=%d, windowStep=%d, windowWidth=%d - Skipping...\n" % (
                    taxId, protId, cds.length(), lastWindowStart, windowStep,
                    self._windowWidth)
                f.write(e)
                logging.error(e)
                timerForPreFolding.stop()
                raise Exception(e)
        elif reference == "end":
            lastPossibleWindowStart = cds.length(
            ) - self._windowWidth  #+ 1  # disregard lastWindowStart when reference=="end"
            #lastWindowCodonStart = (lastPossibleWindowStart-3)-(lastPossibleWindowStart-3)%3

            #lastPossibleWindowStart = seqLength - windowWidth # + 1  # disregard lastWindowStart when reference=="end"
            requestedWindowStarts = frozenset(
                filter(
                    lambda x: x >= lastWindowStart,
                    range(lastPossibleWindowStart % windowStep,
                          lastPossibleWindowStart + 1, windowStep)))

            #requestedWindowStarts = frozenset(range(lastWindowCodonStart % windowStep, lastWindowCodonStart, windowStep))
            #pass
        else:
            assert (False)

        # First, read available results (for all shuffle-ids) in JSON format
        # Array is indexed by shuffle-id, so results not requested will be represented by None (as will requested items that have no results yet).
        logging.info("DEBUG: requestedShuffleIds (%d items): %s\n" %
                     (len(requestedShuffleIds), requestedShuffleIds))
        existingResults = cds.getCalculationResult2(self._seriesSourceNumber,
                                                    requestedShuffleIds,
                                                    True,
                                                    shuffleType=shuffleType)
        #assert(len(existingResults) >= len(requestedShuffleIds))  # The returned array must be at least as large as the requested ids list
        assert (len(existingResults) == len(requestedShuffleIds))
        logging.info("requestedShuffleIds: %s" % requestedShuffleIds)
        logging.info("existingResults.keys(): %s" % existingResults.keys())
        assert (frozenset(requestedShuffleIds) == frozenset(
            existingResults.keys()))
        #existingResults = [None] * (max(requestedShuffleIds)+1)
        logging.info("DEBUG: existingResults (%d items): %s\n" %
                     (len(existingResults), existingResults))

        # Check for which of the requested shuffle-ids there are values missing
        shuffleIdsToProcess = {}
        for shuffleId, r in existingResults.items():
            if r is None:
                # There are no existing results for shuffled-id n. If it was requested, it should be calculated now (including all windows)
                if shuffleId in requestedShuffleIds:
                    shuffleIdsToProcess[shuffleId] = list(
                        requestedWindowStarts)

                timerForPreFolding.stop()

                # ------------------------------------------------------------------------------------
                continue  # TODO - verify this line; should we abort this sequence by throwing????
                # ------------------------------------------------------------------------------------

            logging.info("/// shuffleId r = %d %s" % (shuffleId, r))
            logging.info("r[MFE-profile] %s" % r["MFE-profile"])

            # Check the existing results for this shuffle
            alreadyProcessedWindowStarts = frozenset([
                i for i, x in enumerate(r["MFE-profile"]) if x is not None
            ])  # Get the indices (=window starts) of all non-None values
            missingWindows = requestedWindowStarts - alreadyProcessedWindowStarts  # Are there any requested windows that are not already computed?
            if (missingWindows):
                shuffleIdsToProcess[shuffleId] = missingWindows

        if (not shuffleIdsToProcess):
            e = "All requested shuffle-ids in (taxId: %d, protId: %s, seqs: %s) seem to have already been processed. Skipping...\n" % (
                taxId, protId, str(list(zip(seqIds, requestedShuffleIds))))
            logging.warning(e)
            timerForPreFolding.stop()
            return
        logging.info("DEBUG: shuffleIdsToProcess (%d items): %s\n" %
                     (len(shuffleIdsToProcess), shuffleIdsToProcess))

        logging.info("DEBUG: Before (%d items): %s\n" %
                     (len(existingResults), existingResults))
        # Initialize new results records
        for shuffleId in shuffleIdsToProcess.keys():
            if existingResults[shuffleId] is None:
                logging.info(seqIds)
                logging.info(requestedShuffleIds)
                logging.info(shuffleId)
                thisSeqId = seqIds[requestedShuffleIds.index(shuffleId)]

                existingResults[shuffleId] = {
                    "id":
                    "%s/%s/%d/%d" % (taxId, protId, thisSeqId, shuffleId),
                    "seq-crc": None,
                    "MFE-profile": [],
                    "MeanMFE": None,
                    "v": 2,
                    "shuffle-type": shuffleType
                }
        logging.info("DEBUG: existingResults (%d items): %s\n" %
                     (len(existingResults), existingResults))
        timerForPreFolding.stop()

        # Load the sequences of all shuffle-ids we need to work on
        # TODO - combine loading of multiple sequences into one DB operation
        for shuffleId, record in existingResults.items():
            if record is None:
                logging.info(
                    "DEBUG: skipping empty results record for shuffleId={}".
                    format(shuffleId))
                continue
            timerForPreFolding.start()

            seq = None
            annotatedSeqId = None
            # Get the sequence for this entry
            if (shuffleId < 0):
                seq = cds.sequence()
                annotatedSeqId = cds.seqId()
            else:
                seq = cds.getShuffledSeq(shuffleId, shuffleType)
                annotatedSeqId = cds.getShuffledSeqId(shuffleId, shuffleType)

            if (seq is None or (not seq is None and len(seq) == 0)):
                seq2 = cds.getShuffledSeq2(annotatedSeqId)
                seq3 = cds._fetchSequence(annotatedSeqId)
                seq4 = cds._cache.get("%d:seq" % annotatedSeqId)
                if not seq4 is None:
                    del cds._cache["%d:seq" % annotatedSeqId]
                seq5 = cds.getShuffledSeq2(annotatedSeqId)
                e = "Got empty sequence for shuffleId=%d, seqId=%d, taxId=%d, protId=%s, numShuffled=%d, ids[%d:%d]=%s, len(seq2)=%d, len(seq3)=%d, len(seq4)=%d, len(seq5)=%d" % (
                    shuffleId, annotatedSeqId, taxId, protId,
                    len(cds.shuffledSeqIds()), shuffleId - 2, shuffleId + 2,
                    cds.shuffledSeqIds()[shuffleId - 2:shuffleId + 2],
                    len(seq2) if not seq2 is None else -1, len(seq3)
                    if not seq3 is None else -1, len(seq4) if not seq4 is None
                    else -1, len(seq5) if not seq5 is None else -1)
                logging.error(e)
                timerForPreFolding.stop()
                raise Exception(e)

            #
            # Disabled - calculation needn't include the native sequence...
            #
            #if( annotatedSeqId not in seqIds ):
            #    e = "Error: SeqId specified in queue item %s does not match annotated seq-id %d\n" % (itemToProcess, annotatedSeqId)
            #    f.write(e)
            #    f.write("Current shuffle-id: %d\n" % shuffleId)
            #    f.write("Ids in existing results:\n")
            #    for shuffleId, record in enumerate(existingResults):
            #        f.write(" %d) %s\n" % (shuffleId, record['id']))
            #    f.write("Debug info:\n")
            #    f.write("\n".join(cds.getDebugInfo()))
            #    f.write("\n")
            #    f.write("Skipping...\n")
            #    print("Skipping...")
            #    raise Exception(e)

            expectedSeqLength = cds.length()
            if (not expectedSeqLength is None):
                if (expectedSeqLength != len(seq)):
                    e = "Warning: taxid=%d, protid=%s, seqid=%d - unexpected length %d (expected: %d)\n" % (
                        taxId, protId, annotatedSeqId, len(seq),
                        expectedSeqLength)
                    f.write(e)
                    logging.error(e)
                    timerForPreFolding.stop()
                    raise Exception(e)

            if (len(seq) < self._windowWidth):
                # Sequence is shorter than required window; skip
                e = "Warning: skipping sequence because it is shorter than the requested window...\n"
                f.write(e)
                logging.error(e)
                timerForPreFolding.stop()
                raise Exception(e)

            logging.info(
                "DEBUG: Processing item taxId=%d, protId=%s, shuffle=%d (length=%d, %d windows)...\n"
                % (taxId, protId, shuffleId, len(seq),
                   len(requestedWindowStarts)))

            # TODO - Remove any old value stored in this key?

            # Skip this for now
            # This will be made redundant by completing the "updating" implementation
            #
            #if( cds.isCalculationDone( seriesSourceNumber, shuffleId )):
            #    # Sufficient data seems to exist. Skip...
            #    f.write("Item %s appears to be already completed, skipping..." % itemToProcess)
            #    continue

            logging.info(seq[:50])
            #f.write("\n")

            MFEprofile = record["MFE-profile"]
            #f.write("Profile: %s\n" % MFEprofile)

            # Make sure the profile array contains enough entries for all new windows (and possibly, if windows are non-contiguous, entries between them that we are not going to compute right now)
            if (len(MFEprofile) < max(requestedWindowStarts)):
                entriesToAdd = max(requestedWindowStarts) - len(MFEprofile) + 1
                MFEprofile.extend([None] * entriesToAdd)
            assert (len(MFEprofile) >= max(requestedWindowStarts))

            stats = RunningStats()
            stats.extend([x for x in MFEprofile if x is not None])

            timerForPreFolding.stop()
            timerForFolding.start()
            for start in requestedWindowStarts:
                fragment = seq[start:(start + self._windowWidth)]
                assert (len(fragment) == self._windowWidth)

                if self._seriesSourceNumber == db.Sources.RNAfoldEnergy_SlidingWindow40_v2:
                    # Calculate the RNA folding energy. This is the computation-heavy part.
                    #strct, energy = RNA.fold(fragment)
                    energy = RNAfold_direct(fragment)
                    assert (energy <= 0.0)

                elif self._seriesSourceNumber == db.Sources.RNAfoldEnergy_SlidingWindow40_v2_native_temp:
                    # Calculate the RNA folding energy. This is the computation-heavy part.
                    #strct, energy = RNA.fold(fragment)
                    energy = RNAfold_direct(fragment,
                                            explicitCalculationTemperature=
                                            optimalSpeciesGrowthTemperature)
                    assert (energy <= 0.0)

                elif self._seriesSourceNumber == db.Sources.TEST_StepFunction_BeginReferenced:
                    if shuffleId < 0:
                        energy = 0
                    else:
                        energy = start % 50 - 20

                elif self._seriesSourceNumber == db.Sources.TEST_StepFunction_EndReferenced:
                    if shuffleId < 0:
                        energy = 0
                    else:
                        energy = (expectedSeqLength - self._windowWidth -
                                  start) % 50 - 20

                else:
                    logging.error(
                        "Received unknown seriesSourceNumber {}".format(
                            self._seriesSourceNumber))
                    assert (False)

                # Store the calculation result
                #print("%d:%s --> %f" % (taxId, protId, energy))

                stats.push(energy)
                MFEprofile[start] = energy

            print(
                "///////////////////  shuffleId={} (len={}) //////////////////////////"
                .format(shuffleId, expectedSeqLength))
            prettyPrintProfile(MFEprofile)

            timerForFolding.stop()
            timerForPostFolding.start()

            # Format
            crc = calcCrc(seq)
            #result = """{"id":"%s","seq-crc":%d,"MFE-profile":[%s],"MeanMFE":%.6g,v:2}""" % (itemToProcess, crc, ",".join(map(lambda x: "%.3g" % x, MFEprofile)), stats.mean())
            record["seq-crc"] = crc
            record["MFE-profile"] = [
                round4(x) for x in MFEprofile
            ]  # Round items down to save space (these are not exact numbers anyway)
            record["MeanMFE"] = stats.mean()
            result = json.dumps(record)

            f.write(result)
            f.write("\n")

            if (not self._debugDoneWriteResults):
                cds.saveCalculationResult2(self._seriesSourceNumber, result,
                                           annotatedSeqId, False)

            timerForPostFolding.stop()

        timerForPostFolding.start()

        if (not self._debugDoneWriteResults):
            cds.commitChanges()

        timerForPostFolding.stop()

Example #2

Show file

File: test_randomized_sequences.py Project: michaelpeeri/rnafold-public

            warnings.update(("has-broken-codons", ))
            continue

        nativeCodons = Counter(splitCodons(nativeSeq))

        hasMismatchedCodons = False
        allNativeSeqs[protId] = nativeSeq
        hashesForShuffles = set()

        #for i, c in enumerate(splitCodons(nativeSeq)[:maxCodons]):
        #    nativeColumns[i].append(c)

        shuffledSeqs = []

        for shuffleId in range(len(allIds)):
            shuffledSeq = cds.getShuffledSeq(shuffleId, shuffleType)
            shuffledCodons = Counter(splitCodons(shuffledSeq))

            hashesForShuffles.add(md5(shuffledSeq).hexdigest())

            if shuffledCodons != nativeCodons:
                warnings.update(("num-horizontal-codon-mismatch", ))
                hasMismatchedCodons = True

            shuffledSeqs.append(shuffledSeq)

            #for i, c in enumerate(splitCodons(shuffledSeq)[:maxCodons]):
            #    shuffledColumns[shuffleId][i].append(c)

        numUniqueShuffles.update((len(hashesForShuffles), ))
        if len(hashesForShuffles) != len(allIds):