if profileReference == "begin":
            lastwin = requiredWindows[-1]
        elif profileReference == "end":
            lastwin = requiredWindows[0]
        elif profileReference == "stop3utr":
            lastwin = maxNumWindows
        else:
            assert (False)

        if (shufflesWithMissingWindows):
            #cds.enqueueForProcessing(computationTag, shufflesWithMissingWindows, lastwin, windowStep)#

            shuffleIdsToProcess = sorted(shufflesWithMissingWindows +
                                         completelyMissingShuffles)
            allSeqIds = cds.shuffledSeqIds(shuffleType)

            def shuffleIdToSeqId(shuffleId):
                if shuffleId == -1:
                    return cds.seqId()
                else:
                    return allSeqIds[shuffleId]

            requiredSeqIds = list(map(shuffleIdToSeqId, shuffleIdsToProcess))

            queueItem = "%d/%s/%s/%s/%d/%d/%s/%d" % (
                cds.getTaxId(), cds.getProtId(),
                ",".join(map(str, requiredSeqIds)), ",".join(
                    map(str, shufflesWithMissingWindows +
                        completelyMissingShuffles)), lastwin, windowStep,
                profileReference, shuffleType)
Exemple #2
0
    def calculateMissingWindowsForSequence(
            self,
            taxId,
            protId,
            seqIds,
            requestedShuffleIds,
            firstWindow,
            lastWindowStart,
            windowStep,
            reference="begin",
            shuffleType=db.Sources.ShuffleCDSv2_python):

        timerForPreFolding.start()
        logging.warning("Parameters: %d %s %s %s %d %d %s %d" %
                        (taxId, protId, seqIds, requestedShuffleIds,
                         lastWindowStart, windowStep, reference, shuffleType))
        f = self._logfile

        assert (len(seqIds) > 0)
        assert (len(seqIds) == len(requestedShuffleIds))

        optimalSpeciesGrowthTemperature = None
        if (self._seriesSourceNumber ==
                db.Sources.RNAfoldEnergy_SlidingWindow40_v2_native_temp):
            (numericalProp, _) = getSpeciesTemperatureInfo(taxId)
            optimalSpeciesGrowthTemperature = numericalProp[0]

            if optimalSpeciesGrowthTemperature is None:
                raise Exception(
                    "No temperature value for taxid={}, can't calculate native-temperature folding profile..."
                    .format(taxId))
            else:
                optimalSpeciesGrowthTemperature = float(
                    optimalSpeciesGrowthTemperature)
                assert (optimalSpeciesGrowthTemperature >= -30.0
                        and optimalSpeciesGrowthTemperature <= 150.0)

        if (reference != "begin" and reference != "end"):
            timerForPreFolding.stop()
            e = "Specificed profile reference '%s' is not supported! (" % reference
            logging.error(e)
            raise Exception(e)

        # We will process all listed shuffle-ids for the following protein record
        cds = CDSHelper(taxId, protId)

        if (cds.length() < self._windowWidth):
            e = "Refusing to process item %s because the sequence length (%d nt) is less than the window size (%d nt)\n" % (
                itemToProcess, cds.length(), self._windowWidth)
            f.write(e)
            logging.error(e)
            timerForPreFolding.stop()
            raise Exception(e)

        # Create a list of the windows we need to calculate for this CDS
        if reference == "begin":
            requestedWindowStarts = frozenset(
                range(
                    0,
                    min(lastWindowStart + 1,
                        cds.length() - self._windowWidth - 1), windowStep))
            if (len(requestedWindowStarts) == 0):
                e = "No windows exist for calculation taxid=%d, protId=%s, CDS-length=%d, lastWindowStart=%d, windowStep=%d, windowWidth=%d - Skipping...\n" % (
                    taxId, protId, cds.length(), lastWindowStart, windowStep,
                    self._windowWidth)
                f.write(e)
                logging.error(e)
                timerForPreFolding.stop()
                raise Exception(e)
        elif reference == "end":
            lastPossibleWindowStart = cds.length(
            ) - self._windowWidth  #+ 1  # disregard lastWindowStart when reference=="end"
            #lastWindowCodonStart = (lastPossibleWindowStart-3)-(lastPossibleWindowStart-3)%3

            #lastPossibleWindowStart = seqLength - windowWidth # + 1  # disregard lastWindowStart when reference=="end"
            requestedWindowStarts = frozenset(
                filter(
                    lambda x: x >= lastWindowStart,
                    range(lastPossibleWindowStart % windowStep,
                          lastPossibleWindowStart + 1, windowStep)))

            #requestedWindowStarts = frozenset(range(lastWindowCodonStart % windowStep, lastWindowCodonStart, windowStep))
            #pass
        else:
            assert (False)

        # First, read available results (for all shuffle-ids) in JSON format
        # Array is indexed by shuffle-id, so results not requested will be represented by None (as will requested items that have no results yet).
        logging.info("DEBUG: requestedShuffleIds (%d items): %s\n" %
                     (len(requestedShuffleIds), requestedShuffleIds))
        existingResults = cds.getCalculationResult2(self._seriesSourceNumber,
                                                    requestedShuffleIds,
                                                    True,
                                                    shuffleType=shuffleType)
        #assert(len(existingResults) >= len(requestedShuffleIds))  # The returned array must be at least as large as the requested ids list
        assert (len(existingResults) == len(requestedShuffleIds))
        logging.info("requestedShuffleIds: %s" % requestedShuffleIds)
        logging.info("existingResults.keys(): %s" % existingResults.keys())
        assert (frozenset(requestedShuffleIds) == frozenset(
            existingResults.keys()))
        #existingResults = [None] * (max(requestedShuffleIds)+1)
        logging.info("DEBUG: existingResults (%d items): %s\n" %
                     (len(existingResults), existingResults))

        # Check for which of the requested shuffle-ids there are values missing
        shuffleIdsToProcess = {}
        for shuffleId, r in existingResults.items():
            if r is None:
                # There are no existing results for shuffled-id n. If it was requested, it should be calculated now (including all windows)
                if shuffleId in requestedShuffleIds:
                    shuffleIdsToProcess[shuffleId] = list(
                        requestedWindowStarts)

                timerForPreFolding.stop()

                # ------------------------------------------------------------------------------------
                continue  # TODO - verify this line; should we abort this sequence by throwing????
                # ------------------------------------------------------------------------------------

            logging.info("/// shuffleId r = %d %s" % (shuffleId, r))
            logging.info("r[MFE-profile] %s" % r["MFE-profile"])

            # Check the existing results for this shuffle
            alreadyProcessedWindowStarts = frozenset([
                i for i, x in enumerate(r["MFE-profile"]) if x is not None
            ])  # Get the indices (=window starts) of all non-None values
            missingWindows = requestedWindowStarts - alreadyProcessedWindowStarts  # Are there any requested windows that are not already computed?
            if (missingWindows):
                shuffleIdsToProcess[shuffleId] = missingWindows

        if (not shuffleIdsToProcess):
            e = "All requested shuffle-ids in (taxId: %d, protId: %s, seqs: %s) seem to have already been processed. Skipping...\n" % (
                taxId, protId, str(list(zip(seqIds, requestedShuffleIds))))
            logging.warning(e)
            timerForPreFolding.stop()
            return
        logging.info("DEBUG: shuffleIdsToProcess (%d items): %s\n" %
                     (len(shuffleIdsToProcess), shuffleIdsToProcess))

        logging.info("DEBUG: Before (%d items): %s\n" %
                     (len(existingResults), existingResults))
        # Initialize new results records
        for shuffleId in shuffleIdsToProcess.keys():
            if existingResults[shuffleId] is None:
                logging.info(seqIds)
                logging.info(requestedShuffleIds)
                logging.info(shuffleId)
                thisSeqId = seqIds[requestedShuffleIds.index(shuffleId)]

                existingResults[shuffleId] = {
                    "id":
                    "%s/%s/%d/%d" % (taxId, protId, thisSeqId, shuffleId),
                    "seq-crc": None,
                    "MFE-profile": [],
                    "MeanMFE": None,
                    "v": 2,
                    "shuffle-type": shuffleType
                }
        logging.info("DEBUG: existingResults (%d items): %s\n" %
                     (len(existingResults), existingResults))
        timerForPreFolding.stop()

        # Load the sequences of all shuffle-ids we need to work on
        # TODO - combine loading of multiple sequences into one DB operation
        for shuffleId, record in existingResults.items():
            if record is None:
                logging.info(
                    "DEBUG: skipping empty results record for shuffleId={}".
                    format(shuffleId))
                continue
            timerForPreFolding.start()

            seq = None
            annotatedSeqId = None
            # Get the sequence for this entry
            if (shuffleId < 0):
                seq = cds.sequence()
                annotatedSeqId = cds.seqId()
            else:
                seq = cds.getShuffledSeq(shuffleId, shuffleType)
                annotatedSeqId = cds.getShuffledSeqId(shuffleId, shuffleType)

            if (seq is None or (not seq is None and len(seq) == 0)):
                seq2 = cds.getShuffledSeq2(annotatedSeqId)
                seq3 = cds._fetchSequence(annotatedSeqId)
                seq4 = cds._cache.get("%d:seq" % annotatedSeqId)
                if not seq4 is None:
                    del cds._cache["%d:seq" % annotatedSeqId]
                seq5 = cds.getShuffledSeq2(annotatedSeqId)
                e = "Got empty sequence for shuffleId=%d, seqId=%d, taxId=%d, protId=%s, numShuffled=%d, ids[%d:%d]=%s, len(seq2)=%d, len(seq3)=%d, len(seq4)=%d, len(seq5)=%d" % (
                    shuffleId, annotatedSeqId, taxId, protId,
                    len(cds.shuffledSeqIds()), shuffleId - 2, shuffleId + 2,
                    cds.shuffledSeqIds()[shuffleId - 2:shuffleId + 2],
                    len(seq2) if not seq2 is None else -1, len(seq3)
                    if not seq3 is None else -1, len(seq4) if not seq4 is None
                    else -1, len(seq5) if not seq5 is None else -1)
                logging.error(e)
                timerForPreFolding.stop()
                raise Exception(e)

            #
            # Disabled - calculation needn't include the native sequence...
            #
            #if( annotatedSeqId not in seqIds ):
            #    e = "Error: SeqId specified in queue item %s does not match annotated seq-id %d\n" % (itemToProcess, annotatedSeqId)
            #    f.write(e)
            #    f.write("Current shuffle-id: %d\n" % shuffleId)
            #    f.write("Ids in existing results:\n")
            #    for shuffleId, record in enumerate(existingResults):
            #        f.write(" %d) %s\n" % (shuffleId, record['id']))
            #    f.write("Debug info:\n")
            #    f.write("\n".join(cds.getDebugInfo()))
            #    f.write("\n")
            #    f.write("Skipping...\n")
            #    print("Skipping...")
            #    raise Exception(e)

            expectedSeqLength = cds.length()
            if (not expectedSeqLength is None):
                if (expectedSeqLength != len(seq)):
                    e = "Warning: taxid=%d, protid=%s, seqid=%d - unexpected length %d (expected: %d)\n" % (
                        taxId, protId, annotatedSeqId, len(seq),
                        expectedSeqLength)
                    f.write(e)
                    logging.error(e)
                    timerForPreFolding.stop()
                    raise Exception(e)

            if (len(seq) < self._windowWidth):
                # Sequence is shorter than required window; skip
                e = "Warning: skipping sequence because it is shorter than the requested window...\n"
                f.write(e)
                logging.error(e)
                timerForPreFolding.stop()
                raise Exception(e)

            logging.info(
                "DEBUG: Processing item taxId=%d, protId=%s, shuffle=%d (length=%d, %d windows)...\n"
                % (taxId, protId, shuffleId, len(seq),
                   len(requestedWindowStarts)))

            # TODO - Remove any old value stored in this key?

            # Skip this for now
            # This will be made redundant by completing the "updating" implementation
            #
            #if( cds.isCalculationDone( seriesSourceNumber, shuffleId )):
            #    # Sufficient data seems to exist. Skip...
            #    f.write("Item %s appears to be already completed, skipping..." % itemToProcess)
            #    continue

            logging.info(seq[:50])
            #f.write("\n")

            MFEprofile = record["MFE-profile"]
            #f.write("Profile: %s\n" % MFEprofile)

            # Make sure the profile array contains enough entries for all new windows (and possibly, if windows are non-contiguous, entries between them that we are not going to compute right now)
            if (len(MFEprofile) < max(requestedWindowStarts)):
                entriesToAdd = max(requestedWindowStarts) - len(MFEprofile) + 1
                MFEprofile.extend([None] * entriesToAdd)
            assert (len(MFEprofile) >= max(requestedWindowStarts))

            stats = RunningStats()
            stats.extend([x for x in MFEprofile if x is not None])

            timerForPreFolding.stop()
            timerForFolding.start()
            for start in requestedWindowStarts:
                fragment = seq[start:(start + self._windowWidth)]
                assert (len(fragment) == self._windowWidth)

                if self._seriesSourceNumber == db.Sources.RNAfoldEnergy_SlidingWindow40_v2:
                    # Calculate the RNA folding energy. This is the computation-heavy part.
                    #strct, energy = RNA.fold(fragment)
                    energy = RNAfold_direct(fragment)
                    assert (energy <= 0.0)

                elif self._seriesSourceNumber == db.Sources.RNAfoldEnergy_SlidingWindow40_v2_native_temp:
                    # Calculate the RNA folding energy. This is the computation-heavy part.
                    #strct, energy = RNA.fold(fragment)
                    energy = RNAfold_direct(fragment,
                                            explicitCalculationTemperature=
                                            optimalSpeciesGrowthTemperature)
                    assert (energy <= 0.0)

                elif self._seriesSourceNumber == db.Sources.TEST_StepFunction_BeginReferenced:
                    if shuffleId < 0:
                        energy = 0
                    else:
                        energy = start % 50 - 20

                elif self._seriesSourceNumber == db.Sources.TEST_StepFunction_EndReferenced:
                    if shuffleId < 0:
                        energy = 0
                    else:
                        energy = (expectedSeqLength - self._windowWidth -
                                  start) % 50 - 20

                else:
                    logging.error(
                        "Received unknown seriesSourceNumber {}".format(
                            self._seriesSourceNumber))
                    assert (False)

                # Store the calculation result
                #print("%d:%s --> %f" % (taxId, protId, energy))

                stats.push(energy)
                MFEprofile[start] = energy

            print(
                "///////////////////  shuffleId={} (len={}) //////////////////////////"
                .format(shuffleId, expectedSeqLength))
            prettyPrintProfile(MFEprofile)

            timerForFolding.stop()
            timerForPostFolding.start()

            # Format
            crc = calcCrc(seq)
            #result = """{"id":"%s","seq-crc":%d,"MFE-profile":[%s],"MeanMFE":%.6g,v:2}""" % (itemToProcess, crc, ",".join(map(lambda x: "%.3g" % x, MFEprofile)), stats.mean())
            record["seq-crc"] = crc
            record["MFE-profile"] = [
                round4(x) for x in MFEprofile
            ]  # Round items down to save space (these are not exact numbers anyway)
            record["MeanMFE"] = stats.mean()
            result = json.dumps(record)

            f.write(result)
            f.write("\n")

            if (not self._debugDoneWriteResults):
                cds.saveCalculationResult2(self._seriesSourceNumber, result,
                                           annotatedSeqId, False)

            timerForPostFolding.stop()

        timerForPostFolding.start()

        if (not self._debugDoneWriteResults):
            cds.commitChanges()

        timerForPostFolding.stop()
                skipped += 1
                continue
        else:
            print(
                "Warning: Could not find CDS length entry for taxid=%d, protid=%s"
                % (taxIdForProcessing, protId))
            skipped += 1
            continue

        #requiredNumWindows = seqLength - windowWidth + 1

        cdsSeqId = cds.seqId()

        match[cdsSeqId] = (protId, -1)

        shuffledIds = cds.shuffledSeqIds()

        computedShufflesCount = 0
        missingShuffles = []

        for n in range(50):
            match[shuffledIds[n]] = (protId, n)
            if shuffledIds[n] in computed:
                computedShufflesCount += 1
            else:
                missingShuffles.append(n)

        if (computedShufflesCount < 50):
            print("%s - found only %d computed results" %
                  (protId, computedShufflesCount))
            # Skip sequences that are too short
            if(seqLength < numWindows + windowWidth + 1 ):
                skipped += 1
                continue
        else:
            print("Warning: Could not find CDS length entry for taxid=%d, protid=%s" % (taxIdForProcessing, protId) )
            skipped += 1
            continue

        #requiredNumWindows = seqLength - windowWidth + 1

        cdsSeqId = cds.seqId()

        #match[cdsSeqId] = (protId,-1)

        shuffledIds = cds.shuffledSeqIds()

        computedShufflesCount = 0

        for n in range(min(numShuffledGroups, len(shuffledIds))):
            #match[shuffledIds[n]] = (protId,n)
            if shuffledIds[n] in computed:
                computedShufflesCount += 1

        if( computedShufflesCount<requiredNumShuffledGroups or (not cdsSeqId in computed) ):
            #print("%s - found only %d groups, skipping" % (protId, computedShufflesCount))
            skipped += 1
            continue

        # TODO - do something here...
        #cdsResults = cds.getCalculationResult( seriesSourceNumber, -1 )
numUniqueShuffles = Counter()

for taxId in species:
    proteinsDone = 0

    #nativeColumns = [[] for x in range(maxCodons)]
    #shuffledColumns = [[[] for x in range(maxCodons)] for y in range(maxShuffles)]
    allNativeSeqs = {}
    allShuffledSeqs = {}

    for protId in SpeciesCDSSource(taxId):
        cds = CDSHelper(taxId, protId)
        warnings.update(("total-cds", ))

        allIds = cds.shuffledSeqIds(shuffleType=shuffleType)[:maxShuffles]

        nativeSeq = cds.sequence()
        if (len(nativeSeq) % 3 != 0):
            warnings.update(("has-broken-codons", ))
            continue

        nativeCodons = Counter(splitCodons(nativeSeq))

        hasMismatchedCodons = False
        allNativeSeqs[protId] = nativeSeq
        hashesForShuffles = set()

        #for i, c in enumerate(splitCodons(nativeSeq)[:maxCodons]):
        #    nativeColumns[i].append(c)
Exemple #6
0
def readSeriesResultsForSpecies(seriesSourceNumber,
                                species,
                                minShuffledGroups=20,
                                maxShuffledGroups=20,
                                shuffleType=db.Sources.ShuffleCDSv2_python,
                                cdsFilter=None,
                                returnCDS=True):
    if isinstance(
            species, Iterable
    ):  # usually, species will be a sequence of numeric taxid values
        if isinstance(species, basestring):
            raise Exception("species cannot be string")
        # all set - proceed...
    else:
        species = (species, )  # assume we got a single (numeric) taxid value
    assert (minShuffledGroups <= maxShuffledGroups)

    for taxIdForProcessing in species:
        print("Procesing %d sequences for tax-id %d (%s)..." %
              (countSpeciesCDS(taxIdForProcessing), taxIdForProcessing,
               getSpeciesName(taxIdForProcessing)))

        computed = getAllComputedSeqsForSpecies(seriesSourceNumber,
                                                taxIdForProcessing,
                                                maxShuffledGroups,
                                                shuffleType=shuffleType)
        computedIds = frozenset(computed.keys())
        print("Collecting data from %d computation results..." % len(computed))

        skipped = 0
        selected = 0
        alreadyCompleted = 0

        # Iterate over all CDS entries for this species
        for protId in SpeciesCDSSource(taxIdForProcessing):
            cds = CDSHelper(taxIdForProcessing, protId)

            if (not cdsFilter is None) and (not cdsFilter(cds)):
                continue

            cdsSeqId = cds.seqId()

            shuffledIds = cds.shuffledSeqIds(shuffleType=shuffleType)

            # How many shuffles (for this cds) exist in the data we found?
            computedShufflesCount = len(
                computedIds.intersection(frozenset(shuffledIds)))

            if (computedShufflesCount < minShuffledGroups
                    or (not cdsSeqId in computedIds)):
                #print("%s - found only %d groups, skipping" % (protId, computedShufflesCount))
                skipped += 1
                continue

            # Get the computed results for this CDS
            seqIds = [cds.seqId()]
            seqIds.extend(cds.shuffledSeqIds(shuffleType=shuffleType))
            if (len(seqIds) > maxShuffledGroups + 1):
                seqIds = seqIds[:maxShuffledGroups + 1]
            results = [computed.get(x) for x in seqIds]

            if (results is None or len([() for x in results if not x is None])
                    < minShuffledGroups):
                print("Not enough results found for %s" % protId)
                skipped += 1
                continue

            # Decode the results
            results = list(
                map(
                    lambda x: decodeJsonSeriesRecord(decompressSeriesRecord(x))
                    if not x is None else None, results))
            if (returnCDS):
                yield {
                    "taxid": taxIdForProcessing,
                    "content": results,
                    "cds": cds
                }
            else:
                yield {"taxid": taxIdForProcessing, "content": results}
            del results
            del cds
            selected += 1

            if (rl()):
                print("# %s - %d records included, %d records skipped" %
                      (datetime.now().isoformat(), selected, skipped))
    if (len(cds.sequence()) != cds.length()):
        print(
            "WARNING: incorrect sequence length detected for record (taxid=%d, protId=%s); real-length=%d, recorded-length=%d."
            % (taxId, protId, len(cds.sequence()), cds.length()))
        warningsCount += 1

    recomputedCrc = getCrc(cds.sequence())
    annotatedCrc = cds.crc()
    assert (recomputedCrc == annotatedCrc)
    print(cds.sequence()[:15])

    seq1trans = Seq(cds.sequence(), generic_dna).translate()
    crc1 = getCrc(seq1trans)

    shuffles = cds.shuffledSeqIds(shuffleType=db.Sources.ShuffleCDSv2_python)
    unique = len(frozenset(shuffles))

    if (unique != len(shuffles)):
        print(
            "WARNING: duplicate shuffles found in record (taxid=%d, protId=%s); count=%d, unique=%d."
            % (taxId, protId, len(shuffles), unique))
        warningsCount += 1
    statsShuffles.push(len(shuffles))

    shuffledCrcs = set()
    shuffledTransCrcs = set()

    for shuffledSeqId in shuffles:
        shufSeq = cds.getShuffledSeq2(shuffledSeqId)