Exemple #1
0
def getRandomizedSequenceCacheForVerticalPermutations(taxId):
    global _caches

    if (taxId, db.Sources.ShuffleCDS_vertical_permutation_1nt) in _caches:
        cache = _caches[(taxId, db.Sources.ShuffleCDS_vertical_permutation_1nt)]
        
    else:
        # read all native sequences
        protIds = []
        cdss = []
        for protId in SpeciesCDSSource(taxId):
            cds = CDSHelper(taxId, protId)
            
            if( cds.length()%3 != 0 ):
                continue
            
            seq = cds.sequence()
            
            protIds.append(protId)
            cdss.append(seq)
            
        geneticCode = getSpeciesTranslationTable( taxId )
        scpr = SynonymousCodonPermutingRandomization( geneticCode ) 
        randomizer = lambda cdss: scpr.verticalPermutation( cdss )
        cache = VerticalRandomizationCache(shuffleType=db.Sources.ShuffleCDS_vertical_permutation_1nt,
                                           taxId=taxId,
                                           nativeSeqsMap=dict(zip(protIds, cdss)),
                                           geneticCode=geneticCode,
                                           randomizer=randomizer )
        _caches[(taxId, db.Sources.ShuffleCDS_vertical_permutation_1nt)] = cache
        print(_caches.keys())

        
    return cache
Exemple #2
0
def writeSequenceToTempFile(taxId):

    print("Fetching sequence for taxid={}".format(taxId))

    allRecords = []
    allCDSs = []

    for protId in SpeciesCDSSource(taxId):
        cds = CDSHelper(taxId, protId)

        if (cds.length() % 3 != 0):
            continue

        seq = cds.sequence()
        allCDSs.append(seq)

        if (len(allCDSs) % 1000 == 999): print(".")

    record = SeqRecord(Seq(''.join(allCDSs), NucleotideAlphabet),
                       id="allCDSs",
                       description="")
    allRecords.append(record)

    fout = NamedTemporaryFile(mode="w", delete=(not debugMode))
    SeqIO.write(allRecords, fout.name,
                "fasta")  # write the full sequences into the file

    return (len(allRecords), fout)
def getIdentifiersConversionTableUsingGff3():
    global altIdentifiers

    if altIdentifiers:
        return altIdentifiers

    gm = getGenomeModelFromCache(taxId)

    for protId in SpeciesCDSSource(taxId):
        cds = CDSHelper(taxId, protId)
        geneId = cds.getGeneId()
        alts = gm.findEquivalentIdentifiers(geneId)
        for i in alts:
            altIdentifiers[i] = protId
        altIdentifiers[geneId] = protId
Exemple #4
0
def storeNewShuffles(taxId, protId, newShuffleIds, shuffleType=db.Sources.ShuffleCDSv2_python, dontStore=False):
    
    cds = CDSHelper(taxId, protId)
    print(protId)
    
    if shuffleType == db.Sources.ShuffleCDSv2_python:
        return storeRandomizedSequences(cds,
                                        createRandomizedSeqs(cds, newShuffleIds, shuffleType),
                                        newShuffleIds,
                                        shuffleType
        )
    
    elif shuffleType == db.Sources.ShuffleCDS_vertical_permutation_1nt:
        cache = getRandomizedSequenceCacheForVerticalPermutations( taxId )

        seqs = map( lambda shuffleId: cache.getShuffledSeq( protId, shuffleId ), newShuffleIds )
        print(seqs)

        if dontStore: return seqs
        
        return storeRandomizedSequences(cds,
                                        seqs,
                                        newShuffleIds,
                                        shuffleType)
                                
    else:
        raise Exception("Unsupported shuffleType={}".format(shuffleType))
Exemple #5
0
def testSpecies(taxId):
    paData = getSpeciesPaxdbData( taxId )

    countFound = 0
    countNotFound = 0
    
    for protId in SpeciesCDSSource(taxId):
        cds = CDSHelper( taxId=taxId, protId=protId )
        geneId = cds.getGeneId()
        
        if geneId in paData:
            countFound += 1
        else:
            countNotFound += 1

    print("Species: {} -> Found: {} ({:.3}%) Not found: {}".format(taxId, countFound, countFound/(countFound+countNotFound)*100, countNotFound))
    return( countFound, countNotFound)
Exemple #6
0
def testCDSand3UTRRandomizationIncludingNextCDS(
        taxId: int = 511145,
        geneticCode: int = 11,
        constantOverlaps: bool = False) -> int:
    from data_helpers import SpeciesCDSSource
    from genome_model import getGenomeModelFromCache

    rand = CDSand3UTRRandomizationIncludingNextCDS(
        SynonymousCodonPermutingRandomization(geneticCode=geneticCode),
        NucleotidePermutationRandomization(),
        taxId,
        constantOverlaps=constantOverlaps)

    #for protId in SpeciesCDSSource(taxId):
    countOK = 0
    countNotOK = 0
    countNotOK2 = 0
    countSkipped = 0

    for protId in getGenomeModelFromCache(taxId).allCDSSource():
        try:
            cds = CDSHelper(taxId, protId)
            seq = cds.sequence()

            #if str(seq).find("n") != -1:
            #    countSkipped += 1
            #    continue

        except Exception as e:
            countNotOK += 1
            continue

        for i in range(20):
            try:
                ret = rand.randomize(seq, protId)

            except Exception as e:
                print(
                    "Caught exception during call to randomize(), protId={}!".
                    format(protId))
                print(e)
                countNotOK += 1
                countNotOK2 += 1
                continue

            if ret[0] < 1e5:
                print(protId)

            if not (len(ret[2]) == len(seq)):
                print(ret)
                rand.randomize(seq, protId)
            assert (len(ret[2]) == len(seq))

        countOK += 1

        #print("{} -> {}".format( protId, ret ))

    print("OK: {}, NotOK: {}, Skipped: {}, Total: {}".format(
        countOK, countNotOK, countSkipped,
        countOK + countNotOK + countSkipped))
    print("randomize exception: {}".format(countNotOK2))

    return 0
Exemple #7
0
    def calculateMissingWindowsForSequence(
            self,
            taxId,
            protId,
            seqIds,
            requestedShuffleIds,
            firstWindow,
            lastWindowStart,
            windowStep,
            reference="begin",
            shuffleType=db.Sources.ShuffleCDSv2_python):

        timerForPreFolding.start()
        logging.warning("Parameters: %d %s %s %s %d %d %s %d" %
                        (taxId, protId, seqIds, requestedShuffleIds,
                         lastWindowStart, windowStep, reference, shuffleType))
        f = self._logfile

        assert (len(seqIds) > 0)
        assert (len(seqIds) == len(requestedShuffleIds))

        optimalSpeciesGrowthTemperature = None
        if (self._seriesSourceNumber ==
                db.Sources.RNAfoldEnergy_SlidingWindow40_v2_native_temp):
            (numericalProp, _) = getSpeciesTemperatureInfo(taxId)
            optimalSpeciesGrowthTemperature = numericalProp[0]

            if optimalSpeciesGrowthTemperature is None:
                raise Exception(
                    "No temperature value for taxid={}, can't calculate native-temperature folding profile..."
                    .format(taxId))
            else:
                optimalSpeciesGrowthTemperature = float(
                    optimalSpeciesGrowthTemperature)
                assert (optimalSpeciesGrowthTemperature >= -30.0
                        and optimalSpeciesGrowthTemperature <= 150.0)

        if (reference != "begin" and reference != "end"):
            timerForPreFolding.stop()
            e = "Specificed profile reference '%s' is not supported! (" % reference
            logging.error(e)
            raise Exception(e)

        # We will process all listed shuffle-ids for the following protein record
        cds = CDSHelper(taxId, protId)

        if (cds.length() < self._windowWidth):
            e = "Refusing to process item %s because the sequence length (%d nt) is less than the window size (%d nt)\n" % (
                itemToProcess, cds.length(), self._windowWidth)
            f.write(e)
            logging.error(e)
            timerForPreFolding.stop()
            raise Exception(e)

        # Create a list of the windows we need to calculate for this CDS
        if reference == "begin":
            requestedWindowStarts = frozenset(
                range(
                    0,
                    min(lastWindowStart + 1,
                        cds.length() - self._windowWidth - 1), windowStep))
            if (len(requestedWindowStarts) == 0):
                e = "No windows exist for calculation taxid=%d, protId=%s, CDS-length=%d, lastWindowStart=%d, windowStep=%d, windowWidth=%d - Skipping...\n" % (
                    taxId, protId, cds.length(), lastWindowStart, windowStep,
                    self._windowWidth)
                f.write(e)
                logging.error(e)
                timerForPreFolding.stop()
                raise Exception(e)
        elif reference == "end":
            lastPossibleWindowStart = cds.length(
            ) - self._windowWidth  #+ 1  # disregard lastWindowStart when reference=="end"
            #lastWindowCodonStart = (lastPossibleWindowStart-3)-(lastPossibleWindowStart-3)%3

            #lastPossibleWindowStart = seqLength - windowWidth # + 1  # disregard lastWindowStart when reference=="end"
            requestedWindowStarts = frozenset(
                filter(
                    lambda x: x >= lastWindowStart,
                    range(lastPossibleWindowStart % windowStep,
                          lastPossibleWindowStart + 1, windowStep)))

            #requestedWindowStarts = frozenset(range(lastWindowCodonStart % windowStep, lastWindowCodonStart, windowStep))
            #pass
        else:
            assert (False)

        # First, read available results (for all shuffle-ids) in JSON format
        # Array is indexed by shuffle-id, so results not requested will be represented by None (as will requested items that have no results yet).
        logging.info("DEBUG: requestedShuffleIds (%d items): %s\n" %
                     (len(requestedShuffleIds), requestedShuffleIds))
        existingResults = cds.getCalculationResult2(self._seriesSourceNumber,
                                                    requestedShuffleIds,
                                                    True,
                                                    shuffleType=shuffleType)
        #assert(len(existingResults) >= len(requestedShuffleIds))  # The returned array must be at least as large as the requested ids list
        assert (len(existingResults) == len(requestedShuffleIds))
        logging.info("requestedShuffleIds: %s" % requestedShuffleIds)
        logging.info("existingResults.keys(): %s" % existingResults.keys())
        assert (frozenset(requestedShuffleIds) == frozenset(
            existingResults.keys()))
        #existingResults = [None] * (max(requestedShuffleIds)+1)
        logging.info("DEBUG: existingResults (%d items): %s\n" %
                     (len(existingResults), existingResults))

        # Check for which of the requested shuffle-ids there are values missing
        shuffleIdsToProcess = {}
        for shuffleId, r in existingResults.items():
            if r is None:
                # There are no existing results for shuffled-id n. If it was requested, it should be calculated now (including all windows)
                if shuffleId in requestedShuffleIds:
                    shuffleIdsToProcess[shuffleId] = list(
                        requestedWindowStarts)

                timerForPreFolding.stop()

                # ------------------------------------------------------------------------------------
                continue  # TODO - verify this line; should we abort this sequence by throwing????
                # ------------------------------------------------------------------------------------

            logging.info("/// shuffleId r = %d %s" % (shuffleId, r))
            logging.info("r[MFE-profile] %s" % r["MFE-profile"])

            # Check the existing results for this shuffle
            alreadyProcessedWindowStarts = frozenset([
                i for i, x in enumerate(r["MFE-profile"]) if x is not None
            ])  # Get the indices (=window starts) of all non-None values
            missingWindows = requestedWindowStarts - alreadyProcessedWindowStarts  # Are there any requested windows that are not already computed?
            if (missingWindows):
                shuffleIdsToProcess[shuffleId] = missingWindows

        if (not shuffleIdsToProcess):
            e = "All requested shuffle-ids in (taxId: %d, protId: %s, seqs: %s) seem to have already been processed. Skipping...\n" % (
                taxId, protId, str(list(zip(seqIds, requestedShuffleIds))))
            logging.warning(e)
            timerForPreFolding.stop()
            return
        logging.info("DEBUG: shuffleIdsToProcess (%d items): %s\n" %
                     (len(shuffleIdsToProcess), shuffleIdsToProcess))

        logging.info("DEBUG: Before (%d items): %s\n" %
                     (len(existingResults), existingResults))
        # Initialize new results records
        for shuffleId in shuffleIdsToProcess.keys():
            if existingResults[shuffleId] is None:
                logging.info(seqIds)
                logging.info(requestedShuffleIds)
                logging.info(shuffleId)
                thisSeqId = seqIds[requestedShuffleIds.index(shuffleId)]

                existingResults[shuffleId] = {
                    "id":
                    "%s/%s/%d/%d" % (taxId, protId, thisSeqId, shuffleId),
                    "seq-crc": None,
                    "MFE-profile": [],
                    "MeanMFE": None,
                    "v": 2,
                    "shuffle-type": shuffleType
                }
        logging.info("DEBUG: existingResults (%d items): %s\n" %
                     (len(existingResults), existingResults))
        timerForPreFolding.stop()

        # Load the sequences of all shuffle-ids we need to work on
        # TODO - combine loading of multiple sequences into one DB operation
        for shuffleId, record in existingResults.items():
            if record is None:
                logging.info(
                    "DEBUG: skipping empty results record for shuffleId={}".
                    format(shuffleId))
                continue
            timerForPreFolding.start()

            seq = None
            annotatedSeqId = None
            # Get the sequence for this entry
            if (shuffleId < 0):
                seq = cds.sequence()
                annotatedSeqId = cds.seqId()
            else:
                seq = cds.getShuffledSeq(shuffleId, shuffleType)
                annotatedSeqId = cds.getShuffledSeqId(shuffleId, shuffleType)

            if (seq is None or (not seq is None and len(seq) == 0)):
                seq2 = cds.getShuffledSeq2(annotatedSeqId)
                seq3 = cds._fetchSequence(annotatedSeqId)
                seq4 = cds._cache.get("%d:seq" % annotatedSeqId)
                if not seq4 is None:
                    del cds._cache["%d:seq" % annotatedSeqId]
                seq5 = cds.getShuffledSeq2(annotatedSeqId)
                e = "Got empty sequence for shuffleId=%d, seqId=%d, taxId=%d, protId=%s, numShuffled=%d, ids[%d:%d]=%s, len(seq2)=%d, len(seq3)=%d, len(seq4)=%d, len(seq5)=%d" % (
                    shuffleId, annotatedSeqId, taxId, protId,
                    len(cds.shuffledSeqIds()), shuffleId - 2, shuffleId + 2,
                    cds.shuffledSeqIds()[shuffleId - 2:shuffleId + 2],
                    len(seq2) if not seq2 is None else -1, len(seq3)
                    if not seq3 is None else -1, len(seq4) if not seq4 is None
                    else -1, len(seq5) if not seq5 is None else -1)
                logging.error(e)
                timerForPreFolding.stop()
                raise Exception(e)

            #
            # Disabled - calculation needn't include the native sequence...
            #
            #if( annotatedSeqId not in seqIds ):
            #    e = "Error: SeqId specified in queue item %s does not match annotated seq-id %d\n" % (itemToProcess, annotatedSeqId)
            #    f.write(e)
            #    f.write("Current shuffle-id: %d\n" % shuffleId)
            #    f.write("Ids in existing results:\n")
            #    for shuffleId, record in enumerate(existingResults):
            #        f.write(" %d) %s\n" % (shuffleId, record['id']))
            #    f.write("Debug info:\n")
            #    f.write("\n".join(cds.getDebugInfo()))
            #    f.write("\n")
            #    f.write("Skipping...\n")
            #    print("Skipping...")
            #    raise Exception(e)

            expectedSeqLength = cds.length()
            if (not expectedSeqLength is None):
                if (expectedSeqLength != len(seq)):
                    e = "Warning: taxid=%d, protid=%s, seqid=%d - unexpected length %d (expected: %d)\n" % (
                        taxId, protId, annotatedSeqId, len(seq),
                        expectedSeqLength)
                    f.write(e)
                    logging.error(e)
                    timerForPreFolding.stop()
                    raise Exception(e)

            if (len(seq) < self._windowWidth):
                # Sequence is shorter than required window; skip
                e = "Warning: skipping sequence because it is shorter than the requested window...\n"
                f.write(e)
                logging.error(e)
                timerForPreFolding.stop()
                raise Exception(e)

            logging.info(
                "DEBUG: Processing item taxId=%d, protId=%s, shuffle=%d (length=%d, %d windows)...\n"
                % (taxId, protId, shuffleId, len(seq),
                   len(requestedWindowStarts)))

            # TODO - Remove any old value stored in this key?

            # Skip this for now
            # This will be made redundant by completing the "updating" implementation
            #
            #if( cds.isCalculationDone( seriesSourceNumber, shuffleId )):
            #    # Sufficient data seems to exist. Skip...
            #    f.write("Item %s appears to be already completed, skipping..." % itemToProcess)
            #    continue

            logging.info(seq[:50])
            #f.write("\n")

            MFEprofile = record["MFE-profile"]
            #f.write("Profile: %s\n" % MFEprofile)

            # Make sure the profile array contains enough entries for all new windows (and possibly, if windows are non-contiguous, entries between them that we are not going to compute right now)
            if (len(MFEprofile) < max(requestedWindowStarts)):
                entriesToAdd = max(requestedWindowStarts) - len(MFEprofile) + 1
                MFEprofile.extend([None] * entriesToAdd)
            assert (len(MFEprofile) >= max(requestedWindowStarts))

            stats = RunningStats()
            stats.extend([x for x in MFEprofile if x is not None])

            timerForPreFolding.stop()
            timerForFolding.start()
            for start in requestedWindowStarts:
                fragment = seq[start:(start + self._windowWidth)]
                assert (len(fragment) == self._windowWidth)

                if self._seriesSourceNumber == db.Sources.RNAfoldEnergy_SlidingWindow40_v2:
                    # Calculate the RNA folding energy. This is the computation-heavy part.
                    #strct, energy = RNA.fold(fragment)
                    energy = RNAfold_direct(fragment)
                    assert (energy <= 0.0)

                elif self._seriesSourceNumber == db.Sources.RNAfoldEnergy_SlidingWindow40_v2_native_temp:
                    # Calculate the RNA folding energy. This is the computation-heavy part.
                    #strct, energy = RNA.fold(fragment)
                    energy = RNAfold_direct(fragment,
                                            explicitCalculationTemperature=
                                            optimalSpeciesGrowthTemperature)
                    assert (energy <= 0.0)

                elif self._seriesSourceNumber == db.Sources.TEST_StepFunction_BeginReferenced:
                    if shuffleId < 0:
                        energy = 0
                    else:
                        energy = start % 50 - 20

                elif self._seriesSourceNumber == db.Sources.TEST_StepFunction_EndReferenced:
                    if shuffleId < 0:
                        energy = 0
                    else:
                        energy = (expectedSeqLength - self._windowWidth -
                                  start) % 50 - 20

                else:
                    logging.error(
                        "Received unknown seriesSourceNumber {}".format(
                            self._seriesSourceNumber))
                    assert (False)

                # Store the calculation result
                #print("%d:%s --> %f" % (taxId, protId, energy))

                stats.push(energy)
                MFEprofile[start] = energy

            print(
                "///////////////////  shuffleId={} (len={}) //////////////////////////"
                .format(shuffleId, expectedSeqLength))
            prettyPrintProfile(MFEprofile)

            timerForFolding.stop()
            timerForPostFolding.start()

            # Format
            crc = calcCrc(seq)
            #result = """{"id":"%s","seq-crc":%d,"MFE-profile":[%s],"MeanMFE":%.6g,v:2}""" % (itemToProcess, crc, ",".join(map(lambda x: "%.3g" % x, MFEprofile)), stats.mean())
            record["seq-crc"] = crc
            record["MFE-profile"] = [
                round4(x) for x in MFEprofile
            ]  # Round items down to save space (these are not exact numbers anyway)
            record["MeanMFE"] = stats.mean()
            result = json.dumps(record)

            f.write(result)
            f.write("\n")

            if (not self._debugDoneWriteResults):
                cds.saveCalculationResult2(self._seriesSourceNumber, result,
                                           annotatedSeqId, False)

            timerForPostFolding.stop()

        timerForPostFolding.start()

        if (not self._debugDoneWriteResults):
            cds.commitChanges()

        timerForPostFolding.stop()
            continue

        # ------------------------------------------------------------------------------------------
        # Exclude some sequences from the calculation
        # ------------------------------------------------------------------------------------------

        # Skip sequences with partial CDS annotations
        #if(r.exists("CDS:taxid:%d:protid:%s:partial" % (taxIdForProcessing, protId))):
        #    skipped += 1
        #    continue

        #if( not r.exists(nativeCdsSeqIdKey % (taxIdForProcessing, protId)) ):
        #    skipped +=1
        #    continue

        cds = CDSHelper(taxIdForProcessing, protId)

        seqLength = cds.length()
        stopCodonPos = cds.CDSlength()

        if seqLength is None:
            print(
                "Warning: Could not find CDS length entry for taxid=%d, protid=%s"
                % (taxIdForProcessing, protId))
            skipped += 1
            stats['skipped-cds-length-missing'] += 1
            continue

        # Skip sequences with length <40nt (window width)
        if (seqLength < windowWidth + 1):
            print("short seq")
                    err[ErrorTypes.UpdateProfileTooShort] += 1
                    badUpdateRecords.add(updateRecord[2])
                    if( args.verbose>2 ):
                        print("UpdateProfileTooShort")
                else:
                    #print('-------- 0 --------')
                    #print(len(profile0))
                    #print(profile0)
                    #print('-------- 1 --------')
                    #print(len(profile1))
                    #print(profile1)
                    profile0.extend([None]*(len(profile1)-len(profile0))) # Add 'None's at the end (to allow comparison of new values)
                    hasNewWindows = False
                    numNewWindows = 0

                    cds = CDSHelper(identifierFromOriginalRecord[0], identifierFromOriginalRecord[1] )
                    cdsLength = cds.length()

                    for pos, vs in enumerate(zip( profile0, profile1 )):
                        v0, v1 = vs

                        if( not v1 is None ):
                            allWindows_FrameRelativeToStart.update( (pos%10,) )
                            allWindows_FrameRelativeToEnd.update( ((cdsLength-pos)%10,) )

                        # Note: this check rejects changes to existing windows (though those might be needed at some point)
                        if( (not v0 is None) and
                            ( (v1 is None) or
                              (abs(v0-v1) >= 1e-8) )
                            ):
                            err[ErrorTypes.ExistingProfileValueCorrupted] += 1
# Configuration
taxId = args.taxId

#statsShuffles = RunningStats()
statsShuffles = OfflineStats()

recordsCount = 0
warningsCount = 0

rl = RateLimit(30)

total = countSpeciesCDS(taxId)

for protId in SpeciesCDSSource(taxId):
    cds = CDSHelper(taxId, protId)

    statsShuffles.push(
        cds.dropShuffledSeqs(lastItemToKeep=args.keep_first_n_shuffles))

    recordsCount += 1

    if (rl()):
        print("processed %d records (%.2g%%)" %
              (recordsCount, float(recordsCount) / total * 100))

    # DEBUG ONLY # DEBUG ONLY # DEBUG ONLY # DEBUG ONLY # DEBUG ONLY # DEBUG ONLY #
    #if( recordsCount > 20 ):
    #    break
    # DEBUG ONLY # DEBUG ONLY # DEBUG ONLY # DEBUG ONLY # DEBUG ONLY # DEBUG ONLY #
Exemple #11
0
def readSeriesResultsForSpecies(seriesSourceNumber,
                                species,
                                minShuffledGroups=20,
                                maxShuffledGroups=20,
                                shuffleType=db.Sources.ShuffleCDSv2_python,
                                cdsFilter=None,
                                returnCDS=True):
    if isinstance(
            species, Iterable
    ):  # usually, species will be a sequence of numeric taxid values
        if isinstance(species, basestring):
            raise Exception("species cannot be string")
        # all set - proceed...
    else:
        species = (species, )  # assume we got a single (numeric) taxid value
    assert (minShuffledGroups <= maxShuffledGroups)

    for taxIdForProcessing in species:
        print("Procesing %d sequences for tax-id %d (%s)..." %
              (countSpeciesCDS(taxIdForProcessing), taxIdForProcessing,
               getSpeciesName(taxIdForProcessing)))

        computed = getAllComputedSeqsForSpecies(seriesSourceNumber,
                                                taxIdForProcessing,
                                                maxShuffledGroups,
                                                shuffleType=shuffleType)
        computedIds = frozenset(computed.keys())
        print("Collecting data from %d computation results..." % len(computed))

        skipped = 0
        selected = 0
        alreadyCompleted = 0

        # Iterate over all CDS entries for this species
        for protId in SpeciesCDSSource(taxIdForProcessing):
            cds = CDSHelper(taxIdForProcessing, protId)

            if (not cdsFilter is None) and (not cdsFilter(cds)):
                continue

            cdsSeqId = cds.seqId()

            shuffledIds = cds.shuffledSeqIds(shuffleType=shuffleType)

            # How many shuffles (for this cds) exist in the data we found?
            computedShufflesCount = len(
                computedIds.intersection(frozenset(shuffledIds)))

            if (computedShufflesCount < minShuffledGroups
                    or (not cdsSeqId in computedIds)):
                #print("%s - found only %d groups, skipping" % (protId, computedShufflesCount))
                skipped += 1
                continue

            # Get the computed results for this CDS
            seqIds = [cds.seqId()]
            seqIds.extend(cds.shuffledSeqIds(shuffleType=shuffleType))
            if (len(seqIds) > maxShuffledGroups + 1):
                seqIds = seqIds[:maxShuffledGroups + 1]
            results = [computed.get(x) for x in seqIds]

            if (results is None or len([() for x in results if not x is None])
                    < minShuffledGroups):
                print("Not enough results found for %s" % protId)
                skipped += 1
                continue

            # Decode the results
            results = list(
                map(
                    lambda x: decodeJsonSeriesRecord(decompressSeriesRecord(x))
                    if not x is None else None, results))
            if (returnCDS):
                yield {
                    "taxid": taxIdForProcessing,
                    "content": results,
                    "cds": cds
                }
            else:
                yield {"taxid": taxIdForProcessing, "content": results}
            del results
            del cds
            selected += 1

            if (rl()):
                print("# %s - %d records included, %d records skipped" %
                      (datetime.now().isoformat(), selected, skipped))
        #protId = codecs.decode(protId)
        # Filtering

        # Skip sequences with partial CDS annotations
        #if(r.exists("CDS:taxid:%d:protid:%s:partial" % (taxIdForProcessing, protId))):
        #    skipped += 1
        #    continue

        #if( not r.exists(nativeCdsSeqIdKey % (taxIdForProcessing, protId)) ):
        #    skipped +=1
        #    continue

        if (rl()):
            print("%d %d" % (selected, skipped))

        cds = CDSHelper(taxIdForProcessing, protId)

        seqLength = cds.length()
        if (not seqLength is None):
            # Skip sequences with length <40nt (window width)
            if (seqLength < calculationWidth + windowWidth - 1):
                skipped += 1
                continue
        else:
            print(
                "Warning: Could not find CDS length entry for taxid=%d, protid=%s"
                % (taxIdForProcessing, protId))
            skipped += 1
            continue

        #requiredNumWindows = seqLength - windowWidth + 1
# Configuration
taxId = 3055

statsLength = RunningStats()
statsShuffles = RunningStats()

recordsCount = 0
warningsCount = 0

rl = RateLimit(30)

total = countSpeciesCDS(taxId)

for protId in SpeciesCDSSource(taxId):
    cds = CDSHelper(taxId, protId)
    recordsCount += 1

    statsLength.push(cds.length())

    if (len(cds.sequence()) != cds.length()):
        print(
            "WARNING: incorrect sequence length detected for record (taxid=%d, protId=%s); real-length=%d, recorded-length=%d."
            % (taxId, protId, len(cds.sequence()), cds.length()))
        warningsCount += 1

    recomputedCrc = calcCrc(cds.sequence())
    annotatedCrc = cds.crc()
    assert (recomputedCrc == annotatedCrc)
    print(cds.sequence()[:15])
Exemple #14
0
if (countSpeciesCDS(taxId) == 0):
    print("Species %d (%s) doesn't have any proteins..." %
          (taxId, getSpeciesName(taxId)))
    print("Nothing left to do...")
    sys.exit(0)

print("Species %d (%s) has %d proteins stored." %
      (taxId, getSpeciesName(taxId), countSpeciesCDS(taxId)))
print("Will delete it in 10 seconds...")
sleep(10)

count = 0

for protId in SpeciesCDSSource(taxId):
    print(protId)
    cds = CDSHelper(taxId, protId)
    try:
        cds.dropShuffledSeqs()
    except Exception as e:
        print(e)

    try:
        cds.dropNativeSeq()
    except Exception as e:
        print(e)

    cds.dropRecord()

    count += 1

    if (rl()):

warnings = Counter()

numUniqueShuffles = Counter()

for taxId in species:
    proteinsDone = 0

    #nativeColumns = [[] for x in range(maxCodons)]
    #shuffledColumns = [[[] for x in range(maxCodons)] for y in range(maxShuffles)]
    allNativeSeqs = {}
    allShuffledSeqs = {}

    for protId in SpeciesCDSSource(taxId):
        cds = CDSHelper(taxId, protId)
        warnings.update(("total-cds", ))

        allIds = cds.shuffledSeqIds(shuffleType=shuffleType)[:maxShuffles]

        nativeSeq = cds.sequence()
        if (len(nativeSeq) % 3 != 0):
            warnings.update(("has-broken-codons", ))
            continue

        nativeCodons = Counter(splitCodons(nativeSeq))

        hasMismatchedCodons = False
        allNativeSeqs[protId] = nativeSeq
        hashesForShuffles = set()
Exemple #16
0
    def randomize(self, nucleotideSeq: str, protId: str) -> (int, float, str):

        #print("-----------"*5)
        cds = CDSHelper(self.taxId, protId)

        # Get metadata from genome model
        #gm = cds.getGenomeModel()

        #found = gm.findFeatureById( protId )
        #if found is None:
        #    raise Exception("Failed to find feature matching protein-id={} in genome model".format(protId))
        #(moleculeId, currFeature)  = found

        #if gm.moleculeModels[moleculeId].find3PrimeFlankingRegion( currFeature, debug=True ) is None:
        #    pass
        #print((moleculeId, feature))

        cdsLengthNt = cds.CDSlength()
        assert (cdsLengthNt % 3 == 0)
        flankingRegionLengthNt = cds.flankingRegion3UtrLength()
        nextCDSOppositeStrand = cds.nextCDSOnOppositeStrand()

        # Case 1 (no overlap):
        #                     +--------intergenic--------+
        #                     |                          |
        # +-------CDS1--------+                          +------------CDS2-----------+
        # |                   |                          |                           |
        # +===================+--------------------------+===========================+
        # |                   |                          |                           |
        # +===================+--------------------------+===========================+
        # |<---cdsLengthNt--->|<-flankingRegionLengthNt->|                           |
        # |                              (>= 0)                                      |
        # |<---------------------------cds.totalLength()---------------------------->|

        # Case 2 (overlap):
        #                     +--------------------------CDS2-------------------------+
        #                     |                                                       |
        # +---------------------CDS1----------------------+                           |
        # |                   |                           |                           |
        # +===================+===========================+===========================+
        # |                   |                           |                           |
        # +===================+===========================+===========================+
        # |                   |<-flankingRegionLengthNt-->|                           |
        # |                              (<= 0)           |                           |
        # |<----------------cdsLengthNt------------------>|                           |
        # |<----------------------------cds.totalLength()---------------------------->|

        if flankingRegionLengthNt < 0 and -flankingRegionLengthNt > cdsLengthNt:
            #flankingRegionLengthNt = -cdsLengthNt
            raise Exception("Next CDS is fully overlapping...")

        #-----------------------------------------------------------------------------
        # Randomize the "main" CDS
        #-----------------------------------------------------------------------------
        # First, determine which region to randomize...
        if (not self.constantOverlaps) or (
                flankingRegionLengthNt >=
                0):  # no overlap, or overlap should be randomized
            CDSseq = nucleotideSeq[:cdsLengthNt]
            assert (len(CDSseq) == cdsLengthNt)
        else:  # constant overlaps requested and this CDS is overlapping the next. Remove the overlap from the CDS (it will not be randomized):
            lastNucBeforeOverlap = cdsLengthNt + flankingRegionLengthNt
            assert (lastNucBeforeOverlap < cdsLengthNt)
            lastNucToRandomize = lastNucBeforeOverlap - (lastNucBeforeOverlap %
                                                         3)
            CDSseq = nucleotideSeq[:lastNucToRandomize]
            assert (len(CDSseq) % 3 == 0)

        # Then, do the randomization...
        (CDSpermCount, CDSidentity,
         randomizedCDS) = self.cdsRand.randomizeAmbiguousSequence(CDSseq)

        # Finally, add the non-randomized part of the CDS (if any)
        if (not self.constantOverlaps) or (
                flankingRegionLengthNt >=
                0):  # no overlap, or overlap should be randomized
            pass
        else:  # constant overlaps requested and this CDS is overlapping the next.
            randomizedCDS = randomizedCDS + nucleotideSeq[
                lastNucToRandomize:cdsLengthNt]
            assert (len(randomizedCDS) % 3 == 0)

        assert (
            len(randomizedCDS) == cdsLengthNt
        )  # the length of the resulting sequence matches the original CDS sequence

        #-----------------------------------------------------------------------------
        # Randomize the 3'UTR
        #-----------------------------------------------------------------------------
        if flankingRegionLengthNt > 0:
            _3UTRseq = nucleotideSeq[cdsLengthNt:cdsLengthNt +
                                     flankingRegionLengthNt]
            assert (len(_3UTRseq) == flankingRegionLengthNt)
            (UTRpermCount, UTRidentity,
             randomizedUTR) = self.utrRand.randomizeAmbiguousSequence(_3UTRseq)
        else:
            _3UTRseq = ""
            UTRpermCount = 1
            UTRidentity = 1.0
            randomizedUTR = ""

        #-----------------------------------------------------------------------------
        # Randomize the downstream CDS
        #-----------------------------------------------------------------------------
        nextCDSseq = nucleotideSeq[
            cdsLengthNt +
            flankingRegionLengthNt:]  # Should work for positive and negative length UTRs
        assert (len(nextCDSseq) % 3 == 0)
        #nextCDSseq = nextCDSseq[(len(nextCDSseq)%3):]  # remove partial codons from the start (caused due to the overlap; we can only randomize each codon as part of one CDS, although in the overlap region codons belong to two CDSs...)
        if nextCDSOppositeStrand:
            nextCDSseq = str(Seq(nextCDSseq, generic_dna).reverse_complement())
        assert (len(nextCDSseq) % 3 == 0)
        (nextCDSpermCount, nextCDSidentity, randomizedNextCDS
         ) = self.cdsRand.randomizeAmbiguousSequence(nextCDSseq)
        if nextCDSOppositeStrand:  # if the next CDS is on the opposite strand, revcomp it back to its original frame
            randomizedNextCDS = str(
                Seq(randomizedNextCDS, generic_dna).reverse_complement())
        if flankingRegionLengthNt < 0:
            randomizedNextCDS = randomizedNextCDS[-flankingRegionLengthNt:]

        totalPerms = CDSpermCount * UTRpermCount * nextCDSpermCount

        totalIdentity = ((CDSidentity * len(CDSseq)) +
                         (UTRidentity * len(_3UTRseq)) +
                         (nextCDSidentity * len(nextCDSseq))) / (
                             len(CDSseq) + len(_3UTRseq) + len(nextCDSseq))

        return (totalPerms, totalIdentity,
                randomizedCDS + randomizedUTR + randomizedNextCDS)
    # build a average profiles for each of the shuffled groups
    shuffleProfiles = []

    medianGCContent = []


    ##
    # Are the profiles computed for each sequence, or are they accumulated?
    ##
    
    sdfasdfasdfasdfasdfasdfadsf asdfasd fasd afsdfsd3##@2

    # Iterate over all CDS entries for this species
    for protId in SpeciesCDSSource(taxIdForProcessing):
        cds = CDSHelper(taxIdForProcessing, protId)

        seqLength = cds.length()
        profileInfo.setCDSLength(seqLength)
        
        if( not seqLength is None ):
            # Skip sequences that are too short
            if(seqLength < numWindows + windowWidth + 1 ):
                skipped += 1
                continue
        else:
            print("Warning: Could not find CDS length entry for taxid=%d, protid=%s" % (taxIdForProcessing, protId) )
            skipped += 1
            continue

        #requiredNumWindows = seqLength - windowWidth + 1
def storeNewShuffles(taxId,
                     protId,
                     newShuffleIds,
                     shuffleType=db.Sources.ShuffleCDSv2_python,
                     dontStore=False):

    cds = CDSHelper(taxId, protId)
    #print(protId)

    if shuffleType == db.Sources.ShuffleCDSv2_python:
        return storeRandomizedSequences(
            cds, createRandomizedSeqs(cds, newShuffleIds, shuffleType),
            newShuffleIds, shuffleType)

    elif shuffleType == db.Sources.ShuffleCDS_vertical_permutation_1nt:
        cache = getRandomizedSequenceCacheForVerticalPermutations(taxId)

        seqs = [
            cache.getShuffledSeq(protId, shuffleId)
            for shuffleId in newShuffleIds
        ]
        print(seqs)

        if dontStore: return seqs

        return storeRandomizedSequences(cds, seqs, newShuffleIds, shuffleType)

    elif shuffleType == db.Sources.ShuffleCDS_synon_perm_and_3UTR_nucleotide_permutation:
        #print("store: before")
        #a = createRandomizedSeqs_CDS_with_3UTR(cds, newShuffleIds, shuffleType)
        #print("store: {}".format(a))

        return storeRandomizedSequences(
            cds,
            createRandomizedSeqs_CDS_with_3UTR(cds,
                                               newShuffleIds,
                                               shuffleType=shuffleType,
                                               taxId=taxId), newShuffleIds,
            shuffleType)

    elif shuffleType == db.Sources.ShuffleCDS_synon_perm_and_3UTR_nucleotide_permutation_Including_Next_CDS:
        #print("store: before")
        #a = createRandomizedSeqs_CDS_with_3UTR(cds, newShuffleIds, shuffleType)
        #print("store: {}".format(a))

        return storeRandomizedSequences(
            cds,
            createRandomizedSeqs_CDS_with_3UTR(cds,
                                               newShuffleIds,
                                               shuffleType=shuffleType,
                                               taxId=taxId), newShuffleIds,
            shuffleType)

    elif shuffleType == db.Sources.ShuffleCDS_synon_perm_and_3UTR_nucleotide_permutation_Including_Next_CDS_Constant_Overlaps:
        #print("store: before")
        #a = createRandomizedSeqs_CDS_with_3UTR(cds, newShuffleIds, shuffleType)
        #print("store: {}".format(a))

        return storeRandomizedSequences(
            cds,
            createRandomizedSeqs_CDS_with_3UTR(cds,
                                               newShuffleIds,
                                               shuffleType=shuffleType,
                                               taxId=taxId), newShuffleIds,
            shuffleType)

    else:
        raise Exception("Unsupported shuffleType={}".format(shuffleType))
def processGenome(args, taxId):

    alreadyProcessedGenes = {}
    totalProteinsProcessed = 0
    totalSkipped = 0

    seqsForWriting=[]
    recordsForWriting={}
    
    gm = getGenomeModelFromCache( taxId )

    for protId in SpeciesCDSSource(taxId):
        cds = CDSHelper( taxId, protId )
        totalProteinsProcessed += 1

        #feature = gm.findFeatureById( protId )
        geneId = cds.getGeneId()

        #flanking3UTRRegionLengthNt = cds.flankingRegion3UtrLength()

        feature = gm.findFeatureById( protId )
        #feature = cds.getMatchingFeatureFromGenomeModel()
        #print(feature)
        strand = feature[1].data['strand']

        if strand=='+':
            otherFeature = gm.moleculeModels[ feature[0] ].find5PrimeFlankingRegion( feature[1] )

            if otherFeature is None:
                totalSkipped += 1
                continue

            assert( otherFeature['downstream-feature'].begin <= otherFeature['downstream-feature'].end)
            flanking3UTRRegionLengthNt = otherFeature['curr-feature'].begin       -  otherFeature['downstream-feature'].end

            threePrimeUTRCoords = (feature[1].begin-20, feature[1].begin+2, False) # include the first 3 nucleotides of the CDS

        else:
            otherFeature = gm.moleculeModels[ feature[0] ].find5PrimeFlankingRegion( feature[1] )

            if otherFeature is None:
                totalSkipped += 1
                continue

            assert( otherFeature['downstream-feature'].begin <= otherFeature['downstream-feature'].end)
            flanking3UTRRegionLengthNt = otherFeature['downstream-feature'].begin - otherFeature['curr-feature'].end

            threePrimeUTRCoords = (feature[1].end-3, feature[1].end+20, True) # include the first 3 nucleotides of the CDS

        threePrimeUTR = gm.moleculeModels[ feature[0] ].getSequence( *threePrimeUTRCoords )

        if flanking3UTRRegionLengthNt < -50:
            print("Warning: found gene with apparent long overlap: {},{},{},{},{}".format( protId, geneId, strand, flanking3UTRRegionLengthNt, threePrimeUTR.seq ))
            #totalSkipped += 1
            #continue

        if threePrimeUTR.seq[-2:] != 'TG':
            print("Warning: skipping gene with start codon at the correct place: {},{},{},{},{}".format( protId, geneId, strand, flanking3UTRRegionLengthNt, threePrimeUTR.seq ))
            totalSkipped += 1
            continue

        # All done - emit the output
        #fout.write("{},{},{},{},{}".format( protId, geneId, strand, flanking3UTRRegionLengthNt, threePrimeUTR.seq ))
        recordsForWriting[protId] = (geneId, strand, flanking3UTRRegionLengthNt, threePrimeUTR.seq )

        seqsForWriting.append( SeqRecord( Seq(threePrimeUTR.seq[:-3], NucleotideAlphabet), id=protId) )

    aSD = calculateaSDEnergies( seqsForWriting, args, taxId )
    print(len(aSD))

    with open( outputData.format(taxId), 'wt') as fout:
        for protId, record in recordsForWriting.items():
            aSDval = aSD.get(protId, None)
            vals = (protId,) + record + (aSDval,)
            fout.write("{},{},{},{},{},{}\n".format( *vals ))
    

    print("Processed {} coding sequences for taxid {}".format( totalProteinsProcessed, taxId ))
    print("Skipped {} coding sequences".format( totalSkipped ))
Exemple #20
0
        # Skip sequences with partial CDS annotations
        #if(r.exists("CDS:taxid:%d:protid:%s:partial" % (taxIdForProcessing, protId))):
        #    skipped += 1
        #    continue

        #if( not r.exists(nativeCdsSeqIdKey % (taxIdForProcessing, protId)) ):
        #    skipped +=1
        #    continue

        if(rl()):
            print("# %s - %d records included, %d records skipped" % (datetime.now().isoformat(), selected, skipped))
            if( nativeProfile[0].count() > 1005 and rl2()):
                printOutput()

        cds = CDSHelper(taxIdForProcessing, protId)

        seqLength = cds.length()
        if( not seqLength is None ):
            # Skip sequences that are too short
            if(seqLength < numWindows + windowWidth + 1 ):
                skipped += 1
                continue
        else:
            print("Warning: Could not find CDS length entry for taxid=%d, protid=%s" % (taxIdForProcessing, protId) )
            skipped += 1
            continue

        requiredNumWindows = seqLength - windowWidth + 1

        cdsSeqId = cds.seqId()