def getRandomizedSequenceCacheForVerticalPermutations(taxId): global _caches if (taxId, db.Sources.ShuffleCDS_vertical_permutation_1nt) in _caches: cache = _caches[(taxId, db.Sources.ShuffleCDS_vertical_permutation_1nt)] else: # read all native sequences protIds = [] cdss = [] for protId in SpeciesCDSSource(taxId): cds = CDSHelper(taxId, protId) if( cds.length()%3 != 0 ): continue seq = cds.sequence() protIds.append(protId) cdss.append(seq) geneticCode = getSpeciesTranslationTable( taxId ) scpr = SynonymousCodonPermutingRandomization( geneticCode ) randomizer = lambda cdss: scpr.verticalPermutation( cdss ) cache = VerticalRandomizationCache(shuffleType=db.Sources.ShuffleCDS_vertical_permutation_1nt, taxId=taxId, nativeSeqsMap=dict(zip(protIds, cdss)), geneticCode=geneticCode, randomizer=randomizer ) _caches[(taxId, db.Sources.ShuffleCDS_vertical_permutation_1nt)] = cache print(_caches.keys()) return cache
def writeSequenceToTempFile(taxId): print("Fetching sequence for taxid={}".format(taxId)) allRecords = [] allCDSs = [] for protId in SpeciesCDSSource(taxId): cds = CDSHelper(taxId, protId) if (cds.length() % 3 != 0): continue seq = cds.sequence() allCDSs.append(seq) if (len(allCDSs) % 1000 == 999): print(".") record = SeqRecord(Seq(''.join(allCDSs), NucleotideAlphabet), id="allCDSs", description="") allRecords.append(record) fout = NamedTemporaryFile(mode="w", delete=(not debugMode)) SeqIO.write(allRecords, fout.name, "fasta") # write the full sequences into the file return (len(allRecords), fout)
def getIdentifiersConversionTableUsingGff3(): global altIdentifiers if altIdentifiers: return altIdentifiers gm = getGenomeModelFromCache(taxId) for protId in SpeciesCDSSource(taxId): cds = CDSHelper(taxId, protId) geneId = cds.getGeneId() alts = gm.findEquivalentIdentifiers(geneId) for i in alts: altIdentifiers[i] = protId altIdentifiers[geneId] = protId
def storeNewShuffles(taxId, protId, newShuffleIds, shuffleType=db.Sources.ShuffleCDSv2_python, dontStore=False): cds = CDSHelper(taxId, protId) print(protId) if shuffleType == db.Sources.ShuffleCDSv2_python: return storeRandomizedSequences(cds, createRandomizedSeqs(cds, newShuffleIds, shuffleType), newShuffleIds, shuffleType ) elif shuffleType == db.Sources.ShuffleCDS_vertical_permutation_1nt: cache = getRandomizedSequenceCacheForVerticalPermutations( taxId ) seqs = map( lambda shuffleId: cache.getShuffledSeq( protId, shuffleId ), newShuffleIds ) print(seqs) if dontStore: return seqs return storeRandomizedSequences(cds, seqs, newShuffleIds, shuffleType) else: raise Exception("Unsupported shuffleType={}".format(shuffleType))
def testSpecies(taxId): paData = getSpeciesPaxdbData( taxId ) countFound = 0 countNotFound = 0 for protId in SpeciesCDSSource(taxId): cds = CDSHelper( taxId=taxId, protId=protId ) geneId = cds.getGeneId() if geneId in paData: countFound += 1 else: countNotFound += 1 print("Species: {} -> Found: {} ({:.3}%) Not found: {}".format(taxId, countFound, countFound/(countFound+countNotFound)*100, countNotFound)) return( countFound, countNotFound)
def testCDSand3UTRRandomizationIncludingNextCDS( taxId: int = 511145, geneticCode: int = 11, constantOverlaps: bool = False) -> int: from data_helpers import SpeciesCDSSource from genome_model import getGenomeModelFromCache rand = CDSand3UTRRandomizationIncludingNextCDS( SynonymousCodonPermutingRandomization(geneticCode=geneticCode), NucleotidePermutationRandomization(), taxId, constantOverlaps=constantOverlaps) #for protId in SpeciesCDSSource(taxId): countOK = 0 countNotOK = 0 countNotOK2 = 0 countSkipped = 0 for protId in getGenomeModelFromCache(taxId).allCDSSource(): try: cds = CDSHelper(taxId, protId) seq = cds.sequence() #if str(seq).find("n") != -1: # countSkipped += 1 # continue except Exception as e: countNotOK += 1 continue for i in range(20): try: ret = rand.randomize(seq, protId) except Exception as e: print( "Caught exception during call to randomize(), protId={}!". format(protId)) print(e) countNotOK += 1 countNotOK2 += 1 continue if ret[0] < 1e5: print(protId) if not (len(ret[2]) == len(seq)): print(ret) rand.randomize(seq, protId) assert (len(ret[2]) == len(seq)) countOK += 1 #print("{} -> {}".format( protId, ret )) print("OK: {}, NotOK: {}, Skipped: {}, Total: {}".format( countOK, countNotOK, countSkipped, countOK + countNotOK + countSkipped)) print("randomize exception: {}".format(countNotOK2)) return 0
def calculateMissingWindowsForSequence( self, taxId, protId, seqIds, requestedShuffleIds, firstWindow, lastWindowStart, windowStep, reference="begin", shuffleType=db.Sources.ShuffleCDSv2_python): timerForPreFolding.start() logging.warning("Parameters: %d %s %s %s %d %d %s %d" % (taxId, protId, seqIds, requestedShuffleIds, lastWindowStart, windowStep, reference, shuffleType)) f = self._logfile assert (len(seqIds) > 0) assert (len(seqIds) == len(requestedShuffleIds)) optimalSpeciesGrowthTemperature = None if (self._seriesSourceNumber == db.Sources.RNAfoldEnergy_SlidingWindow40_v2_native_temp): (numericalProp, _) = getSpeciesTemperatureInfo(taxId) optimalSpeciesGrowthTemperature = numericalProp[0] if optimalSpeciesGrowthTemperature is None: raise Exception( "No temperature value for taxid={}, can't calculate native-temperature folding profile..." .format(taxId)) else: optimalSpeciesGrowthTemperature = float( optimalSpeciesGrowthTemperature) assert (optimalSpeciesGrowthTemperature >= -30.0 and optimalSpeciesGrowthTemperature <= 150.0) if (reference != "begin" and reference != "end"): timerForPreFolding.stop() e = "Specificed profile reference '%s' is not supported! (" % reference logging.error(e) raise Exception(e) # We will process all listed shuffle-ids for the following protein record cds = CDSHelper(taxId, protId) if (cds.length() < self._windowWidth): e = "Refusing to process item %s because the sequence length (%d nt) is less than the window size (%d nt)\n" % ( itemToProcess, cds.length(), self._windowWidth) f.write(e) logging.error(e) timerForPreFolding.stop() raise Exception(e) # Create a list of the windows we need to calculate for this CDS if reference == "begin": requestedWindowStarts = frozenset( range( 0, min(lastWindowStart + 1, cds.length() - self._windowWidth - 1), windowStep)) if (len(requestedWindowStarts) == 0): e = "No windows exist for calculation taxid=%d, protId=%s, CDS-length=%d, lastWindowStart=%d, windowStep=%d, windowWidth=%d - Skipping...\n" % ( taxId, protId, cds.length(), lastWindowStart, windowStep, self._windowWidth) f.write(e) logging.error(e) timerForPreFolding.stop() raise Exception(e) elif reference == "end": lastPossibleWindowStart = cds.length( ) - self._windowWidth #+ 1 # disregard lastWindowStart when reference=="end" #lastWindowCodonStart = (lastPossibleWindowStart-3)-(lastPossibleWindowStart-3)%3 #lastPossibleWindowStart = seqLength - windowWidth # + 1 # disregard lastWindowStart when reference=="end" requestedWindowStarts = frozenset( filter( lambda x: x >= lastWindowStart, range(lastPossibleWindowStart % windowStep, lastPossibleWindowStart + 1, windowStep))) #requestedWindowStarts = frozenset(range(lastWindowCodonStart % windowStep, lastWindowCodonStart, windowStep)) #pass else: assert (False) # First, read available results (for all shuffle-ids) in JSON format # Array is indexed by shuffle-id, so results not requested will be represented by None (as will requested items that have no results yet). logging.info("DEBUG: requestedShuffleIds (%d items): %s\n" % (len(requestedShuffleIds), requestedShuffleIds)) existingResults = cds.getCalculationResult2(self._seriesSourceNumber, requestedShuffleIds, True, shuffleType=shuffleType) #assert(len(existingResults) >= len(requestedShuffleIds)) # The returned array must be at least as large as the requested ids list assert (len(existingResults) == len(requestedShuffleIds)) logging.info("requestedShuffleIds: %s" % requestedShuffleIds) logging.info("existingResults.keys(): %s" % existingResults.keys()) assert (frozenset(requestedShuffleIds) == frozenset( existingResults.keys())) #existingResults = [None] * (max(requestedShuffleIds)+1) logging.info("DEBUG: existingResults (%d items): %s\n" % (len(existingResults), existingResults)) # Check for which of the requested shuffle-ids there are values missing shuffleIdsToProcess = {} for shuffleId, r in existingResults.items(): if r is None: # There are no existing results for shuffled-id n. If it was requested, it should be calculated now (including all windows) if shuffleId in requestedShuffleIds: shuffleIdsToProcess[shuffleId] = list( requestedWindowStarts) timerForPreFolding.stop() # ------------------------------------------------------------------------------------ continue # TODO - verify this line; should we abort this sequence by throwing???? # ------------------------------------------------------------------------------------ logging.info("/// shuffleId r = %d %s" % (shuffleId, r)) logging.info("r[MFE-profile] %s" % r["MFE-profile"]) # Check the existing results for this shuffle alreadyProcessedWindowStarts = frozenset([ i for i, x in enumerate(r["MFE-profile"]) if x is not None ]) # Get the indices (=window starts) of all non-None values missingWindows = requestedWindowStarts - alreadyProcessedWindowStarts # Are there any requested windows that are not already computed? if (missingWindows): shuffleIdsToProcess[shuffleId] = missingWindows if (not shuffleIdsToProcess): e = "All requested shuffle-ids in (taxId: %d, protId: %s, seqs: %s) seem to have already been processed. Skipping...\n" % ( taxId, protId, str(list(zip(seqIds, requestedShuffleIds)))) logging.warning(e) timerForPreFolding.stop() return logging.info("DEBUG: shuffleIdsToProcess (%d items): %s\n" % (len(shuffleIdsToProcess), shuffleIdsToProcess)) logging.info("DEBUG: Before (%d items): %s\n" % (len(existingResults), existingResults)) # Initialize new results records for shuffleId in shuffleIdsToProcess.keys(): if existingResults[shuffleId] is None: logging.info(seqIds) logging.info(requestedShuffleIds) logging.info(shuffleId) thisSeqId = seqIds[requestedShuffleIds.index(shuffleId)] existingResults[shuffleId] = { "id": "%s/%s/%d/%d" % (taxId, protId, thisSeqId, shuffleId), "seq-crc": None, "MFE-profile": [], "MeanMFE": None, "v": 2, "shuffle-type": shuffleType } logging.info("DEBUG: existingResults (%d items): %s\n" % (len(existingResults), existingResults)) timerForPreFolding.stop() # Load the sequences of all shuffle-ids we need to work on # TODO - combine loading of multiple sequences into one DB operation for shuffleId, record in existingResults.items(): if record is None: logging.info( "DEBUG: skipping empty results record for shuffleId={}". format(shuffleId)) continue timerForPreFolding.start() seq = None annotatedSeqId = None # Get the sequence for this entry if (shuffleId < 0): seq = cds.sequence() annotatedSeqId = cds.seqId() else: seq = cds.getShuffledSeq(shuffleId, shuffleType) annotatedSeqId = cds.getShuffledSeqId(shuffleId, shuffleType) if (seq is None or (not seq is None and len(seq) == 0)): seq2 = cds.getShuffledSeq2(annotatedSeqId) seq3 = cds._fetchSequence(annotatedSeqId) seq4 = cds._cache.get("%d:seq" % annotatedSeqId) if not seq4 is None: del cds._cache["%d:seq" % annotatedSeqId] seq5 = cds.getShuffledSeq2(annotatedSeqId) e = "Got empty sequence for shuffleId=%d, seqId=%d, taxId=%d, protId=%s, numShuffled=%d, ids[%d:%d]=%s, len(seq2)=%d, len(seq3)=%d, len(seq4)=%d, len(seq5)=%d" % ( shuffleId, annotatedSeqId, taxId, protId, len(cds.shuffledSeqIds()), shuffleId - 2, shuffleId + 2, cds.shuffledSeqIds()[shuffleId - 2:shuffleId + 2], len(seq2) if not seq2 is None else -1, len(seq3) if not seq3 is None else -1, len(seq4) if not seq4 is None else -1, len(seq5) if not seq5 is None else -1) logging.error(e) timerForPreFolding.stop() raise Exception(e) # # Disabled - calculation needn't include the native sequence... # #if( annotatedSeqId not in seqIds ): # e = "Error: SeqId specified in queue item %s does not match annotated seq-id %d\n" % (itemToProcess, annotatedSeqId) # f.write(e) # f.write("Current shuffle-id: %d\n" % shuffleId) # f.write("Ids in existing results:\n") # for shuffleId, record in enumerate(existingResults): # f.write(" %d) %s\n" % (shuffleId, record['id'])) # f.write("Debug info:\n") # f.write("\n".join(cds.getDebugInfo())) # f.write("\n") # f.write("Skipping...\n") # print("Skipping...") # raise Exception(e) expectedSeqLength = cds.length() if (not expectedSeqLength is None): if (expectedSeqLength != len(seq)): e = "Warning: taxid=%d, protid=%s, seqid=%d - unexpected length %d (expected: %d)\n" % ( taxId, protId, annotatedSeqId, len(seq), expectedSeqLength) f.write(e) logging.error(e) timerForPreFolding.stop() raise Exception(e) if (len(seq) < self._windowWidth): # Sequence is shorter than required window; skip e = "Warning: skipping sequence because it is shorter than the requested window...\n" f.write(e) logging.error(e) timerForPreFolding.stop() raise Exception(e) logging.info( "DEBUG: Processing item taxId=%d, protId=%s, shuffle=%d (length=%d, %d windows)...\n" % (taxId, protId, shuffleId, len(seq), len(requestedWindowStarts))) # TODO - Remove any old value stored in this key? # Skip this for now # This will be made redundant by completing the "updating" implementation # #if( cds.isCalculationDone( seriesSourceNumber, shuffleId )): # # Sufficient data seems to exist. Skip... # f.write("Item %s appears to be already completed, skipping..." % itemToProcess) # continue logging.info(seq[:50]) #f.write("\n") MFEprofile = record["MFE-profile"] #f.write("Profile: %s\n" % MFEprofile) # Make sure the profile array contains enough entries for all new windows (and possibly, if windows are non-contiguous, entries between them that we are not going to compute right now) if (len(MFEprofile) < max(requestedWindowStarts)): entriesToAdd = max(requestedWindowStarts) - len(MFEprofile) + 1 MFEprofile.extend([None] * entriesToAdd) assert (len(MFEprofile) >= max(requestedWindowStarts)) stats = RunningStats() stats.extend([x for x in MFEprofile if x is not None]) timerForPreFolding.stop() timerForFolding.start() for start in requestedWindowStarts: fragment = seq[start:(start + self._windowWidth)] assert (len(fragment) == self._windowWidth) if self._seriesSourceNumber == db.Sources.RNAfoldEnergy_SlidingWindow40_v2: # Calculate the RNA folding energy. This is the computation-heavy part. #strct, energy = RNA.fold(fragment) energy = RNAfold_direct(fragment) assert (energy <= 0.0) elif self._seriesSourceNumber == db.Sources.RNAfoldEnergy_SlidingWindow40_v2_native_temp: # Calculate the RNA folding energy. This is the computation-heavy part. #strct, energy = RNA.fold(fragment) energy = RNAfold_direct(fragment, explicitCalculationTemperature= optimalSpeciesGrowthTemperature) assert (energy <= 0.0) elif self._seriesSourceNumber == db.Sources.TEST_StepFunction_BeginReferenced: if shuffleId < 0: energy = 0 else: energy = start % 50 - 20 elif self._seriesSourceNumber == db.Sources.TEST_StepFunction_EndReferenced: if shuffleId < 0: energy = 0 else: energy = (expectedSeqLength - self._windowWidth - start) % 50 - 20 else: logging.error( "Received unknown seriesSourceNumber {}".format( self._seriesSourceNumber)) assert (False) # Store the calculation result #print("%d:%s --> %f" % (taxId, protId, energy)) stats.push(energy) MFEprofile[start] = energy print( "/////////////////// shuffleId={} (len={}) //////////////////////////" .format(shuffleId, expectedSeqLength)) prettyPrintProfile(MFEprofile) timerForFolding.stop() timerForPostFolding.start() # Format crc = calcCrc(seq) #result = """{"id":"%s","seq-crc":%d,"MFE-profile":[%s],"MeanMFE":%.6g,v:2}""" % (itemToProcess, crc, ",".join(map(lambda x: "%.3g" % x, MFEprofile)), stats.mean()) record["seq-crc"] = crc record["MFE-profile"] = [ round4(x) for x in MFEprofile ] # Round items down to save space (these are not exact numbers anyway) record["MeanMFE"] = stats.mean() result = json.dumps(record) f.write(result) f.write("\n") if (not self._debugDoneWriteResults): cds.saveCalculationResult2(self._seriesSourceNumber, result, annotatedSeqId, False) timerForPostFolding.stop() timerForPostFolding.start() if (not self._debugDoneWriteResults): cds.commitChanges() timerForPostFolding.stop()
continue # ------------------------------------------------------------------------------------------ # Exclude some sequences from the calculation # ------------------------------------------------------------------------------------------ # Skip sequences with partial CDS annotations #if(r.exists("CDS:taxid:%d:protid:%s:partial" % (taxIdForProcessing, protId))): # skipped += 1 # continue #if( not r.exists(nativeCdsSeqIdKey % (taxIdForProcessing, protId)) ): # skipped +=1 # continue cds = CDSHelper(taxIdForProcessing, protId) seqLength = cds.length() stopCodonPos = cds.CDSlength() if seqLength is None: print( "Warning: Could not find CDS length entry for taxid=%d, protid=%s" % (taxIdForProcessing, protId)) skipped += 1 stats['skipped-cds-length-missing'] += 1 continue # Skip sequences with length <40nt (window width) if (seqLength < windowWidth + 1): print("short seq")
err[ErrorTypes.UpdateProfileTooShort] += 1 badUpdateRecords.add(updateRecord[2]) if( args.verbose>2 ): print("UpdateProfileTooShort") else: #print('-------- 0 --------') #print(len(profile0)) #print(profile0) #print('-------- 1 --------') #print(len(profile1)) #print(profile1) profile0.extend([None]*(len(profile1)-len(profile0))) # Add 'None's at the end (to allow comparison of new values) hasNewWindows = False numNewWindows = 0 cds = CDSHelper(identifierFromOriginalRecord[0], identifierFromOriginalRecord[1] ) cdsLength = cds.length() for pos, vs in enumerate(zip( profile0, profile1 )): v0, v1 = vs if( not v1 is None ): allWindows_FrameRelativeToStart.update( (pos%10,) ) allWindows_FrameRelativeToEnd.update( ((cdsLength-pos)%10,) ) # Note: this check rejects changes to existing windows (though those might be needed at some point) if( (not v0 is None) and ( (v1 is None) or (abs(v0-v1) >= 1e-8) ) ): err[ErrorTypes.ExistingProfileValueCorrupted] += 1
# Configuration taxId = args.taxId #statsShuffles = RunningStats() statsShuffles = OfflineStats() recordsCount = 0 warningsCount = 0 rl = RateLimit(30) total = countSpeciesCDS(taxId) for protId in SpeciesCDSSource(taxId): cds = CDSHelper(taxId, protId) statsShuffles.push( cds.dropShuffledSeqs(lastItemToKeep=args.keep_first_n_shuffles)) recordsCount += 1 if (rl()): print("processed %d records (%.2g%%)" % (recordsCount, float(recordsCount) / total * 100)) # DEBUG ONLY # DEBUG ONLY # DEBUG ONLY # DEBUG ONLY # DEBUG ONLY # DEBUG ONLY # #if( recordsCount > 20 ): # break # DEBUG ONLY # DEBUG ONLY # DEBUG ONLY # DEBUG ONLY # DEBUG ONLY # DEBUG ONLY #
def readSeriesResultsForSpecies(seriesSourceNumber, species, minShuffledGroups=20, maxShuffledGroups=20, shuffleType=db.Sources.ShuffleCDSv2_python, cdsFilter=None, returnCDS=True): if isinstance( species, Iterable ): # usually, species will be a sequence of numeric taxid values if isinstance(species, basestring): raise Exception("species cannot be string") # all set - proceed... else: species = (species, ) # assume we got a single (numeric) taxid value assert (minShuffledGroups <= maxShuffledGroups) for taxIdForProcessing in species: print("Procesing %d sequences for tax-id %d (%s)..." % (countSpeciesCDS(taxIdForProcessing), taxIdForProcessing, getSpeciesName(taxIdForProcessing))) computed = getAllComputedSeqsForSpecies(seriesSourceNumber, taxIdForProcessing, maxShuffledGroups, shuffleType=shuffleType) computedIds = frozenset(computed.keys()) print("Collecting data from %d computation results..." % len(computed)) skipped = 0 selected = 0 alreadyCompleted = 0 # Iterate over all CDS entries for this species for protId in SpeciesCDSSource(taxIdForProcessing): cds = CDSHelper(taxIdForProcessing, protId) if (not cdsFilter is None) and (not cdsFilter(cds)): continue cdsSeqId = cds.seqId() shuffledIds = cds.shuffledSeqIds(shuffleType=shuffleType) # How many shuffles (for this cds) exist in the data we found? computedShufflesCount = len( computedIds.intersection(frozenset(shuffledIds))) if (computedShufflesCount < minShuffledGroups or (not cdsSeqId in computedIds)): #print("%s - found only %d groups, skipping" % (protId, computedShufflesCount)) skipped += 1 continue # Get the computed results for this CDS seqIds = [cds.seqId()] seqIds.extend(cds.shuffledSeqIds(shuffleType=shuffleType)) if (len(seqIds) > maxShuffledGroups + 1): seqIds = seqIds[:maxShuffledGroups + 1] results = [computed.get(x) for x in seqIds] if (results is None or len([() for x in results if not x is None]) < minShuffledGroups): print("Not enough results found for %s" % protId) skipped += 1 continue # Decode the results results = list( map( lambda x: decodeJsonSeriesRecord(decompressSeriesRecord(x)) if not x is None else None, results)) if (returnCDS): yield { "taxid": taxIdForProcessing, "content": results, "cds": cds } else: yield {"taxid": taxIdForProcessing, "content": results} del results del cds selected += 1 if (rl()): print("# %s - %d records included, %d records skipped" % (datetime.now().isoformat(), selected, skipped))
#protId = codecs.decode(protId) # Filtering # Skip sequences with partial CDS annotations #if(r.exists("CDS:taxid:%d:protid:%s:partial" % (taxIdForProcessing, protId))): # skipped += 1 # continue #if( not r.exists(nativeCdsSeqIdKey % (taxIdForProcessing, protId)) ): # skipped +=1 # continue if (rl()): print("%d %d" % (selected, skipped)) cds = CDSHelper(taxIdForProcessing, protId) seqLength = cds.length() if (not seqLength is None): # Skip sequences with length <40nt (window width) if (seqLength < calculationWidth + windowWidth - 1): skipped += 1 continue else: print( "Warning: Could not find CDS length entry for taxid=%d, protid=%s" % (taxIdForProcessing, protId)) skipped += 1 continue #requiredNumWindows = seqLength - windowWidth + 1
# Configuration taxId = 3055 statsLength = RunningStats() statsShuffles = RunningStats() recordsCount = 0 warningsCount = 0 rl = RateLimit(30) total = countSpeciesCDS(taxId) for protId in SpeciesCDSSource(taxId): cds = CDSHelper(taxId, protId) recordsCount += 1 statsLength.push(cds.length()) if (len(cds.sequence()) != cds.length()): print( "WARNING: incorrect sequence length detected for record (taxid=%d, protId=%s); real-length=%d, recorded-length=%d." % (taxId, protId, len(cds.sequence()), cds.length())) warningsCount += 1 recomputedCrc = calcCrc(cds.sequence()) annotatedCrc = cds.crc() assert (recomputedCrc == annotatedCrc) print(cds.sequence()[:15])
if (countSpeciesCDS(taxId) == 0): print("Species %d (%s) doesn't have any proteins..." % (taxId, getSpeciesName(taxId))) print("Nothing left to do...") sys.exit(0) print("Species %d (%s) has %d proteins stored." % (taxId, getSpeciesName(taxId), countSpeciesCDS(taxId))) print("Will delete it in 10 seconds...") sleep(10) count = 0 for protId in SpeciesCDSSource(taxId): print(protId) cds = CDSHelper(taxId, protId) try: cds.dropShuffledSeqs() except Exception as e: print(e) try: cds.dropNativeSeq() except Exception as e: print(e) cds.dropRecord() count += 1 if (rl()):
warnings = Counter() numUniqueShuffles = Counter() for taxId in species: proteinsDone = 0 #nativeColumns = [[] for x in range(maxCodons)] #shuffledColumns = [[[] for x in range(maxCodons)] for y in range(maxShuffles)] allNativeSeqs = {} allShuffledSeqs = {} for protId in SpeciesCDSSource(taxId): cds = CDSHelper(taxId, protId) warnings.update(("total-cds", )) allIds = cds.shuffledSeqIds(shuffleType=shuffleType)[:maxShuffles] nativeSeq = cds.sequence() if (len(nativeSeq) % 3 != 0): warnings.update(("has-broken-codons", )) continue nativeCodons = Counter(splitCodons(nativeSeq)) hasMismatchedCodons = False allNativeSeqs[protId] = nativeSeq hashesForShuffles = set()
def randomize(self, nucleotideSeq: str, protId: str) -> (int, float, str): #print("-----------"*5) cds = CDSHelper(self.taxId, protId) # Get metadata from genome model #gm = cds.getGenomeModel() #found = gm.findFeatureById( protId ) #if found is None: # raise Exception("Failed to find feature matching protein-id={} in genome model".format(protId)) #(moleculeId, currFeature) = found #if gm.moleculeModels[moleculeId].find3PrimeFlankingRegion( currFeature, debug=True ) is None: # pass #print((moleculeId, feature)) cdsLengthNt = cds.CDSlength() assert (cdsLengthNt % 3 == 0) flankingRegionLengthNt = cds.flankingRegion3UtrLength() nextCDSOppositeStrand = cds.nextCDSOnOppositeStrand() # Case 1 (no overlap): # +--------intergenic--------+ # | | # +-------CDS1--------+ +------------CDS2-----------+ # | | | | # +===================+--------------------------+===========================+ # | | | | # +===================+--------------------------+===========================+ # |<---cdsLengthNt--->|<-flankingRegionLengthNt->| | # | (>= 0) | # |<---------------------------cds.totalLength()---------------------------->| # Case 2 (overlap): # +--------------------------CDS2-------------------------+ # | | # +---------------------CDS1----------------------+ | # | | | | # +===================+===========================+===========================+ # | | | | # +===================+===========================+===========================+ # | |<-flankingRegionLengthNt-->| | # | (<= 0) | | # |<----------------cdsLengthNt------------------>| | # |<----------------------------cds.totalLength()---------------------------->| if flankingRegionLengthNt < 0 and -flankingRegionLengthNt > cdsLengthNt: #flankingRegionLengthNt = -cdsLengthNt raise Exception("Next CDS is fully overlapping...") #----------------------------------------------------------------------------- # Randomize the "main" CDS #----------------------------------------------------------------------------- # First, determine which region to randomize... if (not self.constantOverlaps) or ( flankingRegionLengthNt >= 0): # no overlap, or overlap should be randomized CDSseq = nucleotideSeq[:cdsLengthNt] assert (len(CDSseq) == cdsLengthNt) else: # constant overlaps requested and this CDS is overlapping the next. Remove the overlap from the CDS (it will not be randomized): lastNucBeforeOverlap = cdsLengthNt + flankingRegionLengthNt assert (lastNucBeforeOverlap < cdsLengthNt) lastNucToRandomize = lastNucBeforeOverlap - (lastNucBeforeOverlap % 3) CDSseq = nucleotideSeq[:lastNucToRandomize] assert (len(CDSseq) % 3 == 0) # Then, do the randomization... (CDSpermCount, CDSidentity, randomizedCDS) = self.cdsRand.randomizeAmbiguousSequence(CDSseq) # Finally, add the non-randomized part of the CDS (if any) if (not self.constantOverlaps) or ( flankingRegionLengthNt >= 0): # no overlap, or overlap should be randomized pass else: # constant overlaps requested and this CDS is overlapping the next. randomizedCDS = randomizedCDS + nucleotideSeq[ lastNucToRandomize:cdsLengthNt] assert (len(randomizedCDS) % 3 == 0) assert ( len(randomizedCDS) == cdsLengthNt ) # the length of the resulting sequence matches the original CDS sequence #----------------------------------------------------------------------------- # Randomize the 3'UTR #----------------------------------------------------------------------------- if flankingRegionLengthNt > 0: _3UTRseq = nucleotideSeq[cdsLengthNt:cdsLengthNt + flankingRegionLengthNt] assert (len(_3UTRseq) == flankingRegionLengthNt) (UTRpermCount, UTRidentity, randomizedUTR) = self.utrRand.randomizeAmbiguousSequence(_3UTRseq) else: _3UTRseq = "" UTRpermCount = 1 UTRidentity = 1.0 randomizedUTR = "" #----------------------------------------------------------------------------- # Randomize the downstream CDS #----------------------------------------------------------------------------- nextCDSseq = nucleotideSeq[ cdsLengthNt + flankingRegionLengthNt:] # Should work for positive and negative length UTRs assert (len(nextCDSseq) % 3 == 0) #nextCDSseq = nextCDSseq[(len(nextCDSseq)%3):] # remove partial codons from the start (caused due to the overlap; we can only randomize each codon as part of one CDS, although in the overlap region codons belong to two CDSs...) if nextCDSOppositeStrand: nextCDSseq = str(Seq(nextCDSseq, generic_dna).reverse_complement()) assert (len(nextCDSseq) % 3 == 0) (nextCDSpermCount, nextCDSidentity, randomizedNextCDS ) = self.cdsRand.randomizeAmbiguousSequence(nextCDSseq) if nextCDSOppositeStrand: # if the next CDS is on the opposite strand, revcomp it back to its original frame randomizedNextCDS = str( Seq(randomizedNextCDS, generic_dna).reverse_complement()) if flankingRegionLengthNt < 0: randomizedNextCDS = randomizedNextCDS[-flankingRegionLengthNt:] totalPerms = CDSpermCount * UTRpermCount * nextCDSpermCount totalIdentity = ((CDSidentity * len(CDSseq)) + (UTRidentity * len(_3UTRseq)) + (nextCDSidentity * len(nextCDSseq))) / ( len(CDSseq) + len(_3UTRseq) + len(nextCDSseq)) return (totalPerms, totalIdentity, randomizedCDS + randomizedUTR + randomizedNextCDS)
# build a average profiles for each of the shuffled groups shuffleProfiles = [] medianGCContent = [] ## # Are the profiles computed for each sequence, or are they accumulated? ## sdfasdfasdfasdfasdfasdfadsf asdfasd fasd afsdfsd3##@2 # Iterate over all CDS entries for this species for protId in SpeciesCDSSource(taxIdForProcessing): cds = CDSHelper(taxIdForProcessing, protId) seqLength = cds.length() profileInfo.setCDSLength(seqLength) if( not seqLength is None ): # Skip sequences that are too short if(seqLength < numWindows + windowWidth + 1 ): skipped += 1 continue else: print("Warning: Could not find CDS length entry for taxid=%d, protid=%s" % (taxIdForProcessing, protId) ) skipped += 1 continue #requiredNumWindows = seqLength - windowWidth + 1
def storeNewShuffles(taxId, protId, newShuffleIds, shuffleType=db.Sources.ShuffleCDSv2_python, dontStore=False): cds = CDSHelper(taxId, protId) #print(protId) if shuffleType == db.Sources.ShuffleCDSv2_python: return storeRandomizedSequences( cds, createRandomizedSeqs(cds, newShuffleIds, shuffleType), newShuffleIds, shuffleType) elif shuffleType == db.Sources.ShuffleCDS_vertical_permutation_1nt: cache = getRandomizedSequenceCacheForVerticalPermutations(taxId) seqs = [ cache.getShuffledSeq(protId, shuffleId) for shuffleId in newShuffleIds ] print(seqs) if dontStore: return seqs return storeRandomizedSequences(cds, seqs, newShuffleIds, shuffleType) elif shuffleType == db.Sources.ShuffleCDS_synon_perm_and_3UTR_nucleotide_permutation: #print("store: before") #a = createRandomizedSeqs_CDS_with_3UTR(cds, newShuffleIds, shuffleType) #print("store: {}".format(a)) return storeRandomizedSequences( cds, createRandomizedSeqs_CDS_with_3UTR(cds, newShuffleIds, shuffleType=shuffleType, taxId=taxId), newShuffleIds, shuffleType) elif shuffleType == db.Sources.ShuffleCDS_synon_perm_and_3UTR_nucleotide_permutation_Including_Next_CDS: #print("store: before") #a = createRandomizedSeqs_CDS_with_3UTR(cds, newShuffleIds, shuffleType) #print("store: {}".format(a)) return storeRandomizedSequences( cds, createRandomizedSeqs_CDS_with_3UTR(cds, newShuffleIds, shuffleType=shuffleType, taxId=taxId), newShuffleIds, shuffleType) elif shuffleType == db.Sources.ShuffleCDS_synon_perm_and_3UTR_nucleotide_permutation_Including_Next_CDS_Constant_Overlaps: #print("store: before") #a = createRandomizedSeqs_CDS_with_3UTR(cds, newShuffleIds, shuffleType) #print("store: {}".format(a)) return storeRandomizedSequences( cds, createRandomizedSeqs_CDS_with_3UTR(cds, newShuffleIds, shuffleType=shuffleType, taxId=taxId), newShuffleIds, shuffleType) else: raise Exception("Unsupported shuffleType={}".format(shuffleType))
def processGenome(args, taxId): alreadyProcessedGenes = {} totalProteinsProcessed = 0 totalSkipped = 0 seqsForWriting=[] recordsForWriting={} gm = getGenomeModelFromCache( taxId ) for protId in SpeciesCDSSource(taxId): cds = CDSHelper( taxId, protId ) totalProteinsProcessed += 1 #feature = gm.findFeatureById( protId ) geneId = cds.getGeneId() #flanking3UTRRegionLengthNt = cds.flankingRegion3UtrLength() feature = gm.findFeatureById( protId ) #feature = cds.getMatchingFeatureFromGenomeModel() #print(feature) strand = feature[1].data['strand'] if strand=='+': otherFeature = gm.moleculeModels[ feature[0] ].find5PrimeFlankingRegion( feature[1] ) if otherFeature is None: totalSkipped += 1 continue assert( otherFeature['downstream-feature'].begin <= otherFeature['downstream-feature'].end) flanking3UTRRegionLengthNt = otherFeature['curr-feature'].begin - otherFeature['downstream-feature'].end threePrimeUTRCoords = (feature[1].begin-20, feature[1].begin+2, False) # include the first 3 nucleotides of the CDS else: otherFeature = gm.moleculeModels[ feature[0] ].find5PrimeFlankingRegion( feature[1] ) if otherFeature is None: totalSkipped += 1 continue assert( otherFeature['downstream-feature'].begin <= otherFeature['downstream-feature'].end) flanking3UTRRegionLengthNt = otherFeature['downstream-feature'].begin - otherFeature['curr-feature'].end threePrimeUTRCoords = (feature[1].end-3, feature[1].end+20, True) # include the first 3 nucleotides of the CDS threePrimeUTR = gm.moleculeModels[ feature[0] ].getSequence( *threePrimeUTRCoords ) if flanking3UTRRegionLengthNt < -50: print("Warning: found gene with apparent long overlap: {},{},{},{},{}".format( protId, geneId, strand, flanking3UTRRegionLengthNt, threePrimeUTR.seq )) #totalSkipped += 1 #continue if threePrimeUTR.seq[-2:] != 'TG': print("Warning: skipping gene with start codon at the correct place: {},{},{},{},{}".format( protId, geneId, strand, flanking3UTRRegionLengthNt, threePrimeUTR.seq )) totalSkipped += 1 continue # All done - emit the output #fout.write("{},{},{},{},{}".format( protId, geneId, strand, flanking3UTRRegionLengthNt, threePrimeUTR.seq )) recordsForWriting[protId] = (geneId, strand, flanking3UTRRegionLengthNt, threePrimeUTR.seq ) seqsForWriting.append( SeqRecord( Seq(threePrimeUTR.seq[:-3], NucleotideAlphabet), id=protId) ) aSD = calculateaSDEnergies( seqsForWriting, args, taxId ) print(len(aSD)) with open( outputData.format(taxId), 'wt') as fout: for protId, record in recordsForWriting.items(): aSDval = aSD.get(protId, None) vals = (protId,) + record + (aSDval,) fout.write("{},{},{},{},{},{}\n".format( *vals )) print("Processed {} coding sequences for taxid {}".format( totalProteinsProcessed, taxId )) print("Skipped {} coding sequences".format( totalSkipped ))
# Skip sequences with partial CDS annotations #if(r.exists("CDS:taxid:%d:protid:%s:partial" % (taxIdForProcessing, protId))): # skipped += 1 # continue #if( not r.exists(nativeCdsSeqIdKey % (taxIdForProcessing, protId)) ): # skipped +=1 # continue if(rl()): print("# %s - %d records included, %d records skipped" % (datetime.now().isoformat(), selected, skipped)) if( nativeProfile[0].count() > 1005 and rl2()): printOutput() cds = CDSHelper(taxIdForProcessing, protId) seqLength = cds.length() if( not seqLength is None ): # Skip sequences that are too short if(seqLength < numWindows + windowWidth + 1 ): skipped += 1 continue else: print("Warning: Could not find CDS length entry for taxid=%d, protid=%s" % (taxIdForProcessing, protId) ) skipped += 1 continue requiredNumWindows = seqLength - windowWidth + 1 cdsSeqId = cds.seqId()