def getRandomizedSequenceCacheForVerticalPermutations(taxId): global _caches if (taxId, db.Sources.ShuffleCDS_vertical_permutation_1nt) in _caches: cache = _caches[(taxId, db.Sources.ShuffleCDS_vertical_permutation_1nt)] else: # read all native sequences protIds = [] cdss = [] for protId in SpeciesCDSSource(taxId): cds = CDSHelper(taxId, protId) if( cds.length()%3 != 0 ): continue seq = cds.sequence() protIds.append(protId) cdss.append(seq) geneticCode = getSpeciesTranslationTable( taxId ) scpr = SynonymousCodonPermutingRandomization( geneticCode ) randomizer = lambda cdss: scpr.verticalPermutation( cdss ) cache = VerticalRandomizationCache(shuffleType=db.Sources.ShuffleCDS_vertical_permutation_1nt, taxId=taxId, nativeSeqsMap=dict(zip(protIds, cdss)), geneticCode=geneticCode, randomizer=randomizer ) _caches[(taxId, db.Sources.ShuffleCDS_vertical_permutation_1nt)] = cache print(_caches.keys()) return cache
def testRegionSpecificRandomization2(codon, taxId): #assert(fraction>=0) #assert(fraction < numFractions) assert(type(taxId)==type(0)) startTime = time() shuffler = SynonymousCodonPermutingRandomization(getSpeciesTranslationTable(taxId)) numShuffles = 1 numSeqsDone = 0 #diffStats = CalcStats2(taxId) #allNativeStats = CalcStats2(taxId) data = [] poolCodons = [] for (seqId, seq) in nativeSequencesSource(taxId, 0, 1): if len(seq) >= (codon+1)*3-1: codon = seq[codon*3:(codon+1)*3] assert(len(codon)==3) poolCodons.append( codon ) pool = ''.join(poolCodons) totalPermutationsCountForSeq = None numAttempts = 0 while True: identity = None shuffledSeq = None #if time() - startTime > 300: # raise Exception("Calculation took to much time!") try: numAttempts += 1 totalPermutationsCountForSeq, identity, shuffledSeq = shuffler.randomize(pool) except Exception as e: print(e) #continue # skip this sequence raise e if numAttempts >= 3: break #data.append( (len(seq), totalPermutationsCountForSeq) ) #numSeqsDone += 1 #logging.warning(mod3.getResults()) #logging.warning(mod4.getResults()) #logging.warning(mod5.getResults()) return (taxId, codon, len(pool), totalPermutationsCountForSeq)
def calculateENcPrimeForSpecies(taxId, orig=False): geneticCode = getSpeciesTranslationTable(taxId) if orig: cdsCount, fastaFile = writeSequenceToTempFile_orig(taxId) else: cdsCount, fastaFile = writeSequenceToTempFile(taxId) createCodonCounts(fastaFile.name, cdsCount) createNucleotideCounts(fastaFile.name, cdsCount) print("Genomic GC%: {}".format(getSpeciesProperty(taxId, 'gc-content'))) return createEncPrimeReport(fastaFile.name, geneticCode)
def calcNativeSequencesStatistics(taxId, fraction, numFractions): #countPairedNucleotides = 0 #countTotalNucleotides = 0 cdsCount = 0 gcCount = 0 totalCount = 0 cdsWarnings = 0 warnings = Counter() firstAA = Counter() lastAA = Counter() geneticCode = getSpeciesTranslationTable(taxId) for seqId, seq in nativeSequencesSource(taxId, fraction, numFractions): seq = seq.lower() seqHasWarnings = False gcCount += sum([1 for x in seq if (x == 'c' or x == 'g')]) totalCount += sum([ 1 for x in seq if (x == 'c' or x == 'g' or x == 'a' or x == 't') ]) # don't count 'N's if len(seq) % 3 != 0: seqHasWarnings = True warnings['cds-length'] += 1 xlation = Seq(seq).translate(table=geneticCode).lower() if xlation[0] != 'm': seqHasWarnings = True warnings['translation-methionine'] += 1 if xlation[-1] != '*': seqHasWarnings = True warnings['translation-stop-codon'] += 1 if seqHasWarnings: cdsWarnings += 1 firstAA.update(xlation[0]) lastAA.update(xlation[-1]) cdsCount += 1 #print("Total: %d" % countTotalNucleotides) #print("Paired: %d (%.3g%%)" % (countPairedNucleotides, float(countPairedNucleotides)/countTotalNucleotides*100)) return (taxId, fraction, cdsCount, gcCount, totalCount, cdsWarnings, warnings, firstAA, lastAA)
def _init_item(self, taxId): from data_helpers import getSpeciesGenomeSequenceFile, getSpeciesGenomeAnnotationsFile, getSpeciesGenomeAnnotationsVariant, getSpeciesTranslationTable, getSpeciesGenbankAnnotationsFile genomeSeqFile = getSpeciesGenomeSequenceFile(taxId) genomeAnnotFile = getSpeciesGenomeAnnotationsFile(taxId) genomeAnnotVariant = getSpeciesGenomeAnnotationsVariant(taxId) genomeGenbankFile = getSpeciesGenbankAnnotationsFile(taxId) geneticCode = getSpeciesTranslationTable(taxId) if genomeSeqFile is None or genomeAnnotFile is None or geneticCode is None: raise ValueError( "No supporting annotations for taxId={}".format(taxId)) gm = GenomeModel( sequenceFile=genomeSeqFile, gffFile=genomeAnnotFile, isLinear=False, # TODO fix this variant=genomeAnnotVariant, geneticCode=geneticCode, genbankFile=genomeGenbankFile) return gm
def testRegionSpecificRandomization(fraction, taxId, numFractions): assert(fraction>=0) assert(fraction < numFractions) assert(type(taxId)==type(0)) startTime = time() shuffler = SynonymousCodonPermutingRandomization(getSpeciesTranslationTable(taxId)) numShuffles = 1 numSeqsDone = 0 #diffStats = CalcStats2(taxId) #allNativeStats = CalcStats2(taxId) data = [] for (seqId, seq) in nativeSequencesSource(taxId, fraction, numFractions): if random.randint(0,1)>0: continue #print(seqId) #nativeStats = CalcStats2(taxId) #nativeStats.calcSeq(seq) #allNativeStats += nativeStats numShufflesIncluded = 0 numAttempts = 0 totalPermutationsCountForSeq = None while True: identity = None shuffledSeq = None #if time() - startTime > 300: # raise Exception("Calculation took to much time!") try: numAttempts += 1 totalPermutationsCountForSeq, identity, shuffledSeq = shuffler.randomizeWithMask(seq, getCodonMaskForSeq(seq, 0, 22) ) except Exception as e: print(e) #continue # skip this sequence raise e if numAttempts >= 3: break data.append( (len(seq), totalPermutationsCountForSeq) ) numSeqsDone += 1 #logging.warning(mod3.getResults()) #logging.warning(mod4.getResults()) #logging.warning(mod5.getResults()) return (taxId, fraction, numSeqsDone, data)
def calculateMissingWindowsForSequence(self, taxId, protId, seqIds, requestedShuffleIds, firstWindow, lastWindowStart, windowStep, reference="begin", shuffleType=db.Sources.ShuffleCDSv2_python, debug=False): timerForPreFolding.start() logging.warning("Parameters: %d %s %s %s %d %d %s %d" % (taxId, protId, seqIds, requestedShuffleIds, lastWindowStart, windowStep, reference, shuffleType)) f = self._logfile assert(len(seqIds)>0) assert(len(seqIds)==len(requestedShuffleIds)) # ------------------------------------------------------------------------ # Obtain species-dependent properties needed for some calculations # ---------------- # Optimal Temp optimalSpeciesGrowthTemperature = None if( self._seriesSourceNumber == db.Sources.RNAfoldEnergy_SlidingWindow40_v2_native_temp ): (numericalProp, _) = getSpeciesTemperatureInfo(taxId) optimalSpeciesGrowthTemperature = numericalProp[0] if optimalSpeciesGrowthTemperature is None: raise Exception("No temperature value for taxid={}, can't calculate native-temperature folding profile...".format(taxId)) else: optimalSpeciesGrowthTemperature = float(optimalSpeciesGrowthTemperature) assert(optimalSpeciesGrowthTemperature >= -30.0 and optimalSpeciesGrowthTemperature <= 150.0) # ---------------- # Genomic translation table genomicTranslationTable = None if( self._seriesSourceNumber in (db.Sources.StopCodon_content_SlidingWindow30, db.Sources.StopCodon_content_SlidingWindow40, db.Sources.StopCodon_content_SlidingWindow50 )): genomicTranslationTable = getSpeciesTranslationTable(taxId) assert(genomicTranslationTable>0 and genomicTranslationTable<=31) if( reference != "begin" and reference != "end" and reference != "stop3utr"): timerForPreFolding.stop() e = "Specificed profile reference '%s' is not supported!" % reference logging.error(e) raise Exception(e) # We will process all listed shuffle-ids for the following protein record if( reference == "begin" or reference == "end" ): regionOfInterest = RegionsOfInterset.CDSonly elif reference == "stop3utr": regionOfInterest = RegionsOfInterset.CDSand3UTR else: assert(False) cds = CDSHelper( taxId, protId, regionOfInterest=regionOfInterest ) if( cds.length() < self._windowWidth ): e = "Refusing to process item %s because the sequence length (%d nt) is less than the window size (%d nt)\n" % (itemToProcess, cds.length(), self._windowWidth) f.write(e) logging.error(e) timerForPreFolding.stop() raise Exception(e) # Create a list of the windows we need to calculate for this CDS if reference == "begin": requestedWindowStarts = frozenset(list(range(0, min(lastWindowStart+1, cds.length()-self._windowWidth-1), windowStep))) if( len(requestedWindowStarts) == 0): e = "No windows exist for calculation taxid=%d, protId=%s, CDS-length=%d, lastWindowStart=%d, windowStep=%d, windowWidth=%d - Skipping...\n" % (taxId, protId, cds.length(), lastWindowStart, windowStep, self._windowWidth) f.write(e) logging.error(e) timerForPreFolding.stop() raise Exception(e) elif reference == "end": lastPossibleWindowStart = cds.length() - self._windowWidth #+ 1 # disregard lastWindowStart when reference=="end" #lastWindowCodonStart = (lastPossibleWindowStart-3)-(lastPossibleWindowStart-3)%3 #lastPossibleWindowStart = seqLength - windowWidth # + 1 # disregard lastWindowStart when reference=="end" requestedWindowStarts = frozenset([x for x in range(lastPossibleWindowStart % windowStep, lastPossibleWindowStart+1, windowStep) if x>=lastWindowStart]) elif reference == "stop3utr": seqLength = cds.length() stopCodonPos = cds.CDSlength() isRequired = [1 if abs(pos-stopCodonPos)<((lastWindowStart//2)*windowStep) else 0 for pos in range(0, seqLength - self._windowWidth, windowStep)] requestedWindowStarts = frozenset( compress( range(seqLength), isRequired ) ) #requestedWindowStarts = frozenset(range(lastWindowCodonStart % windowStep, lastWindowCodonStart, windowStep)) #pass else: assert(False) # First, read available results (for all shuffle-ids) in JSON format # Array is indexed by shuffle-id, so results not requested will be represented by None (as will requested items that have no results yet). logging.info("DEBUG: requestedShuffleIds (%d items): %s\n" % (len(requestedShuffleIds), requestedShuffleIds)) existingResults = cds.getCalculationResult2( self._seriesSourceNumber, requestedShuffleIds, True, shuffleType=shuffleType ) #assert(len(existingResults) >= len(requestedShuffleIds)) # The returned array must be at least as large as the requested ids list assert(len(existingResults) == len(requestedShuffleIds)) logging.info("requestedShuffleIds: %s" % requestedShuffleIds) logging.info("existingResults.keys(): %s" % list(existingResults.keys())) assert(frozenset(requestedShuffleIds)==frozenset(list(existingResults.keys()))) #existingResults = [None] * (max(requestedShuffleIds)+1) logging.info("DEBUG: existingResults (%d items): %s\n" % (len(existingResults), existingResults)) # Check for which of the requested shuffle-ids there are values missing shuffleIdsToProcess = {} for shuffleId, r in list(existingResults.items()): if r is None: # There are no existing results for shuffled-id n. If it was requested, it should be calculated now (including all windows) if shuffleId in requestedShuffleIds: shuffleIdsToProcess[shuffleId] = list(requestedWindowStarts) timerForPreFolding.stop() # ------------------------------------------------------------------------------------ continue # TODO - verify this line; should we abort this sequence by throwing???? # ------------------------------------------------------------------------------------ logging.info("/// shuffleId r = %d %s" % (shuffleId, r)) logging.info("r[MFE-profile] %s" % r["MFE-profile"]) # Check the existing results for this shuffle alreadyProcessedWindowStarts = frozenset( [i for i,x in enumerate(r["MFE-profile"] ) if x is not None] ) # Get the indices (=window starts) of all non-None values missingWindows = requestedWindowStarts - alreadyProcessedWindowStarts # Are there any requested windows that are not already computed? if( missingWindows ): shuffleIdsToProcess[shuffleId] = missingWindows if( not shuffleIdsToProcess): e = "All requested shuffle-ids in (taxId: %d, protId: %s, seqs: %s) seem to have already been processed. Skipping...\n" % (taxId, protId, str(list(zip(seqIds, requestedShuffleIds))) ) logging.warning(e) timerForPreFolding.stop() return logging.info("DEBUG: shuffleIdsToProcess (%d items): %s\n" % (len(shuffleIdsToProcess), shuffleIdsToProcess)) logging.info("DEBUG: Before (%d items): %s\n" % (len(existingResults), existingResults)) # Initialize new results records for shuffleId in list(shuffleIdsToProcess.keys()): if existingResults[shuffleId] is None: logging.info(seqIds) logging.info(requestedShuffleIds) logging.info(shuffleId) thisSeqId = seqIds[ requestedShuffleIds.index(shuffleId) ] existingResults[shuffleId] = { "id": "%s/%s/%d/%d" % (taxId, protId, thisSeqId, shuffleId), "seq-crc": None, "MFE-profile": [], "MeanMFE": None, "v": 2, "shuffle-type":shuffleType } logging.info("DEBUG: existingResults (%d items): %s\n" % (len(existingResults),existingResults) ) timerForPreFolding.stop() # Load the sequences of all shuffle-ids we need to work on # TODO - combine loading of multiple sequences into one DB operation for shuffleId, record in list(existingResults.items()): if record is None: logging.info("DEBUG: skipping empty results record for shuffleId={}".format(shuffleId)) continue timerForPreFolding.start() seq = None annotatedSeqId = None # Get the sequence for this entry if( shuffleId < 0 ): seq = cds.sequence() annotatedSeqId = cds.seqId() else: seq = cds.getShuffledSeq(shuffleId, shuffleType) annotatedSeqId = cds.getShuffledSeqId(shuffleId, shuffleType) if( seq is None or (not seq is None and len(seq)==0 )): seq2 = cds.getShuffledSeq2( annotatedSeqId ) seq3 = cds._fetchSequence( annotatedSeqId ) seq4 = cds._cache.get("%d:seq"%annotatedSeqId) if not seq4 is None: del cds._cache["%d:seq"%annotatedSeqId] seq5 = cds.getShuffledSeq2( annotatedSeqId ) e = "Got empty sequence for shuffleId=%d, seqId=%d, taxId=%d, protId=%s, numShuffled=%d, ids[%d:%d]=%s, len(seq2)=%d, len(seq3)=%d, len(seq4)=%d, len(seq5)=%d" % (shuffleId, annotatedSeqId, taxId, protId, len(cds.shuffledSeqIds()), shuffleId-2, shuffleId+2, cds.shuffledSeqIds()[shuffleId-2:shuffleId+2], len(seq2) if not seq2 is None else -1, len(seq3) if not seq3 is None else -1, len(seq4) if not seq4 is None else -1, len(seq5) if not seq5 is None else -1 ) logging.error(e) timerForPreFolding.stop() raise Exception(e) # # Disabled - calculation needn't include the native sequence... # #if( annotatedSeqId not in seqIds ): # e = "Error: SeqId specified in queue item %s does not match annotated seq-id %d\n" % (itemToProcess, annotatedSeqId) # f.write(e) # f.write("Current shuffle-id: %d\n" % shuffleId) # f.write("Ids in existing results:\n") # for shuffleId, record in enumerate(existingResults): # f.write(" %d) %s\n" % (shuffleId, record['id'])) # f.write("Debug info:\n") # f.write("\n".join(cds.getDebugInfo())) # f.write("\n") # f.write("Skipping...\n") # print("Skipping...") # raise Exception(e) expectedSeqLength = cds.length() if( not expectedSeqLength is None ): if( expectedSeqLength != len(seq) ): e = "Warning: taxid=%d, protid=%s, seqid=%d - unexpected length %d (expected: %d)\n" % (taxId, protId, annotatedSeqId, len(seq), expectedSeqLength) f.write(e) logging.error(e) timerForPreFolding.stop() raise Exception(e) if( len(seq) < self._windowWidth ): # Sequence is shorter than required window; skip e = "Warning: skipping sequence because it is shorter than the requested window...\n" f.write(e) logging.error(e) timerForPreFolding.stop() raise Exception(e) logging.info("DEBUG: Processing item taxId=%d, protId=%s, shuffle=%d (length=%d, %d windows)...\n" % (taxId, protId, shuffleId, len(seq), len(requestedWindowStarts))) # TODO - Remove any old value stored in this key? # Skip this for now # This will be made redundant by completing the "updating" implementation # #if( cds.isCalculationDone( seriesSourceNumber, shuffleId )): # # Sufficient data seems to exist. Skip... # f.write("Item %s appears to be already completed, skipping..." % itemToProcess) # continue logging.info(seq[:50]) #f.write("\n") MFEprofile = record["MFE-profile"] #f.write("Profile: %s\n" % MFEprofile) # Make sure the profile array contains enough entries for all new windows (and possibly, if windows are non-contiguous, entries between them that we are not going to compute right now) if( len(MFEprofile) < max(requestedWindowStarts) ): entriesToAdd = max(requestedWindowStarts) - len(MFEprofile) + 1 MFEprofile.extend( [None] * entriesToAdd ) assert(len(MFEprofile) >= max(requestedWindowStarts)) stats = RunningStats() stats.extend([x for x in MFEprofile if x is not None]) timerForPreFolding.stop() timerForFolding.start() for start in requestedWindowStarts: fragment = seq[start:(start+self._windowWidth)] assert(len(fragment)==self._windowWidth) if self._seriesSourceNumber in (db.Sources.RNAfoldEnergy_SlidingWindow30_v2, db.Sources.RNAfoldEnergy_SlidingWindow40_v2, db.Sources.RNAfoldEnergy_SlidingWindow50_v2): # Calculate the RNA folding energy. This is the computation-heavy part. #strct, energy = RNA.fold(fragment) result = RNAfold_direct(fragment) assert(result <= 0.0) elif self._seriesSourceNumber == db.Sources.RNAfoldEnergy_SlidingWindow40_v2_native_temp: # Calculate the RNA folding energy. This is the computation-heavy part. #strct, energy = RNA.fold(fragment) result = RNAfold_direct(fragment, explicitCalculationTemperature = optimalSpeciesGrowthTemperature) assert(result <= 0.0) elif self._seriesSourceNumber == db.Sources.GC_content_SlidingWindow40: result = calcWindowGCContent( fragment ) assert( isnan(result) or (result >= 0.0 and result <= 1.0) ) elif self._seriesSourceNumber == db.Sources.Purine_content_SlidingWindow40: result = calcWindowPurineContent( fragment ) assert( isnan(result) or (result >= 0.0 and result <= 1.0) ) elif self._seriesSourceNumber in (db.Sources.StopCodon_content_SlidingWindow30, db.Sources.StopCodon_content_SlidingWindow40, db.Sources.StopCodon_content_SlidingWindow50): result = calcWindowStopCodonContent( fragment, translationTable=genomicTranslationTable, phase=start%3 ) assert( result >= 0.0 and result <= 1.0 ) elif self._seriesSourceNumber == db.Sources.TEST_StepFunction_BeginReferenced: if shuffleId < 0: result = 0 else: result = start%50 - 20 elif self._seriesSourceNumber == db.Sources.TEST_StepFunction_EndReferenced: if shuffleId < 0: result = 0 else: result = (expectedSeqLength - self._windowWidth - start)%50 - 20 else: logging.error("Received unknown seriesSourceNumber {}".format(self._seriesSourceNumber)) assert(False) # Store the calculation result #print("%d:%s --> %f" % (taxId, protId, energy)) stats.push(result) MFEprofile[start] = result #print("/////////////////// shuffleId={} (len={}) //////////////////////////".format(shuffleId, expectedSeqLength)) if debug: prettyPrintProfile(MFEprofile) timerForFolding.stop() timerForPostFolding.start() # Format crc = getCrc(seq) #result = """{"id":"%s","seq-crc":%d,"MFE-profile":[%s],"MeanMFE":%.6g,v:2}""" % (itemToProcess, crc, ",".join(map(lambda x: "%.3g" % x, MFEprofile)), stats.mean()) record["seq-crc"] = crc record["MFE-profile"] = [round4(x) for x in MFEprofile] # Round items down to save space (these are not exact numbers anyway) record["MeanMFE"] = stats.mean() if reference == "stop3utr": record["stop-codon-pos"] = cds.CDSlength() result = json.dumps(record) f.write(result) f.write("\n") if( not self._debugDoneWriteResults): cds.saveCalculationResult2( self._seriesSourceNumber, result, annotatedSeqId, False ) timerForPostFolding.stop() timerForPostFolding.start() if( not self._debugDoneWriteResults): cds.commitChanges() timerForPostFolding.stop()
inPhyloTree = taxId in speciesInTree if inPhyloTree: stats.update(['tree']) speciesDf = speciesDf.append( pd.DataFrame({ 'TaxId': pd.Series([taxId], dtype='int'), 'Species': pd.Series([getSpeciesName(taxId)], dtype='str'), 'Nickname': pd.Series([shortNames[taxId]], dtype='str'), 'Source': pd.Series([''], dtype='str'), 'TranslationTbl': pd.Series([getSpeciesTranslationTable(taxId)], dtype='int'), 'InPhyloTree': pd.Series([inPhyloTree], dtype='bool'), 'GenomicGC%': pd.Series([genomicGC], dtype='float'), 'GenomicENc\'': pd.Series([genomicENcprime], dtype='float'), 'GrowthTempC': pd.Series([optimumTemp], dtype='float'), 'GenomeSizeMb': pd.Series([genomeSizeMb], dtype='float'), 'GrowthTimeHours': pd.Series([growthTimeHours], dtype='float'), 'IsEndosymbiont': pd.Series([isEndosymbiont(taxId)], dtype='bool'), 'EndosymbiontRef':