Ejemplo n.º 1
0
def getRandomizedSequenceCacheForVerticalPermutations(taxId):
    global _caches

    if (taxId, db.Sources.ShuffleCDS_vertical_permutation_1nt) in _caches:
        cache = _caches[(taxId, db.Sources.ShuffleCDS_vertical_permutation_1nt)]
        
    else:
        # read all native sequences
        protIds = []
        cdss = []
        for protId in SpeciesCDSSource(taxId):
            cds = CDSHelper(taxId, protId)
            
            if( cds.length()%3 != 0 ):
                continue
            
            seq = cds.sequence()
            
            protIds.append(protId)
            cdss.append(seq)
            
        geneticCode = getSpeciesTranslationTable( taxId )
        scpr = SynonymousCodonPermutingRandomization( geneticCode ) 
        randomizer = lambda cdss: scpr.verticalPermutation( cdss )
        cache = VerticalRandomizationCache(shuffleType=db.Sources.ShuffleCDS_vertical_permutation_1nt,
                                           taxId=taxId,
                                           nativeSeqsMap=dict(zip(protIds, cdss)),
                                           geneticCode=geneticCode,
                                           randomizer=randomizer )
        _caches[(taxId, db.Sources.ShuffleCDS_vertical_permutation_1nt)] = cache
        print(_caches.keys())

        
    return cache
def testRegionSpecificRandomization2(codon, taxId):
    #assert(fraction>=0)
    #assert(fraction < numFractions)

    assert(type(taxId)==type(0))

    startTime = time()

    shuffler = SynonymousCodonPermutingRandomization(getSpeciesTranslationTable(taxId))

    numShuffles = 1

    numSeqsDone = 0

    #diffStats = CalcStats2(taxId)
    #allNativeStats = CalcStats2(taxId)

    data = []

    poolCodons = []
    
    for (seqId, seq) in nativeSequencesSource(taxId, 0, 1):
        if len(seq) >= (codon+1)*3-1:
            codon = seq[codon*3:(codon+1)*3]
            assert(len(codon)==3)
            poolCodons.append( codon )

    pool = ''.join(poolCodons)

    totalPermutationsCountForSeq = None

    numAttempts = 0

    while True:
        identity = None
        shuffledSeq = None
        
        #if time() - startTime > 300:
        #    raise Exception("Calculation took to much time!")
        
        try:
            numAttempts += 1
            totalPermutationsCountForSeq, identity, shuffledSeq = shuffler.randomize(pool)
            
        except Exception as e:
            print(e)
            #continue # skip this sequence
            raise e

        if numAttempts >= 3:
            break

    #data.append( (len(seq), totalPermutationsCountForSeq) )
        
    #numSeqsDone += 1
                
    #logging.warning(mod3.getResults())
    #logging.warning(mod4.getResults())
    #logging.warning(mod5.getResults())
    return (taxId, codon, len(pool), totalPermutationsCountForSeq)
Ejemplo n.º 3
0
def calculateENcPrimeForSpecies(taxId, orig=False):
    geneticCode = getSpeciesTranslationTable(taxId)

    if orig:
        cdsCount, fastaFile = writeSequenceToTempFile_orig(taxId)
    else:
        cdsCount, fastaFile = writeSequenceToTempFile(taxId)

    createCodonCounts(fastaFile.name, cdsCount)
    createNucleotideCounts(fastaFile.name, cdsCount)
    print("Genomic GC%: {}".format(getSpeciesProperty(taxId, 'gc-content')))

    return createEncPrimeReport(fastaFile.name, geneticCode)
Ejemplo n.º 4
0
def calcNativeSequencesStatistics(taxId, fraction, numFractions):

    #countPairedNucleotides = 0
    #countTotalNucleotides  = 0
    cdsCount = 0
    gcCount = 0
    totalCount = 0
    cdsWarnings = 0
    warnings = Counter()
    firstAA = Counter()
    lastAA = Counter()

    geneticCode = getSpeciesTranslationTable(taxId)

    for seqId, seq in nativeSequencesSource(taxId, fraction, numFractions):
        seq = seq.lower()
        seqHasWarnings = False

        gcCount += sum([1 for x in seq if (x == 'c' or x == 'g')])
        totalCount += sum([
            1 for x in seq if (x == 'c' or x == 'g' or x == 'a' or x == 't')
        ])  # don't count 'N's

        if len(seq) % 3 != 0:
            seqHasWarnings = True
            warnings['cds-length'] += 1

        xlation = Seq(seq).translate(table=geneticCode).lower()
        if xlation[0] != 'm':
            seqHasWarnings = True
            warnings['translation-methionine'] += 1

        if xlation[-1] != '*':
            seqHasWarnings = True
            warnings['translation-stop-codon'] += 1

        if seqHasWarnings:
            cdsWarnings += 1

        firstAA.update(xlation[0])
        lastAA.update(xlation[-1])

        cdsCount += 1

    #print("Total:  %d" % countTotalNucleotides)
    #print("Paired: %d (%.3g%%)" % (countPairedNucleotides, float(countPairedNucleotides)/countTotalNucleotides*100))

    return (taxId, fraction, cdsCount, gcCount, totalCount, cdsWarnings,
            warnings, firstAA, lastAA)
Ejemplo n.º 5
0
    def _init_item(self, taxId):
        from data_helpers import getSpeciesGenomeSequenceFile, getSpeciesGenomeAnnotationsFile, getSpeciesGenomeAnnotationsVariant, getSpeciesTranslationTable, getSpeciesGenbankAnnotationsFile

        genomeSeqFile = getSpeciesGenomeSequenceFile(taxId)
        genomeAnnotFile = getSpeciesGenomeAnnotationsFile(taxId)
        genomeAnnotVariant = getSpeciesGenomeAnnotationsVariant(taxId)
        genomeGenbankFile = getSpeciesGenbankAnnotationsFile(taxId)
        geneticCode = getSpeciesTranslationTable(taxId)
        if genomeSeqFile is None or genomeAnnotFile is None or geneticCode is None:
            raise ValueError(
                "No supporting annotations for taxId={}".format(taxId))

        gm = GenomeModel(
            sequenceFile=genomeSeqFile,
            gffFile=genomeAnnotFile,
            isLinear=False,  # TODO fix this
            variant=genomeAnnotVariant,
            geneticCode=geneticCode,
            genbankFile=genomeGenbankFile)

        return gm
def testRegionSpecificRandomization(fraction, taxId, numFractions):
    assert(fraction>=0)
    assert(fraction < numFractions)

    assert(type(taxId)==type(0))

    startTime = time()

    shuffler = SynonymousCodonPermutingRandomization(getSpeciesTranslationTable(taxId))

    numShuffles = 1

    numSeqsDone = 0

    #diffStats = CalcStats2(taxId)
    #allNativeStats = CalcStats2(taxId)

    data = []
    
    for (seqId, seq) in nativeSequencesSource(taxId, fraction, numFractions):

        if random.randint(0,1)>0:
            continue

        #print(seqId)

        #nativeStats   = CalcStats2(taxId)
        
        #nativeStats.calcSeq(seq)

        #allNativeStats += nativeStats
        
        numShufflesIncluded = 0
        numAttempts = 0
        
        totalPermutationsCountForSeq = None

        while True:
            identity = None
            shuffledSeq = None

            #if time() - startTime > 300:
            #    raise Exception("Calculation took to much time!")
            
            try:
                numAttempts += 1
                totalPermutationsCountForSeq, identity, shuffledSeq = shuffler.randomizeWithMask(seq, getCodonMaskForSeq(seq, 0, 22) )

            except Exception as e:
                print(e)
                #continue # skip this sequence
                raise e

            if numAttempts >= 3:
                    break

        data.append( (len(seq), totalPermutationsCountForSeq) )
        
        numSeqsDone += 1
                
    #logging.warning(mod3.getResults())
    #logging.warning(mod4.getResults())
    #logging.warning(mod5.getResults())
    return (taxId, fraction, numSeqsDone, data)
    def calculateMissingWindowsForSequence(self, taxId, protId, seqIds, requestedShuffleIds, firstWindow, lastWindowStart, windowStep, reference="begin", shuffleType=db.Sources.ShuffleCDSv2_python, debug=False):

        timerForPreFolding.start()
        logging.warning("Parameters: %d %s %s %s %d %d %s %d" % (taxId, protId, seqIds, requestedShuffleIds, lastWindowStart, windowStep, reference, shuffleType))
        f = self._logfile

        assert(len(seqIds)>0)
        assert(len(seqIds)==len(requestedShuffleIds))

        # ------------------------------------------------------------------------
        # Obtain species-dependent properties needed for some calculations
        # ----------------
        # Optimal Temp
        optimalSpeciesGrowthTemperature = None
        if( self._seriesSourceNumber == db.Sources.RNAfoldEnergy_SlidingWindow40_v2_native_temp ):
            (numericalProp, _) = getSpeciesTemperatureInfo(taxId)
            optimalSpeciesGrowthTemperature = numericalProp[0]

            if optimalSpeciesGrowthTemperature is None:
                raise Exception("No temperature value for taxid={}, can't calculate native-temperature folding profile...".format(taxId))
            else:
                optimalSpeciesGrowthTemperature = float(optimalSpeciesGrowthTemperature)
                assert(optimalSpeciesGrowthTemperature >= -30.0 and optimalSpeciesGrowthTemperature <= 150.0)
        # ----------------
        # Genomic translation table
        genomicTranslationTable = None
        if( self._seriesSourceNumber in (db.Sources.StopCodon_content_SlidingWindow30, db.Sources.StopCodon_content_SlidingWindow40, db.Sources.StopCodon_content_SlidingWindow50 )):
            genomicTranslationTable = getSpeciesTranslationTable(taxId)
            assert(genomicTranslationTable>0 and genomicTranslationTable<=31)
            

        if( reference != "begin" and reference != "end" and reference != "stop3utr"):
            timerForPreFolding.stop()
            e = "Specificed profile reference '%s' is not supported!" % reference
            logging.error(e)
            raise Exception(e)

        # We will process all listed shuffle-ids for the following protein record
        if( reference == "begin" or reference == "end" ):
            regionOfInterest = RegionsOfInterset.CDSonly
        elif reference == "stop3utr":
            regionOfInterest = RegionsOfInterset.CDSand3UTR
        else:
            assert(False)
            
        cds = CDSHelper( taxId, protId, regionOfInterest=regionOfInterest )

        if( cds.length() < self._windowWidth ):
            e = "Refusing to process item %s because the sequence length (%d nt) is less than the window size (%d nt)\n" % (itemToProcess, cds.length(), self._windowWidth)
            f.write(e)
            logging.error(e)
            timerForPreFolding.stop()
            raise Exception(e)

        # Create a list of the windows we need to calculate for this CDS
        if reference == "begin":
            requestedWindowStarts = frozenset(list(range(0, min(lastWindowStart+1, cds.length()-self._windowWidth-1), windowStep)))
            if( len(requestedWindowStarts) == 0):
                e = "No windows exist for calculation taxid=%d, protId=%s, CDS-length=%d, lastWindowStart=%d, windowStep=%d, windowWidth=%d - Skipping...\n" % (taxId, protId, cds.length(), lastWindowStart, windowStep, self._windowWidth)
                f.write(e)
                logging.error(e)
                timerForPreFolding.stop()
                raise Exception(e)
            
        elif reference == "end":
            lastPossibleWindowStart = cds.length() - self._windowWidth #+ 1  # disregard lastWindowStart when reference=="end"
            #lastWindowCodonStart = (lastPossibleWindowStart-3)-(lastPossibleWindowStart-3)%3

            #lastPossibleWindowStart = seqLength - windowWidth # + 1  # disregard lastWindowStart when reference=="end"
            requestedWindowStarts = frozenset([x for x in range(lastPossibleWindowStart % windowStep, lastPossibleWindowStart+1, windowStep) if x>=lastWindowStart])

        elif reference == "stop3utr":
            seqLength = cds.length()
            stopCodonPos = cds.CDSlength()
            
            isRequired = [1 if abs(pos-stopCodonPos)<((lastWindowStart//2)*windowStep) else 0 for pos in range(0, seqLength - self._windowWidth, windowStep)]
            requestedWindowStarts = frozenset( compress( range(seqLength), isRequired ) )
            

            #requestedWindowStarts = frozenset(range(lastWindowCodonStart % windowStep, lastWindowCodonStart, windowStep))
            #pass
        else:
            assert(False)

        # First, read available results (for all shuffle-ids) in JSON format
        # Array is indexed by shuffle-id, so results not requested will be represented by None (as will requested items that have no results yet).
        logging.info("DEBUG: requestedShuffleIds (%d items): %s\n" % (len(requestedShuffleIds), requestedShuffleIds))
        existingResults = cds.getCalculationResult2( self._seriesSourceNumber, requestedShuffleIds, True, shuffleType=shuffleType )
        #assert(len(existingResults) >= len(requestedShuffleIds))  # The returned array must be at least as large as the requested ids list
        assert(len(existingResults) == len(requestedShuffleIds))
        logging.info("requestedShuffleIds: %s" % requestedShuffleIds)
        logging.info("existingResults.keys(): %s" % list(existingResults.keys()))
        assert(frozenset(requestedShuffleIds)==frozenset(list(existingResults.keys())))
        #existingResults = [None] * (max(requestedShuffleIds)+1)
        logging.info("DEBUG: existingResults (%d items): %s\n" % (len(existingResults), existingResults))

        # Check for which of the requested shuffle-ids there are values missing
        shuffleIdsToProcess = {}
        for shuffleId, r in list(existingResults.items()):
            if r is None:
                # There are no existing results for shuffled-id n. If it was requested, it should be calculated now (including all windows)
                if shuffleId in requestedShuffleIds:
                    shuffleIdsToProcess[shuffleId] = list(requestedWindowStarts)
                    
                timerForPreFolding.stop()
                
                # ------------------------------------------------------------------------------------
                continue   # TODO - verify this line; should we abort this sequence by throwing????
                # ------------------------------------------------------------------------------------

            logging.info("/// shuffleId r = %d %s" % (shuffleId, r))
            logging.info("r[MFE-profile] %s" % r["MFE-profile"])
            
            # Check the existing results for this shuffle
            alreadyProcessedWindowStarts = frozenset( [i for i,x in enumerate(r["MFE-profile"] ) if x is not None] ) # Get the indices (=window starts) of all non-None values
            missingWindows = requestedWindowStarts - alreadyProcessedWindowStarts # Are there any requested windows that are not already computed?
            if( missingWindows ): 
                shuffleIdsToProcess[shuffleId] = missingWindows

        if( not shuffleIdsToProcess):
            e = "All requested shuffle-ids in (taxId: %d, protId: %s, seqs: %s) seem to have already been processed. Skipping...\n" % (taxId, protId, str(list(zip(seqIds, requestedShuffleIds))) )
            logging.warning(e)
            timerForPreFolding.stop()
            return
        logging.info("DEBUG: shuffleIdsToProcess (%d items): %s\n" % (len(shuffleIdsToProcess), shuffleIdsToProcess))

        logging.info("DEBUG: Before (%d items): %s\n" % (len(existingResults), existingResults))
        # Initialize new results records
        for shuffleId in list(shuffleIdsToProcess.keys()):
            if existingResults[shuffleId] is None:
                logging.info(seqIds)
                logging.info(requestedShuffleIds)
                logging.info(shuffleId)
                thisSeqId = seqIds[ requestedShuffleIds.index(shuffleId) ]
                    
                existingResults[shuffleId] = { "id": "%s/%s/%d/%d" % (taxId, protId, thisSeqId, shuffleId), "seq-crc": None, "MFE-profile": [], "MeanMFE": None, "v": 2, "shuffle-type":shuffleType }
        logging.info("DEBUG: existingResults (%d items): %s\n" % (len(existingResults),existingResults) )
        timerForPreFolding.stop()

        # Load the sequences of all shuffle-ids we need to work on
        # TODO - combine loading of multiple sequences into one DB operation
        for shuffleId, record in list(existingResults.items()):
            if record is None:
                logging.info("DEBUG: skipping empty results record for shuffleId={}".format(shuffleId))
                continue
            timerForPreFolding.start()

            seq = None
            annotatedSeqId = None
            # Get the sequence for this entry
            if( shuffleId < 0 ):
                seq = cds.sequence()
                annotatedSeqId = cds.seqId()
            else:
                seq = cds.getShuffledSeq(shuffleId, shuffleType)
                annotatedSeqId = cds.getShuffledSeqId(shuffleId, shuffleType)

            if( seq is None or (not seq is None and len(seq)==0 )):
                seq2 = cds.getShuffledSeq2( annotatedSeqId )
                seq3 = cds._fetchSequence( annotatedSeqId )
                seq4 = cds._cache.get("%d:seq"%annotatedSeqId)
                if not seq4 is None:
                    del cds._cache["%d:seq"%annotatedSeqId]
                seq5 = cds.getShuffledSeq2( annotatedSeqId )
                e = "Got empty sequence for shuffleId=%d, seqId=%d, taxId=%d, protId=%s, numShuffled=%d, ids[%d:%d]=%s, len(seq2)=%d, len(seq3)=%d, len(seq4)=%d, len(seq5)=%d" % (shuffleId, annotatedSeqId, taxId, protId, len(cds.shuffledSeqIds()), shuffleId-2, shuffleId+2, cds.shuffledSeqIds()[shuffleId-2:shuffleId+2], len(seq2) if not seq2 is None else -1, len(seq3) if not seq3 is None else -1, len(seq4) if not seq4 is None else -1, len(seq5) if not seq5 is None else -1 )
                logging.error(e)
                timerForPreFolding.stop()
                raise Exception(e)

            #
            # Disabled - calculation needn't include the native sequence...
            #
            #if( annotatedSeqId not in seqIds ):
            #    e = "Error: SeqId specified in queue item %s does not match annotated seq-id %d\n" % (itemToProcess, annotatedSeqId)
            #    f.write(e)
            #    f.write("Current shuffle-id: %d\n" % shuffleId)
            #    f.write("Ids in existing results:\n")
            #    for shuffleId, record in enumerate(existingResults):
            #        f.write(" %d) %s\n" % (shuffleId, record['id']))
            #    f.write("Debug info:\n")
            #    f.write("\n".join(cds.getDebugInfo()))
            #    f.write("\n")
            #    f.write("Skipping...\n")
            #    print("Skipping...")
            #    raise Exception(e)

            expectedSeqLength = cds.length()
            if( not expectedSeqLength is None ):
                if( expectedSeqLength != len(seq) ):
                    e = "Warning: taxid=%d, protid=%s, seqid=%d - unexpected length %d (expected: %d)\n" % (taxId, protId, annotatedSeqId, len(seq), expectedSeqLength)
                    f.write(e)
                    logging.error(e)
                    timerForPreFolding.stop()
                    raise Exception(e)

            if( len(seq) < self._windowWidth ):
                # Sequence is shorter than required window; skip
                e = "Warning: skipping sequence because it is shorter than the requested window...\n"
                f.write(e)
                logging.error(e)
                timerForPreFolding.stop()
                raise Exception(e)

            logging.info("DEBUG: Processing item taxId=%d, protId=%s, shuffle=%d (length=%d, %d windows)...\n" % (taxId, protId, shuffleId, len(seq), len(requestedWindowStarts)))

            # TODO - Remove any old value stored in this key?

            # Skip this for now
            # This will be made redundant by completing the "updating" implementation
            #
            #if( cds.isCalculationDone( seriesSourceNumber, shuffleId )):
            #    # Sufficient data seems to exist. Skip...
            #    f.write("Item %s appears to be already completed, skipping..." % itemToProcess)
            #    continue

            logging.info(seq[:50])
            #f.write("\n")

            MFEprofile = record["MFE-profile"]
            #f.write("Profile: %s\n" % MFEprofile)

            # Make sure the profile array contains enough entries for all new windows (and possibly, if windows are non-contiguous, entries between them that we are not going to compute right now)
            if( len(MFEprofile) < max(requestedWindowStarts) ):
                entriesToAdd = max(requestedWindowStarts) - len(MFEprofile) + 1
                MFEprofile.extend( [None] * entriesToAdd )
            assert(len(MFEprofile) >= max(requestedWindowStarts))

            stats = RunningStats()
            stats.extend([x for x in MFEprofile if x is not None])

            timerForPreFolding.stop()
            timerForFolding.start()
            for start in requestedWindowStarts:
                fragment = seq[start:(start+self._windowWidth)]
                assert(len(fragment)==self._windowWidth)

                if self._seriesSourceNumber in (db.Sources.RNAfoldEnergy_SlidingWindow30_v2, db.Sources.RNAfoldEnergy_SlidingWindow40_v2, db.Sources.RNAfoldEnergy_SlidingWindow50_v2):
                    # Calculate the RNA folding energy. This is the computation-heavy part.
                    #strct, energy = RNA.fold(fragment)
                    result = RNAfold_direct(fragment)
                    assert(result <= 0.0)

                elif self._seriesSourceNumber == db.Sources.RNAfoldEnergy_SlidingWindow40_v2_native_temp:
                    # Calculate the RNA folding energy. This is the computation-heavy part.
                    #strct, energy = RNA.fold(fragment)
                    result = RNAfold_direct(fragment, explicitCalculationTemperature = optimalSpeciesGrowthTemperature)
                    assert(result <= 0.0)

                elif self._seriesSourceNumber == db.Sources.GC_content_SlidingWindow40:
                    result = calcWindowGCContent( fragment )
                    assert( isnan(result) or (result >= 0.0 and result <= 1.0) )
                    
                elif self._seriesSourceNumber == db.Sources.Purine_content_SlidingWindow40:
                    result = calcWindowPurineContent( fragment )
                    assert( isnan(result) or (result >= 0.0 and result <= 1.0) )
                    
                elif self._seriesSourceNumber in (db.Sources.StopCodon_content_SlidingWindow30, db.Sources.StopCodon_content_SlidingWindow40, db.Sources.StopCodon_content_SlidingWindow50):
                    result = calcWindowStopCodonContent( fragment, translationTable=genomicTranslationTable, phase=start%3 )
                    assert( result >= 0.0 and result <= 1.0 )

                    
                elif self._seriesSourceNumber == db.Sources.TEST_StepFunction_BeginReferenced:
                    if shuffleId < 0:
                        result = 0
                    else:
                        result = start%50 - 20
                
                elif self._seriesSourceNumber == db.Sources.TEST_StepFunction_EndReferenced:
                    if shuffleId < 0:
                        result = 0
                    else:
                        result = (expectedSeqLength - self._windowWidth - start)%50 - 20

                else:
                    logging.error("Received unknown seriesSourceNumber {}".format(self._seriesSourceNumber))
                    assert(False)
                    
                # Store the calculation result
                #print("%d:%s --> %f" % (taxId, protId, energy))

                stats.push(result)
                MFEprofile[start] = result

            #print("///////////////////  shuffleId={} (len={}) //////////////////////////".format(shuffleId, expectedSeqLength))
            if debug:
                prettyPrintProfile(MFEprofile)

            timerForFolding.stop()
            timerForPostFolding.start()

            # Format
            crc = getCrc(seq)
            #result = """{"id":"%s","seq-crc":%d,"MFE-profile":[%s],"MeanMFE":%.6g,v:2}""" % (itemToProcess, crc, ",".join(map(lambda x: "%.3g" % x, MFEprofile)), stats.mean())
            record["seq-crc"] = crc
            record["MFE-profile"] = [round4(x) for x in MFEprofile] # Round items down to save space (these are not exact numbers anyway)
            record["MeanMFE"] = stats.mean()
            
            if reference == "stop3utr":
                record["stop-codon-pos"] = cds.CDSlength()
                
            result = json.dumps(record)

            f.write(result)
            f.write("\n")

            if( not self._debugDoneWriteResults):
                cds.saveCalculationResult2( self._seriesSourceNumber, result, annotatedSeqId, False )
                
            timerForPostFolding.stop()

            
        timerForPostFolding.start()
        
        if( not self._debugDoneWriteResults):
            cds.commitChanges()
            
        timerForPostFolding.stop()
    inPhyloTree = taxId in speciesInTree
    if inPhyloTree:
        stats.update(['tree'])

    speciesDf = speciesDf.append(
        pd.DataFrame({
            'TaxId':
            pd.Series([taxId], dtype='int'),
            'Species':
            pd.Series([getSpeciesName(taxId)], dtype='str'),
            'Nickname':
            pd.Series([shortNames[taxId]], dtype='str'),
            'Source':
            pd.Series([''], dtype='str'),
            'TranslationTbl':
            pd.Series([getSpeciesTranslationTable(taxId)], dtype='int'),
            'InPhyloTree':
            pd.Series([inPhyloTree], dtype='bool'),
            'GenomicGC%':
            pd.Series([genomicGC], dtype='float'),
            'GenomicENc\'':
            pd.Series([genomicENcprime], dtype='float'),
            'GrowthTempC':
            pd.Series([optimumTemp], dtype='float'),
            'GenomeSizeMb':
            pd.Series([genomeSizeMb], dtype='float'),
            'GrowthTimeHours':
            pd.Series([growthTimeHours], dtype='float'),
            'IsEndosymbiont':
            pd.Series([isEndosymbiont(taxId)], dtype='bool'),
            'EndosymbiontRef':