def annotateENcPrime(taxId, overwrite=False): encPropValue = getSpeciesProperty(taxId, 'ENc') encPrimePropValue = getSpeciesProperty(taxId, 'ENc-prime') # TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY # ###return (taxId, 0.0, 1.0, False) # return old values (last value indicates this value are old) # TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY # if (encPropValue[0] is None) or (encPrimePropValue[0] is None) or overwrite: ENc, ENc_prime = calculateENcPrimeForSpecies(taxId) assert ( ENc < 75.0 and ENc > 10.0 ) # The actual extreme values for ENc are not clear to me, but let's do a sanity check assert (ENc_prime < 75.0 and ENc_prime > 10.0) setSpeciesProperty(taxId, 'ENc', str(ENc), "ENCprime (custom version)") setSpeciesProperty(taxId, 'ENc-prime', str(ENc_prime), "ENCprime (custom version)") else: return ( taxId, encPropValue[0], encPrimePropValue[0], False ) # return old values (last value indicates this value are old) return (taxId, ENc, ENc_prime, True ) # return values (last value indicates new values)
def getTraits( taxIds, traits=(("gc-content", "float"), ("ENc-prime", "float"), ("optimum-temperature", "float"), ("is-endosymbiont", "int"), ("is-high-temp", "int")) ): df = pd.DataFrame(dict([(x, pd.Series(dtype=t)) for x,t in traits]), index=taxIds) for trait, _ in traits: for taxId in taxIds: traitVal = None if trait=="is-endosymbiont": traitVal = isEndosymbiont( taxId ) elif trait=="is-high-temp": traitVal = 0 prop = getSpeciesProperty(taxId, "optimum-temperature") if not prop[0] is None: tempVal = float(prop[0]) print("{} -> {}".format(taxId, tempVal)) if tempVal > 75.0: traitVal = 1 print(taxId) else: prop = getSpeciesProperty(taxId, trait) if not prop[0] is None: traitVal = float(prop[0]) if not traitVal is None: print("{} {} -> {}".format(taxId, trait, traitVal)) df.loc[taxId, trait] = traitVal return df
def runDistributed(): import _distributed import dask scheduler = _distributed.open() delayedCalls = [] for taxId in allSpeciesSource(): if not getSpeciesProperty(taxId, "ENc-prime")[0] is None: continue print(taxId) call = dask.delayed(annotateENcPrime)(taxId) delayedCalls.append(call) print("Starting %d calls..." % len(delayedCalls)) futures = scheduler.compute( delayedCalls ) # submit all delayed calculations; obtain futures immediately try: _distributed.progress(futures) # wait for all calculations to complete except Exception as e: print(E) print("\n") print("Waiting for all tasks to complete...") _distributed.wait(futures) results = {} errorsCount = 0 newValuesCount = 0 oldValuesCount = 0 for f in futures: try: (taxId, ENc, ENc_prime, isFreshValue) = scheduler.gather(f) results[taxId] = (ENc, ENc_prime) if isFreshValue: newValuesCount += 1 else: oldValuesCount += 1 except Exception as e: print(e) errorsCount += 1 print("Finished %d species with %d errors" % (len(results), errorsCount)) print("{} new values; {} old values".format(newValuesCount, oldValuesCount)) return results
def calculateENcPrimeForSpecies(taxId, orig=False): geneticCode = getSpeciesTranslationTable(taxId) if orig: cdsCount, fastaFile = writeSequenceToTempFile_orig(taxId) else: cdsCount, fastaFile = writeSequenceToTempFile(taxId) createCodonCounts(fastaFile.name, cdsCount) createNucleotideCounts(fastaFile.name, cdsCount) print("Genomic GC%: {}".format(getSpeciesProperty(taxId, 'gc-content'))) return createEncPrimeReport(fastaFile.name, geneticCode)
def runDistributed(): for taxId in allSpeciesSource(): currentProp = getSpeciesProperty(taxId, 'paired-mRNA-fraction') if currentProp[0] is None: continue if currentProp[1] == "computed": origVal = float(currentProp[0]) fixedVal = origVal * 2 setSpeciesProperty(taxId, "paired-mRNA-fraction", "%.4g" % fixedVal, "computed (v2)", overwrite=True) print("Fixed %d: %.4g -> %.4g" % (taxId, origVal, fixedVal))
def annotateCUBmeasures(taxId, overwrite=False): caiPropValue = getSpeciesProperty(taxId, 'genomic-CAI') # TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY # ###return (taxId, 0.0, 1.0, False) # return old values (last value indicates this value are old) # TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY # if (caiPropValue[0] is None) or overwrite: cubDf = calculateGenomeLevelCUBmeasures(taxId) print(cubDf) #print(cubDf.index) #print(cubDf.iloc[0].at['CAI']) CAI = cubDf.iloc[0].at['CAI'] CBI = cubDf.iloc[0].at['CBI'] Fop = cubDf.iloc[0].at['Fop'] Nc = cubDf.iloc[0].at['Nc'] assert (CAI < 1.0 and CAI > 0.0) assert (CBI < 1.0 and CBI > -0.5) assert (Fop < 1.0 and Fop > 0.0) assert ( Nc < 75.0 and Nc > 10.0 ) # The actual extreme values for ENc are not clear to me, but let's do a sanity check print(CAI, CBI, Fop, Nc) setSpeciesProperty(taxId, 'genomic-CAI', "{:.4}".format(CAI), "codonw 1.4.4") setSpeciesProperty(taxId, 'genomic-CBI', "{:.4}".format(CBI), "codonw 1.4.4") setSpeciesProperty(taxId, 'genomic-Fop', "{:.4}".format(Fop), "codonw 1.4.4") setSpeciesProperty(taxId, 'genomic-Nc-codonw', "{:.4}".format(Nc), "codonw 1.4.4") else: return ( taxId, caiPropValue[0], False ) # return old values (last value indicates this value are old) return (taxId, CAI, True ) # return values (last value indicates new values)
def annotateDCBS(taxId, overwrite=False): dcbsPropValue = getSpeciesProperty(taxId, 'DCBS-geomean') # TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY # ## return (taxId, 0.0, False) # return old values (last value indicates this value are old) # TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY # if (dcbsPropValue[0] is None) or overwrite: DCBS = float(calcDCBS(taxId)) #assert(ENc_prime<75.0 and ENc_prime>10.0) setSpeciesProperty(taxId, 'DCBS-geomean', str(DCBS), "DCBS (matlab, Renana)") else: return ( taxId, dcbsPropValue[0], False ) # return old values (last value indicates this value are old) return (taxId, DCBS, True ) # return values (last value indicates new values)
def getSpeciesPaxdbDataFromFile( taxId ): (paxfn, _) = getSpeciesProperty( taxId, 'paxdb-path' ) if paxfn is None: return {} return parsePaxDbFile( paxfn, taxId=taxId )
#Counter({'Mesophilic': 79, 'Hyperthermophilic': 20, 'Thermophilic': 13, 'Psychrophilic': 4, 'Unknown': 1}) # Temperatures and categories for all species *that have temperatures* temperatureVsCategoryStatistics = pd.DataFrame({ 'tax_id':pd.Series(dtype='int'), 'temperature':pd.Series(dtype='float'), 'category':pd.Categorical([]) }) # Plot raw data for taxId in allSpeciesSource(): category = None temperatureRange = getSpeciesProperty( taxId, 'temperature-range') if not temperatureRange[0] is None: category = temperatureRange[0] categories.update((category,)) else: category = "Unknown" assert(not category is None) optimalTemperatureData = getSpeciesProperty( taxId, 'optimum-temperature') optimalTemperature = None if not optimalTemperatureData[0] is None: optimalTemperature = float(optimalTemperatureData[0]) temperatureVsCategoryStatistics = temperatureVsCategoryStatistics.append(pd.DataFrame({ 'tax_id':pd.Series([taxId], dtype='int'), 'temperature':pd.Series([optimalTemperature], dtype='float'),
def runDistributed(): import _distributed import dask scheduler = _distributed.open() results = {} #taxids = [] delayedCalls = [] fractionSize = 20 for taxId in allSpeciesSource(): if randint(0, 20) > 0: continue if not getSpeciesProperty(taxId, 'paired-mRNA-fraction')[0] is None: continue size = countSpeciesCDS(taxId) numFractions = size / fractionSize for i in range(numFractions): call = dask.delayed(calcNativePairedFraction)(taxId, i, numFractions) delayedCalls.append(call) #taxids.append(taxId) print("Starting %d calls..." % len(delayedCalls)) futures = scheduler.compute( delayedCalls ) # submit all delayed calculations; obtain futures immediately try: _distributed.progress(futures) # wait for all calculations to complete except Exception as e: print(E) print("\n") print("Waiting for all tasks to complete...") _distributed.wait(futures) results = {} errorsCount = 0 for f in futures: try: (taxId, fraction, cdsCount, countPairedNucleotides, countTotalNucleotides) = scheduler.gather(f) current = None if taxId in results: current = results[taxId] else: current = (0, 0, 0, set()) current = (current[0] + cdsCount, current[1] + countPairedNucleotides, current[2] + countTotalNucleotides, current[3].union(set((fraction, )))) results[taxId] = current except Exception as e: print(e) errorsCount += 1 for taxId, result in results.items(): if len(result[3]) != max(result[3]) + 1: #raise Exception("Found invalid number of items for taxId=%d" % taxId) print("Found invalid number of items for taxId=%d" % taxId) continue fraction = float(result[1]) / result[2] setSpeciesProperty(taxId, "paired-mRNA-fraction", "%.4g" % fraction, "computed (v3)", overwrite=False) print("TaxId: %d\t\tFraction: %.4g" % (taxId, fraction)) print("Finished %d species with %d errors" % (len(results), errorsCount)) return results
ret.append(taxId) return ret # get tree (_, prunedTree) = pruneReferenceTree_Nmicrobiol201648( getSpeciesToInclude() ) # prune complete reference phylogenetic tree to include only dataset species speciesInTree = getSpeciesFromTree(prunedTree) shortNames = getSpeciesShortestUniqueNamesMapping_memoized() stats = Counter() for taxId in getSpeciesToInclude(): genomicGC = getSpeciesProperty(taxId, 'gc-content')[0] if not genomicGC is None: genomicGC = float(genomicGC) genomicENcprime = getSpeciesProperty(taxId, 'ENc-prime')[0] if not genomicENcprime is None: genomicENcprime = float(genomicENcprime) optimumTemp = getSpeciesProperty(taxId, 'optimum-temperature')[0] if not optimumTemp is None: optimumTemp = float(optimumTemp) genomeSizeMb = getSpeciesProperty(taxId, 'genome-size-mb')[0] if not genomeSizeMb is None: genomeSizeMb = float(genomeSizeMb)
def speciesStatisticsAndValidityReport(args): import _distributed speciesDf = pd.DataFrame({ 'TaxId': pd.Series([], dtype='int'), # Species TaxId 'Species': pd.Series([], dtype='str'), # Species binomial name 'Nickname': pd.Series([], dtype='str'), 'Domain': pd.Categorical([]), # Bacteria, Eukaryota, Archaea 'Phylum': pd.Categorical([]), # Phylum name (string) 'NumCDSs': pd.Series([], dtype='int'), # CDS count for this species 'NumCDSsInProfile': pd.Series([], dtype='int' ), # Num seqs with 20 shuffled profiles for this species 'AnnotatedNumCDSs': pd.Series([], dtype='int'), # 'CDSDifference': pd.Series([], dtype='float'), # 'NumNativeSeqs': pd.Series([], dtype='int'), # 'GCContentInCDS': pd.Series([], dtype='float'), # 'AnnotatedGCContent': pd.Series([], dtype='float'), # 'RowType': pd.Categorical([]), # Species count or total 'Warnings': pd.Series([], dtype='str'), # 'CDSWarnings': pd.Series([], dtype='int'), # 'CDSWarnings_': pd.Series([], dtype='str'), # 'FirstAA': pd.Series([], dtype='str'), # 'LastAA': pd.Series([], dtype='str') # }) scheduler = _distributed.open() results = {} delayedCalls_native = [] shuffledCounts = {} delayedCalls_shuffledProfiles = [] for taxId in allSpeciesSource(): if taxId in speciesToExclude: continue # always exclude species from the blacklist if args.taxid and taxId not in args.taxid: continue # if a whitelist is specified, skip other species warnings = [] ## DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ## #if randint(0, 20) > 0: # continue ## DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ## cdsCountInRedis = countSpeciesCDS(taxId) #cdsCountProfiles = countx(taxId, (310, 10, "begin", 0), 102, 11) annotatedProteinCount = getSpeciesProperty(taxId, 'protein-count')[0] annotatedGCContent = getSpeciesProperty(taxId, 'gc-content')[0] proteinDifference = None if not annotatedProteinCount is None: proteinDifference = (1.0 - float(cdsCountInRedis) / float(annotatedProteinCount)) * 100.0 if abs(proteinDifference) > 9.9: warnings.append("CDS_count") else: warnings.append("No_CDS_count") # Determine phylum lineage = ncbiTaxa.get_lineage(taxId) names = ncbiTaxa.get_taxid_translator(lineage) ranks = ncbiTaxa.get_rank(lineage) # Determine kingdom/domain domain = "" kingdomTaxId = [ t for t, rank in ranks.items() if rank == 'superkingdom' ] if not kingdomTaxId: kingdomTaxId = [ t for t, rank in ranks.items() if rank == 'kingdom' ] domain = names[kingdomTaxId[0]] phylumName = "" # Determine phylum phylumTaxId = [t for t, rank in ranks.items() if rank == 'phylum'] if phylumTaxId: phylumName = names[phylumTaxId[0]] speciesDf = speciesDf.append( pd.DataFrame({ 'TaxId': pd.Series([taxId], dtype='int'), # Species TaxId 'Species': pd.Series([getSpeciesName(taxId)], dtype='str'), 'Nickname': pd.Series([shortNames[taxId]], dtype='str'), 'Domain': pd.Categorical([domain]), # Bacteria, Eukaryota, Archaea 'Phylum': pd.Categorical([phylumName]), # Phylum name (string) 'NumCDSs': pd.Series([cdsCountInRedis], dtype='int'), # CDS count for this species 'NumCDSsInProfile': pd.Series([0], dtype='int'), # Num seqs with 20 shuffled profiles 'AnnotatedNumCDSs': pd.Series([ 0 if annotatedProteinCount is None else annotatedProteinCount ], dtype='int'), # 'CDSDifference': pd.Series([proteinDifference], dtype='float'), # 'NumNativeSeqs': pd.Series([0], dtype='int'), # 'GCContentInCDS': pd.Series([0.0], dtype='float'), # 'AnnotatedGCContent': pd.Series([annotatedGCContent], dtype='float'), # 'RowType': pd.Categorical(["species"]), # Species count or total 'Warnings': pd.Series([", ".join(warnings)], dtype='str'), # 'CDSWarnings': pd.Series([0], dtype='int'), 'CDSWarnings_': pd.Series([""], dtype='str'), 'FirstAA': pd.Series([""], dtype='str'), 'LastAA': pd.Series([""], dtype='str'), 'Source': pd.Series([""], dtype='str') })) fractionSize = 1000 # How many sequences (roughly) to process in each task numFractions = cdsCountInRedis / fractionSize if numFractions == 0: numFractions = 1 for i in range(numFractions): # DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY # #if i%100!=5: continue # DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY # call = dask.delayed(calcNativeSequencesStatistics)(taxId, i, numFractions) delayedCalls_native.append(call) call = dask.delayed(countShuffledProfiles)(taxId, (310, 10, "begin", 0), 102, 11) delayedCalls_shuffledProfiles.append(call) speciesDf.set_index('TaxId', inplace=True) print("Starting {} calls...".format( len(delayedCalls_native) + len(delayedCalls_shuffledProfiles))) futures = scheduler.compute( delayedCalls_native + delayedCalls_shuffledProfiles ) # submit all delayed calculations; obtain futures immediately try: _distributed.progress(futures) # wait for all calculations to complete except Exception as e: print(E) print("\n") print("Waiting for all tasks to complete...") _distributed.wait(futures) results = {} errorsCount = 0 for f in futures: try: ret = scheduler.gather(f) if (len(ret) == 9): (taxId, fraction, cdsCount, gcCounts, totalCounts, cdsWarnings, warnings, firstAA, lastAA) = ret current = None if taxId in results: current = results[taxId] else: current = (0, 0, 0, 0, Counter(), Counter(), Counter()) current = (current[0] + cdsCount, current[1] + gcCounts, current[2] + totalCounts, current[3] + cdsWarnings, current[4] + warnings, current[5] + firstAA, current[6] + lastAA) results[taxId] = current elif (len(ret) == 2): (taxId, numShuffledSeqs) = ret shuffledCounts[taxId] = numShuffledSeqs else: assert (False) except Exception as e: print(e) errorsCount += 1 for taxId, result in results.items(): (numNativeSeqs, gcCounts, totalCounts, cdsWarnings, warnings, firstAA, lastAA) = result speciesDf.at[taxId, 'NumNativeSeqs'] = numNativeSeqs speciesDf.at[taxId, 'GCContentInCDS'] = round( float(gcCounts) / float(totalCounts) * 100.0, 1) speciesDf.at[taxId, 'CDSWarnings'] = cdsWarnings speciesDf.at[taxId, 'CDSWarnings_'] = summarizeCounter(warnings) speciesDf.at[taxId, 'FirstAA'] = summarizeCounter(firstAA) speciesDf.at[taxId, 'LastAA'] = summarizeCounter(lastAA) #if numNativeSeqs < species.at[taxId, 'NumCDSs']: # pass for taxId, result in shuffledCounts.items(): speciesDf.at[taxId, 'NumCDSsInProfile'] = result speciesDf = speciesDf.sort_values(by=['Domain', 'Species']) # sort rows speciesDf.to_html('species_report.html', float_format='{0:.1f}'.format, columns=[ 'Species', 'Nickname', 'NumCDSs', 'NumCDSsInProfile', 'AnnotatedNumCDSs', 'CDSDifference', 'NumNativeSeqs', 'GCContentInCDS', 'AnnotatedGCContent', 'Phylum', 'Domain', 'Warnings', 'CDSWarnings', 'CDSWarnings_', 'FirstAA', 'LastAA' ]) with open("species_report_simple.rst", "w") as f: f.write( speciesDf.drop([ 'RowType', 'Warnings', 'CDSWarnings', 'CDSWarnings_', 'FirstAA', 'LastAA', 'CDSDifference' ], axis=1).pipe(tabulate, headers='keys', tablefmt='rst')) speciesDf.to_html('species_report_simple.html', float_format='{0:.1f}'.format, columns=[ 'Species', 'Nickname', 'NumCDSs', 'NumCDSsInProfile', 'AnnotatedNumCDSs', 'CDSDifference', 'NumNativeSeqs', 'GCContentInCDS', 'AnnotatedGCContent', 'Phylum', 'Domain' ]) speciesDf.to_excel('species_report.xlsx', sheet_name='Species summary')