def plotProfile(taxId, data): fig, (ax1,ax2) = plt.subplots(2, sharex=True, gridspec_kw={'height_ratios': [2, 1]}) data[['native', 'shuffled']].plot(ax=ax1) #plt.title(getSpeciesName(taxId)) plt.xlabel('Position (nt, window start, from cds %s)' % args.profile.ProfileReference) ax1.set_title("Mean LFE for %s" % getSpeciesName(taxId)) ax1.set_ylabel('Mean LFE') ax1.legend() ax1.grid(True) data['gc'].plot(ax=ax2) ax2.set_title("GC%") ax2.set_ylabel('GC% (in window)') ax2.grid(True) profileId = str(args.profile.ProfileId).replace(':', '-') plt.savefig("mfe_40nt_cds_%s_%s.pdf" % (profileId, getSpeciesFileName(taxId)) ) plt.savefig("mfe_40nt_cds_%s_%s.svg" % (profileId, getSpeciesFileName(taxId)) ) plt.close(fig)
def processGenome(taxId, args, outputPositions=OutputPositions, displayPositions=DisplayPositions): buffer = [] for result in getGeneNativeLFEProfiles(taxId, args): buffer.append(result) allLFEs = np.stack(buffer) #mean = allLFEs.mean(axis=0) #std = allLFEs.std( axis=0) mean = np.apply_along_axis( lambda x: x[~np.isnan(x)].mean(), axis=0, arr=allLFEs ) std = np.apply_along_axis( lambda x: x[~np.isnan(x)].std(), axis=0, arr=allLFEs ) vals = [] vals.append( ('Species', getSpeciesName(taxId), 'str') ) vals.append( ('Domain', getDomainForSpecies(taxId), 'str') ) for pos, displayPos in zip( outputPositions, displayPositions ): vals.append( ('Mean at {}'.format(displayPos), mean[pos], 'float') ) vals.append( ('Std at {}'.format(displayPos), std[pos], 'float') ) ssetup = [(label, pd.Series( [value], index=[taxId], dtype=dtype)) for label, value, dtype in vals ] return pd.DataFrame( dict(ssetup), index=[taxId] )
def outputNodeExistenceInRnafoldDB(): from data_helpers import getSpeciesName treeNodeIdentifiersDf = pd.read_csv(nodeIdentifiersMappingTable_csv, dtype= {'NodeLabel': 'string', 'DBIdentifier': 'string', 'DBIdentifierType': 'category', 'TaxId': 'int32'} ) existenceStatuses = pd.Series([], dtype='int32') values = Counter() i=0 for row in treeNodeIdentifiersDf.itertuples(): isIncluded = not (getSpeciesName(row.TaxId) is None) existenceStatuses[i] = 1 if isIncluded else 0 values.update((isIncluded,)) i += 1 treeNodeIdentifiersDf['Included'] = existenceStatuses del treeNodeIdentifiersDf['DBIdentifier'] del treeNodeIdentifiersDf['DBIdentifierType'] del treeNodeIdentifiersDf['TaxId'] del treeNodeIdentifiersDf['Unnamed: 0'] treeNodeIdentifiersDf.to_csv( nodeIdentifiersMappingTable_with_inclusion_csv, index=False ) print("Output values summary: %s" % values) return 0
def taxIdToGenomeId(taxId): speciesName = getSpeciesName(taxId) ##speciesName = "Saccharomyces cerevisiae" ##speciesName = "Methanocaldococcus jannaschii DSM 2661" sleep(requestDelaySeconds) handle = Entrez.esearch(db="genome", retmax=10, term="\"%s\"[ORGN]" % speciesName) record = Entrez.read(handle) handle.close() return map(int, record['IdList'])
# Establish DB connections #r = redis.StrictRedis(host=config.host, port=config.port, db=config.db) #session = db.Session() skipped = 0 selected = 0 alreadyCompleted = 0 totalMissingResults = 0 queuedDelayedCalls = [] for taxIdForProcessing in species: print("Processing %d sequences for tax-id %d (%s)..." % (countSpeciesCDS(taxIdForProcessing), taxIdForProcessing, getSpeciesName(taxIdForProcessing))) stats = Counter() # Iterate over all CDS entries for this species # TODO - preloading all sequences and results should optimize this for protId in SpeciesCDSSource(taxIdForProcessing): stats['all-sequences'] += 1 #protId = codecs.decode(protId) # Filtering # Only process 1/N of the sequences, selected randomly (N=randomFraction) # (if randomFraction==1, all sequences will be processed) if (randint(1, randomFraction) != 1):
if (encPropValue[0] is None) or (encPrimePropValue[0] is None) or overwrite: ENc, ENc_prime = calculateENcPrimeForSpecies(taxId) assert ( ENc < 75.0 and ENc > 10.0 ) # The actual extreme values for ENc are not clear to me, but let's do a sanity check assert (ENc_prime < 75.0 and ENc_prime > 10.0) setSpeciesProperty(taxId, 'ENc', str(ENc), "ENCprime (custom version)") setSpeciesProperty(taxId, 'ENc-prime', str(ENc_prime), "ENCprime (custom version)") else: return ( taxId, encPropValue[0], encPrimePropValue[0], False ) # return old values (last value indicates this value are old) return (taxId, ENc, ENc_prime, True ) # return values (last value indicates new values) if __name__ == "__main__": import sys taxId = int(sys.argv[1]) print("Name: {}".format(getSpeciesName(taxId))) print(annotateENcPrime(taxId)) print(calculateENcPrimeForSpecies(taxId, orig=True)) print(calculateENcPrimeForSpecies(taxId, orig=False))
def loadSpeciesMapping(): for taxId in allSpeciesSource(): speciesName = getSpeciesName(taxId) speciesMapping[speciesName] = taxId
# GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. import sys from time import sleep from data_helpers import SpeciesCDSSource, CDSHelper, getSpeciesName, countSpeciesCDS, matchCDSKeyNamesSource, r from rate_limit import RateLimit taxId = int(sys.argv[1]) rl = RateLimit(10) if (countSpeciesCDS(taxId) == 0): print("Species %d (%s) doesn't have any proteins..." % (taxId, getSpeciesName(taxId))) print("Nothing left to do...") sys.exit(0) print("Species %d (%s) has %d proteins stored." % (taxId, getSpeciesName(taxId), countSpeciesCDS(taxId))) print("Will delete it in 10 seconds...") sleep(10) count = 0 for protId in SpeciesCDSSource(taxId): print(protId) cds = CDSHelper(taxId, protId) try: cds.dropShuffledSeqs()
def ingestGenome(args): # Sanity test 1 -- genes list file doesn't already exists (if it does, short name may have been reused...) genesListFilename = "%s/coding_genes_list.%s.list" % (ensembl_data_dir, args.short_name) print(genesListFilename) assert (not os.path.exists(genesListFilename)) # Sanity test 2 -- this species doesn't already exist in the DB if r.exists(speciesNameKey % args.taxid): if r.exists(speciesCDSList % args.taxid): raise Exception("Species with taxid=%d already exists as '%s'" % (args.taxid, getSpeciesName(args.taxid))) else: pass # Name defined but no CDS list; maybe an earlier run of this script was interrupted? # Sanity tests passed genomefn = None cdsfn = None gff3fn = None # Step 1 - get files from Ensembl FTP if (args.variant == "Ensembl" and args.fetch_ftp_files): ftp = EnsemblFTP(args.local_name, args.remote_name, release=args.release, section=args.section, subsection=args.subsection) (genomefn, cdsfn, gff3fn) = ftp.fetchAll() ftp.close() assert (os.path.exists(genomefn) and os.path.isfile(genomefn)) assert (os.path.exists(cdsfn) and os.path.isfile(cdsfn)) assert (os.path.exists(gff3fn) and os.path.isfile(gff3fn)) else: gff3fn = args.gff3 cdsfn = args.cds # Step 2 - parse GFF3 file to yield list of acceptable CDS genes processGff3(gff3fn, genesListFilename, args) numGenesReturnedFromGff3 = None with open(genesListFilename, "r") as f: numGenesReturnedFromGff3 = len(f.readlines()) if numGenesReturnedFromGff3 < 400: raise Exception( "Processing gff3 file only yielded %d results; aborting" % numGenesReturnedFromGff3) print("%d protein-coding genes found in gff3" % numGenesReturnedFromGff3) # Step 3 - Add required annotations for this species to redis DB # TODO - add redis items here #redis-cli -h power5 -a rnafold set "species:taxid:203267:name" "Tropheryma whipplei str. Twist" r.set(speciesNameKey % args.taxid, args.full_name) #redis-cli -h power5 -a rnafold set "species:name:Tropheryma whipplei str. Twist:taxid" "203267" r.set(speciesTaxIdKey % args.full_name, args.taxid) #redis-cli -h power5 -a rnafold set "species:taxid:203267:genomic-transl-table" "11" r.set(speciesTranslationTableKey % args.taxid, args.nuclear_genetic_code) # Step 4 - Load CDS sequences to DB print("Doing trial run for gene loading...") if not loadCDSSequences(cdsfn, genesListFilename, args, dryRun=True) is None: print("Trial run succeeded.") print("Performing actual gene loading...") (cdsLoadedCount, cdsIds, skippedGenes) = loadCDSSequences(cdsfn, genesListFilename, args, dryRun=False) print("Loaded %d CDS genes..." % cdsLoadedCount) if skippedGenes: print("Skipped genes: %s" % skippedGenes) else: print( "Dry-run loading may have encountered errors; aborting without actual load..." ) return -1 # Step 5 - Generate randomized sequences # TODO return 0
genomeSizeMb = float(genomeSizeMb) growthTimeHours = getSpeciesProperty(taxId, 'growth-time-hours-v2')[0] if not growthTimeHours is None: growthTimeHours = float(growthTimeHours) inPhyloTree = taxId in speciesInTree if inPhyloTree: stats.update(['tree']) speciesDf = speciesDf.append( pd.DataFrame({ 'TaxId': pd.Series([taxId], dtype='int'), 'Species': pd.Series([getSpeciesName(taxId)], dtype='str'), 'Nickname': pd.Series([shortNames[taxId]], dtype='str'), 'Source': pd.Series([''], dtype='str'), 'TranslationTbl': pd.Series([getSpeciesTranslationTable(taxId)], dtype='int'), 'InPhyloTree': pd.Series([inPhyloTree], dtype='bool'), 'GenomicGC%': pd.Series([genomicGC], dtype='float'), 'GenomicENc\'': pd.Series([genomicENcprime], dtype='float'), 'GrowthTempC': pd.Series([optimumTemp], dtype='float'), 'GenomeSizeMb':
def readSeriesResultsForSpecies(seriesSourceNumber, species, minShuffledGroups=20, maxShuffledGroups=20, shuffleType=db.Sources.ShuffleCDSv2_python, cdsFilter=None, returnCDS=True): if isinstance( species, Iterable ): # usually, species will be a sequence of numeric taxid values if isinstance(species, basestring): raise Exception("species cannot be string") # all set - proceed... else: species = (species, ) # assume we got a single (numeric) taxid value assert (minShuffledGroups <= maxShuffledGroups) for taxIdForProcessing in species: print("Procesing %d sequences for tax-id %d (%s)..." % (countSpeciesCDS(taxIdForProcessing), taxIdForProcessing, getSpeciesName(taxIdForProcessing))) computed = getAllComputedSeqsForSpecies(seriesSourceNumber, taxIdForProcessing, maxShuffledGroups, shuffleType=shuffleType) computedIds = frozenset(computed.keys()) print("Collecting data from %d computation results..." % len(computed)) skipped = 0 selected = 0 alreadyCompleted = 0 # Iterate over all CDS entries for this species for protId in SpeciesCDSSource(taxIdForProcessing): cds = CDSHelper(taxIdForProcessing, protId) if (not cdsFilter is None) and (not cdsFilter(cds)): continue cdsSeqId = cds.seqId() shuffledIds = cds.shuffledSeqIds(shuffleType=shuffleType) # How many shuffles (for this cds) exist in the data we found? computedShufflesCount = len( computedIds.intersection(frozenset(shuffledIds))) if (computedShufflesCount < minShuffledGroups or (not cdsSeqId in computedIds)): #print("%s - found only %d groups, skipping" % (protId, computedShufflesCount)) skipped += 1 continue # Get the computed results for this CDS seqIds = [cds.seqId()] seqIds.extend(cds.shuffledSeqIds(shuffleType=shuffleType)) if (len(seqIds) > maxShuffledGroups + 1): seqIds = seqIds[:maxShuffledGroups + 1] results = [computed.get(x) for x in seqIds] if (results is None or len([() for x in results if not x is None]) < minShuffledGroups): print("Not enough results found for %s" % protId) skipped += 1 continue # Decode the results results = list( map( lambda x: decodeJsonSeriesRecord(decompressSeriesRecord(x)) if not x is None else None, results)) if (returnCDS): yield { "taxid": taxIdForProcessing, "content": results, "cds": cds } else: yield {"taxid": taxIdForProcessing, "content": results} del results del cds selected += 1 if (rl()): print("# %s - %d records included, %d records skipped" % (datetime.now().isoformat(), selected, skipped))
shortNames = getSpeciesShortestUniqueNamesMapping() # Create an empty data-frame for csv output df = pd.DataFrame( { 'kingdom': pd.Series(dtype="string"), 'full.name': pd.Series(dtype="string"), 'short.name': pd.Series(dtype="string") }, index=pd.Index([], name='tax_id', dtype='int')) # Add kingdom data to the data-frame for k, v in taxidToKingdom.items(): df.loc[k, 'kingdom'] = v df.loc[k, 'full.name'] = getSpeciesName(k) df.loc[k, 'short.name'] = shortNames[k] assert (df.loc[k, 'kingdom'] == v) # Get list of large taxonomic groups (based on the lineages of all species) majorGroups = getMajorTaxonomicGroups(taxidToLineage) # Add a binary membership column for each major group for groupTaxId, _ in majorGroups: groupName = ncbiTaxa.get_taxid_translator([groupTaxId])[groupTaxId] groupName = "Member_%s_%d" % (groupName.replace(" ", "_").replace( "/", "_").replace("-", "_"), groupTaxId) groupDf = pd.DataFrame({groupName: pd.Series(dtype='bool')}, index=pd.Index(df.index.values, name='tax_id',
def speciesStatisticsAndValidityReport(args): import _distributed speciesDf = pd.DataFrame({ 'TaxId': pd.Series([], dtype='int'), # Species TaxId 'Species': pd.Series([], dtype='str'), # Species binomial name 'Nickname': pd.Series([], dtype='str'), 'Domain': pd.Categorical([]), # Bacteria, Eukaryota, Archaea 'Phylum': pd.Categorical([]), # Phylum name (string) 'NumCDSs': pd.Series([], dtype='int'), # CDS count for this species 'NumCDSsInProfile': pd.Series([], dtype='int' ), # Num seqs with 20 shuffled profiles for this species 'AnnotatedNumCDSs': pd.Series([], dtype='int'), # 'CDSDifference': pd.Series([], dtype='float'), # 'NumNativeSeqs': pd.Series([], dtype='int'), # 'GCContentInCDS': pd.Series([], dtype='float'), # 'AnnotatedGCContent': pd.Series([], dtype='float'), # 'RowType': pd.Categorical([]), # Species count or total 'Warnings': pd.Series([], dtype='str'), # 'CDSWarnings': pd.Series([], dtype='int'), # 'CDSWarnings_': pd.Series([], dtype='str'), # 'FirstAA': pd.Series([], dtype='str'), # 'LastAA': pd.Series([], dtype='str') # }) scheduler = _distributed.open() results = {} delayedCalls_native = [] shuffledCounts = {} delayedCalls_shuffledProfiles = [] for taxId in allSpeciesSource(): if taxId in speciesToExclude: continue # always exclude species from the blacklist if args.taxid and taxId not in args.taxid: continue # if a whitelist is specified, skip other species warnings = [] ## DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ## #if randint(0, 20) > 0: # continue ## DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ## cdsCountInRedis = countSpeciesCDS(taxId) #cdsCountProfiles = countx(taxId, (310, 10, "begin", 0), 102, 11) annotatedProteinCount = getSpeciesProperty(taxId, 'protein-count')[0] annotatedGCContent = getSpeciesProperty(taxId, 'gc-content')[0] proteinDifference = None if not annotatedProteinCount is None: proteinDifference = (1.0 - float(cdsCountInRedis) / float(annotatedProteinCount)) * 100.0 if abs(proteinDifference) > 9.9: warnings.append("CDS_count") else: warnings.append("No_CDS_count") # Determine phylum lineage = ncbiTaxa.get_lineage(taxId) names = ncbiTaxa.get_taxid_translator(lineage) ranks = ncbiTaxa.get_rank(lineage) # Determine kingdom/domain domain = "" kingdomTaxId = [ t for t, rank in ranks.items() if rank == 'superkingdom' ] if not kingdomTaxId: kingdomTaxId = [ t for t, rank in ranks.items() if rank == 'kingdom' ] domain = names[kingdomTaxId[0]] phylumName = "" # Determine phylum phylumTaxId = [t for t, rank in ranks.items() if rank == 'phylum'] if phylumTaxId: phylumName = names[phylumTaxId[0]] speciesDf = speciesDf.append( pd.DataFrame({ 'TaxId': pd.Series([taxId], dtype='int'), # Species TaxId 'Species': pd.Series([getSpeciesName(taxId)], dtype='str'), 'Nickname': pd.Series([shortNames[taxId]], dtype='str'), 'Domain': pd.Categorical([domain]), # Bacteria, Eukaryota, Archaea 'Phylum': pd.Categorical([phylumName]), # Phylum name (string) 'NumCDSs': pd.Series([cdsCountInRedis], dtype='int'), # CDS count for this species 'NumCDSsInProfile': pd.Series([0], dtype='int'), # Num seqs with 20 shuffled profiles 'AnnotatedNumCDSs': pd.Series([ 0 if annotatedProteinCount is None else annotatedProteinCount ], dtype='int'), # 'CDSDifference': pd.Series([proteinDifference], dtype='float'), # 'NumNativeSeqs': pd.Series([0], dtype='int'), # 'GCContentInCDS': pd.Series([0.0], dtype='float'), # 'AnnotatedGCContent': pd.Series([annotatedGCContent], dtype='float'), # 'RowType': pd.Categorical(["species"]), # Species count or total 'Warnings': pd.Series([", ".join(warnings)], dtype='str'), # 'CDSWarnings': pd.Series([0], dtype='int'), 'CDSWarnings_': pd.Series([""], dtype='str'), 'FirstAA': pd.Series([""], dtype='str'), 'LastAA': pd.Series([""], dtype='str'), 'Source': pd.Series([""], dtype='str') })) fractionSize = 1000 # How many sequences (roughly) to process in each task numFractions = cdsCountInRedis / fractionSize if numFractions == 0: numFractions = 1 for i in range(numFractions): # DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY # #if i%100!=5: continue # DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY # call = dask.delayed(calcNativeSequencesStatistics)(taxId, i, numFractions) delayedCalls_native.append(call) call = dask.delayed(countShuffledProfiles)(taxId, (310, 10, "begin", 0), 102, 11) delayedCalls_shuffledProfiles.append(call) speciesDf.set_index('TaxId', inplace=True) print("Starting {} calls...".format( len(delayedCalls_native) + len(delayedCalls_shuffledProfiles))) futures = scheduler.compute( delayedCalls_native + delayedCalls_shuffledProfiles ) # submit all delayed calculations; obtain futures immediately try: _distributed.progress(futures) # wait for all calculations to complete except Exception as e: print(E) print("\n") print("Waiting for all tasks to complete...") _distributed.wait(futures) results = {} errorsCount = 0 for f in futures: try: ret = scheduler.gather(f) if (len(ret) == 9): (taxId, fraction, cdsCount, gcCounts, totalCounts, cdsWarnings, warnings, firstAA, lastAA) = ret current = None if taxId in results: current = results[taxId] else: current = (0, 0, 0, 0, Counter(), Counter(), Counter()) current = (current[0] + cdsCount, current[1] + gcCounts, current[2] + totalCounts, current[3] + cdsWarnings, current[4] + warnings, current[5] + firstAA, current[6] + lastAA) results[taxId] = current elif (len(ret) == 2): (taxId, numShuffledSeqs) = ret shuffledCounts[taxId] = numShuffledSeqs else: assert (False) except Exception as e: print(e) errorsCount += 1 for taxId, result in results.items(): (numNativeSeqs, gcCounts, totalCounts, cdsWarnings, warnings, firstAA, lastAA) = result speciesDf.at[taxId, 'NumNativeSeqs'] = numNativeSeqs speciesDf.at[taxId, 'GCContentInCDS'] = round( float(gcCounts) / float(totalCounts) * 100.0, 1) speciesDf.at[taxId, 'CDSWarnings'] = cdsWarnings speciesDf.at[taxId, 'CDSWarnings_'] = summarizeCounter(warnings) speciesDf.at[taxId, 'FirstAA'] = summarizeCounter(firstAA) speciesDf.at[taxId, 'LastAA'] = summarizeCounter(lastAA) #if numNativeSeqs < species.at[taxId, 'NumCDSs']: # pass for taxId, result in shuffledCounts.items(): speciesDf.at[taxId, 'NumCDSsInProfile'] = result speciesDf = speciesDf.sort_values(by=['Domain', 'Species']) # sort rows speciesDf.to_html('species_report.html', float_format='{0:.1f}'.format, columns=[ 'Species', 'Nickname', 'NumCDSs', 'NumCDSsInProfile', 'AnnotatedNumCDSs', 'CDSDifference', 'NumNativeSeqs', 'GCContentInCDS', 'AnnotatedGCContent', 'Phylum', 'Domain', 'Warnings', 'CDSWarnings', 'CDSWarnings_', 'FirstAA', 'LastAA' ]) with open("species_report_simple.rst", "w") as f: f.write( speciesDf.drop([ 'RowType', 'Warnings', 'CDSWarnings', 'CDSWarnings_', 'FirstAA', 'LastAA', 'CDSDifference' ], axis=1).pipe(tabulate, headers='keys', tablefmt='rst')) speciesDf.to_html('species_report_simple.html', float_format='{0:.1f}'.format, columns=[ 'Species', 'Nickname', 'NumCDSs', 'NumCDSsInProfile', 'AnnotatedNumCDSs', 'CDSDifference', 'NumNativeSeqs', 'GCContentInCDS', 'AnnotatedGCContent', 'Phylum', 'Domain' ]) speciesDf.to_excel('species_report.xlsx', sheet_name='Species summary')