def tst(noise, target, source): print '##', noise sep = Taxonomy.getRawTaxonomy('tax/skel/', 'ott') t = Taxonomy.getRawTaxonomy(target, 'target') s = Taxonomy.getRawTaxonomy(source, 'source') u = combine(sep, t, s, blustery) u.dumpChoices('/tmp/align_tests_choices.tsv') subprocess.call(['cat', '/tmp/align_tests_choices.tsv']) if False: u.dumpLog('/tmp/align_tests_log.tsv') subprocess.call(['cat', '/tmp/align_tests_log.tsv']) print
def doit(tax_path, ids_path): ott = Taxonomy.getRawTaxonomy(tax_path, 'ott') all_nodes = {} with open(ids_path, 'r') as infile: reader = csv.reader(infile, delimiter='\t') otu_count = 0 for row in reader: id = row[0] if otu_count % 50000 == 0: print otu_count, id otu_count += 1 node = ott.lookupId(id) if node != None: all_nodes[node.id] = node print 'OTT taxa assigned to OTUs:', len(all_nodes) prefix_to_count = {} ott_count = 0 for id in all_nodes: node = all_nodes[id] ott_count += 1 for qid in node.sourceIds: prefix = qid.prefix count = prefix_to_count.get(prefix, 0) prefix_to_count[prefix] = count + 1 print 'OTT ids assigned to OTUs:', otu_count for prefix in prefix_to_count: print prefix, prefix_to_count[prefix]
def report(dir, idspace): tax = Taxonomy.getRawTaxonomy(os.path.join('tax', dir, ''), idspace) # tax.smush() # HomonymReport.homonymDensityReport(tax, dir + '-density-report.csv') # HomonymReport.homonymUncertaintyReport(tax, 'reports/' + dir + '-uncertainty-report.csv') if not os.path.isdir(report_dir): os.makedirs(report_dir) HomonymReport.homonymReport(tax, os.path.join(report_dir, dir + '-homonym-report.csv'))
def report(dir, idspace): tax = Taxonomy.getRawTaxonomy(os.path.join('tax', dir, ''), idspace) # tax.smush() # HomonymReport.homonymDensityReport(tax, dir + '-density-report.csv') # HomonymReport.homonymUncertaintyReport(tax, 'reports/' + dir + '-uncertainty-report.csv') if not os.path.isdir(report_dir): os.makedirs(report_dir) HomonymReport.homonymReport( tax, os.path.join(report_dir, dir + '-homonym-report.csv'))
def doit(ott, sep, outpath, conpath): do_rug = False #os.path.isdir('out/ruggiero') if do_rug: rug = Taxonomy.getRawTaxonomy('out/ruggiero/', 'rug') # Prepare for conflict analysis # oh no, we really need a separation taxonomy to do that. rug_alignment = AlignmentByName(rug, ott) rug_alignment.align() rug_conflict = ConflictAnalysis(rug, ott, rug_alignment, True) overall_table(ott, outpath) source_breakdown_table(ott, conpath)
def retain_ids(ott, prev_path, by_qid): # ad hoc assignments specifically for NCBI taxa, basedon NCBI id for (ncbi_id, ott_id, name) in ncbi_ott_assignments.ncbi_assignments_list: im = ott.lookupQid(QualifiedId('ncbi', ncbi_id)) if im == None: print '* ncbi:%s not found in OTT - %s' % (ncbi_id, name) else: if im.name != name: print '* ncbi:%s name is %s, but expected %s' % (ncbi_id, im.name, name) im.addId(ott_id) # Force some id assignments... will try to automate this in the future. # Most of these come from looking at the deprecated.tsv file after a # series of smasher runs. for (inf, sup, id) in [ ('Tipuloidea', 'Diptera', '722875'), ('Saccharomycetes', 'Saccharomycotina', '989999'), ('Phaeosphaeria', 'Ascomycota', '5486272'), ('Synedra acus', 'Eukaryota', '992764'), ('Hessea', 'Archaeplastida', '600099'), ('Morganella', 'Arthropoda', '6400'), ('Rhynchonelloidea', 'Rhynchonellidae', '5316010'), ('Morganella', 'Fungi', '973932'), ('Parmeliaceae', 'Lecanorales', '305904'), ('Cordana', 'Ascomycota', '946160'), ('Pseudofusarium', 'Ascomycota', '655794'), ('Marssonina', 'Dermateaceae', '372158'), # ncbi:324777 ('Marssonia', 'Lamiales', '5512668'), # gbif:7268388 # ('Gloeosporium', 'Pezizomycotina', '75019'), # synonym for Marssonina ('Escherichia coli', 'Enterobacteriaceae', '474506'), # ncbi:562 # ('Dischloridium', 'Trichocomaceae', '895423'), ('Exaiptasia pallida', 'Cnidaria', '135923'), ('Choanoflagellida', 'Holozoa', '202765'), ('Billardiera', 'Lamiales', '798963'), ('Trachelomonas grandis', 'Bacteria', '58035'), # study ot_91 Tr46259 ('Hypomyzostoma', 'Myzostomida', '552744'), # was incorrectly in Annelida ('Gyromitus', 'SAR', '696946'), ('Pseudogymnoascus destructans', 'Pezizomycotina', '428163'), # ('Amycolicicoccus subflavus', 'Mycobacteriaceae', '541768'), # ncbi:639313 # ('Pohlia', 'Foraminifera', '5325989') - NO ('Pohlia', 'Amphibia', '5325989'), # irmng:1311321 ('Phyllanthus', 'Pentapetalae', '452944'), # pg_25 @josephwb = 5509975 ]: tax = ott.maybeTaxon(inf, sup) if tax != None: tax.setId(id) ott.taxon('452944').addId('5509975') # ott.taxon('474506') ... ott.taxonThatContains('Rhynchonelloidea', 'Sphenarina').setId('795939') # NCBI # Trichosporon is a mess, because it occurs 3 times in NCBI. trich = ott.taxonThatContains('Trichosporon', 'Trichosporon cutaneum') if trich != None: trich.setId('364222') #ott.image(fungi.taxon('11060')).setId('4107132') #Cryptococcus - a total mess # -------------------- # Assign OTT ids to taxa that don't have them, re-using old ids when possible ids = Taxonomy.getRawTaxonomy(prev_path, 'ott') # Edit the id source taxonomy to optimize id coverage # Kludge to undo lossage in OTT 2.9 for taxon in ids.taxa(): if (len(taxon.sourceIds) >= 2 and taxon.sourceIds[0].prefix == "ncbi" and taxon.sourceIds[1].prefix == "silva"): taxon.sourceIds.remove(taxon.sourceIds[0]) # OTT 2.9 has both Glaucophyta and Glaucophyceae... # this creates an ambiguity when aligning. # Need to review this; maybe they *should* be separate taxa. g1 = ids.maybeTaxon('Glaucophyta') g2 = ids.maybeTaxon('Glaucophyceae') if g1 != None and g2 != None and g1 != g2: g1.absorb(g2) # Assign old ids to nodes in the new version ott.carryOverIds(ids) # Align & copy ids print '-- Checking id list' retain_ids_from_list(ott, by_qid)
def tst(target, source, want): global tests t = Taxonomy.getRawTaxonomy(target, 'target') s = Taxonomy.getRawTaxonomy(source, 'source') u = combine(t, s, blustery) tests.append((t, s, u, want))
import sys, codecs from org.opentreeoflife.taxa import Taxonomy, Newick source = sys.argv[1] # Name of directory containing original taxonomy (must end in /) name = sys.argv[2] # Name of taxon to extract dest = sys.argv[3] # Directory to store result (must end in /) if not (dest.endswith('/') or dest.endswith('.tre')): print >>sys.stderr, 'Invalid taxonomy destination (need / or .tre)', dest sys.exit(1) selection = Taxonomy.getRawTaxonomy(source, 'foo').select(name) if dest.endswith('.tre'): with codecs.open(dest, 'w', 'utf-8') as outfile: outfile.write(Newick.toNewick(selection, Newick.USE_NAMES_AND_IDS)) outfile.write('\n') else: selection.dump(dest)
import sys from org.opentreeoflife.taxa import Taxonomy, Newick source = sys.argv[1] # Name of directory containing original taxonomy (must end in /) ott = Taxonomy.getRawTaxonomy(source, 'ott') count = 0 grafts = 0 non_tip_grafts = 0 # Seen = seen idspaces among ancestors. # Returns set of seen idspaces. def recur(taxon, seen): global count, grafts, non_tip_grafts count += 1 # idspace (source) of taxon space = taxon.sourceIds.get(0).prefix all = empty() seen_child = adjoin(space, seen) for child in taxon.getChildren(): under = recur(child, seen_child) child_space = child.sourceIds.get(0).prefix if child_space != space: # A graft or resolution. if intersectp(under, seen): # A resolution. print 'resolve', child, taxon, child.rank
* copied = total number of nodes originating from this source (copied) * aligned = number of source nodes aligned and copied * absorbed = number of source nodes absorbed (not copied) * conflict = number of inconsistent source nodes (not copied) """ def dump_table_as_csv(table, outfile): # Provide CSV form for Pensoft writer = csv.writer(outfile) for row in table: writer.writerow(row) def max_depth(node): m = 0 for child in node.getChildren(): d = max_depth(child) + 1 if d > m: m = d return m if __name__ == '__main__': taxpath = sys.argv[1] seppath = sys.argv[2] outpath = sys.argv[3] # general report, JSON conpath = sys.argv[4] # contributions, CSV sep = Taxonomy.getRawTaxonomy(seppath, 'ott') ott = Taxonomy.getRawTaxonomy(taxpath, 'ott') ott.inferFlags() doit(ott, sep, outpath, conpath)
import sys from org.opentreeoflife.taxa import Taxonomy, Rank ott = Taxonomy.getRawTaxonomy(sys.argv[1], 'ott') # Look for splitting: # Suppose X, Y are distinct in GBIF, but both align to X in NCBI, # because NCBI says Y a synonym of X. # Then we have X in NCBI with GBIF X and Y aligning to it, and # Y a synonym via NCBI but not via GBIF. # So GBIF X is a source for X, and GBIF Y is in sources for Y-synonym of X. for X in ott.taxa(): # Species only if X.rank != Rank.SPECIES_RANK: continue xid = X.sourceIds[0].id # Look for Y, a synonym of X... for Y in X.getSynonyms(): yids = [qid.id for qid in Y.sourceIds] # that has same source as X... if not xid in yids: continue # but, an alignment from Y for yid in yids:
* conflict = number of inconsistent source nodes (not copied) """ def dump_table_as_csv(table, outfile): # Provide CSV form for Pensoft writer = csv.writer(outfile) for row in table: writer.writerow(row) def max_depth(node): m = 0 for child in node.getChildren(): d = max_depth(child) + 1 if d > m: m = d return m if __name__ == '__main__': taxpath = sys.argv[1] seppath = sys.argv[2] outpath = sys.argv[3] # general report, JSON conpath = sys.argv[4] # contributions, CSV sep = Taxonomy.getRawTaxonomy(seppath, 'ott') ott = Taxonomy.getRawTaxonomy(taxpath, 'ott') ott.inferFlags() doit(ott, sep, outpath, conpath)
return qid return None def same_epithet(name1, name2): epi1 = epithet_stem(name1) if epi1 == None: return False epi2 = epithet_stem(name2) if epi2 == None: return False return epi1 == epi2 def epithet_stem(name): s = name.split(' ', 1) epi = s[-1] if epi.endswith('us'): return epi[0:-2] if epi.endswith('um'): return epi[0:-2] if epi.endswith('a'): return epi[0:-1] return epi if __name__ == '__main__': taxpath = sys.argv[1] inpath = sys.argv[2] outpath = sys.argv[3] reportpath = sys.argv[4] tax = Taxonomy.getRawTaxonomy(taxpath, 'ott') tax.startQidIndex() print 'ncbi:98683 =', tax.lookupQid(QualifiedId('ncbi', '98683')) (page_to_nodes, node_to_pages) = load_eol_page_ids(inpath, tax) dump_mapping(node_to_pages, outpath) report(page_to_nodes, reportpath)