#!/usr/bin/env python """Examines the tags (ot:tag) study. Prints out a list of each unique tag used in the studies """ from peyotl.manip import iter_trees from peyotl.phylesystem.phylesystem_umbrella import Phylesystem from peyotl.nexson_syntax import get_nexml_el from collections import defaultdict import codecs import sys phy = Phylesystem() study_dict = defaultdict(int) tree_dict = defaultdict(int) out = codecs.getwriter("utf-8")(sys.stdout) for study_id, n in phy.iter_study_objs(): nexml = get_nexml_el(n) t = nexml.get("^ot:tag") if t: # print study_id, t if isinstance(t, list): for tag in t: study_dict[tag] += 1 else: study_dict[t] += 1 for trees_group_id, tree_id, tree in iter_trees(n): t = tree.get("^ot:tag") if t: # print study_id, tree_id, t if isinstance(t, list): for tag in t: study_dict[tag] += 1
from peyotl.phylesystem.phylesystem_umbrella import Phylesystem from peyotl.nexson_syntax import get_nexml_el from peyotl.manip import iter_otus from collections import defaultdict import argparse import codecs import sys import os description = __doc__ prog = os.path.split(sys.argv[0])[-1] parser = argparse.ArgumentParser(prog=prog, description=description) parser.add_argument('output') args = parser.parse_args(sys.argv[1:]) if os.path.exists(args.output): sys.exit('{} already exists! Exiting...\n'.format(args.output)) phy = Phylesystem() with codecs.open(args.output, 'w', encoding='utf-8') as out: num_unmapped = 0 for study_id, n in phy.iter_study_objs(): for og, otu_id, otu in iter_otus(n): if '^ot:ottTaxonName' in otu: out.write(u'{s}\t{o}\t{r}\t{m}\n'.format( s=study_id, o=otu_id, r=otu['^ot:originalLabel'], m=otu['^ot:ottTaxonName'])) else: num_unmapped += 1 sys.stderr.write('{n:d} unmapped otus\n'.format(n=num_unmapped))
try: phylsys = Phylesystem() except Exception as e: sys.stderr.write('count_trees.py: Exception: {}\n'.format(e.message)) sys.exit('count_trees.py: There was a problem creating a wrapper around your phylesystem ' \ 'instance. Double check your configuration (see ' \ 'http://opentreeoflife.github.io/peyotl/configuration/ for info).') try: print_freq = 500 num_trees = 0 num_studies = 0 max_trees_per_study = 0 biggest_study = None studies_without_trees = [] sys.stderr.write('count_trees.py: beginning loop over studies...\n') for study_id, nexson in phylsys.iter_study_objs(): num_studies += 1 try: nt = len(extract_tree_nexson(nexson, tree_id=None)) except: sys.stderr.write( 'Problem extracting trees from study {}'.format(study_id)) raise if nt == 0: studies_without_trees.append(study_id) else: num_trees += nt if nt > max_trees_per_study: biggest_study = study_id max_trees_per_study = nt if num_studies % print_freq == 0:
try: phylsys = Phylesystem() except Exception as e: sys.stderr.write('count_trees.py: Exception: {}\n'.format(e.message)) sys.exit('count_trees.py: There was a problem creating a wrapper around your phylesystem ' 'instance. Double check your configuration (see ' 'http://opentreeoflife.github.io/peyotl/configuration/ for info).') try: print_freq = 500 num_trees = 0 num_studies = 0 max_trees_per_study = 0 biggest_study = None studies_without_trees = [] sys.stderr.write('count_trees.py: beginning loop over studies...\n') for study_id, nexson in phylsys.iter_study_objs(): num_studies += 1 try: nt = len(extract_tree_nexson(nexson, tree_id=None)) except: sys.stderr.write('Problem extracting trees from study {}'.format(study_id)) raise if nt == 0: studies_without_trees.append(study_id) else: num_trees += nt if nt > max_trees_per_study: biggest_study = study_id max_trees_per_study = nt if num_studies % print_freq == 0: sys.stderr.write(' ...{d} studies read. Still going...\n'.format(d=num_studies))
def write_tree_list(outpath): conflict_analyses = read_conflict_analyses() trees_in_synthesis = read_synthesis_list() taxa_in_synthesis = read_synthesis_taxa() phylesystem = Phylesystem() study_count = 0 tree_count = 0 preferred_count = 0 table = [] for study_id, nexson in phylesystem.iter_study_objs(): study_count += 1 nexml_el = nexson[u'nexml'] n_intended = 1 not_intended = nexml_el.get(u'^ot:notIntendedForSynthesis') if not_intended == True: n_intended = 0 else: n_intended = 2 candidates = nexml_el.get(u'^ot:candidateTreeForSynthesis') if candidates == None: candidates = [] tid_tree_otug = extract_tree_nexson(nexson, tree_id=None) for (tree_id, tree, otu_group) in tid_tree_otug: tree_count += 1 row = Row() # otu_group = otu_groups[ogi]['otuById'] long_id = '%s@%s' % (study_id, tree_id) row.id = long_id row.n_intended = n_intended # per study if len(candidates) == 0: # No selection(s) made if len(tid_tree_otug) == 1: n_preferred = 2 # Only one tree; use it else: n_preferred = 1 # More than one tree; decision required else: if tree_id in candidates: preferred_count += 1 n_preferred = 2 # This is a preferred tree; use it else: n_preferred = 0 # Not preferred, another is; do not use row.n_preferred = n_preferred ctype = tree.get('^ot:curatedType') n_ctype = 0 if ctype != None and ctype != '': n_ctype = 1 row.n_ctype = n_ctype # whether a curator has confirmed the root root = tree.get('^ot:specifiedRoot') root_confirmed = 0 if root != None and root != '': root_confirmed = 1 row.root_confirmed = root_confirmed row.n_synth = 1 if long_id in trees_in_synthesis else 0 ingroup_node_id = tree.get('^ot:inGroupClade') row.n_ingroup = (1 if (ingroup_node_id != None) else 0) (row.tip_count, row.ott_count, row.new_count) = \ examine_tree(tree, otu_group, ingroup_node_id, taxa_in_synthesis) row.conflict_count = 0 row.resolve_count = 0 analysis = conflict_analyses.get(long_id) if analysis != None: row.conflict_count = int(analysis[1]) row.resolve_count = int(analysis[2]) row.score = ((row.new_count + row.resolve_count) - (row.conflict_count * 20) + (row.n_ingroup * 10) + (row.n_preferred * 50) + (row.n_intended * 100)) table.append(row) if tree_count % 500 == 0: print tree_count, long_id, ctype table.sort(key=lambda row:(-row.score, row.n_intended == 0, # whether intended for synthesis -row.n_preferred, # whether preferred -row.n_ingroup, # whether ingroup is designated row.conflict_count, # number of synth tree conflicts -row.new_count, # number of OTUs mapped to OTT -row.n_ctype, # whether there's a 'curated type' -row.tip_count, # total number of tips (for comparison) )) with codecs.open(outpath, 'w', encoding='utf-8') as outfile: writer = csv.writer(outfile) writer.writerow(['tree', 'intended', 'preferred', 'has ingroup', 'has method', 'root confirmed', 'in synth', '#tips', '#mapped', '#new', '#resolved', '#conflicts', 'score']) for row in table: writer.writerow([row.id, row.n_intended, row.n_preferred, row.n_ingroup, row.n_ctype, row.root_confirmed, row.n_synth, row.tip_count, row.ott_count, row.new_count, row.resolve_count, row.conflict_count, row.score]) print 'studies:', study_count print 'trees:', tree_count print 'preferred:', preferred_count
def write_tree_list(outpath): conflict_analyses = read_conflict_analyses() trees_in_synthesis = read_synthesis_list() taxa_in_synthesis = read_synthesis_taxa() phylesystem = Phylesystem() study_count = 0 tree_count = 0 preferred_count = 0 table = [] for study_id, nexson in phylesystem.iter_study_objs(): study_count += 1 nexml_el = nexson[u'nexml'] n_intended = 1 not_intended = nexml_el.get(u'^ot:notIntendedForSynthesis') if not_intended == True: n_intended = 0 else: n_intended = 2 candidates = nexml_el.get(u'^ot:candidateTreeForSynthesis') if candidates == None: candidates = [] tid_tree_otug = extract_tree_nexson(nexson, tree_id=None) for (tree_id, tree, otu_group) in tid_tree_otug: tree_count += 1 row = Row() # otu_group = otu_groups[ogi]['otuById'] long_id = '%s@%s' % (study_id, tree_id) row.id = long_id row.n_intended = n_intended # per study if len(candidates) == 0: # No selection(s) made if len(tid_tree_otug) == 1: n_preferred = 2 # Only one tree; use it else: n_preferred = 1 # More than one tree; decision required else: if tree_id in candidates: preferred_count += 1 n_preferred = 2 # This is a preferred tree; use it else: n_preferred = 0 # Not preferred, another is; do not use row.n_preferred = n_preferred ctype = tree.get('^ot:curatedType') n_ctype = 0 if ctype != None and ctype != '': n_ctype = 1 row.n_ctype = n_ctype # whether a curator has confirmed the root root = tree.get('^ot:specifiedRoot') root_confirmed = 0 if root != None and root != '': root_confirmed = 1 row.root_confirmed = root_confirmed row.n_synth = 1 if long_id in trees_in_synthesis else 0 ingroup_node_id = tree.get('^ot:inGroupClade') row.n_ingroup = (1 if (ingroup_node_id != None) else 0) (row.tip_count, row.ott_count, row.new_count) = \ examine_tree(tree, otu_group, ingroup_node_id, taxa_in_synthesis) row.conflict_count = 0 row.resolve_count = 0 analysis = conflict_analyses.get(long_id) if analysis != None: row.conflict_count = int(analysis[1]) row.resolve_count = int(analysis[2]) row.score = ((row.new_count + row.resolve_count) - (row.conflict_count * 20) + (row.n_ingroup * 10) + (row.n_preferred * 50) + (row.n_intended * 100)) table.append(row) if tree_count % 500 == 0: print tree_count, long_id, ctype table.sort(key=lambda row: ( -row.score, row.n_intended == 0, # whether intended for synthesis -row.n_preferred, # whether preferred -row.n_ingroup, # whether ingroup is designated row.conflict_count, # number of synth tree conflicts -row.new_count, # number of OTUs mapped to OTT -row.n_ctype, # whether there's a 'curated type' -row.tip_count, # total number of tips (for comparison) )) with codecs.open(outpath, 'w', encoding='utf-8') as outfile: writer = csv.writer(outfile) writer.writerow([ 'tree', 'intended', 'preferred', 'has ingroup', 'has method', 'root confirmed', 'in synth', '#tips', '#mapped', '#new', '#resolved', '#conflicts', 'score' ]) for row in table: writer.writerow([ row.id, row.n_intended, row.n_preferred, row.n_ingroup, row.n_ctype, row.root_confirmed, row.n_synth, row.tip_count, row.ott_count, row.new_count, row.resolve_count, row.conflict_count, row.score ]) print 'studies:', study_count print 'trees:', tree_count print 'preferred:', preferred_count