#!/usr/bin/env python
"""Examines the tags (ot:tag) study. Prints out a list 
of each unique tag used in the studies """
from peyotl.manip import iter_trees
from peyotl.phylesystem.phylesystem_umbrella import Phylesystem
from peyotl.nexson_syntax import get_nexml_el
from collections import defaultdict
import codecs
import sys

phy = Phylesystem()
study_dict = defaultdict(int)
tree_dict = defaultdict(int)
out = codecs.getwriter("utf-8")(sys.stdout)
for study_id, n in phy.iter_study_objs():
    nexml = get_nexml_el(n)
    t = nexml.get("^ot:tag")
    if t:
        # print study_id, t
        if isinstance(t, list):
            for tag in t:
                study_dict[tag] += 1
        else:
            study_dict[t] += 1
    for trees_group_id, tree_id, tree in iter_trees(n):
        t = tree.get("^ot:tag")
        if t:
            # print study_id, tree_id, t
            if isinstance(t, list):
                for tag in t:
                    study_dict[tag] += 1
Exemple #2
0
from peyotl.phylesystem.phylesystem_umbrella import Phylesystem
from peyotl.nexson_syntax import get_nexml_el
from peyotl.manip import iter_otus
from collections import defaultdict
import argparse
import codecs
import sys
import os

description = __doc__
prog = os.path.split(sys.argv[0])[-1]
parser = argparse.ArgumentParser(prog=prog, description=description)
parser.add_argument('output')
args = parser.parse_args(sys.argv[1:])
if os.path.exists(args.output):
    sys.exit('{} already exists! Exiting...\n'.format(args.output))
phy = Phylesystem()
with codecs.open(args.output, 'w', encoding='utf-8') as out:
    num_unmapped = 0
    for study_id, n in phy.iter_study_objs():
        for og, otu_id, otu in iter_otus(n):
            if '^ot:ottTaxonName' in otu:
                out.write(u'{s}\t{o}\t{r}\t{m}\n'.format(
                    s=study_id,
                    o=otu_id,
                    r=otu['^ot:originalLabel'],
                    m=otu['^ot:ottTaxonName']))
            else:
                num_unmapped += 1
sys.stderr.write('{n:d} unmapped otus\n'.format(n=num_unmapped))
Exemple #3
0
try:
    phylsys = Phylesystem()
except Exception as e:
    sys.stderr.write('count_trees.py: Exception: {}\n'.format(e.message))
    sys.exit('count_trees.py: There was a problem creating a wrapper around your phylesystem ' \
             'instance. Double check your configuration (see ' \
             'http://opentreeoflife.github.io/peyotl/configuration/ for info).')
try:
    print_freq = 500
    num_trees = 0
    num_studies = 0
    max_trees_per_study = 0
    biggest_study = None
    studies_without_trees = []
    sys.stderr.write('count_trees.py: beginning loop over studies...\n')
    for study_id, nexson in phylsys.iter_study_objs():
        num_studies += 1
        try:
            nt = len(extract_tree_nexson(nexson, tree_id=None))
        except:
            sys.stderr.write(
                'Problem extracting trees from study {}'.format(study_id))
            raise
        if nt == 0:
            studies_without_trees.append(study_id)
        else:
            num_trees += nt
            if nt > max_trees_per_study:
                biggest_study = study_id
                max_trees_per_study = nt
        if num_studies % print_freq == 0:
Exemple #4
0
try:
    phylsys = Phylesystem()
except Exception as e:
    sys.stderr.write('count_trees.py: Exception: {}\n'.format(e.message))
    sys.exit('count_trees.py: There was a problem creating a wrapper around your phylesystem '
             'instance. Double check your configuration (see '
             'http://opentreeoflife.github.io/peyotl/configuration/ for info).')
try:
    print_freq = 500
    num_trees = 0
    num_studies = 0
    max_trees_per_study = 0
    biggest_study = None
    studies_without_trees = []
    sys.stderr.write('count_trees.py: beginning loop over studies...\n')
    for study_id, nexson in phylsys.iter_study_objs():
        num_studies += 1
        try:
            nt = len(extract_tree_nexson(nexson, tree_id=None))
        except:
            sys.stderr.write('Problem extracting trees from study {}'.format(study_id))
            raise
        if nt == 0:
            studies_without_trees.append(study_id)
        else:
            num_trees += nt
            if nt > max_trees_per_study:
                biggest_study = study_id
                max_trees_per_study = nt
        if num_studies % print_freq == 0:
            sys.stderr.write('   ...{d} studies read. Still going...\n'.format(d=num_studies))
def write_tree_list(outpath):
    conflict_analyses = read_conflict_analyses()
    trees_in_synthesis = read_synthesis_list()
    taxa_in_synthesis = read_synthesis_taxa()
    phylesystem = Phylesystem()
    study_count = 0
    tree_count = 0
    preferred_count = 0
    table = []
    for study_id, nexson in phylesystem.iter_study_objs():
        study_count += 1
        nexml_el = nexson[u'nexml']
        n_intended = 1
        not_intended = nexml_el.get(u'^ot:notIntendedForSynthesis')
        if not_intended == True:
            n_intended = 0
        else:
            n_intended = 2
        candidates = nexml_el.get(u'^ot:candidateTreeForSynthesis')
        if candidates == None: candidates = []
        tid_tree_otug = extract_tree_nexson(nexson, tree_id=None)
        for (tree_id, tree, otu_group) in tid_tree_otug:
            tree_count += 1
            row = Row()

            # otu_group = otu_groups[ogi]['otuById']
            long_id = '%s@%s' % (study_id, tree_id)
            row.id = long_id

            row.n_intended = n_intended  # per study

            if len(candidates) == 0: # No selection(s) made
                if len(tid_tree_otug) == 1:
                    n_preferred = 2    # Only one tree; use it
                else:
                    n_preferred = 1    # More than one tree; decision required
            else:
                if tree_id in candidates:
                    preferred_count += 1
                    n_preferred = 2    # This is a preferred tree; use it
                else:
                    n_preferred = 0    # Not preferred, another is; do not use
            row.n_preferred = n_preferred

            ctype = tree.get('^ot:curatedType')
            n_ctype = 0
            if ctype != None and ctype != '':
                n_ctype = 1
            row.n_ctype = n_ctype

            # whether a curator has confirmed the root
            root = tree.get('^ot:specifiedRoot')
            root_confirmed = 0
            if root != None and root != '':
                root_confirmed = 1
            row.root_confirmed = root_confirmed

            row.n_synth = 1 if long_id in trees_in_synthesis else 0

            ingroup_node_id = tree.get('^ot:inGroupClade')
            row.n_ingroup = (1 if (ingroup_node_id != None) else 0)

            (row.tip_count, row.ott_count, row.new_count) = \
                examine_tree(tree, otu_group, ingroup_node_id, taxa_in_synthesis)

            row.conflict_count = 0
            row.resolve_count = 0
            analysis = conflict_analyses.get(long_id)
            if analysis != None:
                row.conflict_count = int(analysis[1])
                row.resolve_count = int(analysis[2])

            row.score = ((row.new_count + row.resolve_count) -
                         (row.conflict_count * 20) +
                         (row.n_ingroup * 10) +
                         (row.n_preferred * 50) +
                         (row.n_intended * 100))

            table.append(row)
            if tree_count % 500 == 0:
                print tree_count, long_id, ctype
    table.sort(key=lambda row:(-row.score,
                               row.n_intended == 0,   # whether intended for synthesis
                               -row.n_preferred,   # whether preferred
                               -row.n_ingroup,   # whether ingroup is designated
                               row.conflict_count,    # number of synth tree conflicts
                               -row.new_count,   # number of OTUs mapped to OTT
                               -row.n_ctype,   # whether there's a 'curated type'
                               -row.tip_count,   # total number of tips (for comparison)
                               ))
    with codecs.open(outpath, 'w', encoding='utf-8') as outfile:
        writer = csv.writer(outfile)
        writer.writerow(['tree', 'intended', 'preferred', 'has ingroup',
                         'has method', 'root confirmed', 'in synth', '#tips',
                         '#mapped', '#new', '#resolved', '#conflicts',
                         'score'])
        for row in table:
            writer.writerow([row.id, row.n_intended, row.n_preferred,
                             row.n_ingroup, row.n_ctype,
                             row.root_confirmed, row.n_synth,
                             row.tip_count, row.ott_count,
                             row.new_count,
                             row.resolve_count,
                             row.conflict_count,
                             row.score])
    print 'studies:', study_count
    print 'trees:', tree_count
    print 'preferred:', preferred_count
def write_tree_list(outpath):
    conflict_analyses = read_conflict_analyses()
    trees_in_synthesis = read_synthesis_list()
    taxa_in_synthesis = read_synthesis_taxa()
    phylesystem = Phylesystem()
    study_count = 0
    tree_count = 0
    preferred_count = 0
    table = []
    for study_id, nexson in phylesystem.iter_study_objs():
        study_count += 1
        nexml_el = nexson[u'nexml']
        n_intended = 1
        not_intended = nexml_el.get(u'^ot:notIntendedForSynthesis')
        if not_intended == True:
            n_intended = 0
        else:
            n_intended = 2
        candidates = nexml_el.get(u'^ot:candidateTreeForSynthesis')
        if candidates == None: candidates = []
        tid_tree_otug = extract_tree_nexson(nexson, tree_id=None)
        for (tree_id, tree, otu_group) in tid_tree_otug:
            tree_count += 1
            row = Row()

            # otu_group = otu_groups[ogi]['otuById']
            long_id = '%s@%s' % (study_id, tree_id)
            row.id = long_id

            row.n_intended = n_intended  # per study

            if len(candidates) == 0:  # No selection(s) made
                if len(tid_tree_otug) == 1:
                    n_preferred = 2  # Only one tree; use it
                else:
                    n_preferred = 1  # More than one tree; decision required
            else:
                if tree_id in candidates:
                    preferred_count += 1
                    n_preferred = 2  # This is a preferred tree; use it
                else:
                    n_preferred = 0  # Not preferred, another is; do not use
            row.n_preferred = n_preferred

            ctype = tree.get('^ot:curatedType')
            n_ctype = 0
            if ctype != None and ctype != '':
                n_ctype = 1
            row.n_ctype = n_ctype

            # whether a curator has confirmed the root
            root = tree.get('^ot:specifiedRoot')
            root_confirmed = 0
            if root != None and root != '':
                root_confirmed = 1
            row.root_confirmed = root_confirmed

            row.n_synth = 1 if long_id in trees_in_synthesis else 0

            ingroup_node_id = tree.get('^ot:inGroupClade')
            row.n_ingroup = (1 if (ingroup_node_id != None) else 0)

            (row.tip_count, row.ott_count, row.new_count) = \
                examine_tree(tree, otu_group, ingroup_node_id, taxa_in_synthesis)

            row.conflict_count = 0
            row.resolve_count = 0
            analysis = conflict_analyses.get(long_id)
            if analysis != None:
                row.conflict_count = int(analysis[1])
                row.resolve_count = int(analysis[2])

            row.score = ((row.new_count + row.resolve_count) -
                         (row.conflict_count * 20) + (row.n_ingroup * 10) +
                         (row.n_preferred * 50) + (row.n_intended * 100))

            table.append(row)
            if tree_count % 500 == 0:
                print tree_count, long_id, ctype
    table.sort(key=lambda row: (
        -row.score,
        row.n_intended == 0,  # whether intended for synthesis
        -row.n_preferred,  # whether preferred
        -row.n_ingroup,  # whether ingroup is designated
        row.conflict_count,  # number of synth tree conflicts
        -row.new_count,  # number of OTUs mapped to OTT
        -row.n_ctype,  # whether there's a 'curated type'
        -row.tip_count,  # total number of tips (for comparison)
    ))
    with codecs.open(outpath, 'w', encoding='utf-8') as outfile:
        writer = csv.writer(outfile)
        writer.writerow([
            'tree', 'intended', 'preferred', 'has ingroup', 'has method',
            'root confirmed', 'in synth', '#tips', '#mapped', '#new',
            '#resolved', '#conflicts', 'score'
        ])
        for row in table:
            writer.writerow([
                row.id, row.n_intended, row.n_preferred, row.n_ingroup,
                row.n_ctype, row.root_confirmed, row.n_synth, row.tip_count,
                row.ott_count, row.new_count, row.resolve_count,
                row.conflict_count, row.score
            ])
    print 'studies:', study_count
    print 'trees:', tree_count
    print 'preferred:', preferred_count