Example #1
0
def iter_trees(nexson, nexson_version=None):
    '''generator over all trees in all trees elements.
    yields a tuple of 3 items:
        trees element ID,
        tree ID,
        the tree obj
    '''
    if nexson_version is None:
        nexson_version = detect_nexson_version(nexson)
    nex = get_nexml_el(nexson)
    if _is_by_id_hbf(nexson_version):
        trees_group_by_id = nex['treesById']
        group_order = nex.get('^ot:treesElementOrder', [])
        if len(group_order) < len(trees_group_by_id):
            group_order = list(trees_group_by_id.keys())
            group_order.sort()
        for trees_group_id in group_order:
            trees_group = trees_group_by_id[trees_group_id]
            tree_by_id = trees_group['treeById']
            ti_order = trees_group.get('^ot:treeElementOrder', [])
            if len(ti_order) < len(tree_by_id):
                ti_order = list(tree_by_id.keys())
                ti_order.sort()
            for tree_id in ti_order:
                tree = tree_by_id[tree_id]
                yield trees_group_id, tree_id, tree
    else:
        for trees_group in nex.get('trees', []):
            trees_group_id = trees_group['@id']
            for tree in trees_group.get('tree', []):
                tree_id = tree['@id']
                yield trees_group_id, tree_id, tree
Example #2
0
def iter_otus(nexson, nexson_version=None):
    """generator over all otus in all otus group elements.
    yields a tuple of 3 items:
        otus group ID,
        otu ID,
        the otu obj
    """
    if nexson_version is None:
        nexson_version = detect_nexson_version(nexson)
    if not _is_by_id_hbf(nexson_version):
        convert_nexson_format(
            nexson, BY_ID_HONEY_BADGERFISH)  # TODO shouldn't modify...
    nex = get_nexml_el(nexson)
    otus_group_by_id = nex['otusById']
    group_order = nex.get('^ot:otusElementOrder', [])
    if len(group_order) < len(otus_group_by_id):
        group_order = list(otus_group_by_id.keys())
        group_order.sort()
    for otus_group_id in group_order:
        otus_group = otus_group_by_id[otus_group_id]
        otu_by_id = otus_group['otuById']
        ti_order = list(otu_by_id.keys())
        for otu_id in ti_order:
            otu = otu_by_id[otu_id]
            yield otus_group_id, otu_id, otu
Example #3
0
def iter_trees(nexson, nexson_version=None):
    '''generator over all trees in all trees elements.
    yields a tuple of 3 items:
        trees element ID,
        tree ID,
        the tree obj
    '''
    if nexson_version is None:
        nexson_version = detect_nexson_version(nexson)
    nex = get_nexml_el(nexson)
    if _is_by_id_hbf(nexson_version):
        trees_group_by_id = nex['treesById']
        group_order = nex.get('^ot:treesElementOrder', [])
        if len(group_order) < len(trees_group_by_id):
            group_order = list(trees_group_by_id.keys())
            group_order.sort()
        for trees_group_id in group_order:
            trees_group = trees_group_by_id[trees_group_id]
            tree_by_id = trees_group['treeById']
            ti_order = trees_group.get('^ot:treeElementOrder', [])
            if len(ti_order) < len(tree_by_id):
                ti_order = list(tree_by_id.keys())
                ti_order.sort()
            for tree_id in ti_order:
                tree = tree_by_id[tree_id]
                yield trees_group_id, tree_id, tree
    else:
        for trees_group in nex.get('trees', []):
            trees_group_id = trees_group['@id']
            for tree in trees_group.get('tree', []):
                tree_id = tree['@id']
                yield trees_group_id, tree_id, tree
Example #4
0
 def __init__(self, filepath='', nexson=None):
     self.filepath = filepath
     if nexson is None:
         if not filepath:
             raise ValueError('Either a filepath or nexson argument must be provided')
         self._nexson = read_as_json(self.filepath)
     else:
         self._nexson = nexson
     v = detect_nexson_version(self._nexson)
     if v != BY_ID_HONEY_BADGERFISH:
         _LOG.debug('NexsonProxy converting to hbf1.2')
         convert_nexson_format(self._nexson, BY_ID_HONEY_BADGERFISH)
     self._nexml_el = get_nexml_el(self._nexson)
     self._otu_cache = {}
     self._tree_cache = {}
     self._wr = None
Example #5
0
def count_num_trees(nexson, nexson_version=None):
    if nexson_version is None:
        nexson_version = detect_nexson_version(nexson)
    nex = get_nexml_el(nexson)
    num_trees_by_group = []
    if _is_by_id_hbf(nexson_version):
        for tree_group in nex.get('treesById', {}).values():
            nt = len(tree_group.get('treeById', {}))
            num_trees_by_group.append(nt)
    else:
        trees_group = nex.get('trees', [])
        if isinstance(trees_group, dict):
            trees_group = [trees_group]
        for tree_group in trees_group:
            t = tree_group.get('tree')
            if isinstance(t, list):
                nt = len(t)
            else:
                nt = 1
            num_trees_by_group.append(nt)
    return sum(num_trees_by_group)
Example #6
0
def count_num_trees(nexson, nexson_version=None):
    '''Returns the number of trees summed across all tree
    groups.
    '''
    if nexson_version is None:
        nexson_version = detect_nexson_version(nexson)
    nex = get_nexml_el(nexson)
    num_trees_by_group = []
    if _is_by_id_hbf(nexson_version):
        for tree_group in nex.get('treesById', {}).values():
            nt = len(tree_group.get('treeById', {}))
            num_trees_by_group.append(nt)
    else:
        trees_group = nex.get('trees', [])
        if isinstance(trees_group, dict):
            trees_group = [trees_group]
        for tree_group in trees_group:
            t = tree_group.get('tree')
            if isinstance(t, list):
                nt = len(t)
            else:
                nt = 1
            num_trees_by_group.append(nt)
    return sum(num_trees_by_group)
Example #7
0
def iter_otus(nexson, nexson_version=None):
    '''generator over all otus in all otus group elements.
    yields a tuple of 3 items:
        otus group ID,
        otu ID,
        the otu obj
    '''
    if nexson_version is None:
        nexson_version = detect_nexson_version(nexson)
    nex = get_nexml_el(nexson)
    if not _is_by_id_hbf(nexson_version):
        convert_nexson_format(nexson_blob, BY_ID_HONEY_BADGERFISH) #TODO shouldn't modify...
    otus_group_by_id = nex['otusById']
    group_order = nex.get('^ot:otusElementOrder', [])
    if len(group_order) < len(otus_group_by_id):
        group_order = list(otus_group_by_id.keys())
        group_order.sort()
    for otus_group_id in group_order:
        otus_group = otus_group_by_id[otus_group_id]
        otu_by_id = otus_group['otuById']
        ti_order = list(otu_by_id.keys())
        for otu_id in ti_order:
            otu = otu_by_id[otu_id]
            yield otus_group_id, otu_id, otu
Example #8
0
#!/usr/bin/env python
from peyotl.api import APIWrapper
from peyotl.utility.input_output import read_as_json, write_as_json
from peyotl.nexson_syntax import get_nexml_el

a = APIWrapper(phylesystem_api_kwargs={'get_from': 'local'})
pa = a.phylesystem_api
p = pa.phylesystem_obj
for sid, fp in p.iter_study_filepaths():
    blob = read_as_json(fp)
    nex = get_nexml_el(blob)
    x = nex.get('^ot:studyId')
    if x != sid:
        nex['^ot:studyId'] = sid
        write_as_json(blob, fp)
        print(x, sid)
Example #9
0
def merge_otus_and_trees(nexson_blob):
    '''Takes a nexson object:
        1. merges trees elements 2 - # trees into the first trees element.,
        2. merges otus elements 2 - # otus into the first otus element.
        3. if there is no ot:originalLabel field for any otu,
            it sets that field based on @label and deletes @label
        4. merges an otu elements using the rule:
              A. treat (ottId, originalLabel) as a key
              B. If otu objects in subsequent trees match originalLabel and
                have a matching or absent ot:ottId, then they are merged into
                the same OTUs (however see C)
              C. No two leaves of a tree may share an otu (though otu should
                be shared across different trees). It is important that
                each leaf node be mapped to a distinct OTU. Otherwise there
                will be no way of separating them during OTU mapping. we
                do this indirectly by assuring to no two otu objects in the
                same otus object get merged with each other (or to a common
                object)

        5. correct object references to deleted entities.

    This function is used to patch up NexSONs created by multiple imports, hence the
    substitution of '@label' for 'ot:originalLabel'. Ids are arbitrary for imports from
    non-nexml tools, so matching is done based on names. This should mimic the behavior
    of the analysis tools that produced the trees (for most/all such tools unique names
    constitute unique OTUs).
    '''
    id_to_replace_id = {}
    orig_version = detect_nexson_version(nexson_blob)
    convert_nexson_format(nexson_blob, BY_ID_HONEY_BADGERFISH)
    nexson = get_nexml_el(nexson_blob)
    otus_group_order = nexson.get('^ot:otusElementOrder', [])
    # (ott, orig) -> list of otu elements
    retained_mapped2otu = {}
    # orig -> list of otu elements
    retained_orig2otu = {}
    # For the first (entirely retained) group of otus:
    #   1. assure that originalLabel is filled in
    #   2. register the otu in retained_mapped2otu and retained_orig2otu
    # otu elements that have no label, originalLabel or ottId will not
    #   be registered, so they'll never be matched.
    if len(otus_group_order) > 0:
        otus_group_by_id = nexson['otusById']
        retained_ogi = otus_group_order[0]
        retained_og = otus_group_by_id[retained_ogi]
        retained_og_otu = retained_og.setdefault('otuById', {})
        label_to_original_label_otu_by_id(retained_og_otu)
        for oid, otu in retained_og_otu.items():
            ottid = otu.get('^ot:ottId')
            orig = otu.get('^ot:originalLabel')
            key = (ottid, orig)
            if key != (None, None):
                m = retained_mapped2otu.setdefault(key, [])
                t = (oid, otu)
                m.append(t)
                if orig is not None:
                    m = retained_orig2otu.setdefault(orig, [])
                    m.append(t)
        # For each of the other otus elements, we:
        #   1. assure that originalLabel is filled in
        #   2. decide (for each otu) whether it will
        #       be added to retained_og or merged with
        #       an otu already in retained_og. In the
        #       case of the latter, we add to the
        #       replaced_otu dict (old oid as key, new otu as value)
        for ogi in otus_group_order[1:]:
            #_LOG.debug('retained_mapped2otu = {r}'.format(r=retained_mapped2otu))
            og = otus_group_by_id[ogi]
            del otus_group_by_id[ogi]
            otu_by_id = og.get('otuById', {})
            label_to_original_label_otu_by_id(otu_by_id)
            used_matches = set()
            id_to_replace_id[ogi] = retained_ogi
            for oid, otu in otu_by_id.items():
                ottid = otu.get('^ot:ottId')
                orig = otu.get('^ot:originalLabel')
                key = (ottid, orig)
                if key == (None, None):
                    retained_og[oid] = otu
                else:
                    match_otu = None
                    mlist = retained_mapped2otu.get(key)
                    if mlist is not None:
                        for m in mlist:
                            if m[0] not in used_matches:
                                # _LOG.debug('Matching {k} to {m}'.format(k=repr(key), m=repr(m)))
                                match_otu = m
                                break
                            #else:
                            #    _LOG.debug('{k} already in {m}'.format(k=repr(m[0]), m=repr(used_matches)))
                    if match_otu is None:
                        #_LOG.debug('New el: {k} mlist = {m}'.format(k=repr(key), m=repr(mlist)))
                        mlist = retained_orig2otu.get(orig, [])
                        for m in mlist:
                            if m[0] not in used_matches:
                                match_otu = m
                                break
                    if match_otu is not None:
                        id_to_replace_id[oid] = match_otu[0]
                        used_matches.add(match_otu[0])
                        _merge_otu_do_not_fix_references(otu, match_otu[1])
                    else:
                        assert oid not in retained_og_otu
                        retained_og_otu[oid] = otu
                        m = retained_mapped2otu.setdefault(key, [])
                        t = (oid, otu)
                        m.append(t)
                        if orig is not None:
                            m = retained_orig2otu.setdefault(orig, [])
                            m.append(t)
        nexson['^ot:otusElementOrder'] = [retained_ogi]
    # Move all of the tree elements to the first trees group.
    trees_group_order = nexson.get('^ot:treesElementOrder', [])
    if len(trees_group_order) > 0:
        trees_group_by_id = nexson['treesById']
        retained_tgi = trees_group_order[0]
        retained_tg = trees_group_by_id[retained_tgi]
        retained_tg['@otus'] = retained_ogi
        retained_tg_tree_obj = retained_tg.get('treeById', {})
        for tgi in trees_group_order[1:]:
            tg = trees_group_by_id[tgi]
            del trees_group_by_id[tgi]
            id_to_replace_id[tgi] = retained_tgi
            retained_tg['^ot:treeElementOrder'].extend(
                tg['^ot:treeElementOrder'])
            for tid, tree_obj in tg.get('treeById', {}).items():
                retained_tg_tree_obj[tid] = tree_obj
        for tree_obj in retained_tg_tree_obj.values():
            for node in tree_obj.get('nodeById', {}).values():
                o = node.get('@otu')
                if o is not None:
                    r = id_to_replace_id.get(o)
                    if r is not None:
                        node['@otu'] = r
        nexson['^ot:treesElementOrder'] = [retained_tgi]

    replace_entity_references_in_meta_and_annotations(nexson, id_to_replace_id)
    convert_nexson_format(nexson_blob, orig_version)
    return nexson_blob
Example #10
0
if report_ids:
    v_dict = {}
else:
    v_dict = defaultdict(int)

def process_val(v, id_str):
    if v is not None:
        if report_ids:
            v_dict.setdefault(v, []).append(id_str)
        elif summarize_as_set:
            v_dict[v] += 1
        else:
            out.write(u'{i}: {v}\n'.format(i=study_id, v=v))

for study_id, n in phy.iter_study_objs():
    nexml = get_nexml_el(n)
    if check_trees:
        for trees_group_id, tree_id, tree in iter_trees(n):
            id_str = 'study: {s} tree: {t}'.format(s=study_id, t=tree_id)
            process_val(tree.get(study_prop), id_str)
    else:
        process_val(nexml.get(study_prop), study_id)

if report_ids:
    as_list = [(len(v), k, v) for k, v in v_dict.items()]
    as_list.sort(reverse=True)
    for n, k, v in as_list:
        out.write(u'{k}\tseen {n:d} times\t{v}\n'.format(k=k, n=n, v='\t'.join(v)))
elif summarize_as_set:
    as_list = [(v, k) for k, v in v_dict.items()]
    as_list.sort(reverse=True)
Example #11
0
    'nominated_study_unique_OTU_count', 'nominated_study_unmapped_OTU_count',
    'run_time'
]
for prop in report_properties:
    locals()[prop] = 0
# end locals-punching
#################################################

ott_id_set = set()
nominated_ott_id_set = set()
for study_id, n in phy.iter_study_objs():
    reported_study_count += 1
    otu_dict = gen_otu_dict(n)
    if not bool(otu_dict):
        continue
    nex_obj = get_nexml_el(n)
    study_count += 1
    not_intended_for_synth = nex_obj.get('^ot:notIntendedForSynthesis')
    intended_for_synth = (not_intended_for_synth is
                          None) or (not_intended_for_synth is False)
    if intended_for_synth:
        nominated_study_count += 1
        nominated_study_OTU_count += len(otu_dict)
    OTU_count += len(otu_dict)

    for oid, o in otu_dict.items():
        ott_id = o.get('^ot:ottId')
        if ott_id is None:
            unmapped_OTU_count += 1
            if intended_for_synth:
                nominated_study_unmapped_OTU_count += 1
unmapped_OTU_count = 0
unique_OTU_count = 0
nominated_study_count = 0
nominated_study_OTU_count = 0
nominated_study_unique_OTU_count = 0
nominated_study_unmapped_OTU_count = 0
run_time = 0

ott_id_set = set()
nominated_ott_id_set = set()
for study_id, n in phy.iter_study_objs():
    reported_study_count += 1
    otu_dict = gen_otu_dict(n)
    if not bool(otu_dict):
        continue
    nex_obj = get_nexml_el(n)
    study_count += 1
    not_intended_for_synth = nex_obj.get('^ot:notIntendedForSynthesis')
    intended_for_synth = (not_intended_for_synth is None) or (not_intended_for_synth is False)
    if intended_for_synth:
        nominated_study_count += 1
        nominated_study_OTU_count += len(otu_dict)
    OTU_count += len(otu_dict)

    for oid, o in otu_dict.items():
        ott_id = o.get('^ot:ottId')
        if ott_id is None:
            unmapped_OTU_count += 1
            if intended_for_synth:
                nominated_study_unmapped_OTU_count += 1
        else:
Example #13
0
def merge_otus_and_trees(nexson_blob):
    '''Takes a nexson object:
        1. merges trees elements 2 - # trees into the first trees element.,
        2. merges otus elements 2 - # otus into the first otus element.
        3. if there is no ot:originalLabel field for any otu,
            it sets that field based on @label and deletes @label
        4. merges an otu elements using the rule:
              A. treat (ottId, originalLabel) as a key
              B. If otu objects in subsequent trees match originalLabel and
                have a matching or absent ot:ottId, then they are merged into
                the same OTUs (however see C)
              C. No two leaves of a tree may share an otu (though otu should
                be shared across different trees). It is important that
                each leaf node be mapped to a distinct OTU. Otherwise there
                will be no way of separating them during OTU mapping. we
                do this indirectly by assuring to no two otu objects in the
                same otus object get merged with each other (or to a common
                object)

        5. correct object references to deleted entities.

    This function is used to patch up NexSONs created by multiple imports, hence the
    substitution of '@label' for 'ot:originalLabel'. Ids are arbitrary for imports from
    non-nexml tools, so matching is done based on names. This should mimic the behavior
    of the analysis tools that produced the trees (for most/all such tools unique names
    constitute unique OTUs).
    '''
    id_to_replace_id = {}
    orig_version = detect_nexson_version(nexson_blob)
    convert_nexson_format(nexson_blob, BY_ID_HONEY_BADGERFISH)
    nexson = get_nexml_el(nexson_blob)
    otus_group_order = nexson.get('^ot:otusElementOrder', [])
    # (ott, orig) -> list of otu elements
    retained_mapped2otu = {}
    # orig -> list of otu elements
    retained_orig2otu = {}
    # For the first (entirely retained) group of otus:
    #   1. assure that originalLabel is filled in
    #   2. register the otu in retained_mapped2otu and retained_orig2otu
    # otu elements that have no label, originalLabel or ottId will not
    #   be registered, so they'll never be matched.
    if len(otus_group_order) > 0:
        otus_group_by_id = nexson['otusById']
        retained_ogi = otus_group_order[0]
        retained_og = otus_group_by_id[retained_ogi]
        retained_og_otu = retained_og.setdefault('otuById', {})
        label_to_original_label_otu_by_id(retained_og_otu)
        for oid, otu in retained_og_otu.items():
            ottid = otu.get('^ot:ottId')
            orig = otu.get('^ot:originalLabel')
            key = (ottid, orig)
            if key != (None, None):
                m = retained_mapped2otu.setdefault(key, [])
                t = (oid, otu)
                m.append(t)
                if orig is not None:
                    m = retained_orig2otu.setdefault(orig, [])
                    m.append(t)
        # For each of the other otus elements, we:
        #   1. assure that originalLabel is filled in
        #   2. decide (for each otu) whether it will
        #       be added to retained_og or merged with
        #       an otu already in retained_og. In the
        #       case of the latter, we add to the
        #       replaced_otu dict (old oid as key, new otu as value)
        for ogi in otus_group_order[1:]:
            #_LOG.debug('retained_mapped2otu = {r}'.format(r=retained_mapped2otu))
            og = otus_group_by_id[ogi]
            del otus_group_by_id[ogi]
            otu_by_id = og.get('otuById', {})
            label_to_original_label_otu_by_id(otu_by_id)
            used_matches = set()
            id_to_replace_id[ogi] = retained_ogi
            for oid, otu in otu_by_id.items():
                ottid = otu.get('^ot:ottId')
                orig = otu.get('^ot:originalLabel')
                key = (ottid, orig)
                if key == (None, None):
                    retained_og[oid] = otu
                else:
                    match_otu = None
                    mlist = retained_mapped2otu.get(key)
                    if mlist is not None:
                        for m in mlist:
                            if m[0] not in used_matches:
                                # _LOG.debug('Matching {k} to {m}'.format(k=repr(key), m=repr(m)))
                                match_otu = m
                                break
                            #else:
                            #    _LOG.debug('{k} already in {m}'.format(k=repr(m[0]), m=repr(used_matches)))
                    if match_otu is None:
                        #_LOG.debug('New el: {k} mlist = {m}'.format(k=repr(key), m=repr(mlist)))
                        mlist = retained_orig2otu.get(orig, [])
                        for m in mlist:
                            if m[0] not in used_matches:
                                match_otu = m
                                break
                    if match_otu is not None:
                        id_to_replace_id[oid] = match_otu[0]
                        used_matches.add(match_otu[0])
                        _merge_otu_do_not_fix_references(otu, match_otu[1])
                    else:
                        assert oid not in retained_og_otu
                        retained_og_otu[oid] = otu
                        m = retained_mapped2otu.setdefault(key, [])
                        t = (oid, otu)
                        m.append(t)
                        if orig is not None:
                            m = retained_orig2otu.setdefault(orig, [])
                            m.append(t)
        nexson['^ot:otusElementOrder'] = [retained_ogi]
    # Move all of the tree elements to the first trees group.
    trees_group_order = nexson.get('^ot:treesElementOrder', [])
    if len(trees_group_order) > 0:
        trees_group_by_id = nexson['treesById']
        retained_tgi = trees_group_order[0]
        retained_tg = trees_group_by_id[retained_tgi]
        retained_tg['@otus'] = retained_ogi
        retained_tg_tree_obj = retained_tg.get('treeById', {})
        for tgi in trees_group_order[1:]:
            tg = trees_group_by_id[tgi]
            del trees_group_by_id[tgi]
            id_to_replace_id[tgi] = retained_tgi
            retained_tg['^ot:treeElementOrder'].extend(tg['^ot:treeElementOrder'])
            for tid, tree_obj in tg.get('treeById', {}).items():
                retained_tg_tree_obj[tid] = tree_obj
        for tree_obj in retained_tg_tree_obj.values():
            for node in tree_obj.get('nodeById', {}).values():
                o = node.get('@otu')
                if o is not None:
                    r = id_to_replace_id.get(o)
                    if r is not None:
                        node['@otu'] = r
        nexson['^ot:treesElementOrder'] = [retained_tgi]

    replace_entity_references_in_meta_and_annotations(nexson, id_to_replace_id)
    convert_nexson_format(nexson_blob, orig_version)
    return nexson_blob
    if len(study) == 1:
        study = '0' + study
    study2tree.setdefault('pg_' + study, []).append('tree' + tree)


pa = PhylesystemAPI(get_from='local')
raw_phylsys = pa.phylesystem_obj
nexson_version = raw_phylsys.repo_nexml2json
for study_id, tree_list in study2tree.items():
    if verbose:
        sys.stderr.write('treelist={t} for study {s}.\n'.format(t=str(tree_list), s=study_id))
    try:
        fp = raw_phylsys.get_filepath_for_study(study_id)
        blob = read_as_json(fp)

        nex = get_nexml_el(blob)
        prev = nex.setdefault('^ot:candidateTreeForSynthesis', [])
        for tree_id in tree_list:
            if tree_id not in prev:
                prev.append(tree_id)
            i_t_o_list = extract_tree_nexson(blob, tree_id, nexson_version)
            if not i_t_o_list:
                sys.stderr.write('tree {t} of study {s} not found !!!\n'.format(t=tree_id, s=study_id))
            for tid, tree, otus_group in i_t_o_list:
                tree['^ot:unrootedTree'] = False
                tree['^ot:specifiedRoot'] = tree['^ot:rootNodeId']
        if not dry_run:
            write_as_json(blob, fp)
        
    except KeyError:
        sys.stderr.write('study {} not found !!!\n'.format(study_id))
Example #15
0
#!/usr/bin/env python
"""Examines the tags (ot:tag) study. Prints out a list 
of each unique tag used in the studies """
from peyotl.manip import iter_trees
from peyotl.phylesystem.phylesystem_umbrella import Phylesystem
from peyotl.nexson_syntax import get_nexml_el
from collections import defaultdict
import codecs
import sys

phy = Phylesystem()
study_dict = defaultdict(int)
tree_dict = defaultdict(int)
out = codecs.getwriter("utf-8")(sys.stdout)
for study_id, n in phy.iter_study_objs():
    nexml = get_nexml_el(n)
    t = nexml.get("^ot:tag")
    if t:
        # print study_id, t
        if isinstance(t, list):
            for tag in t:
                study_dict[tag] += 1
        else:
            study_dict[t] += 1
    for trees_group_id, tree_id, tree in iter_trees(n):
        t = tree.get("^ot:tag")
        if t:
            # print study_id, tree_id, t
            if isinstance(t, list):
                for tag in t:
                    study_dict[tag] += 1
Example #16
0
def _main():
    import sys, codecs, json, os
    import argparse
    _HELP_MESSAGE = '''NeXML/NexSON converter'''
    _EPILOG = '''UTF-8 encoding is used (for input and output).

Environmental variables used:
    NEXSON_INDENTATION_SETTING indentation in NexSON (default 0)
    NEXML_INDENTATION_SETTING indentation in NeXML (default is 0).
    NEXSON_LOGGING_LEVEL logging setting: NotSet, Debug, Warn, Info, Error
    NEXSON_LOGGING_FORMAT format string for logging messages.
'''
    parser = argparse.ArgumentParser(description=_HELP_MESSAGE,
                                     formatter_class=argparse.RawDescriptionHelpFormatter,
                                     epilog=_EPILOG)
    parser.add_argument("input", help="filepath to input")
    parser.add_argument("-o", "--output", 
                        metavar="FILE",
                        required=False,
                        help="output filepath. Standard output is used if omitted.")
    parser.add_argument("-s", "--sort", 
                        action="store_true",
                        default=False,
                        help="If specified, the arbitrarily ordered items will be sorted.")
    e_choices = ["nexml",
                 str(BADGER_FISH_NEXSON_VERSION),
                 str(DIRECT_HONEY_BADGERFISH),
                 str(BY_ID_HONEY_BADGERFISH),
                 "0.0",
                 "1.0",
                 "1.2",
                 "badgerfish"]
    e_choices.sort()
    e_help = 'output format. Valid choices are: "{c}". \
With "0.0" and "badgerfish" as aliases for "0.0.0", and \
"1.2" being an alias for the most recent version of honeybadgerfish \
(1.2.0). The verions "1.0.0" and its alias "1.0" refer to a \
version that uses the honeybadgefish syntax for meta elements, \
but maintained the direct object-mapping from NeXML of the \
badgerfish form of NexSON'.format(c='", "'.join(e_choices))
    parser.add_argument("-e", "--export", 
                        metavar="FMT",
                        required=False,
                        choices=e_choices,
                        help=e_help)
    codes = 'xjb'
    parser.add_argument("-m", "--mode", 
                        metavar="MODE",
                        required=False,
                        choices=[i + j for i in codes for j in codes],
                        help="A less precise way to specify a mapping. The \
                               m option is a two-letter code for {input}{output} \
                               The letters are x for NeXML, j for NexSON, \
                               and b for BadgerFish JSON version of NexML. \
                               The default behavior is to autodetect the format \
                               and convert JSON to NeXML or NeXML to NexSON.")
    args = parser.parse_args()
    inpfn = args.input
    outfn = args.output
    mode = args.mode
    export_format = args.export
    if export_format:
        if export_format.lower() in ["badgerfish", "0.0"]:
            export_format = str(BADGER_FISH_NEXSON_VERSION)
        elif export_format.lower() ==  "1.0":
            export_format = str(DIRECT_HONEY_BADGERFISH)
        elif export_format.lower() ==  "1.2":
            export_format = str(BY_ID_HONEY_BADGERFISH)
    if export_format is not None and mode is not None:
        if (mode.endswith('b') and (export_format != str(BADGER_FISH_NEXSON_VERSION))) \
           or (mode.endswith('x') and (export_format.lower() != "nexml")) \
           or (mode.endswith('x') and (export_format.lower() not in [str(DIRECT_HONEY_BADGERFISH)])):
            sys.exit('export format {e} clashes with mode {m}. The mode option is not neeeded if the export option is used.'.format(e=export_format, m=mode))
    try:
        inp = codecs.open(inpfn, mode='rU', encoding='utf-8')
    except:
        sys.exit('nexson_nexml: Could not open file "{fn}"\n'.format(fn=inpfn))
    if mode is None:
        try:
            while True:
                first_graph_char = inp.read(1).strip()
                if first_graph_char == '<':
                    mode = 'x*'
                    break
                elif first_graph_char in '{[':
                    mode = '*x'
                    break
                elif first_graph_char:
                    raise ValueError('Expecting input to start with <, {, or [')
        except:
            sys.exit('nexson_nexml: First character of "{fn}" was not <, {, or [\nInput does not appear to be NeXML or NexSON\n'.format(fn=inpfn))
        if export_format is None:
            if mode.endswith('*'):
                export_format = str(DIRECT_HONEY_BADGERFISH)
            else:
                export_format = "nexml"
        inp.seek(0)
    elif export_format is None:
        if mode.endswith('j'):
            export_format = str(DIRECT_HONEY_BADGERFISH)
        elif mode.endswith('b'):
            export_format = str(BADGER_FISH_NEXSON_VERSION)
        else:
            assert mode.endswith('x')
            export_format = "nexml"

    if export_format == "nexml":
        indentation = int(os.environ.get('NEXML_INDENTATION_SETTING', 0))
    else:
        indentation = int(os.environ.get('NEXSON_INDENTATION_SETTING', 0))
    
    if outfn is not None:
        try:
            out = codecs.open(outfn, mode='w', encoding='utf-8')
        except:
            sys.exit('nexson_nexml: Could not open output filepath "{fn}"\n'.format(fn=outfn))
    else:
        out = codecs.getwriter('utf-8')(sys.stdout)

    if mode.startswith('x'):
        blob = get_ot_study_info_from_nexml(inp,
                                            nexson_syntax_version=export_format)
    else:
        blob = json.load(inp)
        if mode.startswith('*'):
            try:
                n = get_nexml_el(blob)
            except:
                n = None
            if not n or (not isinstance(n, dict)):
                sys.exit('No top level "nex:nexml" element found. Document does not appear to be a JSON version of NeXML\n')
            if n:
                mode = 'j' + mode[1]
    if args.sort:
        sort_arbitrarily_ordered_nexson(blob)
    if export_format == "nexml":
        if indentation > 0:
            indent = ' '*indentation
        else:
            indent = ''
        newline = '\n'
        write_obj_as_nexml(blob,
                           out,
                           addindent=indent,
                           newl=newline)
    else:
        if not mode.startswith('x'):
            blob = convert_nexson_format(blob, export_format, sort_arbitrary=True)
        write_as_json(blob, out, indent=indentation)
Example #17
0
def addStudy(session, study_id):
    # get latest version of nexson
    print "adding study {s}".format(s=study_id)
    phy = PhylesystemAPI(get_from="local")
    studyobj = phy.get_study(study_id)["data"]
    nexml = get_nexml_el(studyobj)
    year = nexml.get("^ot:studyYear")
    proposedTrees = nexml.get("^ot:candidateTreeForSynthesis")
    if proposedTrees is None:
        proposedTrees = []

    # create a new Study object
    new_study = Study(id=study_id, year=year)
    session.add(new_study)
    # session.commit()

    # get curator(s), noting that ot:curators might be a
    # string or a list
    c = nexml.get("^ot:curatorName")
    print " ot:curatorName: ", c
    # create list of curator objects
    curator_list = []
    if isinstance(c, basestring):
        curator_list.append(c)
    else:
        curator_list = c
    for curator in curator_list:
        test_c = session.query(Curator).filter(Curator.name == curator).first()
        if test_c:
            print "curator {c} already exists".format(c=curator)
            # session.add(curator)
            new_study.curators.append(test_c)
        else:
            print "curator {c} does no exist".format(c=curator)
            new_study.curators.append(Curator(name=curator))

    # mapped otus in this study
    otu_dict = gen_otu_dict(studyobj)
    # iterate over the OTUs in the study, collecting the mapped
    # ones (oid to ott_id mapping held at the study level)
    mapped_otus = {}
    for oid, o in otu_dict.items():
        ottID = o.get("^ot:ottId")
        if ottID is not None:
            mapped_otus[oid] = ottID

    # iterate over trees and insert tree data
    for trees_group_id, tree_id, tree in iter_trees(studyobj):
        print " tree :", tree_id
        proposedForSynth = False
        if tree_id in proposedTrees:
            proposedForSynth = True

        treejson = json.dumps(tree)
        new_tree = Tree(tree_id=tree_id, study_id=study_id, proposed=proposedForSynth, data=treejson)

        # get otus
        ottIDs = set()  # ott ids for this tree
        ntips = 0
        for node_id, node in iter_node(tree):
            oid = node.get("@otu")
            # no @otu property on internal nodes
            if oid is not None:
                ntips += 1
                # ottID = mapped_otus[oid]
                if oid in mapped_otus:
                    ottID = mapped_otus[oid]
                    # check that this exists in the taxonomy
                    # (it might not, if the ID has been deprecated)
                    taxon = session.query(Taxonomy).filter(Taxonomy.id == ottID).first()
                    if taxon:
                        new_tree.otus.append(taxon)
                        ottIDs.add(ottID)
        new_tree.ntips = ntips
        # need to write function for recursive query of Taxonomy table
        # ottIDs = parent_closure(ottIDs,taxonomy)

        # update with treebase id, if exists
        datadeposit = nexml.get("^ot:dataDeposit")
        if datadeposit:
            url = datadeposit["@href"]
            pattern = re.compile(u".+TB2:(.+)$")
            matchobj = re.match(pattern, url)
            if matchobj:
                tb_id = matchobj.group(1)
                new_tree.treebase_id = tb_id
        session.add(new_tree)

    # now that we have added the tree info, update the study record
    # with the json data (minus the tree info)
    del nexml["treesById"]
    studyjson = json.dumps(nexml)
    new_study.data = studyjson
    session.commit()
def add_study(study_id):
    _LOG.debug('adding study {s}'.format(s=study_id))

    # get latest version of nexson
    # location of repo (test vs dev) dependent on peyotl config
    phy = create_phylesystem_obj()
    try:
        studyobj = phy.get_study(study_id)['data']
    except:
        _LOG.debug('did not find study {s} in phylesystem'.format(s=study_id))
        raise HTTPNotFound("Study {s} not found in phylesystem".format(s=study_id))
    nexml = get_nexml_el(studyobj)
    proposedTrees = nexml.get('^ot:candidateTreeForSynthesis')
    if proposedTrees is None:
        proposedTrees = []

    # create a new Study object
    new_study = Study(id=study_id)
    DBSession.add(new_study)

    # update with treebase id, if exists
    datadeposit = nexml.get('^ot:dataDeposit')
    if (datadeposit):
        url = datadeposit['@href']
        if (url):
            pattern = re.compile(u'.+TB2:(.+)$')
            matchobj = re.match(pattern,url)
            if (matchobj):
                tb_id = matchobj.group(1)
                new_study.treebase_id=tb_id

    # get curator(s), noting that ot:curators might be a
    # string or a list
    c = nexml.get('^ot:curatorName')
    # create list of curator objects
    curator_list=[]
    if (isinstance(c,basestring)):
        curator_list.append(c)
    else:
        curator_list = c
    for curator in curator_list:
        test_c = DBSession.query(Curator).filter(Curator.name==curator).first()
        if test_c:
            _LOG.debug("curator {c} already exists".format(c=curator))
            #DBSession.add(curator)
            new_study.curators.append(test_c)
        else:
            _LOG.debug("curator {c} does not yet exist".format(c=curator))
            new_study.curators.append(Curator(name=curator))

    # mapped otus in this study
    otu_dict = gen_otu_dict(studyobj)
    # iterate over the OTUs in the study, collecting the mapped
    # ones (oid to ott_id mapping held at the study level)
    mapped_otus = {}
    for oid, o in otu_dict.items():
        ottID = o.get('^ot:ottId')
        if ottID is not None:
            mapped_otus[oid]=ottID

    # iterate over trees and insert tree data
    ntrees = 0
    for trees_group_id, tree_id, tree in iter_trees(studyobj):
        _LOG.debug(' tree : {t}'.format(t=tree_id))
        ntrees+=1
        proposedForSynth = False
        if (tree_id in proposedTrees):
            proposedForSynth = True

        treejson = json.dumps(tree)
        new_tree = Tree(
            tree_id=tree_id,
            study_id=study_id,
            proposed=proposedForSynth,
            data=treejson
            )

        # get otus
        ottIDs = set()     # ott ids for this tree
        ntips=0
        for node_id, node in iter_node(tree):
            oid = node.get('@otu')
            # no @otu property on internal nodes
            if oid is not None:
                ntips+=1
                #ottID = mapped_otus[oid]
                if oid in mapped_otus:
                    ottID = mapped_otus[oid]
                    # _LOG.debug(' mapped ottID: {m}'.format(m=ottID))
                    # check that this exists in the taxonomy
                    # (it might not, if the ID has been deprecated)
                    taxon = DBSession.query(Taxonomy).filter(
                        Taxonomy.id==ottID
                        ).first()
                    if taxon:
                        lineage = get_lineage(ottID)
                        _LOG.debug(' lineage of {m} = {l}'.format(m=ottID,l=lineage))
                        for t in lineage:
                            ottIDs.add(t)
        new_tree.ntips = ntips
        for t in ottIDs:
            taxon = DBSession.query(Taxonomy).filter(
                Taxonomy.id==t
                ).first()
            # _LOG.debug(' adding {t},{n} to tree {tid}'.format(
            #     t=t,
            #     n=taxon.name,
            #     tid=tree_id)
            #     )
            new_tree.otus.append(taxon)

        # add the tree
        DBSession.add(new_tree)

    # now that we have added the tree info, update the study record
    # with the json data (minus the tree info)
    del nexml['treesById']
    studyjson = json.dumps(nexml)
    new_study.data=studyjson
    new_study.ntrees = ntrees
Example #19
0
def load_nexsons(connection,cursor,phy,config_obj,nstudies=None):
    counter = 0
    study_properties = set()
    tree_properties = set()
    for study_id, studyobj in phy.iter_study_objs():
        nexml = get_nexml_el(studyobj)
        #print 'STUDY: ',study_id
        study_properties.update(nexml.keys())
        # study data for study table
        STUDYTABLE = config_obj.get('database_tables','studytable')
        year = nexml.get('^ot:studyYear')
        proposedTrees = nexml.get('^ot:candidateTreeForSynthesis')
        if proposedTrees is None:
            proposedTrees = []

        # must insert study before trees
        sqlstring = ("INSERT INTO {tablename} (id) "
            "VALUES (%s);"
            .format(tablename=STUDYTABLE)
            )
        data = (study_id,)
        #print '  SQL: ',cursor.mogrify(sqlstring)
        cursor.execute(sqlstring,data)
        connection.commit()

        # update with treebase id, if exists
        datadeposit = nexml.get('^ot:dataDeposit')
        if (datadeposit):
            url = datadeposit['@href']
            pattern = re.compile(u'.+TB2:(.+)$')
            matchobj = re.match(pattern,url)
            if (matchobj):
                tb_id = matchobj.group(1)
                sqlstring = ("UPDATE {tablename} "
                    "SET treebase_id=%s "
                    "WHERE id=%s;"
                    .format(tablename=STUDYTABLE)
                    )
                data = (tb_id,study_id)
                #print '  SQL: ',cursor.mogrify(sqlstring,data)
                cursor.execute(sqlstring,data)
                connection.commit()

        # get curator(s), noting that ot:curators might be a
        # string or a list
        c = nexml.get('^ot:curatorName')
        #print ' ot:curatorName: ',c
        curators=[]
        if (isinstance(c,basestring)):
            curators.append(c)
        else:
            curators=c
        # remove duplicates
        curators = list(set(curators))
        insert_curators(connection,cursor,config_obj,study_id,curators)

        # iterate over trees and insert tree data
        # note that OTU data done separately as COPY
        # due to size of table (see script <scriptname>)
        TREETABLE = config_obj.get('database_tables','treetable')
        ntrees = 0
        try:
            for trees_group_id, tree_id, tree in iter_trees(studyobj):
                #print ' tree :' ,tree_id
                ntrees += 1
                proposedForSynth = False
                tree_properties.update(tree.keys())
                if (tree_id in proposedTrees):
                    proposedForSynth = True
                treejson = json.dumps(tree)
                ntips = 0
                for node_id, node in iter_node(tree):
                    oid = node.get('@otu')
                    # no @otu property on internal nodes
                    if oid is not None:
                        ntips+=1

                sqlstring = ("INSERT INTO {tablename} "
                    "(tree_id,study_id,ntips,proposed,data) "
                    "VALUES (%s,%s,%s,%s,%s);"
                    .format(tablename=TREETABLE)
                    )
                data = (tree_id,study_id,ntips,proposedForSynth,treejson)
                #print '  SQL: ',cursor.mogrify(sqlstring,data)
                cursor.execute(sqlstring,data)
                connection.commit()

        except psy.Error as e:
            print e.pgerror

        # now that we have added the tree info, update the study record
        # with the json data (minus the tree info) and ntrees
        del nexml['treesById']
        studyjson = json.dumps(nexml)
        sqlstring = ("UPDATE {tablename} "
            "SET data=%s,ntrees=%s "
            "WHERE id=%s;"
            .format(tablename=STUDYTABLE)
        )
        data = (studyjson,ntrees,study_id)
        cursor.execute(sqlstring,data)
        connection.commit()

        counter+=1
        if (counter%500 == 0):
            print "loaded {n} studies".format(n=counter)

        if (nstudies and counter>=nstudies):
            print "finished inserting",nstudies,"studies"
            break

    # load the tree and study properties
    PROPERTYTABLE = config_obj.get('database_tables','propertytable')
    load_properties(
        connection,
        cursor,
        PROPERTYTABLE,
        study_properties,
        tree_properties)