def diagnose_repo_nexml2json(self): with self._index_lock: fp = self.study_index.values()[0][2] _LOG.debug('diagnose_repo_nexml2json with fp={}'.format(fp)) with codecs.open(fp, mode='r', encoding='utf-8') as fo: fj = json.load(fo) return detect_nexson_version(fj)
def iter_otus(nexson, nexson_version=None): """generator over all otus in all otus group elements. yields a tuple of 3 items: otus group ID, otu ID, the otu obj """ if nexson_version is None: nexson_version = detect_nexson_version(nexson) if not _is_by_id_hbf(nexson_version): convert_nexson_format( nexson, BY_ID_HONEY_BADGERFISH) # TODO shouldn't modify... nex = get_nexml_el(nexson) otus_group_by_id = nex['otusById'] group_order = nex.get('^ot:otusElementOrder', []) if len(group_order) < len(otus_group_by_id): group_order = list(otus_group_by_id.keys()) group_order.sort() for otus_group_id in group_order: otus_group = otus_group_by_id[otus_group_id] otu_by_id = otus_group['otuById'] ti_order = list(otu_by_id.keys()) for otu_id in ti_order: otu = otu_by_id[otu_id] yield otus_group_id, otu_id, otu
def iter_trees(nexson, nexson_version=None): '''generator over all trees in all trees elements. yields a tuple of 3 items: trees element ID, tree ID, the tree obj ''' if nexson_version is None: nexson_version = detect_nexson_version(nexson) nex = get_nexml_el(nexson) if _is_by_id_hbf(nexson_version): trees_group_by_id = nex['treesById'] group_order = nex.get('^ot:treesElementOrder', []) if len(group_order) < len(trees_group_by_id): group_order = list(trees_group_by_id.keys()) group_order.sort() for trees_group_id in group_order: trees_group = trees_group_by_id[trees_group_id] tree_by_id = trees_group['treeById'] ti_order = trees_group.get('^ot:treeElementOrder', []) if len(ti_order) < len(tree_by_id): ti_order = list(tree_by_id.keys()) ti_order.sort() for tree_id in ti_order: tree = tree_by_id[tree_id] yield trees_group_id, tree_id, tree else: for trees_group in nex.get('trees', []): trees_group_id = trees_group['@id'] for tree in trees_group.get('tree', []): tree_id = tree['@id'] yield trees_group_id, tree_id, tree
def add_or_replace_annotation( self, #pylint: disable=R0201 obj, annotation, agent, add_agent_only=False): '''Takes an `annotation` dictionary which is expected to have a string as the value of annotation['author']['name'] This function will remove all annotations from obj that: 1. have the same author/name, and 2. have no messages that are flagged as messages to be preserved (values for 'preserve' that evaluate to true) ''' nex = get_nexml_el(obj) nvers = detect_nexson_version(obj) _LOG.debug('detected version as ' + nvers) agents_obj = find_val_literal_meta_first(nex, 'ot:agents', nvers) if not agents_obj: agents_obj = add_literal_meta(nex, 'ot:agents', {'agent': []}, nvers) agents_list = agents_obj.setdefault('agent', []) found_agent = False aid = agent['@id'] for a in agents_list: if a.get('@id') == aid: found_agent = True break if not found_agent: agents_list.append(agent) if add_agent_only: delete_same_agent_annotation(obj, annotation) else: replace_same_agent_annotation(obj, annotation)
def add_or_replace_annotation(self, # pylint: disable=R0201 obj, annotation, agent, add_agent_only=False): """Takes an `annotation` dictionary which is expected to have a string as the value of annotation['author']['name'] This function will remove all annotations from obj that: 1. have the same author/name, and 2. have no messages that are flagged as messages to be preserved (values for 'preserve' that evaluate to true) """ nex = get_nexml_el(obj) nvers = detect_nexson_version(obj) _LOG.debug('detected version as ' + nvers) agents_obj = find_val_literal_meta_first(nex, 'ot:agents', nvers) if not agents_obj: agents_obj = add_literal_meta(nex, 'ot:agents', {'agent': []}, nvers) agents_list = agents_obj.setdefault('agent', []) found_agent = False aid = agent['@id'] for a in agents_list: if a.get('@id') == aid: found_agent = True break if not found_agent: agents_list.append(agent) if add_agent_only: delete_same_agent_annotation(obj, annotation) else: replace_same_agent_annotation(obj, annotation)
def diagnose_repo_nexml2json(shard): """Optimistic test for Nexson version in a shard (tests first study found)""" with shard._index_lock: fp = next(iter(shard.study_index.values()))[2] with codecs.open(fp, mode='r', encoding='utf-8') as fo: fj = json.load(fo) from peyotl.nexson_syntax import detect_nexson_version return detect_nexson_version(fj)
def delete_annotation(obj, agent_id=None, annot_id=None, nexson_version=None): if nexson_version is None: nexson_version = detect_nexson_version(obj) nex_el = get_nexml_el(obj) annotation_list = get_annotation_list(nex_el, nexson_version) delete_annotation_from_annot_list(annotation_list, agent_id=agent_id, annot_id=annot_id)
def __init__(self, obj, logger): self._raw = obj self._nexml = None self._pyid_to_nexson_add = {} self._logger = logger self._repeated_id = False self._otuid2ottid_byogid = {} self._ottid2otuid_list_byogid = {} self._dupottid_by_ogid_tree_id = {} uk = None for k in obj.keys(): if k not in ['nexml', 'nex:nexml']: if uk is None: uk = [] uk.append(k) if uk: uk.sort() self._warn_event(_NEXEL.TOP_LEVEL, obj=obj, err_type=gen_UnrecognizedKeyWarning, anc=_EMPTY_TUPLE, obj_nex_id=None, key_list=uk) self._nexml = None try: self._nexml = get_nexml_el(obj) assert isinstance(self._nexml, dict) except: self._error_event(_NEXEL.TOP_LEVEL, obj=obj, err_type=gen_MissingMandatoryKeyWarning, anc=_EMPTY_TUPLE, obj_nex_id=None, key_list=[ 'nexml', ]) return ## EARLY EXIT!! self._nexson_id_to_obj = {} self._nexson_version = detect_nexson_version(obj) #attr used in validation only should be cleaned up # in the finally clause self._otu_group_by_id = {} self._otu_by_otug = {} try: # a little duck-punching vc = _ValidationContext(self, logger) add_schema_attributes(vc, self._nexson_version) assert self._nexson_version[:3] in ('0.0', '1.0', '1.2') self._validate_nexml_obj(self._nexml, vc, obj) finally: vc.adaptor = None # delete circular ref to help gc del vc del self._otu_group_by_id del self._otu_by_otug
def __init__(self, obj, logger): self._raw = obj self._nexml = None self._pyid_to_nexson_add = {} self._logger = logger self._repeated_id = False self._otuid2ottid_byogid = {} self._ottid2otuid_list_byogid = {} self._dupottid_by_ogid_tree_id = {} uk = None for k in obj.keys(): if k not in ['nexml', 'nex:nexml']: if uk is None: uk = [] uk.append(k) if uk: uk.sort() self._warn_event(_NEXEL.TOP_LEVEL, obj=obj, err_type=gen_UnrecognizedKeyWarning, anc=_EMPTY_TUPLE, obj_nex_id=None, key_list=uk) self._nexml = None try: self._nexml = get_nexml_el(obj) assert isinstance(self._nexml, dict) except: self._error_event(_NEXEL.TOP_LEVEL, obj=obj, err_type=gen_MissingMandatoryKeyWarning, anc=_EMPTY_TUPLE, obj_nex_id=None, key_list=['nexml',]) return ## EARLY EXIT!! self._nexson_id_to_obj = {} self._nexson_version = detect_nexson_version(obj) #attr used in validation only should be cleaned up # in the finally clause self._otu_group_by_id = {} self._otu_by_otug = {} try: # a little duck-punching vc = _ValidationContext(self, logger) add_schema_attributes(vc, self._nexson_version) assert self._nexson_version[:3] in ('0.0', '1.0', '1.2') self._validate_nexml_obj(self._nexml, vc, obj) finally: vc.adaptor = None # delete circular ref to help gc del vc del self._otu_group_by_id del self._otu_by_otug
def testTreesCulledNonmatcingConvViaPSV0(self): """Verify that the culling does not break the conversion to other forms of NexSON""" o = pathmap.nexson_obj('9/v1.2.json') self.assertEqual(len(extract_tree_nexson(o, tree_id=None)), 2) ps = PhyloSchema('nexson', content='tree', content_id='tree2', version='0.0.0', cull_nonmatching='true') x = ps.serialize(o) etn = extract_tree_nexson(o, tree_id=None) self.assertEqual(len(etn), 1) self.assertEqual(etn[0][0], 'tree2') self.assertTrue(x.startswith('{')) # pylint: disable=E1103 rx = json.loads(x) self.assertEqual(detect_nexson_version(rx), '0.0.0') etn = extract_tree_nexson(rx, tree_id=None) self.assertEqual(len(etn), 1) self.assertEqual(etn[0][0], 'tree2')
def testTreesCulledNonmatcingConvViaPSV0(self): '''Verify that the culling does not break the conversion to other forms of NexSON''' o = pathmap.nexson_obj('9/v1.2.json') self.assertEqual(len(extract_tree_nexson(o, tree_id=None)), 2) ps = PhyloSchema('nexson', content='tree', content_id='tree2', version='0.0.0', cull_nonmatching='true') x = ps.serialize(o) etn = extract_tree_nexson(o, tree_id=None) self.assertEqual(len(etn), 1) self.assertEqual(etn[0][0], 'tree2') self.assertTrue(x.startswith('{')) #pylint: disable=E1103 rx = json.loads(x) self.assertEqual(detect_nexson_version(rx), '0.0.0') etn = extract_tree_nexson(rx, tree_id=None) self.assertEqual(len(etn), 1) self.assertEqual(etn[0][0], 'tree2')
def create_validation_adaptor(obj, logger, **kwargs): try: nexson_version = detect_nexson_version(obj) except: return BadgerFishValidationAdaptor(obj, logger, **kwargs) if _is_by_id_hbf(nexson_version): # _LOG.debug('validating as ById...') return ByIdHBFValidationAdaptor(obj, logger, **kwargs) elif _is_badgerfish_version(nexson_version): # _LOG.debug('validating as BadgerFish...') return BadgerFishValidationAdaptor(obj, logger, **kwargs) elif _is_direct_hbf(nexson_version): # _LOG.debug('validating as DirectHBF...') return DirectHBFValidationAdaptor(obj, logger, **kwargs) raise NotImplementedError('nexml2json version {v}'.format(v=nexson_version))
def __init__(self, filepath='', nexson=None): self.filepath = filepath if nexson is None: if not filepath: raise ValueError('Either a filepath or nexson argument must be provided') self._nexson = read_as_json(self.filepath) else: self._nexson = nexson v = detect_nexson_version(self._nexson) if v != BY_ID_HONEY_BADGERFISH: _LOG.debug('NexsonProxy converting to hbf1.2') convert_nexson_format(self._nexson, BY_ID_HONEY_BADGERFISH) self._nexml_el = get_nexml_el(self._nexson) self._otu_cache = {} self._tree_cache = {} self._wr = None
def gen_otu_dict(nex_obj, nexson_version=None): '''Takes a NexSON object and returns a dict of otu_id -> otu_obj ''' if nexson_version is None: nexson_version = detect_nexson_version(nex_obj) if _is_by_id_hbf(nexson_version): otus = nex_obj['nexml']['otusById'] if len(otus) > 1: d = {} for v in otus.values(): d.update(v['otuById']) return d else: return otus.value()[0]['otuById'] o_dict = {} for ob in nex_obj.get('otus', []): for o in ob.get('otu', []): oid = o['@id'] o_dict[oid] = o return o_dict
def gen_otu_dict(nex_obj, nexson_version=None): """Takes a NexSON object and returns a dict of otu_id -> otu_obj """ if nexson_version is None: nexson_version = detect_nexson_version(nex_obj) if _is_by_id_hbf(nexson_version): otus = nex_obj["nexml"]["otusById"] if len(otus) > 1: d = {} for v in otus.values(): d.update(v["otuById"]) return d else: return otus.values()[0]["otuById"] o_dict = {} for ob in nex_obj.get("otus", []): for o in ob.get("otu", []): oid = o["@id"] o_dict[oid] = o return o_dict
def count_num_trees(nexson, nexson_version=None): if nexson_version is None: nexson_version = detect_nexson_version(nexson) nex = get_nexml_el(nexson) num_trees_by_group = [] if _is_by_id_hbf(nexson_version): for tree_group in nex.get('treesById', {}).values(): nt = len(tree_group.get('treeById', {})) num_trees_by_group.append(nt) else: trees_group = nex.get('trees', []) if isinstance(trees_group, dict): trees_group = [trees_group] for tree_group in trees_group: t = tree_group.get('tree') if isinstance(t, list): nt = len(t) else: nt = 1 num_trees_by_group.append(nt) return sum(num_trees_by_group)
def count_num_trees(nexson, nexson_version=None): '''Returns the number of trees summed across all tree groups. ''' if nexson_version is None: nexson_version = detect_nexson_version(nexson) nex = get_nexml_el(nexson) num_trees_by_group = [] if _is_by_id_hbf(nexson_version): for tree_group in nex.get('treesById', {}).values(): nt = len(tree_group.get('treeById', {})) num_trees_by_group.append(nt) else: trees_group = nex.get('trees', []) if isinstance(trees_group, dict): trees_group = [trees_group] for tree_group in trees_group: t = tree_group.get('tree') if isinstance(t, list): nt = len(t) else: nt = 1 num_trees_by_group.append(nt) return sum(num_trees_by_group)
def iter_otus(nexson, nexson_version=None): '''generator over all otus in all otus group elements. yields a tuple of 3 items: otus group ID, otu ID, the otu obj ''' if nexson_version is None: nexson_version = detect_nexson_version(nexson) nex = get_nexml_el(nexson) if not _is_by_id_hbf(nexson_version): convert_nexson_format(nexson_blob, BY_ID_HONEY_BADGERFISH) #TODO shouldn't modify... otus_group_by_id = nex['otusById'] group_order = nex.get('^ot:otusElementOrder', []) if len(group_order) < len(otus_group_by_id): group_order = list(otus_group_by_id.keys()) group_order.sort() for otus_group_id in group_order: otus_group = otus_group_by_id[otus_group_id] otu_by_id = otus_group['otuById'] ti_order = list(otu_by_id.keys()) for otu_id in ti_order: otu = otu_by_id[otu_id] yield otus_group_id, otu_id, otu
def __init__(self, obj, logger, **kwargs): self._raw = obj self._nexml = None self._pyid_to_nexson_add = {} self._logger = logger self._repeated_id = False self._otuid2ottid_byogid = {} self._ottid2otuid_list_byogid = {} self._dupottid_by_ogid_tree_id = {} self._max_num_trees_per_study = kwargs.get('max_num_trees_per_study') uk = None for k in obj.keys(): if k not in ['nexml', 'nex:nexml']: if uk is None: uk = [] uk.append(k) if uk: uk.sort() self._warn_event(_NEXEL.TOP_LEVEL, obj=obj, err_type=gen_UnrecognizedKeyWarning, anc=_EMPTY_TUPLE, obj_nex_id=None, key_list=uk) self._nexml = None try: self._nexml = get_nexml_el(obj) assert isinstance(self._nexml, dict) except: self._error_event(_NEXEL.TOP_LEVEL, obj=obj, err_type=gen_MissingMandatoryKeyWarning, anc=_EMPTY_TUPLE, obj_nex_id=None, key_list=['nexml', ]) return # EARLY EXIT!! self._nexson_id_to_obj = {} self._nexson_version = detect_nexson_version(obj) # attr used in validation only should be cleaned up # in the finally clause self._otu_group_by_id = {} self._otu_by_otug = {} try: # a little duck-punching vc = _ValidationContext(self, logger) try: add_schema_attributes(vc, self._nexson_version) assert self._nexson_version[:3] in ('0.0', '1.0', '1.2') self._validate_nexml_obj(self._nexml, vc, obj) if self._max_num_trees_per_study is not None: nt = count_num_trees(self._raw) if nt > self._max_num_trees_per_study: m = '{f:d} trees found, but a limit of {m:d} trees per nexson is being enforced' m = m.format(f=nt, m=self._max_num_trees_per_study) self._error_event(_NEXEL.TOP_LEVEL, obj=obj, err_type=gen_MaxSizeExceededWarning, anc=_EMPTY_TUPLE, obj_nex_id=None, message=m) return # EARLY EXIT!! finally: vc.adaptor = None # delete circular ref to help gc del vc finally: del self._otu_group_by_id del self._otu_by_otug
def __init__(self, obj, logger, **kwargs): self._raw = obj self._nexml = None self._pyid_to_nexson_add = {} self._logger = logger self._repeated_id = False self._otuid2ottid_byogid = {} self._ottid2otuid_list_byogid = {} self._dupottid_by_ogid_tree_id = {} self._max_num_trees_per_study = kwargs.get('max_num_trees_per_study') uk = None for k in obj.keys(): if k not in ['nexml', 'nex:nexml']: if uk is None: uk = [] uk.append(k) if uk: uk.sort() self._warn_event(_NEXEL.TOP_LEVEL, obj=obj, err_type=gen_UnrecognizedKeyWarning, anc=_EMPTY_TUPLE, obj_nex_id=None, key_list=uk) self._nexml = None try: self._nexml = get_nexml_el(obj) assert isinstance(self._nexml, dict) except: self._error_event(_NEXEL.TOP_LEVEL, obj=obj, err_type=gen_MissingMandatoryKeyWarning, anc=_EMPTY_TUPLE, obj_nex_id=None, key_list=[ 'nexml', ]) return ## EARLY EXIT!! self._nexson_id_to_obj = {} self._nexson_version = detect_nexson_version(obj) #attr used in validation only should be cleaned up # in the finally clause self._otu_group_by_id = {} self._otu_by_otug = {} try: # a little duck-punching vc = _ValidationContext(self, logger) add_schema_attributes(vc, self._nexson_version) assert self._nexson_version[:3] in ('0.0', '1.0', '1.2') self._validate_nexml_obj(self._nexml, vc, obj) if self._max_num_trees_per_study is not None: nt = count_num_trees(self._raw) if nt > self._max_num_trees_per_study: m = '{f:d} trees found, but a limit of {m:d} trees per nexson is being enforced' m = m.format(f=nt, m=self._max_num_trees_per_study) self._error_event(_NEXEL.TOP_LEVEL, obj=obj, err_type=gen_MaxSizeExceededWarning, anc=_EMPTY_TUPLE, obj_nex_id=None, message=m) return ## EARLY EXIT!! finally: vc.adaptor = None # delete circular ref to help gc del vc del self._otu_group_by_id del self._otu_by_otug
def testDetectVersion(self): o = pathmap.nexson_obj('invalid/bad_version.json.input') v = detect_nexson_version(o) self.assertEqual(v, '1.3.1')
def merge_otus_and_trees(nexson_blob): '''Takes a nexson object: 1. merges trees elements 2 - # trees into the first trees element., 2. merges otus elements 2 - # otus into the first otus element. 3. if there is no ot:originalLabel field for any otu, it sets that field based on @label and deletes @label 4. merges an otu elements using the rule: A. treat (ottId, originalLabel) as a key B. If otu objects in subsequent trees match originalLabel and have a matching or absent ot:ottId, then they are merged into the same OTUs (however see C) C. No two leaves of a tree may share an otu (though otu should be shared across different trees). It is important that each leaf node be mapped to a distinct OTU. Otherwise there will be no way of separating them during OTU mapping. we do this indirectly by assuring to no two otu objects in the same otus object get merged with each other (or to a common object) 5. correct object references to deleted entities. This function is used to patch up NexSONs created by multiple imports, hence the substitution of '@label' for 'ot:originalLabel'. Ids are arbitrary for imports from non-nexml tools, so matching is done based on names. This should mimic the behavior of the analysis tools that produced the trees (for most/all such tools unique names constitute unique OTUs). ''' id_to_replace_id = {} orig_version = detect_nexson_version(nexson_blob) convert_nexson_format(nexson_blob, BY_ID_HONEY_BADGERFISH) nexson = get_nexml_el(nexson_blob) otus_group_order = nexson.get('^ot:otusElementOrder', []) # (ott, orig) -> list of otu elements retained_mapped2otu = {} # orig -> list of otu elements retained_orig2otu = {} # For the first (entirely retained) group of otus: # 1. assure that originalLabel is filled in # 2. register the otu in retained_mapped2otu and retained_orig2otu # otu elements that have no label, originalLabel or ottId will not # be registered, so they'll never be matched. if len(otus_group_order) > 0: otus_group_by_id = nexson['otusById'] retained_ogi = otus_group_order[0] retained_og = otus_group_by_id[retained_ogi] retained_og_otu = retained_og.setdefault('otuById', {}) label_to_original_label_otu_by_id(retained_og_otu) for oid, otu in retained_og_otu.items(): ottid = otu.get('^ot:ottId') orig = otu.get('^ot:originalLabel') key = (ottid, orig) if key != (None, None): m = retained_mapped2otu.setdefault(key, []) t = (oid, otu) m.append(t) if orig is not None: m = retained_orig2otu.setdefault(orig, []) m.append(t) # For each of the other otus elements, we: # 1. assure that originalLabel is filled in # 2. decide (for each otu) whether it will # be added to retained_og or merged with # an otu already in retained_og. In the # case of the latter, we add to the # replaced_otu dict (old oid as key, new otu as value) for ogi in otus_group_order[1:]: #_LOG.debug('retained_mapped2otu = {r}'.format(r=retained_mapped2otu)) og = otus_group_by_id[ogi] del otus_group_by_id[ogi] otu_by_id = og.get('otuById', {}) label_to_original_label_otu_by_id(otu_by_id) used_matches = set() id_to_replace_id[ogi] = retained_ogi for oid, otu in otu_by_id.items(): ottid = otu.get('^ot:ottId') orig = otu.get('^ot:originalLabel') key = (ottid, orig) if key == (None, None): retained_og[oid] = otu else: match_otu = None mlist = retained_mapped2otu.get(key) if mlist is not None: for m in mlist: if m[0] not in used_matches: # _LOG.debug('Matching {k} to {m}'.format(k=repr(key), m=repr(m))) match_otu = m break #else: # _LOG.debug('{k} already in {m}'.format(k=repr(m[0]), m=repr(used_matches))) if match_otu is None: #_LOG.debug('New el: {k} mlist = {m}'.format(k=repr(key), m=repr(mlist))) mlist = retained_orig2otu.get(orig, []) for m in mlist: if m[0] not in used_matches: match_otu = m break if match_otu is not None: id_to_replace_id[oid] = match_otu[0] used_matches.add(match_otu[0]) _merge_otu_do_not_fix_references(otu, match_otu[1]) else: assert oid not in retained_og_otu retained_og_otu[oid] = otu m = retained_mapped2otu.setdefault(key, []) t = (oid, otu) m.append(t) if orig is not None: m = retained_orig2otu.setdefault(orig, []) m.append(t) nexson['^ot:otusElementOrder'] = [retained_ogi] # Move all of the tree elements to the first trees group. trees_group_order = nexson.get('^ot:treesElementOrder', []) if len(trees_group_order) > 0: trees_group_by_id = nexson['treesById'] retained_tgi = trees_group_order[0] retained_tg = trees_group_by_id[retained_tgi] retained_tg['@otus'] = retained_ogi retained_tg_tree_obj = retained_tg.get('treeById', {}) for tgi in trees_group_order[1:]: tg = trees_group_by_id[tgi] del trees_group_by_id[tgi] id_to_replace_id[tgi] = retained_tgi retained_tg['^ot:treeElementOrder'].extend(tg['^ot:treeElementOrder']) for tid, tree_obj in tg.get('treeById', {}).items(): retained_tg_tree_obj[tid] = tree_obj for tree_obj in retained_tg_tree_obj.values(): for node in tree_obj.get('nodeById', {}).values(): o = node.get('@otu') if o is not None: r = id_to_replace_id.get(o) if r is not None: node['@otu'] = r nexson['^ot:treesElementOrder'] = [retained_tgi] replace_entity_references_in_meta_and_annotations(nexson, id_to_replace_id) convert_nexson_format(nexson_blob, orig_version) return nexson_blob
def merge_otus_and_trees(nexson_blob): '''Takes a nexson object: 1. merges trees elements 2 - # trees into the first trees element., 2. merges otus elements 2 - # otus into the first otus element. 3. if there is no ot:originalLabel field for any otu, it sets that field based on @label and deletes @label 4. merges an otu elements using the rule: A. treat (ottId, originalLabel) as a key B. If otu objects in subsequent trees match originalLabel and have a matching or absent ot:ottId, then they are merged into the same OTUs (however see C) C. No two leaves of a tree may share an otu (though otu should be shared across different trees). It is important that each leaf node be mapped to a distinct OTU. Otherwise there will be no way of separating them during OTU mapping. we do this indirectly by assuring to no two otu objects in the same otus object get merged with each other (or to a common object) 5. correct object references to deleted entities. This function is used to patch up NexSONs created by multiple imports, hence the substitution of '@label' for 'ot:originalLabel'. Ids are arbitrary for imports from non-nexml tools, so matching is done based on names. This should mimic the behavior of the analysis tools that produced the trees (for most/all such tools unique names constitute unique OTUs). ''' id_to_replace_id = {} orig_version = detect_nexson_version(nexson_blob) convert_nexson_format(nexson_blob, BY_ID_HONEY_BADGERFISH) nexson = get_nexml_el(nexson_blob) otus_group_order = nexson.get('^ot:otusElementOrder', []) # (ott, orig) -> list of otu elements retained_mapped2otu = {} # orig -> list of otu elements retained_orig2otu = {} # For the first (entirely retained) group of otus: # 1. assure that originalLabel is filled in # 2. register the otu in retained_mapped2otu and retained_orig2otu # otu elements that have no label, originalLabel or ottId will not # be registered, so they'll never be matched. if len(otus_group_order) > 0: otus_group_by_id = nexson['otusById'] retained_ogi = otus_group_order[0] retained_og = otus_group_by_id[retained_ogi] retained_og_otu = retained_og.setdefault('otuById', {}) label_to_original_label_otu_by_id(retained_og_otu) for oid, otu in retained_og_otu.items(): ottid = otu.get('^ot:ottId') orig = otu.get('^ot:originalLabel') key = (ottid, orig) if key != (None, None): m = retained_mapped2otu.setdefault(key, []) t = (oid, otu) m.append(t) if orig is not None: m = retained_orig2otu.setdefault(orig, []) m.append(t) # For each of the other otus elements, we: # 1. assure that originalLabel is filled in # 2. decide (for each otu) whether it will # be added to retained_og or merged with # an otu already in retained_og. In the # case of the latter, we add to the # replaced_otu dict (old oid as key, new otu as value) for ogi in otus_group_order[1:]: #_LOG.debug('retained_mapped2otu = {r}'.format(r=retained_mapped2otu)) og = otus_group_by_id[ogi] del otus_group_by_id[ogi] otu_by_id = og.get('otuById', {}) label_to_original_label_otu_by_id(otu_by_id) used_matches = set() id_to_replace_id[ogi] = retained_ogi for oid, otu in otu_by_id.items(): ottid = otu.get('^ot:ottId') orig = otu.get('^ot:originalLabel') key = (ottid, orig) if key == (None, None): retained_og[oid] = otu else: match_otu = None mlist = retained_mapped2otu.get(key) if mlist is not None: for m in mlist: if m[0] not in used_matches: # _LOG.debug('Matching {k} to {m}'.format(k=repr(key), m=repr(m))) match_otu = m break #else: # _LOG.debug('{k} already in {m}'.format(k=repr(m[0]), m=repr(used_matches))) if match_otu is None: #_LOG.debug('New el: {k} mlist = {m}'.format(k=repr(key), m=repr(mlist))) mlist = retained_orig2otu.get(orig, []) for m in mlist: if m[0] not in used_matches: match_otu = m break if match_otu is not None: id_to_replace_id[oid] = match_otu[0] used_matches.add(match_otu[0]) _merge_otu_do_not_fix_references(otu, match_otu[1]) else: assert oid not in retained_og_otu retained_og_otu[oid] = otu m = retained_mapped2otu.setdefault(key, []) t = (oid, otu) m.append(t) if orig is not None: m = retained_orig2otu.setdefault(orig, []) m.append(t) nexson['^ot:otusElementOrder'] = [retained_ogi] # Move all of the tree elements to the first trees group. trees_group_order = nexson.get('^ot:treesElementOrder', []) if len(trees_group_order) > 0: trees_group_by_id = nexson['treesById'] retained_tgi = trees_group_order[0] retained_tg = trees_group_by_id[retained_tgi] retained_tg['@otus'] = retained_ogi retained_tg_tree_obj = retained_tg.get('treeById', {}) for tgi in trees_group_order[1:]: tg = trees_group_by_id[tgi] del trees_group_by_id[tgi] id_to_replace_id[tgi] = retained_tgi retained_tg['^ot:treeElementOrder'].extend( tg['^ot:treeElementOrder']) for tid, tree_obj in tg.get('treeById', {}).items(): retained_tg_tree_obj[tid] = tree_obj for tree_obj in retained_tg_tree_obj.values(): for node in tree_obj.get('nodeById', {}).values(): o = node.get('@otu') if o is not None: r = id_to_replace_id.get(o) if r is not None: node['@otu'] = r nexson['^ot:treesElementOrder'] = [retained_tgi] replace_entity_references_in_meta_and_annotations(nexson, id_to_replace_id) convert_nexson_format(nexson_blob, orig_version) return nexson_blob