def testCanConvert(self):
     inp = pathmap.nexson_obj('merge/merge-input.v1.2.json')
     expected = pathmap.nexson_obj('merge/merge-expected.v1.2.json')
     expected = sort_arbitrarily_ordered_nexson(expected)
     inp = sort_arbitrarily_ordered_nexson(inp)
     self.assertNotEqual(inp, expected)
     merge_otus_and_trees(inp)
     equal_blob_check(self, '', inp, expected)
Example #2
0
 def testConvertHBF1_2toHBF1_0(self):
     for t in RT_DIRS:
         obj, b_expect = _get_pair(t, 'v1.2.json', 'v1.0.json')
         if obj is None:
             continue
         b = convert_nexson_format(obj, DIRECT_HONEY_BADGERFISH)
         sort_arbitrarily_ordered_nexson(b_expect)
         sort_arbitrarily_ordered_nexson(b)
         equal_blob_check(self, '', b, b_expect)
Example #3
0
 def testConvertHBF1_2toBF(self):
     for t in RT_DIRS:
         obj, b_expect = _get_pair(t, 'v1.2.json', 'v0.0.json')
         if obj is None:
             continue
         b = convert_nexson_format(obj, BADGER_FISH_NEXSON_VERSION)
         sort_arbitrarily_ordered_nexson(b_expect)
         sort_arbitrarily_ordered_nexson(b)
         equal_blob_check(self, '', b, b_expect)
Example #4
0
def get_ot_study_info_from_treebase_nexml(src=None,
                                          nexml_content=None,
                                          encoding=u'utf8',
                                          nexson_syntax_version=DEFAULT_NEXSON_VERSION,
                                          merge_blocks=True,
                                          sort_arbitrary=False):
    '''Normalize treebase-specific metadata into the locations where
    open tree of life software that expects it.

    See get_ot_study_info_from_nexml for the explanation of the src,
    nexml_content, encoding, and nexson_syntax_version arguments
    If merge_blocks is True then peyotl.manip.merge_otus_and_trees

    Actions to "normalize" TreeBase objects to ot Nexson
        1. the meta id for any meta item that has only a value and an id
        2. throw away rdfs:isDefinedBy
        3. otu @label -> otu ^ot:originalLabel
        4. ^tb:indentifier.taxon, ^tb:indentifier.taxonVariant and some skos:closeMatch
            fields to ^ot:taxonLink
        5. remove "@xml:base"
        6. coerce edge lengths to native types
    '''
    #pylint: disable=R0915
    raw = get_ot_study_info_from_nexml(src=src,
                                       nexml_content=nexml_content,
                                       encoding=encoding,
                                       nexson_syntax_version=BY_ID_HONEY_BADGERFISH)
    nexml = raw['nexml']
    SKOS_ALT_LABEL = '^skos:altLabel'
    SKOS_CLOSE_MATCH = '^skos:closeMatch'
    strippable_pre = {
        'http://www.ubio.org/authority/metadata.php?lsid=urn:lsid:ubio.org:namebank:': '@ubio',
        'http://purl.uniprot.org/taxonomy/': '@uniprot',
    }
    moveable2taxon_link = {"^tb:identifier.taxon": '@tb:identifier.taxon',
                           "^tb:identifier.taxonVariant": '@tb:identifier.taxonVariant', }
    to_del = ['^rdfs:isDefinedBy', '@xml:base']
    for tag in to_del:
        if tag in nexml:
            del nexml[tag]
    _simplify_all_meta_by_id_del(nexml)
    _otu2label = {}
    prefix_map = {}
    # compose dataDeposit
    nexid = nexml['@id']
    tb_url = 'http://purl.org/phylo/treebase/phylows/study/TB2:' + nexid
    nexml['^ot:dataDeposit'] = {'@href': tb_url}
    # compose dataDeposit
    bd = nexml.get("^dcterms:bibliographicCitation")
    if bd:
        nexml['^ot:studyPublicationReference'] = bd
    doi = nexml.get('^prism:doi')
    if doi:
        nexml['^ot:studyPublication'] = {'@href': doi}
    year = nexml.get('^prism:publicationDate')
    if year:
        try:
            nexml['^ot:studyYear'] = int(year)
        except:
            pass
    #
    for otus in nexml['otusById'].values():
        for tag in to_del:
            if tag in otus:
                del otus[tag]
        _simplify_all_meta_by_id_del(otus)
        for oid, otu in otus['otuById'].items():
            for tag in to_del:
                if tag in otu:
                    del otu[tag]
            _simplify_all_meta_by_id_del(otu)
            label = otu['@label']
            _otu2label[oid] = label
            otu['^ot:originalLabel'] = label
            del otu['@label']
            al = otu.get(SKOS_ALT_LABEL)
            if al is not None:
                if otu.get('^ot:altLabel') is None:
                    otu['^ot:altLabel'] = al
                del otu[SKOS_ALT_LABEL]
            tl = {}
            scm = otu.get(SKOS_CLOSE_MATCH)
            #_LOG.debug('scm = ' + str(scm))
            if scm:
                if isinstance(scm, dict):
                    h = scm.get('@href')
                    if h:
                        try:
                            for p, t in strippable_pre.items():
                                if h.startswith(p):
                                    ident = h[len(p):]
                                    tl[t] = ident
                                    del otu[SKOS_CLOSE_MATCH]
                                    prefix_map[t] = p
                        except:
                            pass
                else:
                    nm = []
                    try:
                        for el in scm:
                            h = el.get('@href')
                            if h:
                                found = False
                                for p, t in strippable_pre.items():
                                    if h.startswith(p):
                                        ident = h[len(p):]
                                        tl[t] = ident
                                        found = True
                                        prefix_map[t] = p
                                        break
                                if not found:
                                    nm.append(el)
                    except:
                        pass
                    if len(nm) < len(scm):
                        if len(nm) > 1:
                            otu[SKOS_CLOSE_MATCH] = nm
                        elif len(nm) == 1:
                            otu[SKOS_CLOSE_MATCH] = nm[0]
                        else:
                            del otu[SKOS_CLOSE_MATCH]
            #_LOG.debug('tl =' + str(tl))
            for k, t in moveable2taxon_link.items():
                al = otu.get(k)
                if al:
                    tl[t] = al
                    del otu[k]
            if tl:
                otu['^ot:taxonLink'] = tl
    for trees in nexml['treesById'].values():
        for tag in to_del:
            if tag in trees:
                del trees[tag]
        _simplify_all_meta_by_id_del(trees)
        for tree in trees['treeById'].values():
            for tag in to_del:
                if tag in tree:
                    del tree[tag]
            _simplify_all_meta_by_id_del(tree)
            tt = tree.get('@xsi:type', 'nex:FloatTree')
            if tt.lower() == 'nex:inttree':
                e_len_coerce = int
            else:
                e_len_coerce = float
            for edge_d in tree['edgeBySourceId'].values():
                for edge in edge_d.values():
                    try:
                        x = e_len_coerce(edge['@length'])
                        edge['@length'] = x
                    except:
                        pass
            for node in tree['nodeById'].values():
                nl = node.get('@label')
                if nl:
                    no = node.get('@otu')
                    if no and _otu2label[no] == nl:
                        del node['@label']

    if prefix_map:
        nexml['^ot:taxonLinkPrefixes'] = prefix_map
    if merge_blocks:
        from peyotl.manip import merge_otus_and_trees
        merge_otus_and_trees(raw)
    if nexson_syntax_version != BY_ID_HONEY_BADGERFISH:
        convert_nexson_format(raw,
                              nexson_syntax_version,
                              current_format=BY_ID_HONEY_BADGERFISH,
                              sort_arbitrary=sort_arbitrary)
    elif sort_arbitrary:
        sort_arbitrarily_ordered_nexson(raw)
    return raw
Example #5
0
def _main():
    import sys, codecs, json, os
    import argparse
    _HELP_MESSAGE = '''NeXML/NexSON converter'''
    _EPILOG = '''UTF-8 encoding is used (for input and output).

Environmental variables used:
    NEXSON_INDENTATION_SETTING indentation in NexSON (default 0)
    NEXML_INDENTATION_SETTING indentation in NeXML (default is 0).
    NEXSON_LOGGING_LEVEL logging setting: NotSet, Debug, Warn, Info, Error
    NEXSON_LOGGING_FORMAT format string for logging messages.
'''
    parser = argparse.ArgumentParser(description=_HELP_MESSAGE,
                                     formatter_class=argparse.RawDescriptionHelpFormatter,
                                     epilog=_EPILOG)
    parser.add_argument("input", help="filepath to input")
    parser.add_argument("-o", "--output", 
                        metavar="FILE",
                        required=False,
                        help="output filepath. Standard output is used if omitted.")
    parser.add_argument("-s", "--sort", 
                        action="store_true",
                        default=False,
                        help="If specified, the arbitrarily ordered items will be sorted.")
    e_choices = ["nexml",
                 str(BADGER_FISH_NEXSON_VERSION),
                 str(DIRECT_HONEY_BADGERFISH),
                 str(BY_ID_HONEY_BADGERFISH),
                 "0.0",
                 "1.0",
                 "1.2",
                 "badgerfish"]
    e_choices.sort()
    e_help = 'output format. Valid choices are: "{c}". \
With "0.0" and "badgerfish" as aliases for "0.0.0", and \
"1.2" being an alias for the most recent version of honeybadgerfish \
(1.2.0). The verions "1.0.0" and its alias "1.0" refer to a \
version that uses the honeybadgefish syntax for meta elements, \
but maintained the direct object-mapping from NeXML of the \
badgerfish form of NexSON'.format(c='", "'.join(e_choices))
    parser.add_argument("-e", "--export", 
                        metavar="FMT",
                        required=False,
                        choices=e_choices,
                        help=e_help)
    codes = 'xjb'
    parser.add_argument("-m", "--mode", 
                        metavar="MODE",
                        required=False,
                        choices=[i + j for i in codes for j in codes],
                        help="A less precise way to specify a mapping. The \
                               m option is a two-letter code for {input}{output} \
                               The letters are x for NeXML, j for NexSON, \
                               and b for BadgerFish JSON version of NexML. \
                               The default behavior is to autodetect the format \
                               and convert JSON to NeXML or NeXML to NexSON.")
    args = parser.parse_args()
    inpfn = args.input
    outfn = args.output
    mode = args.mode
    export_format = args.export
    if export_format:
        if export_format.lower() in ["badgerfish", "0.0"]:
            export_format = str(BADGER_FISH_NEXSON_VERSION)
        elif export_format.lower() ==  "1.0":
            export_format = str(DIRECT_HONEY_BADGERFISH)
        elif export_format.lower() ==  "1.2":
            export_format = str(BY_ID_HONEY_BADGERFISH)
    if export_format is not None and mode is not None:
        if (mode.endswith('b') and (export_format != str(BADGER_FISH_NEXSON_VERSION))) \
           or (mode.endswith('x') and (export_format.lower() != "nexml")) \
           or (mode.endswith('x') and (export_format.lower() not in [str(DIRECT_HONEY_BADGERFISH)])):
            sys.exit('export format {e} clashes with mode {m}. The mode option is not neeeded if the export option is used.'.format(e=export_format, m=mode))
    try:
        inp = codecs.open(inpfn, mode='rU', encoding='utf-8')
    except:
        sys.exit('nexson_nexml: Could not open file "{fn}"\n'.format(fn=inpfn))
    if mode is None:
        try:
            while True:
                first_graph_char = inp.read(1).strip()
                if first_graph_char == '<':
                    mode = 'x*'
                    break
                elif first_graph_char in '{[':
                    mode = '*x'
                    break
                elif first_graph_char:
                    raise ValueError('Expecting input to start with <, {, or [')
        except:
            sys.exit('nexson_nexml: First character of "{fn}" was not <, {, or [\nInput does not appear to be NeXML or NexSON\n'.format(fn=inpfn))
        if export_format is None:
            if mode.endswith('*'):
                export_format = str(DIRECT_HONEY_BADGERFISH)
            else:
                export_format = "nexml"
        inp.seek(0)
    elif export_format is None:
        if mode.endswith('j'):
            export_format = str(DIRECT_HONEY_BADGERFISH)
        elif mode.endswith('b'):
            export_format = str(BADGER_FISH_NEXSON_VERSION)
        else:
            assert mode.endswith('x')
            export_format = "nexml"

    if export_format == "nexml":
        indentation = int(os.environ.get('NEXML_INDENTATION_SETTING', 0))
    else:
        indentation = int(os.environ.get('NEXSON_INDENTATION_SETTING', 0))
    
    if outfn is not None:
        try:
            out = codecs.open(outfn, mode='w', encoding='utf-8')
        except:
            sys.exit('nexson_nexml: Could not open output filepath "{fn}"\n'.format(fn=outfn))
    else:
        out = codecs.getwriter('utf-8')(sys.stdout)

    if mode.startswith('x'):
        blob = get_ot_study_info_from_nexml(inp,
                                            nexson_syntax_version=export_format)
    else:
        blob = json.load(inp)
        if mode.startswith('*'):
            try:
                n = get_nexml_el(blob)
            except:
                n = None
            if not n or (not isinstance(n, dict)):
                sys.exit('No top level "nex:nexml" element found. Document does not appear to be a JSON version of NeXML\n')
            if n:
                mode = 'j' + mode[1]
    if args.sort:
        sort_arbitrarily_ordered_nexson(blob)
    if export_format == "nexml":
        if indentation > 0:
            indent = ' '*indentation
        else:
            indent = ''
        newline = '\n'
        write_obj_as_nexml(blob,
                           out,
                           addindent=indent,
                           newl=newline)
    else:
        if not mode.startswith('x'):
            blob = convert_nexson_format(blob, export_format, sort_arbitrary=True)
        write_as_json(blob, out, indent=indentation)