Ejemplo n.º 1
0
def get_translations(node):
    """Get all translations attached directly to this node.

    """
    assert node.tag in ['snc', 'subsnc', 'drv', 'subdrv', 'ekz', 'subart', 'art', 'bld']

    # a dict that defaults to empty list if that key isn't present
    translations = defaultdict(list)

    for trd_node in node.findall('trd'):
        language_code = trd_node.attrib['lng']
        foreign_word = flatten_node(trd_node)
        translations[language_code].append(foreign_word)

    for trdgrp_node in node.findall('trdgrp'):
        language_code = trdgrp_node.attrib['lng']

        for trd_node in trdgrp_node.findall('trd'):
            foreign_word = flatten_node(trd_node)
            if foreign_word.endswith(';'):
                foreign_word = foreign_word[:-1]

            translations[language_code].append(foreign_word)

    return translations
Ejemplo n.º 2
0
def flatten_example(ekz_node):
    """Get the contents of an <ekz>, discarding examples sources
    (<fnt>s). Since a series of examples are often written in the form
    'foo; bar; baz.' we also discard trailing full stops or
    semicolons.

    An example:

    <ekz>
      kion vi legas mia princo? <tld/>ojn, <tld/>ojn, <tld/>ojn
      <fnt>Z</fnt>!
    </ekz>
    (from vort.xml)

    <ekz>
      <tld/>o de reno
    </ekz>;
    (from ablaci.xml)

    <ekz>
      en via decembra numero trovi&gcirc;as sub la <tld/>o <ctl>&Scirc;erco kaj
      satiro</ctl> &leftquot;publika letero&rightquot;<fnt><aut>Reinhard
      F&ouml;&szlig;meier</aut>:
      <vrk>Netrafa adreso</vrk>, <bib>Monato</bib><lok>jaro 2002a,
      numero 2a, p. 7a</lok></fnt>.
    </ekz>
    (from rubrik.xml, mixing quote types)

    """
    # klr = klarigo = clarification, ideally we'd extract this
    # and format it appropriately on the frontend (TODO)
    # <fnt> is example attribution, which we ignore
    # <uzo> indicates topic to which this examples relates
    example = flatten_node(ekz_node,
                           skip_tags=['fnt', 'klr', 'uzo', 
                                      'trd', 'trdgrp'])

    # remove trailing semicolon/full stop due to the examples being
    # written as a series
    if example.endswith(';') or example.endswith('.'):
        example = example[:-1]

    # if we didn't extract anything with letters in (e.g. only
    # references that we discarded), return an empty string
    if not re.search(u'[a-zĉĝĥĵŝ]', example,
                     flags=re.UNICODE+re.IGNORECASE):
        return ""

    source = None
    # there's probably only one <fnt>, but this loop is easy and robust
    for fnt_node in ekz_node.findall('fnt'):
        source = flatten_node(fnt_node)

    return (example, source)
Ejemplo n.º 3
0
def get_words_from_kap(node):
    r"""Return a list of all the terms in a <kap>. Every term in a
    <kap> is an alternative spelling of the same term. This is not
    necessarily single words, since ReVo includes entries such as
    'brazila nukso'.

    <kap><ofc>*</ofc><tld/>o</kap>
    <kap>brazil<tld/>arbo, <var><kap>brazila <tld/>arbo</kap></var></kap>
    (from nuks.xml)

    The heavy lifting is done in flatten_kap, all we do here is
    separate out terms and remove extraneous whitespace.

    Possible formats encountered:
    'foo'
    'foo, bar'
    'foo,\n   bar'
    '(n,p)-matrico' (the only term in ReVo with an internal comma)

    """
    flat_string = flatten_node(node, skip_tags=['ofc', 'fnt'])

    if flat_string == '(n,p)-matrico':
        words = ['(n,p)-matrico']
    else:
        words = flat_string.split(',')
    if len(words) > 1:
        for i in range(len(words)):
            # remove trailing/leading space and awkard newlines
            words[i] = clean_string(words[i])

    return words
Ejemplo n.º 4
0
def get_words_from_kap(node):
    r"""Return a list of all the terms in a <kap>. Every term in a
    <kap> is an alternative spelling of the same term. This is not
    necessarily single words, since ReVo includes entries such as
    'brazila nukso'.

    <kap><ofc>*</ofc><tld/>o</kap>
    <kap>brazil<tld/>arbo, <var><kap>brazila <tld/>arbo</kap></var></kap>
    (from nuks.xml)

    The heavy lifting is done in flatten_kap, all we do here is
    separate out terms and remove extraneous whitespace.

    Possible formats encountered:
    'foo'
    'foo, bar'
    'foo,\n   bar'
    '(n,p)-matrico' (the only term in ReVo with an internal comma)

    """
    flat_string = flatten_node(node, skip_tags=["ofc", "fnt"])

    if flat_string == "(n,p)-matrico":
        words = ["(n,p)-matrico"]
    else:
        words = flat_string.split(",")
    if len(words) > 1:
        for i in range(len(words)):
            # remove trailing/leading space and awkard newlines
            words[i] = clean_string(words[i])

    return words
Ejemplo n.º 5
0
    def add_reference_group(self, refgrp_node):
        # dif=difino i.e. this word is defined elsewhere
        if refgrp_node.attrib.get('tip') == 'dif':
            for ref_node in refgrp_node.findall('ref'):
                self.see.append(flatten_node(ref_node))

        # vid=vidu ankaŭ
        elif refgrp_node.attrib.get('tip') == 'vid':
            for ref_node in refgrp_node.findall('ref'):
                self.see_also.append(flatten_node(ref_node))

        # sin=sinonimo
        elif refgrp_node.attrib.get('tip') == 'sin':
            for ref_node in refgrp_node.findall('ref'):
                self.synonyms.append(flatten_node(ref_node))

        # ant=antonimo
        elif refgrp_node.attrib.get('tip') == 'ant':
            for ref_node in refgrp_node.findall('ref'):
                self.antonyms.append(flatten_node(ref_node))

        # super=supernocio
        elif refgrp_node.attrib.get('tip') == 'super':
            for ref_node in refgrp_node.findall('ref'):
                self.supernotions.append(flatten_node(ref_node))

        # sub=subnocio
        elif refgrp_node.attrib.get('tip') == 'sub':
            for ref_node in refgrp_node.findall('ref'):
                self.subnotions.append(flatten_node(ref_node))

        # prt=parto de
        elif refgrp_node.attrib.get('tip') == 'prt':
            for ref_node in refgrp_node.findall('ref'):
                self.meronyms.append(flatten_node(ref_node))

        # malprt=malparto de, aŭ 'konsistas el'
        elif refgrp_node.attrib.get('tip') == 'malprt':
            for ref_node in refgrp_node.findall('ref'):
                self.holonyms.append(flatten_node(ref_node))

        # hom=homonimo
        # (we ignore hononyms since we collect all the definitions together
        # so the cross-reference is unnecessary)
        elif refgrp_node.attrib.get('tip') == 'hom':
            pass

        # ignore unlabelled references
        elif refgrp_node.attrib.get('tip') is None:
            pass

        else:
            assert False, "Found an unknown reference type: %s" % ref_node.attrib.get('tip')
Ejemplo n.º 6
0
def flatten_definition(dif_node):
    """Convert a definition node to a simple unicode string (this
    requires us to flatten it), and handle any references or
    clarifications we encounter.

    An example:

    <dif>
      <klr>(de <ref cel="polino.0o">polinomo</ref>)</klr>
      <ref tip="super" cel="nul0.0iganto.de_funkcio">Nuliganto</ref>
      de la responda <ref cel="funkci.polinoma0o">polinoma funkcio</ref>.
    </dif>
    (from radik.xml)

    """
    # skip examples, they're dealt with elsewhere
    definition = flatten_node(dif_node, skip_tags=['ekz'])

    # if this definition has examples, it ends with a colon not a full stop
    # but since we format examples separately, replace the colon
    if definition.endswith(':'):
        definition = definition[:-1].strip() + '.'

    return definition
Ejemplo n.º 7
0
            def find_ekz_translations(ekz_dct, node, flat_translations):
                #for trd in parse_vip.iter_tags(node, "ekz/trd|trdgrp"):
                def trd_iter(ekz_name, name):
                    return parse_vip.iter_tags(
                        node, "%(ekz_name)s/%(name)s" % locals())

                def trd_iters(ekz_name):
                    return trd_iter(ekz_name,
                                    "trd"), trd_iter(ekz_name, "trdgrp")

                for trd in itertools.chain(*(trd_iters("ekz") +
                                             trd_iters("bld"))):
                    ekz = trd.getparent()

                    if ekz in ekz_node_set:
                        continue
                    else:
                        ekz_node_set.add(ekz)

                    def make_orig_txt(ind_node):
                        return ', '.join(
                            rvut_words.get_words_from_kap(ind_node))

                    ind_node = ekz.find('ind')
                    if ind_node is None:
                        # kalkulas orig_txt mem, kolektante ĉiujn etikedojn ĝis apero de trd aŭ trdgrp
                        # anim.xml:
                        # <ekz>
                        #  <tld/>ita parolado<fnt>K</fnt>,
                        #  <trd lng="hu">lelkes besz&eacute;d</trd>
                        # </ekz>
                        ind_node = etree.Element("ind")
                        ind_node.text = ekz.text
                        for child in ekz.getchildren():
                            if child.tag in ["trd", "trdgrp"]:
                                break
                            else:
                                child = copy.deepcopy(child)
                                ind_node.append(child)

                        tree.append(ind_node)
                        orig_txt = make_orig_txt(ind_node)
                        ind_node.getparent().remove(ind_node)
                    else:
                        orig_txt = make_orig_txt(ind_node)

                    for lang, tr_lst in get_count_translations(ekz).items():
                        # :REFACTOR:
                        lst = ekz_dct.setdefault(lang, [])

                        tr_lst = ", ".join(tr_lst)
                        ekz_txt = "<i><b>%(orig_txt)s</b>: %(tr_lst)s</i>" % locals(
                        )
                        lst.append(ekz_txt)

                #return

                # :TRICKY: iuj <trd> kumulas tradukon mem k indikon de originala nomo (Latina prezipe) =>
                # nur <trd> povas esti tia, ne <trdgrp>, ĉar tio estas perokula etikedo (angla - tag)
                # malĝuste - hel.xml!
                rest_translations = {}
                for trd in parse_vip.iter_tags(node, "trd"):
                    if trd not in used_tr_nodes:
                        par_node = trd.getparent()
                        if par_node.tag == "trdgrp":
                            lang = par_node.get("lng")

                            used_tr_nodes[par_node] = True
                        else:
                            lang = trd.get("lng")

                        foreign_word = rvut_flatten.flatten_node(trd)
                        if foreign_word:
                            # :REFACTOR:
                            rest_translations.setdefault(
                                lang, []).append(foreign_word)
                        # :REFACTOR:
                        used_tr_nodes[trd] = True
                append_translations(flat_translations, rest_translations)
                append_translations(national_headwords, rest_translations)
Ejemplo n.º 8
0
            def find_ekz_translations(ekz_dct, node, flat_translations):
                #for trd in parse_vip.iter_tags(node, "ekz/trd|trdgrp"):
                def trd_iter(ekz_name, name):
                    return parse_vip.iter_tags(node, "%(ekz_name)s/%(name)s" % locals())
                def trd_iters(ekz_name):
                    return trd_iter(ekz_name, "trd"), trd_iter(ekz_name, "trdgrp")
                for trd in itertools.chain(*(trd_iters("ekz") + trd_iters("bld"))):
                    ekz = trd.getparent()
                    
                    if ekz in ekz_node_set:
                        continue
                    else:
                        ekz_node_set.add(ekz)

                    def make_orig_txt(ind_node):
                        return ', '.join(rvut_words.get_words_from_kap(ind_node))
                    
                    ind_node = ekz.find('ind')
                    if ind_node is None:
                        # kalkulas orig_txt mem, kolektante ĉiujn etikedojn ĝis apero de trd aŭ trdgrp
                        # anim.xml:
                        # <ekz>
                        #  <tld/>ita parolado<fnt>K</fnt>,
                        #  <trd lng="hu">lelkes besz&eacute;d</trd>
                        # </ekz>                        
                        ind_node = etree.Element("ind")
                        ind_node.text = ekz.text
                        for child in ekz.getchildren():
                            if child.tag in ["trd", "trdgrp"]:
                                break
                            else:
                                child = copy.deepcopy(child)
                                ind_node.append(child)
                                
                        tree.append(ind_node)
                        orig_txt = make_orig_txt(ind_node)
                        ind_node.getparent().remove(ind_node)
                    else:
                        orig_txt = make_orig_txt(ind_node)
                    
                    for lang, tr_lst in get_count_translations(ekz).items():
                        # :REFACTOR:
                        lst = ekz_dct.setdefault(lang, [])
                        
                        tr_lst = ", ".join(tr_lst)
                        ekz_txt = "<i><b>%(orig_txt)s</b>: %(tr_lst)s</i>" % locals()
                        lst.append(ekz_txt)
                    
                #return
                
                # :TRICKY: iuj <trd> kumulas tradukon mem k indikon de originala nomo (Latina prezipe) =>
                # nur <trd> povas esti tia, ne <trdgrp>, ĉar tio estas perokula etikedo (angla - tag)
                # malĝuste - hel.xml!
                rest_translations = {}
                for trd in parse_vip.iter_tags(node, "trd"):
                    if trd not in used_tr_nodes:
                        par_node = trd.getparent()
                        if par_node.tag == "trdgrp":
                            lang = par_node.get("lng")
                            
                            used_tr_nodes[par_node] = True
                        else:
                            lang = trd.get("lng")
                        
                        foreign_word = rvut_flatten.flatten_node(trd)
                        if foreign_word:
                            # :REFACTOR:
                            rest_translations.setdefault(lang, []).append(foreign_word)
                        # :REFACTOR:
                        used_tr_nodes[trd] = True
                append_translations(flat_translations, rest_translations)
                append_translations(national_headwords, rest_translations)
Ejemplo n.º 9
0
def get_all_definitions(drv_node):
    """For a given entry (which is a single <drv> node), get all its
    definitions. I have tested this as far as possible but bugs may
    remain given the complexity and variability of the XML.

    Generally, a primary definition is a <dif> inside a <snc> and a
    subdefinition is a <dif> inside a <subsnc> inside a <snc>.

    Some representative examples are:
    sxiling.xml and vort.xml for subsenses
    apetit.xml for notes that the term is figurative
    jakobi1.xml only <ref> inside <snc>, no <dif> node
    frakci.xml only <ref> inside <snc> but huge and complex
    ad.xml has a load of stuff, some of which is not documented by ReVo
    akusx.xml has <ref> and no <snc> on akusxigisistino

    """
    assert drv_node.tag in ['drv', 'subdrv']

    definitions = []

    # if <dif> is outside <snc>, treat <snc>s as subsenses
    # (yes, this isn't simple)
    for dif_node in drv_node.findall('dif'):
        # outside a <snc> we do not have subdefinitions
        definition_string = flatten_definition(dif_node)
        definition_string = get_definition_notes(drv_node) + definition_string
        definitions.append(Definition(definition_string))

    # the common case, get definitions on <snc>s
    for snc_node in drv_node.findall('snc'):
        definitions.append(get_definition(snc_node))

    # there may just be a <ref> (normally these are inside <snc>s)
    for ref_node in drv_node.findall('ref'):
        # ignore malprt which (e.g. saluti, pluralo) just comes in awkward places
        if not ref_node.attrib.get('tip') in ['malprt', 'sub']:
            definition_string = flatten_node(ref_node)
            definitions.append(Definition(definition_string))

    # or similarly may be just a <refgrp>
    for refgrp_node in drv_node.findall('refgrp'):
        # ignore malprt which (e.g. saluti, pluralo) just comes in awkward places
        if not refgrp_node.attrib.get('tip') in ['malprt', 'sub']:
            definition_string = flatten_node(refgrp_node)
            definitions.append(Definition(definition_string))

    # get any remarks which aren't on <dif>s and assign them
    # (arbitrarily) to the first definition. This happens so rarely
    # (e.g. abdiko) that the loss of clarity is negligible.
    rim_nodes = []
    for rim_node in drv_node.findall('rim'):
        rim_nodes.append(flatten_node(rim_node, skip_tags=['aut', 'fnt']))

    if rim_nodes:
        definitions[0].remarks = rim_nodes

    # get any examples which are just on the <drv> (rare, e.g. 'pluralo')
    examples = get_examples(drv_node)
    if examples:
        definitions[0].examples.extend(examples)

    # get any translations which are just on the <drv>
    translations = get_translations(drv_node)
    if translations and definitions:
        definitions[0].translations.update(translations)

    # get any definitions which are in a subdrv:
    # if we've already started on a definition, we add to it
    if definitions:
        for subdrv_node in drv_node.findall('subdrv'):
            subdefinitions = get_subdefinitions_from_subdrv(subdrv_node)
            definitions[0].subdefinitions.extend(subdefinitions)
    else:
        subdrv_nodes = drv_node.findall('subdrv')
        if subdrv_nodes:
            definitions.append(get_definition_from_subdrvs(subdrv_nodes))

    # remove any duplicates (happens with multiple <ref>s
    # e.g. direkt3.xml) or empty definitions (happens with example
    # only senses, such as purigi in pur.xml)
    no_duplicates = []
    for definition in definitions:
        if definition not in no_duplicates and not definition.is_empty():
            no_duplicates.append(definition)
    
    return no_duplicates
Ejemplo n.º 10
0
def get_definition(snc_node):
    """Build a Definition from this <snc> and add any subdefinitions if
    present, any examples if present and any remarks if present.

    Every <snc> contains a primary definition (a <dif>), a reference
    (i.e. a 'see foo' definition, a <ref>) or subdefinitions (<dif>s
    inside <subsnc>s).

    Worth testing pur.xml, since <snc> may have <dif> as a sibling
    rather than a child.

    An example:

    <dif>
      <ekz>
        lingva <tld/>a&jcirc;o<fnt>Z</fnt>;
      </ekz>
      <ekz>
        rimaj <tld/>a&jcirc;oj.
      </ekz>
    </dif>
    (from akroba.xml)

    <snc mrk="sekv.0i.dividi_opinion">
      <uzo tip="stl">FIG</uzo>
      <dif>
        Dividi ies opinion, morojn, konduton; alpreni kiel modelon,
        mastron:
        <ekz>
          kaj Barak vokis la Zebulunidojn kaj la Naftaliidojn al Kede&scirc;,
          kaj lin <tld/>is dek mil viroj
          <fnt><bib>MT</bib><lok>&Jug; 4:10</lok></fnt>;
        </ekz>
        <ekz>
          ne <tld/>u aliajn diojn el la dioj de la popoloj,
          kiuj estas &ccirc;irka&ubreve; vi
          <fnt><bib>MT</bib><lok>&Rea; 6:14</lok></fnt>;
        </ekz>
        <ekz>
          ne <tld/>u malbonajn homojn, kaj ne deziru esti kun ili
          <fnt><bib>MT</bib><lok>&Sen; 24:1</lok></fnt>.
        </ekz>
      </dif>
    </snc>
    (from sekv.xml)

    <snc>
      <dif>
        Neoficiala sufikso, uzata por nomi
        <ref tip="vid" cel="famili.0o.BIO">familiojn</ref>
        la&ubreve; la botanika nomenklaturo.
        La sufikso apliki&gcirc;as al genro el la familio
        por formi la familinomon:
        <ekz>
          La rozo apartenas al la familio rozacoj.
        </ekz>
      </dif>
      <rim num="1">
        Al kiu genro apliki&gcirc;as la sufikso por nomi la
        familion, estas difinite de la internacia botanika
        nomenklaturo.
      </rim>
      <rim num="2">
        Povas okazi, ke tiu genro ne plu ekzistas, &ccirc;ar
        pro novaj esploroj &gcirc;iaj specioj estas ordigitaj
        sub aliaj genroj.
        <refgrp tip="vid">
          <ref cel="fabac.0oj">fabacoj</ref>,
          <ref cel="kaprif1.0oj">kaprifoliacoj</ref> k.a.
        </refgrp>
      </rim>
      [...]
    </snc>
    (from ac.xml)

    """
    # we gradually populate the Definition
    definition = Definition()

    # get the primary definition itself
    for dif_node in snc_node.findall('dif'):
        definition.primary = flatten_definition(dif_node)

    # get examples of this definition, regardless of position
    definition.examples = get_examples(snc_node)

    # may have a <ref> that points to another word
    for ref_node in snc_node.findall('ref'):
        definition.cross_references.add_reference(ref_node)
    for refgrp_node in snc_node.findall('refgrp'):
        definition.cross_references.add_reference(refgrp_node)

    # note: may have only <subsnc>, no <dif> or <ref>
    # (e.g. sxilin.xml)

    # prepend any notes (transitivity etc)
    notes = get_definition_notes(snc_node)
    if notes and definition.primary:
        definition.primary = notes + definition.primary

    # get any subdefinitions
    for child in snc_node.findall('subsnc'):
        definition.subdefinitions.append(get_subdefinition(child))

    # get any remarks
    for rim_node in snc_node.findall('rim'):
        definition.remarks.append(flatten_node(rim_node,
                                               skip_tags=['aut', 'fnt']))

    # get all translations
    definition.translations = get_translations(snc_node)

    # final sanity check: do we have *something* for this word?
    if definition.is_empty():
        kap_node = snc_node.getparent().find('kap')
        print("Warning: no data found for " + get_words_from_kap(kap_node)[0])

    return definition