Example #1
0
def find_kap_words(kap_parent):
    lst = []
    for word in rvut_words.get_words_from_kap(kap_parent.find("kap")):
        # запятая влечет пустой элемент
        # disting ... <var><kap><tld/>indulo</kap></var>,
        if word:
            lst.append(word)
    return lst
Example #2
0
def find_kap_words(kap_parent):
    lst = []
    for word in rvut_words.get_words_from_kap(kap_parent.find("kap")):
        # запятая влечет пустой элемент
        # disting ... <var><kap><tld/>indulo</kap></var>,
        if word:
            lst.append(word)
    return lst
Example #3
0
def get_entries(xml_file):
    """Get every entry from a given XML file: the words, their roots
    and their definitions.

    """
    tree = get_tree(xml_file)

    # each <drv> is one entry
    entries = []
    for drv_node in tree.iter('drv'):
        node_words = get_words_from_kap(drv_node.find('kap'))
        root = get_word_root(drv_node)
        definitions = get_all_definitions(drv_node)
        for word in node_words:
            entries.append(Entry(word, root, definitions))

    return entries
def get_entries(xml_file):
    """Get every entry from a given XML file: the words, their roots
    and their definitions.

    """
    tree = get_tree(xml_file)

    # each <drv> is one entry
    entries = []
    for drv_node in tree.iter('drv'):
        node_words = get_words_from_kap(drv_node.find('kap'))
        root = get_word_root(drv_node)
        definitions = get_all_definitions(drv_node)
        for word in node_words:
            entries.append(Entry(word, root, definitions))

    return entries
Example #5
0
 def make_orig_txt(ind_node):
     return ', '.join(
         rvut_words.get_words_from_kap(ind_node))
Example #6
0
 def make_orig_txt(ind_node):
     return ', '.join(rvut_words.get_words_from_kap(ind_node))
Example #7
0
def get_definition(snc_node):
    """Build a Definition from this <snc> and add any subdefinitions if
    present, any examples if present and any remarks if present.

    Every <snc> contains a primary definition (a <dif>), a reference
    (i.e. a 'see foo' definition, a <ref>) or subdefinitions (<dif>s
    inside <subsnc>s).

    Worth testing pur.xml, since <snc> may have <dif> as a sibling
    rather than a child.

    An example:

    <dif>
      <ekz>
        lingva <tld/>a&jcirc;o<fnt>Z</fnt>;
      </ekz>
      <ekz>
        rimaj <tld/>a&jcirc;oj.
      </ekz>
    </dif>
    (from akroba.xml)

    <snc mrk="sekv.0i.dividi_opinion">
      <uzo tip="stl">FIG</uzo>
      <dif>
        Dividi ies opinion, morojn, konduton; alpreni kiel modelon,
        mastron:
        <ekz>
          kaj Barak vokis la Zebulunidojn kaj la Naftaliidojn al Kede&scirc;,
          kaj lin <tld/>is dek mil viroj
          <fnt><bib>MT</bib><lok>&Jug; 4:10</lok></fnt>;
        </ekz>
        <ekz>
          ne <tld/>u aliajn diojn el la dioj de la popoloj,
          kiuj estas &ccirc;irka&ubreve; vi
          <fnt><bib>MT</bib><lok>&Rea; 6:14</lok></fnt>;
        </ekz>
        <ekz>
          ne <tld/>u malbonajn homojn, kaj ne deziru esti kun ili
          <fnt><bib>MT</bib><lok>&Sen; 24:1</lok></fnt>.
        </ekz>
      </dif>
    </snc>
    (from sekv.xml)

    <snc>
      <dif>
        Neoficiala sufikso, uzata por nomi
        <ref tip="vid" cel="famili.0o.BIO">familiojn</ref>
        la&ubreve; la botanika nomenklaturo.
        La sufikso apliki&gcirc;as al genro el la familio
        por formi la familinomon:
        <ekz>
          La rozo apartenas al la familio rozacoj.
        </ekz>
      </dif>
      <rim num="1">
        Al kiu genro apliki&gcirc;as la sufikso por nomi la
        familion, estas difinite de la internacia botanika
        nomenklaturo.
      </rim>
      <rim num="2">
        Povas okazi, ke tiu genro ne plu ekzistas, &ccirc;ar
        pro novaj esploroj &gcirc;iaj specioj estas ordigitaj
        sub aliaj genroj.
        <refgrp tip="vid">
          <ref cel="fabac.0oj">fabacoj</ref>,
          <ref cel="kaprif1.0oj">kaprifoliacoj</ref> k.a.
        </refgrp>
      </rim>
      [...]
    </snc>
    (from ac.xml)

    """
    # we gradually populate the Definition
    definition = Definition()

    # get the primary definition itself
    for dif_node in snc_node.findall('dif'):
        definition.primary = flatten_definition(dif_node)

    # get examples of this definition, regardless of position
    definition.examples = get_examples(snc_node)

    # may have a <ref> that points to another word
    for ref_node in snc_node.findall('ref'):
        definition.cross_references.add_reference(ref_node)
    for refgrp_node in snc_node.findall('refgrp'):
        definition.cross_references.add_reference(refgrp_node)

    # note: may have only <subsnc>, no <dif> or <ref>
    # (e.g. sxilin.xml)

    # prepend any notes (transitivity etc)
    notes = get_definition_notes(snc_node)
    if notes and definition.primary:
        definition.primary = notes + definition.primary

    # get any subdefinitions
    for child in snc_node.findall('subsnc'):
        definition.subdefinitions.append(get_subdefinition(child))

    # get any remarks
    for rim_node in snc_node.findall('rim'):
        definition.remarks.append(flatten_node(rim_node,
                                               skip_tags=['aut', 'fnt']))

    # get all translations
    definition.translations = get_translations(snc_node)

    # final sanity check: do we have *something* for this word?
    if definition.is_empty():
        kap_node = snc_node.getparent().find('kap')
        print("Warning: no data found for " + get_words_from_kap(kap_node)[0])

    return definition
Example #8
0
def get_examples(node):
    """Get all examples from the children of a node. Examples tend to
    be in <dif>s, and take the following form:

    <ekz>
      simpla, kunmetita, dubsenca <tld/>o;
    </ekz><ekz>
      uzi la &gcirc;ustan, konvenan <tld/>on;
    </ekz><ekz>
      la bildoj elvokitaj de la <tld/>oj;
    </ekz><ekz>
      <tld/>ordo.
    </ekz>
    (from vort.xml)

    Sometimes (bizarrely) examples spread across several <ekz> nodes:

    <ekz>
      <tld/>i al si plezuron<fnt>Z</fnt>;
    </ekz><ekz>
      <tld/>i instruon<fnt>Z</fnt>,
    </ekz><ekz>
      amikecon<fnt>Z</fnt>,
    [...]
    (from sercx.xml)

    Sometimes only references, which we discard:

    <ekz>
      <ref tip="sub" cel="bier.0o">biero</ref>, 
      <ref tip="sub" cel="brand.0o">brando</ref>,
      <ref tip="sub" cel="vin.0o">vino</ref> 
    </ekz>
    (from alkoho.xml)

    <subsnc mrk="afekt.0o.sxajnigi" ref="afekt.0i.sxajnigi">
      <ekz>
        kiom a&ccirc;as la <tld/>o komplezi al duonvivul'
    (from afekt.xml)

    """
    raw_examples = []

    # examples tend to be on <dif>s
    for dif_node in node.findall('dif'):
        for ekz_node in dif_node.findall('ekz'):
            raw_example = flatten_example(ekz_node)
            if raw_example:
                raw_examples.append(raw_example)

    # but examples can also be on the <snc>/<subsnc> itself
    # (or even a <drv>!)
    for ekz_node in node.findall('ekz'):
        raw_example = flatten_example(ekz_node)
        if raw_example:
            raw_examples.append(raw_example)

    # fix examples spread over multiple <ekz>s by concatenating each
    # example that ends with a comma with the next example
    examples = []
    example_string = ""
    for (example, source) in raw_examples:
        example_string += ' ' + example

        if not example_string.endswith(','):
            examples.append((clean_string(example_string), source))
            example_string = ""

    if example_string != "":
        art_node = ekz_node.iterancestors('art').next()
        kap_node = art_node.iter('kap').next()
        word = get_words_from_kap(kap_node)[0]
        print("Warning: example for %s ended with comma: %s" % \
            (word, clean_string(example_string)))
            
    return examples