def find_kap_words(kap_parent): lst = [] for word in rvut_words.get_words_from_kap(kap_parent.find("kap")): # запятая влечет пустой элемент # disting ... <var><kap><tld/>indulo</kap></var>, if word: lst.append(word) return lst
def get_entries(xml_file): """Get every entry from a given XML file: the words, their roots and their definitions. """ tree = get_tree(xml_file) # each <drv> is one entry entries = [] for drv_node in tree.iter('drv'): node_words = get_words_from_kap(drv_node.find('kap')) root = get_word_root(drv_node) definitions = get_all_definitions(drv_node) for word in node_words: entries.append(Entry(word, root, definitions)) return entries
def make_orig_txt(ind_node): return ', '.join( rvut_words.get_words_from_kap(ind_node))
def make_orig_txt(ind_node): return ', '.join(rvut_words.get_words_from_kap(ind_node))
def get_definition(snc_node): """Build a Definition from this <snc> and add any subdefinitions if present, any examples if present and any remarks if present. Every <snc> contains a primary definition (a <dif>), a reference (i.e. a 'see foo' definition, a <ref>) or subdefinitions (<dif>s inside <subsnc>s). Worth testing pur.xml, since <snc> may have <dif> as a sibling rather than a child. An example: <dif> <ekz> lingva <tld/>aĵo<fnt>Z</fnt>; </ekz> <ekz> rimaj <tld/>aĵoj. </ekz> </dif> (from akroba.xml) <snc mrk="sekv.0i.dividi_opinion"> <uzo tip="stl">FIG</uzo> <dif> Dividi ies opinion, morojn, konduton; alpreni kiel modelon, mastron: <ekz> kaj Barak vokis la Zebulunidojn kaj la Naftaliidojn al Kedeŝ, kaj lin <tld/>is dek mil viroj <fnt><bib>MT</bib><lok>&Jug; 4:10</lok></fnt>; </ekz> <ekz> ne <tld/>u aliajn diojn el la dioj de la popoloj, kiuj estas ĉirkaŭ vi <fnt><bib>MT</bib><lok>&Rea; 6:14</lok></fnt>; </ekz> <ekz> ne <tld/>u malbonajn homojn, kaj ne deziru esti kun ili <fnt><bib>MT</bib><lok>&Sen; 24:1</lok></fnt>. </ekz> </dif> </snc> (from sekv.xml) <snc> <dif> Neoficiala sufikso, uzata por nomi <ref tip="vid" cel="famili.0o.BIO">familiojn</ref> laŭ la botanika nomenklaturo. La sufikso aplikiĝas al genro el la familio por formi la familinomon: <ekz> La rozo apartenas al la familio rozacoj. </ekz> </dif> <rim num="1"> Al kiu genro aplikiĝas la sufikso por nomi la familion, estas difinite de la internacia botanika nomenklaturo. </rim> <rim num="2"> Povas okazi, ke tiu genro ne plu ekzistas, ĉar pro novaj esploroj ĝiaj specioj estas ordigitaj sub aliaj genroj. <refgrp tip="vid"> <ref cel="fabac.0oj">fabacoj</ref>, <ref cel="kaprif1.0oj">kaprifoliacoj</ref> k.a. </refgrp> </rim> [...] </snc> (from ac.xml) """ # we gradually populate the Definition definition = Definition() # get the primary definition itself for dif_node in snc_node.findall('dif'): definition.primary = flatten_definition(dif_node) # get examples of this definition, regardless of position definition.examples = get_examples(snc_node) # may have a <ref> that points to another word for ref_node in snc_node.findall('ref'): definition.cross_references.add_reference(ref_node) for refgrp_node in snc_node.findall('refgrp'): definition.cross_references.add_reference(refgrp_node) # note: may have only <subsnc>, no <dif> or <ref> # (e.g. sxilin.xml) # prepend any notes (transitivity etc) notes = get_definition_notes(snc_node) if notes and definition.primary: definition.primary = notes + definition.primary # get any subdefinitions for child in snc_node.findall('subsnc'): definition.subdefinitions.append(get_subdefinition(child)) # get any remarks for rim_node in snc_node.findall('rim'): definition.remarks.append(flatten_node(rim_node, skip_tags=['aut', 'fnt'])) # get all translations definition.translations = get_translations(snc_node) # final sanity check: do we have *something* for this word? if definition.is_empty(): kap_node = snc_node.getparent().find('kap') print("Warning: no data found for " + get_words_from_kap(kap_node)[0]) return definition
def get_examples(node): """Get all examples from the children of a node. Examples tend to be in <dif>s, and take the following form: <ekz> simpla, kunmetita, dubsenca <tld/>o; </ekz><ekz> uzi la ĝustan, konvenan <tld/>on; </ekz><ekz> la bildoj elvokitaj de la <tld/>oj; </ekz><ekz> <tld/>ordo. </ekz> (from vort.xml) Sometimes (bizarrely) examples spread across several <ekz> nodes: <ekz> <tld/>i al si plezuron<fnt>Z</fnt>; </ekz><ekz> <tld/>i instruon<fnt>Z</fnt>, </ekz><ekz> amikecon<fnt>Z</fnt>, [...] (from sercx.xml) Sometimes only references, which we discard: <ekz> <ref tip="sub" cel="bier.0o">biero</ref>, <ref tip="sub" cel="brand.0o">brando</ref>, <ref tip="sub" cel="vin.0o">vino</ref> </ekz> (from alkoho.xml) <subsnc mrk="afekt.0o.sxajnigi" ref="afekt.0i.sxajnigi"> <ekz> kiom aĉas la <tld/>o komplezi al duonvivul' (from afekt.xml) """ raw_examples = [] # examples tend to be on <dif>s for dif_node in node.findall('dif'): for ekz_node in dif_node.findall('ekz'): raw_example = flatten_example(ekz_node) if raw_example: raw_examples.append(raw_example) # but examples can also be on the <snc>/<subsnc> itself # (or even a <drv>!) for ekz_node in node.findall('ekz'): raw_example = flatten_example(ekz_node) if raw_example: raw_examples.append(raw_example) # fix examples spread over multiple <ekz>s by concatenating each # example that ends with a comma with the next example examples = [] example_string = "" for (example, source) in raw_examples: example_string += ' ' + example if not example_string.endswith(','): examples.append((clean_string(example_string), source)) example_string = "" if example_string != "": art_node = ekz_node.iterancestors('art').next() kap_node = art_node.iter('kap').next() word = get_words_from_kap(kap_node)[0] print("Warning: example for %s ended with comma: %s" % \ (word, clean_string(example_string))) return examples