def get_translations(node): """Get all translations attached directly to this node. """ assert node.tag in ['snc', 'subsnc', 'drv', 'subdrv', 'ekz', 'subart', 'art', 'bld'] # a dict that defaults to empty list if that key isn't present translations = defaultdict(list) for trd_node in node.findall('trd'): language_code = trd_node.attrib['lng'] foreign_word = flatten_node(trd_node) translations[language_code].append(foreign_word) for trdgrp_node in node.findall('trdgrp'): language_code = trdgrp_node.attrib['lng'] for trd_node in trdgrp_node.findall('trd'): foreign_word = flatten_node(trd_node) if foreign_word.endswith(';'): foreign_word = foreign_word[:-1] translations[language_code].append(foreign_word) return translations
def flatten_example(ekz_node): """Get the contents of an <ekz>, discarding examples sources (<fnt>s). Since a series of examples are often written in the form 'foo; bar; baz.' we also discard trailing full stops or semicolons. An example: <ekz> kion vi legas mia princo? <tld/>ojn, <tld/>ojn, <tld/>ojn <fnt>Z</fnt>! </ekz> (from vort.xml) <ekz> <tld/>o de reno </ekz>; (from ablaci.xml) <ekz> en via decembra numero troviĝas sub la <tld/>o <ctl>Ŝerco kaj satiro</ctl> &leftquot;publika letero&rightquot;<fnt><aut>Reinhard Fößmeier</aut>: <vrk>Netrafa adreso</vrk>, <bib>Monato</bib><lok>jaro 2002a, numero 2a, p. 7a</lok></fnt>. </ekz> (from rubrik.xml, mixing quote types) """ # klr = klarigo = clarification, ideally we'd extract this # and format it appropriately on the frontend (TODO) # <fnt> is example attribution, which we ignore # <uzo> indicates topic to which this examples relates example = flatten_node(ekz_node, skip_tags=['fnt', 'klr', 'uzo', 'trd', 'trdgrp']) # remove trailing semicolon/full stop due to the examples being # written as a series if example.endswith(';') or example.endswith('.'): example = example[:-1] # if we didn't extract anything with letters in (e.g. only # references that we discarded), return an empty string if not re.search(u'[a-zĉĝĥĵŝ]', example, flags=re.UNICODE+re.IGNORECASE): return "" source = None # there's probably only one <fnt>, but this loop is easy and robust for fnt_node in ekz_node.findall('fnt'): source = flatten_node(fnt_node) return (example, source)
def get_words_from_kap(node): r"""Return a list of all the terms in a <kap>. Every term in a <kap> is an alternative spelling of the same term. This is not necessarily single words, since ReVo includes entries such as 'brazila nukso'. <kap><ofc>*</ofc><tld/>o</kap> <kap>brazil<tld/>arbo, <var><kap>brazila <tld/>arbo</kap></var></kap> (from nuks.xml) The heavy lifting is done in flatten_kap, all we do here is separate out terms and remove extraneous whitespace. Possible formats encountered: 'foo' 'foo, bar' 'foo,\n bar' '(n,p)-matrico' (the only term in ReVo with an internal comma) """ flat_string = flatten_node(node, skip_tags=['ofc', 'fnt']) if flat_string == '(n,p)-matrico': words = ['(n,p)-matrico'] else: words = flat_string.split(',') if len(words) > 1: for i in range(len(words)): # remove trailing/leading space and awkard newlines words[i] = clean_string(words[i]) return words
def get_words_from_kap(node): r"""Return a list of all the terms in a <kap>. Every term in a <kap> is an alternative spelling of the same term. This is not necessarily single words, since ReVo includes entries such as 'brazila nukso'. <kap><ofc>*</ofc><tld/>o</kap> <kap>brazil<tld/>arbo, <var><kap>brazila <tld/>arbo</kap></var></kap> (from nuks.xml) The heavy lifting is done in flatten_kap, all we do here is separate out terms and remove extraneous whitespace. Possible formats encountered: 'foo' 'foo, bar' 'foo,\n bar' '(n,p)-matrico' (the only term in ReVo with an internal comma) """ flat_string = flatten_node(node, skip_tags=["ofc", "fnt"]) if flat_string == "(n,p)-matrico": words = ["(n,p)-matrico"] else: words = flat_string.split(",") if len(words) > 1: for i in range(len(words)): # remove trailing/leading space and awkard newlines words[i] = clean_string(words[i]) return words
def add_reference_group(self, refgrp_node): # dif=difino i.e. this word is defined elsewhere if refgrp_node.attrib.get('tip') == 'dif': for ref_node in refgrp_node.findall('ref'): self.see.append(flatten_node(ref_node)) # vid=vidu ankaŭ elif refgrp_node.attrib.get('tip') == 'vid': for ref_node in refgrp_node.findall('ref'): self.see_also.append(flatten_node(ref_node)) # sin=sinonimo elif refgrp_node.attrib.get('tip') == 'sin': for ref_node in refgrp_node.findall('ref'): self.synonyms.append(flatten_node(ref_node)) # ant=antonimo elif refgrp_node.attrib.get('tip') == 'ant': for ref_node in refgrp_node.findall('ref'): self.antonyms.append(flatten_node(ref_node)) # super=supernocio elif refgrp_node.attrib.get('tip') == 'super': for ref_node in refgrp_node.findall('ref'): self.supernotions.append(flatten_node(ref_node)) # sub=subnocio elif refgrp_node.attrib.get('tip') == 'sub': for ref_node in refgrp_node.findall('ref'): self.subnotions.append(flatten_node(ref_node)) # prt=parto de elif refgrp_node.attrib.get('tip') == 'prt': for ref_node in refgrp_node.findall('ref'): self.meronyms.append(flatten_node(ref_node)) # malprt=malparto de, aŭ 'konsistas el' elif refgrp_node.attrib.get('tip') == 'malprt': for ref_node in refgrp_node.findall('ref'): self.holonyms.append(flatten_node(ref_node)) # hom=homonimo # (we ignore hononyms since we collect all the definitions together # so the cross-reference is unnecessary) elif refgrp_node.attrib.get('tip') == 'hom': pass # ignore unlabelled references elif refgrp_node.attrib.get('tip') is None: pass else: assert False, "Found an unknown reference type: %s" % ref_node.attrib.get('tip')
def flatten_definition(dif_node): """Convert a definition node to a simple unicode string (this requires us to flatten it), and handle any references or clarifications we encounter. An example: <dif> <klr>(de <ref cel="polino.0o">polinomo</ref>)</klr> <ref tip="super" cel="nul0.0iganto.de_funkcio">Nuliganto</ref> de la responda <ref cel="funkci.polinoma0o">polinoma funkcio</ref>. </dif> (from radik.xml) """ # skip examples, they're dealt with elsewhere definition = flatten_node(dif_node, skip_tags=['ekz']) # if this definition has examples, it ends with a colon not a full stop # but since we format examples separately, replace the colon if definition.endswith(':'): definition = definition[:-1].strip() + '.' return definition
def find_ekz_translations(ekz_dct, node, flat_translations): #for trd in parse_vip.iter_tags(node, "ekz/trd|trdgrp"): def trd_iter(ekz_name, name): return parse_vip.iter_tags( node, "%(ekz_name)s/%(name)s" % locals()) def trd_iters(ekz_name): return trd_iter(ekz_name, "trd"), trd_iter(ekz_name, "trdgrp") for trd in itertools.chain(*(trd_iters("ekz") + trd_iters("bld"))): ekz = trd.getparent() if ekz in ekz_node_set: continue else: ekz_node_set.add(ekz) def make_orig_txt(ind_node): return ', '.join( rvut_words.get_words_from_kap(ind_node)) ind_node = ekz.find('ind') if ind_node is None: # kalkulas orig_txt mem, kolektante ĉiujn etikedojn ĝis apero de trd aŭ trdgrp # anim.xml: # <ekz> # <tld/>ita parolado<fnt>K</fnt>, # <trd lng="hu">lelkes beszéd</trd> # </ekz> ind_node = etree.Element("ind") ind_node.text = ekz.text for child in ekz.getchildren(): if child.tag in ["trd", "trdgrp"]: break else: child = copy.deepcopy(child) ind_node.append(child) tree.append(ind_node) orig_txt = make_orig_txt(ind_node) ind_node.getparent().remove(ind_node) else: orig_txt = make_orig_txt(ind_node) for lang, tr_lst in get_count_translations(ekz).items(): # :REFACTOR: lst = ekz_dct.setdefault(lang, []) tr_lst = ", ".join(tr_lst) ekz_txt = "<i><b>%(orig_txt)s</b>: %(tr_lst)s</i>" % locals( ) lst.append(ekz_txt) #return # :TRICKY: iuj <trd> kumulas tradukon mem k indikon de originala nomo (Latina prezipe) => # nur <trd> povas esti tia, ne <trdgrp>, ĉar tio estas perokula etikedo (angla - tag) # malĝuste - hel.xml! rest_translations = {} for trd in parse_vip.iter_tags(node, "trd"): if trd not in used_tr_nodes: par_node = trd.getparent() if par_node.tag == "trdgrp": lang = par_node.get("lng") used_tr_nodes[par_node] = True else: lang = trd.get("lng") foreign_word = rvut_flatten.flatten_node(trd) if foreign_word: # :REFACTOR: rest_translations.setdefault( lang, []).append(foreign_word) # :REFACTOR: used_tr_nodes[trd] = True append_translations(flat_translations, rest_translations) append_translations(national_headwords, rest_translations)
def find_ekz_translations(ekz_dct, node, flat_translations): #for trd in parse_vip.iter_tags(node, "ekz/trd|trdgrp"): def trd_iter(ekz_name, name): return parse_vip.iter_tags(node, "%(ekz_name)s/%(name)s" % locals()) def trd_iters(ekz_name): return trd_iter(ekz_name, "trd"), trd_iter(ekz_name, "trdgrp") for trd in itertools.chain(*(trd_iters("ekz") + trd_iters("bld"))): ekz = trd.getparent() if ekz in ekz_node_set: continue else: ekz_node_set.add(ekz) def make_orig_txt(ind_node): return ', '.join(rvut_words.get_words_from_kap(ind_node)) ind_node = ekz.find('ind') if ind_node is None: # kalkulas orig_txt mem, kolektante ĉiujn etikedojn ĝis apero de trd aŭ trdgrp # anim.xml: # <ekz> # <tld/>ita parolado<fnt>K</fnt>, # <trd lng="hu">lelkes beszéd</trd> # </ekz> ind_node = etree.Element("ind") ind_node.text = ekz.text for child in ekz.getchildren(): if child.tag in ["trd", "trdgrp"]: break else: child = copy.deepcopy(child) ind_node.append(child) tree.append(ind_node) orig_txt = make_orig_txt(ind_node) ind_node.getparent().remove(ind_node) else: orig_txt = make_orig_txt(ind_node) for lang, tr_lst in get_count_translations(ekz).items(): # :REFACTOR: lst = ekz_dct.setdefault(lang, []) tr_lst = ", ".join(tr_lst) ekz_txt = "<i><b>%(orig_txt)s</b>: %(tr_lst)s</i>" % locals() lst.append(ekz_txt) #return # :TRICKY: iuj <trd> kumulas tradukon mem k indikon de originala nomo (Latina prezipe) => # nur <trd> povas esti tia, ne <trdgrp>, ĉar tio estas perokula etikedo (angla - tag) # malĝuste - hel.xml! rest_translations = {} for trd in parse_vip.iter_tags(node, "trd"): if trd not in used_tr_nodes: par_node = trd.getparent() if par_node.tag == "trdgrp": lang = par_node.get("lng") used_tr_nodes[par_node] = True else: lang = trd.get("lng") foreign_word = rvut_flatten.flatten_node(trd) if foreign_word: # :REFACTOR: rest_translations.setdefault(lang, []).append(foreign_word) # :REFACTOR: used_tr_nodes[trd] = True append_translations(flat_translations, rest_translations) append_translations(national_headwords, rest_translations)
def get_all_definitions(drv_node): """For a given entry (which is a single <drv> node), get all its definitions. I have tested this as far as possible but bugs may remain given the complexity and variability of the XML. Generally, a primary definition is a <dif> inside a <snc> and a subdefinition is a <dif> inside a <subsnc> inside a <snc>. Some representative examples are: sxiling.xml and vort.xml for subsenses apetit.xml for notes that the term is figurative jakobi1.xml only <ref> inside <snc>, no <dif> node frakci.xml only <ref> inside <snc> but huge and complex ad.xml has a load of stuff, some of which is not documented by ReVo akusx.xml has <ref> and no <snc> on akusxigisistino """ assert drv_node.tag in ['drv', 'subdrv'] definitions = [] # if <dif> is outside <snc>, treat <snc>s as subsenses # (yes, this isn't simple) for dif_node in drv_node.findall('dif'): # outside a <snc> we do not have subdefinitions definition_string = flatten_definition(dif_node) definition_string = get_definition_notes(drv_node) + definition_string definitions.append(Definition(definition_string)) # the common case, get definitions on <snc>s for snc_node in drv_node.findall('snc'): definitions.append(get_definition(snc_node)) # there may just be a <ref> (normally these are inside <snc>s) for ref_node in drv_node.findall('ref'): # ignore malprt which (e.g. saluti, pluralo) just comes in awkward places if not ref_node.attrib.get('tip') in ['malprt', 'sub']: definition_string = flatten_node(ref_node) definitions.append(Definition(definition_string)) # or similarly may be just a <refgrp> for refgrp_node in drv_node.findall('refgrp'): # ignore malprt which (e.g. saluti, pluralo) just comes in awkward places if not refgrp_node.attrib.get('tip') in ['malprt', 'sub']: definition_string = flatten_node(refgrp_node) definitions.append(Definition(definition_string)) # get any remarks which aren't on <dif>s and assign them # (arbitrarily) to the first definition. This happens so rarely # (e.g. abdiko) that the loss of clarity is negligible. rim_nodes = [] for rim_node in drv_node.findall('rim'): rim_nodes.append(flatten_node(rim_node, skip_tags=['aut', 'fnt'])) if rim_nodes: definitions[0].remarks = rim_nodes # get any examples which are just on the <drv> (rare, e.g. 'pluralo') examples = get_examples(drv_node) if examples: definitions[0].examples.extend(examples) # get any translations which are just on the <drv> translations = get_translations(drv_node) if translations and definitions: definitions[0].translations.update(translations) # get any definitions which are in a subdrv: # if we've already started on a definition, we add to it if definitions: for subdrv_node in drv_node.findall('subdrv'): subdefinitions = get_subdefinitions_from_subdrv(subdrv_node) definitions[0].subdefinitions.extend(subdefinitions) else: subdrv_nodes = drv_node.findall('subdrv') if subdrv_nodes: definitions.append(get_definition_from_subdrvs(subdrv_nodes)) # remove any duplicates (happens with multiple <ref>s # e.g. direkt3.xml) or empty definitions (happens with example # only senses, such as purigi in pur.xml) no_duplicates = [] for definition in definitions: if definition not in no_duplicates and not definition.is_empty(): no_duplicates.append(definition) return no_duplicates
def get_definition(snc_node): """Build a Definition from this <snc> and add any subdefinitions if present, any examples if present and any remarks if present. Every <snc> contains a primary definition (a <dif>), a reference (i.e. a 'see foo' definition, a <ref>) or subdefinitions (<dif>s inside <subsnc>s). Worth testing pur.xml, since <snc> may have <dif> as a sibling rather than a child. An example: <dif> <ekz> lingva <tld/>aĵo<fnt>Z</fnt>; </ekz> <ekz> rimaj <tld/>aĵoj. </ekz> </dif> (from akroba.xml) <snc mrk="sekv.0i.dividi_opinion"> <uzo tip="stl">FIG</uzo> <dif> Dividi ies opinion, morojn, konduton; alpreni kiel modelon, mastron: <ekz> kaj Barak vokis la Zebulunidojn kaj la Naftaliidojn al Kedeŝ, kaj lin <tld/>is dek mil viroj <fnt><bib>MT</bib><lok>&Jug; 4:10</lok></fnt>; </ekz> <ekz> ne <tld/>u aliajn diojn el la dioj de la popoloj, kiuj estas ĉirkaŭ vi <fnt><bib>MT</bib><lok>&Rea; 6:14</lok></fnt>; </ekz> <ekz> ne <tld/>u malbonajn homojn, kaj ne deziru esti kun ili <fnt><bib>MT</bib><lok>&Sen; 24:1</lok></fnt>. </ekz> </dif> </snc> (from sekv.xml) <snc> <dif> Neoficiala sufikso, uzata por nomi <ref tip="vid" cel="famili.0o.BIO">familiojn</ref> laŭ la botanika nomenklaturo. La sufikso aplikiĝas al genro el la familio por formi la familinomon: <ekz> La rozo apartenas al la familio rozacoj. </ekz> </dif> <rim num="1"> Al kiu genro aplikiĝas la sufikso por nomi la familion, estas difinite de la internacia botanika nomenklaturo. </rim> <rim num="2"> Povas okazi, ke tiu genro ne plu ekzistas, ĉar pro novaj esploroj ĝiaj specioj estas ordigitaj sub aliaj genroj. <refgrp tip="vid"> <ref cel="fabac.0oj">fabacoj</ref>, <ref cel="kaprif1.0oj">kaprifoliacoj</ref> k.a. </refgrp> </rim> [...] </snc> (from ac.xml) """ # we gradually populate the Definition definition = Definition() # get the primary definition itself for dif_node in snc_node.findall('dif'): definition.primary = flatten_definition(dif_node) # get examples of this definition, regardless of position definition.examples = get_examples(snc_node) # may have a <ref> that points to another word for ref_node in snc_node.findall('ref'): definition.cross_references.add_reference(ref_node) for refgrp_node in snc_node.findall('refgrp'): definition.cross_references.add_reference(refgrp_node) # note: may have only <subsnc>, no <dif> or <ref> # (e.g. sxilin.xml) # prepend any notes (transitivity etc) notes = get_definition_notes(snc_node) if notes and definition.primary: definition.primary = notes + definition.primary # get any subdefinitions for child in snc_node.findall('subsnc'): definition.subdefinitions.append(get_subdefinition(child)) # get any remarks for rim_node in snc_node.findall('rim'): definition.remarks.append(flatten_node(rim_node, skip_tags=['aut', 'fnt'])) # get all translations definition.translations = get_translations(snc_node) # final sanity check: do we have *something* for this word? if definition.is_empty(): kap_node = snc_node.getparent().find('kap') print("Warning: no data found for " + get_words_from_kap(kap_node)[0]) return definition