def get_words_from_kap(node): r"""Return a list of all the terms in a <kap>. Every term in a <kap> is an alternative spelling of the same term. This is not necessarily single words, since ReVo includes entries such as 'brazila nukso'. <kap><ofc>*</ofc><tld/>o</kap> <kap>brazil<tld/>arbo, <var><kap>brazila <tld/>arbo</kap></var></kap> (from nuks.xml) The heavy lifting is done in flatten_kap, all we do here is separate out terms and remove extraneous whitespace. Possible formats encountered: 'foo' 'foo, bar' 'foo,\n bar' '(n,p)-matrico' (the only term in ReVo with an internal comma) """ flat_string = flatten_node(node, skip_tags=['ofc', 'fnt']) if flat_string == '(n,p)-matrico': words = ['(n,p)-matrico'] else: words = flat_string.split(',') if len(words) > 1: for i in range(len(words)): # remove trailing/leading space and awkard newlines words[i] = clean_string(words[i]) return words
def get_words_from_kap(node): r"""Return a list of all the terms in a <kap>. Every term in a <kap> is an alternative spelling of the same term. This is not necessarily single words, since ReVo includes entries such as 'brazila nukso'. <kap><ofc>*</ofc><tld/>o</kap> <kap>brazil<tld/>arbo, <var><kap>brazila <tld/>arbo</kap></var></kap> (from nuks.xml) The heavy lifting is done in flatten_kap, all we do here is separate out terms and remove extraneous whitespace. Possible formats encountered: 'foo' 'foo, bar' 'foo,\n bar' '(n,p)-matrico' (the only term in ReVo with an internal comma) """ flat_string = flatten_node(node, skip_tags=["ofc", "fnt"]) if flat_string == "(n,p)-matrico": words = ["(n,p)-matrico"] else: words = flat_string.split(",") if len(words) > 1: for i in range(len(words)): # remove trailing/leading space and awkard newlines words[i] = clean_string(words[i]) return words
def flatten_node(node, skip_tags=None): """Return a friendly string representing the contents of this node and its children. This method is generic although occasionally we need methods which are specific to a certain node type. skip_tags specifies node tags for a node which we don't recurse into (although we will collect its tail, since that is outside). Some examples: <rim> La tuta terminologio pri <tld/>oj, <tld/>-vektoroj kaj -subspacoj de endomorfio ekzistas ankaŭ por <frm>(<k>n</k>,<k>n</k>)</frm>-matrico, konvencie identigita kun la endomorfio, kies matrico rilate al la kanona bazo de <frm><g>K</g><sup><k>n</k></sup></frm> ĝi estas. </rim> (from ajgen.xml) <ekz> <ctl>popolo</ctl>, <ctl>foliaro</ctl>, <ctl>herbo</ctl>, <ctl>armeo</ctl> estas ar<tld/>oj. </ekz> (from vort.xml) <ekz> <ind>saluton!</ind> [...] </ekz> (from salut.xml) <klr>(de <ref cel="polino.0o">polinomo</ref>)</klr> (from radik.xml) """ flatten_method = get_flatten_method(node) flat_string = flatten_method(node) for child in node.getchildren(): flat_string += _flatten(child, skip_tags) return clean_string(flat_string)
def get_examples(node): """Get all examples from the children of a node. Examples tend to be in <dif>s, and take the following form: <ekz> simpla, kunmetita, dubsenca <tld/>o; </ekz><ekz> uzi la ĝustan, konvenan <tld/>on; </ekz><ekz> la bildoj elvokitaj de la <tld/>oj; </ekz><ekz> <tld/>ordo. </ekz> (from vort.xml) Sometimes (bizarrely) examples spread across several <ekz> nodes: <ekz> <tld/>i al si plezuron<fnt>Z</fnt>; </ekz><ekz> <tld/>i instruon<fnt>Z</fnt>, </ekz><ekz> amikecon<fnt>Z</fnt>, [...] (from sercx.xml) Sometimes only references, which we discard: <ekz> <ref tip="sub" cel="bier.0o">biero</ref>, <ref tip="sub" cel="brand.0o">brando</ref>, <ref tip="sub" cel="vin.0o">vino</ref> </ekz> (from alkoho.xml) <subsnc mrk="afekt.0o.sxajnigi" ref="afekt.0i.sxajnigi"> <ekz> kiom aĉas la <tld/>o komplezi al duonvivul' (from afekt.xml) """ raw_examples = [] # examples tend to be on <dif>s for dif_node in node.findall('dif'): for ekz_node in dif_node.findall('ekz'): raw_example = flatten_example(ekz_node) if raw_example: raw_examples.append(raw_example) # but examples can also be on the <snc>/<subsnc> itself # (or even a <drv>!) for ekz_node in node.findall('ekz'): raw_example = flatten_example(ekz_node) if raw_example: raw_examples.append(raw_example) # fix examples spread over multiple <ekz>s by concatenating each # example that ends with a comma with the next example examples = [] example_string = "" for (example, source) in raw_examples: example_string += ' ' + example if not example_string.endswith(','): examples.append((clean_string(example_string), source)) example_string = "" if example_string != "": art_node = ekz_node.iterancestors('art').next() kap_node = art_node.iter('kap').next() word = get_words_from_kap(kap_node)[0] print("Warning: example for %s ended with comma: %s" % \ (word, clean_string(example_string))) return examples
def get_args(self, message, lower=True): messy_args = message.clean_content[len(f'okuyasu {self.name} '):] return clean_string(messy_args, lower=lower)