def handle_caps(element): """Handle small caps. Replace '°xxx' by '<span class="sc">xxx</span>'. """ import re pattern = r"([^°]*)°([^\s\.,)+/:]*)(.*)" # Find text to display in small caps result = re.match(pattern, element.attrib["val"].encode(ENCODING)) # Initialize loop variables previous_span = None index = 0 while result: before = result.group(1).decode(ENCODING) sc = result.group(2).decode(ENCODING) after = result.group(3).decode(ENCODING) # Handle previous span or element if previous_span is None: element.text = before else: previous_span.tail = before # Create span span = Element("span") span.attrib["class"] = "sc" span.text = sc # Insert span in element element.insert(index, span) # Update result result = re.match(pattern, after.encode(ENCODING)) if not result: span.tail = after # Update loop variables previous_span = span index += 1 return element
def handle_font(element): """Replace '{xxx}' by '<span class="ipa">xxx</span>'. """ import re # Find text to display in IPA pattern = r"([^{}]*){([^}]*)}(.*)" result = re.match(pattern, element.attrib["val"]) # Initialize loop variables previous_span = None index = 0 while result: before = result.group(1) ipa = result.group(2) after = result.group(3) # Handle previous span or element if previous_span is None: element.text = before else: previous_span.tail = before # Create span span = Element("span") span.attrib["class"] = "ipa" span.text = ipa # Insert span in element element.insert(index, span) # Update result result = re.match(pattern, after) if not result: span.tail = after # Update loop variables previous_span = span index += 1 return element
def handle_pinyin(element): """Replace '@xxx' by '<span class="pinyin">xxx</span>'. """ import re # Find pinyin pattern = r"([^@]*)@(\w*)(.*)" result = re.match(pattern, element.attrib["val"]) # Initialize loop variables previous_span = None index = 0 while result: before = result.group(1) pinyin = result.group(2) after = result.group(3) # Handle previous span or element if previous_span is None: element.text = before else: previous_span.tail = before # Create span span = Element("span") span.attrib["class"] = "pinyin" span.text = pinyin # Insert span in element element.insert(index, span) # Update result result = re.match(pattern, after) if not result: span.tail = after # Update loop variables previous_span = span index += 1 return element
def handle_caps(element): """Handle small caps. Replace '°xxx' by '<span class="sc">xxx</span>'. """ import re pattern = r"([^°]*)°([^\s\.,)+/:]*)(.*)" # Find text to display in small caps result = re.match(pattern, element.attrib["val"].encode(ENCODING)) # Initialize loop variables previous_span = None index = 0 while result: before = result.group(1).decode(ENCODING) sc = result.group(2).decode(ENCODING) after = result.group(3).decode(ENCODING) # Handle previous span or element if previous_span is None: element.text = before else: previous_span.tail = before # Create span span = Element("span") span.attrib["class"] = "sc" span.text = sc # Insert span in element element.insert(index, span) # Update result result = re.match(pattern, after.encode(ENCODING)) if not result: span.tail = after # Update loop variables previous_span = span index += 1 return element
def handle_pinyin(element): """Replace '@xxx' by '<span class="pinyin">xxx</span>'. """ import re # Find pinyin pattern = r"([^@]*)@(\w*)(.*)" result = re.match(pattern, element.attrib["val"]) # Initialize loop variables previous_span = None index = 0 while result: before = result.group(1) pinyin = result.group(2) after = result.group(3) # Handle previous span or element if previous_span is None: element.text = before else: previous_span.tail = before # Create span span = Element("span") span.attrib["class"] = "pinyin" span.text = pinyin # Insert span in element element.insert(index, span) # Update result result = re.match(pattern, after) if not result: span.tail = after # Update loop variables previous_span = span index += 1 return element
def handle_font(element): """Replace '{xxx}' by '<span class="ipa">xxx</span>'. """ import re # Find text to display in IPA pattern = r"([^{}]*){([^}]*)}(.*)" result = re.match(pattern, element.attrib["val"]) # Initialize loop variables previous_span = None index = 0 while result: before = result.group(1) ipa = result.group(2) after = result.group(3) # Handle previous span or element if previous_span is None: element.text = before else: previous_span.tail = before # Create span span = Element("span") span.attrib["class"] = "ipa" span.text = ipa # Insert span in element element.insert(index, span) # Update result result = re.match(pattern, after) if not result: span.tail = after # Update loop variables previous_span = span index += 1 return element
def test_add_link(self): from morphology.related_form import RelatedForm input = Element("RelatedForm", targets="lx") form = RelatedForm() form.set_lexical_entry(LexicalEntry(id="lx_id")) # Create output element and sub-elements output = Element("RelatedForm", targets="lx") sub = SubElement(output, "a") sub.attrib["href"] = "lx_id1" # Fill in text sub.text = "lx" result = add_link(form, input) self.assertEqual(result[0], form) self.assertEqual(tostring(result[1]), tostring(output))
def test_handle_pinyin(self): value = "@at1 atA@at2 atB" input = Element("name", val=unicode(value)) # Create output element and sub-elements output = Element("name", val=unicode(value)) sub1 = SubElement(output, "span") sub1.attrib["class"] = "pinyin" sub2 = SubElement(output, "span") sub2.attrib["class"] = "pinyin" # Fill in text output.text = "" sub1.text = "at1" sub1.tail = " atA" sub2.text = "at2" sub2.tail = " atB" self.assertEqual(tostring(handle_pinyin(input)), tostring(output))
def test_handle_caps(self): value = u"°trucs et°astuces" input = Element("name", val=value) # Create output element and sub-elements output = Element("name", val=value) sub1 = SubElement(output, "span") sub1.attrib["class"] = "sc" sub2 = SubElement(output, "span") sub2.attrib["class"] = "sc" # Fill in text output.text = "" sub1.text = "trucs" sub1.tail = " et" sub2.text = "astuces" sub2.tail = "" self.assertEqual(tostring(handle_caps(input)), tostring(output))
def test_prettify(self): # Create XML element with sub-element element = Element("LexicalEntry") SubElement(element, "Lemma") # Build expected result eol = unicode(EOL) expected_str = "<?xml version=\"1.0\" encoding=\"utf-8\"?>" + eol + "<LexicalEntry>" + eol + " <Lemma/>" + eol + "</LexicalEntry>" + eol # Test self.assertEqual(prettify(element), expected_str) del element
def add_link(object, element): """Insert an hyperlink <a href=xxx>xxx<a/> in XML. """ # To access options from pylmflib import options global options if options.cross_references: # Retrieve identifier try: id = object.get_lexical_entry().get_id() except AttributeError: id = None if id is not None: # Create link a = Element("a") a.attrib["href"] = id a.text = element.attrib["targets"] # Insert link in element element.insert(0, a) return (object, element)
def add_link(object, element): """Insert an hyperlink <a href=xxx>xxx<a/> in XML. """ # To access options from pylmflib import options global options if options.cross_references: # Retrieve identifier try: id = object.get_lexical_entry().get_id() except AttributeError: id = None if id is not None: # Create link a = Element("a") a.attrib["href"] = id a.text = element.attrib["targets"] # Insert link in element element.insert(0, a) return (object, element)
def xml_lmf_write(object, filename): """! @brief Write an XML LMF file. @param object The LMF instance to write as XML. @param filename The name of the XML LMF file to write with full path, for instance 'user/output.xml'. """ # Create the root XML element root = Element(object.__class__.__name__) # Create all XML sub-elements build_sub_elements(object, root) # Write all created XML elements in the output file write_result(root, filename)
def test_handle_font(self): value = "blaA{bla1} blaB {bla2}blaC {bla3}" input = Element("name", val=unicode(value)) # Create output element and sub-elements output = Element("name", val=unicode(value)) sub1 = SubElement(output, "span") sub1.attrib["class"] = "ipa" sub2 = SubElement(output, "span") sub2.attrib["class"] = "ipa" sub3 = SubElement(output, "span") sub3.attrib["class"] = "ipa" # Fill in text output.text = "blaA" sub1.text = "bla1" sub1.tail = " blaB " sub2.text = "bla2" sub2.tail = "blaC " sub3.text = "bla3" sub3.tail = "" self.assertEqual(tostring(handle_font(input)), tostring(output))
def test_write_result(self): import sys, os utest_path = sys.path[0] + '/' xml_filename = utest_path + "output.xml" # Create XML element with sub-element element = Element("LexicalEntry") SubElement(element, "Lemma") # Write result write_result(element, xml_filename) del element # Remove XML file os.remove(xml_filename)
def test_handle_fv(self): value1 = "fv:something here and fv:there" value2 = "|fv{something here} and fv:there" for value in [value1, value2]: input = Element("name", val=unicode(value)) # Create output element and sub-elements output = Element("name", val=unicode(value)) sub1 = SubElement(output, "span") sub1.attrib["class"] = "vernacular" sub2 = SubElement(output, "span") sub2.attrib["class"] = "vernacular" # Fill in text output.text = "" if value == value1: sub1.text = "something" sub1.tail = " here and " elif value == value2: sub1.text = "something here" sub1.tail = " and " sub2.text = "there" sub2.tail = "" self.assertEqual(tostring(handle_fv(input)), tostring(output))
def test_build_sub_elements(self): # Create LMF objects and an empty XML element instance = LexicalEntry() instance.lemma = Lemma() instance.partOfSpeech = "toto" instance.status = "draft" instance.lemma.lexeme = "hello" element = Element("LexicalEntry") # Build sub-elements and test result build_sub_elements(instance, element) lemma = element.find("Lemma") lexeme = lemma.find("feat") self.assertEqual(lexeme.attrib["att"], "lexeme") self.assertEqual(lexeme.attrib["val"], "hello") [status, partOfSpeech] = element.findall("feat") self.assertEqual(partOfSpeech.attrib["att"], "partOfSpeech") self.assertEqual(partOfSpeech.attrib["val"], "toto") self.assertEqual(status.attrib["att"], "status") self.assertEqual(status.attrib["val"], "draft") del instance.lemma instance.lemma = None del instance, element
def test_handle_fn(self): value1 = "textfn:this fn:but not this" value2 = "textfn:this |fn{and this}" for value in [value1, value2]: input = Element("name", val=unicode(value)) # Create output element and sub-elements output = Element("name", val=unicode(value)) sub1 = SubElement(output, "span") sub1.attrib["class"] = "national" sub2 = SubElement(output, "span") sub2.attrib["class"] = "national" # Fill in text output.text = "text" sub1.text = "this" sub1.tail = " " if value == value1: sub2.text = "but" sub2.tail = " not this" elif value == value2: sub2.text = "and this" sub2.tail = "" self.assertEqual(tostring(handle_fn(input)), tostring(output))
def handle_fv(element): """Replace 'fv:xxx' and '|fv{xxx}' by '<span class="vernacular">xxx</span>'. """ import re # Find text to display in vernacular font pattern = r"(([^:\|]*)fv:([^\s\.,)]*)(.*))|(([^:\|]*)\|fv{([^}]*)}(.*))" result = re.match(pattern, element.attrib["val"]) # Initialize loop variables previous_span = None index = 0 while result: if result.group(1) is not None: before = result.group(2) vernacular = result.group(3) after = result.group(4) elif result.group(5) is not None: before = result.group(6) vernacular = result.group(7) after = result.group(8) # Handle previous span or element if previous_span is None: element.text = before else: previous_span.tail = before # Create span span = Element("span") span.attrib["class"] = "vernacular" span.text = vernacular # Insert span in element element.insert(index, span) # Update result result = re.match(pattern, after) if not result: span.tail = after # Update loop variables previous_span = span index += 1 return element
def handle_fv(element): """Replace 'fv:xxx' and '|fv{xxx}' by '<span class="vernacular">xxx</span>'. """ import re # Find text to display in vernacular font pattern = r"(([^:\|]*)fv:([^\s\.,)]*)(.*))|(([^:\|]*)\|fv{([^}]*)}(.*))" result = re.match(pattern, element.attrib["val"]) # Initialize loop variables previous_span = None index = 0 while result: if result.group(1) is not None: before = result.group(2) vernacular = result.group(3) after = result.group(4) elif result.group(5) is not None: before = result.group(6) vernacular = result.group(7) after = result.group(8) # Handle previous span or element if previous_span is None: element.text = before else: previous_span.tail = before # Create span span = Element("span") span.attrib["class"] = "vernacular" span.text = vernacular # Insert span in element element.insert(index, span) # Update result result = re.match(pattern, after) if not result: span.tail = after # Update loop variables previous_span = span index += 1 return element
def test_get_sub_elements(self): # Declare instance and prepare XML element with its sub-elements instance = LexicalEntry() element = Element("LexicalEntry") lemma = SubElement(element, "Lemma") SubElement(lemma, "feat", att="lexeme", val="hello") SubElement(element, "feat", att="partOfSpeech", val="toto") SubElement(element, "feat", att="status", val="draft") # Test results get_sub_elements(instance, element) self.assertEqual(instance.get_lexeme(), "hello") self.assertEqual(instance.get_partOfSpeech(), "toto") self.assertEqual(instance.get_status(), "draft") del instance, element, lemma
def test_parse_xml(self): import sys, os utest_path = sys.path[0] + '/' xml_filename = utest_path + "input.xml" # Create XML tree element = Element("LexicalEntry") SubElement(element, "Lemma") tree = ElementTree(element) # Write tree then parse tree.write(xml_filename) parse_xml(xml_filename) del tree, element # Remove XML file os.remove(xml_filename)
def handle_tones(element): """Replace tones subscripts by '<sub>xxx</sub>'. """ from utils.io import ENCODING import re if element.attrib["att"] == "tone": # Initialize loop variables previous_sub = None if element.text is None: element.text = "" index = 0 for c in element.attrib["val"]: if c in set("abcd123"): # Create sub sub = Element("sub") sub.text = c # Insert sub in element element.insert(index, sub) # Update loop variables previous_sub = sub previous_sub.tail = "" index += 1 else: # Handle previous sub or element if previous_sub is None: element.text += c else: previous_sub.tail += c if element.text == element.attrib["val"]: # Reset if identical element.text = None return element if element.attrib["att"] != "lexeme": return element # Find text to display as subscript tones = "˩˧˥".decode(encoding=ENCODING) # Monosyllabic current_pattern = "([^" + tones + "#$]+)(#?[" + tones + "]{1,2}[$#]?)([abcd123]?)" pattern = "^" + current_pattern + "$" if re.search(pattern, element.attrib["val"]): result = re.match(pattern, element.attrib["val"]) before = result.group(1) + result.group(2) subscript = result.group(3) element.text = before if len(subscript) != 0: # Create sub sub = Element("sub") sub.text = subscript # Insert sub in element element.insert(0, sub) if element.text == element.attrib["val"]: # Reset if identical element.text = None return element # Disyllabic: add a constraint on other syllables which must have at least 2 characters (maximum 5) syllable = "([^" + tones + "#$]{2,5})(#?[" + tones + "]{1,2}[$#]?)([abcd123]?)" # Handle words composed of 2, 3, 4, 5 syllables for syllable_nb in range (2, 6): current_pattern += syllable pattern = "^" + current_pattern + "$" if re.search(pattern, element.attrib["val"]): result = re.match(pattern, element.attrib["val"]) # Initialize loop variables previous_sub = None if element.text is None: element.text = "" for i in range (0, syllable_nb): before = result.group(i*3+1) + result.group(i*3+2) subscript = result.group(i*3+3) if i != syllable_nb - 1: before += subscript subscript = "" # Handle previous sub or element if previous_sub is None: element.text += before else: previous_sub.tail += before if len(subscript) != 0: # Create sub sub = Element("sub") sub.text = subscript # Insert sub in element element.insert(i, sub) # Update loop variable previous_sub = sub previous_sub.tail = "" if element.text == element.attrib["val"]: # Reset if identical element.text = None return element
def test_handle_tones(self): ## Test "tone" value = u"LaM1H" input = Element("name", att="tone", val=value) # Create output element and sub-elements output = Element("name", att="tone", val=value) sub1 = SubElement(output, "sub") sub2 = SubElement(output, "sub") # Fill in text output.text = "L" sub1.text = "a" sub1.tail = "M" sub2.text = "1" sub2.tail = "H" self.assertEqual(tostring(handle_tones(input)), tostring(output)) ## Test "lexeme" value = "aa˩abb˧bcc˥c".decode(encoding=ENCODING) input = Element("name", att="lexeme", val=value) # Create output element and sub-elements output = Element("name", att="lexeme", val=value) sub = SubElement(output, "sub") # Fill in text output.text = "aa˩abb˧bcc˥".decode(encoding=ENCODING) sub.text = "c" self.assertEqual(tostring(handle_tones(input)), tostring(output)) ## Test others input = Element("name", att="other", val=value) output = Element("name", att="other", val=value) self.assertEqual(tostring(handle_tones(input)), tostring(output))
def handle_tones(element): """Replace tones subscripts by '<sub>xxx</sub>'. """ from utils.io import ENCODING import re if element.attrib["att"] == "tone": # Initialize loop variables previous_sub = None if element.text is None: element.text = "" index = 0 for c in element.attrib["val"]: if c in set("abcd123"): # Create sub sub = Element("sub") sub.text = c # Insert sub in element element.insert(index, sub) # Update loop variables previous_sub = sub previous_sub.tail = "" index += 1 else: # Handle previous sub or element if previous_sub is None: element.text += c else: previous_sub.tail += c if element.text == element.attrib["val"]: # Reset if identical element.text = None return element if element.attrib["att"] != "lexeme": return element # Find text to display as subscript tones = "˩˧˥".decode(encoding=ENCODING) # Monosyllabic current_pattern = "([^" + tones + "#$]+)(#?[" + tones + "]{1,2}[$#]?)([abcd123]?)" pattern = "^" + current_pattern + "$" if re.search(pattern, element.attrib["val"]): result = re.match(pattern, element.attrib["val"]) before = result.group(1) + result.group(2) subscript = result.group(3) element.text = before if len(subscript) != 0: # Create sub sub = Element("sub") sub.text = subscript # Insert sub in element element.insert(0, sub) if element.text == element.attrib["val"]: # Reset if identical element.text = None return element # Disyllabic: add a constraint on other syllables which must have at least 2 characters (maximum 5) syllable = "([^" + tones + "#$]{2,5})(#?[" + tones + "]{1,2}[$#]?)([abcd123]?)" # Handle words composed of 2, 3, 4, 5 syllables for syllable_nb in range(2, 6): current_pattern += syllable pattern = "^" + current_pattern + "$" if re.search(pattern, element.attrib["val"]): result = re.match(pattern, element.attrib["val"]) # Initialize loop variables previous_sub = None if element.text is None: element.text = "" for i in range(0, syllable_nb): before = result.group(i * 3 + 1) + result.group(i * 3 + 2) subscript = result.group(i * 3 + 3) if i != syllable_nb - 1: before += subscript subscript = "" # Handle previous sub or element if previous_sub is None: element.text += before else: previous_sub.tail += before if len(subscript) != 0: # Create sub sub = Element("sub") sub.text = subscript # Insert sub in element element.insert(i, sub) # Update loop variable previous_sub = sub previous_sub.tail = "" if element.text == element.attrib["val"]: # Reset if identical element.text = None return element