Beispiel #1
0
def handle_caps(element):
    """Handle small caps.
    Replace '°xxx' by '<span class="sc">xxx</span>'.
    """
    import re
    pattern = r"([^°]*)°([^\s\.,)+/:]*)(.*)"
    # Find text to display in small caps
    result = re.match(pattern, element.attrib["val"].encode(ENCODING))
    # Initialize loop variables
    previous_span = None
    index = 0
    while result:
        before = result.group(1).decode(ENCODING)
        sc = result.group(2).decode(ENCODING)
        after = result.group(3).decode(ENCODING)
        # Handle previous span or element
        if previous_span is None:
            element.text = before
        else:
            previous_span.tail = before
        # Create span
        span = Element("span")
        span.attrib["class"] = "sc"
        span.text = sc
        # Insert span in element
        element.insert(index, span)
        # Update result
        result = re.match(pattern, after.encode(ENCODING))
        if not result:
            span.tail = after
        # Update loop variables
        previous_span = span
        index += 1
    return element
Beispiel #2
0
def handle_font(element):
    """Replace '{xxx}' by '<span class="ipa">xxx</span>'.
    """
    import re
    # Find text to display in IPA
    pattern = r"([^{}]*){([^}]*)}(.*)"
    result = re.match(pattern, element.attrib["val"])
    # Initialize loop variables
    previous_span = None
    index = 0
    while result:
        before = result.group(1)
        ipa = result.group(2)
        after = result.group(3)
        # Handle previous span or element
        if previous_span is None:
            element.text = before
        else:
            previous_span.tail = before
        # Create span
        span = Element("span")
        span.attrib["class"] = "ipa"
        span.text = ipa
        # Insert span in element
        element.insert(index, span)
        # Update result
        result = re.match(pattern, after)
        if not result:
            span.tail = after
        # Update loop variables
        previous_span = span
        index += 1
    return element
Beispiel #3
0
def handle_pinyin(element):
    """Replace '@xxx' by '<span class="pinyin">xxx</span>'.
    """
    import re
    # Find pinyin
    pattern = r"([^@]*)@(\w*)(.*)"
    result = re.match(pattern, element.attrib["val"])
    # Initialize loop variables
    previous_span = None
    index = 0
    while result:
        before = result.group(1)
        pinyin = result.group(2)
        after = result.group(3)
        # Handle previous span or element
        if previous_span is None:
            element.text = before
        else:
            previous_span.tail = before
        # Create span
        span = Element("span")
        span.attrib["class"] = "pinyin"
        span.text = pinyin
        # Insert span in element
        element.insert(index, span)
        # Update result
        result = re.match(pattern, after)
        if not result:
            span.tail = after
        # Update loop variables
        previous_span = span
        index += 1
    return element
Beispiel #4
0
def handle_caps(element):
    """Handle small caps.
    Replace '°xxx' by '<span class="sc">xxx</span>'.
    """
    import re
    pattern = r"([^°]*)°([^\s\.,)+/:]*)(.*)"
    # Find text to display in small caps
    result = re.match(pattern, element.attrib["val"].encode(ENCODING))
    # Initialize loop variables
    previous_span = None
    index = 0
    while result:
        before = result.group(1).decode(ENCODING)
        sc = result.group(2).decode(ENCODING)
        after = result.group(3).decode(ENCODING)
        # Handle previous span or element
        if previous_span is None:
            element.text = before
        else:
            previous_span.tail = before
        # Create span
        span = Element("span")
        span.attrib["class"] = "sc"
        span.text = sc
        # Insert span in element
        element.insert(index, span)
        # Update result
        result = re.match(pattern, after.encode(ENCODING))
        if not result:
            span.tail = after
        # Update loop variables
        previous_span = span
        index += 1
    return element
Beispiel #5
0
def handle_pinyin(element):
    """Replace '@xxx' by '<span class="pinyin">xxx</span>'.
    """
    import re
    # Find pinyin
    pattern = r"([^@]*)@(\w*)(.*)"
    result = re.match(pattern, element.attrib["val"])
    # Initialize loop variables
    previous_span = None
    index = 0
    while result:
        before = result.group(1)
        pinyin = result.group(2)
        after = result.group(3)
        # Handle previous span or element
        if previous_span is None:
            element.text = before
        else:
            previous_span.tail = before
        # Create span
        span = Element("span")
        span.attrib["class"] = "pinyin"
        span.text = pinyin
        # Insert span in element
        element.insert(index, span)
        # Update result
        result = re.match(pattern, after)
        if not result:
            span.tail = after
        # Update loop variables
        previous_span = span
        index += 1
    return element
Beispiel #6
0
def handle_font(element):
    """Replace '{xxx}' by '<span class="ipa">xxx</span>'.
    """
    import re
    # Find text to display in IPA
    pattern = r"([^{}]*){([^}]*)}(.*)"
    result = re.match(pattern, element.attrib["val"])
    # Initialize loop variables
    previous_span = None
    index = 0
    while result:
        before = result.group(1)
        ipa = result.group(2)
        after = result.group(3)
        # Handle previous span or element
        if previous_span is None:
            element.text = before
        else:
            previous_span.tail = before
        # Create span
        span = Element("span")
        span.attrib["class"] = "ipa"
        span.text = ipa
        # Insert span in element
        element.insert(index, span)
        # Update result
        result = re.match(pattern, after)
        if not result:
            span.tail = after
        # Update loop variables
        previous_span = span
        index += 1
    return element
 def test_add_link(self):
     from morphology.related_form import RelatedForm
     input = Element("RelatedForm", targets="lx")
     form = RelatedForm()
     form.set_lexical_entry(LexicalEntry(id="lx_id"))
     # Create output element and sub-elements
     output = Element("RelatedForm", targets="lx")
     sub = SubElement(output, "a")
     sub.attrib["href"] = "lx_id1"
     # Fill in text
     sub.text = "lx"
     result = add_link(form, input)
     self.assertEqual(result[0], form)
     self.assertEqual(tostring(result[1]), tostring(output))
 def test_handle_pinyin(self):
     value = "@at1 atA@at2 atB"
     input = Element("name", val=unicode(value))
     # Create output element and sub-elements
     output = Element("name", val=unicode(value))
     sub1 = SubElement(output, "span")
     sub1.attrib["class"] = "pinyin"
     sub2 = SubElement(output, "span")
     sub2.attrib["class"] = "pinyin"
     # Fill in text
     output.text = ""
     sub1.text = "at1"
     sub1.tail = " atA"
     sub2.text = "at2"
     sub2.tail = " atB"
     self.assertEqual(tostring(handle_pinyin(input)), tostring(output))
 def test_handle_caps(self):
     value = u"°trucs et°astuces"
     input = Element("name", val=value)
     # Create output element and sub-elements
     output = Element("name", val=value)
     sub1 = SubElement(output, "span")
     sub1.attrib["class"] = "sc"
     sub2 = SubElement(output, "span")
     sub2.attrib["class"] = "sc"
     # Fill in text
     output.text = ""
     sub1.text = "trucs"
     sub1.tail = " et"
     sub2.text = "astuces"
     sub2.tail = ""
     self.assertEqual(tostring(handle_caps(input)), tostring(output))
 def test_prettify(self):
     # Create XML element with sub-element
     element = Element("LexicalEntry")
     SubElement(element, "Lemma")
     # Build expected result
     eol = unicode(EOL)
     expected_str = "<?xml version=\"1.0\" encoding=\"utf-8\"?>" + eol + "<LexicalEntry>" + eol + "    <Lemma/>" + eol + "</LexicalEntry>" + eol
     # Test
     self.assertEqual(prettify(element), expected_str)
     del element
Beispiel #11
0
def add_link(object, element):
    """Insert an hyperlink <a href=xxx>xxx<a/> in XML.
    """
    # To access options
    from pylmflib import options
    global options
    if options.cross_references:
        # Retrieve identifier
        try:
            id = object.get_lexical_entry().get_id()
        except AttributeError:
            id = None
        if id is not None:
            # Create link
            a = Element("a")
            a.attrib["href"] = id
            a.text = element.attrib["targets"]
            # Insert link in element
            element.insert(0, a)
    return (object, element)
Beispiel #12
0
def add_link(object, element):
    """Insert an hyperlink <a href=xxx>xxx<a/> in XML.
    """
    # To access options
    from pylmflib import options
    global options
    if options.cross_references:
        # Retrieve identifier
        try:
            id = object.get_lexical_entry().get_id()
        except AttributeError:
            id = None
        if id is not None:
            # Create link
            a = Element("a")
            a.attrib["href"] = id
            a.text = element.attrib["targets"]
            # Insert link in element
            element.insert(0, a)
    return (object, element)
Beispiel #13
0
def xml_lmf_write(object, filename):
    """! @brief Write an XML LMF file.
    @param object The LMF instance to write as XML.
    @param filename The name of the XML LMF file to write with full path, for instance 'user/output.xml'.
    """
    # Create the root XML element
    root = Element(object.__class__.__name__)
    # Create all XML sub-elements
    build_sub_elements(object, root)
    # Write all created XML elements in the output file
    write_result(root, filename)
 def test_handle_font(self):
     value = "blaA{bla1} blaB {bla2}blaC {bla3}"
     input = Element("name", val=unicode(value))
     # Create output element and sub-elements
     output = Element("name", val=unicode(value))
     sub1 = SubElement(output, "span")
     sub1.attrib["class"] = "ipa"
     sub2 = SubElement(output, "span")
     sub2.attrib["class"] = "ipa"
     sub3 = SubElement(output, "span")
     sub3.attrib["class"] = "ipa"
     # Fill in text
     output.text = "blaA"
     sub1.text = "bla1"
     sub1.tail = " blaB "
     sub2.text = "bla2"
     sub2.tail = "blaC "
     sub3.text = "bla3"
     sub3.tail = ""
     self.assertEqual(tostring(handle_font(input)), tostring(output))
 def test_write_result(self):
     import sys, os
     utest_path = sys.path[0] + '/'
     xml_filename = utest_path + "output.xml"
     # Create XML element with sub-element
     element = Element("LexicalEntry")
     SubElement(element, "Lemma")
     # Write result
     write_result(element, xml_filename)
     del element
     # Remove XML file
     os.remove(xml_filename)
 def test_handle_fv(self):
     value1 = "fv:something here and fv:there"
     value2 = "|fv{something here} and fv:there"
     for value in [value1, value2]:
         input = Element("name", val=unicode(value))
         # Create output element and sub-elements
         output = Element("name", val=unicode(value))
         sub1 = SubElement(output, "span")
         sub1.attrib["class"] = "vernacular"
         sub2 = SubElement(output, "span")
         sub2.attrib["class"] = "vernacular"
         # Fill in text
         output.text = ""
         if value == value1:
             sub1.text = "something"
             sub1.tail = " here and "
         elif value == value2:
             sub1.text = "something here"
             sub1.tail = " and "
         sub2.text = "there"
         sub2.tail = ""
         self.assertEqual(tostring(handle_fv(input)), tostring(output))
 def test_build_sub_elements(self):
     # Create LMF objects and an empty XML element
     instance = LexicalEntry()
     instance.lemma = Lemma()
     instance.partOfSpeech = "toto"
     instance.status = "draft"
     instance.lemma.lexeme = "hello"
     element = Element("LexicalEntry")
     # Build sub-elements and test result
     build_sub_elements(instance, element)
     lemma = element.find("Lemma")
     lexeme = lemma.find("feat")
     self.assertEqual(lexeme.attrib["att"], "lexeme")
     self.assertEqual(lexeme.attrib["val"], "hello")
     [status, partOfSpeech] = element.findall("feat")
     self.assertEqual(partOfSpeech.attrib["att"], "partOfSpeech")
     self.assertEqual(partOfSpeech.attrib["val"], "toto")
     self.assertEqual(status.attrib["att"], "status")
     self.assertEqual(status.attrib["val"], "draft")
     del instance.lemma
     instance.lemma = None
     del instance, element
 def test_handle_fn(self):
     value1 = "textfn:this fn:but not this"
     value2 = "textfn:this |fn{and this}"
     for value in [value1, value2]:
         input = Element("name", val=unicode(value))
         # Create output element and sub-elements
         output = Element("name", val=unicode(value))
         sub1 = SubElement(output, "span")
         sub1.attrib["class"] = "national"
         sub2 = SubElement(output, "span")
         sub2.attrib["class"] = "national"
         # Fill in text
         output.text = "text"
         sub1.text = "this"
         sub1.tail = " "
         if value == value1:
             sub2.text = "but"
             sub2.tail = " not this"
         elif value == value2:
             sub2.text = "and this"
             sub2.tail = ""
         self.assertEqual(tostring(handle_fn(input)), tostring(output))
Beispiel #19
0
def handle_fv(element):
    """Replace 'fv:xxx' and '|fv{xxx}' by '<span class="vernacular">xxx</span>'.
    """
    import re
    # Find text to display in vernacular font
    pattern = r"(([^:\|]*)fv:([^\s\.,)]*)(.*))|(([^:\|]*)\|fv{([^}]*)}(.*))"
    result = re.match(pattern, element.attrib["val"])
    # Initialize loop variables
    previous_span = None
    index = 0
    while result:
        if result.group(1) is not None:
            before = result.group(2)
            vernacular = result.group(3)
            after = result.group(4)
        elif result.group(5) is not None:
            before = result.group(6)
            vernacular = result.group(7)
            after = result.group(8)
        # Handle previous span or element
        if previous_span is None:
            element.text = before
        else:
            previous_span.tail = before
        # Create span
        span = Element("span")
        span.attrib["class"] = "vernacular"
        span.text = vernacular
        # Insert span in element
        element.insert(index, span)
        # Update result
        result = re.match(pattern, after)
        if not result:
            span.tail = after
        # Update loop variables
        previous_span = span
        index += 1
    return element
Beispiel #20
0
def handle_fv(element):
    """Replace 'fv:xxx' and '|fv{xxx}' by '<span class="vernacular">xxx</span>'.
    """
    import re
    # Find text to display in vernacular font
    pattern = r"(([^:\|]*)fv:([^\s\.,)]*)(.*))|(([^:\|]*)\|fv{([^}]*)}(.*))"
    result = re.match(pattern, element.attrib["val"])
    # Initialize loop variables
    previous_span = None
    index = 0
    while result:
        if result.group(1) is not None:
            before = result.group(2)
            vernacular = result.group(3)
            after = result.group(4)
        elif result.group(5) is not None:
            before = result.group(6)
            vernacular = result.group(7)
            after = result.group(8)
        # Handle previous span or element
        if previous_span is None:
            element.text = before
        else:
            previous_span.tail = before
        # Create span
        span = Element("span")
        span.attrib["class"] = "vernacular"
        span.text = vernacular
        # Insert span in element
        element.insert(index, span)
        # Update result
        result = re.match(pattern, after)
        if not result:
            span.tail = after
        # Update loop variables
        previous_span = span
        index += 1
    return element
Beispiel #21
0
 def test_get_sub_elements(self):
     # Declare instance and prepare XML element with its sub-elements
     instance = LexicalEntry()
     element = Element("LexicalEntry")
     lemma = SubElement(element, "Lemma")
     SubElement(lemma, "feat", att="lexeme", val="hello")
     SubElement(element, "feat", att="partOfSpeech", val="toto")
     SubElement(element, "feat", att="status", val="draft")
     # Test results
     get_sub_elements(instance, element)
     self.assertEqual(instance.get_lexeme(), "hello")
     self.assertEqual(instance.get_partOfSpeech(), "toto")
     self.assertEqual(instance.get_status(), "draft")
     del instance, element, lemma
 def test_parse_xml(self):
     import sys, os
     utest_path = sys.path[0] + '/'
     xml_filename = utest_path + "input.xml"
     # Create XML tree
     element = Element("LexicalEntry")
     SubElement(element, "Lemma")
     tree = ElementTree(element)
     # Write tree then parse
     tree.write(xml_filename)
     parse_xml(xml_filename)
     del tree, element
     # Remove XML file
     os.remove(xml_filename)
Beispiel #23
0
def handle_tones(element):
    """Replace tones subscripts by '<sub>xxx</sub>'.
    """
    from utils.io import ENCODING
    import re
    if element.attrib["att"] == "tone":
        # Initialize loop variables
        previous_sub = None
        if element.text is None:
            element.text = ""
        index = 0
        for c in element.attrib["val"]:
            if c in set("abcd123"):
                # Create sub
                sub = Element("sub")
                sub.text = c
                # Insert sub in element
                element.insert(index, sub)
                # Update loop variables
                previous_sub = sub
                previous_sub.tail = ""
                index += 1
            else:
                # Handle previous sub or element
                if previous_sub is None:
                    element.text += c
                else:
                    previous_sub.tail += c
        if element.text == element.attrib["val"]:
            # Reset if identical
            element.text = None
        return element
    if element.attrib["att"] != "lexeme":
        return element
    # Find text to display as subscript
    tones = "˩˧˥".decode(encoding=ENCODING)
    # Monosyllabic
    current_pattern = "([^" + tones + "#$]+)(#?[" + tones + "]{1,2}[$#]?)([abcd123]?)"
    pattern = "^" + current_pattern + "$"
    if re.search(pattern, element.attrib["val"]):
        result = re.match(pattern, element.attrib["val"])
        before = result.group(1) + result.group(2)
        subscript = result.group(3)
        element.text = before
        if len(subscript) != 0:
            # Create sub
            sub = Element("sub")
            sub.text = subscript
            # Insert sub in element
            element.insert(0, sub)
        if element.text == element.attrib["val"]:
            # Reset if identical
            element.text = None
        return element
    # Disyllabic: add a constraint on other syllables which must have at least 2 characters (maximum 5)
    syllable = "([^" + tones + "#$]{2,5})(#?[" + tones + "]{1,2}[$#]?)([abcd123]?)"
    # Handle words composed of 2, 3, 4, 5 syllables
    for syllable_nb in range (2, 6):
        current_pattern += syllable
        pattern = "^" + current_pattern + "$"
        if re.search(pattern, element.attrib["val"]):
            result = re.match(pattern, element.attrib["val"])
            # Initialize loop variables
            previous_sub = None
            if element.text is None:
                element.text = ""
            for i in range (0, syllable_nb):
                before = result.group(i*3+1) + result.group(i*3+2)
                subscript = result.group(i*3+3)
                if i != syllable_nb - 1:
                    before += subscript
                    subscript = ""
                # Handle previous sub or element
                if previous_sub is None:
                    element.text += before
                else:
                    previous_sub.tail += before
                if len(subscript) != 0:
                    # Create sub
                    sub = Element("sub")
                    sub.text = subscript
                    # Insert sub in element
                    element.insert(i, sub)
                    # Update loop variable
                    previous_sub = sub
                    previous_sub.tail = ""
    if element.text == element.attrib["val"]:
        # Reset if identical
        element.text = None
    return element
 def test_handle_tones(self):
     ## Test "tone"
     value = u"LaM1H"
     input = Element("name", att="tone", val=value)
     # Create output element and sub-elements
     output = Element("name", att="tone", val=value)
     sub1 = SubElement(output, "sub")
     sub2 = SubElement(output, "sub")
     # Fill in text
     output.text = "L"
     sub1.text = "a"
     sub1.tail = "M"
     sub2.text = "1"
     sub2.tail = "H"
     self.assertEqual(tostring(handle_tones(input)), tostring(output))
     ## Test "lexeme"
     value = "aa˩abb˧bcc˥c".decode(encoding=ENCODING)
     input = Element("name", att="lexeme", val=value)
     # Create output element and sub-elements
     output = Element("name", att="lexeme", val=value)
     sub = SubElement(output, "sub")
     # Fill in text
     output.text = "aa˩abb˧bcc˥".decode(encoding=ENCODING)
     sub.text = "c"
     self.assertEqual(tostring(handle_tones(input)), tostring(output))
     ## Test others
     input = Element("name", att="other", val=value)
     output = Element("name", att="other", val=value)
     self.assertEqual(tostring(handle_tones(input)), tostring(output))
Beispiel #25
0
def handle_tones(element):
    """Replace tones subscripts by '<sub>xxx</sub>'.
    """
    from utils.io import ENCODING
    import re
    if element.attrib["att"] == "tone":
        # Initialize loop variables
        previous_sub = None
        if element.text is None:
            element.text = ""
        index = 0
        for c in element.attrib["val"]:
            if c in set("abcd123"):
                # Create sub
                sub = Element("sub")
                sub.text = c
                # Insert sub in element
                element.insert(index, sub)
                # Update loop variables
                previous_sub = sub
                previous_sub.tail = ""
                index += 1
            else:
                # Handle previous sub or element
                if previous_sub is None:
                    element.text += c
                else:
                    previous_sub.tail += c
        if element.text == element.attrib["val"]:
            # Reset if identical
            element.text = None
        return element
    if element.attrib["att"] != "lexeme":
        return element
    # Find text to display as subscript
    tones = "˩˧˥".decode(encoding=ENCODING)
    # Monosyllabic
    current_pattern = "([^" + tones + "#$]+)(#?[" + tones + "]{1,2}[$#]?)([abcd123]?)"
    pattern = "^" + current_pattern + "$"
    if re.search(pattern, element.attrib["val"]):
        result = re.match(pattern, element.attrib["val"])
        before = result.group(1) + result.group(2)
        subscript = result.group(3)
        element.text = before
        if len(subscript) != 0:
            # Create sub
            sub = Element("sub")
            sub.text = subscript
            # Insert sub in element
            element.insert(0, sub)
        if element.text == element.attrib["val"]:
            # Reset if identical
            element.text = None
        return element
    # Disyllabic: add a constraint on other syllables which must have at least 2 characters (maximum 5)
    syllable = "([^" + tones + "#$]{2,5})(#?[" + tones + "]{1,2}[$#]?)([abcd123]?)"
    # Handle words composed of 2, 3, 4, 5 syllables
    for syllable_nb in range(2, 6):
        current_pattern += syllable
        pattern = "^" + current_pattern + "$"
        if re.search(pattern, element.attrib["val"]):
            result = re.match(pattern, element.attrib["val"])
            # Initialize loop variables
            previous_sub = None
            if element.text is None:
                element.text = ""
            for i in range(0, syllable_nb):
                before = result.group(i * 3 + 1) + result.group(i * 3 + 2)
                subscript = result.group(i * 3 + 3)
                if i != syllable_nb - 1:
                    before += subscript
                    subscript = ""
                # Handle previous sub or element
                if previous_sub is None:
                    element.text += before
                else:
                    previous_sub.tail += before
                if len(subscript) != 0:
                    # Create sub
                    sub = Element("sub")
                    sub.text = subscript
                    # Insert sub in element
                    element.insert(i, sub)
                    # Update loop variable
                    previous_sub = sub
                    previous_sub.tail = ""
    if element.text == element.attrib["val"]:
        # Reset if identical
        element.text = None
    return element