Python ensure_utf8 Beispiele, util.ensure_utf8 Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: hlds.py Projekt: winnerineast/pypolibox

def __determine_nom_prefix(diamond):
    """
    determines, which character will be used as a prefix for <nom> tags. 
    usually, its the first character used in the corresponding <prop> tag, 
    (e.g. prop = "und" will turn nom = "konjunktion" into nom = 
    "u1:konjunktion", iff its the 1st "konjunktion" beginning with "u" in 
    that sentence).
    
    :type diamond: ``Diamond``
    
    :rtype: ``str``
    :return: a single character
    """
    numbers_only = re.compile("\d+$")

    if "prop" in diamond.keys():
        prop = ensure_utf8(diamond["prop"])
        if numbers_only.match(prop):
            nom_prefix_char = "n"
        else:  # <prop> doesn't represent a year, page count etc.
            nom_prefix_char = diamond["prop"].lower()[0]

    else:  #if there's no <prop> tag
        nom_prefix_char = "x"

    return ensure_utf8(nom_prefix_char)

Beispiel #2

0

Datei anzeigen

Datei: hlds.py Projekt: BigData-Tools/pypolibox

def __determine_nom_prefix(diamond):
    """
    determines, which character will be used as a prefix for <nom> tags. 
    usually, its the first character used in the corresponding <prop> tag, 
    (e.g. prop = "und" will turn nom = "konjunktion" into nom = 
    "u1:konjunktion", iff its the 1st "konjunktion" beginning with "u" in 
    that sentence).
    
    @type diamond: C{Diamond}
    
    @rtype: C{str}
    @return: a single character
    """
    numbers_only = re.compile("\d+$")
    
    if "prop" in diamond.keys():
        prop = ensure_utf8(diamond["prop"])
        if numbers_only.match(prop):
            nom_prefix_char = "n"
        else: # <prop> doesn't represent a year, page count etc.
            nom_prefix_char = diamond["prop"].lower()[0]
        
    else: #if there's no <prop> tag
        nom_prefix_char = "x"
    
    return ensure_utf8(nom_prefix_char)

Beispiel #3

0

Datei anzeigen

Datei: hlds.py Projekt: BigData-Tools/pypolibox

def convert_diamond_xml2fs(etree):
    """
    transforms a HLDS XML <diamond>...</diamond> structure 
    (that was parsed into an etree element) into an NLTK feature structure.

    @type etree_or_tuple: C{etree._Element}
    @param etree_or_tuple: a diamond etree element
    
    @rtype: C{Diamond}
    """
    mode = ensure_utf8(etree.attrib["mode"])

    nested_diamonds = []
    nom = "" # default value
    prop = "" # default value
    
    for child in etree.getchildren():
        if child.tag == "diamond":
            nested_diamond = convert_diamond_xml2fs(child)
            nested_diamonds.append(nested_diamond)
        elif child.tag == "nom":
            nom = ensure_utf8(child.attrib["name"])
        elif child.tag == "prop":
            prop = ensure_utf8(child.attrib["name"])

    return create_diamond(mode, nom, prop, nested_diamonds)

Beispiel #4

0

Datei anzeigen

Datei: hlds.py Projekt: winnerineast/pypolibox

def convert_diamond_xml2fs(etree):
    """
    transforms a HLDS XML <diamond>...</diamond> structure 
    (that was parsed into an etree element) into an NLTK feature structure.

    :type etree_or_tuple: ``etree._Element``
    :param etree_or_tuple: a diamond etree element
    
    :rtype: ``Diamond``
    """
    mode = ensure_utf8(etree.attrib["mode"])

    nested_diamonds = []
    nom = ""  # default value
    prop = ""  # default value

    for child in etree.getchildren():
        if child.tag == "diamond":
            nested_diamond = convert_diamond_xml2fs(child)
            nested_diamonds.append(nested_diamond)
        elif child.tag == "nom":
            nom = ensure_utf8(child.attrib["name"])
        elif child.tag == "prop":
            prop = ensure_utf8(child.attrib["name"])

    return create_diamond(mode, nom, prop, nested_diamonds)

Beispiel #5

0

Datei anzeigen

Datei: hlds.py Projekt: BigData-Tools/pypolibox

def add_nom_prefixes(diamond):
    """
    Adds a prefix/index to the name attribute of every <nom> tag of a 
    C{Diamond} or C{Sentence} structure. Without this, I{ccg-realize} will 
    only produce gibberish.
    
    Every <nom> tag has a 'name' attribute, which contains a category/type-like
    description of the corresponding <prop> tag's name attribute, e.g.::
    
        <diamond mode="PRÄP">
            <nom name="v1:zugehörigkeit"/>
            <prop name="von"/>
        </diamond>

    Here 'zugehörigkeit' is the name of a category that the preposition 
    'von' belongs to. usually, the nom prefix is the first character of the 
    prop name attribute with an added index. index iteration is done by a 
    depth-first walk through all diamonds contained in the given feature 
    structure. In this example 'v1:zugehörigkeit' means, that "von" is the 
    first C{diamond} in the structure that starts with 'v' and belongs to 
    the category 'zugehörigkeit'.
    """
    prop_dict = defaultdict(int)
    elements = [element for element in diamond.walk()]

    for e in elements:
        if type(e) is Diamond:
            if "nom" in e.keys():
                nom_prefix_char = __determine_nom_prefix(e)
                    
                prop_dict[nom_prefix_char] += 1
                nom_without_prefix = e["nom"]
                nom_type = type(nom_without_prefix)
                e["nom"] = "{0}{1}:{2}".format(ensure_utf8(nom_prefix_char), 
                                               prop_dict[nom_prefix_char],
                                               ensure_utf8(nom_without_prefix))
                if nom_type == unicode:
                # preserve unicode, if the string was unicode encoded before
                    e["nom"] = ensure_unicode(e["nom"])

Beispiel #6

0

Datei anzeigen

Datei: hlds.py Projekt: winnerineast/pypolibox

def add_nom_prefixes(diamond):
    """
    Adds a prefix/index to the name attribute of every <nom> tag of a 
    ``Diamond`` or ``Sentence`` structure. Without this, ``ccg-realize`` will 
    only produce gibberish.
    
    Every <nom> tag has a 'name' attribute, which contains a category/type-like
    description of the corresponding <prop> tag's name attribute, e.g.::
    
        <diamond mode="PRÄP">
            <nom name="v1:zugehörigkeit"/>
            <prop name="von"/>
        </diamond>

    Here 'zugehörigkeit' is the name of a category that the preposition 
    'von' belongs to. usually, the nom prefix is the first character of the 
    prop name attribute with an added index. index iteration is done by a 
    depth-first walk through all diamonds contained in the given feature 
    structure. In this example 'v1:zugehörigkeit' means, that "von" is the 
    first ``diamond`` in the structure that starts with 'v' and belongs to 
    the category 'zugehörigkeit'.
    """
    prop_dict = defaultdict(int)
    elements = [element for element in diamond.walk()]

    for e in elements:
        if type(e) is Diamond:
            if "nom" in e.keys():
                nom_prefix_char = __determine_nom_prefix(e)

                prop_dict[nom_prefix_char] += 1
                nom_without_prefix = e["nom"]
                nom_type = type(nom_without_prefix)
                e["nom"] = "{0}{1}:{2}".format(ensure_utf8(nom_prefix_char),
                                               prop_dict[nom_prefix_char],
                                               ensure_utf8(nom_without_prefix))
                if nom_type == unicode:
                    # preserve unicode, if the string was unicode encoded before
                    e["nom"] = ensure_unicode(e["nom"])

Beispiel #7

0

Datei anzeigen

Datei: hlds.py Projekt: BigData-Tools/pypolibox

    def parse_sentence(self, sentence, single_sent=True):
        if single_sent is False:
            item = sentence
            satop = item.find("xml/lf/satop") # root (verb) of the sentence

            # <item numOfParses="4" string="er beschreibt sie">
            sentence_string = ensure_utf8(item.attrib["string"])
            expected_parses = item.attrib["numOfParses"]

        elif single_sent is True:
            satop = sentence
            root = sentence.getroottree()
            target_element = root.find("target")
                        
            if target_element is not None:
                sentence_string = target_element.text
            else:
                sentence_string = ""
            
            expected_parses = 1 
                
        # <satop nom="b1:handlung">
        #   <prop name="beschreiben"/>
        root_prop = "" # some HLDS satop structures don't have a <prop> tag
        if satop.find("prop") is not None:
            root_prop = satop.find("prop").attrib["name"]
        root_nom = satop.attrib["nom"]
        elements = []
        
        for element in satop.findall("diamond"):
            diamond = convert_diamond_xml2fs(element)
            elements.append(diamond)

        sentence = Sentence()
        sentence.create_sentence(sentence_string, expected_parses, 
                                root_nom, root_prop, elements)
        return sentence

Beispiel #8

0

Datei anzeigen

Datei: hlds.py Projekt: winnerineast/pypolibox

    def parse_sentence(self, sentence, single_sent=True):
        if single_sent is False:
            item = sentence
            satop = item.find("xml/lf/satop")  # root (verb) of the sentence

            # <item numOfParses="4" string="er beschreibt sie">
            sentence_string = ensure_utf8(item.attrib["string"])
            expected_parses = item.attrib["numOfParses"]

        elif single_sent is True:
            satop = sentence
            root = sentence.getroottree()
            target_element = root.find("target")

            if target_element is not None:
                sentence_string = target_element.text
            else:
                sentence_string = ""

            expected_parses = 1

        # <satop nom="b1:handlung">
        #   <prop name="beschreiben"/>
        root_prop = ""  # some HLDS satop structures don't have a <prop> tag
        if satop.find("prop") is not None:
            root_prop = satop.find("prop").attrib["name"]
        root_nom = satop.attrib["nom"]
        elements = []

        for element in satop.findall("diamond"):
            diamond = convert_diamond_xml2fs(element)
            elements.append(diamond)

        sentence = Sentence()
        sentence.create_sentence(sentence_string, expected_parses, root_nom,
                                 root_prop, elements)
        return sentence

Beispiel #9

0

Datei anzeigen

Datei: hlds.py Projekt: BigData-Tools/pypolibox

def featstruct2avm(featstruct, mode="non-recursive"):
    """
    converts an NLTK feature structure into an attribute-value matrix
    that can be printed with LaTeX's avm environment.

    @type featstruct: C{nltk.featstruct} or C{Diamond} or C{Sentence}
    @rtype: C{str}
    """
    ret_str = "\[ "
    for key, val in sorted(featstruct.items()):

        if isinstance(val, Diamond): #handles nested Diamond structures
            diamond_key = val[Feature("mode")]
            diamond_val = featstruct2avm(val, mode="recursive")
            ret_str += "{0} & {1} \\\\\n".format( ensure_utf8(diamond_key),
                                                  ensure_utf8(diamond_val))

        elif isinstance(val, nltk.FeatStruct):
        #every other subclass of FeatStruct incl. FeatStruct
            nested_featstruct = featstruct2avm(val, mode="recursive")
            ret_str += "{0} & {1} \\\\\n".format( ensure_utf8(key),
                                                  ensure_utf8(nested_featstruct))
            
            
        else: # normal key, value pairs within a FeatStruct
            if key in (Feature("mode"), Feature("expected_parses")):
                continue # don't print "mode" or "expected_parses" keys
            elif key == Feature("root_nom"):
                key = Feature("nom")
            elif key == Feature("root_prop"):
                key = Feature("prop")

            ret_str += "{0} & `{1}' \\\\\n".format( ensure_utf8(key),
                                                    ensure_utf8(val))

    ret_str += " \]\n"

    if mode == "non-recursive":
        clean_ret_str = ret_str.replace("*", "$*$").replace("_", "\_")
        ret_str = "{0}\n{1}{2}".format('\\begin{avm}', clean_ret_str, '\\end{avm}')
    return ret_str

Beispiel #10

0

Datei anzeigen

Datei: hlds.py Projekt: winnerineast/pypolibox

def featstruct2avm(featstruct, mode="non-recursive"):
    """
    converts an NLTK feature structure into an attribute-value matrix
    that can be printed with LaTeX's avm environment.

    :type featstruct: ``nltk.featstruct`` or ``Diamond`` or ``Sentence``
    :rtype: ``str``
    """
    ret_str = "\[ "
    for key, val in sorted(featstruct.items()):

        if isinstance(val, Diamond):  #handles nested Diamond structures
            diamond_key = val[Feature("mode")]
            diamond_val = featstruct2avm(val, mode="recursive")
            ret_str += "{0} & {1} \\\\\n".format(ensure_utf8(diamond_key),
                                                 ensure_utf8(diamond_val))

        elif isinstance(val, nltk.FeatStruct):
            #every other subclass of FeatStruct incl. FeatStruct
            nested_featstruct = featstruct2avm(val, mode="recursive")
            ret_str += "{0} & {1} \\\\\n".format(
                ensure_utf8(key), ensure_utf8(nested_featstruct))

        else:  # normal key, value pairs within a FeatStruct
            if key in (Feature("mode"), Feature("expected_parses")):
                continue  # don't print "mode" or "expected_parses" keys
            elif key == Feature("root_nom"):
                key = Feature("nom")
            elif key == Feature("root_prop"):
                key = Feature("prop")

            ret_str += "{0} & `{1}' \\\\\n".format(ensure_utf8(key),
                                                   ensure_utf8(val))

    ret_str += " \]\n"

    if mode == "non-recursive":
        clean_ret_str = ret_str.replace("*", "$*$").replace("_", "\_")
        ret_str = "{0}\n{1}{2}".format('\\begin{avm}', clean_ret_str,
                                       '\\end{avm}')
    return ret_str