Esempio n. 1
0
def sinica_parse(s):
    """
    Parse a Sinica Treebank string and return a tree.  Trees are represented as nested brackettings,
    as shown in the following example (X represents a Chinese character):
    S(goal:NP(Head:Nep:XX)|theme:NP(Head:Nhaa:X)|quantity:Dab:X|Head:VL2:X)#0(PERIODCATEGORY)

    :return: A tree corresponding to the string representation.
    :rtype: Tree
    :param s: The string to be converted
    :type s: str
    """
    tokens = re.split(r"([()| ])", s)
    for i in range(len(tokens)):
        if tokens[i] == "(":
            tokens[i - 1], tokens[i] = (
                tokens[i],
                tokens[i - 1],
            )  # pull nonterminal inside parens
        elif ":" in tokens[i]:
            fields = tokens[i].split(":")
            if len(fields) == 2:  # non-terminal
                tokens[i] = fields[1]
            else:
                tokens[i] = "(" + fields[-2] + " " + fields[-1] + ")"
        elif tokens[i] == "|":
            tokens[i] = ""

    treebank_string = " ".join(tokens)
    return Tree.fromstring(treebank_string, remove_empty_top_bracketing=True)
Esempio n. 2
0
def test():
    """Do some tree drawing tests."""
    def print_tree(n, tree, sentence=None, ansi=True, **xargs):
        print()
        print('{}: "{}"'.format(n, " ".join(sentence or tree.leaves())))
        print(tree)
        print()
        drawtree = TreePrettyPrinter(tree, sentence)
        try:
            print(drawtree.text(unicodelines=ansi, ansi=ansi, **xargs))
        except (UnicodeDecodeError, UnicodeEncodeError):
            print(drawtree.text(unicodelines=False, ansi=False, **xargs))

    from nltk.corpus import treebank

    for n in [0, 1440, 1591, 2771, 2170]:
        tree = treebank.parsed_sents()[n]
        print_tree(n, tree, nodedist=2, maxwidth=8)
    print()
    print("ASCII version:")
    print(TreePrettyPrinter(tree).text(nodedist=2))

    tree = Tree.fromstring(
        "(top (punct 8) (smain (noun 0) (verb 1) (inf (verb 5) (inf (verb 6) "
        "(conj (inf (pp (prep 2) (np (det 3) (noun 4))) (verb 7)) (inf (verb 9)) "
        "(vg 10) (inf (verb 11)))))) (punct 12))",
        read_leaf=int,
    )
    sentence = ("Ze had met haar moeder kunnen gaan winkelen ,"
                " zwemmen of terrassen .".split())
    print_tree("Discontinuous tree", tree, sentence, nodedist=2)
Esempio n. 3
0
def demo():
    """
    A demonstration showing how each tree transform can be used.
    """

    from copy import deepcopy

    from nltk.draw.tree import draw_trees
    from nltk.tree.tree import Tree

    # original tree from WSJ bracketed text
    sentence = """(TOP
  (S
    (S
      (VP
        (VBN Turned)
        (ADVP (RB loose))
        (PP
          (IN in)
          (NP
            (NP (NNP Shane) (NNP Longman) (POS 's))
            (NN trading)
            (NN room)))))
    (, ,)
    (NP (DT the) (NN yuppie) (NNS dealers))
    (VP (AUX do) (NP (NP (RB little)) (ADJP (RB right))))
    (. .)))"""
    t = Tree.fromstring(sentence, remove_empty_top_bracketing=True)

    # collapse subtrees with only one child
    collapsedTree = deepcopy(t)
    collapse_unary(collapsedTree)

    # convert the tree to CNF
    cnfTree = deepcopy(collapsedTree)
    chomsky_normal_form(cnfTree)

    # convert the tree to CNF with parent annotation (one level) and horizontal smoothing of order two
    parentTree = deepcopy(collapsedTree)
    chomsky_normal_form(parentTree, horzMarkov=2, vertMarkov=1)

    # convert the tree back to its original form (used to make CYK results comparable)
    original = deepcopy(parentTree)
    un_chomsky_normal_form(original)

    # convert tree back to bracketed text
    sentence2 = original.pprint()
    print(sentence)
    print(sentence2)
    print("Sentences the same? ", sentence == sentence2)

    draw_trees(t, collapsedTree, cnfTree, parentTree, original)
Esempio n. 4
0
def un_chomsky_normal_form(tree,
                           expandUnary=True,
                           childChar="|",
                           parentChar="^",
                           unaryChar="+"):
    # Traverse the tree-depth first keeping a pointer to the parent for modification purposes.
    nodeList = [(tree, [])]
    while nodeList != []:
        node, parent = nodeList.pop()
        if isinstance(node, Tree):
            # if the node contains the 'childChar' character it means that
            # it is an artificial node and can be removed, although we still need
            # to move its children to its parent
            childIndex = node.label().find(childChar)
            if childIndex != -1:
                nodeIndex = parent.index(node)
                parent.remove(parent[nodeIndex])
                # Generated node was on the left if the nodeIndex is 0 which
                # means the grammar was left factored.  We must insert the children
                # at the beginning of the parent's children
                if nodeIndex == 0:
                    parent.insert(0, node[0])
                    parent.insert(1, node[1])
                else:
                    parent.extend([node[0], node[1]])

                # parent is now the current node so the children of parent will be added to the agenda
                node = parent
            else:
                parentIndex = node.label().find(parentChar)
                if parentIndex != -1:
                    # strip the node name of the parent annotation
                    node.set_label(node.label()[:parentIndex])

                # expand collapsed unary productions
                if expandUnary == True:
                    unaryIndex = node.label().find(unaryChar)
                    if unaryIndex != -1:
                        newNode = Tree(node.label()[unaryIndex + 1:],
                                       [i for i in node])
                        node.set_label(node.label()[:unaryIndex])
                        node[0:] = [newNode]

            for child in node:
                nodeList.append((child, node))
Esempio n. 5
0
 def __init__(self, node, children=None, **prob_kwargs):
     Tree.__init__(self, node, children)
     ProbabilisticMixIn.__init__(self, **prob_kwargs)
Esempio n. 6
0
def chomsky_normal_form(tree,
                        factor="right",
                        horzMarkov=None,
                        vertMarkov=0,
                        childChar="|",
                        parentChar="^"):
    # assume all subtrees have homogeneous children
    # assume all terminals have no siblings

    # A semi-hack to have elegant looking code below.  As a result,
    # any subtree with a branching factor greater than 999 will be incorrectly truncated.
    if horzMarkov is None:
        horzMarkov = 999

    # Traverse the tree depth-first keeping a list of ancestor nodes to the root.
    # I chose not to use the tree.treepositions() method since it requires
    # two traversals of the tree (one to get the positions, one to iterate
    # over them) and node access time is proportional to the height of the node.
    # This method is 7x faster which helps when parsing 40,000 sentences.

    nodeList = [(tree, [tree.label()])]
    while nodeList != []:
        node, parent = nodeList.pop()
        if isinstance(node, Tree):

            # parent annotation
            parentString = ""
            originalNode = node.label()
            if vertMarkov != 0 and node != tree and isinstance(node[0], Tree):
                parentString = "{}<{}>".format(parentChar, "-".join(parent))
                node.set_label(node.label() + parentString)
                parent = [originalNode] + parent[:vertMarkov - 1]

            # add children to the agenda before we mess with them
            for child in node:
                nodeList.append((child, parent))

            # chomsky normal form factorization
            if len(node) > 2:
                childNodes = [child.label() for child in node]
                nodeCopy = node.copy()
                node[0:] = []  # delete the children

                curNode = node
                numChildren = len(nodeCopy)
                for i in range(1, numChildren - 1):
                    if factor == "right":
                        newHead = "{}{}<{}>{}".format(
                            originalNode,
                            childChar,
                            "-".join(
                                childNodes[i:min([i +
                                                  horzMarkov, numChildren])]),
                            parentString,
                        )  # create new head
                        newNode = Tree(newHead, [])
                        curNode[0:] = [nodeCopy.pop(0), newNode]
                    else:
                        newHead = "{}{}<{}>{}".format(
                            originalNode,
                            childChar,
                            "-".join(childNodes[
                                max([numChildren - i - horzMarkov, 0]):-i]),
                            parentString,
                        )
                        newNode = Tree(newHead, [])
                        curNode[0:] = [newNode, nodeCopy.pop()]

                    curNode = newNode

                curNode[0:] = [child for child in nodeCopy]