def sinica_parse(s): """ Parse a Sinica Treebank string and return a tree. Trees are represented as nested brackettings, as shown in the following example (X represents a Chinese character): S(goal:NP(Head:Nep:XX)|theme:NP(Head:Nhaa:X)|quantity:Dab:X|Head:VL2:X)#0(PERIODCATEGORY) :return: A tree corresponding to the string representation. :rtype: Tree :param s: The string to be converted :type s: str """ tokens = re.split(r"([()| ])", s) for i in range(len(tokens)): if tokens[i] == "(": tokens[i - 1], tokens[i] = ( tokens[i], tokens[i - 1], ) # pull nonterminal inside parens elif ":" in tokens[i]: fields = tokens[i].split(":") if len(fields) == 2: # non-terminal tokens[i] = fields[1] else: tokens[i] = "(" + fields[-2] + " " + fields[-1] + ")" elif tokens[i] == "|": tokens[i] = "" treebank_string = " ".join(tokens) return Tree.fromstring(treebank_string, remove_empty_top_bracketing=True)
def test(): """Do some tree drawing tests.""" def print_tree(n, tree, sentence=None, ansi=True, **xargs): print() print('{}: "{}"'.format(n, " ".join(sentence or tree.leaves()))) print(tree) print() drawtree = TreePrettyPrinter(tree, sentence) try: print(drawtree.text(unicodelines=ansi, ansi=ansi, **xargs)) except (UnicodeDecodeError, UnicodeEncodeError): print(drawtree.text(unicodelines=False, ansi=False, **xargs)) from nltk.corpus import treebank for n in [0, 1440, 1591, 2771, 2170]: tree = treebank.parsed_sents()[n] print_tree(n, tree, nodedist=2, maxwidth=8) print() print("ASCII version:") print(TreePrettyPrinter(tree).text(nodedist=2)) tree = Tree.fromstring( "(top (punct 8) (smain (noun 0) (verb 1) (inf (verb 5) (inf (verb 6) " "(conj (inf (pp (prep 2) (np (det 3) (noun 4))) (verb 7)) (inf (verb 9)) " "(vg 10) (inf (verb 11)))))) (punct 12))", read_leaf=int, ) sentence = ("Ze had met haar moeder kunnen gaan winkelen ," " zwemmen of terrassen .".split()) print_tree("Discontinuous tree", tree, sentence, nodedist=2)
def demo(): """ A demonstration showing how each tree transform can be used. """ from copy import deepcopy from nltk.draw.tree import draw_trees from nltk.tree.tree import Tree # original tree from WSJ bracketed text sentence = """(TOP (S (S (VP (VBN Turned) (ADVP (RB loose)) (PP (IN in) (NP (NP (NNP Shane) (NNP Longman) (POS 's)) (NN trading) (NN room))))) (, ,) (NP (DT the) (NN yuppie) (NNS dealers)) (VP (AUX do) (NP (NP (RB little)) (ADJP (RB right)))) (. .)))""" t = Tree.fromstring(sentence, remove_empty_top_bracketing=True) # collapse subtrees with only one child collapsedTree = deepcopy(t) collapse_unary(collapsedTree) # convert the tree to CNF cnfTree = deepcopy(collapsedTree) chomsky_normal_form(cnfTree) # convert the tree to CNF with parent annotation (one level) and horizontal smoothing of order two parentTree = deepcopy(collapsedTree) chomsky_normal_form(parentTree, horzMarkov=2, vertMarkov=1) # convert the tree back to its original form (used to make CYK results comparable) original = deepcopy(parentTree) un_chomsky_normal_form(original) # convert tree back to bracketed text sentence2 = original.pprint() print(sentence) print(sentence2) print("Sentences the same? ", sentence == sentence2) draw_trees(t, collapsedTree, cnfTree, parentTree, original)
def un_chomsky_normal_form(tree, expandUnary=True, childChar="|", parentChar="^", unaryChar="+"): # Traverse the tree-depth first keeping a pointer to the parent for modification purposes. nodeList = [(tree, [])] while nodeList != []: node, parent = nodeList.pop() if isinstance(node, Tree): # if the node contains the 'childChar' character it means that # it is an artificial node and can be removed, although we still need # to move its children to its parent childIndex = node.label().find(childChar) if childIndex != -1: nodeIndex = parent.index(node) parent.remove(parent[nodeIndex]) # Generated node was on the left if the nodeIndex is 0 which # means the grammar was left factored. We must insert the children # at the beginning of the parent's children if nodeIndex == 0: parent.insert(0, node[0]) parent.insert(1, node[1]) else: parent.extend([node[0], node[1]]) # parent is now the current node so the children of parent will be added to the agenda node = parent else: parentIndex = node.label().find(parentChar) if parentIndex != -1: # strip the node name of the parent annotation node.set_label(node.label()[:parentIndex]) # expand collapsed unary productions if expandUnary == True: unaryIndex = node.label().find(unaryChar) if unaryIndex != -1: newNode = Tree(node.label()[unaryIndex + 1:], [i for i in node]) node.set_label(node.label()[:unaryIndex]) node[0:] = [newNode] for child in node: nodeList.append((child, node))
def __init__(self, node, children=None, **prob_kwargs): Tree.__init__(self, node, children) ProbabilisticMixIn.__init__(self, **prob_kwargs)
def chomsky_normal_form(tree, factor="right", horzMarkov=None, vertMarkov=0, childChar="|", parentChar="^"): # assume all subtrees have homogeneous children # assume all terminals have no siblings # A semi-hack to have elegant looking code below. As a result, # any subtree with a branching factor greater than 999 will be incorrectly truncated. if horzMarkov is None: horzMarkov = 999 # Traverse the tree depth-first keeping a list of ancestor nodes to the root. # I chose not to use the tree.treepositions() method since it requires # two traversals of the tree (one to get the positions, one to iterate # over them) and node access time is proportional to the height of the node. # This method is 7x faster which helps when parsing 40,000 sentences. nodeList = [(tree, [tree.label()])] while nodeList != []: node, parent = nodeList.pop() if isinstance(node, Tree): # parent annotation parentString = "" originalNode = node.label() if vertMarkov != 0 and node != tree and isinstance(node[0], Tree): parentString = "{}<{}>".format(parentChar, "-".join(parent)) node.set_label(node.label() + parentString) parent = [originalNode] + parent[:vertMarkov - 1] # add children to the agenda before we mess with them for child in node: nodeList.append((child, parent)) # chomsky normal form factorization if len(node) > 2: childNodes = [child.label() for child in node] nodeCopy = node.copy() node[0:] = [] # delete the children curNode = node numChildren = len(nodeCopy) for i in range(1, numChildren - 1): if factor == "right": newHead = "{}{}<{}>{}".format( originalNode, childChar, "-".join( childNodes[i:min([i + horzMarkov, numChildren])]), parentString, ) # create new head newNode = Tree(newHead, []) curNode[0:] = [nodeCopy.pop(0), newNode] else: newHead = "{}{}<{}>{}".format( originalNode, childChar, "-".join(childNodes[ max([numChildren - i - horzMarkov, 0]):-i]), parentString, ) newNode = Tree(newHead, []) curNode[0:] = [newNode, nodeCopy.pop()] curNode = newNode curNode[0:] = [child for child in nodeCopy]