def _element_2_syntactic_node(cls, element): cat = element.attrib["cat"] if element.tag == "lf": terminal_node = SyntacticNode(cat, pos=element.attrib["pos"], lemma=element.attrib["lemma"], word=element.attrib["word"]) return terminal_node else: node = SyntacticNode(cat) for child_element in element: child_node = cls._element_2_syntactic_node(child_element) node.add_child(child_node) return node
def read_tree(cls, string): """Read a SyntacticTree from a string Args: string: the input string For a PSG tree, the format of the tree is similar to a Penn Treebank tree, without newline characters: e.g. (S (NP (NNP Mary)) (VP (VBZ loves) (NP (NNP John)))) Returns: the SyntacticTree represented by the input string """ # first, remove the brackets () string = string.strip() if len(string) == 0: raise ValueError("empty string cannot be a Synstactic Tree") if string[0] == "(" and string[-1] == ")": string = string[1:-1] # split the string with blank character # if the string has exactly or fewer than one element, it cannot be a # tree # if it has two elements, it must be a tree with a terminal node as the # root # if it has more than two elements, take the first element as the root # and other elements as the branches elements = string.split() if len(elements) <= 1 or elements[1] == "": raise ValueError("%s cannot be a tree or subtree" %string) else: if len(elements) == 2: # TODO: if the label comes from CCG parser, turn [] into () root = cls._read_terminal_node(elements) else: branch_string = " ".join(elements[1:]) root = SyntacticNode(elements[0]) branches = cls._read_branches(branch_string) for branch in branches: root.add_child(branch._root) return SyntacticTree(root)