def parse_doc_ptb(doc_id, doc_tkd_toks): """Dirty PTB parser""" # get PTB trees ptb_name = _guess_ptb_name(doc_id) if ptb_name is None: return None # use tweaked tokens doc_tokens = doc_tkd_toks tokens_iter = iter(doc_tokens) trees = [] lex_heads = [] for tree in PTB_READER.parsed_sents(ptb_name): # apply standard cleaning to tree # strip function tags, remove empty nodes tree_no_empty = prune_tree(tree, is_non_empty) tree_no_empty_no_gf = transform_tree(tree_no_empty, strip_subcategory) # leaves = tree_no_empty_no_gf.leaves() tslice = itertools.islice(tokens_iter, len(leaves)) clean_tree = ConstituencyTree.build(tree_no_empty_no_gf, tslice) trees.append(clean_tree) # lexicalize the PTB tree: find the head word of each constituent # constituents and their heads are designated by their Gorn address # ("tree position" in NLTK) in the tree lheads = find_lexical_heads(clean_tree) lex_heads.append(lheads) return trees # , lex_heads
def parse(self, doc): """Parse a document, using the gold PTB annotation. Given a document, return a list of educified PTB parse trees (one per sentence). These are almost the same as the trees that would be returned by the `parsed_sents` method, except that each leaf/node is associated with a span within the RST DT text. Note: does nothing if there is no associated PTB corpus entry. Parameters ---------- doc: DocumentPlus Rich representation of the document. Returns ------- doc: DocumentPlus Rich representation of the document, with syntactic constituency trees. """ # get PTB trees ptb_name = _guess_ptb_name(doc.key) if ptb_name is None: return doc # get tokens from tokenized document # FIXME alignment/reconstruction should never have to deal # with the left padding token in the first place doc_tokens = doc.tkd_tokens[1:] # skip left padding token tokens_iter = iter(doc_tokens) trees = [] lex_heads = [] for tree in self.reader.parsed_sents(ptb_name): # apply standard cleaning to tree # strip function tags, remove empty nodes tree_no_empty = prune_tree(tree, is_non_empty) tree_no_empty_no_gf = transform_tree(tree_no_empty, strip_subcategory) # leaves = tree_no_empty_no_gf.leaves() tslice = itertools.islice(tokens_iter, len(leaves)) clean_tree = ConstituencyTree.build(tree_no_empty_no_gf, tslice) trees.append(clean_tree) # lexicalize the PTB tree: find the head word of each constituent # constituents and their heads are designated by their Gorn address # ("tree position" in NLTK) in the tree lheads = find_lexical_heads(clean_tree) lex_heads.append(lheads) # store trees in doc doc.set_syn_ctrees(trees, lex_heads=lex_heads) return doc
def parse(self, doc): """ Given a document, return a list of educified PTB parse trees (one per sentence). These are almost the same as the trees that would be returned by the `parsed_sents` method, except that each leaf/node is associated with a span within the RST DT text. Note: does nothing if there is no associated PTB corpus entry. """ # get PTB trees ptb_name = _guess_ptb_name(doc.key) if ptb_name is None: return doc # get tokens from tokenized document # FIXME alignment/reconstruction should never have to deal # with the left padding token in the first place doc_tokens = doc.tkd_tokens[1:] # skip left padding token tokens_iter = iter(doc_tokens) trees = [] lex_heads = [] for tree in self.reader.parsed_sents(ptb_name): # apply standard cleaning to tree # strip function tags, remove empty nodes tree_no_empty = prune_tree(tree, is_non_empty) tree_no_empty_no_gf = transform_tree(tree_no_empty, strip_subcategory) # leaves = tree_no_empty_no_gf.leaves() tslice = itertools.islice(tokens_iter, len(leaves)) clean_tree = ConstituencyTree.build(tree_no_empty_no_gf, tslice) trees.append(clean_tree) # lexicalize the PTB tree: find the head word of each constituent # constituents and their heads are designated by their Gorn address # ("tree position" in NLTK) in the tree lheads = find_lexical_heads(clean_tree) lex_heads.append(lheads) # store trees in doc doc.tkd_trees.extend(trees) # store lexical heads in doc # TODO move to DocumentPlus doc.lex_heads = [] doc.lex_heads.append(None) # end TODO doc.lex_heads.extend(lex_heads) return doc