def parse_doc_ptb(doc_id, doc_tkd_toks): """Dirty PTB parser""" # get PTB trees ptb_name = _guess_ptb_name(doc_id) if ptb_name is None: return None # use tweaked tokens doc_tokens = doc_tkd_toks tokens_iter = iter(doc_tokens) trees = [] lex_heads = [] for tree in PTB_READER.parsed_sents(ptb_name): # apply standard cleaning to tree # strip function tags, remove empty nodes tree_no_empty = prune_tree(tree, is_non_empty) tree_no_empty_no_gf = transform_tree(tree_no_empty, strip_subcategory) # leaves = tree_no_empty_no_gf.leaves() tslice = itertools.islice(tokens_iter, len(leaves)) clean_tree = ConstituencyTree.build(tree_no_empty_no_gf, tslice) trees.append(clean_tree) # lexicalize the PTB tree: find the head word of each constituent # constituents and their heads are designated by their Gorn address # ("tree position" in NLTK) in the tree lheads = find_lexical_heads(clean_tree) lex_heads.append(lheads) return trees # , lex_heads
def tokenize_doc_ptb(doc_id, doc_text): """Dirty PTB tokenizer""" ptb_name = _guess_ptb_name(doc_id) if ptb_name is None: return None # get doc text # here we cheat and get it from the RST-DT tree # was: rst_text = doc.orig_rsttree.text() rst_text = doc_text tagged_tokens = PTB_READER.tagged_words(ptb_name) # tweak tokens THEN filter empty nodes tweaked1, tweaked2 =\ itertools.tee(_tweak_token(ptb_name)(i, tok) for i, tok in enumerate(tagged_tokens) if not is_empty_category(tok[1])) spans = generic_token_spans(rst_text, tweaked1, txtfn=lambda x: x.tweaked_word) result = [_mk_token(t, s) for t, s in izip(tweaked2, spans)] return result