def tokenize(self, doc): """Tokenize the document text using the PTB gold annotation. Return a tokenized document. """ # get tokens from PTB ptb_name = _guess_ptb_name(doc.key) if ptb_name is None: return doc # get doc text # here we cheat and get it from the RST-DT tree # was: rst_text = doc.orig_rsttree.text() rst_text = doc.text tagged_tokens = self.reader.tagged_words(ptb_name) # tweak tokens THEN filter empty nodes tweaked1, tweaked2 =\ itertools.tee(_tweak_token(ptb_name)(i, tok) for i, tok in enumerate(tagged_tokens) if not is_empty_category(tok[1])) spans = generic_token_spans(rst_text, tweaked1, txtfn=lambda x: x.tweaked_word) result = [_mk_token(t, s) for t, s in izip(tweaked2, spans)] # store in doc doc.tkd_tokens.extend(result) return doc
def tokenize_doc_ptb(doc_id, doc_text): """Dirty PTB tokenizer""" ptb_name = _guess_ptb_name(doc_id) if ptb_name is None: return None # get doc text # here we cheat and get it from the RST-DT tree # was: rst_text = doc.orig_rsttree.text() rst_text = doc_text tagged_tokens = PTB_READER.tagged_words(ptb_name) # tweak tokens THEN filter empty nodes tweaked1, tweaked2 =\ itertools.tee(_tweak_token(ptb_name)(i, tok) for i, tok in enumerate(tagged_tokens) if not is_empty_category(tok[1])) spans = generic_token_spans(rst_text, tweaked1, txtfn=lambda x: x.tweaked_word) result = [_mk_token(t, s) for t, s in izip(tweaked2, spans)] return result
def align(corpus, k, ptb): """ Align PTB annotations to the corpus raw text. Return a generator of `Token` objects Note: returns None if there is no associated PTB corpus entry. See also `parse_tree` (which calls this function internall) """ ptb_name = _guess_ptb_name(k) if ptb_name is None: return None rst_text = corpus[k].text() tagged_tokens = ptb.tagged_words(ptb_name) tweaked1, tweaked2 =\ itertools.tee(_tweak_token(ptb_name)(i, tok) for i, tok in enumerate(tagged_tokens)) spans = generic_token_spans(rst_text, tweaked1, txtfn=lambda x: x.tweaked_word) return (_mk_token(t, s) for t, s in izip(tweaked2, spans))
def tokenize(self, doc): """Tokenize the document text using the PTB gold annotation. Parameters ---------- doc: DocumentPlus Rich representation of the document. Returns ------- doc: DocumentPlus Rich representation of the document, with tokenization. """ # get tokens from PTB ptb_name = _guess_ptb_name(doc.key) if ptb_name is None: return doc # get doc text # here we cheat and get it from the RST-DT tree # was: rst_text = doc.orig_rsttree.text() rst_text = doc.text tagged_tokens = self.reader.tagged_words(ptb_name) # tweak tokens THEN filter empty nodes tweaked1, tweaked2 =\ itertools.tee(_tweak_token(ptb_name)(i, tok) for i, tok in enumerate(tagged_tokens) if not is_empty_category(tok[1])) spans = generic_token_spans(rst_text, tweaked1, txtfn=lambda x: x.tweaked_word) result = [_mk_token(t, s) for t, s in izip(tweaked2, spans)] # store in doc doc.set_tokens(result) return doc