Beispiel #1
0
    def tokenize(self, doc):
        """Tokenize the document text using the PTB gold annotation.

        Return a tokenized document.
        """
        # get tokens from PTB
        ptb_name = _guess_ptb_name(doc.key)
        if ptb_name is None:
            return doc

        # get doc text
        # here we cheat and get it from the RST-DT tree
        # was: rst_text = doc.orig_rsttree.text()
        rst_text = doc.text
        tagged_tokens = self.reader.tagged_words(ptb_name)
        # tweak tokens THEN filter empty nodes
        tweaked1, tweaked2 =\
            itertools.tee(_tweak_token(ptb_name)(i, tok) for i, tok in
                          enumerate(tagged_tokens)
                          if not is_empty_category(tok[1]))
        spans = generic_token_spans(rst_text, tweaked1,
                                    txtfn=lambda x: x.tweaked_word)
        result = [_mk_token(t, s) for t, s in izip(tweaked2, spans)]

        # store in doc
        doc.tkd_tokens.extend(result)

        return doc
Beispiel #2
0
def tokenize_doc_ptb(doc_id, doc_text):
    """Dirty PTB tokenizer"""
    ptb_name = _guess_ptb_name(doc_id)
    if ptb_name is None:
        return None

    # get doc text
    # here we cheat and get it from the RST-DT tree
    # was: rst_text = doc.orig_rsttree.text()
    rst_text = doc_text
    tagged_tokens = PTB_READER.tagged_words(ptb_name)
    # tweak tokens THEN filter empty nodes
    tweaked1, tweaked2 =\
        itertools.tee(_tweak_token(ptb_name)(i, tok) for i, tok in
                      enumerate(tagged_tokens)
                      if not is_empty_category(tok[1]))
    spans = generic_token_spans(rst_text, tweaked1,
                                txtfn=lambda x: x.tweaked_word)
    result = [_mk_token(t, s) for t, s in izip(tweaked2, spans)]
    return result
Beispiel #3
0
def tokenize_doc_ptb(doc_id, doc_text):
    """Dirty PTB tokenizer"""
    ptb_name = _guess_ptb_name(doc_id)
    if ptb_name is None:
        return None

    # get doc text
    # here we cheat and get it from the RST-DT tree
    # was: rst_text = doc.orig_rsttree.text()
    rst_text = doc_text
    tagged_tokens = PTB_READER.tagged_words(ptb_name)
    # tweak tokens THEN filter empty nodes
    tweaked1, tweaked2 =\
        itertools.tee(_tweak_token(ptb_name)(i, tok) for i, tok in
                      enumerate(tagged_tokens)
                      if not is_empty_category(tok[1]))
    spans = generic_token_spans(rst_text, tweaked1,
                                txtfn=lambda x: x.tweaked_word)
    result = [_mk_token(t, s) for t, s in izip(tweaked2, spans)]
    return result
Beispiel #4
0
def align(corpus, k, ptb):
    """
    Align PTB annotations to the corpus raw text.
    Return a generator of `Token` objects

    Note: returns None if there is no associated PTB corpus entry.

    See also `parse_tree` (which calls this function internall)
    """
    ptb_name = _guess_ptb_name(k)
    if ptb_name is None:
        return None
    rst_text = corpus[k].text()
    tagged_tokens = ptb.tagged_words(ptb_name)
    tweaked1, tweaked2 =\
        itertools.tee(_tweak_token(ptb_name)(i, tok) for i, tok in
                      enumerate(tagged_tokens))
    spans = generic_token_spans(rst_text, tweaked1,
                                txtfn=lambda x: x.tweaked_word)
    return (_mk_token(t, s) for t, s in izip(tweaked2, spans))
Beispiel #5
0
    def tokenize(self, doc):
        """Tokenize the document text using the PTB gold annotation.

        Parameters
        ----------
        doc: DocumentPlus
            Rich representation of the document.

        Returns
        -------
        doc: DocumentPlus
            Rich representation of the document, with tokenization.
        """
        # get tokens from PTB
        ptb_name = _guess_ptb_name(doc.key)
        if ptb_name is None:
            return doc

        # get doc text
        # here we cheat and get it from the RST-DT tree
        # was: rst_text = doc.orig_rsttree.text()
        rst_text = doc.text
        tagged_tokens = self.reader.tagged_words(ptb_name)
        # tweak tokens THEN filter empty nodes
        tweaked1, tweaked2 =\
            itertools.tee(_tweak_token(ptb_name)(i, tok) for i, tok in
                          enumerate(tagged_tokens)
                          if not is_empty_category(tok[1]))
        spans = generic_token_spans(rst_text,
                                    tweaked1,
                                    txtfn=lambda x: x.tweaked_word)
        result = [_mk_token(t, s) for t, s in izip(tweaked2, spans)]

        # store in doc
        doc.set_tokens(result)

        return doc