예제 #1
0
    def from_nersuite(cls, text):
        """Return Document given NERsuite format file."""

        sentences = []
        lines = []
        offset = 0
        for line in split_keep_separator(text):
            if not line:
                pass
            elif not line.isspace():
                lines.append(line)
            else:
                sentences.append(Sentence.from_nersuite(lines, offset))
                if sentences[-1].tokens:
                    offset = sentences[-1].tokens[-1].end + 1  # guess
                lines = []
        if lines:
            sentences.append(Sentence.from_nersuite(lines, offset))

        # Add spaces for gaps implied by token positions but not
        # explitly recorded in NERsuite format
        for s, next_s in pairwise(sentences):
            if s.tokens and next_s.tokens:
                gap = next_s.tokens[0].start - s.tokens[-1].end
                s.text = s.text + ' ' * gap

        # Assure document-final newline (text file)
        if sentences and not sentences[-1].text.endswith('\n'):
            sentences[-1].text = sentences[-1].text + '\n'

        text = ''.join(s.text for s in sentences)

        return cls(text, sentences)
예제 #2
0
def retag_document(document, tagset):
    if tagset == IO_TAGSET:
        mapper = BIO_to_IO
    elif tagset == IOBES_TAGSET:
        mapper = BIO_to_IOBES
    else:
        raise ValueError('tagset {}'.format(tagset))
    for sentence in document.sentences:
        for t, next_t in pairwise(sentence.tokens, include_last=True):
            next_tag = next_t.tag if next_t is not None else None
            t.tag = mapper(t.tag, next_tag)
예제 #3
0
def retag_document(document, tagset):
    if tagset == IO_TAGSET:
        mapper = BIO_to_IO
    elif tagset == IOBES_TAGSET:
        mapper = BIO_to_IOBES
    else:
        raise ValueError('tagset {}'.format(tagset))
    for sentence in document.sentences:
        for t, next_t in pairwise(sentence.tokens, include_last=True):
            next_tag = next_t.tag if next_t is not None else None
            t.tag = mapper(t.tag, next_tag)
예제 #4
0
def document_to_ascii(document):
    # traverse sentences, store sentence head (text before the
    # first token), then traverse tokens in pairs, storing the
    # tail of each token (space between the token and the next),
    # convert token text to ASCII using convert_u2a(), redo token
    # offsets, then redo the sentence text using the head, token
    # texts and tails, and finally redo the document text using
    # the sentences.

    read_u2a_data()

    document.unicode_text = document.text
    offset = 0
    for sentence in document.sentences:
        sentence.unicode_text = sentence.text
        sent_end = sentence.base_offset + len(sentence.text)
        ascii_base_offset = offset

        if sentence.tokens:
            t_start = sentence.tokens[0].start
            sent_head = document.text[sentence.base_offset:t_start]
        else:
            sent_head = sentence.text
        assert not sent_head or sent_head.isspace()

        offset += len(sent_head)

        for t, next_t in pairwise(sentence.tokens, include_last=True):
            t.unicode_text = t.text
            tail_end = next_t.start if next_t is not None else sent_end
            t.tail = document.text[t.end:tail_end]
            assert not t.tail or t.tail.isspace()
            t.text = convert_u2a(t.text)
            t.start, t.end = offset, offset + len(t.text)
            offset += len(t.text) + len(t.tail)

        sentence.base_offset = ascii_base_offset
        sentence.text = sent_head + ''.join([t.text + t.tail 
                                             for t in sentence.tokens])

        assert sentence.is_valid()

    document.text = ''.join(s.text for s in document.sentences)
    assert document.is_valid()
예제 #5
0
def document_to_ascii(document):
    # traverse sentences, store sentence head (text before the
    # first token), then traverse tokens in pairs, storing the
    # tail of each token (space between the token and the next),
    # convert token text to ASCII using convert_u2a(), redo token
    # offsets, then redo the sentence text using the head, token
    # texts and tails, and finally redo the document text using
    # the sentences.

    read_u2a_data()

    document.unicode_text = document.text
    offset = 0
    for sentence in document.sentences:
        sentence.unicode_text = sentence.text
        sent_end = sentence.base_offset + len(sentence.text)
        ascii_base_offset = offset

        if sentence.tokens:
            t_start = sentence.tokens[0].start
            sent_head = document.text[sentence.base_offset:t_start]
        else:
            sent_head = sentence.text
        assert not sent_head or sent_head.isspace()

        offset += len(sent_head)

        for t, next_t in pairwise(sentence.tokens, include_last=True):
            t.unicode_text = t.text
            tail_end = next_t.start if next_t is not None else sent_end
            t.tail = document.text[t.end:tail_end]
            assert not t.tail or t.tail.isspace()
            t.text = convert_u2a(t.text)
            t.start, t.end = offset, offset + len(t.text)
            offset += len(t.text) + len(t.tail)

        sentence.base_offset = ascii_base_offset
        sentence.text = sent_head + ''.join(
            [t.text + t.tail for t in sentence.tokens])

        assert sentence.is_valid()

    document.text = ''.join(s.text for s in document.sentences)
    assert document.is_valid()
예제 #6
0
    def get_tagged(self, relative_offsets=False):
        """Return list of (type, start, end) based on Token tags.

        If relative_offsets is True, start and end offsets are
        relative to sentence beginning; otherwise, they are absolute
        offsets into the document text.
        """

        tagged = []
        first = None
        for t, next_t in pairwise(self.tokens, include_last=True):
            if is_start_tag(t.tag):
                first = t
            if first and not (next_t and is_continue_tag(next_t.tag)):
                tagged.append((first.tagged_type(), first.start, t.end))
                first = None
        if relative_offsets:
            tagged = [(t[0], t[1] - self.base_offset, t[2] - self.base_offset)
                      for t in tagged]
        return tagged
예제 #7
0
def tagged_spans(tags):
    """Given a sequence of tags, return corresponding list TypedSpan,
    i.e. (type, start, end) triples where start and end are token
    indices.

    The end index is that of the first token occurring after a tagged
    span, i.e. the spanned tokens are tokens[start:end].
    """

    # TODO: eliminate redundancy with Sentence.get_tagged() (DRY!)
    spans = []
    first = None
    index = 0
    for t, next_t in pairwise(tags, include_last=True):
        if is_start_tag(t):
            first = index
        if first is not None and not (next_t and is_continue_tag(next_t)):
            _, type_ = parse_tag(t)
            spans.append(TypedSpan(type_, first, index + 1))
            first = None
        index += 1
    return spans