def from_nersuite(cls, text): """Return Document given NERsuite format file.""" sentences = [] lines = [] offset = 0 for line in split_keep_separator(text): if not line: pass elif not line.isspace(): lines.append(line) else: sentences.append(Sentence.from_nersuite(lines, offset)) if sentences[-1].tokens: offset = sentences[-1].tokens[-1].end + 1 # guess lines = [] if lines: sentences.append(Sentence.from_nersuite(lines, offset)) # Add spaces for gaps implied by token positions but not # explitly recorded in NERsuite format for s, next_s in pairwise(sentences): if s.tokens and next_s.tokens: gap = next_s.tokens[0].start - s.tokens[-1].end s.text = s.text + ' ' * gap # Assure document-final newline (text file) if sentences and not sentences[-1].text.endswith('\n'): sentences[-1].text = sentences[-1].text + '\n' text = ''.join(s.text for s in sentences) return cls(text, sentences)
def retag_document(document, tagset): if tagset == IO_TAGSET: mapper = BIO_to_IO elif tagset == IOBES_TAGSET: mapper = BIO_to_IOBES else: raise ValueError('tagset {}'.format(tagset)) for sentence in document.sentences: for t, next_t in pairwise(sentence.tokens, include_last=True): next_tag = next_t.tag if next_t is not None else None t.tag = mapper(t.tag, next_tag)
def document_to_ascii(document): # traverse sentences, store sentence head (text before the # first token), then traverse tokens in pairs, storing the # tail of each token (space between the token and the next), # convert token text to ASCII using convert_u2a(), redo token # offsets, then redo the sentence text using the head, token # texts and tails, and finally redo the document text using # the sentences. read_u2a_data() document.unicode_text = document.text offset = 0 for sentence in document.sentences: sentence.unicode_text = sentence.text sent_end = sentence.base_offset + len(sentence.text) ascii_base_offset = offset if sentence.tokens: t_start = sentence.tokens[0].start sent_head = document.text[sentence.base_offset:t_start] else: sent_head = sentence.text assert not sent_head or sent_head.isspace() offset += len(sent_head) for t, next_t in pairwise(sentence.tokens, include_last=True): t.unicode_text = t.text tail_end = next_t.start if next_t is not None else sent_end t.tail = document.text[t.end:tail_end] assert not t.tail or t.tail.isspace() t.text = convert_u2a(t.text) t.start, t.end = offset, offset + len(t.text) offset += len(t.text) + len(t.tail) sentence.base_offset = ascii_base_offset sentence.text = sent_head + ''.join([t.text + t.tail for t in sentence.tokens]) assert sentence.is_valid() document.text = ''.join(s.text for s in document.sentences) assert document.is_valid()
def document_to_ascii(document): # traverse sentences, store sentence head (text before the # first token), then traverse tokens in pairs, storing the # tail of each token (space between the token and the next), # convert token text to ASCII using convert_u2a(), redo token # offsets, then redo the sentence text using the head, token # texts and tails, and finally redo the document text using # the sentences. read_u2a_data() document.unicode_text = document.text offset = 0 for sentence in document.sentences: sentence.unicode_text = sentence.text sent_end = sentence.base_offset + len(sentence.text) ascii_base_offset = offset if sentence.tokens: t_start = sentence.tokens[0].start sent_head = document.text[sentence.base_offset:t_start] else: sent_head = sentence.text assert not sent_head or sent_head.isspace() offset += len(sent_head) for t, next_t in pairwise(sentence.tokens, include_last=True): t.unicode_text = t.text tail_end = next_t.start if next_t is not None else sent_end t.tail = document.text[t.end:tail_end] assert not t.tail or t.tail.isspace() t.text = convert_u2a(t.text) t.start, t.end = offset, offset + len(t.text) offset += len(t.text) + len(t.tail) sentence.base_offset = ascii_base_offset sentence.text = sent_head + ''.join( [t.text + t.tail for t in sentence.tokens]) assert sentence.is_valid() document.text = ''.join(s.text for s in document.sentences) assert document.is_valid()
def get_tagged(self, relative_offsets=False): """Return list of (type, start, end) based on Token tags. If relative_offsets is True, start and end offsets are relative to sentence beginning; otherwise, they are absolute offsets into the document text. """ tagged = [] first = None for t, next_t in pairwise(self.tokens, include_last=True): if is_start_tag(t.tag): first = t if first and not (next_t and is_continue_tag(next_t.tag)): tagged.append((first.tagged_type(), first.start, t.end)) first = None if relative_offsets: tagged = [(t[0], t[1] - self.base_offset, t[2] - self.base_offset) for t in tagged] return tagged
def tagged_spans(tags): """Given a sequence of tags, return corresponding list TypedSpan, i.e. (type, start, end) triples where start and end are token indices. The end index is that of the first token occurring after a tagged span, i.e. the spanned tokens are tokens[start:end]. """ # TODO: eliminate redundancy with Sentence.get_tagged() (DRY!) spans = [] first = None index = 0 for t, next_t in pairwise(tags, include_last=True): if is_start_tag(t): first = index if first is not None and not (next_t and is_continue_tag(next_t)): _, type_ = parse_tag(t) spans.append(TypedSpan(type_, first, index + 1)) first = None index += 1 return spans