def _annotate_entities(self, text): spans = list(self.nltk_tokenizer.span_tokenize(text)) tokens = [text[b:e] for (b, e) in spans] annotations = [] trees = nltk.ne_chunk(nltk.pos_tag(tokens)) start_index = 0 for tree in trees: if hasattr(tree, 'label'): children = [text for text, pos in tree] end_index = start_index + len(children) if tree.label() in self.nltk_pos_types: begin, _ = spans[start_index] _, end = spans[end_index - 1] surface_form = ' '.join(children) # There are edge cases when these are not equal. # For example, Diminish'd != Diminish 'd # assert text[begin:end] == surface_form, text surface_form = text[begin:end] annotations.append( data_utils.Annotation(begin=begin, end=end - 1, text=surface_form, label=1, type=1)) start_index = end_index else: start_index += 1 annotations.sort(key=lambda a: (a.begin, a.end)) sentence = data_utils.Sentence(text=text, annotations=annotations) sentence.strip_whitespaces() return sentence
def find_answer_annotations(text, answer_set): """Find answer annotations.""" annotations = [] for answer in answer_set: # We use regex matching to search for the answer for two reasons: # (1) We want to ignore case (so `flags=re.IGNORECASE`) # (2) We want to the space and the hyphen to be treated as the same token. # Sometimes the answer is "TSR 2", but the actual text contains only "TSR-2" # # Note that we have to espace -- `re.escape(answer)` -- because the answer # can contain parentheses, etc. # Finally, to accommodate (2) we replace spaces ('\\ ' due to escaping) # with a group '[ -]'. answer_regex = re.compile(re.escape(answer).replace('\\ ', '[ -]'), flags=re.IGNORECASE) for match in re.finditer(answer_regex, text): if not answer.strip() or match.end() == 0: raise ValueError( 'Invalid answer string "%s" from answer set %s' % (answer, str(answer_set))) annotations.append( data_utils.Annotation(begin=match.start(), end=match.end() - 1, text=match.group(0))) return sorted(annotations)