def _annotate_entities(self, text):
     spans = list(self.nltk_tokenizer.span_tokenize(text))
     tokens = [text[b:e] for (b, e) in spans]
     annotations = []
     trees = nltk.ne_chunk(nltk.pos_tag(tokens))
     start_index = 0
     for tree in trees:
         if hasattr(tree, 'label'):
             children = [text for text, pos in tree]
             end_index = start_index + len(children)
             if tree.label() in self.nltk_pos_types:
                 begin, _ = spans[start_index]
                 _, end = spans[end_index - 1]
                 surface_form = ' '.join(children)
                 # There are edge cases when these are not equal.
                 # For example, Diminish'd != Diminish 'd
                 # assert text[begin:end] == surface_form, text
                 surface_form = text[begin:end]
                 annotations.append(
                     data_utils.Annotation(begin=begin,
                                           end=end - 1,
                                           text=surface_form,
                                           label=1,
                                           type=1))
             start_index = end_index
         else:
             start_index += 1
     annotations.sort(key=lambda a: (a.begin, a.end))
     sentence = data_utils.Sentence(text=text, annotations=annotations)
     sentence.strip_whitespaces()
     return sentence
def find_answer_annotations(text, answer_set):
    """Find answer annotations."""
    annotations = []
    for answer in answer_set:
        # We use regex matching to search for the answer for two reasons:
        # (1) We want to ignore case (so `flags=re.IGNORECASE`)
        # (2) We want to the space and the hyphen to be treated as the same token.
        # Sometimes the answer is "TSR 2", but the actual text contains only "TSR-2"
        #
        # Note that we have to espace -- `re.escape(answer)` -- because the answer
        # can contain parentheses, etc.
        # Finally, to accommodate (2) we replace spaces ('\\ ' due to escaping)
        # with a group '[ -]'.
        answer_regex = re.compile(re.escape(answer).replace('\\ ', '[ -]'),
                                  flags=re.IGNORECASE)
        for match in re.finditer(answer_regex, text):
            if not answer.strip() or match.end() == 0:
                raise ValueError(
                    'Invalid answer string "%s" from answer set %s' %
                    (answer, str(answer_set)))
            annotations.append(
                data_utils.Annotation(begin=match.start(),
                                      end=match.end() - 1,
                                      text=match.group(0)))
    return sorted(annotations)