コード例 #1
0
def _improve_answer_span(
    doc_tokens: Sequence[str],
    unimproved_span: Tuple[int, int],
    orig_answer_text: str,
    tokenizer: tokenization.FullTokenizer,
):
  """Returns answer token spans that better match the annotated answer.

  This function is branched from the original BERT `run_squad.py` code

  Usually question answer span annotations are character based. We first project
  them to whitespace-tokenized words (unigrams). But then after WordPiece
  tokenization, we can often find a "better match". For example:

    Question: What year was John Smith born?
    Context: The leader was John Smith (1895-1943).
    Answer: 1895

  The original whitespace-tokenized answer will be "(1895-1943).". However
  after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
  the exact answer, 1895. The purpose of this function is to find such "better
  match".

  However, this is not always possible. Consider the following:

    Question: What country is the top exporter of electornics?
    Context: The Japanese electronics industry is the lagest in the world.
    Answer: Japan

  In this case, the annotator chose "Japan" as a character sub-span of
  the word "Japanese". Since our WordPiece tokenizer does not split
  "Japanese", we just use "Japanese" as the annotation. This is expected to be
  fairly rare.

  Args:
    doc_tokens: Sequence of Text, the wordpiece tokenized tokens of the doc.
    unimproved_span: Tuple of two ints, the unimproved answer token span. In the
      first example, it is the token span for "(" and ")".
    orig_answer_text: Text, the original answer text. In the first example, it
      is "1895".
    tokenizer: FullTokenizer, wordpiece tokenizer to tokenize the original
      answer text.

  Returns:
    Tuple of two ints, the improved answer token span. In the first example, it
    corresponds to the answer token span for "1895".
  """
  tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
  for new_begin in range(unimproved_span[0], unimproved_span[1] + 1):
    for new_end in range(unimproved_span[1], new_begin - 1, -1):
      text_span = " ".join(doc_tokens[new_begin:(new_end + 1)])
      if text_span == tok_answer_text:
        return new_begin, new_end

  return unimproved_span
コード例 #2
0
def wordpiece_tokenize_with_indices(
    doc_unigrams: Sequence[str], tokenizer: tokenization.FullTokenizer
) -> Tuple[List[str], List[int], List[int]]:
  """Wordpiece tokenizes unigrams to tokens and returns indices mapping."""
  token_to_unigram_map = []
  unigram_to_token_map = []
  doc_tokens = []
  for (i, token) in enumerate(doc_unigrams):
    unigram_to_token_map.append(len(doc_tokens))
    sub_tokens = tokenizer.tokenize(token)
    token_to_unigram_map.extend([i] * len(sub_tokens))
    doc_tokens.extend(sub_tokens)
  return doc_tokens, unigram_to_token_map, token_to_unigram_map
コード例 #3
0
def get_sentencepiece_tokenized_text(
    text: str, tokenizer: tokenization.FullTokenizer) -> TokenizedText:
  """Gets SentencePiece TokenizedText for a text with indices mapping."""
  tokens = [six.ensure_text(tk, "utf-8") for tk in tokenizer.tokenize(text)]
  token_ids = tokenizer.convert_tokens_to_ids(tokens)
  chars_to_tokens = []
  for i, token in enumerate(tokens):
    num_chars = len(token)
    if i == 0:
      num_chars -= 1
    chars_to_tokens.extend([i] * num_chars)
  token_ids = tokenizer.convert_tokens_to_ids(tokens)
  tokenized_text = TokenizedText()
  tokenized_text.text = sentencepiece_detokenize(tokens)
  tokenized_text.tokens = tokens
  tokenized_text.token_ids = token_ids
  tokenized_text.chars_to_tokens = chars_to_tokens
  return tokenized_text
コード例 #4
0
def find_candidate_mentions(
    input_text: Text,
    candidate: Text,
    tokenizer: tokenization.FullTokenizer,
    offset=0) -> Tuple[List[Text], List[Tuple[int, int]]]:
  """Finds the candidate string mentions in the sentence post tokenization.

  Args:
    input_text: The input for searching the candidate.
    candidate: The candidate to be searched for mentions in the input.
    tokenizer: The tokenizer to be used. For BERT tokenzier, we assume an
      uncased vocab.
    offset: Offset to be added to all the span values.

  Returns:
    A tuple of (input_tokens_list, list_of_candidate_spans_in_the_list)

  Example:
  input = "Thisss is Saaan Franciscooo"
  candidate = "saan franciscooo"

  Let's say we are using the ALBERT tokenizer. Tokenizing the input would give:
  ['▁This', 'ss', '▁is', '▁Sa', 'aan', '▁Franc', 'isc', 'ooo']

  We return the tokens of the sentence.
  We also return [(3, 7)] representing the only span where the candidate
  occurs in the tokenized sentence. Note that the span is  inclusive.
  """

  assert isinstance(tokenizer, tokenization.FullTokenizer)

  input_lower = input_text.lower()
  candidate_lower = candidate.lower()
  tokens = tokenizer.tokenize(input_text)
  if (not candidate_lower or not input_lower or
      candidate_lower not in input_lower):
    return (tokens, [])

  if isinstance(tokenizer, tokenization.FullTokenizer):
    # We assume a tokenizer with lower cased vocab here. We do a simple
    # substring match of the candidate tokens to the input text tokens.
    if (tokenizer.sp_model is None and
        not tokenizer.basic_tokenizer.do_lower_case):
      raise ValueError("BERT tokenizer should be lower cased.")
    candidate_tokens = tokenizer.tokenize(candidate.lower())
    candidate_len = len(candidate_tokens)
    candidate_spans = []
    for i in range(0, len(tokens)):
      if i + candidate_len <= len(tokens):
        if tokens[i:i + candidate_len] == candidate_tokens:
          candidate_spans.append((offset + i, offset + i + candidate_len - 1))
    return (tokens, candidate_spans)

  # Now that we know the candidate is present in the input_text, we do a
  # best effort matching.
  spiece_underline = tokenization.SPIECE_UNDERLINE.decode("utf-8")
  char_index_to_token_index = collections.OrderedDict()
  i = 0
  for (j, token) in enumerate(tokens):
    k = 0
    if token.startswith(spiece_underline):
      k += 1
    for c in token[k:len(token)]:
      c = c.lower()
      # Most chars, in general, other than the special_token in tokens have a
      # corresponding mapping in the input_text.
      while i < len(input_lower) and _is_whitespace(input_lower[i]):
        # To handle cases like a space etc in the input_text. Spaces, tabs etc.
        # generally don't appear in the tokens.
        char_index_to_token_index[i] = j
        i += 1
      if _is_whitespace(c):
        # This shouldn't generally happen - ALBERT tokenizer collapses
        # whitespaces.
        continue
      if c != input_lower[i]:
        # Tokenizer probably has extra characters for this token.
        continue
      if i < len(input_lower):
        assert c == input_lower[i]
        char_index_to_token_index[i] = j
        i += 1

  if i != len(input_text):
    # Our best effort matching chars to tokens failed. As a fallback, we will
    # just match the given candidate with the entire input_text and return.
    # Because we know that the candidate is already present in the input_text,
    # it's better to assign the candidate to the entire input_text (which in
    # our case is a sentence), rather than dropping it altogether.
    return (tokens, [(offset, offset + len(tokens) - 1)])

  # We now have matched every char in the input to its corresponding token
  # index successfully.
  candidate_spans = []
  cand_len = len(candidate_lower)
  # Using re.finditer for substring match seems to be throwing a weird python
  # error -- "nothing to repeat at position 0", in some cases. So doing a
  # brute force substring match.
  for start in range(0, len(input_lower)):
    if (start + cand_len <= len(input_lower) and
        input_lower[start:start + cand_len] == candidate_lower):
      end = start + cand_len - 1
      assert start in char_index_to_token_index, (
          "no mapping found for index %d for candidate %s ", start, candidate)
      assert end in char_index_to_token_index, (
          "no mapping found for index %d for candidate %s ", end, candidate)
      token_span_start = char_index_to_token_index[start]
      token_span_end = char_index_to_token_index[end]
      candidate_spans.append(
          (offset + token_span_start, offset + token_span_end))

  return (tokens, candidate_spans)