コード例 #1
0
ファイル: tokenization.py プロジェクト: ffaisal93/SD-QA
  def tokenize(self, text):
    """Tokenizes a piece of `text` and returns a list of `SubToken`s."""
    text = bert_tokenization.convert_to_unicode(text)

    # Begin with the entire input as a single string.
    subtokens = [SubToken(text, text, is_good=True)]
    del text  # unused after this point
    subtokens = self._clean_text(subtokens)

    # This was added on November 1st, 2018 for the multilingual and Chinese
    # models. This is also applied to the English models now, but it doesn't
    # matter since the English models were not trained on any Chinese data
    # and generally don't have any Chinese data in them (there are Chinese
    # characters in the vocabulary because Wikipedia does have some Chinese
    # words in the English Wikipedia.).
    subtokens = self._tokenize_chinese_chars(subtokens)

    # Split punctuation, preserving special tokens.
    subtokens = whitespace_tokenize(subtokens)

    split_subtokens = []
    for subtoken, orig_subtoken, is_good in subtokens:
      assert subtoken == orig_subtoken

      if not is_good:
        split_subtokens.append(SubToken(subtoken, subtoken, is_good=False))
        continue

      if bert_tokenization.preserve_token(subtoken, self.vocab):
        split_subtokens.append(SubToken(subtoken, subtoken, is_good=True))
        continue

      split_subtokens.extend(
          self._run_split_on_punc([SubToken(subtoken, subtoken, is_good=True)]))
    return split_subtokens
コード例 #2
0
ファイル: tokenization.py プロジェクト: ffaisal93/SD-QA
  def tokenize(self, text):
    """Tokenizes a piece of `text` and returns a list of `SubToken`s."""
    split_tokens = []  # list of `SubToken`s.
    for token, orig_token, is_good_token in self.basic_tokenizer.tokenize(text):
      if not is_good_token:
        split_tokens.append(SubToken(token, orig_token, is_good=False))
        continue

      # Preserve special tokens such as '[Q]' and '[SEP]'.
      if bert_tokenization.preserve_token(token, self.vocab):
        split_tokens.append(SubToken(token, orig_token, is_good=True))
        continue

      # For everything else, send the text-like tokens that have survived
      # whitespace and puncutation splitting through a wordpiece tokenizer.
      for sub_token in self.wordpiece_tokenizer.tokenize(
          [SubToken(token, orig_token, is_good_token)]):
        # `sub_token` has type `SubToken`.
        split_tokens.append(sub_token)

    return split_tokens