Beispiel #1
0
    def tokenize_with_offsets(self, text_input):
        """Performs basic word tokenization for BERT.

    Args:
      text_input: A `Tensor` or `RaggedTensor` of untokenized UTF-8 strings.
    Returns:
      A `RaggedTensor` of tokenized strings from text_input.
    """
        # lowercase and strip accents (if option is set)
        if self._lower_case:
            text_input = case_fold_utf8(text_input)
            text_input = normalize_utf8(text_input, "NFD")
            text_input = string_ops.regex_replace(text_input, r"\p{Mn}", "")
        else:
            # utf8 normalization
            if self._normalization_form is not None:
                text_input = normalize_utf8(text_input,
                                            self._normalization_form)

        # strip out control characters
        text_input = string_ops.regex_replace(text_input, r"\p{Cc}|\p{Cf}",
                                              " ")

        return regex_split_ops.regex_split_with_offsets(
            text_input, _DELIM_REGEX_PATTERN, self._keep_delim_regex_pattern,
            "BertBasicTokenizer")
Beispiel #2
0
    def testRegexSplitOp(self,
                         text_input,
                         delim_regex_pattern,
                         expected,
                         keep_delim_regex_pattern=r"",
                         descr="",
                         input_is_dense=False,
                         ragged_rank=None):
        if input_is_dense:
            text_input = constant_op.constant(text_input)
        else:
            text_input = ragged_factory_ops.constant(text_input,
                                                     ragged_rank=ragged_rank)

        actual_tokens, start, end = regex_split_ops.regex_split_with_offsets(
            input=text_input,
            delim_regex_pattern=delim_regex_pattern,
            keep_delim_regex_pattern=keep_delim_regex_pattern,
        )
        self.assertAllEqual(actual_tokens, expected)

        # Use the offsets to extract substrings and verify that the substrings match
        # up with the expected tokens
        extracted_tokens = _ragged_substr(
            array_ops.expand_dims(text_input, -1), start, end - start)
        if extracted_tokens is not None:
            self.assertAllEqual(extracted_tokens, expected)
  def testRegexSplitOp(self,
                       text_input,
                       delim_regex_pattern,
                       expected,
                       keep_delim_regex_pattern=r""):
    text_input = ragged_factory_ops.constant(text_input)
    actual_tokens, start, end = regex_split_ops.regex_split_with_offsets(
        input=text_input,
        delim_regex_pattern=delim_regex_pattern,
        keep_delim_regex_pattern=keep_delim_regex_pattern,
    )
    self.assertAllEqual(actual_tokens, expected)

    # Use the offsets to extract substrings and verify that the substrings match
    # up with the expected tokens
    extracted_tokens = _ragged_substr(text_input, start, end)
    self.assertAllEqual(extracted_tokens, expected)
Beispiel #4
0
 def break_sentences_with_offsets(self, input):  # pylint: disable=redefined-builtin
   return regex_split_ops.regex_split_with_offsets(input,
                                                   self._new_sentence_regex)