Beispiel #1
0
 def test_normalize_nfkc_ragged(self):
     txt = ragged_factory_ops.constant([[[u"\u1e9b\u0323 \ufb01"], []],
                                        [[u"\u1e9b\u0323", u"\ufb01"]]])
     expected = [[[u"ṩ fi".encode("utf-8")], []],
                 [[u"ṩ".encode("utf-8"), "fi"]]]
     self.assertRaggedEqual(expected, text.normalize_utf8(txt, "NFKC"))
     self.assertRaggedEqual(expected, text.normalize_utf8(txt, "nfkc"))
Beispiel #2
0
 def test_normalize_nfd(self):
     txt = [u"\u1e9b\u0323"]
     expected = [
         u"\u017f\u0323\u0307".encode("utf-8"),
     ]
     self.assertAllEqual(expected, text.normalize_utf8(txt, "NFD"))
     self.assertAllEqual(expected, text.normalize_utf8(txt, "nfd"))
Beispiel #3
0
 def test_normalize_nfkc(self):
     txt = [
         u"\u1e9b\u0323",
     ]
     expected = [
         u"ṩ".encode("utf-8"),
     ]
     self.assertAllEqual(expected, text.normalize_utf8(txt, "NFKC"))
     self.assertAllEqual(expected, text.normalize_utf8(txt, "nfkc"))
Beispiel #4
0
def tf_lower_and_split_punct(text):
  # Split accecented characters.
  text = tf_text.normalize_utf8(text, 'NFKD')
  text = tf.strings.lower(text)
  # Keep space, a to z, and select punctuation.
  text = tf.strings.regex_replace(text, '[^ a-z.?!,¿]', '')
  # Add spaces around punctuation.
  text = tf.strings.regex_replace(text, '[.?!,¿]', r' \0 ')
  # Strip whitespace.
  text = tf.strings.strip(text)

  text = tf.strings.join(['[START]', text, '[END]'], separator=' ')
  return text
Beispiel #5
0
def basic_tokenize(text_input, lower_case=False, keep_whitespace=False):
    """Performs basic word tokenization for BERT.

  Args:
    text_input: A Tensor of untokenized strings.
    lower_case: A bool indicating whether or not to perform lowercasing. Default
      is False.
    keep_whitespace: A bool indicating whether or not whitespace tokens should
      be kept in the output
  """
    # lowercase and strip accents (if option is set)
    if lower_case:
        text_input = tf_text.case_fold_utf8(text_input)

    # normalize by NFD
    text_input = tf_text.normalize_utf8(text_input, "NFD")

    # strip out control characters
    text_input = tf.strings.regex_replace(text_input, r"\p{Cc}|\p{Cf}|\p{Mn}",
                                          "")

    # For chinese and emoji characters, tokenize by unicode codepoints
    script_tokenized = tf_text.unicode_script_tokenize(
        text_input, keep_whitespace=keep_whitespace, name="UTF-8")
    token_script_ids = tf.strings.unicode_script(
        tf.strings.unicode_decode(script_tokenized.flat_values, "UTF-8"))

    is_chinese = tf.equal(token_script_ids, _CHINESE_SCRIPT_ID)[:, :1].values
    is_emoji = tf_text.wordshape(script_tokenized.flat_values,
                                 tf_text.WordShape.HAS_EMOJI)
    is_punct = tf_text.wordshape(script_tokenized.flat_values,
                                 tf_text.WordShape.IS_PUNCT_OR_SYMBOL)
    split_cond = is_chinese | is_emoji | is_punct
    unicode_char_split = tf.strings.unicode_split(script_tokenized, "UTF-8")

    unicode_split_tokens = tf.where(split_cond,
                                    y=tf.expand_dims(
                                        script_tokenized.flat_values, 1),
                                    x=unicode_char_split.values)

    # Pack back into a [batch, (num_tokens), (num_unicode_chars)] RT
    chinese_mix_tokenized = tf.RaggedTensor.from_row_lengths(
        values=unicode_split_tokens,
        row_lengths=script_tokenized.row_lengths())

    # Squeeze out to a [batch, (num_tokens)] RT
    return collapse_dims(chinese_mix_tokenized)
Beispiel #6
0
    def call(self, inputs: tf.Tensor):
        """Calls `text.SentencepieceTokenizer` on inputs.

    Args:
      inputs: A string Tensor of shape `(batch_size,)`.

    Returns:
      One or three of RaggedTensors if tokenize_with_offsets is False or True,
      respectively. These are
      tokens: A RaggedTensor of shape `[batch_size, (pieces)]` and type `int32`.
        `tokens[i,j]` contains the j-th piece in the i-th input.
      start_offsets, limit_offsets: If `tokenize_with_offsets` is True,
        RaggedTensors of type `int64` with the same indices as tokens.
        Element `[i,j]` contains the byte offset at the start, or past the
        end, resp., for the j-th piece in the i-th input.
    """
        if self._strip_diacritics:
            if self.tokenize_with_offsets:
                raise ValueError(
                    "`tokenize_with_offsets` is not supported yet when "
                    "`strip_diacritics` is set to True (b/181866850).")
            inputs = text.normalize_utf8(inputs, "NFD")
            inputs = tf.strings.regex_replace(inputs, r"\p{Mn}", "")

        if self._lower_case:
            inputs = text.case_fold_utf8(inputs)

        # Prepare to reshape the result to work around broken shape inference.
        batch_size = tf.shape(inputs)[0]

        def _reshape(rt):
            values = rt.values
            row_splits = rt.row_splits
            row_splits = tf.reshape(row_splits, [batch_size + 1])
            return tf.RaggedTensor.from_row_splits(values, row_splits)

        # Call the tokenizer.
        if self.tokenize_with_offsets:
            tokens, start_offsets, limit_offsets = (
                self._tokenizer.tokenize_with_offsets(inputs))
            return _reshape(tokens), _reshape(start_offsets), _reshape(
                limit_offsets)
        else:
            tokens = self._tokenizer.tokenize(inputs)
            return _reshape(tokens)
Beispiel #7
0
    def __call__(self, x):
        # Constrained sequence
        cs_scores = np.array([[10.0, 12.0, 6.0, 4.0], [13.0, 12.0, 11.0,
                                                       10.0]])
        cs_input = np.array([cs_scores, cs_scores, cs_scores],
                            dtype=np.float32)
        cs_transition_weights = np.array(
            [[-1.0, 1.0, -2.0, 2.0, 0.0], [3.0, -3.0, 4.0, -4.0, 0.0],
             [5.0, 1.0, 10.0, 1.0, 1.0], [-7.0, 7.0, -8.0, 8.0, 0.0],
             [0.0, 1.0, 2.0, 3.0, 0.0]],
            dtype=np.float32)
        cs_allowed_transitions = np.array([[True, True, True, True, True],
                                           [True, True, True, True, True],
                                           [True, False, True, False, False],
                                           [True, True, True, True, True],
                                           [True, False, True, True, True]])
        constrained_sequence = text.viterbi_constrained_sequence(
            cs_input, [2, 2, 2],
            allowed_transitions=cs_allowed_transitions,
            transition_weights=cs_transition_weights,
            use_log_space=True,
            use_start_and_end_states=True)
        # Max Spanning Tree
        mst_num_nodes = tf.constant([4, 3], tf.int32)
        mst_scores = tf.constant(
            [[[0, 0, 0, 0], [1, 0, 0, 0], [1, 2, 0, 0], [1, 2, 3, 4]],
             [[4, 3, 2, 9], [0, 0, 2, 9], [0, 0, 0, 9], [9, 9, 9, 9]]],
            tf.int32)  # pyformat: disable
        (max_spanning_tree,
         _) = text.max_spanning_tree(mst_num_nodes, mst_scores)
        # Normalize
        normalized = text.case_fold_utf8(['A String'])
        normalized = text.normalize_utf8(normalized)
        # Regex split
        regex_split = text.regex_split(input=['Yo dawg!'],
                                       delim_regex_pattern=r'\s')
        # Rouge-L
        rl_hypotheses = tf.ragged.constant(
            [['captain', 'of', 'the', 'delta', 'flight'],
             ['the', '1990', 'transcript']])
        rl_references = tf.ragged.constant(
            [['delta', 'air', 'lines', 'flight'],
             ['this', 'concludes', 'the', 'transcript']])
        (rouge_l, _, _) = text.metrics.rouge_l(rl_hypotheses, rl_references)
        # Sentence breaking version 1 (token dependent)
        sb_token_word = [['Welcome', 'to', 'the', 'U.S.', '!', 'Harry'],
                         ['Wu', 'Tang', 'Clan', ';', 'ain\'t', 'nothing']]
        sb_token_properties = [[0, 0, 0, 256, 0, 0], [0, 0, 0, 0, 0, 0]]
        sb_token_starts = []
        sb_token_ends = []
        for sentence in sb_token_word:
            sentence_string = ''
            sentence_start = []
            sentence_end = []
            for word in sentence:
                sentence_start.append(len(sentence_string))
                sentence_string = sentence_string.join([word, ' '])
                sentence_end.append(len(sentence_string))
            sb_token_starts.append(sentence_start)
            sb_token_ends.append(sentence_end)
        sb_token_starts = tf.constant(sb_token_starts, dtype=tf.int64)
        sb_token_ends = tf.constant(sb_token_ends, dtype=tf.int64)
        sb_token_properties = tf.ragged.constant(sb_token_properties,
                                                 dtype=tf.int64)
        (sentence_breaking, _, _,
         _) = text.sentence_fragments(sb_token_word, sb_token_starts,
                                      sb_token_ends, sb_token_properties)
        # Sentence breaking version 2 (StateBasedSentenceBreaker)
        sbv2_text_input = [['Welcome to the U.S.! Harry'],
                           ['Wu Tang Clan; ain\'t nothing']]
        sentence_breaker_v2 = text.StateBasedSentenceBreaker()
        sbv2_fragment_text, _, _ = (
            sentence_breaker_v2.break_sentences_with_offsets(sbv2_text_input))
        # Sentencepiece tokenizer
        sp_model_file = (
            'third_party/tensorflow_text/python/ops/test_data/test_oss_model.model'
        )
        sp_model = open(sp_model_file, 'rb').read()
        sp_tokenizer = text.SentencepieceTokenizer(sp_model)
        sentencepiece = sp_tokenizer.tokenize(['A sentence of things.'])
        sentencepiece = sp_tokenizer.detokenize(sentencepiece)
        (sentencepiece, _,
         _) = sp_tokenizer.tokenize_with_offsets(sentencepiece)
        sentencepiece_size = sp_tokenizer.vocab_size()
        sentencepiece_id = sp_tokenizer.id_to_string(1)
        # Split merge tokenizer
        sm_tokenizer = text.SplitMergeTokenizer()
        split_merge = sm_tokenizer.tokenize(b'IloveFlume!',
                                            [0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0])
        # Split merge from logits tokenizer
        smfl_tokenizer = text.SplitMergeFromLogitsTokenizer()
        split_merge_from_logits = smfl_tokenizer.tokenize(
            b'IloveFlume!',
            # One pair of logits for each Unicode character from the text.  Each
            # pair indicates a "split" action if the first component is greater than
            # the second one, and a "merge" otherwise.
            [
                [2.7, -0.3],  # I: split
                [4.1, 0.82],  # l: split
                [-2.3, 4.3],  # o: merge
                [3.1, 12.2],  # v: merge
                [-3.0, 4.7],  # e: merge
                [2.7, -0.7],  # F: split
                [0.7, 15.0],  # l: merge
                [1.6, 23.0],  # u: merge
                [2.1, 11.0],  # m: merge
                [0.0, 20.0],  # e: merge
                [18.0, 0.7],  # !: split
            ])
        # Confirm TF unicode_script op that requires ICU works
        tf_unicode_script = tf.strings.unicode_script(
            [ord('a'), 0x0411, 0x82b8, ord(',')])
        # Unicode script tokenizer
        us_tokenizer = text.UnicodeScriptTokenizer()
        unicode_script = us_tokenizer.tokenize(['a string'])
        # Whitespace tokenizer
        ws_tokenizer = text.WhitespaceTokenizer()
        whitespace = ws_tokenizer.tokenize(['a string'])
        # Wordpiece tokenizer
        wp_initializer = tf.lookup.KeyValueTensorInitializer(
            ['i'], [1], key_dtype=tf.string, value_dtype=tf.int64)
        self.wp_vocab_table = tf.lookup.StaticHashTable(wp_initializer,
                                                        default_value=-1)
        wp_tokenizer = text.WordpieceTokenizer(self.wp_vocab_table)
        wordpiece = wp_tokenizer.tokenize(['i am'])
        # Wordshape
        wordshapes = text.wordshape([u'a-b', u'a\u2010b'.encode('utf-8')],
                                    text.WordShape.HAS_PUNCTUATION_DASH)

        # Assertion method
        def assert_check(tensor):
            return tf.assert_equal(tensor, tf.identity(tensor))

        # Assertions
        constrained_sequence_assert = assert_check(
            constrained_sequence.to_tensor())
        max_spanning_tree_assert = assert_check(max_spanning_tree)
        normalized_assert = assert_check(normalized)
        regex_split_assert = assert_check(regex_split.to_tensor())
        rouge_l_assert = assert_check(rouge_l)
        sentence_breaking_assert = assert_check(sentence_breaking.to_tensor())
        sentence_breaking_v2_assert = assert_check(
            sbv2_fragment_text.to_tensor())
        sentencepiece_assert = assert_check(sentencepiece.to_tensor())
        sentencepiece_id_assert = assert_check(sentencepiece_id)
        sentencepiece_size_assert = assert_check(sentencepiece_size)
        split_merge_assert = assert_check(split_merge)
        split_merge_from_logits_assert = assert_check(split_merge_from_logits)
        tf_unicode_script_assert = assert_check(tf_unicode_script)
        unicode_script_assert = assert_check(unicode_script.to_tensor())
        whitespace_assert = assert_check(whitespace.to_tensor())
        wordpiece_assert = assert_check(wordpiece.to_tensor())
        wordshapes_assert = assert_check(wordshapes)

        with tf.control_dependencies([
                constrained_sequence_assert, max_spanning_tree_assert,
                normalized_assert, regex_split_assert, rouge_l_assert,
                sentence_breaking_assert, sentence_breaking_v2_assert,
                sentencepiece_assert, sentencepiece_id_assert,
                sentencepiece_size_assert, split_merge_assert,
                split_merge_from_logits_assert, tf_unicode_script_assert,
                unicode_script_assert, whitespace_assert, wordpiece_assert,
                wordshapes_assert
        ]):
            y = tf.add(x, [1])
        return {'y': y}
Beispiel #8
0
 def _do_lower_case(t):
   t = tf_text.case_fold_utf8(t)
   t = tf_text.normalize_utf8(t, "NFD")
   t = tf.regex_replace(t, r"\p{Mn}", "")
   return t
Beispiel #9
0
    def __call__(self, x):
        # Constrained sequence
        cs_scores = np.array([[10.0, 12.0, 6.0, 4.0], [13.0, 12.0, 11.0,
                                                       10.0]])
        cs_input = np.array([cs_scores, cs_scores, cs_scores],
                            dtype=np.float32)
        cs_transition_weights = np.array(
            [[-1.0, 1.0, -2.0, 2.0, 0.0], [3.0, -3.0, 4.0, -4.0, 0.0],
             [5.0, 1.0, 10.0, 1.0, 1.0], [-7.0, 7.0, -8.0, 8.0, 0.0],
             [0.0, 1.0, 2.0, 3.0, 0.0]],
            dtype=np.float32)
        cs_allowed_transitions = np.array([[True, True, True, True, True],
                                           [True, True, True, True, True],
                                           [True, False, True, False, False],
                                           [True, True, True, True, True],
                                           [True, False, True, True, True]])
        constrained_sequence = text.viterbi_constrained_sequence(
            cs_input, [2, 2, 2],
            allowed_transitions=cs_allowed_transitions,
            transition_weights=cs_transition_weights,
            use_log_space=True,
            use_start_and_end_states=True)
        # Max Spanning Tree
        mst_num_nodes = tf.constant([4, 3], tf.int32)
        mst_scores = tf.constant(
            [[[0, 0, 0, 0], [1, 0, 0, 0], [1, 2, 0, 0], [1, 2, 3, 4]],
             [[4, 3, 2, 9], [0, 0, 2, 9], [0, 0, 0, 9], [9, 9, 9, 9]]],
            tf.int32)  # pyformat: disable
        (max_spanning_tree,
         _) = text.max_spanning_tree(mst_num_nodes, mst_scores)
        # Normalize
        normalized = text.case_fold_utf8(['A String'])
        normalized = text.normalize_utf8(normalized)
        # Regex split
        regex_split = text.regex_split(input=['Yo dawg!'],
                                       delim_regex_pattern=r'\s')
        # Rouge-L
        rl_hypotheses = tf.ragged.constant(
            [['captain', 'of', 'the', 'delta', 'flight'],
             ['the', '1990', 'transcript']])
        rl_references = tf.ragged.constant(
            [['delta', 'air', 'lines', 'flight'],
             ['this', 'concludes', 'the', 'transcript']])
        (rouge_l, _, _) = text.metrics.rouge_l(rl_hypotheses, rl_references)
        # Sentence breaking
        sb_token_word = [['Welcome', 'to', 'the', 'U.S.', '!', 'Harry'],
                         ['Wu', 'Tang', 'Clan', ';', 'ain\'t', 'nothing']]
        sb_token_properties = [[0, 0, 0, 256, 0, 0], [0, 0, 0, 0, 0, 0]]
        sb_token_starts = []
        sb_token_ends = []
        for sentence in sb_token_word:
            sentence_string = ''
            sentence_start = []
            sentence_end = []
            for word in sentence:
                sentence_start.append(len(sentence_string))
                sentence_string = sentence_string.join([word, ' '])
                sentence_end.append(len(sentence_string))
            sb_token_starts.append(sentence_start)
            sb_token_ends.append(sentence_end)
        sb_token_starts = tf.constant(sb_token_starts, dtype=tf.int64)
        sb_token_ends = tf.constant(sb_token_ends, dtype=tf.int64)
        sb_token_properties = tf.ragged.constant(sb_token_properties,
                                                 dtype=tf.int64)
        (sentence_breaking, _, _,
         _) = text.sentence_fragments(sb_token_word, sb_token_starts,
                                      sb_token_ends, sb_token_properties)
        # Sentencepiece tokenizer
        sp_model_file = (
            'third_party/tensorflow_text/python/ops/test_data/test_oss_model.model'
        )
        sp_model = open(sp_model_file, 'rb').read()
        sp_tokenizer = text.SentencepieceTokenizer(sp_model)
        sentencepiece = sp_tokenizer.tokenize(['A sentence of things.'])
        sentencepiece = sp_tokenizer.detokenize(sentencepiece)
        (sentencepiece, _,
         _) = sp_tokenizer.tokenize_with_offsets(sentencepiece)
        sentencepiece_size = sp_tokenizer.vocab_size()
        sentencepiece_id = sp_tokenizer.id_to_string(1)
        # Split merge tokenizer - not in this version
        sm_tokenizer = text.SplitMergeTokenizer()
        split_merge = sm_tokenizer.tokenize(b'IloveFlume!',
                                            [0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0])
        # Unicode script tokenizer
        us_tokenizer = text.UnicodeScriptTokenizer()
        unicode_script = us_tokenizer.tokenize(['a string'])
        # Whitespace tokenizer
        ws_tokenizer = text.WhitespaceTokenizer()
        whitespace = ws_tokenizer.tokenize(['a string'])
        # Wordpiece tokenizer
        wp_initializer = tf.lookup.KeyValueTensorInitializer(
            ['i'], [1], key_dtype=tf.string, value_dtype=tf.int64)
        self.wp_vocab_table = tf.lookup.StaticHashTable(wp_initializer,
                                                        default_value=-1)
        wp_tokenizer = text.WordpieceTokenizer(self.wp_vocab_table)
        wordpiece = wp_tokenizer.tokenize(['i am'])
        # Wordshape
        wordshapes = text.wordshape([u'a-b', u'a\u2010b'.encode('utf-8')],
                                    text.WordShape.HAS_PUNCTUATION_DASH)

        with tf.control_dependencies([
                constrained_sequence, max_spanning_tree, normalized,
                regex_split, rouge_l, sentence_breaking, sentencepiece,
                sentencepiece_id, sentencepiece_size, split_merge,
                unicode_script, whitespace, wordpiece, wordshapes
        ]):
            y = tf.add(x, [1])
        return {'y': y}
Beispiel #10
0
def make_model_input(x: tf.Tensor) -> tf.Tensor:
    x = text.normalize_utf8(x, "NFD")
    return tokenizer.tokenize(x)
Beispiel #11
0
 def test_unknown_normalization_form(self):
     with self.assertRaises(tf.errors.InvalidArgumentError):
         bomb = text.normalize_utf8(["cant readme", "wont read me"],
                                    "cantfindme")
         self.evaluate(bomb)