コード例 #1
0
    def test_single_input_sequence_with_implicit_lengths(self):
        use_log_space = True
        use_start_and_end_states = False
        scores = np.array([[10.0, 13.0, 6.0, 4.0], [13.0, 12.0, 11.0, 10.0],
                           [13.0, 12.0, 11.0, 10.0]])
        # pyformat: disable
        # pylint: disable=bad-whitespace
        # pylint: disable=bad-continuation
        transition_weights = np.array(
            [[-1.0, 1.0, -2.0, 2.0], [3.0, -3.0, 4.0, -4.0],
             [5.0, 1.0, 10.0, 1.0], [-7.0, 7.0, -8.0, 8.0]],
            dtype=np.float32)

        # pyformat: enable
        # pylint: enable=bad-whitespace
        # pylint: enable=bad-continuation
        sequence, _ = viterbi_decode.decode(
            scores,
            transition_weights,
            use_log_space=use_log_space,
            use_start_and_end_states=use_start_and_end_states)

        # Test a multi-item batch.
        multiple_input = np.array([scores], dtype=np.float32)

        single_sequence_op = tftext.viterbi_constrained_sequence(
            multiple_input,
            transition_weights=transition_weights,
            use_log_space=use_log_space,
            use_start_and_end_states=use_start_and_end_states)
        single_sequence_result = self.evaluate(single_sequence_op)
        self.assertRaggedEqual(single_sequence_result, [sequence])
コード例 #2
0
    def test_sequence_in_log_space_with_start_end_states_multi_input(self):
        use_log_space = True
        use_start_and_end_states = True
        scores = np.array([[10.0, 12.0, 7.0, 4.0], [13.0, 12.0, 11.0, 10.0]])
        # pyformat: disable
        # pylint: disable=bad-whitespace
        # pylint: disable=bad-continuation
        transition_weights = np.array(
            [[-1.0, 1.0, -2.0, 2.0, 0.0], [3.0, -3.0, 4.0, -4.0, 0.0],
             [5.0, 1.0, 10.0, 1.0, 1.0], [-7.0, 7.0, -8.0, 8.0, 0.0],
             [0.0, 1.0, 2.0, 3.0, 0.0]],
            dtype=np.float32)

        allowed_transitions = np.array([[True, True, True, True, True],
                                        [True, True, True, True, True],
                                        [True, False, True, False, False],
                                        [True, True, True, True, True],
                                        [True, False, True, True, True]])
        # pyformat: enable
        # pylint: enable=bad-whitespace
        # pylint: enable=bad-continuation
        sequence, _ = viterbi_decode.decode(
            scores,
            transition_weights,
            allowed_transitions,
            use_log_space=use_log_space,
            use_start_and_end_states=use_start_and_end_states)

        # Test a multi-item batch.
        multiple_input = np.array([scores, scores, scores], dtype=np.float32)

        multiple_sequence_op = tftext.viterbi_constrained_sequence(
            multiple_input, [2, 2, 2],
            allowed_transitions=allowed_transitions,
            transition_weights=transition_weights,
            use_log_space=use_log_space,
            use_start_and_end_states=use_start_and_end_states)
        multiple_sequence_result = self.evaluate(multiple_sequence_op)
        self.assertRaggedEqual(multiple_sequence_result,
                               [sequence, sequence, sequence])
コード例 #3
0
    def test_sequence_in_exp_space_with_start_end_states_multi_input(self):
        use_log_space = False
        use_start_and_end_states = True
        scores = np.array([[10.0, 12.0, 6.0, 4.0], [13.0, 12.0, 11.0, 10.0]])
        # pyformat: disable
        # pylint: disable=bad-whitespace
        # pylint: disable=bad-continuation
        transition_weights = np.array(
            [[.1, .2, .3, .4, .1], [.5, .6, .7, .8, .1], [.9, 1, .15, 1, .1],
             [.25, .35, .45, .55, .5], [.1, .5, .1, .1, 1]],
            dtype=np.float32)

        allowed_transitions = np.array([[True, True, True, True, True],
                                        [True, True, True, True, True],
                                        [True, False, True, False, True],
                                        [True, True, True, True, True],
                                        [True, True, True, True, False]])
        # pyformat: enable
        # pylint: enable=bad-whitespace
        # pylint: enable=bad-continuation
        sequence, _ = viterbi_decode.decode(
            scores,
            transition_weights,
            allowed_transitions,
            use_log_space=use_log_space,
            use_start_and_end_states=use_start_and_end_states)

        # Test a multi-item batch.
        multiple_input = np.array([scores, scores, scores], dtype=np.float32)

        multiple_sequence_op = tftext.viterbi_constrained_sequence(
            multiple_input, [2, 2, 2],
            allowed_transitions=allowed_transitions,
            transition_weights=transition_weights,
            use_log_space=use_log_space,
            use_start_and_end_states=use_start_and_end_states)
        multiple_sequence_result = self.evaluate(multiple_sequence_op)
        self.assertRaggedEqual(multiple_sequence_result,
                               [sequence, sequence, sequence])
コード例 #4
0
    def test_ragged_input_sequence(self):
        use_log_space = True
        use_start_and_end_states = False
        input_1 = np.array([[10.0, 13.0, 6.0, 4.0], [13.0, 12.0, 11.0, 10.0],
                            [13.0, 12.0, 11.0, 10.0]])
        input_2 = np.array([[10.0, 12.0, 6.0, 4.0], [13.0, 12.0, 11.0, 10.0]])
        # TODO(b/122968457): Extend RT support to lists-of-ndarrays.
        scores = tf.ragged.constant([input_1.tolist(), input_2.tolist()])
        # pyformat: disable
        # pylint: disable=bad-whitespace
        # pylint: disable=bad-continuation
        transition_weights = np.array(
            [[-1.0, 1.0, -2.0, 2.0], [3.0, -3.0, 4.0, -4.0],
             [5.0, 1.0, 10.0, 1.0], [-7.0, 7.0, -8.0, 8.0]],
            dtype=np.float32)

        # pyformat: enable
        # pylint: enable=bad-whitespace
        # pylint: enable=bad-continuation
        sequence_1, _ = viterbi_decode.decode(
            input_1,
            transition_weights,
            use_log_space=use_log_space,
            use_start_and_end_states=use_start_and_end_states)
        sequence_2, _ = viterbi_decode.decode(
            input_2,
            transition_weights,
            use_log_space=use_log_space,
            use_start_and_end_states=use_start_and_end_states)
        expected_sequence = tf.ragged.constant([sequence_1, sequence_2])

        # Test a ragged batch
        single_sequence_op = tftext.viterbi_constrained_sequence(
            scores,
            transition_weights=transition_weights,
            use_log_space=use_log_space,
            use_start_and_end_states=use_start_and_end_states)
        single_sequence_result = self.evaluate(single_sequence_op)
        self.assertRaggedEqual(single_sequence_result, expected_sequence)
コード例 #5
0
    def __call__(self, x):
        # Constrained sequence
        cs_scores = np.array([[10.0, 12.0, 6.0, 4.0], [13.0, 12.0, 11.0,
                                                       10.0]])
        cs_input = np.array([cs_scores, cs_scores, cs_scores],
                            dtype=np.float32)
        cs_transition_weights = np.array(
            [[-1.0, 1.0, -2.0, 2.0, 0.0], [3.0, -3.0, 4.0, -4.0, 0.0],
             [5.0, 1.0, 10.0, 1.0, 1.0], [-7.0, 7.0, -8.0, 8.0, 0.0],
             [0.0, 1.0, 2.0, 3.0, 0.0]],
            dtype=np.float32)
        cs_allowed_transitions = np.array([[True, True, True, True, True],
                                           [True, True, True, True, True],
                                           [True, False, True, False, False],
                                           [True, True, True, True, True],
                                           [True, False, True, True, True]])
        constrained_sequence = text.viterbi_constrained_sequence(
            cs_input, [2, 2, 2],
            allowed_transitions=cs_allowed_transitions,
            transition_weights=cs_transition_weights,
            use_log_space=True,
            use_start_and_end_states=True)
        # Max Spanning Tree
        mst_num_nodes = tf.constant([4, 3], tf.int32)
        mst_scores = tf.constant(
            [[[0, 0, 0, 0], [1, 0, 0, 0], [1, 2, 0, 0], [1, 2, 3, 4]],
             [[4, 3, 2, 9], [0, 0, 2, 9], [0, 0, 0, 9], [9, 9, 9, 9]]],
            tf.int32)  # pyformat: disable
        (max_spanning_tree,
         _) = text.max_spanning_tree(mst_num_nodes, mst_scores)
        # Normalize
        normalized = text.case_fold_utf8(['A String'])
        normalized = text.normalize_utf8(normalized)
        # Regex split
        regex_split = text.regex_split(input=['Yo dawg!'],
                                       delim_regex_pattern=r'\s')
        # Rouge-L
        rl_hypotheses = tf.ragged.constant(
            [['captain', 'of', 'the', 'delta', 'flight'],
             ['the', '1990', 'transcript']])
        rl_references = tf.ragged.constant(
            [['delta', 'air', 'lines', 'flight'],
             ['this', 'concludes', 'the', 'transcript']])
        (rouge_l, _, _) = text.metrics.rouge_l(rl_hypotheses, rl_references)
        # Sentence breaking version 1 (token dependent)
        sb_token_word = [['Welcome', 'to', 'the', 'U.S.', '!', 'Harry'],
                         ['Wu', 'Tang', 'Clan', ';', 'ain\'t', 'nothing']]
        sb_token_properties = [[0, 0, 0, 256, 0, 0], [0, 0, 0, 0, 0, 0]]
        sb_token_starts = []
        sb_token_ends = []
        for sentence in sb_token_word:
            sentence_string = ''
            sentence_start = []
            sentence_end = []
            for word in sentence:
                sentence_start.append(len(sentence_string))
                sentence_string = sentence_string.join([word, ' '])
                sentence_end.append(len(sentence_string))
            sb_token_starts.append(sentence_start)
            sb_token_ends.append(sentence_end)
        sb_token_starts = tf.constant(sb_token_starts, dtype=tf.int64)
        sb_token_ends = tf.constant(sb_token_ends, dtype=tf.int64)
        sb_token_properties = tf.ragged.constant(sb_token_properties,
                                                 dtype=tf.int64)
        (sentence_breaking, _, _,
         _) = text.sentence_fragments(sb_token_word, sb_token_starts,
                                      sb_token_ends, sb_token_properties)
        # Sentence breaking version 2 (StateBasedSentenceBreaker)
        sbv2_text_input = [['Welcome to the U.S.! Harry'],
                           ['Wu Tang Clan; ain\'t nothing']]
        sentence_breaker_v2 = text.StateBasedSentenceBreaker()
        sbv2_fragment_text, _, _ = (
            sentence_breaker_v2.break_sentences_with_offsets(sbv2_text_input))
        # Sentencepiece tokenizer
        sp_model_file = (
            'third_party/tensorflow_text/python/ops/test_data/test_oss_model.model'
        )
        sp_model = open(sp_model_file, 'rb').read()
        sp_tokenizer = text.SentencepieceTokenizer(sp_model)
        sentencepiece = sp_tokenizer.tokenize(['A sentence of things.'])
        sentencepiece = sp_tokenizer.detokenize(sentencepiece)
        (sentencepiece, _,
         _) = sp_tokenizer.tokenize_with_offsets(sentencepiece)
        sentencepiece_size = sp_tokenizer.vocab_size()
        sentencepiece_id = sp_tokenizer.id_to_string(1)
        # Split merge tokenizer
        sm_tokenizer = text.SplitMergeTokenizer()
        split_merge = sm_tokenizer.tokenize(b'IloveFlume!',
                                            [0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0])
        # Split merge from logits tokenizer
        smfl_tokenizer = text.SplitMergeFromLogitsTokenizer()
        split_merge_from_logits = smfl_tokenizer.tokenize(
            b'IloveFlume!',
            # One pair of logits for each Unicode character from the text.  Each
            # pair indicates a "split" action if the first component is greater than
            # the second one, and a "merge" otherwise.
            [
                [2.7, -0.3],  # I: split
                [4.1, 0.82],  # l: split
                [-2.3, 4.3],  # o: merge
                [3.1, 12.2],  # v: merge
                [-3.0, 4.7],  # e: merge
                [2.7, -0.7],  # F: split
                [0.7, 15.0],  # l: merge
                [1.6, 23.0],  # u: merge
                [2.1, 11.0],  # m: merge
                [0.0, 20.0],  # e: merge
                [18.0, 0.7],  # !: split
            ])
        # Confirm TF unicode_script op that requires ICU works
        tf_unicode_script = tf.strings.unicode_script(
            [ord('a'), 0x0411, 0x82b8, ord(',')])
        # Unicode script tokenizer
        us_tokenizer = text.UnicodeScriptTokenizer()
        unicode_script = us_tokenizer.tokenize(['a string'])
        # Whitespace tokenizer
        ws_tokenizer = text.WhitespaceTokenizer()
        whitespace = ws_tokenizer.tokenize(['a string'])
        # Wordpiece tokenizer
        wp_initializer = tf.lookup.KeyValueTensorInitializer(
            ['i'], [1], key_dtype=tf.string, value_dtype=tf.int64)
        self.wp_vocab_table = tf.lookup.StaticHashTable(wp_initializer,
                                                        default_value=-1)
        wp_tokenizer = text.WordpieceTokenizer(self.wp_vocab_table)
        wordpiece = wp_tokenizer.tokenize(['i am'])
        # Wordshape
        wordshapes = text.wordshape([u'a-b', u'a\u2010b'.encode('utf-8')],
                                    text.WordShape.HAS_PUNCTUATION_DASH)

        # Assertion method
        def assert_check(tensor):
            return tf.assert_equal(tensor, tf.identity(tensor))

        # Assertions
        constrained_sequence_assert = assert_check(
            constrained_sequence.to_tensor())
        max_spanning_tree_assert = assert_check(max_spanning_tree)
        normalized_assert = assert_check(normalized)
        regex_split_assert = assert_check(regex_split.to_tensor())
        rouge_l_assert = assert_check(rouge_l)
        sentence_breaking_assert = assert_check(sentence_breaking.to_tensor())
        sentence_breaking_v2_assert = assert_check(
            sbv2_fragment_text.to_tensor())
        sentencepiece_assert = assert_check(sentencepiece.to_tensor())
        sentencepiece_id_assert = assert_check(sentencepiece_id)
        sentencepiece_size_assert = assert_check(sentencepiece_size)
        split_merge_assert = assert_check(split_merge)
        split_merge_from_logits_assert = assert_check(split_merge_from_logits)
        tf_unicode_script_assert = assert_check(tf_unicode_script)
        unicode_script_assert = assert_check(unicode_script.to_tensor())
        whitespace_assert = assert_check(whitespace.to_tensor())
        wordpiece_assert = assert_check(wordpiece.to_tensor())
        wordshapes_assert = assert_check(wordshapes)

        with tf.control_dependencies([
                constrained_sequence_assert, max_spanning_tree_assert,
                normalized_assert, regex_split_assert, rouge_l_assert,
                sentence_breaking_assert, sentence_breaking_v2_assert,
                sentencepiece_assert, sentencepiece_id_assert,
                sentencepiece_size_assert, split_merge_assert,
                split_merge_from_logits_assert, tf_unicode_script_assert,
                unicode_script_assert, whitespace_assert, wordpiece_assert,
                wordshapes_assert
        ]):
            y = tf.add(x, [1])
        return {'y': y}
コード例 #6
0
ファイル: save_models.py プロジェクト: sts-sadr/text
    def __call__(self, x):
        # Constrained sequence
        cs_scores = np.array([[10.0, 12.0, 6.0, 4.0], [13.0, 12.0, 11.0,
                                                       10.0]])
        cs_input = np.array([cs_scores, cs_scores, cs_scores],
                            dtype=np.float32)
        cs_transition_weights = np.array(
            [[-1.0, 1.0, -2.0, 2.0, 0.0], [3.0, -3.0, 4.0, -4.0, 0.0],
             [5.0, 1.0, 10.0, 1.0, 1.0], [-7.0, 7.0, -8.0, 8.0, 0.0],
             [0.0, 1.0, 2.0, 3.0, 0.0]],
            dtype=np.float32)
        cs_allowed_transitions = np.array([[True, True, True, True, True],
                                           [True, True, True, True, True],
                                           [True, False, True, False, False],
                                           [True, True, True, True, True],
                                           [True, False, True, True, True]])
        constrained_sequence = text.viterbi_constrained_sequence(
            cs_input, [2, 2, 2],
            allowed_transitions=cs_allowed_transitions,
            transition_weights=cs_transition_weights,
            use_log_space=True,
            use_start_and_end_states=True)
        # Max Spanning Tree
        mst_num_nodes = tf.constant([4, 3], tf.int32)
        mst_scores = tf.constant(
            [[[0, 0, 0, 0], [1, 0, 0, 0], [1, 2, 0, 0], [1, 2, 3, 4]],
             [[4, 3, 2, 9], [0, 0, 2, 9], [0, 0, 0, 9], [9, 9, 9, 9]]],
            tf.int32)  # pyformat: disable
        (max_spanning_tree,
         _) = text.max_spanning_tree(mst_num_nodes, mst_scores)
        # Normalize
        normalized = text.case_fold_utf8(['A String'])
        normalized = text.normalize_utf8(normalized)
        # Regex split
        regex_split = text.regex_split(input=['Yo dawg!'],
                                       delim_regex_pattern=r'\s')
        # Rouge-L
        rl_hypotheses = tf.ragged.constant(
            [['captain', 'of', 'the', 'delta', 'flight'],
             ['the', '1990', 'transcript']])
        rl_references = tf.ragged.constant(
            [['delta', 'air', 'lines', 'flight'],
             ['this', 'concludes', 'the', 'transcript']])
        (rouge_l, _, _) = text.metrics.rouge_l(rl_hypotheses, rl_references)
        # Sentence breaking
        sb_token_word = [['Welcome', 'to', 'the', 'U.S.', '!', 'Harry'],
                         ['Wu', 'Tang', 'Clan', ';', 'ain\'t', 'nothing']]
        sb_token_properties = [[0, 0, 0, 256, 0, 0], [0, 0, 0, 0, 0, 0]]
        sb_token_starts = []
        sb_token_ends = []
        for sentence in sb_token_word:
            sentence_string = ''
            sentence_start = []
            sentence_end = []
            for word in sentence:
                sentence_start.append(len(sentence_string))
                sentence_string = sentence_string.join([word, ' '])
                sentence_end.append(len(sentence_string))
            sb_token_starts.append(sentence_start)
            sb_token_ends.append(sentence_end)
        sb_token_starts = tf.constant(sb_token_starts, dtype=tf.int64)
        sb_token_ends = tf.constant(sb_token_ends, dtype=tf.int64)
        sb_token_properties = tf.ragged.constant(sb_token_properties,
                                                 dtype=tf.int64)
        (sentence_breaking, _, _,
         _) = text.sentence_fragments(sb_token_word, sb_token_starts,
                                      sb_token_ends, sb_token_properties)
        # Sentencepiece tokenizer
        sp_model_file = (
            'third_party/tensorflow_text/python/ops/test_data/test_oss_model.model'
        )
        sp_model = open(sp_model_file, 'rb').read()
        sp_tokenizer = text.SentencepieceTokenizer(sp_model)
        sentencepiece = sp_tokenizer.tokenize(['A sentence of things.'])
        sentencepiece = sp_tokenizer.detokenize(sentencepiece)
        (sentencepiece, _,
         _) = sp_tokenizer.tokenize_with_offsets(sentencepiece)
        sentencepiece_size = sp_tokenizer.vocab_size()
        sentencepiece_id = sp_tokenizer.id_to_string(1)
        # Split merge tokenizer - not in this version
        sm_tokenizer = text.SplitMergeTokenizer()
        split_merge = sm_tokenizer.tokenize(b'IloveFlume!',
                                            [0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0])
        # Unicode script tokenizer
        us_tokenizer = text.UnicodeScriptTokenizer()
        unicode_script = us_tokenizer.tokenize(['a string'])
        # Whitespace tokenizer
        ws_tokenizer = text.WhitespaceTokenizer()
        whitespace = ws_tokenizer.tokenize(['a string'])
        # Wordpiece tokenizer
        wp_initializer = tf.lookup.KeyValueTensorInitializer(
            ['i'], [1], key_dtype=tf.string, value_dtype=tf.int64)
        self.wp_vocab_table = tf.lookup.StaticHashTable(wp_initializer,
                                                        default_value=-1)
        wp_tokenizer = text.WordpieceTokenizer(self.wp_vocab_table)
        wordpiece = wp_tokenizer.tokenize(['i am'])
        # Wordshape
        wordshapes = text.wordshape([u'a-b', u'a\u2010b'.encode('utf-8')],
                                    text.WordShape.HAS_PUNCTUATION_DASH)

        with tf.control_dependencies([
                constrained_sequence, max_spanning_tree, normalized,
                regex_split, rouge_l, sentence_breaking, sentencepiece,
                sentencepiece_id, sentencepiece_size, split_merge,
                unicode_script, whitespace, wordpiece, wordshapes
        ]):
            y = tf.add(x, [1])
        return {'y': y}