def test_width_3_ragged_tensor_equivalence(self, test_case): input_tensor = tf.ragged.constant(test_case) tf_output = tf_text.ngrams( input_tensor, 3, reduction_type=tf_text.Reduction.STRING_JOIN) rank = input_tensor.shape.rank model = self._make_model(rank, 3, ragged_tensor=True, flex=False) interpreter = interpreter_wrapper.InterpreterWithCustomOps( model_content=model, custom_op_registerers=['AddNgramsCustomOp']) interpreter.resize_tensor_input(0, input_tensor.flat_values.shape) for r in range(rank - 1): interpreter.resize_tensor_input( r + 1, input_tensor.nested_row_splits[r].shape) interpreter.allocate_tensors() interpreter.set_tensor(interpreter.get_input_details()[0]['index'], input_tensor.flat_values.numpy()) for r in range(rank - 1): interpreter.set_tensor( interpreter.get_input_details()[r + 1]['index'], input_tensor.nested_row_splits[r].numpy()) interpreter.invoke() tflite_output_values = interpreter.get_tensor( interpreter.get_output_details()[0]['index']) self.assertEqual(tf_output.flat_values.numpy().tolist(), tflite_output_values.tolist()) for i in range(rank - 1): tflite_output_cur_row_splits = interpreter.get_tensor( interpreter.get_output_details()[i + 1]['index']) self.assertEqual(tf_output.nested_row_splits[i].numpy().tolist(), tflite_output_cur_row_splits.tolist())
def __init__(self, all_text): # self.paragraphs = all_text tokenizer = text.UnicodeScriptTokenizer() (self.tokens, self.offset_starts, self.offset_limits) = tokenizer.tokenize_with_offsets(all_text) self.bigrams = text.ngrams(self.tokens, 2, reduction_type=text.Reduction.STRING_JOIN)
def testStringJoinReductionFailsWithImproperAxis(self): with self.assertRaisesRegexp( tf.errors.InvalidArgumentError, r".*requires that ngrams' 'axis' parameter be -1."): _ = text.ngrams(data=[], width=2, axis=0, reduction_type=text.Reduction.STRING_JOIN)
def testMeanReduction(self): test_data = tf.constant([[1.0, 2.0, 3.0], [10.0, 20.0, 30.0]]) ngrams_op = text.ngrams(test_data, width=2, axis=1, reduction_type=text.Reduction.MEAN) expected_values = [[1.5, 2.5], [15.0, 25.0]] self.assertRaggedEqual(expected_values, ngrams_op)
def testSumReduction(self): test_data = tf.constant([[1.0, 2.0, 3.0], [10.0, 20.0, 30.0]]) ngrams_op = text.ngrams(test_data, width=2, axis=1, reduction_type=text.Reduction.SUM) expected_values = [[3.0, 5.0], [30.0, 50.0]] self.assertRaggedEqual(expected_values, ngrams_op)
def testReductionOnAxisWithInsufficientValuesReturnsEmptySet(self): test_data = tf.constant([[1.0, 2.0, 3.0], [10.0, 20.0, 30.0]]) ngrams_op = text.ngrams(test_data, width=4, axis=-1, reduction_type=text.Reduction.SUM) expected_values = [[], []] self.assertRaggedEqual(expected_values, ngrams_op)
def testStringJoinReduction(self): test_data = tf.constant([["a", "b", "c"], ["dd", "ee", "ff"]]) ngrams_op = text.ngrams(test_data, width=2, axis=-1, reduction_type=text.Reduction.STRING_JOIN, string_separator="|") expected_values = [["a|b", "b|c"], ["dd|ee", "ee|ff"]] self.assertRaggedEqual(expected_values, ngrams_op)
def testRaggedSumReductionAxisZero(self): test_data = tf.ragged.constant([[1.0, 2.0, 3.0, 4.0], [10.0, 20.0, 30.0, 40.0]]) ngrams_op = text.ngrams(test_data, width=2, axis=0, reduction_type=text.Reduction.SUM) expected_values = [[11.0, 22.0, 33.0, 44.0]] self.assertRaggedEqual(expected_values, ngrams_op)
def testReductionOnInnerAxis(self): test_data = tf.constant([[[1.0, 2.0, 3.0], [10.0, 20.0, 30.0]], [[4.0, 5.0, 6.0], [40.0, 50.0, 60.0]]]) ngrams_op = text.ngrams(test_data, width=2, axis=-2, reduction_type=text.Reduction.SUM) expected_values = [[[11.0, 22.0, 33.0]], [[44.0, 55.0, 66.0]]] self.assertRaggedEqual(expected_values, ngrams_op)
def testStringJoinReductionAxisZero(self): test_data = tf.constant(["a", "b", "c"]) ngrams_op = text.ngrams( test_data, width=2, axis=-1, # The -1 axis is the zero axis here. reduction_type=text.Reduction.STRING_JOIN, string_separator="|") expected_values = ["a|b", "b|c"] self.assertRaggedEqual(expected_values, ngrams_op)
def make_data(sentences, window_size): tokenizer = text.WhitespaceTokenizer() tokens = tokenizer.tokenize(sentences) ngrams = text.ngrams(tokens, window_size + 1, reduction_type=text.Reduction.STRING_JOIN) segments = np.array( [x[0].decode("UTF-8").split(" ") for x in ngrams.to_list()]) input_batch = [' '.join(x) for x in segments[:, 0:-1]] target_batch = to_categorical(np.vectorize(lambda x: word_index[x] - 1)( segments[:, -1]), n_class, dtype='float32') return input_batch, target_batch
def test_width_3_tensor_equivalence(self, test_case): input_tensor = tf.ragged.constant(test_case).to_tensor() tf_output = tf_text.ngrams( input_tensor, 3, reduction_type=tf_text.Reduction.STRING_JOIN) rank = input_tensor.shape.rank model = self._make_model(rank, 3, ragged_tensor=False, flex=False) interpreter = interpreter_wrapper.InterpreterWithCustomOps( model_content=model, custom_op_registerers=['AddNgramsCustomOp']) interpreter.resize_tensor_input(0, input_tensor.shape) interpreter.allocate_tensors() interpreter.set_tensor(interpreter.get_input_details()[0]['index'], input_tensor.numpy()) interpreter.invoke() tflite_output = interpreter.get_tensor( interpreter.get_output_details()[0]['index']) self.assertEqual(tf_output.numpy().tolist(), tflite_output.tolist())
def test_width_2_ragged_tensor_equivalence(self, test_case): input_tensor = tf.ragged.constant(test_case) tf_output = tf_text.ngrams( input_tensor, 2, reduction_type=tf_text.Reduction.STRING_JOIN) rank = input_tensor.shape.rank model = self._make_model(rank, 2, ragged_tensor=True, flex=False) interpreter = interpreter_wrapper.InterpreterWithCustomOps( model_content=model, custom_op_registerers=['AddNgramsCustomOp']) signature_fn = interpreter.get_signature_runner() signature_kwargs = {} signature_kwargs['values'] = input_tensor.flat_values.numpy() for r in range(rank - 1): signature_kwargs[f'args_{r}'] = input_tensor.nested_row_splits[ r].numpy() output = signature_fn(**signature_kwargs) tflite_output_values = output['output_0'] self.assertEqual(tf_output.flat_values.numpy().tolist(), tflite_output_values.tolist()) for i in range(rank - 1): tflite_output_cur_row_splits = output[f'output_{i + 1}'] self.assertEqual(tf_output.nested_row_splits[i].numpy().tolist(), tflite_output_cur_row_splits.tolist())
iterator = iter(tokenized_docs) print(next(iterator).to_list()) print(next(iterator).to_list()) tokenizer = text.WhitespaceTokenizer() tokens = tokenizer.tokenize( ['Everything not saved will be lost.', u'Sad☹'.encode('UTF-8')]) # Is capitalized? f1 = text.wordshape(tokens, text.WordShape.HAS_TITLE_CASE) # Are all letters uppercased? f2 = text.wordshape(tokens, text.WordShape.IS_UPPERCASE) # Does the token contain punctuation? f3 = text.wordshape(tokens, text.WordShape.HAS_SOME_PUNCT_OR_SYMBOL) # Is the token a number? f4 = text.wordshape(tokens, text.WordShape.IS_NUMERIC_VALUE) print(f1.to_list()) print(f2.to_list()) print(f3.to_list()) print(f4.to_list()) tokenizer = text.WhitespaceTokenizer() tokens = tokenizer.tokenize( ['Everything not saved will be lost.', u'Sad☹'.encode('UTF-8')]) # Ngrams, in this case bi-gram (n = 2) bigrams = text.ngrams(tokens, 2, reduction_type=text.Reduction.STRING_JOIN) print(bigrams.to_list())
def main(): # Unicode docs = tf.constant([ u'Everything not saved will be lost.'.encode('UTF-16-BE'), u'Sad☹'.encode('UTF-16-BE') ]) _ = tf.strings.unicode_transcode(docs, input_encoding='UTF-16-BE', output_encoding='UTF-8') # Tokenization # WhitespaceTokenizer tokenizer = text.UnicodeScriptTokenizer() tokens = tokenizer.tokenize( ['everything not saved will be lost', u'Sad☹'.encode('UTF-8')]) print(f'Tokens: {tokens.to_list()}') # Unicode split tokens = tf.strings.unicode_split([u"仅今年前".encode('UTF-8')], 'UTF-8') print(f'Tokens: {tokens.to_list()}') # Offsets tokenizer = text.UnicodeScriptTokenizer() (tokens, _, end_offsets) = tokenizer.tokenize_with_offsets( ['everything not saved will be lost.', u'Sad☹'.encode('UTF-8')]) print(f'Tokens: {tokens.to_list()}') print(f'Offsets: {end_offsets.to_list()}') # TF.Data Example docs = tf.data.Dataset.from_tensor_slices([['Never tell me the odds.'], ["It's a trap!"]]) tokenizer = text.WhitespaceTokenizer() tokenized_docs = docs.map(lambda x: tokenizer.tokenize(x)) iterator = iter(tokenized_docs) print(f'First sentence tokens: {next(iterator).to_list()}') print(f'Seconds sentence tokens: {next(iterator).to_list()}') # Other Text Ops # Wordshape tokenizer = text.WhitespaceTokenizer() tokens = tokenizer.tokenize( ['Everything not saved will be lost.', u'Sad☹'.encode('UTF-8')]) # Is capitalized? f1 = text.wordshape(tokens, text.WordShape.HAS_TITLE_CASE) # Are all letters uppercased f2 = text.wordshape(tokens, text.WordShape.IS_UPPERCASE) # Does the token contain punctuation? f3 = text.wordshape(tokens, text.WordShape.HAS_SOME_PUNCT_OR_SYMBOL) # Is the token a number? f4 = text.wordshape(tokens, text.WordShape.IS_NUMERIC_VALUE) print(f'Is capitalized? {f1.to_list()}') print(f'Are all letters uppercased? {f2.to_list()}') print(f'Does the token contain punctuation? {f3.to_list()}') print(f'Is the token a number? {f4.to_list()}') # N-grams & Sliding Window tokenizer = text.WhitespaceTokenizer() tokens = tokenizer.tokenize( ['Everything not saved will be lost.', u'Sad☹'.encode('UTF-8')]) # Ngrams, in this case bi-gram (n = 2) bigrams = text.ngrams(tokens, 2, reduction_type=text.Reduction.STRING_JOIN) print(f'Bi-grams: {bigrams.to_list()}')
def testUnspecifiedReductionTypeFails(self): with self.assertRaisesRegexp(tf.errors.InvalidArgumentError, r"reduction_type must be specified."): _ = text.ngrams(data=[], width=2, axis=0)
def func(data): return tf_text.ngrams(data, width, axis, reduction_type, string_separator, name)
def ragged_func(values, *args): ragged_tensor = tf.RaggedTensor.from_nested_row_splits( flat_values=values, nested_row_splits=args) return tf_text.ngrams(ragged_tensor, width, axis, reduction_type, string_separator, name)
def testBadReductionTypeFails(self): with self.assertRaisesRegexp(tf.errors.InvalidArgumentError, r"reduction_type must be a Reduction."): _ = text.ngrams(data=[], width=2, axis=0, reduction_type="SUM")