def __init__(self, all_text): # self.paragraphs = all_text tokenizer = text.UnicodeScriptTokenizer() (self.tokens, self.offset_starts, self.offset_limits) = tokenizer.tokenize_with_offsets(all_text) self.bigrams = text.ngrams(self.tokens, 2, reduction_type=text.Reduction.STRING_JOIN)
import tensorflow_text as text docs = tf.constant([ u'Everything not saved will be lost.'.encode('UTF-16-BE'), u'Sad☹'.encode('UTF-16-BE') ]) utf8_docs = tf.strings.unicode_transcode(docs, input_encoding='UTF-16-BE', output_encoding='UTF-8') tokenizer = text.WhitespaceTokenizer() tokens = tokenizer.tokenize( ['everything not saved will be lost.', u'Sad☹'.encode('UTF-8')]) print(tokens.to_list()) tokenizer = text.UnicodeScriptTokenizer() tokens = tokenizer.tokenize( ['everything not saved will be lost.', u'Sad☹'.encode('UTF-8')]) print(tokens.to_list()) tokens = tf.strings.unicode_split([u"仅今年前".encode('UTF-8')], 'UTF-8') print(tokens.to_list()) tokenizer = text.UnicodeScriptTokenizer() (tokens, offset_starts, offset_limits) = tokenizer.tokenize_with_offsets( ['everything not saved will be lost.', u'Sad☹'.encode('UTF-8')]) print(tokens.to_list()) print(offset_starts.to_list()) print(offset_limits.to_list()) docs = tf.data.Dataset.from_tensor_slices([['Never tell me the odds.'],
def __call__(self, x): # Constrained sequence cs_scores = np.array([[10.0, 12.0, 6.0, 4.0], [13.0, 12.0, 11.0, 10.0]]) cs_input = np.array([cs_scores, cs_scores, cs_scores], dtype=np.float32) cs_transition_weights = np.array( [[-1.0, 1.0, -2.0, 2.0, 0.0], [3.0, -3.0, 4.0, -4.0, 0.0], [5.0, 1.0, 10.0, 1.0, 1.0], [-7.0, 7.0, -8.0, 8.0, 0.0], [0.0, 1.0, 2.0, 3.0, 0.0]], dtype=np.float32) cs_allowed_transitions = np.array([[True, True, True, True, True], [True, True, True, True, True], [True, False, True, False, False], [True, True, True, True, True], [True, False, True, True, True]]) constrained_sequence = text.viterbi_constrained_sequence( cs_input, [2, 2, 2], allowed_transitions=cs_allowed_transitions, transition_weights=cs_transition_weights, use_log_space=True, use_start_and_end_states=True) # Max Spanning Tree mst_num_nodes = tf.constant([4, 3], tf.int32) mst_scores = tf.constant( [[[0, 0, 0, 0], [1, 0, 0, 0], [1, 2, 0, 0], [1, 2, 3, 4]], [[4, 3, 2, 9], [0, 0, 2, 9], [0, 0, 0, 9], [9, 9, 9, 9]]], tf.int32) # pyformat: disable (max_spanning_tree, _) = text.max_spanning_tree(mst_num_nodes, mst_scores) # Normalize normalized = text.case_fold_utf8(['A String']) normalized = text.normalize_utf8(normalized) # Regex split regex_split = text.regex_split(input=['Yo dawg!'], delim_regex_pattern=r'\s') # Rouge-L rl_hypotheses = tf.ragged.constant( [['captain', 'of', 'the', 'delta', 'flight'], ['the', '1990', 'transcript']]) rl_references = tf.ragged.constant( [['delta', 'air', 'lines', 'flight'], ['this', 'concludes', 'the', 'transcript']]) (rouge_l, _, _) = text.metrics.rouge_l(rl_hypotheses, rl_references) # Sentence breaking version 1 (token dependent) sb_token_word = [['Welcome', 'to', 'the', 'U.S.', '!', 'Harry'], ['Wu', 'Tang', 'Clan', ';', 'ain\'t', 'nothing']] sb_token_properties = [[0, 0, 0, 256, 0, 0], [0, 0, 0, 0, 0, 0]] sb_token_starts = [] sb_token_ends = [] for sentence in sb_token_word: sentence_string = '' sentence_start = [] sentence_end = [] for word in sentence: sentence_start.append(len(sentence_string)) sentence_string = sentence_string.join([word, ' ']) sentence_end.append(len(sentence_string)) sb_token_starts.append(sentence_start) sb_token_ends.append(sentence_end) sb_token_starts = tf.constant(sb_token_starts, dtype=tf.int64) sb_token_ends = tf.constant(sb_token_ends, dtype=tf.int64) sb_token_properties = tf.ragged.constant(sb_token_properties, dtype=tf.int64) (sentence_breaking, _, _, _) = text.sentence_fragments(sb_token_word, sb_token_starts, sb_token_ends, sb_token_properties) # Sentence breaking version 2 (StateBasedSentenceBreaker) sbv2_text_input = [['Welcome to the U.S.! Harry'], ['Wu Tang Clan; ain\'t nothing']] sentence_breaker_v2 = text.StateBasedSentenceBreaker() sbv2_fragment_text, _, _ = ( sentence_breaker_v2.break_sentences_with_offsets(sbv2_text_input)) # Sentencepiece tokenizer sp_model_file = ( 'third_party/tensorflow_text/python/ops/test_data/test_oss_model.model' ) sp_model = open(sp_model_file, 'rb').read() sp_tokenizer = text.SentencepieceTokenizer(sp_model) sentencepiece = sp_tokenizer.tokenize(['A sentence of things.']) sentencepiece = sp_tokenizer.detokenize(sentencepiece) (sentencepiece, _, _) = sp_tokenizer.tokenize_with_offsets(sentencepiece) sentencepiece_size = sp_tokenizer.vocab_size() sentencepiece_id = sp_tokenizer.id_to_string(1) # Split merge tokenizer sm_tokenizer = text.SplitMergeTokenizer() split_merge = sm_tokenizer.tokenize(b'IloveFlume!', [0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0]) # Split merge from logits tokenizer smfl_tokenizer = text.SplitMergeFromLogitsTokenizer() split_merge_from_logits = smfl_tokenizer.tokenize( b'IloveFlume!', # One pair of logits for each Unicode character from the text. Each # pair indicates a "split" action if the first component is greater than # the second one, and a "merge" otherwise. [ [2.7, -0.3], # I: split [4.1, 0.82], # l: split [-2.3, 4.3], # o: merge [3.1, 12.2], # v: merge [-3.0, 4.7], # e: merge [2.7, -0.7], # F: split [0.7, 15.0], # l: merge [1.6, 23.0], # u: merge [2.1, 11.0], # m: merge [0.0, 20.0], # e: merge [18.0, 0.7], # !: split ]) # Confirm TF unicode_script op that requires ICU works tf_unicode_script = tf.strings.unicode_script( [ord('a'), 0x0411, 0x82b8, ord(',')]) # Unicode script tokenizer us_tokenizer = text.UnicodeScriptTokenizer() unicode_script = us_tokenizer.tokenize(['a string']) # Whitespace tokenizer ws_tokenizer = text.WhitespaceTokenizer() whitespace = ws_tokenizer.tokenize(['a string']) # Wordpiece tokenizer wp_initializer = tf.lookup.KeyValueTensorInitializer( ['i'], [1], key_dtype=tf.string, value_dtype=tf.int64) self.wp_vocab_table = tf.lookup.StaticHashTable(wp_initializer, default_value=-1) wp_tokenizer = text.WordpieceTokenizer(self.wp_vocab_table) wordpiece = wp_tokenizer.tokenize(['i am']) # Wordshape wordshapes = text.wordshape([u'a-b', u'a\u2010b'.encode('utf-8')], text.WordShape.HAS_PUNCTUATION_DASH) # Assertion method def assert_check(tensor): return tf.assert_equal(tensor, tf.identity(tensor)) # Assertions constrained_sequence_assert = assert_check( constrained_sequence.to_tensor()) max_spanning_tree_assert = assert_check(max_spanning_tree) normalized_assert = assert_check(normalized) regex_split_assert = assert_check(regex_split.to_tensor()) rouge_l_assert = assert_check(rouge_l) sentence_breaking_assert = assert_check(sentence_breaking.to_tensor()) sentence_breaking_v2_assert = assert_check( sbv2_fragment_text.to_tensor()) sentencepiece_assert = assert_check(sentencepiece.to_tensor()) sentencepiece_id_assert = assert_check(sentencepiece_id) sentencepiece_size_assert = assert_check(sentencepiece_size) split_merge_assert = assert_check(split_merge) split_merge_from_logits_assert = assert_check(split_merge_from_logits) tf_unicode_script_assert = assert_check(tf_unicode_script) unicode_script_assert = assert_check(unicode_script.to_tensor()) whitespace_assert = assert_check(whitespace.to_tensor()) wordpiece_assert = assert_check(wordpiece.to_tensor()) wordshapes_assert = assert_check(wordshapes) with tf.control_dependencies([ constrained_sequence_assert, max_spanning_tree_assert, normalized_assert, regex_split_assert, rouge_l_assert, sentence_breaking_assert, sentence_breaking_v2_assert, sentencepiece_assert, sentencepiece_id_assert, sentencepiece_size_assert, split_merge_assert, split_merge_from_logits_assert, tf_unicode_script_assert, unicode_script_assert, whitespace_assert, wordpiece_assert, wordshapes_assert ]): y = tf.add(x, [1]) return {'y': y}
def _create_or_get_tokenizer(): if DEFAULT_TOKENIZER_TYPE not in _tokenizers: _tokenizers[DEFAULT_TOKENIZER_TYPE] = tf_text.UnicodeScriptTokenizer() return _tokenizers[DEFAULT_TOKENIZER_TYPE]
def main(): # Unicode docs = tf.constant([ u'Everything not saved will be lost.'.encode('UTF-16-BE'), u'Sad☹'.encode('UTF-16-BE') ]) _ = tf.strings.unicode_transcode(docs, input_encoding='UTF-16-BE', output_encoding='UTF-8') # Tokenization # WhitespaceTokenizer tokenizer = text.UnicodeScriptTokenizer() tokens = tokenizer.tokenize( ['everything not saved will be lost', u'Sad☹'.encode('UTF-8')]) print(f'Tokens: {tokens.to_list()}') # Unicode split tokens = tf.strings.unicode_split([u"仅今年前".encode('UTF-8')], 'UTF-8') print(f'Tokens: {tokens.to_list()}') # Offsets tokenizer = text.UnicodeScriptTokenizer() (tokens, _, end_offsets) = tokenizer.tokenize_with_offsets( ['everything not saved will be lost.', u'Sad☹'.encode('UTF-8')]) print(f'Tokens: {tokens.to_list()}') print(f'Offsets: {end_offsets.to_list()}') # TF.Data Example docs = tf.data.Dataset.from_tensor_slices([['Never tell me the odds.'], ["It's a trap!"]]) tokenizer = text.WhitespaceTokenizer() tokenized_docs = docs.map(lambda x: tokenizer.tokenize(x)) iterator = iter(tokenized_docs) print(f'First sentence tokens: {next(iterator).to_list()}') print(f'Seconds sentence tokens: {next(iterator).to_list()}') # Other Text Ops # Wordshape tokenizer = text.WhitespaceTokenizer() tokens = tokenizer.tokenize( ['Everything not saved will be lost.', u'Sad☹'.encode('UTF-8')]) # Is capitalized? f1 = text.wordshape(tokens, text.WordShape.HAS_TITLE_CASE) # Are all letters uppercased f2 = text.wordshape(tokens, text.WordShape.IS_UPPERCASE) # Does the token contain punctuation? f3 = text.wordshape(tokens, text.WordShape.HAS_SOME_PUNCT_OR_SYMBOL) # Is the token a number? f4 = text.wordshape(tokens, text.WordShape.IS_NUMERIC_VALUE) print(f'Is capitalized? {f1.to_list()}') print(f'Are all letters uppercased? {f2.to_list()}') print(f'Does the token contain punctuation? {f3.to_list()}') print(f'Is the token a number? {f4.to_list()}') # N-grams & Sliding Window tokenizer = text.WhitespaceTokenizer() tokens = tokenizer.tokenize( ['Everything not saved will be lost.', u'Sad☹'.encode('UTF-8')]) # Ngrams, in this case bi-gram (n = 2) bigrams = text.ngrams(tokens, 2, reduction_type=text.Reduction.STRING_JOIN) print(f'Bi-grams: {bigrams.to_list()}')
def __call__(self, x): # Constrained sequence cs_scores = np.array([[10.0, 12.0, 6.0, 4.0], [13.0, 12.0, 11.0, 10.0]]) cs_input = np.array([cs_scores, cs_scores, cs_scores], dtype=np.float32) cs_transition_weights = np.array( [[-1.0, 1.0, -2.0, 2.0, 0.0], [3.0, -3.0, 4.0, -4.0, 0.0], [5.0, 1.0, 10.0, 1.0, 1.0], [-7.0, 7.0, -8.0, 8.0, 0.0], [0.0, 1.0, 2.0, 3.0, 0.0]], dtype=np.float32) cs_allowed_transitions = np.array([[True, True, True, True, True], [True, True, True, True, True], [True, False, True, False, False], [True, True, True, True, True], [True, False, True, True, True]]) constrained_sequence = text.viterbi_constrained_sequence( cs_input, [2, 2, 2], allowed_transitions=cs_allowed_transitions, transition_weights=cs_transition_weights, use_log_space=True, use_start_and_end_states=True) # Max Spanning Tree mst_num_nodes = tf.constant([4, 3], tf.int32) mst_scores = tf.constant( [[[0, 0, 0, 0], [1, 0, 0, 0], [1, 2, 0, 0], [1, 2, 3, 4]], [[4, 3, 2, 9], [0, 0, 2, 9], [0, 0, 0, 9], [9, 9, 9, 9]]], tf.int32) # pyformat: disable (max_spanning_tree, _) = text.max_spanning_tree(mst_num_nodes, mst_scores) # Normalize normalized = text.case_fold_utf8(['A String']) normalized = text.normalize_utf8(normalized) # Regex split regex_split = text.regex_split(input=['Yo dawg!'], delim_regex_pattern=r'\s') # Rouge-L rl_hypotheses = tf.ragged.constant( [['captain', 'of', 'the', 'delta', 'flight'], ['the', '1990', 'transcript']]) rl_references = tf.ragged.constant( [['delta', 'air', 'lines', 'flight'], ['this', 'concludes', 'the', 'transcript']]) (rouge_l, _, _) = text.metrics.rouge_l(rl_hypotheses, rl_references) # Sentence breaking sb_token_word = [['Welcome', 'to', 'the', 'U.S.', '!', 'Harry'], ['Wu', 'Tang', 'Clan', ';', 'ain\'t', 'nothing']] sb_token_properties = [[0, 0, 0, 256, 0, 0], [0, 0, 0, 0, 0, 0]] sb_token_starts = [] sb_token_ends = [] for sentence in sb_token_word: sentence_string = '' sentence_start = [] sentence_end = [] for word in sentence: sentence_start.append(len(sentence_string)) sentence_string = sentence_string.join([word, ' ']) sentence_end.append(len(sentence_string)) sb_token_starts.append(sentence_start) sb_token_ends.append(sentence_end) sb_token_starts = tf.constant(sb_token_starts, dtype=tf.int64) sb_token_ends = tf.constant(sb_token_ends, dtype=tf.int64) sb_token_properties = tf.ragged.constant(sb_token_properties, dtype=tf.int64) (sentence_breaking, _, _, _) = text.sentence_fragments(sb_token_word, sb_token_starts, sb_token_ends, sb_token_properties) # Sentencepiece tokenizer sp_model_file = ( 'third_party/tensorflow_text/python/ops/test_data/test_oss_model.model' ) sp_model = open(sp_model_file, 'rb').read() sp_tokenizer = text.SentencepieceTokenizer(sp_model) sentencepiece = sp_tokenizer.tokenize(['A sentence of things.']) sentencepiece = sp_tokenizer.detokenize(sentencepiece) (sentencepiece, _, _) = sp_tokenizer.tokenize_with_offsets(sentencepiece) sentencepiece_size = sp_tokenizer.vocab_size() sentencepiece_id = sp_tokenizer.id_to_string(1) # Split merge tokenizer - not in this version sm_tokenizer = text.SplitMergeTokenizer() split_merge = sm_tokenizer.tokenize(b'IloveFlume!', [0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0]) # Unicode script tokenizer us_tokenizer = text.UnicodeScriptTokenizer() unicode_script = us_tokenizer.tokenize(['a string']) # Whitespace tokenizer ws_tokenizer = text.WhitespaceTokenizer() whitespace = ws_tokenizer.tokenize(['a string']) # Wordpiece tokenizer wp_initializer = tf.lookup.KeyValueTensorInitializer( ['i'], [1], key_dtype=tf.string, value_dtype=tf.int64) self.wp_vocab_table = tf.lookup.StaticHashTable(wp_initializer, default_value=-1) wp_tokenizer = text.WordpieceTokenizer(self.wp_vocab_table) wordpiece = wp_tokenizer.tokenize(['i am']) # Wordshape wordshapes = text.wordshape([u'a-b', u'a\u2010b'.encode('utf-8')], text.WordShape.HAS_PUNCTUATION_DASH) with tf.control_dependencies([ constrained_sequence, max_spanning_tree, normalized, regex_split, rouge_l, sentence_breaking, sentencepiece, sentencepiece_id, sentencepiece_size, split_merge, unicode_script, whitespace, wordpiece, wordshapes ]): y = tf.add(x, [1]) return {'y': y}
def main(): # Example 1: Predict the tag for a Stack Overflow question # Download and explore the dataset url = 'https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz' dataset = utils.get_file('stack_overflow_16k.tar.gz', url, untar=True, cache_dir='stack_overflow', cache_subdir='') dataset_dir = pathlib.Path(dataset).parent print(f'Paths: {list(dataset_dir.iterdir())}') train_dir = dataset_dir / 'train' print(f'Train paths: {train_dir.iterdir}') sample_file = train_dir / 'python/1755.txt' with open(sample_file) as f: print(f.read()) # Load the dataset batch_size = 32 seed = 42 raw_train_ds = preprocessing.text_dataset_from_directory( train_dir, batch_size=batch_size, validation_split=0.2, subset='training', seed=seed) for text_batch, label_batch in raw_train_ds.take(1): for i in range(10): print(f'Question: {text_batch.numpy()[i]}') print(f'Label: {label_batch.numpy()[i]}') for i, label in enumerate(raw_train_ds.class_names): print(f'Label {i} corresponds to {label}') raw_val_ds = preprocessing.text_dataset_from_directory( train_dir, batch_size=batch_size, validation_split=0.2, subset='validation', seed=seed) test_dir = dataset_dir / 'test' raw_test_ds = preprocessing.text_dataset_from_directory( test_dir, batch_size=batch_size) # Prepare the dataset for training vocab_size = 10000 binary_vectorize_layer = TextVectorization(max_tokens=vocab_size, output_mode='binary') max_sequence_length = 250 int_vectorize_layer = TextVectorization( max_tokens=vocab_size, output_mode='int', output_sequence_length=max_sequence_length) # Make a text-only dataset (without labels), then call adapt train_text = raw_train_ds.map(lambda text, labels: text) binary_vectorize_layer.adapt(train_text) int_vectorize_layer.adapt(train_text) def binary_vectorize_text(text, label): text = tf.expand_dims(text, -1) return binary_vectorize_layer(text), label def int_vectorize_text(text, label): text = tf.expand_dims(text, -1) return int_vectorize_layer(text), label # Retrieve a batch (of 32 reviews and labels) from the dataset text_batch, label_batch = next(iter(raw_train_ds)) first_question, first_label = text_batch[0], label_batch[0] print(f'Question: {first_question}') print(f'Label: {first_label}') print( f'"binary" vectorized question: {binary_vectorize_text(first_question, first_label)[0]}' ) print( f'"int" vectorized question: {int_vectorize_text(first_question, first_label)[0]}' ) print(f'1298 ---> {int_vectorize_layer.get_vocabulary()[1289]}') print(f' 313 ---> {int_vectorize_layer.get_vocabulary()[313]}') print(f'Vocabulary size: {len(int_vectorize_layer.get_vocabulary())}') binary_train_ds = raw_train_ds.map(binary_vectorize_text) binary_val_ds = raw_val_ds.map(binary_vectorize_text) binary_test_ds = raw_test_ds.map(binary_vectorize_text) int_train_ds = raw_train_ds.map(int_vectorize_text) int_val_ds = raw_val_ds.map(int_vectorize_text) int_test_ds = raw_test_ds.map(int_vectorize_text) # Configure the dataset for performance AUTOTUNE = tf.data.AUTOTUNE def configure_dataset(dataset): return dataset.cache().prefetch(buffer_size=AUTOTUNE) binary_train_ds = configure_dataset(binary_train_ds) binary_val_ds = configure_dataset(binary_val_ds) binary_test_ds = configure_dataset(binary_test_ds) int_train_ds = configure_dataset(int_train_ds) int_val_ds = configure_dataset(int_val_ds) int_test_ds = configure_dataset(int_test_ds) # Train the model binary_model = tf.keras.Sequential([layers.Dense(4)]) binary_model.compile( optimizer='adam', loss=losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy']) _ = binary_model.fit(binary_train_ds, validation_data=binary_val_ds, epochs=10) def create_model(vocab_size, num_labels): model = tf.keras.Sequential([ layers.Embedding(vocab_size, 64, mask_zero=True), layers.Conv1D(64, 5, padding='valid', activation='relu', strides=2), layers.GlobalMaxPool1D(), layers.Dense(num_labels) ]) return model # vocab_size is vocab_size + 1 since 0 is used additionally for padding. int_model = create_model(vocab_size=vocab_size + 1, num_labels=4) int_model.compile( optimizer='adam', loss=losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy']) _ = int_model.fit(int_train_ds, validation_data=int_val_ds, epochs=5) print(f'Linear model on binary vectorized data: {binary_model.summary()}') print(f'ConvNet model on int vectorized data: {int_model.summary()}') _, binary_accuracy = binary_model.evaluate(binary_test_ds) _, int_accuracy = int_model.evaluate(int_test_ds) print(f'Binary model accuracy {binary_accuracy:2.2%}') print(f'Int model accuracy: {int_accuracy:2.2%}') # Export the model export_model = tf.keras.Sequential( [binary_vectorize_layer, binary_model, layers.Activation('sigmoid')]) export_model.compile( optimizer='adam', loss=losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy']) # Test it with `raw_test_ds`, which yields raw strings loss, accuracy = export_model.evaluate(raw_test_ds) print(f'Accuracy: {accuracy:2.2%}') def get_string_labels(predicted_scores_batch): predicted_int_labels = tf.argmax(predicted_scores_batch, axis=1) predicted_labels = tf.gather(raw_train_ds.class_names, predicted_int_labels) return predicted_labels # Run inference on new data inputs = [ 'how do I extract keys from a dict into a list?', # python 'debug public static void main(string[] args) {...}', # java ] predicted_scores = export_model.predict(inputs) predicted_labels = get_string_labels(predicted_scores) for input, label in zip(inputs, predicted_labels): print(f'Question: {input}') print(f'Predicted label: {label.numpy()}') # Example 2: Predict the author of Illiad translations url = 'https://storage.googleapis.com/download.tensorflow.org/data/illiad/' file_names = ['cowper.txt', 'derby.txt', 'butler.txt'] for name in file_names: text_dir = utils.get_file(name, origin=url + name) parent_dir = pathlib.Path(text_dir).parent print(f'Paths: {list(parent_dir.iterdir())}') # Load the dataset def labeler(example, index): return example, tf.cast(index, tf.int64) labeled_data_sets = [] for i, file_name in enumerate(file_names): lines_dataset = tf.data.TextLineDataset(str(parent_dir / file_name)) labeled_dataset = lines_dataset.map(lambda ex: labeler(ex, i)) labeled_data_sets.append(labeled_dataset) buffer_size = 50000 batch_size = 64 validation_size = 5000 all_labeled_data = labeled_data_sets[0] for labeled_dataset in labeled_data_sets[1:]: all_labeled_data = all_labeled_data.concatenate(labeled_dataset) all_labeled_data = all_labeled_data.shuffle(buffer_size, reshuffle_each_iteration=False) for text, label in all_labeled_data.take(10): print(f'Sentence: {text.numpy()}') print(f'Label: {label.numpy()}') # Prepare the dataset for training tokenizer = tf_text.UnicodeScriptTokenizer() def tokenize(text, unused_label): lower_case = tf_text.case_fold_utf8(text) return tokenizer.tokenize(lower_case) tokenized_ds = all_labeled_data.map(tokenize) for text_batch in tokenized_ds.take(5): print(f'Tokens: {text_batch.numpy()}') tokenized_ds = configure_dataset(tokenized_ds) vocab_dict = collections.defaultdict(lambda: 0) for toks in tokenized_ds.as_numpy_iterator(): for tok in toks: vocab_dict[tok] += 1 vocab = sorted(vocab_dict.items(), key=lambda x: x[1], reverse=True) vocab = [token for token, count in vocab] vocab = vocab[:vocab_size] vocab_size = len(vocab) print(f'Vocab size: {vocab_size}') print(f'First five vocab entries: {vocab[:5]}') keys = vocab values = range(2, len(vocab) + 2) # reserve 0 for padding, 1 for OOV init = tf.lookup.KeyValueTensorInitializer(keys, values, key_dtype=tf.string, value_dtype=tf.int64) num_oov_buckets = 1 vocab_table = tf.lookup.StaticVocabularyTable(init, num_oov_buckets) def preprocess_text(text, label): standardized = tf_text.case_fold_utf8(text) tokenized = tokenizer.tokenize(standardized) vectorized = vocab_table.lookup(tokenized) return vectorized, label example_text, example_label = next(iter(all_labeled_data)) print(f'Sentence: {example_text.numpy()}') vectorized_text, example_label = preprocess_text(example_text, example_label) print(f'Vectorized sentence: {vectorized_text.numpy()}') all_encoded_data = all_labeled_data.map(preprocess_text) # Split the dataset into train and test train_data = all_encoded_data.skip(validation_size).shuffle(buffer_size) validation_data = all_encoded_data.take(validation_size) train_data = train_data.padded_batch(batch_size) validation_data = validation_data.padded_batch(batch_size) sample_text, sample_labels = next(iter(validation_data)) print(f'Text batch shape: {sample_text.shape}') print(f'Label batch shape: {sample_labels.shape}') print(f'First text example: {sample_text[0]}') print(f'First label example: {sample_labels[0]}') vocab_size += 2 # Train the model model = create_model(vocab_size=vocab_size, num_labels=3) model.compile(optimizer='adam', loss=losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy']) _ = model.fit(train_data, validation_data=validation_data, epochs=3) loss, accuracy = model.evaluate(validation_data) print(f'Loss: {loss}') print(f'Accuracy: {accuracy:2.2%}') # Export the model preprocess_layer = TextVectorization( max_tokens=vocab_size, standardize=tf_text.case_fold_utf8, split=tokenizer.tokenize, output_mode='int', output_sequence_length=max_sequence_length) preprocess_layer.set_vocabulary(vocab) export_model = tf.keras.Sequential( [preprocess_layer, model, layers.Activation('sigmoid')]) export_model.compile( optimizer='adam', loss=losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy']) # Create a test dataset of raw strings test_ds = all_labeled_data.take(validation_size).batch(batch_size) test_ds = configure_dataset(test_ds) loss, accuracy = export_model.evaluate(test_ds) print(f'Loss: {loss}') print(f'Accuracy: {accuracy:2.2%}') # Run inference on new data inputs = [ "Join'd to th' Ionians with their flowing robes,", # Label: 1 "the allies, and his armour flashed about him so that he seemed to all", # Label: 2 "And with loud clangor of his arms he fell.", # Label: 0 ] predicted_scores = export_model.predict(inputs) predicted_labels = tf.argmax(predicted_scores, axis=1) for input, label in zip(inputs, predicted_labels): print(f'Question: {input}') print(f'Predicted label: {label.numpy()}') # Download more datasets using TensorFlow Datasets (TFDS) train_ds = tfds.load('imdb_reviews', split='train', batch_size=batch_size, shuffle_files=True, as_supervised=True) val_ds = tfds.load('imdb_reviews', split='train', batch_size=batch_size, shuffle_files=True, as_supervised=True) for review_batch, label_batch in val_ds.take(1): for i in range(5): print(f'Review: {review_batch[i].numpy()}') print(f'Label: {label_batch[i].numpy()}') # Prepare thje dataset for training vectorize_layer = TextVectorization( max_tokens=vocab_size, output_mode='int', output_sequence_length=max_sequence_length) # Make a text-only dataset (without labels), then call adapt train_text = train_ds.map(lambda text, labels: text) vectorize_layer.adapt(train_text) def vectorize_text(text, label): text = tf.expand_dims(text, -1) return vectorize_layer(text), label train_ds = train_ds.map(vectorize_text) val_ds = val_ds.map(vectorize_text) train_ds = configure_dataset(train_ds) val_ds = configure_dataset(val_ds) # Train the model model = create_model(vocab_size=vocab_size + 1, num_labels=1) model.summary() model.compile(optimizer='adam', loss=losses.BinaryCrossentropy(from_logits=True), metrics=['accuracy']) _ = model.fit(train_ds, validation_data=val_ds, epochs=3) loss, accuracy = model.evaluate(val_ds) print(f'Loss: {loss}') print(f'Accuracy: {accuracy:2.2%}') # Export the model export_model = tf.keras.Sequential( [vectorize_layer, model, layers.Activation('sigmoid')]) export_model.compile( optimizer='adam', loss=losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy']) # 0 ---> negative review # 1 ---> positive review inputs = [ 'This is a fantastic movie.', 'This is a bad movie.', 'This movie was bad that it was good.', 'I will never say yes to watching this movie.' ] predicted_scores = export_model.predict(inputs) predicted_labels = [int(round(x[0])) for x in predicted_scores] for input, label in zip(inputs, predicted_labels): print(f'Question: {input}') print(f'Predicted label: {label}')