from tensorflow.keras.preprocessing.text import Tokenizer tokenizer = Tokenizer() tokenizer.fit_on_texts(['This is a sample text', 'Another sample text']) sequences = tokenizer.texts_to_sequences(['This is a sample text', 'Another sample text']) # Convert sequences back to text texts = tokenizer.sequences_to_texts(sequences) print(texts) # ['this is a sample text', 'another sample text']
from tensorflow.keras.preprocessing.text import Tokenizer tokenizer = Tokenizer(num_words=100) tokenizer.fit_on_texts(['This is a sample text', 'Another sample text']) sequences = tokenizer.texts_to_sequences(['This is a sample text', 'Another sample text']) # Convert sequences back to text with added 'In this example, we use the same `Tokenizer` object as in Example 1. However, we also specify the `num_words` parameter to limit the vocabulary size to the top 100 most frequently occurring words. In addition, we pass the string `' for out-of-vocabulary words texts = tokenizer.sequences_to_texts(sequences, ' ') print(texts) # ['this is a sample text', 'another sample text ']