def build_classifier(text): MAX_VOCAB_SIZE = 20000 encoder = TextVectorization(max_tokens=MAX_VOCAB_SIZE) encoder.adapt(text) vocabset = set(encoder.get_vocabulary()) vocab_size = len(encoder.get_vocabulary()) word2idx, weights = get_glove_embeddings(vocabset) embedding_matrix = np.zeros((vocab_size, weights.shape[1])) for i, word in enumerate(encoder.get_vocabulary()): vec = word2idx.get(word) if(vec is not None): embedding_matrix[i] = weights[vec] model = tf.keras.Sequential([ encoder, tf.keras.layers.Embedding( input_dim = embedding_matrix.shape[0], output_dim = embedding_matrix.shape[1], weights = embedding_matrix, mask_zero=True, trainable=True ), tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)), tf.keras.layers.Dense(64, activation='relu'), tf.keras.layers.Dropout(0.2), tf.keras.layers.Dense(1, activation='sigmoid') ]) return model
def prepare_data_set_for_training(raw_train_ds, raw_val_ds, raw_test_ds): max_features = 10000 sequence_length = 250 # create vectorization layer vectorize_layer = TextVectorization(standardize=custom_standardization, max_tokens=max_features, output_mode='int', output_sequence_length=sequence_length) # Make a text-only dataset (without labels), then call adapt train_text = raw_train_ds.map(lambda x, y: x) vectorize_layer.adapt(train_text) def vectorize_text(text, label): text = tf.expand_dims(text, -1) return vectorize_layer(text), label # retrieve a batch (of 32 reviews and labels) from the dataset text_batch, label_batch = next(iter(raw_train_ds)) first_review, first_label = text_batch[0], label_batch[0] print("Review", first_review) print("Label", raw_train_ds.class_names[first_label]) print("Vectorized review", vectorize_text(first_review, first_label)) #explore the vocabulary print("1287 ---> ", vectorize_layer.get_vocabulary()[1287]) print(" 313 ---> ", vectorize_layer.get_vocabulary()[313]) print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary()))) train_ds = raw_train_ds.map(vectorize_text) val_ds = raw_val_ds.map(vectorize_text) test_ds = raw_test_ds.map(vectorize_text) return max_features, train_ds, val_ds, test_ds, vectorize_layer
def __init__(self, model_dir): # load the artifacts self.artifacts = pickle.load( open(os.path.join(model_dir, 'model_artifacts.pkl'), 'rb')) # create the vectorizers train_src = np.load(os.path.join(model_dir, 'train_src.npy'), allow_pickle=True) train_tgt = np.load(os.path.join(model_dir, 'train_tgt.npy'), allow_pickle=True) vectorizer_src = TextVectorization() vectorizer_src.adapt(train_src) train_seq = vectorizer_src(train_src) self.vectorizer_src = vectorizer_src vectorizer_tgt = TextVectorization() vectorizer_tgt.adapt(train_tgt) self.vectorizer_tgt = vectorizer_tgt # load models vocab_src = vectorizer_src.get_vocabulary() self.encoder = MyEncoder( len(vocab_src), embedding_dim=self.artifacts['embedding_size'], enc_units=self.artifacts['bottleneck_units'], batch_size=self.artifacts['batch_size']) # call the model first to create the variables sample_hidden = self.encoder.initialize_hidden_state() sample_output, sample_hidden = self.encoder( tf.zeros( (self.artifacts['batch_size'], train_seq.numpy().shape[1])), sample_hidden) self.encoder.load_weights( os.path.join(model_dir, f'encoder_weights_e{self.artifacts["epochs"]}.h5')) print(self.encoder.summary()) vocab_tgt = vectorizer_tgt.get_vocabulary() self.decoder = MyDecoder( len(vocab_tgt), embedding_dim=self.artifacts['embedding_size'], dec_units=self.artifacts['bottleneck_units'], batch_size=self.artifacts['batch_size']) # call the model first to create the variables _ = self.decoder(tf.random.uniform((self.artifacts['batch_size'], 1)), sample_hidden, sample_output) self.decoder.load_weights( os.path.join(model_dir, f'decoder_weights_e{self.artifacts["epochs"]}.h5')) print(self.decoder.summary())
def get_base_embedder(self, vocab): # text-to-index-list layer encoder = TextVectorization( max_tokens=self.max_tokens, output_mode="int", output_sequence_length=self.max_sentence_length, vocabulary=vocab, ) # embedder model # text input layer input_layer = Input(shape=(1, ), dtype=tfstring) # text to index layer vectorize_layer = encoder(input_layer) # embedding layer embedding_layer = Embedding(input_dim=len(encoder.get_vocabulary()), output_dim=32, mask_zero=True)(vectorize_layer) # bidirectional lstm layer bi_lstm_layer = Bidirectional( LSTM(32, name="lstm-layer"), name="bidirectional-layer")(embedding_layer) # normalization layer norm_layer = BatchNormalization()(bi_lstm_layer) # final embedding layer embedding = Dense(self.embedding_size, name="embedding-layer")(norm_layer) return Model(inputs=input_layer, outputs=embedding)
def get_vectorize_layer(texts, vocab_size, max_seq, special_tokens=["[MASK]"]): """Build Text vectorization layer Args: texts (list): List of string i.e input texts vocab_size (int): vocab size max_seq (int): Maximum sequence lenght. special_tokens (list, optional): List of special tokens. Defaults to ['[MASK]']. Returns: layers.Layer: Return TextVectorization Keras Layer """ vectorize_layer = TextVectorization( max_tokens=vocab_size, output_mode="int", standardize=custom_standardization, output_sequence_length=max_seq, ) vectorize_layer.adapt(texts) # Insert mask token in vocabulary vocab = vectorize_layer.get_vocabulary() vocab = vocab[2 : vocab_size - len(special_tokens)] + ["[mask]"] vectorize_layer.set_vocabulary(vocab) return vectorize_layer
def get_vectorize_layer(self, texts, special_tokens=["mask"]): """Build Text vectorization layer Args: texts (list): List of string i.e input texts vocab_size (int): vocab size max_seq (int): Maximum sequence lenght. special_tokens (list, optional): List of special tokens. Defaults to ['[MASK]']. Returns: layers.Layer: Return TextVectorization Keras Layer """ vectorize_layer = TextVectorization( max_tokens=self.config.VOCAB_SIZE, output_mode="int", ngrams=None, standardize="lower_and_strip_punctuation", output_sequence_length=self.config.MAX_LEN, ) vectorize_layer.adapt(texts) # Insert mask token in vocabulary vocab = vectorize_layer.get_vocabulary() vocab = vocab[2:self.config.VOCAB_SIZE - len(special_tokens)] + ["mask"] vectorize_layer.set_vocabulary(vocab) return vectorize_layer
def load_dataset(self): raw_train_ds, raw_val_ds, raw_test_ds = self.load_raw_data() max_features = 10000 sequence_length = 250 vectorize_layer = TextVectorization( standardize=self.custom_standardization, max_tokens=self.max_tokens, output_mode='int', output_sequence_length=sequence_length) # Make a text-only dataset (without labels), then call adapt train_text = raw_train_ds.map(lambda x, y: x) vectorize_layer.adapt(train_text) self.vocab = vectorize_layer.get_vocabulary() train_ds = raw_train_ds.map( lambda x, y: (vectorize_layer(tf.expand_dims(x, -1)), y)) val_ds = raw_val_ds.map(lambda x, y: (vectorize_layer(tf.expand_dims(x, -1)), y)) test_ds = raw_test_ds.map(lambda x, y: (vectorize_layer(tf.expand_dims(x, -1)), y)) train_ds = train_ds.cache().prefetch(buffer_size=self.AUTOTUNE) val_ds = val_ds.cache().prefetch(buffer_size=self.AUTOTUNE) test_ds = test_ds.cache().prefetch(buffer_size=self.AUTOTUNE) return train_ds, val_ds, test_ds
def get_vectorize_layer(texts, vocab_size, max_seq): """Build Text vectorization layer Args: texts (list): List of string, i.e., input texts vocab_size (int): vocab size max_seq (int): Maximum sequence length. Returns: layers.Layer: Return TextVectorization Keras Layer """ vectorize_layer = TextVectorization( max_tokens=vocab_size, output_mode="int", standardize=custom_standardization, output_sequence_length=max_seq, ) vectorize_layer.adapt(texts) # Insert mask token in vocabulary vocab = vectorize_layer.get_vocabulary() #print("len(vocab):", len(vocab)) #177 #vocab: ['', '[UNK]', 'the', 'and', 'a', 'of', ...] all lower-case #GJ20: where do the empty string and [UNK] come from? # they are created by adapt() as words 0 and 1 # '' is padding token; [UNK] is OOV token vocab = vocab[2:len(vocab)-1] + ["[mask]"] #print("len(vocab):", len(vocab)) #175 #GJ20: anyway first 2 words removed and '[mask]' added at the end vectorize_layer.set_vocabulary(vocab) # '' and [UNK] are back in #vocab = vectorize_layer.get_vocabulary() #print("len(vocab):", len(vocab)) #177 # '[mask]' has been added as last (least frequent) word in the vocab return vectorize_layer
class TFVectTokenizer: def __init__(self, seqlen, step, freq_threshold): self.freq_threshold = freq_threshold self.freq_threshold = 0 self.seqlen = seqlen self.step = step self.vocab_size = 20000 self.vectorize_layer = TextVectorization( standardize=custom_standardization, max_tokens=self.vocab_size - 1, output_mode="int", output_sequence_length=self.seqlen + 1, ) def tokenize(self, text_ds): # Create a vectorization layer and adapt it to the text self.vectorize_layer.adapt(text_ds) vocab = self.vectorize_layer.get_vocabulary( ) # To get words back from token indices reverse_token_map = {t: i for i, t in enumerate(vocab)} return text_ds, vocab, reverse_token_map def get_input_sequences(self, text_ds, reverse_token_map): text_ds = text_ds.map( lambda text: prepare_lm_inputs_labels(text, self.vectorize_layer)) text_ds = text_ds.prefetch(tf.data.experimental.AUTOTUNE) return text_ds
def build_vocab(directories, batch_size, vocab_size, maxlen): global vectorize_layer # Create a list all files filenames = [] for dir in directories: for f in os.listdir(dir): filenames.append(os.path.join(dir, f)) print(f"{len(filenames)} files") # Create dataset from text files random.shuffle(filenames) text_ds = tf.data.TextLineDataset(filenames) text_ds = text_ds.shuffle(buffer_size=256) text_ds = text_ds.batch(batch_size) # Create vectcorization layer and adapt it to the text vectorize_layer = TextVectorization( standardize=custom_standardization, max_tokens=vocab_size - 1, output_mode="int", output_sequence_length=maxlen + 1, ) vectorize_layer.adapt(text_ds) vocab = vectorize_layer.get_vocabulary() word_to_index = {} for index, word in enumerate(vocab): word_to_index[word] = index text_ds = text_ds.map(prepare_lm_inputs_labels) text_ds = text_ds.prefetch(tf.data.experimental.AUTOTUNE) return (text_ds, vocab, word_to_index)
def build_model(train_dataset: PrefetchDataset) -> Sequential: """ Initializes a Sequential model and adds text vectorization, word embedding, LSTM, and densely connected layers. :param train_dataset: The dataset to adapt the vocabulary on. :return: A Sequential object. """ # Initialize the TextVectorization layer which assigns integers to each token encoder = TextVectorization(max_tokens=VOCAB_SIZE) # Set the vocabulary for the encoding layer. This will be used to initialize a lookup table of word embeddings. # The code for this and subsequent layers adapted from: # https://www.tensorflow.org/tutorials/text/text_classification_rnn#create_the_text_encoder encoder.adapt(train_dataset.map(lambda text, label: text)) model = Sequential() model.add(encoder) # Next we add our word embedding layer which converts token indices into dense vectors model.add(Embedding(input_dim=len(encoder.get_vocabulary()), output_dim=8, activity_regularizer=l2(0.001), mask_zero=True)) # Bidirectional wrapper for LSTM allows data to be processed forwards and backwards and then concatenated into # one output model.add(Bidirectional(LSTM(8))) # Densely connected layers with L2 regularization to reduce over-fitting model.add(Dense(8, activation="relu", kernel_regularizer=l2(0.001), activity_regularizer=l2(0.001))) model.add(Dense(1, activation="sigmoid")) return model
def initialize_vectorizer_layer(text, pad_length, max_tokens=None): # Create vectorizer vectorizer = TextVectorization(output_sequence_length=pad_length, standardize=None, max_tokens=max_tokens) vectorizer.adapt(text) vocab = vectorizer.get_vocabulary() return vectorizer, vocab
def vectorizer(raw_train_ds): vectorizer = TextVectorization(standardize=custom_standardization, max_tokens=max_tokens, output_sequence_length=sequence_length) text_ds = raw_train_ds.map(lambda x, y: x) vectorizer.adapt(text_ds) np.savetxt('voc.out', vectorizer.get_vocabulary(), fmt='%s') return vectorizer
def main(): train_dataset, test_dataset = generate_data() train_dataset = train_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE) test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE) encoder = TextVectorization(max_tokens=VOCAB_SIZE) encoder.adapt(train_dataset.map(lambda text, label: text)) vocab = np.array(encoder.get_vocabulary()) #print(vocab[:20]) LSTM_model(train_dataset, test_dataset, encoder)
def get_vectorize_layer(texts, vocab_size, max_seq, special_tokens=["[MASK]"]): vectorize_layer = TextVectorization( max_tokens=vocab_size, output_mode="int", standardize=custom_standardization, output_sequence_length=max_seq, ) vectorize_layer.adapt(texts) # Insert mask token in vocabulary. vocab = vectorize_layer.get_vocabulary() vocab = vocab[2:vocab_size - len(special_tokens)] + ["[mask]"] #vocab = vocab[2 : vocab_size - len(special_tokens)] + special_tokens vectorize_layer.set_vocabulary(vocab) return vectorize_layer
class DataPrepare: def __init__(self): vocab_size = 50000 batch_size = 128 maxlen = 5 filenames = ["test.txt"] self.text_ds = tf.data.TextLineDataset(filenames) self.text_ds = self.text_ds.shuffle(buffer_size=256) self.text_ds = self.text_ds.batch(batch_size) self.vectorize_layer = TextVectorization( standardize=self.custom_standardization, max_tokens=vocab_size - 1, output_mode="int", output_sequence_length=maxlen + 1, ) self.vectorize_layer.adapt(self.text_ds) self.vocab = self.vectorize_layer.get_vocabulary( ) # To get words back from token indices self.text_ds = self.text_ds.map(self.prepare_lm_inputs_labels) self.text_ds = self.text_ds.prefetch(tf.data.experimental.AUTOTUNE) def custom_standardization(self, input_string): """ Remove html line-break tags and handle punctuation """ lowercased = tf.strings.lower(input_string) stripped_html = tf.strings.regex_replace(lowercased, "<br />", " ") return tf.strings.regex_replace(stripped_html, f"([{string.punctuation}])", r" \1") def prepare_lm_inputs_labels(self, text): """ Shift word sequences by 1 position so that the target for position (i) is word at position (i+1). The model will use all words up till position (i) to predict the next word. """ text = tf.expand_dims(text, -1) tokenized_sentences = self.vectorize_layer(text) x = tokenized_sentences[:, :-1] y = tokenized_sentences[:, 1:] return x, y def get_dataset(self): return self.text_ds def get_vocab(self): return self.vocab def get_vectorize_layer(self): return self.vectorize_layer
def vectorize(self): vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200) text_ds = tf.data.Dataset.from_tensor_slices( self.train_samples).batch(128) vectorizer.adapt(text_ds) self.train_X = vectorizer(np.array([[s] for s in self.train_samples ])).numpy() self.test_X = vectorizer(np.array([[s] for s in self.val_samples ])).numpy() self.test_y = np.array(self.train_labels) self.test_y = np.array(self.val_labels) return vectorizer.get_vocabulary()
def get_model() -> Tuple[Sequential, Callable[[str, bool], Optional[list]]]: importedModel = tf.keras.models.load_model('savedModel/data') max_features = 10000 # number of max distinct words to be extracted from a dataset sequence_length = 250 # size of output sequence, constant regardless of number of tokens extracted from a sample batch_size = 32 seed = 42 raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory( 'aclImdb/train', batch_size=batch_size, validation_split=0.2, subset='training', seed=seed) vectorization_layer = TextVectorization( standardize=lowercase_and_html_escape, max_tokens=max_features, output_mode='int', output_sequence_length=sequence_length) train_text_without_labels = raw_train_ds.map(lambda text, label: text) vectorization_layer.adapt( train_text_without_labels ) # create a dictionary of distinct words from the test set weights = importedModel.layers[0].get_weights()[0] vocabulary = vectorization_layer.get_vocabulary() dense_layer_weights = importedModel.layers[4].get_weights()[0] get_most_influential_words = most_influential_words_factory( weights, vocabulary, dense_layer_weights) model = tf.keras.Sequential( [vectorization_layer, importedModel, layers.Activation('sigmoid')]) model.compile(loss=losses.BinaryCrossentropy(from_logits=False), optimizer="adam", metrics=['accuracy']) return model, get_most_influential_words
class BiLSTMModel(BaseModel): def __init__(self): self.vocab_size = 2000 self.encoder = TextVectorization(max_tokens=self.vocab_size) self.le = LabelEncoder() self.le.fit_transform(['neu', 'neg', 'pos']) def train(self, X_train, Y_train): Y_train = to_categorical(self.le.transform(Y_train), 3) self.encoder.adapt(X_train) self.model = Sequential([ self.encoder, Embedding(input_dim=len(self.encoder.get_vocabulary()), output_dim=64, mask_zero=True), Bidirectional(LSTM(64, return_sequences=True)), Bidirectional(LSTM(32)), Dense(64, activation='relu'), Dropout(0.5), Dense(3, activation='softmax') ]) self.model.compile(loss=CategoricalCrossentropy(), optimizer=Adam(1e-4), metrics=['accuracy']) self.model.fit(x=X_train, y=Y_train, epochs=10) def analyze(self, X_test, Y_test): self.model.summary() Y_test = to_categorical(self.le.transform(Y_test), 3) test_loss, test_acc = self.model.evaluate(x=X_test, y=Y_test) print( f'{self.__class__.__name__} Accuracy: {test_acc} Loss: {test_loss}' ) def predict(self, texts): texts = [text_cleaner(text) for text in texts] p = self.model.predict(texts) y_classes = [np.argmax(y, axis=None, out=None) for y in p] return self.le.inverse_transform(y_classes)
def load_cnn_dailymail_experiment(batch_size=1, max_vocab=5000, max_sequence=400): (_, _, ds_val), ds_info = tfds.load("cnn_dailymail", split=['train', 'test', 'validation'], shuffle_files=True, as_supervised=True, with_info=True) int_vectorize = TextVectorization(max_tokens=max_vocab, output_mode='int', output_sequence_length=max_sequence, standardize=standardize) ds_val = ds_val.map(remove_newline, num_parallel_calls=tf.data.experimental.AUTOTUNE) #ds_val = ds_val.map(add_EOS_BOS, num_parallel_calls=tf.data.experimental.AUTOTUNE) # "Train" vectorization layer on validate articles. *for debugging purposes only!!* int_vectorize.adapt(ds_val.map(lambda article, highlights: article)) def int_vectorize_map(article, highlights, original_article, original_highlights): article = tf.expand_dims(article, -1) highlights = tf.expand_dims(highlights, -1) return int_vectorize(article), int_vectorize( highlights), original_article, original_highlights ds_val = ds_val.map(duplicate_originals, num_parallel_calls=tf.data.experimental.AUTOTUNE) ds_val = ds_val.map(int_vectorize_map, num_parallel_calls=tf.data.experimental.AUTOTUNE) ds_val = ds_val.map(standardize_map, num_parallel_calls=tf.data.experimental.AUTOTUNE) ds_val = ds_val.shuffle(ds_info.splits['validation'].num_examples) ds_val = ds_val.batch(batch_size=batch_size) ds_val = ds_val.prefetch(tf.data.experimental.AUTOTUNE) return ds_val, None, None, int_vectorize.get_vocabulary()
def gen_vocab(data, max_tokens = 200000): """ helper function to generate the vocab for embedding. by default this will limit to the top 20000 tokens Parameters ---------- data : dataset from the pipeline. Returns ------- vocab : vectorizer : vectorizer for encoding x_train and y_train words """ vectorizer = TextVectorization(max_tokens=max_tokens, output_sequence_length=200) text_ds = tf.data.Dataset.from_tensor_slices(data).batch(128) vectorizer.adapt(text_ds) vocab = vectorizer.get_vocabulary() return vocab, vectorizer
def get_vectorize_layer(texts, max_seq=20, special_tokens=["x"]): """Build Text vectorization layer Args: texts (list): List of string i.e input texts max_seq (int): Maximum sequence lenght. special_tokens (list, optional): List of special tokens. Defaults to ['x']. Returns: layers.Layer: Return TextVectorization Keras Layer """ vectorize_layer = TextVectorization( output_mode="int", split=char_split, output_sequence_length=max_seq, ) vectorize_layer.adapt(texts) # Insert mask token in vocabulary vocab = vectorize_layer.get_vocabulary() vocab = vocab[2:] + ["x"] vectorize_layer.set_vocabulary(vocab) return vectorize_layer
def vectorize_text(self, text_dataset, debug=False): """ :param text_dataset: :return: """ vectorize_layer = TextVectorization(standardize=self.custom_standardization, max_tokens=self.vocab_size, output_mode='int', output_sequence_length=self.sequence_len) vectorize_layer.adapt(text_dataset.batch(self.batch_size)) self.vocab = vectorize_layer.get_vocabulary() text_vector_ds = text_dataset.batch(self.batch_size).prefetch(AUTOTUNE).map(vectorize_layer).unbatch() sequences = list(text_vector_ds.as_numpy_iterator()) if debug: print(f"====>>>> length of sequences: {len(sequences)}") for seq in sequences[:5]: print(f"====>>>> {seq} => {[self.vocab[i] for i in seq]}") return sequences
vectorize_layer.adapt(train_text) def vectorize_text(text, label): text = tf.expand_dims(text, -1) return vectorize_layer(text), label # retrieve a batch (of 32 reviews and labels) from the dataset text_batch, label_batch = next(iter(raw_train_ds)) first_review, first_label = text_batch[0], label_batch[0] print("Review", first_review) print("Label", raw_train_ds.class_names[first_label]) print("Vectorized review", vectorize_text(first_review, first_label)) print("1287 ---> ", vectorize_layer.get_vocabulary()[1287]) print(" 313 ---> ", vectorize_layer.get_vocabulary()[313]) print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary()))) train_ds = raw_train_ds.map(vectorize_text) val_ds = raw_val_ds.map(vectorize_text) test_ds = raw_test_ds.map(vectorize_text) AUTOTUNE = tf.data.experimental.AUTOTUNE train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE) val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE) test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE) embedding_dim = 16
Our layer will only consider the top 20,000 words, and will truncate or pad sequences to be actually 200 tokens long. """ from tensorflow.keras.layers.experimental.preprocessing import TextVectorization vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200) text_ds = tf.data.Dataset.from_tensor_slices(train_samples).batch(128) vectorizer.adapt(text_ds) """ You can retrieve the computed vocabulary used via `vectorizer.get_vocabulary()`. Let's print the top 5 words: """ vectorizer.get_vocabulary()[:5] """ Let's vectorize a test sentence: """ output = vectorizer([["the cat sat on the mat"]]) output.numpy()[0, :6] """ As you can see, "the" gets represented as "2". Why not 0, given that "the" was the first word in the vocabulary? That's because index 0 is reserved for padding and index 1 is reserved for "out of vocabulary" tokens. Here's a dict mapping words to their indices: """ voc = vectorizer.get_vocabulary()
return {'batch_size': self.batch_size, 'dec_units': self.dec_units} # test if __name__ == '__main__': # target text that we will eventually want to decode to # see previous task for the source text spanish_text = [ 'Pidan, y se les dará', 'busquen, y encontrarán', 'llamen, y se les abrirá.' ] texts_delimited = [f'{START_TOKEN} {t} {END_TOKEN}' for t in spanish_text] vectorizer = TextVectorization() vectorizer.adapt(texts_delimited) vocab = vectorizer.get_vocabulary() print('Vocabulary', vocab) print('Vocabulary size', len(vocab)) print('========================') print('Vectorized texts') sequences = vectorizer(texts_delimited) print(sequences) sample_encoder_output = np.array( [[-0.00256194], [-0.00898881], [-0.00391034]], dtype=np.float32) sample_encoder_hidden = np.array( [[-0.00156194], [0.00020050], [-0.00095034]], dtype=np.float32) decoder = MyDecoder(len(vocab), embedding_dim=EMBEDDING_SIZE,
def load_cnn_dailymail_deep(batch_size=1, max_vocab=5000, max_sequence=400): (ds_train, ds_test, ds_val), ds_info = tfds.load("cnn_dailymail", split=['train', 'test', 'validation'], shuffle_files=False, as_supervised=True, with_info=True) ds_train = ds_train.take(20000) ds_test = ds_test.take(1000) ds_val = ds_val.take(100) int_vectorize = TextVectorization(max_tokens=max_vocab, output_mode='int', output_sequence_length=max_sequence, standardize=standardize) ds_train = ds_train.map(remove_newline, num_parallel_calls=tf.data.experimental.AUTOTUNE) # "Train" vectorization layer on training articles. int_vectorize.adapt(ds_train.map(lambda article, highlights: article)) def int_vectorize_map(article, highlights, original_article, original_highlights): article = tf.expand_dims(article, -1) highlights = tf.expand_dims(highlights, -1) return int_vectorize(article), int_vectorize( highlights), original_article, original_highlights ds_train = ds_train.map(duplicate_originals, num_parallel_calls=tf.data.experimental.AUTOTUNE) ds_train = ds_train.map(int_vectorize_map, num_parallel_calls=tf.data.experimental.AUTOTUNE) ds_train = ds_train.map(standardize_map, num_parallel_calls=tf.data.experimental.AUTOTUNE) ds_train = ds_train.shuffle(ds_info.splits['train'].num_examples) ds_train = ds_train.batch(batch_size=batch_size) ds_train = ds_train.prefetch(tf.data.experimental.AUTOTUNE) ds_test = ds_test.map(remove_newline, num_parallel_calls=tf.data.experimental.AUTOTUNE) ds_test = ds_test.map(duplicate_originals, num_parallel_calls=tf.data.experimental.AUTOTUNE) ds_test = ds_test.map(int_vectorize_map, num_parallel_calls=tf.data.experimental.AUTOTUNE) ds_test = ds_test.map(standardize_map, num_parallel_calls=tf.data.experimental.AUTOTUNE) ds_test = ds_test.batch(batch_size=batch_size) ds_test = ds_test.cache() ds_test = ds_test.prefetch(tf.data.experimental.AUTOTUNE) ds_val = ds_val.map(remove_newline, num_parallel_calls=tf.data.experimental.AUTOTUNE) ds_val = ds_val.map(duplicate_originals, num_parallel_calls=tf.data.experimental.AUTOTUNE) ds_val = ds_val.map(int_vectorize_map, num_parallel_calls=tf.data.experimental.AUTOTUNE) ds_val = ds_val.map(standardize_map, num_parallel_calls=tf.data.experimental.AUTOTUNE) ds_val = ds_val.shuffle(ds_info.splits['validation'].num_examples) ds_val = ds_val.batch(batch_size=batch_size) ds_val = ds_val.prefetch(tf.data.experimental.AUTOTUNE) return ds_train, ds_val, ds_test, int_vectorize.get_vocabulary()
caption_model.compile( optimizer=keras.optimizers.Adam(learning_rate=LEARNING_RATE), loss=cross_entropy) # Fit the model caption_model.fit( train_dataset, epochs=EPOCHS, validation_data=valid_dataset, callbacks=[early_stopping], ) """ ## Check sample predictions """ vocab = vectorization.get_vocabulary() index_lookup = dict(zip(range(len(vocab)), vocab)) max_decoded_sentence_length = SEQ_LENGTH - 1 valid_images = list(valid_data.keys()) def generate_caption(): # Select a random image from the validation dataset sample_img = np.random.choice(valid_images) # Read the image from the disk sample_img = read_image(sample_img) img = sample_img.numpy().astype(np.uint8) plt.imshow(img) plt.show()
return int_vectorize_layer(text), label # Retrieve a batch (of 32 reviews and labels) from the dataset text_batch, label_batch = next(iter(raw_train_ds)) first_question, first_label = text_batch[0], label_batch[0] print("Question", first_question) print("Label", first_label) print("'binary' vectorized question:", binary_vectorize_text(first_question, first_label)[0]) print("'int' vectorized question:", int_vectorize_text(first_question, first_label)[0]) print("1289 ---> ", int_vectorize_layer.get_vocabulary()[1289]) print("313 ---> ", int_vectorize_layer.get_vocabulary()[313]) print("Vocabulary size: {}".format(len(int_vectorize_layer.get_vocabulary()))) binary_train_ds = raw_train_ds.map(binary_vectorize_text) binary_val_ds = raw_val_ds.map(binary_vectorize_text) binary_test_ds = raw_test_ds.map(binary_vectorize_text) int_train_ds = raw_train_ds.map(int_vectorize_text) int_val_ds = raw_val_ds.map(int_vectorize_text) int_test_ds = raw_test_ds.map(int_vectorize_text) AUTOTUNE = tf.data.experimental.AUTOTUNE def configure_dataset(dataset):
def prepare_model_input(self): """ Reads and parses MIDI files, then saves the note sequences as text files in the train and val directories. Note that each song is treated as 1 long sentence, which has implications for the max_sequence_length argument passed to the transformer. In the future, it may be worth exploring how to break up the notes into sentence-like chunks, perhaps by using bars like MuseGAN. :return: TF TextLineDataset object with x and y mapped. """ # if train and val directories do not exist, create them cwd = os.getcwd() data_dir = cwd + "/maestro-v3.0.0-midi/maestro-v3.0.0/" Path(cwd + "/clean/train").mkdir(parents=True, exist_ok=True) Path(cwd + "/clean/val").mkdir(parents=True, exist_ok=True) # read Maestro metadata to Pandas, shuffle, then determine train/test split metadata = pd.read_csv(data_dir + "maestro-v3.0.0.csv") metadata = metadata.sample(frac=1.0) train_val_split_index = int(len(metadata) * 0.8) # only parse MIDI files if the clean train or test dirs are empty if len(os.listdir('clean/train')) == 0 or len( os.listdir('clean/val')) == 0: # use the Pandas df to determine which parsed files to store in each folder for file_idx, file in enumerate(metadata.midi_filename.to_list()): # Pre-pend the data directory to the file path file = data_dir + file print("Parsing MIDI file:", file_idx, "/", len(metadata)) notes_list, offsets = self.parse_midi_file(file) offsets_from_prior_note = self.offsets_relative_to_prior_note( offsets=offsets) if file_idx <= train_val_split_index: path_prefix = "clean/train/" else: path_prefix = "clean/val/" # save each parsed sequence of notes as a string in a txt file with open( path_prefix + file[file.rindex("/") + 1:file.rindex(".")] + ".txt", "w") as text_file: text_file.write(' '.join(notes_list)) # walk through the directories filenames = [] directories = [ "clean/train", "clean/val", ] for dir in directories: for f in os.listdir(dir): filenames.append(os.path.join(dir, f)) print(f"{len(filenames)} files") # Create a dataset from text files random.shuffle(filenames) text_ds = tf.data.TextLineDataset(filenames) text_ds = text_ds.shuffle(buffer_size=256) text_ds = text_ds.batch(self.batch_size) # Create a vectorization layer and adapt it to the text vectorize_layer = TextVectorization( standardize=None, # do not perform any pre-processing or cleaning max_tokens=self.vocab_size - 1, output_mode="int", output_sequence_length=self.max_sequence_len + 1, ) vectorize_layer.adapt(text_ds) self.vocab = vectorize_layer.get_vocabulary() def create_x_and_y(text): """ Shift word sequences by 1 position so that the target for position (i) is word at position (i+1). The model will use all words up till position (i) to predict the next word. """ text = tf.expand_dims(text, -1) tokenized_sentences = vectorize_layer(text) x = tokenized_sentences[:, :-1] y = tokenized_sentences[:, 1:] return x, y return text_ds.map(create_x_and_y)