def load_model(path='./model'): num_layers = 4 d_model = 128 dff = 512 num_heads = 8 added_input_size = OLD_VOCAB_SIZE + 2 added_target_size = KOR_VOCAB_SIZE + 2 dropout_rate = 0.1 learning_rate = CustomSchedule(d_model) optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9) transformer = Transformer(num_layers, d_model, num_heads, dff, added_input_size, added_target_size, pe_input=added_input_size, pe_target=added_target_size, rate=dropout_rate) checkpoint_path = path ckpt = tf.train.Checkpoint(transformer=transformer) ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5) # if a checkpoint exists, restore the latest checkpoint. if ckpt_manager.latest_checkpoint: ckpt.restore(ckpt_manager.latest_checkpoint).expect_partial() print ('Model Load completed!!') return transformer else: print('Model Load Fail..') raise FileNotFoundError
def load_transformer_model(user_config, tokenizer_inp, tokenizer_tar): """ load transformer model and latest checkpoint to continue training """ input_vocab_size = tokenizer_inp.vocab_size target_vocab_size = tokenizer_tar.vocab_size inp_language = user_config["inp_language"] target_language = user_config["target_language"] use_pretrained_emb = user_config["use_pretrained_emb"] if use_pretrained_emb: pretrained_weights_inp = np.load( user_config["pretrained_emb_path_{}".format(inp_language)]) pretrained_weights_tar = np.load( user_config["pretrained_emb_path_{}".format(target_language)]) else: pretrained_weights_inp = None pretrained_weights_tar = None # custom learning schedule learning_rate = CustomSchedule(user_config["transformer_model_dimensions"]) optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9) transformer_model = Transformer( user_config["transformer_num_layers"], user_config["transformer_model_dimensions"], user_config["transformer_num_heads"], user_config["transformer_dff"], input_vocab_size, target_vocab_size, en_input=input_vocab_size, fr_target=target_vocab_size, rate=user_config["transformer_dropout_rate"], weights_inp=pretrained_weights_inp, weights_tar=pretrained_weights_tar) ckpt = tf.train.Checkpoint(transformer=transformer_model, optimizer=optimizer) checkpoint_path = user_config["transformer_checkpoint_path"] ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=10) # if a checkpoint exists, restore the latest checkpoint. if ckpt_manager.latest_checkpoint: ckpt.restore(ckpt_manager.latest_checkpoint) print('Latest checkpoint restored from path {}'.format( ckpt_manager.latest_checkpoint)) return transformer_model, optimizer, ckpt_manager
def restore_model(self, checkpoint_path, num_layers, d_model, num_heads, dff, dropout_rate): # create model tructure self.translate_transformer = Transformer(num_layers, d_model, num_heads, dff, self.input_vocab_size, self.target_vocab_size, dropout_rate) # restore model weight learning_rate = CustomSchedule(d_model) optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9) ckpt = tf.train.Checkpoint(transformer=self.translate_transformer, optimizer=optimizer) ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5) # if a checkpoint exists, restore the latest checkpoint. if ckpt_manager.latest_checkpoint: ckpt.restore(ckpt_manager.latest_checkpoint) print('Latest checkpoint restored!')
tokenized_outputs = tf.keras.preprocessing.sequence.pad_sequences( tokenized_outputs, maxlen=MAX_LENGTH, padding='post') return tokenized_inputs, tokenized_outputs questions, answers = tokenize_and_filter(questions, answers) tokenizer.save_to_file(os.path.join(path_work, 'vocab')) #----------------- # model #----------------- # loss and learning rate defifition learning_rate = CustomSchedule(D_MODEL) '''optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)''' optimizer = tf.keras.optimizers.Adam() inputs_enc = Input(shape=(None, )) inputs_dec = Input(shape=(None, )) # vocab_size, word_emb_dim, nb_layers, nb_heads, seq_max_length, dropout, mask outputs = Transformer(VOCAB_SIZE, D_MODEL, 4, 8, MAX_LENGTH, 0.1, True)(inputs_enc, inputs_dec, True) model = Model(inputs=[inputs_enc, inputs_dec], outputs=outputs) model.compile(optimizer=optimizer, loss=loss_function,
def train(): # region Functions def loss_function(real, pred): mask = tf.math.logical_not(tf.math.equal(real, 0)) loss_ = loss_object(real, pred) mask = tf.cast(mask, dtype=loss_.dtype) loss_ *= mask return tf.reduce_mean(loss_) def filter_max_length(x, y, max_length=MAX_LENGTH): """Function restricting used sequences x and y to <= max_lenght""" return tf.logical_and(tf.size(x) <= max_length, tf.size(y) <= max_length) def encode(lang1, lang2): lang1 = [tokenizer_de.vocab_size] + tokenizer_de.encode( lang1.numpy()) + [tokenizer_de.vocab_size + 1] lang2 = [tokenizer_en.vocab_size] + tokenizer_en.encode( lang2.numpy()) + [tokenizer_en.vocab_size + 1] return lang1, lang2 def tf_encode(de, en): return tf.py_function(encode, [de, en], [tf.int64, tf.int64]) # endregion # region Create tokenizers # read previously created tokenizers if they exist if TRAIN_ON < 100: print('Creating new tokenizers') tag_new_tok = 'to' + str(TRAIN_ON) else: tag_new_tok = '' if (os.path.isfile(os.path.join(output_path, tag_new_tok + "tokenizer_en_" + str(DICT_SIZE) + ".subwords")) & os.path.isfile(os.path.join(output_path, tag_new_tok + "tokenizer_de_" + str(DICT_SIZE) + ".subwords"))): tokenizer_en = tfds.features.text.SubwordTextEncoder.load_from_file( os.path.join(output_path, tag_new_tok + "tokenizer_en_" + str(DICT_SIZE))) tokenizer_de = tfds.features.text.SubwordTextEncoder.load_from_file( os.path.join(output_path, tag_new_tok + "tokenizer_de_" + str(DICT_SIZE))) else: # create tokenizers from scratch examples, metadata = tfds.load('wmt14_translate/de-en', data_dir=data_path, with_info=True, as_supervised=True) train_examples, val_examples = examples['train'], examples['validation'] # English tokenizer tokenizer_en = tfds.features.text.SubwordTextEncoder.build_from_corpus( (en.numpy() for de, en in train_examples), target_vocab_size=DICT_SIZE) tokenizer_en.save_to_file(os.path.join(output_path, tag_new_tok + "tokenizer_en_" + str(DICT_SIZE))) # German tokenizer tokenizer_de = tfds.features.text.SubwordTextEncoder.build_from_corpus( (de.numpy() for de, en in train_examples), target_vocab_size=DICT_SIZE) tokenizer_de.save_to_file(os.path.join(output_path, tag_new_tok + "tokenizer_de_" + str(DICT_SIZE))) input_vocab_size = tokenizer_de.vocab_size + 2 target_vocab_size = tokenizer_en.vocab_size + 2 # endregion # region Prepare Train dataset split = tfds.Split.TRAIN.subsplit(tfds.percent[:TRAIN_ON]) examples, metadata = tfds.load('wmt14_translate/de-en', data_dir=data_path, with_info=True, as_supervised=True, split=[split, 'validation']) train_examples, val_examples = examples[0], examples[1] # <_OptionsDataset shapes: ((), ()), types: (tf.string, tf.string)> if len(include_backtrans_of_model) > 0: print('adding backtranslated train data sequences to training set') path2backtranslation = os.path.join(output_path, 'results_backtrans_'+include_backtrans_of_model+'.csv') if not os.path.exists(path2backtranslation): raise Exception('First you need to create the backtranslated sequences in evaluate_transformer w option backtrans_train!') df_backtrans = pd.read_csv(open(path2backtranslation, 'r'), encoding='utf-8', engine='c', index_col=0) ar_backtrans_input = df_backtrans['input'].values # Eng ar_backtrans_backtrans = df_backtrans['translation'].values.tolist() # De ar_backtrans_input = [str(x) for x in ar_backtrans_input if type(x) != 'str'] ar_backtrans_backtrans = [str(x) for x in ar_backtrans_backtrans if type(x) != 'str'] train_backtrans_input = tf.data.Dataset.from_tensor_slices(ar_backtrans_input) #, dtype=tf.string) # Eng train_backtrans_backtrans = tf.data.Dataset.from_tensor_slices(ar_backtrans_backtrans) # De # De, Eng train_backtrans = tf.data.Dataset.zip((train_backtrans_backtrans, train_backtrans_input)) # <ZipDataset shapes: ((), ()), types: (tf.string, tf.string)> # merge train_backtrans with train_examples train_examples = train_examples.concatenate(train_backtrans) # <ConcatenateDataset shapes: ((), ()), types: (tf.string, tf.string)> train_dataset = train_examples.map(tf_encode) # <MapDataset shapes: (<unknown>, <unknown>), types: (tf.int64, tf.int64)> train_dataset = train_dataset.filter(filter_max_length) # cache the dataset to memory to get a speedup while reading from it. train_dataset = train_dataset.cache() train_dataset = train_dataset.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE, padded_shapes=([-1], [-1])) train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE) # <PrefetchDataset shapes: ((None, None), (None, None)), types: (tf.int64, tf.int64)> # endregion # region Prepare Validation dataset val_dataset = val_examples.map(tf_encode) val_dataset = val_dataset.filter(filter_max_length).padded_batch(BATCH_SIZE, padded_shapes=([-1], [-1])) # <PaddedBatchDataset shapes: ((None, None), (None, None)), types: (tf.int64, tf.int64)> # endregion # region Define Modelling setup learning_rate = CustomSchedule(d_model, warmup_steps=WARMUP_STEPS) optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9) loss_object = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction='none') train_loss = tf.keras.metrics.Mean(name='train_loss') train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy') val_loss = tf.keras.metrics.Mean('val_loss', dtype=tf.float32) val_accuracy = tf.keras.metrics.SparseCategoricalAccuracy('val_accuracy') transformer1 = Transformer(num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, pe_input=input_vocab_size, pe_target=target_vocab_size, rate=dropout_rate) if USE_RTL: transformer2 = Transformer(num_layers, d_model, num_heads, dff, target_vocab_size, input_vocab_size, pe_input=target_vocab_size, pe_target=input_vocab_size, rate=dropout_rate) ckpt = tf.train.Checkpoint(transformer1=transformer1, transformer2=transformer2, optimizer=optimizer) else: ckpt = tf.train.Checkpoint(transformer1=transformer1, optimizer=optimizer) ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5) # endregion # region Train model if pretrained_name: latest = tf.train.latest_checkpoint(checkpoint_path_pretrained) if latest: ckpt.restore(latest) print('Pretrained model loaded from ' + latest + '.') else: raise Exception('Pretrained model not found.') elif ckpt_manager.latest_checkpoint: ckpt.restore(ckpt_manager.latest_checkpoint) print('Latest checkpoint restored.') # The @tf.function trace-compiles train_step into a TF graph for faster # execution. The function specializes to the precise shape of the argument # tensors. To avoid re-tracing due to the variable sequence lengths or variable # batch sizes (the last batch is smaller), use input_signature to specify # more generic shapes. train_step_signature = [ tf.TensorSpec(shape=(None, None), dtype=tf.int64), tf.TensorSpec(shape=(None, None), dtype=tf.int64), ] @tf.function(input_signature=train_step_signature) def train_step(inp, tar): tar_inp = tar[:, :-1] tar_real = tar[:, 1:] inp_inp = inp[:, :-1] inp_real = inp[:, 1:] enc_padding_mask1, combined_mask1, dec_padding_mask1 = create_masks(inp, tar_inp) enc_padding_mask2, combined_mask2, dec_padding_mask2 = create_masks(tar, inp_inp) with tf.GradientTape() as tape: predictions1, _ = transformer1(inp, tar_inp, True, enc_padding_mask1, combined_mask1, dec_padding_mask1) loss1 = loss_function(tar_real, predictions1) # this is de->en if USE_RTL: predictions2, _ = transformer2(tar, inp_inp, True, enc_padding_mask2, combined_mask2, dec_padding_mask2) loss2 = loss_function(inp_real, predictions2) # this is en->de loss = loss1 + loss2 if LAMBDA>0: predicted_id2 = tf.argmax(predictions2, axis=-1) # find most likely token from logits inp2 = tf.concat([inp[:, 0:1], predicted_id2], axis=-1) # add start token. inp2 is \hat{s} in the paper predicted_id1 = tf.argmax(predictions1, axis=-1) # find most likely token from logits tar2 = tf.concat([tar[:, 0:1], predicted_id1], axis=-1) # add start token. tar22 is \hat{t} in the paper enc_padding_mask3, combined_mask3, dec_padding_mask3 = create_masks(inp2, tar_inp) enc_padding_mask4, combined_mask4, dec_padding_mask4 = create_masks(tar2, inp_inp) predictions3, _ = transformer1(inp2, tar_inp, True, enc_padding_mask3, combined_mask3, dec_padding_mask3) loss3 = loss_function(tar_real, predictions3) # predictions3 is \tilde{t} in the paper predictions4, _ = transformer2(tar2, inp_inp, True, enc_padding_mask4, combined_mask4, dec_padding_mask4) loss4 = loss_function(inp_real, predictions4) # predictions4 is \tilde{s} in the paper loss += LAMBDA * (loss3 + loss4) else: loss = loss1 if USE_RTL: gradients = tape.gradient(loss, [transformer1.trainable_variables, transformer2.trainable_variables]) optimizer.apply_gradients(zip(gradients[0] + gradients[1], transformer1.trainable_variables + transformer2.trainable_variables)) else: gradients = tape.gradient(loss, transformer1.trainable_variables) optimizer.apply_gradients(zip(gradients, transformer1.trainable_variables)) train_loss(loss) train_accuracy(tar_real, predictions1) def val_step(inp, tar): tar_inp = tar[:, :-1] tar_real = tar[:, 1:] inp_inp = inp[:, :-1] inp_real = inp[:, 1:] enc_padding_mask1, combined_mask1, dec_padding_mask1 = create_masks(inp, tar_inp) enc_padding_mask2, combined_mask2, dec_padding_mask2 = create_masks(tar, inp_inp) predictions1, _ = transformer1(inp, tar_inp, True, enc_padding_mask1, combined_mask1, dec_padding_mask1) loss1 = loss_function(tar_real, predictions1) # this is de->en if USE_RTL: predictions2, _ = transformer2(tar, inp_inp, True, enc_padding_mask2, combined_mask2, dec_padding_mask2) loss2 = loss_function(inp_real, predictions2) # this is en->de predicted_id2 = tf.argmax(predictions2, axis=-1) # find most likely token from logits inp2 = tf.concat([inp[:, 0:1], predicted_id2], axis=-1) # add start token. inp2 is \hat{s} in the paper predicted_id1 = tf.argmax(predictions1, axis=-1) # find most likely token from logits tar2 = tf.concat([tar[:, 0:1], predicted_id1], axis=-1) # add start token. tar22 is \hat{t} in the paper enc_padding_mask3, combined_mask3, dec_padding_mask3 = create_masks(inp2, tar_inp) enc_padding_mask4, combined_mask4, dec_padding_mask4 = create_masks(tar2, inp_inp) predictions3, _ = transformer1(inp2, tar_inp, True, enc_padding_mask3, combined_mask3, dec_padding_mask3) loss3 = loss_function(tar_real, predictions3) # predictions3 is \tilde{t} in the paper predictions4, _ = transformer2(tar2, inp_inp, True, enc_padding_mask4, combined_mask4, dec_padding_mask4) loss4 = loss_function(inp_real, predictions4) # predictions4 is \tilde{s} in the paper loss = loss1 + loss2 + LAMBDA * (loss3 + loss4) else: loss = loss1 val_loss(loss) val_accuracy(tar_real, predictions1) train_summary_writer = tf.summary.create_file_writer(train_log_dir) val_summary_writer = tf.summary.create_file_writer(val_log_dir) for epoch in range(EPOCHS): start = time.time() train_loss.reset_states() train_accuracy.reset_states() val_loss.reset_states() val_accuracy.reset_states() # inp -> german, tar -> english for (batch, (inp, tar)) in enumerate(train_dataset): train_step(inp, tar) if batch % 50 == 0: print('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format( epoch + 1, batch, train_loss.result(), train_accuracy.result())) with train_summary_writer.as_default(): tf.summary.scalar('train_loss', train_loss.result(), step=epoch) tf.summary.scalar('train_accuracy', train_accuracy.result(), step=epoch) ckpt_save_path = ckpt_manager.save() print('Saving checkpoint for epoch {} at {}'.format(epoch + 1, ckpt_save_path)) for (batch, (inp, tar)) in enumerate(val_dataset): val_step(inp, tar) with val_summary_writer.as_default(): tf.summary.scalar('val_loss', val_loss.result(), step=epoch) tf.summary.scalar('val_accuracy', val_accuracy.result(), step=epoch) print('Epoch {} Val Loss {:.4f} Val Accuracy {:.4f}'.format(epoch + 1, val_loss.result(), val_accuracy.result())) print('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))
def main(BUFFER_SIZE, BATCH_SIZE, MAX_LENGTH, num_layers, d_model, dff, num_heads, dropout_rate, EPOCHS, checkpoint_path): @tf.function def train_step(inp, tar): tar_inp = tar[:, :-1] tar_real = tar[:, 1:] enc_padding_mask, combined_mask, dec_padding_mask = create_masks( inp, tar_inp) with tf.GradientTape() as tape: predictions, _ = transformer(inp, tar_inp, True, enc_padding_mask, combined_mask, dec_padding_mask) loss = loss_function(tar_real, predictions) gradients = tape.gradient(loss, transformer.trainable_variables) optimizer.apply_gradients( zip(gradients, transformer.trainable_variables)) train_loss(loss) train_accuracy(tar_real, predictions) # prepare data and produce tokenizer train_examples, val_examples, tokenizer_en, tokenizer_pt = portuguese2english_translation_data_and_tokenizer( ) # prepare dataset train_dataset, val_dataset = get_dataset(train_examples, val_examples, BATCH_SIZE, MAX_LENGTH, BUFFER_SIZE) print("check a batch data:") pt_batch, en_batch = next(iter(val_dataset)) print("pt_batch:\n", pt_batch) print("en_batch:\n", en_batch) input_vocab_size = tokenizer_pt.vocab_size + 2 target_vocab_size = tokenizer_en.vocab_size + 2 # create model transformer = Transformer(num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, dropout_rate) train_loss = tf.keras.metrics.Mean(name='train_loss') train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy( name='train_accuracy') learning_rate = CustomSchedule(d_model) optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9) # checkpoint ckpt = tf.train.Checkpoint(transformer=transformer, optimizer=optimizer) ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5) # if a checkpoint exists, restore the latest checkpoint. if ckpt_manager.latest_checkpoint: ckpt.restore(ckpt_manager.latest_checkpoint) print('Latest checkpoint restored!!') # train for epoch in range(EPOCHS): start = time.time() train_loss.reset_states() train_accuracy.reset_states() # inp -> portuguese, tar -> english for (batch, (inp, tar)) in enumerate(train_dataset): train_step(inp, tar) if batch % 500 == 0: print('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format( epoch + 1, batch, train_loss.result(), train_accuracy.result())) if (epoch + 1) % 5 == 0: ckpt_save_path = ckpt_manager.save() print('Saving checkpoint for epoch {} at {}'.format( epoch + 1, ckpt_save_path)) print('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format( epoch + 1, train_loss.result(), train_accuracy.result())) print('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none') def loss_function(real, pred): mask = tf.math.logical_not(tf.math.equal(real, 0)) loss_ = loss_object(real, pred) mask = tf.cast(mask, dtype=loss_.dtype) loss_ *= mask return tf.reduce_mean(loss_) learning_rate = CustomSchedule(d_model) optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9) train_loss = tf.keras.metrics.Mean(name='train_loss') train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy( name='train_accuracy') transformer = Transformer(num_layers, d_model, num_heads, dff, input_vocab_size,