def create_model(params, is_train): with tf.name_scope('model'): if is_train: inputs = layers.Input((None, ), dtype=tf.int64, name='inputs') targets = layers.Input((None, ), dtype=tf.int64, name='targets') internal_model = Transformer(params, name='transformer') logits = internal_model([inputs, targets], training=is_train) vocab_size = params['vocab_size'] label_smoothing = params['label_smoothing'] if params['enable_metrics_in_training']: logits = metrics.MetricLayer(vocab_size)([logits, targets]) logits = layers.Lambda(lambda x: x, name='logits', dtype=tf.float32)(logits) model = Model([inputs, targets], logits) # TODO: Can we do this loss in float16 instead of float32? loss = metrics.transformer_loss(logits, targets, label_smoothing, vocab_size) model.add_loss(loss) return model else: inputs = layers.Input((None, ), dtype=tf.int64, name='inputs') internal_model = Transformer(params, name='transformer') ret = internal_model([inputs], training=is_train) outputs, scores = ret['outputs'], ret['scores'] return Model(inputs, [outputs, scores])
def generator_loss(fake_output, generated_output, targets): discriminator_loss = cross_entropy(tf.ones_like(fake_output), fake_output) logits = metrics.MetricLayer( self.vocab_size)([generated_output, targets]) logits, generator_loss = metrics.LossLayer(self.vocab_size, 0.1)([logits, targets]) return discriminator_loss + generator_loss, generator_loss
def test_basic_simpleSeq2Seq(self): trace_path = "checkpoints_tl/logging/loss" vocabulary_size = 64 emb_dim = 32 model_ = Seq2seqLuongAttention(hidden_size=128, embedding_layer=tl.layers.Embedding( vocabulary_size=vocabulary_size, embedding_size=emb_dim), cell=tf.keras.layers.GRUCell, method="dot") # print(", ".join(x for x in [t.name for t in model_.trainable_weights])) self.vocab_size = 64 optimizer = tf.optimizers.Adam(learning_rate=0.01) for epoch in range(self.num_epochs): model_.train() t = time.time() trainX, trainY = shuffle(self.trainX, self.trainY) total_loss, n_iter = 0, 0 for X, Y in tqdm(tl.iterate.minibatches(inputs=trainX, targets=trainY, batch_size=self.batch_size, shuffle=False), total=self.n_step, desc='Epoch[{}/{}]'.format( epoch + 1, self.num_epochs), leave=False): with tf.GradientTape() as tape: dec_seq = Y[:, :-1] targets = Y[:, 1:] logits = model_(inputs=[X, dec_seq]) logits = metrics.MetricLayer( self.vocab_size)([logits, targets]) logits, loss = metrics.LossLayer(self.vocab_size, 0.1)([logits, targets]) with tf.io.gfile.GFile(trace_path, "ab+") as trace_file: trace_file.write(str(loss.numpy()) + '\n') grad = tape.gradient(loss, model_.all_weights) optimizer.apply_gradients(zip(grad, model_.all_weights)) total_loss += loss n_iter += 1 print(time.time() - t) # tl.files.save_npz(model_.all_weights, name='./model_v4.npz') model_.eval() test_sample = trainX[0:2, :] prediction = model_(inputs=[test_sample], seq_length=10, sos=0) print("Prediction: >>>>> ", prediction, "\n Target: >>>>> ", trainY[0:2, 1:], "\n\n") print('Epoch [{}/{}]: loss {:.4f}'.format(epoch + 1, self.num_epochs, total_loss / n_iter))
def train_step(inputs, targets): model.train() with tf.GradientTape() as tape: #print(inputs) logits = model(inputs=inputs, targets=targets) logits = metrics.MetricLayer(params.vocab_size)([logits, targets]) logits, loss = metrics.LossLayer(params.vocab_size, 0.1)([logits, targets]) gradients = tape.gradient(loss, model.all_weights) optimizer_.apply_gradients(zip(gradients, model.all_weights)) return loss
def train_step(inputs, targets): with tf.GradientTape() as tape: #print(inputs) logits = model(inputs=[inputs, targets], training=True) logits = metrics.MetricLayer( params["vocab_size"])([logits, targets]) logits, loss = metrics.LossLayer(params["vocab_size"], 0.1)([logits, targets]) gradients = tape.gradient(loss, model.trainable_weights) optimizer_.apply_gradients(zip(gradients, model.trainable_weights)) return loss
def train_step(inputs, targets): model.train() with tf.GradientTape() as tape: decoder_inputs = tf.pad(targets, [[0, 0], [1, 0]])[:, :-1] logits = model(inputs=[inputs, decoder_inputs]) logits = metrics.MetricLayer(params.vocab_size)([logits, targets]) logits, loss = metrics.LossLayer(params.vocab_size, 0.1)([logits, targets]) gradients = tape.gradient(loss, model.all_weights) optimizer_.apply_gradients(zip(gradients, model.all_weights)) return loss
def test_basic_simpleSeq2Seq(self): model_ = Transformer(TINY_PARAMS) self.vocab_size = TINY_PARAMS.vocab_size optimizer = tf.optimizers.Adam(learning_rate=0.01) for epoch in range(self.num_epochs): model_.train() t = time.time() trainX, trainY = shuffle(self.trainX, self.trainY) total_loss, n_iter = 0, 0 for X, Y in tqdm(tl.iterate.minibatches(inputs=trainX, targets=trainY, batch_size=self.batch_size, shuffle=False), total=self.n_step, desc='Epoch[{}/{}]'.format( epoch + 1, self.num_epochs), leave=False): with tf.GradientTape() as tape: targets = Y logits = model_(inputs=X, targets=Y) logits = metrics.MetricLayer( self.vocab_size)([logits, targets]) logits, loss = metrics.LossLayer(self.vocab_size, 0.1)([logits, targets]) grad = tape.gradient(loss, model_.all_weights) optimizer.apply_gradients(zip(grad, model_.all_weights)) total_loss += loss n_iter += 1 print(time.time() - t) model_.eval() test_sample = trainX[0:2, :] model_.eval() prediction = model_(inputs=test_sample) print("Prediction: >>>>> ", prediction["outputs"], "\n Target: >>>>> ", trainY[0:2, :], "\n\n") print('Epoch [{}/{}]: loss {:.4f}'.format(epoch + 1, self.num_epochs, total_loss / n_iter))
def test_basic_simpleSeq2Seq(self): trace_path = "checkpoints_tl/logging/loss" model_ = Transformer(TINY_PARAMS) # print(", ".join(x for x in [t.name for t in model_.trainable_weights])) self.vocab_size = TINY_PARAMS.vocab_size optimizer = tf.optimizers.Adam(learning_rate=0.01) for epoch in range(self.num_epochs): model_.train() t = time.time() trainX, trainY = shuffle(self.trainX, self.trainY) total_loss, n_iter = 0, 0 for X, Y in tqdm(tl.iterate.minibatches(inputs=trainX, targets=trainY, batch_size=self.batch_size, shuffle=False), total=self.n_step, desc='Epoch[{}/{}]'.format(epoch + 1, self.num_epochs), leave=False): with tf.GradientTape() as tape: targets = Y logits = model_(inputs = X, targets = Y) logits = metrics.MetricLayer(self.vocab_size)([logits, targets]) logits, loss = metrics.LossLayer(self.vocab_size, 0.1)([logits, targets]) with tf.io.gfile.GFile(trace_path, "ab+") as trace_file: trace_file.write(str(loss.numpy())+'\n') grad = tape.gradient(loss, model_.all_weights) optimizer.apply_gradients(zip(grad, model_.all_weights)) total_loss += loss n_iter += 1 print(time.time()-t) tl.files.save_npz(model_.all_weights, name='./model_v4.npz') model_.eval() test_sample = trainX[0:2, :] model_.eval() prediction = model_(inputs = test_sample) print("Prediction: >>>>> ", prediction["outputs"], "\n Target: >>>>> ", trainY[0:2, :], "\n\n") print('Epoch [{}/{}]: loss {:.4f}'.format(epoch + 1, self.num_epochs, total_loss / n_iter))
def test_basic_simpleSeq2Seq(self): model_ = Transformer(params) self.vocab_size = params["vocab_size"] # optimizer_ = optimizer.LazyAdam( # params["learning_rate"], # params["optimizer_adam_beta1"], # params["optimizer_adam_beta2"], # epsilon=params["optimizer_adam_epsilon"]) learning_rate = CustomSchedule(params["hidden_size"]) optimizer_ = tf.optimizers.Adam(learning_rate=0.01) # optimizer_ = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, # epsilon=1e-9) # optimizer_ = optimizer.LazyAdam(learning_rate, beta_1=0.9, beta_2=0.98, # epsilon=1e-9) for epoch in range(self.num_epochs): trainX, trainY = shuffle(self.trainX, self.trainY) total_loss, n_iter = 0, 0 for X, Y in tqdm(tl.iterate.minibatches(inputs=trainX, targets=trainY, batch_size=self.batch_size, shuffle=False), total=self.n_step, desc='Epoch[{}/{}]'.format( epoch + 1, self.num_epochs), leave=False): with tf.GradientTape() as tape: targets = Y output = model_(inputs=[X, Y], training=True) # print(len(model_.trainable_weights)) # print(model_.trainable_weights) # exit() # print(logits.shape, Y.shape) logits = metrics.MetricLayer( self.vocab_size)([output, targets]) logits, loss = metrics.LossLayer(self.vocab_size, 0.1)([logits, targets]) # logits = tf.keras.layers.Lambda(lambda x: x, name="logits")(logits) # print(time.time()-start) # output = tf.reshape(output, [-1, output.shape[-1]]) # print(", ".join([t.name for t in model_.trainable_weights])) # layer_normalization_print = [x for x in [t.name for t in model_.trainable_weights] if "feed_forward_network" in x ] # print(", ".join(x for x in [t.name for t in model_.trainable_weights] if "feed_forward_network" in x )) # print("number of layers : ", len(model_.trainable_weights)) # exit() # loss = cross_entropy_seq(logits=output, target_seqs=Y) grad = tape.gradient(loss, model_.trainable_weights) # print(grad) # exit() optimizer_.apply_gradients( zip(grad, model_.trainable_weights)) # print(time.time()-start) total_loss += loss n_iter += 1 test_sample = trainX[0:2, :] top_n = 1 for i in range(top_n): prediction = model_(inputs=[test_sample], training=False) print("Prediction: >>>>> ", prediction["outputs"], "\n Target: >>>>> ", trainY[0:2, :], "\n\n") # printing average loss after every epoch print('Epoch [{}/{}]: loss {:.4f}'.format(epoch + 1, self.num_epochs, total_loss / n_iter))