def val_step(inp, tar): tar_inp = tar[:, :-1] tar_real = tar[:, 1:] inp_inp = inp[:, :-1] inp_real = inp[:, 1:] enc_padding_mask1, combined_mask1, dec_padding_mask1 = create_masks(inp, tar_inp) enc_padding_mask2, combined_mask2, dec_padding_mask2 = create_masks(tar, inp_inp) predictions1, _ = transformer1(inp, tar_inp, True, enc_padding_mask1, combined_mask1, dec_padding_mask1) loss1 = loss_function(tar_real, predictions1) # this is de->en if USE_RTL: predictions2, _ = transformer2(tar, inp_inp, True, enc_padding_mask2, combined_mask2, dec_padding_mask2) loss2 = loss_function(inp_real, predictions2) # this is en->de predicted_id2 = tf.argmax(predictions2, axis=-1) # find most likely token from logits inp2 = tf.concat([inp[:, 0:1], predicted_id2], axis=-1) # add start token. inp2 is \hat{s} in the paper predicted_id1 = tf.argmax(predictions1, axis=-1) # find most likely token from logits tar2 = tf.concat([tar[:, 0:1], predicted_id1], axis=-1) # add start token. tar22 is \hat{t} in the paper enc_padding_mask3, combined_mask3, dec_padding_mask3 = create_masks(inp2, tar_inp) enc_padding_mask4, combined_mask4, dec_padding_mask4 = create_masks(tar2, inp_inp) predictions3, _ = transformer1(inp2, tar_inp, True, enc_padding_mask3, combined_mask3, dec_padding_mask3) loss3 = loss_function(tar_real, predictions3) # predictions3 is \tilde{t} in the paper predictions4, _ = transformer2(tar2, inp_inp, True, enc_padding_mask4, combined_mask4, dec_padding_mask4) loss4 = loss_function(inp_real, predictions4) # predictions4 is \tilde{s} in the paper loss = loss1 + loss2 + LAMBDA * (loss3 + loss4) else: loss = loss1 val_loss(loss) val_accuracy(tar_real, predictions1)
def train_step(inp, tar, grad_accum_flag): tar_inp = tar[:, :-1] tar_real = tar[:, 1:] enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp) with tf.GradientTape() as tape: predictions, attention_weights, dec_output = model( inp, tar_inp, enc_padding_mask, combined_mask, dec_padding_mask, training=True ) train_variables = model.trainable_variables loss = loss_function(tar_real, predictions) scaled_loss = optimizer.get_scaled_loss(loss) scaled_gradients = tape.gradient(scaled_loss, train_variables) gradients = optimizer.get_unscaled_gradients(scaled_gradients) # Initialize the shadow variables with same type as the gradients if not accumulators: for tv in gradients: accumulators.append(tf.Variable(tf.zeros_like(tv), trainable=False)) # accmulate the gradients to the shadow variables for (accumulator, grad) in zip(accumulators, gradients): accumulator.assign_add(grad) # apply the gradients and reset them to zero if the flag is true if grad_accum_flag: for accumlator in accumulators: accumulator.assign(tf.math.divide(accumulator,h_parms.accumulation_steps)) optimizer.apply_gradients(zip(accumulators, train_variables)) for accumulator in (accumulators): accumulator.assign(tf.zeros_like(accumulator)) train_loss(loss) train_accuracy(tar_real, predictions)
def evaluate_batch(model, inputs, tokenizer_tar, max_length): encoder_input = tf.convert_to_tensor(inputs) decoder_input = tf.expand_dims([tokenizer_tar.bos_token_id] * inputs.shape[0], axis=1) output = decoder_input attention_weights = None for _ in range(max_length): enc_padding_mask, combined_mask, dec_padding_mask = create_masks( encoder_input, output) # predictions.shape == (batch_size, seq_len, vocab_size) predictions, attention_weights = model(encoder_input, output, False, enc_padding_mask, combined_mask, dec_padding_mask) # select the last word from the seq_len dimension predictions = predictions[:, -1:, :] # (batch_size, 1, vocab_size) predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32) # return the result if the predicted_id is equal to the end token if (predicted_id == tokenizer_tar.eos_token_id).numpy().all(): return output, attention_weights # return tf.squeeze(output, axis=0), attention_weights # concatenate the predicted_id to the output which is given to the decoder # as its input. output = tf.concat([output, predicted_id], axis=-1) return output, attention_weights
def val_step(inp, tar, inp_shape, tar_shape, batch): tar_inp = tar[:, :-1] tar_real = tar[:, 1:] enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp) predictions, attention_weights, dec_output = transformer( inp, tar_inp, False, enc_padding_mask, combined_mask, dec_padding_mask ) if config.copy_gen: predictions = pointer_generator( dec_output, predictions, attention_weights, inp, inp_shape, tar_shape, batch, training=False ) loss = loss_function(tar_real, predictions) validation_loss(loss) validation_accuracy(tar_real, predictions)
def call(self, input_ids, target_ids, training): # (batch_size, 1, 1, seq_len), (batch_size, 1, 1, seq_len) _, combined_mask, dec_padding_mask = create_masks( input_ids, target_ids[:, :-1]) # (batch_size, seq_len, d_bert) enc_output = self.bert_model(input_ids)[0] # (batch_size, seq_len, vocab_len), _ draft_logits, draft_attention_dist = self.draft_summary( input_ids, enc_output=enc_output, look_ahead_mask=combined_mask, padding_mask=dec_padding_mask, target_ids=target_ids[:, :-1], training=True) # (batch_size, seq_len, vocab_len), _ refine_logits, refine_attention_dist = self.refine_summary( input_ids, enc_output=enc_output, target=target_ids[:, :-1], padding_mask=dec_padding_mask, training=True) return draft_logits, draft_attention_dist, refine_logits, refine_attention_dist
def train_step(inp, tar, inp_shape, tar_shape, batch): tar_inp = tar[:, :-1] tar_real = tar[:, 1:] enc_padding_mask, combined_mask, dec_padding_mask = create_masks( inp, tar_inp) with tf.GradientTape() as tape: predictions, attention_weights, dec_output = transformer( inp, tar_inp, True, enc_padding_mask, combined_mask, dec_padding_mask) train_variables = transformer.trainable_variables tf.debugging.check_numerics(predictions, "Nan's in the transformer predictions") if config.copy_gen: predictions = pointer_generator(dec_output, predictions, attention_weights, inp, inp_shape, tar_shape, batch, training=True) tf.debugging.check_numerics( predictions, "Nan's in the pointer_generator predictions") train_variables = train_variables + pointer_generator.trainable_variables loss = loss_function(tar_real, predictions) gradients = tape.gradient(loss, train_variables) optimizer.apply_gradients(zip(gradients, train_variables)) train_loss(loss) train_accuracy(tar_real, predictions)
def val_step(inp, tar, epoch, inp_shape, tar_shape, batch, create_summ): tar_inp = tar[:, :-1] tar_real = tar[:, 1:] enc_padding_mask, combined_mask, dec_padding_mask = create_masks( inp, tar_inp) predictions, attention_weights, dec_output = transformer( inp, tar_inp, False, enc_padding_mask, combined_mask, dec_padding_mask) if config.copy_gen: predictions = pointer_generator(dec_output, predictions, attention_weights, inp, inp_shape, tar_shape, batch, training=False) loss = loss_function(tar_real, predictions) validation_loss(loss) validation_accuracy(tar_real, predictions) if create_summ: rouge, bert = tf_write_summary(tar_real, predictions, inp[:, 1:], epoch) else: rouge, bert = (1.0, 1.0) return (rouge, bert)
def decoder_query(output): enc_padding_mask, combined_mask, dec_padding_mask = create_masks( doc_input, output) predictions, attention_weights, dec_output = model( doc_input, output, enc_padding_mask, combined_mask, dec_padding_mask, True) return (predictions[:, -1:, :])
def decoder_query(output): enc_padding_mask, combined_mask, dec_padding_mask = create_masks( encoder_input, output) predictions, attention_weights, dec_output = model( encoder_input, output, enc_padding_mask, combined_mask, dec_padding_mask, False) # (batch_size, 1, target_vocab_size) return (predictions[:, -1:, :])
def symbols_to_logits(output): batched_input = tf.tile(encoder_input, [beam_width, 1]) enc_padding_mask, combined_mask, dec_padding_mask = create_masks( batched_input, output) predictions, attention_weights = transformer2( batched_input, output, False, enc_padding_mask, combined_mask, dec_padding_mask) predictions = predictions[:, -1, :] return predictions
def draft_summary_sampling(model, inp, enc_output, look_ahead_mask, padding_mask, sampling_type='greedy', temperature=0.9, p=0.9, k=25, training=False): """ Inference call, builds a draft summary auto-regressively """ log.info(f"Building: 'Draft {sampling_type} decoder'") N = tf.shape(enc_output)[0] T = tf.shape(enc_output)[1] # (batch_size, 1) dec_input = tf.ones([N, 1], dtype=tf.int32) * CLS_ID summary, dec_outputs, dec_logits, attention_dists = [], [], [], [] summary += [dec_input] for i in (range(0, config.summ_length)): _, _, dec_padding_mask = create_masks(inp, dec_input) # (batch_size, i+1, d_bert) embeddings = model.embedding(dec_input) # (batch_size, i+1, vocab), (_) dec_output, attention_dist = model.decoder(inp, embeddings, enc_output, training, look_ahead_mask, padding_mask) # (batch_size, 1, vocab) dec_output_i = dec_output[:, -1:, :] if sampling_type == 'nucleus': preds = tf.cast( nucleus_sampling(((dec_output_i) / temperature), p=p), tf.int32) elif sampling_type == 'topk': preds = tf.cast( top_k_sampling(((dec_output_i) / temperature), k=k), tf.int32) elif sampling_type == 'random_sampling': preds = tf.cast(sampling((dec_output_i) / temperature), tf.int32) elif sampling_type == 'topktopp': preds = tf.cast( topp_topk(((dec_output_i) / temperature), p=p, k=k), tf.int32) else: preds = tf.cast(tf.argmax(dec_output_i, axis=-1), tf.int32) dec_outputs += [dec_output_i] #dec_logits_i = dec_logits_i[:, -1:, :] #dec_logits += [dec_logits_i] summary += [preds] dec_input = with_column(dec_input, i + 1, preds) summary = tf.concat(summary, axis=1) # (batch_size, seq_len, vocab_len), (batch_size, seq_len), (_) return summary, attention_dist
def beam_search_decoder(target_ids): _, combined_mask, dec_padding_mask = create_masks( input_ids, target_ids) draft_logits, _ = model.draft_summary(input_ids=input_ids, enc_output=enc_output, look_ahead_mask=combined_mask, padding_mask=dec_padding_mask, target_ids=target_ids, training=False) # (batch_size, 1, target_vocab_size) return (draft_logits[:, -1:, :])
def run_inference(model, dataset, beam_sizes_to_try=config.beam_sizes): for beam_size in beam_sizes_to_try: ref_sents = [] hyp_sents = [] for (doc_id, (input_ids, _, _, target_ids, _, _)) in enumerate(dataset, 1): start_time = time.time() # translated_output_temp[0] (batch, beam_size, summ_length+1) translated_output_temp, enc_output = draft_decoded_summary( model, input_ids, target_ids[:, :-1], beam_size) draft_predictions = translated_output_temp[0][:, 0, :] _, _, dec_padding_mask = create_masks(input_ids, target_ids[:, :-1]) refined_summary, attention_dists = refined_summary_greedy( model, input_ids, enc_output, draft_predictions, dec_padding_mask, training=False) sum_ref = tokenizer.convert_ids_to_tokens( [i for i in tf.squeeze(target_ids) if i not in [0, 101, 102]]) sum_hyp = tokenizer.convert_ids_to_tokens([ i for i in tf.squeeze(refined_summary) if i not in [0, 101, 102] ]) sum_ref = convert_wordpiece_to_words(sum_ref) sum_hyp = convert_wordpiece_to_words(sum_hyp) print('Original summary: {}'.format(sum_ref)) print('Predicted summary: {}'.format(sum_hyp)) if sum_ref and sum_hyp: ref_sents.append(sum_ref) hyp_sents.append(sum_hyp) try: rouges = rouge_all.get_scores(ref_sents, hyp_sents) avg_rouge_f1 = np.mean([ np.mean([ rouge_scores['rouge-1']["f"], rouge_scores['rouge-2']["f"], rouge_scores['rouge-l']["f"] ]) for rouge_scores in rouges ]) _, _, bert_f1 = b_score(ref_sents, hyp_sents, lang='en', model_type=config.pretrained_bert_model) avg_bert_f1 = np.mean(bert_f1.numpy()) except: avg_rouge_f1 = 0 avg_bert_f1 = 0 print(infer_template.format(beam_size, avg_rouge_f1, avg_bert_f1)) print(f'time to process document {doc_id} : {time.time()-start_time}')
def train_step(inp, tar): tar_inp = tar[:, :-1] tar_real = tar[:, 1:] inp_inp = inp[:, :-1] inp_real = inp[:, 1:] enc_padding_mask1, combined_mask1, dec_padding_mask1 = create_masks(inp, tar_inp) enc_padding_mask2, combined_mask2, dec_padding_mask2 = create_masks(tar, inp_inp) with tf.GradientTape() as tape: predictions1, _ = transformer1(inp, tar_inp, True, enc_padding_mask1, combined_mask1, dec_padding_mask1) loss1 = loss_function(tar_real, predictions1) # this is de->en if USE_RTL: predictions2, _ = transformer2(tar, inp_inp, True, enc_padding_mask2, combined_mask2, dec_padding_mask2) loss2 = loss_function(inp_real, predictions2) # this is en->de loss = loss1 + loss2 if LAMBDA>0: predicted_id2 = tf.argmax(predictions2, axis=-1) # find most likely token from logits inp2 = tf.concat([inp[:, 0:1], predicted_id2], axis=-1) # add start token. inp2 is \hat{s} in the paper predicted_id1 = tf.argmax(predictions1, axis=-1) # find most likely token from logits tar2 = tf.concat([tar[:, 0:1], predicted_id1], axis=-1) # add start token. tar22 is \hat{t} in the paper enc_padding_mask3, combined_mask3, dec_padding_mask3 = create_masks(inp2, tar_inp) enc_padding_mask4, combined_mask4, dec_padding_mask4 = create_masks(tar2, inp_inp) predictions3, _ = transformer1(inp2, tar_inp, True, enc_padding_mask3, combined_mask3, dec_padding_mask3) loss3 = loss_function(tar_real, predictions3) # predictions3 is \tilde{t} in the paper predictions4, _ = transformer2(tar2, inp_inp, True, enc_padding_mask4, combined_mask4, dec_padding_mask4) loss4 = loss_function(inp_real, predictions4) # predictions4 is \tilde{s} in the paper loss += LAMBDA * (loss3 + loss4) else: loss = loss1 if USE_RTL: gradients = tape.gradient(loss, [transformer1.trainable_variables, transformer2.trainable_variables]) optimizer.apply_gradients(zip(gradients[0] + gradients[1], transformer1.trainable_variables + transformer2.trainable_variables)) else: gradients = tape.gradient(loss, transformer1.trainable_variables) optimizer.apply_gradients(zip(gradients, transformer1.trainable_variables)) train_loss(loss) train_accuracy(tar_real, predictions1)
def transformer_query(output): enc_padding_mask, combined_mask, dec_padding_mask = create_masks( encoder_input, output) predictions, attention_weights, dec_output = transformer( encoder_input, output, False, enc_padding_mask, combined_mask, dec_padding_mask) if config.copy_gen: predictions = generator(dec_output, predictions, attention_weights, encoder_input, inp_shape, output.shape[-1], batch, False) # select the last sequence return (predictions[:, -1:, :]) # (batch_size, 1, target_vocab_size)
def val_step(model, loss_object, inp, tar, val_loss, val_accuracy, pad_token_id): tar_inp = tar[:, :-1] tar_real = tar[:, 1:] enc_padding_mask, combined_mask, dec_padding_mask = create_masks( inp, tar_inp) predictions, _ = model(inp, tar_inp, False, enc_padding_mask, combined_mask, dec_padding_mask) loss = loss_function(tar_real, predictions, loss_object, pad_token_id) val_loss(loss) val_accuracy(tar_real, predictions)
def call(self, inp, tar, training): # (batch_size, seq_len) x3 input_ids, input_mask, input_segment_ids = inp # (batch_size, seq_len + 1) x3 target_ids, target_mask, target_segment_ids = tar # (batch_size, 1, 1, seq_len), (_), (batch_size, 1, 1, seq_len) _, combined_mask, dec_padding_mask = create_masks( input_ids, target_ids[:, :-1]) # (batch_size, seq_len, d_bert) enc_output = self.bert((input_ids, input_mask, input_segment_ids)) # (batch_size, seq_len, d_bert) embeddings = self.embedding(target_ids[:, :-1]) draft_logits,\ draft_attention_dist,\ draft_dec_outputs = self.draft_summary( embeddings, enc_output, combined_mask, dec_padding_mask, target_ids[:, :-1], training ) if config.copy_gen: draft_logits = self.pointer_generator( draft_dec_outputs, draft_logits, draft_attention_dist, input_ids, tf.shape(input_ids)[1], tf.shape(target_ids[:, :-1])[1], training) refine_logits,\ refine_attention_dist,\ refine_dec_outputs = self.refine_summary( enc_output, (target_ids[:, :-1], target_mask[:, :-1], target_segment_ids[:, :-1]), dec_padding_mask, training ) return (draft_logits, draft_attention_dist, draft_dec_outputs, refine_logits, refine_attention_dist, refine_dec_outputs)
def train_step(inp, tar): tar_inp = tar[:, :-1] tar_real = tar[:, 1:] enc_padding_mask, combined_mask, dec_padding_mask = create_masks( inp, tar_inp) with tf.GradientTape() as tape: predictions, _ = model(inp, tar_inp, True, enc_padding_mask, combined_mask, dec_padding_mask) loss = loss_function(tar_real, predictions) gradients = tape.gradient(loss, model.trainable_variables) optimizer.apply_gradients(zip(gradients, model.trainable_variables)) train_loss(loss) train_accuracy(tar_real, predictions)
def validate(model, data, mode="valid"): valid_loss = tf.keras.metrics.Mean(name='valid_loss') valid_accuracy = tf.keras.metrics.SparseCategoricalAccuracy( name='valid_accuracy') for inp, tar in data.batcher(mode=mode): tar_inp = tar[:, :-1] tar_real = tar[:, 1:] enc_padding_mask, combined_mask, dec_padding_mask = create_masks( inp, tar_inp) predictions, _ = model(inp, tar_inp, True, enc_padding_mask, combined_mask, dec_padding_mask) loss = loss_function(tar_real, predictions) valid_loss(loss) valid_accuracy(tar_real, predictions) return valid_accuracy.result().numpy(), valid_loss.result().numpy()
def call(self, input_ids, input_mask, input_segment_ids, target_ids, target_mask, target_segment_ids, training): # (batch_size, seq_len) x3 #input_ids, input_mask, input_segment_ids = inp # (batch_size, seq_len + 1) x3 #target_ids, target_mask, target_segment_ids = tar # (batch_size, 1, 1, seq_len), (_), (batch_size, 1, 1, seq_len) _, combined_mask, dec_padding_mask = create_masks( input_ids, target_ids[:, :-1]) # (batch_size, seq_len, d_bert) enc_output = self.bert_model(input_ids)[ 0] #, input_mask, input_segment_ids) draft_logits, draft_attention_dist, draft_dec_outputs = self.draft_summary( enc_output=enc_output, look_ahead_mask=combined_mask, padding_mask=dec_padding_mask, target_ids=target_ids[:, :-1], training=True) if config.copy_gen: draft_logits = self.pointer_generator(draft_dec_outputs, draft_logits, draft_attention_dist, input_ids, tf.shape(input_ids)[1], tf.shape( target_ids[:, :-1])[1], training=training) # (batch_size, seq_len, vocab_len), (batch_size, seq_len), (_) refine_logits, refine_attention_dist, refine_dec_outputs = self.refine_summary( enc_output=enc_output, target=(target_ids[:, :-1], target_mask[:, :-1], target_segment_ids[:, :-1]), padding_mask=dec_padding_mask, training=True) return draft_logits, draft_attention_dist, draft_dec_outputs, refine_logits, refine_attention_dist, refine_dec_outputs
def train_step(model, loss_object, optimizer, inp, tar, train_loss, train_accuracy, pad_token_id): tar_inp = tar[:, :-1] tar_real = tar[:, 1:] enc_padding_mask, combined_mask, dec_padding_mask = create_masks( inp, tar_inp) with tf.GradientTape() as tape: # training=True is only needed if there are layers with different # behavior during training versus inference (e.g. Dropout). predictions, _ = model(inp, tar_inp, True, enc_padding_mask, combined_mask, dec_padding_mask) loss = loss_function(tar_real, predictions, loss_object, pad_token_id) gradients = tape.gradient(loss, model.trainable_variables) optimizer.apply_gradients(zip(gradients, model.trainable_variables)) train_loss(loss) train_accuracy(tar_real, predictions)
def val_step(inp, tar, epoch, create_summ): tar_inp = tar[:, :-1] tar_real = tar[:, 1:] enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp) predictions, attention_weights, dec_output = model( inp, tar_inp, enc_padding_mask, combined_mask, dec_padding_mask, training=False ) loss = loss_function(tar_real, predictions) validation_loss(loss) validation_accuracy(tar_real, predictions) if create_summ: rouge, bert = tf_write_summary(tar_real, predictions, inp[:, 1:], epoch) else: rouge, bert = (1.0, 1.0) return (rouge, bert)
def train_step(inp, tar): tar_inp = tar[:, :-1] tar_real = tar[:, 1:] enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp) with tf.GradientTape() as tape: predictions, attention_weights, dec_output = transformer( inp, tar_inp, enc_padding_mask, combined_mask, dec_padding_mask, training=True ) train_variables = transformer.trainable_variables loss = loss_function(tar_real, predictions) scaled_loss = optimizer.get_scaled_loss(loss) scaled_gradients = tape.gradient(scaled_loss, train_variables) gradients = optimizer.get_unscaled_gradients(scaled_gradients) optimizer.apply_gradients(zip(gradients, train_variables)) train_loss(loss) train_accuracy(tar_real, predictions)
def beam_search_decoder(output): _, _, dec_padding_mask = create_masks(input_ids, output) embeddings = model.embedding(output) predictions, dec_op, attention_weights = model.decoder( input_ids, embeddings, enc_output, False, None, dec_padding_mask ) if config.copy_gen: predictions = model.decoder.pointer_generator( dec_op, predictions, attention_weights, input_ids, tf.shape(input_ids)[1], tf.shape(output)[-1], False ) # (batch_size, 1, target_vocab_size) return (predictions[:,-1:,:])
def train(self, examples, is_train=True): src_token_ids = examples["src_token_ids"] tgt_token_ids = examples["tgt_token_ids"] tgt_edges = examples["tgt_edges"] # enc_padding_mask: (batch_size, 1, 1, src_seq_len) # combined_mask: (batch_size, 1, tgt_seq_len, tgt_seq_len) # dec_padding_mask: (batch_size, 1, 1, src_seq_len) enc_padding_mask, combined_mask, dec_padding_mask = create_masks( src_token_ids, tgt_token_ids, self.src_vocab.token2idx[self.src_vocab.PAD], self.tgt_vocab.token2idx[self.tgt_vocab.PAD]) # (batch_size, src_seq_len, d_model) enc_output = self.encoder(src_token_ids, is_train, enc_padding_mask) # dec_output.shape == (batch_size, tgt_seq_len, tgt_vocab_size+src_seq_len) dec_output, _, edge_scores = self.decoder(tgt_token_ids, enc_output, is_train, combined_mask, dec_padding_mask, tgt_edges=tgt_edges) # prepend the BOS token # (batch_size, 1) start_token = tf.expand_dims(tgt_token_ids[:, 0], axis=-1) # (batch_size, 1, tgt_vocab_size + src_seq_len) start_token_onehot = tf.one_hot(start_token, depth=(self.tgt_vocab_size + self.src_seq_len)) start_token_logits = start_token_onehot + (start_token_onehot - 1) * 1e9 dec_output = tf.concat([start_token_logits, dec_output[:, :-1, :]], axis=1) # (batch_size, tgt_seq_len, tgt_vocab_size+src_seq_len) return dec_output, edge_scores
def evaluate(self, inp_sentence): start_token = [self.tokenizer_pt.vocab_size] end_token = [self.tokenizer_pt.vocab_size + 1] # inp sentence is portuguese, hence adding the start and end token inp_sentence = start_token + self.tokenizer_pt.encode( inp_sentence) + end_token encoder_input = tf.expand_dims(inp_sentence, 0) # as the target is english, the first word to the transformer should be the # english start token. decoder_input = [self.tokenizer_en.vocab_size] output = tf.expand_dims(decoder_input, 0) for i in range(self.MAX_LENGTH): enc_padding_mask, combined_mask, dec_padding_mask = create_masks( encoder_input, output) # predictions.shape == (batch_size, seq_len, vocab_size) predictions, attention_weights = self.translate_transformer( encoder_input, output, False, enc_padding_mask, combined_mask, dec_padding_mask) # select the last word from the seq_len dimension predictions = predictions[:, -1:, :] # (batch_size, 1, vocab_size) predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32) # return the result if the predicted_id is equal to the end token if tf.equal(predicted_id, self.tokenizer_en.vocab_size + 1): return tf.squeeze(output, axis=0), attention_weights # concatentate the predicted_id to the output which is given to the decoder # as its input. output = tf.concat([output, predicted_id], axis=-1) return tf.squeeze(output, axis=0), attention_weights
def evaluate(inp_sentence, transformer=model): # start = time() start_token = OLD_VOCAB_SIZE end_token = OLD_VOCAB_SIZE+1 inp_sentence = [start_token] + inp_sentence + [end_token] encoder_input = tf.expand_dims(inp_sentence, 0) decoder_input = [KOR_VOCAB_SIZE] output = tf.expand_dims(decoder_input, 0) for i in range(200): enc_padding_mask, combined_mask, dec_padding_mask = create_masks( encoder_input, output) # predictions.shape == (batch_size, seq_len, vocab_size) predictions, attention_weights = transformer(encoder_input, output, False, enc_padding_mask, combined_mask, dec_padding_mask) predictions = predictions[: ,-1:, :] # (batch_size, 1, vocab_size) predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32) if predicted_id == KOR_VOCAB_SIZE+1: # print("model predict time :", time()-start) return tf.squeeze(output, axis=0), attention_weights output = tf.concat([output, predicted_id], axis=-1) # print("model predict time :", time()-start) return tf.squeeze(output, axis=0), attention_weights
def train_epoch(epoch, model, dataloader, optimizer, sched=None): model.train() start = time.time() total_loss = 0 print_every = max(1, int(len(dataloader) / 100.0)) for i, (smiles, iupac_in, iupac_out, smiles_lens, iupac_lens) in enumerate(dataloader): smiles = smiles.to(DEVICE) iupac_in = iupac_in.to(DEVICE) iupac_out = iupac_out.to(DEVICE) optimizer.zero_grad() smiles_mask, iupac_mask = create_masks(smiles, iupac_in, device=DEVICE) preds = model(smiles, iupac_in, smiles_mask, iupac_mask) loss = torch.nn.functional.cross_entropy(preds.view(-1, preds.size(-1)), iupac_out.view(-1), ignore_index=ord(EXTRA_CHARS['pad'])) #print(loss, preds) loss.backward() optimizer.step() if sched: sched.step() total_loss += loss.item() if (i+1) % print_every == 0: avg_loss = total_loss / float(print_every) print_progress((time.time() - start)//60, epoch+1, i+1, avg_loss) total_loss = 0 #if (i+1) % SAVE_ITERS == 0: # save(epoch, i+1, NAME, model, optimizer) avg_loss = total_loss / max(1, (i+1) % print_every) print_progress((time.time() - start)//60, epoch+1, i+1, avg_loss) save(epoch, model, optimizer)
model.load_state_dict(checkpoint['state_dict'], strict=False) except AttributeError as e: model = nn.DataParallel(model) model.load_state_dict(checkpoint['state_dict'], strict=False) print("Pretrained weights loaded") try: encoder = model.module.encoder except AttributeError as e: encoder = model.encoder embeddings = [] with torch.no_grad(): for smiles in smiles_strings: encoded = encode_smiles(smiles) mask = create_masks(encoded) embedding = encoder(encoded, mask)[0].numpy() embeddings.append(embedding) print("embedded {0} into {1} matrix.".format(smiles, str(embedding.shape))) print("All SMILES strings embedded. Saving...") filename = os.path.splitext(os.path.basename(args.data_path))[0] out_dir = "embeddings/" out_file = os.path.join(out_dir, filename + ".npz") if not os.path.exists(out_dir): os.makedirs(out_dir) out_dict = { smiles: matrix
def call(self, inp, tar, training): # (batch_size, seq_len) x3 input_ids, input_mask, input_segment_ids = inp # (batch_size, seq_len + 1) x3 target_ids, target_mask, target_segment_ids = tar # (batch_size, 1, 1, seq_len), (_), (batch_size, 1, 1, seq_len) _, combined_mask, dec_padding_mask = create_masks( input_ids, target_ids[:, :-1]) # (batch_size, seq_len, d_bert) enc_output = self.bert((input_ids, input_mask, input_segment_ids)) if self.add_stage_1: # (batch_size, seq_len, d_bert) embeddings = self.embedding(target_ids[:, :-1]) #print(tf.shape(embeddings)) # (batch_size, seq_len, d_bert), (_) dec_outputs, attention_dist = self.decoder(embeddings, enc_output, training, combined_mask, dec_padding_mask) # (batch_size, seq_len, vocab_len) logits = self.final_layer(dec_outputs) if config.copy_gen: logits = self.pointer_generator(dec_outputs, logits, attention_dist, input_ids, tf.shape(input_ids)[1], tf.shape( target_ids[:, :-1])[1], training=training) if self.add_stage_2: N = tf.shape(enc_output)[0] T = self.output_seq_len # since we are using teacher forcing we do not need an autoregressice mechanism here # (batch_size x (seq_len - 1), seq_len) dec_inp_ids = tile_and_mask_diagonal(target_ids[:, :-1], mask_with=MASK_ID) # (batch_size x (seq_len - 1), seq_len) dec_inp_mask = tf.tile(target_mask[:, :-1], [T - 1, 1]) # (batch_size x (seq_len - 1), seq_len) dec_inp_segment_ids = tf.tile(target_segment_ids[:, :-1], [T - 1, 1]) # (batch_size x (seq_len - 1), seq_len, d_bert) enc_output = tf.tile(enc_output, [T - 1, 1, 1]) # (batch_size x (seq_len - 1), 1, 1, seq_len) padding_mask = tf.tile(dec_padding_mask, [T - 1, 1, 1, 1]) # (batch_size x (seq_len - 1), seq_len, d_bert) context_vectors = self.bert( (dec_inp_ids, dec_inp_mask, dec_inp_segment_ids)) # (batch_size x (seq_len - 1), seq_len, d_bert), (_) dec_outputs, attention_dist = self.decoder( context_vectors, enc_output, training, look_ahead_mask=None, padding_mask=padding_mask) # (batch_size x (seq_len - 1), seq_len - 1, d_bert) dec_outputs = dec_outputs[:, 1:, :] # (batch_size x (seq_len - 1), (seq_len - 1)) diag = tf.linalg.set_diag(tf.zeros([T - 1, T - 1]), tf.ones([T - 1])) diag = tf.tile(diag, [N, 1]) where = tf.not_equal(diag, 0) indices = tf.where(where) # (batch_size x (seq_len - 1), d_bert) dec_outputs = tf.gather_nd(dec_outputs, indices) # (batch_size, seq_len - 1, d_bert) dec_outputs = tf.reshape(dec_outputs, [N, T - 1, -1]) # (batch_size, seq_len, d_bert) dec_outputs = tf.concat([ tf.tile( tf.expand_dims(tf.one_hot([CLS_ID], self.d_model), axis=0), [N, 1, 1]), dec_outputs ], axis=1) # (batch_size, seq_len, vocab_len) logits = self.final_layer(dec_outputs) return logits, attention_dist, dec_outputs