Exemple #1
0
 def val_step(inp, tar):
     tar_inp = tar[:, :-1]
     tar_real = tar[:, 1:]
     inp_inp = inp[:, :-1]
     inp_real = inp[:, 1:]
     enc_padding_mask1, combined_mask1, dec_padding_mask1 = create_masks(inp, tar_inp)
     enc_padding_mask2, combined_mask2, dec_padding_mask2 = create_masks(tar, inp_inp)
     predictions1, _ = transformer1(inp, tar_inp, True, enc_padding_mask1, combined_mask1, dec_padding_mask1)
     loss1 = loss_function(tar_real, predictions1) # this is de->en
     if USE_RTL:
         predictions2, _ = transformer2(tar, inp_inp, True, enc_padding_mask2, combined_mask2, dec_padding_mask2)
         loss2 = loss_function(inp_real, predictions2) # this is en->de
         predicted_id2 = tf.argmax(predictions2, axis=-1) # find most likely token from logits
         inp2 = tf.concat([inp[:, 0:1], predicted_id2], axis=-1) # add start token. inp2 is \hat{s} in the paper
         predicted_id1 = tf.argmax(predictions1, axis=-1) # find most likely token from logits
         tar2 = tf.concat([tar[:, 0:1], predicted_id1], axis=-1) # add start token. tar22 is \hat{t} in the paper
         enc_padding_mask3, combined_mask3, dec_padding_mask3 = create_masks(inp2, tar_inp)
         enc_padding_mask4, combined_mask4, dec_padding_mask4 = create_masks(tar2, inp_inp)
         predictions3, _ = transformer1(inp2, tar_inp, True, enc_padding_mask3, combined_mask3, dec_padding_mask3)
         loss3 = loss_function(tar_real, predictions3) # predictions3 is \tilde{t} in the paper
         predictions4, _ = transformer2(tar2, inp_inp, True, enc_padding_mask4, combined_mask4, dec_padding_mask4)
         loss4 = loss_function(inp_real, predictions4) # predictions4 is \tilde{s} in the paper
         loss = loss1 + loss2 + LAMBDA * (loss3 + loss4)            
     else:
         loss = loss1
     val_loss(loss)
     val_accuracy(tar_real, predictions1)
Exemple #2
0
def train_step(inp, tar, grad_accum_flag):
  tar_inp = tar[:, :-1]
  tar_real = tar[:, 1:]
  enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)
  with tf.GradientTape() as tape:
    predictions, attention_weights, dec_output = model(
                                                       inp, 
                                                       tar_inp, 
                                                       enc_padding_mask, 
                                                       combined_mask, 
                                                       dec_padding_mask,
                                                       training=True
                                                       )
    train_variables = model.trainable_variables
    loss = loss_function(tar_real, predictions)
    scaled_loss = optimizer.get_scaled_loss(loss)
  scaled_gradients  = tape.gradient(scaled_loss, train_variables)
  gradients = optimizer.get_unscaled_gradients(scaled_gradients)
  # Initialize the shadow variables with same type as the gradients 
  if not accumulators:
    for tv in gradients:
      accumulators.append(tf.Variable(tf.zeros_like(tv), trainable=False))
  # accmulate the gradients to the shadow variables
  for (accumulator, grad) in zip(accumulators, gradients):
    accumulator.assign_add(grad)
  # apply the gradients and reset them to zero if the flag is true
  if grad_accum_flag:
    for accumlator in accumulators:
      accumulator.assign(tf.math.divide(accumulator,h_parms.accumulation_steps))
    optimizer.apply_gradients(zip(accumulators, train_variables))
    for accumulator in (accumulators):
        accumulator.assign(tf.zeros_like(accumulator))
  train_loss(loss)
  train_accuracy(tar_real, predictions)  
def evaluate_batch(model, inputs, tokenizer_tar, max_length):
    encoder_input = tf.convert_to_tensor(inputs)
    decoder_input = tf.expand_dims([tokenizer_tar.bos_token_id] *
                                   inputs.shape[0],
                                   axis=1)
    output = decoder_input
    attention_weights = None

    for _ in range(max_length):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
            encoder_input, output)

        # predictions.shape == (batch_size, seq_len, vocab_size)
        predictions, attention_weights = model(encoder_input, output, False,
                                               enc_padding_mask, combined_mask,
                                               dec_padding_mask)

        # select the last word from the seq_len dimension
        predictions = predictions[:, -1:, :]  # (batch_size, 1, vocab_size)

        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

        # return the result if the predicted_id is equal to the end token
        if (predicted_id == tokenizer_tar.eos_token_id).numpy().all():
            return output, attention_weights
            # return tf.squeeze(output, axis=0), attention_weights

        # concatenate the predicted_id to the output which is given to the decoder
        # as its input.
        output = tf.concat([output, predicted_id], axis=-1)

    return output, attention_weights
def val_step(inp, tar, inp_shape, tar_shape, batch):
  tar_inp = tar[:, :-1]
  tar_real = tar[:, 1:]
  
  enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)
  predictions, attention_weights, dec_output = transformer(
                                                           inp, 
                                                           tar_inp, 
                                                           False, 
                                                           enc_padding_mask, 
                                                           combined_mask, 
                                                           dec_padding_mask
                                                           )
  if config.copy_gen:
    predictions = pointer_generator(
                            dec_output, 
                            predictions, 
                            attention_weights, 
                            inp, 
                            inp_shape, 
                            tar_shape, 
                            batch, 
                            training=False
                            )
  loss = loss_function(tar_real, predictions)
  validation_loss(loss)
  validation_accuracy(tar_real, predictions)
    def call(self, input_ids, target_ids, training):

        # (batch_size, 1, 1, seq_len), (batch_size, 1, 1, seq_len)
        _, combined_mask, dec_padding_mask = create_masks(
            input_ids, target_ids[:, :-1])

        # (batch_size, seq_len, d_bert)
        enc_output = self.bert_model(input_ids)[0]

        # (batch_size, seq_len, vocab_len), _
        draft_logits, draft_attention_dist = self.draft_summary(
            input_ids,
            enc_output=enc_output,
            look_ahead_mask=combined_mask,
            padding_mask=dec_padding_mask,
            target_ids=target_ids[:, :-1],
            training=True)

        # (batch_size, seq_len, vocab_len), _
        refine_logits, refine_attention_dist = self.refine_summary(
            input_ids,
            enc_output=enc_output,
            target=target_ids[:, :-1],
            padding_mask=dec_padding_mask,
            training=True)

        return draft_logits, draft_attention_dist, refine_logits, refine_attention_dist
def train_step(inp, tar, inp_shape, tar_shape, batch):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]
    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
        inp, tar_inp)
    with tf.GradientTape() as tape:
        predictions, attention_weights, dec_output = transformer(
            inp, tar_inp, True, enc_padding_mask, combined_mask,
            dec_padding_mask)
        train_variables = transformer.trainable_variables
        tf.debugging.check_numerics(predictions,
                                    "Nan's in the transformer predictions")
        if config.copy_gen:
            predictions = pointer_generator(dec_output,
                                            predictions,
                                            attention_weights,
                                            inp,
                                            inp_shape,
                                            tar_shape,
                                            batch,
                                            training=True)
            tf.debugging.check_numerics(
                predictions, "Nan's in the pointer_generator predictions")
        train_variables = train_variables + pointer_generator.trainable_variables
        loss = loss_function(tar_real, predictions)
    gradients = tape.gradient(loss, train_variables)
    optimizer.apply_gradients(zip(gradients, train_variables))
    train_loss(loss)
    train_accuracy(tar_real, predictions)
def val_step(inp, tar, epoch, inp_shape, tar_shape, batch, create_summ):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]
    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
        inp, tar_inp)
    predictions, attention_weights, dec_output = transformer(
        inp, tar_inp, False, enc_padding_mask, combined_mask, dec_padding_mask)
    if config.copy_gen:
        predictions = pointer_generator(dec_output,
                                        predictions,
                                        attention_weights,
                                        inp,
                                        inp_shape,
                                        tar_shape,
                                        batch,
                                        training=False)
    loss = loss_function(tar_real, predictions)
    validation_loss(loss)
    validation_accuracy(tar_real, predictions)
    if create_summ:
        rouge, bert = tf_write_summary(tar_real, predictions, inp[:, 1:],
                                       epoch)
    else:
        rouge, bert = (1.0, 1.0)
    return (rouge, bert)
        def decoder_query(output):
            enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
                doc_input, output)
            predictions, attention_weights, dec_output = model(
                doc_input, output, enc_padding_mask, combined_mask,
                dec_padding_mask, True)

            return (predictions[:, -1:, :])
    def decoder_query(output):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
            encoder_input, output)
        predictions, attention_weights, dec_output = model(
            encoder_input, output, enc_padding_mask, combined_mask,
            dec_padding_mask, False)

        # (batch_size, 1, target_vocab_size)
        return (predictions[:, -1:, :])
        def symbols_to_logits(output):
            batched_input = tf.tile(encoder_input, [beam_width, 1])
            enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
                batched_input, output)
            predictions, attention_weights = transformer2(
                batched_input, output, False, enc_padding_mask, combined_mask,
                dec_padding_mask)
            predictions = predictions[:, -1, :]

            return predictions
def draft_summary_sampling(model,
                           inp,
                           enc_output,
                           look_ahead_mask,
                           padding_mask,
                           sampling_type='greedy',
                           temperature=0.9,
                           p=0.9,
                           k=25,
                           training=False):
    """
    Inference call, builds a draft summary auto-regressively
    """
    log.info(f"Building: 'Draft {sampling_type} decoder'")
    N = tf.shape(enc_output)[0]
    T = tf.shape(enc_output)[1]

    # (batch_size, 1)
    dec_input = tf.ones([N, 1], dtype=tf.int32) * CLS_ID
    summary, dec_outputs, dec_logits, attention_dists = [], [], [], []
    summary += [dec_input]
    for i in (range(0, config.summ_length)):
        _, _, dec_padding_mask = create_masks(inp, dec_input)
        # (batch_size, i+1, d_bert)
        embeddings = model.embedding(dec_input)

        # (batch_size, i+1, vocab), (_)
        dec_output, attention_dist = model.decoder(inp, embeddings, enc_output,
                                                   training, look_ahead_mask,
                                                   padding_mask)

        # (batch_size, 1, vocab)
        dec_output_i = dec_output[:, -1:, :]
        if sampling_type == 'nucleus':
            preds = tf.cast(
                nucleus_sampling(((dec_output_i) / temperature), p=p),
                tf.int32)
        elif sampling_type == 'topk':
            preds = tf.cast(
                top_k_sampling(((dec_output_i) / temperature), k=k), tf.int32)
        elif sampling_type == 'random_sampling':
            preds = tf.cast(sampling((dec_output_i) / temperature), tf.int32)
        elif sampling_type == 'topktopp':
            preds = tf.cast(
                topp_topk(((dec_output_i) / temperature), p=p, k=k), tf.int32)
        else:
            preds = tf.cast(tf.argmax(dec_output_i, axis=-1), tf.int32)
        dec_outputs += [dec_output_i]
        #dec_logits_i = dec_logits_i[:, -1:, :]
        #dec_logits += [dec_logits_i]
        summary += [preds]
        dec_input = with_column(dec_input, i + 1, preds)
    summary = tf.concat(summary, axis=1)
    # (batch_size, seq_len, vocab_len), (batch_size, seq_len), (_)
    return summary, attention_dist
Exemple #12
0
 def beam_search_decoder(target_ids):
     _, combined_mask, dec_padding_mask = create_masks(
         input_ids, target_ids)
     draft_logits, _ = model.draft_summary(input_ids=input_ids,
                                           enc_output=enc_output,
                                           look_ahead_mask=combined_mask,
                                           padding_mask=dec_padding_mask,
                                           target_ids=target_ids,
                                           training=False)
     # (batch_size, 1, target_vocab_size)
     return (draft_logits[:, -1:, :])
Exemple #13
0
def run_inference(model, dataset, beam_sizes_to_try=config.beam_sizes):
    for beam_size in beam_sizes_to_try:
        ref_sents = []
        hyp_sents = []
        for (doc_id, (input_ids, _, _, target_ids, _,
                      _)) in enumerate(dataset, 1):
            start_time = time.time()
            # translated_output_temp[0] (batch, beam_size, summ_length+1)
            translated_output_temp, enc_output = draft_decoded_summary(
                model, input_ids, target_ids[:, :-1], beam_size)
            draft_predictions = translated_output_temp[0][:, 0, :]
            _, _, dec_padding_mask = create_masks(input_ids,
                                                  target_ids[:, :-1])
            refined_summary, attention_dists = refined_summary_greedy(
                model,
                input_ids,
                enc_output,
                draft_predictions,
                dec_padding_mask,
                training=False)
            sum_ref = tokenizer.convert_ids_to_tokens(
                [i for i in tf.squeeze(target_ids) if i not in [0, 101, 102]])
            sum_hyp = tokenizer.convert_ids_to_tokens([
                i for i in tf.squeeze(refined_summary)
                if i not in [0, 101, 102]
            ])
            sum_ref = convert_wordpiece_to_words(sum_ref)
            sum_hyp = convert_wordpiece_to_words(sum_hyp)
            print('Original summary: {}'.format(sum_ref))
            print('Predicted summary: {}'.format(sum_hyp))
            if sum_ref and sum_hyp:
                ref_sents.append(sum_ref)
                hyp_sents.append(sum_hyp)
        try:
            rouges = rouge_all.get_scores(ref_sents, hyp_sents)
            avg_rouge_f1 = np.mean([
                np.mean([
                    rouge_scores['rouge-1']["f"], rouge_scores['rouge-2']["f"],
                    rouge_scores['rouge-l']["f"]
                ]) for rouge_scores in rouges
            ])
            _, _, bert_f1 = b_score(ref_sents,
                                    hyp_sents,
                                    lang='en',
                                    model_type=config.pretrained_bert_model)
            avg_bert_f1 = np.mean(bert_f1.numpy())
        except:
            avg_rouge_f1 = 0
            avg_bert_f1 = 0
        print(infer_template.format(beam_size, avg_rouge_f1, avg_bert_f1))
        print(f'time to process document {doc_id} : {time.time()-start_time}')
Exemple #14
0
 def train_step(inp, tar):
     
     tar_inp = tar[:, :-1]
     tar_real = tar[:, 1:]
     inp_inp = inp[:, :-1]
     inp_real = inp[:, 1:]
     enc_padding_mask1, combined_mask1, dec_padding_mask1 = create_masks(inp, tar_inp)
     enc_padding_mask2, combined_mask2, dec_padding_mask2 = create_masks(tar, inp_inp)
     with tf.GradientTape() as tape:
         predictions1, _ = transformer1(inp, tar_inp, True, enc_padding_mask1, combined_mask1, dec_padding_mask1)
         loss1 = loss_function(tar_real, predictions1) # this is de->en
         if USE_RTL:
             predictions2, _ = transformer2(tar, inp_inp, True, enc_padding_mask2, combined_mask2, dec_padding_mask2)
             loss2 = loss_function(inp_real, predictions2) # this is en->de
             loss = loss1 + loss2
             if LAMBDA>0:
                 predicted_id2 = tf.argmax(predictions2, axis=-1) # find most likely token from logits
                 inp2 = tf.concat([inp[:, 0:1], predicted_id2], axis=-1) # add start token. inp2 is \hat{s} in the paper
                 predicted_id1 = tf.argmax(predictions1, axis=-1) # find most likely token from logits
                 tar2 = tf.concat([tar[:, 0:1], predicted_id1], axis=-1) # add start token. tar22 is \hat{t} in the paper
                 enc_padding_mask3, combined_mask3, dec_padding_mask3 = create_masks(inp2, tar_inp)
                 enc_padding_mask4, combined_mask4, dec_padding_mask4 = create_masks(tar2, inp_inp)
                 predictions3, _ = transformer1(inp2, tar_inp, True, enc_padding_mask3, combined_mask3, dec_padding_mask3)
                 loss3 = loss_function(tar_real, predictions3) # predictions3 is \tilde{t} in the paper
                 predictions4, _ = transformer2(tar2, inp_inp, True, enc_padding_mask4, combined_mask4, dec_padding_mask4)
                 loss4 = loss_function(inp_real, predictions4) # predictions4 is \tilde{s} in the paper
                 loss += LAMBDA * (loss3 + loss4)
         else:
             loss = loss1
     if USE_RTL:        
         gradients = tape.gradient(loss, [transformer1.trainable_variables, transformer2.trainable_variables])
         optimizer.apply_gradients(zip(gradients[0] + gradients[1], transformer1.trainable_variables + transformer2.trainable_variables))
     else:
         gradients = tape.gradient(loss, transformer1.trainable_variables)
         optimizer.apply_gradients(zip(gradients, transformer1.trainable_variables))
     
     train_loss(loss)
     train_accuracy(tar_real, predictions1)
    def transformer_query(output):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
            encoder_input, output)
        predictions, attention_weights, dec_output = transformer(
            encoder_input, output, False, enc_padding_mask, combined_mask,
            dec_padding_mask)

        if config.copy_gen:
            predictions = generator(dec_output, predictions, attention_weights,
                                    encoder_input, inp_shape, output.shape[-1],
                                    batch, False)

        # select the last sequence
        return (predictions[:, -1:, :])  # (batch_size, 1, target_vocab_size)
def val_step(model, loss_object, inp, tar, val_loss, val_accuracy,
             pad_token_id):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]

    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
        inp, tar_inp)

    predictions, _ = model(inp, tar_inp, False, enc_padding_mask,
                           combined_mask, dec_padding_mask)
    loss = loss_function(tar_real, predictions, loss_object, pad_token_id)

    val_loss(loss)
    val_accuracy(tar_real, predictions)
    def call(self, inp, tar, training):
        # (batch_size, seq_len) x3
        input_ids, input_mask, input_segment_ids = inp

        # (batch_size, seq_len + 1) x3
        target_ids, target_mask, target_segment_ids = tar

        # (batch_size, 1, 1, seq_len), (_), (batch_size, 1, 1, seq_len)
        _, combined_mask, dec_padding_mask = create_masks(
            input_ids, target_ids[:, :-1])

        # (batch_size, seq_len, d_bert)
        enc_output = self.bert((input_ids, input_mask, input_segment_ids))

        # (batch_size, seq_len, d_bert)
        embeddings = self.embedding(target_ids[:, :-1])

        draft_logits,\
        draft_attention_dist,\
        draft_dec_outputs = self.draft_summary(
                                                embeddings,
                                                enc_output,
                                                combined_mask,
                                                dec_padding_mask,
                                                target_ids[:, :-1],
                                                training
                                              )

        if config.copy_gen:
            draft_logits = self.pointer_generator(
                draft_dec_outputs, draft_logits, draft_attention_dist,
                input_ids,
                tf.shape(input_ids)[1],
                tf.shape(target_ids[:, :-1])[1], training)


        refine_logits,\
        refine_attention_dist,\
        refine_dec_outputs = self.refine_summary(
                                                  enc_output,
                                                  (target_ids[:, :-1], target_mask[:, :-1],
                                                   target_segment_ids[:, :-1]),
                                                  dec_padding_mask,
                                                  training
                                                 )

        return (draft_logits, draft_attention_dist, draft_dec_outputs,
                refine_logits, refine_attention_dist, refine_dec_outputs)
Exemple #18
0
def train_step(inp, tar):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]

    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
        inp, tar_inp)
    with tf.GradientTape() as tape:
        predictions, _ = model(inp, tar_inp, True, enc_padding_mask,
                               combined_mask, dec_padding_mask)

        loss = loss_function(tar_real, predictions)

        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

        train_loss(loss)
        train_accuracy(tar_real, predictions)
Exemple #19
0
def validate(model, data, mode="valid"):
    valid_loss = tf.keras.metrics.Mean(name='valid_loss')
    valid_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
        name='valid_accuracy')

    for inp, tar in data.batcher(mode=mode):
        tar_inp = tar[:, :-1]
        tar_real = tar[:, 1:]

        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
            inp, tar_inp)
        predictions, _ = model(inp, tar_inp, True, enc_padding_mask,
                               combined_mask, dec_padding_mask)
        loss = loss_function(tar_real, predictions)

        valid_loss(loss)
        valid_accuracy(tar_real, predictions)
    return valid_accuracy.result().numpy(), valid_loss.result().numpy()
    def call(self, input_ids, input_mask, input_segment_ids, target_ids,
             target_mask, target_segment_ids, training):
        # (batch_size, seq_len) x3
        #input_ids, input_mask, input_segment_ids = inp

        # (batch_size, seq_len + 1) x3
        #target_ids, target_mask, target_segment_ids = tar

        # (batch_size, 1, 1, seq_len), (_), (batch_size, 1, 1, seq_len)
        _, combined_mask, dec_padding_mask = create_masks(
            input_ids, target_ids[:, :-1])

        # (batch_size, seq_len, d_bert)
        enc_output = self.bert_model(input_ids)[
            0]  #, input_mask, input_segment_ids)

        draft_logits, draft_attention_dist, draft_dec_outputs = self.draft_summary(
            enc_output=enc_output,
            look_ahead_mask=combined_mask,
            padding_mask=dec_padding_mask,
            target_ids=target_ids[:, :-1],
            training=True)

        if config.copy_gen:
            draft_logits = self.pointer_generator(draft_dec_outputs,
                                                  draft_logits,
                                                  draft_attention_dist,
                                                  input_ids,
                                                  tf.shape(input_ids)[1],
                                                  tf.shape(
                                                      target_ids[:, :-1])[1],
                                                  training=training)

        # (batch_size, seq_len, vocab_len), (batch_size, seq_len), (_)
        refine_logits, refine_attention_dist, refine_dec_outputs = self.refine_summary(
            enc_output=enc_output,
            target=(target_ids[:, :-1], target_mask[:, :-1],
                    target_segment_ids[:, :-1]),
            padding_mask=dec_padding_mask,
            training=True)

        return draft_logits, draft_attention_dist, draft_dec_outputs, refine_logits, refine_attention_dist, refine_dec_outputs
def train_step(model, loss_object, optimizer, inp, tar, train_loss,
               train_accuracy, pad_token_id):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]

    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
        inp, tar_inp)

    with tf.GradientTape() as tape:
        # training=True is only needed if there are layers with different
        # behavior during training versus inference (e.g. Dropout).
        predictions, _ = model(inp, tar_inp, True, enc_padding_mask,
                               combined_mask, dec_padding_mask)
        loss = loss_function(tar_real, predictions, loss_object, pad_token_id)

    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    train_loss(loss)
    train_accuracy(tar_real, predictions)
def val_step(inp, tar, epoch, create_summ):
  tar_inp = tar[:, :-1]
  tar_real = tar[:, 1:]
  enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)
  predictions, attention_weights, dec_output = model(
                                                     inp, 
                                                     tar_inp, 
                                                     enc_padding_mask, 
                                                     combined_mask, 
                                                     dec_padding_mask,
                                                     training=False
                                                     )
  loss = loss_function(tar_real, predictions)
  validation_loss(loss)
  validation_accuracy(tar_real, predictions)
  if create_summ: 
    rouge, bert = tf_write_summary(tar_real, predictions, inp[:, 1:], epoch)  
  else: 
    rouge, bert = (1.0, 1.0)  
  return (rouge, bert)
def train_step(inp, tar):
  tar_inp = tar[:, :-1]
  tar_real = tar[:, 1:]
  enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)
  with tf.GradientTape() as tape:
    predictions, attention_weights, dec_output = transformer(
                                                             inp, 
                                                             tar_inp, 
                                                             enc_padding_mask, 
                                                             combined_mask, 
                                                             dec_padding_mask,
                                                             training=True
                                                             )
    train_variables = transformer.trainable_variables
    loss = loss_function(tar_real, predictions)
    scaled_loss = optimizer.get_scaled_loss(loss)
  scaled_gradients = tape.gradient(scaled_loss, train_variables)
  gradients = optimizer.get_unscaled_gradients(scaled_gradients)
  optimizer.apply_gradients(zip(gradients, train_variables))
  train_loss(loss)
  train_accuracy(tar_real, predictions)  
Exemple #24
0
 def beam_search_decoder(output):
   _, _, dec_padding_mask = create_masks(input_ids, output)    
   embeddings = model.embedding(output)
   predictions, dec_op, attention_weights = model.decoder(
                                                         input_ids, 
                                                         embeddings, 
                                                         enc_output, 
                                                         False, 
                                                         None, 
                                                         dec_padding_mask
                                                         )
   if config.copy_gen:
     predictions = model.decoder.pointer_generator(
                                                   dec_op, 
                                                   predictions,
                                                   attention_weights,
                                                   input_ids,
                                                   tf.shape(input_ids)[1], 
                                                   tf.shape(output)[-1], 
                                                   False
                                                  )
   # (batch_size, 1, target_vocab_size)
   return (predictions[:,-1:,:])
    def train(self, examples, is_train=True):
        src_token_ids = examples["src_token_ids"]
        tgt_token_ids = examples["tgt_token_ids"]
        tgt_edges = examples["tgt_edges"]

        # enc_padding_mask: (batch_size, 1, 1, src_seq_len)
        # combined_mask: (batch_size, 1, tgt_seq_len, tgt_seq_len)
        # dec_padding_mask: (batch_size, 1, 1, src_seq_len)
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
            src_token_ids, tgt_token_ids,
            self.src_vocab.token2idx[self.src_vocab.PAD],
            self.tgt_vocab.token2idx[self.tgt_vocab.PAD])

        # (batch_size, src_seq_len, d_model)
        enc_output = self.encoder(src_token_ids, is_train, enc_padding_mask)

        # dec_output.shape == (batch_size, tgt_seq_len, tgt_vocab_size+src_seq_len)
        dec_output, _, edge_scores = self.decoder(tgt_token_ids,
                                                  enc_output,
                                                  is_train,
                                                  combined_mask,
                                                  dec_padding_mask,
                                                  tgt_edges=tgt_edges)
        # prepend the BOS token
        # (batch_size, 1)
        start_token = tf.expand_dims(tgt_token_ids[:, 0], axis=-1)
        # (batch_size, 1, tgt_vocab_size + src_seq_len)
        start_token_onehot = tf.one_hot(start_token,
                                        depth=(self.tgt_vocab_size +
                                               self.src_seq_len))
        start_token_logits = start_token_onehot + (start_token_onehot -
                                                   1) * 1e9
        dec_output = tf.concat([start_token_logits, dec_output[:, :-1, :]],
                               axis=1)

        # (batch_size, tgt_seq_len, tgt_vocab_size+src_seq_len)
        return dec_output, edge_scores
    def evaluate(self, inp_sentence):
        start_token = [self.tokenizer_pt.vocab_size]
        end_token = [self.tokenizer_pt.vocab_size + 1]

        # inp sentence is portuguese, hence adding the start and end token
        inp_sentence = start_token + self.tokenizer_pt.encode(
            inp_sentence) + end_token
        encoder_input = tf.expand_dims(inp_sentence, 0)

        # as the target is english, the first word to the transformer should be the
        # english start token.
        decoder_input = [self.tokenizer_en.vocab_size]
        output = tf.expand_dims(decoder_input, 0)

        for i in range(self.MAX_LENGTH):
            enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
                encoder_input, output)

            # predictions.shape == (batch_size, seq_len, vocab_size)
            predictions, attention_weights = self.translate_transformer(
                encoder_input, output, False, enc_padding_mask, combined_mask,
                dec_padding_mask)

            # select the last word from the seq_len dimension
            predictions = predictions[:, -1:, :]  # (batch_size, 1, vocab_size)

            predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

            # return the result if the predicted_id is equal to the end token
            if tf.equal(predicted_id, self.tokenizer_en.vocab_size + 1):
                return tf.squeeze(output, axis=0), attention_weights

            # concatentate the predicted_id to the output which is given to the decoder
            # as its input.
            output = tf.concat([output, predicted_id], axis=-1)

        return tf.squeeze(output, axis=0), attention_weights
Exemple #27
0
def evaluate(inp_sentence, transformer=model):

    # start = time()

    start_token = OLD_VOCAB_SIZE
    end_token = OLD_VOCAB_SIZE+1
    
    inp_sentence = [start_token] + inp_sentence + [end_token]
    encoder_input = tf.expand_dims(inp_sentence, 0)

    decoder_input = [KOR_VOCAB_SIZE]
    output = tf.expand_dims(decoder_input, 0)
    
    for i in range(200):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
            encoder_input, output)

        # predictions.shape == (batch_size, seq_len, vocab_size)
        predictions, attention_weights = transformer(encoder_input, 
                                                     output,
                                                     False,
                                                     enc_padding_mask,
                                                     combined_mask,
                                                     dec_padding_mask)

        predictions = predictions[: ,-1:, :]  # (batch_size, 1, vocab_size)

        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

        if predicted_id == KOR_VOCAB_SIZE+1:
            # print("model predict time :", time()-start)
            return tf.squeeze(output, axis=0), attention_weights

        output = tf.concat([output, predicted_id], axis=-1)

    # print("model predict time :", time()-start)
    return tf.squeeze(output, axis=0), attention_weights
def train_epoch(epoch, model, dataloader, optimizer, sched=None):
    model.train()
    start = time.time()
    total_loss = 0
    print_every = max(1, int(len(dataloader) / 100.0))
    
    for i, (smiles, iupac_in, iupac_out, smiles_lens, iupac_lens) in enumerate(dataloader):
        smiles = smiles.to(DEVICE)
        iupac_in = iupac_in.to(DEVICE)
        iupac_out = iupac_out.to(DEVICE)
        
        optimizer.zero_grad()
        
        smiles_mask, iupac_mask = create_masks(smiles, iupac_in, device=DEVICE)
        preds = model(smiles, iupac_in, smiles_mask, iupac_mask)
        
        loss = torch.nn.functional.cross_entropy(preds.view(-1, preds.size(-1)), iupac_out.view(-1), ignore_index=ord(EXTRA_CHARS['pad']))
        #print(loss, preds)
        loss.backward()
        optimizer.step()
        if sched:
            sched.step()
            
        total_loss += loss.item()
        
        if (i+1) % print_every == 0:
            avg_loss = total_loss / float(print_every)
            print_progress((time.time() - start)//60, epoch+1, i+1, avg_loss)
            total_loss = 0
            
        #if (i+1) % SAVE_ITERS == 0:
        #    save(epoch, i+1, NAME, model, optimizer)
       
    avg_loss = total_loss / max(1, (i+1) % print_every)
    print_progress((time.time() - start)//60, epoch+1, i+1, avg_loss)
    save(epoch, model, optimizer)
    model.load_state_dict(checkpoint['state_dict'], strict=False)
except AttributeError as e:
    model = nn.DataParallel(model)
    model.load_state_dict(checkpoint['state_dict'], strict=False)
print("Pretrained weights loaded")

try:
    encoder = model.module.encoder
except AttributeError as e:
    encoder = model.encoder

embeddings = []
with torch.no_grad():
    for smiles in smiles_strings:
        encoded = encode_smiles(smiles)
        mask = create_masks(encoded)
        embedding = encoder(encoded, mask)[0].numpy()
        embeddings.append(embedding)
        print("embedded {0} into {1} matrix.".format(smiles,
                                                     str(embedding.shape)))

print("All SMILES strings embedded. Saving...")
filename = os.path.splitext(os.path.basename(args.data_path))[0]
out_dir = "embeddings/"
out_file = os.path.join(out_dir, filename + ".npz")

if not os.path.exists(out_dir):
    os.makedirs(out_dir)

out_dict = {
    smiles: matrix
Exemple #30
0
    def call(self, inp, tar, training):
        # (batch_size, seq_len) x3
        input_ids, input_mask, input_segment_ids = inp

        # (batch_size, seq_len + 1) x3
        target_ids, target_mask, target_segment_ids = tar

        # (batch_size, 1, 1, seq_len), (_), (batch_size, 1, 1, seq_len)
        _, combined_mask, dec_padding_mask = create_masks(
            input_ids, target_ids[:, :-1])

        # (batch_size, seq_len, d_bert)
        enc_output = self.bert((input_ids, input_mask, input_segment_ids))

        if self.add_stage_1:
            # (batch_size, seq_len, d_bert)
            embeddings = self.embedding(target_ids[:, :-1])
            #print(tf.shape(embeddings))
            # (batch_size, seq_len, d_bert), (_)
            dec_outputs, attention_dist = self.decoder(embeddings, enc_output,
                                                       training, combined_mask,
                                                       dec_padding_mask)
            # (batch_size, seq_len, vocab_len)
            logits = self.final_layer(dec_outputs)

            if config.copy_gen:
                logits = self.pointer_generator(dec_outputs,
                                                logits,
                                                attention_dist,
                                                input_ids,
                                                tf.shape(input_ids)[1],
                                                tf.shape(
                                                    target_ids[:, :-1])[1],
                                                training=training)

        if self.add_stage_2:
            N = tf.shape(enc_output)[0]
            T = self.output_seq_len
            # since we are using teacher forcing we do not need an autoregressice mechanism here
            # (batch_size x (seq_len - 1), seq_len)
            dec_inp_ids = tile_and_mask_diagonal(target_ids[:, :-1],
                                                 mask_with=MASK_ID)
            # (batch_size x (seq_len - 1), seq_len)
            dec_inp_mask = tf.tile(target_mask[:, :-1], [T - 1, 1])
            # (batch_size x (seq_len - 1), seq_len)
            dec_inp_segment_ids = tf.tile(target_segment_ids[:, :-1],
                                          [T - 1, 1])
            # (batch_size x (seq_len - 1), seq_len, d_bert)
            enc_output = tf.tile(enc_output, [T - 1, 1, 1])
            # (batch_size x (seq_len - 1), 1, 1, seq_len)
            padding_mask = tf.tile(dec_padding_mask, [T - 1, 1, 1, 1])
            # (batch_size x (seq_len - 1), seq_len, d_bert)
            context_vectors = self.bert(
                (dec_inp_ids, dec_inp_mask, dec_inp_segment_ids))

            # (batch_size x (seq_len - 1), seq_len, d_bert), (_)
            dec_outputs, attention_dist = self.decoder(
                context_vectors,
                enc_output,
                training,
                look_ahead_mask=None,
                padding_mask=padding_mask)
            # (batch_size x (seq_len - 1), seq_len - 1, d_bert)
            dec_outputs = dec_outputs[:, 1:, :]
            # (batch_size x (seq_len - 1), (seq_len - 1))
            diag = tf.linalg.set_diag(tf.zeros([T - 1, T - 1]),
                                      tf.ones([T - 1]))
            diag = tf.tile(diag, [N, 1])

            where = tf.not_equal(diag, 0)
            indices = tf.where(where)

            # (batch_size x (seq_len - 1), d_bert)
            dec_outputs = tf.gather_nd(dec_outputs, indices)

            # (batch_size, seq_len - 1, d_bert)
            dec_outputs = tf.reshape(dec_outputs, [N, T - 1, -1])
            # (batch_size, seq_len, d_bert)
            dec_outputs = tf.concat([
                tf.tile(
                    tf.expand_dims(tf.one_hot([CLS_ID], self.d_model), axis=0),
                    [N, 1, 1]), dec_outputs
            ],
                                    axis=1)

            # (batch_size, seq_len, vocab_len)
            logits = self.final_layer(dec_outputs)
        return logits, attention_dist, dec_outputs