Ejemplo n.º 1
0
    def predict(self, input_seq):
        encoder_model, decoder_model = self.get_encoder_and_decoder_models()
        _, trg_tokenizer = load_tokenizers()
        states_value = encoder_model.predict(input_seq)
        target_seq = np.zeros((1, 1))
        print('<BOS> index : ', trg_tokenizer.word_index['bos'])
        target_seq[0, 0] = trg_tokenizer.word_index['bos']
        eos = trg_tokenizer.word_index['eos']
        print('<EOS> index : ', eos)
        output_sentence = []

        for _ in range(self.max_trg_seq_len):
            output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
            idx = np.argmax(output_tokens[0, 0, :])

            if eos == idx:
                break

            word = ''

            if idx > 0:
                word = trg_tokenizer.index_word[idx]
                output_sentence.append(word)

            target_seq[0, 0] = idx
            states_value = [h, c]

        return ' '.join(output_sentence)
Ejemplo n.º 2
0
    def predict(self, text):
        inp_tokenizer, trg_tokenizer = load_tokenizers()
        print(text)
        test_source_seq = inp_tokenizer.texts_to_sequences([text])
        print(test_source_seq)

        en_initial_states = self.encoder.init_states(1)
        en_outputs = self.encoder(tf.constant(test_source_seq),
                                  en_initial_states)

        de_input = tf.constant([[trg_tokenizer.word_index['<bos>']]])
        de_state_h, de_state_c = en_outputs[1:]
        out_words = []

        while True:
            de_output, de_state_h, de_state_c = self.decoder(
                de_input, (de_state_h, de_state_c))
            de_input = tf.argmax(de_output, -1)
            out_words.append(trg_tokenizer.index_word[de_input.numpy()[0][0]])

            if out_words[-1] == '<eos>' or len(out_words) >= 20:
                break

        print(' '.join(out_words))
Ejemplo n.º 3
0
    def __init__(self, config_path):

        with open(os.path.join(config_path, "config.yml")) as cf:
            config = yaml.load(cf, Loader=yaml.FullLoader)

        self.num_layers = config["num_layers"]
        self.d_model = config["d_model"]
        self.dff = config["dff"]
        self.num_heads = config["num_heads"]
        self.dropout_rate = config["dropout_rate"]
        self.max_length = config["max_length"]
        self.epochs = config["epochs"]
        self.batch_size = config["batch_size"]
        self.target_vocab_size = config["target_vocab_size"]
        self.checkpoint = config["checkpoint"]
        self.max_checkpoint = config["max_checkpoint"]
        self.custom_checkpoint = config["custom_checkpoint"]
        self.eval_limit = config["eval_limit"]
        self.exit_phrase = config["exit_phrase"]

        if config["storage_path"] != None:
            self.storage_path = config["storage_path"]
        else:
            self.storage_path = "./"

        if config["ckpt_path"] != None:
            self.ckpt_path = config["ckpt_path"]
        else:
            self.ckpt_path = "./"

        if not self.storage_path.endswith("/"):
            self.storage_path += "/"

        if not self.ckpt_path.endswith("/"):
            self.ckpt_path += "/"

        self.data_path = f"{self.storage_path}data"
        self.checkpoint_path = f"{self.ckpt_path}checkpoints/train"
        self.tokenizer_path = f"{self.storage_path}tokenizers"
        self.inputs_savepath = f"{self.tokenizer_path}/inputs_token"
        self.outputs_savepath = f"{self.tokenizer_path}/outputs_token"

        if not os.path.exists(f"{self.ckpt_path}checkpoints"):
            os.mkdir(f"{self.ckpt_path}checkpoints")
        if not os.path.exists(f"{self.ckpt_path}checkpoints/train"):
            os.mkdir(f"{self.ckpt_path}checkpoints/train")
        if not os.path.exists(f"{self.storage_path}tokenizers"):
            os.mkdir(f"{self.storage_path}tokenizers")
        if not os.path.exists(f"{self.storage_path}models"):
            os.mkdir(f"{self.storage_path}models")

        if config["mode"] in ["train", "eval"]:
            if os.path.exists(os.path.join(
                    config_path, "data/train.from")) and os.path.exists(
                        os.path.join(config_path, "data/train.to")):
                pass
            else:
                if config["reddit_data"]:
                    print("Starting to generate train data from Subreddits.")
                    get_data(config_path)
            loader(config_path)

        self.inputs, self.outputs = load_data(
            f"{self.data_path}/training_data.txt")
        try:
            self.inputs_tokenizer, self.outputs_tokenizer = load_tokenizers(
                inputs_outputs_savepaths=[
                    self.inputs_savepath, self.outputs_savepath
                ])
        except:
            print(
                "No tokenizers has been created yet, creating new tokenizers..."
            )
            self.inputs_tokenizer, self.outputs_tokenizer = create_tokenizers(
                inputs_outputs=[self.inputs, self.outputs],
                inputs_outputs_savepaths=[
                    self.inputs_savepath, self.outputs_savepath
                ],
                target_vocab_size=self.target_vocab_size)

        self.input_vocab_size = self.inputs_tokenizer.vocab_size + 2
        self.target_vocab_size = self.outputs_tokenizer.vocab_size + 2

        self.learning_rate = CustomSchedule(self.d_model)
        self.optimizer = tf.keras.optimizers.Adam(self.learning_rate,
                                                  beta_1=0.9,
                                                  beta_2=0.98,
                                                  epsilon=1e-9)
        self.train_loss = tf.keras.metrics.Mean(name='train_loss')
        self.train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
            name='train_accuracy')
        self.transformer = Transformer(self.num_layers,
                                       self.d_model,
                                       self.num_heads,
                                       self.dff,
                                       self.input_vocab_size,
                                       self.target_vocab_size,
                                       pe_input=self.input_vocab_size,
                                       pe_target=self.target_vocab_size,
                                       rate=self.dropout_rate)

        self.ckpt = tf.train.Checkpoint(transformer=self.transformer,
                                        optimizer=self.optimizer)
        self.ckpt_manager = tf.train.CheckpointManager(
            self.ckpt, self.checkpoint_path, max_to_keep=self.max_checkpoint)

        if self.custom_checkpoint:
            self.ckpt.restore(self.custom_checkpoint)
            print(f"Custom checkpoint restored: {self.custom_checkpoint}")
        # if a checkpoint exists, restore the latest checkpoint.
        elif self.ckpt_manager.latest_checkpoint:
            self.ckpt.restore(self.ckpt_manager.latest_checkpoint)
            print(
                f"Latest checkpoint restored: {self.ckpt_manager.latest_checkpoint}"
            )

        if config["mode"] == "train":
            print("\nMODE: train\n===========\n")
            self.train_dataset = prepare_data(
                self.batch_size, [self.inputs, self.outputs],
                [self.inputs_tokenizer, self.outputs_tokenizer],
                self.max_length)

            self.train()
            eval_indexes = random.choices(range(len(self.inputs)),
                                          k=int(len(self.inputs) * 0.01))
            for i in eval_indexes:
                predicted_sentence, attention_weights, sentence, result = self.reply(
                    self.inputs[i])
                print(f"\nInput: {self.inputs[i]}")
                print(f"Predicted: {predicted_sentence}")
                print(f"Sample output: {self.outputs[i]}")

        elif config["mode"] == "eval":
            print("\nMODE: eval\n==========\n")
            self.inputs = self.inputs[:self.eval_limit]
            self.outputs = self.outputs[:self.eval_limit]

            for (ins, outs) in zip(self.inputs, self.outputs):
                predicted_sentence, attention_weights, sentence, result = self.reply(
                    ins)
                print(f"\nInput: {ins}")
                print(f"Predicted: {predicted_sentence}")
                print(f"Sample output: {outs}")

        elif config["mode"] == "test":
            print("\nMODE: test\n==========\n")
            while True:
                usr_input = input("[USER]: ")
                if usr_input == self.exit_phrase:
                    print("Exiting test mode...")
                    break
                else:
                    predicted_sentence, _, _, _ = self.reply(usr_input)
                    print(f"[BOT]: {predicted_sentence}")
        elif config["mode"] == "script":
            print("\nMODE: script\n==========\n")
def do_training(user_config):
    inp_language = user_config["inp_language"]
    target_language = user_config["target_language"]

    print("\n****Training model from {} to {}****\n".format(
        inp_language, target_language))

    print("****Loading tokenizers****")
    # load pre-trained tokenizer
    tokenizer_inp, tokenizer_tar = utils.load_tokenizers(
        inp_language, target_language, user_config)

    print("****Loading train dataset****")
    # train data loader
    train_aligned_path_inp = user_config["train_data_path_{}".format(
        inp_language)]
    train_aligned_path_tar = user_config["train_data_path_{}".format(
        target_language)]
    train_dataloader = DataLoader(user_config["transformer_batch_size"],
                                  train_aligned_path_inp,
                                  train_aligned_path_tar, tokenizer_inp,
                                  tokenizer_tar, inp_language, target_language,
                                  True)
    train_dataset = train_dataloader.get_data_loader()

    print("****Loading val dataset****")
    # val data loader
    val_aligned_path_inp = user_config["val_data_path_{}".format(inp_language)]
    val_aligned_path_tar = user_config["val_data_path_{}".format(
        target_language)]
    val_dataloader = DataLoader(
        user_config["transformer_batch_size"] *
        2,  # for fast validation increase batch size
        val_aligned_path_inp,
        val_aligned_path_tar,
        tokenizer_inp,
        tokenizer_tar,
        inp_language,
        target_language,
        False)
    val_dataset = val_dataloader.get_data_loader()

    # define loss and accuracy metrics
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction='none')
    train_loss = tf.keras.metrics.Mean(name='train_loss')
    train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
        name='train_accuracy')
    val_loss = tf.keras.metrics.Mean(name='val_loss')
    val_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
        name='val_accuracy')

    print("****Loading transformer model****")
    # load model and optimizer
    transformer_model, optimizer, ckpt_manager = \
        utils.load_transformer_model(user_config, tokenizer_inp, tokenizer_tar)

    epochs = user_config["transformer_epochs"]
    print("\nTraining model now...")
    for epoch in range(epochs):
        print()
        start = time.time()
        train_loss.reset_states()
        train_accuracy.reset_states()
        val_loss.reset_states()
        val_accuracy.reset_states()

        # inp -> english, tar -> french
        for (batch, (inp, tar, _)) in enumerate(train_dataset):
            train_step(transformer_model,
                       loss_object,
                       optimizer,
                       inp,
                       tar,
                       train_loss,
                       train_accuracy,
                       pad_token_id=tokenizer_tar.pad_token_id)

            if batch % 50 == 0:
                print('Train: Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.
                      format(epoch + 1, batch, train_loss.result(),
                             train_accuracy.result()))

            if (batch + 1) % 2200 == 0:
                # inp -> english, tar -> french
                for (_, (inp, tar, _)) in enumerate(val_dataset):
                    val_step(transformer_model,
                             loss_object,
                             inp,
                             tar,
                             val_loss,
                             val_accuracy,
                             pad_token_id=tokenizer_tar.pad_token_id)
                print('Batch {}: Val Loss: {:.4f}, Val Accuracy: {:.4f}\n'.
                      format(batch, val_loss.result(), val_accuracy.result()))
                if user_config["compute_bleu"]:
                    print("\nComputing BLEU at batch {}: ".format(batch))
                    compute_bleu_score(transformer_model, val_dataset,
                                       user_config, tokenizer_tar,
                                       batch * epoch + 1)

        print("After {} epochs".format(epoch + 1))
        print('Train Loss: {:.4f}, Train Accuracy: {:.4f}'.format(
            train_loss.result(), train_accuracy.result()))

        # inp -> english, tar -> french
        for (batch, (inp, tar, _)) in enumerate(val_dataset):
            val_step(transformer_model,
                     loss_object,
                     inp,
                     tar,
                     val_loss,
                     val_accuracy,
                     pad_token_id=tokenizer_tar.pad_token_id)
        print('Val Loss: {:.4f}, Val Accuracy: {:.4f}'.format(
            val_loss.result(), val_accuracy.result()))

        print('Time taken for training epoch {}: {} secs'.format(
            epoch + 1,
            time.time() - start))

        # evaluate and save model every x-epochs
        ckpt_save_path = ckpt_manager.save()
        print('Saving checkpoint after epoch {} at {}'.format(
            epoch + 1, ckpt_save_path))
        if user_config["compute_bleu"]:
            print("\nComputing BLEU at epoch {}: ".format(epoch + 1))
            compute_bleu_score(transformer_model, val_dataset, user_config,
                               tokenizer_tar, epoch + 1)
def do_evaluation(user_config, input_file_path, target_file_path,
                  pred_file_path):
    inp_language = user_config["inp_language"]
    target_language = user_config["target_language"]

    print("\n****Evaluating model from {} to {}****\n".format(
        inp_language, target_language))

    print("****Loading Sub-Word Tokenizers****")
    # load pre-trained tokenizer
    tokenizer_inp, tokenizer_tar = utils.load_tokenizers(
        inp_language, target_language, user_config)

    print("****Initializing DataLoader****")
    # dummy data loader. required for loading checkpoint
    dummy_dataloader = DataLoader(
        user_config["transformer_batch_size"],
        user_config["dummy_data_path_{}".format(inp_language)], None,
        tokenizer_inp, tokenizer_tar, inp_language, target_language, False)
    dummy_dataset = dummy_dataloader.get_data_loader()

    # data loader
    test_dataloader = DataLoader(user_config["transformer_batch_size"],
                                 input_file_path, target_file_path,
                                 tokenizer_inp, tokenizer_tar, inp_language,
                                 target_language, False)
    test_dataset = test_dataloader.get_data_loader()

    input_vocab_size = tokenizer_inp.vocab_size
    target_vocab_size = tokenizer_tar.vocab_size

    use_pretrained_emb = user_config["use_pretrained_emb"]
    if use_pretrained_emb:
        pretrained_weights_inp = np.load(
            user_config["pretrained_emb_path_{}".format(inp_language)])
        pretrained_weights_tar = np.load(
            user_config["pretrained_emb_path_{}".format(target_language)])
    else:
        pretrained_weights_inp = None
        pretrained_weights_tar = None

    transformer_model = Transformer(
        user_config["transformer_num_layers"],
        user_config["transformer_model_dimensions"],
        user_config["transformer_num_heads"],
        user_config["transformer_dff"],
        input_vocab_size,
        target_vocab_size,
        en_input=input_vocab_size,
        fr_target=target_vocab_size,
        rate=user_config["transformer_dropout_rate"],
        weights_inp=pretrained_weights_inp,
        weights_tar=pretrained_weights_tar)

    sacrebleu_metric(transformer_model, pred_file_path, None, tokenizer_tar,
                     dummy_dataset, tokenizer_tar.MAX_LENGTH)

    print("****Loading Model****")
    # load model
    model_path = user_config["model_file"]
    transformer_model.load_weights(model_path)

    print("****Generating Translations****")
    sacrebleu_metric(transformer_model, pred_file_path, target_file_path,
                     tokenizer_tar, test_dataset, tokenizer_tar.MAX_LENGTH)