def __init__(self, config_path, store_as_file=False, add_start_end=True):
        self.config = LoadConfig(config_path).load_config()
        self.store_as_file = store_as_file
        self.add_start_end = add_start_end

        self.train_data, self.valid_data, self.test_data, self.data_info = LoadData(
            self.config['dataset_name']).get_data()

        if self.config['tokenizer'] == 'sentencepiece':
            self.store_as_file = True
        self.train_data, self.valid_data, self.test_data = PreprocessText(
            self.config,
            self.train_data,
            self.valid_data,
            self.test_data,
            store_as_file=self.store_as_file).clean_text()

        self.tokenizer = TokenizeData(self.config,
                                      self.train_data,
                                      add_start_end=self.add_start_end)
Exemple #2
0
import tensorflow as tf
import tqdm
import time

from models.masking import create_padding_mask, create_combined_mask
from utils.load_config import LoadConfig
from prepare_data.create_data import CreateData



# Config dict and model is for reference
config_dict = LoadConfig('conf').load_config()


# Load Data
dataset_name = config_dict['dataset_name']
data_creator = CreateData(config_path='conf')
train_datasets, valid_datasets, test_datasets = data_creator.create_all()


def evaluate(inp_sentence, model, data_creator, max_length):
    inp_sentence_converted = data_creator.tokenizer.convert_to_ids([inp_sentence], [], False)
    inp_sentence_converted = inp_sentence_converted[0]
    inp_sentence_converted = tf.constant(inp_sentence_converted)

    decoder_input = [data_creator.tokenizer.lang_two_sos]
    translate_result = tf.expand_dims(decoder_input, 0)

    for i in range(max_length):
        enc_padding = create_padding_mask(inp_sentence_converted)
        combined_mask = create_combined_mask(translate_result)
                full_dataset = full_dataset.shuffle(
                    buffer_size=self.generator.num_of_imgs)

            full_dataset = full_dataset.batch(batch_size)
            full_dataset = full_dataset.prefetch(tf.data.experimental.AUTOTUNE)

            if save_tf:
                tf.data.experimental.save(full_dataset,
                                          self.config['dataset_save_path'],
                                          compression=None)
            return full_dataset


if __name__ == '__main__':
    config_path = './config'
    config_dict = LoadConfig(config_path)()

    dataset = DataCreator(config_dict).create_data(batch_size=16,
                                                   shuffle=False,
                                                   check_result=False,
                                                   augmentation=True,
                                                   save_tf=False)

    for n, (hr_data, lr_data) in enumerate(dataset.take(1)):
        plt.subplot(1, 2, 1)
        plt.imshow(hr_data[0].numpy().astype('int'))
        plt.title('HR_DATA: (256, 256)')
        plt.subplot(1, 2, 2)
        plt.imshow(lr_data[0].numpy().astype('int'))
        plt.title('LR_DATA: (64, 64)')
        plt.axis('off')
import tensorflow as tf
from utils.load_config import LoadConfig
from data_prep.data_processing import DataCreator
from architecture.generator import SRGenerator
from architecture.discriminator import SRDiscriminator
from architecture.load_vgg import VGGModel

from trainer.losses import pixel_wise_mse, vgg_loss

# Load Config
config_dict = LoadConfig('./config')()

# Load Dataset
dataset = DataCreator(config_dict).create_data(
    batch_size=config_dict['batch_size'],
    shuffle=False,
    check_result=False,
    augmentation=False,
    save_tf=False)

# Call architectures
generator = SRGenerator(n_res_layers=16)
discriminator = SRDiscriminator()

# Call vgg model to calculate content loss
vgg_model = VGGModel(config_dict['vgg_loss_model'])

# Define Loss Object
loss_obj = tf.keras.losses.BinaryCrossentropy(from_logits=False,
                                              label_smoothing=0.1)
        return sos_token, eos_token

    def _spm_add_special_token(self, tokenizer):
        return tokenizer.SetEncodeExtraOptions('bos:eos')

    def _word_tokenizer_add_special_token(self, tokenizer):
        sos_token = len(tokenizer.index_word) + 1
        eos_token = len(tokenizer.index_word) + 2
        return sos_token, eos_token





if __name__ == '__main__':
    config = LoadConfig('conf').load_config()
    train_d, valid_d, test_d, infos = LoadData(config['dataset_name']).get_data()
    t, vd, ttd = PreprocessText(config, train_d, valid_d, test_d, True).clean_text()

    # Define Tokenizer
    tokenizer = TokenizeData(config, t)

    # Tokenizer Test
    encode_one, encode_two = tokenizer.convert_to_ids(t[0], t[1], is_train=True)
    decode_one = tokenizer.convert_to_texts(encode_one[0], tokenizer.lang_one_tokenizer, tokenizer.lang_one_sos, tokenizer.lang_one_eos)
    decode_two = tokenizer.convert_to_texts(encode_two[0], tokenizer.lang_two_tokenizer, tokenizer.lang_two_sos, tokenizer.lang_two_eos)

    # Checker
    print(f'Tokenizer Method::{config["tokenizer"]}\n')
    print('Original Sentence\n')
    print(f'Pt\tEn\n')