def __init__(self, config_path, store_as_file=False, add_start_end=True): self.config = LoadConfig(config_path).load_config() self.store_as_file = store_as_file self.add_start_end = add_start_end self.train_data, self.valid_data, self.test_data, self.data_info = LoadData( self.config['dataset_name']).get_data() if self.config['tokenizer'] == 'sentencepiece': self.store_as_file = True self.train_data, self.valid_data, self.test_data = PreprocessText( self.config, self.train_data, self.valid_data, self.test_data, store_as_file=self.store_as_file).clean_text() self.tokenizer = TokenizeData(self.config, self.train_data, add_start_end=self.add_start_end)
import tensorflow as tf import tqdm import time from models.masking import create_padding_mask, create_combined_mask from utils.load_config import LoadConfig from prepare_data.create_data import CreateData # Config dict and model is for reference config_dict = LoadConfig('conf').load_config() # Load Data dataset_name = config_dict['dataset_name'] data_creator = CreateData(config_path='conf') train_datasets, valid_datasets, test_datasets = data_creator.create_all() def evaluate(inp_sentence, model, data_creator, max_length): inp_sentence_converted = data_creator.tokenizer.convert_to_ids([inp_sentence], [], False) inp_sentence_converted = inp_sentence_converted[0] inp_sentence_converted = tf.constant(inp_sentence_converted) decoder_input = [data_creator.tokenizer.lang_two_sos] translate_result = tf.expand_dims(decoder_input, 0) for i in range(max_length): enc_padding = create_padding_mask(inp_sentence_converted) combined_mask = create_combined_mask(translate_result)
full_dataset = full_dataset.shuffle( buffer_size=self.generator.num_of_imgs) full_dataset = full_dataset.batch(batch_size) full_dataset = full_dataset.prefetch(tf.data.experimental.AUTOTUNE) if save_tf: tf.data.experimental.save(full_dataset, self.config['dataset_save_path'], compression=None) return full_dataset if __name__ == '__main__': config_path = './config' config_dict = LoadConfig(config_path)() dataset = DataCreator(config_dict).create_data(batch_size=16, shuffle=False, check_result=False, augmentation=True, save_tf=False) for n, (hr_data, lr_data) in enumerate(dataset.take(1)): plt.subplot(1, 2, 1) plt.imshow(hr_data[0].numpy().astype('int')) plt.title('HR_DATA: (256, 256)') plt.subplot(1, 2, 2) plt.imshow(lr_data[0].numpy().astype('int')) plt.title('LR_DATA: (64, 64)') plt.axis('off')
import tensorflow as tf from utils.load_config import LoadConfig from data_prep.data_processing import DataCreator from architecture.generator import SRGenerator from architecture.discriminator import SRDiscriminator from architecture.load_vgg import VGGModel from trainer.losses import pixel_wise_mse, vgg_loss # Load Config config_dict = LoadConfig('./config')() # Load Dataset dataset = DataCreator(config_dict).create_data( batch_size=config_dict['batch_size'], shuffle=False, check_result=False, augmentation=False, save_tf=False) # Call architectures generator = SRGenerator(n_res_layers=16) discriminator = SRDiscriminator() # Call vgg model to calculate content loss vgg_model = VGGModel(config_dict['vgg_loss_model']) # Define Loss Object loss_obj = tf.keras.losses.BinaryCrossentropy(from_logits=False, label_smoothing=0.1)
return sos_token, eos_token def _spm_add_special_token(self, tokenizer): return tokenizer.SetEncodeExtraOptions('bos:eos') def _word_tokenizer_add_special_token(self, tokenizer): sos_token = len(tokenizer.index_word) + 1 eos_token = len(tokenizer.index_word) + 2 return sos_token, eos_token if __name__ == '__main__': config = LoadConfig('conf').load_config() train_d, valid_d, test_d, infos = LoadData(config['dataset_name']).get_data() t, vd, ttd = PreprocessText(config, train_d, valid_d, test_d, True).clean_text() # Define Tokenizer tokenizer = TokenizeData(config, t) # Tokenizer Test encode_one, encode_two = tokenizer.convert_to_ids(t[0], t[1], is_train=True) decode_one = tokenizer.convert_to_texts(encode_one[0], tokenizer.lang_one_tokenizer, tokenizer.lang_one_sos, tokenizer.lang_one_eos) decode_two = tokenizer.convert_to_texts(encode_two[0], tokenizer.lang_two_tokenizer, tokenizer.lang_two_sos, tokenizer.lang_two_eos) # Checker print(f'Tokenizer Method::{config["tokenizer"]}\n') print('Original Sentence\n') print(f'Pt\tEn\n')