def main(): model = int(sys.argv[1]) file_name = sys.argv[2] val_inp = text_retrieve(file_name + '.en') val_tar = text_retrieve(file_name + '.phone') inp_lines, tar_lines, pred_lines = [], [], [] for i in range(len(val_inp)): inp = str(val_inp[i]) tar = str(val_tar[i]) try: pred = translate(inp, model) except: continue print(i) print('Input sentence: ', preprocess_inp_tar(inp)) print('Target sentence: ', preprocess_inp_tar(tar)) print('Predict sentence: ', pred) print() inp_lines.append(preprocess_inp_tar(inp)) tar_lines.append(preprocess_inp_tar(tar)) pred_lines.append(pred) inp_text = lines_to_text(inp_lines, '\n') tar_text = lines_to_text(tar_lines, '\n') pred_text = lines_to_text(pred_lines, '\n') text_save(inp_text, 'model_' + str(model) + '/predictions/' + file_name + '_inp.txt') text_save(tar_text, 'model_' + str(model) + '/predictions/' + file_name + '_tar.txt') text_save( pred_text, 'model_' + str(model) + '/predictions/' + file_name + '_pred.txt')
def main(): print() model = int(sys.argv[1]) train_inp = text_retrieve('spt-tokenized/train.gloss') val_inp = text_retrieve('spt-tokenized/val.gloss') test_inp = text_retrieve('spt-tokenized/test.gloss') train_tar = text_retrieve('spt-tokenized/train.en') val_tar = text_retrieve('spt-tokenized/val.en') test_tar = text_retrieve('spt-tokenized/test.en') print('No. of original sentences in Training set: ', len(train_inp)) print('No. of original sentences in Validation set: ', len(val_inp)) print('No. of original sentences in Test set: ', len(test_inp)) print() max_length = 40 train_inp, train_tar = create_new_dataset(train_inp, train_tar, max_length) val_inp, val_tar = create_new_dataset(val_inp, val_tar, max_length) test_inp, test_tar = create_new_dataset(test_inp, test_tar, max_length) print('No. of new sentences in Training set: ', len(train_inp)) print('No. of new sentences in Validation set: ', len(val_inp)) print('No. of new sentences in Test set: ', len(test_inp)) print() inp_lang, train_inp, val_inp, test_inp = tokenize(train_inp, val_inp, test_inp, max_length) tar_lang, train_tar, val_tar, test_tar = tokenize(train_tar, val_tar, test_tar, max_length) print('Input Vocabulary size: ', len(inp_lang.word_index) + 1) print('Target Vocabulary size: ', len(tar_lang.word_index) + 1) print() batch_size = 128 save_file(inp_lang.word_index, 'model_' + str(model) + '/utils/inp-word-index') save_file(inp_lang.index_word, 'model_' + str(model) + '/utils/inp-index-word') save_file(tar_lang.word_index, 'model_' + str(model) + '/utils/tar-word-index') save_file(tar_lang.index_word, 'model_' + str(model) + '/utils/tar-index-word') parameters = {'inp_vocab_size': len(inp_lang.word_index) + 1, 'tar_vocab_size': len(tar_lang.word_index) + 1, 'emb_size': 512, 'rnn_size': 512, 'batch_size': batch_size, 'epochs': 30, 'train_steps_per_epoch': len(train_inp) // batch_size, 'rate': 0.3, 'val_steps_per_epoch': len(val_inp) // batch_size, 'test_steps': len(test_inp) // batch_size, 'max_length': max_length, 'model': model} save_file(parameters, 'model_' + str(model) + '/utils/parameters') print() print('No. of Training steps per epoch: ', parameters['train_steps_per_epoch']) print('No. of Validation steps per epoch: ', parameters['val_steps_per_epoch']) print('No. of Testing steps: ', parameters['test_steps']) print() train_dataset = tf.data.Dataset.from_tensor_slices((train_inp, train_tar)).shuffle(len(train_inp)) train_dataset = train_dataset.batch(batch_size, drop_remainder=True) val_dataset = tf.data.Dataset.from_tensor_slices((val_inp, val_tar)).shuffle(len(val_inp)) val_dataset = val_dataset.batch(batch_size, drop_remainder=True) test_dataset = tf.data.Dataset.from_tensor_slices((test_inp, test_tar)).shuffle(len(test_inp)) test_dataset = test_dataset.batch(batch_size, drop_remainder=True) print('Model training started') print() model_training(train_dataset, val_dataset, parameters) model_testing(test_dataset, parameters)
def main(): print() train_inp = text_retrieve('train.gloss') val_inp = text_retrieve('val.gloss') test_inp = text_retrieve('test.gloss') train_tar = text_retrieve('train.en') val_tar = text_retrieve('val.en') test_tar = text_retrieve('test.en') print('No. of original sentences in Training set: ', len(train_inp)) print('No. of original sentences in Validation set: ', len(val_inp)) print('No. of original sentences in Test set: ', len(test_inp)) print() vocab_size = 4000 max_length = 40 tokenizer(train_inp, train_tar, val_inp, val_tar, test_inp, test_tar, vocab_size, max_length)
def main(): print() loc = '/home/preetham/Documents/Preetham/masters-thesis/' files = text_retrieve('files_list.txt') print('No. of files in original dataset: ', len(files)) print() shuffle(files) train, val, test = files[:1000], files[:20], files[:20] print('No. of files in training dataset: ', len(train)) print('No. of files in validation dataset: ', len(val)) print('No. of files in testing dataset: ', len(test)) print() inp_word_index = open_file( 'results/grapheme-to-phoneme/luong/model_7/utils/tar-word-index.pkl') start_index = 0 batch_size = 8 train = train[start_index:start_index + batch_size] train_batch_inp, train_batch_tar = create_batch(train, inp_word_index) dec_pre_net = DecoderPreNet(256, 0.1) print(train_batch_tar.shape) print(train_batch_tar[:, :, 0].shape) x = dec_pre_net(train_batch_tar[:, 0], False) print(x.shape)