def __init__(self, **kwargs) : dataset_folder = Path(kwargs["dataset_folder"]).resolve() check_valid_path(dataset_folder) result_folder = kwargs["result_folder"] self.initial_epoch = 1 self.test_mode = kwargs["test"] self.epochs = kwargs["epochs"] self.hidden_size = kwargs["hidden_size"] self.num_heads = kwargs["heads"] self.use_label_smoothing = kwargs["label_smoothing"] self.ckpt_path = kwargs["ckpt_path"] self.ckpt_epoch = kwargs["ckpt_epoch"] # model에 필요한 폴더 및 파일 생성 self.log_folder, self.ckpt_folder, self.image_folder = create_folder(result_folder) if not self.test_mode : self.training_result_file = self.log_folder / "training_result.txt" self.test_result_file = None # kwargs 값 저장 msg = "" for k, v in list(kwargs.items()) : msg += "{} = {}\n".format(k, v) msg += "new model checkpoint path = {}\n".format(self.ckpt_folder) with (self.log_folder / "model_settings.txt").open("w", encoding = "utf-8") as fp : fp.write(msg) # 필요한 data를 불러옴 self.src_word2id, self.src_id2word, self.src_vocab_size = load_word_dic(dataset_folder / "src_word2id.pkl") self.tar_word2id, self.tar_id2word, self.tar_vocab_size = load_word_dic(dataset_folder / "tar_word2id.pkl") if not self.test_mode : # encoder data : <END> tag 추가 # decoder data : 1) input = <START> tag만 추가 2)output = <END> tag만 추가 train_src, num_train_src = get_dataset(self.src_word2id, dataset_folder / "train_src.txt", False, True, True) train_tar, num_train_tar = get_dataset(self.tar_word2id, dataset_folder / "train_tar.txt", True, True, True) if num_train_src != num_train_tar : raise Exception("한글 데이터셋({})과 영어 데이터셋({})의 크기가 다릅니다.".format( num_train_src, num_train_tar)) self.num_train = num_train_src self.train_dataset = tf.data.Dataset.from_generator(lambda: zip(train_src, train_tar), (tf.int32, tf.int32)) self.train_dataset = self.train_dataset.cache().shuffle(self.num_train + 1).padded_batch( batch_size = kwargs["batch_size"], padded_shapes = (tf.TensorShape([None]), tf.TensorShape([None])), padding_values = (self.src_word2id["<PAD>"], self.tar_word2id["<PAD>"])).prefetch(1) test_src_path = dataset_folder / "test.txt" if test_src_path.exists() : test_src, self.num_test = get_dataset(self.src_word2id, test_src_path, False, True, False) self.test_dataset = tf.data.Dataset.from_generator(lambda: test_src, tf.int32) self.test_dataset = self.test_dataset.cache().batch(1).prefetch(1) self.test_result_file = self.log_folder / "test_result.txt" elif self.test_mode : raise FileNotFoundError("[ {} ] 경로가 존재하지 않습니다.".format(test_src_path)) self.transformer = Transformer(self.src_vocab_size, self.tar_vocab_size, self.src_word2id["<PAD>"], kwargs["num_layers"], kwargs["heads"], kwargs["embedding_size"], kwargs["hidden_size"], kwargs["dropout_rate"], kwargs["use_conv"])
def infer(): def process(inputs): inputs = clear_punc(inputs) x = [] for word in jieba.cut(inputs): x.append(token2idx.get(word, token2idx['<UNK>'])) x.append(token2idx['</S>']) x = x + [token2idx['<PAD>']] * (model_params.maxlen - len(x)) x = [x] return x model_params = Params() idx2token, token2idx = load_vocab(model_params.idx2token_path,model_params.token2idx_path) model = Transformer(model_params) model.eval() with tf.Session() as sess: saver = tf.train.Saver() last_ckpt = tf.train.latest_checkpoint(model_params.model_save) saver.restore(sess,last_ckpt) while True: x = input('{}:>>'.format('笑给我看')) # todo 古诗模式 if x == '对古诗': pass x = process(x) feed_dict = {model.xs: x} y_hat = sess.run(model.y_hat, feed_dict=feed_dict) result = '' for word in y_hat[0]: if word == token2idx['<UNK>']: result += '*××' elif word != 3: result += idx2token[word] else: break if result == '==': result = "= =" elif result == '<UNK>': result = '哎呀,我不知道啊!' print('傻逼一号:>>',result,'\n')
def main(): args = get_args() src_vocab_size = len( pickle.load(open(args.data_bin + '/dict' + "." + args.src_lang, 'rb')).keys()) tgt_vocab_size = len( pickle.load(open(args.data_bin + '/dict' + '.' + args.tgt_lang, 'rb')).keys()) device = 'cuda' model = Transformer(src_vocab_size=src_vocab_size, tgt_vocab_size=tgt_vocab_size, encoder_layer_num=args.encoder_layer_num, decoder_layer_num=args.decoder_layer_num, hidden_size=args.hidden_size, feedback_size=args.feedback, num_head=args.num_head, dropout=args.dropout, device=device) optim = Optim(Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-9), warmup_step=4000, d_model=args.hidden_size) train_loader = DataLoader(Dataload(args.data_bin + '/' + 'train', args.src_lang, args.tgt_lang), batch_size=args.batch_size, collate_fn=collate_fn, shuffle=True) # optim = Adam(model.parameters(), lr=5e-6) test_loader = DataLoader(Dataload(args.data_bin + '/' + 'test', args.src_lang, args.tgt_lang), batch_size=args.batch_size, collate_fn=collate_fn) valid_loader = DataLoader(Dataload(args.data_bin + '/' + 'valid', args.src_lang, args.tgt_lang), batch_size=args.batch_size, collate_fn=collate_fn) best_loss = 1e4 model = model.to(device) # model.load_state_dict(torch.load('best_model.pkl')) for i in range(args.epoch): train(i, model, data_loader=train_loader, optim=optim, device=device) with torch.no_grad(): best_loss = eval(i, model, valid_loader, best_loss, device)
def train(): session_conf = tf.ConfigProto( allow_soft_placement = True, log_device_placement = False ) model_params = Params() model = Transformer(model_params) with tf.device('/gpu:0'): with tf.Session(config=session_conf) as sess: sess.run(tf.global_variables_initializer()) for data,epoch_i in get_batch: xs, ys = data
parser = argparse.ArgumentParser() parser.add_argument('--bin_path', type=str, required=True) parser.add_argument('--model_path', type=str, required=True) parser.add_argument('--src_lang', type=str, required=True) parser.add_argument('--tgt_lang', type=str, required=True) args = parser.parse_args() src_dict = pickle.load( open(add_(args.bin_path) + 'dict.' + args.src_lang, 'rb')) trg_dict = pickle.load( open(add_(args.bin_path) + 'dict.' + args.tgt_lang, 'rb')) model = Transformer(src_vocab_size=len(src_dict.keys()), tgt_vocab_size=len(trg_dict.keys()), encoder_layer_num=6, decoder_layer_num=6, hidden_size=512, feedback_size=2048, num_head=8, dropout=0.1, device=device) model = model.to(device) model.load_state_dict(torch.load(args.model_path)) dataload = DataLoader(Dataload(add_(args.bin_path) + 'test', src=args.src_lang, trg=args.tgt_lang), batch_size=32, collate_fn=collate_fn) real = [] predict = [] pbtr = tqdm(total=len(dataload)) with torch.no_grad():
class Trainer() : def __init__(self, **kwargs) : dataset_folder = Path(kwargs["dataset_folder"]).resolve() check_valid_path(dataset_folder) result_folder = kwargs["result_folder"] self.initial_epoch = 1 self.test_mode = kwargs["test"] self.epochs = kwargs["epochs"] self.hidden_size = kwargs["hidden_size"] self.num_heads = kwargs["heads"] self.use_label_smoothing = kwargs["label_smoothing"] self.ckpt_path = kwargs["ckpt_path"] self.ckpt_epoch = kwargs["ckpt_epoch"] # model에 필요한 폴더 및 파일 생성 self.log_folder, self.ckpt_folder, self.image_folder = create_folder(result_folder) if not self.test_mode : self.training_result_file = self.log_folder / "training_result.txt" self.test_result_file = None # kwargs 값 저장 msg = "" for k, v in list(kwargs.items()) : msg += "{} = {}\n".format(k, v) msg += "new model checkpoint path = {}\n".format(self.ckpt_folder) with (self.log_folder / "model_settings.txt").open("w", encoding = "utf-8") as fp : fp.write(msg) # 필요한 data를 불러옴 self.src_word2id, self.src_id2word, self.src_vocab_size = load_word_dic(dataset_folder / "src_word2id.pkl") self.tar_word2id, self.tar_id2word, self.tar_vocab_size = load_word_dic(dataset_folder / "tar_word2id.pkl") if not self.test_mode : # encoder data : <END> tag 추가 # decoder data : 1) input = <START> tag만 추가 2)output = <END> tag만 추가 train_src, num_train_src = get_dataset(self.src_word2id, dataset_folder / "train_src.txt", False, True, True) train_tar, num_train_tar = get_dataset(self.tar_word2id, dataset_folder / "train_tar.txt", True, True, True) if num_train_src != num_train_tar : raise Exception("한글 데이터셋({})과 영어 데이터셋({})의 크기가 다릅니다.".format( num_train_src, num_train_tar)) self.num_train = num_train_src self.train_dataset = tf.data.Dataset.from_generator(lambda: zip(train_src, train_tar), (tf.int32, tf.int32)) self.train_dataset = self.train_dataset.cache().shuffle(self.num_train + 1).padded_batch( batch_size = kwargs["batch_size"], padded_shapes = (tf.TensorShape([None]), tf.TensorShape([None])), padding_values = (self.src_word2id["<PAD>"], self.tar_word2id["<PAD>"])).prefetch(1) test_src_path = dataset_folder / "test.txt" if test_src_path.exists() : test_src, self.num_test = get_dataset(self.src_word2id, test_src_path, False, True, False) self.test_dataset = tf.data.Dataset.from_generator(lambda: test_src, tf.int32) self.test_dataset = self.test_dataset.cache().batch(1).prefetch(1) self.test_result_file = self.log_folder / "test_result.txt" elif self.test_mode : raise FileNotFoundError("[ {} ] 경로가 존재하지 않습니다.".format(test_src_path)) self.transformer = Transformer(self.src_vocab_size, self.tar_vocab_size, self.src_word2id["<PAD>"], kwargs["num_layers"], kwargs["heads"], kwargs["embedding_size"], kwargs["hidden_size"], kwargs["dropout_rate"], kwargs["use_conv"]) def start(self) : if self.test_mode : self.test() else : self.train() def train(self) : self.optimizer = tf.keras.optimizers.Adam(beta_1 = 0.9, beta_2 = 0.98, epsilon = 1e-9) if self.use_label_smoothing : self.loss_function = tf.keras.losses.CategoricalCrossentropy(from_logits = True) else : self.loss_function = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) self.loss_metric = tf.keras.metrics.Mean(name = "train_loss") self.acc_metric = tf.keras.metrics.SparseCategoricalAccuracy(name = "train_acc") ckpt = tf.train.Checkpoint(model = self.transformer, opt = self.optimizer) if self.ckpt_path is not None : fname, self.initial_epoch = load_checkpoint(Path(self.ckpt_path).resolve(), self.ckpt_epoch) print("\nCheckpoint File : {}\n".format(fname)) ckpt.mapped = {"model" : self.transformer, "opt" : self.optimizer} ckpt.restore(fname) progbar = tf.keras.utils.Progbar(target = self.num_train) self.count = 0 for epoch in range(self.initial_epoch, self.initial_epoch + self.epochs) : K.set_value(self.optimizer.lr, self._get_lr(epoch)) progbar.update(0) self.loss_metric.reset_states() self.acc_metric.reset_states() start_time = korea_time(None) for train_src, train_tar in self.train_dataset : num_data = K.int_shape(train_src)[0] logits = self.forward(train_src, train_tar) progbar.add(num_data) end_time = korea_time(None) epoch_loss = self.loss_metric.result() epoch_acc = self.acc_metric.result() ckpt_prefix = self.ckpt_folder / "Epoch-{}_Loss-{:.5f}_Acc-{:5f}".format( epoch, epoch_loss, epoch_acc) ckpt.save(file_prefix = ckpt_prefix) print("Epoch = [{:5d}] Loss = [{:8.6f}] Acc = [{:8.6f}] LR = [{:.10f}]\n".format( epoch, epoch_loss, epoch_acc, K.get_value(self.optimizer.lr))) # model result 저장 msg = "Epoch = [{:5d}] - End Time [ {} ]\n".format(epoch, end_time.strftime("%Y/%m/%d %H:%M:%S")) msg += "Elapsed Time = {}\n".format(end_time - start_time) msg += "Learning Rate = [{:.10f}]\n".format(K.get_value(self.optimizer.lr)) msg += "Loss : [{:8.6f}] - Acc : [{:8.6f}]\n".format(epoch_loss, epoch_acc) msg += " - " * 15 + "\n\n" with self.training_result_file.open("a+", encoding = "utf-8") as fp : fp.write(msg) if self.test_result_file is not None : self.translate(epoch) def test(self) : ckpt = tf.train.Checkpoint(model = self.transformer) fname, _ = load_checkpoint(Path(self.ckpt_path).resolve(), self.ckpt_epoch) print("\nCheckpoint File : {}\n".format(fname)) # model만 불러옴 ckpt.mapped = {"model" : self.transformer} ckpt.restore(fname).expect_partial() self.translate("Test") def _get_lr(self, step) : return pow(self.hidden_size, -0.5) * min(pow(step, -0.5), step * pow(4000, -1.5)) def get_loss(self, labels, logits) : # labels shape : (sequence_length, 1, ) # logits shape : (sequence_length, vocab_size, ) # decoder에서 pad 부분은 loss에서 제외함 loss_masking = tf.math.not_equal(labels, self.tar_word2id["<PAD>"]) if self.use_label_smoothing : labels = K.one_hot(labels, self.tar_vocab_size) labels = self.label_smoothing(labels, self.tar_vocab_size) loss = self.loss_function(labels, logits) # loss_masking에는 True, False값으로 이루어져 있으므로 숫자로 바꿔줌 loss_masking = tf.cast(loss_masking, loss.dtype) loss *= loss_masking # loss를 구할 때 PAD 부분은 제외함 return tf.reduce_sum(loss) / (tf.reduce_sum(loss_masking) + 1e-9) def label_smoothing(self, inputs, vocab_size, epsilon = 0.1) : vocab_size = K.int_shape(inputs)[-1] return ((1 - epsilon) * inputs) + (epsilon / vocab_size) @tf.function(input_signature = [tf.TensorSpec((None, None), tf.int32), tf.TensorSpec((None, None), tf.int32)]) def forward(self, train_src, train_tar) : # train_tar = <START> Token Token Token ... Token <END> # input_tar = <START> Token Token Token ... Token # output_tar = Token Token Token ... Token <END> enc_inputs = train_src dec_inputs = train_tar[:, : -1] dec_outputs = train_tar[:, 1 : ] with tf.GradientTape() as tape : logits, _ = self.transformer(enc_inputs, dec_inputs, True) loss = self.get_loss(dec_outputs, logits) grads = tape.gradient(loss, self.transformer.trainable_variables) self.optimizer.apply_gradients(zip(grads, self.transformer.trainable_variables)) self.loss_metric.update_state(loss) self.acc_metric.update_state(dec_outputs, logits) return logits def translate(self, epoch) : results = [] att_weights_list = [] for test_src in self.test_dataset : dec_inputs = tf.expand_dims([self.tar_word2id["<START>"]], 1) # (1, 1) tar_list = [] # 불필요한 반복 계산을 피하기 위해 encoder의 output은 미리 구함 enc_outputs, padding_mask = self.transformer.encoder(test_src, False) for idx in range(20) : # 최대 20글자까지 예측 # shape : (1, sequence_length, vocab_size) # att_weights에는 kwargs["heads"] 만큼의 head개 저장되어 있음 # att_weights shape : (1, num_heads, target_sentence_len, source_sentence_len) dec_outputs, att_weights = self.transformer.decoder(dec_inputs, enc_outputs, padding_mask, False) logits = self.transformer.linear(dec_outputs) # 마지막 word에 대한 logits만 선택. shape : (1, vocab_size) last_word_logits = logits[:, -1, :] word_id = K.get_value(K.argmax(last_word_logits, axis = -1))[0] word = self.tar_id2word[word_id].split("/")[0] tar_list.append(word) if word == "<END>" : break dec_inputs = tf.concat([dec_inputs, [[word_id]]], axis = -1) # shape : (1, n) results.append(tar_list) att_weights_list.append(att_weights[0]) # batch_size 부분은 제거 self.save_results(results, att_weights_list, epoch) def save_results(self, tar_list, att_weights, epoch) : image_epoch_folder = self.image_folder / "epoch-{}".format(epoch) image_epoch_folder.mkdir() dataset = zip(self.test_dataset, tar_list, att_weights) with self.test_result_file.open("a+", encoding = "utf-8") as fp : if isinstance(epoch, int) : fp.write("Epoch = [{:5d}]\n".format(epoch)) else : fp.write("Epoch = {}\n".format(epoch)) for idx, (src_id, tar, weights) in enumerate(dataset) : # <END> tag는 제외함 src = id_to_word([K.get_value(num) for num in src_id[0][: -1]], self.src_id2word) src_sentence = [word.split("/")[0] for word in src] tar_sentence = [word.split("/")[0] for word in tar if word != "<END>"] with self.test_result_file.open("a+", encoding = "utf-8") as fp : msg = "Source : {}\n".format(" ".join(src_sentence)) if len(tar_sentence) : msg += "Target : {}\n\n".format(" ".join(tar_sentence)) else : msg += "Target : 번역결과가 없습니다.\n\n" fp.write(msg) if len(tar_sentence) : self.plot_attention(weights, src_sentence, tar_sentence, image_epoch_folder, idx + 1) with self.test_result_file.open("a+", encoding = "utf-8") as fp : fp.write(" - - " * 10 + "\n\n") def plot_attention(self, att_weights, src_sentence, tar_sentence, image_epoch_folder, idx) : sample_src = " ".join(src_sentence[ : 5]) save_folder = image_epoch_folder / sample_src if not save_folder.exists() : save_folder.mkdir() # tar_sentence에서 마지막 token이 "<END>"라면 제외 y_len = len(tar_sentence) if tar_sentence[-1] == "<END>" : y_len -= 1 # kwargs["heads"] 만큼의 attention graph를 작성 for head_idx in range(self.num_heads) : fig = plt.figure(figsize = (16, 16)) ax = fig.add_subplot(1, 1, 1) graph = ax.matshow(att_weights[head_idx][ : y_len, :], cmap = "viridis") fontdict = {"fontsize" : 24} # x축 : 한글, y축 : 영어 ax.set_xticks(range(len(src_sentence) + 1)) # <END> tag를 포함 ax.set_yticks(range(y_len)) ax.set_xticklabels(src_sentence + ["<END>"], rotation = 90, fontdict = fontdict) ax.set_yticklabels([token for token in tar_sentence if token != "<END>"], fontdict = fontdict) cax = fig.add_axes([ax.get_position().x1+0.01,ax.get_position().y0,0.02,ax.get_position().height]) fig.colorbar(graph, cax = cax) plt.savefig(save_folder / "Head-{}.png".format(head_idx + 1)) plt.close(fig)
logging.basicConfig(level=logging.INFO) path = './data' vocab_path = './data/vocab.txt' logging.info('# Generate data .') train_batches, num_train_batches, num_train_samples = get_batches( path, vocab_path, hp.batch_size, shuffle=True) iter = tf.data.Iterator.from_structure(train_batches.output_types, train_batches.output_shapes) xs, ys = iter.get_next() train_init_op = iter.make_initializer(train_batches) logging.info('# Load model .') model = Transformer(hp) loss, train_op, global_step, train_summaries = model.train(xs, ys) logging.info('# Session') saver = tf.train.Saver(max_to_keep=3) with tf.Session() as sess: ckpt = tf.train.latest_checkpoint(hp.logdir) if ckpt is None: logging.info('# Initialize model .') sess.run(tf.global_variables_initializer()) save_variable_space(os.path.join(hp.logdir, 'spaces')) else: logging.info('# Go on training the model .') saver.restore(sess, ckpt)
split=(96, 2, 2)) src_mask, tar_mask = get_mask(4 * CONST_LEN, random=False) # send src_mask, tar_mask to GPU src_mask, tar_mask = src_mask.to(device), tar_mask.to(device) scale = torch.Tensor(dataLoader.scale) scale = scale.to(device) print("re-start a previous training ... ") for k in range(epoch): if k and k % 25 == 0: checkpoint = { 'model': Transformer(seq_len, channels, conv_k, dropout), 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict() } torch.save(checkpoint, str(k) + '_' + 'checkpoint_re.pth') loss_train = [] dataLoader.shuffle() # set model training state model.train() for i, (cat, src, tar) in enumerate(dataLoader.get_training_batch()): # print("train mini-batch ", i) # send tensors to GPU # print("train - check input: ", check_tensor([cat, src, tar])) cat, src, tar = cat.to(device), src.to(device), tar.to(device) # print(src.size())
def train(): model_params = Params() GPU_Option = tf.GPUOptions(per_process_gpu_memory_fraction=model_params.per_process_gpu_memory_fraction) session_conf = tf.ConfigProto( allow_soft_placement = True, log_device_placement = False, gpu_options = GPU_Option, ) train_data, test_data = load_processed_data(model_params.processed_data_path) print("Test data 's size:{}".format(test_data[0].shape[0])) model = Transformer(model_params) model.train() with tf.device('/gpu:0'): with tf.Session(config=session_conf) as sess: if not os.path.exists(model_params.summary_path): os.mkdir(model_params.summary_path) summary_writer = tf.summary.FileWriter(model_params.summary_path,sess.graph) if not os.path.exists(model_params.model_save): os.mkdir(model_params.model_save) saver = tf.train.Saver(max_to_keep=4) latest_ckpt = tf.train.latest_checkpoint(model_params.model_save) if not latest_ckpt: sess.run(tf.global_variables_initializer()) print('Initial the model .') else: saver.restore(sess,latest_ckpt) print('Restore the model from better_checkpoint .') last_loss = 10000 for xs, decode_inputs, ys, epoch_i in get_batch(train_data,epoch=model_params.epochs,batch_size=model_params.batch_size): feed_dict = {model.xs : xs, model.decode_inputs : decode_inputs, model.ys : ys, model.dropout_rate:model_params.dropout_rate} _, loss, global_step, y_hat, summary_ = sess.run([model.train_op, model.loss, model.global_step, model.y_hat, model.summary], feed_dict=feed_dict) summary_writer.add_summary(summary_,global_step) if global_step % model_params.print_per_steps == 0: print('{} Epoch: {}, global_step: {}, loss: {}'.format(datetime.datetime.now().strftime('%Y-%m-%d %X'), epoch_i + 1, global_step, loss)) if global_step % model_params.test_per_steps == 0: temp_loss = 0 count = 0 for xs, decode_inputs, ys, _ in get_batch(test_data,epoch=1,batch_size=model_params.batch_size,shuffle=False): feed_dict = {model.xs : xs, model.decode_inputs : decode_inputs, model.ys : ys, model.dropout_rate:0} loss = sess.run(model.loss, feed_dict=feed_dict) temp_loss += loss count += 1 loss = temp_loss / count if loss < last_loss: last_loss = loss saver.save(sess,model_params.model_save + '/{}'.format(int(time.time()))) print('{} Save model with lower loss :{}.'.format(datetime.datetime.now().strftime('%Y-%m-%d %X'), loss)) # 预览test效果 print(decode_inputs[:3]) print(ys[:3]) print(y_hat[:3]) summary_writer.close()
eval_accuracy = eval_corrects/eval_words eval_perplexity = math.exp(eval_loss/eval_words) return eval_accuracy, eval_perplexity if __name__ == "__main__": torch.manual_seed(123) torch.cuda.manual_seed(123) print("Building Dataloader ...") train_path = "/home/ubuntu/translation-data/train." traindataloader = Dataloader(train_path+"en.id", train_path+"de.id", 96, cuda=True) dev_path = "/home/ubuntu/translation-data/dev." devdataloader = Dataloader(dev_path+"en.id", dev_path+"de.id", 96, cuda=True, volatile=True) print("Building Model ...") model = Transformer(bpe_size=32000, h=8, d_model=512, p=0.1, d_ff=1024).cuda() nllloss_weights = torch.ones(32000) criterion = nn.NLLLoss(nllloss_weights, size_average=False, ignore_index=0).cuda() base_optim = torch.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09) optim = TransformerOptimizer(base_optim, warmup_steps=32000, d_model=512) print("Start Training ...") for epoch in range(60): if epoch > 0: traindataloader.shuffle(1024) if epoch == 20: optim.init_lr = 0.5 * optim.init_lr if epoch == 40: optim.init_lr = 0.1 * optim.init_lr train_acc, train_ppl= trainEpoch(epoch, model, criterion, traindataloader, optim) print("[Train][Epoch %2d] Accuracy: %6.2f, Perplexity: %6.2f" % (epoch+1, train_acc, train_ppl))
devdataloader = Dataloader(valid_path1, valid_path2, batch_size, cuda=True, volatile=True) if run_testing_during_training: testdataloader = Dataloader(test_path1, test_path2, 1, cuda=True, volatile=True) # test sentences one by one print("Building Model ...") model = Transformer(bpe_size=vocab_size, h=8, d_model=512, p=0.1, d_ff=1024).cuda() nllloss_weights = torch.ones(vocab_size) criterion = nn.NLLLoss(nllloss_weights, size_average=False, ignore_index=0).cuda() # criterion = nn.NLLLoss(size_average=False, ignore_index=0).cuda() base_optim = torch.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09) optim = TransformerOptimizer(base_optim, warmup_steps=32000, d_model=512) print("Start Training ...") best_eval_acc = 0 for epoch in range(num_epochs): if epoch > 0:
word2idx, idx2word = load_vocab(vocab_fpath) x_1 = tf.placeholder(tf.int32, [1, 20], name='input') x_2 = tf.placeholder(tf.int32, (), name='input') x_3 = tf.placeholder(tf.string, (), name='input') y_1 = tf.placeholder(tf.int32, [1, 1], name='output') y_2 = tf.placeholder(tf.int32, [1, 19], name='output') y_3 = tf.placeholder(tf.int32, (), name='output') y_4 = tf.placeholder(tf.string, (), name='output') x_input = (x_1, x_2, x_3) y_input = (y_1, y_2, y_3, y_4) sess = tf.Session() m = Transformer(hp) y_hat = m.infer(x_input, y_input) new_saver = tf.train.Saver() new_saver.restore(sess, tf.train.latest_checkpoint('./model')) def generate_input(query): query_id = [] for word in jieba.cut(query): query_id.append(word2idx.get(word, 1)) query_id.append(word2idx.get('<S>')) if len(query_id) >= hp.maxlen: query_id = query_id[:20] else: query_id = pad(query_id, hp.maxlen, vocab_fpath)