def get_model(): embedin = Embedin(vocab_size, embed_size) embedout = Embedout(vocab_size, hidden_size) enc = Encoder(embedin, embed_size, hidden_size, n_layers) dec5 = Decoder(embedin, embed_size, hidden_size, n_layers) #enc7 = Encoder(embedin, embed_size, hidden_size, n_layers) dec7 = Decoder(embedin, embed_size, hidden_size, n_layers) atten = Attention(hidden_size) #atten7 = Attention(hidden_size) ae5 = Autoencoder(enc, dec5, atten, embedout, 13) ae7 = Autoencoder(enc, dec7, atten, embedout, 17) discriminator = Discriminator(hidden_size) discriminator2 = Discriminator2(vocab_size, embed_size, hidden_size) seq2seq57 = Autoencoder(enc, dec7, atten, embedout, 17) seq2seq75 = Autoencoder(enc, dec5, atten, embedout, 13) lm5 = Lstm(vocab_size, embed_size, hidden_size, n_layers, drop_out=0) lm7 = Lstm(vocab_size, embed_size, hidden_size, n_layers, drop_out=0) lm5.load_state_dict(torch.load('models/lm5_lstm_dropout.th')) lm7.load_state_dict(torch.load('models/lm7_lstm_dropout.th')) ae5 = ae5.cuda() ae7 = ae7.cuda() discriminator = discriminator.cuda() discriminator2 = discriminator2.cuda() seq2seq57 = Autoencoder(enc, dec7, atten, embedout, 17) seq2seq75 = Autoencoder(enc, dec5, atten, embedout, 13) lm5 = lm5.cuda() lm7 = lm7.cuda() return ae5, ae7, discriminator, discriminator2, seq2seq57, seq2seq75, lm5, lm7
def __init__(self, inp_dim, out_dim, emb_dim, enc_hid, dec_hid, enc_drop, dec_drop, epoch, clip, sparse_max, tf, max_length, vocab, batch, device): self.inp_dim = inp_dim self.out_dim = out_dim self.emb_dim = emb_dim self.enc_hid = enc_hid self.dec_hid = dec_hid self.enc_drop = enc_drop self.dec_drop = dec_drop self.tf = tf self.max_length = max_length self.batch = batch self.device = device self.vocab = vocab self.attn = Attention(enc_hid, dec_hid, sparse_max=sparse_max) self.enc = Encoder(inp_dim, emb_dim, enc_hid, dec_hid, enc_drop) self.dec = Decoder(out_dim, emb_dim, enc_hid, dec_hid, dec_drop, self.attn) self.model = Seq2Seq(self.enc, self.dec, device).to(device) self.model.apply(self.init_weights) self.count_parameters() self.optimizer = optim.Adam(self.model.parameters()) if sparse_max: self.criterion = SparsemaxLoss(ignore_index=0) else: self.criterion = nn.CrossEntropyLoss(ignore_index=0) # pad_idx 0 self.epoch = epoch self.clip = clip
train_iterator, valid_iterator, test_iterator, SRC, TRG = Return_Data_Loaders() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') INPUT_DIM = len(SRC.vocab) OUTPUT_DIM = len(TRG.vocab) ENC_EMB_DIM = 256 DEC_EMB_DIM = 256 HID_DIM = 512 N_LAYERS = 2 ENC_DROPOUT = 0.5 DEC_DROPOUT = 0.5 enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT) dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT) model = Seq2Seq(enc, dec, device).to(device) ########################################################################## model.apply(init_weights) optimizer = optim.Adam(model.parameters()) TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token] criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX) ##########################################################################
import torch import pickle from torch.utils.data import DataLoader from my_dataloader import * from create_vocabulary import * from Model import Encoder, Decoder, Seq2Seq import torch.nn as nn import torch.optim as optim from torch.optim.lr_scheduler import StepLR device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #encoder = Encoder(input_dim=2999, name='emb_inspec.npy') #decoder = Decoder(output_dim=2999, name='emb_inspec.npy') encoder = Encoder() decoder = Decoder() model = Seq2Seq(encoder, decoder, device).to(device) #model.load_state_dict(torch.load('train.pt')) def init_weights(m): for name, param in m.named_parameters(): nn.init.normal_(param.data, mean=0, std=0.01) batch=64 tot_epoch = 100 vocab = np.load('vocab_kp20k2.npy', allow_pickle=True).item() #vocab = np.load('vocab_inspec.npy', allow_pickle=True).item() TRG_PAD_IDX = vocab('<pad>')
def _create_graph(self, DECODER_TYPE): self.raw_state = tf.placeholder(tf.float32, shape=[None, Config.NUM_OF_CUSTOMERS+1, 2], name='State') self.current_location = self.raw_state[:, -1] self.sampled_cost = tf.placeholder(tf.float32, [None, 1], name='Sampled_Cost') if Config.SEQUENCE_COST == 1: self.sampled_cost = tf.placeholder(tf.float32, [None, Config.NUM_OF_CUSTOMERS], name='Sampled_Cost') self.batch_size = tf.shape(self.raw_state)[0] self.keep_prob = tf.placeholder(tf.float32) self.global_step = tf.Variable(0, trainable=False, name='step') self.input_lengths = tf.convert_to_tensor([Config.NUM_OF_CUSTOMERS]*(self.batch_size)) self.or_route = tf.placeholder(tf.int32, shape=[None, Config.NUM_OF_CUSTOMERS+1]) self.or_cost = tf.placeholder(tf.float32, shape=[None, 1]) self.difference_in_length = tf.reduce_mean(self.sampled_cost - self.or_cost) self.relative_length = tf.reduce_mean(self.sampled_cost/self.or_cost) if Config.SEQUENCE_COST == 1: self.relative_length = tf.reduce_mean(self.sampled_cost[:, 0]/self.or_cost) self.start_tokens = tf.placeholder(tf.int32, shape=[None]) self.end_token = -1 self.MA_baseline = tf.Variable(0.0, dtype=tf.float32, trainable=False) if Config.SEQUENCE_COST == 1: self.MA_baseline = tf.Variable(tf.tile([0.0], [Config.NUM_OF_CUSTOMERS]), dtype=tf.float32, trainable=False) self.assign_init_MA = tf.assign(self.MA_baseline, tf.reduce_mean(self.sampled_cost, axis=0)) else: self.assign_init_MA = tf.assign(self.MA_baseline, tf.reduce_mean(self.sampled_cost)) if Config.STATE_EMBED == 1: self.with_depot_state = self.raw_state for i in range(0): self.with_depot_state = tf.layers.conv1d(self.with_depot_state, Config.RNN_HIDDEN_DIM, 1, padding="SAME", activation=tf.nn.relu) self.with_depot_state = tf.layers.conv1d(self.with_depot_state, Config.RNN_HIDDEN_DIM, 1, padding="VALID") else: self.with_depot_state = self.raw_state self.state = self.with_depot_state[:, :-1, :] self.old_probs = tf.placeholder(tf.float32, shape=[None, Config.NUM_OF_CUSTOMERS, Config.NUM_OF_CUSTOMERS]) # ENCODER if Config.DIRECTION == 4 or Config.DIRECTION == 5 or Config.DIRECTION == 6: self.encoder_outputs = self.state self.encoder_state = None if Config.DIRECTION < 6 and Config.DIRECTION != 4 and Config.DIRECTION != 5 and Config.DIRECTION != 6: self.encoder_outputs, self.encoder_state = Encoder(self.state, self.keep_prob) # HELPERS self.training_index = tf.concat([tf.expand_dims(self.start_tokens, -1), self.or_route], axis=1) self.training_index = self.training_index[:, :-1] self.gather_ids = tf.concat([tf.expand_dims( tf.reshape(tf.tile(tf.reshape(tf.range(self.batch_size), [-1, 1]), [1, tf.shape(self.with_depot_state)[1]]), [-1]), -1), tf.reshape(self.training_index, [-1, 1])], -1) if Config.STATE_EMBED == 0: self.training_inputs = tf.reshape(tf.gather_nd(self.with_depot_state, self.gather_ids), [self.batch_size, tf.shape(self.with_depot_state)[1], 2]) else: self.training_inputs = tf.reshape(tf.gather_nd(self.with_depot_state, self.gather_ids), [self.batch_size, tf.shape(self.with_depot_state)[1], Config.RNN_HIDDEN_DIM]) train_helper, pred_helper = Helper(self.with_depot_state, self.batch_size, self.training_inputs, self.start_tokens, self.end_token) # DECODER if Config.DIRECTION < 6: train_decoder, pred_decoder, critic_network_pred = Decoder(self.batch_size, self.encoder_state, self.encoder_outputs, train_helper, pred_helper, self.state, self.start_tokens, self.end_token, self.keep_prob, self.raw_state, DECODER_TYPE) self.train_final_output, self.train_final_state, train_final_sequence_lengths = tf.contrib.seq2seq.dynamic_decode( train_decoder, impute_finished=False, maximum_iterations=tf.shape(self.state)[1]) self.train_final_action = self.train_final_output.sample_id self.pred_final_output, self.pred_final_state, pred_final_sequence_lengths = tf.contrib.seq2seq.dynamic_decode( pred_decoder, impute_finished=False, maximum_iterations=tf.shape(self.state)[1]) self.pred_final_action = self.pred_final_output.sample_id self.base_line_est = critic_network_pred self.logits = self.train_final_output.rnn_output if Config.DIRECTION == 6: self.train_final_action, self.pred_final_action, self.base_line_est, self.logits = Beam_Search( self.batch_size, self.encoder_state, self.encoder_outputs, train_helper, pred_helper, self.with_depot_state, self.start_tokens, self.end_token, self.keep_prob, self.raw_state, DECODER_TYPE) # self.pred_final_action = tf.squeeze(self.pred_final_action) if Config.DIRECTION == 9: self.train_final_action, self.pred_final_action, self.base_line_est, self.logits = Reza_Model(self.batch_size, self.with_depot_state) if Config.DIRECTION == 10: self.train_final_action, self.pred_final_action, self.base_line_est, self.logits = Wyatt_Model(self.batch_size, self.state, self.raw_state) self.probs = self.logits self.probs = self.probs + tf.to_float(tf.less(self.probs, -.8*Config.LOGIT_PENALTY))*Config.LOGIT_PENALTY self.probs = tf.clip_by_value(tf.nn.softmax(self.probs), 1e-7, 1e7) gather_ind = tf.concat([ tf.reshape(tf.tile(tf.reshape(tf.range(0, self.batch_size), [-1, 1]), [1, Config.NUM_OF_CUSTOMERS]), [-1, 1]), tf.tile(tf.reshape(tf.range(0, Config.NUM_OF_CUSTOMERS), [-1, 1]), [self.batch_size, 1]), tf.reshape(self.pred_final_action, [-1, 1])], axis=1) self.new_probs_with_pi = tf.reshape(tf.gather_nd(self.probs, gather_ind), [self.batch_size, Config.NUM_OF_CUSTOMERS]) self.old_probs_with_pi = tf.reshape(tf.gather_nd(self.old_probs, gather_ind), [self.batch_size, Config.NUM_OF_CUSTOMERS]) self.ratio = tf.divide(self.new_probs_with_pi, self.old_probs_with_pi) if DECODER_TYPE == 0: # x = tf.range(0, 19, dtype=tf.int32) # x = [tf.random_shuffle(x)] # for i in range(499): # y = tf.range(0, 19, dtype=tf.int32) # y = [tf.random_shuffle(y)] # x = tf.concat((x, y), axis=0) # self.pred_final_action = x[:self.batch_size, :] if Config.SEQUENCE_COST == 0: self.critic_loss = tf.losses.mean_squared_error(self.sampled_cost, self.base_line_est) else: self.critic_loss = tf.losses.mean_squared_error(tf.reshape(self.sampled_cost[:, 0], [-1, 1]), self.base_line_est) if Config.LOGIT_CLIP_SCALAR != 0: self.logits = Config.LOGIT_CLIP_SCALAR*tf.nn.tanh(self.logits) if Config.REINFORCE == 0: # self.weights = tf.to_float(tf.tile(tf.reshape(tf.range( # 1, tf.divide(1, tf.shape(self.state)[1]), -tf.divide(1, tf.shape(self.state)[1])), # [1, -1]), [self.batch_size, 1])) self.actor_loss = tf.contrib.seq2seq.sequence_loss( logits=self.logits, targets=self.or_route[:, :-1], weights=tf.ones([self.batch_size, tf.shape(self.state)[1]]) # weights=self.weights ) else: self.neg_log_prob = -1*tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=self.train_final_action) self.R = tf.stop_gradient(self.sampled_cost) if Config.SEQUENCE_COST == 1 and Config.USE_PPO == 0: assign = tf.assign(self.MA_baseline, self.MA_baseline*.999 + tf.reduce_mean(self.R, axis=0)*.001) with tf.control_dependencies([assign]): V = self.MA_baseline self.actor_loss = tf.reduce_mean(tf.multiply(self.neg_log_prob, self.R-V)) elif Config.USE_PPO == 1: assign = tf.assign(self.MA_baseline, self.MA_baseline*.999 + tf.reduce_mean(self.R, axis=0)*.001) with tf.control_dependencies([assign]): V = self.MA_baseline adv = self.R - V epsilon = 0.1 self.actor_loss = -tf.reduce_mean(tf.reduce_sum( tf.minimum(tf.multiply(self.ratio, adv), tf.clip_by_value(self.ratio, 1.0-epsilon, 1.0+epsilon)*adv), axis=1)) elif Config.MOVING_AVERAGE == 1: assign = tf.assign(self.MA_baseline, self.MA_baseline*.999 + tf.reduce_mean(self.R)*.001) with tf.control_dependencies([assign]): V = self.MA_baseline self.actor_loss = tf.reduce_mean(tf.multiply(tf.reduce_sum(self.neg_log_prob, axis=1), self.R-V)) elif Config.USE_OR_COST == 1: V = tf.stop_gradient(self.or_cost) self.actor_loss = tf.reduce_mean(tf.multiply(tf.reduce_sum(self.neg_log_prob, axis=1), (self.R-V)/5)) else: V = tf.stop_gradient(self.base_line_est) self.actor_loss = tf.reduce_mean(tf.multiply(tf.reduce_sum(self.neg_log_prob, axis=1), self.R-V)) with tf.name_scope("Train"): if Config.GPU == 1: colocate = True else: colocate = False if Config.LR_DECAY_OFF == 0: self.lr = tf.train.exponential_decay( Config.LEARNING_RATE, self.global_step, 200000, .9, staircase=True, name="learning_rate") else: self.lr = Config.LEARNING_RATE self.train_critic_op = tf.train.AdamOptimizer(self.lr).minimize(self.critic_loss) if Config.MAX_GRAD != 0: self.params = tf.trainable_variables() self.gradients = tf.gradients(self.actor_loss, self.params, colocate_gradients_with_ops=colocate) opt = tf.train.AdamOptimizer(self.lr) self.clipped_gradients, gradient_norm = tf.clip_by_global_norm(self.gradients, Config.MAX_GRAD) self.train_actor_op = opt.apply_gradients(zip(self.clipped_gradients, self.params), global_step=self.global_step) tf.summary.scalar("grad_norm", gradient_norm) tf.summary.scalar("LearningRate", self.lr) else: self.train_actor_op = tf.train.AdamOptimizer(self.lr).minimize(self.actor_loss, global_step=self.global_step, colocate_gradients_with_ops=colocate) # # for gradient clipping https://github.com/tensorflow/nmt/blob/master/nmt/model.py with tf.name_scope("Loss"): tf.summary.scalar("Loss", self.actor_loss) tf.summary.scalar("Critic_Loss", self.critic_loss) with tf.name_scope("Performace"): tf.summary.scalar("Relative Critic Loss", tf.reduce_mean(self.base_line_est/self.or_cost)) tf.summary.scalar("Relative Critic Loss to Sampled", tf.reduce_mean(self.base_line_est/self.sampled_cost)) tf.summary.scalar("difference_in_length", self.difference_in_length) tf.summary.scalar("relative_length", self.relative_length) tf.summary.scalar("Avg_or_cost", tf.reduce_mean(self.or_cost)) if Config.SEQUENCE_COST == 0: tf.summary.scalar("Avg_sampled_cost", tf.reduce_mean(self.sampled_cost)) else: tf.summary.scalar("Avg_sampled_cost", tf.reduce_mean(self.sampled_cost[:, 0])) # tf.summary.histogram("LocationStartDist", tf.transpose(self.pred_final_action, [1, 0])[0]) # tf.summary.histogram("LocationEndDist", tf.transpose(self.pred_final_action, [1, 0])[-1]) with tf.name_scope("Config"): tf.summary.scalar("REINFORCE", Config.REINFORCE) tf.summary.scalar("DIRECTION", Config.DIRECTION) tf.summary.scalar("NUM_OF_CUSTOMERS", Config.NUM_OF_CUSTOMERS) tf.summary.scalar("StateEmbed", tf.cast(Config.STATE_EMBED, tf.int32)) tf.summary.scalar("MAX_GRAD", Config.MAX_GRAD) tf.summary.scalar("LogitPen", Config.LOGIT_PENALTY) tf.summary.scalar("batch_size", self.batch_size) tf.summary.scalar("Config.LAYERS_STACKED_COUNT", Config.LAYERS_STACKED_COUNT) tf.summary.scalar("RNN_HIDDEN_DIM", Config.RNN_HIDDEN_DIM) tf.summary.scalar("RUN_TIME", Config.RUN_TIME) tf.summary.scalar("LOGIT_CLIP_SCALAR", Config.LOGIT_CLIP_SCALAR) tf.summary.scalar("Droput", tf.cast(Config.DROPOUT, tf.int32)) tf.summary.scalar("GPU", Config.GPU)
def __init__(self, **kwargs): dataset_folder = Path(kwargs["dataset_folder"]).resolve() check_valid_path(dataset_folder) result_folder = kwargs["result_folder"] self.initial_epoch = 1 self.test_mode = kwargs["test"] self.epochs = kwargs["epochs"] self.use_label_smoothing = kwargs["label_smoothing"] self.ckpt_path = kwargs["ckpt_path"] self.ckpt_epoch = kwargs["ckpt_epoch"] # model에 필요한 폴더 및 파일 생성 self.log_folder, self.ckpt_folder, self.image_folder = create_folder( result_folder) if not self.test_mode: self.training_result_file = self.log_folder / "training_result.txt" self.test_result_file = None # kwargs 값 저장 msg = "" for k, v in list(kwargs.items()): msg += "{} = {}\n".format(k, v) msg += "new model checkpoint path = {}\n".format(self.ckpt_folder) with (self.log_folder / "model_settings.txt").open( "w", encoding="utf-8") as fp: fp.write(msg) # 필요한 data를 불러옴 self.src_word2id, self.src_id2word, self.src_vocab_size = load_word_dic( dataset_folder / "src_word2id.pkl") self.tar_word2id, self.tar_id2word, self.tar_vocab_size = load_word_dic( dataset_folder / "tar_word2id.pkl") if not self.test_mode: train_src, num_train_src = get_dataset( self.src_word2id, dataset_folder / "train_src.txt", False, True, True) train_tar, num_train_tar = get_dataset( self.tar_word2id, dataset_folder / "train_tar.txt", True, True, True) if num_train_src != num_train_tar: raise Exception( "source 데이터셋({})과 target 데이터셋({})의 크기가 다릅니다.".format( num_train_src, num_train_tar)) self.num_train = num_train_src self.train_dataset = tf.data.Dataset.from_generator( lambda: zip(train_src, train_tar), (tf.int32, tf.int32)) self.train_dataset = self.train_dataset.cache().shuffle( self.num_train + 1).padded_batch( batch_size=kwargs["batch_size"], padded_shapes=(tf.TensorShape([None]), tf.TensorShape([None])), padding_values=(self.src_word2id["<PAD>"], self.tar_word2id["<PAD>"])).prefetch(1) test_src_path = dataset_folder / "test.txt" if test_src_path.exists(): test_src, self.num_test = get_dataset(self.src_word2id, test_src_path, False, True, False) # self.test_src_max_len = max([len(sentence) for sentence in test_src]) # padded_test_src = tf.keras.preprocessing.sequence.pad_sequences( # test_src, maxlen = self.test_src_max_len, padding = 'post', # dtype = 'int32', value = self.src_word2id["<PAD>"]) self.test_dataset = tf.data.Dataset.from_generator( lambda: test_src, tf.int32) self.test_dataset = self.test_dataset.cache().batch(1).prefetch(1) self.test_result_file = self.log_folder / "test_result.txt" elif self.test_mode: raise FileNotFoundError( "[ {} ] 경로가 존재하지 않습니다.".format(test_src_path)) self.encoder = Encoder(self.src_vocab_size, kwargs["embedding_size"], kwargs["hidden_size"], kwargs["dropout_rate"], kwargs["gru"], kwargs["bi"]) self.decoder = Decoder(self.tar_vocab_size, kwargs["embedding_size"], kwargs["hidden_size"], kwargs["attention_size"], kwargs["dropout_rate"], kwargs["gru"], kwargs["bi"]) # 아래 line 6줄은 colab에서 한글 깨짐을 방지하기 위한 부분으로 생략해도 됩니다. # %config InlineBackend.figure_format = 'retina' # !apt -qq -y install fonts-nanum fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf' font = fm.FontProperties(fname=fontpath, size=9) plt.rc('font', family='NanumBarunGothic') mpl.font_manager._rebuild()
def _get_default_params(): return { "cell_class": "tensorflow.contrib.rnn.BasicLSTMCell", "cell_params": { "num_units": 32 }, "dropout_input_keep_prob": 1.0, "dropout_output_keep_prob": 1.0 } input_data, source_sequence_length, targets, target_sequence_length = _get_inputs(batch_size) encoder_embed_input = tf.contrib.layers.embed_sequence(input_data, source_vocab_size, encoding_embedding_size) encoder = Encoder(tf.contrib.learn.ModeKeys.TRAIN, _get_default_params()) outputs, final_state = encoder(encoder_embed_input, source_sequence_length) layers = Dense(source_vocab_size) # I think I could use outputs to decoder real output. outputs = layers(outputs) # b * t * source_v_size masks = tf.sequence_mask(source_sequence_length, max_sequence_length) # b * t masks = tf.cast(masks, tf.float32) cost = sequence_loss(outputs, targets, masks) optimizer = tf.train.AdamOptimizer(lr) gradients = optimizer.compute_gradients(cost) capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None] train_op = optimizer.apply_gradients(capped_gradients) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) print("start trainning")
assert args.version is not None, "wrong --version argument." #------------------------------------------------------------------------- # #------------------------------------------------------------------------- Ps = args2Ps(args) #------------------------------------------------------------------------- # Vocabulary #------------------------------------------------------------------------- vocab = Vocabulary() vocab.make(dataset="flickr8k", min_word_freq=5) #------------------------------------------------------------------------- # models #------------------------------------------------------------------------- encoder = Encoder() encoder.fine_tune(Ps["fine_tune_encoder"]) decoder = Decoder(attention_dim = Ps["attention_dim"], embed_dim = Ps["embed_dim"], decoder_dim = Ps["decoder_dim"], encoder_dim = encoder.encoder_dim, vocab_size = len(vocab), device = Ps["device"], dropout = Ps["dropout"] ) encoder = encoder.to(Ps["device"]) decoder = decoder.to(Ps["device"]) # whether to load a saved state_dict from checkpoint file if Ps["parent"] is not None: pass #-------------------------------------------------------------------------
def train(train_dataset, validation_dataset=None, iterations=150, hidden_size=64, batch_size=16): print("Training...") train = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate) validation = DataLoader(validation_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate) encoder = Encoder(1, hidden_size).to(device) decoder = Decoder(hidden_size, 1).to(device) encoder_optimizer = optim.Adam(encoder.parameters()) decoder_optimizer = optim.Adam(decoder.parameters()) criterion = nn.MSELoss() train_losses = [] validation_losses = [] for iter in range(iterations): encoder.train() decoder.train() loss_acc = 0 for input_tensor, target_tensor, _, max_len, lens in train: _, encoder_hidden = encoder(input_tensor, None) decoder_hidden = encoder_hidden decoder_input = target_tensor[:, 0].view(batch_size, 1, 1) outputs = torch.zeros(batch_size, max_len) for di in range(1, max_len): decoder_output, decoder_hidden = decoder( decoder_input, decoder_hidden) outputs[:, di] = decoder_output.view(batch_size) decoder_input = decoder_output.detach() for i in range(len(lens)): outputs[i, lens[i]:] = 0 """ if iter == iterations-1: print(target_tensor[:,1:].squeeze()) print(outputs[:,1:].squeeze()) print() """ encoder_optimizer.zero_grad() decoder_optimizer.zero_grad() batch_loss = criterion(outputs[:, 1:].squeeze(), target_tensor[:, 1:].squeeze()) batch_loss.backward(retain_graph=True) loss_acc += batch_loss.item() encoder_optimizer.step() decoder_optimizer.step() train_losses.append(loss_acc) with torch.no_grad(): val_loss_acc = 0 for input_tensor, target_tensor, _, max_len, lens in validation: val_batch_size = len(target_tensor) _, encoder_hidden = encoder(input_tensor) decoder_hidden = encoder_hidden decoder_input = target_tensor[:, 0].view(val_batch_size, 1, 1) decoder_hidden = encoder_hidden outputs = torch.zeros(val_batch_size, max_len) for di in range(1, max_len): decoder_output, decoder_hidden = decoder( decoder_input, decoder_hidden) outputs[:, di] = decoder_output.view(val_batch_size) decoder_input = decoder_output for i in range(len(lens)): outputs[i, lens[i]:] = 0 val_loss = criterion(outputs[:, 1:].squeeze(), target_tensor[:, 1:].squeeze()) val_loss_acc += val_loss.item() validation_losses.append(val_loss_acc) if iter % 1 == 0: print("Iteration:", iter, " Train loss: ", "{0:.5f}".format(loss_acc / len(train)), " Validation loss: ", "{0:.5f}".format(validation_losses[-1])) showPlot(train_losses, validation_losses) torch.save(encoder, "models/encoder.pt") torch.save(decoder, "models/decoder.pt")