def train(args, data, bidaf): device = torch.device( f"cuda:{args.gpu}" if torch.cuda.is_available() else "cpu") utte_encoder = EncoderRNN(args, data.WORD.vocab.vectors).to(device) span_encoder = EncoderRNN(args, data.WORD.vocab.vectors).to(device) decoder = AttnDecoderRNN(args, data.WORD.vocab.vectors).to(device) utte_encoder_optimizer = optim.SGD(encoder.parameters(), lr=args.learning_rate) span_encoder_optimizer = optim.SGD(encoder.parameters(), lr=args.learning_rate) decoder_optimizer = optim.SGD(decoder.parameters(), lr=args.learning_rate) criterion = nn.NLLLoss() n_iters = 10 * len(data.train.examples) plot_loss_total = [] print_every = 10000 for iter in range(1, n_iters + 1): input_tensor = data.train.examples[i].q_word target_tensor = data.train.examples[i].ans span = ata.train.examples[i].span loss = train_each(input_tensor, target_tensor, utte_encoder, span_encoder, decoder, utte_encoder_optimizer, span_encoder_optimizer, decoder_optimizer, criterion) print_loss += loss if iter % print_every == 0: print_loss_avg = print_loss_total / print_every print_loss_total = 0 print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters), iter, iter / n_iters * 100, print_loss_avg))
def load_model_state(self, model_file): print("Resuming training from a given model...") model = torch.load(model_file, map_location=lambda storage, loc: storage) epoch = model['epoch'] encoder_state_dict = model['encoder_state_dict'] encoder_optimizer_state_dict = model['encoder_optimizer_state_dict'] decoder_state_dict = model['decoder_state_dict'] decoder_optimizer_state_dict = model['decoder_optimizer_state_dict'] loss = model['loss'] encoder = EncoderRNN(self.wm, self.embedding_size,\ self.hidden_size, self.bidirectional) decoder = AttnDecoderRNN("general", self.hidden_size, 10) enc_optimizer = optim.Adam(encoder.parameters(), lr=self.learning_rate) dec_optimizer = optim.Adam(decoder.parameters(), lr=self.learning_rate) return encoder, decoder, enc_optimizer, dec_optimizer, epoch
def main(): input_lang, output_lang, pairs = prepareData('eng', 'fra', True) print(random.choice(pairs)) device = torch.device(args.device) print('device : {}'.format(device)) encoder = EncoderRNN(input_lang.n_words, args.hidden_size).to(device) decoder = AttnDecoderRNN(args.hidden_size, output_lang.n_words, dropout_p=0.1).to(device) encoder_optimizer = optim.SGD(encoder.parameters(), lr=args.lr) decoder_optimizer = optim.SGD(decoder.parameters(), lr=args.lr) model = Translator(input_lang, output_lang, encoder, decoder, encoder_optimizer, decoder_optimizer) trainIters(model, pairs, n_iters=10000, print_every=100, plot_every=100) evaluateRandomly(model, pairs) output_words, attentions = evaluate(model, "je suis trop froid .") plt.matshow(attentions.numpy())
def loadmodel(model_file, wm, hidden_size, bidirectional): """ Loads the trained model, returns the encoder and decoder for inferencing. We initialize 'empty models' in which we will load our parameters. It is important that the hyperparameters are the same as used for training. Keyword arguments: model_file - string with the model location wm - embedding matrix hidden_size - hidden size bidirectional - whether we use bidirectional GRU layers """ model = torch.load(model_file, map_location=lambda storage, loc: storage) epoch = model['epoch'] encoder_state_dict = model['encoder_state_dict'] encoder_optimizer_state_dict = model['encoder_optimizer_state_dict'] decoder_state_dict = model['decoder_state_dict'] decoder_optimizer_state_dict = model['decoder_optimizer_state_dict'] loss = model['loss'] encoder = EncoderRNN(wm, 300, hidden_size, bidirectional) decoder = AttnDecoderRNN(hidden_size, 10) enc_optimizer = optim.Adam(encoder.parameters(), lr=0.0001) dec_optimizer = optim.Adam(decoder.parameters(), lr=0.0001) return encoder, decoder
def main(args): global best_bleu4, epochs_since_improvement, checkpoint, start_epoch, fine_tune_encoder, data_name, word_map # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) if args.checkpoint is None: decoder = AttnDecoderRNN(attention_dim=args.attention_dim, embed_dim=args.embed_dim, decoder_dim=args.decoder_dim, vocab_size=len(vocab), dropout=args.dropout) decoder_optimizer = torch.optim.Adam(params=filter( lambda p: p.requires_grad, decoder.parameters()), lr=args.decoder_lr) encoder = EncoderCNN() encoder.fine_tune(args.fine_tune_encoder) encoder_optimizer = torch.optim.Adam( params=filter(lambda p: p.requires_grad, encoder.parameters()), lr=args.encoder_lr) if args.fine_tune_encoder else None else: checkpoint = torch.load(args.checkpoint) start_epoch = checkpoint['epoch'] + 1 epochs_since_improvement = checkpoint['epochs_since_improvement'] best_bleu4 = checkpoint['bleu-4'] decoder = checkpoint['decoder'] decoder_optimizer = checkpoint['decoder_optimizer'] encoder = checkpoint['encoder'] encoder_optimizer = checkpoint['encoder_optimizer'] if fine_tune_encoder is True and encoder_optimizer is None: encoder.fine_tune(fine_tune_encoder) encoder_optimizer = torch.optim.Adam(params=filter( lambda p: p.requires_grad, encoder.parameters()), lr=args.encoder_lr) decoder = decoder.to(device) encoder = encoder.to(device) criterion = nn.CrossEntropyLoss().to(device) # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Build data loader train_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) val_loader = get_loader(args.image_dir_val, args.caption_path_val, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) for epoch in range(args.start_epoch, args.epochs): if args.epochs_since_improvement == 20: break if args.epochs_since_improvement > 0 and args.epochs_since_improvement % 8 == 0: adjust_learning_rate(decoder_optimizer, 0.8) if args.fine_tune_encoder: adjust_learning_rate(encoder_optimizer, 0.8) train(train_loader=train_loader, encoder=encoder, decoder=decoder, criterion=criterion, encoder_optimizer=encoder_optimizer, decoder_optimizer=decoder_optimizer, epoch=epoch) recent_bleu4 = validate(val_loader=val_loader, encoder=encoder, decoder=decoder, criterion=criterion) is_best = recent_bleu4 > best_bleu4 best_bleu4 = max(recent_bleu4, best_bleu4) if not is_best: args.epochs_since_improvement += 1 print("\nEpoch since last improvement: %d\n" % (args.epochs_since_improvement, )) else: args.epochs_since_improvement = 0 save_checkpoint(args.data_name, epoch, args.epochs_since_improvement, encoder, decoder, encoder_optimizer, decoder_optimizer, recent_bleu4, is_best)
def train(train_set, langs, embedding_size=600, learning_rate=0.01, iter_time=10, batch_size=32, get_loss=GET_LOSS, save_model=SAVE_MODEL, encoder_style=ENCODER_STYLE, use_model=USE_MODEL): """The training procedure.""" # Set the timer start = time.time() # Initialize the model emb = docEmbedding(langs['rt'].n_words, langs['re'].n_words, langs['rm'].n_words, embedding_size) emb.init_weights() if encoder_style == 'LIN': encoder = EncoderLIN(embedding_size, emb) elif encoder_style == 'BiLSTM': encoder = EncoderBiLSTM(embedding_size, emb) else: encoder = EncoderRNN(embedding_size, emb) decoder = AttnDecoderRNN(embedding_size, langs['summary'].n_words) if use_cuda: emb.cuda() encoder.cuda() decoder.cuda() if use_model is not None: encoder = load_model(encoder, use_model[0]) decoder = load_model(decoder, use_model[1]) # Choose optimizer loss_optimizer = optim.Adagrad(list(encoder.parameters()) + list(decoder.parameters()), lr=learning_rate, lr_decay=0, weight_decay=0) # decoder_optimizer = optim.Adagrad(decoder.parameters(), lr=learning_rate, lr_decay=0, weight_decay=0) criterion = nn.NLLLoss() total_loss = 0 iteration = 0 for epo in range(1, iter_time + 1): print("Epoch #%d" % (epo)) # Get data train_iter = data_iter(train_set, batch_size=batch_size) for dt in train_iter: iteration += 1 data, idx_data = get_batch(dt) rt, re, rm, summary = idx_data # Add paddings rt = addpaddings(rt) re = addpaddings(re) rm = addpaddings(rm) summary = addpaddings(summary) rt = Variable(torch.LongTensor(rt), requires_grad=False) re = Variable(torch.LongTensor(re), requires_grad=False) rm = Variable(torch.LongTensor(rm), requires_grad=False) # For Decoding summary = Variable(torch.LongTensor(summary), requires_grad=False) if use_cuda: rt, re, rm, summary = rt.cuda(), re.cuda(), rm.cuda( ), summary.cuda() # Get the average loss on the sentences loss = sentenceloss(rt, re, rm, summary, encoder, decoder, loss_optimizer, criterion, embedding_size, encoder_style) total_loss += loss # Print the information and save model if iteration % get_loss == 0: print("Time {}, iter {}, avg loss = {:.4f}".format( gettime(start), iteration, total_loss / get_loss)) total_loss = 0 if epo % save_model == 0: torch.save(encoder.state_dict(), "{}_encoder_{}".format(OUTPUT_FILE, iteration)) torch.save(decoder.state_dict(), "{}_decoder_{}".format(OUTPUT_FILE, iteration)) print("Save the model at iter {}".format(iteration)) return encoder, decoder
dropout_p=config.DROPOUT) if config.RESTORE: encoder_path = os.path.join(config.MODEL_DIR, "encoder.pth") decoder_path = os.path.join(config.MODEL_DIR, "decoder.pth") encoder.load_state_dict(torch.load(encoder_path)) decoder.load_state_dict(torch.load(decoder_path)) # Move models to GPU if config.USE_CUDA: encoder.cuda() decoder.cuda() # Initialize optimizers and criterion encoder_optimizer = optim.Adam(encoder.parameters(), lr=config.LR) decoder_optimizer = optim.Adam(decoder.parameters(), lr=config.LR) criterion = LanguageModelCriterion() #nn.NLLLoss(ignore_index=0) # Keep track of time elapsed and running averages start = time.time() plot_losses = [] print_loss_total = 0 plot_loss_total = 0 for epoch in range(1, config.NUM_ITER + 1): # Get training data for this cycle input_index, output_index, mask_batch = next(train_dataloader.load()) input_variable = Variable(torch.LongTensor(input_index)) output_variable = Variable(torch.LongTensor(output_index)) mask_variable = Variable(torch.FloatTensor(mask_batch))
lang_tuple = pkl.load(f) lang = Lang(lang_tuple) # Prepare dataloader for training train_dataiter = DataIter(train_pairs, lang, args.vocab_size, args.batch_size, args.cuda) # Set encoder and decoder encoder = Encoder(args.vocab_size, args.hidden_size) decoder = AttnDecoderRNN(args.attn, args.hidden_size, args.vocab_size, args.n_layers, args.dropout, args.cuda) if args.cuda: encoder = encoder.cuda() decoder = decoder.cuda() # Set optimizer and criterion encoder_optimizer = optim.Adam(encoder.parameters(), lr=args.lr, weight_decay=args.weight_decay) decoder_optimizer = optim.Adam(decoder.parameters(), lr=args.lr, weight_decay=args.weight_decay) encoder_scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer=encoder_optimizer, mode='min', factor=0.1, patience=5, verbose=True, min_lr=0.00001) decoder_scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer=decoder_optimizer, mode='min', factor=0.1, patience=5, verbose=True, min_lr=0.00001) criterion = nn.NLLLoss(ignore_index=PAD_token)
class EncoderDecoder(object): """EncoderDecoder""" def __init__(self, hidden_size=128, input_vocab_len=10000, output_vocab_len=10000, dropout_p=0.1, teacher_forcing_ratio=0.5, max_length=10, learning_rate=0.01, simple=False, bidirectional=False, dot=False, multi=False, num_layers=1): super(EncoderDecoder, self).__init__() self.hidden_size = hidden_size self.input_vocab_len = input_vocab_len self.output_vocab_len = output_vocab_len self.dropout_p = dropout_p self.max_length = max_length self.learning_rate = learning_rate self.simple = simple self.dot = dot self.bidirectional = bidirectional self.teacher_forcing_ratio = teacher_forcing_ratio self.multi = multi self.num_layers = num_layers if self.multi: self.encoder = code.MultiLayerBidirectionalEncoderRNN( input_vocab_len, hidden_size, num_layers=num_layers).to(device) self.decoder = code.MultiLayerAttnDecoderRNNDot( hidden_size, output_vocab_len, dropout_p=dropout_p, max_length=max_length, num_layers=num_layers).to(device) else: if self.bidirectional: self.encoder = code.define_bi_encoder(input_vocab_len, hidden_size).to(device) else: self.encoder = EncoderRNN(input_vocab_len, hidden_size).to(device) if self.simple: self.decoder = code.define_simple_decoder( hidden_size, input_vocab_len, output_vocab_len, max_length, num_layers=num_layers).to(device) else: if not self.dot: self.decoder = AttnDecoderRNN( hidden_size, output_vocab_len, dropout_p=dropout_p, max_length=self.max_length).to(device) else: self.decoder = code.AttnDecoderRNNDot( hidden_size, output_vocab_len, dropout_p=dropout_p, max_length=max_length).to(device) self.encoder_optimizer = None self.decoder_optimizer = None self.criterion = None self.input_lang = None self.output_lang = None def indexesFromSentence(self, lang, sentence, char=False): if char: return [lang.char2index[char] for char in sentence] else: return [lang.word2index[word] for word in sentence.split(' ')] def tensorFromSentence(self, lang, sentence, char=False): indexes = self.indexesFromSentence(lang, sentence, char) indexes.append(EOS_token) return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1) def tensorsFromPair(self, pair, char=False): input_tensor = self.tensorFromSentence(self.input_lang, pair[0], char) target_tensor = self.tensorFromSentence(self.output_lang, pair[1], char) return (input_tensor, target_tensor) def train(self, input_tensor, target_tensor): encoder_hidden = self.encoder.initHidden() self.encoder_optimizer.zero_grad() self.decoder_optimizer.zero_grad() input_length = input_tensor.size(0) target_length = target_tensor.size(0) encoder_outputs = torch.zeros(self.max_length, self.hidden_size, device=device) loss = 0 for ei in range(input_length): encoder_output, encoder_hidden = self.encoder( input_tensor[ei], encoder_hidden) if self.bidirectional: encoder_output = code.fix_bi_encoder_output_dim( encoder_output, self.hidden_size) if self.multi: encoder_output = code.fix_multi_bi_encoder_output_dim( encoder_output, self.hidden_size) encoder_outputs[ei] = encoder_output[0, 0] decoder_input = torch.tensor([[SOS_token]], device=device) if self.bidirectional: decoder_hidden = code.fix_bi_encoder_hidden_dim(encoder_hidden) elif self.multi: decoder_hidden = code.fix_multi_bi_encoder_hidden_dim( encoder_hidden) else: decoder_hidden = encoder_hidden use_teacher_forcing = True if random.random( ) < self.teacher_forcing_ratio else False if use_teacher_forcing: # Teacher forcing: Feed the target as the next input for di in range(target_length): if self.simple: decoder_output, decoder_hidden = code.run_simple_decoder( self.decoder, decoder_input, encoder_hidden, decoder_hidden, encoder_outputs) else: decoder_output, decoder_hidden, decoder_attention = self.decoder( decoder_input, decoder_hidden, encoder_outputs) loss += self.criterion(decoder_output, target_tensor[di]) decoder_input = target_tensor[di] # Teacher forcing else: # Without teacher forcing: use its own predictions as the next input for di in range(target_length): if self.simple: decoder_output, decoder_hidden = code.run_simple_decoder( self.decoder, decoder_input, encoder_hidden, decoder_hidden, encoder_outputs) else: decoder_output, decoder_hidden, decoder_attention = self.decoder( decoder_input, decoder_hidden, encoder_outputs) topv, topi = decoder_output.topk(1) decoder_input = topi.squeeze().detach( ) # detach from history as input loss += self.criterion(decoder_output, target_tensor[di]) if decoder_input.item() == EOS_token: break loss.backward() self.encoder_optimizer.step() self.decoder_optimizer.step() return loss.item() / target_length def trainIters(self, pairs, input_lang, output_lang, n_iters, print_every=1000, plot_every=100, char=False): start = time.time() plot_losses = [] print_loss_total = 0 # Reset every print_every plot_loss_total = 0 # Reset every plot_every self.input_lang = input_lang self.output_lang = output_lang self.encoder_optimizer = optim.SGD(self.encoder.parameters(), lr=self.learning_rate) self.decoder_optimizer = optim.SGD(self.decoder.parameters(), lr=self.learning_rate) selected_pairs = [random.choice(pairs) for i in range(n_iters)] training_pairs = [ self.tensorsFromPair(pair, char) for pair in selected_pairs ] self.criterion = nn.NLLLoss() for iter in range(1, n_iters + 1): training_pair = training_pairs[iter - 1] input_tensor = training_pair[0] target_tensor = training_pair[1] loss = self.train(input_tensor, target_tensor) print_loss_total += loss plot_loss_total += loss if iter % print_every == 0: print_loss_avg = print_loss_total / print_every print_loss_total = 0 print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters), iter, iter / n_iters * 100, print_loss_avg)) if iter % plot_every == 0: plot_loss_avg = plot_loss_total / plot_every plot_losses.append(plot_loss_avg) plot_loss_total = 0 showPlot(plot_losses) def evaluate(self, sentence, char=False): with torch.no_grad(): input_tensor = self.tensorFromSentence(self.input_lang, sentence, char) input_length = input_tensor.size()[0] encoder_hidden = self.encoder.initHidden() encoder_outputs = torch.zeros(self.max_length, self.encoder.hidden_size, device=device) for ei in range(input_length): encoder_output, encoder_hidden = self.encoder( input_tensor[ei], encoder_hidden) if self.bidirectional: encoder_output = code.fix_bi_encoder_output_dim( encoder_output, self.hidden_size) if self.multi: encoder_output = code.fix_multi_bi_encoder_output_dim( encoder_output, self.hidden_size) encoder_outputs[ei] += encoder_output[0, 0] decoder_input = torch.tensor([[SOS_token]], device=device) # SOS if self.bidirectional: decoder_hidden = code.fix_bi_encoder_hidden_dim(encoder_hidden) elif self.multi: decoder_hidden = code.fix_multi_bi_encoder_hidden_dim( encoder_hidden) else: decoder_hidden = encoder_hidden decoded_words = [] if not self.simple: decoder_attentions = torch.zeros(self.max_length, self.max_length) for di in range(self.max_length): if self.simple: decoder_output, decoder_hidden = code.run_simple_decoder( self.decoder, decoder_input, encoder_hidden, decoder_hidden, encoder_outputs) else: decoder_output, decoder_hidden, decoder_attention = self.decoder( decoder_input, decoder_hidden, encoder_outputs) decoder_attentions[di] = decoder_attention.data topv, topi = decoder_output.data.topk(1) if topi.item() == EOS_token: decoded_words.append('<EOS>') break else: if char: decoded_words.append( self.output_lang.index2char[topi.item()]) else: decoded_words.append( self.output_lang.index2word[topi.item()]) decoder_input = topi.squeeze().detach() if not self.simple: return decoded_words, decoder_attentions[:di + 1] else: return decoded_words, None @classmethod def load(cls, directory): with open(os.path.join(directory, 'args.pkl'), 'rb') as f: params = cloudpickle.load(f) model = EncoderDecoder( params['hidden_size'], params['input_vocab_len'], params['output_vocab_len'], dropout_p=params['dropout_p'], teacher_forcing_ratio=params['teacher_forcing_ratio'], max_length=params['max_length'], learning_rate=params['learning_rate'], simple=params['simple'], bidirectional=params['bidirectional'], dot=params['dot'], multi=params['multi'], num_layers=params['num_layers']) model.input_lang = params['input_lang'] model.output_lang = params['output_lang'] model.encoder.load_state_dict( torch.load(os.path.join(directory, 'encoder.pt'), map_location=lambda storage, loc: storage).state_dict()) model.decoder.load_state_dict( torch.load(os.path.join(directory, 'decoder.pt'), map_location=lambda storage, loc: storage).state_dict()) return model def save(self, directory): if not os.path.exists(directory): os.makedirs(directory) def create_save_model(model, path): return torch.save(model, path) create_save_model(self.encoder, directory + 'encoder.pt') create_save_model(self.decoder, directory + 'decoder.pt') with open(os.path.join(directory, 'args.pkl'), 'wb') as f: cloudpickle.dump( { 'input_lang': self.input_lang, 'output_lang': self.output_lang, 'dropout_p': self.dropout_p, 'teacher_forcing_ratio': self.teacher_forcing_ratio, 'max_length': self.max_length, 'learning_rate': self.learning_rate, 'hidden_size': self.hidden_size, 'input_vocab_len': self.input_vocab_len, 'output_vocab_len': self.output_vocab_len, 'simple': self.simple, 'bidirectional': self.bidirectional, 'dot': self.dot, 'multi': self.multi, 'num_layers': self.num_layers }, f) def evaluatePairs(self, pairs, rand=True, n=10, plot=False, char=False): n = n if rand else len(pairs) outputs = [] for i in range(n): if rand: pair = random.choice(pairs) else: pair = pairs[i] print('>', pair[0]) print('=', pair[1]) output_words, attentions = self.evaluate(pair[0], char) if plot and not self.simple: plt.matshow(attentions.numpy()) if char: output_sentence = ''.join(output_words[:-1]) else: output_sentence = ' '.join(output_words[:-1]) outputs.append((output_sentence, pair[1])) print('<', output_sentence) print('') return outputs def evaluateAndShowAttention(self, input_sentence, char=False): output_words, attentions = self.evaluate( normalizeString(input_sentence), char) print('input =', input_sentence) if char: print('output =', ''.join(output_words)) else: print('output =', ' '.join(output_words)) if not self.simple: showAttention(normalizeString(input_sentence), output_words, attentions[:, :len(output_words)], char=char) else: print( "Not an attention based model as per the parameter 'simple' !")
nlayers, dropout_p) else: encoder = MetaNetRNN(emb_size, input_size, output_size, nlayers, dropout_p) if use_attention: decoder = AttnDecoderRNN(emb_size, output_size, nlayers, dropout_p) else: decoder = DecoderRNN(emb_size, output_size, nlayers, dropout_p) if USE_CUDA: encoder = encoder.cuda() decoder = decoder.cuda() criterion = nn.NLLLoss() print(' Set learning rate to ' + str(adam_learning_rate)) encoder_optimizer = optim.Adam(encoder.parameters(), lr=adam_learning_rate) decoder_optimizer = optim.Adam(decoder.parameters(), lr=adam_learning_rate) print("") print("Architecture options...") print(" Decoder attention is USED") if use_attention else print( " Decoder attention is NOT used") print(" External memory is USED") if not disable_memory else print( " External memory is NOT used") print(" Reconstruction loss is USED" ) if not disable_recon_loss else print( " Reconstruction loss is NOT used") print("") describe_model(encoder) describe_model(decoder) # create validation episodes
class Train(object): """ """ #TODO : need to change, with no dataset in the train class, a apply method should be #TODO : in this class, which accept a dataset parameters and train the dataset . def __init__(self, config, dataset): self.config = config self.n_epochs = config.n_epochs self.encoder = EncoderRNN(n_dict=dataset.source.n_words, config=config) self.decoder = AttnDecoderRNN(n_dict=dataset.target.n_words, config=config) self.encoder_optimizer = config.optimizier(self.encoder.parameters(), lr=config.learning_rate) self.decoder_optimizer = config.optimizier(self.decoder.parameters(), lr=config.learning_rate) self.criterion = nn.NLLLoss() self.is_plot = config.is_plot self.clip_value = config.clip_value self.losses = [] if self.config.USE_CUDA: self.encoder.cuda(self.config.gpu_id) if self.config.USE_CUDA: self.decoder.cuda(device_id=self.config.gpu_id) def train(self, dataset): if self.is_plot: fig, ax = plt.subplots() grid(True) plt.ion() for epoch in range(self.n_epochs): training_pair = dataset.get_sample_var() loss, result_output = self.step(training_pair) print("At Epoch : {:5},Get loss : {:10}\n".format(epoch, loss)) self.losses.append(loss) if self.is_plot: ax.plot(range(epoch + 1), self.losses, "b") plt.pause(0.0001) plt.show() if epoch % 100 == 0: print ''.join([ dataset.target.index2word[i] for i in training_pair[1].squeeze(1).data.tolist() ]) print ''.join( [dataset.target.index2word[i] for i in result_output]) def step(self, training_pair): self.encoder_optimizer.zero_grad() self.decoder_optimizer.zero_grad() input_variable = training_pair[0] target_variable = training_pair[1] loss = 0 input_length = input_variable.size()[0] target_length = target_variable.size()[0] encoder_hidden = self.encoder.init_hidden() encoder_outputs, encoder_hidden = self.encoder(input_variable, encoder_hidden) decoder_input = Variable(torch.LongTensor([[self.config.SOS_token]])) decoder_context = Variable(torch.zeros(1, self.decoder.hidden_dim)) decoder_hidden = encoder_hidden if self.config.USE_CUDA: decoder_input = decoder_input.cuda(device_id=self.config.gpu_id) decoder_context = decoder_context.cuda( device_id=self.config.gpu_id) assert type(decoder_input.data) == torch.cuda.LongTensor assert type(decoder_context.data) == torch.cuda.FloatTensor result_output = [] for di in range(target_length): decoder_output, \ decoder_context, \ decoder_hidden, \ decoder_attention = self.decoder(decoder_input, decoder_context, decoder_hidden, encoder_outputs) loss += self.criterion(decoder_output[0], target_variable[di]) topv, topi = decoder_output.data.topk(1) ni = topi[0][0] decoder_input = Variable(torch.LongTensor([[ni]])) if self.config.USE_CUDA: decoder_input = decoder_input.cuda( device_id=self.config.gpu_id) result_output.append(ni) if ni == self.config.EOS_token: break loss.backward() # TODO : clip value torch.nn.utils.clip_grad_norm(self.encoder.parameters(), self.clip_value) torch.nn.utils.clip_grad_norm(self.decoder.parameters(), self.clip_value) self.encoder_optimizer.step() self.decoder_optimizer.step() if self.config.USE_CUDA: return loss.cpu().data[0] / target_length, result_output return loss.data[0] / target_length, result_output
n_layers = 2 dropout_p = 0.05 # Initialize models encoder = EncoderRNN(input_lang.n_words, hidden_size, n_layers) decoder = AttnDecoderRNN(attn_model, hidden_size, output_lang.n_words, n_layers, dropout_p=dropout_p) # Move models to GPU if USE_CUDA: encoder.cuda() decoder.cuda() # Initialize optimizers and criterion learning_rate = 0.0001 encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate) decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate) criterion = nn.NLLLoss() # Configuring training n_epochs = 50000 plot_every = 200 print_every = 1000 # Keep track of time elapsed and running averages start = time.time() plot_losses = [] print_loss_total = 0 # Reset every print_every plot_loss_total = 0 # Reset every plot_every # Begin! for epoch in range(1, n_epochs + 1):
def main(args): if not os.path.exists(args.model_path): os.makedirs(args.model_path) transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) encoder = EncoderCNN(args.embed_size) decoder = AttnDecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) total_step = len(data_loader) decoder_hidden = decoder.init_hidden() for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): images = cuda_variable(images, volatile=True) captions = cuda_variable(captions) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] decoder.zero_grad() encoder.zero_grad() features = encoder(images) outputs = decoder(captions, decoder_hidden, features, lengths) # outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() if i % args.log_step == 0: print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' %(epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) if (i+1) % args.save_step == 0: torch.save(decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' %(epoch+1, i+1))) torch.save(encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' %(epoch+1, i+1)))
# Initialize models embedder = EmbeddingMatrix(Vocab_Size, Embedding_Size) encoder = EncoderRNN(Embedding_Size, Hidden_Size) decoder = AttnDecoderRNN(Embedding_Size, Hidden_Size, Vocab_Size, Extend_Vocab_Size, dropout_p=0.3) embedder = embedder.cuda() encoder = encoder.cuda() decoder = decoder.cuda() embedder_optimizer = optim.Adam(embedder.parameters(),lr = learning_rate) encoder_optimizer = optim.Adam(encoder.parameters(),lr = learning_rate, weight_decay=0.0000001) decoder_optimizer = optim.Adam(decoder.parameters(),lr = learning_rate, weight_decay=0.0000001) criterion = nn.NLLLoss(ignore_index = 0).cuda() #configring traing n_epochs = 40 plot_every = 2 print_every = 5 start = time.time() plot_losses =[] print_loss = 0 print_loss_total = 0 plot_loss_total = 0 #begin loss_list=[] mydataset = KP20K('dataset','small',True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") torch.set_num_threads(10) # Split into training and data set train_set, val_set = random_split( recipe_step_pairs, [TRAIN_SET_SIZE, len(recipe_step_pairs) - TRAIN_SET_SIZE]) print(len(train_set)) print(len(val_set)) encoder = EncoderRNN(n_words, HIDDEN_DIM).to(device) decoder = AttnDecoderRNN(HIDDEN_DIM, n_words, max_length=MAX_LENGTH).to(device) encoder_optimizer = optim.SGD(encoder.parameters(), lr=LEARNING_RATE) decoder_optimizer = optim.SGD(decoder.parameters(), lr=LEARNING_RATE) loss_function = nn.NLLLoss() losses_per_epoch = [] for e in range(N_EPOCHS): print("---- epoch ", e) train_set = list(train_set) shuffle(train_set) loss = trainIters(encoder, decoder, n_iters=TRAIN_SET_SIZE, max_length=MAX_LENGTH, print_every=REPORT_EVERY) losses_per_epoch.append(loss) torch.save(
class Seq2Pose(): def __init__(self, wm, input_length, batch_size, hidden_size, bidirectional\ , embedding_size, n_parameter, m_parameter, learning_rate, clip,\ alpha, beta, pre_trained_file = None): self.batch_size = batch_size self.hidden_size = hidden_size self.embedding_size = embedding_size self.bidirectional = bidirectional self.n_parameter = n_parameter self.m_parameter = m_parameter self.learning_rate = learning_rate self.wm = wm self.clip = clip self.alpha = alpha self.beta = beta if pre_trained_file == None: self.encoder = EncoderRNN(self.wm, self.embedding_size,\ hidden_size, bidirectional) self.decoder = AttnDecoderRNN(self.hidden_size, 10) self.enc_optimizer = optim.Adam(self.encoder.parameters(),\ lr=self.learning_rate) self.dec_optimizer = optim.Adam(self.decoder.parameters(),\ lr=self.learning_rate) self.start = 0 else: self.resume_training = True self.encoder, self.decoder, self.enc_optimizer, self.dec_optimizer,\ self.start = self.load_model_state(pre_trained_file) self.decoder = self.decoder.to(device) self.encoder = self.encoder.to(device) def load_model_state(self, model_file): print("Resuming training from a given model...") model = torch.load(model_file, map_location=lambda storage, loc: storage) epoch = model['epoch'] encoder_state_dict = model['encoder_state_dict'] encoder_optimizer_state_dict = model['encoder_optimizer_state_dict'] decoder_state_dict = model['decoder_state_dict'] decoder_optimizer_state_dict = model['decoder_optimizer_state_dict'] loss = model['loss'] encoder = EncoderRNN(self.wm, self.embedding_size,\ self.hidden_size, self.bidirectional) decoder = AttnDecoderRNN(self.hidden_size, 10) enc_optimizer = optim.Adam(encoder.parameters(), lr=self.learning_rate) dec_optimizer = optim.Adam(decoder.parameters(), lr=self.learning_rate) return encoder, decoder, enc_optimizer, dec_optimizer, epoch def train(self, epochs, x_train, y_train): """ Training loop, trains the network for the given parameters. Keyword arguments: epochs - number of epochs to train for (looping over the whole dataset) x_train - training data, contains a list of integer encoded strings y_train - training data, contains a list of pose sequences """ criterion = CustomLoss(self.alpha, self.beta) training_set = Dataset(x_train, y_train) training_generator = data.DataLoader(training_set,\ batch_size=self.batch_size, shuffle=True,\ collate_fn=self.pad_and_sort_batch,\ num_workers=8, drop_last=True) decoder_fixed_previous = Variable(torch.zeros(self.n_parameter,\ self.batch_size, 10, requires_grad=False)).to(device) decoder_fixed_input = torch.FloatTensor\ ([[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] *\ self.batch_size).to(device) for epoch in range(self.start, epochs): total_loss = 0 for mini_batches, max_target_length in tqdm(training_generator): #kickstart vectors self.enc_optimizer.zero_grad() self.dec_optimizer.zero_grad() loss = 0 decoder_previous_inputs = decoder_fixed_previous for z in range(self.n_parameter): decoder_previous_inputs[z] = decoder_fixed_input for i, (x, y, lengths) in enumerate(mini_batches): t1 = time.perf_counter() x = x.to(device) y = y.to(device) decoder_m = np.shape(y)[0] encoder_outputs, encoder_hidden = self.encoder(x, None) decoder_hidden = encoder_hidden[:self.decoder.n_layers] decoder_output = None for n_prev in range(self.n_parameter): decoder_output, decoder_hidden, attn_weights =\ self.decoder(decoder_previous_inputs[n_prev].float(),\ decoder_hidden, encoder_outputs) decoder_input = decoder_output.float() decoder_previous_generated = Variable(torch.zeros(decoder_m,\ self.batch_size, 10, requires_grad=False)).to(device) decoder_outputs_generated = Variable(torch.zeros(decoder_m,\ self.batch_size, 10, requires_grad=False)).to(device) for fut_pose in range(decoder_m): decoder_output, decoder_hidden, attn_weights =\ self.decoder(decoder_input,decoder_hidden, encoder_outputs) decoder_outputs_generated[fut_pose] = decoder_output decoder_input = y[fut_pose].float() decoder_previous_inputs = decoder_outputs_generated[:-10] # max_length, batch_, item # now mask generated outputs decoder_masked = torch.where(y == 0.0, y.float(),\ decoder_outputs_generated.float()) decoder_previous_generated[1:] = decoder_masked[:-1] loss += criterion(decoder_masked, decoder_previous_generated,\ y.float()) total_loss += loss.item() loss.backward() torch.nn.utils.clip_grad_norm_(self.encoder.parameters(),\ self.clip) torch.nn.utils.clip_grad_norm_(self.decoder.parameters(),\ self.clip) self.enc_optimizer.step() self.dec_optimizer.step() if epoch % 10 == 0: self.save_model(self.encoder, self.decoder, self.enc_optimizer,\ self.dec_optimizer, epoch, "./models/seq2seq_{}_{}.tar".\ format(epoch, total_loss/len(x_train)), total_loss) print("Epoch: {} Loss: {}".format(epoch, total_loss)) def pad_and_sort_batch(self, DataLoaderBatch): """ Pads and sorts the batches, provided as a collate function. Keyword arguments: DataLoaderBatch - Batch of data coming from dataloader class. """ batch_size = len(DataLoaderBatch) batch_split = list(zip(*DataLoaderBatch)) seqs, targs, lengths, target_lengths = batch_split[0], batch_split[1],\ batch_split[2], batch_split[3] #calculating the size for the minibatches max_length = max(lengths) #longest sequence in X max_target_length = max(target_lengths) #longest sequence in Y number_of_chunks = int(max_target_length / self.m_parameter) not_in_chunk = max_target_length % self.m_parameter words_per_chunk = int(max_length / number_of_chunks) not_in_words_per_chunk = max_length % words_per_chunk #first zeropad it all padded_seqs = np.zeros((batch_size, max_length)) for i, l in enumerate(lengths): padded_seqs[i, 0:l] = seqs[i][0:l] new_targets = np.zeros((batch_size, max([len(s) for s in targs]), 10)) for i, item in enumerate(targs): new_targets[i][:len(targs[i])] = targs[i] seq_lengths, perm_idx = torch.tensor(lengths).sort(descending=True) seq_lengths = list(seq_lengths) seq_tensor = padded_seqs[perm_idx] target_tensor = new_targets[perm_idx] #Full batch is sorted, now we are going to create minibatches. #in these batches time comes first, so: [time, batch, features] #we also add a vector with lengths, which are necessary for padding mini_batches = [] #contains x and y tensor per item seq_tensor = np.transpose(seq_tensor, (1, 0)) target_tensor = np.transpose(target_tensor, (1, 0, 2)) counter = 0 for i in range(number_of_chunks): x = seq_tensor[i * words_per_chunk:(i + 1) * words_per_chunk] y = target_tensor[i * self.m_parameter:(i + 1) * self.m_parameter] counter += words_per_chunk * i x_mini_batch_lengths = [] for j in range(batch_size): if seq_lengths[j] > counter and seq_lengths[ j] < counter + words_per_chunk: x_mini_batch_lengths.append(seq_lengths[j].item() - counter) elif seq_lengths[j] > counter + words_per_chunk: x_mini_batch_lengths.append(words_per_chunk) else: x_mini_batch_lengths.append(0) mini_batches.append([ torch.tensor(x).long(), torch.tensor(y), x_mini_batch_lengths ]) if not_in_chunk != 0: x = seq_tensor[number_of_chunks * words_per_chunk:] y = target_tensor[number_of_chunks * self.m_parameter:] x_mini_batch_lengths = [] counter = number_of_chunks * words_per_chunk for j in range(batch_size): if seq_lengths[j] > counter and seq_lengths[ j] < counter + words_per_chunk: x_mini_batch_lengths.append(seq_lengths[j].item() - counter) elif seq_lengths[j] > counter + words_per_chunk: x_mini_batch_lengths.append(words_per_chunk) else: x_mini_batch_lengths.append(0) if len(x) > 0 and len(y) > 0: mini_batches.append([ torch.tensor(x).long(), torch.tensor(y), x_mini_batch_lengths ]) return mini_batches, max_target_length def save_model(self, encoder, decoder, enc_optimizer, dec_optimizer,\ epoch, PATH, loss): torch.save( { 'epoch': epoch, 'encoder_state_dict': encoder.state_dict(), 'encoder_optimizer_state_dict': enc_optimizer.state_dict(), 'decoder_state_dict': decoder.state_dict(), 'decoder_optimizer_state_dict': dec_optimizer.state_dict(), 'loss': loss, }, PATH)