def train_procedure_test(): config = get_config() load_path = config['word2idx_train_path'] voca_path = config['caption_vocab_path'] dataset = load_tokenized_data(load_path) voca = load_voca(voca_path) batch_size = 2 embed_size = 10 vocab_len = len(voca) hidden_layer = 1 hidden_size = 10 loader = make_caption_loader(dataset, batch_size, config['caption_train_image_path']) dataiter = iter(loader) images, caption, length = dataiter.next() # data형태 확인하기 print("Data 형태 확인") print(images.size()) print(caption.size()) encoder = EncoderCNN(embed_size) decoder = DecoderRNN(embed_size, vocab_len, hidden_layer, hidden_size) grad_params = list(encoder.linear.parameters()) loss_function = nn.CrossEntropyLoss() optimizer = optim.Adam(params=grad_params, lr=0.001) compare_target = pack_padded_sequence(caption, length, batch_first=True).data feature = encoder(images) output = decoder(caption, feature, length) loss = loss_function(output, compare_target) optimizer.zero_grad() loss.backward() optimizer.step() datestr = date2str() save_path = config['checkpoints_saved_path'] mini_batch_loss = [] mini_batch_loss.append(loss.item()) save_config(config, "config" + datestr, save_path) save_loss(mini_batch_loss, "loss" + datestr, save_path) save_model(encoder, "encoder" + datestr, save_path) save_model(decoder, "decoder" + datestr, save_path) print( "optimzer.zero_grad()와 encoder.zero_grad() , decoder.zero_grad()와 같을 까?" ) print("optimizer.zero_grad() 호출하기 전") print(encoder.linear.weight.grad) print("optimizer.zero_grad() 호출한 후") optimizer.zero_grad() print(encoder.linear.weight.grad) print("====================") print(grad_params)
def loader_test(): config = get_config() load_path = config['word2idx_test_path'] voca_path = config['caption_vocab_path'] dataset = load_tokenized_data(load_path) print(dataset['image_list']) voca = load_voca(voca_path) loader = make_caption_loader(dataset, 10, config['train_image_path']) dataiter = iter(loader) images, padded_caption, caption_length = dataiter.next() print(images)
def caption_train(vocab_path, image_path, cfg, caption_path, word2idx_path=None): voca = load_voca(vocab_path) if word2idx_path is not None: dataset = load_tokenized_data(word2idx_path) else: dataset = tokenized_data(caption_path, voca, type="train") save_tokenized_data(dataset, type="train") batch = cfg['caption_batch'] embed_size = cfg['caption_embed_size'] hidden_size = cfg['caption_hidden_size'] hidden_layer = cfg['caption_hidden_layer'] epochs = cfg['caption_epoch'] loader = make_caption_loader(dataset, batch, image_path) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") encoder = EncoderCNN(embed_size) decoder = DecoderRNN(embed_size, len(voca), hidden_layers_num=hidden_layer, hidden_size=hidden_size) encoder.to(device) decoder.to(device) learning_rate = 5e-5 adam_epsilon = 1e-8 loss_function = nn.CrossEntropyLoss() param_list = list(encoder.linear.parameters()) + list( encoder.bn.parameters()) + list(decoder.parameters()) optimizer = AdamW(param_list, lr=learning_rate, eps=adam_epsilon) num_training_steps = len(loader) * epochs scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_training_steps) global_step = 0 epochs_trained = 0 tr_loss = 0.0 logging_loss = 0.0 train_iterator = trange(epochs_trained, int(epochs), desc="Epoch") logging_steps = 500 loss_record = [] for epoch in train_iterator: epoch_iterator = tqdm(loader, desc="Iteration") for idx_of_batch, (images, word2idxes, length) in enumerate(epoch_iterator): images, word2idxes = images.to(device), word2idxes.to(device) features = encoder(images) compare_targets = pack_padded_sequence(word2idxes, length, batch_first=True).data output = decoder(word2idxes, features, length) loss = loss_function(output, compare_targets) optimizer.zero_grad() loss.backward() optimizer.step() scheduler.step() tr_loss += loss.item() global_step += 1 if logging_steps > 0 and global_step % logging_steps == 0: logs = {} loss_scalar = (tr_loss - logging_loss) / logging_steps learning_rate_scalar = scheduler.get_last_lr()[0] logs["learning_rate"] = learning_rate_scalar logs["loss"] = loss_scalar loss_record.append(loss_scalar) logging_loss = tr_loss epoch_iterator.write( json.dumps({ **logs, **{ "step": global_step } })) return loss_record, encoder, decoder
def caption_test(vocab_path, encoder_path, decoder_path, caption_path, image_path, config_path, batch, max_sequence_len, word2idx_path=None): vocab = load_voca(vocab_path) cfg = get_config(config_path) embed_size = cfg['caption_embed_size'] vocab_size = len(vocab) hidden_layers_num = cfg['caption_hidden_layer'] hidden_size = cfg['caption_hidden_size'] if word2idx_path is not None: dataset = load_tokenized_data(word2idx_path) else: dataset = tokenized_data(caption_path, vocab, type="test") save_tokenized_data(dataset, type="test") encoder = EncoderCNN(embed_size) decoder = DecoderRNN(embed_size, vocab_size, hidden_layers_num, hidden_size) encoder.load_state_dict(torch.load(encoder_path)) decoder.load_state_dict(torch.load(decoder_path)) encoder.eval() decoder.eval() loader = make_caption_loader(dataset, batch, image_path) test_data_iter = iter(loader) images, captions, length = test_data_iter.next() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") device_images = images.to(device) features = encoder(images) states = None # features의 형태는 (batch,embed_size)인 2차원입니다. 그러나 # 이후 사용될 lstm은 input으로 (batch,num of embeddings,embed_size) 3차원 형태를 요구하기 때문에 # features의 차원을 강제로 늘려줍니다. lstm_inputs = features.unsqueeze(1) predicted_index = [] for i in range(max_sequence_len): outputs, states = decoder.lstm(lstm_inputs, states) # outputs을 linear 레이어의 인풋을 위해 2차원 배열로 만들어 줘야함 outputs = outputs.squeeze(1) scores_per_batch = decoder.score_layer(outputs) values, predicted = scores_per_batch.max(1) predicted_index.append(predicted) lstm_inputs = decoder.embed(predicted) lstm_inputs = lstm_inputs.unsqueeze(1) # tensor를 포함한 그냥 1차원 짜리 리스트 [batch * max_sequence_len] => 2차원의 매트릭스 [batch X max_sequence_len] 바꿔줘야 함 # ex) # predicted_index = [tensor([0,3,6]),tensor([1,4,7]),tensor([2,5,8])] # 이걸 # [0,1,2] # [3,4,5] # [6,7,8] 이렇게 바꿔줘야 함 # 2차원 짜리를 만들건데 기존의 리스트는 dim 0 방향이 되고(세로방향) # 새로 붙이는 리스트는 dim 1 방향으로 붙여야 함(가로 방향) predicted_index = torch.stack(predicted_index, dim=1) # 현재 tensor가 gpu에 있으므로 cpu로 옮겨서 연산을 해야함. predicted_index = predicted_index.cpu().numpy() result_captions = [] for wordindices in predicted_index: caption = [] for index in wordindices: word = vocab.idx2word[index] if word == '<end>': break if word == '<unk>' or word == '<start>': continue caption.append(word) result_captions.append(caption) return images, result_captions, captions
def attention_caption_train(vocab_path, image_path, cfg, caption_path, word2idx_path=None): voca = load_voca(vocab_path) if word2idx_path is not None: dataset = load_tokenized_data(word2idx_path) else: dataset = tokenized_data(caption_path, voca, type="train") save_tokenized_data(dataset,type="train") batch = cfg['caption_batch'] emb_dim = cfg['caption_embed_size'] decoder_dim = cfg['caption_hidden_size'] attention_dim = cfg['caption_attention_dim'] dropout = cfg['caption_dropout_ratio'] epochs = cfg['caption_epoch'] loader = make_caption_loader(dataset, batch, image_path) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") encoder = Encoder() encoder.fine_tune(False) decoder = DecoderWithAttention(attention_dim=attention_dim, embed_dim=emb_dim, decoder_dim=decoder_dim, vocab_size=len(voca), dropout=dropout) encoder.to(device) decoder.to(device) learning_rate = 5e-5 adam_epsilon = 1e-8 loss_function = nn.CrossEntropyLoss() param_list = list(decoder.parameters()) optimizer = AdamW(param_list, lr=learning_rate, eps=adam_epsilon) num_training_steps = len(loader) * epochs scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_training_steps ) global_step = 0 epochs_trained = 0 tr_loss = 0.0 logging_loss = 0.0 train_iterator = trange( epochs_trained, int(epochs), desc="Epoch" ) logging_steps = 500 loss_record = [] for epoch in train_iterator: epoch_iterator = tqdm(loader, desc="Iteration") encoder.train() decoder.train () for idx_of_batch,(images, word2idxes,length) in enumerate(epoch_iterator): length = torch.LongTensor(length).to(device) images,word2idxes = images.to(device),word2idxes.to(device) features = encoder(images) scores, caps_sorted, decode_lengths, alphas, sort_ind = decoder(features, word2idxes, length) # Since we decoded starting with <start>, the targets are all words after <start>, up to <end> targets = caps_sorted[:, 1:] # Remove timesteps that we didn't decode at, or are pads # pack_padded_sequence is an easy trick to do this scores = pack_padded_sequence(scores, decode_lengths, batch_first=True).data targets = pack_padded_sequence(targets, decode_lengths, batch_first=True).data # Calculate loss loss = loss_function(scores, targets) optimizer.zero_grad() loss.backward() optimizer.step() scheduler.step() tr_loss += loss.item() global_step += 1 if logging_steps > 0 and global_step % logging_steps == 0: logs = {} loss_scalar = (tr_loss - logging_loss) / logging_steps learning_rate_scalar = scheduler.get_last_lr()[0] logs["learning_rate"] = learning_rate_scalar logs["loss"] = loss_scalar loss_record.append(loss_scalar) logging_loss = tr_loss epoch_iterator.write(json.dumps({**logs, **{"step": global_step}})) return loss_record,encoder,decoder