コード例 #1
0
def train_procedure_test():
    config = get_config()
    load_path = config['word2idx_train_path']
    voca_path = config['caption_vocab_path']
    dataset = load_tokenized_data(load_path)
    voca = load_voca(voca_path)
    batch_size = 2
    embed_size = 10
    vocab_len = len(voca)
    hidden_layer = 1
    hidden_size = 10
    loader = make_caption_loader(dataset, batch_size,
                                 config['caption_train_image_path'])

    dataiter = iter(loader)
    images, caption, length = dataiter.next()

    # data형태 확인하기
    print("Data 형태 확인")
    print(images.size())
    print(caption.size())

    encoder = EncoderCNN(embed_size)
    decoder = DecoderRNN(embed_size, vocab_len, hidden_layer, hidden_size)

    grad_params = list(encoder.linear.parameters())

    loss_function = nn.CrossEntropyLoss()
    optimizer = optim.Adam(params=grad_params, lr=0.001)

    compare_target = pack_padded_sequence(caption, length,
                                          batch_first=True).data

    feature = encoder(images)
    output = decoder(caption, feature, length)

    loss = loss_function(output, compare_target)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    datestr = date2str()
    save_path = config['checkpoints_saved_path']
    mini_batch_loss = []
    mini_batch_loss.append(loss.item())
    save_config(config, "config" + datestr, save_path)
    save_loss(mini_batch_loss, "loss" + datestr, save_path)
    save_model(encoder, "encoder" + datestr, save_path)
    save_model(decoder, "decoder" + datestr, save_path)
    print(
        "optimzer.zero_grad()와 encoder.zero_grad() , decoder.zero_grad()와 같을 까?"
    )
    print("optimizer.zero_grad() 호출하기 전")
    print(encoder.linear.weight.grad)
    print("optimizer.zero_grad() 호출한 후")
    optimizer.zero_grad()
    print(encoder.linear.weight.grad)
    print("====================")
    print(grad_params)
コード例 #2
0
def loader_test():
    config = get_config()
    load_path = config['word2idx_test_path']
    voca_path = config['caption_vocab_path']
    dataset = load_tokenized_data(load_path)
    print(dataset['image_list'])
    voca = load_voca(voca_path)

    loader = make_caption_loader(dataset, 10, config['train_image_path'])
    dataiter = iter(loader)
    images, padded_caption, caption_length = dataiter.next()
    print(images)
コード例 #3
0
def caption_train(vocab_path,
                  image_path,
                  cfg,
                  caption_path,
                  word2idx_path=None):
    voca = load_voca(vocab_path)
    if word2idx_path is not None:
        dataset = load_tokenized_data(word2idx_path)
    else:
        dataset = tokenized_data(caption_path, voca, type="train")
        save_tokenized_data(dataset, type="train")

    batch = cfg['caption_batch']
    embed_size = cfg['caption_embed_size']
    hidden_size = cfg['caption_hidden_size']
    hidden_layer = cfg['caption_hidden_layer']
    epochs = cfg['caption_epoch']
    loader = make_caption_loader(dataset, batch, image_path)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    encoder = EncoderCNN(embed_size)
    decoder = DecoderRNN(embed_size,
                         len(voca),
                         hidden_layers_num=hidden_layer,
                         hidden_size=hidden_size)

    encoder.to(device)
    decoder.to(device)
    learning_rate = 5e-5
    adam_epsilon = 1e-8
    loss_function = nn.CrossEntropyLoss()
    param_list = list(encoder.linear.parameters()) + list(
        encoder.bn.parameters()) + list(decoder.parameters())
    optimizer = AdamW(param_list, lr=learning_rate, eps=adam_epsilon)
    num_training_steps = len(loader) * epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
    global_step = 0
    epochs_trained = 0

    tr_loss = 0.0
    logging_loss = 0.0
    train_iterator = trange(epochs_trained, int(epochs), desc="Epoch")
    logging_steps = 500
    loss_record = []
    for epoch in train_iterator:
        epoch_iterator = tqdm(loader, desc="Iteration")
        for idx_of_batch, (images, word2idxes,
                           length) in enumerate(epoch_iterator):
            images, word2idxes = images.to(device), word2idxes.to(device)
            features = encoder(images)
            compare_targets = pack_padded_sequence(word2idxes,
                                                   length,
                                                   batch_first=True).data

            output = decoder(word2idxes, features, length)
            loss = loss_function(output, compare_targets)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()
            tr_loss += loss.item()
            global_step += 1
            if logging_steps > 0 and global_step % logging_steps == 0:
                logs = {}
                loss_scalar = (tr_loss - logging_loss) / logging_steps
                learning_rate_scalar = scheduler.get_last_lr()[0]
                logs["learning_rate"] = learning_rate_scalar
                logs["loss"] = loss_scalar
                loss_record.append(loss_scalar)
                logging_loss = tr_loss
                epoch_iterator.write(
                    json.dumps({
                        **logs,
                        **{
                            "step": global_step
                        }
                    }))
    return loss_record, encoder, decoder
コード例 #4
0
def caption_test(vocab_path,
                 encoder_path,
                 decoder_path,
                 caption_path,
                 image_path,
                 config_path,
                 batch,
                 max_sequence_len,
                 word2idx_path=None):
    vocab = load_voca(vocab_path)
    cfg = get_config(config_path)

    embed_size = cfg['caption_embed_size']
    vocab_size = len(vocab)
    hidden_layers_num = cfg['caption_hidden_layer']
    hidden_size = cfg['caption_hidden_size']

    if word2idx_path is not None:
        dataset = load_tokenized_data(word2idx_path)
    else:
        dataset = tokenized_data(caption_path, vocab, type="test")
        save_tokenized_data(dataset, type="test")

    encoder = EncoderCNN(embed_size)
    decoder = DecoderRNN(embed_size, vocab_size, hidden_layers_num,
                         hidden_size)

    encoder.load_state_dict(torch.load(encoder_path))
    decoder.load_state_dict(torch.load(decoder_path))

    encoder.eval()
    decoder.eval()

    loader = make_caption_loader(dataset, batch, image_path)

    test_data_iter = iter(loader)
    images, captions, length = test_data_iter.next()

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    device_images = images.to(device)
    features = encoder(images)
    states = None

    # features의 형태는 (batch,embed_size)인 2차원입니다. 그러나
    # 이후 사용될 lstm은 input으로 (batch,num of embeddings,embed_size) 3차원 형태를 요구하기 때문에
    # features의 차원을 강제로 늘려줍니다.
    lstm_inputs = features.unsqueeze(1)
    predicted_index = []
    for i in range(max_sequence_len):
        outputs, states = decoder.lstm(lstm_inputs, states)
        # outputs을 linear 레이어의 인풋을 위해 2차원 배열로 만들어 줘야함
        outputs = outputs.squeeze(1)
        scores_per_batch = decoder.score_layer(outputs)
        values, predicted = scores_per_batch.max(1)
        predicted_index.append(predicted)
        lstm_inputs = decoder.embed(predicted)
        lstm_inputs = lstm_inputs.unsqueeze(1)

    # tensor를 포함한 그냥 1차원 짜리 리스트 [batch * max_sequence_len] => 2차원의 매트릭스 [batch X max_sequence_len] 바꿔줘야 함
    # ex)
    # predicted_index = [tensor([0,3,6]),tensor([1,4,7]),tensor([2,5,8])]
    # 이걸
    # [0,1,2]
    # [3,4,5]
    # [6,7,8] 이렇게 바꿔줘야 함
    # 2차원 짜리를 만들건데 기존의 리스트는 dim 0 방향이 되고(세로방향)
    # 새로 붙이는 리스트는 dim 1 방향으로 붙여야 함(가로 방향)

    predicted_index = torch.stack(predicted_index, dim=1)
    # 현재 tensor가 gpu에 있으므로 cpu로 옮겨서 연산을 해야함.
    predicted_index = predicted_index.cpu().numpy()

    result_captions = []
    for wordindices in predicted_index:
        caption = []
        for index in wordindices:
            word = vocab.idx2word[index]
            if word == '<end>':
                break
            if word == '<unk>' or word == '<start>':
                continue
            caption.append(word)
        result_captions.append(caption)

    return images, result_captions, captions
コード例 #5
0
def attention_caption_train(vocab_path, image_path, cfg, caption_path, word2idx_path=None):
    voca = load_voca(vocab_path)
    if word2idx_path is not None:
        dataset = load_tokenized_data(word2idx_path)
    else:
        dataset = tokenized_data(caption_path, voca, type="train")
        save_tokenized_data(dataset,type="train")

    batch = cfg['caption_batch']
    emb_dim = cfg['caption_embed_size']
    decoder_dim = cfg['caption_hidden_size']
    attention_dim = cfg['caption_attention_dim']
    dropout = cfg['caption_dropout_ratio']
    epochs = cfg['caption_epoch']
    loader = make_caption_loader(dataset, batch, image_path)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    encoder = Encoder()
    encoder.fine_tune(False)
    decoder = DecoderWithAttention(attention_dim=attention_dim,
                                   embed_dim=emb_dim,
                                   decoder_dim=decoder_dim,
                                   vocab_size=len(voca),
                                   dropout=dropout)

    encoder.to(device)
    decoder.to(device)
    learning_rate = 5e-5
    adam_epsilon = 1e-8
    loss_function = nn.CrossEntropyLoss()
    param_list = list(decoder.parameters())
    optimizer = AdamW(param_list, lr=learning_rate, eps=adam_epsilon)
    num_training_steps = len(loader) * epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
    )
    global_step = 0
    epochs_trained = 0

    tr_loss = 0.0
    logging_loss = 0.0
    train_iterator = trange(
        epochs_trained, int(epochs), desc="Epoch"
    )
    logging_steps = 500
    loss_record = []
    for epoch in train_iterator:
        epoch_iterator = tqdm(loader, desc="Iteration")
        encoder.train()
        decoder.train ()
        for idx_of_batch,(images, word2idxes,length) in enumerate(epoch_iterator):
            length = torch.LongTensor(length).to(device)
            images,word2idxes = images.to(device),word2idxes.to(device)
            features = encoder(images)
            scores, caps_sorted, decode_lengths, alphas, sort_ind = decoder(features, word2idxes, length)

            # Since we decoded starting with <start>, the targets are all words after <start>, up to <end>
            targets = caps_sorted[:, 1:]

            # Remove timesteps that we didn't decode at, or are pads
            # pack_padded_sequence is an easy trick to do this
            scores = pack_padded_sequence(scores, decode_lengths, batch_first=True).data
            targets = pack_padded_sequence(targets, decode_lengths, batch_first=True).data

            # Calculate loss
            loss = loss_function(scores, targets)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()
            tr_loss += loss.item()
            global_step += 1
            if logging_steps > 0 and global_step % logging_steps == 0:
                logs = {}
                loss_scalar = (tr_loss - logging_loss) / logging_steps
                learning_rate_scalar = scheduler.get_last_lr()[0]
                logs["learning_rate"] = learning_rate_scalar
                logs["loss"] = loss_scalar
                loss_record.append(loss_scalar)
                logging_loss = tr_loss
                epoch_iterator.write(json.dumps({**logs, **{"step": global_step}}))
    return loss_record,encoder,decoder