Ejemplo n.º 1
0
def train_procedure_test():
    config = get_config()
    load_path = config['word2idx_train_path']
    voca_path = config['caption_vocab_path']
    dataset = load_tokenized_data(load_path)
    voca = load_voca(voca_path)
    batch_size = 2
    embed_size = 10
    vocab_len = len(voca)
    hidden_layer = 1
    hidden_size = 10
    loader = make_caption_loader(dataset, batch_size,
                                 config['caption_train_image_path'])

    dataiter = iter(loader)
    images, caption, length = dataiter.next()

    # data형태 확인하기
    print("Data 형태 확인")
    print(images.size())
    print(caption.size())

    encoder = EncoderCNN(embed_size)
    decoder = DecoderRNN(embed_size, vocab_len, hidden_layer, hidden_size)

    grad_params = list(encoder.linear.parameters())

    loss_function = nn.CrossEntropyLoss()
    optimizer = optim.Adam(params=grad_params, lr=0.001)

    compare_target = pack_padded_sequence(caption, length,
                                          batch_first=True).data

    feature = encoder(images)
    output = decoder(caption, feature, length)

    loss = loss_function(output, compare_target)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    datestr = date2str()
    save_path = config['checkpoints_saved_path']
    mini_batch_loss = []
    mini_batch_loss.append(loss.item())
    save_config(config, "config" + datestr, save_path)
    save_loss(mini_batch_loss, "loss" + datestr, save_path)
    save_model(encoder, "encoder" + datestr, save_path)
    save_model(decoder, "decoder" + datestr, save_path)
    print(
        "optimzer.zero_grad()와 encoder.zero_grad() , decoder.zero_grad()와 같을 까?"
    )
    print("optimizer.zero_grad() 호출하기 전")
    print(encoder.linear.weight.grad)
    print("optimizer.zero_grad() 호출한 후")
    optimizer.zero_grad()
    print(encoder.linear.weight.grad)
    print("====================")
    print(grad_params)
Ejemplo n.º 2
0
def save_caption2idx_test():
    config = get_config()
    AI_DIREC = "C:\\Users\\multicampus\\s02p23c104\\Back\\AI"
    voca_path = AI_DIREC + config['caption_vocab_path']
    train_json_path = AI_DIREC + config['caption_train_path']
    voca = load_voca(voca_path)
    dataset = tokenized_data(train_json_path, voca)
    save_tokenized_data(dataset=dataset, AI_DIREC=AI_DIREC)
Ejemplo n.º 3
0
def loader_test():
    config = get_config()
    load_path = config['word2idx_test_path']
    voca_path = config['caption_vocab_path']
    dataset = load_tokenized_data(load_path)
    print(dataset['image_list'])
    voca = load_voca(voca_path)

    loader = make_caption_loader(dataset, 10, config['train_image_path'])
    dataiter = iter(loader)
    images, padded_caption, caption_length = dataiter.next()
    print(images)
Ejemplo n.º 4
0
def caption_train(vocab_path,
                  image_path,
                  cfg,
                  caption_path,
                  word2idx_path=None):
    voca = load_voca(vocab_path)
    if word2idx_path is not None:
        dataset = load_tokenized_data(word2idx_path)
    else:
        dataset = tokenized_data(caption_path, voca, type="train")
        save_tokenized_data(dataset, type="train")

    batch = cfg['caption_batch']
    embed_size = cfg['caption_embed_size']
    hidden_size = cfg['caption_hidden_size']
    hidden_layer = cfg['caption_hidden_layer']
    epochs = cfg['caption_epoch']
    loader = make_caption_loader(dataset, batch, image_path)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    encoder = EncoderCNN(embed_size)
    decoder = DecoderRNN(embed_size,
                         len(voca),
                         hidden_layers_num=hidden_layer,
                         hidden_size=hidden_size)

    encoder.to(device)
    decoder.to(device)
    learning_rate = 5e-5
    adam_epsilon = 1e-8
    loss_function = nn.CrossEntropyLoss()
    param_list = list(encoder.linear.parameters()) + list(
        encoder.bn.parameters()) + list(decoder.parameters())
    optimizer = AdamW(param_list, lr=learning_rate, eps=adam_epsilon)
    num_training_steps = len(loader) * epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
    global_step = 0
    epochs_trained = 0

    tr_loss = 0.0
    logging_loss = 0.0
    train_iterator = trange(epochs_trained, int(epochs), desc="Epoch")
    logging_steps = 500
    loss_record = []
    for epoch in train_iterator:
        epoch_iterator = tqdm(loader, desc="Iteration")
        for idx_of_batch, (images, word2idxes,
                           length) in enumerate(epoch_iterator):
            images, word2idxes = images.to(device), word2idxes.to(device)
            features = encoder(images)
            compare_targets = pack_padded_sequence(word2idxes,
                                                   length,
                                                   batch_first=True).data

            output = decoder(word2idxes, features, length)
            loss = loss_function(output, compare_targets)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()
            tr_loss += loss.item()
            global_step += 1
            if logging_steps > 0 and global_step % logging_steps == 0:
                logs = {}
                loss_scalar = (tr_loss - logging_loss) / logging_steps
                learning_rate_scalar = scheduler.get_last_lr()[0]
                logs["learning_rate"] = learning_rate_scalar
                logs["loss"] = loss_scalar
                loss_record.append(loss_scalar)
                logging_loss = tr_loss
                epoch_iterator.write(
                    json.dumps({
                        **logs,
                        **{
                            "step": global_step
                        }
                    }))
    return loss_record, encoder, decoder
Ejemplo n.º 5
0
def caption_test(vocab_path,
                 encoder_path,
                 decoder_path,
                 caption_path,
                 image_path,
                 config_path,
                 batch,
                 max_sequence_len,
                 word2idx_path=None):
    vocab = load_voca(vocab_path)
    cfg = get_config(config_path)

    embed_size = cfg['caption_embed_size']
    vocab_size = len(vocab)
    hidden_layers_num = cfg['caption_hidden_layer']
    hidden_size = cfg['caption_hidden_size']

    if word2idx_path is not None:
        dataset = load_tokenized_data(word2idx_path)
    else:
        dataset = tokenized_data(caption_path, vocab, type="test")
        save_tokenized_data(dataset, type="test")

    encoder = EncoderCNN(embed_size)
    decoder = DecoderRNN(embed_size, vocab_size, hidden_layers_num,
                         hidden_size)

    encoder.load_state_dict(torch.load(encoder_path))
    decoder.load_state_dict(torch.load(decoder_path))

    encoder.eval()
    decoder.eval()

    loader = make_caption_loader(dataset, batch, image_path)

    test_data_iter = iter(loader)
    images, captions, length = test_data_iter.next()

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    device_images = images.to(device)
    features = encoder(images)
    states = None

    # features의 형태는 (batch,embed_size)인 2차원입니다. 그러나
    # 이후 사용될 lstm은 input으로 (batch,num of embeddings,embed_size) 3차원 형태를 요구하기 때문에
    # features의 차원을 강제로 늘려줍니다.
    lstm_inputs = features.unsqueeze(1)
    predicted_index = []
    for i in range(max_sequence_len):
        outputs, states = decoder.lstm(lstm_inputs, states)
        # outputs을 linear 레이어의 인풋을 위해 2차원 배열로 만들어 줘야함
        outputs = outputs.squeeze(1)
        scores_per_batch = decoder.score_layer(outputs)
        values, predicted = scores_per_batch.max(1)
        predicted_index.append(predicted)
        lstm_inputs = decoder.embed(predicted)
        lstm_inputs = lstm_inputs.unsqueeze(1)

    # tensor를 포함한 그냥 1차원 짜리 리스트 [batch * max_sequence_len] => 2차원의 매트릭스 [batch X max_sequence_len] 바꿔줘야 함
    # ex)
    # predicted_index = [tensor([0,3,6]),tensor([1,4,7]),tensor([2,5,8])]
    # 이걸
    # [0,1,2]
    # [3,4,5]
    # [6,7,8] 이렇게 바꿔줘야 함
    # 2차원 짜리를 만들건데 기존의 리스트는 dim 0 방향이 되고(세로방향)
    # 새로 붙이는 리스트는 dim 1 방향으로 붙여야 함(가로 방향)

    predicted_index = torch.stack(predicted_index, dim=1)
    # 현재 tensor가 gpu에 있으므로 cpu로 옮겨서 연산을 해야함.
    predicted_index = predicted_index.cpu().numpy()

    result_captions = []
    for wordindices in predicted_index:
        caption = []
        for index in wordindices:
            word = vocab.idx2word[index]
            if word == '<end>':
                break
            if word == '<unk>' or word == '<start>':
                continue
            caption.append(word)
        result_captions.append(caption)

    return images, result_captions, captions
Ejemplo n.º 6
0
def predict(images,root_path,AI_directory_path,model_type="life"):
    config = get_config()
    #0. Extract captions from images
    vocab = load_voca(AI_directory_path+config['caption_vocab_path'])
    caption_embed_size = config['caption_embed_size']
    caption_hidden_layer = config['caption_hidden_layer']
    caption_hidden_size = config['caption_hidden_size']
    caption_encoder_path = AI_directory_path+config['caption_encoder_path']
    caption_decoder_path = AI_directory_path+config['caption_decoder_path']
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    max_sequence_len = 30 #default value

    transform = torch_transform.Compose([
        torch_transform.ToTensor(),
        torch_transform.Normalize(mean=(0.4444, 0.4215, 0.3833), std=(0.2738, 0.2664, 0.2766))])

    encoder = EncoderCNN(caption_embed_size)
    decoder = DecoderRNN(caption_embed_size, len(vocab), caption_hidden_layer, caption_hidden_size)

    encoder.load_state_dict(torch.load(caption_encoder_path,map_location=device))
    decoder.load_state_dict(torch.load(caption_decoder_path,map_location=device))
    images = load_image(images, root_path, transform)

    encoder.eval()
    decoder.eval()

    encoder.to(device)
    decoder.to(device)
    images = images.to(device)

    features = encoder(images)
    states = None
    predicted_index = []
    lstm_inputs = features.unsqueeze(1)

    for i in range(max_sequence_len):
        outputs,states = decoder.lstm(lstm_inputs,states)
        # outputs을 linear 레이어의 인풋을 위해 2차원 배열로 만들어 줘야함
        outputs = outputs.squeeze(1)
        scores_per_batch = decoder.score_layer(outputs)
        values, predicted = scores_per_batch.max(1)
        predicted_index.append(predicted)
        lstm_inputs = decoder.embed(predicted)
        lstm_inputs = lstm_inputs.unsqueeze(1)

    predicted_index = torch.stack(predicted_index,dim=1)
    predicted_index = predicted_index.cpu().numpy()

    result_captions = []
    for wordindices in predicted_index:
        text =""
        for index in wordindices:
            word = vocab.idx2word[index]
            if word == '<end>':
                break
            if word == '<unk>' or word == '<start>':
                continue
            text += word + " "
        result_captions.append(text)

    print("result_caption : ",result_captions)
    # 1. translate captions to korean

    korean_sentences = []
    for sent in result_captions:
        translate_result = get_translate(sent)
        if translate_result != -1:
            translate_result = re.sub(r'\.','',translate_result)
            korean_sentences.append(translate_result)
    print("result_korean : ",korean_sentences)

    kogpt2_config = get_kog_config()
    if model_type == "life":
        kogpt2_model_path = AI_directory_path+config['kogpt_life_model_path']
    elif model_type == "story":
        kogpt2_model_path = AI_directory_path + config['kogpt_story_model_path']
    else:
        kogpt2_model_path = AI_directory_path+config['kogpt_model_path']
    kogpt2_vocab_path = AI_directory_path+config['kogpt_vocab_path']
    kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config))
    kogpt2model.load_state_dict(torch.load(kogpt2_model_path,map_location=device))

    kogpt2model.to(device)
    kogpt2model.eval()
    vocab = nlp.vocab.BERTVocab.from_sentencepiece(kogpt2_vocab_path,
                                                         mask_token=None,
                                                         sep_token=None,
                                                         cls_token=None,
                                                         unknown_token='<unk>',
                                                         padding_token='<pad>',
                                                         bos_token='<s>',
                                                         eos_token='</s>')
    tok = SentencepieceTokenizer(kogpt2_vocab_path)

    korean_preprocess(korean_sentences)
    gpt_result = naive_prediction(korean_sentences,tok,vocab,device,kogpt2model,model_type)
    korean_postprocess(gpt_result)
    result = []
    make_sentence(gpt_result,"",result,0)
    result.sort(key=lambda item: (-len(item),item))
    result_len = len(result)
    if result_len >11:
        result_len = 11
    result = result[1:result_len]
    return result
Ejemplo n.º 7
0
def attention_caption_train(vocab_path, image_path, cfg, caption_path, word2idx_path=None):
    voca = load_voca(vocab_path)
    if word2idx_path is not None:
        dataset = load_tokenized_data(word2idx_path)
    else:
        dataset = tokenized_data(caption_path, voca, type="train")
        save_tokenized_data(dataset,type="train")

    batch = cfg['caption_batch']
    emb_dim = cfg['caption_embed_size']
    decoder_dim = cfg['caption_hidden_size']
    attention_dim = cfg['caption_attention_dim']
    dropout = cfg['caption_dropout_ratio']
    epochs = cfg['caption_epoch']
    loader = make_caption_loader(dataset, batch, image_path)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    encoder = Encoder()
    encoder.fine_tune(False)
    decoder = DecoderWithAttention(attention_dim=attention_dim,
                                   embed_dim=emb_dim,
                                   decoder_dim=decoder_dim,
                                   vocab_size=len(voca),
                                   dropout=dropout)

    encoder.to(device)
    decoder.to(device)
    learning_rate = 5e-5
    adam_epsilon = 1e-8
    loss_function = nn.CrossEntropyLoss()
    param_list = list(decoder.parameters())
    optimizer = AdamW(param_list, lr=learning_rate, eps=adam_epsilon)
    num_training_steps = len(loader) * epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
    )
    global_step = 0
    epochs_trained = 0

    tr_loss = 0.0
    logging_loss = 0.0
    train_iterator = trange(
        epochs_trained, int(epochs), desc="Epoch"
    )
    logging_steps = 500
    loss_record = []
    for epoch in train_iterator:
        epoch_iterator = tqdm(loader, desc="Iteration")
        encoder.train()
        decoder.train ()
        for idx_of_batch,(images, word2idxes,length) in enumerate(epoch_iterator):
            length = torch.LongTensor(length).to(device)
            images,word2idxes = images.to(device),word2idxes.to(device)
            features = encoder(images)
            scores, caps_sorted, decode_lengths, alphas, sort_ind = decoder(features, word2idxes, length)

            # Since we decoded starting with <start>, the targets are all words after <start>, up to <end>
            targets = caps_sorted[:, 1:]

            # Remove timesteps that we didn't decode at, or are pads
            # pack_padded_sequence is an easy trick to do this
            scores = pack_padded_sequence(scores, decode_lengths, batch_first=True).data
            targets = pack_padded_sequence(targets, decode_lengths, batch_first=True).data

            # Calculate loss
            loss = loss_function(scores, targets)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()
            tr_loss += loss.item()
            global_step += 1
            if logging_steps > 0 and global_step % logging_steps == 0:
                logs = {}
                loss_scalar = (tr_loss - logging_loss) / logging_steps
                learning_rate_scalar = scheduler.get_last_lr()[0]
                logs["learning_rate"] = learning_rate_scalar
                logs["loss"] = loss_scalar
                loss_record.append(loss_scalar)
                logging_loss = tr_loss
                epoch_iterator.write(json.dumps({**logs, **{"step": global_step}}))
    return loss_record,encoder,decoder
Ejemplo n.º 8
0
def attention_beam_search_test(images, root_path):
    config = get_config()
    # 0. Extract captions from images
    AI_directory_path = "C:\\Users\\multicampus\\s02p23c104\\Back\\AI"
    vocab = load_voca(AI_directory_path +
                      config['caption_attention_vocab_path'])
    emb_dim = config['caption_embed_size']
    decoder_dim = config['caption_hidden_size']
    attention_dim = config['caption_attention_dim']
    dropout = config['caption_dropout_ratio']
    caption_encoder_path = AI_directory_path + config[
        'caption_attention_encoder_path']
    caption_decoder_path = AI_directory_path + config[
        'caption_attention_decoder_path']
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    max_sequence_len = 50  # default value

    transform = torch_transform.Compose([
        torch_transform.ToTensor(),
        torch_transform.Normalize(mean=(0.4444, 0.4215, 0.3833),
                                  std=(0.2738, 0.2664, 0.2766))
    ])

    encoder = Encoder()
    decoder = DecoderWithAttention(attention_dim=attention_dim,
                                   embed_dim=emb_dim,
                                   decoder_dim=decoder_dim,
                                   vocab_size=len(vocab),
                                   dropout=dropout)

    encoder.load_state_dict(
        torch.load(caption_encoder_path, map_location=device))
    decoder.load_state_dict(
        torch.load(caption_decoder_path, map_location=device))
    images = load_image(images, root_path, transform)

    encoder.eval()
    decoder.eval()

    encoder.to(device)
    decoder.to(device)
    images = images.to(device)
    batch = images.shape[0]

    predicted_index = []
    encoder_out = encoder(
        images)  # (1, enc_image_size, enc_image_size, encoder_dim)
    enc_image_size = encoder_out.size(1)
    encoder_dim = encoder_out.size(3)

    # Flatten encoding
    encoder_out = encoder_out.view(batch, -1,
                                   encoder_dim)  # (1, num_pixels, encoder_dim)
    num_pixels = encoder_out.size(1)
    k_prev_words = torch.LongTensor([[vocab('<start>')]] * batch).to(device)
    h, c = decoder.init_hidden_state(encoder_out)
    for i in range(max_sequence_len):
        embeddings = decoder.embedding(k_prev_words).squeeze(
            1)  # (s, embed_dim)
        awe, _ = decoder.attention(encoder_out,
                                   h)  # (s, encoder_dim), (s, num_pixels)
        gate = decoder.sigmoid(
            decoder.f_beta(h))  # gating scalar, (s, encoder_dim)
        awe = gate * awe
        h, c = decoder.decode_step(torch.cat([embeddings, awe], dim=1),
                                   (h, c))  # (s, decoder_dim)
        scores = decoder.fc(h)  # (s, vocab_size)
        _, predicted = scores.max(1)
        predicted_index.append(predicted)
        k_prev_words = predicted.unsqueeze(1)

    predicted_index = torch.stack(predicted_index, dim=1)
    predicted_index = predicted_index.cpu().numpy()

    result_captions = []
    for wordindices in predicted_index:
        text = ""
        for index in wordindices:
            word = vocab.idx2word[index]
            if word == '<end>':
                break
            if word == '<unk>' or word == '<start>':
                continue
            text += word + " "
        result_captions.append(text)

    print("result_caption : ", result_captions)