예제 #1
0
def korean_gpt_long_setence_life_test():
    config = get_config()
    kogpt2_config = get_kog_config()
    kogpt2_model_path = "C:\\Users\\multicampus\\s02p23c104\\Back\\AI\\checkpoints\\kogpt_life_model_20_2020-04-26-23-56-31.pth"

    kogpt2_vocab_path = config['kogpt_vocab_path']
    kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config))
    torch.load(kogpt2_model_path)
    kogpt2model.load_state_dict(torch.load(kogpt2_model_path))

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    kogpt2model.to(device)
    kogpt2model.eval()
    vocab = nlp.vocab.BERTVocab.from_sentencepiece(kogpt2_vocab_path,
                                                   mask_token=None,
                                                   sep_token=None,
                                                   cls_token=None,
                                                   unknown_token='<unk>',
                                                   padding_token='<pad>',
                                                   bos_token='<s>',
                                                   eos_token='</s>')
    tok = SentencepieceTokenizer(kogpt2_vocab_path)

    sent = '나는 밥을 먹었'
    toked = tok(sent)
    print(toked)
    sent_cnt = 0

    input_ids = torch.tensor([
        vocab[vocab.bos_token],
    ] + vocab[toked]).unsqueeze(0)
    input_ids = input_ids.to(device)

    outputs = kogpt2model.generate(input_ids=input_ids,
                                   max_length=100,
                                   min_length=50,
                                   repetition_penalty=1.2,
                                   do_sample=True,
                                   num_beams=3,
                                   bos_token_id=0,
                                   pad_token_id=3,
                                   eos_token_id=1,
                                   num_return_sequences=3)

    target = outputs[0]
    print("========수필===========")
    for i in range(3):  # 3 output sequences were generated
        toked = vocab.to_tokens(outputs[i].squeeze().tolist())
        ret = re.sub(r'(<s>|</s>|<pad>|<unk>)', '',
                     ''.join(toked).replace('▁', ' ').strip())
        print('Generated {}: {}'.format(i, ret))
예제 #2
0
def fine_tuning(config, fine_tune_num, AI_DIRECTORY):
    """ Train the model """

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    batch_size = config['kogpt_batch_size']
    train_path = AI_DIRECTORY + config['kogpt_story_train_data_path']
    num_train_epochs = config['kogpt_epoch']

    kogpt2_config = get_kog_config()
    kogpt2_model_path = AI_DIRECTORY + config['kogpt_model_path']
    kogpt2_vocab_path = AI_DIRECTORY + config['kogpt_vocab_path']

    kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config))
    kogpt2model.load_state_dict(torch.load(kogpt2_model_path))

    kogpt2model.to(device)

    vocab = nlp.vocab.BERTVocab.from_sentencepiece(kogpt2_vocab_path,
                                                   mask_token=None,
                                                   sep_token=None,
                                                   cls_token=None,
                                                   unknown_token='<unk>',
                                                   padding_token='<pad>',
                                                   bos_token='<s>',
                                                   eos_token='</s>')
    tok = SentencepieceTokenizer(kogpt2_vocab_path)

    loader = make_kogpt2_loader(train_path, batch_size)
    num_training_steps = len(loader) * num_train_epochs
    learning_rate = 5e-6
    adam_epsilon = 1e-8
    warmup_steps = 0
    no_decay = ["bias", "LayerNorm.weight"]
    #freeze_model(fine_tune_num,kogpt2model)
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in kogpt2model.named_parameters()
                if not any(nd in n
                           for nd in no_decay) and p.requires_grad == True
            ],
            "weight_decay":
            0.0,
        },
        {
            "params": [
                p for n, p in kogpt2model.named_parameters()
                if any(nd in n for nd in no_decay) and p.requires_grad == True
            ],
            "weight_decay":
            0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=learning_rate,
                      eps=adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=num_training_steps)
    global_step = 0
    epochs_trained = 0

    tr_loss = 0.0
    logging_loss = 0.0
    kogpt2model.zero_grad()

    train_iterator = trange(epochs_trained,
                            int(num_train_epochs),
                            desc="Epoch")
    logging_steps = 500
    loss_record = []
    for epoch in train_iterator:
        epoch_iterator = tqdm(loader, desc="Iteration")
        for step, inputs in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training

            ###############
            kogpt2model.train()
            input = inputs.to(device)
            label = inputs.to(device)

            outputs = kogpt2model(input_ids=input, labels=label)
            loss = outputs[
                0]  # model outputs are always tuple in transformers (see doc)
            loss.backward()

            tr_loss += loss.item()

            optimizer.step()
            scheduler.step()
            kogpt2model.zero_grad()
            global_step += 1

            if logging_steps > 0 and global_step % logging_steps == 0:
                logs = {}
                loss_scalar = (tr_loss - logging_loss) / logging_steps
                learning_rate_scalar = scheduler.get_last_lr()[0]
                logs["learning_rate"] = learning_rate_scalar
                logs["loss"] = loss_scalar
                loss_record.append(loss_scalar)
                logging_loss = tr_loss
                epoch_iterator.write(
                    json.dumps({
                        **logs,
                        **{
                            "step": global_step
                        }
                    }))

    return kogpt2model, loss_record
예제 #3
0
def predict(images,root_path,AI_directory_path,model_type="life"):
    config = get_config()
    #0. Extract captions from images
    vocab = load_voca(AI_directory_path+config['caption_vocab_path'])
    caption_embed_size = config['caption_embed_size']
    caption_hidden_layer = config['caption_hidden_layer']
    caption_hidden_size = config['caption_hidden_size']
    caption_encoder_path = AI_directory_path+config['caption_encoder_path']
    caption_decoder_path = AI_directory_path+config['caption_decoder_path']
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    max_sequence_len = 30 #default value

    transform = torch_transform.Compose([
        torch_transform.ToTensor(),
        torch_transform.Normalize(mean=(0.4444, 0.4215, 0.3833), std=(0.2738, 0.2664, 0.2766))])

    encoder = EncoderCNN(caption_embed_size)
    decoder = DecoderRNN(caption_embed_size, len(vocab), caption_hidden_layer, caption_hidden_size)

    encoder.load_state_dict(torch.load(caption_encoder_path,map_location=device))
    decoder.load_state_dict(torch.load(caption_decoder_path,map_location=device))
    images = load_image(images, root_path, transform)

    encoder.eval()
    decoder.eval()

    encoder.to(device)
    decoder.to(device)
    images = images.to(device)

    features = encoder(images)
    states = None
    predicted_index = []
    lstm_inputs = features.unsqueeze(1)

    for i in range(max_sequence_len):
        outputs,states = decoder.lstm(lstm_inputs,states)
        # outputs을 linear 레이어의 인풋을 위해 2차원 배열로 만들어 줘야함
        outputs = outputs.squeeze(1)
        scores_per_batch = decoder.score_layer(outputs)
        values, predicted = scores_per_batch.max(1)
        predicted_index.append(predicted)
        lstm_inputs = decoder.embed(predicted)
        lstm_inputs = lstm_inputs.unsqueeze(1)

    predicted_index = torch.stack(predicted_index,dim=1)
    predicted_index = predicted_index.cpu().numpy()

    result_captions = []
    for wordindices in predicted_index:
        text =""
        for index in wordindices:
            word = vocab.idx2word[index]
            if word == '<end>':
                break
            if word == '<unk>' or word == '<start>':
                continue
            text += word + " "
        result_captions.append(text)

    print("result_caption : ",result_captions)
    # 1. translate captions to korean

    korean_sentences = []
    for sent in result_captions:
        translate_result = get_translate(sent)
        if translate_result != -1:
            translate_result = re.sub(r'\.','',translate_result)
            korean_sentences.append(translate_result)
    print("result_korean : ",korean_sentences)

    kogpt2_config = get_kog_config()
    if model_type == "life":
        kogpt2_model_path = AI_directory_path+config['kogpt_life_model_path']
    elif model_type == "story":
        kogpt2_model_path = AI_directory_path + config['kogpt_story_model_path']
    else:
        kogpt2_model_path = AI_directory_path+config['kogpt_model_path']
    kogpt2_vocab_path = AI_directory_path+config['kogpt_vocab_path']
    kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config))
    kogpt2model.load_state_dict(torch.load(kogpt2_model_path,map_location=device))

    kogpt2model.to(device)
    kogpt2model.eval()
    vocab = nlp.vocab.BERTVocab.from_sentencepiece(kogpt2_vocab_path,
                                                         mask_token=None,
                                                         sep_token=None,
                                                         cls_token=None,
                                                         unknown_token='<unk>',
                                                         padding_token='<pad>',
                                                         bos_token='<s>',
                                                         eos_token='</s>')
    tok = SentencepieceTokenizer(kogpt2_vocab_path)

    korean_preprocess(korean_sentences)
    gpt_result = naive_prediction(korean_sentences,tok,vocab,device,kogpt2model,model_type)
    korean_postprocess(gpt_result)
    result = []
    make_sentence(gpt_result,"",result,0)
    result.sort(key=lambda item: (-len(item),item))
    result_len = len(result)
    if result_len >11:
        result_len = 11
    result = result[1:result_len]
    return result
예제 #4
0
def kogpt_life_recursive_test():
    config = get_config()
    AI_directory_path = "C:\\Users\\multicampus\\s02p23c104\\Back\\AI"
    kogpt2_config = get_kog_config()
    kogpt2_model_path = "C:\\Users\\multicampus\\s02p23c104\\Back\\AI\\checkpoints\\kogpt_life_model_20_2020-04-26-23-56-31.pth"

    kogpt2_vocab_path = AI_directory_path + config['kogpt_vocab_path']
    kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config))
    torch.load(kogpt2_model_path)
    kogpt2model.load_state_dict(torch.load(kogpt2_model_path))

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    kogpt2model.to(device)
    kogpt2model.eval()
    vocab = nlp.vocab.BERTVocab.from_sentencepiece(kogpt2_vocab_path,
                                                   mask_token=None,
                                                   sep_token=None,
                                                   cls_token=None,
                                                   unknown_token='<unk>',
                                                   padding_token='<pad>',
                                                   bos_token='<s>',
                                                   eos_token='</s>')
    tok = SentencepieceTokenizer(kogpt2_vocab_path)

    # sent = ' 신랑 신부가 결혼식 파티 앞에서 사진을 찍기 위해 포즈를 취하고 있자, 신부님은 웨딩드레파란 셔츠를 입은 남자가 사다리에 서 있'
    # toked = tok(sent)
    # print(toked)
    # sent_cnt = 0

    # input_ids = torch.tensor([vocab[vocab.bos_token], ] + vocab[toked]).unsqueeze(0)
    # input_ids = input_ids.to(device)
    korean_sentences = [
        '신랑 신부가 결혼식 파티 앞에서 사진을 찍기 위해 포즈를 취하고 있다.', '파란 셔츠를 입은 남자가 사다리에 서 있다.',
        '두 남자가 서 있다'
    ]
    kogpt_input_sentences = []
    for korean in korean_sentences:
        korean_size = len(korean)
        if not kogpt_input_sentences:
            korean_size = len(korean)
            if korean_size > 3:
                kogpt_input_sentences.append(korean[:-2])
            elif korean_size > 1:
                kogpt_input_sentences.append(korean[:-1])
            else:
                kogpt_input_sentences.append(korean)
        else:
            for i in range(len(kogpt_input_sentences)):
                if korean_size > 3:
                    kogpt_input_sentences[i] += korean[:-2]
                elif korean_size > 1:
                    kogpt_input_sentences[i] += korean[:-1]
                else:
                    kogpt_input_sentences[i] += korean[:]
        kogpt_output_sentences = []
        print(kogpt_input_sentences)
        expected_length = 50
        for kogpt_input_sentence in kogpt_input_sentences:
            print(kogpt_input_sentence)
            toked = tok(kogpt_input_sentence)
            input_ids = torch.tensor([
                vocab[vocab.bos_token],
            ] + vocab[toked]).unsqueeze(0)
            print(input_ids)
            input_ids = input_ids.to(device)
            input_length = input_ids.shape[1]
            outputs = kogpt2model.generate(input_ids=input_ids,
                                           max_length=input_length +
                                           expected_length,
                                           repetition_penalty=1.2,
                                           do_sample=True,
                                           num_beams=3,
                                           bos_token_id=0,
                                           pad_token_id=3,
                                           eos_token_id=1,
                                           num_return_sequences=3)
            for i in range(3):  # 3 output sequences were generated
                toked = vocab.to_tokens(outputs[i].squeeze().tolist())
                ret = re.sub(r'(<s>|</s>|<pad>|<unk>|)', '',
                             ''.join(toked).replace('▁', ' ').strip())
                kogpt_output_sentences.append(ret)
        kogpt_input_sentences = copy.deepcopy(kogpt_output_sentences)
    print(kogpt_input_sentences)