def korean_gpt_long_setence_life_test(): config = get_config() kogpt2_config = get_kog_config() kogpt2_model_path = "C:\\Users\\multicampus\\s02p23c104\\Back\\AI\\checkpoints\\kogpt_life_model_20_2020-04-26-23-56-31.pth" kogpt2_vocab_path = config['kogpt_vocab_path'] kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config)) torch.load(kogpt2_model_path) kogpt2model.load_state_dict(torch.load(kogpt2_model_path)) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") kogpt2model.to(device) kogpt2model.eval() vocab = nlp.vocab.BERTVocab.from_sentencepiece(kogpt2_vocab_path, mask_token=None, sep_token=None, cls_token=None, unknown_token='<unk>', padding_token='<pad>', bos_token='<s>', eos_token='</s>') tok = SentencepieceTokenizer(kogpt2_vocab_path) sent = '나는 밥을 먹었' toked = tok(sent) print(toked) sent_cnt = 0 input_ids = torch.tensor([ vocab[vocab.bos_token], ] + vocab[toked]).unsqueeze(0) input_ids = input_ids.to(device) outputs = kogpt2model.generate(input_ids=input_ids, max_length=100, min_length=50, repetition_penalty=1.2, do_sample=True, num_beams=3, bos_token_id=0, pad_token_id=3, eos_token_id=1, num_return_sequences=3) target = outputs[0] print("========수필===========") for i in range(3): # 3 output sequences were generated toked = vocab.to_tokens(outputs[i].squeeze().tolist()) ret = re.sub(r'(<s>|</s>|<pad>|<unk>)', '', ''.join(toked).replace('▁', ' ').strip()) print('Generated {}: {}'.format(i, ret))
def fine_tuning(config, fine_tune_num, AI_DIRECTORY): """ Train the model """ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") batch_size = config['kogpt_batch_size'] train_path = AI_DIRECTORY + config['kogpt_story_train_data_path'] num_train_epochs = config['kogpt_epoch'] kogpt2_config = get_kog_config() kogpt2_model_path = AI_DIRECTORY + config['kogpt_model_path'] kogpt2_vocab_path = AI_DIRECTORY + config['kogpt_vocab_path'] kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config)) kogpt2model.load_state_dict(torch.load(kogpt2_model_path)) kogpt2model.to(device) vocab = nlp.vocab.BERTVocab.from_sentencepiece(kogpt2_vocab_path, mask_token=None, sep_token=None, cls_token=None, unknown_token='<unk>', padding_token='<pad>', bos_token='<s>', eos_token='</s>') tok = SentencepieceTokenizer(kogpt2_vocab_path) loader = make_kogpt2_loader(train_path, batch_size) num_training_steps = len(loader) * num_train_epochs learning_rate = 5e-6 adam_epsilon = 1e-8 warmup_steps = 0 no_decay = ["bias", "LayerNorm.weight"] #freeze_model(fine_tune_num,kogpt2model) optimizer_grouped_parameters = [ { "params": [ p for n, p in kogpt2model.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad == True ], "weight_decay": 0.0, }, { "params": [ p for n, p in kogpt2model.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad == True ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps) global_step = 0 epochs_trained = 0 tr_loss = 0.0 logging_loss = 0.0 kogpt2model.zero_grad() train_iterator = trange(epochs_trained, int(num_train_epochs), desc="Epoch") logging_steps = 500 loss_record = [] for epoch in train_iterator: epoch_iterator = tqdm(loader, desc="Iteration") for step, inputs in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training ############### kogpt2model.train() input = inputs.to(device) label = inputs.to(device) outputs = kogpt2model(input_ids=input, labels=label) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) loss.backward() tr_loss += loss.item() optimizer.step() scheduler.step() kogpt2model.zero_grad() global_step += 1 if logging_steps > 0 and global_step % logging_steps == 0: logs = {} loss_scalar = (tr_loss - logging_loss) / logging_steps learning_rate_scalar = scheduler.get_last_lr()[0] logs["learning_rate"] = learning_rate_scalar logs["loss"] = loss_scalar loss_record.append(loss_scalar) logging_loss = tr_loss epoch_iterator.write( json.dumps({ **logs, **{ "step": global_step } })) return kogpt2model, loss_record
def predict(images,root_path,AI_directory_path,model_type="life"): config = get_config() #0. Extract captions from images vocab = load_voca(AI_directory_path+config['caption_vocab_path']) caption_embed_size = config['caption_embed_size'] caption_hidden_layer = config['caption_hidden_layer'] caption_hidden_size = config['caption_hidden_size'] caption_encoder_path = AI_directory_path+config['caption_encoder_path'] caption_decoder_path = AI_directory_path+config['caption_decoder_path'] device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") max_sequence_len = 30 #default value transform = torch_transform.Compose([ torch_transform.ToTensor(), torch_transform.Normalize(mean=(0.4444, 0.4215, 0.3833), std=(0.2738, 0.2664, 0.2766))]) encoder = EncoderCNN(caption_embed_size) decoder = DecoderRNN(caption_embed_size, len(vocab), caption_hidden_layer, caption_hidden_size) encoder.load_state_dict(torch.load(caption_encoder_path,map_location=device)) decoder.load_state_dict(torch.load(caption_decoder_path,map_location=device)) images = load_image(images, root_path, transform) encoder.eval() decoder.eval() encoder.to(device) decoder.to(device) images = images.to(device) features = encoder(images) states = None predicted_index = [] lstm_inputs = features.unsqueeze(1) for i in range(max_sequence_len): outputs,states = decoder.lstm(lstm_inputs,states) # outputs을 linear 레이어의 인풋을 위해 2차원 배열로 만들어 줘야함 outputs = outputs.squeeze(1) scores_per_batch = decoder.score_layer(outputs) values, predicted = scores_per_batch.max(1) predicted_index.append(predicted) lstm_inputs = decoder.embed(predicted) lstm_inputs = lstm_inputs.unsqueeze(1) predicted_index = torch.stack(predicted_index,dim=1) predicted_index = predicted_index.cpu().numpy() result_captions = [] for wordindices in predicted_index: text ="" for index in wordindices: word = vocab.idx2word[index] if word == '<end>': break if word == '<unk>' or word == '<start>': continue text += word + " " result_captions.append(text) print("result_caption : ",result_captions) # 1. translate captions to korean korean_sentences = [] for sent in result_captions: translate_result = get_translate(sent) if translate_result != -1: translate_result = re.sub(r'\.','',translate_result) korean_sentences.append(translate_result) print("result_korean : ",korean_sentences) kogpt2_config = get_kog_config() if model_type == "life": kogpt2_model_path = AI_directory_path+config['kogpt_life_model_path'] elif model_type == "story": kogpt2_model_path = AI_directory_path + config['kogpt_story_model_path'] else: kogpt2_model_path = AI_directory_path+config['kogpt_model_path'] kogpt2_vocab_path = AI_directory_path+config['kogpt_vocab_path'] kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config)) kogpt2model.load_state_dict(torch.load(kogpt2_model_path,map_location=device)) kogpt2model.to(device) kogpt2model.eval() vocab = nlp.vocab.BERTVocab.from_sentencepiece(kogpt2_vocab_path, mask_token=None, sep_token=None, cls_token=None, unknown_token='<unk>', padding_token='<pad>', bos_token='<s>', eos_token='</s>') tok = SentencepieceTokenizer(kogpt2_vocab_path) korean_preprocess(korean_sentences) gpt_result = naive_prediction(korean_sentences,tok,vocab,device,kogpt2model,model_type) korean_postprocess(gpt_result) result = [] make_sentence(gpt_result,"",result,0) result.sort(key=lambda item: (-len(item),item)) result_len = len(result) if result_len >11: result_len = 11 result = result[1:result_len] return result
def kogpt_life_recursive_test(): config = get_config() AI_directory_path = "C:\\Users\\multicampus\\s02p23c104\\Back\\AI" kogpt2_config = get_kog_config() kogpt2_model_path = "C:\\Users\\multicampus\\s02p23c104\\Back\\AI\\checkpoints\\kogpt_life_model_20_2020-04-26-23-56-31.pth" kogpt2_vocab_path = AI_directory_path + config['kogpt_vocab_path'] kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config)) torch.load(kogpt2_model_path) kogpt2model.load_state_dict(torch.load(kogpt2_model_path)) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") kogpt2model.to(device) kogpt2model.eval() vocab = nlp.vocab.BERTVocab.from_sentencepiece(kogpt2_vocab_path, mask_token=None, sep_token=None, cls_token=None, unknown_token='<unk>', padding_token='<pad>', bos_token='<s>', eos_token='</s>') tok = SentencepieceTokenizer(kogpt2_vocab_path) # sent = ' 신랑 신부가 결혼식 파티 앞에서 사진을 찍기 위해 포즈를 취하고 있자, 신부님은 웨딩드레파란 셔츠를 입은 남자가 사다리에 서 있' # toked = tok(sent) # print(toked) # sent_cnt = 0 # input_ids = torch.tensor([vocab[vocab.bos_token], ] + vocab[toked]).unsqueeze(0) # input_ids = input_ids.to(device) korean_sentences = [ '신랑 신부가 결혼식 파티 앞에서 사진을 찍기 위해 포즈를 취하고 있다.', '파란 셔츠를 입은 남자가 사다리에 서 있다.', '두 남자가 서 있다' ] kogpt_input_sentences = [] for korean in korean_sentences: korean_size = len(korean) if not kogpt_input_sentences: korean_size = len(korean) if korean_size > 3: kogpt_input_sentences.append(korean[:-2]) elif korean_size > 1: kogpt_input_sentences.append(korean[:-1]) else: kogpt_input_sentences.append(korean) else: for i in range(len(kogpt_input_sentences)): if korean_size > 3: kogpt_input_sentences[i] += korean[:-2] elif korean_size > 1: kogpt_input_sentences[i] += korean[:-1] else: kogpt_input_sentences[i] += korean[:] kogpt_output_sentences = [] print(kogpt_input_sentences) expected_length = 50 for kogpt_input_sentence in kogpt_input_sentences: print(kogpt_input_sentence) toked = tok(kogpt_input_sentence) input_ids = torch.tensor([ vocab[vocab.bos_token], ] + vocab[toked]).unsqueeze(0) print(input_ids) input_ids = input_ids.to(device) input_length = input_ids.shape[1] outputs = kogpt2model.generate(input_ids=input_ids, max_length=input_length + expected_length, repetition_penalty=1.2, do_sample=True, num_beams=3, bos_token_id=0, pad_token_id=3, eos_token_id=1, num_return_sequences=3) for i in range(3): # 3 output sequences were generated toked = vocab.to_tokens(outputs[i].squeeze().tolist()) ret = re.sub(r'(<s>|</s>|<pad>|<unk>|)', '', ''.join(toked).replace('▁', ' ').strip()) kogpt_output_sentences.append(ret) kogpt_input_sentences = copy.deepcopy(kogpt_output_sentences) print(kogpt_input_sentences)