def main(parser): # Config args = parser.parse_args() data_dir = Path(args.data_dir) model_dir = Path(args.model_dir) data_config = Config(json_path=data_dir / 'config.json') model_config = Config(json_path=model_dir / 'config.json') # Vocab & Tokenizer with open(data_config.token2idx_vocab, mode='rb') as io: token2idx_vocab = json.load(io) print("token2idx_vocab: ", token2idx_vocab) vocab = Vocabulary(token2idx=token2idx_vocab) tokenizer = Tokenizer(vocab=vocab, split_fn=mecab_token_pos_flat_fn, pad_fn=keras_pad_fn, maxlen=model_config.maxlen) model_config.vocab_size = len(vocab.token2idx) # Model model = Transformer(config=model_config, vocab=vocab) checkpoint_manager = CheckpointManager(model_dir) # experiments/base_model checkpoint = checkpoint_manager.load_checkpoint('best.tar') model.load_state_dict(checkpoint['model_state_dict']) device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') model.to(device) model.eval() while (True): input_text = input("문장을 입력하세요: ") enc_input = torch.tensor( tokenizer.list_of_string_to_arr_of_pad_token_ids([input_text])) dec_input = torch.tensor([[vocab.token2idx[vocab.START_TOKEN]]]) for i in range(model_config.maxlen): y_pred = model(enc_input.to(device), dec_input.to(device)) y_pred_ids = y_pred.max(dim=-1)[1] if (y_pred_ids[0, -1] == vocab.token2idx[vocab.END_TOKEN]).to( torch.device('cpu')).numpy(): decoding_from_result(enc_input=enc_input, y_pred=y_pred, tokenizer=tokenizer) break # decoding_from_result(enc_input, y_pred, tokenizer) dec_input = torch.cat([ dec_input.to(torch.device('cpu')), y_pred_ids[0, -1].unsqueeze(0).unsqueeze(0).to( torch.device('cpu')) ], dim=-1) if i == model_config.maxlen - 1: decoding_from_result(enc_input=enc_input, y_pred=y_pred, tokenizer=tokenizer)
def main(): cur_path = os.path.dirname(sys.argv[0]) if cur_path: os.chdir(cur_path) model_dir = Path('./experiments/base_model_with_crf') model_config = Config(json_path=model_dir / 'config.json') # load vocab & tokenizer tok_path = "./ptr_lm_model/tokenizer_78b3253a26.model" ptr_tokenizer = SentencepieceTokenizer(tok_path) with open(model_dir / "vocab.pkl", 'rb') as f: vocab = pickle.load(f) tokenizer = Tokenizer(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=model_config.maxlen) # load ner_to_index.json with open(model_dir / "ner_to_index.json", 'rb') as f: ner_to_index = json.load(f) index_to_ner = {v: k for k, v in ner_to_index.items()} # model model = KobertCRF(config=model_config, num_classes=len(ner_to_index), vocab=vocab) # load model_dict = model.state_dict() checkpoint = torch.load("./model.bin", map_location=torch.device('cpu')) convert_keys = {} for k, v in checkpoint['model_state_dict'].items(): new_key_name = k.replace("module.", '') if new_key_name not in model_dict: print("{} is not int model_dict".format(new_key_name), file=sys.stderr) continue convert_keys[new_key_name] = v model.load_state_dict(convert_keys, strict=False) model.eval() device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') #model.to(device) decoder_from_res = DecoderFromNamedEntitySequence(tokenizer=tokenizer, index_to_ner=index_to_ner) try: while(True): input_text = input() list_of_input_ids = tokenizer.list_of_string_to_list_of_cls_sep_token_ids([input_text]) x_input = torch.tensor(list_of_input_ids).long() list_of_pred_ids = model(x_input) list_of_ner_word = decoder_from_res(list_of_input_ids=list_of_input_ids, list_of_pred_ids=list_of_pred_ids) if list_of_ner_word: print(",".join(list_of_ner_word)) else: print("/") except: print("EOF", file=sys.stderr)
def __init__(self, config: Config, vocab: Vocabulary, state_dict = None): self.seq2seq = TransformerNet(config=config, vocab=vocab) self.config = config self.vocab = vocab self.tokenizer = Tokenizer(vocab=vocab, split_fn=mecab_token_pos_flat_fn, pad_fn=keras_pad_fn, maxlen=config.maxlen) self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') if state_dict is not None: self.seq2seq.load_state_dict(state_dict) self.seq2seq.to(self.device) self.learning_rate = config.learning_rate
def post(): value = request.form['input'] model_dir = Path('./experiments/base_model_with_crf') model_config = Config(json_path=model_dir / 'config.json') # load vocab & tokenizer tok_path = "ptr_lm_model/tokenizer_78b3253a26.model" ptr_tokenizer = SentencepieceTokenizer(tok_path) with open(model_dir / "vocab.pkl", 'rb') as f: vocab = pickle.load(f) tokenizer = Tokenizer(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=model_config.maxlen) # load ner_to_index.json with open(model_dir / "ner_to_index.json", 'rb') as f: ner_to_index = json.load(f) index_to_ner = {v: k for k, v in ner_to_index.items()} # model model = KobertCRFViz(config=model_config, num_classes=len(ner_to_index), vocab=vocab) # load model_dict = model.state_dict() checkpoint = torch.load( "./experiments/base_model_with_crf/best-epoch-16-step-1500-acc-0.993.bin", map_location=torch.device('cpu')) convert_keys = {} for k, v in checkpoint['model_state_dict'].items(): new_key_name = k.replace("module.", '') if new_key_name not in model_dict: print("{} is not int model_dict".format(new_key_name)) continue convert_keys[new_key_name] = v model.load_state_dict(convert_keys) model.eval() device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') model.to(device) decoder_from_res = DecoderFromNamedEntitySequence( tokenizer=tokenizer, index_to_ner=index_to_ner) input_text = value list_of_input_ids = tokenizer.list_of_string_to_list_of_cls_sep_token_ids( [input_text]) x_input = torch.tensor(list_of_input_ids).long() list_of_pred_ids, _ = model(x_input) list_of_ner_word, decoding_ner_sentence = decoder_from_res( list_of_input_ids=list_of_input_ids, list_of_pred_ids=list_of_pred_ids) return {'word': list_of_ner_word, 'decoding': decoding_ner_sentence}
def __init__(self, vocab=None, tokenizer=None, maxlen=30, model_dir=Path('data_in')): if vocab is None or tokenizer is None: tok_path = get_tokenizer() self.ptr_tokenizer = SentencepieceTokenizer(tok_path) self.ptr_detokenizer = SentencepieceDetokenizer(tok_path) _, vocab_of_gluonnlp = get_pytorch_kobert_model() token2idx = vocab_of_gluonnlp.token_to_idx self.vocab = Vocabulary(token2idx=token2idx) self.tokenizer = Tokenizer(vocab=self.vocab, split_fn=self.ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=maxlen) else: self.vocab = vocab self.tokenizer = tokenizer self.maxlen = maxlen self.model_dir = model_dir
def get_tokenizer(cls): if cls.tokenizer == None: tok_path = "./tokenizer_78b3253a26.model" ptr_tokenizer = SentencepieceTokenizer(tok_path) cls.tokenizer = Tokenizer(vocab=cls.vocab, split_fn=ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=model_config.maxlen) return cls.tokenizer
def load_generator(args): # 载入预训练的生成器 data_dir = Path(args.data_dir) model_dir = Path(args.model_dir) data_config = Config(json_path=data_dir / 'config.json') model_config = Config(json_path=model_dir / 'config.json') checkpoint_manager = CheckpointManager(model_dir) # experiments/base_model checkpoint = checkpoint_manager.load_checkpoint('best.tar') with open(data_config.token2idx_vocab, mode='rb') as io: token2idx_vocab = json.load(io) print("token2idx_vocab: ", token2idx_vocab) vocab = Vocabulary(token2idx = token2idx_vocab) model_config.vocab_size = len(vocab.token2idx) tokenizer = Tokenizer(vocab=vocab, split_fn=mecab_token_pos_flat_fn, pad_fn=keras_pad_fn, maxlen=model_config.maxlen) # loss_fn = nn.CrossEntropyLoss(ignore_index=vocab.PAD_ID) return Generator(model_config, vocab, checkpoint['model_state_dict']), tokenizer, vocab.PAD_ID, checkpoint_manager
def main(parser): # Config args = parser.parse_args() data_dir = Path(args.data_dir) model_dir = Path(args.model_dir) # data_config = Config(json_path=data_dir / 'config.json') model_config = Config(json_path=model_dir / 'config.json') # Vocab & Tokenizer tok_path = get_tokenizer() # ./tokenizer_78b3253a26.model ptr_tokenizer = SentencepieceTokenizer(tok_path) _, vocab_of_gluonnlp = get_pytorch_kobert_model() token_to_idx = vocab_of_gluonnlp.token_to_idx model_config.vocab_size = len(token_to_idx) vocab = Vocabulary(token_to_idx=token_to_idx) print("len(token_to_idx): ", len(token_to_idx)) with open(model_dir / "token2idx_vocab.json", 'w', encoding='utf-8') as f: json.dump(token_to_idx, f, ensure_ascii=False, indent=4) # save vocab & tokenizer with open(model_dir / "vocab.pkl", 'wb') as f: pickle.dump(vocab, f) # load vocab & tokenizer with open(model_dir / "vocab.pkl", 'rb') as f: vocab = pickle.load(f) tokenizer = Tokenizer(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=model_config.maxlen) ner_formatter = NamedEntityRecognitionFormatter(vocab=vocab, tokenizer=tokenizer, maxlen=model_config.maxlen, model_dir=model_dir) # Train & Val Datasets cwd = Path.cwd() data_in = cwd / "data_in" train_data_dir = data_in / "NER-master" / "말뭉치 - 형태소_개체명" tr_clf_ds = NamedEntityRecognitionDataset(train_data_dir=train_data_dir, model_dir=model_dir) tr_clf_ds.set_transform_fn(transform_source_fn=ner_formatter.transform_source_fn, transform_target_fn=ner_formatter.transform_target_fn) tr_clf_dl = DataLoader(tr_clf_ds, batch_size=model_config.batch_size, shuffle=True, num_workers=4, drop_last=False) # Model model = KobertCRF(config=model_config, num_classes=len(tr_clf_ds.ner_to_index)) model.train() # optim train_examples_len = len(tr_clf_ds) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}] # num_train_optimization_steps = int(train_examples_len / model_config.batch_size / model_config.gradient_accumulation_steps) * model_config.epochs t_total = len(tr_clf_dl) // model_config.gradient_accumulation_steps * model_config.epochs optimizer = AdamW(optimizer_grouped_parameters, lr=model_config.learning_rate, eps=model_config.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=model_config.warmup_steps, t_total=t_total) device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') n_gpu = torch.cuda.device_count() # if n_gpu > 1: # model = torch.nn.DataParallel(model) model.to(device) # save tb_writer = SummaryWriter('{}/runs'.format(model_dir)) checkpoint_manager = CheckpointManager(model_dir) summary_manager = SummaryManager(model_dir) best_val_loss = 1e+10 best_train_acc = 0 # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(tr_clf_ds)) logger.info(" Num Epochs = %d", model_config.epochs) logger.info(" Instantaneous batch size per GPU = %d", model_config.batch_size) # logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", # args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", model_config.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 best_dev_acc, best_dev_loss = 0.0, 99999999999.0 best_steps = 0 model.zero_grad() set_seed() # Added here for reproductibility (even between python 2 and 3) # Train train_iterator = trange(int(model_config.epochs), desc="Epoch") for _epoch, _ in enumerate(train_iterator): epoch_iterator = tqdm(tr_clf_dl, desc="Iteration") # , disable=args.local_rank not in [-1, 0] epoch = _epoch for step, batch in enumerate(epoch_iterator): model.train() x_input, token_type_ids, y_real = map(lambda elm: elm.to(device), batch) log_likelihood, sequence_of_tags = model(x_input, token_type_ids, y_real) # loss: negative log-likelihood loss = -1 * log_likelihood if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if model_config.gradient_accumulation_steps > 1: loss = loss / model_config.gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), model_config.max_grad_norm) tr_loss += loss.item() if (step + 1) % model_config.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 with torch.no_grad(): sequence_of_tags = torch.tensor(sequence_of_tags) print("sequence_of_tags: ", sequence_of_tags) print("y_real: ", y_real) print("loss: ", loss) print("(sequence_of_tags == y_real): ", (sequence_of_tags == y_real)) mb_acc = (sequence_of_tags == y_real).float()[y_real != vocab.PAD_ID].mean() tr_acc = mb_acc.item() tr_loss_avg = tr_loss / global_step tr_summary = {'loss': tr_loss_avg, 'acc': tr_acc} # if step % 50 == 0: print('epoch : {}, global_step : {}, tr_loss: {:.3f}, tr_acc: {:.2%}'.format(epoch + 1, global_step, tr_summary['loss'], tr_summary['acc'])) if model_config.logging_steps > 0 and global_step % model_config.logging_steps == 0: # Log metrics if model_config.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well pass tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / model_config.logging_steps, global_step) logger.info("Average loss: %s at global step: %s", str((tr_loss - logging_loss) / model_config.logging_steps), str(global_step)) logging_loss = tr_loss if model_config.save_steps > 0 and global_step % model_config.save_steps == 0: # Save model checkpoint output_dir = os.path.join(model_config.output_dir, 'epoch-{}'.format(epoch + 1)) if not os.path.exists(output_dir): os.makedirs(output_dir) logger.info("Saving model checkpoint to %s", output_dir) state = {'global_step': global_step + 1, 'model_state_dict': model.state_dict(), 'opt_state_dict': optimizer.state_dict()} summary = {'train': tr_summary} summary_manager.update(summary) summary_manager.save('summary.json') is_best = tr_acc >= best_train_acc # acc 기준 (원래는 train_acc가 아니라 val_acc로 해야) # Save if is_best: best_train_acc = tr_acc checkpoint_manager.save_checkpoint(state, 'best-epoch-{}-step-{}-acc-{:.3f}.bin'.format(epoch + 1, global_step, tr_acc)) else: torch.save(state, os.path.join(output_dir, 'model-epoch-{}-step-{}-acc-{:.3f}.bin'.format(epoch + 1, global_step, tr_acc))) tb_writer.close() logger.info(" global_step = %s, average loss = %s", global_step, tr_loss / global_step) return global_step, tr_loss / global_step, best_steps
class NamedEntityRecognitionFormatter(): """ NER formatter class """ def __init__(self, vocab=None, tokenizer=None, maxlen=30, model_dir=Path('data_in')): if vocab is None or tokenizer is None: tok_path = get_tokenizer() self.ptr_tokenizer = SentencepieceTokenizer(tok_path) self.ptr_detokenizer = SentencepieceDetokenizer(tok_path) _, vocab_of_gluonnlp = get_pytorch_kobert_model() token2idx = vocab_of_gluonnlp.token_to_idx self.vocab = Vocabulary(token2idx=token2idx) self.tokenizer = Tokenizer(vocab=self.vocab, split_fn=self.ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=maxlen) else: self.vocab = vocab self.tokenizer = tokenizer self.maxlen = maxlen self.model_dir = model_dir def transform_source_fn(self, text): # text = "첫 회를 시작으로 13일까지 4일간 총 4회에 걸쳐 매 회 2편씩 총 8편이 공개될 예정이다." # label_text = "첫 회를 시작으로 <13일:DAT>까지 <4일간:DUR> 총 <4회:NOH>에 걸쳐 매 회 <2편:NOH>씩 총 <8편:NOH>이 공개될 예정이다." # text = "트래버 모리슨 학장은 로스쿨 학생과 교직원이 바라라 전 검사의 사법정의에 대한 깊이 있는 지식과 경험으로부터 많은 것을 배울 수 있을 것이라고 말했다." # label_text = "<트래버 모리슨:PER> 학장은 로스쿨 학생과 교직원이 <바라라:PER> 전 검사의 사법정의에 대한 깊이 있는 지식과 경험으로부터 많은 것을 배울 수 있을 것이라고 말했다." tokens = self.tokenizer.split(text) token_ids_with_cls_sep = self.tokenizer.list_of_string_to_arr_of_cls_sep_pad_token_ids( [text]) # save token sequence length for matching entity label to sequence label prefix_sum_of_token_start_index = [] sum = 0 for i, token in enumerate(tokens): if i == 0: prefix_sum_of_token_start_index.append(0) sum += len(token) - 1 else: prefix_sum_of_token_start_index.append(sum) sum += len(token) return token_ids_with_cls_sep, tokens, prefix_sum_of_token_start_index def transform_target_fn(self, label_text, tokens, prefix_sum_of_token_start_index): regex_ner = re.compile('<(.+?):[A-Z]{3}>') regex_filter_res = regex_ner.finditer(label_text) list_of_ner_tag = [] list_of_ner_text = [] list_of_tuple_ner_start_end = [] count_of_match = 0 for match_item in regex_filter_res: ner_tag = match_item[0][-4:-1] # <4일간:DUR> -> DUR ner_text = match_item[1] # <4일간:DUR> -> 4일간 start_index = match_item.start( ) - 6 * count_of_match # delete previous '<, :, 3 words tag name, >' end_index = match_item.end() - 6 - 6 * count_of_match list_of_ner_tag.append(ner_tag) list_of_ner_text.append(ner_text) list_of_tuple_ner_start_end.append((start_index, end_index)) count_of_match += 1 list_of_ner_label = [] entity_index = 0 is_entity_still_B = True for tup in zip(tokens, prefix_sum_of_token_start_index): token, index = tup if '▁' in token: # 주의할 점!! '▁' 이것과 우리가 쓰는 underscore '_'는 서로 다른 토큰임 index += 1 # 토큰이 띄어쓰기를 앞단에 포함한 경우 index 한개 앞으로 당김 # ('▁13', 9) -> ('13', 10) if entity_index < len(list_of_tuple_ner_start_end): start, end = list_of_tuple_ner_start_end[entity_index] if end < index: # 엔티티 범위보다 현재 seq pos가 더 크면 다음 엔티티를 꺼내서 체크 is_entity_still_B = True entity_index = entity_index + 1 if entity_index + 1 < len( list_of_tuple_ner_start_end) else entity_index start, end = list_of_tuple_ner_start_end[entity_index] if start <= index and index < end: # <13일:DAT>까지 -> ('▁13', 10, 'B-DAT') ('일까지', 12, 'I-DAT') 이런 경우가 포함됨, 포함 안시키려면 토큰의 length도 계산해서 제어해야함 entity_tag = list_of_ner_tag[entity_index] if is_entity_still_B is True: entity_tag = 'B-' + entity_tag list_of_ner_label.append(entity_tag) is_entity_still_B = False else: entity_tag = 'I-' + entity_tag list_of_ner_label.append(entity_tag) else: is_entity_still_B = True entity_tag = 'O' list_of_ner_label.append(entity_tag) else: entity_tag = 'O' list_of_ner_label.append(entity_tag) # print((token, index, entity_tag), end=' ') with open(self.model_dir / "ner_to_index.json", 'rb') as f: self.ner_to_index = json.load(f) # ner_str -> ner_ids -> cls + ner_ids + sep -> cls + ner_ids + sep + pad + pad .. + pad list_of_ner_ids = [self.ner_to_index['[CLS]']] + [ self.ner_to_index[ner_tag] for ner_tag in list_of_ner_label ] + [self.ner_to_index['[SEP]']] list_of_ner_ids = self.tokenizer._pad([list_of_ner_ids], pad_id=self.vocab.PAD_ID, maxlen=self.maxlen)[0] return list_of_ner_ids, list_of_ner_label
def transformation(): # Do an inference on a single batch of data data = None # 1) INPUT: convert Korean text input to NER code array if flask.request.content_type == 'text/plain': '''CHECK file locations''' model_config = Config(json_path="config.json") tok_path = "./tokenizer_78b3253a26.model" ptr_tokenizer = SentencepieceTokenizer(tok_path) with open("vocab.pkl", 'rb') as f: vocab = pickle.load(f) tokenizer = Tokenizer(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=model_config.maxlen) with open("ner_to_index.json", 'rb') as f: ner_to_index = json.load(f) index_to_ner = {v: k for k, v in ner_to_index.items()} decoder_from_res = DecoderFromNamedEntitySequence(tokenizer=tokenizer, index_to_ner=index_to_ner) f = flask.request.get_data() # ftype = str(type(f)) string_f = f.decode("utf-8") lines = string_f.splitlines(True) with open("result.txt", 'w', encoding='utf-8-sig') as w: # w.write('start\n') # w.write(ftype) # w.write('\nand\n') # w.write(string_f) # w.write('\nend\n') index = 0 for i in range(len(lines)): input_text = '' if i% 4 == 1: input_text = lines[i][3:] addInfo = lines[i+1][3:] if input_text == '': continue index += 1 # print("\n## " + str(index) + "\n") list_of_input_ids = tokenizer.list_of_string_to_list_of_cls_sep_token_ids([input_text]) x_input = torch.tensor(list_of_input_ids).long() w.write('## '+str(index)+'\n') w.write(addInfo) # w.write('\n'+str(list_of_input_ids)) predictions = run_inference_for_single_data(list_of_input_ids[0], ModelHandler.get_model()) # 2) OUTPUT: convert NER code to Korean text (FILE) emission = torch.tensor(predictions['output']) num_classes = len(ner_to_index) crf = CRF(num_tags=num_classes, batch_first=True) # 순서 (rearrange tag sequences) list_of_pred_ids = crf.decode(emission) input_token, list_of_ner_word, decoding_ner_sentence = decoder_from_res(list_of_input_ids=list_of_input_ids, list_of_pred_ids=list_of_pred_ids, unkTokenList=False) unkTokenList = makeUNKTokenList(input_text, input_token) input_token, list_of_ner_word, decoding_ner_sentence = decoder_from_res(list_of_input_ids=list_of_input_ids, list_of_pred_ids=list_of_pred_ids, unkTokenList=unkTokenList) w.write(str(list_of_ner_word) + '\n') w.write(str(decoding_ner_sentence[6:-5]) + '\n') return flask.Response(response=open("result.txt", 'r', encoding='utf-8-sig'), status=200, mimetype='text/plain') else: return flask.Response(response='This predictor only supports TEXT data', status=415, mimetype='text/plain')
def main(parser): args = parser.parse_args() model_dir = Path(args.model_dir) model_config = Config(json_path=model_dir / 'config.json') # Vocab & Tokenizer # tok_path = get_tokenizer() # ./tokenizer_78b3253a26.model tok_path = "./tokenizer_78b3253a26.model" ptr_tokenizer = SentencepieceTokenizer(tok_path) # load vocab & tokenizer with open(model_dir / "vocab.pkl", 'rb') as f: vocab = pickle.load(f) tokenizer = Tokenizer(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=model_config.maxlen) # load ner_to_index.json with open(model_dir / "ner_to_index.json", 'rb') as f: ner_to_index = json.load(f) index_to_ner = {v: k for k, v in ner_to_index.items()} # Model # model = KobertSequenceFeatureExtractor(config=model_config, num_classes=len(ner_to_index)) model = KobertCRF(config=model_config, num_classes=len(ner_to_index), vocab=vocab) # model = KobertBiLSTMCRF(config=model_config, num_classes=len(ner_to_index), vocab=vocab) # model = KobertBiGRUCRF(config=model_config, num_classes=len(ner_to_index), vocab=vocab) # load model_dict = model.state_dict() # checkpoint = torch.load("./experiments/base_model/best-epoch-9-step-600-acc-0.845.bin", map_location=torch.device('cpu')) checkpoint = torch.load( "./experiments/base_model_with_crf/best-epoch-16-step-1500-acc-0.993.bin", map_location=torch.device('cpu')) # checkpoint = torch.load("./experiments/base_model_with_bilstm_crf/best-epoch-15-step-2750-acc-0.992.bin", map_location=torch.device('cpu')) # checkpoint = torch.load("./experiments/base_model_with_bigru_crf/model-epoch-18-step-3250-acc-0.997.bin", map_location=torch.device('cpu')) convert_keys = {} for k, v in checkpoint['model_state_dict'].items(): new_key_name = k.replace("module.", '') if new_key_name not in model_dict: print("{} is not int model_dict".format(new_key_name)) continue convert_keys[new_key_name] = v model.load_state_dict(convert_keys) model.eval() device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') # n_gpu = torch.cuda.device_count() # if n_gpu > 1: # model = torch.nn.DataParallel(model) model.to(device) decoder_from_res = DecoderFromNamedEntitySequence( tokenizer=tokenizer, index_to_ner=index_to_ner) while (True): input_text = input("문장을 입력하세요: ") list_of_input_ids = tokenizer.list_of_string_to_list_of_cls_sep_token_ids( [input_text]) x_input = torch.tensor(list_of_input_ids).long() ## for bert alone # y_pred = model(x_input) # list_of_pred_ids = y_pred.max(dim=-1)[1].tolist() ## for bert crf list_of_pred_ids = model(x_input) ## for bert bilstm crf & bert bigru crf # list_of_pred_ids = model(x_input, using_pack_sequence=False) list_of_ner_word, decoding_ner_sentence = decoder_from_res( list_of_input_ids=list_of_input_ids, list_of_pred_ids=list_of_pred_ids) print("list_of_ner_word:", list_of_ner_word) print("decoding_ner_sentence:", decoding_ner_sentence)
class Generator(): def __init__(self, config: Config, vocab: Vocabulary, state_dict = None): self.seq2seq = TransformerNet(config=config, vocab=vocab) self.config = config self.vocab = vocab self.tokenizer = Tokenizer(vocab=vocab, split_fn=mecab_token_pos_flat_fn, pad_fn=keras_pad_fn, maxlen=config.maxlen) self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') if state_dict is not None: self.seq2seq.load_state_dict(state_dict) self.seq2seq.to(self.device) self.learning_rate = config.learning_rate def switch_mode(self, mode='eval'): if mode=='eval': self.seq2seq.eval() else: self.seq2seq.train() def parameters(self): return self.seq2seq.parameters() def get_state_dict(self): state_dict = self.seq2seq.to(torch.device('cpu')).state_dict() self.seq2seq.to(self.device) return state_dict def eval(self): self.seq2seq.eval() def train(self): self.seq2seq.train() def gen_output_with_ids(self, input_ids): # dec_input = torch.tensor([[self.vocab.token2idx[self.vocab.START_TOKEN]]]) dec_input = torch.full((input_ids.shape[0],1),self.vocab.token2idx[self.vocab.START_TOKEN]).long() # print(dec_input) for i in range(self.config.maxlen): # print('input_ids', input_ids) # print('dec_input', dec_input) # print('device', self.device) # y_pred = self.seq2seq(input_ids, dec_input) y_pred = self.seq2seq(input_ids.to(self.device), dec_input.to(self.device)) y_pred_ids = y_pred.max(dim=-1)[1] # if (y_pred_ids[0,-1] == self.vocab.token2idx[self.vocab.END_TOKEN]).to(torch.device('cpu')).numpy(): # # 填充PAD_ID # fill_pad = torch.full((input_ids.shape[0],self.config.maxlen-dec_input.shape[1]), self.vocab.PAD_ID).long().to(self.device) # # print(fill_pad.shape,y_pred_ids.shape) # y_pred_ids = torch.cat([y_pred_ids, fill_pad], dim=1) # break # 对self.vocab.END_TOKEN之后的位置填充pad_id # end_indices = (y_pred_ids==self.vocab.token2idx[self.vocab.END_TOKEN]).nonzero() # for val in end_indices: # fill_pad = torch.full((1,self.config.maxlen-val[1]), self.vocab.PAD_ID).long().to(self.device) # print(y_pred_ids[val[0]]) # y_pred_ids[val[0]] = torch.cat([y_pred_ids[val[0],-1], fill_pad], dim=1) # print(end_indices) # decoding_from_result(enc_input, y_pred, tokenizer) # print('y_pred_ids',y_pred_ids.shape) # print(dec_input.shape, y_pred_ids.shape,y_pred.shape) # dec_input = torch.cat((dec_input.to(torch.device('cpu')), y_pred_ids[:,-1].view(-1,1).to(torch.device('cpu'))), dim=1) dec_input = torch.cat((dec_input.to(self.device), y_pred_ids[:,-1].view(-1,1)), dim=1) # print(dec_input) if i == self.config.maxlen - 1: # output_str = decoding_from_result(enc_input=enc_input, y_pred=y_pred, tokenizer=self.tokenizer) break # 对self.vocab.END_TOKEN之后的位置填充pad_id # end_indices = (y_pred_ids==self.vocab.token2idx[self.vocab.END_TOKEN]).nonzero() # 1. 寻找每行最早的结束token # end_tokens = [] # last_r = -1 # last_col = -1 # print(end_indices) # for val in end_indices: # if val[0]>last_r: # last_r = val[0] # last_col = 500 # if val[1]<last_col: # last_col = val[1] # end_tokens.append([last_r.cpu().tolist(), last_col.cpu().tolist()]) # continue # for item in end_tokens: # fill_pad = torch.full((1, self.config.maxlen-item[1]), self.vocab.PAD_ID).long().to(self.device) # print(y_pred_ids[item[0]][0:item[1]]) # y_pred_ids[item[0]] = torch.cat([y_pred_ids[item[0]][0:item[1]], fill_pad], dim=1) # print('end_tokens',end_tokens) return y_pred_ids, y_pred def is_end_token(self, token): if (self.vocab.token2idx[self.vocab.END_TOKEN]==token).cpu().numpy(): return True return False def gen_output(self, input_text): enc_input = torch.tensor(self.tokenizer.list_of_string_to_arr_of_pad_token_ids([input_text])) dec_input = torch.tensor([[self.vocab.token2idx[self.vocab.START_TOKEN]]]) output_str = '' for i in range(self.config.maxlen): y_pred = self.seq2seq(enc_input.to(self.device), dec_input.to(self.device)) y_pred_ids = y_pred.max(dim=-1)[1] if (y_pred_ids[0,-1] == self.vocab.token2idx[self.vocab.END_TOKEN]).to(torch.device('cpu')).numpy(): output_str = decoding_from_result(enc_input=enc_input, y_pred=y_pred, tokenizer=self.tokenizer) break # decoding_from_result(enc_input, y_pred, tokenizer) dec_input = torch.cat([dec_input.to(torch.device('cpu')), y_pred_ids[0,-1].unsqueeze(0).unsqueeze(0).to(torch.device('cpu'))], dim=-1) if i == self.config.maxlen - 1: output_str = decoding_from_result(enc_input=enc_input, y_pred=y_pred, tokenizer=self.tokenizer) break output_str = output_str.replace('\n', '').replace('\r','') return output_str def sample(self, dataset): # 根据输入数据集生成回复数据集 data_enc_input = [] data_dec_input = [] data_dec_output = [] question = [] answer = [] labels = [] self.seq2seq.eval() # preds = [] for item in tqdm(dataset,desc='sampling'): enc_input, dec_input, dec_output = map(lambda elm: elm, item) pred_ids, _ = self.gen_output_with_ids(enc_input) # print(pred.shape) output_str = decoding_to_str(pred_ids, self.tokenizer) input_str = decoding_to_str(enc_input, self.tokenizer) # print(input_str) # print('---------------') # print(output_str) # discriminator_inputs = [] # for r in range(len(input_str)): # question += input_str[r] # answer += output_str[r] data_enc_input += enc_input data_dec_input += dec_input data_dec_output += dec_output question += input_str answer += output_str labels += [0 for _ in range(len(output_str))] # preds += pred # data_D_set += discriminator_inputs # batch_data.append([enc_input, dec_input, dec_output, discriminator_inputs]) data_enc_input += enc_input data_dec_input += dec_input data_dec_output += dec_output question += input_str answer_str = decoding_to_str(dec_input, self.tokenizer) # print('*********************') # print(answer_str) answer += answer_str labels += [1 for _ in range(len(output_str))] # break # print(batch_data) # print(len(labels), len(answer)) # print(labels, answer) df = pd.DataFrame({'enc_input': data_enc_input, 'dec_input': data_dec_input, 'dec_output': data_dec_output, 'question': question, 'answer': answer, 'label': labels}) return df
def transformation(): # Do an inference on a single batch of data data = None # 1) INPUT: convert Korean text input to NER code array if flask.request.content_type == 'text/plain': '''CHECK file locations''' model_config = Config(json_path="config.json") tok_path = "./tokenizer_78b3253a26.model" ptr_tokenizer = SentencepieceTokenizer(tok_path) with open("vocab.pkl", 'rb') as f: vocab = pickle.load(f) tokenizer = Tokenizer(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=model_config.maxlen) with open("ner_to_index.json", 'rb') as f: ner_to_index = json.load(f) index_to_ner = {v: k for k, v in ner_to_index.items()} decoder_from_res = DecoderFromNamedEntitySequence( tokenizer=tokenizer, index_to_ner=index_to_ner) ''' Assuming request.data is a string: name of txt file > NER_OY_data.txt as an example > 지금은 /opt/program에 (product-tags) HERE:? ''' f = flask.request.data.decode("utf-8") lines = f.splitlines(True) index = 0 with open("NER_OY_result.txt", 'w', encoding='utf-8-sig') as w: for i in range(len(lines)): input_text = '' if i % 4 == 1: input_text = lines[i][3:] addInfo = lines[i + 1][3:] if input_text == '': continue index += 1 # print("\n## " + str(index) + "\n") list_of_input_ids = tokenizer.list_of_string_to_list_of_cls_sep_token_ids( [input_text]) x_input = torch.tensor(list_of_input_ids).long() # print(list_of_input_ids) # print(x_input) data = {"instances": list_of_input_ids} predictions = ScoringService.predict(data) # 2) OUTPUT: convert NER code to Korean text (FILE) emission = torch.tensor(predictions['predictions']) num_classes = len(ner_to_index) crf = CRF(num_tags=num_classes, batch_first=True) list_of_pred_ids = crf.decode(emission) input_token, list_of_ner_word, decoding_ner_sentence = decoder_from_res( list_of_input_ids=list_of_input_ids, list_of_pred_ids=list_of_pred_ids, unkTokenList=False) unkTokenList = makeUNKTokenList(input_text, input_token) input_token, list_of_ner_word, decoding_ner_sentence = decoder_from_res( list_of_input_ids=list_of_input_ids, list_of_pred_ids=list_of_pred_ids, unkTokenList=unkTokenList) w.write('## ' + str(index) + '\n') w.write(addInfo) w.write(str(list_of_ner_word) + '\n') w.write(str(decoding_ner_sentence[6:-5]) + '\n') '''RETURN a file: NER_OY_result.txt''' return flask.Response(response=open("NER_OY_result.txt", 'r'), status=200, mimetype='text/plain') else: return flask.Response( response='This predictor only supports TEXT data', status=415, mimetype='text/plain')
def main(parser): # Config args = parser.parse_args() data_dir = Path(args.data_dir) model_dir = Path(args.model_dir) data_config = Config(json_path=data_dir / 'config.json') model_config = Config(json_path=model_dir / 'config.json') # Vocab & Tokenizer with open(data_config.token2idx_vocab, mode='rb') as io: token2idx_vocab = json.load(io) print("token2idx_vocab: ", token2idx_vocab) vocab = Vocabulary(token2idx=token2idx_vocab) tokenizer = Tokenizer(vocab=vocab, split_fn=mecab_token_pos_flat_fn, pad_fn=keras_pad_fn, maxlen=model_config.maxlen) model_config.vocab_size = len(vocab.token2idx) # Model & Model Params model = Transformer(config=model_config, vocab=vocab) # Train & Val Datasets tr_ds = ChatbotDataset(data_config.train, tokenizer.list_of_string_to_arr_of_pad_token_ids) tr_dl = DataLoader(tr_ds, batch_size=model_config.batch_size, shuffle=True, num_workers=4, drop_last=False) val_ds = ChatbotDataset(data_config.validation, tokenizer.list_of_string_to_arr_of_pad_token_ids) val_dl = DataLoader(val_ds, batch_size=model_config.batch_size, shuffle=True, num_workers=4, drop_last=False) # loss loss_fn = nn.CrossEntropyLoss(ignore_index=vocab.PAD_ID) # nn.NLLLoss() # optim opt = optim.Adam( params=model.parameters(), lr=model_config.learning_rate ) # torch.optim.SGD(params=model.parameters(), lr=model_config.learning_rate) # scheduler = ReduceLROnPlateau(opt, patience=5) # Check scheduler = GradualWarmupScheduler(opt, multiplier=8, total_epoch=model_config.epochs) device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') model.to(device) # save # writer = SummaryWriter('{}/runs'.format(model_dir)) checkpoint_manager = CheckpointManager(model_dir) summary_manager = SummaryManager(model_dir) best_val_loss = 1e+10 best_train_acc = 0 # load if (model_dir / 'best.tar').exists(): print("pretrained model exists") checkpoint = checkpoint_manager.load_checkpoint('best.tar') model.load_state_dict(checkpoint['model_state_dict']) # Train for epoch in tqdm(range(model_config.epochs), desc='epoch', total=model_config.epochs): scheduler.step(epoch) print("epoch : {}, lr: {}".format(epoch, opt.param_groups[0]['lr'])) tr_loss = 0 tr_acc = 0 model.train() for step, mb in tqdm(enumerate(tr_dl), desc='steps', total=len(tr_dl)): opt.zero_grad() enc_input, dec_input, dec_output = map(lambda elm: elm.to(device), mb) y_pred = model(enc_input, dec_input) y_pred_copy = y_pred.detach() dec_output_copy = dec_output.detach() # loss 계산을 위해 shape 변경 y_pred = y_pred.reshape(-1, y_pred.size(-1)) dec_output = dec_output.view(-1).long() # padding 제외한 value index 추출 real_value_index = [dec_output != 0] # padding은 loss 계산시 제외 mb_loss = loss_fn( y_pred[real_value_index], dec_output[real_value_index]) # Input: (N, C) Target: (N) mb_loss.backward() opt.step() with torch.no_grad(): mb_acc = acc(y_pred, dec_output) tr_loss += mb_loss.item() tr_acc = mb_acc.item() tr_loss_avg = tr_loss / (step + 1) tr_summary = {'loss': tr_loss_avg, 'acc': tr_acc} total_step = epoch * len(tr_dl) + step # Eval if total_step % model_config.summary_step == 0 and total_step != 0: print("train: ") decoding_from_result(enc_input, y_pred_copy, dec_output_copy, tokenizer) model.eval() print("eval: ") val_summary = evaluate(model, val_dl, { 'loss': loss_fn, 'acc': acc }, device, tokenizer) val_loss = val_summary['loss'] # writer.add_scalars('loss', {'train': tr_loss_avg, # 'val': val_loss}, epoch * len(tr_dl) + step) tqdm.write( 'epoch : {}, step : {}, ' 'tr_loss: {:.3f}, val_loss: {:.3f}, tr_acc: {:.2%}, val_acc: {:.2%}' .format(epoch + 1, total_step, tr_summary['loss'], val_summary['loss'], tr_summary['acc'], val_summary['acc'])) val_loss = val_summary['loss'] # is_best = val_loss < best_val_loss # loss 기준 is_best = tr_acc > best_train_acc # acc 기준 (원래는 train_acc가 아니라 val_acc로 해야) # Save if is_best: print( "[Best model Save] train_acc: {}, train_loss: {}, val_loss: {}" .format(tr_summary['acc'], tr_summary['loss'], val_loss)) # CPU에서도 동작 가능하도록 자료형 바꾼 뒤 저장 state = { 'epoch': epoch + 1, 'model_state_dict': model.to(torch.device('cpu')).state_dict(), 'opt_state_dict': opt.state_dict() } summary = {'train': tr_summary, 'validation': val_summary} summary_manager.update(summary) summary_manager.save('summary.json') checkpoint_manager.save_checkpoint(state, 'best.tar') best_val_loss = val_loss model.to(device) model.train() else: if step % 50 == 0: print( 'epoch : {}, step : {}, tr_loss: {:.3f}, tr_acc: {:.2%}' .format(epoch + 1, total_step, tr_summary['loss'], tr_summary['acc']))
ABS_PATH = os.environ.get('BASEDIR') model_dir = Path(f'{ABS_PATH}/experiments/base_model_with_crf_val') model_config = Config(json_path=model_dir / 'config.json') # Vocab & Tokenizer tok_path = f"{ABS_PATH}/tokenizer_78b3253a26.model" ptr_tokenizer = SentencepieceTokenizer(tok_path) # load vocab & tokenizer with open(model_dir / "vocab.pkl", 'rb') as f: vocab = pickle.load(f) tokenizer = Tokenizer(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=None) # load ner_to_index.json with open(model_dir / "ner_to_index.json", 'rb') as f: ner_to_index = json.load(f) index_to_ner = {v: k for k, v in ner_to_index.items()} # Model model = KobertCRF(config=model_config, num_classes=len(ner_to_index), vocab=vocab) # model = KobertBiGRUCRF(config=model_config, num_classes=len(ner_to_index), vocab=vocab) # load model_dict = model.state_dict()
class NamedEntityRecognitionFormatter(): def __init__(self, vocab=None, tokenizer=None, maxlen=30, model_dir=Path('data_in')): if vocab is None or tokenizer is None: tok_path = get_tokenizer() self.ptr_tokenizer = SentencepieceTokenizer(tok_path) self.ptr_detokenizer = SentencepieceDetokenizer(tok_path) _, vocab_of_gluonnlp = get_pytorch_kobert_model() token2idx = vocab_of_gluonnlp.token_to_idx self.vocab = Vocabulary(token2idx=token2idx) self.tokenizer = Tokenizer(vocab=self.vocab, split_fn=self.ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=maxlen) else: self.vocab = vocab self.tokenizer = tokenizer self.maxlen = maxlen self.model_dir = model_dir def transform_source_fn(self, text): tokens = self.tokenizer.split(text) token_ids_with_cls_sep = self.tokenizer.list_of_string_to_arr_of_cls_sep_pad_token_ids([text]) prefix_sum_of_token_start_index = [] sum = 0 for i, token in enumerate(tokens): if i == 0: prefix_sum_of_token_start_index.append(0) sum += len(token) - 1 else: prefix_sum_of_token_start_index.append(sum) sum += len(token) return token_ids_with_cls_sep, tokens, prefix_sum_of_token_start_index def transform_target_fn(self, label_text, tokens, prefix_sum_of_token_start_index): regex_ner = re.compile('<(.+?):[A-Z]{3}>') # NER Tag가 2자리 문자면 {3} -> {2}로 변경 (e.g. LOC -> LC) 인경우 regex_filter_res = regex_ner.finditer(label_text) list_of_ner_tag = [] list_of_ner_text = [] list_of_tuple_ner_start_end = [] count_of_match = 0 for match_item in regex_filter_res: ner_tag = match_item[0][-4:-1] ner_text = match_item[1] start_index = match_item.start() end_index = match_item.end() list_of_ner_tag.append(ner_tag) list_of_ner_text.append(ner_text) list_of_tuple_ner_start_end.append((start_index, end_index)) count_of_match += 1 list_of_ner_label = [] entity_index = 0 is_entity_still_B = True for tup in zip(tokens, prefix_sum_of_token_start_index): token, index = tup if '▁' in token: index += 1 if entity_index < len(list_of_tuple_ner_start_end): start, end = list_of_tuple_ner_start_end[entity_index] if end < index: is_entity_still_B = True entity_index = entity_index + 1 if entity_index + 1 < len(list_of_tuple_ner_start_end) else entity_index start, end = list_of_tuple_ner_start_end[entity_index] if start <= index and index < end: entity_tag = list_of_ner_tag[entity_index] if is_entity_still_B is True: entity_tag = 'B-' + entity_tag list_of_ner_label.append(entity_tag) is_entity_still_B = False else: is_entity_still_B = True entity_tag = 'O' list_of_ner_label.append(entity_tag) else: entity_tag = 'O' list_of_ner_label.append(entity_tag) with open(self.model_dir / "ner_to_index,json", 'rb') as f: self.ner_to_index = json.load(f) list_of_ner_ids = [self.ner_to_index['[CLS']] + [self.ner_to_index[ner_tag] for ner_tag in list_of_ner_label] + [self.ner_to_index['[SEP]']] list_of_ner_ids = self.tokenizer._pad([list_of_ner_ids], pad_id=self.vocab.PAD_ID, maxlen=self.maxlen)[0] return list_of_ner_ids, list_of_ner_label