class NamedEntityRecognitionFormatter(): """ NER formatter class """ def __init__(self, vocab=None, tokenizer=None, maxlen=30, model_dir=Path('data_in')): if vocab is None or tokenizer is None: tok_path = get_tokenizer() self.ptr_tokenizer = SentencepieceTokenizer(tok_path) self.ptr_detokenizer = SentencepieceDetokenizer(tok_path) _, vocab_of_gluonnlp = get_pytorch_kobert_model() token2idx = vocab_of_gluonnlp.token_to_idx self.vocab = Vocabulary(token2idx=token2idx) self.tokenizer = Tokenizer(vocab=self.vocab, split_fn=self.ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=maxlen) else: self.vocab = vocab self.tokenizer = tokenizer self.maxlen = maxlen self.model_dir = model_dir def transform_source_fn(self, text): # text = "첫 회를 시작으로 13일까지 4일간 총 4회에 걸쳐 매 회 2편씩 총 8편이 공개될 예정이다." # label_text = "첫 회를 시작으로 <13일:DAT>까지 <4일간:DUR> 총 <4회:NOH>에 걸쳐 매 회 <2편:NOH>씩 총 <8편:NOH>이 공개될 예정이다." # text = "트래버 모리슨 학장은 로스쿨 학생과 교직원이 바라라 전 검사의 사법정의에 대한 깊이 있는 지식과 경험으로부터 많은 것을 배울 수 있을 것이라고 말했다." # label_text = "<트래버 모리슨:PER> 학장은 로스쿨 학생과 교직원이 <바라라:PER> 전 검사의 사법정의에 대한 깊이 있는 지식과 경험으로부터 많은 것을 배울 수 있을 것이라고 말했다." tokens = self.tokenizer.split(text) token_ids_with_cls_sep = self.tokenizer.list_of_string_to_arr_of_cls_sep_pad_token_ids( [text]) # save token sequence length for matching entity label to sequence label prefix_sum_of_token_start_index = [] sum = 0 for i, token in enumerate(tokens): if i == 0: prefix_sum_of_token_start_index.append(0) sum += len(token) - 1 else: prefix_sum_of_token_start_index.append(sum) sum += len(token) return token_ids_with_cls_sep, tokens, prefix_sum_of_token_start_index def transform_target_fn(self, label_text, tokens, prefix_sum_of_token_start_index): regex_ner = re.compile('<(.+?):[A-Z]{3}>') regex_filter_res = regex_ner.finditer(label_text) list_of_ner_tag = [] list_of_ner_text = [] list_of_tuple_ner_start_end = [] count_of_match = 0 for match_item in regex_filter_res: ner_tag = match_item[0][-4:-1] # <4일간:DUR> -> DUR ner_text = match_item[1] # <4일간:DUR> -> 4일간 start_index = match_item.start( ) - 6 * count_of_match # delete previous '<, :, 3 words tag name, >' end_index = match_item.end() - 6 - 6 * count_of_match list_of_ner_tag.append(ner_tag) list_of_ner_text.append(ner_text) list_of_tuple_ner_start_end.append((start_index, end_index)) count_of_match += 1 list_of_ner_label = [] entity_index = 0 is_entity_still_B = True for tup in zip(tokens, prefix_sum_of_token_start_index): token, index = tup if '▁' in token: # 주의할 점!! '▁' 이것과 우리가 쓰는 underscore '_'는 서로 다른 토큰임 index += 1 # 토큰이 띄어쓰기를 앞단에 포함한 경우 index 한개 앞으로 당김 # ('▁13', 9) -> ('13', 10) if entity_index < len(list_of_tuple_ner_start_end): start, end = list_of_tuple_ner_start_end[entity_index] if end < index: # 엔티티 범위보다 현재 seq pos가 더 크면 다음 엔티티를 꺼내서 체크 is_entity_still_B = True entity_index = entity_index + 1 if entity_index + 1 < len( list_of_tuple_ner_start_end) else entity_index start, end = list_of_tuple_ner_start_end[entity_index] if start <= index and index < end: # <13일:DAT>까지 -> ('▁13', 10, 'B-DAT') ('일까지', 12, 'I-DAT') 이런 경우가 포함됨, 포함 안시키려면 토큰의 length도 계산해서 제어해야함 entity_tag = list_of_ner_tag[entity_index] if is_entity_still_B is True: entity_tag = 'B-' + entity_tag list_of_ner_label.append(entity_tag) is_entity_still_B = False else: entity_tag = 'I-' + entity_tag list_of_ner_label.append(entity_tag) else: is_entity_still_B = True entity_tag = 'O' list_of_ner_label.append(entity_tag) else: entity_tag = 'O' list_of_ner_label.append(entity_tag) # print((token, index, entity_tag), end=' ') with open(self.model_dir / "ner_to_index.json", 'rb') as f: self.ner_to_index = json.load(f) # ner_str -> ner_ids -> cls + ner_ids + sep -> cls + ner_ids + sep + pad + pad .. + pad list_of_ner_ids = [self.ner_to_index['[CLS]']] + [ self.ner_to_index[ner_tag] for ner_tag in list_of_ner_label ] + [self.ner_to_index['[SEP]']] list_of_ner_ids = self.tokenizer._pad([list_of_ner_ids], pad_id=self.vocab.PAD_ID, maxlen=self.maxlen)[0] return list_of_ner_ids, list_of_ner_label
class NamedEntityRecognitionFormatter(): def __init__(self, vocab=None, tokenizer=None, maxlen=30, model_dir=Path('data_in')): if vocab is None or tokenizer is None: tok_path = get_tokenizer() self.ptr_tokenizer = SentencepieceTokenizer(tok_path) self.ptr_detokenizer = SentencepieceDetokenizer(tok_path) _, vocab_of_gluonnlp = get_pytorch_kobert_model() token2idx = vocab_of_gluonnlp.token_to_idx self.vocab = Vocabulary(token2idx=token2idx) self.tokenizer = Tokenizer(vocab=self.vocab, split_fn=self.ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=maxlen) else: self.vocab = vocab self.tokenizer = tokenizer self.maxlen = maxlen self.model_dir = model_dir def transform_source_fn(self, text): tokens = self.tokenizer.split(text) token_ids_with_cls_sep = self.tokenizer.list_of_string_to_arr_of_cls_sep_pad_token_ids([text]) prefix_sum_of_token_start_index = [] sum = 0 for i, token in enumerate(tokens): if i == 0: prefix_sum_of_token_start_index.append(0) sum += len(token) - 1 else: prefix_sum_of_token_start_index.append(sum) sum += len(token) return token_ids_with_cls_sep, tokens, prefix_sum_of_token_start_index def transform_target_fn(self, label_text, tokens, prefix_sum_of_token_start_index): regex_ner = re.compile('<(.+?):[A-Z]{3}>') # NER Tag가 2자리 문자면 {3} -> {2}로 변경 (e.g. LOC -> LC) 인경우 regex_filter_res = regex_ner.finditer(label_text) list_of_ner_tag = [] list_of_ner_text = [] list_of_tuple_ner_start_end = [] count_of_match = 0 for match_item in regex_filter_res: ner_tag = match_item[0][-4:-1] ner_text = match_item[1] start_index = match_item.start() end_index = match_item.end() list_of_ner_tag.append(ner_tag) list_of_ner_text.append(ner_text) list_of_tuple_ner_start_end.append((start_index, end_index)) count_of_match += 1 list_of_ner_label = [] entity_index = 0 is_entity_still_B = True for tup in zip(tokens, prefix_sum_of_token_start_index): token, index = tup if '▁' in token: index += 1 if entity_index < len(list_of_tuple_ner_start_end): start, end = list_of_tuple_ner_start_end[entity_index] if end < index: is_entity_still_B = True entity_index = entity_index + 1 if entity_index + 1 < len(list_of_tuple_ner_start_end) else entity_index start, end = list_of_tuple_ner_start_end[entity_index] if start <= index and index < end: entity_tag = list_of_ner_tag[entity_index] if is_entity_still_B is True: entity_tag = 'B-' + entity_tag list_of_ner_label.append(entity_tag) is_entity_still_B = False else: is_entity_still_B = True entity_tag = 'O' list_of_ner_label.append(entity_tag) else: entity_tag = 'O' list_of_ner_label.append(entity_tag) with open(self.model_dir / "ner_to_index,json", 'rb') as f: self.ner_to_index = json.load(f) list_of_ner_ids = [self.ner_to_index['[CLS']] + [self.ner_to_index[ner_tag] for ner_tag in list_of_ner_label] + [self.ner_to_index['[SEP]']] list_of_ner_ids = self.tokenizer._pad([list_of_ner_ids], pad_id=self.vocab.PAD_ID, maxlen=self.maxlen)[0] return list_of_ner_ids, list_of_ner_label