コード例 #1
0
class NamedEntityRecognitionFormatter():
    """ NER formatter class """
    def __init__(self,
                 vocab=None,
                 tokenizer=None,
                 maxlen=30,
                 model_dir=Path('data_in')):

        if vocab is None or tokenizer is None:
            tok_path = get_tokenizer()
            self.ptr_tokenizer = SentencepieceTokenizer(tok_path)
            self.ptr_detokenizer = SentencepieceDetokenizer(tok_path)
            _, vocab_of_gluonnlp = get_pytorch_kobert_model()
            token2idx = vocab_of_gluonnlp.token_to_idx
            self.vocab = Vocabulary(token2idx=token2idx)
            self.tokenizer = Tokenizer(vocab=self.vocab,
                                       split_fn=self.ptr_tokenizer,
                                       pad_fn=keras_pad_fn,
                                       maxlen=maxlen)
        else:
            self.vocab = vocab
            self.tokenizer = tokenizer
        self.maxlen = maxlen
        self.model_dir = model_dir

    def transform_source_fn(self, text):
        # text = "첫 회를 시작으로 13일까지 4일간 총 4회에 걸쳐 매 회 2편씩 총 8편이 공개될 예정이다."
        # label_text = "첫 회를 시작으로 <13일:DAT>까지 <4일간:DUR> 총 <4회:NOH>에 걸쳐 매 회 <2편:NOH>씩 총 <8편:NOH>이 공개될 예정이다."
        # text = "트래버 모리슨 학장은 로스쿨 학생과 교직원이 바라라 전 검사의 사법정의에 대한 깊이 있는 지식과 경험으로부터 많은 것을 배울 수 있을 것이라고 말했다."
        # label_text = "<트래버 모리슨:PER> 학장은 로스쿨 학생과 교직원이 <바라라:PER> 전 검사의 사법정의에 대한 깊이 있는 지식과 경험으로부터 많은 것을 배울 수 있을 것이라고 말했다."
        tokens = self.tokenizer.split(text)
        token_ids_with_cls_sep = self.tokenizer.list_of_string_to_arr_of_cls_sep_pad_token_ids(
            [text])

        # save token sequence length for matching entity label to sequence label
        prefix_sum_of_token_start_index = []
        sum = 0
        for i, token in enumerate(tokens):
            if i == 0:
                prefix_sum_of_token_start_index.append(0)
                sum += len(token) - 1
            else:
                prefix_sum_of_token_start_index.append(sum)
                sum += len(token)
        return token_ids_with_cls_sep, tokens, prefix_sum_of_token_start_index

    def transform_target_fn(self, label_text, tokens,
                            prefix_sum_of_token_start_index):
        regex_ner = re.compile('<(.+?):[A-Z]{3}>')
        regex_filter_res = regex_ner.finditer(label_text)

        list_of_ner_tag = []
        list_of_ner_text = []
        list_of_tuple_ner_start_end = []

        count_of_match = 0
        for match_item in regex_filter_res:
            ner_tag = match_item[0][-4:-1]  # <4일간:DUR> -> DUR
            ner_text = match_item[1]  # <4일간:DUR> -> 4일간
            start_index = match_item.start(
            ) - 6 * count_of_match  # delete previous '<, :, 3 words tag name, >'
            end_index = match_item.end() - 6 - 6 * count_of_match

            list_of_ner_tag.append(ner_tag)
            list_of_ner_text.append(ner_text)
            list_of_tuple_ner_start_end.append((start_index, end_index))
            count_of_match += 1

        list_of_ner_label = []
        entity_index = 0
        is_entity_still_B = True
        for tup in zip(tokens, prefix_sum_of_token_start_index):
            token, index = tup

            if '▁' in token:  # 주의할 점!! '▁' 이것과 우리가 쓰는 underscore '_'는 서로 다른 토큰임
                index += 1  # 토큰이 띄어쓰기를 앞단에 포함한 경우 index 한개 앞으로 당김 # ('▁13', 9) -> ('13', 10)

            if entity_index < len(list_of_tuple_ner_start_end):
                start, end = list_of_tuple_ner_start_end[entity_index]

                if end < index:  # 엔티티 범위보다 현재 seq pos가 더 크면 다음 엔티티를 꺼내서 체크
                    is_entity_still_B = True
                    entity_index = entity_index + 1 if entity_index + 1 < len(
                        list_of_tuple_ner_start_end) else entity_index
                    start, end = list_of_tuple_ner_start_end[entity_index]

                if start <= index and index < end:  # <13일:DAT>까지 -> ('▁13', 10, 'B-DAT') ('일까지', 12, 'I-DAT') 이런 경우가 포함됨, 포함 안시키려면 토큰의 length도 계산해서 제어해야함
                    entity_tag = list_of_ner_tag[entity_index]
                    if is_entity_still_B is True:
                        entity_tag = 'B-' + entity_tag
                        list_of_ner_label.append(entity_tag)
                        is_entity_still_B = False
                    else:
                        entity_tag = 'I-' + entity_tag
                        list_of_ner_label.append(entity_tag)
                else:
                    is_entity_still_B = True
                    entity_tag = 'O'
                    list_of_ner_label.append(entity_tag)

            else:
                entity_tag = 'O'
                list_of_ner_label.append(entity_tag)

            # print((token, index, entity_tag), end=' ')

        with open(self.model_dir / "ner_to_index.json", 'rb') as f:
            self.ner_to_index = json.load(f)
        # ner_str -> ner_ids -> cls + ner_ids + sep -> cls + ner_ids + sep + pad + pad .. + pad
        list_of_ner_ids = [self.ner_to_index['[CLS]']] + [
            self.ner_to_index[ner_tag] for ner_tag in list_of_ner_label
        ] + [self.ner_to_index['[SEP]']]
        list_of_ner_ids = self.tokenizer._pad([list_of_ner_ids],
                                              pad_id=self.vocab.PAD_ID,
                                              maxlen=self.maxlen)[0]

        return list_of_ner_ids, list_of_ner_label
コード例 #2
0
class NamedEntityRecognitionFormatter():
    def __init__(self, vocab=None, tokenizer=None, maxlen=30, model_dir=Path('data_in')):

        if vocab is None or tokenizer is None:
            tok_path = get_tokenizer()
            self.ptr_tokenizer = SentencepieceTokenizer(tok_path)
            self.ptr_detokenizer = SentencepieceDetokenizer(tok_path)
            _, vocab_of_gluonnlp = get_pytorch_kobert_model()
            token2idx = vocab_of_gluonnlp.token_to_idx
            self.vocab = Vocabulary(token2idx=token2idx)
            self.tokenizer = Tokenizer(vocab=self.vocab, split_fn=self.ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=maxlen)
        else:
            self.vocab = vocab
            self.tokenizer = tokenizer
        self.maxlen = maxlen
        self.model_dir = model_dir

    def transform_source_fn(self, text):
        tokens = self.tokenizer.split(text)
        token_ids_with_cls_sep = self.tokenizer.list_of_string_to_arr_of_cls_sep_pad_token_ids([text])

        prefix_sum_of_token_start_index = []
        sum = 0
        for i, token in enumerate(tokens):
            if i == 0:
                prefix_sum_of_token_start_index.append(0)
                sum += len(token) - 1
            else:
                prefix_sum_of_token_start_index.append(sum)
                sum += len(token)
        return token_ids_with_cls_sep, tokens, prefix_sum_of_token_start_index

    def transform_target_fn(self, label_text, tokens, prefix_sum_of_token_start_index):
        regex_ner = re.compile('<(.+?):[A-Z]{3}>') # NER Tag가 2자리 문자면 {3} -> {2}로 변경 (e.g. LOC -> LC) 인경우
        regex_filter_res = regex_ner.finditer(label_text)

        list_of_ner_tag = []
        list_of_ner_text = []
        list_of_tuple_ner_start_end = []

        count_of_match = 0
        for match_item in regex_filter_res:
            ner_tag = match_item[0][-4:-1]
            ner_text = match_item[1]
            start_index = match_item.start()
            end_index = match_item.end()

            list_of_ner_tag.append(ner_tag)
            list_of_ner_text.append(ner_text)
            list_of_tuple_ner_start_end.append((start_index, end_index))
            count_of_match += 1

        list_of_ner_label = []
        entity_index = 0
        is_entity_still_B = True
        for tup in zip(tokens, prefix_sum_of_token_start_index):
            token, index = tup

            if '▁' in token:
                index += 1

            if entity_index < len(list_of_tuple_ner_start_end):
                start, end = list_of_tuple_ner_start_end[entity_index]

                if end < index:
                    is_entity_still_B = True
                    entity_index = entity_index + 1 if entity_index + 1 < len(list_of_tuple_ner_start_end) else entity_index
                    start, end = list_of_tuple_ner_start_end[entity_index]

                if start <= index and index < end:
                    entity_tag = list_of_ner_tag[entity_index]
                    if is_entity_still_B is True:
                        entity_tag = 'B-' + entity_tag
                        list_of_ner_label.append(entity_tag)
                        is_entity_still_B = False
                    else:
                        is_entity_still_B = True
                        entity_tag = 'O'
                        list_of_ner_label.append(entity_tag)

            else:
                entity_tag = 'O'
                list_of_ner_label.append(entity_tag)

        with open(self.model_dir / "ner_to_index,json", 'rb') as f:
            self.ner_to_index = json.load(f)

        list_of_ner_ids = [self.ner_to_index['[CLS']] + [self.ner_to_index[ner_tag] for ner_tag in list_of_ner_label] + [self.ner_to_index['[SEP]']]
        list_of_ner_ids = self.tokenizer._pad([list_of_ner_ids], pad_id=self.vocab.PAD_ID, maxlen=self.maxlen)[0]

        return list_of_ner_ids, list_of_ner_label