def _decode_bio( text: str, tokens: List[str], mask: List[int], labels: List[str], ) -> Doc: """Create `Doc` from transformers output. Only support BIO label scheme.""" assert len(labels) == len(tokens) doc = Doc(text) # get Ent ents: List[Ent] = [] cur_ent: Optional[Ent] = None tokens = _norm_tokens(tokens, mask) for span_lists, label in zip(textspan.get_original_spans(tokens, text), labels): if not span_lists: # special tokens should hit here continue l = span_lists[0][0] r = span_lists[-1][1] if label.startswith("I-") and cur_ent and cur_ent.label == label[2:]: # expand previous entity cur_ent.end_char = r elif label.startswith("I-") or label.startswith("B-"): # new entity if cur_ent: ents.append(cur_ent) cur_ent = Ent(l, r, doc, label=label[2:]) if cur_ent: ents.append(cur_ent) doc.ents = ents return doc
def custom_split( self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]: text = str(normalized_string) tokens = self.tokenize(text) tokens_spans = textspan.get_original_spans(tokens, text) return [ normalized_string[st:ed] for char_spans in tokens_spans for st, ed in char_spans ]
def custom_split( self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]: """See. https://github.com/huggingface/tokenizers/blob/b24a2fc/bindings/python/examples/custom_components.py""" text = str(normalized_string) tokens = self.tokenize(text) tokens_spans = textspan.get_original_spans(tokens, text) return [ normalized_string[st:ed] for char_spans in tokens_spans for st, ed in char_spans ]
def test_random_get_original_spans(tokens, text, expected): ret = textspan.get_original_spans(tokens, text) assert ret == expected, (tokens, text)
def test_random_get_original_spans(tokens, text): textspan.get_original_spans(tokens, text) ret = textspan.get_original_spans(tokens, "".join(tokens)) assert all(x is not None for x in ret)
def convert_feature_line(self, text: str) -> List[Dict[str, str]]: """文字レベルに対して、品詞など単語レベルの情報をBIO形式や単語表層の形で付与する""" # parse and align tokens with characters tfs = self.parse_full(text) if len(tfs) == 0: return [] ts, fs = zip(*tfs) featuremap = {k: v for k, v in enumerate(fs)} featuremap[-1] = "*,*,*,*,*,*,*,*,*" ts_spans = textspan.get_original_spans(ts, text) char2token_index = { char_ix: token_ix for token_ix, char_spans in enumerate(ts_spans) for st, ed in char_spans for char_ix in range(st, ed) } feature_lines = [{ "index": i, "word_index": char2token_index.get(i, -1), "text": c, "features": featuremap[char2token_index.get(i, -1)].split(","), } for i, c in enumerate(text)] # convert feature_lines nonemap = {"*": None} prev_word_index = -1 # make word_index2midasi dict # NOTE: line_feature['features'][6] では見出しが無いことがある word_index2midasi = {-1: ""} word_surface = "" for line_feature in feature_lines: word_index = line_feature["word_index"] midasi = line_feature["features"][6] midasi = nonemap[midasi] if midasi in nonemap else midasi # 見出しが無い場合は単語surfaceで埋める if midasi is not None: word_surface = midasi elif word_index != prev_word_index: word_surface = line_feature["text"] else: word_surface += line_feature["text"] prev_word_index = word_index word_index2midasi[word_index] = word_surface # make word-BIO feature tags from 'word_index' and 'features' new_features = [] for line_feature in feature_lines: word_index = line_feature["word_index"] if word_index != prev_word_index: bioprefix = "B-" else: bioprefix = "I-" prev_word_index = word_index pos = line_feature["features"][0] pos = nonemap[pos] if pos in nonemap else pos finepos = line_feature["features"][1] finepos = nonemap[finepos] if finepos in nonemap else finepos bunrui = line_feature["features"][2] bunrui = nonemap[bunrui] if bunrui in nonemap else bunrui new_features.append({ "index": line_feature["index"], "word_index": line_feature["word_index"], "text": line_feature["text"], "word_f": bioprefix + "word", "pos_f": bioprefix + pos if pos is not None else "O", "finepos_f": bioprefix + finepos if finepos is not None else "O", "bunrui_f": bioprefix + bunrui if bunrui is not None else "O", "midasi_f": word_index2midasi[word_index], "phrase_f": "O", }) return new_features