コード例 #1
0
def _decode_bio(
    text: str,
    tokens: List[str],
    mask: List[int],
    labels: List[str],
) -> Doc:
    """Create `Doc` from transformers output. Only support BIO label scheme."""
    assert len(labels) == len(tokens)
    doc = Doc(text)

    # get Ent
    ents: List[Ent] = []
    cur_ent: Optional[Ent] = None
    tokens = _norm_tokens(tokens, mask)
    for span_lists, label in zip(textspan.get_original_spans(tokens, text),
                                 labels):
        if not span_lists:
            # special tokens should hit here
            continue
        l = span_lists[0][0]
        r = span_lists[-1][1]
        if label.startswith("I-") and cur_ent and cur_ent.label == label[2:]:
            # expand previous entity
            cur_ent.end_char = r
        elif label.startswith("I-") or label.startswith("B-"):
            # new entity
            if cur_ent:
                ents.append(cur_ent)
            cur_ent = Ent(l, r, doc, label=label[2:])
    if cur_ent:
        ents.append(cur_ent)
    doc.ents = ents
    return doc
コード例 #2
0
 def custom_split(
         self, i: int,
         normalized_string: NormalizedString) -> List[NormalizedString]:
     text = str(normalized_string)
     tokens = self.tokenize(text)
     tokens_spans = textspan.get_original_spans(tokens, text)
     return [
         normalized_string[st:ed] for char_spans in tokens_spans
         for st, ed in char_spans
     ]
コード例 #3
0
 def custom_split(
         self, i: int,
         normalized_string: NormalizedString) -> List[NormalizedString]:
     """See. https://github.com/huggingface/tokenizers/blob/b24a2fc/bindings/python/examples/custom_components.py"""
     text = str(normalized_string)
     tokens = self.tokenize(text)
     tokens_spans = textspan.get_original_spans(tokens, text)
     return [
         normalized_string[st:ed] for char_spans in tokens_spans
         for st, ed in char_spans
     ]
コード例 #4
0
ファイル: test_main.py プロジェクト: KoichiYasuoka/textspan
def test_random_get_original_spans(tokens, text, expected):
    ret = textspan.get_original_spans(tokens, text)
    assert ret == expected, (tokens, text)
コード例 #5
0
ファイル: test_main.py プロジェクト: KoichiYasuoka/textspan
def test_random_get_original_spans(tokens, text):
    textspan.get_original_spans(tokens, text)
    ret = textspan.get_original_spans(tokens, "".join(tokens))
    assert all(x is not None for x in ret)
コード例 #6
0
    def convert_feature_line(self, text: str) -> List[Dict[str, str]]:
        """文字レベルに対して、品詞など単語レベルの情報をBIO形式や単語表層の形で付与する"""
        # parse and align tokens with characters
        tfs = self.parse_full(text)
        if len(tfs) == 0:
            return []
        ts, fs = zip(*tfs)
        featuremap = {k: v for k, v in enumerate(fs)}
        featuremap[-1] = "*,*,*,*,*,*,*,*,*"
        ts_spans = textspan.get_original_spans(ts, text)
        char2token_index = {
            char_ix: token_ix
            for token_ix, char_spans in enumerate(ts_spans)
            for st, ed in char_spans for char_ix in range(st, ed)
        }
        feature_lines = [{
            "index":
            i,
            "word_index":
            char2token_index.get(i, -1),
            "text":
            c,
            "features":
            featuremap[char2token_index.get(i, -1)].split(","),
        } for i, c in enumerate(text)]

        # convert feature_lines
        nonemap = {"*": None}
        prev_word_index = -1

        # make word_index2midasi dict
        # NOTE: line_feature['features'][6] では見出しが無いことがある
        word_index2midasi = {-1: ""}
        word_surface = ""
        for line_feature in feature_lines:
            word_index = line_feature["word_index"]
            midasi = line_feature["features"][6]
            midasi = nonemap[midasi] if midasi in nonemap else midasi
            # 見出しが無い場合は単語surfaceで埋める
            if midasi is not None:
                word_surface = midasi
            elif word_index != prev_word_index:
                word_surface = line_feature["text"]
            else:
                word_surface += line_feature["text"]
            prev_word_index = word_index
            word_index2midasi[word_index] = word_surface

        # make word-BIO feature tags from 'word_index' and 'features'
        new_features = []
        for line_feature in feature_lines:
            word_index = line_feature["word_index"]
            if word_index != prev_word_index:
                bioprefix = "B-"
            else:
                bioprefix = "I-"
            prev_word_index = word_index
            pos = line_feature["features"][0]
            pos = nonemap[pos] if pos in nonemap else pos
            finepos = line_feature["features"][1]
            finepos = nonemap[finepos] if finepos in nonemap else finepos
            bunrui = line_feature["features"][2]
            bunrui = nonemap[bunrui] if bunrui in nonemap else bunrui

            new_features.append({
                "index":
                line_feature["index"],
                "word_index":
                line_feature["word_index"],
                "text":
                line_feature["text"],
                "word_f":
                bioprefix + "word",
                "pos_f":
                bioprefix + pos if pos is not None else "O",
                "finepos_f":
                bioprefix + finepos if finepos is not None else "O",
                "bunrui_f":
                bioprefix + bunrui if bunrui is not None else "O",
                "midasi_f":
                word_index2midasi[word_index],
                "phrase_f":
                "O",
            })

        return new_features