start_char, end_char = ent_arrays[0]
                    label_ent_array.append((start_char, end_char + 1, l))
                ents.append(label_ent_array[0])

            if True == diff_contain_overlapping(ents):
                i = i + 1

                doc = nlp(text)
                tags = biluo_tags_from_offsets(doc, ents)
                doc.ents = spans_from_biluo_tags(doc, tags)

                line = docs_to_json([doc])
                f.write(json_dumps(line) + "\n")

    msg.good(f"Finished {file_path} :: {i} rows")
    if print_label:
        msg.info(f"{labels}")


if __name__ == "__main__":
    # Chinese.Defaults.use_jieba = True
    nlp = Chinese()
    nlp.add_pipe(nlp.create_pipe('sentencizer'))

    dev_data = read_jsonl(Path("./cluener2020/dev.json"))
    train_data = read_jsonl(Path("./cluener2020/train.json"))

    format_data_to_jsonl(dev_data, Path("./clue_spacy_dev.jsonl"))
    format_data_to_jsonl(train_data,
                         Path("./clue_spacy_train.jsonl"),
                         print_label=True)
Example #2
0
with open("exercises/zh/capitals.json", encoding="utf8") as f:
    CAPITALS = json.loads(f.read())

nlp = Chinese()
matcher = PhraseMatcher(nlp.vocab)
matcher.add("COUNTRY", None, *list(nlp.pipe(COUNTRIES)))


def countries_component(doc):
    # 对所有匹配结果创建一个标签为"GPE"的实体Span
    matches = matcher(doc)
    doc.ents = [
        Span(doc, start, end, label="GPE") for match_id, start, end in matches
    ]
    return doc


# 把这个组件加入到流程中
nlp.add_pipe(countries_component)
print(nlp.pipe_names)

# 取值器,在国家首都的字典中寻找span的文本
get_capital = lambda span: CAPITALS.get(span.text)

# 用这个取值器注册Span的扩展属性"capital"
Span.set_extension("capital", getter=get_capital, force=True)

# 处理文本,打印实体文本、标签和首都属性
doc = nlp("新加坡可能会和马来西亚一起建造高铁。")
print([(ent.text, ent.label_, ent._.capital) for ent in doc.ents])