import json
from spacy.matcher import Matcher
from spacy.lang.zh import Chinese

with open("exercises/zh/iphone.json", encoding="utf8") as f:
    TEXTS = json.loads(f.read())

nlp = Chinese()
matcher = Matcher(nlp.vocab)

# 两个词符,其小写形式匹配到"iphone"和"x"上
pattern1 = [{____: ____}, {____: ____}]

# 词符的小写形式匹配到"iphone"和一个数字上
pattern2 = [{____: ____}, {____: ____}]

# 把模板加入到matcher中然后检查结果
matcher.add("GADGET", None, pattern1, pattern2)
for doc in nlp.pipe(TEXTS):
    print([doc[start:end] for match_id, start, end in matcher(doc)])
import json
from spacy.lang.zh import Chinese

with open("exercises/zh/countries.json", encoding="utf8") as f:
    COUNTRIES = json.loads(f.read())

nlp = Chinese()
doc = nlp("智利可能会从斯洛伐克进口货物")

# 导入PhraseMatcher并实例化
from spacy.____ import ____

matcher = ____(____)

# 创建Doc实例的模板然后加入matcher中
# 下面的代码比这样的表达方式更快: [nlp(country) for country in COUNTRIES]
patterns = list(nlp.pipe(COUNTRIES))
matcher.add("COUNTRY", None, *patterns)

# 在测试文档中调用matcher并打印结果
matches = ____(____)
print([doc[start:end] for match_id, start, end in matches])
Exemple #3
0
import json
from spacy.lang.zh import Chinese
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher

with open("exercises/zh/countries.json", encoding="utf8") as f:
    COUNTRIES = json.loads(f.read())

with open("exercises/zh/capitals.json", encoding="utf8") as f:
    CAPITALS = json.loads(f.read())

nlp = Chinese()
matcher = PhraseMatcher(nlp.vocab)
matcher.add("COUNTRY", None, *list(nlp.pipe(COUNTRIES)))


def countries_component(doc):
    # 对所有匹配结果创建一个标签为"GPE"的实体Span
    matches = matcher(doc)
    doc.ents = [
        Span(doc, start, end, label="GPE") for match_id, start, end in matches
    ]
    return doc


# 把这个组件加入到流程中
nlp.add_pipe(countries_component)
print(nlp.pipe_names)

# 取值器,在国家首都的字典中寻找span的文本
get_capital = lambda span: CAPITALS.get(span.text)