import json from spacy.matcher import Matcher from spacy.lang.zh import Chinese with open("exercises/zh/iphone.json", encoding="utf8") as f: TEXTS = json.loads(f.read()) nlp = Chinese() matcher = Matcher(nlp.vocab) # 两个词符,其小写形式匹配到"iphone"和"x"上 pattern1 = [{____: ____}, {____: ____}] # 词符的小写形式匹配到"iphone"和一个数字上 pattern2 = [{____: ____}, {____: ____}] # 把模板加入到matcher中然后检查结果 matcher.add("GADGET", None, pattern1, pattern2) for doc in nlp.pipe(TEXTS): print([doc[start:end] for match_id, start, end in matcher(doc)])
import json from spacy.lang.zh import Chinese with open("exercises/zh/countries.json", encoding="utf8") as f: COUNTRIES = json.loads(f.read()) nlp = Chinese() doc = nlp("智利可能会从斯洛伐克进口货物") # 导入PhraseMatcher并实例化 from spacy.____ import ____ matcher = ____(____) # 创建Doc实例的模板然后加入matcher中 # 下面的代码比这样的表达方式更快: [nlp(country) for country in COUNTRIES] patterns = list(nlp.pipe(COUNTRIES)) matcher.add("COUNTRY", None, *patterns) # 在测试文档中调用matcher并打印结果 matches = ____(____) print([doc[start:end] for match_id, start, end in matches])
import json from spacy.lang.zh import Chinese from spacy.tokens import Span from spacy.matcher import PhraseMatcher with open("exercises/zh/countries.json", encoding="utf8") as f: COUNTRIES = json.loads(f.read()) with open("exercises/zh/capitals.json", encoding="utf8") as f: CAPITALS = json.loads(f.read()) nlp = Chinese() matcher = PhraseMatcher(nlp.vocab) matcher.add("COUNTRY", None, *list(nlp.pipe(COUNTRIES))) def countries_component(doc): # 对所有匹配结果创建一个标签为"GPE"的实体Span matches = matcher(doc) doc.ents = [ Span(doc, start, end, label="GPE") for match_id, start, end in matches ] return doc # 把这个组件加入到流程中 nlp.add_pipe(countries_component) print(nlp.pipe_names) # 取值器,在国家首都的字典中寻找span的文本 get_capital = lambda span: CAPITALS.get(span.text)