def test_phrase_matcher_deprecated(en_vocab): matcher = PhraseMatcher(en_vocab) matcher.add("TEST", [Doc(en_vocab, words=["helllo"])]) doc = Doc(en_vocab, words=["hello", "world"]) with pytest.warns(DeprecationWarning) as record: for _ in matcher.pipe([doc]): pass assert record.list assert "spaCy v3.0" in str(record.list[0].message)
def test_issue3410(): texts = ["Hello world", "This is a test"] nlp = English() matcher = Matcher(nlp.vocab) phrasematcher = PhraseMatcher(nlp.vocab) with pytest.deprecated_call(): docs = list(nlp.pipe(texts, n_threads=4)) with pytest.deprecated_call(): docs = list(nlp.tokenizer.pipe(texts, n_threads=4)) with pytest.deprecated_call(): list(matcher.pipe(docs, n_threads=4)) with pytest.deprecated_call(): list(phrasematcher.pipe(docs, n_threads=4))
def trim_segments(all_doc_per): '''Assumes all_doc_per is a dictionary of person names and segments. Returns a new dictionary of person names and trimmed segments.''' matcher = PhraseMatcher(nlp.vocab) patterns = [nlp.make_doc(name) for name in all_doc_per.keys()] matcher.add('Name', None, *patterns) all_segments = nlp.pipe(list(all_doc_per.values()), n_threads=4) for doc_i, doc in enumerate(matcher.pipe(all_segments)): matches = matcher(doc) persons = [(doc[start:end], start) for matcher_id, start, end in matches] if len(persons) == 0: del all_doc_per[patterns[doc_i].text] elif len(persons) == 1: seg5 = doc.text all_doc_per[patterns[doc_i].text] = seg5 else: sentences = list(doc.sents) persons_remove = [per for per in persons if per[1] > sentences[0].end] if not persons_remove: seg5 = doc.text all_doc_per[patterns[doc_i].text] = seg5 else: i = 1 sent = sentences[i] while persons_remove[0][1] > sent.end: i += 1 sent = sentences[i] else: seg5 = sentences[:i] seg5 = ' '.join([s.text for s in seg5]) all_doc_per[patterns[doc_i].text] = seg5 return all_doc_per