Beispiel #1
0
def test_phrase_matcher_deprecated(en_vocab):
    matcher = PhraseMatcher(en_vocab)
    matcher.add("TEST", [Doc(en_vocab, words=["helllo"])])
    doc = Doc(en_vocab, words=["hello", "world"])
    with pytest.warns(DeprecationWarning) as record:
        for _ in matcher.pipe([doc]):
            pass
        assert record.list
        assert "spaCy v3.0" in str(record.list[0].message)
Beispiel #2
0
def test_issue3410():
    texts = ["Hello world", "This is a test"]
    nlp = English()
    matcher = Matcher(nlp.vocab)
    phrasematcher = PhraseMatcher(nlp.vocab)
    with pytest.deprecated_call():
        docs = list(nlp.pipe(texts, n_threads=4))
    with pytest.deprecated_call():
        docs = list(nlp.tokenizer.pipe(texts, n_threads=4))
    with pytest.deprecated_call():
        list(matcher.pipe(docs, n_threads=4))
    with pytest.deprecated_call():
        list(phrasematcher.pipe(docs, n_threads=4))
def test_issue3410():
    texts = ["Hello world", "This is a test"]
    nlp = English()
    matcher = Matcher(nlp.vocab)
    phrasematcher = PhraseMatcher(nlp.vocab)
    with pytest.deprecated_call():
        docs = list(nlp.pipe(texts, n_threads=4))
    with pytest.deprecated_call():
        docs = list(nlp.tokenizer.pipe(texts, n_threads=4))
    with pytest.deprecated_call():
        list(matcher.pipe(docs, n_threads=4))
    with pytest.deprecated_call():
        list(phrasematcher.pipe(docs, n_threads=4))
Beispiel #4
0
def trim_segments(all_doc_per):
    '''Assumes all_doc_per is a dictionary of person names and segments.
    Returns a new dictionary of person names and trimmed segments.'''
    
    matcher = PhraseMatcher(nlp.vocab)
    patterns = [nlp.make_doc(name) for name in all_doc_per.keys()]
    matcher.add('Name', None, *patterns)
    
    all_segments = nlp.pipe(list(all_doc_per.values()), n_threads=4)
    for doc_i, doc in enumerate(matcher.pipe(all_segments)):
        matches = matcher(doc)
        persons = [(doc[start:end], start) for matcher_id, start, end in matches]
        
        if len(persons) == 0:
            
            del all_doc_per[patterns[doc_i].text]
        
        elif len(persons) == 1:
            seg5 = doc.text
            all_doc_per[patterns[doc_i].text] = seg5
        
        else:
            sentences = list(doc.sents)
            persons_remove = [per for per in persons if per[1] > sentences[0].end]
            if not persons_remove:
                seg5 = doc.text
                all_doc_per[patterns[doc_i].text] = seg5
            else:
                i = 1
                sent = sentences[i]
                while persons_remove[0][1] > sent.end:
                    i += 1
                    sent = sentences[i]
                else:
                    seg5 = sentences[:i]
                    seg5 = ' '.join([s.text for s in seg5])
                    all_doc_per[patterns[doc_i].text] = seg5

    return all_doc_per