def test_dependency_matcher_ops(en_vocab, doc, left, right, op, num_matches): right_id = right if left == right: right_id = right + "2" pattern = [ { "RIGHT_ID": left, "RIGHT_ATTRS": { "LOWER": left } }, { "LEFT_ID": left, "REL_OP": op, "RIGHT_ID": right_id, "RIGHT_ATTRS": { "LOWER": right }, }, ] matcher = DependencyMatcher(en_vocab) matcher.add("pattern", [pattern]) matches = matcher(doc) assert len(matches) == num_matches
def test_dependency_matcher_span_user_data(en_tokenizer): doc = en_tokenizer("a b c d e") for token in doc: token.head = doc[0] token.dep_ = "a" get_is_c = lambda token: token.text in ("c", ) Token.set_extension("is_c", default=False) doc[2]._.is_c = True pattern = [ { "RIGHT_ID": "c", "RIGHT_ATTRS": { "_": { "is_c": True } } }, ] matcher = DependencyMatcher(en_tokenizer.vocab) matcher.add("C", [pattern]) doc_matches = matcher(doc) offset = 1 span_matches = matcher(doc[offset:]) for doc_match, span_match in zip(sorted(doc_matches), sorted(span_matches)): assert doc_match[0] == span_match[0] for doc_t_i, span_t_i in zip(doc_match[1], span_match[1]): assert doc_t_i == span_t_i + offset
def test_issue4590(en_vocab): """Test that matches param in on_match method are the same as matches run with no on_match method""" pattern = [ {"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}}, { "SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"}, "PATTERN": {"ORTH": "fox"}, }, { "SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"}, "PATTERN": {"ORTH": "fox"}, }, ] on_match = Mock() matcher = DependencyMatcher(en_vocab) matcher.add("pattern", on_match, pattern) text = "The quick brown fox jumped over the lazy fox" heads = [3, 2, 1, 1, 0, -1, 2, 1, -3] deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"] doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps) matches = matcher(doc) on_match_args = on_match.call_args assert on_match_args[0][3] == matches
def match_prep_pattern(doc, nlp, tag_matches, notag_matches): """ Returns a a dictionary of ('word preposition wordtag'): count. The dictionary is created from dependency parse tree syntactic relations. """ matcher = DependencyMatcher(nlp.vocab) matcher.add("prep", [prep_pattern]) count = 0 for match in matcher( doc ): # Iterate through all of the matches and add 1 to its count in the dictionary if count % 1000 == 0: print(f"Processing match # {count}") indices = match[1] # Writing the named entity of the head token insted of the whole word in case a named entity exists if doc[indices[0]].ent_type_: head_word = doc[indices[0]].ent_type_ else: head_word = doc[indices[0]].text.lower() tag_trigram = ' '.join( (head_word, doc[indices[1]].text.lower(), doc[indices[2]].tag_)) if doc[indices[2]].ent_type_: dependent_notag = doc[indices[2]].ent_type_ else: dependent_notag = doc[indices[2]].text.lower() notag_trigram = ' '.join( (head_word, doc[indices[1]].text.lower(), dependent_notag)) notag_matches.add(notag_trigram) tag_matches[tag_trigram] = tag_matches.get(tag_trigram, 0) + 1 count += 1 return tag_matches, notag_matches
def test_dependency_matcher_precedence_ops(en_vocab, op, num_matches): # two sentences to test that all matches are within the same sentence doc = Doc( en_vocab, words=["a", "b", "c", "d", "e"] * 2, heads=[0, 0, 0, 0, 0, 5, 5, 5, 5, 5], deps=["dep"] * 10, ) match_count = 0 for text in ["a", "b", "c", "d", "e"]: pattern = [ {"RIGHT_ID": "1", "RIGHT_ATTRS": {"ORTH": text}}, {"LEFT_ID": "1", "REL_OP": op, "RIGHT_ID": "2", "RIGHT_ATTRS": {}}, ] matcher = DependencyMatcher(en_vocab) matcher.add("A", [pattern]) matches = matcher(doc) match_count += len(matches) for match in matches: match_id, token_ids = match # token_ids[0] op token_ids[1] if op == ".": assert token_ids[0] == token_ids[1] - 1 elif op == ";": assert token_ids[0] == token_ids[1] + 1 elif op == ".*": assert token_ids[0] < token_ids[1] elif op == ";*": assert token_ids[0] > token_ids[1] # all tokens are within the same sentence assert doc[token_ids[0]].sent == doc[token_ids[1]].sent assert match_count == num_matches
def add_matches( vocab, patterns: List[str], lemmas=True, start_ents=None, end_ents=None, print_patterns=False, ): # BAAAAD PATTERN! PASSING DOWN VARS (DECORATOR?) """Converts "prevented|nsubj|START_ENTITY prevented|dobj|END_ENTITY" into a pattern that DependencyMatcher class can use""" matcher = DependencyMatcher(vocab) for p in patterns: pattern = construct_pattern(p, lemmatize=lemmas, start_ents=start_ents, end_ents=end_ents) if print_patterns: print(pattern, p) try: # object of type 'NoneType' has no len() weirdly on some of the 20k dep paths matcher.add(p, None, pattern) except: print("error with pattern", p, "-->", pattern) continue return matcher
def test_dependency_matcher_long_matches(en_vocab, doc): pattern = [ {"RIGHT_ID": "quick", "RIGHT_ATTRS": {"DEP": "amod", "OP": "+"}}, ] matcher = DependencyMatcher(en_vocab) with pytest.raises(ValueError): matcher.add("pattern", [pattern])
def test_dependency_matcher_order_issue(en_tokenizer): # issue from #9263 doc = en_tokenizer("I like text") doc[2].head = doc[1] # this matches on attrs but not rel op pattern1 = [ { "RIGHT_ID": "root", "RIGHT_ATTRS": { "ORTH": "like" } }, { "LEFT_ID": "root", "RIGHT_ID": "r", "RIGHT_ATTRS": { "ORTH": "text" }, "REL_OP": "<", }, ] # this matches on rel op but not attrs pattern2 = [ { "RIGHT_ID": "root", "RIGHT_ATTRS": { "ORTH": "like" } }, { "LEFT_ID": "root", "RIGHT_ID": "r", "RIGHT_ATTRS": { "ORTH": "fish" }, "REL_OP": ">", }, ] matcher = DependencyMatcher(en_tokenizer.vocab) # This should behave the same as the next pattern matcher.add("check", [pattern1, pattern2]) matches = matcher(doc) assert matches == [] # use a new matcher matcher = DependencyMatcher(en_tokenizer.vocab) # adding one at a time under same label gets a match matcher.add("check", [pattern1]) matcher.add("check", [pattern2]) matches = matcher(doc) assert matches == []
def dependency_matcher(en_vocab, patterns, doc): matcher = DependencyMatcher(en_vocab) mock = Mock() for i in range(1, len(patterns) + 1): if i == 1: matcher.add("pattern1", [patterns[0]], on_match=mock) else: matcher.add("pattern" + str(i), [patterns[i - 1]]) return matcher
def get_dep_matcher(nlp, patterns, pattern_names=None) -> object: """ Add patterns with pattern_names to the dependency matcher """ if pattern_names is None: pattern_names = ["pattern" + str(pi) for pi in range(len(patterns))] else: pattern_names = [x for x in pattern_names] matcher = DependencyMatcher(nlp.vocab) for pi, pattern in enumerate(patterns): # print("pattern names: ", pattern_names[pi], pattern) matcher.add(pattern_names[pi], None, pattern) return matcher
def test_dependency_matcher_remove(en_tokenizer): # issue from #9263 doc = en_tokenizer("The red book") doc[1].head = doc[2] # this matches pattern1 = [ { "RIGHT_ID": "root", "RIGHT_ATTRS": { "ORTH": "book" } }, { "LEFT_ID": "root", "RIGHT_ID": "r", "RIGHT_ATTRS": { "ORTH": "red" }, "REL_OP": ">", }, ] # add and then remove it matcher = DependencyMatcher(en_tokenizer.vocab) matcher.add("check", [pattern1]) matcher.remove("check") # this matches on rel op but not attrs pattern2 = [ { "RIGHT_ID": "root", "RIGHT_ATTRS": { "ORTH": "flag" } }, { "LEFT_ID": "root", "RIGHT_ID": "r", "RIGHT_ATTRS": { "ORTH": "blue" }, "REL_OP": ">", }, ] # Adding this new pattern with the same label, which should not match matcher.add("check", [pattern2]) matches = matcher(doc) assert matches == []
def dependency_matcher(en_vocab): def is_brown_yellow(text): return bool(re.compile(r"brown|yellow|over").match(text)) IS_BROWN_YELLOW = en_vocab.add_flag(is_brown_yellow) pattern1 = [ {"SPEC": {"NODE_NAME": "fox"}, "PATTERN": {"ORTH": "fox"}}, { "SPEC": {"NODE_NAME": "q", "NBOR_RELOP": ">", "NBOR_NAME": "fox"}, "PATTERN": {"ORTH": "quick", "DEP": "amod"}, }, { "SPEC": {"NODE_NAME": "r", "NBOR_RELOP": ">", "NBOR_NAME": "fox"}, "PATTERN": {IS_BROWN_YELLOW: True}, }, ] pattern2 = [ {"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}}, { "SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"}, "PATTERN": {"ORTH": "fox"}, }, { "SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"}, "PATTERN": {"ORTH": "fox"}, }, ] pattern3 = [ {"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}}, { "SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"}, "PATTERN": {"ORTH": "fox"}, }, { "SPEC": {"NODE_NAME": "r", "NBOR_RELOP": ">>", "NBOR_NAME": "fox"}, "PATTERN": {"ORTH": "brown"}, }, ] matcher = DependencyMatcher(en_vocab) matcher.add("pattern1", [pattern1]) matcher.add("pattern2", [pattern2]) matcher.add("pattern3", [pattern3]) return matcher
def spacyDep(doc): matcher = DependencyMatcher(nlp.vocab, validate=True) pattern = [ { "RIGHT_ID": "anchor_AUX", #unique name "RIGHT_ATTRS": { "POS": "AUX" } #token pattern for disaster }, { "LEFT_ID": "anchor_AUX", "REL_OP": ">", "RIGHT_ID": "anchor_disaster", ##Il faut aussi vérifier que c'est bien un désastre "RIGHT_ATTRS": { "DEP": "attr" } }, { "LEFT_ID": "anchor_AUX", "REL_OP": ">", "RIGHT_ID": "AUX_prep", "RIGHT_ATTRS": { "DEP": "prep", "POS": "ADP" } }, { "LEFT_ID": "AUX_prep", "REL_OP": ">", "RIGHT_ID": "pobj_prep", "RIGHT_ATTRS": { "DEP": "pobj" } ##Normalement c'est une ville, il faut donc la recup } ] matcher.add("DISASTER", [pattern]) #displacy.serve(doc) matches = matcher(doc) print(doc[matches[0][1][-1]].text) #print(doc[3].dep_) #([print(str(name)+" : "+str(thing)) for name,thing in inspect.getmembers(doc[ return bool(matches)
def spacyDep(doc): matcher = DependencyMatcher(nlp.vocab, validate=True) pattern = [ { "RIGHT_ID": "anchor_AUX", #Nom unique pour le label d'ancrage "RIGHT_ATTRS": { "POS": "AUX" } }, { "LEFT_ID": "anchor_AUX", "REL_OP": ">", "RIGHT_ID": "anchor_disaster", "RIGHT_ATTRS": { "DEP": "attr" } }, { "LEFT_ID": "anchor_AUX", "REL_OP": ">", "RIGHT_ID": "AUX_prep", "RIGHT_ATTRS": { "DEP": "prep", "POS": "ADP" } }, { "LEFT_ID": "AUX_prep", "REL_OP": ">", "RIGHT_ID": "pobj_prep", "RIGHT_ATTRS": { "DEP": "pobj" } } ] matcher.add("DISASTER", [pattern]) #displacy.serve(doc) matches = matcher(doc) if bool(matches): return [True, doc[matches[0][1][-1]].text] else: return [False, []]
def test_dependency_matcher_callback(en_vocab, doc): pattern = [ { "RIGHT_ID": "quick", "RIGHT_ATTRS": { "ORTH": "quick" } }, ] nomatch_pattern = [ { "RIGHT_ID": "quick", "RIGHT_ATTRS": { "ORTH": "NOMATCH" } }, ] matcher = DependencyMatcher(en_vocab) mock = Mock() matcher.add("pattern", [pattern], on_match=mock) matcher.add("nomatch_pattern", [nomatch_pattern], on_match=mock) matches = matcher(doc) assert len(matches) == 1 mock.assert_called_once_with(matcher, doc, 0, matches) # check that matches with and without callback are the same (#4590) matcher2 = DependencyMatcher(en_vocab) matcher2.add("pattern", [pattern]) matches2 = matcher2(doc) assert matches == matches2
def test_dependency_matcher_pickle(en_vocab, patterns, doc): matcher = DependencyMatcher(en_vocab) for i in range(1, len(patterns) + 1): matcher.add("pattern" + str(i), [patterns[i - 1]]) matches = matcher(doc) assert matches[0][1] == [3, 1, 2] assert matches[1][1] == [4, 3, 5] assert matches[2][1] == [4, 3, 2] assert matches[3][1] == [4, 3] assert matches[4][1] == [4, 3] assert matches[5][1] == [4, 8] b = pickle.dumps(matcher) matcher_r = pickle.loads(b) assert len(matcher) == len(matcher_r) matches = matcher_r(doc) assert matches[0][1] == [3, 1, 2] assert matches[1][1] == [4, 3, 5] assert matches[2][1] == [4, 3, 2] assert matches[3][1] == [4, 3] assert matches[4][1] == [4, 3] assert matches[5][1] == [4, 8]
} ] # a rare condition in which a *brain* develops without the cerebellum inflected_verb_noun = [ { 'SPEC': {'NODE_NAME': 'inflected_verb'}, 'PATTERN': {'_': {'change': 'inflected'}} }, { 'SPEC': {'NBOR_NAME': 'inflected_verb', 'NBOR_RELOP': '>', 'NODE_NAME': 'node0'}, 'PATTERN': {'DEP': 'nsubj', 'TAG': {'IN': ['NN', 'NNP']}} } ] noun_to_plural_matcher.add('noun_to_plural', None, root_noun) noun_to_plural_matcher.add('root_subj', None, root_subj) noun_to_plural_matcher.add('noun_conjunction', None, noun_conjunction) noun_to_plural_matcher.add('nsubjpass', None, nsubjpass) # noun_to_plural_matcher.add('inflected_verb_noun', None, inflected_verb_noun) verb_inflection_matcher = DependencyMatcher(nlp.vocab) # a rocket is -> rockets *are* [todo] verb_inflection = [ { 'SPEC': {'NODE_NAME': 'pluralized_noun'}, 'PATTERN': {'_': {'change': 'pluralized_noun'}} }, { 'SPEC': {'NBOR_NAME': 'pluralized_noun', 'NBOR_RELOP': '>', 'NODE_NAME': 'node0'},
node["PATTERN"] = token_pattern pattern.append(node) add_node(child, pattern) pattern = [{"SPEC": {"NODE_NAME": root}, "PATTERN": {"ORTH": root}}] add_node(root, pattern) return pattern from spacy.matcher import DependencyMatcher example = [["founded", "nsubj", "START_ENTITY"], ["founded", "dobj", "END_ENTITY"]] pattern = construct_pattern(example) matcher = DependencyMatcher(nlp.vocab) matcher.add("pattern1", None, pattern) doc1 = nlp("Bill Gates founded Microsoft.") doc2 = nlp("Bill Gates, the Seattle Seahawks owner, founded Microsoft.") match = matcher(doc1)[0] subtree = match[1][0] visualise_subtrees(doc1, subtree) match = matcher(doc2)[0] subtree = match[1][0] visualise_subtrees(doc2, subtree)
def build_matcher(vocab, pattern_dict): matcher = DependencyMatcher(vocab) for name, pattern in pattern_dict.items(): dep_pattern = pattern['spacy_dep_pattern'] matcher.add(name, None, dep_pattern) return matcher
def search_out(doc, nlp): """Search for prepositions with verb Args: doc (spacy.tokens.Doc): doc to be analyzed nlp (spacy.language.Language): context language Returns: list: list of spacy.tokens.Span """ result = [] token_matcher = Matcher(nlp.vocab) dep_matcher = DependencyMatcher(nlp.vocab) token_patterns = [ [{ "POS": "NOUN" }], [{ "POS": "PRON" }], [{ "POS": "PROPN" }], [{ "POS": "PROPN" }, { "POS": "CCONJ", "DEP": "cd" }, { "POS": "PROPN", "DEP": "cj" }], ] token_matcher.add("token_NP", token_patterns) token_matches = token_matcher(doc) token_refined_matches = merge([(start, end) for _, start, end in token_matches]) dep_patterns = [ [ { "RIGHT_ID": "noun", "RIGHT_ATTRS": { "POS": "NOUN" } }, { "LEFT_ID": "noun", "REL_OP": ">", "RIGHT_ID": "noun_mod", "RIGHT_ATTRS": { "DEP": { "IN": ["ag", "nk", "pg", "op"] }, "POS": { "NOT_IN": ["PART", "ADV", "PRON", "CCONJ"] } } }, ], [ { "RIGHT_ID": "pron", "RIGHT_ATTRS": { "POS": "PRON" } }, { "LEFT_ID": "pron", "REL_OP": ">", "RIGHT_ID": "pron_mod", "RIGHT_ATTRS": { "DEP": { "IN": ["ag"] }, "POS": { "IN": ["NOUN"] } } }, ], [ { "RIGHT_ID": "noun", "RIGHT_ATTRS": { "POS": "NOUN" } }, { "LEFT_ID": "noun", "REL_OP": ">", "RIGHT_ID": "noun_mod", "RIGHT_ATTRS": { "DEP": "mnr", "LOWER": { "IN": [ "an", "nach", "vom", "von", "auf", "zu", "zur", "zwischen", "aus" ] } } }, ], [ { "RIGHT_ID": "propn", "RIGHT_ATTRS": { "POS": "PROPN" } }, { "LEFT_ID": "propn", "REL_OP": ">", "RIGHT_ID": "propn_mod", "RIGHT_ATTRS": { "DEP": { "IN": ["pnc", "nk"] }, "POS": { "IN": ["NOUN", "DET", "PROPN"] } } }, ], ] dep_matcher.add("dep_NP", dep_patterns) dep_matches = dep_matcher(doc) dep_refined_matches = [] for _, (noun, desp) in dep_matches: desp_tree = [e.i for e in doc[desp].subtree] length_valid = len(desp_tree) == max(desp_tree) - min(desp_tree) + 1 noun_valid = any([ noun <= end and noun >= start for start, end in token_refined_matches ]) if length_valid and noun_valid: desp_tree.append(noun) desp_tree.sort() dep_refined_matches.append((min(desp_tree), max(desp_tree) + 1)) matches = token_refined_matches + dep_refined_matches refined_matches = merge(matches) for start, end in refined_matches: if end - start > 1 and all([e.pos_ != "PUNCT" for e in doc[start:end]]): np = doc[start:end] result.append(np) return result
def dependency_matcher(en_vocab): def is_brown_yellow(text): return bool(re.compile(r"brown|yellow|over").match(text)) IS_BROWN_YELLOW = en_vocab.add_flag(is_brown_yellow) pattern1 = [ { "SPEC": { "NODE_NAME": "fox" }, "PATTERN": { "ORTH": "fox" } }, { "SPEC": { "NODE_NAME": "q", "NBOR_RELOP": ">", "NBOR_NAME": "fox" }, "PATTERN": { "ORTH": "quick", "DEP": "amod" }, }, { "SPEC": { "NODE_NAME": "r", "NBOR_RELOP": ">", "NBOR_NAME": "fox" }, "PATTERN": { IS_BROWN_YELLOW: True }, }, ] pattern2 = [ { "SPEC": { "NODE_NAME": "jumped" }, "PATTERN": { "ORTH": "jumped" } }, { "SPEC": { "NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped" }, "PATTERN": { "ORTH": "fox" }, }, { "SPEC": { "NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped" }, "PATTERN": { "ORTH": "fox" }, }, ] pattern3 = [ { "SPEC": { "NODE_NAME": "jumped" }, "PATTERN": { "ORTH": "jumped" } }, { "SPEC": { "NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped" }, "PATTERN": { "ORTH": "fox" }, }, { "SPEC": { "NODE_NAME": "r", "NBOR_RELOP": ">>", "NBOR_NAME": "fox" }, "PATTERN": { "ORTH": "brown" }, }, ] # pattern that doesn't match pattern4 = [ { "SPEC": { "NODE_NAME": "jumped" }, "PATTERN": { "ORTH": "NOMATCH" } }, { "SPEC": { "NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped" }, "PATTERN": { "ORTH": "fox" }, }, { "SPEC": { "NODE_NAME": "r", "NBOR_RELOP": ">>", "NBOR_NAME": "fox" }, "PATTERN": { "ORTH": "brown" }, }, ] matcher = DependencyMatcher(en_vocab) on_match = Mock() matcher.add("pattern1", [pattern1], on_match=on_match) matcher.add("pattern2", [pattern2], on_match=on_match) matcher.add("pattern3", [pattern3], on_match=on_match) matcher.add("pattern4", [pattern4], on_match=on_match) assert len(dependency_matcher) == 4 text = "The quick brown fox jumped over the lazy fox" heads = [3, 2, 1, 1, 0, -1, 2, 1, -3] deps = [ "det", "amod", "amod", "nsubj", "ROOT", "prep", "pobj", "det", "amod" ] doc = get_doc(dependency_matcher.vocab, text.split(), heads=heads, deps=deps) matches = dependency_matcher(doc) assert len(matches) == 3 assert matches[0][1] == [[3, 1, 2]] assert matches[1][1] == [[4, 3, 3]] assert matches[2][1] == [[4, 3, 2]] assert on_match.call_count == 3
def test_dependency_matcher_pattern_validation(en_vocab): pattern = [ { "RIGHT_ID": "fox", "RIGHT_ATTRS": { "ORTH": "fox" } }, { "LEFT_ID": "fox", "REL_OP": ">", "RIGHT_ID": "q", "RIGHT_ATTRS": { "ORTH": "quick", "DEP": "amod" }, }, { "LEFT_ID": "fox", "REL_OP": ">", "RIGHT_ID": "r", "RIGHT_ATTRS": { "ORTH": "brown" }, }, ] matcher = DependencyMatcher(en_vocab) # original pattern is valid matcher.add("FOUNDED", [pattern]) # individual pattern not wrapped in a list with pytest.raises(ValueError): matcher.add("FOUNDED", pattern) # no anchor node with pytest.raises(ValueError): matcher.add("FOUNDED", [pattern[1:]]) # required keys missing with pytest.raises(ValueError): pattern2 = copy.deepcopy(pattern) del pattern2[0]["RIGHT_ID"] matcher.add("FOUNDED", [pattern2]) with pytest.raises(ValueError): pattern2 = copy.deepcopy(pattern) del pattern2[1]["RIGHT_ID"] matcher.add("FOUNDED", [pattern2]) with pytest.raises(ValueError): pattern2 = copy.deepcopy(pattern) del pattern2[1]["RIGHT_ATTRS"] matcher.add("FOUNDED", [pattern2]) with pytest.raises(ValueError): pattern2 = copy.deepcopy(pattern) del pattern2[1]["LEFT_ID"] matcher.add("FOUNDED", [pattern2]) with pytest.raises(ValueError): pattern2 = copy.deepcopy(pattern) del pattern2[1]["REL_OP"] matcher.add("FOUNDED", [pattern2]) # invalid operator with pytest.raises(ValueError): pattern2 = copy.deepcopy(pattern) pattern2[1]["REL_OP"] = "!!!" matcher.add("FOUNDED", [pattern2]) # duplicate node name with pytest.raises(ValueError): pattern2 = copy.deepcopy(pattern) pattern2[1]["RIGHT_ID"] = "fox" matcher.add("FOUNDED", [pattern2])
) model_load_state = st.info(f"Loading model '{spacy_model}'...") nlp = load_model(spacy_model) model_load_state.empty() st.sidebar.subheader("spaCy pipeline:") desc = f"""<p style="font-size: 0.85em; line-height: 1.5"><strong>{spacy_model}:</strong> <code>v{nlp.meta['version']}</code></p>""" st.sidebar.markdown(desc, unsafe_allow_html=True) # Initialize Matcher Generator matcher = DependencyMatcher(nlp.vocab) with open('matcherPatterns.pickle', 'rb') as fp: pattern_dict = pickle.load(fp) for coltype, pattern in pattern_dict.items(): matcher.add(coltype, pattern) # Text Box default_text = "You can enter some sentences here to see their dependency relationships. In the sidebar, you can choose which spaCy pipeline to use. Hit ctrl + enter to give it a whirl (and check out how each parser handles the first phrase of this sentence)." st.title("Collocation Extractor") text = st.text_area("Text to analyze", default_text, height=200) # Process Text, then retokenize with collapsed punctuation, then split into sentence docs doc = process_text(spacy_model, text) def my_spans(doc): spans = [] for word in doc[:-1]: if word.is_punct or not word.nbor(1).is_punct: continue
'REL_OP': '>', 'RIGHT_ID': "update_object", 'RIGHT_ATTRS': { "DEP": "dobj" }, }, { "LEFT_ID": "update_object", "REL_OP": ">", "RIGHT_ID": "update_object_modifier", "RIGHT_ATTRS": { "DEP": { "IN": ["amod", "compound"] } }, }] dep_matcher.add('ADD_Pattern', [dep_pattern_add]) dep_matcher.add('AMEND_Pattern', [dep_pattern_amend]) dep_matcher.add('UPDATE_Pattern', [dep_pattern_update]) dep_matches_add = dep_matcher(doc_dep) dep_matches_title = dep_matcher(doc_dep_title) for id, match in dep_matches_add: for word in match: dependency_word = (doc_dep[word]) if str(dependency_word) != 'add' and str( dependency_word) != 'adding': df_dependency = df_dependency.append( { 'words': str(dependency_word), 'Ticket_id': row['Key'], 'Business Function': row['Business Function'], 'Summary': row['Summary'],
def preposition_check(self, sent_pack): """ Return a list of m2 corrections. Detects and corrects errors connected with the choice of preposition. It is done with the help of two datasets: 1) trigram counts with tags in them. These are in the following form: 'word/entity + preposition + word.tag' 2) trigrams without tags in them ('notag'). These are as follows: 'word/entity + preposition + word/entity' """ tag_matches = load_tag_ngram_file( os.path.join(self.data_folder, self.prep_tag_trigrams_file)) notag_matches = load_notag_ngram_filename( os.path.join(self.data_folder, self.prep_notag_trigrams_file)) matcher = DependencyMatcher(self.nlp.vocab) matcher.add("prep", [prep_pattern]) sent = sent_pack['corrected'] for match in matcher(sent): if sent[match[1][0]].ent_type_: head = sent[match[1][0]].ent_type_ else: head = sent[match[1][0]].text.lower() preposition = sent[match[1][1]] dependent = sent[match[1][2]] tag_key = ' '.join( [head, preposition.text.lower(), dependent.tag_]) # Get the entity tag of a token if possible (to generalise) if dependent.ent_type_: dependent_text = dependent.ent_type_ else: dependent_text = dependent.text.lower() notag_key = ' '.join( (head, preposition.text.lower(), dependent_text)) count = tag_matches.get(tag_key, 0) # If the given preposition with its context is found neither in # tagged nor non-tagged data if count == 0 and notag_key not in notag_matches: tag_keys = tag_matches.keys() regex_notag_prep_finder = re.compile( f"{head} .* {dependent_text}") relevant_notag_keys = list( filter(regex_notag_prep_finder.match, notag_matches)) if not relevant_notag_keys: regex_tag_prep_finder = re.compile( f'{head} .* {dependent.tag_}') relevant_tag_keys = list( filter(regex_tag_prep_finder.match, tag_keys)) relevant_tag_keys.sort(key=lambda x: tag_matches[x]) if not relevant_tag_keys: continue # FOR FUTURE: can suggest a few prepositions, actually (TODO) most_relevant = relevant_tag_keys[-1] else: # Taking a random first trigram. Possible to change by choosing # the one with the largest count (TODO) most_relevant = relevant_notag_keys[0] correct_prep = most_relevant.split(' ')[1] correction = f'A {preposition.i} {preposition.i+1}|||Prep|||{correct_prep}|||REQUIRED|||-NONE-|||0' sent_pack['corrections'].append(correction) sent_pack['corrected'] = self.token_replace( sent_pack['corrected'], correct_prep, preposition.i) return sent_pack
def search_out(doc, nlp): """Search for prepositions with verb Args: doc (spacy.tokens.Doc): doc to be analyzed nlp (spacy.language.Language): context language Returns: list: list of spacy.tokens.Span """ result = [] token_matcher = Matcher(nlp.vocab) dep_matcher = DependencyMatcher(nlp.vocab) token_patterns = [ [{ "LOWER": { "IN": ["más", "mneos"] } }, { "POS": "NOUN" }, { "LOWER": "que" }, {}], [{ "POS": "NOUN" }, { "POS": "PRON", "DEP": "compound" }], [{ "POS": "DET", "DEP": { "IN": ["det", "amod"] }, "OP": "*" }, { "POS": "NUM", "DEP": "nummod", "OP": "?" }, { "POS": "ADJ", "DEP": "amod", "OP": "*" }, { "POS": "NOUN" }], [{ "POS": "DET", "DEP": { "IN": ["det", "amod"] }, "OP": "*" }, { "POS": "ADV", "DEP": "advmod", "OP": "?" }, { "POS": "ADJ", "DEP": "amod", "OP": "+" }, { "POS": "NOUN" }], ] token_matcher.add("token_NP", token_patterns) token_matches = token_matcher(doc) token_refined_matches = merge([(start, end) for _, start, end in token_matches]) dep_patterns = [ [ { "RIGHT_ID": "noun", "RIGHT_ATTRS": { "POS": "NOUN" } }, { "LEFT_ID": "noun", "REL_OP": ">", "RIGHT_ID": "mod", "RIGHT_ATTRS": { "DEP": { "IN": ["nmod", "amod"] } } }, ], ] dep_matcher.add("dep_NP", dep_patterns) dep_matches = dep_matcher(doc) dep_refined_matches = [] for _, (noun, desp) in dep_matches: desp_tree = [e.i for e in doc[desp].subtree] length_valid = len(desp_tree) == max(desp_tree) - min(desp_tree) + 1 noun_valid = any([ noun <= end and noun >= start for start, end in token_refined_matches ]) if length_valid and noun_valid and noun < desp: desp_tree.append(noun) desp_tree.sort() dep_refined_matches.append((min(desp_tree), max(desp_tree) + 1)) matches = token_refined_matches + dep_refined_matches refined_matches = merge(matches) for start, end in refined_matches: if end - start > 1 and all([e.pos_ != "PUNCT" for e in doc[start:end]]): np = doc[start:end] result.append(np) return result
def build_matcher(vocab, pattern): matcher = DependencyMatcher(vocab) matcher.add('pattern', None, pattern) return matcher
} } }, # subject should be 'I' { 'LEFT_ID': 'family_member', 'REL_OP': '>', 'RIGHT_ID': 'proper_name', 'RIGHT_ATTRS': { 'DEP': 'appos', 'POS': 'PROPN' } } ] matcher.add("born_date", [born_date_pattern]) matcher.add("born_place", [born_place_pattern]) matcher.add("family_member_name", [family_member_name_pattern]) def get_birth_family_details(narrative: str) -> (list, list, dict): """ Use spaCy Dependency Parsing and rules-based matching to get the birth date and place details, and family member names from a narrative. :param narrative: String holding the narrative :return: Two lists where the first contains the tokens related to date and the second contains the tokens related to location, and a dictionary containing the names of family members and their relationship to the narrator/subject """ logging.info('Getting birth and family data from narrative')
# INIT SPACY MODEL nlp = spacy.load("fr_core_news_lg") nlp.add_pipe("merge_noun_chunks") # DATA df_data = pd.read_csv(args.dataset_fn, dtype={"authorZipCode": str}).fillna("") # PREPARE MATCHER matcher = DependencyMatcher(nlp.vocab) prop = enjeux.Proposition() observ = enjeux.ObservationObjectif() sit_perso = SituationPersonnelle() for pat_key, pat_value in observ.get_patterns().items(): matcher.add("OBSERVATION|{0}".format(pat_key), [pat_value]) for pat_key, pat_value in prop.get_patterns().items(): matcher.add("PROPOSITION|{0}".format(pat_key), [pat_value]) for pat_key, pat_value in sit_perso.get_patterns().items(): matcher.add("SITUATIONPERSONNELLE|{0}".format(pat_key), [pat_value]) for question, pattern_idx in question_patterns[args.dataset_code].items(): if not len({ 1, 2, 3 }.intersection(pattern_idx)) > 0: # 4 and 5 have no pattern attached continue results = [] id_ = df_data["id"].values zipCode = df_data["authorZipCode"].values data_question = df_data[question].values