def test_dependency_matcher_ops(en_vocab, doc, left, right, op, num_matches):
    right_id = right
    if left == right:
        right_id = right + "2"
    pattern = [
        {
            "RIGHT_ID": left,
            "RIGHT_ATTRS": {
                "LOWER": left
            }
        },
        {
            "LEFT_ID": left,
            "REL_OP": op,
            "RIGHT_ID": right_id,
            "RIGHT_ATTRS": {
                "LOWER": right
            },
        },
    ]

    matcher = DependencyMatcher(en_vocab)
    matcher.add("pattern", [pattern])
    matches = matcher(doc)
    assert len(matches) == num_matches
def test_dependency_matcher_span_user_data(en_tokenizer):
    doc = en_tokenizer("a b c d e")
    for token in doc:
        token.head = doc[0]
        token.dep_ = "a"
    get_is_c = lambda token: token.text in ("c", )
    Token.set_extension("is_c", default=False)
    doc[2]._.is_c = True
    pattern = [
        {
            "RIGHT_ID": "c",
            "RIGHT_ATTRS": {
                "_": {
                    "is_c": True
                }
            }
        },
    ]
    matcher = DependencyMatcher(en_tokenizer.vocab)
    matcher.add("C", [pattern])
    doc_matches = matcher(doc)
    offset = 1
    span_matches = matcher(doc[offset:])
    for doc_match, span_match in zip(sorted(doc_matches),
                                     sorted(span_matches)):
        assert doc_match[0] == span_match[0]
        for doc_t_i, span_t_i in zip(doc_match[1], span_match[1]):
            assert doc_t_i == span_t_i + offset
Esempio n. 3
0
def test_issue4590(en_vocab):
    """Test that matches param in on_match method are the same as matches run with no on_match method"""
    pattern = [
        {"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
        {
            "SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
            "PATTERN": {"ORTH": "fox"},
        },
        {
            "SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"},
            "PATTERN": {"ORTH": "fox"},
        },
    ]

    on_match = Mock()

    matcher = DependencyMatcher(en_vocab)
    matcher.add("pattern", on_match, pattern)

    text = "The quick brown fox jumped over the lazy fox"
    heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
    deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"]

    doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)

    matches = matcher(doc)

    on_match_args = on_match.call_args

    assert on_match_args[0][3] == matches
Esempio n. 4
0
def match_prep_pattern(doc, nlp, tag_matches, notag_matches):
    """ Returns a a dictionary of ('word preposition wordtag'): count.
        The dictionary is created from dependency parse tree syntactic relations.
    """

    matcher = DependencyMatcher(nlp.vocab)

    matcher.add("prep", [prep_pattern])
    count = 0
    for match in matcher(
            doc
    ):  # Iterate through all of the matches and add 1 to its count in the dictionary
        if count % 1000 == 0:
            print(f"Processing match # {count}")
        indices = match[1]

        # Writing the named entity of the head token insted of the whole word in case a named entity exists
        if doc[indices[0]].ent_type_:
            head_word = doc[indices[0]].ent_type_
        else:
            head_word = doc[indices[0]].text.lower()

        tag_trigram = ' '.join(
            (head_word, doc[indices[1]].text.lower(), doc[indices[2]].tag_))
        if doc[indices[2]].ent_type_:
            dependent_notag = doc[indices[2]].ent_type_
        else:
            dependent_notag = doc[indices[2]].text.lower()
        notag_trigram = ' '.join(
            (head_word, doc[indices[1]].text.lower(), dependent_notag))
        notag_matches.add(notag_trigram)
        tag_matches[tag_trigram] = tag_matches.get(tag_trigram, 0) + 1
        count += 1

    return tag_matches, notag_matches
def test_dependency_matcher_precedence_ops(en_vocab, op, num_matches):
    # two sentences to test that all matches are within the same sentence
    doc = Doc(
        en_vocab,
        words=["a", "b", "c", "d", "e"] * 2,
        heads=[0, 0, 0, 0, 0, 5, 5, 5, 5, 5],
        deps=["dep"] * 10,
    )
    match_count = 0
    for text in ["a", "b", "c", "d", "e"]:
        pattern = [
            {"RIGHT_ID": "1", "RIGHT_ATTRS": {"ORTH": text}},
            {"LEFT_ID": "1", "REL_OP": op, "RIGHT_ID": "2", "RIGHT_ATTRS": {}},
        ]
        matcher = DependencyMatcher(en_vocab)
        matcher.add("A", [pattern])
        matches = matcher(doc)
        match_count += len(matches)
        for match in matches:
            match_id, token_ids = match
            # token_ids[0] op token_ids[1]
            if op == ".":
                assert token_ids[0] == token_ids[1] - 1
            elif op == ";":
                assert token_ids[0] == token_ids[1] + 1
            elif op == ".*":
                assert token_ids[0] < token_ids[1]
            elif op == ";*":
                assert token_ids[0] > token_ids[1]
            # all tokens are within the same sentence
            assert doc[token_ids[0]].sent == doc[token_ids[1]].sent
    assert match_count == num_matches
Esempio n. 6
0
def add_matches(
    vocab,
    patterns: List[str],
    lemmas=True,
    start_ents=None,
    end_ents=None,
    print_patterns=False,
):
    # BAAAAD PATTERN! PASSING DOWN VARS (DECORATOR?)
    """Converts "prevented|nsubj|START_ENTITY prevented|dobj|END_ENTITY"
    into a pattern that DependencyMatcher class can use"""
    matcher = DependencyMatcher(vocab)
    for p in patterns:
        pattern = construct_pattern(p,
                                    lemmatize=lemmas,
                                    start_ents=start_ents,
                                    end_ents=end_ents)
        if print_patterns:
            print(pattern, p)
        try:  # object of type 'NoneType' has no len() weirdly on some of the 20k dep paths
            matcher.add(p, None, pattern)
        except:
            print("error with pattern", p, "-->", pattern)
            continue
    return matcher
def test_dependency_matcher_long_matches(en_vocab, doc):
    pattern = [
        {"RIGHT_ID": "quick", "RIGHT_ATTRS": {"DEP": "amod", "OP": "+"}},
    ]

    matcher = DependencyMatcher(en_vocab)
    with pytest.raises(ValueError):
        matcher.add("pattern", [pattern])
Esempio n. 8
0
def test_dependency_matcher_order_issue(en_tokenizer):
    # issue from #9263
    doc = en_tokenizer("I like text")
    doc[2].head = doc[1]

    # this matches on attrs but not rel op
    pattern1 = [
        {
            "RIGHT_ID": "root",
            "RIGHT_ATTRS": {
                "ORTH": "like"
            }
        },
        {
            "LEFT_ID": "root",
            "RIGHT_ID": "r",
            "RIGHT_ATTRS": {
                "ORTH": "text"
            },
            "REL_OP": "<",
        },
    ]

    # this matches on rel op but not attrs
    pattern2 = [
        {
            "RIGHT_ID": "root",
            "RIGHT_ATTRS": {
                "ORTH": "like"
            }
        },
        {
            "LEFT_ID": "root",
            "RIGHT_ID": "r",
            "RIGHT_ATTRS": {
                "ORTH": "fish"
            },
            "REL_OP": ">",
        },
    ]

    matcher = DependencyMatcher(en_tokenizer.vocab)

    # This should behave the same as the next pattern
    matcher.add("check", [pattern1, pattern2])
    matches = matcher(doc)

    assert matches == []

    # use a new matcher
    matcher = DependencyMatcher(en_tokenizer.vocab)
    # adding one at a time under same label gets a match
    matcher.add("check", [pattern1])
    matcher.add("check", [pattern2])
    matches = matcher(doc)

    assert matches == []
def dependency_matcher(en_vocab, patterns, doc):
    matcher = DependencyMatcher(en_vocab)
    mock = Mock()
    for i in range(1, len(patterns) + 1):
        if i == 1:
            matcher.add("pattern1", [patterns[0]], on_match=mock)
        else:
            matcher.add("pattern" + str(i), [patterns[i - 1]])

    return matcher
def get_dep_matcher(nlp, patterns, pattern_names=None) -> object:
    """ Add patterns with pattern_names to the dependency matcher """
    if pattern_names is None:
        pattern_names = ["pattern" + str(pi) for pi in range(len(patterns))]
    else:
        pattern_names = [x for x in pattern_names]
    matcher = DependencyMatcher(nlp.vocab)
    for pi, pattern in enumerate(patterns):
        #         print("pattern names: ", pattern_names[pi], pattern)
        matcher.add(pattern_names[pi], None, pattern)
    return matcher
Esempio n. 11
0
def test_dependency_matcher_remove(en_tokenizer):
    # issue from #9263
    doc = en_tokenizer("The red book")
    doc[1].head = doc[2]

    # this matches
    pattern1 = [
        {
            "RIGHT_ID": "root",
            "RIGHT_ATTRS": {
                "ORTH": "book"
            }
        },
        {
            "LEFT_ID": "root",
            "RIGHT_ID": "r",
            "RIGHT_ATTRS": {
                "ORTH": "red"
            },
            "REL_OP": ">",
        },
    ]

    # add and then remove it
    matcher = DependencyMatcher(en_tokenizer.vocab)
    matcher.add("check", [pattern1])
    matcher.remove("check")

    # this matches on rel op but not attrs
    pattern2 = [
        {
            "RIGHT_ID": "root",
            "RIGHT_ATTRS": {
                "ORTH": "flag"
            }
        },
        {
            "LEFT_ID": "root",
            "RIGHT_ID": "r",
            "RIGHT_ATTRS": {
                "ORTH": "blue"
            },
            "REL_OP": ">",
        },
    ]

    # Adding this new pattern with the same label, which should not match
    matcher.add("check", [pattern2])
    matches = matcher(doc)

    assert matches == []
def dependency_matcher(en_vocab):
    def is_brown_yellow(text):
        return bool(re.compile(r"brown|yellow|over").match(text))

    IS_BROWN_YELLOW = en_vocab.add_flag(is_brown_yellow)

    pattern1 = [
        {"SPEC": {"NODE_NAME": "fox"}, "PATTERN": {"ORTH": "fox"}},
        {
            "SPEC": {"NODE_NAME": "q", "NBOR_RELOP": ">", "NBOR_NAME": "fox"},
            "PATTERN": {"ORTH": "quick", "DEP": "amod"},
        },
        {
            "SPEC": {"NODE_NAME": "r", "NBOR_RELOP": ">", "NBOR_NAME": "fox"},
            "PATTERN": {IS_BROWN_YELLOW: True},
        },
    ]

    pattern2 = [
        {"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
        {
            "SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
            "PATTERN": {"ORTH": "fox"},
        },
        {
            "SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"},
            "PATTERN": {"ORTH": "fox"},
        },
    ]

    pattern3 = [
        {"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
        {
            "SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
            "PATTERN": {"ORTH": "fox"},
        },
        {
            "SPEC": {"NODE_NAME": "r", "NBOR_RELOP": ">>", "NBOR_NAME": "fox"},
            "PATTERN": {"ORTH": "brown"},
        },
    ]

    matcher = DependencyMatcher(en_vocab)
    matcher.add("pattern1", [pattern1])
    matcher.add("pattern2", [pattern2])
    matcher.add("pattern3", [pattern3])

    return matcher
def spacyDep(doc):
    matcher = DependencyMatcher(nlp.vocab, validate=True)
    pattern = [
        {
            "RIGHT_ID": "anchor_AUX",  #unique name
            "RIGHT_ATTRS": {
                "POS": "AUX"
            }  #token pattern for disaster
        },
        {
            "LEFT_ID": "anchor_AUX",
            "REL_OP": ">",
            "RIGHT_ID":
            "anchor_disaster",  ##Il faut aussi vérifier que c'est bien un désastre
            "RIGHT_ATTRS": {
                "DEP": "attr"
            }
        },
        {
            "LEFT_ID": "anchor_AUX",
            "REL_OP": ">",
            "RIGHT_ID": "AUX_prep",
            "RIGHT_ATTRS": {
                "DEP": "prep",
                "POS": "ADP"
            }
        },
        {
            "LEFT_ID": "AUX_prep",
            "REL_OP": ">",
            "RIGHT_ID": "pobj_prep",
            "RIGHT_ATTRS": {
                "DEP": "pobj"
            }  ##Normalement c'est une ville, il faut donc la recup
        }
    ]
    matcher.add("DISASTER", [pattern])
    #displacy.serve(doc)
    matches = matcher(doc)
    print(doc[matches[0][1][-1]].text)
    #print(doc[3].dep_)
    #([print(str(name)+" : "+str(thing)) for name,thing in inspect.getmembers(doc[
    return bool(matches)
Esempio n. 14
0
def spacyDep(doc):
    matcher = DependencyMatcher(nlp.vocab, validate=True)
    pattern = [
        {
            "RIGHT_ID": "anchor_AUX",  #Nom unique pour le label d'ancrage
            "RIGHT_ATTRS": {
                "POS": "AUX"
            }
        },
        {
            "LEFT_ID": "anchor_AUX",
            "REL_OP": ">",
            "RIGHT_ID": "anchor_disaster",
            "RIGHT_ATTRS": {
                "DEP": "attr"
            }
        },
        {
            "LEFT_ID": "anchor_AUX",
            "REL_OP": ">",
            "RIGHT_ID": "AUX_prep",
            "RIGHT_ATTRS": {
                "DEP": "prep",
                "POS": "ADP"
            }
        },
        {
            "LEFT_ID": "AUX_prep",
            "REL_OP": ">",
            "RIGHT_ID": "pobj_prep",
            "RIGHT_ATTRS": {
                "DEP": "pobj"
            }
        }
    ]
    matcher.add("DISASTER", [pattern])
    #displacy.serve(doc)
    matches = matcher(doc)
    if bool(matches):
        return [True, doc[matches[0][1][-1]].text]
    else:
        return [False, []]
Esempio n. 15
0
def test_dependency_matcher_callback(en_vocab, doc):
    pattern = [
        {
            "RIGHT_ID": "quick",
            "RIGHT_ATTRS": {
                "ORTH": "quick"
            }
        },
    ]
    nomatch_pattern = [
        {
            "RIGHT_ID": "quick",
            "RIGHT_ATTRS": {
                "ORTH": "NOMATCH"
            }
        },
    ]

    matcher = DependencyMatcher(en_vocab)
    mock = Mock()
    matcher.add("pattern", [pattern], on_match=mock)
    matcher.add("nomatch_pattern", [nomatch_pattern], on_match=mock)
    matches = matcher(doc)
    assert len(matches) == 1
    mock.assert_called_once_with(matcher, doc, 0, matches)

    # check that matches with and without callback are the same (#4590)
    matcher2 = DependencyMatcher(en_vocab)
    matcher2.add("pattern", [pattern])
    matches2 = matcher2(doc)
    assert matches == matches2
Esempio n. 16
0
def test_dependency_matcher_pickle(en_vocab, patterns, doc):
    matcher = DependencyMatcher(en_vocab)
    for i in range(1, len(patterns) + 1):
        matcher.add("pattern" + str(i), [patterns[i - 1]])

    matches = matcher(doc)
    assert matches[0][1] == [3, 1, 2]
    assert matches[1][1] == [4, 3, 5]
    assert matches[2][1] == [4, 3, 2]
    assert matches[3][1] == [4, 3]
    assert matches[4][1] == [4, 3]
    assert matches[5][1] == [4, 8]

    b = pickle.dumps(matcher)
    matcher_r = pickle.loads(b)

    assert len(matcher) == len(matcher_r)
    matches = matcher_r(doc)
    assert matches[0][1] == [3, 1, 2]
    assert matches[1][1] == [4, 3, 5]
    assert matches[2][1] == [4, 3, 2]
    assert matches[3][1] == [4, 3]
    assert matches[4][1] == [4, 3]
    assert matches[5][1] == [4, 8]
Esempio n. 17
0
    }
]

# a rare condition in which a *brain* develops without the cerebellum 
inflected_verb_noun = [
    {
        'SPEC': {'NODE_NAME': 'inflected_verb'}, 
        'PATTERN': {'_': {'change': 'inflected'}}
    },
    {
        'SPEC': {'NBOR_NAME': 'inflected_verb', 'NBOR_RELOP': '>', 'NODE_NAME': 'node0'},
        'PATTERN': {'DEP': 'nsubj', 'TAG': {'IN': ['NN', 'NNP']}}
    }
]

noun_to_plural_matcher.add('noun_to_plural', None, root_noun)
noun_to_plural_matcher.add('root_subj', None, root_subj)
noun_to_plural_matcher.add('noun_conjunction', None, noun_conjunction)
noun_to_plural_matcher.add('nsubjpass', None, nsubjpass)
# noun_to_plural_matcher.add('inflected_verb_noun', None, inflected_verb_noun)

verb_inflection_matcher = DependencyMatcher(nlp.vocab)

# a rocket is -> rockets *are* [todo]
verb_inflection = [
    {
        'SPEC': {'NODE_NAME': 'pluralized_noun'}, 
        'PATTERN': {'_': {'change': 'pluralized_noun'}}
    },
    {
        'SPEC': {'NBOR_NAME': 'pluralized_noun', 'NBOR_RELOP': '>', 'NODE_NAME': 'node0'},
            node["PATTERN"] = token_pattern

            pattern.append(node)
            add_node(child, pattern)

    pattern = [{"SPEC": {"NODE_NAME": root}, "PATTERN": {"ORTH": root}}]
    add_node(root, pattern)

    return pattern


from spacy.matcher import DependencyMatcher


example = [["founded", "nsubj", "START_ENTITY"], ["founded", "dobj", "END_ENTITY"]]

pattern = construct_pattern(example)
matcher = DependencyMatcher(nlp.vocab)
matcher.add("pattern1", None, pattern)

doc1 = nlp("Bill Gates founded Microsoft.")
doc2 = nlp("Bill Gates, the Seattle Seahawks owner, founded Microsoft.")

match = matcher(doc1)[0]
subtree = match[1][0]
visualise_subtrees(doc1, subtree)

match = matcher(doc2)[0]
subtree = match[1][0]
visualise_subtrees(doc2, subtree)
Esempio n. 19
0
def build_matcher(vocab, pattern_dict):
    matcher = DependencyMatcher(vocab)
    for name, pattern in pattern_dict.items():
        dep_pattern = pattern['spacy_dep_pattern']
        matcher.add(name, None, dep_pattern)
    return matcher
Esempio n. 20
0
def search_out(doc, nlp):
    """Search for prepositions with verb 

  Args:
    doc (spacy.tokens.Doc): doc to be analyzed
    nlp (spacy.language.Language): context language

  Returns:
    list: list of spacy.tokens.Span
  """
    result = []

    token_matcher = Matcher(nlp.vocab)
    dep_matcher = DependencyMatcher(nlp.vocab)

    token_patterns = [
        [{
            "POS": "NOUN"
        }],
        [{
            "POS": "PRON"
        }],
        [{
            "POS": "PROPN"
        }],
        [{
            "POS": "PROPN"
        }, {
            "POS": "CCONJ",
            "DEP": "cd"
        }, {
            "POS": "PROPN",
            "DEP": "cj"
        }],
    ]
    token_matcher.add("token_NP", token_patterns)
    token_matches = token_matcher(doc)

    token_refined_matches = merge([(start, end)
                                   for _, start, end in token_matches])

    dep_patterns = [
        [
            {
                "RIGHT_ID": "noun",
                "RIGHT_ATTRS": {
                    "POS": "NOUN"
                }
            },
            {
                "LEFT_ID": "noun",
                "REL_OP": ">",
                "RIGHT_ID": "noun_mod",
                "RIGHT_ATTRS": {
                    "DEP": {
                        "IN": ["ag", "nk", "pg", "op"]
                    },
                    "POS": {
                        "NOT_IN": ["PART", "ADV", "PRON", "CCONJ"]
                    }
                }
            },
        ],
        [
            {
                "RIGHT_ID": "pron",
                "RIGHT_ATTRS": {
                    "POS": "PRON"
                }
            },
            {
                "LEFT_ID": "pron",
                "REL_OP": ">",
                "RIGHT_ID": "pron_mod",
                "RIGHT_ATTRS": {
                    "DEP": {
                        "IN": ["ag"]
                    },
                    "POS": {
                        "IN": ["NOUN"]
                    }
                }
            },
        ],
        [
            {
                "RIGHT_ID": "noun",
                "RIGHT_ATTRS": {
                    "POS": "NOUN"
                }
            },
            {
                "LEFT_ID": "noun",
                "REL_OP": ">",
                "RIGHT_ID": "noun_mod",
                "RIGHT_ATTRS": {
                    "DEP": "mnr",
                    "LOWER": {
                        "IN": [
                            "an", "nach", "vom", "von", "auf", "zu", "zur",
                            "zwischen", "aus"
                        ]
                    }
                }
            },
        ],
        [
            {
                "RIGHT_ID": "propn",
                "RIGHT_ATTRS": {
                    "POS": "PROPN"
                }
            },
            {
                "LEFT_ID": "propn",
                "REL_OP": ">",
                "RIGHT_ID": "propn_mod",
                "RIGHT_ATTRS": {
                    "DEP": {
                        "IN": ["pnc", "nk"]
                    },
                    "POS": {
                        "IN": ["NOUN", "DET", "PROPN"]
                    }
                }
            },
        ],
    ]

    dep_matcher.add("dep_NP", dep_patterns)
    dep_matches = dep_matcher(doc)
    dep_refined_matches = []
    for _, (noun, desp) in dep_matches:
        desp_tree = [e.i for e in doc[desp].subtree]
        length_valid = len(desp_tree) == max(desp_tree) - min(desp_tree) + 1
        noun_valid = any([
            noun <= end and noun >= start
            for start, end in token_refined_matches
        ])
        if length_valid and noun_valid:
            desp_tree.append(noun)
            desp_tree.sort()
            dep_refined_matches.append((min(desp_tree), max(desp_tree) + 1))

    matches = token_refined_matches + dep_refined_matches
    refined_matches = merge(matches)

    for start, end in refined_matches:
        if end - start > 1 and all([e.pos_ != "PUNCT"
                                    for e in doc[start:end]]):
            np = doc[start:end]
            result.append(np)

    return result
Esempio n. 21
0
def dependency_matcher(en_vocab):
    def is_brown_yellow(text):
        return bool(re.compile(r"brown|yellow|over").match(text))

    IS_BROWN_YELLOW = en_vocab.add_flag(is_brown_yellow)

    pattern1 = [
        {
            "SPEC": {
                "NODE_NAME": "fox"
            },
            "PATTERN": {
                "ORTH": "fox"
            }
        },
        {
            "SPEC": {
                "NODE_NAME": "q",
                "NBOR_RELOP": ">",
                "NBOR_NAME": "fox"
            },
            "PATTERN": {
                "ORTH": "quick",
                "DEP": "amod"
            },
        },
        {
            "SPEC": {
                "NODE_NAME": "r",
                "NBOR_RELOP": ">",
                "NBOR_NAME": "fox"
            },
            "PATTERN": {
                IS_BROWN_YELLOW: True
            },
        },
    ]

    pattern2 = [
        {
            "SPEC": {
                "NODE_NAME": "jumped"
            },
            "PATTERN": {
                "ORTH": "jumped"
            }
        },
        {
            "SPEC": {
                "NODE_NAME": "fox",
                "NBOR_RELOP": ">",
                "NBOR_NAME": "jumped"
            },
            "PATTERN": {
                "ORTH": "fox"
            },
        },
        {
            "SPEC": {
                "NODE_NAME": "quick",
                "NBOR_RELOP": ".",
                "NBOR_NAME": "jumped"
            },
            "PATTERN": {
                "ORTH": "fox"
            },
        },
    ]

    pattern3 = [
        {
            "SPEC": {
                "NODE_NAME": "jumped"
            },
            "PATTERN": {
                "ORTH": "jumped"
            }
        },
        {
            "SPEC": {
                "NODE_NAME": "fox",
                "NBOR_RELOP": ">",
                "NBOR_NAME": "jumped"
            },
            "PATTERN": {
                "ORTH": "fox"
            },
        },
        {
            "SPEC": {
                "NODE_NAME": "r",
                "NBOR_RELOP": ">>",
                "NBOR_NAME": "fox"
            },
            "PATTERN": {
                "ORTH": "brown"
            },
        },
    ]

    # pattern that doesn't match
    pattern4 = [
        {
            "SPEC": {
                "NODE_NAME": "jumped"
            },
            "PATTERN": {
                "ORTH": "NOMATCH"
            }
        },
        {
            "SPEC": {
                "NODE_NAME": "fox",
                "NBOR_RELOP": ">",
                "NBOR_NAME": "jumped"
            },
            "PATTERN": {
                "ORTH": "fox"
            },
        },
        {
            "SPEC": {
                "NODE_NAME": "r",
                "NBOR_RELOP": ">>",
                "NBOR_NAME": "fox"
            },
            "PATTERN": {
                "ORTH": "brown"
            },
        },
    ]

    matcher = DependencyMatcher(en_vocab)
    on_match = Mock()
    matcher.add("pattern1", [pattern1], on_match=on_match)
    matcher.add("pattern2", [pattern2], on_match=on_match)
    matcher.add("pattern3", [pattern3], on_match=on_match)
    matcher.add("pattern4", [pattern4], on_match=on_match)

    assert len(dependency_matcher) == 4

    text = "The quick brown fox jumped over the lazy fox"
    heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
    deps = [
        "det", "amod", "amod", "nsubj", "ROOT", "prep", "pobj", "det", "amod"
    ]

    doc = get_doc(dependency_matcher.vocab,
                  text.split(),
                  heads=heads,
                  deps=deps)
    matches = dependency_matcher(doc)

    assert len(matches) == 3
    assert matches[0][1] == [[3, 1, 2]]
    assert matches[1][1] == [[4, 3, 3]]
    assert matches[2][1] == [[4, 3, 2]]
    assert on_match.call_count == 3
Esempio n. 22
0
def test_dependency_matcher_pattern_validation(en_vocab):
    pattern = [
        {
            "RIGHT_ID": "fox",
            "RIGHT_ATTRS": {
                "ORTH": "fox"
            }
        },
        {
            "LEFT_ID": "fox",
            "REL_OP": ">",
            "RIGHT_ID": "q",
            "RIGHT_ATTRS": {
                "ORTH": "quick",
                "DEP": "amod"
            },
        },
        {
            "LEFT_ID": "fox",
            "REL_OP": ">",
            "RIGHT_ID": "r",
            "RIGHT_ATTRS": {
                "ORTH": "brown"
            },
        },
    ]

    matcher = DependencyMatcher(en_vocab)
    # original pattern is valid
    matcher.add("FOUNDED", [pattern])
    # individual pattern not wrapped in a list
    with pytest.raises(ValueError):
        matcher.add("FOUNDED", pattern)
    # no anchor node
    with pytest.raises(ValueError):
        matcher.add("FOUNDED", [pattern[1:]])
    # required keys missing
    with pytest.raises(ValueError):
        pattern2 = copy.deepcopy(pattern)
        del pattern2[0]["RIGHT_ID"]
        matcher.add("FOUNDED", [pattern2])
    with pytest.raises(ValueError):
        pattern2 = copy.deepcopy(pattern)
        del pattern2[1]["RIGHT_ID"]
        matcher.add("FOUNDED", [pattern2])
    with pytest.raises(ValueError):
        pattern2 = copy.deepcopy(pattern)
        del pattern2[1]["RIGHT_ATTRS"]
        matcher.add("FOUNDED", [pattern2])
    with pytest.raises(ValueError):
        pattern2 = copy.deepcopy(pattern)
        del pattern2[1]["LEFT_ID"]
        matcher.add("FOUNDED", [pattern2])
    with pytest.raises(ValueError):
        pattern2 = copy.deepcopy(pattern)
        del pattern2[1]["REL_OP"]
        matcher.add("FOUNDED", [pattern2])
    # invalid operator
    with pytest.raises(ValueError):
        pattern2 = copy.deepcopy(pattern)
        pattern2[1]["REL_OP"] = "!!!"
        matcher.add("FOUNDED", [pattern2])
    # duplicate node name
    with pytest.raises(ValueError):
        pattern2 = copy.deepcopy(pattern)
        pattern2[1]["RIGHT_ID"] = "fox"
        matcher.add("FOUNDED", [pattern2])
Esempio n. 23
0
)
model_load_state = st.info(f"Loading model '{spacy_model}'...")
nlp = load_model(spacy_model)
model_load_state.empty()

st.sidebar.subheader("spaCy pipeline:")
desc = f"""<p style="font-size: 0.85em; line-height: 1.5"><strong>{spacy_model}:</strong> <code>v{nlp.meta['version']}</code></p>"""
st.sidebar.markdown(desc, unsafe_allow_html=True)

# Initialize Matcher Generator

matcher = DependencyMatcher(nlp.vocab)
with open('matcherPatterns.pickle', 'rb') as fp:
    pattern_dict = pickle.load(fp)
for coltype, pattern in pattern_dict.items():
    matcher.add(coltype, pattern)

# Text Box
default_text = "You can enter some sentences here to see their dependency relationships. In the sidebar, you can choose which spaCy pipeline to use. Hit ctrl + enter to give it a whirl (and check out how each parser handles the first phrase of this sentence)."
st.title("Collocation Extractor")
text = st.text_area("Text to analyze", default_text, height=200)

# Process Text, then retokenize with collapsed punctuation, then split into sentence docs
doc = process_text(spacy_model, text)


def my_spans(doc):
    spans = []
    for word in doc[:-1]:
        if word.is_punct or not word.nbor(1).is_punct:
            continue
Esempio n. 24
0
     'REL_OP': '>',
     'RIGHT_ID': "update_object",
     'RIGHT_ATTRS': {
         "DEP": "dobj"
     },
 }, {
     "LEFT_ID": "update_object",
     "REL_OP": ">",
     "RIGHT_ID": "update_object_modifier",
     "RIGHT_ATTRS": {
         "DEP": {
             "IN": ["amod", "compound"]
         }
     },
 }]
 dep_matcher.add('ADD_Pattern', [dep_pattern_add])
 dep_matcher.add('AMEND_Pattern', [dep_pattern_amend])
 dep_matcher.add('UPDATE_Pattern', [dep_pattern_update])
 dep_matches_add = dep_matcher(doc_dep)
 dep_matches_title = dep_matcher(doc_dep_title)
 for id, match in dep_matches_add:
     for word in match:
         dependency_word = (doc_dep[word])
         if str(dependency_word) != 'add' and str(
                 dependency_word) != 'adding':
             df_dependency = df_dependency.append(
                 {
                     'words': str(dependency_word),
                     'Ticket_id': row['Key'],
                     'Business Function': row['Business Function'],
                     'Summary': row['Summary'],
Esempio n. 25
0
    def preposition_check(self, sent_pack):
        """ Return a list of m2 corrections.
            Detects and corrects errors connected with the choice of preposition.
            It is done with the help of two datasets:
            1) trigram counts with tags in them. These are in the following form:
            'word/entity + preposition + word.tag'
            2) trigrams without tags in them ('notag'). These are as follows:
            'word/entity + preposition + word/entity'
        """
        tag_matches = load_tag_ngram_file(
            os.path.join(self.data_folder, self.prep_tag_trigrams_file))

        notag_matches = load_notag_ngram_filename(
            os.path.join(self.data_folder, self.prep_notag_trigrams_file))

        matcher = DependencyMatcher(self.nlp.vocab)

        matcher.add("prep", [prep_pattern])

        sent = sent_pack['corrected']

        for match in matcher(sent):
            if sent[match[1][0]].ent_type_:
                head = sent[match[1][0]].ent_type_
            else:
                head = sent[match[1][0]].text.lower()

            preposition = sent[match[1][1]]
            dependent = sent[match[1][2]]

            tag_key = ' '.join(
                [head, preposition.text.lower(), dependent.tag_])

            # Get the entity tag of a token if possible (to generalise)
            if dependent.ent_type_:
                dependent_text = dependent.ent_type_
            else:
                dependent_text = dependent.text.lower()

            notag_key = ' '.join(
                (head, preposition.text.lower(), dependent_text))

            count = tag_matches.get(tag_key, 0)

            # If the given preposition with its context is found neither in
            # tagged nor non-tagged data
            if count == 0 and notag_key not in notag_matches:
                tag_keys = tag_matches.keys()

                regex_notag_prep_finder = re.compile(
                    f"{head} .* {dependent_text}")
                relevant_notag_keys = list(
                    filter(regex_notag_prep_finder.match, notag_matches))
                if not relevant_notag_keys:
                    regex_tag_prep_finder = re.compile(
                        f'{head} .* {dependent.tag_}')
                    relevant_tag_keys = list(
                        filter(regex_tag_prep_finder.match, tag_keys))
                    relevant_tag_keys.sort(key=lambda x: tag_matches[x])
                    if not relevant_tag_keys:
                        continue
                    # FOR FUTURE: can suggest a few prepositions, actually (TODO)
                    most_relevant = relevant_tag_keys[-1]
                else:
                    # Taking a random first trigram. Possible to change by choosing
                    # the one with the largest count (TODO)
                    most_relevant = relevant_notag_keys[0]

                correct_prep = most_relevant.split(' ')[1]
                correction = f'A {preposition.i} {preposition.i+1}|||Prep|||{correct_prep}|||REQUIRED|||-NONE-|||0'
                sent_pack['corrections'].append(correction)

                sent_pack['corrected'] = self.token_replace(
                    sent_pack['corrected'], correct_prep, preposition.i)

        return sent_pack
Esempio n. 26
0
def search_out(doc, nlp):
    """Search for prepositions with verb 

  Args:
    doc (spacy.tokens.Doc): doc to be analyzed
    nlp (spacy.language.Language): context language

  Returns:
    list: list of spacy.tokens.Span
  """
    result = []

    token_matcher = Matcher(nlp.vocab)
    dep_matcher = DependencyMatcher(nlp.vocab)

    token_patterns = [
        [{
            "LOWER": {
                "IN": ["más", "mneos"]
            }
        }, {
            "POS": "NOUN"
        }, {
            "LOWER": "que"
        }, {}],
        [{
            "POS": "NOUN"
        }, {
            "POS": "PRON",
            "DEP": "compound"
        }],
        [{
            "POS": "DET",
            "DEP": {
                "IN": ["det", "amod"]
            },
            "OP": "*"
        }, {
            "POS": "NUM",
            "DEP": "nummod",
            "OP": "?"
        }, {
            "POS": "ADJ",
            "DEP": "amod",
            "OP": "*"
        }, {
            "POS": "NOUN"
        }],
        [{
            "POS": "DET",
            "DEP": {
                "IN": ["det", "amod"]
            },
            "OP": "*"
        }, {
            "POS": "ADV",
            "DEP": "advmod",
            "OP": "?"
        }, {
            "POS": "ADJ",
            "DEP": "amod",
            "OP": "+"
        }, {
            "POS": "NOUN"
        }],
    ]
    token_matcher.add("token_NP", token_patterns)
    token_matches = token_matcher(doc)

    token_refined_matches = merge([(start, end)
                                   for _, start, end in token_matches])

    dep_patterns = [
        [
            {
                "RIGHT_ID": "noun",
                "RIGHT_ATTRS": {
                    "POS": "NOUN"
                }
            },
            {
                "LEFT_ID": "noun",
                "REL_OP": ">",
                "RIGHT_ID": "mod",
                "RIGHT_ATTRS": {
                    "DEP": {
                        "IN": ["nmod", "amod"]
                    }
                }
            },
        ],
    ]

    dep_matcher.add("dep_NP", dep_patterns)
    dep_matches = dep_matcher(doc)
    dep_refined_matches = []
    for _, (noun, desp) in dep_matches:
        desp_tree = [e.i for e in doc[desp].subtree]
        length_valid = len(desp_tree) == max(desp_tree) - min(desp_tree) + 1
        noun_valid = any([
            noun <= end and noun >= start
            for start, end in token_refined_matches
        ])
        if length_valid and noun_valid and noun < desp:
            desp_tree.append(noun)
            desp_tree.sort()
            dep_refined_matches.append((min(desp_tree), max(desp_tree) + 1))

    matches = token_refined_matches + dep_refined_matches
    refined_matches = merge(matches)

    for start, end in refined_matches:
        if end - start > 1 and all([e.pos_ != "PUNCT"
                                    for e in doc[start:end]]):
            np = doc[start:end]
            result.append(np)

    return result
Esempio n. 27
0
def build_matcher(vocab, pattern):
    matcher = DependencyMatcher(vocab)
    matcher.add('pattern', None, pattern)
    return matcher
Esempio n. 28
0
            }
        }
    },
    # subject should be 'I'
    {
        'LEFT_ID': 'family_member',
        'REL_OP': '>',
        'RIGHT_ID': 'proper_name',
        'RIGHT_ATTRS': {
            'DEP': 'appos',
            'POS': 'PROPN'
        }
    }
]

matcher.add("born_date", [born_date_pattern])
matcher.add("born_place", [born_place_pattern])
matcher.add("family_member_name", [family_member_name_pattern])


def get_birth_family_details(narrative: str) -> (list, list, dict):
    """
    Use spaCy Dependency Parsing and rules-based matching to get the birth date and place details,
    and family member names from a narrative.

    :param narrative: String holding the narrative
    :return: Two lists where the first contains the tokens related to date and the second
             contains the tokens related to location, and a dictionary containing the names
             of family members and their relationship to the narrator/subject
    """
    logging.info('Getting birth and family data from narrative')
# INIT SPACY MODEL
nlp = spacy.load("fr_core_news_lg")
nlp.add_pipe("merge_noun_chunks")

# DATA
df_data = pd.read_csv(args.dataset_fn, dtype={"authorZipCode": str}).fillna("")

# PREPARE MATCHER
matcher = DependencyMatcher(nlp.vocab)
prop = enjeux.Proposition()
observ = enjeux.ObservationObjectif()
sit_perso = SituationPersonnelle()

for pat_key, pat_value in observ.get_patterns().items():
    matcher.add("OBSERVATION|{0}".format(pat_key), [pat_value])
for pat_key, pat_value in prop.get_patterns().items():
    matcher.add("PROPOSITION|{0}".format(pat_key), [pat_value])

for pat_key, pat_value in sit_perso.get_patterns().items():
    matcher.add("SITUATIONPERSONNELLE|{0}".format(pat_key), [pat_value])

for question, pattern_idx in question_patterns[args.dataset_code].items():
    if not len({
            1, 2, 3
    }.intersection(pattern_idx)) > 0:  # 4 and 5 have no pattern attached
        continue
    results = []
    id_ = df_data["id"].values
    zipCode = df_data["authorZipCode"].values
    data_question = df_data[question].values