コード例 #1
0
ファイル: test_matcher_logic.py プロジェクト: spacy-io/spaCy
def test_operator_combos(en_vocab):
    cases = [
        ("aaab", "a a a b", True),
        ("aaab", "a+ b", True),
        ("aaab", "a+ a+ b", True),
        ("aaab", "a+ a+ a b", True),
        ("aaab", "a+ a+ a+ b", True),
        ("aaab", "a+ a a b", True),
        ("aaab", "a+ a a", True),
        ("aaab", "a+", True),
        ("aaa", "a+ b", False),
        ("aaa", "a+ a+ b", False),
        ("aaa", "a+ a+ a+ b", False),
        ("aaa", "a+ a b", False),
        ("aaa", "a+ a a b", False),
        ("aaab", "a+ a a", True),
        ("aaab", "a+", True),
        ("aaab", "a+ a b", True),
    ]
    for string, pattern_str, result in cases:
        matcher = Matcher(en_vocab)
        doc = Doc(matcher.vocab, words=list(string))
        pattern = []
        for part in pattern_str.split():
            if part.endswith("+"):
                pattern.append({"ORTH": part[0], "OP": "+"})
            else:
                pattern.append({"ORTH": part})
        matcher.add("PATTERN", None, pattern)
        matches = matcher(doc)
        if result:
            assert matches, (string, pattern_str)
        else:
            assert not matches, (string, pattern_str)
コード例 #2
0
ファイル: test_issue1-1000.py プロジェクト: spacy-io/spaCy
def test_issue615(en_tokenizer):
    def merge_phrases(matcher, doc, i, matches):
        """Merge a phrase. We have to be careful here because we'll change the
        token indices. To avoid problems, merge all the phrases once we're called
        on the last match."""
        if i != len(matches) - 1:
            return None
        spans = [Span(doc, start, end, label=label) for label, start, end in matches]
        with doc.retokenize() as retokenizer:
            for span in spans:
                tag = "NNP" if span.label_ else span.root.tag_
                attrs = {"tag": tag, "lemma": span.text}
                retokenizer.merge(span, attrs=attrs)
                doc.ents = doc.ents + (span,)

    text = "The golf club is broken"
    pattern = [{"ORTH": "golf"}, {"ORTH": "club"}]
    label = "Sport_Equipment"
    doc = en_tokenizer(text)
    matcher = Matcher(doc.vocab)
    matcher.add(label, merge_phrases, pattern)
    matcher(doc)
    entities = list(doc.ents)
    assert entities != []
    assert entities[0].label != 0
コード例 #3
0
class RussianTokenizer(object):
    name = 'russian_tokenizer'

    def __init__(self, nlp, merge_patterns=None, terminal_patterns=None):
        self.matcher = Matcher(nlp.vocab)
        self.token_merge = nlp.vocab.strings['pattern']
        self.sentence_terminal = nlp.vocab.strings['sentence_terminal']
        if merge_patterns:
            self.matcher.add(self.token_merge, None, *merge_patterns)
        if terminal_patterns:
            self.matcher.add(self.sentence_terminal, None, *terminal_patterns)

    def __call__(self, doc):
        spans = []
        for id, start, end in self.matcher(doc):
            if id == self.token_merge:
                spans.append(doc[start:end])
            elif id == self.sentence_terminal:
                # remove all sentence start marks from span that match pattern
                for token in doc[start:end]:
                    if token.sent_start:
                        token.sent_start = False
        if spans:
            for span in spans:
                span.merge()
        return doc
コード例 #4
0
ファイル: ud_run_test.py プロジェクト: spacy-io/spaCy
def write_conllu(docs, file_):
    merger = Matcher(docs[0].vocab)
    merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}])
    for i, doc in enumerate(docs):
        matches = merger(doc)
        spans = [doc[start : end + 1] for _, start, end in matches]
        with doc.retokenize() as retokenizer:
            for span in spans:
                retokenizer.merge(span)
        # TODO: This shouldn't be necessary? Should be handled in merge
        for word in doc:
            if word.i == word.head.i:
                word.dep_ = "ROOT"
        file_.write("# newdoc id = {i}\n".format(i=i))
        for j, sent in enumerate(doc.sents):
            file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
            file_.write("# text = {text}\n".format(text=sent.text))
            for k, token in enumerate(sent):
                file_.write(_get_token_conllu(token, k, len(sent)) + "\n")
            file_.write("\n")
            for word in sent:
                if word.head.i == word.i and word.dep_ == "ROOT":
                    break
            else:
                print("Rootless sentence!")
                print(sent)
                print(i)
                for w in sent:
                    print(w.i, w.text, w.head.text, w.head.i, w.dep_)
                raise ValueError
コード例 #5
0
ファイル: test_matcher_api.py プロジェクト: spacy-io/spaCy
def test_matcher_match_zero_plus(matcher):
    words = 'He said , " some words " ...'.split()
    pattern = [{"ORTH": '"'}, {"OP": "*", "IS_PUNCT": False}, {"ORTH": '"'}]
    matcher = Matcher(matcher.vocab)
    matcher.add("Quote", None, pattern)
    doc = Doc(matcher.vocab, words=words)
    assert len(matcher(doc)) == 1
コード例 #6
0
ファイル: ud_train.py プロジェクト: spacy-io/spaCy
def write_conllu(docs, file_):
    merger = Matcher(docs[0].vocab)
    merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}])
    for i, doc in enumerate(docs):
        matches = merger(doc)
        spans = [doc[start : end + 1] for _, start, end in matches]
        with doc.retokenize() as retokenizer:
            for span in spans:
                retokenizer.merge(span)
        file_.write("# newdoc id = {i}\n".format(i=i))
        for j, sent in enumerate(doc.sents):
            file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
            file_.write("# text = {text}\n".format(text=sent.text))
            for k, token in enumerate(sent):
                if token.head.i > sent[-1].i or token.head.i < sent[0].i:
                    for word in doc[sent[0].i - 10 : sent[0].i]:
                        print(word.i, word.head.i, word.text, word.dep_)
                    for word in sent:
                        print(word.i, word.head.i, word.text, word.dep_)
                    for word in doc[sent[-1].i : sent[-1].i + 10]:
                        print(word.i, word.head.i, word.text, word.dep_)
                    raise ValueError(
                        "Invalid parse: head outside sentence (%s)" % token.text
                    )
                file_.write(token._.get_conllu_lines(k) + "\n")
            file_.write("\n")
コード例 #7
0
ファイル: test_matcher_api.py プロジェクト: spacy-io/spaCy
def test_matcher_operator_shadow(en_vocab):
    matcher = Matcher(en_vocab)
    doc = Doc(matcher.vocab, words=["a", "b", "c"])
    pattern = [{"ORTH": "a"}, {"IS_ALPHA": True, "OP": "+"}, {"ORTH": "c"}]
    matcher.add("A.C", None, pattern)
    matches = matcher(doc)
    assert len(matches) == 1
    assert matches[0][1:] == (0, 3)
コード例 #8
0
ファイル: test_issue1501-2000.py プロジェクト: spacy-io/spaCy
def test_issue_1971_2(en_vocab):
    matcher = Matcher(en_vocab)
    pattern1 = [{"ORTH": "EUR", "LOWER": {"IN": ["eur"]}}, {"LIKE_NUM": True}]
    pattern2 = [{"LIKE_NUM": True}, {"ORTH": "EUR"}]  # {"IN": ["EUR"]}}]
    doc = Doc(en_vocab, words=["EUR", "10", "is", "10", "EUR"])
    matcher.add("TEST1", None, pattern1, pattern2)
    matches = matcher(doc)
    assert len(matches) == 2
コード例 #9
0
ファイル: test_matcher_logic.py プロジェクト: spacy-io/spaCy
def test_match_consuming(doc, text, pattern, re_pattern):
    """Test that matcher.__call__ consumes tokens on a match similar to
    re.findall."""
    matcher = Matcher(doc.vocab)
    matcher.add(re_pattern, None, pattern)
    matches = matcher(doc)
    re_matches = [m.span() for m in re.finditer(re_pattern, text)]
    assert len(matches) == len(re_matches)
コード例 #10
0
ファイル: test_issue3555.py プロジェクト: spacy-io/spaCy
def test_issue3555(en_vocab):
    """Test that custom extensions with default None don't break matcher."""
    Token.set_extension("issue3555", default=None)
    matcher = Matcher(en_vocab)
    pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}]
    matcher.add("TEST", None, pattern)
    doc = Doc(en_vocab, words=["have", "apple"])
    matcher(doc)
コード例 #11
0
ファイル: test_issue1501-2000.py プロジェクト: spacy-io/spaCy
def test_issue1883():
    matcher = Matcher(Vocab())
    matcher.add("pat1", None, [{"orth": "hello"}])
    doc = Doc(matcher.vocab, words=["hello"])
    assert len(matcher(doc)) == 1
    new_matcher = copy.deepcopy(matcher)
    new_doc = Doc(new_matcher.vocab, words=["hello"])
    assert len(new_matcher(new_doc)) == 1
コード例 #12
0
ファイル: test_matcher_logic.py プロジェクト: spacy-io/spaCy
def test_greedy_matching(doc, text, pattern, re_pattern):
    """Test that the greedy matching behavior of the * op is consistant with
    other re implementations."""
    matcher = Matcher(doc.vocab)
    matcher.add(re_pattern, None, pattern)
    matches = matcher(doc)
    re_matches = [m.span() for m in re.finditer(re_pattern, text)]
    for match, re_match in zip(matches, re_matches):
        assert match[1:] == re_match
コード例 #13
0
ファイル: test_issue1501-2000.py プロジェクト: spacy-io/spaCy
def test_issue1945():
    """Test regression in Matcher introduced in v2.0.6."""
    matcher = Matcher(Vocab())
    matcher.add("MWE", None, [{"orth": "a"}, {"orth": "a"}])
    doc = Doc(matcher.vocab, words=["a", "a", "a"])
    matches = matcher(doc)  # we should see two overlapping matches here
    assert len(matches) == 2
    assert matches[0][1:] == (0, 2)
    assert matches[1][1:] == (1, 3)
コード例 #14
0
ファイル: test_matcher_api.py プロジェクト: spacy-io/spaCy
def test_matcher_compare_length(en_vocab):
    matcher = Matcher(en_vocab)
    pattern = [{"LENGTH": {">=": 2}}]
    matcher.add("LENGTH_COMPARE", None, pattern)
    doc = Doc(en_vocab, words=["a", "aa", "aaa"])
    matches = matcher(doc)
    assert len(matches) == 2
    doc = Doc(en_vocab, words=["a"])
    matches = matcher(doc)
    assert len(matches) == 0
コード例 #15
0
ファイル: test_matcher_api.py プロジェクト: spacy-io/spaCy
def matcher(en_vocab):
    rules = {
        "JS": [[{"ORTH": "JavaScript"}]],
        "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]],
        "Java": [[{"LOWER": "java"}]],
    }
    matcher = Matcher(en_vocab)
    for key, patterns in rules.items():
        matcher.add(key, None, *patterns)
    return matcher
コード例 #16
0
ファイル: test_matcher_api.py プロジェクト: spacy-io/spaCy
def test_matcher_regex_shape(en_vocab):
    matcher = Matcher(en_vocab)
    pattern = [{"SHAPE": {"REGEX": r"^[^x]+$"}}]
    matcher.add("NON_ALPHA", None, pattern)
    doc = Doc(en_vocab, words=["99", "problems", "!"])
    matches = matcher(doc)
    assert len(matches) == 2
    doc = Doc(en_vocab, words=["bye"])
    matches = matcher(doc)
    assert len(matches) == 0
コード例 #17
0
ファイル: test_matcher_api.py プロジェクト: spacy-io/spaCy
def test_matcher_any_token_operator(en_vocab):
    """Test that patterns with "any token" {} work with operators."""
    matcher = Matcher(en_vocab)
    matcher.add("TEST", None, [{"ORTH": "test"}, {"OP": "*"}])
    doc = Doc(en_vocab, words=["test", "hello", "world"])
    matches = [doc[start:end].text for _, start, end in matcher(doc)]
    assert len(matches) == 3
    assert matches[0] == "test"
    assert matches[1] == "test hello"
    assert matches[2] == "test hello world"
コード例 #18
0
ファイル: test_matcher_api.py プロジェクト: spacy-io/spaCy
def test_matcher_set_value(en_vocab):
    matcher = Matcher(en_vocab)
    pattern = [{"ORTH": {"IN": ["an", "a"]}}]
    matcher.add("A_OR_AN", None, pattern)
    doc = Doc(en_vocab, words=["an", "a", "apple"])
    matches = matcher(doc)
    assert len(matches) == 2
    doc = Doc(en_vocab, words=["aardvark"])
    matches = matcher(doc)
    assert len(matches) == 0
コード例 #19
0
ファイル: test_matcher_api.py プロジェクト: spacy-io/spaCy
def test_matcher_set_value_operator(en_vocab):
    matcher = Matcher(en_vocab)
    pattern = [{"ORTH": {"IN": ["a", "the"]}, "OP": "?"}, {"ORTH": "house"}]
    matcher.add("DET_HOUSE", None, pattern)
    doc = Doc(en_vocab, words=["In", "a", "house"])
    matches = matcher(doc)
    assert len(matches) == 2
    doc = Doc(en_vocab, words=["my", "house"])
    matches = matcher(doc)
    assert len(matches) == 1
コード例 #20
0
ファイル: test_matcher_api.py プロジェクト: spacy-io/spaCy
def test_matcher_regex(en_vocab):
    matcher = Matcher(en_vocab)
    pattern = [{"ORTH": {"REGEX": r"(?:a|an)"}}]
    matcher.add("A_OR_AN", None, pattern)
    doc = Doc(en_vocab, words=["an", "a", "hi"])
    matches = matcher(doc)
    assert len(matches) == 2
    doc = Doc(en_vocab, words=["bye"])
    matches = matcher(doc)
    assert len(matches) == 0
コード例 #21
0
ファイル: test_issue1501-2000.py プロジェクト: spacy-io/spaCy
def test_issue_1971_3(en_vocab):
    """Test that pattern matches correctly for multiple extension attributes."""
    Token.set_extension("a", default=1, force=True)
    Token.set_extension("b", default=2, force=True)
    doc = Doc(en_vocab, words=["hello", "world"])
    matcher = Matcher(en_vocab)
    matcher.add("A", None, [{"_": {"a": 1}}])
    matcher.add("B", None, [{"_": {"b": 2}}])
    matches = sorted((en_vocab.strings[m_id], s, e) for m_id, s, e in matcher(doc))
    assert len(matches) == 4
    assert matches == sorted([("A", 0, 1), ("A", 1, 2), ("B", 0, 1), ("B", 1, 2)])
コード例 #22
0
ファイル: test_issue1001-1500.py プロジェクト: spacy-io/spaCy
def test_issue1450(string, start, end):
    """Test matcher works when patterns end with * operator."""
    pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}]
    matcher = Matcher(Vocab())
    matcher.add("TSTEND", None, pattern)
    doc = Doc(Vocab(), words=string.split())
    matches = matcher(doc)
    if start is None or end is None:
        assert matches == []
    assert matches[-1][1] == start
    assert matches[-1][2] == end
コード例 #23
0
ファイル: test_issue3328.py プロジェクト: spacy-io/spaCy
def test_issue3328(en_vocab):
    doc = Doc(en_vocab, words=["Hello", ",", "how", "are", "you", "doing", "?"])
    matcher = Matcher(en_vocab)
    patterns = [
        [{"LOWER": {"IN": ["hello", "how"]}}],
        [{"LOWER": {"IN": ["you", "doing"]}}],
    ]
    matcher.add("TEST", None, *patterns)
    matches = matcher(doc)
    assert len(matches) == 4
    matched_texts = [doc[start:end].text for _, start, end in matches]
    assert matched_texts == ["Hello", "how", "you", "doing"]
コード例 #24
0
ファイル: test_matcher_logic.py プロジェクト: spacy-io/spaCy
def test_matcher_end_zero_plus(en_vocab):
    """Test matcher works when patterns end with * operator. (issue 1450)"""
    matcher = Matcher(en_vocab)
    pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}]
    matcher.add("TSTEND", None, pattern)
    nlp = lambda string: Doc(matcher.vocab, words=string.split())
    assert len(matcher(nlp("a"))) == 1
    assert len(matcher(nlp("a b"))) == 2
    assert len(matcher(nlp("a c"))) == 1
    assert len(matcher(nlp("a b c"))) == 2
    assert len(matcher(nlp("a b b c"))) == 3
    assert len(matcher(nlp("a b b"))) == 3
コード例 #25
0
ファイル: test_matcher_logic.py プロジェクト: spacy-io/spaCy
def test_matcher_sets_return_correct_tokens(en_vocab):
    matcher = Matcher(en_vocab)
    patterns = [
        [{'LOWER': {'IN': ["zero"]}}],
        [{'LOWER': {'IN': ["one"]}}],
        [{'LOWER': {'IN': ["two"]}}],
    ]
    matcher.add('TEST', None, *patterns)
    doc = Doc(en_vocab, words="zero one two three".split())
    matches = matcher(doc)
    texts = [Span(doc, s, e, label=L).text for L, s, e in matches]
    assert texts == ['zero', 'one', 'two']
コード例 #26
0
ファイル: test_matcher_api.py プロジェクト: spacy-io/spaCy
def test_matcher_extension_attribute(en_vocab):
    matcher = Matcher(en_vocab)
    get_is_fruit = lambda token: token.text in ("apple", "banana")
    Token.set_extension("is_fruit", getter=get_is_fruit, force=True)
    pattern = [{"ORTH": "an"}, {"_": {"is_fruit": True}}]
    matcher.add("HAVING_FRUIT", None, pattern)
    doc = Doc(en_vocab, words=["an", "apple"])
    matches = matcher(doc)
    assert len(matches) == 1
    doc = Doc(en_vocab, words=["an", "aardvark"])
    matches = matcher(doc)
    assert len(matches) == 0
コード例 #27
0
ファイル: test_issue1-1000.py プロジェクト: spacy-io/spaCy
def test_issue850_basic():
    """Test Matcher matches with '*' operator and Boolean flag"""
    vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()})
    matcher = Matcher(vocab)
    pattern = [{"LOWER": "bob"}, {"OP": "*", "LOWER": "and"}, {"LOWER": "frank"}]
    matcher.add("FarAway", None, pattern)
    doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"])
    match = matcher(doc)
    assert len(match) == 1
    ent_id, start, end = match[0]
    assert start == 0
    assert end == 4
コード例 #28
0
ファイル: test_issue1001-1500.py プロジェクト: spacy-io/spaCy
def test_issue1434():
    """Test matches occur when optional element at end of short doc."""
    pattern = [{"ORTH": "Hello"}, {"IS_ALPHA": True, "OP": "?"}]
    vocab = Vocab(lex_attr_getters=LEX_ATTRS)
    hello_world = Doc(vocab, words=["Hello", "World"])
    hello = Doc(vocab, words=["Hello"])
    matcher = Matcher(vocab)
    matcher.add("MyMatcher", None, pattern)
    matches = matcher(hello_world)
    assert matches
    matches = matcher(hello)
    assert matches
コード例 #29
0
ファイル: test_issue1-1000.py プロジェクト: spacy-io/spaCy
def test_issue590(en_vocab):
    """Test overlapping matches"""
    doc = Doc(en_vocab, words=["n", "=", "1", ";", "a", ":", "5", "%"])
    matcher = Matcher(en_vocab)
    matcher.add(
        "ab",
        None,
        [{"IS_ALPHA": True}, {"ORTH": ":"}, {"LIKE_NUM": True}, {"ORTH": "%"}],
    )
    matcher.add("ab", None, [{"IS_ALPHA": True}, {"ORTH": "="}, {"LIKE_NUM": True}])
    matches = matcher(doc)
    assert len(matches) == 2
コード例 #30
0
ファイル: test_matcher_api.py プロジェクト: spacy-io/spaCy
def test_matcher_extension_set_membership(en_vocab):
    matcher = Matcher(en_vocab)
    get_reversed = lambda token: "".join(reversed(token.text))
    Token.set_extension("reversed", getter=get_reversed, force=True)
    pattern = [{"_": {"reversed": {"IN": ["eyb", "ih"]}}}]
    matcher.add("REVERSED", None, pattern)
    doc = Doc(en_vocab, words=["hi", "bye", "hello"])
    matches = matcher(doc)
    assert len(matches) == 2
    doc = Doc(en_vocab, words=["aardvark"])
    matches = matcher(doc)
    assert len(matches) == 0
コード例 #31
0
def test_matcher_subset_value_operator(en_vocab):
    matcher = Matcher(en_vocab)
    pattern = [{"MORPH": {"IS_SUBSET": ["Feat=Val", "Feat2=Val2"]}}]
    matcher.add("M", [pattern])
    doc = Doc(en_vocab, words=["a", "b", "c"])
    assert len(matcher(doc)) == 3
    doc[0].set_morph("Feat=Val")
    assert len(matcher(doc)) == 3
    doc[0].set_morph("Feat=Val|Feat2=Val2")
    assert len(matcher(doc)) == 3
    doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3")
    assert len(matcher(doc)) == 2
    doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4")
    assert len(matcher(doc)) == 2

    # IS_SUBSET acts like "IN" for attrs other than MORPH
    matcher = Matcher(en_vocab)
    pattern = [{"TAG": {"IS_SUBSET": ["A", "B"]}}]
    matcher.add("M", [pattern])
    doc = Doc(en_vocab, words=["a", "b", "c"])
    doc[0].tag_ = "A"
    assert len(matcher(doc)) == 1

    # IS_SUBSET with an empty list matches nothing
    matcher = Matcher(en_vocab)
    pattern = [{"TAG": {"IS_SUBSET": []}}]
    matcher.add("M", [pattern])
    doc = Doc(en_vocab, words=["a", "b", "c"])
    doc[0].tag_ = "A"
    assert len(matcher(doc)) == 0
コード例 #32
0
def test_matcher_superset_value_operator(en_vocab):
    matcher = Matcher(en_vocab)
    pattern = [{"MORPH": {"IS_SUPERSET": ["Feat=Val", "Feat2=Val2", "Feat3=Val3"]}}]
    matcher.add("M", [pattern])
    doc = Doc(en_vocab, words=["a", "b", "c"])
    assert len(matcher(doc)) == 0
    doc[0].set_morph("Feat=Val|Feat2=Val2")
    assert len(matcher(doc)) == 0
    doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3")
    assert len(matcher(doc)) == 1
    doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4")
    assert len(matcher(doc)) == 1

    # IS_SUPERSET with more than one value only matches for MORPH
    matcher = Matcher(en_vocab)
    pattern = [{"TAG": {"IS_SUPERSET": ["A", "B"]}}]
    matcher.add("M", [pattern])
    doc = Doc(en_vocab, words=["a", "b", "c"])
    doc[0].tag_ = "A"
    assert len(matcher(doc)) == 0

    # IS_SUPERSET with one value is the same as ==
    matcher = Matcher(en_vocab)
    pattern = [{"TAG": {"IS_SUPERSET": ["A"]}}]
    matcher.add("M", [pattern])
    doc = Doc(en_vocab, words=["a", "b", "c"])
    doc[0].tag_ = "A"
    assert len(matcher(doc)) == 1

    # IS_SUPERSET with an empty value matches everything
    matcher = Matcher(en_vocab)
    pattern = [{"TAG": {"IS_SUPERSET": []}}]
    matcher.add("M", [pattern])
    doc = Doc(en_vocab, words=["a", "b", "c"])
    doc[0].tag_ = "A"
    assert len(matcher(doc)) == 3
コード例 #33
0
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("ja_core_news_sm")
matcher = Matcher(nlp.vocab)

doc = nlp("松島、天橋立、宮島は日本三景として知られています。" "松島は宮城県、天橋立は京都府、宮島は広島県にそれぞれあります。")

# v2.3現在、日本語モデルではdoc.is_taggedが正しく設定されないので、
# 明示的に設定
# 参考: https://github.com/explosion/spaCy/issues/5802
doc.is_tagged = True

# 「固有名詞 + 県」からなるパターンを書きます
pattern = [{"POS": "PROPN"}, {"LEMMA": "県"}]

# パターンをmatcherに追加し、docに対してmatcherを適用します
matcher.add("PREFECTURE_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found:", len(matches))

# 結果をイテレートし、スパンの文字列をプリントします
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)
def custom_tokenizer_to_df(nlp, doc):
    # Initialize the Matcher with a vocab
    matcher = Matcher(nlp.vocab)

    ###############################################################
    # Add pattern for valid hashtag, i.e. '#' plus any ASCII token
    matcher.add("HASHTAG", None, [{"ORTH": "#"}, {"IS_ALPHA": True}])

    # Register token extension for hashtag
    Token.set_extension("is_hashtag", default=False, force=True)

    # Fit in text in matcher
    matches = matcher(doc)

    # Find hashtag and merge, assign hashtag label
    hashtags = []
    for match_id, start, end in matches:
        if doc.vocab.strings[match_id] == "HASHTAG":
            hashtags.append(doc[start:end])
    with doc.retokenize() as retokenizer:
        for span in hashtags:
            retokenizer.merge(span)
            for token in span:
                token._.is_hashtag = True
    ##############################################################

    ##############################################################
    # Find number and merge, assign number label
    # Add pattern for valid hashtag, i.e. '#' plus any ASCII token
    matcher.add("LONG_NUMBER", None, [{
        "IS_DIGIT": True
    }, {
        "ORTH": ','
    }, {
        "IS_DIGIT": True
    }])
    matcher.add("LONG_NUMBER", None, [{
        "IS_DIGIT": True
    }, {
        "ORTH": '.'
    }, {
        "IS_DIGIT": True
    }])

    # Register token extension for hashtag
    Token.set_extension("is_long_number", default=False, force=True)

    # Fit in text in matcher
    matches = matcher(doc)

    long_number = []
    for match_id, start, end in matches:
        if doc.vocab.strings[match_id] == "LONG_NUMBER":
            long_number.append(doc[start:end])
    with doc.retokenize() as retokenizer:
        for span in long_number:
            retokenizer.merge(span)
            for token in span:
                token._.is_long_number = True
    ##############################################################

    for i, token in enumerate(doc):
        if token._.is_hashtag:
            token.tag_ = 'Hashtag'
        if token.like_url:
            token.tag_ = 'URL'
        if token.like_email:
            token.tag_ = 'Email'
        if token.is_stop:
            token.tag_ = 'Stop Word'
        if token.like_num:
            token.tag_ = 'Number'
        if token._.is_long_number:
            token.tag_ = 'Number'
        if token.is_punct:
            token.tag_ = 'Punctuation'

    # Write the tokens to data frame
    df = pd.DataFrame()
    df['Token'] = [token.text for token in doc]
    df['POS'] = [token.pos_ for token in doc]
    df['NE'] = [token.ent_iob_ for token in doc]
    df['Lemma'] = [token.lemma_ for token in doc]
    df['Tag'] = [token.tag_ for token in doc]
    return df
コード例 #35
0
import spacy

# Import the Matcher
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
doc = nlp("Upcoming iPhone X release date leaked as Apple reveals pre-orders")

# Initialize the Matcher with the shared vocabulary
matcher = Matcher(nlp.vocab)

# Create a pattern matching two tokens: "iPhone" and "X"
pattern = [{"TEXT": "iPhone"}, {"TEXT": "X"}]

# Add the pattern to the matcher
matcher.add("IPHONE_X_PATTERN", None, pattern)

# Use the matcher on the doc
matches = matcher(doc)
print("Matches:", [doc[start:end].text for match_id, start, end in matches])
コード例 #36
0
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")

from shanepy import *
os.environ["EDITOR"] = "sp"
from ptpython.repl import embed
embed(globals(), locals())

# Match sequences of tokens, based on pattern rules
matcher = Matcher(nlp.vocab)

doc = nlp(
    "Features of the app include a beautiful design, smart search, automatic "
    "labels and optional voice responses.")

# Write a pattern for adjective plus one or two nouns
pattern = [{"POS": "ADJ"}, {"POS": "NOUN"}, {"POS": "NOUN", "OP": "?"}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("ADJ_NOUN_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)
コード例 #37
0
#!/usr/bin/python
""" WRITING MORE COMPLEX MATCH PATTERNS """

import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

doc = nlp(
    "i downloaded Fortnite on my laptop and can't open the game at all. Help? "
    "so when I was downloading Minecraft, I got the Windows version where it "
    "is the '.zip' folder and I used the default program to unpack it... do "
    "I also need to download Winzip?")

# Write a pattern that matches a form of "download" plus proper noun
pattern = [{"LEMMA": "download"}, {"POS": "PROPN"}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("DOWNLOAD_THINGS_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)
def identify_build_date_in_text(text):
    nlp = English()
    doc = nlp(text)
    matcher = Matcher(nlp.vocab)

    #
    # START - spaCy patterns
    #

    # WATER_VESSEL
    water_vessel_pattern = [{"LOWER": {"IN": ["vessels"]}}]
    matcher.add("WATER_VESSEL", None, water_vessel_pattern)

    # DATE
    matcher.add("DATE", None, [{'IS_DIGIT': True, 'LENGTH': 4}])

    # CONSTRUCT
    matcher.add("CONSTRUCT", None, [{"LOWER": {"IN": ["constructed"]}}])

    #
    # END - spaCy patterns
    #

    result = []

    for match_id, token_start, token_end in matcher(doc):

        match_id_as_string = nlp.vocab.strings[match_id]
        final_token_start = token_start
        final_token_end = token_end

        if match_id_as_string == "DATE" and token_start > 0:

            # At this point, DATE is just a year string. Example: 2021

            # Expand DATE?
            prev_word_1_token_number = token_start - 1
            prev_word_1_token = doc[prev_word_1_token_number]
            if prev_word_1_token.text.lower() in ("january", "february",
                                                  "march", "april", "may",
                                                  "june", "july", "august",
                                                  "september", "october",
                                                  "november", "december"):
                final_token_start = prev_word_1_token_number  # expanding
                # Expand more?
                prev_word_2_token_number = token_start - 2
                prev_word_2_token = doc[prev_word_2_token_number]
                if is_int(prev_word_2_token.text):
                    final_token_start = prev_word_2_token_number  # expanding

            prev_word_on_date_token_number = final_token_start - 1
            prev_word_on_date_token = doc[prev_word_on_date_token_number]

            # Does the DATE have a DATE_SEPARATOR?
            if prev_word_on_date_token.text in ("and", "to"):
                prev_word_on_date_char_span_start_number = prev_word_on_date_token.idx
                prev_word_on_date_char_span_end_number = prev_word_on_date_char_span_start_number + len(
                    prev_word_on_date_token.text)
                identified_entity = {
                    'start': prev_word_on_date_char_span_start_number,
                    'end': prev_word_on_date_char_span_end_number,
                    'label': "DATE_SEPARATOR"
                }
                result.append(identified_entity)

            # Does the DATE have a DATE_SEPARATOR?
            elif prev_word_on_date_token.text in ("between", "before",
                                                  "after"):
                # DATE_PREFIX detected
                prev_word_on_date_char_span_start_number = prev_word_on_date_token.idx
                prev_word_on_date_char_span_end_number = prev_word_on_date_char_span_start_number + len(
                    prev_word_on_date_token.text)
                identified_entity = {
                    'start': prev_word_on_date_char_span_start_number,
                    'end': prev_word_on_date_char_span_end_number,
                    'label': "DATE_PREFIX"
                }
                result.append(identified_entity)

        #
        # convert token_span to char_span.
        # char_span is needed to display correctly withdisplacy.render().
        #
        span = doc[final_token_start:final_token_end]
        span_char_start = span[0].idx
        span_char_end = span[-1].idx + len(span[-1].text)

        # return result
        identified_entity = {
            'start': span_char_start,
            'end': span_char_end,
            'label': match_id_as_string
        }
        result.append(identified_entity)

    return result
コード例 #39
0
ファイル: movement_extractor.py プロジェクト: szeke/indicator
def load_movement_matcher(nlp):
    matcher = Matcher(nlp.vocab)

    place = ['area', 'place', 'city', 'town']
    girl = [
        'gal', 'girl', 's**t', 'cutie', 'hottie', 'lady', 'teen', 'teenager',
        'chick', 'staff', 'gf', 'she'
    ]

    add_to_vocab(nlp, place)
    add_to_vocab(nlp, girl)

    is_place = FLAG18
    is_girl = FLAG19
    upper_start = FLAG20

    for lexeme in nlp.vocab:
        if lexeme.lower_ in place:
            lexeme.set_flag(is_place, True)
        if lexeme.lower_ in girl:
            lexeme.set_flag(is_girl, True)
        if lexeme.prefix_.isupper():
            lexeme.set_flag(upper_start, True)

    # Positive Matcher Patterns
    matcher.add_entity(1)
    matcher.add_pattern(1, [{
        LEMMA: "last"
    }, {
        LEMMA: "night"
    }, {
        LEMMA: "in"
    }, {
        LEMMA: "town"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "leave"
    }, {
        IS_ASCII: True,
        ENT_TYPE: "DATE"
    }])
    matcher.add_pattern(1, [{LEMMA: "leave"}, {DEP: "partmod"}])
    matcher.add_pattern(1, [{LEMMA: "leave"}, {DEP: "quantmod"}])
    matcher.add_pattern(1, [{
        LEMMA: "leave"
    }, {
        IS_ASCII: True,
        ENT_TYPE: "TIME"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "leave"
    }, {
        LEMMA: "in"
    }, {
        IS_ASCII: True,
        ENT_TYPE: "DATE"
    }])
    matcher.add_pattern(1, [{LEMMA: "leave"}, {LEMMA: "town"}])
    matcher.add_pattern(1, [{LEMMA: "out"}, {LEMMA: "of"}, {LEMMA: "town"}])
    matcher.add_pattern(1, [{LOWER: "outta"}, {LEMMA: "town"}])
    matcher.add_pattern(1, [{
        LEMMA: "lastnight"
    }, {
        LEMMA: "in"
    }, {
        LEMMA: "town"
    }])
    matcher.add_pattern(1, [{LEMMA: "back"}, {LEMMA: "in"}, {LEMMA: "town"}])
    matcher.add_pattern(1, [{LEMMA: "just"}, {LEMMA: "in"}, {LEMMA: "town"}])
    matcher.add_pattern(1, [{LEMMA: "day"}, {LEMMA: "in"}, {LEMMA: "town"}])
    matcher.add_pattern(1, [{
        LEMMA: "in"
    }, {
        LEMMA: "town"
    }, {
        LEMMA: "tonight"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "in"
    }, {
        LEMMA: "town"
    }, {
        LEMMA: "through"
    }])
    matcher.add_pattern(1, [{LEMMA: "in"}, {LEMMA: "town"}, {LEMMA: "until"}])
    matcher.add_pattern(1, [{
        LEMMA: "in"
    }, {
        LEMMA: "town"
    }, {
        LEMMA: "for"
    }, {
        LEMMA: "one"
    }, {
        LEMMA: "night"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "in"
    }, {
        LEMMA: "town"
    }, {
        LEMMA: "for"
    }, {
        IS_DIGIT: True
    }, {
        LEMMA: "night"
    }])
    matcher.add_pattern(1, [{LEMMA: "town"}, {LEMMA: "stay", DEP: "nmod"}])
    matcher.add_pattern(1, [{
        LEMMA: "town"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "stay",
        DEP: "nmod"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "new"
    }, {
        LEMMA: "girl"
    }, {
        LEMMA: "in"
    }, {
        LEMMA: "town"
    }])
    matcher.add_pattern(1, [{LEMMA: "recent"}, {LEMMA: "move"}])
    matcher.add_pattern(1, [{LEMMA: "recently"}, {LEMMA: "move"}])
    matcher.add_pattern(1, [{LEMMA: "relocate"}])
    matcher.add_pattern(1, [{
        LEMMA: "new",
        DEP: "amod"
    }, {
        LEMMA: "city"
    }, {
        LEMMA: "to",
        DEP: "dep"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "new",
        DEP: "amod"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "city"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "to",
        DEP: "dep"
    }])
    matcher.add_pattern(1, [{LEMMA: "new"}, {LEMMA: "to"}, {LEMMA: "area"}])
    matcher.add_pattern(1, [{
        LEMMA: "new"
    }, {
        LEMMA: "to"
    }, {
        upper_start: True
    }])
    matcher.add_pattern(1, [{LEMMA: "first"}, {LEMMA: "visit"}, {LEMMA: "to"}])
    matcher.add_pattern(1, [{LEMMA: "i", DEP: "nsubj"}, {LEMMA: "arrive"}])
    matcher.add_pattern(1, [{
        LEMMA: "girl",
        DEP: "nsubj"
    }, {
        LEMMA: "arrive"
    }, {
        DEP: "partmod"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "girl",
        DEP: "nsubj"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "arrive"
    }, {
        IS_ASCII: True
    }, {
        DEP: "partmod"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "girl",
        DEP: "nsubj"
    }, {
        LEMMA: "arrive"
    }, {
        DEP: "quantmod"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "girl",
        DEP: "nsubj"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "arrive"
    }, {
        IS_ASCII: True
    }, {
        DEP: "quantmod"
    }])
    matcher.add_pattern(1, [{LEMMA: "just"}, {LEMMA: "arrive"}])
    matcher.add_pattern(1, [{
        LEMMA: "on"
    }, {
        LEMMA: "my"
    }, {
        LEMMA: "way"
    }, {
        LEMMA: "to"
    }, {
        TAG: "NNP"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "on"
    }, {
        LEMMA: "my"
    }, {
        LEMMA: "way"
    }, {
        LEMMA: "to"
    }, {
        TAG: "NN"
    }])
    matcher.add_pattern(1, [{LEMMA: "on"}, {LEMMA: "the"}, {LEMMA: "way"}])
    matcher.add_pattern(1, [{LEMMA: "just"}, {LEMMA: "get"}, {LEMMA: "here"}])
    matcher.add_pattern(1, [{LEMMA: "get"}, {LEMMA: "here"}, {LEMMA: "today"}])
    matcher.add_pattern(1, [{
        LEMMA: "get"
    }, {
        LEMMA: "here"
    }, {
        LEMMA: "yesterday"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "get"
    }, {
        LEMMA: "here"
    }, {
        LEMMA: "last"
    }, {
        LEMMA: "night"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "i",
        DEP: "nsubj"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "visit"
    }, {
        IS_ASCII: True
    }, {
        is_place: True,
        DEP: "dobj"
    }])
    matcher.add_pattern(1, [{
        LEMMA: "i",
        DEP: "nsubj"
    }, {
        LEMMA: "visit"
    }, {
        is_place: True,
        DEP: "dobj"
    }])

    # Strong Positive Matcher Patterns
    matcher.add_entity(2)
    matcher.add_pattern(2, [{
        LEMMA: "new"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "in"
    }, {
        is_place: True
    }])
    matcher.add_pattern(2, [{
        LEMMA: "new"
    }, {
        IS_ASCII: True
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "in"
    }, {
        is_place: True
    }])
    matcher.add_pattern(2, [{
        LEMMA: "im"
    }, {
        LEMMA: "new"
    }, {
        LEMMA: "in"
    }, {
        LEMMA: "town"
    }])
    matcher.add_pattern(2, [{LEMMA: "new"}, {LEMMA: "in"}, {is_place: True}])
    matcher.add_pattern(2, [{LEMMA: "new"}, {LEMMA: "to"}, {is_place: True}])
    matcher.add_pattern(2, [{
        LEMMA: "new"
    }, {
        is_girl: True
    }, {
        LEMMA: "in"
    }, {
        LEMMA: "town"
    }])
    matcher.add_pattern(2, [{
        LEMMA: "new"
    }, {
        LEMMA: "to"
    }, {
        upper_start: True
    }, {
        LEMMA: "area"
    }])

    # Negative Matcher Patterns
    matcher.add_entity(3)
    matcher.add_pattern(3, [{LEMMA: "new"}])
    matcher.add_pattern(3, [{LEMMA: "girl"}, {LEMMA: "in"}, {LEMMA: "town"}])
    matcher.add_pattern(3, [{LEMMA: "grand"}, {LEMMA: "new"}])
    matcher.add_pattern(3, [{LEMMA: "new"}, {LEMMA: "at"}])
    matcher.add_pattern(3, [{
        LEMMA: "new"
    }, {
        LEMMA: "to"
    }, {
        LEMMA: "business"
    }])
    matcher.add_pattern(3, [{
        LEMMA: "new"
    }, {
        LEMMA: "to"
    }, {
        LEMMA: "industry"
    }])
    matcher.add_pattern(3, [{
        LEMMA: "new"
    }, {
        LEMMA: "to"
    }, {
        LEMMA: "scenario"
    }])
    matcher.add_pattern(3, [{LEMMA: "dream", DEP: "nsubj"}, {LEMMA: "arrive"}])
    matcher.add_pattern(3, [{
        LEMMA: "fantasy",
        DEP: "nsubj"
    }, {
        LEMMA: "arrive"
    }])
    matcher.add_pattern(3, [{LEMMA: "you", DEP: "nsubj"}, {LEMMA: "arrive"}])
    matcher.add_pattern(3, [{LEMMA: "area"}, {LEMMA: "only"}])
    matcher.add_pattern(3, [{upper_start: True}, {LEMMA: "area"}])
    matcher.add_pattern(3, [{LEMMA: "you", DEP: "nsubj"}, {LEMMA: "leave"}])
    matcher.add_pattern(3, [{
        LEMMA: "it",
        DEP: "dobj"
    }, {
        LEMMA: "leave"
    }, {
        IS_ASCII: True,
        DEP: "nmod",
        TAG: "TO"
    }])
    matcher.add_pattern(3, [{
        LEMMA: "that",
        DEP: "dobj"
    }, {
        LEMMA: "leave"
    }, {
        IS_ASCII: True,
        DEP: "nmod",
        TAG: "TO"
    }])
    matcher.add_pattern(3, [{LEMMA: "best"}, {LEMMA: "move"}])
    matcher.add_pattern(3, [{LEMMA: "next"}, {LEMMA: "move"}])
    matcher.add_pattern(3, [{
        LEMMA: "arrive"
    }, {
        IS_ASCII: True
    }, {
        IS_ASCII: True,
        DEP: "xcomp"
    }])
    matcher.add_pattern(3, [{LEMMA: "arrive"}, {IS_ASCII: True, DEP: "xcomp"}])
    matcher.add_pattern(3, [{LEMMA: "visit"}, {LEMMA: "sister", DEP: "dobj"}])
    matcher.add_pattern(3, [{
        LEMMA: "visit"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "sister",
        DEP: "dobj"
    }])
    matcher.add_pattern(3, [{LEMMA: "visit"}, {LEMMA: "family", DEP: "dobj"}])
    matcher.add_pattern(3, [{
        LEMMA: "visit"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "family",
        DEP: "dobj"
    }])
    matcher.add_pattern(3, [{LEMMA: "we", DEP: "poss"}, {LEMMA: "visit"}])

    # Strong Negative Matcher Patterns
    matcher.add_entity(4)
    matcher.add_pattern(4, [{LEMMA: "town"}, {LEMMA: "girl"}])
    matcher.add_pattern(4, [{LEMMA: "on"}, {LEMMA: "the"}, {LEMMA: "town"}])
    matcher.add_pattern(4, [{LEMMA: "near"}, {LEMMA: "town"}])
    matcher.add_pattern(4, [{LEMMA: "down"}, {LEMMA: "town"}])
    matcher.add_pattern(4, [{LEMMA: "town"}, {LEMMA: "hall"}])
    matcher.add_pattern(4, [{LEMMA: "best"}, {LEMMA: "in"}, {LEMMA: "town"}])
    matcher.add_pattern(4, [{
        LEMMA: "best"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "in"
    }, {
        LEMMA: "town"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "best"
    }, {
        IS_ASCII: True
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "in"
    }, {
        LEMMA: "town"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "best"
    }, {
        LEMMA: "in"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "town"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "best"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "in"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "town"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "best"
    }, {
        IS_ASCII: True
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "in"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "town"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "not"
    }, {
        LEMMA: "new"
    }, {
        LEMMA: "in"
    }, {
        LEMMA: "town"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "not"
    }, {
        LEMMA: "new"
    }, {
        LEMMA: "to"
    }, {
        LEMMA: "town"
    }])
    matcher.add_pattern(4, [{LEMMA: "not"}, {LEMMA: "leave"}, {LEMMA: "town"}])
    matcher.add_pattern(4, [{
        LEMMA: "i",
        DEP: "nsubj"
    }, {
        LEMMA: "leave"
    }, {
        LEMMA: "you",
        DEP: "dobj"
    }])
    matcher.add_pattern(4, [{LEMMA: "new"}, {LEMMA: "but"}])
    matcher.add_pattern(4, [{
        LEMMA: "new"
    }, {
        LEMMA: "backpage",
        DEP: "nmod",
        TAG: "TO"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "new"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "backpage",
        DEP: "nmod",
        TAG: "TO"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "new"
    }, {
        LEMMA: "bp",
        DEP: "nmod",
        TAG: "TO"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "new"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "bp",
        DEP: "nmod",
        TAG: "TO"
    }])
    #DS
    matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "message", DEP: "dobj"}])
    matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "msg", DEP: "dobj"}])
    matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "txt", DEP: "dobj"}])
    matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "text", DEP: "dobj"}])
    matcher.add_pattern(4, [{
        LEMMA: "leave"
    }, {
        LEMMA: "impression",
        DEP: "dobj"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "leave"
    }, {
        LEMMA: "voicemail",
        DEP: "dobj"
    }])
    matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "smile", DEP: "dobj"}])
    matcher.add_pattern(4, [{
        LEMMA: "leave"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "message",
        DEP: "dobj"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "leave"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "msg",
        DEP: "dobj"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "leave"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "txt",
        DEP: "dobj"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "leave"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "text",
        DEP: "dobj"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "leave"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "impression",
        DEP: "dobj"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "leave"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "voicemail",
        DEP: "dobj"
    }])
    matcher.add_pattern(4, [{
        LEMMA: "leave"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "smile",
        DEP: "dobj"
    }])
    matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "satisfied"}])
    matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "memory", DEP: "dobj"}])
    matcher.add_pattern(4, [{
        LEMMA: "leave"
    }, {
        IS_ASCII: True
    }, {
        LEMMA: "memory",
        DEP: "dobj"
    }])
    matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "you"}])
    matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "u"}])
    matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "with"}])
    matcher.add_pattern(4, [{
        LEMMA: "leave"
    }, {
        LEMMA: "a"
    }, {
        LEMMA: "gentleman"
    }])
    matcher.add_pattern(4, [{LEMMA: "or"}, {LEMMA: "leave"}])
    matcher.add_pattern(4, [{LEMMA: "or"}, {LEMMA: "i"}, {LEMMA: "leave"}])
    matcher.add_pattern(4, [{LEMMA: "move"}, {LEMMA: "on"}])
    matcher.add_pattern(4, [{LEMMA: "i"}, {LEMMA: "move"}, {LEMMA: "like"}])
    matcher.add_pattern(4, [{LEMMA: "arrive"}, {LEMMA: "on"}, {LEMMA: "time"}])
    matcher.add_pattern(4, [{LEMMA: "can"}, {LEMMA: "move"}])
    matcher.add_pattern(4, [{LEMMA: "new"}, {LEMMA: "but"}])
    matcher.add_pattern(4, [{
        LEMMA: "on"
    }, {
        LEMMA: "my"
    }, {
        LEMMA: "way"
    }, {
        LEMMA: "to"
    }, {
        TAG: "PRP"
    }])
    matcher.add_pattern(4, [{LEMMA: "u"}, {LEMMA: "get"}, {LEMMA: "here"}])
    matcher.add_pattern(4, [{LEMMA: "you"}, {LEMMA: "get"}, {LEMMA: "here"}])
    matcher.add_pattern(4, [{LEMMA: "go"}, {LEMMA: "to"}, {LEMMA: "town"}])
    matcher.add_pattern(4, [{LEMMA: "new"}, {LEMMA: "management"}])

    return matcher
コード例 #40
0
def test_issue4120(en_vocab):
    """Test that matches without a final {OP: ?} token are returned."""
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]])
    doc1 = Doc(en_vocab, words=["a"])
    assert len(matcher(doc1)) == 1  # works
    doc2 = Doc(en_vocab, words=["a", "b", "c"])
    assert len(matcher(doc2)) == 2  # fixed
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]])
    doc3 = Doc(en_vocab, words=["a", "b", "b", "c"])
    assert len(matcher(doc3)) == 2  # works
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]])
    doc4 = Doc(en_vocab, words=["a", "b", "b", "c"])
    assert len(matcher(doc4)) == 3  # fixed
コード例 #41
0
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)


def set_sentiment(matcher, doc, i, matches):
    doc.sentiment += 0.1


pattern1 = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]
pattern2 = [[{"ORTH": emoji, "OP": "+"}] for emoji in ["😀", "😂", "🤣", "😍"]]
matcher.add("GoogleIO", None, pattern1)  # Match "Google I/O" or "Google i/o"
matcher.add("HAPPY", set_sentiment, *pattern2)  # Match one or more happy emoji

doc = nlp(u"A text about Google I/O 😀😀😀")
matches = matcher(doc)

for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(string_id, span.text)
print("Sentiment", doc.sentiment)
コード例 #42
0
        return False
    if token_.is_stop:
        return False
    if len(token_.orth_) < 3:
        return False

    return True

path_to_db = "/media/norpheo/mySQL/db/ssorc"
path_to_annotations = os.path.join(path_to_db, "annotations_ner")
pandas_path = os.path.join(path_to_db, "pandas")
path_to_ner = os.path.join(path_to_db, "NER")

nlp = spacy.load(os.path.join(path_to_db, "models", "en_core_web_sm_nertrained"))
vocab = nlp.vocab.from_disk(os.path.join(path_to_db, "dictionaries", "spacy.vocab"))
matcher = Matcher(vocab)

with open(os.path.join(path_to_ner, "ml_algos.txt"), "r") as handle:
    ml_algos = set()
    ml_algos_list = list()
    for line in handle:
        algo = line.strip().lower()
        if algo not in ml_algos:
            ml_algos.add(algo)
            ml_algos_list.append(algo.split(" "))

for i in range(len(ml_algos_list)):
    for j in range(len(ml_algos_list)):
        if i != j:
            algo1 = ml_algos_list[i]
            algo2 = ml_algos_list[j]
コード例 #43
0
def run_prdualrank(T_0, unranked_patterns, unranked_phrases, file):
    global final_patterns, final_keywords, pattern_to_score_map, keyword_to_score_map, ngram_prob_map, phrase_seg_score, removed_phrases, wiki_score_cache, error_count, total_ngram_counts
    phrase2id = {}
    for i in range(len(unranked_phrases)):
        phrase2id[unranked_phrases[i]] = i

    id2phrase = {}
    for i in range(len(unranked_phrases)):
        id2phrase[i] = unranked_phrases[i]

    id2pattern = {}
    for i in range(len(unranked_patterns)):
        id2pattern[i] = unranked_patterns[i]

    seedIdwConfidence = {}
    for key, val in phrase2id.items():
        if key in T_0:
            seedIdwConfidence[val] = 0.0

    id2patterns = defaultdict(set)
    pattern2ids = defaultdict(set)

    context_matrix = np.zeros((len(unranked_phrases), len(unranked_patterns)))
    # find c (t, p)
    with open(file, 'r') as f:
        file_chunk = partition(f)
        matcher = Matcher(nlp.vocab)
        for t in file_chunk:
            doc = nlp(t)
            for i in range(len(unranked_patterns)):
                offset = 0
                for pattern_dict in unranked_patterns[i]:
                    if 'POS' in pattern_dict:
                        break
                    offset += 1
                matcher.add("extraction", None, unranked_patterns[i])
                matches = matcher(doc)
                for match_id, start, end in matches:
                    span = doc[start + offset:end].text
                    j = unranked_phrases.index(
                        span) if span in unranked_phrases else -1
                    if j == -1:
                        continue
                    context_matrix[j, i] += 1
                    id2patterns[j].add(i)
                    pattern2ids[i].add(j)
                matcher.remove("extraction")

    id2sup = {}
    for i in range(len(unranked_phrases)):
        id2sup[i] = 0

    pattern2sup = {}
    for i in range(len(unranked_patterns)):
        pattern2sup[i] = 0

    for id in id2patterns.keys():
        sum = 0
        for col in range(len(unranked_patterns)):
            sum += context_matrix[id, col]
        id2sup[id] = sum

    for pattern in pattern2ids.keys():
        sum = 0
        for row in range(len(unranked_phrases)):
            sum += context_matrix[row, pattern]
        pattern2sup[pattern] = sum

    l1, l2, l3, l4, m1, m2, m3, m4 = prDualRank(seedIdwConfidence, [],
                                                id2patterns,
                                                pattern2ids, {}, {}, {}, {},
                                                id2phrase,
                                                context_matrix.tolist(),
                                                id2sup,
                                                pattern2sup,
                                                FLAGS_VERBOSE=False,
                                                FLAGS_DEBUG=False)

    return l1, l2, l3, l4, m1, m2, m3, m4
コード例 #44
0
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)


def remove_overlapping_matches(matches):
    remove = []
    for m1 in range(len(matches) - 1):
        if m1 in remove:
            continue
        for m2 in range(m1 + 1, len(matches)):
            if m2 in remove:
                continue
            _, s1, e1 = matches[m1]
            _, s2, e2 = matches[m2]
            if s1 >= s2 and e1 <= e2:
                remove.append(m1)
                break
            if s2 >= s1 and e2 <= e1:
                remove.append(m2)
                continue

    return [matches[m] for m in range(len(matches)) if m not in remove]


def markup_timex(doc, matches):
    matches = remove_overlapping_matches(matches)
    out = ""
    prev = 0
コード例 #45
0
def test_attr_pipeline_checks(en_vocab):
    doc1 = Doc(en_vocab, words=["Test"])
    doc1.is_parsed = True
    doc2 = Doc(en_vocab, words=["Test"])
    doc2.is_tagged = True
    doc3 = Doc(en_vocab, words=["Test"])
    # DEP requires is_parsed
    matcher = Matcher(en_vocab)
    matcher.add("TEST", None, [{"DEP": "a"}])
    matcher(doc1)
    with pytest.raises(ValueError):
        matcher(doc2)
    with pytest.raises(ValueError):
        matcher(doc3)
    # TAG, POS, LEMMA require is_tagged
    for attr in ("TAG", "POS", "LEMMA"):
        matcher = Matcher(en_vocab)
        matcher.add("TEST", None, [{attr: "a"}])
        matcher(doc2)
        with pytest.raises(ValueError):
            matcher(doc1)
        with pytest.raises(ValueError):
            matcher(doc3)
    # TEXT/ORTH only require tokens
    matcher = Matcher(en_vocab)
    matcher.add("TEST", None, [{"ORTH": "a"}])
    matcher(doc1)
    matcher(doc2)
    matcher(doc3)
    matcher = Matcher(en_vocab)
    matcher.add("TEST", None, [{"TEXT": "a"}])
    matcher(doc1)
    matcher(doc2)
    matcher(doc3)
コード例 #46
0
#!/usr/bin/python
""" WRITING MATCH PATTERNS """
# Write one pattern that only matches mentions of the
#  full iOS versions: “iOS 7”, “iOS 11” and “iOS 10”.

import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

doc = nlp(
    "After making the iOS update you won't notice a radical system-wide "
    "redesign: nothing like the aesthetic upheaval we got with iOS 7. Most of "
    "iOS 11's furniture remains the same as in iOS 10. But you will discover "
    "some tweaks once you delve a little deeper.")

# Write a pattern for full iOS versions ("iOS 7", "iOS 11", "iOS 10")
pattern = [{"TEXT": "iOS"}, {"IS_DIGIT": True}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("IOS_VERSION_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

import spacy
from spacy.matcher import Matcher
コード例 #47
0
import json
from spacy.matcher import Matcher
from spacy.lang.es import Spanish

with open("exercises/es/adidas.json", encoding="utf8") as f:
    TEXTS = json.loads(f.read())

nlp = Spanish()
matcher = Matcher(nlp.vocab)
pattern1 = [{"LOWER": "adidas"}, {"LOWER": "zx"}]
pattern2 = [{"LOWER": "adidas"}, {"IS_DIGIT": True}]
matcher.add("ROPA", None, pattern1, pattern2)

TRAINING_DATA = []

# Crea un objeto Doc para cada texto en TEXTS
for doc in nlp.pipe(TEXTS):
    # Encuentra en el doc y crea una lista de los spans resultantes
    spans = [doc[start:end] for match_id, start, end in matcher(doc)]
    # Obtén los tuples (carácter de inicio, carácter del final, label) resultantes
    entities = [(span.start_char, span.end_char, "ROPA") for span in spans]
    # Da formato a los resultados como tuples con (doc.text, entidades)
    training_example = (doc.text, {"entities": entities})
    # Añade el ejemplo a los datos de entrenamiento
    TRAINING_DATA.append(training_example)

print(*TRAINING_DATA, sep="\n")
コード例 #48
0
All the reservations will be made by the wedding planner.\
For the bake sale, two dozen cookies will be baked by Susan.\
The comet was viewed by the science class.\
The video was posted on Facebook by Alex.\
Instructions will be given to you by the director.\
The Grand Canyon is viewed by thousands of tourists every year.\
The house was remodeled by the homeowners to help it sell.\
The victory will be celebrated by the team tomorrow.\
The metal beams were eventually corroded by the saltwater.\
The baby was carried by the kangaroo in her pouch.\
The last cookie was eaten by whom?"
try:
    nlp = spacy.load('en_core_web_sm')
except:
    nlp = spacy.load('en')
matcher = Matcher(nlp.vocab)
def is_passive(sentence):
    doc = nlp(sentence)
    passive_rule = [{'DEP': 'nsubjpass'}, {'DEP': 'aux', 'OP': '*'}, {'DEP': 'auxpass'}, {'TAG': 'VBN'}]
    matcher.add('Passive', None, passive_rule)
    matches = matcher(doc)
    if matches:
        return "Passive"
    else:
        return "Active"

if __name__=='__main__':
    #nlp = spacy.load('en_core_web_sm')
    #matcher = Matcher(nlp.vocab)
    text = Passive
    doc = nlp(text)
コード例 #49
0
import json
from spacy.matcher import Matcher
from spacy.lang.en import English

with open("exercises/iphone.json") as f:
    TEXTS = json.loads(f.read())

nlp = English()
matcher = Matcher(nlp.vocab)
pattern1 = [{"LOWER": "iphone"}, {"LOWER": "x"}]
pattern2 = [{"LOWER": "iphone"}, {"IS_DIGIT": True, "OP": "?"}]
matcher.add("GADGET", None, pattern1, pattern2)

TRAINING_DATA = []

# Create a Doc object for each text in TEXTS
for doc in nlp.pipe(TEXTS):
    # Match on the doc and create a list of matched spans
    spans = [doc[start:end] for match_id, start, end in matcher(doc)]
    # Get (start character, end character, label) tuples of matches
    entities = [(span.start_char, span.end_char, "GADGET") for span in spans]
    # Format the matches as a (doc.text, entities) tuple
    training_example = (doc.text, {"entities": entities})
    # Append the example to the training data
    TRAINING_DATA.append(training_example)

print(*TRAINING_DATA, sep="\n")
コード例 #50
0
def test_matcher_no_zero_length(en_vocab):
    doc = Doc(en_vocab, words=["a", "b"], tags=["A", "B"])
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [[{"TAG": "C", "OP": "?"}]])
    assert len(matcher(doc)) == 0
コード例 #51
0
slack_bot_token = os.environ['SLACK_BOT_TOKEN']
slack_client = WebClient(slack_bot_token)

slack_bot_id = os.environ['SLACK_BOT_USER_ID']

####==== SKYSCANNER ====####
sky_url = os.environ['SKYSCAN_URL']
rapid_host = os.environ['RAPID_HOST']
rapid_key = os.environ['RAPID_KEY']

####===== spaCy ====####
# Load spaCy object
nlp = spacy.load('en_core_web_sm')

# Create Matcher object for phrase matching
matcher = Matcher(nlp.vocab)

# Starting location
pattern_start = [
    {
        'LOWER': 'from',
    },
    {
        "ENT_TYPE": "GPE",
        "OP": "+"
    },
]

# Ending/Destination location
pattern_end = [{'LOWER': 'to'}, {"ENT_TYPE": "GPE", "OP": "+"}]
コード例 #52
0
def test_matcher_valid_callback(en_vocab):
    """Test that on_match can only be None or callable."""
    matcher = Matcher(en_vocab)
    with pytest.raises(ValueError):
        matcher.add("TEST", [[{"TEXT": "test"}]], on_match=[])
    matcher(Doc(en_vocab, words=["test"]))
コード例 #53
0
@author: Josh
"""

#Using the Matcher (1)

import spacy

#Import the Matcher
from spacy.matcher import Matcher

#Load a model and create the nlp object
nlp = spacy.load('en_core_web_sm')

#Initialise the matcher with the shared vocab
matcher = Matcher(nlp.vocab)

#Add the pattern to the matcher
pattern = [{"ORTH": "iPhone"}, {"ORTH": "X"}]
matcher.add('IPHONE_PATTERN', None, pattern)

#Process some text
doc = nlp("New iPhone X release date leaked")

#Call the matcher on the doc
matches = matcher(doc)

#Using the Matcher (2)

#Iterate over the matches
for match_id, start, end in matches:
コード例 #54
0
def test_attr_pipeline_checks(en_vocab):
    doc1 = Doc(en_vocab, words=["Test"])
    doc1[0].dep_ = "ROOT"
    doc2 = Doc(en_vocab, words=["Test"])
    doc2[0].tag_ = "TAG"
    doc2[0].pos_ = "X"
    doc2[0].set_morph("Feat=Val")
    doc2[0].lemma_ = "LEMMA"
    doc3 = Doc(en_vocab, words=["Test"])
    # DEP requires DEP
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [[{"DEP": "a"}]])
    matcher(doc1)
    with pytest.raises(ValueError):
        matcher(doc2)
    with pytest.raises(ValueError):
        matcher(doc3)
    # errors can be suppressed if desired
    matcher(doc2, allow_missing=True)
    matcher(doc3, allow_missing=True)
    # TAG, POS, LEMMA require those values
    for attr in ("TAG", "POS", "LEMMA"):
        matcher = Matcher(en_vocab)
        matcher.add("TEST", [[{attr: "a"}]])
        matcher(doc2)
        with pytest.raises(ValueError):
            matcher(doc1)
        with pytest.raises(ValueError):
            matcher(doc3)
    # TEXT/ORTH only require tokens
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [[{"ORTH": "a"}]])
    matcher(doc1)
    matcher(doc2)
    matcher(doc3)
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [[{"TEXT": "a"}]])
    matcher(doc1)
    matcher(doc2)
    matcher(doc3)
コード例 #55
0
ファイル: exc_02_13.py プロジェクト: datakime/spacy-course
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
doc = nlp(
    "Twitch Prime, the perks program for Amazon Prime members offering free "
    "loot, games and other benefits, is ditching one of its best features: "
    "ad-free viewing. According to an email sent out to Amazon Prime members "
    "today, ad-free viewing will no longer be included as a part of Twitch "
    "Prime for new members, beginning on September 14. However, members with "
    "existing annual subscriptions will be able to continue to enjoy ad-free "
    "viewing until their subscription comes up for renewal. Those with "
    "monthly subscriptions will have access to ad-free viewing until October 15."
)

# Create the match patterns
pattern1 = [{"LOWER": "Amazon"}, {"IS_TITLE": True, "POS": "PROPN"}]
pattern2 = [{"LOWER": "ad-free"}, {"POS": "NOUN"}]

# Initialize the Matcher and add the patterns
matcher = Matcher(nlp.vocab)
matcher.add("PATTERN1", [pattern1])
matcher.add("PATTERN2", [pattern2])

# Iterate over the matches
for match_id, start, end in matcher(doc):
    # Print pattern string name and text of matched span
    print(doc.vocab.strings[match_id], doc[start:end].text)
コード例 #56
0
def test_matcher_basic_check(en_vocab):
    matcher = Matcher(en_vocab)
    # Potential mistake: pass in pattern instead of list of patterns
    pattern = [{"TEXT": "hello"}, {"TEXT": "world"}]
    with pytest.raises(ValueError):
        matcher.add("TEST", pattern)
コード例 #57
0
def test_issue4373():
    """Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab)."""
    matcher = Matcher(Vocab())
    assert isinstance(matcher.vocab, Vocab)
    matcher = PhraseMatcher(Vocab())
    assert isinstance(matcher.vocab, Vocab)
コード例 #58
0
def find_matches(text, keywords, countries):
    all_matches = []
    country_reg_string = "((?:"
    for c in countries:
        country_reg_string += c.lower() + "|"
    country_reg_string = country_reg_string.rstrip("|")
    country_reg_string += ")(?:\\')?\w+)"
    doc_sent = nlp(text)
    for sent in doc_sent.sents:
        doc = nlp(sent.text)

        def on_match(matcher, doc, id, matches):
            country = None
            location = None
            date = None
            for m in matches:
                num, start, end = m
            keyword = nlp.vocab.strings[num]
            for ent in doc[start:end].ents:
                if ent.label_ == 'GPE' or ent.label_ == 'NORP':
                    for coun in countries:
                        if coun.lower() in ent.text.lower():
                            country = coun
                            break
                if ent.label_ == 'LOC':
                    location = re.sub(r'^the ',
                                      '',
                                      ent.text.strip(),
                                      flags=re.IGNORECASE)

                if ent.label_ == 'DATE':
                    date = ent.text.strip()
            if country != None and location != None:
                value = {
                    "country": country,
                    "event": keyword,
                    "location": location,
                    "sentence": str(sent.text).strip()
                }
                if date != None:
                    value['date'] = date
                if value not in all_matches:
                    all_matches.append(value)

                    print(doc[start:end])

        matcher = Matcher(nlp.vocab)

        for c in countries:
            for entity_i in keywords:
                if " " in entity_i:
                    pre_build = [{
                        "NORM": {
                            "REGEX": country_reg_string
                        }
                    }, {
                        "OP": "*"
                    }]
                    for i in entity_i.split(" "):
                        pre_build.append({"NORM": i})
                    pre_build.append({"OP": "*"})
                    pre_build.append({"TAG": "IN"})
                    pre_build.append({"OP": "*"})
                    pre_build.append({"ENT_TYPE": "LOC"})
                    matcher.add(entity_i, on_match, pre_build)
                else:
                    matcher.add(entity_i, on_match, [{
                        "NORM": {
                            "REGEX": country_reg_string
                        }
                    }, {
                        "OP": "*"
                    }, {
                        "NORM": entity_i
                    }, {
                        "OP": "*"
                    }, {
                        "TAG": "IN"
                    }, {
                        "OP": "*"
                    }, {
                        "ENT_TYPE": "LOC"
                    }])

        matches = matcher(doc)
    return list(all_matches)
コード例 #59
0
import spacy
import os
from flask import Flask, request, jsonify, render_template
from flask_pymongo import PyMongo
from pymongo import MongoClient
from celery import Celery, current_app
from spacy import displacy
from spacy.matcher import Matcher
import en_core_web_sm
nlp = spacy.load('en_core_web_sm')

m_tool = Matcher(nlp.vocab)
p1 = [{
    'LOWER': 'bootstrap'
}, {
    'LOWER': 'oracle'
}, {
    'LOWER': 'python'
}, {
    'LOWER': 'mysql'
}, {
    'LOWER': 'django'
}, {
    'LOWER': 'web development'
}, {
    'LOWER': 'unix'
}, {
    'LOWER': 'sql'
}, {
    'LOWER': 'selenium'
}, {
コード例 #60
0
html_doc = soup.prettify()
print(html_doc[:15])

# Den Textinhalt des html extrahieren.
text_from_html_document = u''
for x in soup.findAll('body'):
    text_from_html_document += x.text

# Importieren der Mustererkennung.
from spacy.matcher import Matcher

# Das Sprachmodell in das NLP Objekt laden.
nlp = spacy.load('en_core_web_md')

# Die Mustererkennung initialisieren.
matcher = Matcher(nlp.vocab)

# Die zu suchenden Muster hinzufügen.

pattern = [{'LIKE_NUM': True}]
matcher.add('SimpleNumeric_PATTERN', None, pattern)

# Dokument Verarbeiten.
doc = nlp(text_from_html_document)

# Das Dokument auf treffer untersuchen.
matches = matcher(doc)

# Erstellen eines Index über den Body der HTML
matches_container = []
html_body_index = {}