def test_operator_combos(en_vocab): cases = [ ("aaab", "a a a b", True), ("aaab", "a+ b", True), ("aaab", "a+ a+ b", True), ("aaab", "a+ a+ a b", True), ("aaab", "a+ a+ a+ b", True), ("aaab", "a+ a a b", True), ("aaab", "a+ a a", True), ("aaab", "a+", True), ("aaa", "a+ b", False), ("aaa", "a+ a+ b", False), ("aaa", "a+ a+ a+ b", False), ("aaa", "a+ a b", False), ("aaa", "a+ a a b", False), ("aaab", "a+ a a", True), ("aaab", "a+", True), ("aaab", "a+ a b", True), ] for string, pattern_str, result in cases: matcher = Matcher(en_vocab) doc = Doc(matcher.vocab, words=list(string)) pattern = [] for part in pattern_str.split(): if part.endswith("+"): pattern.append({"ORTH": part[0], "OP": "+"}) else: pattern.append({"ORTH": part}) matcher.add("PATTERN", None, pattern) matches = matcher(doc) if result: assert matches, (string, pattern_str) else: assert not matches, (string, pattern_str)
def test_issue615(en_tokenizer): def merge_phrases(matcher, doc, i, matches): """Merge a phrase. We have to be careful here because we'll change the token indices. To avoid problems, merge all the phrases once we're called on the last match.""" if i != len(matches) - 1: return None spans = [Span(doc, start, end, label=label) for label, start, end in matches] with doc.retokenize() as retokenizer: for span in spans: tag = "NNP" if span.label_ else span.root.tag_ attrs = {"tag": tag, "lemma": span.text} retokenizer.merge(span, attrs=attrs) doc.ents = doc.ents + (span,) text = "The golf club is broken" pattern = [{"ORTH": "golf"}, {"ORTH": "club"}] label = "Sport_Equipment" doc = en_tokenizer(text) matcher = Matcher(doc.vocab) matcher.add(label, merge_phrases, pattern) matcher(doc) entities = list(doc.ents) assert entities != [] assert entities[0].label != 0
class RussianTokenizer(object): name = 'russian_tokenizer' def __init__(self, nlp, merge_patterns=None, terminal_patterns=None): self.matcher = Matcher(nlp.vocab) self.token_merge = nlp.vocab.strings['pattern'] self.sentence_terminal = nlp.vocab.strings['sentence_terminal'] if merge_patterns: self.matcher.add(self.token_merge, None, *merge_patterns) if terminal_patterns: self.matcher.add(self.sentence_terminal, None, *terminal_patterns) def __call__(self, doc): spans = [] for id, start, end in self.matcher(doc): if id == self.token_merge: spans.append(doc[start:end]) elif id == self.sentence_terminal: # remove all sentence start marks from span that match pattern for token in doc[start:end]: if token.sent_start: token.sent_start = False if spans: for span in spans: span.merge() return doc
def write_conllu(docs, file_): merger = Matcher(docs[0].vocab) merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}]) for i, doc in enumerate(docs): matches = merger(doc) spans = [doc[start : end + 1] for _, start, end in matches] with doc.retokenize() as retokenizer: for span in spans: retokenizer.merge(span) # TODO: This shouldn't be necessary? Should be handled in merge for word in doc: if word.i == word.head.i: word.dep_ = "ROOT" file_.write("# newdoc id = {i}\n".format(i=i)) for j, sent in enumerate(doc.sents): file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j)) file_.write("# text = {text}\n".format(text=sent.text)) for k, token in enumerate(sent): file_.write(_get_token_conllu(token, k, len(sent)) + "\n") file_.write("\n") for word in sent: if word.head.i == word.i and word.dep_ == "ROOT": break else: print("Rootless sentence!") print(sent) print(i) for w in sent: print(w.i, w.text, w.head.text, w.head.i, w.dep_) raise ValueError
def test_matcher_match_zero_plus(matcher): words = 'He said , " some words " ...'.split() pattern = [{"ORTH": '"'}, {"OP": "*", "IS_PUNCT": False}, {"ORTH": '"'}] matcher = Matcher(matcher.vocab) matcher.add("Quote", None, pattern) doc = Doc(matcher.vocab, words=words) assert len(matcher(doc)) == 1
def write_conllu(docs, file_): merger = Matcher(docs[0].vocab) merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}]) for i, doc in enumerate(docs): matches = merger(doc) spans = [doc[start : end + 1] for _, start, end in matches] with doc.retokenize() as retokenizer: for span in spans: retokenizer.merge(span) file_.write("# newdoc id = {i}\n".format(i=i)) for j, sent in enumerate(doc.sents): file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j)) file_.write("# text = {text}\n".format(text=sent.text)) for k, token in enumerate(sent): if token.head.i > sent[-1].i or token.head.i < sent[0].i: for word in doc[sent[0].i - 10 : sent[0].i]: print(word.i, word.head.i, word.text, word.dep_) for word in sent: print(word.i, word.head.i, word.text, word.dep_) for word in doc[sent[-1].i : sent[-1].i + 10]: print(word.i, word.head.i, word.text, word.dep_) raise ValueError( "Invalid parse: head outside sentence (%s)" % token.text ) file_.write(token._.get_conllu_lines(k) + "\n") file_.write("\n")
def test_matcher_operator_shadow(en_vocab): matcher = Matcher(en_vocab) doc = Doc(matcher.vocab, words=["a", "b", "c"]) pattern = [{"ORTH": "a"}, {"IS_ALPHA": True, "OP": "+"}, {"ORTH": "c"}] matcher.add("A.C", None, pattern) matches = matcher(doc) assert len(matches) == 1 assert matches[0][1:] == (0, 3)
def test_issue_1971_2(en_vocab): matcher = Matcher(en_vocab) pattern1 = [{"ORTH": "EUR", "LOWER": {"IN": ["eur"]}}, {"LIKE_NUM": True}] pattern2 = [{"LIKE_NUM": True}, {"ORTH": "EUR"}] # {"IN": ["EUR"]}}] doc = Doc(en_vocab, words=["EUR", "10", "is", "10", "EUR"]) matcher.add("TEST1", None, pattern1, pattern2) matches = matcher(doc) assert len(matches) == 2
def test_match_consuming(doc, text, pattern, re_pattern): """Test that matcher.__call__ consumes tokens on a match similar to re.findall.""" matcher = Matcher(doc.vocab) matcher.add(re_pattern, None, pattern) matches = matcher(doc) re_matches = [m.span() for m in re.finditer(re_pattern, text)] assert len(matches) == len(re_matches)
def test_issue3555(en_vocab): """Test that custom extensions with default None don't break matcher.""" Token.set_extension("issue3555", default=None) matcher = Matcher(en_vocab) pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}] matcher.add("TEST", None, pattern) doc = Doc(en_vocab, words=["have", "apple"]) matcher(doc)
def test_issue1883(): matcher = Matcher(Vocab()) matcher.add("pat1", None, [{"orth": "hello"}]) doc = Doc(matcher.vocab, words=["hello"]) assert len(matcher(doc)) == 1 new_matcher = copy.deepcopy(matcher) new_doc = Doc(new_matcher.vocab, words=["hello"]) assert len(new_matcher(new_doc)) == 1
def test_greedy_matching(doc, text, pattern, re_pattern): """Test that the greedy matching behavior of the * op is consistant with other re implementations.""" matcher = Matcher(doc.vocab) matcher.add(re_pattern, None, pattern) matches = matcher(doc) re_matches = [m.span() for m in re.finditer(re_pattern, text)] for match, re_match in zip(matches, re_matches): assert match[1:] == re_match
def test_issue1945(): """Test regression in Matcher introduced in v2.0.6.""" matcher = Matcher(Vocab()) matcher.add("MWE", None, [{"orth": "a"}, {"orth": "a"}]) doc = Doc(matcher.vocab, words=["a", "a", "a"]) matches = matcher(doc) # we should see two overlapping matches here assert len(matches) == 2 assert matches[0][1:] == (0, 2) assert matches[1][1:] == (1, 3)
def test_matcher_compare_length(en_vocab): matcher = Matcher(en_vocab) pattern = [{"LENGTH": {">=": 2}}] matcher.add("LENGTH_COMPARE", None, pattern) doc = Doc(en_vocab, words=["a", "aa", "aaa"]) matches = matcher(doc) assert len(matches) == 2 doc = Doc(en_vocab, words=["a"]) matches = matcher(doc) assert len(matches) == 0
def matcher(en_vocab): rules = { "JS": [[{"ORTH": "JavaScript"}]], "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]], "Java": [[{"LOWER": "java"}]], } matcher = Matcher(en_vocab) for key, patterns in rules.items(): matcher.add(key, None, *patterns) return matcher
def test_matcher_regex_shape(en_vocab): matcher = Matcher(en_vocab) pattern = [{"SHAPE": {"REGEX": r"^[^x]+$"}}] matcher.add("NON_ALPHA", None, pattern) doc = Doc(en_vocab, words=["99", "problems", "!"]) matches = matcher(doc) assert len(matches) == 2 doc = Doc(en_vocab, words=["bye"]) matches = matcher(doc) assert len(matches) == 0
def test_matcher_any_token_operator(en_vocab): """Test that patterns with "any token" {} work with operators.""" matcher = Matcher(en_vocab) matcher.add("TEST", None, [{"ORTH": "test"}, {"OP": "*"}]) doc = Doc(en_vocab, words=["test", "hello", "world"]) matches = [doc[start:end].text for _, start, end in matcher(doc)] assert len(matches) == 3 assert matches[0] == "test" assert matches[1] == "test hello" assert matches[2] == "test hello world"
def test_matcher_set_value(en_vocab): matcher = Matcher(en_vocab) pattern = [{"ORTH": {"IN": ["an", "a"]}}] matcher.add("A_OR_AN", None, pattern) doc = Doc(en_vocab, words=["an", "a", "apple"]) matches = matcher(doc) assert len(matches) == 2 doc = Doc(en_vocab, words=["aardvark"]) matches = matcher(doc) assert len(matches) == 0
def test_matcher_set_value_operator(en_vocab): matcher = Matcher(en_vocab) pattern = [{"ORTH": {"IN": ["a", "the"]}, "OP": "?"}, {"ORTH": "house"}] matcher.add("DET_HOUSE", None, pattern) doc = Doc(en_vocab, words=["In", "a", "house"]) matches = matcher(doc) assert len(matches) == 2 doc = Doc(en_vocab, words=["my", "house"]) matches = matcher(doc) assert len(matches) == 1
def test_matcher_regex(en_vocab): matcher = Matcher(en_vocab) pattern = [{"ORTH": {"REGEX": r"(?:a|an)"}}] matcher.add("A_OR_AN", None, pattern) doc = Doc(en_vocab, words=["an", "a", "hi"]) matches = matcher(doc) assert len(matches) == 2 doc = Doc(en_vocab, words=["bye"]) matches = matcher(doc) assert len(matches) == 0
def test_issue_1971_3(en_vocab): """Test that pattern matches correctly for multiple extension attributes.""" Token.set_extension("a", default=1, force=True) Token.set_extension("b", default=2, force=True) doc = Doc(en_vocab, words=["hello", "world"]) matcher = Matcher(en_vocab) matcher.add("A", None, [{"_": {"a": 1}}]) matcher.add("B", None, [{"_": {"b": 2}}]) matches = sorted((en_vocab.strings[m_id], s, e) for m_id, s, e in matcher(doc)) assert len(matches) == 4 assert matches == sorted([("A", 0, 1), ("A", 1, 2), ("B", 0, 1), ("B", 1, 2)])
def test_issue1450(string, start, end): """Test matcher works when patterns end with * operator.""" pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}] matcher = Matcher(Vocab()) matcher.add("TSTEND", None, pattern) doc = Doc(Vocab(), words=string.split()) matches = matcher(doc) if start is None or end is None: assert matches == [] assert matches[-1][1] == start assert matches[-1][2] == end
def test_issue3328(en_vocab): doc = Doc(en_vocab, words=["Hello", ",", "how", "are", "you", "doing", "?"]) matcher = Matcher(en_vocab) patterns = [ [{"LOWER": {"IN": ["hello", "how"]}}], [{"LOWER": {"IN": ["you", "doing"]}}], ] matcher.add("TEST", None, *patterns) matches = matcher(doc) assert len(matches) == 4 matched_texts = [doc[start:end].text for _, start, end in matches] assert matched_texts == ["Hello", "how", "you", "doing"]
def test_matcher_end_zero_plus(en_vocab): """Test matcher works when patterns end with * operator. (issue 1450)""" matcher = Matcher(en_vocab) pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}] matcher.add("TSTEND", None, pattern) nlp = lambda string: Doc(matcher.vocab, words=string.split()) assert len(matcher(nlp("a"))) == 1 assert len(matcher(nlp("a b"))) == 2 assert len(matcher(nlp("a c"))) == 1 assert len(matcher(nlp("a b c"))) == 2 assert len(matcher(nlp("a b b c"))) == 3 assert len(matcher(nlp("a b b"))) == 3
def test_matcher_sets_return_correct_tokens(en_vocab): matcher = Matcher(en_vocab) patterns = [ [{'LOWER': {'IN': ["zero"]}}], [{'LOWER': {'IN': ["one"]}}], [{'LOWER': {'IN': ["two"]}}], ] matcher.add('TEST', None, *patterns) doc = Doc(en_vocab, words="zero one two three".split()) matches = matcher(doc) texts = [Span(doc, s, e, label=L).text for L, s, e in matches] assert texts == ['zero', 'one', 'two']
def test_matcher_extension_attribute(en_vocab): matcher = Matcher(en_vocab) get_is_fruit = lambda token: token.text in ("apple", "banana") Token.set_extension("is_fruit", getter=get_is_fruit, force=True) pattern = [{"ORTH": "an"}, {"_": {"is_fruit": True}}] matcher.add("HAVING_FRUIT", None, pattern) doc = Doc(en_vocab, words=["an", "apple"]) matches = matcher(doc) assert len(matches) == 1 doc = Doc(en_vocab, words=["an", "aardvark"]) matches = matcher(doc) assert len(matches) == 0
def test_issue850_basic(): """Test Matcher matches with '*' operator and Boolean flag""" vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()}) matcher = Matcher(vocab) pattern = [{"LOWER": "bob"}, {"OP": "*", "LOWER": "and"}, {"LOWER": "frank"}] matcher.add("FarAway", None, pattern) doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"]) match = matcher(doc) assert len(match) == 1 ent_id, start, end = match[0] assert start == 0 assert end == 4
def test_issue1434(): """Test matches occur when optional element at end of short doc.""" pattern = [{"ORTH": "Hello"}, {"IS_ALPHA": True, "OP": "?"}] vocab = Vocab(lex_attr_getters=LEX_ATTRS) hello_world = Doc(vocab, words=["Hello", "World"]) hello = Doc(vocab, words=["Hello"]) matcher = Matcher(vocab) matcher.add("MyMatcher", None, pattern) matches = matcher(hello_world) assert matches matches = matcher(hello) assert matches
def test_issue590(en_vocab): """Test overlapping matches""" doc = Doc(en_vocab, words=["n", "=", "1", ";", "a", ":", "5", "%"]) matcher = Matcher(en_vocab) matcher.add( "ab", None, [{"IS_ALPHA": True}, {"ORTH": ":"}, {"LIKE_NUM": True}, {"ORTH": "%"}], ) matcher.add("ab", None, [{"IS_ALPHA": True}, {"ORTH": "="}, {"LIKE_NUM": True}]) matches = matcher(doc) assert len(matches) == 2
def test_matcher_extension_set_membership(en_vocab): matcher = Matcher(en_vocab) get_reversed = lambda token: "".join(reversed(token.text)) Token.set_extension("reversed", getter=get_reversed, force=True) pattern = [{"_": {"reversed": {"IN": ["eyb", "ih"]}}}] matcher.add("REVERSED", None, pattern) doc = Doc(en_vocab, words=["hi", "bye", "hello"]) matches = matcher(doc) assert len(matches) == 2 doc = Doc(en_vocab, words=["aardvark"]) matches = matcher(doc) assert len(matches) == 0
def test_matcher_subset_value_operator(en_vocab): matcher = Matcher(en_vocab) pattern = [{"MORPH": {"IS_SUBSET": ["Feat=Val", "Feat2=Val2"]}}] matcher.add("M", [pattern]) doc = Doc(en_vocab, words=["a", "b", "c"]) assert len(matcher(doc)) == 3 doc[0].set_morph("Feat=Val") assert len(matcher(doc)) == 3 doc[0].set_morph("Feat=Val|Feat2=Val2") assert len(matcher(doc)) == 3 doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3") assert len(matcher(doc)) == 2 doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4") assert len(matcher(doc)) == 2 # IS_SUBSET acts like "IN" for attrs other than MORPH matcher = Matcher(en_vocab) pattern = [{"TAG": {"IS_SUBSET": ["A", "B"]}}] matcher.add("M", [pattern]) doc = Doc(en_vocab, words=["a", "b", "c"]) doc[0].tag_ = "A" assert len(matcher(doc)) == 1 # IS_SUBSET with an empty list matches nothing matcher = Matcher(en_vocab) pattern = [{"TAG": {"IS_SUBSET": []}}] matcher.add("M", [pattern]) doc = Doc(en_vocab, words=["a", "b", "c"]) doc[0].tag_ = "A" assert len(matcher(doc)) == 0
def test_matcher_superset_value_operator(en_vocab): matcher = Matcher(en_vocab) pattern = [{"MORPH": {"IS_SUPERSET": ["Feat=Val", "Feat2=Val2", "Feat3=Val3"]}}] matcher.add("M", [pattern]) doc = Doc(en_vocab, words=["a", "b", "c"]) assert len(matcher(doc)) == 0 doc[0].set_morph("Feat=Val|Feat2=Val2") assert len(matcher(doc)) == 0 doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3") assert len(matcher(doc)) == 1 doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4") assert len(matcher(doc)) == 1 # IS_SUPERSET with more than one value only matches for MORPH matcher = Matcher(en_vocab) pattern = [{"TAG": {"IS_SUPERSET": ["A", "B"]}}] matcher.add("M", [pattern]) doc = Doc(en_vocab, words=["a", "b", "c"]) doc[0].tag_ = "A" assert len(matcher(doc)) == 0 # IS_SUPERSET with one value is the same as == matcher = Matcher(en_vocab) pattern = [{"TAG": {"IS_SUPERSET": ["A"]}}] matcher.add("M", [pattern]) doc = Doc(en_vocab, words=["a", "b", "c"]) doc[0].tag_ = "A" assert len(matcher(doc)) == 1 # IS_SUPERSET with an empty value matches everything matcher = Matcher(en_vocab) pattern = [{"TAG": {"IS_SUPERSET": []}}] matcher.add("M", [pattern]) doc = Doc(en_vocab, words=["a", "b", "c"]) doc[0].tag_ = "A" assert len(matcher(doc)) == 3
import spacy from spacy.matcher import Matcher nlp = spacy.load("ja_core_news_sm") matcher = Matcher(nlp.vocab) doc = nlp("松島、天橋立、宮島は日本三景として知られています。" "松島は宮城県、天橋立は京都府、宮島は広島県にそれぞれあります。") # v2.3現在、日本語モデルではdoc.is_taggedが正しく設定されないので、 # 明示的に設定 # 参考: https://github.com/explosion/spaCy/issues/5802 doc.is_tagged = True # 「固有名詞 + 県」からなるパターンを書きます pattern = [{"POS": "PROPN"}, {"LEMMA": "県"}] # パターンをmatcherに追加し、docに対してmatcherを適用します matcher.add("PREFECTURE_PATTERN", None, pattern) matches = matcher(doc) print("Total matches found:", len(matches)) # 結果をイテレートし、スパンの文字列をプリントします for match_id, start, end in matches: print("Match found:", doc[start:end].text)
def custom_tokenizer_to_df(nlp, doc): # Initialize the Matcher with a vocab matcher = Matcher(nlp.vocab) ############################################################### # Add pattern for valid hashtag, i.e. '#' plus any ASCII token matcher.add("HASHTAG", None, [{"ORTH": "#"}, {"IS_ALPHA": True}]) # Register token extension for hashtag Token.set_extension("is_hashtag", default=False, force=True) # Fit in text in matcher matches = matcher(doc) # Find hashtag and merge, assign hashtag label hashtags = [] for match_id, start, end in matches: if doc.vocab.strings[match_id] == "HASHTAG": hashtags.append(doc[start:end]) with doc.retokenize() as retokenizer: for span in hashtags: retokenizer.merge(span) for token in span: token._.is_hashtag = True ############################################################## ############################################################## # Find number and merge, assign number label # Add pattern for valid hashtag, i.e. '#' plus any ASCII token matcher.add("LONG_NUMBER", None, [{ "IS_DIGIT": True }, { "ORTH": ',' }, { "IS_DIGIT": True }]) matcher.add("LONG_NUMBER", None, [{ "IS_DIGIT": True }, { "ORTH": '.' }, { "IS_DIGIT": True }]) # Register token extension for hashtag Token.set_extension("is_long_number", default=False, force=True) # Fit in text in matcher matches = matcher(doc) long_number = [] for match_id, start, end in matches: if doc.vocab.strings[match_id] == "LONG_NUMBER": long_number.append(doc[start:end]) with doc.retokenize() as retokenizer: for span in long_number: retokenizer.merge(span) for token in span: token._.is_long_number = True ############################################################## for i, token in enumerate(doc): if token._.is_hashtag: token.tag_ = 'Hashtag' if token.like_url: token.tag_ = 'URL' if token.like_email: token.tag_ = 'Email' if token.is_stop: token.tag_ = 'Stop Word' if token.like_num: token.tag_ = 'Number' if token._.is_long_number: token.tag_ = 'Number' if token.is_punct: token.tag_ = 'Punctuation' # Write the tokens to data frame df = pd.DataFrame() df['Token'] = [token.text for token in doc] df['POS'] = [token.pos_ for token in doc] df['NE'] = [token.ent_iob_ for token in doc] df['Lemma'] = [token.lemma_ for token in doc] df['Tag'] = [token.tag_ for token in doc] return df
import spacy # Import the Matcher from spacy.matcher import Matcher nlp = spacy.load("en_core_web_sm") doc = nlp("Upcoming iPhone X release date leaked as Apple reveals pre-orders") # Initialize the Matcher with the shared vocabulary matcher = Matcher(nlp.vocab) # Create a pattern matching two tokens: "iPhone" and "X" pattern = [{"TEXT": "iPhone"}, {"TEXT": "X"}] # Add the pattern to the matcher matcher.add("IPHONE_X_PATTERN", None, pattern) # Use the matcher on the doc matches = matcher(doc) print("Matches:", [doc[start:end].text for match_id, start, end in matches])
import spacy from spacy.matcher import Matcher nlp = spacy.load("en_core_web_sm") from shanepy import * os.environ["EDITOR"] = "sp" from ptpython.repl import embed embed(globals(), locals()) # Match sequences of tokens, based on pattern rules matcher = Matcher(nlp.vocab) doc = nlp( "Features of the app include a beautiful design, smart search, automatic " "labels and optional voice responses.") # Write a pattern for adjective plus one or two nouns pattern = [{"POS": "ADJ"}, {"POS": "NOUN"}, {"POS": "NOUN", "OP": "?"}] # Add the pattern to the matcher and apply the matcher to the doc matcher.add("ADJ_NOUN_PATTERN", None, pattern) matches = matcher(doc) print("Total matches found:", len(matches)) # Iterate over the matches and print the span text for match_id, start, end in matches: print("Match found:", doc[start:end].text)
#!/usr/bin/python """ WRITING MORE COMPLEX MATCH PATTERNS """ import spacy from spacy.matcher import Matcher nlp = spacy.load("en_core_web_sm") matcher = Matcher(nlp.vocab) doc = nlp( "i downloaded Fortnite on my laptop and can't open the game at all. Help? " "so when I was downloading Minecraft, I got the Windows version where it " "is the '.zip' folder and I used the default program to unpack it... do " "I also need to download Winzip?") # Write a pattern that matches a form of "download" plus proper noun pattern = [{"LEMMA": "download"}, {"POS": "PROPN"}] # Add the pattern to the matcher and apply the matcher to the doc matcher.add("DOWNLOAD_THINGS_PATTERN", None, pattern) matches = matcher(doc) print("Total matches found:", len(matches)) # Iterate over the matches and print the span text for match_id, start, end in matches: print("Match found:", doc[start:end].text)
def identify_build_date_in_text(text): nlp = English() doc = nlp(text) matcher = Matcher(nlp.vocab) # # START - spaCy patterns # # WATER_VESSEL water_vessel_pattern = [{"LOWER": {"IN": ["vessels"]}}] matcher.add("WATER_VESSEL", None, water_vessel_pattern) # DATE matcher.add("DATE", None, [{'IS_DIGIT': True, 'LENGTH': 4}]) # CONSTRUCT matcher.add("CONSTRUCT", None, [{"LOWER": {"IN": ["constructed"]}}]) # # END - spaCy patterns # result = [] for match_id, token_start, token_end in matcher(doc): match_id_as_string = nlp.vocab.strings[match_id] final_token_start = token_start final_token_end = token_end if match_id_as_string == "DATE" and token_start > 0: # At this point, DATE is just a year string. Example: 2021 # Expand DATE? prev_word_1_token_number = token_start - 1 prev_word_1_token = doc[prev_word_1_token_number] if prev_word_1_token.text.lower() in ("january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"): final_token_start = prev_word_1_token_number # expanding # Expand more? prev_word_2_token_number = token_start - 2 prev_word_2_token = doc[prev_word_2_token_number] if is_int(prev_word_2_token.text): final_token_start = prev_word_2_token_number # expanding prev_word_on_date_token_number = final_token_start - 1 prev_word_on_date_token = doc[prev_word_on_date_token_number] # Does the DATE have a DATE_SEPARATOR? if prev_word_on_date_token.text in ("and", "to"): prev_word_on_date_char_span_start_number = prev_word_on_date_token.idx prev_word_on_date_char_span_end_number = prev_word_on_date_char_span_start_number + len( prev_word_on_date_token.text) identified_entity = { 'start': prev_word_on_date_char_span_start_number, 'end': prev_word_on_date_char_span_end_number, 'label': "DATE_SEPARATOR" } result.append(identified_entity) # Does the DATE have a DATE_SEPARATOR? elif prev_word_on_date_token.text in ("between", "before", "after"): # DATE_PREFIX detected prev_word_on_date_char_span_start_number = prev_word_on_date_token.idx prev_word_on_date_char_span_end_number = prev_word_on_date_char_span_start_number + len( prev_word_on_date_token.text) identified_entity = { 'start': prev_word_on_date_char_span_start_number, 'end': prev_word_on_date_char_span_end_number, 'label': "DATE_PREFIX" } result.append(identified_entity) # # convert token_span to char_span. # char_span is needed to display correctly withdisplacy.render(). # span = doc[final_token_start:final_token_end] span_char_start = span[0].idx span_char_end = span[-1].idx + len(span[-1].text) # return result identified_entity = { 'start': span_char_start, 'end': span_char_end, 'label': match_id_as_string } result.append(identified_entity) return result
def load_movement_matcher(nlp): matcher = Matcher(nlp.vocab) place = ['area', 'place', 'city', 'town'] girl = [ 'gal', 'girl', 's**t', 'cutie', 'hottie', 'lady', 'teen', 'teenager', 'chick', 'staff', 'gf', 'she' ] add_to_vocab(nlp, place) add_to_vocab(nlp, girl) is_place = FLAG18 is_girl = FLAG19 upper_start = FLAG20 for lexeme in nlp.vocab: if lexeme.lower_ in place: lexeme.set_flag(is_place, True) if lexeme.lower_ in girl: lexeme.set_flag(is_girl, True) if lexeme.prefix_.isupper(): lexeme.set_flag(upper_start, True) # Positive Matcher Patterns matcher.add_entity(1) matcher.add_pattern(1, [{ LEMMA: "last" }, { LEMMA: "night" }, { LEMMA: "in" }, { LEMMA: "town" }]) matcher.add_pattern(1, [{ LEMMA: "leave" }, { IS_ASCII: True, ENT_TYPE: "DATE" }]) matcher.add_pattern(1, [{LEMMA: "leave"}, {DEP: "partmod"}]) matcher.add_pattern(1, [{LEMMA: "leave"}, {DEP: "quantmod"}]) matcher.add_pattern(1, [{ LEMMA: "leave" }, { IS_ASCII: True, ENT_TYPE: "TIME" }]) matcher.add_pattern(1, [{ LEMMA: "leave" }, { LEMMA: "in" }, { IS_ASCII: True, ENT_TYPE: "DATE" }]) matcher.add_pattern(1, [{LEMMA: "leave"}, {LEMMA: "town"}]) matcher.add_pattern(1, [{LEMMA: "out"}, {LEMMA: "of"}, {LEMMA: "town"}]) matcher.add_pattern(1, [{LOWER: "outta"}, {LEMMA: "town"}]) matcher.add_pattern(1, [{ LEMMA: "lastnight" }, { LEMMA: "in" }, { LEMMA: "town" }]) matcher.add_pattern(1, [{LEMMA: "back"}, {LEMMA: "in"}, {LEMMA: "town"}]) matcher.add_pattern(1, [{LEMMA: "just"}, {LEMMA: "in"}, {LEMMA: "town"}]) matcher.add_pattern(1, [{LEMMA: "day"}, {LEMMA: "in"}, {LEMMA: "town"}]) matcher.add_pattern(1, [{ LEMMA: "in" }, { LEMMA: "town" }, { LEMMA: "tonight" }]) matcher.add_pattern(1, [{ LEMMA: "in" }, { LEMMA: "town" }, { LEMMA: "through" }]) matcher.add_pattern(1, [{LEMMA: "in"}, {LEMMA: "town"}, {LEMMA: "until"}]) matcher.add_pattern(1, [{ LEMMA: "in" }, { LEMMA: "town" }, { LEMMA: "for" }, { LEMMA: "one" }, { LEMMA: "night" }]) matcher.add_pattern(1, [{ LEMMA: "in" }, { LEMMA: "town" }, { LEMMA: "for" }, { IS_DIGIT: True }, { LEMMA: "night" }]) matcher.add_pattern(1, [{LEMMA: "town"}, {LEMMA: "stay", DEP: "nmod"}]) matcher.add_pattern(1, [{ LEMMA: "town" }, { IS_ASCII: True }, { LEMMA: "stay", DEP: "nmod" }]) matcher.add_pattern(1, [{ LEMMA: "new" }, { LEMMA: "girl" }, { LEMMA: "in" }, { LEMMA: "town" }]) matcher.add_pattern(1, [{LEMMA: "recent"}, {LEMMA: "move"}]) matcher.add_pattern(1, [{LEMMA: "recently"}, {LEMMA: "move"}]) matcher.add_pattern(1, [{LEMMA: "relocate"}]) matcher.add_pattern(1, [{ LEMMA: "new", DEP: "amod" }, { LEMMA: "city" }, { LEMMA: "to", DEP: "dep" }]) matcher.add_pattern(1, [{ LEMMA: "new", DEP: "amod" }, { IS_ASCII: True }, { LEMMA: "city" }, { IS_ASCII: True }, { LEMMA: "to", DEP: "dep" }]) matcher.add_pattern(1, [{LEMMA: "new"}, {LEMMA: "to"}, {LEMMA: "area"}]) matcher.add_pattern(1, [{ LEMMA: "new" }, { LEMMA: "to" }, { upper_start: True }]) matcher.add_pattern(1, [{LEMMA: "first"}, {LEMMA: "visit"}, {LEMMA: "to"}]) matcher.add_pattern(1, [{LEMMA: "i", DEP: "nsubj"}, {LEMMA: "arrive"}]) matcher.add_pattern(1, [{ LEMMA: "girl", DEP: "nsubj" }, { LEMMA: "arrive" }, { DEP: "partmod" }]) matcher.add_pattern(1, [{ LEMMA: "girl", DEP: "nsubj" }, { IS_ASCII: True }, { LEMMA: "arrive" }, { IS_ASCII: True }, { DEP: "partmod" }]) matcher.add_pattern(1, [{ LEMMA: "girl", DEP: "nsubj" }, { LEMMA: "arrive" }, { DEP: "quantmod" }]) matcher.add_pattern(1, [{ LEMMA: "girl", DEP: "nsubj" }, { IS_ASCII: True }, { LEMMA: "arrive" }, { IS_ASCII: True }, { DEP: "quantmod" }]) matcher.add_pattern(1, [{LEMMA: "just"}, {LEMMA: "arrive"}]) matcher.add_pattern(1, [{ LEMMA: "on" }, { LEMMA: "my" }, { LEMMA: "way" }, { LEMMA: "to" }, { TAG: "NNP" }]) matcher.add_pattern(1, [{ LEMMA: "on" }, { LEMMA: "my" }, { LEMMA: "way" }, { LEMMA: "to" }, { TAG: "NN" }]) matcher.add_pattern(1, [{LEMMA: "on"}, {LEMMA: "the"}, {LEMMA: "way"}]) matcher.add_pattern(1, [{LEMMA: "just"}, {LEMMA: "get"}, {LEMMA: "here"}]) matcher.add_pattern(1, [{LEMMA: "get"}, {LEMMA: "here"}, {LEMMA: "today"}]) matcher.add_pattern(1, [{ LEMMA: "get" }, { LEMMA: "here" }, { LEMMA: "yesterday" }]) matcher.add_pattern(1, [{ LEMMA: "get" }, { LEMMA: "here" }, { LEMMA: "last" }, { LEMMA: "night" }]) matcher.add_pattern(1, [{ LEMMA: "i", DEP: "nsubj" }, { IS_ASCII: True }, { LEMMA: "visit" }, { IS_ASCII: True }, { is_place: True, DEP: "dobj" }]) matcher.add_pattern(1, [{ LEMMA: "i", DEP: "nsubj" }, { LEMMA: "visit" }, { is_place: True, DEP: "dobj" }]) # Strong Positive Matcher Patterns matcher.add_entity(2) matcher.add_pattern(2, [{ LEMMA: "new" }, { IS_ASCII: True }, { LEMMA: "in" }, { is_place: True }]) matcher.add_pattern(2, [{ LEMMA: "new" }, { IS_ASCII: True }, { IS_ASCII: True }, { LEMMA: "in" }, { is_place: True }]) matcher.add_pattern(2, [{ LEMMA: "im" }, { LEMMA: "new" }, { LEMMA: "in" }, { LEMMA: "town" }]) matcher.add_pattern(2, [{LEMMA: "new"}, {LEMMA: "in"}, {is_place: True}]) matcher.add_pattern(2, [{LEMMA: "new"}, {LEMMA: "to"}, {is_place: True}]) matcher.add_pattern(2, [{ LEMMA: "new" }, { is_girl: True }, { LEMMA: "in" }, { LEMMA: "town" }]) matcher.add_pattern(2, [{ LEMMA: "new" }, { LEMMA: "to" }, { upper_start: True }, { LEMMA: "area" }]) # Negative Matcher Patterns matcher.add_entity(3) matcher.add_pattern(3, [{LEMMA: "new"}]) matcher.add_pattern(3, [{LEMMA: "girl"}, {LEMMA: "in"}, {LEMMA: "town"}]) matcher.add_pattern(3, [{LEMMA: "grand"}, {LEMMA: "new"}]) matcher.add_pattern(3, [{LEMMA: "new"}, {LEMMA: "at"}]) matcher.add_pattern(3, [{ LEMMA: "new" }, { LEMMA: "to" }, { LEMMA: "business" }]) matcher.add_pattern(3, [{ LEMMA: "new" }, { LEMMA: "to" }, { LEMMA: "industry" }]) matcher.add_pattern(3, [{ LEMMA: "new" }, { LEMMA: "to" }, { LEMMA: "scenario" }]) matcher.add_pattern(3, [{LEMMA: "dream", DEP: "nsubj"}, {LEMMA: "arrive"}]) matcher.add_pattern(3, [{ LEMMA: "fantasy", DEP: "nsubj" }, { LEMMA: "arrive" }]) matcher.add_pattern(3, [{LEMMA: "you", DEP: "nsubj"}, {LEMMA: "arrive"}]) matcher.add_pattern(3, [{LEMMA: "area"}, {LEMMA: "only"}]) matcher.add_pattern(3, [{upper_start: True}, {LEMMA: "area"}]) matcher.add_pattern(3, [{LEMMA: "you", DEP: "nsubj"}, {LEMMA: "leave"}]) matcher.add_pattern(3, [{ LEMMA: "it", DEP: "dobj" }, { LEMMA: "leave" }, { IS_ASCII: True, DEP: "nmod", TAG: "TO" }]) matcher.add_pattern(3, [{ LEMMA: "that", DEP: "dobj" }, { LEMMA: "leave" }, { IS_ASCII: True, DEP: "nmod", TAG: "TO" }]) matcher.add_pattern(3, [{LEMMA: "best"}, {LEMMA: "move"}]) matcher.add_pattern(3, [{LEMMA: "next"}, {LEMMA: "move"}]) matcher.add_pattern(3, [{ LEMMA: "arrive" }, { IS_ASCII: True }, { IS_ASCII: True, DEP: "xcomp" }]) matcher.add_pattern(3, [{LEMMA: "arrive"}, {IS_ASCII: True, DEP: "xcomp"}]) matcher.add_pattern(3, [{LEMMA: "visit"}, {LEMMA: "sister", DEP: "dobj"}]) matcher.add_pattern(3, [{ LEMMA: "visit" }, { IS_ASCII: True }, { LEMMA: "sister", DEP: "dobj" }]) matcher.add_pattern(3, [{LEMMA: "visit"}, {LEMMA: "family", DEP: "dobj"}]) matcher.add_pattern(3, [{ LEMMA: "visit" }, { IS_ASCII: True }, { LEMMA: "family", DEP: "dobj" }]) matcher.add_pattern(3, [{LEMMA: "we", DEP: "poss"}, {LEMMA: "visit"}]) # Strong Negative Matcher Patterns matcher.add_entity(4) matcher.add_pattern(4, [{LEMMA: "town"}, {LEMMA: "girl"}]) matcher.add_pattern(4, [{LEMMA: "on"}, {LEMMA: "the"}, {LEMMA: "town"}]) matcher.add_pattern(4, [{LEMMA: "near"}, {LEMMA: "town"}]) matcher.add_pattern(4, [{LEMMA: "down"}, {LEMMA: "town"}]) matcher.add_pattern(4, [{LEMMA: "town"}, {LEMMA: "hall"}]) matcher.add_pattern(4, [{LEMMA: "best"}, {LEMMA: "in"}, {LEMMA: "town"}]) matcher.add_pattern(4, [{ LEMMA: "best" }, { IS_ASCII: True }, { LEMMA: "in" }, { LEMMA: "town" }]) matcher.add_pattern(4, [{ LEMMA: "best" }, { IS_ASCII: True }, { IS_ASCII: True }, { LEMMA: "in" }, { LEMMA: "town" }]) matcher.add_pattern(4, [{ LEMMA: "best" }, { LEMMA: "in" }, { IS_ASCII: True }, { LEMMA: "town" }]) matcher.add_pattern(4, [{ LEMMA: "best" }, { IS_ASCII: True }, { LEMMA: "in" }, { IS_ASCII: True }, { LEMMA: "town" }]) matcher.add_pattern(4, [{ LEMMA: "best" }, { IS_ASCII: True }, { IS_ASCII: True }, { LEMMA: "in" }, { IS_ASCII: True }, { LEMMA: "town" }]) matcher.add_pattern(4, [{ LEMMA: "not" }, { LEMMA: "new" }, { LEMMA: "in" }, { LEMMA: "town" }]) matcher.add_pattern(4, [{ LEMMA: "not" }, { LEMMA: "new" }, { LEMMA: "to" }, { LEMMA: "town" }]) matcher.add_pattern(4, [{LEMMA: "not"}, {LEMMA: "leave"}, {LEMMA: "town"}]) matcher.add_pattern(4, [{ LEMMA: "i", DEP: "nsubj" }, { LEMMA: "leave" }, { LEMMA: "you", DEP: "dobj" }]) matcher.add_pattern(4, [{LEMMA: "new"}, {LEMMA: "but"}]) matcher.add_pattern(4, [{ LEMMA: "new" }, { LEMMA: "backpage", DEP: "nmod", TAG: "TO" }]) matcher.add_pattern(4, [{ LEMMA: "new" }, { IS_ASCII: True }, { LEMMA: "backpage", DEP: "nmod", TAG: "TO" }]) matcher.add_pattern(4, [{ LEMMA: "new" }, { LEMMA: "bp", DEP: "nmod", TAG: "TO" }]) matcher.add_pattern(4, [{ LEMMA: "new" }, { IS_ASCII: True }, { LEMMA: "bp", DEP: "nmod", TAG: "TO" }]) #DS matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "message", DEP: "dobj"}]) matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "msg", DEP: "dobj"}]) matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "txt", DEP: "dobj"}]) matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "text", DEP: "dobj"}]) matcher.add_pattern(4, [{ LEMMA: "leave" }, { LEMMA: "impression", DEP: "dobj" }]) matcher.add_pattern(4, [{ LEMMA: "leave" }, { LEMMA: "voicemail", DEP: "dobj" }]) matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "smile", DEP: "dobj"}]) matcher.add_pattern(4, [{ LEMMA: "leave" }, { IS_ASCII: True }, { LEMMA: "message", DEP: "dobj" }]) matcher.add_pattern(4, [{ LEMMA: "leave" }, { IS_ASCII: True }, { LEMMA: "msg", DEP: "dobj" }]) matcher.add_pattern(4, [{ LEMMA: "leave" }, { IS_ASCII: True }, { LEMMA: "txt", DEP: "dobj" }]) matcher.add_pattern(4, [{ LEMMA: "leave" }, { IS_ASCII: True }, { LEMMA: "text", DEP: "dobj" }]) matcher.add_pattern(4, [{ LEMMA: "leave" }, { IS_ASCII: True }, { LEMMA: "impression", DEP: "dobj" }]) matcher.add_pattern(4, [{ LEMMA: "leave" }, { IS_ASCII: True }, { LEMMA: "voicemail", DEP: "dobj" }]) matcher.add_pattern(4, [{ LEMMA: "leave" }, { IS_ASCII: True }, { LEMMA: "smile", DEP: "dobj" }]) matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "satisfied"}]) matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "memory", DEP: "dobj"}]) matcher.add_pattern(4, [{ LEMMA: "leave" }, { IS_ASCII: True }, { LEMMA: "memory", DEP: "dobj" }]) matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "you"}]) matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "u"}]) matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "with"}]) matcher.add_pattern(4, [{ LEMMA: "leave" }, { LEMMA: "a" }, { LEMMA: "gentleman" }]) matcher.add_pattern(4, [{LEMMA: "or"}, {LEMMA: "leave"}]) matcher.add_pattern(4, [{LEMMA: "or"}, {LEMMA: "i"}, {LEMMA: "leave"}]) matcher.add_pattern(4, [{LEMMA: "move"}, {LEMMA: "on"}]) matcher.add_pattern(4, [{LEMMA: "i"}, {LEMMA: "move"}, {LEMMA: "like"}]) matcher.add_pattern(4, [{LEMMA: "arrive"}, {LEMMA: "on"}, {LEMMA: "time"}]) matcher.add_pattern(4, [{LEMMA: "can"}, {LEMMA: "move"}]) matcher.add_pattern(4, [{LEMMA: "new"}, {LEMMA: "but"}]) matcher.add_pattern(4, [{ LEMMA: "on" }, { LEMMA: "my" }, { LEMMA: "way" }, { LEMMA: "to" }, { TAG: "PRP" }]) matcher.add_pattern(4, [{LEMMA: "u"}, {LEMMA: "get"}, {LEMMA: "here"}]) matcher.add_pattern(4, [{LEMMA: "you"}, {LEMMA: "get"}, {LEMMA: "here"}]) matcher.add_pattern(4, [{LEMMA: "go"}, {LEMMA: "to"}, {LEMMA: "town"}]) matcher.add_pattern(4, [{LEMMA: "new"}, {LEMMA: "management"}]) return matcher
def test_issue4120(en_vocab): """Test that matches without a final {OP: ?} token are returned.""" matcher = Matcher(en_vocab) matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]]) doc1 = Doc(en_vocab, words=["a"]) assert len(matcher(doc1)) == 1 # works doc2 = Doc(en_vocab, words=["a", "b", "c"]) assert len(matcher(doc2)) == 2 # fixed matcher = Matcher(en_vocab) matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]]) doc3 = Doc(en_vocab, words=["a", "b", "b", "c"]) assert len(matcher(doc3)) == 2 # works matcher = Matcher(en_vocab) matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]]) doc4 = Doc(en_vocab, words=["a", "b", "b", "c"]) assert len(matcher(doc4)) == 3 # fixed
import spacy from spacy.matcher import Matcher nlp = spacy.load("en_core_web_sm") matcher = Matcher(nlp.vocab) def set_sentiment(matcher, doc, i, matches): doc.sentiment += 0.1 pattern1 = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}] pattern2 = [[{"ORTH": emoji, "OP": "+"}] for emoji in ["😀", "😂", "🤣", "😍"]] matcher.add("GoogleIO", None, pattern1) # Match "Google I/O" or "Google i/o" matcher.add("HAPPY", set_sentiment, *pattern2) # Match one or more happy emoji doc = nlp(u"A text about Google I/O 😀😀😀") matches = matcher(doc) for match_id, start, end in matches: string_id = nlp.vocab.strings[match_id] span = doc[start:end] print(string_id, span.text) print("Sentiment", doc.sentiment)
return False if token_.is_stop: return False if len(token_.orth_) < 3: return False return True path_to_db = "/media/norpheo/mySQL/db/ssorc" path_to_annotations = os.path.join(path_to_db, "annotations_ner") pandas_path = os.path.join(path_to_db, "pandas") path_to_ner = os.path.join(path_to_db, "NER") nlp = spacy.load(os.path.join(path_to_db, "models", "en_core_web_sm_nertrained")) vocab = nlp.vocab.from_disk(os.path.join(path_to_db, "dictionaries", "spacy.vocab")) matcher = Matcher(vocab) with open(os.path.join(path_to_ner, "ml_algos.txt"), "r") as handle: ml_algos = set() ml_algos_list = list() for line in handle: algo = line.strip().lower() if algo not in ml_algos: ml_algos.add(algo) ml_algos_list.append(algo.split(" ")) for i in range(len(ml_algos_list)): for j in range(len(ml_algos_list)): if i != j: algo1 = ml_algos_list[i] algo2 = ml_algos_list[j]
def run_prdualrank(T_0, unranked_patterns, unranked_phrases, file): global final_patterns, final_keywords, pattern_to_score_map, keyword_to_score_map, ngram_prob_map, phrase_seg_score, removed_phrases, wiki_score_cache, error_count, total_ngram_counts phrase2id = {} for i in range(len(unranked_phrases)): phrase2id[unranked_phrases[i]] = i id2phrase = {} for i in range(len(unranked_phrases)): id2phrase[i] = unranked_phrases[i] id2pattern = {} for i in range(len(unranked_patterns)): id2pattern[i] = unranked_patterns[i] seedIdwConfidence = {} for key, val in phrase2id.items(): if key in T_0: seedIdwConfidence[val] = 0.0 id2patterns = defaultdict(set) pattern2ids = defaultdict(set) context_matrix = np.zeros((len(unranked_phrases), len(unranked_patterns))) # find c (t, p) with open(file, 'r') as f: file_chunk = partition(f) matcher = Matcher(nlp.vocab) for t in file_chunk: doc = nlp(t) for i in range(len(unranked_patterns)): offset = 0 for pattern_dict in unranked_patterns[i]: if 'POS' in pattern_dict: break offset += 1 matcher.add("extraction", None, unranked_patterns[i]) matches = matcher(doc) for match_id, start, end in matches: span = doc[start + offset:end].text j = unranked_phrases.index( span) if span in unranked_phrases else -1 if j == -1: continue context_matrix[j, i] += 1 id2patterns[j].add(i) pattern2ids[i].add(j) matcher.remove("extraction") id2sup = {} for i in range(len(unranked_phrases)): id2sup[i] = 0 pattern2sup = {} for i in range(len(unranked_patterns)): pattern2sup[i] = 0 for id in id2patterns.keys(): sum = 0 for col in range(len(unranked_patterns)): sum += context_matrix[id, col] id2sup[id] = sum for pattern in pattern2ids.keys(): sum = 0 for row in range(len(unranked_phrases)): sum += context_matrix[row, pattern] pattern2sup[pattern] = sum l1, l2, l3, l4, m1, m2, m3, m4 = prDualRank(seedIdwConfidence, [], id2patterns, pattern2ids, {}, {}, {}, {}, id2phrase, context_matrix.tolist(), id2sup, pattern2sup, FLAGS_VERBOSE=False, FLAGS_DEBUG=False) return l1, l2, l3, l4, m1, m2, m3, m4
import spacy from spacy.matcher import Matcher nlp = spacy.load("en_core_web_sm") matcher = Matcher(nlp.vocab) def remove_overlapping_matches(matches): remove = [] for m1 in range(len(matches) - 1): if m1 in remove: continue for m2 in range(m1 + 1, len(matches)): if m2 in remove: continue _, s1, e1 = matches[m1] _, s2, e2 = matches[m2] if s1 >= s2 and e1 <= e2: remove.append(m1) break if s2 >= s1 and e2 <= e1: remove.append(m2) continue return [matches[m] for m in range(len(matches)) if m not in remove] def markup_timex(doc, matches): matches = remove_overlapping_matches(matches) out = "" prev = 0
def test_attr_pipeline_checks(en_vocab): doc1 = Doc(en_vocab, words=["Test"]) doc1.is_parsed = True doc2 = Doc(en_vocab, words=["Test"]) doc2.is_tagged = True doc3 = Doc(en_vocab, words=["Test"]) # DEP requires is_parsed matcher = Matcher(en_vocab) matcher.add("TEST", None, [{"DEP": "a"}]) matcher(doc1) with pytest.raises(ValueError): matcher(doc2) with pytest.raises(ValueError): matcher(doc3) # TAG, POS, LEMMA require is_tagged for attr in ("TAG", "POS", "LEMMA"): matcher = Matcher(en_vocab) matcher.add("TEST", None, [{attr: "a"}]) matcher(doc2) with pytest.raises(ValueError): matcher(doc1) with pytest.raises(ValueError): matcher(doc3) # TEXT/ORTH only require tokens matcher = Matcher(en_vocab) matcher.add("TEST", None, [{"ORTH": "a"}]) matcher(doc1) matcher(doc2) matcher(doc3) matcher = Matcher(en_vocab) matcher.add("TEST", None, [{"TEXT": "a"}]) matcher(doc1) matcher(doc2) matcher(doc3)
#!/usr/bin/python """ WRITING MATCH PATTERNS """ # Write one pattern that only matches mentions of the # full iOS versions: “iOS 7”, “iOS 11” and “iOS 10”. import spacy from spacy.matcher import Matcher nlp = spacy.load("en_core_web_sm") matcher = Matcher(nlp.vocab) doc = nlp( "After making the iOS update you won't notice a radical system-wide " "redesign: nothing like the aesthetic upheaval we got with iOS 7. Most of " "iOS 11's furniture remains the same as in iOS 10. But you will discover " "some tweaks once you delve a little deeper.") # Write a pattern for full iOS versions ("iOS 7", "iOS 11", "iOS 10") pattern = [{"TEXT": "iOS"}, {"IS_DIGIT": True}] # Add the pattern to the matcher and apply the matcher to the doc matcher.add("IOS_VERSION_PATTERN", None, pattern) matches = matcher(doc) print("Total matches found:", len(matches)) # Iterate over the matches and print the span text for match_id, start, end in matches: print("Match found:", doc[start:end].text) import spacy from spacy.matcher import Matcher
import json from spacy.matcher import Matcher from spacy.lang.es import Spanish with open("exercises/es/adidas.json", encoding="utf8") as f: TEXTS = json.loads(f.read()) nlp = Spanish() matcher = Matcher(nlp.vocab) pattern1 = [{"LOWER": "adidas"}, {"LOWER": "zx"}] pattern2 = [{"LOWER": "adidas"}, {"IS_DIGIT": True}] matcher.add("ROPA", None, pattern1, pattern2) TRAINING_DATA = [] # Crea un objeto Doc para cada texto en TEXTS for doc in nlp.pipe(TEXTS): # Encuentra en el doc y crea una lista de los spans resultantes spans = [doc[start:end] for match_id, start, end in matcher(doc)] # Obtén los tuples (carácter de inicio, carácter del final, label) resultantes entities = [(span.start_char, span.end_char, "ROPA") for span in spans] # Da formato a los resultados como tuples con (doc.text, entidades) training_example = (doc.text, {"entities": entities}) # Añade el ejemplo a los datos de entrenamiento TRAINING_DATA.append(training_example) print(*TRAINING_DATA, sep="\n")
All the reservations will be made by the wedding planner.\ For the bake sale, two dozen cookies will be baked by Susan.\ The comet was viewed by the science class.\ The video was posted on Facebook by Alex.\ Instructions will be given to you by the director.\ The Grand Canyon is viewed by thousands of tourists every year.\ The house was remodeled by the homeowners to help it sell.\ The victory will be celebrated by the team tomorrow.\ The metal beams were eventually corroded by the saltwater.\ The baby was carried by the kangaroo in her pouch.\ The last cookie was eaten by whom?" try: nlp = spacy.load('en_core_web_sm') except: nlp = spacy.load('en') matcher = Matcher(nlp.vocab) def is_passive(sentence): doc = nlp(sentence) passive_rule = [{'DEP': 'nsubjpass'}, {'DEP': 'aux', 'OP': '*'}, {'DEP': 'auxpass'}, {'TAG': 'VBN'}] matcher.add('Passive', None, passive_rule) matches = matcher(doc) if matches: return "Passive" else: return "Active" if __name__=='__main__': #nlp = spacy.load('en_core_web_sm') #matcher = Matcher(nlp.vocab) text = Passive doc = nlp(text)
import json from spacy.matcher import Matcher from spacy.lang.en import English with open("exercises/iphone.json") as f: TEXTS = json.loads(f.read()) nlp = English() matcher = Matcher(nlp.vocab) pattern1 = [{"LOWER": "iphone"}, {"LOWER": "x"}] pattern2 = [{"LOWER": "iphone"}, {"IS_DIGIT": True, "OP": "?"}] matcher.add("GADGET", None, pattern1, pattern2) TRAINING_DATA = [] # Create a Doc object for each text in TEXTS for doc in nlp.pipe(TEXTS): # Match on the doc and create a list of matched spans spans = [doc[start:end] for match_id, start, end in matcher(doc)] # Get (start character, end character, label) tuples of matches entities = [(span.start_char, span.end_char, "GADGET") for span in spans] # Format the matches as a (doc.text, entities) tuple training_example = (doc.text, {"entities": entities}) # Append the example to the training data TRAINING_DATA.append(training_example) print(*TRAINING_DATA, sep="\n")
def test_matcher_no_zero_length(en_vocab): doc = Doc(en_vocab, words=["a", "b"], tags=["A", "B"]) matcher = Matcher(en_vocab) matcher.add("TEST", [[{"TAG": "C", "OP": "?"}]]) assert len(matcher(doc)) == 0
slack_bot_token = os.environ['SLACK_BOT_TOKEN'] slack_client = WebClient(slack_bot_token) slack_bot_id = os.environ['SLACK_BOT_USER_ID'] ####==== SKYSCANNER ====#### sky_url = os.environ['SKYSCAN_URL'] rapid_host = os.environ['RAPID_HOST'] rapid_key = os.environ['RAPID_KEY'] ####===== spaCy ====#### # Load spaCy object nlp = spacy.load('en_core_web_sm') # Create Matcher object for phrase matching matcher = Matcher(nlp.vocab) # Starting location pattern_start = [ { 'LOWER': 'from', }, { "ENT_TYPE": "GPE", "OP": "+" }, ] # Ending/Destination location pattern_end = [{'LOWER': 'to'}, {"ENT_TYPE": "GPE", "OP": "+"}]
def test_matcher_valid_callback(en_vocab): """Test that on_match can only be None or callable.""" matcher = Matcher(en_vocab) with pytest.raises(ValueError): matcher.add("TEST", [[{"TEXT": "test"}]], on_match=[]) matcher(Doc(en_vocab, words=["test"]))
@author: Josh """ #Using the Matcher (1) import spacy #Import the Matcher from spacy.matcher import Matcher #Load a model and create the nlp object nlp = spacy.load('en_core_web_sm') #Initialise the matcher with the shared vocab matcher = Matcher(nlp.vocab) #Add the pattern to the matcher pattern = [{"ORTH": "iPhone"}, {"ORTH": "X"}] matcher.add('IPHONE_PATTERN', None, pattern) #Process some text doc = nlp("New iPhone X release date leaked") #Call the matcher on the doc matches = matcher(doc) #Using the Matcher (2) #Iterate over the matches for match_id, start, end in matches:
def test_attr_pipeline_checks(en_vocab): doc1 = Doc(en_vocab, words=["Test"]) doc1[0].dep_ = "ROOT" doc2 = Doc(en_vocab, words=["Test"]) doc2[0].tag_ = "TAG" doc2[0].pos_ = "X" doc2[0].set_morph("Feat=Val") doc2[0].lemma_ = "LEMMA" doc3 = Doc(en_vocab, words=["Test"]) # DEP requires DEP matcher = Matcher(en_vocab) matcher.add("TEST", [[{"DEP": "a"}]]) matcher(doc1) with pytest.raises(ValueError): matcher(doc2) with pytest.raises(ValueError): matcher(doc3) # errors can be suppressed if desired matcher(doc2, allow_missing=True) matcher(doc3, allow_missing=True) # TAG, POS, LEMMA require those values for attr in ("TAG", "POS", "LEMMA"): matcher = Matcher(en_vocab) matcher.add("TEST", [[{attr: "a"}]]) matcher(doc2) with pytest.raises(ValueError): matcher(doc1) with pytest.raises(ValueError): matcher(doc3) # TEXT/ORTH only require tokens matcher = Matcher(en_vocab) matcher.add("TEST", [[{"ORTH": "a"}]]) matcher(doc1) matcher(doc2) matcher(doc3) matcher = Matcher(en_vocab) matcher.add("TEST", [[{"TEXT": "a"}]]) matcher(doc1) matcher(doc2) matcher(doc3)
import spacy from spacy.matcher import Matcher nlp = spacy.load("en_core_web_sm") doc = nlp( "Twitch Prime, the perks program for Amazon Prime members offering free " "loot, games and other benefits, is ditching one of its best features: " "ad-free viewing. According to an email sent out to Amazon Prime members " "today, ad-free viewing will no longer be included as a part of Twitch " "Prime for new members, beginning on September 14. However, members with " "existing annual subscriptions will be able to continue to enjoy ad-free " "viewing until their subscription comes up for renewal. Those with " "monthly subscriptions will have access to ad-free viewing until October 15." ) # Create the match patterns pattern1 = [{"LOWER": "Amazon"}, {"IS_TITLE": True, "POS": "PROPN"}] pattern2 = [{"LOWER": "ad-free"}, {"POS": "NOUN"}] # Initialize the Matcher and add the patterns matcher = Matcher(nlp.vocab) matcher.add("PATTERN1", [pattern1]) matcher.add("PATTERN2", [pattern2]) # Iterate over the matches for match_id, start, end in matcher(doc): # Print pattern string name and text of matched span print(doc.vocab.strings[match_id], doc[start:end].text)
def test_matcher_basic_check(en_vocab): matcher = Matcher(en_vocab) # Potential mistake: pass in pattern instead of list of patterns pattern = [{"TEXT": "hello"}, {"TEXT": "world"}] with pytest.raises(ValueError): matcher.add("TEST", pattern)
def test_issue4373(): """Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab).""" matcher = Matcher(Vocab()) assert isinstance(matcher.vocab, Vocab) matcher = PhraseMatcher(Vocab()) assert isinstance(matcher.vocab, Vocab)
def find_matches(text, keywords, countries): all_matches = [] country_reg_string = "((?:" for c in countries: country_reg_string += c.lower() + "|" country_reg_string = country_reg_string.rstrip("|") country_reg_string += ")(?:\\')?\w+)" doc_sent = nlp(text) for sent in doc_sent.sents: doc = nlp(sent.text) def on_match(matcher, doc, id, matches): country = None location = None date = None for m in matches: num, start, end = m keyword = nlp.vocab.strings[num] for ent in doc[start:end].ents: if ent.label_ == 'GPE' or ent.label_ == 'NORP': for coun in countries: if coun.lower() in ent.text.lower(): country = coun break if ent.label_ == 'LOC': location = re.sub(r'^the ', '', ent.text.strip(), flags=re.IGNORECASE) if ent.label_ == 'DATE': date = ent.text.strip() if country != None and location != None: value = { "country": country, "event": keyword, "location": location, "sentence": str(sent.text).strip() } if date != None: value['date'] = date if value not in all_matches: all_matches.append(value) print(doc[start:end]) matcher = Matcher(nlp.vocab) for c in countries: for entity_i in keywords: if " " in entity_i: pre_build = [{ "NORM": { "REGEX": country_reg_string } }, { "OP": "*" }] for i in entity_i.split(" "): pre_build.append({"NORM": i}) pre_build.append({"OP": "*"}) pre_build.append({"TAG": "IN"}) pre_build.append({"OP": "*"}) pre_build.append({"ENT_TYPE": "LOC"}) matcher.add(entity_i, on_match, pre_build) else: matcher.add(entity_i, on_match, [{ "NORM": { "REGEX": country_reg_string } }, { "OP": "*" }, { "NORM": entity_i }, { "OP": "*" }, { "TAG": "IN" }, { "OP": "*" }, { "ENT_TYPE": "LOC" }]) matches = matcher(doc) return list(all_matches)
import spacy import os from flask import Flask, request, jsonify, render_template from flask_pymongo import PyMongo from pymongo import MongoClient from celery import Celery, current_app from spacy import displacy from spacy.matcher import Matcher import en_core_web_sm nlp = spacy.load('en_core_web_sm') m_tool = Matcher(nlp.vocab) p1 = [{ 'LOWER': 'bootstrap' }, { 'LOWER': 'oracle' }, { 'LOWER': 'python' }, { 'LOWER': 'mysql' }, { 'LOWER': 'django' }, { 'LOWER': 'web development' }, { 'LOWER': 'unix' }, { 'LOWER': 'sql' }, { 'LOWER': 'selenium' }, {
html_doc = soup.prettify() print(html_doc[:15]) # Den Textinhalt des html extrahieren. text_from_html_document = u'' for x in soup.findAll('body'): text_from_html_document += x.text # Importieren der Mustererkennung. from spacy.matcher import Matcher # Das Sprachmodell in das NLP Objekt laden. nlp = spacy.load('en_core_web_md') # Die Mustererkennung initialisieren. matcher = Matcher(nlp.vocab) # Die zu suchenden Muster hinzufügen. pattern = [{'LIKE_NUM': True}] matcher.add('SimpleNumeric_PATTERN', None, pattern) # Dokument Verarbeiten. doc = nlp(text_from_html_document) # Das Dokument auf treffer untersuchen. matches = matcher(doc) # Erstellen eines Index über den Body der HTML matches_container = [] html_body_index = {}