def test_operator_combos(en_vocab): cases = [ ("aaab", "a a a b", True), ("aaab", "a+ b", True), ("aaab", "a+ a+ b", True), ("aaab", "a+ a+ a b", True), ("aaab", "a+ a+ a+ b", True), ("aaab", "a+ a a b", True), ("aaab", "a+ a a", True), ("aaab", "a+", True), ("aaa", "a+ b", False), ("aaa", "a+ a+ b", False), ("aaa", "a+ a+ a+ b", False), ("aaa", "a+ a b", False), ("aaa", "a+ a a b", False), ("aaab", "a+ a a", True), ("aaab", "a+", True), ("aaab", "a+ a b", True), ] for string, pattern_str, result in cases: matcher = Matcher(en_vocab) doc = Doc(matcher.vocab, words=list(string)) pattern = [] for part in pattern_str.split(): if part.endswith("+"): pattern.append({"ORTH": part[0], "OP": "+"}) else: pattern.append({"ORTH": part}) matcher.add("PATTERN", None, pattern) matches = matcher(doc) if result: assert matches, (string, pattern_str) else: assert not matches, (string, pattern_str)
def write_conllu(docs, file_): merger = Matcher(docs[0].vocab) merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}]) for i, doc in enumerate(docs): matches = merger(doc) spans = [doc[start : end + 1] for _, start, end in matches] with doc.retokenize() as retokenizer: for span in spans: retokenizer.merge(span) file_.write("# newdoc id = {i}\n".format(i=i)) for j, sent in enumerate(doc.sents): file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j)) file_.write("# text = {text}\n".format(text=sent.text)) for k, token in enumerate(sent): if token.head.i > sent[-1].i or token.head.i < sent[0].i: for word in doc[sent[0].i - 10 : sent[0].i]: print(word.i, word.head.i, word.text, word.dep_) for word in sent: print(word.i, word.head.i, word.text, word.dep_) for word in doc[sent[-1].i : sent[-1].i + 10]: print(word.i, word.head.i, word.text, word.dep_) raise ValueError( "Invalid parse: head outside sentence (%s)" % token.text ) file_.write(token._.get_conllu_lines(k) + "\n") file_.write("\n")
def test_issue615(en_tokenizer): def merge_phrases(matcher, doc, i, matches): """Merge a phrase. We have to be careful here because we'll change the token indices. To avoid problems, merge all the phrases once we're called on the last match.""" if i != len(matches) - 1: return None spans = [Span(doc, start, end, label=label) for label, start, end in matches] with doc.retokenize() as retokenizer: for span in spans: tag = "NNP" if span.label_ else span.root.tag_ attrs = {"tag": tag, "lemma": span.text} retokenizer.merge(span, attrs=attrs) doc.ents = doc.ents + (span,) text = "The golf club is broken" pattern = [{"ORTH": "golf"}, {"ORTH": "club"}] label = "Sport_Equipment" doc = en_tokenizer(text) matcher = Matcher(doc.vocab) matcher.add(label, merge_phrases, pattern) matcher(doc) entities = list(doc.ents) assert entities != [] assert entities[0].label != 0
def test_matcher_match_zero_plus(matcher): words = 'He said , " some words " ...'.split() pattern = [{"ORTH": '"'}, {"OP": "*", "IS_PUNCT": False}, {"ORTH": '"'}] matcher = Matcher(matcher.vocab) matcher.add("Quote", None, pattern) doc = Doc(matcher.vocab, words=words) assert len(matcher(doc)) == 1
def write_conllu(docs, file_): merger = Matcher(docs[0].vocab) merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}]) for i, doc in enumerate(docs): matches = merger(doc) spans = [doc[start : end + 1] for _, start, end in matches] with doc.retokenize() as retokenizer: for span in spans: retokenizer.merge(span) # TODO: This shouldn't be necessary? Should be handled in merge for word in doc: if word.i == word.head.i: word.dep_ = "ROOT" file_.write("# newdoc id = {i}\n".format(i=i)) for j, sent in enumerate(doc.sents): file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j)) file_.write("# text = {text}\n".format(text=sent.text)) for k, token in enumerate(sent): file_.write(_get_token_conllu(token, k, len(sent)) + "\n") file_.write("\n") for word in sent: if word.head.i == word.i and word.dep_ == "ROOT": break else: print("Rootless sentence!") print(sent) print(i) for w in sent: print(w.i, w.text, w.head.text, w.head.i, w.dep_) raise ValueError
class RussianTokenizer(object): name = 'russian_tokenizer' def __init__(self, nlp, merge_patterns=None, terminal_patterns=None): self.matcher = Matcher(nlp.vocab) self.token_merge = nlp.vocab.strings['pattern'] self.sentence_terminal = nlp.vocab.strings['sentence_terminal'] if merge_patterns: self.matcher.add(self.token_merge, None, *merge_patterns) if terminal_patterns: self.matcher.add(self.sentence_terminal, None, *terminal_patterns) def __call__(self, doc): spans = [] for id, start, end in self.matcher(doc): if id == self.token_merge: spans.append(doc[start:end]) elif id == self.sentence_terminal: # remove all sentence start marks from span that match pattern for token in doc[start:end]: if token.sent_start: token.sent_start = False if spans: for span in spans: span.merge() return doc
def test_issue3555(en_vocab): """Test that custom extensions with default None don't break matcher.""" Token.set_extension("issue3555", default=None) matcher = Matcher(en_vocab) pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}] matcher.add("TEST", None, pattern) doc = Doc(en_vocab, words=["have", "apple"]) matcher(doc)
def test_issue1883(): matcher = Matcher(Vocab()) matcher.add("pat1", None, [{"orth": "hello"}]) doc = Doc(matcher.vocab, words=["hello"]) assert len(matcher(doc)) == 1 new_matcher = copy.deepcopy(matcher) new_doc = Doc(new_matcher.vocab, words=["hello"]) assert len(new_matcher(new_doc)) == 1
def test_matcher_operator_shadow(en_vocab): matcher = Matcher(en_vocab) doc = Doc(matcher.vocab, words=["a", "b", "c"]) pattern = [{"ORTH": "a"}, {"IS_ALPHA": True, "OP": "+"}, {"ORTH": "c"}] matcher.add("A.C", None, pattern) matches = matcher(doc) assert len(matches) == 1 assert matches[0][1:] == (0, 3)
def test_issue_1971_2(en_vocab): matcher = Matcher(en_vocab) pattern1 = [{"ORTH": "EUR", "LOWER": {"IN": ["eur"]}}, {"LIKE_NUM": True}] pattern2 = [{"LIKE_NUM": True}, {"ORTH": "EUR"}] # {"IN": ["EUR"]}}] doc = Doc(en_vocab, words=["EUR", "10", "is", "10", "EUR"]) matcher.add("TEST1", None, pattern1, pattern2) matches = matcher(doc) assert len(matches) == 2
def test_match_consuming(doc, text, pattern, re_pattern): """Test that matcher.__call__ consumes tokens on a match similar to re.findall.""" matcher = Matcher(doc.vocab) matcher.add(re_pattern, None, pattern) matches = matcher(doc) re_matches = [m.span() for m in re.finditer(re_pattern, text)] assert len(matches) == len(re_matches)
def test_greedy_matching(doc, text, pattern, re_pattern): """Test that the greedy matching behavior of the * op is consistant with other re implementations.""" matcher = Matcher(doc.vocab) matcher.add(re_pattern, None, pattern) matches = matcher(doc) re_matches = [m.span() for m in re.finditer(re_pattern, text)] for match, re_match in zip(matches, re_matches): assert match[1:] == re_match
def test_issue1945(): """Test regression in Matcher introduced in v2.0.6.""" matcher = Matcher(Vocab()) matcher.add("MWE", None, [{"orth": "a"}, {"orth": "a"}]) doc = Doc(matcher.vocab, words=["a", "a", "a"]) matches = matcher(doc) # we should see two overlapping matches here assert len(matches) == 2 assert matches[0][1:] == (0, 2) assert matches[1][1:] == (1, 3)
def test_matcher_compare_length(en_vocab): matcher = Matcher(en_vocab) pattern = [{"LENGTH": {">=": 2}}] matcher.add("LENGTH_COMPARE", None, pattern) doc = Doc(en_vocab, words=["a", "aa", "aaa"]) matches = matcher(doc) assert len(matches) == 2 doc = Doc(en_vocab, words=["a"]) matches = matcher(doc) assert len(matches) == 0
def test_matcher_regex(en_vocab): matcher = Matcher(en_vocab) pattern = [{"ORTH": {"REGEX": r"(?:a|an)"}}] matcher.add("A_OR_AN", None, pattern) doc = Doc(en_vocab, words=["an", "a", "hi"]) matches = matcher(doc) assert len(matches) == 2 doc = Doc(en_vocab, words=["bye"]) matches = matcher(doc) assert len(matches) == 0
def test_matcher_set_value_operator(en_vocab): matcher = Matcher(en_vocab) pattern = [{"ORTH": {"IN": ["a", "the"]}, "OP": "?"}, {"ORTH": "house"}] matcher.add("DET_HOUSE", None, pattern) doc = Doc(en_vocab, words=["In", "a", "house"]) matches = matcher(doc) assert len(matches) == 2 doc = Doc(en_vocab, words=["my", "house"]) matches = matcher(doc) assert len(matches) == 1
def test_matcher_set_value(en_vocab): matcher = Matcher(en_vocab) pattern = [{"ORTH": {"IN": ["an", "a"]}}] matcher.add("A_OR_AN", None, pattern) doc = Doc(en_vocab, words=["an", "a", "apple"]) matches = matcher(doc) assert len(matches) == 2 doc = Doc(en_vocab, words=["aardvark"]) matches = matcher(doc) assert len(matches) == 0
def test_matcher_any_token_operator(en_vocab): """Test that patterns with "any token" {} work with operators.""" matcher = Matcher(en_vocab) matcher.add("TEST", None, [{"ORTH": "test"}, {"OP": "*"}]) doc = Doc(en_vocab, words=["test", "hello", "world"]) matches = [doc[start:end].text for _, start, end in matcher(doc)] assert len(matches) == 3 assert matches[0] == "test" assert matches[1] == "test hello" assert matches[2] == "test hello world"
def test_matcher_regex_shape(en_vocab): matcher = Matcher(en_vocab) pattern = [{"SHAPE": {"REGEX": r"^[^x]+$"}}] matcher.add("NON_ALPHA", None, pattern) doc = Doc(en_vocab, words=["99", "problems", "!"]) matches = matcher(doc) assert len(matches) == 2 doc = Doc(en_vocab, words=["bye"]) matches = matcher(doc) assert len(matches) == 0
def matcher(en_vocab): rules = { "JS": [[{"ORTH": "JavaScript"}]], "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]], "Java": [[{"LOWER": "java"}]], } matcher = Matcher(en_vocab) for key, patterns in rules.items(): matcher.add(key, None, *patterns) return matcher
def test_issue_1971_3(en_vocab): """Test that pattern matches correctly for multiple extension attributes.""" Token.set_extension("a", default=1, force=True) Token.set_extension("b", default=2, force=True) doc = Doc(en_vocab, words=["hello", "world"]) matcher = Matcher(en_vocab) matcher.add("A", None, [{"_": {"a": 1}}]) matcher.add("B", None, [{"_": {"b": 2}}]) matches = sorted((en_vocab.strings[m_id], s, e) for m_id, s, e in matcher(doc)) assert len(matches) == 4 assert matches == sorted([("A", 0, 1), ("A", 1, 2), ("B", 0, 1), ("B", 1, 2)])
def test_issue1450(string, start, end): """Test matcher works when patterns end with * operator.""" pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}] matcher = Matcher(Vocab()) matcher.add("TSTEND", None, pattern) doc = Doc(Vocab(), words=string.split()) matches = matcher(doc) if start is None or end is None: assert matches == [] assert matches[-1][1] == start assert matches[-1][2] == end
def test_issue850_basic(): """Test Matcher matches with '*' operator and Boolean flag""" vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()}) matcher = Matcher(vocab) pattern = [{"LOWER": "bob"}, {"OP": "*", "LOWER": "and"}, {"LOWER": "frank"}] matcher.add("FarAway", None, pattern) doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"]) match = matcher(doc) assert len(match) == 1 ent_id, start, end = match[0] assert start == 0 assert end == 4
def test_matcher_end_zero_plus(en_vocab): """Test matcher works when patterns end with * operator. (issue 1450)""" matcher = Matcher(en_vocab) pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}] matcher.add("TSTEND", None, pattern) nlp = lambda string: Doc(matcher.vocab, words=string.split()) assert len(matcher(nlp("a"))) == 1 assert len(matcher(nlp("a b"))) == 2 assert len(matcher(nlp("a c"))) == 1 assert len(matcher(nlp("a b c"))) == 2 assert len(matcher(nlp("a b b c"))) == 3 assert len(matcher(nlp("a b b"))) == 3
def test_matcher_from_api_docs(en_vocab): matcher = Matcher(en_vocab) pattern = [{"ORTH": "test"}] assert len(matcher) == 0 matcher.add("Rule", None, pattern) assert len(matcher) == 1 matcher.remove("Rule") assert "Rule" not in matcher matcher.add("Rule", None, pattern) assert "Rule" in matcher on_match, patterns = matcher.get("Rule") assert len(patterns[0])
def test_matcher_sets_return_correct_tokens(en_vocab): matcher = Matcher(en_vocab) patterns = [ [{'LOWER': {'IN': ["zero"]}}], [{'LOWER': {'IN': ["one"]}}], [{'LOWER': {'IN': ["two"]}}], ] matcher.add('TEST', None, *patterns) doc = Doc(en_vocab, words="zero one two three".split()) matches = matcher(doc) texts = [Span(doc, s, e, label=L).text for L, s, e in matches] assert texts == ['zero', 'one', 'two']
def test_issue590(en_vocab): """Test overlapping matches""" doc = Doc(en_vocab, words=["n", "=", "1", ";", "a", ":", "5", "%"]) matcher = Matcher(en_vocab) matcher.add( "ab", None, [{"IS_ALPHA": True}, {"ORTH": ":"}, {"LIKE_NUM": True}, {"ORTH": "%"}], ) matcher.add("ab", None, [{"IS_ALPHA": True}, {"ORTH": "="}, {"LIKE_NUM": True}]) matches = matcher(doc) assert len(matches) == 2
def test_matcher_empty_dict(en_vocab): """Test matcher allows empty token specs, meaning match on any token.""" matcher = Matcher(en_vocab) doc = Doc(matcher.vocab, words=["a", "b", "c"]) matcher.add("A.C", None, [{"ORTH": "a"}, {}, {"ORTH": "c"}]) matches = matcher(doc) assert len(matches) == 1 assert matches[0][1:] == (0, 3) matcher = Matcher(en_vocab) matcher.add("A.", None, [{"ORTH": "a"}, {}]) matches = matcher(doc) assert matches[0][1:] == (0, 2)
def test_matcher_extension_attribute(en_vocab): matcher = Matcher(en_vocab) get_is_fruit = lambda token: token.text in ("apple", "banana") Token.set_extension("is_fruit", getter=get_is_fruit, force=True) pattern = [{"ORTH": "an"}, {"_": {"is_fruit": True}}] matcher.add("HAVING_FRUIT", None, pattern) doc = Doc(en_vocab, words=["an", "apple"]) matches = matcher(doc) assert len(matches) == 1 doc = Doc(en_vocab, words=["an", "aardvark"]) matches = matcher(doc) assert len(matches) == 0
def test_matcher_extension_set_membership(en_vocab): matcher = Matcher(en_vocab) get_reversed = lambda token: "".join(reversed(token.text)) Token.set_extension("reversed", getter=get_reversed, force=True) pattern = [{"_": {"reversed": {"IN": ["eyb", "ih"]}}}] matcher.add("REVERSED", None, pattern) doc = Doc(en_vocab, words=["hi", "bye", "hello"]) matches = matcher(doc) assert len(matches) == 2 doc = Doc(en_vocab, words=["aardvark"]) matches = matcher(doc) assert len(matches) == 0
def rules(obj, qus, question): matcher = Matcher(nlp.vocab) matcher.add("rule description", None, [{"Lemma": "about"}], [{"Lemma": "detail"}]) matcher.add("rule student_course", None, [{"Lemma": "take"}]) matcher.add("rule topic_course", None, [{"Lemma": "cover"}]) matcher.add("rule topic_student", None, [{"Lemma": "familiar"}, {"Lemma": "with"}]) matcher.add("rule student_topic", None, [{"Lemma": "know"}]) doc = nlp(qus) # doc=nlp(question) matcher_ques = Matcher(nlp.vocab) matcher_ques.add("question rule1", None, [{"POS": "PROPN"}, {"POS": "NUM"}]) matcher_ques.add("question rule2", None, [{"POS": "PROPN"}, {"POS": "PROPN"},{"POS": "PROPN"}],[{"POS": "PROPN"}, {"POS": "PROPN"}]) matcher_ques.add("question rule3", None, [{"Lemma": "course"}, {"Lemma": "cover"}]) doc2 = nlp(question) for match_id, start, end in matcher(doc): string_id = nlp.vocab.strings[match_id] # Get string representation span = doc[start:end] # The matched span if (string_id == "rule topic_student"): return ["q4", obj[0]] for match_id2, start2, end2 in matcher_ques(doc2): span2 = doc2[start2:end2] string_id2 = doc.vocab.strings[match_id2] if (string_id == "rule description" and string_id2 == "question rule1"): return ["q1", span2.text] elif (string_id == "rule student_course" and string_id2 == "question rule2"): return ["q2", span2.text] elif (string_id == "rule topic_course" and string_id2 == "question rule3"): return ["q3", question.split("cover",1)[1]] elif (string_id == "rule student_topic" and string_id2 == "question rule2"): return ["q5", span2.text]
match_id, start, end = matches[i] # indices of matched term span = doc[start:end] # extract matched term print('span: {} | start_ind:{:5} | end_ind:{:5} | id:{}'.format( span, start, end, match_id)) # set a pattern of text to collect # find all mentions of the word fees pattern = [{'LOWER':'fees'}] # LOWER coverts words to lowercase before matching # instantiate matcher matcher = Matcher(nlp.vocab) # add pattern to the matcher (one matcher can look for many unique patterns) # provice a pattern name, function to apply to matches, pattern to identify matcher.add('fee', collect_sents, pattern) # pass the doc to the matcher to run the collect_sents function matcher(doc) # change the function to print the sentence of the matched term (span) def collect_sents(matcher, doc, i, matches): match_id, start, end = matches[i] span = doc[start:end] print('SPAN: {}'.format(span)) # span.sent provides the sentence that contains the span print('SENT: {}'.format(span.sent)) print() # update the pattern to look for any noun preceeding the term 'fees'
import spacy from spacy.matcher import Matcher nlp = spacy.load("en") matcher = Matcher(nlp.vocab) pattern = [{"DEP": "nsubj"}, {"DEP": "aux"}, {"DEP": "ROOT"}] matcher.add("NsubjAuxRoot", None, pattern) doc = nlp(u"We can overtake them.") matches = matcher(doc) for match_id, start, end in matches: span = doc[start:end] print("Span: ", span.text) print("The positions in the doc are: ", start, "-", end)
class CrazyTokenizer(object): """ Tokenizer with Reddit- and Twitter-specific options Parameters ---------- lowercase : bool, optional If True, lowercase all tokens. Defaults to True. keepcaps: bool, optional If True, keep ALL CAPS WORDS uppercased. Defaults to False. normalize: int or bool, optional If not False, perform normalization of repeated charachers ("awesoooooome" -> "awesooome"). The value of parameter determines the number of occurences to keep. Defaults to 3. ignore_quotes: bool, optional If True, ignore tokens contained within double quotes. Defaults to False. ignore_reddit_quotes: bool, optional If True, remove quotes from the Reddit comments. Defaults to False. ignore_stopwords: str, list, or boolean, optional Whether to ignore stopwords - str: language to get a list of stopwords for from NLTK package - list: list of stopwords to remove - True: use built-in list of the english stop words - False: keep all tokens Defaults to False stem: {False, 'stem', 'lemm'}, optional Whether to perform word stemming - False: do not perform word stemming - 'stem': use PorterStemmer from NLTK package - 'lemm': use WordNetLemmatizer from NLTK package remove_punct: bool, optional If True, remove punctuation tokens. Defaults to True. remove_breaks: bool, optional If True, remove linebreak tokens. Defaults to True. decontract: bool, optional If True, attempt to expand certain contractions. Defaults to False. Example: "'ll" -> " will" numbers, subreddits, reddit_usernames, emails: False or str, optional Replacement of the different types of tokens - False: leaves these tokens intact - str: replacement token - '': removes all occurrences of these tokens twitter_handles: False, 'realname' or str, optional Processing of twitter handles - False: do nothing - str: replacement token - 'realname': replace with the real screen name of Twitter account - 'split': split handles using Viterbi algorithm Example: "#vladimirputinisthebest" -> "vladimir putin is the best" hashtags: False or str, optional Processing of hashtags - False: do nothing - str: replacement token - 'split': split hashtags according using Viterbi algorithm urls: False or str, optional Replacement of parsed URLs - False: leave URL intact - str: replacement token - dict: replace all URLs stored in keys with the corresponding values - '': removes all occurrences of these tokens - 'domain': extract domain ("http://cnn.com" -> "cnn") - 'domain_unwrap_fast': extract domain after unwraping links for a list of URL shorteners (goo.gl, t.co, bit.ly, tinyurl.com) - 'domain_unwrap': extract domain after unwraping all links - 'title': extract and tokenize title of each link after unwraping it Defaults to False. extra_patterns: None or list of tuples, optional Replacement of any user-supplied extra patterns. Tuples must have the following form: (name, re_pattern, replacement_token): - name (str): name of the pattern - re_pattern (_sre.SRE_Pattern): compiled re pattern - replacement_token (str): replacement token Defaults to None keep_untokenized: None or list, optional List of expressions to keep untokenized Example: ["New York", "Los Angeles", "San Francisco"] whitespaces_to_underscores: boolean, optional If True, replace all whitespace characters with underscores in the final tokens. Defaults to True. remove_nonunicode: boolean, optional If True, remove all non-unicode characters. Defaults to False. pos_emojis, neg_emojis, neutral_emojis: None, True, or list, optional Replace positive, negative, and neutral emojis with the special tokens - None: do not perform replacement - True: perform replacement of the default lists of emojis - list: list of emojis to replace print_url_warnings: bool, optional If True, print URL-related warnings. Defaults to False. latin_chars_fix: bool, optional Try applying this fix if you have a lot of \\xe2\\x80\\x99-like or U+1F601-like strings in your data. Defaults to False. ngrams: int, optional Add ngrams of tokens after tokenizing """ def __init__(self, lowercase=True, keepcaps=False, normalize=3, ignore_quotes=False, ignore_reddit_quotes=False, ignore_stopwords=False, stem=False, remove_punct=True, remove_breaks=True, decontract=False, twitter_handles=False, urls=False, hashtags=False, numbers=False, subreddits=False, reddit_usernames=False, emails=False, extra_patterns=None, keep_untokenized=None, whitespaces_to_underscores=True, remove_nonunicode=False, pos_emojis=None, neg_emojis=None, neutral_emojis=None, print_url_warnings=False, latin_chars_fix=False, ngrams=1): self.params = locals() self._nlp = English() self._merging_matcher = Matcher(self._nlp.vocab) self._matcher = Matcher(self._nlp.vocab) self._replacements = {} self._domains = {} self._realnames = {} self._stopwords = None alpha_digits_flag = self._nlp.vocab.add_flag(alpha_digits_check) hashtag_flag = self._nlp.vocab.add_flag(hashtag_check) twitter_handle_flag = self._nlp.vocab.add_flag(twitter_handle_check) self._merging_matcher.add('HASHTAG', None, [{ 'ORTH': '#' }, { 'IS_ASCII': True }]) self._merging_matcher.add('SUBREDDIT', None, [{ 'ORTH': '/r' }, { 'ORTH': '/' }, { alpha_digits_flag: True }], [{ 'ORTH': 'r' }, { 'ORTH': '/' }, { alpha_digits_flag: True }]) self._merging_matcher.add('REDDIT_USERNAME', None, [{ 'ORTH': '/u' }, { 'ORTH': '/' }, { alpha_digits_flag: True }], [{ 'ORTH': 'u' }, { 'ORTH': '/' }, { alpha_digits_flag: True }]) if isinstance(ignore_stopwords, str) and ('nltk' in sys.modules): try: self._stopwords = stopwords.words(ignore_stopwords) except OSError: raise ValueError('Language {} was not found by NLTK'.format( ignore_stopwords)) elif ignore_stopwords is True: self._matcher.add('STOPWORDS', self._remove_token, [{ 'IS_STOP': True }]) elif isinstance(ignore_stopwords, list): self._stopwords = [word.lower() for word in ignore_stopwords] elif ignore_stopwords is not False: raise TypeError( 'Type {} is not supported by ignore_stopwords parameter or NLTK is not installed' .format(type(ignore_stopwords))) if lowercase and (not keepcaps): self._matcher.add('LOWERCASE', self._lowercase, [{ 'IS_LOWER': False }]) elif lowercase and keepcaps: self._matcher.add('LOWERCASE', self._lowercase, [{ 'IS_LOWER': False, 'IS_UPPER': False }]) if remove_punct: self._matcher.add('PUNCTUATION', self._remove_token, [{ 'IS_PUNCT': True }]) if remove_breaks: def break_check(text): return bool(BREAKS_RE.fullmatch(text)) break_flag = self._nlp.vocab.add_flag(break_check) self._matcher.add('BREAK', self._remove_token, [{ break_flag: True }]) if normalize: def normalize_check(text): return bool(NORMALIZE_RE.search(text)) normalize_flag = self._nlp.vocab.add_flag(normalize_check) self._matcher.add('NORMALIZE', self._normalize, [{ normalize_flag: True }]) if numbers is not False: self._matcher.add('NUMBER', self._replace_token, [{ 'LIKE_NUM': True }]) self._replacements['NUMBER'] = numbers if urls is not False: if urls in [ 'domain', 'domain_unwrap_fast', 'domain_unwrap', 'title' ]: self._urls = urls self._matcher.add('URL', self._process_url, [{ 'LIKE_URL': True }]) elif isinstance(urls, dict): self._domains = urls self._urls = 'domain_unwrap_fast' self._matcher.add('URL', self._process_url, [{ 'LIKE_URL': True }]) else: self._matcher.add('URL', self._replace_token, [{ 'LIKE_URL': True }]) self._replacements['URL'] = urls if emails is not False: self._matcher.add('EMAIL', self._replace_token, [{ 'LIKE_EMAIL': True }]) self._replacements['EMAIL'] = emails if reddit_usernames is not False: def reddit_username_check(text): return bool(REDDITORS_RE.fullmatch(text)) reddit_username_flag = self._nlp.vocab.add_flag( reddit_username_check) self._matcher.add('REDDIT_USERNAME', self._replace_token, [{ reddit_username_flag: True }]) self._replacements['REDDIT_USERNAME'] = reddit_usernames if subreddits is not False: def subreddit_check(text): return bool(SUBREDDITS_RE.fullmatch(text)) subreddit_flag = self._nlp.vocab.add_flag(subreddit_check) self._matcher.add('SUBREDDIT', self._replace_token, [{ subreddit_flag: True }]) self._replacements['SUBREDDIT'] = subreddits if twitter_handles is not False: self._matcher.add('TWITTER_HANDLE', self._handles_postprocess, [{ twitter_handle_flag: True }]) if hashtags is not False: self._matcher.add('HASHTAG', self._hashtag_postprocess, [{ hashtag_flag: True }]) if hashtags == 'split' or twitter_handles == 'split': file = os.path.join(DATA_PATH, 'wordsfreq_wiki2.txt') with open(file) as f: self._words = f.read().split() self._wordcost = dict((k, log((i + 1) * log(len(self._words)))) for i, k in enumerate(self._words)) self._maxword = max(len(x) for x in self._words) if twitter_handles == 'realname': with open(os.path.join(DATA_PATH, 'realnames.json')) as f: self._realnames = json.load(f) if ignore_quotes: self._merging_matcher.add('QUOTE', None, [{ 'ORTH': '"' }, { 'OP': '*', 'IS_ASCII': True }, { 'ORTH': '"' }]) def doublequote_check(text): return bool(QUOTES_RE.fullmatch(text)) doublequote_flag = self._nlp.vocab.add_flag(doublequote_check) self._matcher.add('DOUBLE_QUOTES', self._remove_token, [{ doublequote_flag: True }]) if self._stopwords: def stopword_check(text): return bool(text.lower() in self._stopwords) stopword_flag = self._nlp.vocab.add_flag(stopword_check) self._matcher.add('STOPWORD', self._remove_token, [{ stopword_flag: True }]) if keep_untokenized is not None: if not isinstance(keep_untokenized, list): raise ValueError( "keep_untokenized has to be either None or a list") for i, phrase in enumerate(keep_untokenized): phrase_tokens = phrase.split(' ') rule = [] for token in phrase_tokens: rule.append({'LOWER': token.lower()}) self._merging_matcher.add('RULE_' + str(i), None, rule) if pos_emojis: if not isinstance(pos_emojis, list): pos_emojis = POS_EMOJIS pos_patterns = [[{'ORTH': emoji}] for emoji in pos_emojis] self._matcher.add('HAPPY', self._replace_token, *pos_patterns) self._replacements['HAPPY'] = 'POS_EMOJI' if neg_emojis: if not isinstance(neg_emojis, list): neg_emojis = NEG_EMOJIS neg_patterns = [[{'ORTH': emoji}] for emoji in neg_emojis] self._matcher.add('SAD', self._replace_token, *neg_patterns) self._replacements['SAD'] = 'NEG_EMOJI' if neutral_emojis: if not isinstance(neutral_emojis, list): neutral_emojis = NEUTRAL_EMOJIS neutral_patterns = [[{'ORTH': emoji}] for emoji in neutral_emojis] self._matcher.add('NEUTRAL', self._replace_token, *neutral_patterns) self._replacements['NEUTRAL'] = 'NEUTRAL_EMOJI' if isinstance(extra_patterns, list): self._flags = {} for name, re_pattern, replacement_token in extra_patterns: def flag(text): return bool(re_pattern.match(text)) self._flags[name] = self._nlp.vocab.add_flag(flag) self._matcher.add(name, self._replace_token, [{ self._flags[name]: True }]) self._replacements[name] = replacement_token if stem and ('nltk' in sys.modules): if stem == 'stem': self._stemmer = PorterStemmer() elif stem == 'lemm': self._stemmer = WordNetLemmatizer() else: raise ValueError( 'Stemming method {} is not supported'.format(stem)) self._matcher.add('WORD_TO_STEM', self._stem_word, [{ 'IS_ALPHA': True }]) retokenize_flag = self._nlp.vocab.add_flag(retokenize_check) self._matcher.add('RETOKENIZE', self._retokenize, [{ retokenize_flag: True, 'IS_PUNCT': False, 'LIKE_URL': False, 'LIKE_EMAIL': False, 'LIKE_NUM': False, hashtag_flag: False, twitter_handle_flag: False }]) self._nlp.add_pipe(self._merge_doc, name='merge_doc', last=True) self._nlp.add_pipe(self._match_doc, name='match_doc', last=True) self._nlp.add_pipe(self._postproc_doc, name='postproc_doc', last=True) @staticmethod def _lowercase(__, doc, i, matches): # Lowercase tokens __, start, end = matches[i] span = doc[start:end] for tok in span: tok._.transformed_text = tok._.transformed_text.lower() def _stem_word(self, __, doc, i, matches): # Stem tokens __, start, end = matches[i] span = doc[start:end] for tok in span: if self.params['stem'] == 'stem': tok._.transformed_text = self._stemmer.stem( tok._.transformed_text) elif self.params['stem'] == 'lemm': tok._.transformed_text = self._stemmer.lemmatize( tok._.transformed_text) def _normalize(self, __, doc, i, matches): # Normalize repeating symbols __, start, end = matches[i] span = doc[start:end] for tok in span: tok._.transformed_text = NORMALIZE_RE.sub( r"\1" * self.params['normalize'], tok._.transformed_text) def _process_url(self, __, doc, i, matches): # Process found URLs __, start, end = matches[i] span = doc[start:end] for tok in span: found_urls = URLS_RE.findall(tok.text) if found_urls: if found_urls[0] in self._domains: tok._.transformed_text = self._domains[found_urls[0]] elif self._urls == 'domain': tok._.transformed_text = tldextract.extract( found_urls[0]).domain elif self._urls != 'title': if self._urls == 'domain_unwrap': domain = unshorten_url( found_urls[0], None, self.params['print_url_warnings']) else: domain = unshorten_url( found_urls[0], URL_SHORTENERS, self.params['print_url_warnings']) self._domains[found_urls[0]] = domain tok._.transformed_text = domain elif self._urls == 'title': domain = unshorten_url(found_urls[0], URL_SHORTENERS) if domain != 'twitter': title = get_url_title( found_urls[0], self.params['print_url_warnings']) title = self.tokenize(URLS_RE.sub('', title)) else: title = '' tok._.transformed_text = title self._domains[found_urls[0]] = title def _replace_token(self, __, doc, i, matches): # Replace tokens with something else match_id, start, end = matches[i] span = doc[start:end] replacement_token = self._replacements[doc.vocab.strings[match_id]] for tok in span: tok._.transformed_text = replacement_token @staticmethod def _remove_token(__, doc, i, matches): # Remove tokens __, start, end = matches[i] span = doc[start:end] for tok in span: tok._.transformed_text = '' def _retokenize(self, __, doc, i, matches): # Retokenize __, start, end = matches[i] span = doc[start:end] for tok in span: text = tok.text text = re.sub(r'([#@])', r' \1', text) text = re.sub(r'\s{2,}', ' ', text).strip() tok._.transformed_text = self.tokenize(text) def _infer_spaces(self, text): # Infer location of spaces in hashtags text = text.lower() text = re.sub(r'[^\w\s]', '', text) def best_match(i): # Find the best match for the first i characters # assuming costs has been built for the first (i-1) characters candidates = enumerate(reversed(cost[max(0, i - self._maxword):i])) return min( (c + self._wordcost.get(text[i - k - 1:i], 9e999), k + 1) for k, c in candidates) cost = [0] for i in range(1, len(text) + 1): cur_cost, k = best_match(i) cost.append(cur_cost) out = [] i = len(text) while i > 0: cur_cost, k = best_match(i) assert cur_cost == cost[i] out.append(text[i - k:i]) i -= k return list(reversed(out)) def _handles_postprocess(self, __, doc, i, matches): # Process twitter handles __, start, end = matches[i] span = doc[start:end] for tok in span: if self.params['twitter_handles'] == 'realname': if tok.text in self._realnames: tok._.transformed_text = self._realnames[tok.text] else: handle = get_twitter_realname(tok.text) realname = self.tokenize(TWITTER_HANDLES_RE.sub( '', handle)) tok._.transformed_text = realname self._realnames[tok.text] = realname elif self.params['twitter_handles'] == 'split': poss = self._infer_spaces(tok._.transformed_text[1:]) if poss: tok._.transformed_text = poss else: tok._.transformed_text = self.params['twitter_handles'] def _hashtag_postprocess(self, __, doc, i, matches): # Process hashtags __, start, end = matches[i] span = doc[start:end] for tok in span: if self.params['hashtags'] == 'split': poss = self._infer_spaces(tok._.transformed_text[1:]) if poss: tok._.transformed_text = poss else: tok._.transformed_text = self.params['hashtags'] @staticmethod def _decontract(text): # Expand contractions for contraction, decontraction in DECONTRACTIONS.items(): text = re.sub(contraction, decontraction, text) return text def _preprocess_text(self, text): # Do some preprocessing text = re.sub("’", "'", text) if self.params['remove_nonunicode']: try: text = text.encode('utf-8').decode('unicode-escape') text = ''.join(filter(lambda x: x in string.printable, text)).strip() except UnicodeDecodeError: warnings.warn( '(UnicodeDecodeError while trying to remove non-unicode characters' ) if self.params['decontract']: text = self._decontract(text) text = html.unescape(text) if self.params['latin_chars_fix']: if EMOJIS_UTF_RE.findall(text): text = EMOJIS_UTF_NOSPACE_RE.sub(r' \1', text) for utf_code, emoji in EMOJIS_UTF.items(): text = EMOJIS_UTF_PATS[utf_code].sub(emoji, text) if EMOJIS_UNICODE_RE.findall(text): text = EMOJIS_UNICODE_NOSPACE_RE.sub(r'\1 \2', text) for utf_code, emoji in EMOJIS_UNICODE.items(): text = EMOJIS_UNICODE_PATS[utf_code].sub(emoji, text) if LATIN_CHARS_RE.findall(text): for _hex, _char in LATIN_CHARS.items(): text = LATIN_CHARS_PATS[_hex].sub(_char, text) if self.params['ignore_reddit_quotes']: text = REDDIT_QUOTES_RE.sub(text, ' ') text = text.replace('.@', '. @') text = re.sub(r'([*;,!?\(\)\[\]])', r' \1', text) text = re.sub(r'\s{2,}', ' ', text) return text.strip() def _merge_doc(self, doc): # Perform merging for certain types of tokens matches = self._merging_matcher(doc) spans = [] for __, start, end in matches: spans.append(doc[start:end]) for span in spans: span.merge() for tok in doc: tok._.transformed_text = tok.text return doc def _match_doc(self, doc): # Perform all additional processing self._matcher(doc) return doc def _postproc_doc(self, doc): # Perform postprocessing doc._.tokens = [] for tok in doc: if isinstance(tok._.transformed_text, list): doc._.tokens.extend(tok._.transformed_text) elif tok._.transformed_text.strip() != '': if self.params['whitespaces_to_underscores']: tok._.transformed_text = "_".join( tok._.transformed_text.split()) doc._.tokens.append(tok._.transformed_text.strip()) return doc def tokenize(self, text): """ Tokenize document Parameters ---------- text : str Document to tokenize Returns ------- list List of tokens Examples -------- >>> from redditscore.tokenizer import CrazyTokenizer >>> tokenizer = CrazyTokenizer(splithashtags=True, hashtags=False) >>> tokenizer.tokenize("#makeamericagreatagain") ["make", "america", "great", "again"] """ if not isinstance(text, str): warnings.warn('Document {} is not a string'.format(text)) return [] text = self._preprocess_text(text) doc = self._nlp(text) tokens = doc._.tokens if self.params['ngrams'] > 1: if self.params['whitespaces_to_underscores']: tokens = word_ngrams(tokens, (1, self.params['ngrams']), separator='_') else: tokens = word_ngrams(tokens, (1, self.params['ngrams'])) return tokens
import spacy from spacy.matcher import Matcher nlp = spacy.load("en_core_web_sm") matcher = Matcher(nlp.vocab) doc = nlp( "i downloaded Fortnite on my laptop and can't open the game at all. Help? " "so when I was downloading Minecraft, I got the Windows version where it " "is the '.zip' folder and I used the default program to unpack it... do " "I also need to download Winzip?" ) # Write a pattern that matches a form of "download" plus proper noun pattern = [{"LEMMA": "download"}, {"POS": "PROPN"}] # Add the pattern to the matcher and apply the matcher to the doc matcher.add("DOWNLOAD_THINGS_PATTERN", None, pattern) matches = matcher(doc) print("Total matches found:", len(matches)) # Iterate over the matches and print the span text for match_id, start, end in matches: print("Match found:", doc[start:end].text)
class PatternMatcher: def __init__(self): self.count = { "0": 0, "1": 0, "2": 0, "3": 0, "4": 0, "5": 0, "6": 0, "7": 0, "8": 0, "9": 0, "10": 0 } self.compa_sent_count = 0 self.nlp = spacy.load('en') self.matcher = Matcher(self.nlp.vocab) # self.matcher.add(0, # None, # [{'ORTH': 'JJR'}, {'ORTH': 'CIN'}, {'ORTH': 'TECH'}], # [{'ORTH': 'JJR'}, {}, {'ORTH': 'CIN'}, {'ORTH': 'TECH'}], # [{'ORTH': 'JJR'}, {'ORTH': 'CIN'}, {}, {'ORTH': 'TECH'}], # [{'ORTH': 'JJR'}, {}, {'ORTH': 'CIN'}, {}, {'ORTH': 'TECH'}]) # self.matcher.add(1, # None, # [{'ORTH': 'RB'}, {'ORTH': 'JJ'}, {'ORTH': 'CIN'}, {'ORTH': 'TECH'}], # [{'ORTH': 'RB'}, {'ORTH': 'JJ'}, {}, {'ORTH': 'CIN'}, {'ORTH': 'TECH'}]) # self.matcher.add(8, # None, # [{'ORTH': 'RBR'}, {'ORTH': 'JJ'}, {'ORTH': 'CIN'}, {'ORTH': 'TECH'}], # [{'ORTH': 'RBR'}, {'ORTH': 'JJ'}, {}, {'ORTH': 'CIN'}, {'ORTH': 'TECH'}]) # self.matcher.add(2, # None, # [{'ORTH': 'CV'}, {'ORTH': 'CIN'}, {'ORTH': 'TECH'}], # [{'ORTH': 'CV'}, {}, {'ORTH': 'CIN'}, {'ORTH': 'TECH'}]) # self.matcher.add(3, # None, # [{'ORTH': 'CV'}, {'ORTH': 'VBG'}, {'ORTH': 'TECH'}]) # self.matcher.add(4, # None, # [{'ORTH': 'CV'}, {'ORTH': 'TECH'}]) self.matcher.add(2, None, [{ 'ORTH': 'VB' }, { 'ORTH': 'VBN' }, { 'ORTH': 'TECH' }], [{ 'ORTH': 'VB' }, { 'ORTH': 'VBN' }, {}, { 'ORTH': 'TECH' }]) # self.matcher.add(6, # None, # [{'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {'ORTH': 'JJS'}], # [{'ORTH': 'TECH'}, {}, {'ORTH': 'VBZ'}, {'ORTH': 'JJS'}], # [{'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {}, {'ORTH': 'JJS'}], # [{'ORTH': 'TECH'}, {}, {'ORTH': 'VBZ'}, {}, {'ORTH': 'JJS'}]) # self.matcher.add(10, # None, # [{'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {'ORTH': 'RBR'}], # [{'ORTH': 'TECH'}, {}, {'ORTH': 'VBZ'}, {'ORTH': 'RBR'}], # [{'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {}, {'ORTH': 'RBR'}], # [{'ORTH': 'TECH'}, {}, {'ORTH': 'VBZ'}, {}, {'ORTH': 'RBR'}]) self.matcher.add(0, None, [{ 'ORTH': 'TECH' }, { 'ORTH': 'VBZ' }, { 'ORTH': 'JJR' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VBZ' }, { 'ORTH': 'JJR' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VBZ' }, {}, { 'ORTH': 'JJR' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VBZ' }, {}, { 'ORTH': 'JJR' }]) self.matcher.add(1, None, [{ 'ORTH': 'TECH' }, { 'ORTH': 'VBZ' }, { 'ORTH': 'JJ' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VBZ' }, { 'ORTH': 'JJ' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VBZ' }, {}, { 'ORTH': 'JJ' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VBZ' }, {}, { 'ORTH': 'JJ' }]) # self.matcher.add(9, # None, # [{'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {'ORTH': 'RBS'}], # [{'ORTH': 'TECH'}, {}, {'ORTH': 'VBZ'}, {'ORTH': 'RBS'}], # [{'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {}, {'ORTH': 'RBS'}], # [{'ORTH': 'TECH'}, {}, {'ORTH': 'VBZ'}, {}, {'ORTH': 'RBS'}]) def add_pos_tag(self, words, tech_pair): tagged_words = CoreNLPParser(url='http://localhost:9000', tagtype='pos').tag(words) # print tagged_words tag_list = [] for (word, tag) in tagged_words: if word in tech_pair.split("\t"): tag_list.append("TECH") else: tag_list.append(tag) return tag_list def match_pattern(self, pre, words, post, current_id, tech_pair): tag_list = self.add_pos_tag(words, tech_pair) patterns = self.matcher(self.nlp(u'{}'.format(" ".join(tag_list)))) if patterns != []: self.compa_sent_count += 1 print("yes") out_file = open( os.path.join(os.pardir, "outnew", "pattern", "sentences.txt"), "a") out_file.write("{}\n".format(current_id)) out_file.write("{}\n".format(current_id)) out_file.write("{}\nPattern(s): ".format(tech_pair)) out_file.write(" ".join(words)) out_file.write("\n") out_file.close() data = open( os.path.join(os.pardir, "outnew", "pattern", "output.txt"), "a") data.write("{}\n".format(current_id)) data.write("{}\nPattern(s): ".format(tech_pair)) for pattern in patterns: self.count[str(pattern[0])] += 1 data.write(str(pattern[0]) + "\t") # data_file = open(os.path.join(os.pardir, "out", "tech_v2", "{}.txt".format(pattern[0])), "a") data.write("\n") data.write(" ".join(words)) data.write("\n\n\n") data.close()
matcher = Matcher(nlp.vocab) text = Active doc = nlp(text) sents = list(doc.sents) print("Number of Sentences = ", len(sents)) for sent in doc.sents: print(sent) for i in sent: # print(token.dep_,token.tag_, end = " ") print( f' {i.text:{10}} {i.pos_:{8}} {i.tag_:{6}} {i.dep_:{10}} {spacy.explain(i.tag_)}' ) print(" ") print( "-----------------------------------------------------------------------------------------" ) print(" ") passive_rule = [{ 'DEP': 'nsubjpass' }, { 'DEP': 'aux', 'OP': '*' }, { 'DEP': 'auxpass' }, { 'TAG': 'VBN' }] matcher.add('Passive', None, passive_rule) matches = matcher(doc) print(len(matches))
class hearst_patterns(object): """ Hearst Patterns is a class object used to detects hypernym relations to hyponyms in a text input: raw text returns: list of dict object with each entry all the hypernym-hyponym pairs of a text entry format: ["predicate" : [(hyponym, hypernym), (hyponym, hypernym), ..]] """ import spacy def __init__(self, nlp, extended=False, predicatematch = "basic"): # Included in each entry is the original regex pattern now adapted as a spaCy matcher pattern. # Many of these patterns are in the same format, next iteration of code should include an # automatic pattern generator for patterns. # These patterns need checking and cleaning up for testing. # Format for the dict entry of each pattern # { # "label" : predicate, # "pattern" : spaCy pattern, # "posn" : first/last depending on whether the hypernym appears before its hyponym # } # make the patterns easier to read # as lexical understanding develops, consider adding attributes to dstinguish between hypernyms and hyponyms self.nlp = nlp options = ["bronze", "silver", "gold"] if predicatematch not in options: entry = "" while entry not in ["1", "2", "3"]: entry = input(f"1. {options[0]}, 2. {options[1]}, 3. {options[2]}") self.predicatematch = options[int(entry) -1] else: self.predicatematch = predicatematch hypernym = {"POS" : {"IN": ["NOUN", "PROPN"]}} hyponym = {"POS" : {"IN": ["NOUN", "PROPN"]}} punct = {"IS_PUNCT": True, "OP": "?"} self.patterns = [ {"label" : "such_as", "pattern" : [ # '(NP_\\w+ (, )?such as (NP_\\w+ ?(, )?(and |or )?)+)', # 'first' hypernym, punct, {"LEMMA": "such"}, {"LEMMA": "as"}, hyponym ], "posn" : "first"}, {"label" : "know_as", "pattern" : [ # '(NP_\\w+ (, )?know as (NP_\\w+ ?(, )?(and |or )?)+)', # added for this experiment # 'first' hypernym, punct, {"LEMMA": "know"}, {"LEMMA": "as"}, hyponym ], "posn" : "first"}, {"label" : "such", "pattern" : [ # '(such NP_\\w+ (, )?as (NP_\\w+ ?(, )?(and |or )?)+)', # 'first' {"LEMMA": "such"}, hypernym, punct, {"LEMMA": "as"}, hyponym ], "posn" : "first"}, {"label" : "include", "pattern" : [ # '(NP_\\w+ (, )?include (NP_\\w+ ?(, )?(and |or )?)+)', # 'first' hypernym, punct, {"LEMMA" : "include"}, hyponym ], "posn" : "first"}, {"label" : "especially", "pattern" : [ ## problem - especially is merged as a modifier in to a noun phrase # '(NP_\\w+ (, )?especially (NP_\\w+ ?(, )?(and |or )?)+)', # 'first' hypernym, punct, {"LEMMA" : "especially"}, hyponym ], "posn" : "first"}, {"label" : "other", "pattern" : [ # problem: the noun_chunk, 'others' clashes with this rule to create a zero length chunk when predicate removed # '((NP_\\w+ ?(, )?)+(and |or )?other NP_\\w+)', # 'last' hyponym, punct, {"LEMMA" : {"IN" : ["and", "or"]}}, {"LEMMA" : "other"}, hypernym # There were bruises, lacerations, or other injuries were not prevalent." ], "posn" : "last"}, ] if extended: self.patterns.extend([ {"label" : "which_may_include", "pattern" : [ # '(NP_\\w+ (, )?which may include (NP_\\w+ ' # '?(, )?(and |or )?)+)', # 'first' hypernym, punct, {"LEMMA" : "which"}, {"LEMMA" : "may"}, {"LEMMA" : "include"}, hyponym ], "posn" : "first"}, {"label" : "which_be_similar_to", "pattern" : [ # '(NP_\\w+ (, )?which be similar to (NP_\\w+ ? ' # '(, )?(and |or )?)+)', # 'first' hypernym, punct, {"LEMMA" : "which"}, {"LEMMA" : "be"}, {"LEMMA" : "similar"}, {"LEMMA" : "to"}, hyponym ], "posn" : "first"}, {"label" : "example_of_this_be", "pattern" : [ # '(NP_\\w+ (, )?example of this be (NP_\\w+ ? ' # '(, )?(and |or )?)+)', # 'first' hypernym, punct, {"LEMMA" : "example"}, {"LEMMA" : "of"}, {"LEMMA" : "this"}, {"LEMMA" : "be"}, hyponym ], "posn" : "first"}, {"label" : ",type", "pattern" : [ # '(NP_\\w+ (, )?type (NP_\\w+ ? (, )?(and |or )?)+)', # 'first' hypernym, punct, {"LEMMA" : "type"}, punct, hyponym ], "posn" : "first"}, {"label" : "mainly", "pattern" : [ # '(NP_\\w+ (, )?mainly (NP_\\w+ ? (, )?(and |or )?)+)', # 'first' hypernym, punct, {"LEMMA" : "mainly"}, hyponym ], "posn" : "first"}, {"label" : "mostly", "pattern" : [ # '(NP_\\w+ (, )?mostly (NP_\\w+ ? (, )?(and |or )?)+)', # 'first' hypernym, punct, {"LEMMA" : "mostly"}, hyponym ], "posn" : "first"}, {"label" : "notably", "pattern" : [ # '(NP_\\w+ (, )?notably (NP_\\w+ ? (, )?(and |or )?)+)', # 'first' hypernym, punct, {"LEMMA" : "notably"}, hyponym ], "posn" : "first"}, {"label" : "particularly", "pattern" : [ # '(NP_\\w+ (, )?particularly (NP_\\w+ ? ' # '(, )?(and |or )?)+)', # 'first' hypernym, punct, {"LEMMA" : "particularly"}, hyponym ], "posn" : "first"}, {"label" : "principally", "pattern" : [ # '(NP_\\w+ (, )?principally (NP_\\w+ ? (, )?(and |or )?)+)', - fuses in a noun phrase # 'first' hypernym, punct, {"LEMMA" : "principally"}, hyponym ], "posn" : "first"}, {"label" : "in_particular", "pattern" : [ # '(NP_\\w+ (, )?in particular (NP_\\w+ ? ' # '(, )?(and |or )?)+)', # 'first' hypernym, punct, {"LEMMA" : "in"}, {"LEMMA" : "particular"}, hyponym ], "posn" : "first"}, {"label" : "except", "pattern" : [ # '(NP_\\w+ (, )?except (NP_\\w+ ? (, )?(and |or )?)+)', # 'first' hypernym, punct, {"LEMMA" : "except"}, hyponym ], "posn" : "first"}, {"label" : "other_than", "pattern" : [ # '(NP_\\w+ (, )?other than (NP_\\w+ ? (, )?(and |or )?)+)', # 'first' hypernym, punct, {"LEMMA" : "other"}, {"LEMMA" : "than"}, hyponym ], "posn" : "first"}, {"label" : "eg", "pattern" : [ # '(NP_\\w+ (, )?e.g. (, )?(NP_\\w+ ? (, )?(and |or )?)+)', # 'first' hypernym, punct, {"LEMMA" : {"IN" : ["e.g.", "eg"]}}, hyponym ], "posn" : "first"}, # {"label" : "eg-ie", "pattern" : [ # # '(NP_\\w+ \\( (e.g.|i.e.) (, )?(NP_\\w+ ? (, )?(and |or )?)+' - need to understand this pattern better # # '(\\. )?\\))', # # 'first' # hypernym, punct, {"LEMMA" : {IN : ["e.g.", "i.e.", "eg", "ie"]}}, {"LEMMA" : "than"}, hyponym # ]}, {"label" : "ie", "pattern" : [ # '(NP_\\w+ (, )?i.e. (, )?(NP_\\w+ ? (, )?(and |or )?)+)', # 'first' hypernym, punct, {"LEMMA" : {"IN" : ["i.e.", "ie"]}}, hyponym ], "posn" : "first"}, {"label" : "for_example", "pattern" : [ # '(NP_\\w+ (, )?for example (, )?' # '(NP_\\w+ ?(, )?(and |or )?)+)', # 'first' hypernym, punct, {"LEMMA" : "for"}, {"LEMMA" : "example"}, punct, hyponym ], "posn" : "first"}, {"label" : "example_of_be", "pattern" : [ # 'example of (NP_\\w+ (, )?be (NP_\\w+ ? ' # '(, )?(and |or )?)+)', # 'first' {"LEMMA" : "example"}, {"LEMMA" : "of"}, hypernym, punct, {"LEMMA" : "be"}, hyponym ], "posn" : "first"}, {"label" : "like", "pattern" : [ # '(NP_\\w+ (, )?like (NP_\\w+ ? (, )?(and |or )?)+)', # 'first' hypernym, punct, {"LEMMA" : "like"}, hyponym, ], "posn" : "first"}, # repeat of such_as pattern in primary patterns??? # 'such (NP_\\w+ (, )?as (NP_\\w+ ? (, )?(and |or )?)+)', # 'first' {"label" : "whether", "pattern" : [ # '(NP_\\w+ (, )?whether (NP_\\w+ ? (, )?(and |or )?)+)', # 'first' hypernym, punct, {"LEMMA" : "whether"}, hyponym ], "posn" : "first"}, {"label" : "compare_to", "pattern" : [ # '(NP_\\w+ (, )?compare to (NP_\\w+ ? (, )?(and |or )?)+)', # 'first' hypernym, punct, {"LEMMA" : "compare"}, {"LEMMA" : "to"}, hyponym ], "posn" : "first"}, {"label" : "among_-PRON-", "pattern" : [ # '(NP_\\w+ (, )?among -PRON- (NP_\\w+ ? ' # '(, )?(and |or )?)+)', # 'first' hypernym, punct, {"LEMMA" : "among"}, {"LEMMA" : "-PRON-"}, hyponym ], "posn" : "first"}, {"label" : "for_instance", "pattern" : [ # '(NP_\\w+ (, )? (NP_\\w+ ? (, )?(and |or )?)+ ' # 'for instance)', # 'first' hypernym, punct, hyponym, {"LEMMA" : "for"}, {"LEMMA" : "instance"} ], "posn" : "first"}, {"label" : "and-or_any_other", "pattern" : [ # '((NP_\\w+ ?(, )?)+(and |or )?any other NP_\\w+)', # 'last' hyponym, punct, {"DEP": "cc"}, {"LEMMA" : "any"}, {"LEMMA" : "other"}, hypernym, ], "posn" : "last"}, {"label" : "some_other", "pattern" : [ # '((NP_\\w+ ?(, )?)+(and |or )?some other NP_\\w+)', # 'last' hyponym, punct, {"DEP": "cc", "OP" : "?"}, {"LEMMA" : "some"}, {"LEMMA" : "other"}, hypernym, ], "posn" : "last"}, {"label" : "be_a", "pattern" : [ # '((NP_\\w+ ?(, )?)+(and |or )?be a NP_\\w+)', # 'last' hyponym, punct, {"LEMMA" : "be"}, {"LEMMA" : "a"}, hypernym, ], "posn" : "last"}, {"label" : "like_other", "pattern" : [ # '((NP_\\w+ ?(, )?)+(and |or )?like other NP_\\w+)', # 'last' hyponym, punct, {"LEMMA" : "like"}, {"LEMMA" : "other"}, hypernym, ], "posn" : "last"}, {"label" : "one_of_the", "pattern" : [ # '((NP_\\w+ ?(, )?)+(and |or )?one of the NP_\\w+)', # 'last' hyponym, punct, {"LEMMA" : "one"}, {"LEMMA" : "of"}, {"LEMMA" : "the"}, hypernym, ], "posn" : "last"}, {"label" : "one_of_these", "pattern" : [ # '((NP_\\w+ ?(, )?)+(and |or )?one of these NP_\\w+)', # 'last' hyponym, punct, {"LEMMA" : "one"}, {"LEMMA" : "of"}, {"LEMMA" : "these"}, hypernym, ], "posn" : "last"}, {"label" : "one_of_those", "pattern" : [ # '((NP_\\w+ ?(, )?)+(and |or )?one of those NP_\\w+)', # 'last' hyponym, punct, {"DEP": "cc", "OP" : "?"}, {"LEMMA" : "one"}, {"LEMMA" : "of"}, {"LEMMA" : "those"}, hypernym, ], "posn" : "last"}, {"label" : "be_example_of", "pattern" : [ # '((NP_\\w+ ?(, )?)+(and |or )?be example of NP_\\w+)', added optional "an" to spaCy pattern for singular vs. plural # 'last' hyponym, punct, {"LEMMA" : "be"}, {"LEMMA" : "an", "OP" : "?"}, {"LEMMA" : "example"}, {"LEMMA" : "of"}, hypernym ], "posn" : "last"}, {"label" : "which_be_call", "pattern" : [ # '((NP_\\w+ ?(, )?)+(and |or )?which be call NP_\\w+)', # 'last' hyponym, punct, {"LEMMA" : "which"}, {"LEMMA" : "be"}, {"LEMMA" : "call"}, hypernym ], "posn" : "last"}, # {"label" : "which_be_name", "pattern" : [ # '((NP_\\w+ ?(, )?)+(and |or )?which be name NP_\\w+)', # 'last' hyponym, punct, {"LEMMA" : "which"}, {"LEMMA" : "be"}, {"LEMMA" : "name"}, hypernym ], "posn" : "last"}, {"label" : "a_kind_of", "pattern" : [ # '((NP_\\w+ ?(, )?)+(and|or)? a kind of NP_\\w+)', # 'last' hyponym, punct, {"LEMMA" : "a"}, {"LEMMA" : "kind"}, {"LEMMA" : "of"}, hypernym ], "posn" : "last"}, # '((NP_\\w+ ?(, )?)+(and|or)? kind of NP_\\w+)', - combined with above # 'last' {"label" : "form_of", "pattern" : [ # '((NP_\\w+ ?(, )?)+(and|or)? form of NP_\\w+)', # 'last' hyponym, punct, {"LEMMA" : "a", "OP" : "?"}, {"LEMMA" : "form"}, {"LEMMA" : "of"}, hypernym ], "posn" : "last"}, {"label" : "which_look_like", "pattern" : [ # '((NP_\\w+ ?(, )?)+(and |or )?which look like NP_\\w+)', # 'last' hyponym, punct, {"LEMMA" : "which"}, {"LEMMA" : "look"}, {"LEMMA" : "like"}, hyponym ], "posn" : "last"}, {"label" : "which_sound_like", "pattern" : [ # '((NP_\\w+ ?(, )?)+(and |or )?which sound like NP_\\w+)', # 'last' hyponym, punct, {"LEMMA" : "which"}, {"LEMMA" : "sound"}, {"LEMMA" : "like"}, hypernym ], "posn" : "last"}, {"label" : "type", "pattern" : [ # '((NP_\\w+ ?(, )?)+(and |or )? NP_\\w+ type)', # 'last' hyponym, punct, {"LEMMA" : "type"}, hypernym ], "posn" : "last"}, {"label" : "compare_with", "pattern" : [ # '(compare (NP_\\w+ ?(, )?)+(and |or )?with NP_\\w+)', # 'last' {"LEMMA" : "compare"}, hyponym, punct, {"LEMMA" : "with"}, hypernym ], "posn" : "last"}, # {"label" : "as", "pattern" : [ # # '((NP_\\w+ ?(, )?)+(and |or )?as NP_\\w+)', # # 'last' # hyponym, punct, {"LEMMA" : "as"}, hypernym # ], "posn" : "last"}, {"label" : "sort_of", "pattern" : [ # '((NP_\\w+ ?(, )?)+(and|or)? sort of NP_\\w+)', # 'last' hyponym, punct, {"LEMMA" : "sort"}, {"LEMMA" : "of"}, hypernym ], "posn" : "last"}, ]), ## initiate matcher from spacy.matcher import Matcher self.matcher = Matcher(self.nlp.vocab, validate = True) # added "some" to original list self.predicate_list = [ 'able', 'available', 'brief', 'certain', 'different', 'due', 'enough', 'especially', 'few', 'fifth', 'former', 'his', 'howbeit', 'immediate', 'important', 'inc', 'its', 'last', 'latter', 'least', 'less', 'likely', 'little', 'many', 'ml', 'more', 'most', 'much', 'my', 'necessary', 'new', 'next', 'non', 'old', 'other', 'our', 'ours', 'own', 'particular', 'past', 'possible', 'present', 'proud', 'recent', 'same', 'several', 'significant', 'similar', 'some', 'such', 'sup', 'sure' ] self.predicates = [] self.first = [] self.last = [] # add patterns to matcher for pattern in self.patterns: self.matcher.add(pattern["label"], None, pattern["pattern"]) # gather list of predicate terms for the noun_chunk deconfliction self.predicates.append(pattern["label"].split('_')) # gather list of predicates where the hypernym appears first if pattern["posn"] == "first": self.first.append(pattern["label"]) # gather list of predicates where the hypernym appears last if pattern["posn"] == "last": self.last.append(pattern["label"]) def isPredicateMatch_bronze(self, noun_chunknoun_chunk, predicates): """ Bronze option to remove predicate phrases from noun_chunks using a predefined list of modifiers input: the chunk to be checked, list of predicate phrases returns: the chnunk with predicate phrases removed. """ counter = 0 while noun_chunknoun_chunk[counter].lemma_ in predicates: counter += 1 #remove empty spans, eg the noun_chunk 'others' becomes a zero length span if len(noun_chunknoun_chunk[counter:]) == 0: counter = 0 return noun_chunknoun_chunk[counter:] def isPredicateMatch_silver(self, noun_chunk): """ Silver option to remove predicate phrases from noun_chunks using stop word list input: the chunk to be checked, list of predicate phrases returns: the chnunk with predicate phrases removed. """ counter = 0 while not noun_chunk[0].is_stop and noun_chunk[counter].is_stop: counter += 1 # #remove empty spans, eg the noun_chunk 'others' becomes a zero length span # if len(chunk[counter:]) == 0: # counter = 0 #print(noun_chunk, "becomes: ", noun_chunk[counter:]) return noun_chunk[counter:] def isPredicateMatch_gold(self, noun_chunk, predicates): """ Gold option to remove predicate phrases from noun_chunks using pattern labels. input: the chunk to be checked, list of predicate phrases returns: the chnunk with predicate phrases removed. """ def match(empty, count, noun_chunk, predicates): # empty: check whether predicates list is empty # count < len(predicates[0]): checks whether the count has reached the final token of the predicate # chunk[count].lemma_ == predicates[0][count]: check whether chunk token is equal to the predicate token while not empty and count < len(predicates[0]) and noun_chunk[count].lemma_ == predicates[0][count]: count += 1 #remove empty spans, eg the noun_chunk 'others' becomes a zero length span if len(noun_chunk[count:]) == 0: count = 0 return empty, count def isMatch(noun_chunk, predicates): empty, counter = match(predicates == [], 0, noun_chunk, predicates) if empty or counter == len(predicates[0]): #print(chunk, "becomes: ", chunk[counter:]) return noun_chunk[counter:] else: return isMatch(noun_chunk, predicates[1:]) return isMatch(noun_chunk, predicates) def find_hyponyms(self, doc): """ this is the main function of the class object follows logic of: 1. checks whether text has been parsed 2. pre-processing for noun_chunks 3. generate matches 4. create list of dict object containing match results """ # if isinstance(text, spacy.tokens.doc.Doc): # doc = text # else: # doc = self.nlp(text) # initiate doc ## Pre-processing # there are some predicate terms, such as "particularly", "especially" and "some other" which are # merged with the noun phrase. Such terms are part of the pattern and become part of the # merged noun-chunk, consequently, they are not detected in by the matcher. # This pre-processing, therefore, walks through the noun_chunks of a doc object to remove those # predicate terms from each noun_chunk and merges the result. with doc.retokenize() as retokenizer: for chunk in doc.noun_chunks: attrs = {"tag": chunk.root.tag, "dep": chunk.root.dep} if self.predicatematch == "bronze": retokenizer.merge(self.isPredicateMatch_bronze(chunk, self.predicate_list), attrs = attrs) elif self.predicatematch == "silver": retokenizer.merge(self.isPredicateMatch_silver(chunk), attrs = attrs) elif self.predicatematch == "gold": retokenizer.merge(self.isPredicateMatch_gold(chunk, self.predicates), attrs = attrs) ## Main Body #Find matches in doc matches = self.matcher(doc) pairs = [] # set up dictionary containing pairs # If none are found then return None if not matches: return pairs for match_id, start, end in matches: predicate = self.nlp.vocab.strings[match_id] # if the predicate is in the list where the hypernym is last, else hypernym is first if predicate in self.last: hypernym = doc[end - 1] hyponym = doc[start] else: # an inelegent way to deal with the "such_NOUN_as pattern" since the first token is not the hypernym if doc[start].lemma_ == "such": start += 1 hypernym = doc[start] hyponym = doc[end - 1] # create a list of dictionary objects with the format: # { # "predicate" : " predicate term based from pattern name, # "pairs" : [(hypernym, hyponym)] + [hyponym conjuncts (tokens linked by and | or)] # "sent" : sentence in which the pairs originate # } # pairs.append(dict({"predicate" : predicate, # "pairs" : [(hypernym, hyponym)] + [(hypernym, token) for token in hyponym.conjuncts if token != hypernym], # "sent" : (hyponym.sent.text).strip()})) pairs.append((hyponym.lemma_, hypernym.lemma_, predicate)) for token in hyponym.conjuncts: if token != hypernym and token != None: pairs.append((token.lemma_, hypernym.lemma_, predicate)) return pairs
class AbbreviationDetector: """ Detects abbreviations using the algorithm in "A simple algorithm for identifying abbreviation definitions in biomedical text.", (Schwartz & Hearst, 2003). This class sets the `._.abbreviations` attribute on spaCy Doc. The abbreviations attribute is a `List[Span]` where each Span has the `Span._.long_form` attribute set to the long form definition of the abbreviation. Note that this class does not replace the spans, or merge them. """ def __init__(self, nlp) -> None: Doc.set_extension("abbreviations", default=[], force=True) Span.set_extension("long_form", default=None, force=True) self.matcher = Matcher(nlp.vocab) self.matcher.add( "parenthesis", None, [{"ORTH": "("}, {"OP": "+"}, {"ORTH": ")"}] ) abbreviation_regex = r'(([A-Z0-9-]+){2,})|(([A-Z0-9-]\.){2,})' acronym_rule = [{'TAG': 'NNP', 'TEXT': {'REGEX': abbreviation_regex}}] self.matcher.add( "only_abbreviation", None, acronym_rule ) self.global_matcher = Matcher(nlp.vocab) self.nlp = nlp def find(self, span: Span, doc: Doc) -> Tuple[Span, Set[Span]]: """ Functional version of calling the matcher for a single span. This method is helpful if you already have an abbreviation which you want to find a definition for. """ dummy_matches = [(-1, int(span.start), int(span.end))] filtered = filter_matches(dummy_matches, doc) abbreviations = self.find_matches_for(filtered, doc) if not abbreviations: return span, set() else: return abbreviations[0] def __call__(self, doc: Doc) -> Doc: matches = self.matcher(doc) matches_all_shorts = [x for x in matches if self.nlp.vocab.strings[x[0]] == 'only_abbreviation'] matches_no_brackets = [(x[0], x[1] + 1, x[2] - 1) for x in matches if self.nlp.vocab.strings[x[0]] == 'parenthesis'] all_shorts = [] [all_shorts.append(doc[m[1]:m[2]]) for m in matches_all_shorts] filtered = filter_matches(matches_no_brackets, doc) occurences = self.find_matches_for(filtered, doc) # Modify here to detect all abbreviations w or w/o definition for (long_form, short_forms) in occurences: for short in short_forms: short._.long_form = long_form doc._.abbreviations.append(short) for short in all_shorts: if short not in doc._.abbreviations: doc._.abbreviations.append(short) return doc def find_matches_for( self, filtered: List[Tuple[Span, Span]], doc: Doc ) -> List[Tuple[Span, Set[Span]]]: rules = {} all_occurences: Dict[Span, Set[Span]] = defaultdict(set) already_seen_long: Set[str] = set() already_seen_short: Set[str] = set() for (long_candidate, short_candidate) in filtered: short, long = find_abbreviation(long_candidate, short_candidate) # We need the long and short form definitions to be unique, because we need # to store them so we can look them up later. This is a bit of a # pathalogical case also, as it would mean an abbreviation had been # defined twice in a document. There's not much we can do about this, # but at least the case which is discarded will be picked up below by # the global matcher. So it's likely that things will work out ok most of the time. new_long = long.string not in already_seen_long if long else False new_short = short.string not in already_seen_short if long is not None and new_long and new_short: already_seen_long.add(long.string) already_seen_short.add(short.string) all_occurences[long].add(short) rules[long.string] = long # Add a rule to a matcher to find exactly this substring. self.global_matcher.add( long.string, None, [{"ORTH": x.text} for x in short] ) to_remove = set() global_matches = self.global_matcher(doc) for match, start, end in global_matches: string_key = self.global_matcher.vocab.strings[match] to_remove.add(string_key) all_occurences[rules[string_key]].add(doc[start:end]) for key in to_remove: # Clean up the global matcher. self.global_matcher.remove(key) return list((k, v) for k, v in all_occurences.items())
def test_matcher_valid_callback(en_vocab): """Test that on_match can only be None or callable.""" matcher = Matcher(en_vocab) with pytest.raises(ValueError): matcher.add("TEST", [[{"TEXT": "test"}]], on_match=[]) matcher(Doc(en_vocab, words=["test"]))
def test_matcher_no_zero_length(en_vocab): doc = Doc(en_vocab, words=["a", "b"], tags=["A", "B"]) matcher = Matcher(en_vocab) matcher.add("TEST", [[{"TAG": "C", "OP": "?"}]]) assert len(matcher(doc)) == 0
def test_matcher_basic_check(en_vocab): matcher = Matcher(en_vocab) # Potential mistake: pass in pattern instead of list of patterns pattern = [{"TEXT": "hello"}, {"TEXT": "world"}] with pytest.raises(ValueError): matcher.add("TEST", pattern)
def test_matcher_intersect_value_operator(en_vocab): matcher = Matcher(en_vocab) pattern = [{"MORPH": {"INTERSECTS": ["Feat=Val", "Feat2=Val2", "Feat3=Val3"]}}] matcher.add("M", [pattern]) doc = Doc(en_vocab, words=["a", "b", "c"]) assert len(matcher(doc)) == 0 doc[0].set_morph("Feat=Val") assert len(matcher(doc)) == 1 doc[0].set_morph("Feat=Val|Feat2=Val2") assert len(matcher(doc)) == 1 doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3") assert len(matcher(doc)) == 1 doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4") assert len(matcher(doc)) == 1 # INTERSECTS with a single value is the same as IN matcher = Matcher(en_vocab) pattern = [{"TAG": {"INTERSECTS": ["A", "B"]}}] matcher.add("M", [pattern]) doc = Doc(en_vocab, words=["a", "b", "c"]) doc[0].tag_ = "A" assert len(matcher(doc)) == 1 # INTERSECTS with an empty pattern list matches nothing matcher = Matcher(en_vocab) pattern = [{"TAG": {"INTERSECTS": []}}] matcher.add("M", [pattern]) doc = Doc(en_vocab, words=["a", "b", "c"]) doc[0].tag_ = "A" assert len(matcher(doc)) == 0 # INTERSECTS with a list value Token.set_extension("ext", default=[]) matcher = Matcher(en_vocab) pattern = [{"_": {"ext": {"INTERSECTS": ["A", "C"]}}}] matcher.add("M", [pattern]) doc = Doc(en_vocab, words=["a", "b", "c"]) doc[0]._.ext = ["A", "B"] assert len(matcher(doc)) == 1 # INTERSECTS with an empty pattern list matches nothing matcher = Matcher(en_vocab) pattern = [{"_": {"ext": {"INTERSECTS": []}}}] matcher.add("M", [pattern]) doc = Doc(en_vocab, words=["a", "b", "c"]) doc[0]._.ext = ["A", "B"] assert len(matcher(doc)) == 0 # INTERSECTS with an empty value matches nothing matcher = Matcher(en_vocab) pattern = [{"_": {"ext": {"INTERSECTS": ["A", "B"]}}}] matcher.add("M", [pattern]) doc = Doc(en_vocab, words=["a", "b", "c"]) doc[0]._.ext = [] assert len(matcher(doc)) == 0
import spacy from spacy.matcher import Matcher nlp = spacy.load("fr_core_news_sm") matcher = Matcher(nlp.vocab) doc = nlp( "L'appli se distingue par une interface magnifique, la recherche " "intelligente, la labellisation automatique et les réponses " "vocales fluides." ) # Écris un motif pour un adjectif suivi d'un ou deux noms pattern = [{"POS": "NOUN"}, {"POS": "ADJ"}, {"POS": "ADJ", "OP": "?"}] # Ajoute le motif au matcher et applique le matcher au doc matcher.add("ADJ_NOUN_PATTERN", None, pattern) matches = matcher(doc) print("Nombre de correspondances trouvées :", len(matches)) # Itère sur les correspondances et affiche la portion de texte for match_id, start, end in matches: print("Correspondance trouvée :", doc[start:end].text)
def feature_extraction(df, ft_model, nlp): # Extracting all the single nouns in the corpus all_nouns = [] for review in df['spacyObj']: for token in review: if token.pos_ == "NOUN": all_nouns.append(token.text) all_nouns = pd.Series(all_nouns) # Finding unique nouns along with their counts sorted in descending order unique_nouns = all_nouns.value_counts() noun_phrases = [] # Pattern to match i.e. two nouns occuring together patterns = [[{'TAG': 'NN'}, {'TAG': 'NN'}]] matcher = Matcher(nlp.vocab) matcher.add('NounPhrasees', patterns) for review in df['spacyObj']: matches = matcher(review) for match_id, start, end in matches: noun_phrases.append(review[start:end].text) noun_phrases = pd.Series(noun_phrases) unique_noun_phrases = noun_phrases.value_counts() # Remove nouns with single or double character for noun in unique_nouns.index: # if noun length is less than 3 or if nouns contain any numbers, it is considered invalid if len(noun) < 3 or re.match(r".*[0-9].*", noun) is not None: del unique_nouns[noun] # Extracting Top Features top2 = len(unique_nouns) * 0.05 # considering top 5% of features top2 = int(top2) top_features = unique_nouns[0:top2] # this will contain all the final features features_bucket = OrderedDict() top_features_list = list(top_features.keys()) top_features_set = set(top_features.keys()) unique_noun_phrases_set = set(unique_noun_phrases.keys()) # Applying assocation rule mining to group nouns occuring together for feature1 in top_features_list: for feature2 in top_features_list: feature_phrase = feature1 + ' ' + feature2 if feature1 in top_features_set and feature2 in top_features_set and feature_phrase in unique_noun_phrases_set: # If the condition is true, we have identified a noun phrase which is a combination of two nouns # in the top_features. So one of the nouns cn be eliminated from top features. # Ex. if "battery life" is found, then "life" can be eliminated from top features as it is not a feature # by itself. It is just part of the feature "battery life" # Now we need to find out if frequency of the lesser occuring noun (in our ex., the word "life") matches # with the frequency of the noun phrase (in our ex., "battery life") by a certain confidence. # If it does so, then we can be sure that the lesser occuring noun occurs only in that particular noun_phrase # i.e in our ex "life" occurs primaryly in the phrase "battery life" lesser_occurring_noun = "" often_occurring_noun = "" if unique_nouns[feature1] < unique_nouns[feature2]: lesser_occurring_noun = feature1 often_occurring_noun = feature2 else: lesser_occurring_noun = feature2 often_occurring_noun = feature1 # assuming confidence interval of 40% # i.e. accordnig to 'battery life' example, out of total times that 'life' is seen, 'battery' is seen next to it 40% of the time. if unique_noun_phrases[feature_phrase] / unique_nouns[ lesser_occurring_noun] > 0.4: try: if often_occurring_noun not in features_bucket: features_bucket[often_occurring_noun] = [] features_bucket[often_occurring_noun].append( lesser_occurring_noun) top_features_set.remove(lesser_occurring_noun) # print(lesser_occurring_noun) except BaseException as error: print(error) continue main_features = list(features_bucket.keys()) top_features_to_add = set(top_features_list[:20]) # here we are manually adding adding 20 top nouns as features which were previously not # added by the assocation rule mining step above. # But before adding, we are checking if any similar nouns exist among the 20 nouns. # Ex. If 'display' and 'screen' occur in the top 20, we must add only the most commonly occuring # one among the two and remove the other. # Here we are only eliminating the nouns that are similar to existing ones in features_bucket. for feature1 in top_features_list[:20]: for feature2 in main_features: if feature1 not in features_bucket and feature1 in top_features_set: similarity = cosine_similarity( ft_model.get_word_vector(feature1).reshape(1, -1), ft_model.get_word_vector(feature2).reshape(1, -1)) if similarity[0][0] > 0.64: top_features_to_add.discard(feature1) else: top_features_to_add.discard(feature1) top_features_to_add_list = list(top_features_to_add) # Here we are eliminating nouns that are similar to one another in the top_features_to_add for feature1 in top_features_to_add_list: for feature2 in top_features_to_add_list: if feature1 in top_features_to_add and feature2 in top_features_to_add: similarity = cosine_similarity( ft_model.get_word_vector(feature1).reshape(1, -1), ft_model.get_word_vector(feature2).reshape(1, -1)) if similarity[0][0] < 0.99 and similarity[0][0] > 0.64: feature_to_remove = min( (unique_nouns[feature1], feature1), (unique_nouns[feature2], feature2))[1] top_features_to_add.remove(feature_to_remove) for feature in top_features_to_add: features_bucket[feature] = [] for main_noun in features_bucket.keys(): top_features_set.remove(main_noun) # Here we are going through the top 5% of the nouns that we originally considering and checking # if any of them are similar to the ones already present in features_bucket. top_features_copy = list(top_features_set) main_features = features_bucket.keys() for feature2 in top_features_copy: best_similarity = 0 most_matching_main_feature = "" for feature1 in main_features: if feature2 in top_features_set: similarity = cosine_similarity( ft_model.get_word_vector(feature1).reshape(1, -1), ft_model.get_word_vector(feature2).reshape(1, -1)) if similarity[0][0] <= 0.99 and similarity[0][0] > 0.62: if similarity[0][0] > best_similarity: best_similarity = similarity[0][0] most_matching_main_feature = feature1 if best_similarity != 0 and most_matching_main_feature != "": features_bucket[most_matching_main_feature].append(feature2) top_features_set.remove(feature2) # We finally sort the features in descending order based on how often they occur. final_features = list(features_bucket.items()) final_features_with_counts = [] for feature in final_features: count = unique_nouns[feature[0]] final_features_with_counts.append((feature, count)) final_features_with_counts.sort(key=lambda x: x[1], reverse=True) final_features = OrderedDict() for feature, count in final_features_with_counts: final_features[feature[0]] = feature[1] return final_features
{'POS':'NOUN'}, {'LOWER':'such'}, {'LOWER':'as'}, {'POS':'PROPN'}] # Extract the pattern from the text # Create a Matcher object. Pass in the vocab for the loaded SpaCy model # (the vocab stores the vocabularly and other data shared across a language) matcher = Matcher(nlp.vocab) # See https://stackoverflow.com/questions/66164156/problem-with-using-spacy-matcher-matcher-matcher-add-method # Define the Matcher object # (matcher now only takes 2 positional arguments in Spacy 3 - an ID for the # matcher, and a list of patterns (which must be passed as a list, even if # there is only 1 pattern) matcher.add("matching_1", [pattern]) # Apply the matcher to the SpaCy document matches = matcher(doc) # The matcher returns a list of three-element tuples, in which each tuple is : # (match_id, start, end). match_id is the hash value of the ID of the matcher # ("matching_1" in this case). start and end represent the token positions of # where the identified match starts and ends (So the first token is 0, second # token is 1 etc). We'll get a tuple returned for every match identified. # Here, we know we only have one match, so we can just refer to matches[0], # which refers to the first (and only) tuple. We can then specify the "span" # (the matched text) using the start and end token positions stored in # the second and third elements of the tuple (so matches[0][1] and # matches[0][2] respectively) span = doc[matches[0][1]:matches[0][2]]
import spacy nlp = spacy.load('en_core_web_sm') from spacy.matcher import Matcher ###Token matching for rule based matching example### matcher = Matcher(nlp.vocab) pattern1 = [{'LOWER':'solarpower'}] pattern2 = [{'LOWER':'solar'},{'IS_PUNCT':True},{'LOWER':'power'}] pattern3 = [{'LOWER':'solar'},{'LOWER':'power'}] matcher.add('SolarPower', None, pattern1,pattern2,pattern3) doc = nlp(u"The Solar Power industry continues to grow a solarpower increases. Solar-Power is amazing.") found_matches=matcher(doc) print(found_matches) #Results: #8656102463236116519 SolarPower 1 3 Solar Power #8656102463236116519 SolarPower 8 9 solarpower #8656102463236116519 SolarPower 12 15 Solar-Power for match_id, start, end in found_matches: string_id = nlp.vocab.strings[match_id] span = doc[start:end] print(match_id, string_id, start, end, span.text) #####phrase matcher#####
class UnitComponent(BaseComponent): """ A pipeline component that tags units Begins by first tagging all mass, volume, time, and form units. """ #TODO Split into files mass_annotator_component, volume_annotator_component #TODO time_annotator_component, etc. name = "unit_annotator" dependencies = [] def __init__(self, nlp): self.nlp = nlp Token.set_extension('feature_is_mass_unit', default=False) nlp.entity.add_label('mass_unit') Token.set_extension('feature_is_volume_unit', default=False) nlp.entity.add_label('volume_unit') Token.set_extension('feature_is_time_unit', default=False) nlp.entity.add_label('time_unit') Token.set_extension('feature_is_route_type', default=False) nlp.entity.add_label('route_type') Token.set_extension('feature_is_form_unit', default=False) nlp.entity.add_label('form_unit') Token.set_extension('feature_is_frequency_indicator', default=False) nlp.entity.add_label('frequency_indicator') Token.set_extension('feature_is_measurement_unit', default=False) nlp.entity.add_label('measurement_unit') Token.set_extension('feature_is_measurement', default=False) nlp.entity.add_label('measurement') Token.set_extension('feature_is_duration_pattern', default=False) nlp.entity.add_label('duration_pattern') self.mass_matcher = Matcher(nlp.vocab) self.volume_matcher = Matcher(nlp.vocab) self.time_matcher = Matcher(nlp.vocab) self.route_matcher = Matcher(nlp.vocab) self.form_matcher = Matcher(nlp.vocab) self.unit_of_measurement_matcher = Matcher(nlp.vocab) self.measurement_matcher = Matcher(nlp.vocab) self.frequency_matcher = Matcher(nlp.vocab) self.duration_matcher = Matcher(nlp.vocab) self.mass_matcher.add('UNIT_OF_MASS', None, [{ 'LOWER': 'mcg' }], [{ 'LOWER': 'microgram' }], [{ 'LOWER': 'micrograms' }], [{ 'ORTH': 'mg' }], [{ 'LOWER': 'milligram' }], [{ 'LOWER': 'g' }], [{ 'LOWER': 'kg' }], [{ 'ORTH': 'mEq' }]) self.volume_matcher.add('UNIT_OF_VOLUME', None, [{ 'LOWER': 'ml' }], [{ 'ORTH': 'dL' }], [{ 'LOWER': 'cc' }], [{ 'ORTH': 'L' }]) self.time_matcher.add('UNIT_OF_TIME', None, [{ 'LOWER': 'sec' }], [{ 'LOWER': 'second' }], [{ 'LOWER': 'seconds' }], [{ 'LOWER': 'min' }], [{ 'LOWER': 'minute' }], [{ 'LOWER': 'minutes' }], [{ 'LOWER': 'hr' }], [{ 'LOWER': 'hour' }], [{ 'LOWER': 'day' }], [{ 'LOWER': 'days' }], [{ 'LOWER': 'week' }], [{ 'LOWER': 'weeks' }], [{ 'LOWER': 'month' }], [{ 'LOWER': 'months' }], [{ 'LOWER': 'year' }], [{ 'LOWER': 'years' }], [{ 'LOWER': 'yrs' }]) self.frequency_matcher.add('FREQUENCY_MATCHER', None, [{ 'LOWER': 'bid' }], [{ 'LOWER': 'prn' }], [{ 'LOWER': 'qid' }], [{ 'LOWER': 'tid' }], [{ 'LOWER': 'qd' }], [{ 'LOWER': 'daily' }], [{ 'LOWER': 'hs' }], [{ 'LOWER': 'as' }, { 'LOWER': 'needed' }], [{ 'LOWER': 'once' }, { 'LOWER': 'a' }, { 'LOWER': 'day' }], [{ 'LOWER': 'twice' }, { 'LOWER': 'a' }, { 'LOWER': 'day' }]) self.form_matcher.add('UNIT_OF_FORM', None, [{ 'ORTH': 'dose' }], [{ 'ORTH': 'doses' }], [{ 'LEMMA': 'pill' }], [{ 'LEMMA': 'tablet' }], [{ 'LEMMA': 'unit' }], [{ 'LEMMA': 'u' }], [{ 'LEMMA': 'patch' }], [{ 'LEMMA': 'unit' }], [{ 'ORTH': 'lotion' }], [{ 'ORTH': 'powder' }], [{ 'ORTH': 'amps' }], [{ 'LOWER': 'actuation' }], [{ 'LEMMA': 'suspension' }], [{ 'LEMMA': 'syringe' }], [{ 'LEMMA': 'puff' }], [{ 'LEMMA': 'liquid' }], [{ 'LEMMA': 'aerosol' }], [{ 'LEMMA': 'cap' }]) self.route_matcher.add('TYPE_OF_ROUTE', None, [{ 'LOWER': 'IV' }], [{ 'ORTH': 'intravenous' }], [{ 'LOWER': 'po' }], [{ 'ORTH': 'gtt' }], [{ 'LOWER': 'drip' }], [{ 'LOWER': 'inhalation' }], [{ 'LOWER': 'by' }, { 'LOWER': 'mouth' }], [{ 'LOWER': 'topical' }], [{ 'LOWER': 'subcutaneous' }], [{ 'LOWER': 'ophthalmic' }], [{ 'LEMMA': 'injection' }], [{ 'LOWER': 'mucous' }, { 'LOWER': 'membrane' }], [{ 'LOWER': 'oral' }], [{ 'LOWER': 'nebs' }], [{ 'LOWER': 'transdermal' }], [{ 'LOWER': 'nasal' }]) self.unit_of_measurement_matcher.add('UNIT_OF_MEASUREMENT', None, [{ 'ENT_TYPE': 'mass_unit' }, { 'ORTH': '/' }, { 'ENT_TYPE': 'volume_unit' }], [{ 'ENT_TYPE': 'volume_unit' }, { 'ORTH': '/' }, { 'ENT_TYPE': 'time_unit' }], [{ 'ENT_TYPE': 'form_unit' }, { 'ORTH': '/' }, { 'ENT_TYPE': 'volume_unit' }]) self.measurement_matcher.add('MEASUREMENT', None, [{ 'LIKE_NUM': True }, { 'ORTH': '%' }], [{ 'LIKE_NUM': True }, { 'ENT_TYPE': 'measurement_unit' }], [{ 'LIKE_NUM': True }, { 'ENT_TYPE': 'mass_unit' }], [{ 'LIKE_NUM': True }, { 'ENT_TYPE': 'volume_unit' }], [{ 'LIKE_NUM': True }, { 'ENT_TYPE': 'form_unit' }], [{ 'LIKE_NUM': True }, { 'LOWER': 'x' }, { 'ENT_TYPE': 'form_unit' }]) self.duration_matcher.add('DURATION', None, [{ 'POS': 'PREP' }, { 'LIKE_NUM': True }, { 'ENT_TYPE': 'time_unit' }], [{ 'LIKE_NUM': True }, { 'ENT_TYPE': 'time_unit' }], [{ 'LOWER': 'in' }, { 'LIKE_NUM': True }, { 'ENT_TYPE': 'time_unit' }], [{ 'LOWER': 'prn' }]) def __call__(self, doc): logging.debug("Called UnitAnnotator Component") nlp = self.nlp with doc.retokenize() as retokenizer: #match and tag mass units matches = self.mass_matcher(doc) for match_id, start, end in matches: span = Span(doc, start, end, label=nlp.vocab.strings['mass_unit']) if span is None: raise BaseException("Span is none") for token in span: token._.feature_is_mass_unit = True try: if len(span) > 1: retokenizer.merge(span) except ValueError: pass doc.ents = list(doc.ents) + [span] with doc.retokenize() as retokenizer: #match and tag volume units matches = self.volume_matcher(doc) for match_id, start, end in matches: span = Span(doc, start, end, label=nlp.vocab.strings['volume_unit']) for token in span: token._.feature_is_volume_unit = True try: if len(span) > 1: retokenizer.merge(span) except ValueError: pass doc.ents = list(doc.ents) + [span] with doc.retokenize() as retokenizer: # match and tag time units matches = self.time_matcher(doc) for match_id, start, end in matches: span = Span(doc, start, end, label=nlp.vocab.strings['time_unit']) for token in span: token._.feature_is_time_unit = True if len(span) > 1: retokenizer.merge(span) doc.ents = list(doc.ents) + [span] with doc.retokenize() as retokenizer: # durations matches = self.duration_matcher(doc) for match_id, start, end in matches: span = Span(doc, start, end, label=nlp.vocab.strings['duration_pattern']) for token in span: token._.feature_is_duration_pattern = True try: if len(span) > 1: retokenizer.merge(span) except ValueError: pass doc.ents = list(doc.ents) + [span] with doc.retokenize() as retokenizer: # match and frequency indicators matches = self.frequency_matcher(doc) for match_id, start, end in matches: span = Span(doc, start, end, label=nlp.vocab.strings['frequency_indicator']) for token in span: token._.feature_is_frequency_indicator = True try: if len(span) > 1: retokenizer.merge(span) except ValueError: pass doc.ents = list(doc.ents) + [span] with doc.retokenize() as retokenizer: #match and tag form units matches = self.form_matcher(doc) spans = [] for match_id, start, end in matches: span = Span(doc, start, end, label=nlp.vocab.strings['form_unit']) for token in span: token._.feature_is_form_unit = True try: if len(span) > 1: retokenizer.merge(span) except ValueError: pass doc.ents = list(doc.ents) + [span] with doc.retokenize() as retokenizer: # match and tag route types matches = self.route_matcher(doc) for match_id, start, end in matches: span = Span(doc, start, end, label=nlp.vocab.strings['route_type']) for token in span: token._.feature_is_route_type = True try: if len(span) > 1: retokenizer.merge(span) except ValueError: pass doc.ents = list(doc.ents) + [span] with doc.retokenize() as retokenizer: # match units of measurement (x/y, , etc) matches = self.unit_of_measurement_matcher(doc) for match_id, start, end in matches: span = Span(doc, start, end, label=nlp.vocab.strings['measurement_unit']) for token in span: token._.feature_is_measurement_unit = True try: if len(span) > 1: retokenizer.merge(span) except ValueError: pass doc.ents = list(doc.ents) + [span] with doc.retokenize() as retokenizer: # units of measures, numbers , percentages all together matches = self.measurement_matcher(doc) for match_id, start, end in matches: span = Span(doc, start, end, label=nlp.vocab.strings['measurement']) for token in span: token._.feature_is_measurement = True try: if len(span) > 1: retokenizer.merge(span) except ValueError: pass doc.ents = list(doc.ents) + [span] return doc
class PatternMatcher: def __init__(self): self.count = { "0": 0, "1": 0, "2": 0, "3": 0, "4": 0, "5": 0, "6": 0, "7": 0, "8": 0, "9": 0, "10": 0 } self.compa_sent_count = 0 self.nlp = spacy.load("en") self.matcher = Matcher(self.nlp.vocab) # self.matcher.add(6, # None, # [{'ORTH': 'JJR'}, {'ORTH': 'CIN'}, {'ORTH': 'TECH'}], # [{'ORTH': 'JJR'}, {}, {'ORTH': 'CIN'}, {'ORTH': 'TECH'}], # [{'ORTH': 'JJR'}, {'ORTH': 'CIN'}, {}, {'ORTH': 'TECH'}], # [{'ORTH': 'JJR'}, {}, {'ORTH': 'CIN'}, {}, {'ORTH': 'TECH'}]) # self.matcher.add(7, # None, # [{'ORTH': 'RB'}, {'ORTH': 'JJ'}, {'ORTH': 'CIN'}, {'ORTH': 'TECH'}], # [{'ORTH': 'RB'}, {'ORTH': 'JJ'}, {}, {'ORTH': 'CIN'}, {'ORTH': 'TECH'}]) # self.matcher.add(8, # None, # [{'ORTH': 'RBR'}, {'ORTH': 'JJ'}, {'ORTH': 'CIN'}, {'ORTH': 'TECH'}], # [{'ORTH': 'RBR'}, {'ORTH': 'JJ'}, {}, {'ORTH': 'CIN'}, {'ORTH': 'TECH'}]) # # # self.matcher.add(4, # None, # [{'ORTH': 'NN'}, {'ORTH': 'IN'}, {'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {}, {'ORTH': 'RB'}], # [{'ORTH': 'NN'}, {'ORTH': 'IN'}, {'ORTH': 'TECH'}, {}, {'ORTH': 'VBZ'}], # [{'ORTH': 'NN'}, {'ORTH': 'IN'}, {'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {'ORTH': 'RB'}], # [{'ORTH': 'NN'}, {'ORTH': 'IN'}, {}, {'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {'ORTH': 'RB'}], # [{'ORTH': 'NN'}, {'ORTH': 'IN'}, {}, {'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {}, {'ORTH': 'RB'}], # # # ) # self.matcher.add( 5, None, [{ 'ORTH': 'TECH' }, { 'ORTH': 'VBP' }, { 'ORTH': 'NN' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VBP' }, { 'ORTH': 'NN' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VBP' }, {}, { 'ORTH': 'NN' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VBP' }, {}, { 'ORTH': 'NN' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VBZ' }, { 'ORTH': 'NN' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VBZ' }, { 'ORTH': 'NN' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VBZ' }, {}, { 'ORTH': 'NN' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VBZ' }, {}, { 'ORTH': 'NN' }], ) self.matcher.add( 1, None, [{ 'ORTH': 'TECH' }, { 'ORTH': 'VBZ' }, { 'ORTH': 'JJ' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VBZ' }, { 'ORTH': 'JJ' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VBZ' }, {}, { 'ORTH': 'JJ' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VBZ' }, {}, { 'ORTH': 'JJ' }], # [{'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {}, {}, {}, {'ORTH': 'JJ'}], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VBZ' }, {}, { 'ORTH': 'JJ' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VBD' }, { 'ORTH': 'JJ' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VBD' }, { 'ORTH': 'JJ' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VBD' }, {}, { 'ORTH': 'JJ' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VBD' }, {}, { 'ORTH': 'JJ' }], # [{'ORTH': 'TECH'}, {'ORTH': 'VBD'}, {}, {}, {}, {'ORTH': 'JJ'}], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VBD' }, {}, { 'ORTH': 'JJ' }]) self.matcher.add( 3, None, [{ 'ORTH': 'TECH' }, { 'ORTH': 'VBZ' }, { 'ORTH': 'RB' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VBZ' }, { 'ORTH': 'RB' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VBZ' }, {}, { 'ORTH': 'RB' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VBZ' }, {}, { 'ORTH': 'RB' }], # [{'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {}, {}, {}, {'ORTH': 'RB'}], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VBZ' }, {}, { 'ORTH': 'RB' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VBD' }, { 'ORTH': 'RB' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VBD' }, { 'ORTH': 'RB' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VBD' }, {}, { 'ORTH': 'RB' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VBD' }, {}, { 'ORTH': 'RB' }], # [{'ORTH': 'TECH'}, {'ORTH': 'VBD'}, {}, {}, {}, {'ORTH': 'RB'}], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VBD' }, {}, { 'ORTH': 'RB' }]) def add_pos_tag(self, words, tech_pair): if len(words) == 0: return [] words = words.split() tagged_words = CoreNLPParser(url='http://localhost:9000', tagtype='pos').tag(words) if len(words) != len(tagged_words): tagged_words = pos_tag(words) tag_list = [] for (word, tag) in tagged_words: if tag == "IN" and word in cin: tag_list.append("CIN") elif tag[:2] == "VB" and word in cv: tag_list.append("CV") elif word == tech_pair.split()[0] or word == tech_pair.split()[1]: tag_list.append("TECH") else: tag_list.append(tag) return tag_list def match_pattern(self, pre, words, post, current_id, tech_pair): pre_rm = pre words_rm = words post_rm = post for w in remove_word: pre_rm = pre_rm.replace(w, ' ') words_rm = words_rm.replace(w, ' ') post_rm = post_rm.replace(w, ' ') tag_list = self.add_pos_tag(words_rm, tech_pair) pre_tag_list = self.add_pos_tag(pre_rm, tech_pair) post_tag_list = self.add_pos_tag(post_rm, tech_pair) words_patterns = [] pre_patterns = [] post_patterns = [] if len(tag_list) > 0: words_patterns = self.matcher( self.nlp(u'{}'.format(" ".join(tag_list)))) if len(pre_tag_list) > 0: pre_patterns = self.matcher( self.nlp(u'{}'.format(" ".join(pre_tag_list)))) if len(post_tag_list) > 0: post_patterns = self.matcher( self.nlp(u'{}'.format(" ".join(post_tag_list)))) patterns = pre_patterns + words_patterns + post_patterns if words_patterns != [] or post_patterns != []: pre_ss = sid.polarity_scores("{}".format(pre)) words_ss = sid.polarity_scores("{}".format(words)) post_ss = sid.polarity_scores("{}".format(post)) if ('TECH' in pre_tag_list and 'TECH' in tag_list) and ( (pre_ss['compound'] >= 0.05 and words_ss['compound'] <= -0.05) or (words_ss['compound'] >= 0.05 and pre_ss['compound'] <= -0.05)) and ( (tech_pair[0] in pre and tech_pair[1] in words) or (tech_pair[0] in words and tech_pair[1] in pre)): self.compa_sent_count += 1 data = open( os.path.join(os.pardir, "outnew", "pattern_v4", "test_output_{}.txt".format(os.getpid())), "a") data.write("{}\n".format(current_id)) data.write("{}\nPattern(s): ".format(tech_pair)) for pattern in patterns: self.count[str(pattern[0])] += 1 data.write(str(pattern[0]) + "\t") data.write("\n") data.write("{}\n".format(pre)) data.write("{}\n".format(words)) data.write("\n\n\n") data.close() if ('TECH' in post_tag_list and 'TECH' in tag_list) and ( (post_ss['compound'] >= 0.05 and words_ss['compound'] <= -0.05) or (words_ss['compound'] >= 0.05 and post_ss['compound'] <= -0.05)) and \ ((tech_pair[0] in post and tech_pair[1] in words) or (tech_pair[0] in words and tech_pair[1] in post)): self.compa_sent_count += 1 data = open( os.path.join(os.pardir, "outnew", "pattern_v4", "test_output_{}.txt".format(os.getpid())), "a") data.write("{}\n".format(current_id)) data.write("{}\nPattern(s): ".format(tech_pair)) for pattern in patterns: self.count[str(pattern[0])] += 1 data.write(str(pattern[0]) + "\t") data.write("\n") data.write("{}\n".format(words)) data.write("{}\n".format(post)) data.write("\n\n\n") data.close()
import json from spacy.matcher import Matcher from spacy.lang.es import Spanish with open("exercises/es/adidas.json", encoding="utf8") as f: TEXTS = json.loads(f.read()) nlp = Spanish() matcher = Matcher(nlp.vocab) # Dos tokens que en minúsculas encuentran "adidas" y "zx" pattern1 = [{____: ____}, {____: ____}] # Token que en minúsculas encuentra "adidas" y un dígito pattern2 = [{____: ____}, {____: ____}] # Añade los patrones al matcher y revisa el resultado matcher.add("ROPA", None, pattern1, pattern2) for doc in nlp.pipe(TEXTS): print([doc[start:end] for match_id, start, end in matcher(doc)])
import json from spacy.matcher import Matcher from spacy.lang.en import English with open("exercises/en/iphone.json", encoding="utf8") as f: TEXTS = json.loads(f.read()) nlp = English() matcher = Matcher(nlp.vocab) # Two tokens whose lowercase forms match "iphone" and "x" pattern1 = [{____: ____}, {____: ____}] # Token whose lowercase form matches "iphone" and a digit pattern2 = [{____: ____}, {____: ____}] # Add patterns to the matcher and check the result matcher.add("GADGET", None, pattern1, pattern2) for doc in nlp.pipe(TEXTS): print([doc[start:end] for match_id, start, end in matcher(doc)])
def custom_tokenizer_to_df(nlp, doc): # Initialize the Matcher with a vocab matcher = Matcher(nlp.vocab) ############################################################### # Add pattern for valid hashtag, i.e. '#' plus any ASCII token matcher.add("HASHTAG", None, [{"ORTH": "#"}, {"IS_ALPHA": True}]) # Register token extension for hashtag Token.set_extension("is_hashtag", default=False, force=True) # Fit in text in matcher matches = matcher(doc) # Find hashtag and merge, assign hashtag label hashtags = [] for match_id, start, end in matches: if doc.vocab.strings[match_id] == "HASHTAG": hashtags.append(doc[start:end]) with doc.retokenize() as retokenizer: for span in hashtags: retokenizer.merge(span) for token in span: token._.is_hashtag = True ############################################################## ############################################################## # Find number and merge, assign number label # Add pattern for valid hashtag, i.e. '#' plus any ASCII token matcher.add("LONG_NUMBER", None, [{ "IS_DIGIT": True }, { "ORTH": ',' }, { "IS_DIGIT": True }]) matcher.add("LONG_NUMBER", None, [{ "IS_DIGIT": True }, { "ORTH": '.' }, { "IS_DIGIT": True }]) # Register token extension for hashtag Token.set_extension("is_long_number", default=False, force=True) # Fit in text in matcher matches = matcher(doc) long_number = [] for match_id, start, end in matches: if doc.vocab.strings[match_id] == "LONG_NUMBER": long_number.append(doc[start:end]) with doc.retokenize() as retokenizer: for span in long_number: retokenizer.merge(span) for token in span: token._.is_long_number = True ############################################################## for i, token in enumerate(doc): if token._.is_hashtag: token.tag_ = 'Hashtag' if token.like_url: token.tag_ = 'URL' if token.like_email: token.tag_ = 'Email' if token.is_stop: token.tag_ = 'Stop Word' if token.like_num: token.tag_ = 'Number' if token._.is_long_number: token.tag_ = 'Number' if token.is_punct: token.tag_ = 'Punctuation' # Write the tokens to data frame df = pd.DataFrame() df['Token'] = [token.text for token in doc] df['POS'] = [token.pos_ for token in doc] df['NE'] = [token.ent_iob_ for token in doc] df['Lemma'] = [token.lemma_ for token in doc] df['Tag'] = [token.tag_ for token in doc] df['Language'] = np.nan df['Candidate'] = True df['Anglicism'] = np.nan return df
import spacy from spacy.matcher import Matcher text = "New iPhone X release date leaked as Apple reveals pre-orders by mistake" # load a model nlp = spacy.load("en_core_web_sm") # initialize matcher matcher = Matcher(nlp.vocab) # create a pattern matching two tokens: "iPhone" and "X" pattern = [{"ORTH": "iPhone"}, {"ORTH": "X"}] # add pattern to the matcher matcher.add("IPHONE_X_PATTERN", None, pattern) doc = nlp(text) # use the matcher on the doc matches = matcher(doc) print("matches: ", [doc[start:end].text for match_id, start, end in matches]) # Writing match patters ## mentions of full iOS versions - iOS 7, iOS 11, etc doc = nlp( "After making the iOS update you won't notice a radical system-wide redesign: nothing like the aesthetic upheaval we got with iOS 7. Most of iOS 11's furniture remains the same as in iOS 10. But you will discover some tweaks once you delve a little deeper." ) pattern = [{"TEXT": "iOS"}, {"IS_DIGIT": True}]
class TextSum(object): def __init__(self): self.nlp = None self.matcher = None self.matched_sents = [] self.phrases_w_bacteria = None self.phrases_w_food = None self.phrases_w_pregnancy = None self.phrases_w_diseases = None self.sent_bounds = [] self.unit_vector = [] self.phrase_id = 0 self.text_for_summary = "" def load_dictionary(self, name='en_core_sci_lg'): """Load SciSpacy dictionary Args: name (str, optional): Name of the vocabulary. Defaults to 'en_core_sci_lg'. Returns: spacy.lang: Spacy vocabulary """ print('\n\nLoading dictionary {}...'.format(name)) self.nlp = spacy.load(name) self.matcher = Matcher(self.nlp.vocab) self.phrase_matcher = PhraseMatcher(self.nlp.vocab, attr="LOWER") print('Dictionary loaded successfully.') return self.nlp def _get_lemmas(self, doc): """Get lemmatization results for document. Args: doc (spacy.doc): Parsed spacy document. Returns: list: list of all document lemmatized tokens. """ result = [] for token in doc: if (token.is_alpha and not (token.is_space or token.is_punct or token.is_stop or token.like_num)): if len(str(token.lemma_)) > 1: result.append(token.lemma_) return result def get_text(self, text_data, remove_references=True): """Tokenize and lemmatize the input text. Args: text_data (str): Document text body remove_references (bool, optional): Remove references from document body. Defaults to True. Returns: spacy.doc: Processed text by Spacy. """ if not self.nlp: print('Vocabulary is not loaded.') return None print('\n\nProcessing the document...') if remove_references: # Remove article references splits = re.split("references", text_data, flags=re.IGNORECASE) if splits: if (len(splits) > 0): text_data = ' '.join(splits[:-1]) # remove numbers in square bracket: text_data = re.sub("[\[].*?[\]]", "", text_data) doc = self.nlp(text_data) # Process document with Spacy print('Finished processing the document.') return doc def add_pipe(self, pipe): """Add Spacy pipes Args: pipe (str): pipe name """ print('Loading Spacy pipe: {}'.format(pipe)) pipe = pipe.lower() if pipe == 'abbreviation': # Abbreviation extraction abbreviation_pipe = AbbreviationDetector(self.nlp) self.nlp.add_pipe(abbreviation_pipe) elif pipe == 'entitylinker': # Entity linker linker = UmlsEntityLinker(resolve_abbreviations=True) self.nlp.add_pipe(linker) elif pipe == 'segmenter': # Rule Segmenter self.nlp.add_pipe(combined_rule_sentence_segmenter, first=True) elif pipe == 'tokenizer': # Tokenizer self.nlp.tokenizer = combined_rule_tokenizer(self.nlp) elif pipe == 'textrank': # Textrank tr = pytextrank.TextRank() self.nlp.add_pipe(tr.PipelineComponent, name='textrank', last=True) print('Pipe loaded.') def get_text_rank_summary(self, doc, limit_sentences=20, verbose=True): """Get extractive summary from textrank. Args: doc (spacy.doc): Parsed spacy document. limit_sentences (int, optional): Length of summary. Defaults to 20. verbose (bool, optional): Whether or not print results. Defaults to True. Returns: list: Summary """ result = doc._.textrank.summary(limit_sentences=limit_sentences) res = '' for sent in result: res += '{} '.format(sent) if verbose: print(sent) return res def create_patterns(self, unique_keys, phrase_pattern=False): pattern = [] if phrase_pattern: for item in unique_keys: pattern.append(self.nlp.make_doc(item.lower())) else: for item in unique_keys: pattern.append([{"LOWER": item.lower()}]) return pattern def collect_sents(self, matcher, doc, i, matches): """This function collects sentences matched with phrases or tokens. This function is originally from Spacy website. Args: matcher (apacy.matcher): Spacy matcher instance. doc (spacy.doc): Spacy document. i (int): Iteration index. matches (list): List of matches. Returns: list: List of matches. """ matched_sents = self.matched_sents match_id, start, end = matches[i] span = doc[start:end] # Matched span sent = span.sent # Sentence containing matched span match_ents = [{ "start": span.start_char - sent.start_char, "end": span.end_char - sent.start_char, "label": "MATCH", }] matched_sents.append({"text": sent.text, "ents": match_ents}) self.matched_sents = matched_sents return matched_sents def match_token_patterns(self, doc, pattern=[]): """Match a list of token patterns with document body. Args: doc (spacy.doc): Spacy document. pattern (list, optional): List of patterns. Defaults to []. Returns: list: List of matched phrases. """ self.matched_sents = [] self.matcher.add("PDFTokens", self.collect_sents, *pattern) # add pattern matches = self.matcher(doc) return matches def match_phrase_patterns(self, doc, pattern=[]): """Match a list of phrases patterns with document body. Args: doc (spacy.doc): Spacy document. pattern (list, optional): List of patterns. Defaults to []. Returns: list: List of matched phrases. """ self.matched_sents = [] self.phrase_matcher.add("PDFPhrases", self.collect_sents, *pattern) # add pattern matches = self.phrase_matcher(doc) return matches def get_matched_sents(self): """A list of matched sentences from pattern matching. Returns: list: List of matched sentences. """ return self.matched_sents
import spacy from spacy.matcher import Matcher nlp = spacy.load('en_core_web_lg') matcher = Matcher(nlp.vocab) cornerofpattern = [ {'LOWER': 'corner'}, {'LOWER': 'of'} ] matcher.add('CORNER_OF_PATTERN', None, cornerofpattern) text = 'meet you are the Corner of chapin and 14th st. See you there!' doc = nlp(text) matches = matcher(doc) for match_id, start, end in matches: matched_span = doc[start:end] print(matched_span.text) import spacy # Load English tokenizer, tagger, parser, NER and word vectors nlp = spacy.load("en_core_web_lg") # Process whole documents text = ("Meet us at 1417 Chapin St NW") doc = nlp(text) # Analyze syntax print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks]) print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"]) # Find named entities, phrases and concepts for entity in doc.ents:
def task2(sentence,timestamp): # loading spacy model nlp = spacy.load("en_core_web_sm") import en_core_web_sm nlp = en_core_web_sm.load() print(sentence) if (isAlredyPresent(sentence) == False): processedTweets.append(sentence) call(["aplay", "Air.wav"]) doc = nlp(sentence) # print(sutime.SUTime(sentence)) # print([(X.text, X.label_) for X in doc.ents]) # Tokenization tokens = [] tokens = nltk.word_tokenize(sentence); #print("Tokens: ", tokens) # tweetFile = open("stanford-ner-2018-10-16/tweet.txt", 'w') nlp = spacy.load("en_core_web_sm") # Matcher class object matcher = Matcher(nlp.vocab) matcher.add("matching", None, [{'POS': 'PROPN'}, {'LOWER': {'IN': ['ave', 'avenue', 'st', 'street', 'rd', 'road', 'dr', 'drive', 'pkwy', 'parkway', 'bend', 'bnd', 'boulevard', 'blvd', 'court', 'ct', 'expressway', 'expy', 'freeway', 'fwy', 'highway', 'hwy', 'junction', 'jct', 'lane', 'ln', 'loop', 'motorway', 'mtwy', 'parkway', 'pkwy', 'point', 'pt', 'ramp', 'turnpike', 'tpke', 'tunnel', 'tunl', 'underpass']}}]) matches = matcher(doc) span = "" for match_id, start, end in matches: span = doc[start:end] # print(span) st = StanfordNERTagger('stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz', "stanford-ner-2018-10-16/stanford-ner.jar", encoding='utf-8') classifiedText = st.tag(tokens) location = "" #print(classifiedText) i = 0 locationMatches = [] for eachOut in classifiedText: if "LOCATION" in eachOut[1]: locationMatches.append(eachOut[0]) # print(locationMatches) span = str(span) #print(span) # Lemmatization without POS tags lems = [] lemmatizer = WordNetLemmatizer() pos_sen = nltk.pos_tag(tokens); #print("\n POS Tags: \n", pos_sen); pos_wn = [(s[0], penn_to_wn(s[1])) for s in pos_sen] # print("\n POS Tags for wordnet: \n", pos_wn) lems_pos = [] for w in pos_wn: if (w[1]): lems_pos.append(lemmatizer.lemmatize(w[0], pos=w[1])) else: lems_pos.append(lemmatizer.lemmatize(w[0])) # print("\n Lemmatization by taking into account the pos tags: \n") # print(lems_pos) if("on" in tokens): try: x = tokens.index("on") x+=1 while pos_sen[x][1]=="NNP": if pos_sen[x][0] not in locationMatches: locationMatches.append(pos_sen[x][0]) x+=1 if(pos_sen[x][1]=="CD" and pos_sen[x+1][1]=="NNP" and pos_sen[x+1][0]!="AM" and pos_sen[x+1][0]!="am" and pos_sen[x+1][0]!="pm" and pos_sen[x+1][0]!="PM" ): if pos_sen[x][0] not in locationMatches: locationMatches.append(pos_sen[x][0]) if pos_sen[x+1][0] not in locationMatches: locationMatches.append(pos_sen[x+1][0]) x+=1 x+=1 while pos_sen[x][1] == "NNP": if pos_sen[x][0] not in locationMatches: locationMatches.append(pos_sen[x][0]) x += 1 except: pass if ("at" in tokens): try: x = tokens.index("at") x += 1 while pos_sen[x][1] == "NNP": if pos_sen[x][0] not in locationMatches: locationMatches.append(pos_sen[x][0]) x+=1 if (pos_sen[x][1] == "CD" and pos_sen[x + 1][1] == "NNP" and pos_sen[x+1][0]!="AM" and pos_sen[x+1][0]!="am" and pos_sen[x+1][0]!="pm" and pos_sen[x+1][0]!="PM" ): if pos_sen[x][0] not in locationMatches: locationMatches.append(pos_sen[x][0]) if pos_sen[x + 1][0] not in locationMatches: locationMatches.append(pos_sen[x + 1][0]) x += 1 x += 1 while pos_sen[x][1] == "NNP": if pos_sen[x][0] not in locationMatches: locationMatches.append(pos_sen[x][0]) x += 1 except: pass if ("AT" in tokens): try: x = tokens.index("AT") x += 1 while pos_sen[x][1] == "NNP": if pos_sen[x][0] not in locationMatches: locationMatches.append(pos_sen[x][0]) x+=1 if (pos_sen[x][1] == "CD" and pos_sen[x + 1][1] == "NNP" and pos_sen[x+1][0]!="AM" and pos_sen[x+1][0]!="am" and pos_sen[x+1][0]!="pm" and pos_sen[x+1][0]!="PM" ): if pos_sen[x][0] not in locationMatches: locationMatches.append(pos_sen[x][0]) if pos_sen[x + 1][0] not in locationMatches: locationMatches.append(pos_sen[x + 1][0]) x += 1 x += 1 while pos_sen[x][1] == "NNP": if pos_sen[x][0] not in locationMatches: locationMatches.append(pos_sen[x][0]) x += 1 except: pass #print(locationMatches) removal=[] if (len(locationMatches) > 0 and len(span) > 0): for eachMatch in locationMatches: #print(len(locationMatches)) try: #print(span.find(eachMatch)) if span.find(eachMatch) != -1: removal.append(eachMatch) except: print("Exception Distinct") for removeItem in removal: locationMatches.remove(removeItem) location= (span + " " + " ".join(locationMatches)).strip() #Extracting Time using Regular Expression: re6 = r"(24:00|2[0-3]:[0-5][0-9]|[0-1][0-9]:[0-5][0-9]:[0-5][0-9])([\s]*[AaPp][Mm])" re2 = r"(24:00|2[0-3]:[0-5][0-9]|[0-1][0-9]:[0-5][0-9]:[0-5][0-9])" re3 = r"24:00|2[0-3]:[0-5][0-9]|[0-1][0-9]:[0-5][0-9]([\s]*[AaPp][Mm])" re4 = r"24:00|2[0-3]:[0-5][0-9]|[0-1][0-9]:[0-5][0-9]" re5 = r"([0-9][0-9]?:[0-5][0-9]|[0-1][0-9]:[0-5][0-9])([\s]*[AaPp]*[Mm]*)" re1 = r"([0-9][0-9]*:[0-5][0-9]:[0-5][0-9])([\s]*[AaPp]*[Mm]*)" re7 = r"([0-9][0-9]*:[0-5][0-9])" try: time=(re.compile("(%s|%s|%s|%s|%s|%s|%s)" % (re1, re2, re3, re4, re5, re6, re7)).findall(sentence))[0][0] time=str(time) if(len(time.strip())>0): print("Time: "+str(time)) timestamp=time except BaseException as e: print("Time : "+timestamp) severity= severity_classifier.severity_finder(sentence) severityStr="" for eachKeyword in severity: severityStr+=str(eachKeyword)+" " print("Severity: "+severityStr) if (len(location) > 0): print("Location: " + location) e2 = {"predictedClassLabel": "Accidental", "tweet": sentence, "timestamp": timestamp, "location":location,"severity":severityStr} else: e2 = {"predictedClassLabel": "Accidental", "tweet": sentence, "timestamp": timestamp,"severity":severityStr} res2 = es.index(index=indexName2, doc_type=typeName2, body=e2)
def run_prdualrank(T_0, unranked_patterns, unranked_phrases, file): global final_patterns, final_keywords, pattern_to_score_map, keyword_to_score_map, ngram_prob_map, phrase_seg_score, removed_phrases, wiki_ir_cache, error_count, total_ngram_counts phrase2id = {} for i in range(len(unranked_phrases)): phrase2id[unranked_phrases[i]] = i id2phrase = {} for i in range(len(unranked_phrases)): id2phrase[i] = unranked_phrases[i] id2pattern = {} for i in range(len(unranked_patterns)): id2pattern[i] = unranked_patterns[i] seedIdwConfidence = {} for key, val in phrase2id.items(): if key in T_0: seedIdwConfidence[val] = 0.0 id2patterns = defaultdict(set) pattern2ids = defaultdict(set) context_matrix = np.zeros((len(unranked_phrases), len(unranked_patterns))) # find c (t, p) with open(file, 'r') as f: file_chunk = partition(f) matcher = Matcher(nlp.vocab) for t in file_chunk: doc = nlp(t) for i in range(len(unranked_patterns)): offset = 0 for pattern_dict in unranked_patterns[i]: if 'POS' in pattern_dict: break offset += 1 matcher.add("extraction", None, unranked_patterns[i]) matches = matcher(doc) for match_id, start, end in matches: span = doc[start+offset:end].text j = unranked_phrases.index(span) if span in unranked_phrases else -1 if j == -1: continue context_matrix[j, i] += 1 id2patterns[j].add(i) pattern2ids[i].add(j) matcher.remove("extraction") id2sup = {} for i in range(len(unranked_phrases)): id2sup[i] = 0 pattern2sup = {} for i in range(len(unranked_patterns)): pattern2sup[i] = 0 for id in id2patterns.keys(): sum = 0 for col in range(len(unranked_patterns)): sum += context_matrix[id, col] id2sup[id] = sum for pattern in pattern2ids.keys(): sum = 0 for row in range(len(unranked_phrases)): sum += context_matrix[row, pattern] pattern2sup[pattern] = sum l1, l2, l3, l4, m1, m2, m3, m4 = prDualRank(seedIdwConfidence, [], id2patterns, pattern2ids, {}, {}, {}, {}, id2phrase, context_matrix.tolist(), id2sup, pattern2sup, FLAGS_VERBOSE=False, FLAGS_DEBUG=False) return l1, l2, l3, l4, m1, m2, m3, m4
result = [(w.text, w.pos_) for w in doc] # Phone number extraction phone_matcher = Matcher(nlp.vocab) # input could be a list of patterns [pattern1, pattern2, ...] pattern1 = [ {"SHAPE": "ddd"}, {"ORTH": "-"}, {"SHAPE": "ddd"}, {"ORTH": "-"}, {"SHAPE": "dddd"} ] pattern2 = [ {"ORTH": "("}, {"SHAPE": "ddd"}, {"ORTH": ")"}, {"SHAPE": "ddd"}, {"ORTH": "-"}, {"SHAPE": "dddd"} ] pattern3 = [ {"ORTH": "("}, {"SHAPE": "ddd"}, {"ORTH": ")"}, {"SHAPE": "ddd"}, {"SHAPE": "dddd"} ] patterns = [ pattern1, pattern2, pattern3 ] phone_matcher.add("PHONE_NUMBER", patterns) matches = phone_matcher(doc) phone_numbers = [] for match_id, start, end in matches: span = doc[start:end] phone_numbers.append(span.text) print(phone_numbers)
import spacy from spacy.matcher import Matcher nlp = spacy.load("ja_core_news_sm") matcher = Matcher(nlp.vocab) doc = nlp("私には年の離れた小さい弟がいます。彼は、甘い卵焼きが好きです") # 形容詞と1つまたは2つの名詞からなるパターンを書きます pattern = [{"POS": ____}, {"POS": ____}, {"POS": ____, "OP": ____}] # パターンをmatcherに追加し、docにmatcherを適用してください matcher.add("ADJ_NOUN_PATTERN", [pattern]) matches = matcher(doc) print("Total matches found:", len(matches)) # 結果をイテレートし、スパンの文字列をプリントしてください。 for match_id, start, end in matches: print("Match found:", doc[start:end].text)
class OldPatternMatcher: cv = { "beat", "beats", "prefer", "prefers", "recommend", "recommends", "defeat", "defeats", "kill", "kills", "lead", "leads", "obliterate", "obliterates", "outclass", "outclasses", "outdo", "outdoes", "outperform", "outperforms", "outplay", "outplays", "overtake", "overtakes", "smack", "smacks", "subdue", "subdues", "surpass", "surpasses", "trump", "trumps", "win", "wins", "blow", "blows", "decimate", "decimates", "destroy", "destroys", "buy", "buys", "choose", "chooses", "favor", "favors", "grab", "grabs", "pick", "picks", "purchase", "purchases", "select", "selects", "race", "races", "compete", "competes", "match", "matches", "compare", "compares", "lose", "loses", "suck", "sucks" } cin = { "than", "over", "beyond", "upon", "as", "against", "out", "behind", "under", "between", "after", "unlike", "with", "by", "opposite" } def __init__(self): self.count = { "0": 0, "1": 0, "2": 0, "3": 0, "4": 0, "5": 0, "6": 0, "7": 0, "8": 0, "9": 0, "10": 0 } self.compa_sent_count = 0 self.matcher = Matcher(nlp.vocab) self.matcher.add(0, None, [{ 'ORTH': 'JJR' }, { 'ORTH': 'CIN' }, { 'ORTH': 'TECH' }], [{ 'ORTH': 'JJR' }, {}, { 'ORTH': 'CIN' }, { 'ORTH': 'TECH' }], [{ 'ORTH': 'JJR' }, { 'ORTH': 'CIN' }, {}, { 'ORTH': 'TECH' }], [{ 'ORTH': 'JJR' }, {}, { 'ORTH': 'CIN' }, {}, { 'ORTH': 'TECH' }], [{ 'ORTH': 'JJ' }, { 'ORTH': 'CIN' }, { 'ORTH': 'TECH' }], [{ 'ORTH': 'JJ' }, {}, { 'ORTH': 'CIN' }, { 'ORTH': 'TECH' }], [{ 'ORTH': 'JJ' }, { 'ORTH': 'CIN' }, {}, { 'ORTH': 'TECH' }], [{ 'ORTH': 'JJ' }, {}, { 'ORTH': 'CIN' }, {}, { 'ORTH': 'TECH' }]) self.matcher.add( 1, None, [{ 'ORTH': 'VB' }, { 'ORTH': 'TECH' }, { 'ORTH': 'TO' }, { 'ORTH': 'VB' }], [{ 'ORTH': 'VB' }, { 'ORTH': 'TECH' }, {}, { 'ORTH': 'TO' }, { 'ORTH': 'VB' }], ) self.matcher.add(8, None, [{ 'ORTH': 'RBR' }, { 'ORTH': 'JJ' }, { 'ORTH': 'CIN' }, { 'ORTH': 'TECH' }], [{ 'ORTH': 'RBR' }, { 'ORTH': 'JJ' }, {}, { 'ORTH': 'CIN' }, { 'ORTH': 'TECH' }]) self.matcher.add(2, None, [{ 'ORTH': 'CV' }, { 'ORTH': 'CIN' }, { 'ORTH': 'TECH' }], [{ 'ORTH': 'CV' }, {}, { 'ORTH': 'CIN' }, { 'ORTH': 'TECH' }]) self.matcher.add(3, None, [{ 'ORTH': 'CV' }, { 'ORTH': 'VBG' }, { 'ORTH': 'TECH' }]) self.matcher.add( 5, None, [{ 'ORTH': 'TECH' }, { 'ORTH': 'VB' }, { 'ORTH': 'NN' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VB' }, { 'ORTH': 'NN' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VB' }, {}, { 'ORTH': 'NN' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VB' }, {}, { 'ORTH': 'NN' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VB' }, { 'ORTH': 'NN' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VB' }, { 'ORTH': 'NN' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VB' }, {}, { 'ORTH': 'NN' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VB' }, {}, { 'ORTH': 'NN' }], ) # self.matcher.add(6, # None, # [{'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {'ORTH': 'JJS'}], # [{'ORTH': 'TECH'}, {}, {'ORTH': 'VBZ'}, {'ORTH': 'JJS'}], # [{'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {}, {'ORTH': 'JJS'}], # [{'ORTH': 'TECH'}, {}, {'ORTH': 'VBZ'}, {}, {'ORTH': 'JJS'}]) self.matcher.add(7, None, [{ 'ORTH': 'TECH' }, { 'ORTH': 'VB' }, { 'ORTH': 'JJR' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VB' }, { 'ORTH': 'JJR' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VB' }, {}, { 'ORTH': 'JJR' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VB' }, {}, { 'ORTH': 'JJR' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VBZ' }, {}, {}, { 'ORTH': 'JJR' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VB' }, {}, { 'ORTH': 'JJR' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VB' }, { 'ORTH': 'JJR' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VB' }, { 'ORTH': 'JJR' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VB' }, {}, { 'ORTH': 'JJR' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VB' }, {}, { 'ORTH': 'JJR' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VB' }, {}, {}, {}, { 'ORTH': 'JJR' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VB' }, {}, { 'ORTH': 'JJR' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'JJR' }]) self.matcher.add(10, None, [{ 'ORTH': 'TECH' }, { 'ORTH': 'VB' }, { 'ORTH': 'RBR' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VB' }, { 'ORTH': 'RBR' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VB' }, {}, { 'ORTH': 'RBR' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VB' }, {}, { 'ORTH': 'RBR' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VB' }, {}, {}, {}, { 'ORTH': 'RBR' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VB' }, {}, { 'ORTH': 'RBR' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VB' }, { 'ORTH': 'RBR' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VB' }, { 'ORTH': 'RBR' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VB' }, {}, { 'ORTH': 'RBR' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VB' }, {}, { 'ORTH': 'RBR' }], [{ 'ORTH': 'TECH' }, { 'ORTH': 'VB' }, {}, {}, { 'ORTH': 'RBR' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'VB' }, {}, { 'ORTH': 'RBR' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'RBR' }]) # self.matcher.add(9, # None, # [{'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {'ORTH': 'RBS'}], # [{'ORTH': 'TECH'}, {}, {'ORTH': 'VBZ'}, {'ORTH': 'RBS'}], # [{'ORTH': 'TECH'}, {'ORTH': 'VBZ'}, {}, {'ORTH': 'RBS'}], # [{'ORTH': 'TECH'}, {}, {'ORTH': 'VBZ'}, {}, {'ORTH': 'RBS'}]) self.matcher.add( 11, None, [{ 'ORTH': 'TECH' }, { 'ORTH': 'NP' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'NP' }], [{ 'ORTH': 'TECH' }, {}, { 'ORTH': 'NP' }], ) def add_pos_tag(self, words, tech_pair): tagged_words = nltk.pos_tag(words.split()) # print(words) # print (tagged_words) tag_list = [] for (word, tag) in tagged_words: if tag == "IN" and word in self.cin: tag_list.append("CIN") elif word == tech_pair.split()[0] or word == tech_pair.split()[1]: tag_list.append("TECH") elif word in np: tag_list.append("NP") elif tag[:2] == "VB" and word in cv: tag_list.append("CV") elif tag[:2] == "VB": tag_list.append("VB") elif tag[:2] == "RB": tag_list.append("RBR") else: tag_list.append(tag) return tag_list