def get_cause(self): cause_matcher = Matcher(nlp.vocab) CAUSUAL_WORDS = [ "consequently", "as a result", "therefore", "as a result", "as a consequence", "for these reason", "thus", "due", "for all these reasons", "because of", "because", "since", "thus", "cause", "occur", "accord", "after", "off", "all of a sudden", "coming from the opposite direction", "fell", "hit" ] CAUSUAL_SENTENCES = [] DOCUMENT = unicode(self.news_story.decode('utf8')) DOC = nlp(DOCUMENT) for word in CAUSUAL_WORDS: cause_matcher.add_pattern("Causual sentence", [{LEMMA: word}]) cause = "" for sent in DOC.sents: new_sent = nlp(unicode(str(sent).decode('utf8'))) matches = cause_matcher(new_sent) if len(matches) > 0: CAUSUAL_SENTENCES.append(sent) else: CAUSUAL_SENTENCES.append("") for sent in CAUSUAL_SENTENCES: sent = str(sent) cause = cause + sent # print("the cause is:", cause) return cause
def test_get_entity_via_match(en_vocab): matcher = Matcher(en_vocab) matcher.add_entity('TestEntity', attrs={u'Hello': u'World'}) assert matcher.n_patterns == 0 assert matcher(Doc(en_vocab, words=[u'Test', u'Entity'])) == [] matcher.add_pattern(u'TestEntity', [{ORTH: u'Test'}, {ORTH: u'Entity'}]) assert matcher.n_patterns == 1 matches = matcher(Doc(en_vocab, words=[u'Test', u'Entity'])) assert len(matches) == 1 assert len(matches[0]) == 4 ent_id, label, start, end = matches[0] assert ent_id == matcher.vocab.strings[u'TestEntity'] assert label == 0 assert start == 0 assert end == 2 attrs = matcher.get_entity(ent_id) assert attrs == {u'Hello': u'World'}
def __init__(self, nlp): matcher = Matcher(nlp.vocab) iob_pattern = [{ a.LIKE_NUM: False, a.ENT_IOB: 3 }, { 'OP': '*', a.ENT_IOB: 1 }, { 'OP': '?', a.LIKE_NUM: True }] entity_name = 'object' # it is to associate matches with patterns matcher.add_entity(entity_name, acceptor=self.make_intersect_ar(2, 2)) matcher.add_pattern(entity_name, iob_pattern, label=Part.OBJ) entity_name = 'subject' matcher.add_entity(entity_name, acceptor=self.make_intersect_ar()) matcher.add_pattern(entity_name, iob_pattern, label=Part.SUBJ) # conjugation_pattern = iob_pattern + [{a.POS: 'CONJ'}] # entity_name = 'version' # ver_pattern1 = [{a.LEMMA: 'version'}, {a.LIKE_NUM: True}] # matcher.add_entity(entity_name, acceptor=self.make_intersect_ar(1,1)) # matcher.add_pattern(entity_name, ver_pattern1, label=Part.SUBJ) # matcher.add_pattern(entity_name, ver_pattern1, label=Part.OBJ) # entity_name = 'location' # entity_name = 'date' # matcher.add_entity(entity_name, acceptor=self.make_inclusion_ar(1,1)) # matcher.add_pattern(entity_name, [{a.ENT_TYPE: 'DATE'}], label=Part.OBJ) self.entity_rules = ['subject', 'object'] super().__init__(matcher)
def load_age_matcher(nlp): """ Matcher Handles: Age : 22 years age : 22 yrs Age 22-40 22 yrs 23yrs 22-40 years About me 22 """ matcher = Matcher(nlp.vocab) # Added New attribute to check for years years = ['years', 'yrs', 'year'] is_year = FLAG63 target_ids = {nlp.vocab.strings[s.lower()] for s in years} for lexeme in nlp.vocab: if lexeme.lower in target_ids: lexeme.set_flag(is_year, True) # New Entity Type : Age matcher.add_entity("Age", acceptor=get_age) # Age Matcher Patterns matcher.add_pattern("Age", [{LOWER: "age"}, {IS_PUNCT: True}, {IS_DIGIT: True, LENGTH: 2}]) matcher.add_pattern("Age", [{LOWER: "age"}, {IS_DIGIT: True, LENGTH: 2}]) matcher.add_pattern("Age", [{LOWER: "age"}, {IS_PUNCT: True}, {IS_DIGIT: True, LENGTH: 2}, {IS_PUNCT: True}, {IS_DIGIT: True, LENGTH: 2}]) matcher.add_pattern("Age", [{LOWER: "age"}, {IS_DIGIT: True, LENGTH: 2}, {IS_PUNCT: True}, {IS_DIGIT: True, LENGTH: 2}]) matcher.add_pattern("Age", [{IS_DIGIT: True, LENGTH: 2}, {is_year: True}]) matcher.add_pattern("Age", [{SUFFIX: "yrs", LENGTH: 5}]) matcher.add_pattern("Age", [{IS_DIGIT: True, LENGTH: 2}, {IS_PUNCT: True, 'OP': '?'}, {IS_DIGIT: True, LENGTH: 2}, {is_year: True}]) matcher.add_pattern("Age", [{IS_DIGIT: True, LENGTH: 2}, {IS_ASCII: True, 'OP': '?'}, {IS_DIGIT: True, LENGTH: 2}, {is_year: True}]) matcher.add_pattern("Age", [{LOWER: 'about'}, {LOWER: 'me', 'OP': '?'}, {IS_DIGIT: True}]) return matcher
def load_date_matcher(nlp): # Create matcher object with list of rules and return matcher = Matcher(nlp.vocab) # Add to vocab add_to_vocab(nlp, months_dict.keys()) add_to_vocab(nlp, ordinals) add_to_vocab(nlp, date_delimiters) add_to_vocab(nlp, date_digits) # Create flag for MONTH is_month = FLAG62 month_target_ids = { nlp.vocab.strings[s.lower()] for s in months_dict.keys() } # Create flag for ORDINALS is_ordinal = FLAG61 ordinal_target_ids = {nlp.vocab.strings[s.lower()] for s in ordinals} # Create flag for DATE_DELIMITER is_date_delimiter = FLAG60 date_delimiter_target_ids = { nlp.vocab.strings[s.lower()] for s in date_delimiters } # Create flag for DIGIT is_date_digit = FLAG59 date_digit_target_ids = {nlp.vocab.strings[s.lower()] for s in date_digits} # Add the flags for lexeme in nlp.vocab: if lexeme.lower in month_target_ids: lexeme.set_flag(is_month, True) if lexeme.lower in ordinal_target_ids: lexeme.set_flag(is_ordinal, True) if lexeme.lower in date_delimiter_target_ids: lexeme.set_flag(is_date_delimiter, True) if lexeme.lower in date_digit_target_ids: lexeme.set_flag(is_date_digit, True) if lexeme.is_digit == True: lexeme.set_flag(is_date_digit, True) # if is_date_digit_with_ordinal(lexeme.lower_): # lexeme.set_flag(is_date_digit, True) # Add rules # March 25, 2017 # March 25th, 2017 # March 25th 2017 # March 25 2017 matcher.add_pattern('DATE', [{ is_month: True }, { is_date_digit: True }, { is_ordinal: True, 'OP': '?' }, { ORTH: ',', 'OP': '?' }, { IS_DIGIT: True, LENGTH: 4 }], label=1) # 25 March, 2017 # 25th March, 2017 # 25th March 2017 # 25 March 2017 matcher.add_pattern('DATE', [{ is_date_digit: True }, { is_date_delimiter: True, 'OP': '?' }, { is_month: True }, { is_ordinal: True, 'OP': '?' }, { ORTH: ',', 'OP': '?' }, { IS_DIGIT: True, LENGTH: 4 }], label=2) # 25/05/2016 matcher.add_pattern('DATE', [{ is_date_digit: True }, { is_date_delimiter: True, 'OP': '+' }, { is_month: True }, { is_date_delimiter: True, 'OP': '+' }, { IS_DIGIT: True, LENGTH: 4 }], label=3) # 05/25/2016 matcher.add_pattern('DATE', [{ is_month: True }, { is_date_delimiter: True, 'OP': '+' }, { is_date_digit: True }, { is_date_delimiter: True, 'OP': '+' }, { IS_DIGIT: True, LENGTH: 4 }], label=4) # Diciembre, 2009 # December 2009 matcher.add_pattern('DATE', [{ is_month: True, is_date_digit: False }, { ORTH: ',' }, { IS_DIGIT: True, LENGTH: 4 }], label=9) matcher.add_pattern('DATE', [{ is_month: True, is_date_digit: False }, { IS_DIGIT: True, LENGTH: 4 }], label=9) # 2013-12-04 matcher.add_pattern('DATE', [{ IS_DIGIT: True, LENGTH: 4 }, { is_date_delimiter: True, 'OP': '+' }, { is_month: True }, { is_date_delimiter: True, 'OP': '+' }, { is_date_digit: True }], label=10) # 9 days ago matcher.add_pattern('DATE', [{ IS_DIGIT: True }, { POS: 'NOUN' }, { LOWER: 'ago' }], label=12) # 1 Jul # 1. Jul matcher.add_pattern('DATE', [{ is_date_digit: True }, { is_ordinal: True }, { is_date_delimiter: True }, { is_month: True, is_date_digit: False }], label=13) matcher.add_pattern('DATE', [{ is_date_digit: True }, { is_ordinal: True }, { is_month: True, is_date_digit: False }], label=13) matcher.add_pattern('DATE', [{ is_date_digit: True }, { is_date_delimiter: True }, { is_month: True, is_date_digit: False }], label=13) matcher.add_pattern('DATE', [{ is_date_digit: True }, { is_month: True, is_date_digit: False }], label=13) # Jul 2nd matcher.add_pattern('DATE', [{ is_month: True, is_date_digit: False }, { is_date_delimiter: True }, { is_date_digit: True }, { is_ordinal: True }], label=15) matcher.add_pattern('DATE', [{ is_month: True, is_date_digit: False }, { is_date_delimiter: True }, { is_date_digit: True }], label=15) matcher.add_pattern('DATE', [{ is_month: True, is_date_digit: False }, { is_date_digit: True }, { is_ordinal: True }], label=15) matcher.add_pattern('DATE', [{ is_month: True, is_date_digit: False }, { is_date_digit: True }], label=15) return matcher
def load_risky_activities_matcher(nlp): matcher = Matcher(nlp.vocab) risky_activities = [ 'bareback', 'uncovered', 'bbbjtcim', 'bbbj', 'bbbjtc', 'bbbjtcws', 'bbbjwf', 'bbfs', 'anal', 'greek', 'rca', 'swallow', 'cim', 'choke', 'bdsm', 'bondage', 'g******g', 'hardcore' ] provider = [ 'girl', 'girls', 'model', 'models', 'staff', 'staffs', 'latina', 'latinas', 'talent', 'talents', 'supermodel', 'supermodels', 'princess', 'princesses' ] is_risky_activities = FLAG40 is_provider = FLAG41 set_flag(nlp, risky_activities, is_risky_activities) set_flag(nlp, provider, is_provider) matcher.add_entity(1) matcher.add_pattern(1, [{is_risky_activities: True}]) matcher.add_entity(2) matcher.add_pattern(2, [{LEMMA: "hardcore"}, {LEMMA: "sex"}]) matcher.add_pattern(2, [{LEMMA: "hardcore"}, {LEMMA: "service"}]) matcher.add_entity(3) matcher.add_pattern(3, [{LEMMA: "hardcore"}]) matcher.add_entity(4) matcher.add_pattern(4, [{LEMMA: "greek"}, {IS_DIGIT: True}]) matcher.add_pattern(4, [{LEMMA: "greek"}, {is_provider: True}]) matcher.add_pattern(4, [{ LEMMA: "if", DEP: "mark" }, { IS_ALPHA: True, DEP: "ROOT" }, { is_risky_activities: True }]) matcher.add_pattern(4, [{is_risky_activities: True}, {LEMMA: "sorry"}]) return matcher
def load_movement_matcher(nlp): matcher = Matcher(nlp.vocab) place = ['area', 'place', 'city', 'town'] girl = [ 'gal', 'girl', 's**t', 'cutie', 'hottie', 'lady', 'teen', 'teenager', 'chick', 'staff', 'gf', 'she' ] add_to_vocab(nlp, place) add_to_vocab(nlp, girl) is_place = FLAG18 is_girl = FLAG19 upper_start = FLAG20 for lexeme in nlp.vocab: if lexeme.lower_ in place: lexeme.set_flag(is_place, True) if lexeme.lower_ in girl: lexeme.set_flag(is_girl, True) if lexeme.prefix_.isupper(): lexeme.set_flag(upper_start, True) # Positive Matcher Patterns matcher.add_entity(1) matcher.add_pattern(1, [{ LEMMA: "last" }, { LEMMA: "night" }, { LEMMA: "in" }, { LEMMA: "town" }]) matcher.add_pattern(1, [{ LEMMA: "leave" }, { IS_ASCII: True, ENT_TYPE: "DATE" }]) matcher.add_pattern(1, [{LEMMA: "leave"}, {DEP: "partmod"}]) matcher.add_pattern(1, [{LEMMA: "leave"}, {DEP: "quantmod"}]) matcher.add_pattern(1, [{ LEMMA: "leave" }, { IS_ASCII: True, ENT_TYPE: "TIME" }]) matcher.add_pattern(1, [{ LEMMA: "leave" }, { LEMMA: "in" }, { IS_ASCII: True, ENT_TYPE: "DATE" }]) matcher.add_pattern(1, [{LEMMA: "leave"}, {LEMMA: "town"}]) matcher.add_pattern(1, [{LEMMA: "out"}, {LEMMA: "of"}, {LEMMA: "town"}]) matcher.add_pattern(1, [{LOWER: "outta"}, {LEMMA: "town"}]) matcher.add_pattern(1, [{ LEMMA: "lastnight" }, { LEMMA: "in" }, { LEMMA: "town" }]) matcher.add_pattern(1, [{LEMMA: "back"}, {LEMMA: "in"}, {LEMMA: "town"}]) matcher.add_pattern(1, [{LEMMA: "just"}, {LEMMA: "in"}, {LEMMA: "town"}]) matcher.add_pattern(1, [{LEMMA: "day"}, {LEMMA: "in"}, {LEMMA: "town"}]) matcher.add_pattern(1, [{ LEMMA: "in" }, { LEMMA: "town" }, { LEMMA: "tonight" }]) matcher.add_pattern(1, [{ LEMMA: "in" }, { LEMMA: "town" }, { LEMMA: "through" }]) matcher.add_pattern(1, [{LEMMA: "in"}, {LEMMA: "town"}, {LEMMA: "until"}]) matcher.add_pattern(1, [{ LEMMA: "in" }, { LEMMA: "town" }, { LEMMA: "for" }, { LEMMA: "one" }, { LEMMA: "night" }]) matcher.add_pattern(1, [{ LEMMA: "in" }, { LEMMA: "town" }, { LEMMA: "for" }, { IS_DIGIT: True }, { LEMMA: "night" }]) matcher.add_pattern(1, [{LEMMA: "town"}, {LEMMA: "stay", DEP: "nmod"}]) matcher.add_pattern(1, [{ LEMMA: "town" }, { IS_ASCII: True }, { LEMMA: "stay", DEP: "nmod" }]) matcher.add_pattern(1, [{ LEMMA: "new" }, { LEMMA: "girl" }, { LEMMA: "in" }, { LEMMA: "town" }]) matcher.add_pattern(1, [{LEMMA: "recent"}, {LEMMA: "move"}]) matcher.add_pattern(1, [{LEMMA: "recently"}, {LEMMA: "move"}]) matcher.add_pattern(1, [{LEMMA: "relocate"}]) matcher.add_pattern(1, [{ LEMMA: "new", DEP: "amod" }, { LEMMA: "city" }, { LEMMA: "to", DEP: "dep" }]) matcher.add_pattern(1, [{ LEMMA: "new", DEP: "amod" }, { IS_ASCII: True }, { LEMMA: "city" }, { IS_ASCII: True }, { LEMMA: "to", DEP: "dep" }]) matcher.add_pattern(1, [{LEMMA: "new"}, {LEMMA: "to"}, {LEMMA: "area"}]) matcher.add_pattern(1, [{ LEMMA: "new" }, { LEMMA: "to" }, { upper_start: True }]) matcher.add_pattern(1, [{LEMMA: "first"}, {LEMMA: "visit"}, {LEMMA: "to"}]) matcher.add_pattern(1, [{LEMMA: "i", DEP: "nsubj"}, {LEMMA: "arrive"}]) matcher.add_pattern(1, [{ LEMMA: "girl", DEP: "nsubj" }, { LEMMA: "arrive" }, { DEP: "partmod" }]) matcher.add_pattern(1, [{ LEMMA: "girl", DEP: "nsubj" }, { IS_ASCII: True }, { LEMMA: "arrive" }, { IS_ASCII: True }, { DEP: "partmod" }]) matcher.add_pattern(1, [{ LEMMA: "girl", DEP: "nsubj" }, { LEMMA: "arrive" }, { DEP: "quantmod" }]) matcher.add_pattern(1, [{ LEMMA: "girl", DEP: "nsubj" }, { IS_ASCII: True }, { LEMMA: "arrive" }, { IS_ASCII: True }, { DEP: "quantmod" }]) matcher.add_pattern(1, [{LEMMA: "just"}, {LEMMA: "arrive"}]) matcher.add_pattern(1, [{ LEMMA: "on" }, { LEMMA: "my" }, { LEMMA: "way" }, { LEMMA: "to" }, { TAG: "NNP" }]) matcher.add_pattern(1, [{ LEMMA: "on" }, { LEMMA: "my" }, { LEMMA: "way" }, { LEMMA: "to" }, { TAG: "NN" }]) matcher.add_pattern(1, [{LEMMA: "on"}, {LEMMA: "the"}, {LEMMA: "way"}]) matcher.add_pattern(1, [{LEMMA: "just"}, {LEMMA: "get"}, {LEMMA: "here"}]) matcher.add_pattern(1, [{LEMMA: "get"}, {LEMMA: "here"}, {LEMMA: "today"}]) matcher.add_pattern(1, [{ LEMMA: "get" }, { LEMMA: "here" }, { LEMMA: "yesterday" }]) matcher.add_pattern(1, [{ LEMMA: "get" }, { LEMMA: "here" }, { LEMMA: "last" }, { LEMMA: "night" }]) matcher.add_pattern(1, [{ LEMMA: "i", DEP: "nsubj" }, { IS_ASCII: True }, { LEMMA: "visit" }, { IS_ASCII: True }, { is_place: True, DEP: "dobj" }]) matcher.add_pattern(1, [{ LEMMA: "i", DEP: "nsubj" }, { LEMMA: "visit" }, { is_place: True, DEP: "dobj" }]) # Strong Positive Matcher Patterns matcher.add_entity(2) matcher.add_pattern(2, [{ LEMMA: "new" }, { IS_ASCII: True }, { LEMMA: "in" }, { is_place: True }]) matcher.add_pattern(2, [{ LEMMA: "new" }, { IS_ASCII: True }, { IS_ASCII: True }, { LEMMA: "in" }, { is_place: True }]) matcher.add_pattern(2, [{ LEMMA: "im" }, { LEMMA: "new" }, { LEMMA: "in" }, { LEMMA: "town" }]) matcher.add_pattern(2, [{LEMMA: "new"}, {LEMMA: "in"}, {is_place: True}]) matcher.add_pattern(2, [{LEMMA: "new"}, {LEMMA: "to"}, {is_place: True}]) matcher.add_pattern(2, [{ LEMMA: "new" }, { is_girl: True }, { LEMMA: "in" }, { LEMMA: "town" }]) matcher.add_pattern(2, [{ LEMMA: "new" }, { LEMMA: "to" }, { upper_start: True }, { LEMMA: "area" }]) # Negative Matcher Patterns matcher.add_entity(3) matcher.add_pattern(3, [{LEMMA: "new"}]) matcher.add_pattern(3, [{LEMMA: "girl"}, {LEMMA: "in"}, {LEMMA: "town"}]) matcher.add_pattern(3, [{LEMMA: "grand"}, {LEMMA: "new"}]) matcher.add_pattern(3, [{LEMMA: "new"}, {LEMMA: "at"}]) matcher.add_pattern(3, [{ LEMMA: "new" }, { LEMMA: "to" }, { LEMMA: "business" }]) matcher.add_pattern(3, [{ LEMMA: "new" }, { LEMMA: "to" }, { LEMMA: "industry" }]) matcher.add_pattern(3, [{ LEMMA: "new" }, { LEMMA: "to" }, { LEMMA: "scenario" }]) matcher.add_pattern(3, [{LEMMA: "dream", DEP: "nsubj"}, {LEMMA: "arrive"}]) matcher.add_pattern(3, [{ LEMMA: "fantasy", DEP: "nsubj" }, { LEMMA: "arrive" }]) matcher.add_pattern(3, [{LEMMA: "you", DEP: "nsubj"}, {LEMMA: "arrive"}]) matcher.add_pattern(3, [{LEMMA: "area"}, {LEMMA: "only"}]) matcher.add_pattern(3, [{upper_start: True}, {LEMMA: "area"}]) matcher.add_pattern(3, [{LEMMA: "you", DEP: "nsubj"}, {LEMMA: "leave"}]) matcher.add_pattern(3, [{ LEMMA: "it", DEP: "dobj" }, { LEMMA: "leave" }, { IS_ASCII: True, DEP: "nmod", TAG: "TO" }]) matcher.add_pattern(3, [{ LEMMA: "that", DEP: "dobj" }, { LEMMA: "leave" }, { IS_ASCII: True, DEP: "nmod", TAG: "TO" }]) matcher.add_pattern(3, [{LEMMA: "best"}, {LEMMA: "move"}]) matcher.add_pattern(3, [{LEMMA: "next"}, {LEMMA: "move"}]) matcher.add_pattern(3, [{ LEMMA: "arrive" }, { IS_ASCII: True }, { IS_ASCII: True, DEP: "xcomp" }]) matcher.add_pattern(3, [{LEMMA: "arrive"}, {IS_ASCII: True, DEP: "xcomp"}]) matcher.add_pattern(3, [{LEMMA: "visit"}, {LEMMA: "sister", DEP: "dobj"}]) matcher.add_pattern(3, [{ LEMMA: "visit" }, { IS_ASCII: True }, { LEMMA: "sister", DEP: "dobj" }]) matcher.add_pattern(3, [{LEMMA: "visit"}, {LEMMA: "family", DEP: "dobj"}]) matcher.add_pattern(3, [{ LEMMA: "visit" }, { IS_ASCII: True }, { LEMMA: "family", DEP: "dobj" }]) matcher.add_pattern(3, [{LEMMA: "we", DEP: "poss"}, {LEMMA: "visit"}]) # Strong Negative Matcher Patterns matcher.add_entity(4) matcher.add_pattern(4, [{LEMMA: "town"}, {LEMMA: "girl"}]) matcher.add_pattern(4, [{LEMMA: "on"}, {LEMMA: "the"}, {LEMMA: "town"}]) matcher.add_pattern(4, [{LEMMA: "near"}, {LEMMA: "town"}]) matcher.add_pattern(4, [{LEMMA: "down"}, {LEMMA: "town"}]) matcher.add_pattern(4, [{LEMMA: "town"}, {LEMMA: "hall"}]) matcher.add_pattern(4, [{LEMMA: "best"}, {LEMMA: "in"}, {LEMMA: "town"}]) matcher.add_pattern(4, [{ LEMMA: "best" }, { IS_ASCII: True }, { LEMMA: "in" }, { LEMMA: "town" }]) matcher.add_pattern(4, [{ LEMMA: "best" }, { IS_ASCII: True }, { IS_ASCII: True }, { LEMMA: "in" }, { LEMMA: "town" }]) matcher.add_pattern(4, [{ LEMMA: "best" }, { LEMMA: "in" }, { IS_ASCII: True }, { LEMMA: "town" }]) matcher.add_pattern(4, [{ LEMMA: "best" }, { IS_ASCII: True }, { LEMMA: "in" }, { IS_ASCII: True }, { LEMMA: "town" }]) matcher.add_pattern(4, [{ LEMMA: "best" }, { IS_ASCII: True }, { IS_ASCII: True }, { LEMMA: "in" }, { IS_ASCII: True }, { LEMMA: "town" }]) matcher.add_pattern(4, [{ LEMMA: "not" }, { LEMMA: "new" }, { LEMMA: "in" }, { LEMMA: "town" }]) matcher.add_pattern(4, [{ LEMMA: "not" }, { LEMMA: "new" }, { LEMMA: "to" }, { LEMMA: "town" }]) matcher.add_pattern(4, [{LEMMA: "not"}, {LEMMA: "leave"}, {LEMMA: "town"}]) matcher.add_pattern(4, [{ LEMMA: "i", DEP: "nsubj" }, { LEMMA: "leave" }, { LEMMA: "you", DEP: "dobj" }]) matcher.add_pattern(4, [{LEMMA: "new"}, {LEMMA: "but"}]) matcher.add_pattern(4, [{ LEMMA: "new" }, { LEMMA: "backpage", DEP: "nmod", TAG: "TO" }]) matcher.add_pattern(4, [{ LEMMA: "new" }, { IS_ASCII: True }, { LEMMA: "backpage", DEP: "nmod", TAG: "TO" }]) matcher.add_pattern(4, [{ LEMMA: "new" }, { LEMMA: "bp", DEP: "nmod", TAG: "TO" }]) matcher.add_pattern(4, [{ LEMMA: "new" }, { IS_ASCII: True }, { LEMMA: "bp", DEP: "nmod", TAG: "TO" }]) #DS matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "message", DEP: "dobj"}]) matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "msg", DEP: "dobj"}]) matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "txt", DEP: "dobj"}]) matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "text", DEP: "dobj"}]) matcher.add_pattern(4, [{ LEMMA: "leave" }, { LEMMA: "impression", DEP: "dobj" }]) matcher.add_pattern(4, [{ LEMMA: "leave" }, { LEMMA: "voicemail", DEP: "dobj" }]) matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "smile", DEP: "dobj"}]) matcher.add_pattern(4, [{ LEMMA: "leave" }, { IS_ASCII: True }, { LEMMA: "message", DEP: "dobj" }]) matcher.add_pattern(4, [{ LEMMA: "leave" }, { IS_ASCII: True }, { LEMMA: "msg", DEP: "dobj" }]) matcher.add_pattern(4, [{ LEMMA: "leave" }, { IS_ASCII: True }, { LEMMA: "txt", DEP: "dobj" }]) matcher.add_pattern(4, [{ LEMMA: "leave" }, { IS_ASCII: True }, { LEMMA: "text", DEP: "dobj" }]) matcher.add_pattern(4, [{ LEMMA: "leave" }, { IS_ASCII: True }, { LEMMA: "impression", DEP: "dobj" }]) matcher.add_pattern(4, [{ LEMMA: "leave" }, { IS_ASCII: True }, { LEMMA: "voicemail", DEP: "dobj" }]) matcher.add_pattern(4, [{ LEMMA: "leave" }, { IS_ASCII: True }, { LEMMA: "smile", DEP: "dobj" }]) matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "satisfied"}]) matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "memory", DEP: "dobj"}]) matcher.add_pattern(4, [{ LEMMA: "leave" }, { IS_ASCII: True }, { LEMMA: "memory", DEP: "dobj" }]) matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "you"}]) matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "u"}]) matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "with"}]) matcher.add_pattern(4, [{ LEMMA: "leave" }, { LEMMA: "a" }, { LEMMA: "gentleman" }]) matcher.add_pattern(4, [{LEMMA: "or"}, {LEMMA: "leave"}]) matcher.add_pattern(4, [{LEMMA: "or"}, {LEMMA: "i"}, {LEMMA: "leave"}]) matcher.add_pattern(4, [{LEMMA: "move"}, {LEMMA: "on"}]) matcher.add_pattern(4, [{LEMMA: "i"}, {LEMMA: "move"}, {LEMMA: "like"}]) matcher.add_pattern(4, [{LEMMA: "arrive"}, {LEMMA: "on"}, {LEMMA: "time"}]) matcher.add_pattern(4, [{LEMMA: "can"}, {LEMMA: "move"}]) matcher.add_pattern(4, [{LEMMA: "new"}, {LEMMA: "but"}]) matcher.add_pattern(4, [{ LEMMA: "on" }, { LEMMA: "my" }, { LEMMA: "way" }, { LEMMA: "to" }, { TAG: "PRP" }]) matcher.add_pattern(4, [{LEMMA: "u"}, {LEMMA: "get"}, {LEMMA: "here"}]) matcher.add_pattern(4, [{LEMMA: "you"}, {LEMMA: "get"}, {LEMMA: "here"}]) matcher.add_pattern(4, [{LEMMA: "go"}, {LEMMA: "to"}, {LEMMA: "town"}]) matcher.add_pattern(4, [{LEMMA: "new"}, {LEMMA: "management"}]) return matcher
def load_multi_girl_matcher(nlp): matcher = Matcher(nlp.vocab) multi_num = [ 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'double', 'triple' ] + [str(x) for x in range(2, 11)] girl = [ 'gal', 'girl', 's**t', 'cutie', 'hottie', 'lady', 'teen', 'teenager', 'chick', 'staff', 'gf', 'she' ] show = ['show', 'special', 'session', 'fantasy'] dict_and = ['and', 'an', 'n', '&'] is_multi_num = FLAG30 is_girl = FLAG31 is_show = FLAG33 is_and = FLAG34 set_flag(nlp, multi_num, is_multi_num) set_flag(nlp, girl, is_girl) set_flag(nlp, show, is_show) set_flag(nlp, dict_and, is_and) matcher.add_entity(1) matcher.add_pattern(1, [{is_multi_num: True}, {is_girl: True, TAG: "NNS"}]) matcher.add_pattern(1, [{ is_multi_num: True }, { is_girl: True, TAG: "NNPS" }]) matcher.add_pattern(1, [{LOWER: "duo"}]) matcher.add_pattern(1, [{ LOWER: "2" }, { ORTH: "-" }, { LOWER: "for" }, { ORTH: "-" }, { LOWER: "1" }]) matcher.add_pattern(1, [{ LEMMA: "double" }, { ORTH: "-" }, { LEMMA: "session" }]) matcher.add_pattern(1, [{LEMMA: "three"}, {ORTH: "-"}, {LEMMA: "way"}]) matcher.add_pattern(1, [{is_multi_num: True}, {LOWER: "for"}, {ORTH: "1"}]) matcher.add_pattern(1, [{ is_multi_num: True }, { LOWER: "for" }, { ORTH: "one" }]) matcher.add_pattern(1, [{LEMMA: "double"}, {is_show: True}]) matcher.add_pattern(1, [{is_multi_num: True}, {LOWER: "way"}]) matcher.add_entity(4) matcher.add_pattern(4, [{LOWER: "a"}, {is_girl: True}]) matcher.add_pattern(4, [{LOWER: "how"}, {is_girl: True}]) matcher.add_pattern(4, [{LOWER: "for"}, {is_girl: True}]) matcher.add_pattern(4, [{IS_ALPHA: True, DEP: "nmod"}, {is_girl: True}]) matcher.add_pattern(4, [{ is_girl: True }, { is_and: True }, { LEMMA: "gentleman" }]) matcher.add_pattern(4, [{ is_girl: True }, { ORTH: "&" }, { ORTH: "&" }, { LEMMA: "gentleman" }]) matcher.add_pattern(4, [{is_girl: True}, {is_and: True}, {LEMMA: "guy"}]) matcher.add_pattern(4, [{ is_girl: True }, { ORTH: "&" }, { ORTH: "&" }, { LEMMA: "guy" }]) matcher.add_pattern(4, [{ LEMMA: "gentleman" }, { is_and: True }, { is_girl: True }]) matcher.add_pattern(4, [{ LEMMA: "gentleman" }, { ORTH: "&" }, { ORTH: "&" }, { is_girl: True }]) matcher.add_pattern(4, [{LEMMA: "guy"}, {is_and: True}, {is_girl: True}]) matcher.add_pattern(4, [{ LEMMA: "guy" }, { ORTH: "&" }, { ORTH: "&" }, { is_girl: True }]) matcher.add_pattern(4, [{LOWER: "she"}]) return matcher
def load_credit_card_matcher(nlp): matcher = Matcher(nlp.vocab) payment = [ 'visa', 'mastercard', 'masterc', 'mc', 'mcard', 'cash', 'csh', 'discover', 'amex', 'interac', 'jcb' ] visa_type = [ 'us', 'american', 'canadian', 'student', 'online', 'transit', 'need', 'make', 'f1', 'temp', 'temporary', 'permanent', 'visitor', 'visit', 'visiting' ] is_payment = FLAG40 is_visa_type = FLAG41 set_flag(nlp, payment, is_payment) set_flag(nlp, visa_type, is_visa_type) matcher.add_entity(1) matcher.add_pattern(1, [{ is_payment: True }, { ORTH: "/" }, { is_payment: True }]) matcher.add_pattern(1, [{ is_payment: True }, { ORTH: "," }, { is_payment: True }]) matcher.add_pattern(1, [{ is_payment: True }, { LEMMA: "and" }, { is_payment: True }]) matcher.add_pattern(1, [{ is_payment: True }, { ORTH: "/" }, { LOWER: "american" }, { LOWER: "express" }]) matcher.add_pattern(1, [{ is_payment: True }, { ORTH: "," }, { LOWER: "american" }, { LOWER: "express" }]) matcher.add_pattern(1, [{ is_payment: True }, { ORTH: "&" }, { LOWER: "american" }, { LOWER: "express" }]) matcher.add_pattern(1, [{ is_payment: True }, { LEMMA: "and" }, { LOWER: "american" }, { LOWER: "express" }]) matcher.add_pattern(1, [{ LEMMA: "m" }, { ORTH: "/" }, { LEMMA: "card" }, { ORTH: "/" }, { LEMMA: "m" }, { ORTH: "/" }, { LEMMA: "card" }]) matcher.add_pattern(1, [{ LEMMA: "m" }, { ORTH: "/" }, { LEMMA: "card" }, { ORTH: "," }, { LEMMA: "m" }, { ORTH: "/" }, { LEMMA: "card" }]) matcher.add_pattern(1, [{ LEMMA: "m" }, { ORTH: "/" }, { LEMMA: "card" }, { LEMMA: "and" }, { LEMMA: "m" }, { ORTH: "/" }, { LEMMA: "card" }]) matcher.add_pattern(1, [{ LEMMA: "m" }, { ORTH: "/" }, { LEMMA: "card" }, { ORTH: "/" }, { LOWER: "american" }, { LOWER: "express" }]) matcher.add_pattern(1, [{ LEMMA: "m" }, { ORTH: "/" }, { LEMMA: "card" }, { ORTH: "," }, { LOWER: "american" }, { LOWER: "express" }]) matcher.add_pattern(1, [{ LEMMA: "m" }, { ORTH: "/" }, { LEMMA: "card" }, { ORTH: "&" }, { LOWER: "american" }, { LOWER: "express" }]) matcher.add_pattern(1, [{ LEMMA: "m" }, { ORTH: "/" }, { LEMMA: "card" }, { LEMMA: "and" }, { LOWER: "american" }, { LOWER: "express" }]) matcher.add_pattern(1, [{LOWER: "american"}, {LOWER: "express"}]) matcher.add_pattern(1, [{ LEMMA: "diners" }, { LEMMA: "club" }, { LEMMA: "internacional" }]) matcher.add_pattern(1, [{LOWER: "union"}, {LOWER: "pay"}]) matcher.add_pattern(1, [{LEMMA: "credit"}, {LEMMA: "card"}]) matcher.add_pattern(1, [{LEMMA: "creditcard"}]) matcher.add_entity(2) matcher.add_pattern(2, [{LEMMA: "accept"}, {LEMMA: "card"}]) matcher.add_pattern(2, [{LEMMA: "accept"}, {is_payment: True}]) matcher.add_pattern(2, [{ LEMMA: "accept" }, { LEMMA: "m" }, { ORTH: "/" }, { LEMMA: "card" }]) matcher.add_pattern(2, [{ LEMMA: "accept" }, { ORTH: ":" }, { is_payment: True }]) matcher.add_pattern(2, [{ LEMMA: "payment" }, { ORTH: ":" }, { is_payment: True }]) matcher.add_pattern(2, [{ LEMMA: "accept" }, { ORTH: ":" }, { LEMMA: "m" }, { ORTH: "/" }, { LEMMA: "card" }]) matcher.add_pattern(2, [{ LEMMA: "payment" }, { ORTH: ":" }, { LEMMA: "m" }, { ORTH: "/" }, { LEMMA: "card" }]) matcher.add_entity(3) matcher.add_pattern(3, [{LEMMA: "at"}, {is_payment: True}]) matcher.add_pattern(3, [{ LEMMA: "at" }, { LEMMA: "m" }, { ORTH: "/" }, { LEMMA: "card" }]) matcher.add_pattern(3, [{LEMMA: "visa"}, {LEMMA: "versa"}]) matcher.add_entity(4) matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "credit"}, {LEMMA: "card"}]) matcher.add_pattern(4, [{LEMMA: "credit"}, {LEMMA: "card"}, {DEP: "neg"}]) matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "creditcard"}]) matcher.add_pattern(4, [{LEMMA: "no"}, {LEMMA: "credit"}, {LEMMA: "card"}]) matcher.add_pattern(4, [{LEMMA: "no"}, {LEMMA: "creditcard"}]) matcher.add_pattern(4, [{ LEMMA: "not" }, { IS_ASCII: True }, { LEMMA: "credit" }, { LEMMA: "card" }]) matcher.add_pattern(4, [{ LEMMA: "not" }, { IS_ASCII: True }, { LEMMA: "creditcard" }]) matcher.add_pattern(4, [{is_visa_type: True}, {LEMMA: "visa"}]) matcher.add_pattern(4, [{LEMMA: "visa"}, {LEMMA: "student"}]) matcher.add_pattern(4, [{LEMMA: "rent"}, {LEMMA: "and"}, {LEMMA: "visa"}]) matcher.add_pattern(4, [{ LEMMA: "rent" }, { LEMMA: "and" }, { LEMMA: "credit" }]) matcher.add_pattern(4, [{LEMMA: "visa"}, {LEMMA: "and"}, {LEMMA: "rent"}]) matcher.add_pattern(4, [{LEMMA: "card"}, {LEMMA: "and"}, {LEMMA: "rent"}]) matcher.add_pattern(4, [{ LEMMA: "apply", DEP: "ROOT" }, { LEMMA: "for", DEP: "prep" }, { LEMMA: "visa" }]) matcher.add_pattern(4, [{LEMMA: "apply", DEP: "ROOT"}, {LEMMA: "visa"}]) matcher.add_pattern(4, [{LEMMA: "arrival", DEP: "ROOT"}, {LEMMA: "visa"}]) return matcher
def load_outcall_matcher(nlp): matcher = Matcher(nlp.vocab) location = ['location', 'place', 'studio', 'apartment', 'home', 'house', 'hotel'] add_to_vocab(nlp, location) location_ids = {nlp.vocab.strings[s.lower()] for s in location} hyphen_id = nlp.vocab.strings['-'] ampersand_id = nlp.vocab.strings['&'] is_hyphen = FLAG23 is_ampersand = FLAG24 is_location = FLAG25 for lexeme in nlp.vocab: if lexeme.lower == hyphen_id: lexeme.set_flag(is_hyphen, True) if lexeme.lower == ampersand_id: lexeme.set_flag(is_ampersand, True) if lexeme.lower in location_ids: lexeme.set_flag(is_location, True) matcher.add_entity(1) matcher.add_pattern(1, [{LEMMA: "outcall"}]) matcher.add_pattern(1, [{LEMMA: "out"}, {LEMMA: "call"}]) matcher.add_pattern(1, [{LEMMA: "out"}, {is_hyphen: True}, {LEMMA: "call"}]) matcher.add_pattern(1, [{LEMMA: "your"}, {is_location: True}]) matcher.add_pattern(1, [{LEMMA: "out"}, {LEMMA: "and"}, {LEMMA: "in"}, {LEMMA: "call"}]) matcher.add_pattern(1, [{LEMMA: "out"}, {is_ampersand: True}, {LEMMA: "in"}, {LEMMA: "call"}]) matcher.add_pattern(1, [{LEMMA: "visit"}, {LEMMA: "you"}]) matcher.add_pattern(1, [{LEMMA: "mind"}, {LEMMA: "travel"}]) matcher.add_pattern(1, [{LEMMA: "anywhere"}, {LEMMA: "and"}, {LEMMA: "everywhere"}]) matcher.add_pattern(1, [{LEMMA: "prefer"}, {LEMMA: "residence"}]) matcher.add_pattern(1, [{LEMMA: "prefer"}, {LEMMA: "hotel"}]) matcher.add_pattern(1, [{LEMMA: "come"}, {LEMMA: "to"}, {LEMMA: "you"}]) matcher.add_pattern(1, [{LEMMA: "will"}, {LEMMA: "travel"}]) matcher.add_entity(2) matcher.add_pattern(2, [{LEMMA: "outcall"}, {LEMMA: "only"}]) matcher.add_pattern(2, [{LEMMA: "out"}, {LEMMA: "call"}, {LEMMA: "only"}]) matcher.add_pattern(2, [{LEMMA: "out"}, {is_hyphen: True}, {LEMMA: "call"}, {LEMMA: "only"}]) matcher.add_pattern(2, [{LEMMA: "your", DEP: "amod"}, {is_location: True}]) matcher.add_pattern(2, [{LEMMA: "your", DEP: "amod"}, {IS_ASCII: True}, {is_location: True}]) matcher.add_pattern(2, [{LEMMA: "your", DEP: "poss"}, {is_location: True}]) matcher.add_pattern(2, [{LEMMA: "your", DEP: "poss"}, {IS_ASCII: True}, {is_location: True}]) matcher.add_entity(3) matcher.add_pattern(3, [{is_location: True}]) matcher.add_pattern(3, [{LEMMA: "place"}]) matcher.add_pattern(3, [{LEMMA: "be"}, {LEMMA: "place"}]) matcher.add_pattern(3, [{LEMMA: "is"}, {LEMMA: "place"}]) matcher.add_entity(4) matcher.add_pattern(4, [{LEMMA: "house"}, {LEMMA: "wives"}]) matcher.add_pattern(4, [{LEMMA: "if", DEP: "mark"}, {LEMMA: "have"}, {IS_ASCII: True, DEP: "dobj"}]) matcher.add_pattern(4, [{LEMMA: "if", DEP: "mark"}, {IS_ASCII: True}, {LEMMA: "have"}, {IS_ASCII: True}, {IS_ASCII: True, DEP: "dobj"}]) matcher.add_pattern(4, [{LEMMA: "if", DEP: "mark"}, {LEMMA: "have"}, {IS_ASCII: True}, {IS_ASCII: True, DEP: "dobj"}]) matcher.add_pattern(4, [{LEMMA: "my", DEP: "poss"}, {is_location: True}]) matcher.add_pattern(4, [{LEMMA: "my", DEP: "poss"}, {IS_ASCII: True}, {is_location: True}]) matcher.add_pattern(4, [{LEMMA: "no"}, {LEMMA: "outcall"}]) matcher.add_pattern(4, [{LEMMA: "no"}, {LEMMA: "out"}, {LEMMA: "call"}]) matcher.add_pattern(4, [{LEMMA: "no"}, {LEMMA: "out"}, {is_hyphen: True}, {LEMMA: "call"}]) matcher.add_pattern(4, [{IS_ASCII: True, DEP: "neg"}, {LEMMA: "outcall"}]) matcher.add_pattern(4, [{IS_ASCII: True, DEP: "neg"}, {LEMMA: "out"}, {LEMMA: "call"}]) matcher.add_pattern(4, [{IS_ASCII: True, DEP: "neg"}, {LEMMA: "out"}, {is_hyphen: True}, {LEMMA: "call"}]) matcher.add_pattern(4, [{IS_ASCII: True, DEP: "neg"}, {IS_ASCII: True}, {LEMMA: "outcall"}]) matcher.add_pattern(4, [{IS_ASCII: True, DEP: "neg"}, {IS_ASCII: True}, {LEMMA: "out"}, {LEMMA: "call"}]) matcher.add_pattern(4, [{IS_ASCII: True, DEP: "neg"}, {IS_ASCII: True}, {LEMMA: "out"}, {is_hyphen: True}, {LEMMA: "call"}]) matcher.add_pattern(4, [{LEMMA: "visit"}, {LEMMA: "your"}, {LEMMA: "city"}]) matcher.add_pattern(4, [{IS_ASCII: True}, {LEMMA: "miss"}, {LEMMA: "out"}]) matcher.add_pattern(4, [{LEMMA: "no"}, {LEMMA: "out"}, {LEMMA: "call"}]) matcher.add_pattern(4, [{IS_ASCII: True, DEP: "dep"}, {LEMMA: "no"}]) return matcher
def load_address_matcher(nlp): # Create matcher object with list of rules and return matcher = Matcher(nlp.vocab) # Add to vocab add_to_vocab(nlp, street) add_to_vocab(nlp, street_name) # Create flag for MONTH is_street = FLAG58 street_ids = {nlp.vocab.strings[s.lower()] for s in street} is_separator = FLAG57 separator_ids = {nlp.vocab.strings[s.lower()] for s in separator} is_street_name = FLAG56 street_name_ids = {nlp.vocab.strings[s.lower()] for s in street_name} # Add the flags for lexeme in nlp.vocab: if lexeme.lower in street_ids: lexeme.set_flag(is_street, True) if lexeme.lower in separator_ids: lexeme.set_flag(is_separator, True) if lexeme.is_alpha: lexeme.set_flag(is_street_name, True) if lexeme.like_num: lexeme.set_flag(is_street_name, True) if lexeme.lower in street_name_ids: lexeme.set_flag(is_street_name, True) # Add rules street_name_rules = [{is_street_name: True}, {IS_ALPHA: True}] for street_name_rule in street_name_rules: for length in range(1, 6): # direct address matcher.add_pattern('ADDRESS', [{ LIKE_NUM: True, LENGTH: length }, street_name_rule, { is_street: True }]) matcher.add_pattern('ADDRESS', [{ LIKE_NUM: True, LENGTH: length }, { IS_ALPHA: True }, street_name_rule, { is_street: True }]) matcher.add_pattern('ADDRESS', [{ LIKE_NUM: True, LENGTH: length }, { IS_ALPHA: True }, { IS_ALPHA: True }, street_name_rule, { is_street: True }]) # Add and filter out matches to return longest match matcher.add_pattern('ADDRESS', [street_name_rule, {is_street: True}]) # two street rules for street_name_rule1 in street_name_rules: for street_name_rule2 in street_name_rules: matcher.add_pattern('ADDRESS', [{ LIKE_NUM: True }, street_name_rule1, { is_street: True }, { is_separator: True }, { LIKE_NUM: True }, street_name_rule2, { is_street: True }]) matcher.add_pattern('ADDRESS', [ street_name_rule1, { is_street: True }, { is_separator: True }, street_name_rule2, { is_street: True } ]) return matcher
def load_hotel_matcher(nlp): matcher = Matcher(nlp.vocab) hotel = ['hotel', 'motel', 'inn', 'hotels', 'motels', 'inns'] dict_and = ['and', 'n', 'an', 'nd', '&', '/'] is_hotel = FLAG29 is_and = FLAG30 set_flag(nlp, hotel, is_hotel) set_flag(nlp, dict_and, is_and) matcher.add_entity(1) matcher.add_pattern(1, [{is_hotel: True}]) matcher.add_entity(4) matcher.add_pattern(4, [{LEMMA: "inn"}, {is_and: True}, {LOWER: "out"}]) matcher.add_pattern(4, [{LEMMA: "inn"}, {is_and: True}, {LOWER: "outcall"}]) matcher.add_pattern(4, [{LEMMA: "inn"}, {LOWER: "call"}]) matcher.add_pattern(4, [{LEMMA: "inn"}, {is_and: True}, {LOWER: "outcalls"}]) matcher.add_pattern(4, [{LEMMA: "inn"}, {LOWER: "calls"}]) matcher.add_pattern(4, [{LEMMA: "inn"}, {IS_DIGIT: True}]) matcher.add_pattern(4, [{LEMMA: "come"}, {LEMMA: "inn"}]) matcher.add_pattern(4, [{LEMMA: "inn"}, {LEMMA: "town"}]) matcher.add_pattern(4, [{LOWER: "no"}, {is_hotel: True}]) matcher.add_pattern(4, [{DEP: "neg"}, {is_hotel: True}]) matcher.add_pattern(4, [{DEP: "neg"}, {IS_ALPHA: True}, {is_hotel: True}]) return matcher
def load_incall_matcher(nlp): matcher = Matcher(nlp.vocab) location = ['location', 'place', 'studio', 'apartment', 'home', 'house'] private = ['private', 'discreet', 'discrete'] clean = ['clean', 'nice', 'lovely'] add_to_vocab(nlp, location) add_to_vocab(nlp, private) add_to_vocab(nlp, clean) location_ids = {nlp.vocab.strings[s.lower()] for s in location} private_ids = {nlp.vocab.strings[s.lower()] for s in private} clean_ids = {nlp.vocab.strings[s.lower()] for s in clean} hyphen_id = nlp.vocab.strings['-'] ampersand_id = nlp.vocab.strings['&'] is_hyphen = FLAG23 is_ampersand = FLAG24 is_location = FLAG25 is_private = FLAG26 is_clean = FLAG27 for lexeme in nlp.vocab: if lexeme.lower == hyphen_id: lexeme.set_flag(is_hyphen, True) if lexeme.lower == ampersand_id: lexeme.set_flag(is_ampersand, True) if lexeme.lower in location_ids: lexeme.set_flag(is_location, True) if lexeme.lower in private_ids: lexeme.set_flag(is_private, True) if lexeme.lower in clean_ids: lexeme.set_flag(is_clean, True) matcher.add_entity(1) matcher.add_pattern(1, [{LEMMA: "incall"}]) matcher.add_pattern(1, [{LEMMA: "in"}, {LEMMA: "call"}]) matcher.add_pattern(1, [{LEMMA: "in"}, {is_hyphen: True}, {LEMMA: "call"}]) matcher.add_pattern(1, [{ LEMMA: "in" }, { LEMMA: "and" }, { LEMMA: "out" }, { LEMMA: "call" }]) matcher.add_pattern(1, [{ LEMMA: "in" }, { is_ampersand: True }, { LEMMA: "out" }, { LEMMA: "call" }]) matcher.add_pattern(1, [{LEMMA: "visit"}, {LEMMA: "i"}]) matcher.add_entity(2) matcher.add_pattern(2, [{LEMMA: "incall"}, {LEMMA: "only"}]) matcher.add_pattern(2, [{LEMMA: "in"}, {LEMMA: "call"}, {LEMMA: "only"}]) matcher.add_pattern(2, [{ LEMMA: "in" }, { is_hyphen: True }, { LEMMA: "call" }, { LEMMA: "only" }]) matcher.add_pattern(2, [{ is_private: True, DEP: "amod" }, { is_location: True }]) matcher.add_pattern(2, [{ is_private: True, DEP: "amod" }, { IS_ASCII: True }, { is_location: True }]) matcher.add_pattern(2, [{is_clean: True}, {is_location: True}]) matcher.add_pattern(2, [{LEMMA: "my", DEP: "poss"}, {is_location: True}]) matcher.add_pattern(2, [{ LEMMA: "my", DEP: "poss" }, { IS_ASCII: True }, { is_location: True }]) matcher.add_entity(3) matcher.add_pattern(3, [{LEMMA: "location"}]) matcher.add_pattern(3, [{LEMMA: "place"}]) matcher.add_pattern(3, [{LEMMA: "be"}, {LEMMA: "place"}]) matcher.add_pattern(3, [{LEMMA: "is"}, {LEMMA: "place"}]) matcher.add_entity(4) matcher.add_pattern(4, [{LEMMA: "house"}, {LEMMA: "wife"}]) matcher.add_pattern(4, [{LOWER: "your", DEP: "poss"}, {is_location: True}]) matcher.add_pattern(4, [{ LOWER: "your", DEP: "poss" }, { IS_ASCII: True }, { is_location: True }]) matcher.add_pattern(4, [{LEMMA: "no"}, {LEMMA: "incall"}]) matcher.add_pattern(4, [{LEMMA: "no"}, {LEMMA: "in"}, {LEMMA: "call"}]) matcher.add_pattern(4, [{ LEMMA: "no" }, { LEMMA: "in" }, { is_hyphen: True }, { LEMMA: "call" }]) matcher.add_pattern(4, [{IS_ASCII: True, DEP: "neg"}, {LEMMA: "incall"}]) matcher.add_pattern(4, [{ IS_ASCII: True, DEP: "neg" }, { LEMMA: "in" }, { LEMMA: "call" }]) matcher.add_pattern(4, [{ IS_ASCII: True, DEP: "neg" }, { LEMMA: "in" }, { is_hyphen: True }, { LEMMA: "call" }]) matcher.add_pattern(4, [{ IS_ASCII: True, DEP: "neg" }, { IS_ASCII: True }, { LEMMA: "incall" }]) matcher.add_pattern(4, [{ IS_ASCII: True, DEP: "neg" }, { IS_ASCII: True }, { LEMMA: "in" }, { LEMMA: "call" }]) matcher.add_pattern(4, [{ IS_ASCII: True, DEP: "neg" }, { IS_ASCII: True }, { LEMMA: "in" }, { is_hyphen: True }, { LEMMA: "call" }]) matcher.add_pattern(4, [{ IS_ASCII: True, DEP: "neg" }, { LEMMA: "have" }, { IS_ASCII: True, DEP: "dobj" }]) matcher.add_pattern(4, [{ IS_ASCII: True, DEP: "neg" }, { LEMMA: "have" }, { IS_ASCII: True }, { IS_ASCII: True, DEP: "dobj" }]) matcher.add_pattern(4, [{ LEMMA: "if", DEP: "mark" }, { LEMMA: "have" }, { IS_ASCII: True, DEP: "dobj" }]) matcher.add_pattern(4, [{ LEMMA: "if", DEP: "mark" }, { IS_ASCII: True }, { LEMMA: "have" }, { IS_ASCII: True }, { IS_ASCII: True, DEP: "dobj" }]) matcher.add_pattern(4, [{ LEMMA: "if", DEP: "mark" }, { LEMMA: "have" }, { IS_ASCII: True }, { IS_ASCII: True, DEP: "dobj" }]) return matcher
def load_derogatory_mentions_matcher(nlp): matcher = Matcher(nlp.vocab) bitch = ['w***e', 'bitch', 'c**t', 'psycho', 's**t'] your = ['your', 'ur'] is_bitch = FLAG29 is_your = FLAG30 set_flag(nlp, bitch, is_bitch) set_flag(nlp, your, is_your) matcher.add_entity(1) matcher.add_pattern(1, [{is_bitch: True}]) matcher.add_pattern(1, [{LEMMA: "expose"}, {LOWER: "i"}]) matcher.add_pattern(1, [{LEMMA: "expose"}, {LOWER: "me"}]) matcher.add_pattern(1, [{LEMMA: "violate"}, {LOWER: "i"}]) matcher.add_pattern(1, [{LEMMA: "violate"}, {LOWER: "me"}]) matcher.add_pattern(1, [{LEMMA: "f**k"}, {LOWER: "i"}]) matcher.add_pattern(1, [{LEMMA: "f**k"}, {LOWER: "me"}]) matcher.add_pattern(1, [{ LOWER: "i", DEP: "nsubj" }, { IS_ALPHA: True }, { LEMMA: "violate", DEP: "xcomp" }]) matcher.add_pattern(1, [{LEMMA: "piece"}, {LOWER: "of"}, {LEMMA: "shit"}]) matcher.add_pattern(1, [{LOWER: "hardcore"}]) matcher.add_pattern(1, [{is_your: True}, {is_bitch: True}]) matcher.add_pattern(1, [{is_your: True}, {LEMMA: "slave"}]) matcher.add_pattern(1, [{is_your: True, DEP: "poss"}, {is_bitch: True}]) matcher.add_pattern(1, [{is_your: True}, {LEMMA: "slave"}]) matcher.add_pattern(1, [{LOWER: "i", DEP: "nsubj"}, {is_bitch: True}]) matcher.add_pattern(1, [{ LOWER: "i", DEP: "nsubj" }, { IS_ALPHA: True }, { is_bitch: True, DEP: "xcomp" }]) matcher.add_entity(3) matcher.add_pattern(3, [{LOWER: "like"}, {is_bitch: True}]) matcher.add_pattern(3, [{LEMMA: "bitch", POS: "VERB"}]) matcher.add_entity(4) matcher.add_pattern(4, [{LOWER: "to"}, {is_bitch: True}]) matcher.add_pattern(4, [{LOWER: "nor"}, {is_bitch: True}]) matcher.add_pattern(4, [{DEP: "neg"}, {is_bitch: True}]) matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "slave"}]) matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "expose"}, {LOWER: "i"}]) matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "expose"}, {LOWER: "me"}]) matcher.add_pattern(4, [{ DEP: "neg" }, { LEMMA: "piece" }, { LOWER: "of" }, { LEMMA: "shit" }]) matcher.add_pattern(4, [{DEP: "neg"}, {is_your: True}, {is_bitch: True}]) matcher.add_pattern(4, [{DEP: "neg"}, {IS_ALPHA: True}, {is_bitch: True}]) matcher.add_pattern(4, [{DEP: "neg"}, {IS_ALPHA: True}, {LEMMA: "slave"}]) matcher.add_pattern(4, [{ DEP: "neg" }, { IS_ALPHA: True }, { LEMMA: "expose" }, { LOWER: "i" }]) matcher.add_pattern(4, [{ DEP: "neg" }, { IS_ALPHA: True }, { LEMMA: "expose" }, { LOWER: "me" }]) matcher.add_pattern(4, [{ DEP: "neg" }, { IS_ALPHA: True }, { LEMMA: "piece" }, { LOWER: "of" }, { LEMMA: "shit" }]) matcher.add_pattern(4, [{ DEP: "neg" }, { IS_ALPHA: True }, { is_your: True }, { is_bitch: True }]) matcher.add_pattern(4, [{DEP: "neg"}, {IS_ALPHA: True}, {is_bitch: True}]) matcher.add_pattern(4, [{DEP: "neg"}, {IS_ALPHA: True}, {LEMMA: "slave"}]) matcher.add_pattern(4, [{ DEP: "neg" }, { IS_ALPHA: True }, { LEMMA: "expose" }, { LOWER: "i" }]) matcher.add_pattern(4, [{ DEP: "neg" }, { IS_ALPHA: True }, { LEMMA: "expose" }, { LOWER: "me" }]) matcher.add_pattern(4, [{ DEP: "neg" }, { IS_ALPHA: True }, { LEMMA: "piece" }, { LOWER: "of" }, { LEMMA: "shit" }]) matcher.add_pattern(4, [{LEMMA: "girl"}, {LOWER: "next"}, {LEMMA: "door"}]) matcher.add_pattern(4, [{LOWER: "with"}, {LOWER: "my"}, {LEMMA: "girl"}]) matcher.add_pattern(4, [{LOWER: "no"}, {is_bitch: True}]) matcher.add_pattern(4, [{DEP: "neg"}, {LOWER: "like"}, {is_bitch: True}]) matcher.add_pattern(4, [{LEMMA: "look"}, {LEMMA: "slave", DEP: "prep"}]) matcher.add_pattern(4, [{LOWER: "you"}, {is_bitch: True}]) matcher.add_pattern(4, [{LOWER: "you"}, {POS: "ADJ"}, {is_bitch: True}]) return matcher
def load_social_media_matcher(nlp): social_media = ['twitter', 'facebook', 'instagram', 'wechat', 'line', 'snapchat'] separators = [':', '-', '@'] add_to_vocab(nlp, social_media) add_to_vocab(nlp, separators) is_separator = FLAG55 is_social_media = FLAG54 social_media_ids = {nlp.vocab.strings[s.lower()] for s in social_media} separators_ids = {nlp.vocab.strings[s.lower()] for s in separators} for lexeme in nlp.vocab: if lexeme.lower in social_media_ids: lexeme.set_flag(is_social_media, True) if lexeme.lower in separators_ids: lexeme.set_flag(is_separator, True) matcher = Matcher(nlp.vocab) matcher.add_entity("social_media") matcher.add_pattern("social_media", [ {is_social_media: True}, {is_separator: True}, {is_separator: True, 'OP': '?'}, { IS_ASCII: True } ], label=1 ) matcher.add_pattern("social_media", [ {is_social_media: True}, { LOWER: "me", TAG: "PRP" }, {is_separator: True, "OP": '?'}, { IS_ASCII: True, TAG: 'NN' } ], label=2 ) '''matcher.add_pattern("social_media", [ {is_social_media: True}, { LOWER: "me", TAG: "PRP" }, {is_separator: True, "OP": '?'}, {IS_ASCII: False, "OP":"?"}, ], label = 3 ) ''' matcher.add_pattern("social_media", [ {is_social_media: True}, {LOWER: 'id'}, {LOWER: 'is', 'OP': '?'}, {is_separator: True, 'OP': '?'}, {IS_ASCII: True} ], label=4 ) matcher.add_pattern("social_media", [ {is_social_media: True}, {TAG: 'NN'}, {LOWER: 'is', 'OP': '?'}, {LOWER: 'to'}, {TAG: 'VB'}, {LOWER: 'me'} ], label=5 ) matcher.add_pattern("social_media", [ {LOWER: 'add'}, {TAG: 'PRP'}, {LOWER: 'on'}, {is_social_media: True}, {TAG: 'NN'} ], label=6 ) return matcher
def load_agency_matcher(nlp): matcher = Matcher(nlp.vocab) agency = ['agency', 'agncy', 'agenc', 'agencies'] is_agency = FLAG29 set_flag(nlp, agency, is_agency) matcher.add_entity(1) matcher.add_pattern(1, [{is_agency: True}]) matcher.add_entity(3) matcher.add_pattern(3, [{LOWER: "or"}, {is_agency: True}]) matcher.add_pattern(3, [{LOWER: "le"}, {is_agency: True}]) matcher.add_pattern(3, [{ LOWER: "law" }, { LOWER: "enforcement" }, { is_agency: True }]) matcher.add_pattern(3, [{LOWER: "no"}, {is_agency: True}]) matcher.add_pattern(3, [{DEP: "neg"}, {is_agency: True}]) matcher.add_pattern(3, [{LOWER: "ad"}, {is_agency: True}]) matcher.add_pattern(3, [{LOWER: "not"}, {LOWER: "a"}, {is_agency: True}]) matcher.add_pattern(3, [{LOWER: "tire"}, {LOWER: "of"}, {is_agency: True}]) return matcher
def main(): nlp = spacy.load('en') matcher = Matcher(nlp.vocab) matcher.add_pattern("deep_learning", [{ LOWER: "deep" }, { LOWER: "learning" }]) matcher.add_pattern("artificial_intelligence", [{ LOWER: "artificial" }, { LOWER: "intelligence" }]) matcher.add_pattern("machine_learning", [{ LOWER: "machine" }, { LOWER: "learning" }]) matcher.add_pattern("reinforcement_learning", [{ LOWER: "reinforcement" }, { LOWER: "learning" }]) matcher.add_pattern("pattern_recognition", [{ LOWER: "parttern" }, { LOWER: "recognition" }]) matcher.add_pattern("computer_vision", [{ LOWER: "computer" }, { LOWER: "vision" }]) matcher.add_pattern("machine_vision", [{ LOWER: "machine" }, { LOWER: "vision" }]) matcher.add_pattern("machine_translation", [{ LOWER: "machine" }, { LOWER: "vision" }]) text = re.sub(r'\s+', ' ', sys.stdin.read()) doc = nlp(text) entities = list(doc.ents) matches = matcher(doc) skip_until = -1 for i, token in enumerate(doc): if i < skip_until: continue if matches: _, _, start, end = matches[0] if i == start: print(doc[start:end].lemma_.lower()) skip_until = end matches.pop(0) if token.is_alpha and not token.is_stop and token.ent_iob_ == 'O': print(token.lemma_.lower()) if (token.ent_iob_ == 'B' and token.ent_type_ not in [ 'DATE', 'TIME', 'MONEY', 'PERCENT', 'QUANTITY', 'ORDINAL', 'CARDINAL' ]): entity = entities.pop(0) print(entity.lemma_.lower())
def load_date_matcher(nlp): # Create matcher object with list of rules and return matcher = Matcher(nlp.vocab) # Add to vocab add_to_vocab(nlp, months_dict.keys()) add_to_vocab(nlp, ordinals) add_to_vocab(nlp, date_delimiters) # Create flag for MONTH is_month = FLAG62 target_ids = {nlp.vocab.strings[s.lower()] for s in months_dict.keys()} for lexeme in nlp.vocab: if lexeme.lower in target_ids: lexeme.set_flag(is_month, True) # Create flag for ORDINALS is_ordinal = FLAG61 target_ids = {nlp.vocab.strings[s.lower()] for s in ordinals} for lexeme in nlp.vocab: if lexeme.lower in target_ids: lexeme.set_flag(is_ordinal, True) # Create flag for DATE_DELIMITER is_date_delimiter = FLAG60 target_ids = {nlp.vocab.strings[s.lower()] for s in date_delimiters} for lexeme in nlp.vocab: if lexeme.lower in target_ids: lexeme.set_flag(is_date_delimiter, True) # print('December', nlp.vocab.__contains__('December')) # print('Diciembre', nlp.vocab.__contains__('diciembre')) # print('December', nlp.vocab['december'].check_flag(is_month)) # print('Diciembre', nlp.vocab['diciembre'].check_flag(is_month)) # Add rules # March 25, 2017 # March 25th, 2017 # March 25th 2017 # March 25 2017 matcher.add_pattern('DATE', [{ is_month: True }, { IS_DIGIT: True, LENGTH: 1 }, { is_ordinal: True, 'OP': '?' }, { ORTH: ',', 'OP': '?' }, { IS_DIGIT: True, LENGTH: 4 }], label=1) matcher.add_pattern('DATE', [{ is_month: True }, { IS_DIGIT: True, LENGTH: 2 }, { is_ordinal: True, 'OP': '?' }, { ORTH: ',', 'OP': '?' }, { IS_DIGIT: True, LENGTH: 4 }], label=2) # 25 March, 2017 # 25th March, 2017 # 25th March 2017 # 25 March 2017 matcher.add_pattern('DATE', [{ IS_DIGIT: True, LENGTH: 1 }, { is_date_delimiter: True, 'OP': '?' }, { is_month: True }, { is_ordinal: True, 'OP': '?' }, { ORTH: ',', 'OP': '?' }, { IS_DIGIT: True, LENGTH: 4 }], label=3) matcher.add_pattern('DATE', [{ IS_DIGIT: True, LENGTH: 2 }, { is_date_delimiter: True, 'OP': '?' }, { is_month: True }, { is_ordinal: True, 'OP': '?' }, { ORTH: ',', 'OP': '?' }, { IS_DIGIT: True, LENGTH: 4 }], label=4) # 25/05/2016 matcher.add_pattern('DATE', [{ IS_DIGIT: True, LENGTH: 1 }, { is_date_delimiter: True, 'OP': '+' }, { is_month: True }, { is_date_delimiter: True, 'OP': '+' }, { IS_DIGIT: True, LENGTH: 4 }], label=5) matcher.add_pattern('DATE', [{ IS_DIGIT: True, LENGTH: 2 }, { is_date_delimiter: True, 'OP': '+' }, { is_month: True }, { is_date_delimiter: True, 'OP': '+' }, { IS_DIGIT: True, LENGTH: 4 }], label=6) # 05/25/2016 matcher.add_pattern('DATE', [{ is_month: True }, { is_date_delimiter: True, 'OP': '+' }, { IS_DIGIT: True, LENGTH: 1 }, { is_date_delimiter: True, 'OP': '+' }, { IS_DIGIT: True, LENGTH: 4 }], label=7) matcher.add_pattern('DATE', [{ is_month: True }, { is_date_delimiter: True, 'OP': '+' }, { IS_DIGIT: True, LENGTH: 2 }, { is_date_delimiter: True, 'OP': '+' }, { IS_DIGIT: True, LENGTH: 4 }], label=8) # Diciembre, 2009 # December 2009 matcher.add_pattern('DATE', [{ is_month: True, IS_DIGIT: False }, { ORTH: ',' }, { IS_DIGIT: True, LENGTH: 4 }], label=9) matcher.add_pattern('DATE', [{ is_month: True, IS_DIGIT: False }, { IS_DIGIT: True, LENGTH: 4 }], label=9) # 2013-12-04 matcher.add_pattern('DATE', [{ IS_DIGIT: True, LENGTH: 4 }, { is_date_delimiter: True, 'OP': '+' }, { is_month: True }, { is_date_delimiter: True, 'OP': '+' }, { IS_DIGIT: True, LENGTH: 2 }], label=10) matcher.add_pattern('DATE', [{ IS_DIGIT: True, LENGTH: 4 }, { is_date_delimiter: True, 'OP': '+' }, { is_month: True }, { is_date_delimiter: True, 'OP': '+' }, { IS_DIGIT: True, LENGTH: 1 }], label=11) # 9 days ago matcher.add_pattern('DATE', [{ IS_DIGIT: True }, { POS: 'NOUN' }, { LOWER: 'ago' }], label=12) # 1 Jul # 1. Jul matcher.add_pattern('DATE', [{ IS_DIGIT: True, LENGTH: 2 }, { is_ordinal: True }, { is_date_delimiter: True }, { is_month: True, IS_DIGIT: False }], label=13) matcher.add_pattern('DATE', [{ IS_DIGIT: True, LENGTH: 2 }, { is_ordinal: True }, { is_month: True, IS_DIGIT: False }], label=13) matcher.add_pattern('DATE', [{ IS_DIGIT: True, LENGTH: 2 }, { is_date_delimiter: True }, { is_month: True, IS_DIGIT: False }], label=13) matcher.add_pattern('DATE', [{ IS_DIGIT: True, LENGTH: 2 }, { is_month: True, IS_DIGIT: False }], label=13) matcher.add_pattern('DATE', [{ IS_DIGIT: True, LENGTH: 1 }, { is_ordinal: True }, { is_date_delimiter: True }, { is_month: True, IS_DIGIT: False }], label=14) matcher.add_pattern('DATE', [{ IS_DIGIT: True, LENGTH: 1 }, { is_ordinal: True }, { is_month: True, IS_DIGIT: False }], label=14) matcher.add_pattern('DATE', [{ IS_DIGIT: True, LENGTH: 1 }, { is_date_delimiter: True }, { is_month: True, IS_DIGIT: False }], label=14) matcher.add_pattern('DATE', [{ IS_DIGIT: True, LENGTH: 1 }, { is_month: True, IS_DIGIT: False }], label=14) # Jul 2nd matcher.add_pattern('DATE', [{ is_month: True, IS_DIGIT: False }, { is_date_delimiter: True }, { IS_DIGIT: True, LENGTH: 2 }, { is_ordinal: True }], label=15) matcher.add_pattern('DATE', [{ is_month: True, IS_DIGIT: False }, { is_date_delimiter: True }, { IS_DIGIT: True, LENGTH: 2 }], label=15) matcher.add_pattern('DATE', [{ is_month: True, IS_DIGIT: False }, { IS_DIGIT: True, LENGTH: 2 }, { is_ordinal: True }], label=15) matcher.add_pattern('DATE', [{ is_month: True, IS_DIGIT: False }, { IS_DIGIT: True, LENGTH: 2 }], label=15) matcher.add_pattern('DATE', [{ is_month: True, IS_DIGIT: False }, { is_date_delimiter: True }, { IS_DIGIT: True, LENGTH: 1 }, { is_ordinal: True }], label=16) matcher.add_pattern('DATE', [{ is_month: True, IS_DIGIT: False }, { is_date_delimiter: True }, { IS_DIGIT: True, LENGTH: 1 }], label=16) matcher.add_pattern('DATE', [{ is_month: True, IS_DIGIT: False }, { IS_DIGIT: True, LENGTH: 1 }, { is_ordinal: True }], label=16) matcher.add_pattern('DATE', [{ is_month: True, IS_DIGIT: False }, { IS_DIGIT: True, LENGTH: 1 }], label=16) return matcher
def load_webcam_matcher(nlp): matcher = Matcher(nlp.vocab) cam = ['cam', 'skype', 'facetime', 'webcam', 'mfc', 'iml'] provider = [ 'girls', 'girl', 'models', 'model', 'staffs', 'staff', 'latinas', 'latina', 'talent', 'supermodels', 'supermodel', 'princesses', 'princess' ] is_cam = FLAG29 is_provider = FLAG30 set_flag(nlp, cam, is_cam) set_flag(nlp, provider, is_provider) matcher.add_entity(1) matcher.add_pattern(1, [{is_cam: True}]) matcher.add_pattern(1, [{LOWER: "live"}, {LEMMA: "show"}]) matcher.add_pattern(1, [{LEMMA: "video"}, {ORTH: "@"}]) matcher.add_pattern(1, [{LOWER: "free", DEP: "amod"}, {LEMMA: "video"}]) matcher.add_pattern(1, [{LOWER: "porno"}, {is_provider: True}]) matcher.add_pattern(1, [{LEMMA: "add"}, {LOWER: "i"}]) matcher.add_pattern(1, [{LOWER: "chaturbate"}]) matcher.add_pattern(1, [{ LEMMA: "see" }, { LOWER: "i" }, { LOWER: "on" }, { LOWER: "http" }]) matcher.add_pattern(1, [{ LEMMA: "see" }, { LOWER: "me" }, { LOWER: "on" }, { LOWER: "http" }]) matcher.add_pattern(1, [{ LEMMA: "see" }, { LOWER: "i" }, { LOWER: "on" }, { LOWER: "https" }]) matcher.add_pattern(1, [{ LEMMA: "see" }, { LOWER: "me" }, { LOWER: "on" }, { LOWER: "http" }]) matcher.add_pattern(1, [{ LEMMA: "see" }, { LOWER: "i" }, { LOWER: "on" }, { LOWER: "www" }, { ORTH: "." }]) matcher.add_pattern(1, [{ LEMMA: "see" }, { LOWER: "me" }, { LOWER: "on" }, { LOWER: "www" }, { ORTH: "." }]) matcher.add_entity(4) matcher.add_pattern(4, [{LOWER: "i"}, {LOWER: "cam"}]) matcher.add_pattern(4, [{LOWER: "cam"}, {LOWER: "to"}]) matcher.add_pattern(4, [{LOWER: "you"}, {LOWER: "cam"}]) matcher.add_pattern(4, [{DEP: "neg"}, {is_cam: True}]) matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "camshow"}]) matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "liveshow"}]) matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "skypeshow"}]) matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "livshow"}]) matcher.add_pattern(4, [{DEP: "neg"}, {LOWER: "paypal"}, {LEMMA: "show"}]) matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "show"}]) matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "chaturbate"}]) matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "live"}]) matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "video"}]) matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "porno"}]) matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "girl"}, {LEMMA: "cam"}]) matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "girl"}, {LEMMA: "webcam"}]) matcher.add_pattern(4, [{ is_cam: True, DEP: "nsubj" }, { IS_ALPHA: True }, { IS_ALPHA: True, DEP: "neg" }]) matcher.add_pattern(4, [{ LEMMA: "video", DEP: "nsubj" }, { IS_ALPHA: True }, { IS_ALPHA: True, DEP: "neg" }]) matcher.add_pattern(4, [{ is_cam: True, DEP: "conj" }, { IS_ALPHA: True }, { IS_ALPHA: True, DEP: "neg" }]) matcher.add_pattern(4, [{ LEMMA: "video", DEP: "conj" }, { IS_ALPHA: True }, { IS_ALPHA: True, DEP: "neg" }]) matcher.add_pattern(4, [{LOWER: "no"}, {is_cam: True}]) matcher.add_pattern(4, [{LOWER: "no"}, {LEMMA: "live"}]) matcher.add_pattern(4, [{LOWER: "no"}, {LEMMA: "video"}]) matcher.add_pattern(4, [{LOWER: "no"}, {LEMMA: "free"}]) matcher.add_pattern(4, [{LOWER: "no"}, {LEMMA: "porno"}]) matcher.add_pattern(4, [{LOWER: "no"}, {LEMMA: "chaturbate"}]) matcher.add_pattern(4, [{LOWER: "no"}, {LEMMA: "camshow"}]) matcher.add_pattern(4, [{LOWER: "no"}, {LEMMA: "liveshow"}]) matcher.add_pattern(4, [{LOWER: "no"}, {LEMMA: "skypeshow"}]) matcher.add_pattern(4, [{LOWER: "no"}, {LOWER: "paypal"}, {LEMMA: "show"}]) matcher.add_pattern(4, [{LOWER: "it"}, {LEMMA: "be"}, {LEMMA: "cam"}]) matcher.add_pattern(4, [{LOWER: "its"}, {LEMMA: "cam"}]) matcher.add_pattern(4, [{LOWER: "im"}, {LEMMA: "cam"}]) matcher.add_pattern(4, [{LOWER: "i"}, {LEMMA: "be"}, {LEMMA: "cam"}]) matcher.add_pattern(4, [{ LOWER: "my" }, { LOWER: "name" }, { LEMMA: "be" }, { LEMMA: "cam" }]) matcher.add_pattern(4, [{LEMMA: "cam"}, {LOWER: "here"}]) return matcher
def load_like_email_matcher(nlp): matcher = Matcher(nlp.vocab) matcher.add_pattern(1, [{LIKE_EMAIL: True}]) return matcher