def load_hotel_matcher(nlp): matcher = Matcher(nlp.vocab) hotel = ['hotel', 'motel', 'inn', 'hotels', 'motels', 'inns'] dict_and = ['and', 'n', 'an', 'nd', '&', '/'] is_hotel = FLAG29 is_and = FLAG30 set_flag(nlp, hotel, is_hotel) set_flag(nlp, dict_and, is_and) matcher.add_entity(1) matcher.add_pattern(1, [{is_hotel: True}]) matcher.add_entity(4) matcher.add_pattern(4, [{LEMMA: "inn"}, {is_and: True}, {LOWER: "out"}]) matcher.add_pattern(4, [{LEMMA: "inn"}, {is_and: True}, {LOWER: "outcall"}]) matcher.add_pattern(4, [{LEMMA: "inn"}, {LOWER: "call"}]) matcher.add_pattern(4, [{LEMMA: "inn"}, {is_and: True}, {LOWER: "outcalls"}]) matcher.add_pattern(4, [{LEMMA: "inn"}, {LOWER: "calls"}]) matcher.add_pattern(4, [{LEMMA: "inn"}, {IS_DIGIT: True}]) matcher.add_pattern(4, [{LEMMA: "come"}, {LEMMA: "inn"}]) matcher.add_pattern(4, [{LEMMA: "inn"}, {LEMMA: "town"}]) matcher.add_pattern(4, [{LOWER: "no"}, {is_hotel: True}]) matcher.add_pattern(4, [{DEP: "neg"}, {is_hotel: True}]) matcher.add_pattern(4, [{DEP: "neg"}, {IS_ALPHA: True}, {is_hotel: True}]) return matcher
def load_agency_matcher(nlp): matcher = Matcher(nlp.vocab) agency = ['agency', 'agncy', 'agenc', 'agencies'] is_agency = FLAG29 set_flag(nlp, agency, is_agency) matcher.add_entity(1) matcher.add_pattern(1, [{is_agency: True}]) matcher.add_entity(3) matcher.add_pattern(3, [{LOWER: "or"}, {is_agency: True}]) matcher.add_pattern(3, [{LOWER: "le"}, {is_agency: True}]) matcher.add_pattern(3, [{ LOWER: "law" }, { LOWER: "enforcement" }, { is_agency: True }]) matcher.add_pattern(3, [{LOWER: "no"}, {is_agency: True}]) matcher.add_pattern(3, [{DEP: "neg"}, {is_agency: True}]) matcher.add_pattern(3, [{LOWER: "ad"}, {is_agency: True}]) matcher.add_pattern(3, [{LOWER: "not"}, {LOWER: "a"}, {is_agency: True}]) matcher.add_pattern(3, [{LOWER: "tire"}, {LOWER: "of"}, {is_agency: True}]) return matcher
def test_get_entity_attrs(en_vocab): matcher = Matcher(en_vocab) matcher.add_entity('TestEntity') entity = matcher.get_entity('TestEntity') assert entity == {} matcher.add_entity('TestEntity2', attrs={'Hello': 'World'}) entity = matcher.get_entity('TestEntity2') assert entity == {'Hello': 'World'} assert matcher.get_entity('TestEntity') == {}
def load_age_matcher(nlp): """ Matcher Handles: Age : 22 years age : 22 yrs Age 22-40 22 yrs 23yrs 22-40 years About me 22 """ matcher = Matcher(nlp.vocab) # Added New attribute to check for years years = ['years', 'yrs', 'year'] is_year = FLAG63 target_ids = {nlp.vocab.strings[s.lower()] for s in years} for lexeme in nlp.vocab: if lexeme.lower in target_ids: lexeme.set_flag(is_year, True) # New Entity Type : Age matcher.add_entity("Age", acceptor=get_age) # Age Matcher Patterns matcher.add_pattern("Age", [{LOWER: "age"}, {IS_PUNCT: True}, {IS_DIGIT: True, LENGTH: 2}]) matcher.add_pattern("Age", [{LOWER: "age"}, {IS_DIGIT: True, LENGTH: 2}]) matcher.add_pattern("Age", [{LOWER: "age"}, {IS_PUNCT: True}, {IS_DIGIT: True, LENGTH: 2}, {IS_PUNCT: True}, {IS_DIGIT: True, LENGTH: 2}]) matcher.add_pattern("Age", [{LOWER: "age"}, {IS_DIGIT: True, LENGTH: 2}, {IS_PUNCT: True}, {IS_DIGIT: True, LENGTH: 2}]) matcher.add_pattern("Age", [{IS_DIGIT: True, LENGTH: 2}, {is_year: True}]) matcher.add_pattern("Age", [{SUFFIX: "yrs", LENGTH: 5}]) matcher.add_pattern("Age", [{IS_DIGIT: True, LENGTH: 2}, {IS_PUNCT: True, 'OP': '?'}, {IS_DIGIT: True, LENGTH: 2}, {is_year: True}]) matcher.add_pattern("Age", [{IS_DIGIT: True, LENGTH: 2}, {IS_ASCII: True, 'OP': '?'}, {IS_DIGIT: True, LENGTH: 2}, {is_year: True}]) matcher.add_pattern("Age", [{LOWER: 'about'}, {LOWER: 'me', 'OP': '?'}, {IS_DIGIT: True}]) return matcher
def test_get_entity_via_match(en_vocab): matcher = Matcher(en_vocab) matcher.add_entity('TestEntity', attrs={u'Hello': u'World'}) assert matcher.n_patterns == 0 assert matcher(Doc(en_vocab, words=[u'Test', u'Entity'])) == [] matcher.add_pattern(u'TestEntity', [{ORTH: u'Test'}, {ORTH: u'Entity'}]) assert matcher.n_patterns == 1 matches = matcher(Doc(en_vocab, words=[u'Test', u'Entity'])) assert len(matches) == 1 assert len(matches[0]) == 4 ent_id, label, start, end = matches[0] assert ent_id == matcher.vocab.strings[u'TestEntity'] assert label == 0 assert start == 0 assert end == 2 attrs = matcher.get_entity(ent_id) assert attrs == {u'Hello': u'World'}
def __init__(self, nlp): matcher = Matcher(nlp.vocab) iob_pattern = [{ a.LIKE_NUM: False, a.ENT_IOB: 3 }, { 'OP': '*', a.ENT_IOB: 1 }, { 'OP': '?', a.LIKE_NUM: True }] entity_name = 'object' # it is to associate matches with patterns matcher.add_entity(entity_name, acceptor=self.make_intersect_ar(2, 2)) matcher.add_pattern(entity_name, iob_pattern, label=Part.OBJ) entity_name = 'subject' matcher.add_entity(entity_name, acceptor=self.make_intersect_ar()) matcher.add_pattern(entity_name, iob_pattern, label=Part.SUBJ) # conjugation_pattern = iob_pattern + [{a.POS: 'CONJ'}] # entity_name = 'version' # ver_pattern1 = [{a.LEMMA: 'version'}, {a.LIKE_NUM: True}] # matcher.add_entity(entity_name, acceptor=self.make_intersect_ar(1,1)) # matcher.add_pattern(entity_name, ver_pattern1, label=Part.SUBJ) # matcher.add_pattern(entity_name, ver_pattern1, label=Part.OBJ) # entity_name = 'location' # entity_name = 'date' # matcher.add_entity(entity_name, acceptor=self.make_inclusion_ar(1,1)) # matcher.add_pattern(entity_name, [{a.ENT_TYPE: 'DATE'}], label=Part.OBJ) self.entity_rules = ['subject', 'object'] super().__init__(matcher)
def load_risky_activities_matcher(nlp): matcher = Matcher(nlp.vocab) risky_activities = [ 'bareback', 'uncovered', 'bbbjtcim', 'bbbj', 'bbbjtc', 'bbbjtcws', 'bbbjwf', 'bbfs', 'anal', 'greek', 'rca', 'swallow', 'cim', 'choke', 'bdsm', 'bondage', 'g******g', 'hardcore' ] provider = [ 'girl', 'girls', 'model', 'models', 'staff', 'staffs', 'latina', 'latinas', 'talent', 'talents', 'supermodel', 'supermodels', 'princess', 'princesses' ] is_risky_activities = FLAG40 is_provider = FLAG41 set_flag(nlp, risky_activities, is_risky_activities) set_flag(nlp, provider, is_provider) matcher.add_entity(1) matcher.add_pattern(1, [{is_risky_activities: True}]) matcher.add_entity(2) matcher.add_pattern(2, [{LEMMA: "hardcore"}, {LEMMA: "sex"}]) matcher.add_pattern(2, [{LEMMA: "hardcore"}, {LEMMA: "service"}]) matcher.add_entity(3) matcher.add_pattern(3, [{LEMMA: "hardcore"}]) matcher.add_entity(4) matcher.add_pattern(4, [{LEMMA: "greek"}, {IS_DIGIT: True}]) matcher.add_pattern(4, [{LEMMA: "greek"}, {is_provider: True}]) matcher.add_pattern(4, [{ LEMMA: "if", DEP: "mark" }, { IS_ALPHA: True, DEP: "ROOT" }, { is_risky_activities: True }]) matcher.add_pattern(4, [{is_risky_activities: True}, {LEMMA: "sorry"}]) return matcher
def load_webcam_matcher(nlp): matcher = Matcher(nlp.vocab) cam = ['cam', 'skype', 'facetime', 'webcam', 'mfc', 'iml'] provider = [ 'girls', 'girl', 'models', 'model', 'staffs', 'staff', 'latinas', 'latina', 'talent', 'supermodels', 'supermodel', 'princesses', 'princess' ] is_cam = FLAG29 is_provider = FLAG30 set_flag(nlp, cam, is_cam) set_flag(nlp, provider, is_provider) matcher.add_entity(1) matcher.add_pattern(1, [{is_cam: True}]) matcher.add_pattern(1, [{LOWER: "live"}, {LEMMA: "show"}]) matcher.add_pattern(1, [{LEMMA: "video"}, {ORTH: "@"}]) matcher.add_pattern(1, [{LOWER: "free", DEP: "amod"}, {LEMMA: "video"}]) matcher.add_pattern(1, [{LOWER: "porno"}, {is_provider: True}]) matcher.add_pattern(1, [{LEMMA: "add"}, {LOWER: "i"}]) matcher.add_pattern(1, [{LOWER: "chaturbate"}]) matcher.add_pattern(1, [{ LEMMA: "see" }, { LOWER: "i" }, { LOWER: "on" }, { LOWER: "http" }]) matcher.add_pattern(1, [{ LEMMA: "see" }, { LOWER: "me" }, { LOWER: "on" }, { LOWER: "http" }]) matcher.add_pattern(1, [{ LEMMA: "see" }, { LOWER: "i" }, { LOWER: "on" }, { LOWER: "https" }]) matcher.add_pattern(1, [{ LEMMA: "see" }, { LOWER: "me" }, { LOWER: "on" }, { LOWER: "http" }]) matcher.add_pattern(1, [{ LEMMA: "see" }, { LOWER: "i" }, { LOWER: "on" }, { LOWER: "www" }, { ORTH: "." }]) matcher.add_pattern(1, [{ LEMMA: "see" }, { LOWER: "me" }, { LOWER: "on" }, { LOWER: "www" }, { ORTH: "." }]) matcher.add_entity(4) matcher.add_pattern(4, [{LOWER: "i"}, {LOWER: "cam"}]) matcher.add_pattern(4, [{LOWER: "cam"}, {LOWER: "to"}]) matcher.add_pattern(4, [{LOWER: "you"}, {LOWER: "cam"}]) matcher.add_pattern(4, [{DEP: "neg"}, {is_cam: True}]) matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "camshow"}]) matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "liveshow"}]) matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "skypeshow"}]) matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "livshow"}]) matcher.add_pattern(4, [{DEP: "neg"}, {LOWER: "paypal"}, {LEMMA: "show"}]) matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "show"}]) matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "chaturbate"}]) matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "live"}]) matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "video"}]) matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "porno"}]) matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "girl"}, {LEMMA: "cam"}]) matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "girl"}, {LEMMA: "webcam"}]) matcher.add_pattern(4, [{ is_cam: True, DEP: "nsubj" }, { IS_ALPHA: True }, { IS_ALPHA: True, DEP: "neg" }]) matcher.add_pattern(4, [{ LEMMA: "video", DEP: "nsubj" }, { IS_ALPHA: True }, { IS_ALPHA: True, DEP: "neg" }]) matcher.add_pattern(4, [{ is_cam: True, DEP: "conj" }, { IS_ALPHA: True }, { IS_ALPHA: True, DEP: "neg" }]) matcher.add_pattern(4, [{ LEMMA: "video", DEP: "conj" }, { IS_ALPHA: True }, { IS_ALPHA: True, DEP: "neg" }]) matcher.add_pattern(4, [{LOWER: "no"}, {is_cam: True}]) matcher.add_pattern(4, [{LOWER: "no"}, {LEMMA: "live"}]) matcher.add_pattern(4, [{LOWER: "no"}, {LEMMA: "video"}]) matcher.add_pattern(4, [{LOWER: "no"}, {LEMMA: "free"}]) matcher.add_pattern(4, [{LOWER: "no"}, {LEMMA: "porno"}]) matcher.add_pattern(4, [{LOWER: "no"}, {LEMMA: "chaturbate"}]) matcher.add_pattern(4, [{LOWER: "no"}, {LEMMA: "camshow"}]) matcher.add_pattern(4, [{LOWER: "no"}, {LEMMA: "liveshow"}]) matcher.add_pattern(4, [{LOWER: "no"}, {LEMMA: "skypeshow"}]) matcher.add_pattern(4, [{LOWER: "no"}, {LOWER: "paypal"}, {LEMMA: "show"}]) matcher.add_pattern(4, [{LOWER: "it"}, {LEMMA: "be"}, {LEMMA: "cam"}]) matcher.add_pattern(4, [{LOWER: "its"}, {LEMMA: "cam"}]) matcher.add_pattern(4, [{LOWER: "im"}, {LEMMA: "cam"}]) matcher.add_pattern(4, [{LOWER: "i"}, {LEMMA: "be"}, {LEMMA: "cam"}]) matcher.add_pattern(4, [{ LOWER: "my" }, { LOWER: "name" }, { LEMMA: "be" }, { LEMMA: "cam" }]) matcher.add_pattern(4, [{LEMMA: "cam"}, {LOWER: "here"}]) return matcher
def load_multi_girl_matcher(nlp): matcher = Matcher(nlp.vocab) multi_num = [ 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'double', 'triple' ] + [str(x) for x in range(2, 11)] girl = [ 'gal', 'girl', 's**t', 'cutie', 'hottie', 'lady', 'teen', 'teenager', 'chick', 'staff', 'gf', 'she' ] show = ['show', 'special', 'session', 'fantasy'] dict_and = ['and', 'an', 'n', '&'] is_multi_num = FLAG30 is_girl = FLAG31 is_show = FLAG33 is_and = FLAG34 set_flag(nlp, multi_num, is_multi_num) set_flag(nlp, girl, is_girl) set_flag(nlp, show, is_show) set_flag(nlp, dict_and, is_and) matcher.add_entity(1) matcher.add_pattern(1, [{is_multi_num: True}, {is_girl: True, TAG: "NNS"}]) matcher.add_pattern(1, [{ is_multi_num: True }, { is_girl: True, TAG: "NNPS" }]) matcher.add_pattern(1, [{LOWER: "duo"}]) matcher.add_pattern(1, [{ LOWER: "2" }, { ORTH: "-" }, { LOWER: "for" }, { ORTH: "-" }, { LOWER: "1" }]) matcher.add_pattern(1, [{ LEMMA: "double" }, { ORTH: "-" }, { LEMMA: "session" }]) matcher.add_pattern(1, [{LEMMA: "three"}, {ORTH: "-"}, {LEMMA: "way"}]) matcher.add_pattern(1, [{is_multi_num: True}, {LOWER: "for"}, {ORTH: "1"}]) matcher.add_pattern(1, [{ is_multi_num: True }, { LOWER: "for" }, { ORTH: "one" }]) matcher.add_pattern(1, [{LEMMA: "double"}, {is_show: True}]) matcher.add_pattern(1, [{is_multi_num: True}, {LOWER: "way"}]) matcher.add_entity(4) matcher.add_pattern(4, [{LOWER: "a"}, {is_girl: True}]) matcher.add_pattern(4, [{LOWER: "how"}, {is_girl: True}]) matcher.add_pattern(4, [{LOWER: "for"}, {is_girl: True}]) matcher.add_pattern(4, [{IS_ALPHA: True, DEP: "nmod"}, {is_girl: True}]) matcher.add_pattern(4, [{ is_girl: True }, { is_and: True }, { LEMMA: "gentleman" }]) matcher.add_pattern(4, [{ is_girl: True }, { ORTH: "&" }, { ORTH: "&" }, { LEMMA: "gentleman" }]) matcher.add_pattern(4, [{is_girl: True}, {is_and: True}, {LEMMA: "guy"}]) matcher.add_pattern(4, [{ is_girl: True }, { ORTH: "&" }, { ORTH: "&" }, { LEMMA: "guy" }]) matcher.add_pattern(4, [{ LEMMA: "gentleman" }, { is_and: True }, { is_girl: True }]) matcher.add_pattern(4, [{ LEMMA: "gentleman" }, { ORTH: "&" }, { ORTH: "&" }, { is_girl: True }]) matcher.add_pattern(4, [{LEMMA: "guy"}, {is_and: True}, {is_girl: True}]) matcher.add_pattern(4, [{ LEMMA: "guy" }, { ORTH: "&" }, { ORTH: "&" }, { is_girl: True }]) matcher.add_pattern(4, [{LOWER: "she"}]) return matcher
def load_social_media_matcher(nlp): social_media = ['twitter', 'facebook', 'instagram', 'wechat', 'line', 'snapchat'] separators = [':', '-', '@'] add_to_vocab(nlp, social_media) add_to_vocab(nlp, separators) is_separator = FLAG55 is_social_media = FLAG54 social_media_ids = {nlp.vocab.strings[s.lower()] for s in social_media} separators_ids = {nlp.vocab.strings[s.lower()] for s in separators} for lexeme in nlp.vocab: if lexeme.lower in social_media_ids: lexeme.set_flag(is_social_media, True) if lexeme.lower in separators_ids: lexeme.set_flag(is_separator, True) matcher = Matcher(nlp.vocab) matcher.add_entity("social_media") matcher.add_pattern("social_media", [ {is_social_media: True}, {is_separator: True}, {is_separator: True, 'OP': '?'}, { IS_ASCII: True } ], label=1 ) matcher.add_pattern("social_media", [ {is_social_media: True}, { LOWER: "me", TAG: "PRP" }, {is_separator: True, "OP": '?'}, { IS_ASCII: True, TAG: 'NN' } ], label=2 ) '''matcher.add_pattern("social_media", [ {is_social_media: True}, { LOWER: "me", TAG: "PRP" }, {is_separator: True, "OP": '?'}, {IS_ASCII: False, "OP":"?"}, ], label = 3 ) ''' matcher.add_pattern("social_media", [ {is_social_media: True}, {LOWER: 'id'}, {LOWER: 'is', 'OP': '?'}, {is_separator: True, 'OP': '?'}, {IS_ASCII: True} ], label=4 ) matcher.add_pattern("social_media", [ {is_social_media: True}, {TAG: 'NN'}, {LOWER: 'is', 'OP': '?'}, {LOWER: 'to'}, {TAG: 'VB'}, {LOWER: 'me'} ], label=5 ) matcher.add_pattern("social_media", [ {LOWER: 'add'}, {TAG: 'PRP'}, {LOWER: 'on'}, {is_social_media: True}, {TAG: 'NN'} ], label=6 ) return matcher
def test_add_empty_entity(en_vocab): matcher = Matcher(en_vocab) matcher.add_entity('TestEntity') assert matcher.n_patterns == 0 assert matcher(Doc(en_vocab, words=[u'Test', u'Entity'])) == []
def load_credit_card_matcher(nlp): matcher = Matcher(nlp.vocab) payment = [ 'visa', 'mastercard', 'masterc', 'mc', 'mcard', 'cash', 'csh', 'discover', 'amex', 'interac', 'jcb' ] visa_type = [ 'us', 'american', 'canadian', 'student', 'online', 'transit', 'need', 'make', 'f1', 'temp', 'temporary', 'permanent', 'visitor', 'visit', 'visiting' ] is_payment = FLAG40 is_visa_type = FLAG41 set_flag(nlp, payment, is_payment) set_flag(nlp, visa_type, is_visa_type) matcher.add_entity(1) matcher.add_pattern(1, [{ is_payment: True }, { ORTH: "/" }, { is_payment: True }]) matcher.add_pattern(1, [{ is_payment: True }, { ORTH: "," }, { is_payment: True }]) matcher.add_pattern(1, [{ is_payment: True }, { LEMMA: "and" }, { is_payment: True }]) matcher.add_pattern(1, [{ is_payment: True }, { ORTH: "/" }, { LOWER: "american" }, { LOWER: "express" }]) matcher.add_pattern(1, [{ is_payment: True }, { ORTH: "," }, { LOWER: "american" }, { LOWER: "express" }]) matcher.add_pattern(1, [{ is_payment: True }, { ORTH: "&" }, { LOWER: "american" }, { LOWER: "express" }]) matcher.add_pattern(1, [{ is_payment: True }, { LEMMA: "and" }, { LOWER: "american" }, { LOWER: "express" }]) matcher.add_pattern(1, [{ LEMMA: "m" }, { ORTH: "/" }, { LEMMA: "card" }, { ORTH: "/" }, { LEMMA: "m" }, { ORTH: "/" }, { LEMMA: "card" }]) matcher.add_pattern(1, [{ LEMMA: "m" }, { ORTH: "/" }, { LEMMA: "card" }, { ORTH: "," }, { LEMMA: "m" }, { ORTH: "/" }, { LEMMA: "card" }]) matcher.add_pattern(1, [{ LEMMA: "m" }, { ORTH: "/" }, { LEMMA: "card" }, { LEMMA: "and" }, { LEMMA: "m" }, { ORTH: "/" }, { LEMMA: "card" }]) matcher.add_pattern(1, [{ LEMMA: "m" }, { ORTH: "/" }, { LEMMA: "card" }, { ORTH: "/" }, { LOWER: "american" }, { LOWER: "express" }]) matcher.add_pattern(1, [{ LEMMA: "m" }, { ORTH: "/" }, { LEMMA: "card" }, { ORTH: "," }, { LOWER: "american" }, { LOWER: "express" }]) matcher.add_pattern(1, [{ LEMMA: "m" }, { ORTH: "/" }, { LEMMA: "card" }, { ORTH: "&" }, { LOWER: "american" }, { LOWER: "express" }]) matcher.add_pattern(1, [{ LEMMA: "m" }, { ORTH: "/" }, { LEMMA: "card" }, { LEMMA: "and" }, { LOWER: "american" }, { LOWER: "express" }]) matcher.add_pattern(1, [{LOWER: "american"}, {LOWER: "express"}]) matcher.add_pattern(1, [{ LEMMA: "diners" }, { LEMMA: "club" }, { LEMMA: "internacional" }]) matcher.add_pattern(1, [{LOWER: "union"}, {LOWER: "pay"}]) matcher.add_pattern(1, [{LEMMA: "credit"}, {LEMMA: "card"}]) matcher.add_pattern(1, [{LEMMA: "creditcard"}]) matcher.add_entity(2) matcher.add_pattern(2, [{LEMMA: "accept"}, {LEMMA: "card"}]) matcher.add_pattern(2, [{LEMMA: "accept"}, {is_payment: True}]) matcher.add_pattern(2, [{ LEMMA: "accept" }, { LEMMA: "m" }, { ORTH: "/" }, { LEMMA: "card" }]) matcher.add_pattern(2, [{ LEMMA: "accept" }, { ORTH: ":" }, { is_payment: True }]) matcher.add_pattern(2, [{ LEMMA: "payment" }, { ORTH: ":" }, { is_payment: True }]) matcher.add_pattern(2, [{ LEMMA: "accept" }, { ORTH: ":" }, { LEMMA: "m" }, { ORTH: "/" }, { LEMMA: "card" }]) matcher.add_pattern(2, [{ LEMMA: "payment" }, { ORTH: ":" }, { LEMMA: "m" }, { ORTH: "/" }, { LEMMA: "card" }]) matcher.add_entity(3) matcher.add_pattern(3, [{LEMMA: "at"}, {is_payment: True}]) matcher.add_pattern(3, [{ LEMMA: "at" }, { LEMMA: "m" }, { ORTH: "/" }, { LEMMA: "card" }]) matcher.add_pattern(3, [{LEMMA: "visa"}, {LEMMA: "versa"}]) matcher.add_entity(4) matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "credit"}, {LEMMA: "card"}]) matcher.add_pattern(4, [{LEMMA: "credit"}, {LEMMA: "card"}, {DEP: "neg"}]) matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "creditcard"}]) matcher.add_pattern(4, [{LEMMA: "no"}, {LEMMA: "credit"}, {LEMMA: "card"}]) matcher.add_pattern(4, [{LEMMA: "no"}, {LEMMA: "creditcard"}]) matcher.add_pattern(4, [{ LEMMA: "not" }, { IS_ASCII: True }, { LEMMA: "credit" }, { LEMMA: "card" }]) matcher.add_pattern(4, [{ LEMMA: "not" }, { IS_ASCII: True }, { LEMMA: "creditcard" }]) matcher.add_pattern(4, [{is_visa_type: True}, {LEMMA: "visa"}]) matcher.add_pattern(4, [{LEMMA: "visa"}, {LEMMA: "student"}]) matcher.add_pattern(4, [{LEMMA: "rent"}, {LEMMA: "and"}, {LEMMA: "visa"}]) matcher.add_pattern(4, [{ LEMMA: "rent" }, { LEMMA: "and" }, { LEMMA: "credit" }]) matcher.add_pattern(4, [{LEMMA: "visa"}, {LEMMA: "and"}, {LEMMA: "rent"}]) matcher.add_pattern(4, [{LEMMA: "card"}, {LEMMA: "and"}, {LEMMA: "rent"}]) matcher.add_pattern(4, [{ LEMMA: "apply", DEP: "ROOT" }, { LEMMA: "for", DEP: "prep" }, { LEMMA: "visa" }]) matcher.add_pattern(4, [{LEMMA: "apply", DEP: "ROOT"}, {LEMMA: "visa"}]) matcher.add_pattern(4, [{LEMMA: "arrival", DEP: "ROOT"}, {LEMMA: "visa"}]) return matcher
def load_outcall_matcher(nlp): matcher = Matcher(nlp.vocab) location = ['location', 'place', 'studio', 'apartment', 'home', 'house', 'hotel'] add_to_vocab(nlp, location) location_ids = {nlp.vocab.strings[s.lower()] for s in location} hyphen_id = nlp.vocab.strings['-'] ampersand_id = nlp.vocab.strings['&'] is_hyphen = FLAG23 is_ampersand = FLAG24 is_location = FLAG25 for lexeme in nlp.vocab: if lexeme.lower == hyphen_id: lexeme.set_flag(is_hyphen, True) if lexeme.lower == ampersand_id: lexeme.set_flag(is_ampersand, True) if lexeme.lower in location_ids: lexeme.set_flag(is_location, True) matcher.add_entity(1) matcher.add_pattern(1, [{LEMMA: "outcall"}]) matcher.add_pattern(1, [{LEMMA: "out"}, {LEMMA: "call"}]) matcher.add_pattern(1, [{LEMMA: "out"}, {is_hyphen: True}, {LEMMA: "call"}]) matcher.add_pattern(1, [{LEMMA: "your"}, {is_location: True}]) matcher.add_pattern(1, [{LEMMA: "out"}, {LEMMA: "and"}, {LEMMA: "in"}, {LEMMA: "call"}]) matcher.add_pattern(1, [{LEMMA: "out"}, {is_ampersand: True}, {LEMMA: "in"}, {LEMMA: "call"}]) matcher.add_pattern(1, [{LEMMA: "visit"}, {LEMMA: "you"}]) matcher.add_pattern(1, [{LEMMA: "mind"}, {LEMMA: "travel"}]) matcher.add_pattern(1, [{LEMMA: "anywhere"}, {LEMMA: "and"}, {LEMMA: "everywhere"}]) matcher.add_pattern(1, [{LEMMA: "prefer"}, {LEMMA: "residence"}]) matcher.add_pattern(1, [{LEMMA: "prefer"}, {LEMMA: "hotel"}]) matcher.add_pattern(1, [{LEMMA: "come"}, {LEMMA: "to"}, {LEMMA: "you"}]) matcher.add_pattern(1, [{LEMMA: "will"}, {LEMMA: "travel"}]) matcher.add_entity(2) matcher.add_pattern(2, [{LEMMA: "outcall"}, {LEMMA: "only"}]) matcher.add_pattern(2, [{LEMMA: "out"}, {LEMMA: "call"}, {LEMMA: "only"}]) matcher.add_pattern(2, [{LEMMA: "out"}, {is_hyphen: True}, {LEMMA: "call"}, {LEMMA: "only"}]) matcher.add_pattern(2, [{LEMMA: "your", DEP: "amod"}, {is_location: True}]) matcher.add_pattern(2, [{LEMMA: "your", DEP: "amod"}, {IS_ASCII: True}, {is_location: True}]) matcher.add_pattern(2, [{LEMMA: "your", DEP: "poss"}, {is_location: True}]) matcher.add_pattern(2, [{LEMMA: "your", DEP: "poss"}, {IS_ASCII: True}, {is_location: True}]) matcher.add_entity(3) matcher.add_pattern(3, [{is_location: True}]) matcher.add_pattern(3, [{LEMMA: "place"}]) matcher.add_pattern(3, [{LEMMA: "be"}, {LEMMA: "place"}]) matcher.add_pattern(3, [{LEMMA: "is"}, {LEMMA: "place"}]) matcher.add_entity(4) matcher.add_pattern(4, [{LEMMA: "house"}, {LEMMA: "wives"}]) matcher.add_pattern(4, [{LEMMA: "if", DEP: "mark"}, {LEMMA: "have"}, {IS_ASCII: True, DEP: "dobj"}]) matcher.add_pattern(4, [{LEMMA: "if", DEP: "mark"}, {IS_ASCII: True}, {LEMMA: "have"}, {IS_ASCII: True}, {IS_ASCII: True, DEP: "dobj"}]) matcher.add_pattern(4, [{LEMMA: "if", DEP: "mark"}, {LEMMA: "have"}, {IS_ASCII: True}, {IS_ASCII: True, DEP: "dobj"}]) matcher.add_pattern(4, [{LEMMA: "my", DEP: "poss"}, {is_location: True}]) matcher.add_pattern(4, [{LEMMA: "my", DEP: "poss"}, {IS_ASCII: True}, {is_location: True}]) matcher.add_pattern(4, [{LEMMA: "no"}, {LEMMA: "outcall"}]) matcher.add_pattern(4, [{LEMMA: "no"}, {LEMMA: "out"}, {LEMMA: "call"}]) matcher.add_pattern(4, [{LEMMA: "no"}, {LEMMA: "out"}, {is_hyphen: True}, {LEMMA: "call"}]) matcher.add_pattern(4, [{IS_ASCII: True, DEP: "neg"}, {LEMMA: "outcall"}]) matcher.add_pattern(4, [{IS_ASCII: True, DEP: "neg"}, {LEMMA: "out"}, {LEMMA: "call"}]) matcher.add_pattern(4, [{IS_ASCII: True, DEP: "neg"}, {LEMMA: "out"}, {is_hyphen: True}, {LEMMA: "call"}]) matcher.add_pattern(4, [{IS_ASCII: True, DEP: "neg"}, {IS_ASCII: True}, {LEMMA: "outcall"}]) matcher.add_pattern(4, [{IS_ASCII: True, DEP: "neg"}, {IS_ASCII: True}, {LEMMA: "out"}, {LEMMA: "call"}]) matcher.add_pattern(4, [{IS_ASCII: True, DEP: "neg"}, {IS_ASCII: True}, {LEMMA: "out"}, {is_hyphen: True}, {LEMMA: "call"}]) matcher.add_pattern(4, [{LEMMA: "visit"}, {LEMMA: "your"}, {LEMMA: "city"}]) matcher.add_pattern(4, [{IS_ASCII: True}, {LEMMA: "miss"}, {LEMMA: "out"}]) matcher.add_pattern(4, [{LEMMA: "no"}, {LEMMA: "out"}, {LEMMA: "call"}]) matcher.add_pattern(4, [{IS_ASCII: True, DEP: "dep"}, {LEMMA: "no"}]) return matcher
def load_incall_matcher(nlp): matcher = Matcher(nlp.vocab) location = ['location', 'place', 'studio', 'apartment', 'home', 'house'] private = ['private', 'discreet', 'discrete'] clean = ['clean', 'nice', 'lovely'] add_to_vocab(nlp, location) add_to_vocab(nlp, private) add_to_vocab(nlp, clean) location_ids = {nlp.vocab.strings[s.lower()] for s in location} private_ids = {nlp.vocab.strings[s.lower()] for s in private} clean_ids = {nlp.vocab.strings[s.lower()] for s in clean} hyphen_id = nlp.vocab.strings['-'] ampersand_id = nlp.vocab.strings['&'] is_hyphen = FLAG23 is_ampersand = FLAG24 is_location = FLAG25 is_private = FLAG26 is_clean = FLAG27 for lexeme in nlp.vocab: if lexeme.lower == hyphen_id: lexeme.set_flag(is_hyphen, True) if lexeme.lower == ampersand_id: lexeme.set_flag(is_ampersand, True) if lexeme.lower in location_ids: lexeme.set_flag(is_location, True) if lexeme.lower in private_ids: lexeme.set_flag(is_private, True) if lexeme.lower in clean_ids: lexeme.set_flag(is_clean, True) matcher.add_entity(1) matcher.add_pattern(1, [{LEMMA: "incall"}]) matcher.add_pattern(1, [{LEMMA: "in"}, {LEMMA: "call"}]) matcher.add_pattern(1, [{LEMMA: "in"}, {is_hyphen: True}, {LEMMA: "call"}]) matcher.add_pattern(1, [{ LEMMA: "in" }, { LEMMA: "and" }, { LEMMA: "out" }, { LEMMA: "call" }]) matcher.add_pattern(1, [{ LEMMA: "in" }, { is_ampersand: True }, { LEMMA: "out" }, { LEMMA: "call" }]) matcher.add_pattern(1, [{LEMMA: "visit"}, {LEMMA: "i"}]) matcher.add_entity(2) matcher.add_pattern(2, [{LEMMA: "incall"}, {LEMMA: "only"}]) matcher.add_pattern(2, [{LEMMA: "in"}, {LEMMA: "call"}, {LEMMA: "only"}]) matcher.add_pattern(2, [{ LEMMA: "in" }, { is_hyphen: True }, { LEMMA: "call" }, { LEMMA: "only" }]) matcher.add_pattern(2, [{ is_private: True, DEP: "amod" }, { is_location: True }]) matcher.add_pattern(2, [{ is_private: True, DEP: "amod" }, { IS_ASCII: True }, { is_location: True }]) matcher.add_pattern(2, [{is_clean: True}, {is_location: True}]) matcher.add_pattern(2, [{LEMMA: "my", DEP: "poss"}, {is_location: True}]) matcher.add_pattern(2, [{ LEMMA: "my", DEP: "poss" }, { IS_ASCII: True }, { is_location: True }]) matcher.add_entity(3) matcher.add_pattern(3, [{LEMMA: "location"}]) matcher.add_pattern(3, [{LEMMA: "place"}]) matcher.add_pattern(3, [{LEMMA: "be"}, {LEMMA: "place"}]) matcher.add_pattern(3, [{LEMMA: "is"}, {LEMMA: "place"}]) matcher.add_entity(4) matcher.add_pattern(4, [{LEMMA: "house"}, {LEMMA: "wife"}]) matcher.add_pattern(4, [{LOWER: "your", DEP: "poss"}, {is_location: True}]) matcher.add_pattern(4, [{ LOWER: "your", DEP: "poss" }, { IS_ASCII: True }, { is_location: True }]) matcher.add_pattern(4, [{LEMMA: "no"}, {LEMMA: "incall"}]) matcher.add_pattern(4, [{LEMMA: "no"}, {LEMMA: "in"}, {LEMMA: "call"}]) matcher.add_pattern(4, [{ LEMMA: "no" }, { LEMMA: "in" }, { is_hyphen: True }, { LEMMA: "call" }]) matcher.add_pattern(4, [{IS_ASCII: True, DEP: "neg"}, {LEMMA: "incall"}]) matcher.add_pattern(4, [{ IS_ASCII: True, DEP: "neg" }, { LEMMA: "in" }, { LEMMA: "call" }]) matcher.add_pattern(4, [{ IS_ASCII: True, DEP: "neg" }, { LEMMA: "in" }, { is_hyphen: True }, { LEMMA: "call" }]) matcher.add_pattern(4, [{ IS_ASCII: True, DEP: "neg" }, { IS_ASCII: True }, { LEMMA: "incall" }]) matcher.add_pattern(4, [{ IS_ASCII: True, DEP: "neg" }, { IS_ASCII: True }, { LEMMA: "in" }, { LEMMA: "call" }]) matcher.add_pattern(4, [{ IS_ASCII: True, DEP: "neg" }, { IS_ASCII: True }, { LEMMA: "in" }, { is_hyphen: True }, { LEMMA: "call" }]) matcher.add_pattern(4, [{ IS_ASCII: True, DEP: "neg" }, { LEMMA: "have" }, { IS_ASCII: True, DEP: "dobj" }]) matcher.add_pattern(4, [{ IS_ASCII: True, DEP: "neg" }, { LEMMA: "have" }, { IS_ASCII: True }, { IS_ASCII: True, DEP: "dobj" }]) matcher.add_pattern(4, [{ LEMMA: "if", DEP: "mark" }, { LEMMA: "have" }, { IS_ASCII: True, DEP: "dobj" }]) matcher.add_pattern(4, [{ LEMMA: "if", DEP: "mark" }, { IS_ASCII: True }, { LEMMA: "have" }, { IS_ASCII: True }, { IS_ASCII: True, DEP: "dobj" }]) matcher.add_pattern(4, [{ LEMMA: "if", DEP: "mark" }, { LEMMA: "have" }, { IS_ASCII: True }, { IS_ASCII: True, DEP: "dobj" }]) return matcher
def load_derogatory_mentions_matcher(nlp): matcher = Matcher(nlp.vocab) bitch = ['w***e', 'bitch', 'c**t', 'psycho', 's**t'] your = ['your', 'ur'] is_bitch = FLAG29 is_your = FLAG30 set_flag(nlp, bitch, is_bitch) set_flag(nlp, your, is_your) matcher.add_entity(1) matcher.add_pattern(1, [{is_bitch: True}]) matcher.add_pattern(1, [{LEMMA: "expose"}, {LOWER: "i"}]) matcher.add_pattern(1, [{LEMMA: "expose"}, {LOWER: "me"}]) matcher.add_pattern(1, [{LEMMA: "violate"}, {LOWER: "i"}]) matcher.add_pattern(1, [{LEMMA: "violate"}, {LOWER: "me"}]) matcher.add_pattern(1, [{LEMMA: "f**k"}, {LOWER: "i"}]) matcher.add_pattern(1, [{LEMMA: "f**k"}, {LOWER: "me"}]) matcher.add_pattern(1, [{ LOWER: "i", DEP: "nsubj" }, { IS_ALPHA: True }, { LEMMA: "violate", DEP: "xcomp" }]) matcher.add_pattern(1, [{LEMMA: "piece"}, {LOWER: "of"}, {LEMMA: "shit"}]) matcher.add_pattern(1, [{LOWER: "hardcore"}]) matcher.add_pattern(1, [{is_your: True}, {is_bitch: True}]) matcher.add_pattern(1, [{is_your: True}, {LEMMA: "slave"}]) matcher.add_pattern(1, [{is_your: True, DEP: "poss"}, {is_bitch: True}]) matcher.add_pattern(1, [{is_your: True}, {LEMMA: "slave"}]) matcher.add_pattern(1, [{LOWER: "i", DEP: "nsubj"}, {is_bitch: True}]) matcher.add_pattern(1, [{ LOWER: "i", DEP: "nsubj" }, { IS_ALPHA: True }, { is_bitch: True, DEP: "xcomp" }]) matcher.add_entity(3) matcher.add_pattern(3, [{LOWER: "like"}, {is_bitch: True}]) matcher.add_pattern(3, [{LEMMA: "bitch", POS: "VERB"}]) matcher.add_entity(4) matcher.add_pattern(4, [{LOWER: "to"}, {is_bitch: True}]) matcher.add_pattern(4, [{LOWER: "nor"}, {is_bitch: True}]) matcher.add_pattern(4, [{DEP: "neg"}, {is_bitch: True}]) matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "slave"}]) matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "expose"}, {LOWER: "i"}]) matcher.add_pattern(4, [{DEP: "neg"}, {LEMMA: "expose"}, {LOWER: "me"}]) matcher.add_pattern(4, [{ DEP: "neg" }, { LEMMA: "piece" }, { LOWER: "of" }, { LEMMA: "shit" }]) matcher.add_pattern(4, [{DEP: "neg"}, {is_your: True}, {is_bitch: True}]) matcher.add_pattern(4, [{DEP: "neg"}, {IS_ALPHA: True}, {is_bitch: True}]) matcher.add_pattern(4, [{DEP: "neg"}, {IS_ALPHA: True}, {LEMMA: "slave"}]) matcher.add_pattern(4, [{ DEP: "neg" }, { IS_ALPHA: True }, { LEMMA: "expose" }, { LOWER: "i" }]) matcher.add_pattern(4, [{ DEP: "neg" }, { IS_ALPHA: True }, { LEMMA: "expose" }, { LOWER: "me" }]) matcher.add_pattern(4, [{ DEP: "neg" }, { IS_ALPHA: True }, { LEMMA: "piece" }, { LOWER: "of" }, { LEMMA: "shit" }]) matcher.add_pattern(4, [{ DEP: "neg" }, { IS_ALPHA: True }, { is_your: True }, { is_bitch: True }]) matcher.add_pattern(4, [{DEP: "neg"}, {IS_ALPHA: True}, {is_bitch: True}]) matcher.add_pattern(4, [{DEP: "neg"}, {IS_ALPHA: True}, {LEMMA: "slave"}]) matcher.add_pattern(4, [{ DEP: "neg" }, { IS_ALPHA: True }, { LEMMA: "expose" }, { LOWER: "i" }]) matcher.add_pattern(4, [{ DEP: "neg" }, { IS_ALPHA: True }, { LEMMA: "expose" }, { LOWER: "me" }]) matcher.add_pattern(4, [{ DEP: "neg" }, { IS_ALPHA: True }, { LEMMA: "piece" }, { LOWER: "of" }, { LEMMA: "shit" }]) matcher.add_pattern(4, [{LEMMA: "girl"}, {LOWER: "next"}, {LEMMA: "door"}]) matcher.add_pattern(4, [{LOWER: "with"}, {LOWER: "my"}, {LEMMA: "girl"}]) matcher.add_pattern(4, [{LOWER: "no"}, {is_bitch: True}]) matcher.add_pattern(4, [{DEP: "neg"}, {LOWER: "like"}, {is_bitch: True}]) matcher.add_pattern(4, [{LEMMA: "look"}, {LEMMA: "slave", DEP: "prep"}]) matcher.add_pattern(4, [{LOWER: "you"}, {is_bitch: True}]) matcher.add_pattern(4, [{LOWER: "you"}, {POS: "ADJ"}, {is_bitch: True}]) return matcher
def load_movement_matcher(nlp): matcher = Matcher(nlp.vocab) place = ['area', 'place', 'city', 'town'] girl = [ 'gal', 'girl', 's**t', 'cutie', 'hottie', 'lady', 'teen', 'teenager', 'chick', 'staff', 'gf', 'she' ] add_to_vocab(nlp, place) add_to_vocab(nlp, girl) is_place = FLAG18 is_girl = FLAG19 upper_start = FLAG20 for lexeme in nlp.vocab: if lexeme.lower_ in place: lexeme.set_flag(is_place, True) if lexeme.lower_ in girl: lexeme.set_flag(is_girl, True) if lexeme.prefix_.isupper(): lexeme.set_flag(upper_start, True) # Positive Matcher Patterns matcher.add_entity(1) matcher.add_pattern(1, [{ LEMMA: "last" }, { LEMMA: "night" }, { LEMMA: "in" }, { LEMMA: "town" }]) matcher.add_pattern(1, [{ LEMMA: "leave" }, { IS_ASCII: True, ENT_TYPE: "DATE" }]) matcher.add_pattern(1, [{LEMMA: "leave"}, {DEP: "partmod"}]) matcher.add_pattern(1, [{LEMMA: "leave"}, {DEP: "quantmod"}]) matcher.add_pattern(1, [{ LEMMA: "leave" }, { IS_ASCII: True, ENT_TYPE: "TIME" }]) matcher.add_pattern(1, [{ LEMMA: "leave" }, { LEMMA: "in" }, { IS_ASCII: True, ENT_TYPE: "DATE" }]) matcher.add_pattern(1, [{LEMMA: "leave"}, {LEMMA: "town"}]) matcher.add_pattern(1, [{LEMMA: "out"}, {LEMMA: "of"}, {LEMMA: "town"}]) matcher.add_pattern(1, [{LOWER: "outta"}, {LEMMA: "town"}]) matcher.add_pattern(1, [{ LEMMA: "lastnight" }, { LEMMA: "in" }, { LEMMA: "town" }]) matcher.add_pattern(1, [{LEMMA: "back"}, {LEMMA: "in"}, {LEMMA: "town"}]) matcher.add_pattern(1, [{LEMMA: "just"}, {LEMMA: "in"}, {LEMMA: "town"}]) matcher.add_pattern(1, [{LEMMA: "day"}, {LEMMA: "in"}, {LEMMA: "town"}]) matcher.add_pattern(1, [{ LEMMA: "in" }, { LEMMA: "town" }, { LEMMA: "tonight" }]) matcher.add_pattern(1, [{ LEMMA: "in" }, { LEMMA: "town" }, { LEMMA: "through" }]) matcher.add_pattern(1, [{LEMMA: "in"}, {LEMMA: "town"}, {LEMMA: "until"}]) matcher.add_pattern(1, [{ LEMMA: "in" }, { LEMMA: "town" }, { LEMMA: "for" }, { LEMMA: "one" }, { LEMMA: "night" }]) matcher.add_pattern(1, [{ LEMMA: "in" }, { LEMMA: "town" }, { LEMMA: "for" }, { IS_DIGIT: True }, { LEMMA: "night" }]) matcher.add_pattern(1, [{LEMMA: "town"}, {LEMMA: "stay", DEP: "nmod"}]) matcher.add_pattern(1, [{ LEMMA: "town" }, { IS_ASCII: True }, { LEMMA: "stay", DEP: "nmod" }]) matcher.add_pattern(1, [{ LEMMA: "new" }, { LEMMA: "girl" }, { LEMMA: "in" }, { LEMMA: "town" }]) matcher.add_pattern(1, [{LEMMA: "recent"}, {LEMMA: "move"}]) matcher.add_pattern(1, [{LEMMA: "recently"}, {LEMMA: "move"}]) matcher.add_pattern(1, [{LEMMA: "relocate"}]) matcher.add_pattern(1, [{ LEMMA: "new", DEP: "amod" }, { LEMMA: "city" }, { LEMMA: "to", DEP: "dep" }]) matcher.add_pattern(1, [{ LEMMA: "new", DEP: "amod" }, { IS_ASCII: True }, { LEMMA: "city" }, { IS_ASCII: True }, { LEMMA: "to", DEP: "dep" }]) matcher.add_pattern(1, [{LEMMA: "new"}, {LEMMA: "to"}, {LEMMA: "area"}]) matcher.add_pattern(1, [{ LEMMA: "new" }, { LEMMA: "to" }, { upper_start: True }]) matcher.add_pattern(1, [{LEMMA: "first"}, {LEMMA: "visit"}, {LEMMA: "to"}]) matcher.add_pattern(1, [{LEMMA: "i", DEP: "nsubj"}, {LEMMA: "arrive"}]) matcher.add_pattern(1, [{ LEMMA: "girl", DEP: "nsubj" }, { LEMMA: "arrive" }, { DEP: "partmod" }]) matcher.add_pattern(1, [{ LEMMA: "girl", DEP: "nsubj" }, { IS_ASCII: True }, { LEMMA: "arrive" }, { IS_ASCII: True }, { DEP: "partmod" }]) matcher.add_pattern(1, [{ LEMMA: "girl", DEP: "nsubj" }, { LEMMA: "arrive" }, { DEP: "quantmod" }]) matcher.add_pattern(1, [{ LEMMA: "girl", DEP: "nsubj" }, { IS_ASCII: True }, { LEMMA: "arrive" }, { IS_ASCII: True }, { DEP: "quantmod" }]) matcher.add_pattern(1, [{LEMMA: "just"}, {LEMMA: "arrive"}]) matcher.add_pattern(1, [{ LEMMA: "on" }, { LEMMA: "my" }, { LEMMA: "way" }, { LEMMA: "to" }, { TAG: "NNP" }]) matcher.add_pattern(1, [{ LEMMA: "on" }, { LEMMA: "my" }, { LEMMA: "way" }, { LEMMA: "to" }, { TAG: "NN" }]) matcher.add_pattern(1, [{LEMMA: "on"}, {LEMMA: "the"}, {LEMMA: "way"}]) matcher.add_pattern(1, [{LEMMA: "just"}, {LEMMA: "get"}, {LEMMA: "here"}]) matcher.add_pattern(1, [{LEMMA: "get"}, {LEMMA: "here"}, {LEMMA: "today"}]) matcher.add_pattern(1, [{ LEMMA: "get" }, { LEMMA: "here" }, { LEMMA: "yesterday" }]) matcher.add_pattern(1, [{ LEMMA: "get" }, { LEMMA: "here" }, { LEMMA: "last" }, { LEMMA: "night" }]) matcher.add_pattern(1, [{ LEMMA: "i", DEP: "nsubj" }, { IS_ASCII: True }, { LEMMA: "visit" }, { IS_ASCII: True }, { is_place: True, DEP: "dobj" }]) matcher.add_pattern(1, [{ LEMMA: "i", DEP: "nsubj" }, { LEMMA: "visit" }, { is_place: True, DEP: "dobj" }]) # Strong Positive Matcher Patterns matcher.add_entity(2) matcher.add_pattern(2, [{ LEMMA: "new" }, { IS_ASCII: True }, { LEMMA: "in" }, { is_place: True }]) matcher.add_pattern(2, [{ LEMMA: "new" }, { IS_ASCII: True }, { IS_ASCII: True }, { LEMMA: "in" }, { is_place: True }]) matcher.add_pattern(2, [{ LEMMA: "im" }, { LEMMA: "new" }, { LEMMA: "in" }, { LEMMA: "town" }]) matcher.add_pattern(2, [{LEMMA: "new"}, {LEMMA: "in"}, {is_place: True}]) matcher.add_pattern(2, [{LEMMA: "new"}, {LEMMA: "to"}, {is_place: True}]) matcher.add_pattern(2, [{ LEMMA: "new" }, { is_girl: True }, { LEMMA: "in" }, { LEMMA: "town" }]) matcher.add_pattern(2, [{ LEMMA: "new" }, { LEMMA: "to" }, { upper_start: True }, { LEMMA: "area" }]) # Negative Matcher Patterns matcher.add_entity(3) matcher.add_pattern(3, [{LEMMA: "new"}]) matcher.add_pattern(3, [{LEMMA: "girl"}, {LEMMA: "in"}, {LEMMA: "town"}]) matcher.add_pattern(3, [{LEMMA: "grand"}, {LEMMA: "new"}]) matcher.add_pattern(3, [{LEMMA: "new"}, {LEMMA: "at"}]) matcher.add_pattern(3, [{ LEMMA: "new" }, { LEMMA: "to" }, { LEMMA: "business" }]) matcher.add_pattern(3, [{ LEMMA: "new" }, { LEMMA: "to" }, { LEMMA: "industry" }]) matcher.add_pattern(3, [{ LEMMA: "new" }, { LEMMA: "to" }, { LEMMA: "scenario" }]) matcher.add_pattern(3, [{LEMMA: "dream", DEP: "nsubj"}, {LEMMA: "arrive"}]) matcher.add_pattern(3, [{ LEMMA: "fantasy", DEP: "nsubj" }, { LEMMA: "arrive" }]) matcher.add_pattern(3, [{LEMMA: "you", DEP: "nsubj"}, {LEMMA: "arrive"}]) matcher.add_pattern(3, [{LEMMA: "area"}, {LEMMA: "only"}]) matcher.add_pattern(3, [{upper_start: True}, {LEMMA: "area"}]) matcher.add_pattern(3, [{LEMMA: "you", DEP: "nsubj"}, {LEMMA: "leave"}]) matcher.add_pattern(3, [{ LEMMA: "it", DEP: "dobj" }, { LEMMA: "leave" }, { IS_ASCII: True, DEP: "nmod", TAG: "TO" }]) matcher.add_pattern(3, [{ LEMMA: "that", DEP: "dobj" }, { LEMMA: "leave" }, { IS_ASCII: True, DEP: "nmod", TAG: "TO" }]) matcher.add_pattern(3, [{LEMMA: "best"}, {LEMMA: "move"}]) matcher.add_pattern(3, [{LEMMA: "next"}, {LEMMA: "move"}]) matcher.add_pattern(3, [{ LEMMA: "arrive" }, { IS_ASCII: True }, { IS_ASCII: True, DEP: "xcomp" }]) matcher.add_pattern(3, [{LEMMA: "arrive"}, {IS_ASCII: True, DEP: "xcomp"}]) matcher.add_pattern(3, [{LEMMA: "visit"}, {LEMMA: "sister", DEP: "dobj"}]) matcher.add_pattern(3, [{ LEMMA: "visit" }, { IS_ASCII: True }, { LEMMA: "sister", DEP: "dobj" }]) matcher.add_pattern(3, [{LEMMA: "visit"}, {LEMMA: "family", DEP: "dobj"}]) matcher.add_pattern(3, [{ LEMMA: "visit" }, { IS_ASCII: True }, { LEMMA: "family", DEP: "dobj" }]) matcher.add_pattern(3, [{LEMMA: "we", DEP: "poss"}, {LEMMA: "visit"}]) # Strong Negative Matcher Patterns matcher.add_entity(4) matcher.add_pattern(4, [{LEMMA: "town"}, {LEMMA: "girl"}]) matcher.add_pattern(4, [{LEMMA: "on"}, {LEMMA: "the"}, {LEMMA: "town"}]) matcher.add_pattern(4, [{LEMMA: "near"}, {LEMMA: "town"}]) matcher.add_pattern(4, [{LEMMA: "down"}, {LEMMA: "town"}]) matcher.add_pattern(4, [{LEMMA: "town"}, {LEMMA: "hall"}]) matcher.add_pattern(4, [{LEMMA: "best"}, {LEMMA: "in"}, {LEMMA: "town"}]) matcher.add_pattern(4, [{ LEMMA: "best" }, { IS_ASCII: True }, { LEMMA: "in" }, { LEMMA: "town" }]) matcher.add_pattern(4, [{ LEMMA: "best" }, { IS_ASCII: True }, { IS_ASCII: True }, { LEMMA: "in" }, { LEMMA: "town" }]) matcher.add_pattern(4, [{ LEMMA: "best" }, { LEMMA: "in" }, { IS_ASCII: True }, { LEMMA: "town" }]) matcher.add_pattern(4, [{ LEMMA: "best" }, { IS_ASCII: True }, { LEMMA: "in" }, { IS_ASCII: True }, { LEMMA: "town" }]) matcher.add_pattern(4, [{ LEMMA: "best" }, { IS_ASCII: True }, { IS_ASCII: True }, { LEMMA: "in" }, { IS_ASCII: True }, { LEMMA: "town" }]) matcher.add_pattern(4, [{ LEMMA: "not" }, { LEMMA: "new" }, { LEMMA: "in" }, { LEMMA: "town" }]) matcher.add_pattern(4, [{ LEMMA: "not" }, { LEMMA: "new" }, { LEMMA: "to" }, { LEMMA: "town" }]) matcher.add_pattern(4, [{LEMMA: "not"}, {LEMMA: "leave"}, {LEMMA: "town"}]) matcher.add_pattern(4, [{ LEMMA: "i", DEP: "nsubj" }, { LEMMA: "leave" }, { LEMMA: "you", DEP: "dobj" }]) matcher.add_pattern(4, [{LEMMA: "new"}, {LEMMA: "but"}]) matcher.add_pattern(4, [{ LEMMA: "new" }, { LEMMA: "backpage", DEP: "nmod", TAG: "TO" }]) matcher.add_pattern(4, [{ LEMMA: "new" }, { IS_ASCII: True }, { LEMMA: "backpage", DEP: "nmod", TAG: "TO" }]) matcher.add_pattern(4, [{ LEMMA: "new" }, { LEMMA: "bp", DEP: "nmod", TAG: "TO" }]) matcher.add_pattern(4, [{ LEMMA: "new" }, { IS_ASCII: True }, { LEMMA: "bp", DEP: "nmod", TAG: "TO" }]) #DS matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "message", DEP: "dobj"}]) matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "msg", DEP: "dobj"}]) matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "txt", DEP: "dobj"}]) matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "text", DEP: "dobj"}]) matcher.add_pattern(4, [{ LEMMA: "leave" }, { LEMMA: "impression", DEP: "dobj" }]) matcher.add_pattern(4, [{ LEMMA: "leave" }, { LEMMA: "voicemail", DEP: "dobj" }]) matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "smile", DEP: "dobj"}]) matcher.add_pattern(4, [{ LEMMA: "leave" }, { IS_ASCII: True }, { LEMMA: "message", DEP: "dobj" }]) matcher.add_pattern(4, [{ LEMMA: "leave" }, { IS_ASCII: True }, { LEMMA: "msg", DEP: "dobj" }]) matcher.add_pattern(4, [{ LEMMA: "leave" }, { IS_ASCII: True }, { LEMMA: "txt", DEP: "dobj" }]) matcher.add_pattern(4, [{ LEMMA: "leave" }, { IS_ASCII: True }, { LEMMA: "text", DEP: "dobj" }]) matcher.add_pattern(4, [{ LEMMA: "leave" }, { IS_ASCII: True }, { LEMMA: "impression", DEP: "dobj" }]) matcher.add_pattern(4, [{ LEMMA: "leave" }, { IS_ASCII: True }, { LEMMA: "voicemail", DEP: "dobj" }]) matcher.add_pattern(4, [{ LEMMA: "leave" }, { IS_ASCII: True }, { LEMMA: "smile", DEP: "dobj" }]) matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "satisfied"}]) matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "memory", DEP: "dobj"}]) matcher.add_pattern(4, [{ LEMMA: "leave" }, { IS_ASCII: True }, { LEMMA: "memory", DEP: "dobj" }]) matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "you"}]) matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "u"}]) matcher.add_pattern(4, [{LEMMA: "leave"}, {LEMMA: "with"}]) matcher.add_pattern(4, [{ LEMMA: "leave" }, { LEMMA: "a" }, { LEMMA: "gentleman" }]) matcher.add_pattern(4, [{LEMMA: "or"}, {LEMMA: "leave"}]) matcher.add_pattern(4, [{LEMMA: "or"}, {LEMMA: "i"}, {LEMMA: "leave"}]) matcher.add_pattern(4, [{LEMMA: "move"}, {LEMMA: "on"}]) matcher.add_pattern(4, [{LEMMA: "i"}, {LEMMA: "move"}, {LEMMA: "like"}]) matcher.add_pattern(4, [{LEMMA: "arrive"}, {LEMMA: "on"}, {LEMMA: "time"}]) matcher.add_pattern(4, [{LEMMA: "can"}, {LEMMA: "move"}]) matcher.add_pattern(4, [{LEMMA: "new"}, {LEMMA: "but"}]) matcher.add_pattern(4, [{ LEMMA: "on" }, { LEMMA: "my" }, { LEMMA: "way" }, { LEMMA: "to" }, { TAG: "PRP" }]) matcher.add_pattern(4, [{LEMMA: "u"}, {LEMMA: "get"}, {LEMMA: "here"}]) matcher.add_pattern(4, [{LEMMA: "you"}, {LEMMA: "get"}, {LEMMA: "here"}]) matcher.add_pattern(4, [{LEMMA: "go"}, {LEMMA: "to"}, {LEMMA: "town"}]) matcher.add_pattern(4, [{LEMMA: "new"}, {LEMMA: "management"}]) return matcher