class KlingonDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters.update(LEX_ATTRS) lex_attr_getters[LANG] = lambda text: "klingon" lex_attr_getters[NORM] = add_lookups( Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = STOP_WORDS
class TurkishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) #lex_attr_getters.update(LEX_ATTRS) lex_attr_getters[LANG] = lambda text: "tr" lex_attr_getters[NORM] = add_lookups( Language.Defaults.lex_attr_getters[NORM], BASE_NORMS ) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tag_map = TAG_MAP stop_words = STOP_WORDS
class PortugueseDefaults(spacy.lang.pt.Portuguese.Defaults): lex_attr_getters = dict(spacy.lang.pt.Portuguese.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'pt' # language ISO code # optional: replace flags with custom functions, e.g. like_num() lex_attr_getters.update(LEX_ATTRS) # merge base exceptions and custom tokenizer exceptions tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = STOP_WORDS
class ChineseDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'zh' # for pickling tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tag_map = TAG_MAP stop_words = STOP_WORDS
class SerbianpDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters.update(LEX_ATTRS) lex_attr_getters[LANG] = lambda text: "srp" tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = STOP_WORDS
ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB" }], "isn't": [{ ORTH: "is", LEMMA: "be" }, { ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB" }] } TOKENIZER_EXCEPTIONS = update_exc(TOKENIZER_EXCEPTIONS) # updating the stopset calfresh_stopwords = { "Calfresh", "CalFresh", "calfresh", "CALFRESH", "foodstamps", "sar7", "sar", "sr7", "sr", "SAR7", "SR7", "SAR", "SR", "Sar", "Sar7", "ebt" } calfresh_placeholders = { "PERSON", "ORG", "GPE", "LOC", "DATE", "MONEY", "CARDINAL" } stopset = STOP_WORDS.update(calfresh_stopwords, calfresh_placeholders) regex = re.compile(r'\W|\d', flags=re.UNICODE) def clean_words(text):
def __init__(self, batch_size, n_cpus, n_threads, mode): print('loading model...', end=' ') self.nlp = english_model.load() self.nlp.remove_pipe('tagger') self.nlp.remove_pipe('ner') punct = list(string.punctuation) punct.remove('.') punct.append('[**') punct.append('**]') punct = [re.escape(p) for p in punct] prefixes_custom = tuple(punct) infixes_custom = tuple(punct) suffixes_custom = tuple(punct) #prefixes_custom = tuple([r'\[\*\*', r'(']) #suffixes_custom = tuple([r'\*\*\]', r')']) #infixes_custom = tuple([r'\[\*\*', r'\*\*\]', r'(', r')', r'>', r'<', r'->', r'-->', r'--->']) exceptions_custom = { id: pattern for id, pattern in tokenizer_utils.generate_matcher_pattern1() } exceptions = update_exc(self.nlp.Defaults.tokenizer_exceptions, exceptions_custom) prefix_re = compile_prefix_regex(self.nlp.Defaults.prefixes + prefixes_custom) infix_re = compile_infix_regex(infixes_custom + self.nlp.Defaults.infixes) suffix_re = compile_suffix_regex(self.nlp.Defaults.suffixes + suffixes_custom) tokenizer = SpacyTokenizer(self.nlp.vocab, rules=exceptions, prefix_search=prefix_re.search, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer, token_match=self.nlp.Defaults.token_match) self.nlp.tokenizer = tokenizer matcher = Matcher(self.nlp.vocab) def on_match_pattern(matcher, doc, id, matches): match_id, start, end = matches[id] if self.nlp.vocab.strings[match_id].startswith('p3'): span = doc[start + 1:end] span.merge() for i in range(id, len(matches)): matches[i] = (matches[i][0], matches[i][1] - 1, matches[i][2] - 1) elif self.nlp.vocab.strings[match_id].startswith('p2.1'): span1 = doc[start:start + 2] span2 = doc[start + 2:end] span1.merge() span2.merge() for i in range(id, len(matches)): matches[i] = (matches[i][0], matches[i][1] - 2, matches[i][2] - 2) elif self.nlp.vocab.strings[match_id].startswith('p2.2'): span2 = doc[start + 1:end] span2.merge() for i in range(id, len(matches)): matches[i] = (matches[i][0], matches[i][1] - 1, matches[i][2] - 1) elif self.nlp.vocab.strings[match_id].startswith('p2.3'): span1 = doc[start:start + 2] span1.merge() for i in range(id, len(matches)): matches[i] = (matches[i][0], matches[i][1] - 1, matches[i][2] - 1) for id, pattern in tokenizer_utils.generate_matcher_pattern2(): matcher.add(id, on_match_pattern, pattern) for id, pattern in tokenizer_utils.generate_matcher_pattern3(): matcher.add(id, on_match_pattern, pattern) self.nlp.add_pipe(matcher, before='parser') print('done') self.batch_size = batch_size self.n_cpus = n_cpus self.n_threads = n_threads self.mode = mode