def custom_tokenizer(nlp): inf = list(nlp.Defaults.infixes) inf = [x for x in inf if '-|–|—|--|---|——|~' not in x] # remove the hyphen-between-letters pattern from infix patterns infix_re = compile_infix_regex(tuple(inf)) infixes = ( LIST_ELLIPSES + LIST_ICONS + [ r'(?<=[0-9])[+\\-\\*^](?=[0-9-])', r'(?<=[{al}{q}])\\.(?=[{au}{q}])'.format( al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES ), # REMOVE: commented out regex that splits on hyphens between letters: # r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), # EDIT: remove split on slash between letters, and add comma # r'(?<=[{a}0-9])[:<>=/](?=[{a}])'.format(a=ALPHA), r'(?<=[{a}0-9])[:<>=,](?=[{a}])'.format(a=ALPHA), # ADD: ampersand as an infix character except for dual upper FOO&FOO variant r'(?<=[{a}0-9])[&](?=[{al}0-9])'.format(a=ALPHA, al=ALPHA_LOWER), r'(?<=[{al}0-9])[&](?=[{a}0-9])'.format(a=ALPHA, al=ALPHA_LOWER), ] ) infix_re = spacy.util.compile_infix_regex(infixes) return Tokenizer(nlp.vocab, prefix_search=nlp.tokenizer.prefix_search, suffix_search=nlp.tokenizer.suffix_search, infix_finditer=infix_re.finditer, token_match=nlp.tokenizer.token_match, rules=nlp.Defaults.tokenizer_exceptions)
def __init__(self, token_indexers: Dict[str, TokenIndexer] = None, use_regex: bool = True) -> None: super().__init__(lazy=False) self.token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer() } self.nlp = spacy.load('en_core_web_sm') if use_regex: infix_re = compile_infix_regex(self.nlp.Defaults.infixes + tuple(r'-') + tuple(r'[/+=\(\)\[\]]')) prefix_re = compile_prefix_regex(self.nlp.Defaults.prefixes + tuple(r'[\'\(\[]')) suffix_re = compile_suffix_regex(self.nlp.Defaults.suffixes + tuple(r'[\.\+\)\]]')) self.nlp.tokenizer = Tokenizer( self.nlp.vocab, prefix_search=prefix_re.search, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer, token_match=self.nlp.tokenizer.token_match)
def clean_text(txt): nlp = French() listcode = [x + 45 for x in range(99)] postalcod = lambda dd, liscode: str(int(dd) * 1000 ) if dd in liscode else dd customize_remove_PUNCT = ['%'] for w in customize_remove_PUNCT: nlp.vocab[w].is_punct = False customize_add_PUNCT = [ '>', '=', '$', '™', 'eee', 'ee', 'e', "EE", "EEE", "E", ":" ] for w in customize_add_PUNCT: nlp.vocab[w].is_punct = True reg = '(?<=[0-9])[+\\-\\*^](?=[0-9-])' list_infixes_defaults = list(nlp.Defaults.infixes) if reg in list_infixes_defaults: list_infixes_defaults.remove(reg) # modify process_text infix patterns(dd-dd-dd) infixes = (list_infixes_defaults + [r"(?<=[0-9])[\+\*^](?=[0-9-])"]) infix_re = compile_infix_regex(infixes) nlp.tokenizer.infix_finditer = infix_re.finditer doc = nlp(txt) tokens = [ postalcod(w.text.lower(), listcode) for w in doc if w.text != 'n' and not w.is_punct and not w.is_space and not (w.like_num and len(w.text) > 5) and not len(w.text) > 11 and not w.is_quote ] listToStr = ' '.join(map(str, tokens)) return listToStr
def custom_tokenizer(self): """ Function that defines a tokenizer in order to be used Parameters ----------- nlp: spacy loaded object return: prepared tokenizer """ infixes = ( LIST_ELLIPSES + LIST_ICONS + [ r"(?<=[0-9])[+\-\*^](?=[0-9-])", r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES), r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), #r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), ]) infix_re = compile_infix_regex(infixes) return Tokenizer(self.nlp.vocab, prefix_search=self.nlp.tokenizer.prefix_search, suffix_search=self.nlp.tokenizer.suffix_search, infix_finditer=infix_re.finditer, token_match=self.nlp.tokenizer.token_match, rules=self.nlp.Defaults.tokenizer_exceptions)
def custom_tokenizer(nlp): # add '\.|-|~' and remove '#' (default prefixes list) hashtag_index = nlp.Defaults.prefixes.index('#') _prefixes = list(nlp.Defaults.prefixes) + [r'^\.|^~|^-(?=\S)'] del _prefixes[hashtag_index] # add '\.' and remove '#' (default suffixes list) # add _api_calls regex hashtag_index = nlp.Defaults.suffixes.index('#') _suffixes = list(nlp.Defaults.suffixes) + _api_invoc + _var + [r'\.$'] del _suffixes[hashtag_index] # add '\(|\[' to split nested api calls, arrays etc (default infixes list) # add _hashtags regex _infixes = list(nlp.Defaults.infixes) + _hashtags + \ [r'\(|\)|\[|\]|\{|\}|<|>|,|=|\+|-|:|;|\'|\"|\/|&|\?'] # setup each regex using native spaCy util functions prefix_re = util.compile_prefix_regex(_prefixes) suffix_re = util.compile_suffix_regex(_suffixes) infix_re = util.compile_infix_regex(_infixes) _tokenizer_exceptions = nlp.Defaults.tokenizer_exceptions return Tokenizer(nlp.vocab, _tokenizer_exceptions, prefix_search=prefix_re.search, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer, token_match=_protect.match)
def setup_tokenizer(): ''' Function to set up a tokenizer with specific rules to not split words or numbers that include hyphens. ''' nlp = spacy.load('en_core_web_sm') # Default infixes inf = list(nlp.Defaults.infixes) # Remove the generic op between numbers or between a number and a hyphen inf.remove(r"(?<=[0-9])[+\-\*^](?=[0-9-])") # Convert inf to tuple inf = tuple(inf) # Add the removed rule after subtracting (?<=[0-9])-(?=[0-9]) pattern infixes = inf + tuple([r"(?<=[0-9])[+*^](?=[0-9-])", r"(?<=[0-9])-(?=-)"]) # Remove hyphen between letters rule infixes = [x for x in infixes if '-|–|—|--|---|——|~' not in x] infix_re = compile_infix_regex(infixes) nlp.tokenizer = Tokenizer(nlp.vocab, prefix_search = nlp.tokenizer.prefix_search, suffix_search = nlp.tokenizer.suffix_search, infix_finditer = infix_re.finditer, token_match = nlp.tokenizer.token_match, rules = nlp.Defaults.tokenizer_exceptions) return nlp
def create_custom_tokenizer(nlp): from spacy import util from spacy.tokenizer import Tokenizer from spacy.lang.tokenizer_exceptions import TOKEN_MATCH prefixes = nlp.Defaults.prefixes + ('^<i>', ) suffixes = nlp.Defaults.suffixes + ('</i>$', ) # remove the tag symbols from prefixes and suffixes prefixes = list(prefixes) prefixes.remove('<') prefixes = tuple(prefixes) suffixes = list(suffixes) suffixes.remove('>') suffixes = tuple(suffixes) infixes = nlp.Defaults.infixes rules = nlp.Defaults.tokenizer_exceptions token_match = TOKEN_MATCH prefix_search = (util.compile_prefix_regex(prefixes).search) suffix_search = (util.compile_suffix_regex(suffixes).search) infix_finditer = (util.compile_infix_regex(infixes).finditer) return Tokenizer(nlp.vocab, rules=rules, prefix_search=prefix_search, suffix_search=suffix_search, infix_finditer=infix_finditer, token_match=token_match)
def custom_tokenizer(nlp): prefix_re = compile_prefix_regex(Language.Defaults.prefixes + (';', '\*')) suffix_re = compile_suffix_regex(Language.Defaults.suffixes + (';', '\*')) infix_re = compile_infix_regex(Language.Defaults.infixes + ('(', ')', "/", "-", ";", "\*")) return Tokenizer(nlp.vocab, prefix_search=prefix_re.search, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer, token_match=None)
def _get_infix_regex(self): """ Custom infix tokenization rules :return: """ custom_infixes = [r'\[\]', r'(?<=[0-9])-(?=[0-9])', r'[!&:,()\*/-><]'] infix_re = compile_infix_regex( tuple(list(self.nlp.Defaults.infixes) + custom_infixes)) return infix_re
def _get_infix_regex(self): """ Custom infix tokenization rules :return: """ custom_infixes = ['.'] infix_re = compile_infix_regex( tuple(list(self.nlp.Defaults.infixes) + custom_infixes)) return infix_re
def _get_infix_regex(self): # Customize Spacy tokenization to NOT split words with hyphens # Source: https://spacy.io/usage/linguistic-features#native-tokenizers return compile_infix_regex(LIST_ELLIPSES + LIST_ICONS + [ r"(?<=[0-9])[+\-\*^](?=[0-9-])", r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES), r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), # EDIT: commented out regex that splits on hyphens between letters: # r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), ]).finditer
def customize_infixes(): infixes = ( LIST_ELLIPSES + LIST_ICONS + [ r"(?<=[0-9])[+\-\*^](?=[0-9-])", r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES), r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), # r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), r"(?<=[{a}0-9])[:<>/](?=[{a}])".format(a=ALPHA), ]) infix_re = compile_infix_regex(infixes) return infix_re
def create_custom_tokenizer(nlp): prefixes = compile_prefix_regex(nlp.Defaults.prefixes) infixes = compile_infix_regex(nlp.Defaults.infixes) suffixes = compile_suffix_regex( tuple(list(nlp.Defaults.suffixes) + custom_suffixes)) return Tokenizer(nlp.vocab, rules=nlp.Defaults.tokenizer_exceptions, prefix_search=prefixes.search, infix_finditer=infixes.finditer, suffix_search=suffixes.search, token_match=None)
def my_nlp(model): nlp = spacy.load(model) list_infixes_defaults = list(nlp.Defaults.infixes) if reg in list_infixes_defaults: list_infixes_defaults.remove(reg) # modify tokenizer infix patterns(dd-dd-dd) infixes = (list_infixes_defaults + [r"(?<=[0-9])[\+\*^](?=[0-9-])"]) infix_re = compile_infix_regex(infixes) nlp.tokenizer.infix_finditer = infix_re.finditer ruler = EntityRuler(nlp) ruler.add_patterns(patterns) nlp.add_pipe(ruler, before='ner') return nlp
def __init__(self): self.nlp = spacy.load("en_core_web_lg") infixes = ( LIST_ELLIPSES + LIST_ICONS + [ r"(?<=[0-9])[+\-\*^](?=[0-9-])", r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES), r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), # EDIT: commented out regex that splits on hyphens between letters: # r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), ]) infix_re = compile_infix_regex(infixes) self.nlp.tokenizer.infix_finditer = infix_re.finditer
def extend_tokenizer(nlp, pref, inf, suf): pref = tuple( pref + list(nlp.Defaults.prefixes)) if pref else nlp.Defaults.prefixes suf = tuple(suf + list(nlp.Defaults.suffixes)) if suf else nlp.Defaults.suffixes inf = tuple(inf + list(nlp.Defaults.infixes)) if inf else nlp.Defaults.infixes tok = "^(?:" + "|".join([RE[r]["str"] for r in RE['tok_patterns']]) + ")$" return Tokenizer( nlp.vocab, rules=nlp.Defaults.tokenizer_exceptions, prefix_search=spacyUtil.compile_prefix_regex(pref).search, suffix_search=spacyUtil.compile_suffix_regex(suf).search, infix_finditer=spacyUtil.compile_infix_regex(inf).finditer, token_match=re.compile(tok).match)
def test_tokenizer_infix_prefix(en_vocab): # the prefix and suffix matches overlap in the suffix lookbehind infixes = ["±"] suffixes = ["%"] infix_re = compile_infix_regex(infixes) suffix_re = compile_suffix_regex(suffixes) tokenizer = Tokenizer( en_vocab, infix_finditer=infix_re.finditer, suffix_search=suffix_re.search, ) tokens = [t.text for t in tokenizer("±10%")] assert tokens == ["±10", "%"] explain_tokens = [t[1] for t in tokenizer.explain("±10%")] assert tokens == explain_tokens
def custom_tokenizer(nlp): infixes = (LIST_ELLIPSES + LIST_ICONS + [ r"(?<=[0-9])[+\-\*^](?=[0-9-])", r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES), r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA) ]) infix_re = compile_infix_regex(infixes) return Tokenizer(nlp.vocab, prefix_search=nlp.tokenizer.prefix_search, suffix_search=nlp.tokenizer.suffix_search, infix_finditer=infix_re.finditer, token_match=nlp.tokenizer.token_match, rules=nlp.Defaults.tokenizer_exceptions)
def spacy_nlp(nlp): customize_add_PUNCT = ['/', '=', '$', '|', '\\', "-"] for w in customize_add_PUNCT: nlp.vocab[w].is_punct = True # modify tokenizer infix patterns prefixes = (list(nlp.Defaults.prefixes) + ['/']) prefixes_regex = compile_prefix_regex(prefixes) nlp.tokenizer.prefix_search = prefixes_regex.search infixes = (list(nlp.Defaults.infixes) + ['(?<=[0-9])[|\/+\\-\\*^](?=[0-9-])']) infix_re = compile_infix_regex(infixes) nlp.tokenizer.infix_finditer = infix_re.finditer return nlp
def spacy_tokenizer(text_spacy): # modify tokenizer infix patterns infixes = ( LIST_ELLIPSES + LIST_ICONS + [ r"(?<=[0-9])[+\-\*^](?=[0-9-])", r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES), r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), # EDIT: commented out regex that splits on hyphens between letters: #r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), ]) infix_re = compile_infix_regex(infixes) self.nlp.tokenizer.infix_finditer = infix_re.finditer doc = self.nlp(text_spacy) return [(t.text, t.pos_) for t in doc]
def create_medspacy_tokenizer(nlp): """Generates a custom tokenizer to augment the default spacy tokenizer for situations commonly seen in clinical text. This includes: * Punctuation infixes. For example, this allows the following examples to be more aggresively tokenized as : "Patient complains of c/o" -> [..., 'c', '/', 'o'] "chf+cp" -> ['chf', '+', 'cp'] @param nlp: Spacy language model """ # augment the defaults # this is not quite correct. We do not want to break on uppercase and we do not # want to break on all punctuation (periods) # infixes = nlp.Defaults.infixes + (r'''[^a-z0-9]''',) # escape all the punctuation we want to allow to allow to break up tokens # get all python punctuation punctuation_chars = string.punctuation # remove periods so that we do not break up '1.5 mg' into '1 . 5 mg' punctuation_chars = punctuation_chars.replace('.', '') infixes = nlp.Defaults.infixes + (r'''[{}]'''.format( re.escape(punctuation_chars)), ) prefixes = nlp.Defaults.prefixes suffixes = nlp.Defaults.suffixes # compile infix_re = compile_infix_regex(infixes) prefix_re = compile_prefix_regex(prefixes) suffix_re = compile_suffix_regex(suffixes) # Default exceptions could be extended later tokenizer_exceptions = nlp.Defaults.tokenizer_exceptions.copy() # now create this tokenizer = Tokenizer( nlp.vocab, tokenizer_exceptions, prefix_search=prefix_re.search, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer, token_match=nlp.tokenizer.token_match, ) return tokenizer
def custom_en_tokenizer(en_vocab): prefix_re = compile_prefix_regex(English.Defaults.prefixes) suffix_re = compile_suffix_regex(English.Defaults.suffixes) custom_infixes = [ r"\.\.\.+", r"(?<=[0-9])-(?=[0-9])", r"[0-9]+(,[0-9]+)+", r"[\[\]!&:,()\*—–\/-]", ] infix_re = compile_infix_regex(custom_infixes) return Tokenizer( en_vocab, English.Defaults.tokenizer_exceptions, prefix_re.search, suffix_re.search, infix_re.finditer, token_match=None, )
def setup_spacy_parser(): SPACY_PARSER = spacy.load('en', disable=['spacy_parser', 'ner']) #prefix_re = re.compile(r'''^[]''') #suffix_re = re.compile(r'''[]$''') #infix_re = re.compile(r'''''') # modify tokenizer infix patterns infixes = (LIST_ELLIPSES + LIST_ICONS) infix_re = compile_infix_regex(infixes) SPACY_PARSER.tokenizer.infix_finditer = infix_re.finditer SPACY_PARSER.tokenizer.add_special_case("``", [{"ORTH": "``"}]) SPACY_PARSER.tokenizer.add_special_case("´´", [{"ORTH": "´´"}]) #SPACY_PARSER.tokenizer.prefix_search = prefix_re.search #SPACY_PARSER.tokenizer.suffix_search = suffix_re.search return SPACY_PARSER
def _custom_tokenizer(nlp): inf = list(nlp.Defaults.infixes) # Default infixes inf.remove( r"(?<=[0-9])[+\-\*^](?=[0-9-])" ) # Remove the generic op between numbers or between a number and a - inf = tuple(inf) # Convert inf to tuple infixes = inf + tuple([ r"(?<=[0-9])[+*^](?=[0-9-])", r"(?<=[0-9])-(?=-)" ]) # Add the removed rule after subtracting (?<=[0-9])-(?=[0-9]) pattern infixes = [x for x in infixes if '-|–|—|--|---|——|~' not in x ] # Remove - between letters rule infix_re = compile_infix_regex(infixes) return Tokenizer(nlp.vocab, prefix_search=nlp.tokenizer.prefix_search, suffix_search=nlp.tokenizer.suffix_search, infix_finditer=infix_re.finditer, token_match=nlp.tokenizer.token_match, rules=nlp.Defaults.tokenizer_exceptions)
def custom_tokenizer(nlp): infixes = list(nlp.Defaults.infixes) # add custom tokenize cases: # for case: <Wort>-<Wort> --> für deutsch eher weglassen? #infixes.append(r'(?<=[{a}"])[-](?=[{a}])'.format(a=ALPHA)) # for case: <Zahl>-<Wort> infixes.append(r'(?<=[0-9])[-](?=[{a}])'.format(a=ALPHA)) infix_re = compile_infix_regex(infixes) prefix_re = compile_prefix_regex(nlp.Defaults.prefixes) suffix_re = compile_suffix_regex(nlp.Defaults.suffixes) return Tokenizer(nlp.vocab, rules=nlp.Defaults.tokenizer_exceptions, prefix_search=prefix_re.search, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer, token_match=None)
def custom_tokenizer(nlp, never_split): cls = nlp.Defaults rules = cls.tokenizer_exceptions token_match = cls.token_match prefix_search = (util.compile_prefix_regex(cls.prefixes).search if cls.prefixes else None) suffix_search = (util.compile_suffix_regex(cls.suffixes).search if cls.suffixes else None) infix_finditer = (util.compile_infix_regex(cls.infixes).finditer if cls.infixes else None) vocab = nlp.vocab return Tokenizer( vocab, rules=rules, prefix_search=prefix_search, suffix_search=suffix_search, infix_finditer=infix_finditer, token_match=lambda x: token_match(x) or x in never_split, )
def add_special_tokenizer_cases(nlp: Language) -> Language: infix_re = compile_infix_regex( tuple(TOKENIZER_INFIXES + [ r"(?<=[{a}0-9])([()#\.]+|(-)+([->])+)(?=[{a}0-9])".format(a=ALPHA) ])) prefix_re = compile_prefix_regex(tuple(TOKENIZER_PREFIXES + [r'^[.-]+'])) suffix_re = compile_suffix_regex(tuple(TOKENIZER_SUFFIXES + [r'[.-]+$'])) nlp.tokenizer = Tokenizer(nlp.vocab, prefix_search=prefix_re.search, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer, token_match=None) for tok in [ '==', '+=', '-=', '*=', '/=', '%=', '!=', '<>', '->', '-->', '--', '---', TOK_VERSION ]: nlp.tokenizer.add_special_case(tok, [{ORTH: tok, NORM: tok, POS: X}]) return nlp
def customize_tokenizer(nlp): prefix_re = util.compile_prefix_regex(nlp.Defaults.prefixes) suffix_re = util.compile_suffix_regex(nlp.Defaults.suffixes) infix_re = util.compile_infix_regex(nlp.Defaults.infixes) # Remove all exceptions where a single letter is followed by a period (e.g. 'h.') exceptions = { k: v for k, v in dict(nlp.Defaults.tokenizer_exceptions).items() if not (len(k) == 2 and k[1] == ".") } new_tokenizer = Tokenizer( nlp.vocab, exceptions, prefix_search=prefix_re.search, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer, token_match=nlp.tokenizer.token_match, ) nlp.tokenizer = new_tokenizer
def custom_tokenizer(nlp): infixes = (LIST_ELLIPSES + LIST_ICONS + [ r"(?<=[0-9])[+\-\*^](?=[0-9-])", r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES), r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), r"(?<=[{a}0-9\.])[:<>=()+—](?=[{a}0-9\.])".format(a=ALPHA), r"(?<=[A-Za-z]{2})/(?=[A-Za-z]{2})", r"(?:[{a}]\.)+ [{a}0-9]".format(a=ALPHA), ]) infix_re = compile_infix_regex(infixes) prefix_re = compile_prefix_regex(nlp.Defaults.prefixes + ('-', )) suffix_re = compile_suffix_regex(nlp.Defaults.suffixes + ('-', )) return Tokenizer(nlp.vocab, prefix_search=prefix_re.search, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer, token_match=nlp.tokenizer.token_match, rules=nlp.Defaults.tokenizer_exceptions)
def custom_tokenizer(self): '''Set up custom tokenizer''' default_infix = self.nlp.Defaults.infixes default_prefix = self.nlp.Defaults.prefixes default_suffix = self.nlp.Defaults.suffixes prefix_list = ['mr', 'dr', 'mrs', 'prof', 'ms', 'mx'] prefix_re_list = self._make_prefix_cases(prefix_list) all_infix_re = compile_infix_regex(default_infix) all_prefix_re = spacy.util.compile_prefix_regex( tuple(list(default_prefix) + prefix_re_list)) all_suffix_re = compile_suffix_regex(default_suffix) return Tokenizer(self.nlp.vocab, prefix_search=all_prefix_re.search, suffix_search=all_suffix_re.search, infix_finditer=all_infix_re.finditer, token_match=None)