def custom_tokenizer(self, nlp): infix_re = compile_prefix_regex(nlp.Defaults.infixes) prefix_re = compile_prefix_regex(nlp.Defaults.prefixes) suffix_re = compile_suffix_regex(nlp.Defaults.suffixes) return Tokenizer(nlp.vocab, prefix_search=prefix_re.search, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer, token_match=None)
def __init__(self, token_indexers: Dict[str, TokenIndexer] = None, use_regex: bool = True) -> None: super().__init__(lazy=False) self.token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer() } self.nlp = spacy.load('en_core_web_sm') if use_regex: infix_re = compile_infix_regex(self.nlp.Defaults.infixes + tuple(r'-') + tuple(r'[/+=\(\)\[\]]')) prefix_re = compile_prefix_regex(self.nlp.Defaults.prefixes + tuple(r'[\'\(\[]')) suffix_re = compile_suffix_regex(self.nlp.Defaults.suffixes + tuple(r'[\.\+\)\]]')) self.nlp.tokenizer = Tokenizer( self.nlp.vocab, prefix_search=prefix_re.search, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer, token_match=self.nlp.tokenizer.token_match)
def create_custom_tokenizer(nlp): from spacy import util from spacy.tokenizer import Tokenizer from spacy.lang.tokenizer_exceptions import TOKEN_MATCH prefixes = nlp.Defaults.prefixes + ('^<i>', ) suffixes = nlp.Defaults.suffixes + ('</i>$', ) # remove the tag symbols from prefixes and suffixes prefixes = list(prefixes) prefixes.remove('<') prefixes = tuple(prefixes) suffixes = list(suffixes) suffixes.remove('>') suffixes = tuple(suffixes) infixes = nlp.Defaults.infixes rules = nlp.Defaults.tokenizer_exceptions token_match = TOKEN_MATCH prefix_search = (util.compile_prefix_regex(prefixes).search) suffix_search = (util.compile_suffix_regex(suffixes).search) infix_finditer = (util.compile_infix_regex(infixes).finditer) return Tokenizer(nlp.vocab, rules=rules, prefix_search=prefix_search, suffix_search=suffix_search, infix_finditer=infix_finditer, token_match=token_match)
def custom_tokenizer(nlp): # add '\.|-|~' and remove '#' (default prefixes list) hashtag_index = nlp.Defaults.prefixes.index('#') _prefixes = list(nlp.Defaults.prefixes) + [r'^\.|^~|^-(?=\S)'] del _prefixes[hashtag_index] # add '\.' and remove '#' (default suffixes list) # add _api_calls regex hashtag_index = nlp.Defaults.suffixes.index('#') _suffixes = list(nlp.Defaults.suffixes) + _api_invoc + _var + [r'\.$'] del _suffixes[hashtag_index] # add '\(|\[' to split nested api calls, arrays etc (default infixes list) # add _hashtags regex _infixes = list(nlp.Defaults.infixes) + _hashtags + \ [r'\(|\)|\[|\]|\{|\}|<|>|,|=|\+|-|:|;|\'|\"|\/|&|\?'] # setup each regex using native spaCy util functions prefix_re = util.compile_prefix_regex(_prefixes) suffix_re = util.compile_suffix_regex(_suffixes) infix_re = util.compile_infix_regex(_infixes) _tokenizer_exceptions = nlp.Defaults.tokenizer_exceptions return Tokenizer(nlp.vocab, _tokenizer_exceptions, prefix_search=prefix_re.search, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer, token_match=_protect.match)
def __init__(self, nlp): punctnquotes = r'… …… , : ; \! \? ¿ ؟ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ? ! , 、 ; : ~ · । ، ؛ ٪\' \'\' " ” “ `` ` ‘ ´ ‘‘ ’’ ‚ , „ » « 「 」 『 』 ( ) 〔 〕 【 】 《 》 〈 〉' infix_re = re.compile(punctnquotes) prefix_re = compile_prefix_regex(nlp.Defaults.prefixes) suffix_re = compile_suffix_regex(nlp.Defaults.suffixes) nlp.tokenizer = Tokenizer(nlp.vocab, prefix_search=prefix_re.search, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer, token_match=None) self.nlp = nlp self.d = 1 #Wordnet distance self.estimators = 10 self.crit = 'gini' self.max_f = 'auto' self.max_d = 1 self.threshold = 0.3 #set paths self.wordListCSVFile = 'essay_evaluation/Corpora/BritishWords_COCA_AmericanCounterparts.csv' self.fnameBritAmerWords = "essay_evaluation/Corpora/BritishWords_COCA_AmericanCounterparts.csv" self.word_embeddings_file = "essay_evaluation/Corpora/lexsub_word_embeddings" self.context_embeddings_file = "essay_evaluation/Corpora/lexsub_context_embeddings" self.fnameModel = "essay_evaluation/resources/subevalmodel"
def custom_tokenizer(nlp): infix_re = re.compile(r'''[?;‘’`“”"'~]''') prefix_re = compile_prefix_regex(nlp.Defaults.prefixes) suffix_re = compile_suffix_regex(nlp.Defaults.suffixes) return Tokenizer(nlp.vocab, prefix_search=prefix_re.search, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer)
def _get_prefix_regex(self): """ Custom prefix tokenization rules :return: """ custom_prefixes = [r"""^[\[\("'\\/@]"""] all_prefixes_re = compile_prefix_regex( tuple(list(self.nlp.Defaults.prefixes) + custom_prefixes)) return all_prefixes_re
def custom_tokenizer(nlp): infix_re = re.compile(r'''[.\,\(\)\?\:\;\...\‘\’\`\“\”\"\'~]''') prefix_re = compile_prefix_regex(nlp.Defaults.prefixes) suffix_re = compile_suffix_regex(nlp.Defaults.suffixes) return Tokenizer(nlp.vocab, prefix_search=prefix_re.search, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer, token_match=None)
def _get_prefix_regex(self): """ Custom prefix tokenization rules :return: """ prefix = r""".""" all_prefixes_re = compile_prefix_regex( tuple(list(self.nlp.Defaults.prefixes) + [prefix])) return all_prefixes_re
def custom_tokenizer(nlp): prefix_re = compile_prefix_regex(Language.Defaults.prefixes + (';', '\*')) suffix_re = compile_suffix_regex(Language.Defaults.suffixes + (';', '\*')) infix_re = compile_infix_regex(Language.Defaults.infixes + ('(', ')', "/", "-", ";", "\*")) return Tokenizer(nlp.vocab, prefix_search=prefix_re.search, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer, token_match=None)
def custom_tokenizer(nlp): # We create our own tokenizer to avoid spliting hyphen words. infix_re = re.compile(r'''[.\,\?\:\;\...\‘\’\`\“\”\"\'~]''') prefix_re = compile_prefix_regex(nlp.Defaults.prefixes) suffix_re = compile_suffix_regex(nlp.Defaults.suffixes) return Tokenizer(nlp.vocab, prefix_search=prefix_re.search, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer, token_match=None)
def custom_tokenizer(nlp): infix_re = re.compile(r'''[?;‘’`“”"'~]''') prefix_re = compile_prefix_regex(nlp.Defaults.prefixes) suffix_re = compile_suffix_regex(nlp.Defaults.suffixes) tokenizer = Tokenizer(nlp.vocab) tokenizer.prefix_search = prefix_re.search tokenizer.suffix_search = suffix_re.search tokenizer.infix_finditer = infix_re.finditer tokenizer.token_match = None return tokenizer
def replace_infix_rules(nlp): """ This converts a spacy pipeline such that its tokeniser no longer separates pretty much any token. E.g. contractions, hyphenations, honorifics, etc. """ return Tokenizer( nlp.vocab, prefix_search=compile_prefix_regex(nlp.Defaults.prefixes).search, suffix_search=compile_suffix_regex(nlp.Defaults.suffixes).search, infix_finditer=lambda x: iter(()), rules={})
def create_custom_tokenizer(nlp): prefixes = compile_prefix_regex(nlp.Defaults.prefixes) infixes = compile_infix_regex(nlp.Defaults.infixes) suffixes = compile_suffix_regex( tuple(list(nlp.Defaults.suffixes) + custom_suffixes)) return Tokenizer(nlp.vocab, rules=nlp.Defaults.tokenizer_exceptions, prefix_search=prefixes.search, infix_finditer=infixes.finditer, suffix_search=suffixes.search, token_match=None)
def custom_tokenizer(nlp): infix_re = re.compile( r'''[.\,\?\:\;\...\‘\’\`\“\”\"\'\(\)\[\]\{\}\*\%\^\+\-\=\<\>\|\!(//)(\n)(\t)~]''' ) prefix_re = compile_prefix_regex(nlp.Defaults.prefixes) suffix_re = compile_suffix_regex(nlp.Defaults.suffixes) return Tokenizer(nlp.vocab, prefix_search=prefix_re.search, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer, token_match=None)
def keep_hyphen_tokenizer(nlp: Language) -> Tokenizer: infix_re = re.compile(r'''[.\,\?\:\;\...\‘\’\`\“\”\"\'~]''') prefix_re = compile_prefix_regex(nlp.Defaults.prefixes) suffix_re = compile_suffix_regex(nlp.Defaults.suffixes) return Tokenizer( nlp.vocab, prefix_search=prefix_re.search, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer, token_match=None, )
def custom_tokenizer(self, nlp): """ Custom tokeniser that does not split on dashes. Useful for names (e.g. Hennis-Plasschaert). """ infix_re = re.compile(r'''[.\,\?\:\;\...\‘\’\`\“\”\"\'~]''') prefix_re = compile_prefix_regex(nlp.Defaults.prefixes) suffix_re = compile_suffix_regex(nlp.Defaults.suffixes) return Tokenizer(nlp.vocab, prefix_search=prefix_re.search, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer, token_match=None)
def custom_tokenizer(nlp): """ custom spacy tokenizer for maintaining hyphenated words """ infix_re = re.compile(r'''[.\,\?\:\;\...\‘\’\`\“\”\"\'~]''') prefix_re = compile_prefix_regex(nlp.Defaults.prefixes) suffix_re = compile_suffix_regex(nlp.Defaults.suffixes) return Tokenizer(nlp.vocab, prefix_search=prefix_re.search, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer, token_match=None)
def extend_tokenizer(nlp, pref, inf, suf): pref = tuple( pref + list(nlp.Defaults.prefixes)) if pref else nlp.Defaults.prefixes suf = tuple(suf + list(nlp.Defaults.suffixes)) if suf else nlp.Defaults.suffixes inf = tuple(inf + list(nlp.Defaults.infixes)) if inf else nlp.Defaults.infixes tok = "^(?:" + "|".join([RE[r]["str"] for r in RE['tok_patterns']]) + ")$" return Tokenizer( nlp.vocab, rules=nlp.Defaults.tokenizer_exceptions, prefix_search=spacyUtil.compile_prefix_regex(pref).search, suffix_search=spacyUtil.compile_suffix_regex(suf).search, infix_finditer=spacyUtil.compile_infix_regex(inf).finditer, token_match=re.compile(tok).match)
def create_custom_tokenizer(nlp): infixes = tuple([r"\<[\w\/]*\>"]) + nlp.Defaults.infixes prefixes = tuple([r"\<[\w\/]*\>"]) + nlp.Defaults.prefixes suffixes = tuple([r"\<[\w\/]*\>"]) + nlp.Defaults.suffixes infix_re = spacy.util.compile_infix_regex(infixes) prefix_re = compile_prefix_regex(prefixes) suffix_re = compile_suffix_regex(suffixes) return Tokenizer(nlp.vocab, prefix_search=prefix_re.search, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer, token_match=nlp.tokenizer.token_match, rules=nlp.Defaults.tokenizer_exceptions)
def test_tokenizer_prefix_suffix_overlap_lookbehind(en_vocab): # the prefix and suffix matches overlap in the suffix lookbehind prefixes = ["a(?=.)"] suffixes = [r"(?<=\w)\.", r"(?<=a)\d+\."] prefix_re = compile_prefix_regex(prefixes) suffix_re = compile_suffix_regex(suffixes) tokenizer = Tokenizer( en_vocab, prefix_search=prefix_re.search, suffix_search=suffix_re.search, ) tokens = [t.text for t in tokenizer("a10.")] assert tokens == ["a", "10", "."] explain_tokens = [t[1] for t in tokenizer.explain("a10.")] assert tokens == explain_tokens
def spacy_nlp(nlp): customize_add_PUNCT = ['/', '=', '$', '|', '\\', "-"] for w in customize_add_PUNCT: nlp.vocab[w].is_punct = True # modify tokenizer infix patterns prefixes = (list(nlp.Defaults.prefixes) + ['/']) prefixes_regex = compile_prefix_regex(prefixes) nlp.tokenizer.prefix_search = prefixes_regex.search infixes = (list(nlp.Defaults.infixes) + ['(?<=[0-9])[|\/+\\-\\*^](?=[0-9-])']) infix_re = compile_infix_regex(infixes) nlp.tokenizer.infix_finditer = infix_re.finditer return nlp
def custom_tokenizer_modified(nlp): # spacy defaults: when the standard behaviour is required, they # need to be included when subclassing the tokenizer infix_re = re.compile(r'''[.\,\?\!\:\...\‘\’\`\“\”\"\'\/~]''') extended_prefixes = tuple(list(nlp.Defaults.prefixes) + ["-"]) prefix_re = compile_prefix_regex(extended_prefixes) extended_suffixes = tuple(list(nlp.Defaults.suffixes) + ["-"]) suffix_re = compile_suffix_regex(extended_suffixes) # extending the default url regex url = URL_PATTERN url_re = re.compile(url) return Tokenizer(nlp.vocab, prefix_search=prefix_re.search, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer, token_match=url_re.match)
def create_medspacy_tokenizer(nlp): """Generates a custom tokenizer to augment the default spacy tokenizer for situations commonly seen in clinical text. This includes: * Punctuation infixes. For example, this allows the following examples to be more aggresively tokenized as : "Patient complains of c/o" -> [..., 'c', '/', 'o'] "chf+cp" -> ['chf', '+', 'cp'] @param nlp: Spacy language model """ # augment the defaults # this is not quite correct. We do not want to break on uppercase and we do not # want to break on all punctuation (periods) # infixes = nlp.Defaults.infixes + (r'''[^a-z0-9]''',) # escape all the punctuation we want to allow to allow to break up tokens # get all python punctuation punctuation_chars = string.punctuation # remove periods so that we do not break up '1.5 mg' into '1 . 5 mg' punctuation_chars = punctuation_chars.replace('.', '') infixes = nlp.Defaults.infixes + (r'''[{}]'''.format( re.escape(punctuation_chars)), ) prefixes = nlp.Defaults.prefixes suffixes = nlp.Defaults.suffixes # compile infix_re = compile_infix_regex(infixes) prefix_re = compile_prefix_regex(prefixes) suffix_re = compile_suffix_regex(suffixes) # Default exceptions could be extended later tokenizer_exceptions = nlp.Defaults.tokenizer_exceptions.copy() # now create this tokenizer = Tokenizer( nlp.vocab, tokenizer_exceptions, prefix_search=prefix_re.search, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer, token_match=nlp.tokenizer.token_match, ) return tokenizer
def custom_en_tokenizer(en_vocab): prefix_re = compile_prefix_regex(English.Defaults.prefixes) suffix_re = compile_suffix_regex(English.Defaults.suffixes) custom_infixes = [ r"\.\.\.+", r"(?<=[0-9])-(?=[0-9])", r"[0-9]+(,[0-9]+)+", r"[\[\]!&:,()\*—–\/-]", ] infix_re = compile_infix_regex(custom_infixes) return Tokenizer( en_vocab, English.Defaults.tokenizer_exceptions, prefix_re.search, suffix_re.search, infix_re.finditer, token_match=None, )
def add_special_tokenizer_cases(nlp: Language) -> Language: infix_re = compile_infix_regex( tuple(TOKENIZER_INFIXES + [ r"(?<=[{a}0-9])([()#\.]+|(-)+([->])+)(?=[{a}0-9])".format(a=ALPHA) ])) prefix_re = compile_prefix_regex(tuple(TOKENIZER_PREFIXES + [r'^[.-]+'])) suffix_re = compile_suffix_regex(tuple(TOKENIZER_SUFFIXES + [r'[.-]+$'])) nlp.tokenizer = Tokenizer(nlp.vocab, prefix_search=prefix_re.search, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer, token_match=None) for tok in [ '==', '+=', '-=', '*=', '/=', '%=', '!=', '<>', '->', '-->', '--', '---', TOK_VERSION ]: nlp.tokenizer.add_special_case(tok, [{ORTH: tok, NORM: tok, POS: X}]) return nlp
def customize_tokenizer(nlp): prefix_re = util.compile_prefix_regex(nlp.Defaults.prefixes) suffix_re = util.compile_suffix_regex(nlp.Defaults.suffixes) infix_re = util.compile_infix_regex(nlp.Defaults.infixes) # Remove all exceptions where a single letter is followed by a period (e.g. 'h.') exceptions = { k: v for k, v in dict(nlp.Defaults.tokenizer_exceptions).items() if not (len(k) == 2 and k[1] == ".") } new_tokenizer = Tokenizer( nlp.vocab, exceptions, prefix_search=prefix_re.search, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer, token_match=nlp.tokenizer.token_match, ) nlp.tokenizer = new_tokenizer
def custom_tokenizer(nlp, never_split): cls = nlp.Defaults rules = cls.tokenizer_exceptions token_match = cls.token_match prefix_search = (util.compile_prefix_regex(cls.prefixes).search if cls.prefixes else None) suffix_search = (util.compile_suffix_regex(cls.suffixes).search if cls.suffixes else None) infix_finditer = (util.compile_infix_regex(cls.infixes).finditer if cls.infixes else None) vocab = nlp.vocab return Tokenizer( vocab, rules=rules, prefix_search=prefix_search, suffix_search=suffix_search, infix_finditer=infix_finditer, token_match=lambda x: token_match(x) or x in never_split, )
def custom_tokenizer(nlp): infixes = list(nlp.Defaults.infixes) # add custom tokenize cases: # for case: <Wort>-<Wort> --> für deutsch eher weglassen? #infixes.append(r'(?<=[{a}"])[-](?=[{a}])'.format(a=ALPHA)) # for case: <Zahl>-<Wort> infixes.append(r'(?<=[0-9])[-](?=[{a}])'.format(a=ALPHA)) infix_re = compile_infix_regex(infixes) prefix_re = compile_prefix_regex(nlp.Defaults.prefixes) suffix_re = compile_suffix_regex(nlp.Defaults.suffixes) return Tokenizer(nlp.vocab, rules=nlp.Defaults.tokenizer_exceptions, prefix_search=prefix_re.search, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer, token_match=None)
def custom_tokenizer(self, nlp): """ Custom tokenizer: - won't -> won't before: - won't -> wo + n't Afterwards all words like won't are compared to a dictionary containing all possibilities. the option before (won't -> wo + n't) cannot detect if "wo" was an actual word or a part from won't. :param nlp: :return: """ infix_re = re.compile(r'''[.\,\?\:\;\...\‘\’\`\“\”\"~]''') prefix_re = compile_prefix_regex(nlp.Defaults.prefixes) suffix_re = compile_suffix_regex(nlp.Defaults.suffixes) return Tokenizer(nlp.vocab, prefix_search=prefix_re.search, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer, token_match=None)
def test_en_tokenizer_splits_pre_punct_regex(text, punct): en_search_prefixes = compile_prefix_regex(TOKENIZER_PREFIXES).search match = en_search_prefixes(text) assert match.group() == punct