Ejemplo n.º 1
0
    def custom_tokenizer(self, nlp):
        infix_re = compile_prefix_regex(nlp.Defaults.infixes)
        prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
        suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)

        return Tokenizer(nlp.vocab,
                         prefix_search=prefix_re.search,
                         suffix_search=suffix_re.search,
                         infix_finditer=infix_re.finditer,
                         token_match=None)
Ejemplo n.º 2
0
    def __init__(self,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 use_regex: bool = True) -> None:
        super().__init__(lazy=False)
        self.token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }

        self.nlp = spacy.load('en_core_web_sm')

        if use_regex:
            infix_re = compile_infix_regex(self.nlp.Defaults.infixes +
                                           tuple(r'-') +
                                           tuple(r'[/+=\(\)\[\]]'))
            prefix_re = compile_prefix_regex(self.nlp.Defaults.prefixes +
                                             tuple(r'[\'\(\[]'))
            suffix_re = compile_suffix_regex(self.nlp.Defaults.suffixes +
                                             tuple(r'[\.\+\)\]]'))

            self.nlp.tokenizer = Tokenizer(
                self.nlp.vocab,
                prefix_search=prefix_re.search,
                suffix_search=suffix_re.search,
                infix_finditer=infix_re.finditer,
                token_match=self.nlp.tokenizer.token_match)
Ejemplo n.º 3
0
def create_custom_tokenizer(nlp):
    from spacy import util
    from spacy.tokenizer import Tokenizer
    from spacy.lang.tokenizer_exceptions import TOKEN_MATCH
    prefixes = nlp.Defaults.prefixes + ('^<i>', )
    suffixes = nlp.Defaults.suffixes + ('</i>$', )
    # remove the tag symbols from prefixes and suffixes
    prefixes = list(prefixes)
    prefixes.remove('<')
    prefixes = tuple(prefixes)
    suffixes = list(suffixes)
    suffixes.remove('>')
    suffixes = tuple(suffixes)
    infixes = nlp.Defaults.infixes
    rules = nlp.Defaults.tokenizer_exceptions
    token_match = TOKEN_MATCH
    prefix_search = (util.compile_prefix_regex(prefixes).search)
    suffix_search = (util.compile_suffix_regex(suffixes).search)
    infix_finditer = (util.compile_infix_regex(infixes).finditer)
    return Tokenizer(nlp.vocab,
                     rules=rules,
                     prefix_search=prefix_search,
                     suffix_search=suffix_search,
                     infix_finditer=infix_finditer,
                     token_match=token_match)
Ejemplo n.º 4
0
def custom_tokenizer(nlp):
    # add '\.|-|~' and remove '#' (default prefixes list)
    hashtag_index = nlp.Defaults.prefixes.index('#')
    _prefixes = list(nlp.Defaults.prefixes) + [r'^\.|^~|^-(?=\S)']
    del _prefixes[hashtag_index]
    # add '\.' and remove '#' (default suffixes list)
    # add _api_calls regex
    hashtag_index = nlp.Defaults.suffixes.index('#')
    _suffixes = list(nlp.Defaults.suffixes) + _api_invoc + _var + [r'\.$']
    del _suffixes[hashtag_index]
    # add '\(|\[' to split nested api calls, arrays etc (default infixes list)
    # add _hashtags regex
    _infixes = list(nlp.Defaults.infixes) + _hashtags + \
        [r'\(|\)|\[|\]|\{|\}|<|>|,|=|\+|-|:|;|\'|\"|\/|&|\?']
    # setup each regex using native spaCy util functions
    prefix_re = util.compile_prefix_regex(_prefixes)
    suffix_re = util.compile_suffix_regex(_suffixes)
    infix_re = util.compile_infix_regex(_infixes)
    _tokenizer_exceptions = nlp.Defaults.tokenizer_exceptions
    return Tokenizer(nlp.vocab,
                     _tokenizer_exceptions,
                     prefix_search=prefix_re.search,
                     suffix_search=suffix_re.search,
                     infix_finditer=infix_re.finditer,
                     token_match=_protect.match)
Ejemplo n.º 5
0
    def __init__(self, nlp):
        punctnquotes = r'… …… , : ; \! \? ¿ ؟ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ? ! , 、 ; : ~ · । ، ؛ ٪\' \'\' " ” “ `` ` ‘ ´ ‘‘ ’’ ‚ , „ » « 「 」 『 』 ( ) 〔 〕 【 】 《 》 〈 〉'
        infix_re = re.compile(punctnquotes)
        prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
        suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)

        nlp.tokenizer = Tokenizer(nlp.vocab,
                                  prefix_search=prefix_re.search,
                                  suffix_search=suffix_re.search,
                                  infix_finditer=infix_re.finditer,
                                  token_match=None)

        self.nlp = nlp
        self.d = 1  #Wordnet distance
        self.estimators = 10
        self.crit = 'gini'
        self.max_f = 'auto'
        self.max_d = 1
        self.threshold = 0.3

        #set paths
        self.wordListCSVFile = 'essay_evaluation/Corpora/BritishWords_COCA_AmericanCounterparts.csv'
        self.fnameBritAmerWords = "essay_evaluation/Corpora/BritishWords_COCA_AmericanCounterparts.csv"
        self.word_embeddings_file = "essay_evaluation/Corpora/lexsub_word_embeddings"
        self.context_embeddings_file = "essay_evaluation/Corpora/lexsub_context_embeddings"
        self.fnameModel = "essay_evaluation/resources/subevalmodel"
Ejemplo n.º 6
0
    def custom_tokenizer(nlp):
        infix_re = re.compile(r'''[?;‘’`“”"'~]''')
        prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
        suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)

        return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
                         suffix_search=suffix_re.search,
                         infix_finditer=infix_re.finditer)
Ejemplo n.º 7
0
 def _get_prefix_regex(self):
     """
     Custom prefix tokenization rules
     :return:
     """
     custom_prefixes = [r"""^[\[\("'\\/@]"""]
     all_prefixes_re = compile_prefix_regex(
         tuple(list(self.nlp.Defaults.prefixes) + custom_prefixes))
     return all_prefixes_re
Ejemplo n.º 8
0
def custom_tokenizer(nlp):
    infix_re = re.compile(r'''[.\,\(\)\?\:\;\...\‘\’\`\“\”\"\'~]''')
    prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
    suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)

    return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
                                suffix_search=suffix_re.search,
                                infix_finditer=infix_re.finditer,
                                token_match=None)
Ejemplo n.º 9
0
 def _get_prefix_regex(self):
     """
     Custom prefix tokenization rules
     :return:
     """
     prefix = r"""."""
     all_prefixes_re = compile_prefix_regex(
         tuple(list(self.nlp.Defaults.prefixes) + [prefix]))
     return all_prefixes_re
Ejemplo n.º 10
0
def custom_tokenizer(nlp):
    prefix_re = compile_prefix_regex(Language.Defaults.prefixes + (';', '\*'))
    suffix_re = compile_suffix_regex(Language.Defaults.suffixes + (';', '\*'))
    infix_re = compile_infix_regex(Language.Defaults.infixes +
                                   ('(', ')', "/", "-", ";", "\*"))
    return Tokenizer(nlp.vocab,
                     prefix_search=prefix_re.search,
                     suffix_search=suffix_re.search,
                     infix_finditer=infix_re.finditer,
                     token_match=None)
def custom_tokenizer(nlp):
    # We create our own tokenizer to avoid spliting hyphen words.
    infix_re = re.compile(r'''[.\,\?\:\;\...\‘\’\`\“\”\"\'~]''')
    prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
    suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)

    return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
                     suffix_search=suffix_re.search,
                     infix_finditer=infix_re.finditer,
                     token_match=None)
Ejemplo n.º 12
0
 def custom_tokenizer(nlp):
     infix_re = re.compile(r'''[?;‘’`“”"'~]''')
     prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
     suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)
     tokenizer = Tokenizer(nlp.vocab)
     tokenizer.prefix_search = prefix_re.search
     tokenizer.suffix_search = suffix_re.search
     tokenizer.infix_finditer = infix_re.finditer
     tokenizer.token_match = None
     return tokenizer
Ejemplo n.º 13
0
def replace_infix_rules(nlp):
    """
    This converts a spacy pipeline such that its tokeniser no longer separates pretty much any token. E.g. contractions,
    hyphenations, honorifics, etc.
    """
    return Tokenizer(
        nlp.vocab,
        prefix_search=compile_prefix_regex(nlp.Defaults.prefixes).search,
        suffix_search=compile_suffix_regex(nlp.Defaults.suffixes).search,
        infix_finditer=lambda x: iter(()),
        rules={})
Ejemplo n.º 14
0
def create_custom_tokenizer(nlp):
    prefixes = compile_prefix_regex(nlp.Defaults.prefixes)
    infixes = compile_infix_regex(nlp.Defaults.infixes)
    suffixes = compile_suffix_regex(
        tuple(list(nlp.Defaults.suffixes) + custom_suffixes))

    return Tokenizer(nlp.vocab,
                     rules=nlp.Defaults.tokenizer_exceptions,
                     prefix_search=prefixes.search,
                     infix_finditer=infixes.finditer,
                     suffix_search=suffixes.search,
                     token_match=None)
Ejemplo n.º 15
0
def custom_tokenizer(nlp):
    infix_re = re.compile(
        r'''[.\,\?\:\;\...\‘\’\`\“\”\"\'\(\)\[\]\{\}\*\%\^\+\-\=\<\>\|\!(//)(\n)(\t)~]'''
    )
    prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
    suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)

    return Tokenizer(nlp.vocab,
                     prefix_search=prefix_re.search,
                     suffix_search=suffix_re.search,
                     infix_finditer=infix_re.finditer,
                     token_match=None)
Ejemplo n.º 16
0
def keep_hyphen_tokenizer(nlp: Language) -> Tokenizer:
    infix_re = re.compile(r'''[.\,\?\:\;\...\‘\’\`\“\”\"\'~]''')
    prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
    suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)

    return Tokenizer(
        nlp.vocab,
        prefix_search=prefix_re.search,
        suffix_search=suffix_re.search,
        infix_finditer=infix_re.finditer,
        token_match=None,
    )
Ejemplo n.º 17
0
	def custom_tokenizer(self, nlp):
		"""
		Custom tokeniser that does not split on dashes.
		Useful for names (e.g. Hennis-Plasschaert).
		"""
		infix_re = re.compile(r'''[.\,\?\:\;\...\‘\’\`\“\”\"\'~]''')
		prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
		suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)

		return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
						 suffix_search=suffix_re.search,
						 infix_finditer=infix_re.finditer,
						 token_match=None)
Ejemplo n.º 18
0
def custom_tokenizer(nlp):
    """
    custom spacy tokenizer for maintaining hyphenated words
    """
    infix_re = re.compile(r'''[.\,\?\:\;\...\‘\’\`\“\”\"\'~]''')
    prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
    suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)

    return Tokenizer(nlp.vocab,
                     prefix_search=prefix_re.search,
                     suffix_search=suffix_re.search,
                     infix_finditer=infix_re.finditer,
                     token_match=None)
Ejemplo n.º 19
0
def extend_tokenizer(nlp, pref, inf, suf):
    pref = tuple(
        pref + list(nlp.Defaults.prefixes)) if pref else nlp.Defaults.prefixes
    suf = tuple(suf +
                list(nlp.Defaults.suffixes)) if suf else nlp.Defaults.suffixes
    inf = tuple(inf +
                list(nlp.Defaults.infixes)) if inf else nlp.Defaults.infixes
    tok = "^(?:" + "|".join([RE[r]["str"] for r in RE['tok_patterns']]) + ")$"
    return Tokenizer(
        nlp.vocab,
        rules=nlp.Defaults.tokenizer_exceptions,
        prefix_search=spacyUtil.compile_prefix_regex(pref).search,
        suffix_search=spacyUtil.compile_suffix_regex(suf).search,
        infix_finditer=spacyUtil.compile_infix_regex(inf).finditer,
        token_match=re.compile(tok).match)
Ejemplo n.º 20
0
def create_custom_tokenizer(nlp):

    infixes = tuple([r"\<[\w\/]*\>"]) + nlp.Defaults.infixes
    prefixes = tuple([r"\<[\w\/]*\>"]) + nlp.Defaults.prefixes
    suffixes = tuple([r"\<[\w\/]*\>"]) + nlp.Defaults.suffixes
    infix_re = spacy.util.compile_infix_regex(infixes)
    prefix_re = compile_prefix_regex(prefixes)
    suffix_re = compile_suffix_regex(suffixes)

    return Tokenizer(nlp.vocab,
                     prefix_search=prefix_re.search,
                     suffix_search=suffix_re.search,
                     infix_finditer=infix_re.finditer,
                     token_match=nlp.tokenizer.token_match,
                     rules=nlp.Defaults.tokenizer_exceptions)
Ejemplo n.º 21
0
def test_tokenizer_prefix_suffix_overlap_lookbehind(en_vocab):
    # the prefix and suffix matches overlap in the suffix lookbehind
    prefixes = ["a(?=.)"]
    suffixes = [r"(?<=\w)\.", r"(?<=a)\d+\."]
    prefix_re = compile_prefix_regex(prefixes)
    suffix_re = compile_suffix_regex(suffixes)
    tokenizer = Tokenizer(
        en_vocab,
        prefix_search=prefix_re.search,
        suffix_search=suffix_re.search,
    )
    tokens = [t.text for t in tokenizer("a10.")]
    assert tokens == ["a", "10", "."]
    explain_tokens = [t[1] for t in tokenizer.explain("a10.")]
    assert tokens == explain_tokens
Ejemplo n.º 22
0
def spacy_nlp(nlp):
    customize_add_PUNCT = ['/', '=', '$', '|', '\\', "-"]
    for w in customize_add_PUNCT:
        nlp.vocab[w].is_punct = True

    # modify tokenizer infix patterns
    prefixes = (list(nlp.Defaults.prefixes) + ['/'])
    prefixes_regex = compile_prefix_regex(prefixes)
    nlp.tokenizer.prefix_search = prefixes_regex.search

    infixes = (list(nlp.Defaults.infixes) +
               ['(?<=[0-9])[|\/+\\-\\*^](?=[0-9-])'])
    infix_re = compile_infix_regex(infixes)
    nlp.tokenizer.infix_finditer = infix_re.finditer

    return nlp
def custom_tokenizer_modified(nlp):
    # spacy defaults: when the standard behaviour is required, they
    # need to be included when subclassing the tokenizer
    infix_re = re.compile(r'''[.\,\?\!\:\...\‘\’\`\“\”\"\'\/~]''')
    extended_prefixes = tuple(list(nlp.Defaults.prefixes) + ["-"])
    prefix_re = compile_prefix_regex(extended_prefixes)
    extended_suffixes = tuple(list(nlp.Defaults.suffixes) + ["-"])
    suffix_re = compile_suffix_regex(extended_suffixes)

    # extending the default url regex
    url = URL_PATTERN
    url_re = re.compile(url)
    return Tokenizer(nlp.vocab,
                     prefix_search=prefix_re.search,
                     suffix_search=suffix_re.search,
                     infix_finditer=infix_re.finditer,
                     token_match=url_re.match)
Ejemplo n.º 24
0
def create_medspacy_tokenizer(nlp):
    """Generates a custom tokenizer to augment the default spacy tokenizer 
        for situations commonly seen in clinical text.
        This includes:
            * Punctuation infixes.  
                For example, this allows the following examples to be more aggresively tokenized as :
                    "Patient complains of c/o" -> [..., 'c', '/', 'o']
                    "chf+cp" -> ['chf', '+', 'cp']
       @param nlp: Spacy language model
    """

    # augment the defaults
    # this is not quite correct.  We do not want to break on uppercase and we do not
    # want to break on all punctuation (periods)
    # infixes = nlp.Defaults.infixes + (r'''[^a-z0-9]''',)
    # escape all the punctuation we want to allow to allow to break up tokens

    # get all python punctuation
    punctuation_chars = string.punctuation
    # remove periods so that we do not break up '1.5 mg' into '1 . 5 mg'
    punctuation_chars = punctuation_chars.replace('.', '')

    infixes = nlp.Defaults.infixes + (r'''[{}]'''.format(
        re.escape(punctuation_chars)), )
    prefixes = nlp.Defaults.prefixes
    suffixes = nlp.Defaults.suffixes

    # compile
    infix_re = compile_infix_regex(infixes)
    prefix_re = compile_prefix_regex(prefixes)
    suffix_re = compile_suffix_regex(suffixes)

    # Default exceptions could be extended later
    tokenizer_exceptions = nlp.Defaults.tokenizer_exceptions.copy()

    # now create this
    tokenizer = Tokenizer(
        nlp.vocab,
        tokenizer_exceptions,
        prefix_search=prefix_re.search,
        suffix_search=suffix_re.search,
        infix_finditer=infix_re.finditer,
        token_match=nlp.tokenizer.token_match,
    )

    return tokenizer
def custom_en_tokenizer(en_vocab):
    prefix_re = compile_prefix_regex(English.Defaults.prefixes)
    suffix_re = compile_suffix_regex(English.Defaults.suffixes)
    custom_infixes = [
        r"\.\.\.+",
        r"(?<=[0-9])-(?=[0-9])",
        r"[0-9]+(,[0-9]+)+",
        r"[\[\]!&:,()\*—–\/-]",
    ]
    infix_re = compile_infix_regex(custom_infixes)
    return Tokenizer(
        en_vocab,
        English.Defaults.tokenizer_exceptions,
        prefix_re.search,
        suffix_re.search,
        infix_re.finditer,
        token_match=None,
    )
Ejemplo n.º 26
0
def add_special_tokenizer_cases(nlp: Language) -> Language:
    infix_re = compile_infix_regex(
        tuple(TOKENIZER_INFIXES + [
            r"(?<=[{a}0-9])([()#\.]+|(-)+([->])+)(?=[{a}0-9])".format(a=ALPHA)
        ]))
    prefix_re = compile_prefix_regex(tuple(TOKENIZER_PREFIXES + [r'^[.-]+']))
    suffix_re = compile_suffix_regex(tuple(TOKENIZER_SUFFIXES + [r'[.-]+$']))
    nlp.tokenizer = Tokenizer(nlp.vocab,
                              prefix_search=prefix_re.search,
                              suffix_search=suffix_re.search,
                              infix_finditer=infix_re.finditer,
                              token_match=None)

    for tok in [
            '==', '+=', '-=', '*=', '/=', '%=', '!=', '<>', '->', '-->', '--',
            '---', TOK_VERSION
    ]:
        nlp.tokenizer.add_special_case(tok, [{ORTH: tok, NORM: tok, POS: X}])
    return nlp
def customize_tokenizer(nlp):
    prefix_re = util.compile_prefix_regex(nlp.Defaults.prefixes)
    suffix_re = util.compile_suffix_regex(nlp.Defaults.suffixes)
    infix_re = util.compile_infix_regex(nlp.Defaults.infixes)
    # Remove all exceptions where a single letter is followed by a period (e.g. 'h.')
    exceptions = {
        k: v
        for k, v in dict(nlp.Defaults.tokenizer_exceptions).items()
        if not (len(k) == 2 and k[1] == ".")
    }
    new_tokenizer = Tokenizer(
        nlp.vocab,
        exceptions,
        prefix_search=prefix_re.search,
        suffix_search=suffix_re.search,
        infix_finditer=infix_re.finditer,
        token_match=nlp.tokenizer.token_match,
    )
    nlp.tokenizer = new_tokenizer
Ejemplo n.º 28
0
def custom_tokenizer(nlp, never_split):
    cls = nlp.Defaults
    rules = cls.tokenizer_exceptions
    token_match = cls.token_match
    prefix_search = (util.compile_prefix_regex(cls.prefixes).search
                     if cls.prefixes else None)
    suffix_search = (util.compile_suffix_regex(cls.suffixes).search
                     if cls.suffixes else None)
    infix_finditer = (util.compile_infix_regex(cls.infixes).finditer
                      if cls.infixes else None)
    vocab = nlp.vocab
    return Tokenizer(
        vocab,
        rules=rules,
        prefix_search=prefix_search,
        suffix_search=suffix_search,
        infix_finditer=infix_finditer,
        token_match=lambda x: token_match(x) or x in never_split,
    )
def custom_tokenizer(nlp):
    infixes = list(nlp.Defaults.infixes)

    # add custom tokenize cases:
    # for case: <Wort>-<Wort> --> für deutsch eher weglassen?
    #infixes.append(r'(?<=[{a}"])[-](?=[{a}])'.format(a=ALPHA))
    # for case: <Zahl>-<Wort>
    infixes.append(r'(?<=[0-9])[-](?=[{a}])'.format(a=ALPHA))

    infix_re = compile_infix_regex(infixes)
    prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
    suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)

    return Tokenizer(nlp.vocab,
                     rules=nlp.Defaults.tokenizer_exceptions,
                     prefix_search=prefix_re.search,
                     suffix_search=suffix_re.search,
                     infix_finditer=infix_re.finditer,
                     token_match=None)
    def custom_tokenizer(self, nlp):
        """
        Custom tokenizer:
            - won't -> won't
        before:
            - won't -> wo + n't

        Afterwards all words like won't are compared to a dictionary containing all possibilities.
        the option before (won't -> wo + n't) cannot detect if "wo" was an actual word or a part from won't.
        :param nlp:
        :return:
        """
        infix_re = re.compile(r'''[.\,\?\:\;\...\‘\’\`\“\”\"~]''')
        prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
        suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)

        return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
                         suffix_search=suffix_re.search,
                         infix_finditer=infix_re.finditer,
                         token_match=None)
Ejemplo n.º 31
0
def test_en_tokenizer_splits_pre_punct_regex(text, punct):
    en_search_prefixes = compile_prefix_regex(TOKENIZER_PREFIXES).search
    match = en_search_prefixes(text)
    assert match.group() == punct