Example #1
0
class KlingonDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters.update(LEX_ATTRS)
    lex_attr_getters[LANG] = lambda text: "klingon"
    lex_attr_getters[NORM] = add_lookups(
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS)
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
Example #2
0
class TurkishDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
	#lex_attr_getters.update(LEX_ATTRS)
    lex_attr_getters[LANG] = lambda text: "tr"
    lex_attr_getters[NORM] = add_lookups(
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    tag_map = TAG_MAP
    stop_words = STOP_WORDS
Example #3
0
class PortugueseDefaults(spacy.lang.pt.Portuguese.Defaults):
    lex_attr_getters = dict(spacy.lang.pt.Portuguese.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'pt' # language ISO code

    # optional: replace flags with custom functions, e.g. like_num()
    lex_attr_getters.update(LEX_ATTRS)

    # merge base exceptions and custom tokenizer exceptions
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
Example #4
0
class ChineseDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'zh'  # for pickling
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    tag_map = TAG_MAP
    stop_words = STOP_WORDS
Example #5
0
class SerbianpDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters.update(LEX_ATTRS)
    lex_attr_getters[LANG] = lambda text: "srp"
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
Example #6
0
        ORTH: "n't",
        LEMMA: "not",
        NORM: "not",
        TAG: "RB"
    }],
    "isn't": [{
        ORTH: "is",
        LEMMA: "be"
    }, {
        ORTH: "n't",
        LEMMA: "not",
        NORM: "not",
        TAG: "RB"
    }]
}
TOKENIZER_EXCEPTIONS = update_exc(TOKENIZER_EXCEPTIONS)

# updating the stopset
calfresh_stopwords = {
    "Calfresh", "CalFresh", "calfresh", "CALFRESH", "foodstamps", "sar7",
    "sar", "sr7", "sr", "SAR7", "SR7", "SAR", "SR", "Sar", "Sar7", "ebt"
}
calfresh_placeholders = {
    "PERSON", "ORG", "GPE", "LOC", "DATE", "MONEY", "CARDINAL"
}
stopset = STOP_WORDS.update(calfresh_stopwords, calfresh_placeholders)

regex = re.compile(r'\W|\d', flags=re.UNICODE)


def clean_words(text):
    def __init__(self, batch_size, n_cpus, n_threads, mode):

        print('loading model...', end=' ')
        self.nlp = english_model.load()
        self.nlp.remove_pipe('tagger')
        self.nlp.remove_pipe('ner')

        punct = list(string.punctuation)
        punct.remove('.')
        punct.append('[**')
        punct.append('**]')
        punct = [re.escape(p) for p in punct]

        prefixes_custom = tuple(punct)
        infixes_custom = tuple(punct)
        suffixes_custom = tuple(punct)

        #prefixes_custom = tuple([r'\[\*\*', r'('])
        #suffixes_custom = tuple([r'\*\*\]', r')'])
        #infixes_custom = tuple([r'\[\*\*', r'\*\*\]', r'(', r')', r'>', r'<', r'->', r'-->', r'--->'])

        exceptions_custom = {
            id: pattern
            for id, pattern in tokenizer_utils.generate_matcher_pattern1()
        }
        exceptions = update_exc(self.nlp.Defaults.tokenizer_exceptions,
                                exceptions_custom)

        prefix_re = compile_prefix_regex(self.nlp.Defaults.prefixes +
                                         prefixes_custom)
        infix_re = compile_infix_regex(infixes_custom +
                                       self.nlp.Defaults.infixes)
        suffix_re = compile_suffix_regex(self.nlp.Defaults.suffixes +
                                         suffixes_custom)

        tokenizer = SpacyTokenizer(self.nlp.vocab,
                                   rules=exceptions,
                                   prefix_search=prefix_re.search,
                                   suffix_search=suffix_re.search,
                                   infix_finditer=infix_re.finditer,
                                   token_match=self.nlp.Defaults.token_match)

        self.nlp.tokenizer = tokenizer

        matcher = Matcher(self.nlp.vocab)

        def on_match_pattern(matcher, doc, id, matches):

            match_id, start, end = matches[id]

            if self.nlp.vocab.strings[match_id].startswith('p3'):
                span = doc[start + 1:end]
                span.merge()
                for i in range(id, len(matches)):
                    matches[i] = (matches[i][0], matches[i][1] - 1,
                                  matches[i][2] - 1)

            elif self.nlp.vocab.strings[match_id].startswith('p2.1'):
                span1 = doc[start:start + 2]
                span2 = doc[start + 2:end]
                span1.merge()
                span2.merge()
                for i in range(id, len(matches)):
                    matches[i] = (matches[i][0], matches[i][1] - 2,
                                  matches[i][2] - 2)

            elif self.nlp.vocab.strings[match_id].startswith('p2.2'):
                span2 = doc[start + 1:end]
                span2.merge()
                for i in range(id, len(matches)):
                    matches[i] = (matches[i][0], matches[i][1] - 1,
                                  matches[i][2] - 1)

            elif self.nlp.vocab.strings[match_id].startswith('p2.3'):
                span1 = doc[start:start + 2]
                span1.merge()
                for i in range(id, len(matches)):
                    matches[i] = (matches[i][0], matches[i][1] - 1,
                                  matches[i][2] - 1)

        for id, pattern in tokenizer_utils.generate_matcher_pattern2():
            matcher.add(id, on_match_pattern, pattern)

        for id, pattern in tokenizer_utils.generate_matcher_pattern3():
            matcher.add(id, on_match_pattern, pattern)

        self.nlp.add_pipe(matcher, before='parser')

        print('done')

        self.batch_size = batch_size
        self.n_cpus = n_cpus
        self.n_threads = n_threads
        self.mode = mode