def pipeline(merge_patterns=None, terminal_patterns=None): CYRILLIC_UPPER = r'[\p{Lu}&&\p{Cyrillic}]' r'(?<=[{au}])\.(?=\w+)'.format(au=CYRILLIC_UPPER) Language = get_lang_class('ru') Language.Defaults.infixes += ('«»',) Language.Defaults.infixes += ('-',) Language.Defaults.infixes += ('"\/',) Language.Defaults.infixes += ('/',) Language.Defaults.infixes += (r'(?<=[{au}])\.(?=\w+)'.format(au=CYRILLIC_UPPER),) # Token.set_extension('is_adjective', default=False, force=True) nlp = Language() russian_tokenizer = RussianTokenizer(nlp, merge_patterns=merge_patterns, terminal_patterns=terminal_patterns) nlp.add_pipe(detect_sentence_boundaries, name='detect_sentence_boundaries', first=True) # nlp.add_pipe(match_adjective, name='match_adjective', after='detect_sentence_boundaries') nlp.add_pipe(russian_tokenizer, name='russian_tokenizer', after='detect_sentence_boundaries') for case in SPECIAL_CASES: nlp.tokenizer.add_special_case(case, [{'ORTH': case}]) for case in DOT_SPECIAL_CASES: nlp.tokenizer.add_special_case(case, [{'ORTH': case}]) nlp.tokenizer.add_special_case('--', [{'ORTH': '—'}]) nlp.tokenizer.add_special_case(' ', [{'ORTH': ' '}]) return nlp
def main(lang_id, lang_data_dir, corpora_dir, model_dir): model_dir = Path(model_dir) lang_data_dir = Path(lang_data_dir) / lang_id corpora_dir = Path(corpora_dir) / lang_id assert corpora_dir.exists() assert lang_data_dir.exists() if not model_dir.exists(): model_dir.mkdir() tag_map = json.load((lang_data_dir / 'tag_map.json').open()) setup_tokenizer(lang_data_dir, model_dir / 'tokenizer') setup_vocab(get_lang_class(lang_id).Defaults.lex_attr_getters, tag_map, corpora_dir, model_dir / 'vocab') if (lang_data_dir / 'gazetteer.json').exists(): copyfile((lang_data_dir / 'gazetteer.json').as_posix(), (model_dir / 'vocab' / 'gazetteer.json').as_posix()) copyfile((lang_data_dir / 'tag_map.json').as_posix(), (model_dir / 'vocab' / 'tag_map.json').as_posix()) if (lang_data_dir / 'lemma_rules.json').exists(): copyfile((lang_data_dir / 'lemma_rules.json').as_posix(), (model_dir / 'vocab' / 'lemma_rules.json').as_posix()) if not (model_dir / 'wordnet').exists() and (corpora_dir / 'wordnet').exists(): copytree((corpora_dir / 'wordnet' / 'dict').as_posix(), (model_dir / 'wordnet').as_posix())
def test_lang_initialize(lang, capfd): """Test that languages can be initialized.""" nlp = get_lang_class(lang)() # Check for stray print statements (see #3342) doc = nlp("test") # noqa: F841 captured = capfd.readouterr() assert not captured.out
def load_default_model_sentencizer(lang): """ Load a generic spaCy model and add the sentencizer for sentence tokenization""" loading_start = time.time() lang_class = get_lang_class(lang) nlp = lang_class() nlp.add_pipe(nlp.create_pipe('sentencizer')) loading_end = time.time() loading_time = loading_end - loading_start return nlp, loading_time, lang + "_default_" + 'sentencizer'
def pipeline(merge_patterns=[], terminal_patterns=[]): def rules_matcher(doc): spans = [] for id, start, end in matcher(doc): if id == 15329811787164753587: spans.append(doc[start:end]) elif id == 7038656598907266222: for token in doc[start:end]: if token.sent_start: token.sent_start = False if spans: for span in spans: # try: # if span.text not in EXCLUSIONS: # span.merge() # except IndexError as error: # # print(doc) # # error occurs when there are more than one hyphen within span, basically it can be ignored span.merge() return doc CYRILLIC_UPPER = r'[\p{Lu}&&\p{Cyrillic}]' r'(?<=[{au}])\.(?=\w+)'.format(au=CYRILLIC_UPPER) Language = get_lang_class('ru') Language.Defaults.infixes += ('«»',) Language.Defaults.infixes += ('-',) Language.Defaults.infixes += ('"\/',) Language.Defaults.infixes += (r'(?<=[{au}])\.(?=\w+)'.format(au=CYRILLIC_UPPER),) # Token.set_extension('is_adjective', default=False, force=True) nlp = Language() matcher = Matcher(nlp.vocab) pattern = nlp.vocab.strings['pattern'] sentence_terminal = nlp.vocab.strings['sentence_terminal'] if merge_patterns: matcher.add(pattern, None, *merge_patterns) if terminal_patterns: matcher.add(sentence_terminal, None, *terminal_patterns) # nlp.add_pipe(match_adjective, name='match_adjective', last=True) nlp.add_pipe(detect_sentence_boundaries, name='detect_sentence_boundaries', first=True) nlp.add_pipe(rules_matcher, name='rules_matcher', after='detect_sentence_boundaries') for case in HYPHEN_SPICIAL_CASES: nlp.tokenizer.add_special_case(case, [{'ORTH': case}]) for case in DOT_SPECIAL_CASES: nlp.tokenizer.add_special_case(case, [{'ORTH': case}]) nlp.tokenizer.add_special_case('--', [{'ORTH': '—'}]) nlp.tokenizer.add_special_case(' ', [{'ORTH': ' '}]) return nlp
def ru_tokenizer(): pytest.importorskip("pymorphy2") return get_lang_class("ru").Defaults.create_tokenizer()
def pt_tokenizer(): return get_lang_class("pt")().tokenizer
def sr_tokenizer(): return get_lang_class("sr")().tokenizer
def pl_tokenizer(): return get_lang_class("pl")().tokenizer
def de_vocab(): return get_lang_class("de")().vocab
def de_tokenizer(): return get_lang_class("de").Defaults.create_tokenizer()
def fr_tokenizer(): return get_lang_class("fr").Defaults.create_tokenizer()
def tt_tokenizer(): return get_lang_class("tt")().tokenizer
def ky_tokenizer(): return get_lang_class("ky")().tokenizer
def tokenizer(): return get_lang_class("xx")().tokenizer
def tr_tokenizer(): return get_lang_class("tr")().tokenizer
def ti_tokenizer(): return get_lang_class("ti")().tokenizer
def th_tokenizer(): pytest.importorskip("pythainlp") return get_lang_class("th")().tokenizer
def sv_tokenizer(): return get_lang_class("sv")().tokenizer
def th_tokenizer(): pytest.importorskip("pythainlp") return get_lang_class("th").Defaults.create_tokenizer()
def uk_tokenizer(): pytest.importorskip("pymorphy2") return get_lang_class("uk")().tokenizer
def uk_tokenizer(): pytest.importorskip("pymorphy2") pytest.importorskip("pymorphy2.lang") return get_lang_class("uk").Defaults.create_tokenizer()
def uk_lemmatizer(): pytest.importorskip("pymorphy2") pytest.importorskip("pymorphy2_dicts_uk") return get_lang_class("uk")().add_pipe("lemmatizer")
def en_parser(en_vocab): nlp = get_lang_class("en")(en_vocab) return nlp.create_pipe("parser")
def load_tokenizer(b): tok = get_lang_class("en").Defaults.create_tokenizer() tok.from_bytes(b) return tok
def ru_lemmatizer(): pytest.importorskip("pymorphy2") return get_lang_class("ru")().add_pipe("lemmatizer")
def id_tokenizer(): return get_lang_class("id").Defaults.create_tokenizer()
def de_tokenizer(): return get_lang_class("de")().tokenizer
def nb_tokenizer(): return get_lang_class("nb").Defaults.create_tokenizer()
def ro_tokenizer(): return get_lang_class("ro")().tokenizer
def pl_tokenizer(): return get_lang_class("pl").Defaults.create_tokenizer()
def ur_tokenizer(): return get_lang_class("ur")().tokenizer
def sa_tokenizer(): return get_lang_class("sa")().tokenizer
def hu_tokenizer(): return get_lang_class("hu").Defaults.create_tokenizer()
def vi_tokenizer(): pytest.importorskip("pyvi") return get_lang_class("vi")().tokenizer
def ja_tokenizer(): pytest.importorskip("MeCab") return get_lang_class("ja").Defaults.create_tokenizer()
def yo_tokenizer(): return get_lang_class("yo")().tokenizer
def nl_lemmatizer(scope="session"): return get_lang_class("nl").Defaults.create_lemmatizer()
def zh_tokenizer_char(): nlp = get_lang_class("zh")() return nlp.tokenizer
def ro_tokenizer(): return get_lang_class("ro").Defaults.create_tokenizer()
def am_tokenizer(): return get_lang_class("am")().tokenizer
def sv_tokenizer(): return get_lang_class("sv").Defaults.create_tokenizer()
def hy_tokenizer(): return get_lang_class("hy")().tokenizer
def tt_tokenizer(): return get_lang_class("tt").Defaults.create_tokenizer()
def ar_tokenizer(): return get_lang_class("ar")().tokenizer
def bn_tokenizer(): return get_lang_class("bn").Defaults.create_tokenizer()
def bg_tokenizer(): return get_lang_class("bg")().tokenizer
def en_vocab(): return get_lang_class("en").Defaults.create_vocab()
def bn_tokenizer(): return get_lang_class("bn")().tokenizer
def es_tokenizer(): return get_lang_class("es").Defaults.create_tokenizer()
def ca_tokenizer(): return get_lang_class("ca")().tokenizer
def ga_tokenizer(): return get_lang_class("ga").Defaults.create_tokenizer()
def cs_tokenizer(): return get_lang_class("cs")().tokenizer
def da_tokenizer(): return get_lang_class("da")().tokenizer
def nl_tokenizer(): return get_lang_class("nl")().tokenizer