def init_syl_tokenizers(main, lang, syl_tokenizer): # Pyphen if syl_tokenizer.startswith('pyphen_'): if f'pyphen_syl_tokenizer_{lang}' not in main.__dict__: lang_pyphen = wl_conversion.to_iso_639_1(main, lang) if lang.find('_') > -1: lang_pyphen = f"{lang_pyphen.split('_')[0]}_{lang_pyphen.split('_')[1].upper()}" else: lang_pyphen = wl_conversion.to_iso_639_1(main, lang) main.__dict__[f'pyphen_syl_tokenizer_{lang}'] = pyphen.Pyphen(lang = lang_pyphen)
def init_spacy_models(main, lang): # Chinese, English, German, Portuguese if not lang.startswith('srp_'): lang = wl_conversion.remove_lang_code_suffixes(main, lang) if f'spacy_nlp_{lang}' not in main.__dict__: # Languages with models if lang in SPACY_LANGS: model = importlib.import_module(SPACY_LANGS[lang]) main.__dict__[f'spacy_nlp_{lang}'] = model.load(disable = ['parser', 'ner']) # Add senter main.__dict__[f'spacy_nlp_{lang}'].enable_pipe('senter') # Languages without models else: # Serbian if lang == 'srp_cyrl': main.__dict__['spacy_nlp_srp_cyrl'] = spacy.blank('sr') elif lang == 'srp_latn': main.__dict__['spacy_nlp_srp_latn'] = spacy.blank('sr') else: main.__dict__[f'spacy_nlp_{lang}'] = spacy.blank(wl_conversion.to_iso_639_1(main, lang)) # Add sentencizer and lemmatizer main.__dict__[f'spacy_nlp_{lang}'].add_pipe('sentencizer') if lang in SPACY_LANGS_LEMMATIZERS: main.__dict__[f'spacy_nlp_{lang}'].add_pipe('lemmatizer') main.__dict__[f'spacy_nlp_{lang}'].initialize()
def init_word_detokenizers(main, lang): if lang not in ['zho_cn', 'zho_tw', 'jpn', 'tha', 'bod']: # Sacremoses lang_sacremoses = wl_conversion.remove_lang_code_suffixes(main, wl_conversion.to_iso_639_1(main, lang)) lang = wl_conversion.remove_lang_code_suffixes(main, lang) if f'sacremoses_moses_detokenizer_{lang}' not in main.__dict__: main.__dict__[f'sacremoses_moses_detokenizer_{lang}'] = sacremoses.MosesDetokenizer(lang = lang_sacremoses)
def check_spacy_models(main, lang, pipeline): spacy_langs = { 'dan': 'da_core_news_sm', 'nld': 'nl_core_news_sm', 'eng': 'en_core_web_sm', 'fra': 'fr_core_news_sm', 'deu': 'de_core_news_sm', 'ell': 'el_core_news_sm', 'ita': 'it_core_news_sm', 'lit': 'lt_core_news_sm', 'nob': 'nb_core_news_sm', 'pol': 'pl_core_news_sm', 'por': 'pt_core_news_sm', 'ron': 'ro_core_news_sm', 'spa': 'es_core_news_sm', 'other': 'en_core_web_sm' } # Remove unused pipelines to boost speed if pipeline == 'word_tokenization': nlp_pipelines = [] nlp_disable = ['tagger', 'parser', 'ner'] elif pipeline in ['sentence_tokenization', 'tokenization']: nlp_pipelines = ['sentencizer'] nlp_disable = ['tagger', 'parser', 'ner'] elif pipeline in ['pos_tagging', 'lemmatization']: nlp_pipelines = ['tagger'] nlp_disable = ['parser', 'ner'] # Languages with models if lang in spacy_langs: if f'spacy_nlp_{lang}' in main.__dict__: if main.__dict__[f'spacy_nlp_{lang}'].pipe_names != nlp_pipelines: del main.__dict__[f'spacy_nlp_{lang}'] if f'spacy_nlp_{lang}' not in main.__dict__: model = importlib.import_module(spacy_langs[lang]) main.__dict__[f'spacy_nlp_{lang}'] = model.load( disable=nlp_disable) # Languages without models else: # Serbian (Cyrillic) & Serbian (Latin) if lang in ['srp_cyrl', 'srp_latn']: main.__dict__['spacy_nlp_srp_cyrl'] = spacy.blank('sr') main.__dict__['spacy_nlp_srp_latn'] = spacy.blank('sr') else: main.__dict__[f'spacy_nlp_{lang}'] = spacy.blank( wl_conversion.to_iso_639_1(main, lang)) if 'sentencizer' in nlp_pipelines: nlp = main.__dict__[f'spacy_nlp_{lang}'] if 'sentencizer' not in nlp.pipe_names: nlp.add_pipe(nlp.create_pipe('sentencizer'))
def init_word_tokenizers(main, lang, word_tokenizer = 'default'): if lang not in main.settings_global['word_tokenizers']: lang = 'other' if word_tokenizer == 'default': word_tokenizer = main.settings_custom['word_tokenization']['word_tokenizers'][lang] # NLTK if word_tokenizer.startswith('nltk_'): if word_tokenizer == 'nltk_nist': if 'nltk_nist_tokenizer' not in main.__dict__: main.nltk_nist_tokenizer = nltk.tokenize.nist.NISTTokenizer() elif word_tokenizer == 'nltk_nltk': if 'nltk_nltk_tokenizer' not in main.__dict__: main.nltk_nltk_tokenizer = nltk.NLTKWordTokenizer() elif word_tokenizer == 'nltk_penn_treebank': if 'nltk_treebank_tokenizer' not in main.__dict__: main.nltk_treebank_tokenizer = nltk.TreebankWordTokenizer() elif word_tokenizer == 'nltk_tok_tok': if 'nltk_toktok_tokenizer' not in main.__dict__: main.nltk_toktok_tokenizer = nltk.ToktokTokenizer() elif word_tokenizer == 'nltk_twitter': if 'nltk_tweet_tokenizer' not in main.__dict__: main.nltk_tweet_tokenizer = nltk.TweetTokenizer() # Sacremoses elif word_tokenizer == 'sacremoses_moses': lang_sacremoses = wl_conversion.remove_lang_code_suffixes(main, wl_conversion.to_iso_639_1(main, lang)) lang = wl_conversion.remove_lang_code_suffixes(main, lang) if f'sacremoses_moses_tokenizer_{lang}' not in main.__dict__: main.__dict__[f'sacremoses_moses_tokenizer_{lang}'] = sacremoses.MosesTokenizer(lang = lang_sacremoses) # spaCy elif word_tokenizer.startswith('spacy_'): init_spacy_models(main, lang) # Chinese elif word_tokenizer == 'pkuseg_zho': if 'pkuseg_word_tokenizer' not in main.__dict__: main.pkuseg_word_tokenizer = pkuseg.pkuseg() # Chinese & Japanese elif word_tokenizer.startswith('wordless_'): init_spacy_models(main, 'eng_us') init_spacy_models(main, 'other') # Japanese elif word_tokenizer.startswith('sudachipy_jpn'): if 'sudachipy_word_tokenizer' not in main.__dict__: main.sudachipy_word_tokenizer = sudachipy.Dictionary().create() # Tibetan elif word_tokenizer == 'botok_bod': if 'botok_word_tokenizer' not in main.__dict__: main.botok_word_tokenizer = botok.WordTokenizer()
def check_missing_extra_langs(langs_supported, langs_global, msg): global lang_missing global lang_extra for lang_code in langs_supported: lang_code_639_3 = wl_conversion.to_iso_639_3(main, lang_code) if lang_code_639_3 not in langs_global: print( f'''Missing language code "{lang_code_639_3}/{lang_code}" found for {msg}!''' ) lang_missing = True for lang_code in langs_global: lang_code_639_1 = wl_conversion.to_iso_639_1(main, lang_code) if lang_code_639_1 not in langs_supported: print( f'''Extra language code "{lang_code}/{lang_code_639_1}" found for {msg}!''' ) lang_extra = True
def wl_word_detokenize(main, tokens, lang, word_detokenizer='default'): sentence_start = 0 sentences = [] text = '' if lang not in main.settings_global['word_detokenizers']: lang = 'other' if word_detokenizer == 'default': word_detokenizer = main.settings_custom['word_detokenization'][ 'word_detokenizers'][lang] for i, token in enumerate(tokens): if type(token) == wl_text.Wl_Token and token.sentence_ending: sentences.append(tokens[sentence_start:i + 1]) sentence_start = i + 1 elif i == len(tokens) - 1: sentences.append(tokens[sentence_start:]) # English & Other Languages if word_detokenizer == main.tr('NLTK - Penn Treebank Detokenizer'): treebank_detokenizer = nltk.tokenize.treebank.TreebankWordDetokenizer() for sentence in sentences: text += treebank_detokenizer.tokenize(tokens) elif word_detokenizer == main.tr('Sacremoses - Moses Detokenizer'): moses_detokenizer = sacremoses.MosesDetokenizer( lang=wl_conversion.to_iso_639_1(main, lang)) for sentence in sentences: text += moses_detokenizer.detokenize(sentence) # Chinese elif word_detokenizer == main.tr('Wordless - Chinese Word Detokenizer'): non_cjk_start = 0 for i, token in enumerate(tokens): if i >= non_cjk_start: if (wl_checking_unicode.has_han(token) or all(map(str.isnumeric, token))): text += token non_cjk_start += 1 else: # English if wl_checking_unicode.is_eng_token(token): for j, token in enumerate(tokens[i:]): if i + j + 1 == len( tokens ) or not wl_checking_unicode.is_eng_token( tokens[i + j + 1]): text += wl_word_detokenize( main, tokens[non_cjk_start:i + j + 1], lang='eng') non_cjk_start = i + j + 1 break # Other Languages else: for j, token in enumerate(tokens[i:]): if (i + j + 1 == len(tokens) or wl_checking_unicode.has_han( tokens[i + j + 1])): text += wl_word_detokenize( main, tokens[non_cjk_start:i + j + 1], lang='other') non_cjk_start = i + j + 1 break elif word_detokenizer == main.tr('Wordless - Japanese Word Detokenizer'): non_cjk_start = 0 for i, token in enumerate(tokens): if i < non_cjk_start: continue if (wl_checking_unicode.has_han(token) or wl_checking_unicode.has_kana(token) or all(map(str.isnumeric, token))): text += token non_cjk_start = i + 1 else: # English if wl_checking_unicode.is_eng_token(token): for j, token in enumerate(tokens[i:]): if i + j + 1 == len( tokens ) or not wl_checking_unicode.is_eng_token( tokens[i + j + 1]): text += wl_word_detokenize(main, tokens[non_cjk_start:i + j + 1], lang='eng') non_cjk_start = i + j + 1 break # Other Languages else: for j, token in enumerate(tokens[i:]): if (i + j + 1 == len(tokens) or wl_checking_unicode.has_han(tokens[i + j + 1]) or wl_checking_unicode.has_kana( tokens[i + j + 1])): text += wl_word_detokenize(main, tokens[non_cjk_start:i + j + 1], lang='other') non_cjk_start = i + j + 1 break # Thai elif word_detokenizer in main.tr('Wordless - Thai Word Detokenizer'): non_thai_start = 0 for i, token in enumerate(tokens): if i < non_thai_start: continue if wl_checking_unicode.has_thai(token): if type(token) == wl_text.Wl_Token: text += token + token.boundary else: text += token non_thai_start = i + 1 else: # English if wl_checking_unicode.is_eng_token(token): for j, token in enumerate(tokens[i:]): if i + j + 1 == len( tokens ) or not wl_checking_unicode.is_eng_token( tokens[i + j + 1]): text += wl_word_detokenize( main, tokens[non_thai_start:i + j + 1], lang='eng') non_thai_start = i + j + 1 break # Other Languages else: for j, token in enumerate(tokens[i:]): if (i + j + 1 == len(tokens) or wl_checking_unicode.has_thai( tokens[i + j + 1])): text += wl_word_detokenize( main, tokens[non_thai_start:i + j + 1], lang='other') non_thai_start = i + j + 1 break # Tibetan elif word_detokenizer == main.tr('Wordless - Tibetan Word Detokenizer'): non_tibetan_start = 0 for i, token in enumerate(tokens): if i < non_tibetan_start: continue if wl_checking_unicode.has_tibetan(token): # Check for Tibetan Mark Shad # See: https://w3c.github.io/tlreq/#section_breaks if i > 0 and token[0] == '།': text += token else: text += token non_tibetan_start = i + 1 else: # English if wl_checking_unicode.is_eng_token(token): for j, token in enumerate(tokens[i:]): if i + j + 1 == len( tokens ) or not wl_checking_unicode.is_eng_token( tokens[i + j + 1]): text += wl_word_detokenize( main, tokens[non_tibetan_start:i + j + 1], lang='eng') non_tibetan_start = i + j + 1 break # Other Languages else: for j, token in enumerate(tokens[i:]): if (i + j + 1 == len(tokens) or wl_checking_unicode.has_tibetan( tokens[i + j + 1])): text += wl_word_detokenize( main, tokens[non_tibetan_start:i + j + 1], lang='other') non_tibetan_start = i + j + 1 break text = re.sub(r'\s{2,}', ' ', text) return text.strip()
def wl_get_stop_word_list(main, lang, stop_word_list = 'default'): if stop_word_list == 'default': stop_word_list = main.settings_custom['stop_word_lists']['stop_word_lists'][lang] if stop_word_list == main.tr('Custom List'): stop_word_list = main.settings_custom['stop_word_lists']['custom_lists'][lang] else: lang_639_1 = wl_conversion.to_iso_639_1(main, lang) # Chinese (Simplified) if lang_639_1 == 'zh_cn': lang_639_1 = 'zh' # Chinese (Traditional) if lang_639_1 == 'zh_tw': cc = opencc.OpenCC('s2tw') stop_word_list_zho_cn = wl_get_stop_word_list( main, lang = 'zho_cn', stop_word_list = stop_word_list.replace('Chinese (Traditional)', 'Chinese (Simplified)')) stop_word_list = [cc.convert(stop_word) for stop_word in stop_word_list_zho_cn] # extra-stopwords elif 'extra-stopwords' in stop_word_list: LANG_TEXTS = { 'sqi': 'albanian', 'ara': 'arabic', 'hye': 'armenian', 'eus': 'basque', 'bel': 'belarusian', 'ben': 'bengali', 'bul': 'bulgarian', 'cat': 'catalan', 'zho_cn': 'chinese', 'hrv': 'croatian', 'ces': 'czech', 'dan': 'danish', 'nld': 'dutch', 'eng': 'english', 'est': 'estonian', 'fin': 'finnish', 'fra': 'french', 'glg': 'galician', 'deu': 'german', 'ell': 'greek', 'hau': 'hausa', 'heb': 'hebrew', 'hin': 'hindi', 'hun': 'hungarian', 'isl': 'icelandic', 'ind': 'indonesian', 'gle': 'irish', 'ita': 'italian', 'jpn': 'japanese', 'kor': 'korean', 'kur': 'kurdish', 'lav': 'latvian', 'lit': 'lithuanian', 'msa': 'malay', 'mar': 'marathi', 'mon': 'mongolian', 'nep': 'nepali', # Norwegian Bokmål & Norwegian Nynorsk 'nob': 'norwegian', 'nno': 'norwegian', 'fas': 'persian', 'pol': 'polish', 'por': 'portuguese', 'ron': 'romanian', 'rus': 'russian', 'srp_cyrl': 'serbian-cyrillic', 'srp_latn': 'serbian', 'slk': 'slovak', 'slv': 'slovenian', 'spa': 'spanish', 'swa': 'swahili', 'swe': 'swedish', 'tgl': 'tagalog', 'tel': 'telugu', 'tha': 'thai', 'tur': 'turkish', 'ukr': 'ukranian', 'urd': 'urdu', 'vie': 'vietnamese', 'yor': 'yoruba' } with open(wl_misc.get_normalized_path(f'stop_word_lists/extra-stopwords/{LANG_TEXTS[lang]}'), 'r', encoding = 'utf_8') as f: stop_word_list = [line.rstrip() for line in f if not line.startswith('#')] # NLTK elif 'NLTK' in stop_word_list: LANG_TEXTS = { 'ara': 'arabic', 'aze': 'azerbaijani', 'dan': 'danish', 'nld': 'dutch', 'eng': 'english', 'fin': 'finnish', 'fra': 'french', 'deu': 'german', 'ell': 'greek', 'hun': 'hungarian', 'ind': 'indonesian', 'ita': 'italian', 'kaz': 'kazakh', 'nep': 'nepali', # Norwegian Bokmål & Norwegian Nynorsk 'nob': 'norwegian', 'nno': 'norwegian', 'por': 'portuguese', 'ron': 'romanian', 'rus': 'russian', 'slv': 'slovene', 'spa': 'spanish', 'swe': 'swedish', 'tgk': 'tajik', 'tur': 'turkish' } stop_word_list = nltk.corpus.stopwords.words(LANG_TEXTS[lang]) # spaCy elif 'spaCy' in stop_word_list: # Serbian (Cyrillic) & Serbian (Latin) if lang_639_1 == 'sr_cyrl': spacy_lang = importlib.import_module('spacy.lang.sr') stop_word_list = spacy_lang.STOP_WORDS elif lang_639_1 == 'sr_latn': spacy_lang = importlib.import_module('spacy.lang.sr') stop_word_list = spacy_lang.STOP_WORDS stop_word_list = wl_text_utils.to_srp_latn(stop_word_list) else: spacy_lang = importlib.import_module(f'spacy.lang.{lang_639_1}') stop_word_list = spacy_lang.STOP_WORDS # Stopwords ISO elif 'Stopwords ISO' in stop_word_list: # Greek (Ancient) if lang_639_1 == 'grc': lang_639_1 = 'el' # Norwegian Bokmål & Norwegian Nynorsk if lang_639_1 in ['nb', 'nn']: lang_639_1 = 'no' with open(wl_misc.get_normalized_path('stop_word_lists/Stopwords ISO/stopwords_iso.json'), 'r', encoding = 'utf_8') as f: stop_word_list = json.load(f)[lang_639_1] # Thai elif stop_word_list == main.tr('PyThaiNLP - Thai Stop Word List'): stop_word_list = pythainlp.corpus.common.thai_stopwords() # Remove empty tokens stop_word_list = [stop_word for stop_word in stop_word_list if stop_word] return sorted(set(stop_word_list))
def wl_lemmatize(main, tokens, lang, text_type = ('untokenized', 'untagged'), lemmatizer = 'default'): empty_offsets = [] mapping_lemmas = {} lemmas = [] tokens = [str(token) for token in tokens] re_tags_all = wl_matching.get_re_tags(main, tags = 'all') re_tags_pos = wl_matching.get_re_tags(main, tags = 'pos') re_tags_non_pos = wl_matching.get_re_tags(main, tags = 'non_pos') if text_type[1] == 'tagged_both': tags = [''.join(re.findall(re_tags_all, token)) for token in tokens] tokens = [re.sub(re_tags_all, '', token) for token in tokens] elif text_type[1] == 'tagged_pos': tags = [''.join(re.findall(re_tags_pos, token)) for token in tokens] tokens = [re.sub(re_tags_pos, '', token) for token in tokens] elif text_type[1] == 'tagged_non_pos': tags = [''.join(re.findall(re_tags_non_pos, token)) for token in tokens] tokens = [re.sub(re_tags_non_pos, '', token) for token in tokens] else: tags = [''] * len(tokens) # Record empty tokens for i, token in reversed(list(enumerate(tokens))): if not token.strip(): empty_offsets.append(i) tokens.remove(token) wl_text_utils.check_lemmatizers(main, lang) if tokens and lang in main.settings_global['lemmatizers']: if lemmatizer == 'default': lemmatizer = main.settings_custom['lemmatization']['lemmatizers'][lang] # Dutch, English, French, German, Greek (Modern), Italian, Portuguese, Spanish if 'spaCy' in lemmatizer: nlp = main.__dict__[f'spacy_nlp_{lang}'] doc = spacy.tokens.Doc(nlp.vocab, words = tokens) nlp.tagger(doc) lemmas = [token.lemma_ for token in doc] # English elif lemmatizer == main.tr('NLTK - WordNet Lemmatizer'): word_net_lemmatizer = nltk.WordNetLemmatizer() for token, pos in wl_pos_tagging.wl_pos_tag( main, tokens, lang = 'eng', pos_tagger = 'NLTK - Perceptron POS Tagger', tagset = 'universal' ): if pos == 'ADJ': lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.ADJ)) elif pos in ['NOUN', 'PROPN']: lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.NOUN)) elif pos == 'ADV': lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.ADV)) elif pos in ['VERB', 'AUX']: lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.VERB)) else: lemmas.append(word_net_lemmatizer.lemmatize(token)) # Greek (Ancient) elif lemmatizer == main.tr('lemmalist-greek - Greek (Ancient) Lemma List'): with open(wl_misc.get_normalized_path('lemmatization/lemmalist-greek/lemmalist-greek.txt'), 'r', encoding = 'utf_8') as f: for line in f.readlines(): line = line.rstrip() if line: lemma, *words = line.split() for word in words: mapping_lemmas[word] = lemma # Russian & Ukrainian elif lemmatizer == main.tr('pymorphy2 - Morphological Analyzer'): if lang == 'rus': morphological_analyzer = pymorphy2.MorphAnalyzer(lang = 'ru') else: morphological_analyzer = pymorphy2.MorphAnalyzer(lang = 'uk') for token in tokens: lemmas.append(morphological_analyzer.parse(token)[0].normal_form) # Tibetan elif lemmatizer == main.tr('botok - Tibetan Lemmatizer'): wl_text_utils.check_word_tokenizers(main, lang = 'bod') tokens = main.botok_word_tokenizer.tokenize(' '.join(tokens)) for token in tokens: if token.lemma: lemmas.append(token.lemma) else: lemmas.append(token.text) # Other Languages elif 'Lemmatization Lists' in lemmatizer: lang = wl_conversion.to_iso_639_1(main, lang) with open(wl_misc.get_normalized_path(f'lemmatization/Lemmatization Lists/lemmatization-{lang}.txt'), 'r', encoding = 'utf_8_sig') as f: for line in f: try: lemma, word = line.rstrip().split('\t') mapping_lemmas[word] = lemma except: pass else: lemmas = tokens if mapping_lemmas: lemmas = [mapping_lemmas.get(token, token) for token in tokens] # Insert empty lemmas for empty_offset in sorted(empty_offsets): lemmas.insert(empty_offset, '') return [lemma + tag for lemma, tag in zip(lemmas, tags)]
def test_to_iso_639_1(lang_code): len_iso_639_3 = max( [len(lang_code) for lang_code in main.settings_global['lang_codes']]) iso_639_1 = wl_conversion.to_iso_639_1(main, lang_code) assert iso_639_1 == main.settings_global['lang_codes'][lang_code]
def wl_word_tokenize(main, text, lang, word_tokenizer='default', flat_tokens=True): tokens_multilevel = [] if lang not in main.settings_global['word_tokenizers']: lang = 'other' if word_tokenizer == 'default': word_tokenizer = main.settings_custom['word_tokenization'][ 'word_tokenizers'][lang] # Check initialization status of word (and sentence) tokenizers if flat_tokens: wl_text_utils.check_word_tokenizers(main, lang=lang, word_tokenizer=word_tokenizer) else: wl_text_utils.check_tokenizers(main, lang=lang, word_tokenizer=word_tokenizer) # NLTK if 'NLTK' in word_tokenizer: sentences = wl_sentence_tokenization.wl_sentence_tokenize( main, text, lang) if word_tokenizer == main.tr('NLTK - NIST Tokenizer'): nist_tokenizer = nltk.tokenize.nist.NISTTokenizer() for sentence in sentences: tokens_multilevel.append(nist_tokenizer.tokenize(sentence)) elif word_tokenizer == main.tr('NLTK - NLTK Tokenizer'): nltk_tokenizer = nltk.NLTKWordTokenizer() for sentence in sentences: tokens_multilevel.append(nltk_tokenizer.tokenize(sentence)) elif word_tokenizer == main.tr('NLTK - Penn Treebank Tokenizer'): treebank_tokenizer = nltk.TreebankWordTokenizer() for sentence in sentences: tokens_multilevel.append(treebank_tokenizer.tokenize(sentence)) elif word_tokenizer == main.tr('NLTK - Tok-tok Tokenizer'): toktok_tokenizer = nltk.ToktokTokenizer() for sentence in sentences: tokens_multilevel.append(toktok_tokenizer.tokenize(sentence)) elif word_tokenizer == main.tr('NLTK - Twitter Tokenizer'): tweet_tokenizer = nltk.TweetTokenizer() for sentence in sentences: tokens_multilevel.append(tweet_tokenizer.tokenize(sentence)) # Sacremoses elif 'Sacremoses' in word_tokenizer: if flat_tokens: sentences = [text] else: sentences = wl_sentence_tokenization.wl_sentence_tokenize( main, text, lang) moses_tokenizer = sacremoses.MosesTokenizer( lang=wl_conversion.to_iso_639_1(main, lang)) for sentence in sentences: tokens_multilevel.append( moses_tokenizer.tokenize(sentence, escape=False)) # spaCy elif 'spaCy' in word_tokenizer: nlp = main.__dict__[f'spacy_nlp_{lang}'] doc = nlp(text) # See Issue #3479: https://github.com/explosion/spaCy/issues/3479 doc.is_parsed = True if flat_tokens: tokens_multilevel.append([token.text for token in doc]) else: for sentence in doc.sents: tokens_multilevel.append( [token.text for token in sentence.as_doc()]) # syntok elif word_tokenizer == 'syntok - Word Tokenizer': syntok_tokenizer = syntok.tokenizer.Tokenizer() if flat_tokens: tokens_multilevel.append( [token.value for token in syntok_tokenizer.tokenize(text)]) else: for para in syntok.segmenter.analyze(text): for sentence in para: tokens_multilevel.append( [token.value for token in sentence]) # Chinese & Japanese elif ('jieba' in word_tokenizer or 'nagisa' in word_tokenizer or 'Wordless' in word_tokenizer): if flat_tokens: sentences = [text] else: sentences = wl_sentence_tokenization.wl_sentence_tokenize( main, text, lang=lang) # Chinese if word_tokenizer == main.tr('jieba - Chinese Word Tokenizer'): for sentence in sentences: tokens_multilevel.append(jieba.cut(sentence)) elif word_tokenizer == main.tr( 'Wordless - Chinese Character Tokenizer'): for sentence in sentences: tokens = [] non_han_start = 0 for i, char in enumerate(sentence): if i >= non_han_start: if wl_checking_unicode.is_han(char): tokens.append(char) non_han_start += 1 else: # English if wl_checking_unicode.is_eng(char): for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or not wl_checking_unicode.is_eng( sentence[i + j + 1]): tokens.extend( wl_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='eng')) non_han_start = i + j + 1 break # Other Languages else: for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or wl_checking_unicode.is_han( sentence[i + j + 1]): tokens.extend( wl_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='other')) non_han_start = i + j + 1 break tokens_multilevel.append(tokens) # Japanese elif word_tokenizer == main.tr('nagisa - Japanese Word Tokenizer'): import nagisa for sentence in sentences: tokens_multilevel.append(nagisa.tagging(str(sentence)).words) elif word_tokenizer == main.tr('Wordless - Japanese Kanji Tokenizer'): for sentence in sentences: tokens = [] non_han_start = 0 for i, char in enumerate(sentence): if i >= non_han_start: if wl_checking_unicode.is_han(char): tokens.append(char) non_han_start += 1 else: # Japanese Kana if wl_checking_unicode.is_kana(char): for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or not wl_checking_unicode.is_kana( sentence[i + j + 1]): tokens.extend( wl_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='jpn')) non_han_start = i + j + 1 break # English elif wl_checking_unicode.is_eng(char): for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or not wl_checking_unicode.is_eng( sentence[i + j + 1]): tokens.extend( wl_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='eng')) non_han_start = i + j + 1 break # Other Languages else: for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or wl_checking_unicode.is_han( sentence[i + j + 1]): tokens.extend( wl_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='other')) non_han_start = i + j + 1 break tokens_multilevel.append(tokens) # Russian elif word_tokenizer == 'razdel - Russian Word Tokenizer': if flat_tokens: sentences = [text] else: sentences = wl_sentence_tokenization.wl_sentence_tokenize( main, text, lang='rus') for sentence in sentences: tokens_multilevel.append( [token.text for token in razdel.tokenize(sentence)]) # Thai elif 'PyThaiNLP' in word_tokenizer: # Preserve sentence boundaries sentences = wl_sentence_tokenization.wl_sentence_tokenize(main, text, lang='tha') if word_tokenizer == main.tr('PyThaiNLP - Longest Matching'): for sentence in sentences: tokens_multilevel.append( pythainlp.word_tokenize(sentence, engine='longest')) elif word_tokenizer == main.tr('PyThaiNLP - Maximum Matching'): for sentence in sentences: tokens_multilevel.append( pythainlp.word_tokenize(sentence, engine='mm')) elif word_tokenizer == main.tr('PyThaiNLP - Maximum Matching + TCC'): for sentence in sentences: tokens_multilevel.append( pythainlp.word_tokenize(sentence, engine='newmm')) elif word_tokenizer == main.tr( 'PyThaiNLP - Maximum Matching + TCC (Safe Mode)'): for sentence in sentences: tokens_multilevel.append( pythainlp.word_tokenize(sentence, engine='newmm-safe')) # Tibetan elif 'botok' in word_tokenizer: if flat_tokens: sentences = [text] else: sentences = wl_sentence_tokenization.wl_sentence_tokenize( main, text, lang='bod') for sentence in sentences: tokens_multilevel.append([ token.text for token in main.botok_word_tokenizer.tokenize(sentence) ]) # Vietnamese elif word_tokenizer == main.tr('Underthesea - Vietnamese Word Tokenizer'): if flat_tokens: sentences = [text] else: sentences = wl_sentence_tokenization.wl_sentence_tokenize( main, text, lang='vie', sentence_tokenizer='Underthesea - Vietnamese Sentence Tokenizer' ) for sentence in sentences: tokens_multilevel.append(underthesea.word_tokenize(str(sentence))) # Remove empty tokens and strip whitespace for i, sentence in enumerate(tokens_multilevel): tokens_multilevel[i] = [ token.strip() for token in sentence if token.strip() ] # Record token boundaries if lang in ['zho_cn', 'zho_tw', 'jpn']: for sentence in tokens_multilevel: if sentence: sentence[-1] = wl_text.Wl_Token(sentence[-1], boundary='', sentence_ending=True) else: for sentence in tokens_multilevel: if sentence: sentence[-1] = wl_text.Wl_Token(sentence[-1], boundary=' ', sentence_ending=True) # Clause tokenization if not flat_tokens: for i, sentence in enumerate(tokens_multilevel): tokens_multilevel[i] = wl_sentence_tokenization.wl_clause_tokenize( main, sentence, lang) # Flatten tokens tokens_flat = list(wl_misc.flatten_list(tokens_multilevel)) if flat_tokens: return tokens_flat else: return tokens_multilevel
def test_to_iso_639_1(): for lang_code in TO_ISO_639_1.keys(): lang_code_639_1 = wl_conversion.to_iso_639_1(main, lang_code) assert lang_code_639_1 == TO_ISO_639_1[lang_code]
def wl_lemmatize_text(main, text, lang, tokenized, tagged, lemmatizer): lemmas = [] # spaCy if lemmatizer.startswith('spacy_'): if not lang.startswith('srp_'): lang = wl_conversion.remove_lang_code_suffixes(main, lang) nlp = main.__dict__[f'spacy_nlp_{lang}'] doc = nlp(text) lemmas = [token.lemma_ for token in doc] # English elif lemmatizer == 'nltk_wordnet': word_net_lemmatizer = nltk.WordNetLemmatizer() for token, pos in wl_pos_tagging.wl_pos_tag( main, text, lang = 'eng_us', pos_tagger = 'nltk_perceptron', tagset = 'universal' ): if pos == 'ADJ': lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.ADJ)) elif pos in ['NOUN', 'PROPN']: lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.NOUN)) elif pos == 'ADV': lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.ADV)) elif pos in ['VERB', 'AUX']: lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.VERB)) else: lemmas.append(word_net_lemmatizer.lemmatize(token)) # Japanese elif lemmatizer == 'sudachipy_jpn': lemmas = [ token.dictionary_form() for token in main.sudachipy_word_tokenizer.tokenize(text) ] # Russian & Ukrainian elif lemmatizer == 'pymorphy2_morphological_analyzer': if lang == 'rus': morphological_analyzer = main.pymorphy2_morphological_analyzer_rus elif lang == 'ukr': morphological_analyzer = main.pymorphy2_morphological_analyzer_ukr tokens = wl_word_tokenization.wl_word_tokenize_flat(main, text, lang = lang) for token in tokens: lemmas.append(morphological_analyzer.parse(token)[0].normal_form) # Tibetan elif lemmatizer == 'botok_bod': tokens = main.botok_word_tokenizer.tokenize(text) for token in tokens: if token.lemma: lemmas.append(token.lemma) else: lemmas.append(token.text) # Lemmatization Lists elif lemmatizer.startswith('lemmatization_lists_'): mapping_lemmas = {} lang = wl_conversion.to_iso_639_1(main, lang) lang = wl_conversion.remove_lang_code_suffixes(main, lang) with open(wl_misc.get_normalized_path(f'lemmatization/Lemmatization Lists/lemmatization-{lang}.txt'), 'r', encoding = 'utf_8_sig') as f: for line in f: try: lemma, word = line.rstrip().split('\t') mapping_lemmas[word] = lemma except ValueError: pass tokens = wl_word_tokenization.wl_word_tokenize_flat(main, text, lang = lang) lemmas = [mapping_lemmas.get(token, token) for token in tokens] # Remove empty lemmas and strip whitespace in tokens lemmas = [ str(lemma).strip() for lemma in lemmas if str(lemma).strip() ] return lemmas
def wl_lemmatize_tokens(main, tokens, lang, tokenized, tagged, lemmatizer): empty_offsets = [] lemmas = [] tokens = [str(token) for token in tokens] re_tags = wl_matching.get_re_tags(main, tag_type = 'body') if tagged == _tr('wl_lemmatize_tokens', 'Yes'): tags = [''.join(re.findall(re_tags, token)) for token in tokens] tokens = [re.sub(re_tags, '', token) for token in tokens] else: tags = [''] * len(tokens) # Record empty tokens with their tags for i, token in reversed(list(enumerate(tokens))): if not token.strip(): empty_offsets.append(i) del tokens[i] del tags[i] # spaCy if 'spacy' in lemmatizer: if not lang.startswith('srp_'): lang = wl_conversion.remove_lang_code_suffixes(main, lang) nlp = main.__dict__[f'spacy_nlp_{lang}'] if lang != 'jpn': doc = spacy.tokens.Doc(nlp.vocab, words = tokens, spaces = [False] * len(tokens)) for pipe_name in nlp.pipe_names: nlp.get_pipe(pipe_name)(doc) # The Japanese model do not have a lemmatizer component and Japanese lemmas are taken directly from SudachiPy # See: https://github.com/explosion/spaCy/discussions/9983#discussioncomment-1923647 else: doc = nlp(''.join(tokens)) lemma_tokens = [token.text for token in doc] lemmas = [token.lemma_ for token in doc] # English elif lemmatizer == 'nltk_wordnet': word_net_lemmatizer = nltk.WordNetLemmatizer() for token, pos in wl_pos_tagging.wl_pos_tag( main, tokens, lang = 'eng_us', pos_tagger = 'nltk_perceptron', tagset = 'universal' ): if pos == 'ADJ': lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.ADJ)) elif pos in ['NOUN', 'PROPN']: lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.NOUN)) elif pos == 'ADV': lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.ADV)) elif pos in ['VERB', 'AUX']: lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.VERB)) else: lemmas.append(word_net_lemmatizer.lemmatize(token)) lemma_tokens = tokens.copy() # Japanese elif lemmatizer == 'sudachipy_jpn': tokens_retokenized = main.sudachipy_word_tokenizer.tokenize(''.join(tokens)) lemma_tokens = [token.surface() for token in tokens_retokenized] lemmas = [token.dictionary_form() for token in tokens_retokenized] # Russian & Ukrainian elif lemmatizer == 'pymorphy2_morphological_analyzer': if lang == 'rus': morphological_analyzer = main.pymorphy2_morphological_analyzer_rus elif lang == 'ukr': morphological_analyzer = main.pymorphy2_morphological_analyzer_ukr for token in tokens: lemmas.append(morphological_analyzer.parse(token)[0].normal_form) lemma_tokens = tokens.copy() # Tibetan elif lemmatizer == 'botok_bod': lemma_tokens = [] tokens_retokenized = main.botok_word_tokenizer.tokenize(''.join(tokens)) for token in tokens_retokenized: if token.lemma: lemmas.append(token.lemma) else: lemmas.append(token.text) lemma_tokens.append(token.text) # Lemmatization Lists elif 'lemmatization_lists' in lemmatizer: mapping_lemmas = {} lang = wl_conversion.to_iso_639_1(main, lang) lang = wl_conversion.remove_lang_code_suffixes(main, lang) with open(wl_misc.get_normalized_path(f'lemmatization/Lemmatization Lists/lemmatization-{lang}.txt'), 'r', encoding = 'utf_8_sig') as f: for line in f: try: lemma, word = line.rstrip().split('\t') mapping_lemmas[word] = lemma except ValueError: pass lemma_tokens = tokens.copy() lemmas = [mapping_lemmas.get(token, token) for token in tokens] # Remove empty lemmas and strip whitespace in tokens for i, lemma in reversed(list(enumerate(lemmas))): lemma_tokens[i] = lemma_tokens[i].strip() lemmas[i] = lemma.strip() if not lemmas[i]: del lemmas[i] del lemma_tokens[i] # Make sure that tokenization is not modified during lemmatization i_tokens = 0 i_lemmas = 0 len_tokens = len(tokens) len_lemmas = len(lemmas) if len_tokens != len_lemmas: tags_modified = [] lemmas_modified = [] while i_tokens < len_tokens and i_lemmas < len_lemmas: # Different token if len(tokens[i_tokens]) != len(lemma_tokens[i_lemmas]): tokens_temp = [tokens[i_tokens]] tags_temp = [tags[i_tokens]] lemma_tokens_temp = [lemma_tokens[i_lemmas]] lemmas_temp = [lemmas[i_lemmas]] # Align tokens while i_tokens < len_tokens - 1 or i_lemmas < len_lemmas - 1: len_tokens_temp = sum([len(token) for token in tokens_temp]) len_lemma_tokens_temp = sum([len(token) for token in lemma_tokens_temp]) if len_tokens_temp > len_lemma_tokens_temp: lemma_tokens_temp.append(lemma_tokens[i_lemmas + 1]) lemmas_temp.append(lemmas[i_lemmas + 1]) i_lemmas += 1 elif len_tokens_temp < len_lemma_tokens_temp: tokens_temp.append(tokens[i_tokens + 1]) tags_temp.append(tags[i_tokens + 1]) i_tokens += 1 else: # Use lemmas in one-to-one if len(tokens_temp) == len(lemma_tokens_temp): tags_modified.extend(tags_temp) lemmas_modified.extend(lemmas_temp) # Use original tokens in many-to-one or one-to-many else: tags_modified.extend(tags) lemmas_modified.extend(tokens_temp) tokens_temp = [] tags_temp = [] lemma_tokens_temp = [] lemmas_temp = [] break if tokens_temp: # Use lemmas in one-to-one if len(tokens_temp) == len(lemma_tokens_temp): tags_modified.extend(tags_temp) lemmas_modified.extend(lemmas_temp) # Use original tokens in many-to-one or one-to-many else: tags_modified.extend(tags) lemmas_modified.extend(tokens_temp) else: tags_modified.extend(tags[i_tokens]) lemmas_modified.append(lemmas[i_lemmas]) i_tokens += 1 i_lemmas += 1 len_lemmas_modified = len(lemmas_modified) if len_tokens < len_lemmas_modified: tags = tags_modified[:len_tokens] lemmas = lemmas_modified[:len_tokens] elif len_tokens > len_lemmas_modified: tags = tags_modified + [tags_modified[-1]] * (len_tokens - len_lemmas_modified) lemmas = lemmas_modified + [lemmas_modified[-1]] * (len_tokens - len_lemmas_modified) else: tags = tags_modified.copy() lemmas = lemmas_modified.copy() # Insert empty lemmas and their tags after alignment of input and output for empty_offset in sorted(empty_offsets): lemmas.insert(empty_offset, '') tags.insert(empty_offset, '') return [lemma + tag for lemma, tag in zip(lemmas, tags)]