def test_to_iso_639_3(lang_code): len_iso_639_1 = max([ len(lang_code) for lang_code in main.settings_global['lang_codes'].values() ]) iso_639_3 = wl_conversion.to_iso_639_3(main, lang_code) assert iso_639_3 == { iso_639_1: iso_639_3 for iso_639_3, iso_639_1 in main.settings_global['lang_codes'].items() }[lang_code]
def detect_lang(main, file): text = '' try: with open(file['path'], 'r', encoding=file['encoding']) as f: if main.settings_custom['auto_detection']['detection_settings'][ 'number_lines_no_limit']: for line in f: text += line else: for i, line in enumerate(f): if i < main.settings_custom['auto_detection'][ 'detection_settings']['number_lines']: text += line else: break lang_code_639_1 = langid.classify(text)[0] # Chinese (Simplified) & Chinese (Traditional) if lang_code_639_1 == 'zh': lang_code_639_1 = 'zh_cn' for lang in sorted(langdetect.detect_langs(text), key=lambda item: -item.prob): if lang.lang in ['zh-cn', 'zh-tw']: lang_code_639_1 = lang.lang.replace('-', '_') break # Norwegian Bokmål elif lang_code_639_1 == 'no': lang_code_639_1 = 'nb' # Serbian (Cyrillic) elif lang_code_639_1 == 'sr': lang_code_639_1 = 'sr_cyrl' lang = wl_conversion.to_iso_639_3(main, lang_code_639_1) success = True except: lang = main.settings_custom['auto_detection']['default_settings'][ 'default_lang'] success = False return lang, success
def check_missing_extra_langs(langs_supported, langs_global, msg): global lang_missing global lang_extra for lang_code in langs_supported: lang_code_639_3 = wl_conversion.to_iso_639_3(main, lang_code) if lang_code_639_3 not in langs_global: print( f'''Missing language code "{lang_code_639_3}/{lang_code}" found for {msg}!''' ) lang_missing = True for lang_code in langs_global: lang_code_639_1 = wl_conversion.to_iso_639_1(main, lang_code) if lang_code_639_1 not in langs_supported: print( f'''Extra language code "{lang_code}/{lang_code_639_1}" found for {msg}!''' ) lang_extra = True
def detect_lang_text(main, text): lang_code_639_1 = langid.classify(text)[0] # Chinese (Simplified) & Chinese (Traditional) if lang_code_639_1 == 'zh': lang_code_639_1 = 'zh_cn' for lang in sorted(langdetect.detect_langs(text), key=lambda item: -item.prob): if lang.lang in ['zh-cn', 'zh-tw']: lang_code_639_1 = lang.lang.replace('-', '_') break # English elif lang_code_639_1 == 'en': lang_code_639_1 = 'en_us' # German elif lang_code_639_1 == 'de': lang_code_639_1 = 'de_de' # Norwegian Bokmål elif lang_code_639_1 == 'no': lang_code_639_1 = 'nb' # Portuguese elif lang_code_639_1 == 'pt': lang_code_639_1 = 'pt_pt' # Serbian (Cyrillic) elif lang_code_639_1 == 'sr': lang_code_639_1 = 'sr_cyrl' lang = wl_conversion.to_iso_639_3(main, lang_code_639_1) # Other Languages if lang is None: lang = 'other' return lang
def test_to_iso_639_3(): for lang_code in TO_ISO_639_3.keys(): lang_code_639_3 = wl_conversion.to_iso_639_3(main, lang_code) assert lang_code_639_3 == TO_ISO_639_3[lang_code]