def to_posix_string(lang_code): # Normalize fra-ca to fr_CA lang_code = use_underscore(lang_code) if not lang_code: return None if '_' in lang_code: # ISO format, must convert to POSIX format lang, country = lang_code.split('_')[:2] else: lang, country = lang_code, None if is_valid639_1(lang): posix_lang = lang elif is_valid639_2(lang): temp = to_iso639_1(lang) posix_lang = temp if temp else lang else: full_name = lang.lower().capitalize() if is_valid639_2(full_name): posix_lang = to_iso639_1(full_name) else: raise ValueError("""The input %s in not a valid code to convert to posix format % (full_name,)""") if country: return '_'.join([posix_lang.lower(), country.upper()]) else: return posix_lang.lower()
def normalize(document, language=None): if language is not None and not iso639.is_valid639_1(language): raise ValueError( '"{}" is not a valid ISO 639-1 code.'.format(language)) return { 'en': normalize_english, 'ja': normalize_japanese, }.get(language or detect_language(document), normalize_english)(document)
def is_valid_lang(lang): if len(lang) == 2: if iso639.is_valid639_1(lang): return True elif len(lang) == 3: if iso639.is_valid639_2(lang): return True else: return False
def to_3_letter_lang(lang): if len(lang) == 2: if iso639.is_valid639_1(lang): return iso639.to_iso639_2(lang) if len(lang) == 3: if iso639.is_valid639_2(lang): return lang return False
def ensure_locale_has_country(locale): # assuming a posix locale if '_' not in locale: # first look in config settings = get_config() available = settings.get('available_languages', 'en_CA fr_CA').split() avail_langs = {get_language(loc): loc for loc in reversed(available) if '_' in loc} locale_with_country = avail_langs.get(locale, None) if not locale_with_country: if is_valid639_1(locale): return locale return None return locale_with_country # TODO: Default countries for languages. Look in pycountry? return locale
def ensure_locale_has_country(locale): # assuming a posix locale if '_' not in locale: # first look in config settings = get_config() available = settings.get('available_languages', 'en_CA fr_CA').split() avail_langs = { get_language(loc): loc for loc in reversed(available) if '_' in loc } locale_with_country = avail_langs.get(locale, None) if not locale_with_country: if is_valid639_1(locale): return locale return None return locale_with_country # TODO: Default countries for languages. Look in pycountry? return locale
def to_posix_string(locale_code): if not locale_code: return None # Normalize fra-ca to fr_CA locale_code = use_underscore(locale_code) locale_parts = locale_code.split("_") # Normalize first component lang = locale_parts[0] if is_valid639_1(lang): posix_lang = lang elif is_valid639_2(lang): temp = to_iso639_1(lang) posix_lang = temp if temp else lang else: # Aryan, not sure what case is being covered here full_name = lang.lower().capitalize() if is_valid639_2(full_name): posix_lang = to_iso639_1(full_name) else: raise ValueError( "The input %s in not a valid code to convert to posix format" % (locale_code, )) locale_parts[0] = posix_lang if len(locale_parts) > 4: raise ValueError("This locale has too many parts: " + locale_code) elif len(locale_parts) == 4: # Drop dialect. Sorry. locale_parts.pop() if len(locale_parts) > 1: # Normalize Country if len(locale_parts[-1]) == 2: locale_parts[-1] = locale_parts[-1].upper() elif len(locale_parts[-1]) != 4: raise ValueError("The last part is not a script or country: " + locale_code) # Normalize script if len(locale_parts[1]) == 4: locale_parts[1] = locale_parts[1].capitalize() return "_".join(locale_parts)
def to_posix_string(locale_code): if not locale_code: return None # Normalize fra-ca to fr_CA locale_code = use_underscore(locale_code) locale_parts = locale_code.split("_") # Normalize first component lang = locale_parts[0] if is_valid639_1(lang): posix_lang = lang elif is_valid639_2(lang): temp = to_iso639_1(lang) posix_lang = temp if temp else lang else: # Aryan, not sure what case is being covered here full_name = lang.lower().capitalize() if is_valid639_2(full_name): posix_lang = to_iso639_1(full_name) else: raise ValueError( "The input %s in not a valid code to convert to posix format" % (locale_code,)) locale_parts[0] = posix_lang if len(locale_parts) > 4: raise ValueError("This locale has too many parts: "+locale_code) elif len(locale_parts) == 4: # Drop dialect. Sorry. locale_parts.pop() if len(locale_parts) > 1: # Normalize Country if len(locale_parts[-1]) == 2: locale_parts[-1] = locale_parts[-1].upper() elif len(locale_parts[-1]) != 4: raise ValueError( "The last part is not a script or country: "+locale_code) # Normalize script if len(locale_parts[1]) == 4: locale_parts[1] = locale_parts[1].capitalize() return "_".join(locale_parts)
def to_posix_format(lang_code): # Normalize fra-ca to fr_CA lang_code = use_underscore(lang_code) if not lang_code: return None if '_' in lang_code: # ISO format, must convert to POSIX format lang, country = lang_code.split('_')[:2] else: lang, country = lang_code, None if is_valid639_1(lang): posix_lang = lang elif is_valid639_2(lang): posix_lang = to_iso639_1(lang) else: full_name = lang.lower().capitalize() if is_valid639_2(full_name): posix_lang = to_iso639_1(full_name) else: return if country: return '_'.join([posix_lang.lower(), country.upper()]) else: return posix_lang.lower()
def get_filename_language(full_path): filename = os.path.basename(full_path).split(".") forced = False numbered = False sub_lang = filename[-2].lower() if sub_lang == "forced": forced = True sub_lang = filename[-3].lower() if sub_lang.isnumeric(): numbered = True sub_lang = filename[-4].lower() elif sub_lang.isnumeric(): numbered = True sub_lang = filename[-3].lower() if len(sub_lang) == 2 or len(sub_lang) == 3: if not iso639.is_valid639_1(sub_lang) and not iso639.is_valid639_2(sub_lang): sub_lang = "Unknown" else: sub_lang = "Unknown" return (sub_lang, forced, numbered)
def validate_lang(lang): import iso639 if not iso639.is_valid639_1(lang): raise Exception("Unknown language: {}".format(lang))
def make_translations(cast, crew, target_lang, source_lang ='auto'): ''' translates cast and crew fields. Source language is auto. :param cast, crew: data to translate :param source_lang: from which language to translate :param target_lang: to which language to translate :return: nothing. Modifies credits inline ''' if target_lang == 'en' or source_lang == target_lang: log.debug('No translation required') return #if we have not valid 6391 iso codes of languages if (source_lang != 'auto' and not iso639.is_valid639_1(source_lang)) or not iso639.is_valid639_1(target_lang): raise Exception('Wrong source or target language for translation') #Translating cast if cast: cast_characters = [cast.character for cast in cast] cast_names = [cast.name for cast in cast] cast_to_translate = [*cast_characters, *cast_names] # Source lang is 'auto' by default params_cast = { 'key': YANDEX_API_KEY, 'text': cast_to_translate, 'lang': f'{source_lang}-{target_lang}' if source_lang != 'auto' else f'{target_lang}' } response_cast = requests.post(YANDEX_BASE_URL, params=params_cast) if response_cast.status_code != 200: log.error('Translation request failed') raise Exception("Translation request failed") cast_translations = loads(response_cast.text)['text'] n_cast = len(cast) i = 0 while i < n_cast: # we have translations as character, character, character,.., name, name, name # So its length as double of credits.cast cast[i].character = cast_translations[i] cast[i].name = cast_translations[i + n_cast] i += 1 #Translating crew if crew: crew_departments = [crew.department for crew in crew] crew_jobs = [crew.job for crew in crew] crew_names = [crew.name for crew in crew] crew_info_to_translate = [*crew_departments, *crew_jobs, *crew_names] params_crew = { 'key': YANDEX_API_KEY, 'text': crew_info_to_translate, 'lang': f'{source_lang}-{target_lang}' if source_lang != 'auto' else f'{target_lang}' } response_crew = requests.post(YANDEX_BASE_URL, params=params_crew) if response_crew.status_code != 200: log.error('Translation request failed') raise Exception(f"Translation request failed") crew_translations = loads(response_crew.text)['text'] n_crew = len(crew) i = 0 while i < n_crew: crew[i].department = crew_translations[i] crew[i].job = crew_translations[i + n_crew] crew[i].name = crew_translations[i + 2 * n_crew] i += 1
def is_valid_iso639_code(value): return is_valid639_1(value) or is_valid639_2(value)
] inuse = ["nd", "ft", "zz", "hk", "dh", "ng", "zy"] combinations = [] tables = [] for i in range(2, len(flairs)+1): for f in itertools.combinations(flairs, i): combinations.append(f) current_id = "ac" for f in combinations: print(", ".join(["html:lang(" + current_id + ") " + s[1] for s in f]) + " {") print(" display: none;") print("}") table = [] for z in flairs: if z in f: table.append(table_true) else: table.append(table_false) tables.append([table, current_id]) current_id = nextid(current_id) while is_valid639_1(current_id) or current_id in inuse: current_id = nextid(current_id) for f in range(3): print() print ("Subdomain | " + " | ".join(f[0] for f in flairs)) print("-|" + "|".join([":-:" for i in range(len(flairs))])) for table in tables: print("[`" + table[1] + "`](http://"+table[1]+".reddit.com/r/"+subreddit+") | " + " | ".join(table[0]))
def findSubtitlesNoneIso639(scanfolder, isoMode, disablelangdetect): """ Detect subtitles that do not comply with ISO-639. TODO: Add more subtitle extensions (and read/parse them correctly for language detection) TODO: Seperate language detection better in different functions TODO: Add percentage of certainty and possible other languages when low certainty TODO: Handle unicode better to detect languages like German and Dutch better TODO: Use table """ subtitleExts = ['.srt', '.sub', '.ass'] total = 0 incorrect = 0 detectedlang = 0 for subdir, dirnames, filenames in os.walk(scanfolder): for filename in filenames: incorrectSubtitle = False extension = os.path.splitext(filename)[1].lower() # subdirName = os.path.basename(os.path.normpath(subdir)) if extension in subtitleExts: total = total + 1 langcodeFromFilename = getIsoLanguageCodeFromFilename(filename) detectedLanguage = "" detectedIsoMode = False if is_valid639_1(langcodeFromFilename): detectedIsoMode = "1" detectedLanguage = iso639_to_name(langcodeFromFilename) if is_valid639_2(langcodeFromFilename): detectedIsoMode = "2" detectedLanguage = iso639_to_name(langcodeFromFilename) if detectedIsoMode is not isoMode: isoShouldBe = "" if isoMode == "1" and detectedIsoMode == "2": isoShouldBe = to_iso639_1(langcodeFromFilename) if isoMode == "2" and detectedIsoMode == "1": isoShouldBe = to_iso639_2(langcodeFromFilename) filepath = subdir + os.sep + filename incorrectSubtitle = True incorrect = incorrect + 1 warning = "Incorrectly named subtitle found at " warning += bold(filepath) printNotificationWarning(warning) if detectedIsoMode is not False: info = "\t\tLang code " + bold(langcodeFromFilename) info += " (ISO 639-" + str(detectedIsoMode) + ") " info += "detected. The ISO 639-" + isoMode + " code" info += " for " + detectedLanguage + " is " info += bold(isoShouldBe) + "." printNotificationInfo(info) if incorrectSubtitle and not disablelangdetect: filepath = subdir + os.sep + filename try: with io.open(filepath, "r", encoding="utf-8") as mfile: my_unicode_string = mfile.read() possibleLanguage = "\tDetected language is likely to " possibleLanguage += "be \"" + detect(my_unicode_string) possibleLanguage += "\"\n" detectedlang = detectedlang + 1 except Exception: possibleLanguage = "\tLanguage detection failed\n" info = "Found subtitle files " + bold(str(total)) + " of which " info += bold(str(incorrect)) + " are incorrectly named!" printNotificationInfo(info)