def to_posix_string(lang_code): # Normalize fra-ca to fr_CA lang_code = use_underscore(lang_code) if not lang_code: return None if '_' in lang_code: # ISO format, must convert to POSIX format lang, country = lang_code.split('_')[:2] else: lang, country = lang_code, None if is_valid639_1(lang): posix_lang = lang elif is_valid639_2(lang): temp = to_iso639_1(lang) posix_lang = temp if temp else lang else: full_name = lang.lower().capitalize() if is_valid639_2(full_name): posix_lang = to_iso639_1(full_name) else: raise ValueError("""The input %s in not a valid code to convert to posix format % (full_name,)""") if country: return '_'.join([posix_lang.lower(), country.upper()]) else: return posix_lang.lower()
def is_valid_lang(lang): if len(lang) == 2: if iso639.is_valid639_1(lang): return True elif len(lang) == 3: if iso639.is_valid639_2(lang): return True else: return False
def to_3_letter_lang(lang): if len(lang) == 2: if iso639.is_valid639_1(lang): return iso639.to_iso639_2(lang) if len(lang) == 3: if iso639.is_valid639_2(lang): return lang return False
def to_posix_string(locale_code): if not locale_code: return None # Normalize fra-ca to fr_CA locale_code = use_underscore(locale_code) locale_parts = locale_code.split("_") # Normalize first component lang = locale_parts[0] if is_valid639_1(lang): posix_lang = lang elif is_valid639_2(lang): temp = to_iso639_1(lang) posix_lang = temp if temp else lang else: # Aryan, not sure what case is being covered here full_name = lang.lower().capitalize() if is_valid639_2(full_name): posix_lang = to_iso639_1(full_name) else: raise ValueError( "The input %s in not a valid code to convert to posix format" % (locale_code, )) locale_parts[0] = posix_lang if len(locale_parts) > 4: raise ValueError("This locale has too many parts: " + locale_code) elif len(locale_parts) == 4: # Drop dialect. Sorry. locale_parts.pop() if len(locale_parts) > 1: # Normalize Country if len(locale_parts[-1]) == 2: locale_parts[-1] = locale_parts[-1].upper() elif len(locale_parts[-1]) != 4: raise ValueError("The last part is not a script or country: " + locale_code) # Normalize script if len(locale_parts[1]) == 4: locale_parts[1] = locale_parts[1].capitalize() return "_".join(locale_parts)
def to_posix_string(locale_code): if not locale_code: return None # Normalize fra-ca to fr_CA locale_code = use_underscore(locale_code) locale_parts = locale_code.split("_") # Normalize first component lang = locale_parts[0] if is_valid639_1(lang): posix_lang = lang elif is_valid639_2(lang): temp = to_iso639_1(lang) posix_lang = temp if temp else lang else: # Aryan, not sure what case is being covered here full_name = lang.lower().capitalize() if is_valid639_2(full_name): posix_lang = to_iso639_1(full_name) else: raise ValueError( "The input %s in not a valid code to convert to posix format" % (locale_code,)) locale_parts[0] = posix_lang if len(locale_parts) > 4: raise ValueError("This locale has too many parts: "+locale_code) elif len(locale_parts) == 4: # Drop dialect. Sorry. locale_parts.pop() if len(locale_parts) > 1: # Normalize Country if len(locale_parts[-1]) == 2: locale_parts[-1] = locale_parts[-1].upper() elif len(locale_parts[-1]) != 4: raise ValueError( "The last part is not a script or country: "+locale_code) # Normalize script if len(locale_parts[1]) == 4: locale_parts[1] = locale_parts[1].capitalize() return "_".join(locale_parts)
def to_posix_format(lang_code): # Normalize fra-ca to fr_CA lang_code = use_underscore(lang_code) if not lang_code: return None if '_' in lang_code: # ISO format, must convert to POSIX format lang, country = lang_code.split('_')[:2] else: lang, country = lang_code, None if is_valid639_1(lang): posix_lang = lang elif is_valid639_2(lang): posix_lang = to_iso639_1(lang) else: full_name = lang.lower().capitalize() if is_valid639_2(full_name): posix_lang = to_iso639_1(full_name) else: return if country: return '_'.join([posix_lang.lower(), country.upper()]) else: return posix_lang.lower()
def get_filename_language(full_path): filename = os.path.basename(full_path).split(".") forced = False numbered = False sub_lang = filename[-2].lower() if sub_lang == "forced": forced = True sub_lang = filename[-3].lower() if sub_lang.isnumeric(): numbered = True sub_lang = filename[-4].lower() elif sub_lang.isnumeric(): numbered = True sub_lang = filename[-3].lower() if len(sub_lang) == 2 or len(sub_lang) == 3: if not iso639.is_valid639_1(sub_lang) and not iso639.is_valid639_2(sub_lang): sub_lang = "Unknown" else: sub_lang = "Unknown" return (sub_lang, forced, numbered)
def clean_language(self): language = self.cleaned_data['language'].strip() if not is_valid639_2(language): raise forms.ValidationError( _("Language must be valid a ISO-639-2 code")) return language
def is_valid_iso639_code(value): return is_valid639_1(value) or is_valid639_2(value)
def valid_lang(lang): return is_valid639_2(lang)
def findSubtitlesNoneIso639(scanfolder, isoMode, disablelangdetect): """ Detect subtitles that do not comply with ISO-639. TODO: Add more subtitle extensions (and read/parse them correctly for language detection) TODO: Seperate language detection better in different functions TODO: Add percentage of certainty and possible other languages when low certainty TODO: Handle unicode better to detect languages like German and Dutch better TODO: Use table """ subtitleExts = ['.srt', '.sub', '.ass'] total = 0 incorrect = 0 detectedlang = 0 for subdir, dirnames, filenames in os.walk(scanfolder): for filename in filenames: incorrectSubtitle = False extension = os.path.splitext(filename)[1].lower() # subdirName = os.path.basename(os.path.normpath(subdir)) if extension in subtitleExts: total = total + 1 langcodeFromFilename = getIsoLanguageCodeFromFilename(filename) detectedLanguage = "" detectedIsoMode = False if is_valid639_1(langcodeFromFilename): detectedIsoMode = "1" detectedLanguage = iso639_to_name(langcodeFromFilename) if is_valid639_2(langcodeFromFilename): detectedIsoMode = "2" detectedLanguage = iso639_to_name(langcodeFromFilename) if detectedIsoMode is not isoMode: isoShouldBe = "" if isoMode == "1" and detectedIsoMode == "2": isoShouldBe = to_iso639_1(langcodeFromFilename) if isoMode == "2" and detectedIsoMode == "1": isoShouldBe = to_iso639_2(langcodeFromFilename) filepath = subdir + os.sep + filename incorrectSubtitle = True incorrect = incorrect + 1 warning = "Incorrectly named subtitle found at " warning += bold(filepath) printNotificationWarning(warning) if detectedIsoMode is not False: info = "\t\tLang code " + bold(langcodeFromFilename) info += " (ISO 639-" + str(detectedIsoMode) + ") " info += "detected. The ISO 639-" + isoMode + " code" info += " for " + detectedLanguage + " is " info += bold(isoShouldBe) + "." printNotificationInfo(info) if incorrectSubtitle and not disablelangdetect: filepath = subdir + os.sep + filename try: with io.open(filepath, "r", encoding="utf-8") as mfile: my_unicode_string = mfile.read() possibleLanguage = "\tDetected language is likely to " possibleLanguage += "be \"" + detect(my_unicode_string) possibleLanguage += "\"\n" detectedlang = detectedlang + 1 except Exception: possibleLanguage = "\tLanguage detection failed\n" info = "Found subtitle files " + bold(str(total)) + " of which " info += bold(str(incorrect)) + " are incorrectly named!" printNotificationInfo(info)