Exemple #1
0
def to_posix_string(lang_code):
    # Normalize fra-ca to fr_CA
    lang_code = use_underscore(lang_code)
    if not lang_code:
        return None
    if '_' in lang_code:
        # ISO format, must convert to POSIX format
        lang, country = lang_code.split('_')[:2]
    else:
        lang, country = lang_code, None
    if is_valid639_1(lang):
        posix_lang = lang
    elif is_valid639_2(lang):
        temp = to_iso639_1(lang)
        posix_lang = temp if temp else lang
    else:
        full_name = lang.lower().capitalize()
        if is_valid639_2(full_name):
            posix_lang = to_iso639_1(full_name)
        else:
            raise ValueError("""The input %s in not a valid code to convert
                             to posix format % (full_name,)""")
    if country:
        return '_'.join([posix_lang.lower(), country.upper()])
    else:
        return posix_lang.lower()
def normalize(document, language=None):
    if language is not None and not iso639.is_valid639_1(language):
        raise ValueError(
            '"{}" is not a valid ISO 639-1 code.'.format(language))

    return {
        'en': normalize_english,
        'ja': normalize_japanese,
    }.get(language or detect_language(document), normalize_english)(document)
def is_valid_lang(lang):
    if len(lang) == 2:
        if iso639.is_valid639_1(lang):
            return True
    elif len(lang) == 3:
        if iso639.is_valid639_2(lang):
            return True
    else:
        return False
def to_3_letter_lang(lang):
    if len(lang) == 2:
        if iso639.is_valid639_1(lang):
            return iso639.to_iso639_2(lang)

    if len(lang) == 3:
        if iso639.is_valid639_2(lang):
            return lang

    return False
Exemple #5
0
def ensure_locale_has_country(locale):
    # assuming a posix locale
    if '_' not in locale:
        # first look in config
        settings = get_config()
        available = settings.get('available_languages', 'en_CA fr_CA').split()
        avail_langs = {get_language(loc): loc
                       for loc in reversed(available) if '_' in loc}
        locale_with_country = avail_langs.get(locale, None)
        if not locale_with_country:
            if is_valid639_1(locale):
                return locale
            return None
        return locale_with_country
        # TODO: Default countries for languages. Look in pycountry?
    return locale
Exemple #6
0
def ensure_locale_has_country(locale):
    # assuming a posix locale
    if '_' not in locale:
        # first look in config
        settings = get_config()
        available = settings.get('available_languages', 'en_CA fr_CA').split()
        avail_langs = {
            get_language(loc): loc
            for loc in reversed(available) if '_' in loc
        }
        locale_with_country = avail_langs.get(locale, None)
        if not locale_with_country:
            if is_valid639_1(locale):
                return locale
            return None
        return locale_with_country
        # TODO: Default countries for languages. Look in pycountry?
    return locale
Exemple #7
0
def to_posix_string(locale_code):
    if not locale_code:
        return None
    # Normalize fra-ca to fr_CA
    locale_code = use_underscore(locale_code)
    locale_parts = locale_code.split("_")
    # Normalize first component
    lang = locale_parts[0]
    if is_valid639_1(lang):
        posix_lang = lang
    elif is_valid639_2(lang):
        temp = to_iso639_1(lang)
        posix_lang = temp if temp else lang
    else:
        # Aryan, not sure what case is being covered here
        full_name = lang.lower().capitalize()
        if is_valid639_2(full_name):
            posix_lang = to_iso639_1(full_name)
        else:
            raise ValueError(
                "The input %s in not a valid code to convert to posix format" %
                (locale_code, ))
    locale_parts[0] = posix_lang
    if len(locale_parts) > 4:
        raise ValueError("This locale has too many parts: " + locale_code)
    elif len(locale_parts) == 4:
        # Drop dialect. Sorry.
        locale_parts.pop()
    if len(locale_parts) > 1:
        # Normalize Country
        if len(locale_parts[-1]) == 2:
            locale_parts[-1] = locale_parts[-1].upper()
        elif len(locale_parts[-1]) != 4:
            raise ValueError("The last part is not a script or country: " +
                             locale_code)
        # Normalize script
        if len(locale_parts[1]) == 4:
            locale_parts[1] = locale_parts[1].capitalize()
    return "_".join(locale_parts)
Exemple #8
0
def to_posix_string(locale_code):
    if not locale_code:
        return None
    # Normalize fra-ca to fr_CA
    locale_code = use_underscore(locale_code)
    locale_parts = locale_code.split("_")
    # Normalize first component
    lang = locale_parts[0]
    if is_valid639_1(lang):
        posix_lang = lang
    elif is_valid639_2(lang):
        temp = to_iso639_1(lang)
        posix_lang = temp if temp else lang
    else:
        # Aryan, not sure what case is being covered here
        full_name = lang.lower().capitalize()
        if is_valid639_2(full_name):
            posix_lang = to_iso639_1(full_name)
        else:
            raise ValueError(
                "The input %s in not a valid code to convert to posix format" %
                (locale_code,))
    locale_parts[0] = posix_lang
    if len(locale_parts) > 4:
        raise ValueError("This locale has too many parts: "+locale_code)
    elif len(locale_parts) == 4:
        # Drop dialect. Sorry.
        locale_parts.pop()
    if len(locale_parts) > 1:
        # Normalize Country
        if len(locale_parts[-1]) == 2:
            locale_parts[-1] = locale_parts[-1].upper()
        elif len(locale_parts[-1]) != 4:
            raise ValueError(
                "The last part is not a script or country: "+locale_code)
        # Normalize script
        if len(locale_parts[1]) == 4:
            locale_parts[1] = locale_parts[1].capitalize()
    return "_".join(locale_parts)
Exemple #9
0
def to_posix_format(lang_code):
    # Normalize fra-ca to fr_CA
    lang_code = use_underscore(lang_code)
    if not lang_code:
        return None
    if '_' in lang_code:
        # ISO format, must convert to POSIX format
        lang, country = lang_code.split('_')[:2]
    else:
        lang, country = lang_code, None
    if is_valid639_1(lang):
        posix_lang = lang
    elif is_valid639_2(lang):
        posix_lang = to_iso639_1(lang)
    else:
        full_name = lang.lower().capitalize()
        if is_valid639_2(full_name):
            posix_lang = to_iso639_1(full_name)
        else:
            return
    if country:
        return '_'.join([posix_lang.lower(), country.upper()])
    else:
        return posix_lang.lower()
Exemple #10
0
def to_posix_format(lang_code):
    # Normalize fra-ca to fr_CA
    lang_code = use_underscore(lang_code)
    if not lang_code:
        return None
    if '_' in lang_code:
        # ISO format, must convert to POSIX format
        lang, country = lang_code.split('_')[:2]
    else:
        lang, country = lang_code, None
    if is_valid639_1(lang):
        posix_lang = lang
    elif is_valid639_2(lang):
        posix_lang = to_iso639_1(lang)
    else:
        full_name = lang.lower().capitalize()
        if is_valid639_2(full_name):
            posix_lang = to_iso639_1(full_name)
        else:
            return
    if country:
        return '_'.join([posix_lang.lower(), country.upper()])
    else:
        return posix_lang.lower()
def get_filename_language(full_path):
    filename = os.path.basename(full_path).split(".")

    forced = False
    numbered = False
    sub_lang = filename[-2].lower()

    if sub_lang == "forced":
        forced = True
        sub_lang = filename[-3].lower()
        if sub_lang.isnumeric():
            numbered = True
            sub_lang = filename[-4].lower()
    elif sub_lang.isnumeric():
        numbered = True
        sub_lang = filename[-3].lower()

    if len(sub_lang) == 2 or len(sub_lang) == 3:
        if not iso639.is_valid639_1(sub_lang) and not iso639.is_valid639_2(sub_lang):
            sub_lang = "Unknown"
    else:
        sub_lang = "Unknown"

    return (sub_lang, forced, numbered)
Exemple #12
0
 def validate_lang(lang):
     import iso639
     if not iso639.is_valid639_1(lang):
         raise Exception("Unknown language: {}".format(lang))
def make_translations(cast, crew, target_lang, source_lang ='auto'):
    '''
    translates cast and crew fields. Source language is auto.
    :param cast, crew: data to translate
    :param source_lang: from which language to translate
    :param target_lang: to which language to translate
    :return: nothing. Modifies credits inline
    '''
    if target_lang == 'en' or source_lang == target_lang:
        log.debug('No translation required')
        return
    #if we have not valid 6391 iso codes of languages
    if (source_lang != 'auto' and not iso639.is_valid639_1(source_lang)) or not iso639.is_valid639_1(target_lang):
        raise Exception('Wrong source or target language for translation')
    #Translating cast
    if cast:
        cast_characters = [cast.character for cast in cast]
        cast_names = [cast.name for cast in cast]
        cast_to_translate = [*cast_characters, *cast_names]
        # Source lang is 'auto' by default
        params_cast = {
            'key': YANDEX_API_KEY,
            'text': cast_to_translate,
            'lang': f'{source_lang}-{target_lang}' if source_lang != 'auto' else f'{target_lang}'
        }
        response_cast = requests.post(YANDEX_BASE_URL, params=params_cast)
        if response_cast.status_code != 200:
            log.error('Translation request failed')
            raise Exception("Translation request failed")
        cast_translations = loads(response_cast.text)['text']
        n_cast = len(cast)
        i = 0
        while i < n_cast:
            # we have translations as  character, character, character,.., name, name, name
            # So its length as double of credits.cast
            cast[i].character = cast_translations[i]
            cast[i].name = cast_translations[i + n_cast]
            i += 1
    #Translating crew
    if crew:
        crew_departments = [crew.department for crew in crew]
        crew_jobs = [crew.job for crew in crew]
        crew_names = [crew.name for crew in crew]
        crew_info_to_translate = [*crew_departments, *crew_jobs, *crew_names]
        params_crew = {
            'key': YANDEX_API_KEY,
            'text': crew_info_to_translate,
            'lang': f'{source_lang}-{target_lang}' if source_lang != 'auto' else f'{target_lang}'
        }
        response_crew = requests.post(YANDEX_BASE_URL, params=params_crew)
        if response_crew.status_code != 200:
            log.error('Translation request failed')
            raise Exception(f"Translation request failed")
        crew_translations = loads(response_crew.text)['text']
        n_crew = len(crew)
        i = 0
        while i < n_crew:
            crew[i].department = crew_translations[i]
            crew[i].job = crew_translations[i + n_crew]
            crew[i].name = crew_translations[i + 2 * n_crew]
            i += 1
Exemple #14
0
def is_valid_iso639_code(value):
    return is_valid639_1(value) or is_valid639_2(value)
]
inuse = ["nd", "ft", "zz", "hk", "dh", "ng", "zy"]
combinations = []
tables = []
for i in range(2, len(flairs)+1):
    for f in itertools.combinations(flairs, i):
        combinations.append(f)
current_id = "ac"
for f in combinations:
    print(", ".join(["html:lang(" + current_id + ") " + s[1] for s in f]) + " {")
    print("    display: none;")
    print("}")
    table = []
    for z in flairs:
        if z in f:
            table.append(table_true)
        else:
            table.append(table_false)
    tables.append([table, current_id])
    current_id = nextid(current_id)
    while is_valid639_1(current_id) or current_id in inuse:
        current_id = nextid(current_id)

for f in range(3):
    print()
print ("Subdomain | " + " | ".join(f[0] for f in flairs))
print("-|" + "|".join([":-:" for i in range(len(flairs))]))
for table in tables:
    print("[`" + table[1] + "`](http://"+table[1]+".reddit.com/r/"+subreddit+") | " + " | ".join(table[0]))

def findSubtitlesNoneIso639(scanfolder, isoMode, disablelangdetect):
    """
    Detect subtitles that do not comply with ISO-639.

    TODO: Add more subtitle extensions (and read/parse them correctly for
          language detection)
    TODO: Seperate language detection better in different functions
    TODO: Add percentage of certainty and possible other languages when
          low certainty
    TODO: Handle unicode better to detect languages like German and Dutch
          better
    TODO: Use table
    """
    subtitleExts = ['.srt', '.sub', '.ass']
    total = 0
    incorrect = 0
    detectedlang = 0
    for subdir, dirnames, filenames in os.walk(scanfolder):
        for filename in filenames:
            incorrectSubtitle = False
            extension = os.path.splitext(filename)[1].lower()
            # subdirName = os.path.basename(os.path.normpath(subdir))
            if extension in subtitleExts:
                total = total + 1
                langcodeFromFilename = getIsoLanguageCodeFromFilename(filename)
                detectedLanguage = ""
                detectedIsoMode = False
                if is_valid639_1(langcodeFromFilename):
                    detectedIsoMode = "1"
                    detectedLanguage = iso639_to_name(langcodeFromFilename)
                if is_valid639_2(langcodeFromFilename):
                    detectedIsoMode = "2"
                    detectedLanguage = iso639_to_name(langcodeFromFilename)
                if detectedIsoMode is not isoMode:
                    isoShouldBe = ""
                    if isoMode == "1" and detectedIsoMode == "2":
                        isoShouldBe = to_iso639_1(langcodeFromFilename)
                    if isoMode == "2" and detectedIsoMode == "1":
                        isoShouldBe = to_iso639_2(langcodeFromFilename)
                    filepath = subdir + os.sep + filename
                    incorrectSubtitle = True
                    incorrect = incorrect + 1
                    warning = "Incorrectly named subtitle found at "
                    warning += bold(filepath)
                    printNotificationWarning(warning)
                    if detectedIsoMode is not False:
                        info = "\t\tLang code " + bold(langcodeFromFilename)
                        info += " (ISO 639-" + str(detectedIsoMode) + ") "
                        info += "detected. The ISO 639-" + isoMode + " code"
                        info += " for " + detectedLanguage + " is "
                        info += bold(isoShouldBe) + "."
                        printNotificationInfo(info)
                if incorrectSubtitle and not disablelangdetect:
                    filepath = subdir + os.sep + filename
                    try:
                        with io.open(filepath, "r", encoding="utf-8") as mfile:
                            my_unicode_string = mfile.read()
                        possibleLanguage = "\tDetected language is likely to "
                        possibleLanguage += "be \"" + detect(my_unicode_string)
                        possibleLanguage += "\"\n"
                        detectedlang = detectedlang + 1
                    except Exception:
                        possibleLanguage = "\tLanguage detection failed\n"
    info = "Found subtitle files " + bold(str(total)) + " of which "
    info += bold(str(incorrect)) + " are incorrectly named!"
    printNotificationInfo(info)