def reverse(self, name): with_country = (GuessitConverter._with_country_regexp.match(name) or GuessitConverter._with_country_regexp2.match(name)) name = u(name.lower()) if with_country: lang = Language.fromguessit(with_country.group(1).strip()) lang.country = babelfish.Country.fromguessit(with_country.group(2).strip()) return (lang.alpha3, lang.country.alpha2 if lang.country else None, lang.script or None) # exceptions come first, as they need to override a potential match # with any of the other guessers try: return self.guessit_exceptions[name] except KeyError: pass for conv in [babelfish.Language, babelfish.Language.fromalpha3b, babelfish.Language.fromalpha2, babelfish.Language.fromname, babelfish.Language.fromopensubtitles]: try: c = conv(name) return c.alpha3, c.country, c.script except (ValueError, babelfish.LanguageReverseError): pass raise babelfish.LanguageReverseError(name)
def reverse(self, name): with_country = (GuessitConverter._with_country_regexp.match(name) or GuessitConverter._with_country_regexp2.match(name)) name = u(name.lower()) if with_country: lang = Language.fromguessit(with_country.group(1).strip()) lang.country = babelfish.Country.fromguessit( with_country.group(2).strip()) return lang.alpha3, lang.country.alpha2 if lang.country else None, lang.script or None # exceptions come first, as they need to override a potential match # with any of the other guessers try: return self.guessit_exceptions[name] except KeyError: pass for conv in [ babelfish.Language, babelfish.Language.fromalpha3b, babelfish.Language.fromalpha2, babelfish.Language.fromname, babelfish.Language.fromopensubtitles ]: try: c = conv(name) return c.alpha3, c.country, c.script except (ValueError, babelfish.LanguageReverseError): pass raise babelfish.LanguageReverseError(name)
def find_possible_languages(string): """Find possible languages in the string :return: list of tuple (property, Language, lang_word, word) """ words = find_words(string) valid_words = [] for word in words: lang_word = word.lower() key = 'language' for prefix in subtitle_prefixes: if lang_word.startswith(prefix): lang_word = lang_word[len(prefix):] key = 'subtitleLanguage' for suffix in subtitle_suffixes: if lang_word.endswith(suffix): lang_word = lang_word[:len(suffix)] key = 'subtitleLanguage' for prefix in lang_prefixes: if lang_word.startswith(prefix): lang_word = lang_word[len(prefix):] if not lang_word in LNG_COMMON_WORDS: try: lang = Language.fromguessit(lang_word) # Keep language with alpha2 equivalent. Others are probably # uncommon languages. if lang == 'mul' or hasattr(lang, 'alpha2'): valid_words.append((key, lang, lang_word, word)) except babelfish.Error: pass return valid_words
def find_possible_languages(string): """Find possible languages in the string :return: list of tuple (property, Language, lang_word, word) """ words = find_words(string) valid_words = [] for word in words: lang_word = word.lower() key = 'language' for prefix in subtitle_prefixes: if lang_word.startswith(prefix): lang_word = lang_word[len(prefix):] key = 'subtitleLanguage' for suffix in subtitle_suffixes: if lang_word.endswith(suffix): lang_word = lang_word[:len(suffix)] key = 'subtitleLanguage' for prefix in lang_prefixes: if lang_word.startswith(prefix): lang_word = lang_word[len(prefix):] if not lang_word in LNG_COMMON_WORDS: try: lang = Language.fromguessit(lang_word) # Keep language with alpha2 equivalent. Others are probably # uncommon languages. if lang == 'mul' or hasattr(lang, 'alpha2'): valid_words.append((key, lang, lang_word, word)) except babelfish.Error: pass return valid_words
def guess_language(text): # pragma: no cover """Guess the language in which a body of text is written. This uses the external guess-language python module, and will fail and return Language(Undetermined) if it is not installed. """ try: from guess_language import guessLanguage return Language.fromguessit(guessLanguage(text)) except ImportError: log.error('Cannot detect the language of the given text body, missing dependency: guess-language') log.error('Please install it from PyPI, by doing eg: pip install guess-language') return UNDETERMINED
def search_language(string, lang_filter=None): """Looks for language patterns, and if found return the language object, its group span and an associated confidence. you can specify a list of allowed languages using the lang_filter argument, as in lang_filter = [ 'fr', 'eng', 'spanish' ] >>> search_language('movie [en].avi')['language'] <Language [en]> >>> search_language('the zen fat cat and the gay mad men got a new fan', lang_filter = ['en', 'fr', 'es']) """ if lang_filter: lang_filter = set(Language.fromguessit(lang) for lang in lang_filter) confidence = 1.0 # for all of them for prop, language, lang, word in find_possible_languages(string): pos = string.find(word) end = pos + len(word) if lang_filter and language not in lang_filter: continue # only allow those languages that have a 2-letter code, those that # don't are too esoteric and probably false matches #if language.lang not in lng3_to_lng2: # continue # confidence depends on alpha2, alpha3, english name, ... if len(lang) == 2: confidence = 0.8 elif len(lang) == 3: confidence = 0.9 elif prop == 'subtitleLanguage': confidence = 0.6 # Subtitle prefix found with language else: # Note: we could either be really confident that we found a # language or assume that full language names are too # common words and lower their confidence accordingly confidence = 0.3 # going with the low-confidence route here return Guess({prop: language}, confidence=confidence, input=string, span=(pos, end)) return None
def guess_language(text): # pragma: no cover """Guess the language in which a body of text is written. This uses the external guess-language python module, and will fail and return Language(Undetermined) if it is not installed. """ try: from guess_language import guessLanguage return Language.fromguessit(guessLanguage(text)) except ImportError: log.error('Cannot detect the language of the given text body, missing dependency: guess-language') log.error('Please install it from PyPI, by doing eg: pip install guess-language') return UNDETERMINED
def search_language(string, lang_filter=None): """Looks for language patterns, and if found return the language object, its group span and an associated confidence. you can specify a list of allowed languages using the lang_filter argument, as in lang_filter = [ 'fr', 'eng', 'spanish' ] >>> search_language('movie [en].avi')['language'] <Language [en]> >>> search_language('the zen fat cat and the gay mad men got a new fan', lang_filter = ['en', 'fr', 'es']) """ if lang_filter: lang_filter = set(Language.fromguessit(lang) for lang in lang_filter) confidence = 1.0 # for all of them for prop, language, lang, word in find_possible_languages(string): pos = string.find(word) end = pos + len(word) if lang_filter and language not in lang_filter: continue # only allow those languages that have a 2-letter code, those that # don't are too esoteric and probably false matches #if language.lang not in lng3_to_lng2: # continue # confidence depends on alpha2, alpha3, english name, ... if len(lang) == 2: confidence = 0.8 elif len(lang) == 3: confidence = 0.9 elif prop == 'subtitleLanguage': confidence = 0.6 # Subtitle prefix found with language else: # Note: we could either be really confident that we found a # language or assume that full language names are too # common words and lower their confidence accordingly confidence = 0.3 # going with the low-confidence route here return Guess({prop: language}, confidence=confidence, input=string, span=(pos, end)) return None
def find_possible_languages(string, allowed_languages=None): """Find possible languages in the string :return: list of tuple (property, Language, lang_word, word) """ common_words = None if allowed_languages: common_words = LNG_COMMON_WORDS_STRICT else: common_words = LNG_COMMON_WORDS words = find_words(string) valid_words = [] for word in words: lang_word = word.lower() key = 'language' for prefix in subtitle_prefixes: if lang_word.startswith(prefix): lang_word = lang_word[len(prefix):] key = 'subtitleLanguage' for suffix in subtitle_suffixes: if lang_word.endswith(suffix): lang_word = lang_word[:len(suffix)] key = 'subtitleLanguage' for prefix in lang_prefixes: if lang_word.startswith(prefix): lang_word = lang_word[len(prefix):] if lang_word not in common_words and word.lower() not in common_words: try: lang = Language.fromguessit(lang_word) if allowed_languages: if lang.name.lower( ) in allowed_languages or lang.alpha2.lower( ) in allowed_languages or lang.alpha3.lower( ) in allowed_languages: valid_words.append((key, lang, lang_word, word)) # Keep language with alpha2 equivalent. Others are probably # uncommon languages. elif lang == 'mul' or hasattr(lang, 'alpha2'): valid_words.append((key, lang, lang_word, word)) except babelfish.Error: pass return valid_words
def find_possible_languages(string, allowed_languages=None): """Find possible languages in the string :return: list of tuple (property, Language, lang_word, word) """ common_words = None if allowed_languages: common_words = LNG_COMMON_WORDS_STRICT else: common_words = LNG_COMMON_WORDS words = find_words(string) valid_words = [] for word in words: lang_word = word.lower() key = 'language' for prefix in subtitle_prefixes: if lang_word.startswith(prefix): lang_word = lang_word[len(prefix):] key = 'subtitleLanguage' for suffix in subtitle_suffixes: if lang_word.endswith(suffix): lang_word = lang_word[:len(suffix)] key = 'subtitleLanguage' for prefix in lang_prefixes: if lang_word.startswith(prefix): lang_word = lang_word[len(prefix):] if lang_word not in common_words and word.lower() not in common_words: try: lang = Language.fromguessit(lang_word) if allowed_languages: if lang.name.lower() in allowed_languages or lang.alpha2.lower() in allowed_languages or lang.alpha3.lower() in allowed_languages: valid_words.append((key, lang, lang_word, word)) # Keep language with alpha2 equivalent. Others are probably # uncommon languages. elif lang == 'mul' or hasattr(lang, 'alpha2'): valid_words.append((key, lang, lang_word, word)) except babelfish.Error: pass return valid_words