def get_language(samples, is_html=False, hint_language: str = None) -> LocaleInfo: """Return the language of this document. By default we use a heuristic based on Google's CLD2. """ try: (success, _, ((_, lang, _, _), *_)) = detect_language( utf8Bytes=samples, isPlainText=not is_html, hintLanguage=hint_language ) except CLD2Error as e: log.error(f"Failed to recognize document language", exc_info=True) success = False if success: return LocaleInfo(lang) return LocaleInfo("en")
def as_document_info(self) -> DocumentInfo: kwargs = self.metadata.copy() kwargs["uri"] = self.uri kwargs["cover_image"] = self.cover_image kwargs["language"] = LocaleInfo(kwargs["language"]) kwargs.setdefault("data", {}).update(database_id=self.get_id()) return DocumentInfo(**kwargs)
def get_recognition_languages(cls) -> t.List[LocaleInfo]: langs = [] for lng in pytesseract.get_languages(): try: langs.append(LocaleInfo.from_three_letter_code(lng)) except ValueError: continue return langs
def language(self): if (epub_lang := self.epub_metadata.get("language")) is not None: try: return LocaleInfo(epub_lang) except: log.exception( "Failed to parse epub language `{epub_lang}`", exc_info=True )
def get_voices(self): rv = [] for voice in self.synth.GetVoices(): rv.append( VoiceInfo( id=voice.Id, name=voice.Name, desc=voice.Description, language=LocaleInfo(voice.Language), data={"voice_obj": voice}, )) return rv
def populate_list(self, set_focus=True): installed_languages = [(True, info) for info in sorted( TesseractOcrEngine.get_recognition_languages(), key=lambda l: l.english_name, )] if self.is_offline: languages = installed_languages else: languages = [] added_locale_infos = set(l[1] for l in installed_languages) for lang in self.online_languages: loc_info = LocaleInfo.from_three_letter_code(lang) if loc_info in added_locale_infos: continue else: languages.append((False, loc_info)) added_locale_infos.add(loc_info) column_defn = [ ColumnDefn( # Translators: the title of a column in the Tesseract language list _("Language"), "left", 450, lambda lang: lang[1].description, ), ColumnDefn( # Translators: the title of a column in the Tesseract language list _("Installed"), "center", 100, lambda lang: _("Yes") if lang[0] else _("No"), ), ] self.tesseractLanguageList.set_columns(column_defn) if set_focus: self.tesseractLanguageList.set_objects(languages, focus_item=0) else: self.tesseractLanguageList.set_objects(languages, set_focus=False) # Maintain the state of the list if not any(languages): if not self.is_offline: self.addBestButton.Enable(False) self.addFastButton.Enable(False) else: self.removeButton.Enable(False) self.btnPanel.Enable(False)
def get_voices(self, language=None): rv = [] for voice in self.synth.GetInstalledVoices(): if not voice.Enabled: continue info = voice.VoiceInfo if (voice_culture := info.Culture) is not None: voice_language = LocaleInfo(voice_culture.IetfLanguageTag) else: log.exception( f"Failed to obtain culture information for voice {info.Name}" ) continue rv.append( VoiceInfo( id=info.Id, name=info.Name, desc=info.Description, language=voice_language, gender=info.Gender, age=info.Age, ))
def language(self): if lang := self.presentation.core_properties.language: try: return LocaleInfo(lang) except ValueError: pass
def get_recognition_languages(cls) -> t.List[LocaleInfo]: return [ LocaleInfo(lang) for lang in Win10DocrEngine.get_supported_languages() ]
def get_user_locale(): return LocaleInfo(CultureInfo.CurrentUICulture.IetfLanguageTag)
def get_user_locale(): return LocaleInfo.from_babel_locale(Locale.default())
author=author.removeprefix("By ").strip(), description=desc, publication_year=publish_date, publisher=info.get("publisher", ""), ) @cached_property def language(self): if (epub_lang := self.epub_metadata.get("language")) is not None: try: return LocaleInfo(epub_lang) except: log.exception( "Failed to parse epub language `{epub_lang}`", exc_info=True ) return self.get_language(self.html_content, is_html=True) or LocaleInfo("en") def get_content(self): return self.structure.get_text() def get_document_semantic_structure(self): return self.structure.semantic_elements def get_document_style_info(self): return self.structure.styled_elements def resolve_link(self, link_range) -> LinkTarget: href = urllib_parse.unquote(self.structure.link_targets[link_range]) if is_external_url(href): return LinkTarget(url=href, is_external=True) else: