Example #1
0
 def get_language(samples, is_html=False, hint_language: str = None) -> LocaleInfo:
     """Return the language of this document.
     By default we use a heuristic based on Google's CLD2.
     """
     try:
         (success, _, ((_, lang, _, _), *_)) = detect_language(
             utf8Bytes=samples, isPlainText=not is_html, hintLanguage=hint_language
         )
     except CLD2Error as e:
         log.error(f"Failed to recognize document language", exc_info=True)
         success = False
     if success:
         return LocaleInfo(lang)
     return LocaleInfo("en")
Example #2
0
 def as_document_info(self) -> DocumentInfo:
     kwargs = self.metadata.copy()
     kwargs["uri"] = self.uri
     kwargs["cover_image"] = self.cover_image
     kwargs["language"] = LocaleInfo(kwargs["language"])
     kwargs.setdefault("data", {}).update(database_id=self.get_id())
     return DocumentInfo(**kwargs)
Example #3
0
 def get_recognition_languages(cls) -> t.List[LocaleInfo]:
     langs = []
     for lng in pytesseract.get_languages():
         try:
             langs.append(LocaleInfo.from_three_letter_code(lng))
         except ValueError:
             continue
     return langs
Example #4
0
 def language(self):
     if (epub_lang := self.epub_metadata.get("language")) is not None:
         try:
             return LocaleInfo(epub_lang)
         except:
             log.exception(
                 "Failed to parse epub language `{epub_lang}`", exc_info=True
             )
Example #5
0
 def get_voices(self):
     rv = []
     for voice in self.synth.GetVoices():
         rv.append(
             VoiceInfo(
                 id=voice.Id,
                 name=voice.Name,
                 desc=voice.Description,
                 language=LocaleInfo(voice.Language),
                 data={"voice_obj": voice},
             ))
     return rv
Example #6
0
 def populate_list(self, set_focus=True):
     installed_languages = [(True, info) for info in sorted(
         TesseractOcrEngine.get_recognition_languages(),
         key=lambda l: l.english_name,
     )]
     if self.is_offline:
         languages = installed_languages
     else:
         languages = []
         added_locale_infos = set(l[1] for l in installed_languages)
         for lang in self.online_languages:
             loc_info = LocaleInfo.from_three_letter_code(lang)
             if loc_info in added_locale_infos:
                 continue
             else:
                 languages.append((False, loc_info))
                 added_locale_infos.add(loc_info)
     column_defn = [
         ColumnDefn(
             # Translators: the title of a column in the Tesseract language list
             _("Language"),
             "left",
             450,
             lambda lang: lang[1].description,
         ),
         ColumnDefn(
             # Translators: the title of a column in the Tesseract language list
             _("Installed"),
             "center",
             100,
             lambda lang: _("Yes") if lang[0] else _("No"),
         ),
     ]
     self.tesseractLanguageList.set_columns(column_defn)
     if set_focus:
         self.tesseractLanguageList.set_objects(languages, focus_item=0)
     else:
         self.tesseractLanguageList.set_objects(languages, set_focus=False)
     # Maintain the state of the list
     if not any(languages):
         if not self.is_offline:
             self.addBestButton.Enable(False)
             self.addFastButton.Enable(False)
         else:
             self.removeButton.Enable(False)
         self.btnPanel.Enable(False)
Example #7
0
 def get_voices(self, language=None):
     rv = []
     for voice in self.synth.GetInstalledVoices():
         if not voice.Enabled:
             continue
         info = voice.VoiceInfo
         if (voice_culture := info.Culture) is not None:
             voice_language = LocaleInfo(voice_culture.IetfLanguageTag)
         else:
             log.exception(
                 f"Failed to obtain culture information for voice {info.Name}"
             )
             continue
         rv.append(
             VoiceInfo(
                 id=info.Id,
                 name=info.Name,
                 desc=info.Description,
                 language=voice_language,
                 gender=info.Gender,
                 age=info.Age,
             ))
Example #8
0
 def language(self):
     if lang := self.presentation.core_properties.language:
         try:
             return LocaleInfo(lang)
         except ValueError:
             pass
Example #9
0
 def get_recognition_languages(cls) -> t.List[LocaleInfo]:
     return [
         LocaleInfo(lang)
         for lang in Win10DocrEngine.get_supported_languages()
     ]
Example #10
0
def get_user_locale():
    return LocaleInfo(CultureInfo.CurrentUICulture.IetfLanguageTag)
Example #11
0
def get_user_locale():
    return LocaleInfo.from_babel_locale(Locale.default())
Example #12
0
            author=author.removeprefix("By ").strip(),
            description=desc,
            publication_year=publish_date,
            publisher=info.get("publisher", ""),
        )

    @cached_property
    def language(self):
        if (epub_lang := self.epub_metadata.get("language")) is not None:
            try:
                return LocaleInfo(epub_lang)
            except:
                log.exception(
                    "Failed to parse epub language `{epub_lang}`", exc_info=True
                )
        return self.get_language(self.html_content, is_html=True) or LocaleInfo("en")

    def get_content(self):
        return self.structure.get_text()

    def get_document_semantic_structure(self):
        return self.structure.semantic_elements

    def get_document_style_info(self):
        return self.structure.styled_elements

    def resolve_link(self, link_range) -> LinkTarget:
        href = urllib_parse.unquote(self.structure.link_targets[link_range])
        if is_external_url(href):
            return LinkTarget(url=href, is_external=True)
        else: