コード例 #1
0
def test_lang_details(query, expected):
    if expected is None:
        assert get_language_details(query, failsafe=True) == expected
        with pytest.raises(NotFound):
            get_language_details(query)
    else:
        assert get_language_details(query) == expected
コード例 #2
0
    def update_zim_metadata(self):

        if not self.languages:
            self.zim_lang = "eng"
        else:
            if len(self.source_languages) > 1:
                self.zim_lang = "mul"
            else:
                lang_info = get_language_details(
                    self.source_languages[0], failsafe=True
                )
                if lang_info:
                    self.zim_lang = lang_info["iso-639-3"]
                else:
                    self.zim_lang = "eng"

        if self.playlist:
            if not self.title:
                self.title = self.playlist_title.strip()
            if not self.description:
                self.description = self.playlist_description.strip()
        else:
            if len(self.topics) > 1:
                if not self.title:
                    self.title = "TED Collection"
                if not self.description:
                    self.description = "A selection of TED videos from several topics"
            else:
                topic_str = self.topics[0].replace("+", " ")
                if not self.title:
                    self.title = f"{topic_str.capitalize()} from TED"
                if not self.description:
                    self.description = f"A selection of {topic_str} videos from TED"
コード例 #3
0
ファイル: constants.py プロジェクト: openzim/sotoki
def lang_for_domain(domain):
    match = re.match(r"^(?P<lang>[a-z]+)\.(stackexchange|stackoverflow)\.com$",
                     domain)
    if match:
        so_code = match.groupdict()["lang"]
        if so_code not in (
                "meta",
                "diy",
                "sqa",
                "tor",
                "dba",
                "tex",
                "law",
                "ham",
                "gis",
                "ell",
                "or",
                "vi",
        ):
            try:
                lang = get_language_details(so_code)
                if not lang["iso-639-1"] or not lang["iso-639-3"]:
                    raise NotFound("Might be an abbreviation")
                return lang["iso-639-1"], lang["iso-639-3"]
            except NotFound:
                ...
    return "en", "eng"
コード例 #4
0
    def get_display_name(self, lang_code, lang_name):
        """Display name for language"""

        lang_info = get_language_details(lang_code, failsafe=True)
        if lang_code != "en" and lang_info:

            return lang_info["native"] + " - " + lang_name
        return lang_name
コード例 #5
0
        def get_subtitles(video_id):
            video_dir = self.videos_dir.joinpath(video_id)
            languages = [
                x.stem.split(".")[1] for x in video_dir.iterdir()
                if x.is_file() and x.name.endswith(".vtt")
            ]

            return [get_language_details(language) for language in languages]
コード例 #6
0
 def to_jinja_subtitle(lang):
     subtitle = get_language_details(
         YOUTUBE_LANG_MAP.get(lang, lang))
     return {
         "code": lang,
         # Youtube.com uses `English - code` format.
         # Note: videojs displays it lowercased anyway
         "name":
         f"{subtitle['english'].title()} - {subtitle['query']}",
     }
コード例 #7
0
 def to_jinja_subtitle(lang):
     try:
         subtitle = get_language_details(
             YOUTUBE_LANG_MAP.get(lang, lang))
     except Exception:
         logger.error(f"Failed to get language details for {lang}")
         raise
     return {
         "code": lang,
         # Youtube.com uses `English - code` format.
         # Note: videojs displays it lowercased anyway
         "name":
         f"{subtitle['english'].title()} - {subtitle['query']}",
     }
コード例 #8
0
    def to_ted_langcodes(self, languages):
        """Converts languages queries into TED language codes

        Examples:
            ["English", "fr", "hin"] => ["en", "fr", "hi"]
            ["chi", "fake"] => ["zh", "zh-cn", "zh-tw"]
        """

        lang_code_list = []
        for lang in languages:
            lang_info = get_language_details(lang, failsafe=True)
            if lang_info:
                if lang_info["querytype"] == "purecode":
                    self.append_part1_or_part3(lang_code_list, lang_info)
                elif lang_info["querytype"] == "locale":
                    query = lang_info["query"].replace("_", "-")
                    if query in TEDLANGS["locales"]:
                        lang_code_list.append(query)
                    else:
                        self.append_part1_or_part3(lang_code_list, lang_info)
                else:
                    self.append_part1_or_part3(lang_code_list, lang_info)
        return list(set(lang_code_list))
コード例 #9
0
    def __init__(
        self,
        topics,
        debug,
        name,
        video_format,
        low_quality,
        output_dir,
        no_zim,
        fname,
        languages,
        locale_name,
        title,
        description,
        creator,
        publisher,
        tags,
        keep_build_dir,
        autoplay,
        use_any_optimized_version,
        s3_url_with_credentials,
        playlist,
        subtitles_enough,
        subtitles_setting,
        tmp_dir,
        threads,
    ):

        # video-encoding info
        self.video_format = video_format
        self.low_quality = low_quality

        # zim params
        self.fname = fname
        self.languages = (
            [] if languages is None else [l.strip() for l in languages.split(",")]
        )
        self.tags = [] if tags is None else [t.strip() for t in tags.split(",")]
        self.title = title
        self.description = description
        self.creator = creator
        self.publisher = publisher
        self.name = name

        # directory setup
        self.output_dir = pathlib.Path(output_dir).expanduser().resolve()
        if tmp_dir:
            pathlib.Path(tmp_dir).mkdir(parents=True, exist_ok=True)
        self.build_dir = pathlib.Path(tempfile.mkdtemp(dir=tmp_dir))

        # scraper options
        self.topics = (
            []
            if not topics
            else [c.strip().replace(" ", "+") for c in topics.split(",")]
        )
        self.autoplay = autoplay
        self.playlist = playlist
        self.subtitles_enough = subtitles_enough
        self.subtitles_setting = (
            subtitles_setting
            if subtitles_setting == ALL
            or subtitles_setting == MATCHING
            or subtitles_setting == NONE
            else self.to_ted_langcodes(
                [lang.strip() for lang in subtitles_setting.split(",")]
            )
        )
        self.threads = threads
        self.yt_downloader = None

        # optimization cache
        self.s3_url_with_credentials = s3_url_with_credentials
        self.use_any_optimized_version = use_any_optimized_version
        self.s3_storage = None
        self.video_quality = "low" if self.low_quality else "high"

        # debug/developer options
        self.no_zim = no_zim
        self.keep_build_dir = keep_build_dir
        self.debug = debug

        # class members
        self.videos = []
        self.playlist_title = None
        self.playlist_description = None
        self.source_languages = (
            [] if not self.languages else self.to_ted_langcodes(self.languages)
        )
        self.zim_lang = None
        self.already_visited = []

        # set and record locale for translations
        locale_details = get_language_details(locale_name)
        if locale_details["querytype"] != "locale":
            locale_name = locale_details["iso-639-1"]
        try:
            self.locale = setlocale(ROOT_DIR, locale_name)
        except locale.Error:
            logger.error(
                f"No locale for {locale_name}. Use --locale to specify it. "
                "defaulting to en_US"
            )
            self.locale = setlocale(ROOT_DIR, "en")
        # locale's language code
        self.locale_name = self.to_ted_langcodes(locale_name)
コード例 #10
0
    def __init__(
        self,
        archive,
        collection,
        nb_items_per_page,
        show_description,
        output_dir,
        no_zim,
        fname,
        debug,
        keep_build_dir,
        skip_download,
        language,
        locale_name,
        tags,
        name=None,
        title=None,
        description=None,
        creator=None,
        publisher=None,
        favicon=None,
        main_logo=None,
        secondary_logo=None,
        main_color=None,
        secondary_color=None,
    ):
        # options & zim params
        self.archive = archive
        self.collection = handle_user_provided_file(source=collection,
                                                    nocopy=True)
        self.nb_items_per_page = nb_items_per_page
        self.show_author = True
        self.show_description = show_description
        self.fname = fname
        self.language = language
        self.tags = [t.strip() for t in tags.split(",")]
        self.title = title
        self.description = description
        self.creator = creator
        self.publisher = publisher
        self.name = name
        self.favicon = favicon
        self.main_logo = main_logo
        self.secondary_logo = secondary_logo
        self.main_color = main_color
        self.secondary_color = secondary_color

        # process-related
        self.output_dir = Path(output_dir).expanduser().resolve()
        self.period = datetime.datetime.now().strftime("%Y-%m")

        # debug/devel options
        self.no_zim = no_zim
        self.debug = debug
        self.keep_build_dir = keep_build_dir
        self.skip_download = skip_download

        self.build_dir = self.output_dir.joinpath("build")

        # store ZIM-related info
        self.zim_info = ZimInfo(
            language=language,
            tags=tags,
            title=title,
            description=description,
            creator=creator,
            publisher=publisher,
            name=name,
            scraper=SCRAPER,
        )

        # set and record locale for translations
        locale_name = locale_name or get_language_details(
            self.language)["iso-639-1"]
        try:
            self.locale = setlocale(ROOT_DIR, locale_name)
        except locale.Error:
            logger.error(
                f"No locale for {locale_name}. Use --locale to specify it. defaulting to en_US"
            )
            self.locale = setlocale(ROOT_DIR, "en")
コード例 #11
0
    def __init__(
        self,
        collection_type,
        youtube_id,
        api_key,
        video_format,
        low_quality,
        nb_videos_per_page,
        all_subtitles,
        autoplay,
        output_dir,
        no_zim,
        fname,
        debug,
        keep_build_dir,
        skip_download,
        max_concurrency,
        youtube_store,
        language,
        locale_name,
        tags,
        title=None,
        description=None,
        creator=None,
        publisher=None,
        name=None,
        profile_image=None,
        banner_image=None,
        main_color=None,
        secondary_color=None,
        only_test_branding=None,
    ):
        # data-retrieval info
        self.collection_type = collection_type
        self.youtube_id = youtube_id
        self.api_key = api_key

        # video-encoding info
        self.video_format = video_format
        self.low_quality = low_quality

        # options & zim params
        self.nb_videos_per_page = nb_videos_per_page
        self.all_subtitles = all_subtitles
        self.autoplay = autoplay
        self.fname = fname
        self.language = language
        self.tags = [t.strip() for t in tags.split(",")]
        self.title = title
        self.description = description
        self.creator = creator
        self.publisher = publisher
        self.name = name
        self.profile_image = profile_image
        self.banner_image = banner_image
        self.main_color = main_color
        self.secondary_color = secondary_color

        # process-related
        self.output_dir = Path(output_dir).expanduser().resolve()
        self.playlists = []
        self.uploads_playlist_id = None
        self.videos_ids = []
        self.main_channel_id = None  # use for branding
        self.only_test_branding = only_test_branding

        # debug/devel options
        self.no_zim = no_zim
        self.debug = debug
        self.keep_build_dir = keep_build_dir
        self.skip_download = skip_download
        self.max_concurrency = max_concurrency

        self.build_dir = self.output_dir.joinpath("build")

        # store ZIM-related info
        self.zim_info = ZimInfo(
            language=language,
            tags=tags,
            title=title,
            description=description,
            creator=creator,
            publisher=publisher,
            name=name,
            scraper=SCRAPER,
            favicon="favicon.jpg",
        )

        # update youtube credentials store
        youtube_store.update(build_dir=self.build_dir,
                             api_key=self.api_key,
                             cache_dir=self.cache_dir)

        # set and record locale for translations
        locale_name = locale_name or get_language_details(
            self.language)["iso-639-1"]
        try:
            self.locale = setlocale(ROOT_DIR, locale_name)
        except locale.Error:
            logger.error(
                f"No locale for {locale_name}. Use --locale to specify it. defaulting to en_US"
            )
            self.locale = setlocale(ROOT_DIR, "en")
コード例 #12
0
    def run(self):
        if not self.inputs:
            logger.info(
                "Arguments valid, no inputs to process. Exiting with error code 100"
            )
            return 100

        self.find_main_page_metadata()

        # make sure Language metadata is ISO-639-3 and setup translations
        try:
            lang_data = get_language_details(self.language)
            self.language = lang_data["iso-639-3"]
            setlocale(
                pathlib.Path(__file__).parent, lang_data.get("iso-639-1"))
        except Exception:
            logger.error(
                f"Invalid language setting `{self.language}`. Using `eng`.")

        self.env = self.init_env()

        # init head insert
        template = self.env.get_template(HEAD_INSERT_FILE)
        self.head_insert = ("<head>" + template.render()).encode("utf-8")
        if self.custom_css:
            self.css_insert = (
                f'\n<link type="text/css" href="{CUSTOM_CSS_URL}" '
                'rel="Stylesheet" />\n</head>').encode("utf-8")
        else:
            self.css_insert = None

        self.creator = Creator(
            self.full_filename,
            main_path="A/index.html",
            language=self.language or "eng",
            title=self.title,
            date=datetime.date.today(),
            **self.metadata,
        ).start()

        self.add_replayer()

        for filename in pkg_resources.resource_listdir("warc2zim",
                                                       "templates"):
            if filename == HEAD_INSERT_FILE or filename == SW_JS:
                continue

            self.creator.add_item(
                StaticArticle(self.env, filename, self.main_url))

        for record in self.iter_all_warc_records():
            self.add_items_for_warc_record(record)

        # process revisits, headers only
        for url, record in self.revisits.items():
            if url not in self.indexed_urls:
                logger.debug("Adding revisit {0} -> {1}".format(
                    url, record.rec_headers["WARC-Refers-To-Target-URI"]))
                self.creator.add_item(WARCHeadersItem(record))
                self.indexed_urls.add(url)

        if self.favicon_url:
            self.add_illustration()

        logger.debug(f"Found {self.total_records} records in WARCs")

        self.creator.finish()
コード例 #13
0
    def __init__(
        self,
        collection_type,
        youtube_id,
        api_key,
        video_format,
        low_quality,
        nb_videos_per_page,
        all_subtitles,
        autoplay,
        output_dir,
        no_zim,
        fname,
        debug,
        tmp_dir,
        keep_build_dir,
        max_concurrency,
        youtube_store,
        language,
        locale_name,
        tags,
        dateafter,
        use_any_optimized_version,
        s3_url_with_credentials,
        title=None,
        description=None,
        creator=None,
        publisher=None,
        name=None,
        profile_image=None,
        banner_image=None,
        main_color=None,
        secondary_color=None,
    ):
        # data-retrieval info
        self.collection_type = collection_type
        self.youtube_id = youtube_id
        self.api_key = api_key
        self.dateafter = dateafter

        # video-encoding info
        self.video_format = video_format
        self.low_quality = low_quality

        # options & zim params
        self.nb_videos_per_page = nb_videos_per_page
        self.all_subtitles = all_subtitles
        self.autoplay = autoplay
        self.fname = fname
        self.language = language
        self.tags = [t.strip() for t in tags.split(",")]
        self.title = title
        self.description = description
        self.creator = creator
        self.publisher = publisher
        self.name = name
        self.profile_image = profile_image
        self.banner_image = banner_image
        self.main_color = main_color
        self.secondary_color = secondary_color

        # directory setup
        self.output_dir = Path(output_dir).expanduser().resolve()
        if tmp_dir:
            tmp_dir = Path(tmp_dir).expanduser().resolve()
            tmp_dir.mkdir(parents=True, exist_ok=True)
        self.build_dir = Path(tempfile.mkdtemp(dir=tmp_dir))

        # process-related
        self.playlists = []
        self.uploads_playlist_id = None
        self.videos_ids = []
        self.main_channel_id = None  # use for branding

        # debug/devel options
        self.no_zim = no_zim
        self.debug = debug
        self.keep_build_dir = keep_build_dir
        self.max_concurrency = max_concurrency

        # update youtube credentials store
        youtube_store.update(build_dir=self.build_dir,
                             api_key=self.api_key,
                             cache_dir=self.cache_dir)

        # Optimization-cache
        self.s3_url_with_credentials = s3_url_with_credentials
        self.use_any_optimized_version = use_any_optimized_version
        self.video_quality = "low" if self.low_quality else "high"
        self.s3_storage = None

        # set and record locale for translations
        locale_name = locale_name or get_language_details(
            self.language)["iso-639-1"]
        try:
            self.locale = setlocale(ROOT_DIR, locale_name)
        except locale.Error:
            logger.error(
                f"No locale for {locale_name}. Use --locale to specify it. defaulting to en_US"
            )
            self.locale = setlocale(ROOT_DIR, "en")
コード例 #14
0
def test_lang_details(iso_639_3, expected):
    assert get_language_details(iso_639_3) == expected