def test_lang_details(query, expected): if expected is None: assert get_language_details(query, failsafe=True) == expected with pytest.raises(NotFound): get_language_details(query) else: assert get_language_details(query) == expected
def update_zim_metadata(self): if not self.languages: self.zim_lang = "eng" else: if len(self.source_languages) > 1: self.zim_lang = "mul" else: lang_info = get_language_details( self.source_languages[0], failsafe=True ) if lang_info: self.zim_lang = lang_info["iso-639-3"] else: self.zim_lang = "eng" if self.playlist: if not self.title: self.title = self.playlist_title.strip() if not self.description: self.description = self.playlist_description.strip() else: if len(self.topics) > 1: if not self.title: self.title = "TED Collection" if not self.description: self.description = "A selection of TED videos from several topics" else: topic_str = self.topics[0].replace("+", " ") if not self.title: self.title = f"{topic_str.capitalize()} from TED" if not self.description: self.description = f"A selection of {topic_str} videos from TED"
def lang_for_domain(domain): match = re.match(r"^(?P<lang>[a-z]+)\.(stackexchange|stackoverflow)\.com$", domain) if match: so_code = match.groupdict()["lang"] if so_code not in ( "meta", "diy", "sqa", "tor", "dba", "tex", "law", "ham", "gis", "ell", "or", "vi", ): try: lang = get_language_details(so_code) if not lang["iso-639-1"] or not lang["iso-639-3"]: raise NotFound("Might be an abbreviation") return lang["iso-639-1"], lang["iso-639-3"] except NotFound: ... return "en", "eng"
def get_display_name(self, lang_code, lang_name): """Display name for language""" lang_info = get_language_details(lang_code, failsafe=True) if lang_code != "en" and lang_info: return lang_info["native"] + " - " + lang_name return lang_name
def get_subtitles(video_id): video_dir = self.videos_dir.joinpath(video_id) languages = [ x.stem.split(".")[1] for x in video_dir.iterdir() if x.is_file() and x.name.endswith(".vtt") ] return [get_language_details(language) for language in languages]
def to_jinja_subtitle(lang): subtitle = get_language_details( YOUTUBE_LANG_MAP.get(lang, lang)) return { "code": lang, # Youtube.com uses `English - code` format. # Note: videojs displays it lowercased anyway "name": f"{subtitle['english'].title()} - {subtitle['query']}", }
def to_jinja_subtitle(lang): try: subtitle = get_language_details( YOUTUBE_LANG_MAP.get(lang, lang)) except Exception: logger.error(f"Failed to get language details for {lang}") raise return { "code": lang, # Youtube.com uses `English - code` format. # Note: videojs displays it lowercased anyway "name": f"{subtitle['english'].title()} - {subtitle['query']}", }
def to_ted_langcodes(self, languages): """Converts languages queries into TED language codes Examples: ["English", "fr", "hin"] => ["en", "fr", "hi"] ["chi", "fake"] => ["zh", "zh-cn", "zh-tw"] """ lang_code_list = [] for lang in languages: lang_info = get_language_details(lang, failsafe=True) if lang_info: if lang_info["querytype"] == "purecode": self.append_part1_or_part3(lang_code_list, lang_info) elif lang_info["querytype"] == "locale": query = lang_info["query"].replace("_", "-") if query in TEDLANGS["locales"]: lang_code_list.append(query) else: self.append_part1_or_part3(lang_code_list, lang_info) else: self.append_part1_or_part3(lang_code_list, lang_info) return list(set(lang_code_list))
def __init__( self, topics, debug, name, video_format, low_quality, output_dir, no_zim, fname, languages, locale_name, title, description, creator, publisher, tags, keep_build_dir, autoplay, use_any_optimized_version, s3_url_with_credentials, playlist, subtitles_enough, subtitles_setting, tmp_dir, threads, ): # video-encoding info self.video_format = video_format self.low_quality = low_quality # zim params self.fname = fname self.languages = ( [] if languages is None else [l.strip() for l in languages.split(",")] ) self.tags = [] if tags is None else [t.strip() for t in tags.split(",")] self.title = title self.description = description self.creator = creator self.publisher = publisher self.name = name # directory setup self.output_dir = pathlib.Path(output_dir).expanduser().resolve() if tmp_dir: pathlib.Path(tmp_dir).mkdir(parents=True, exist_ok=True) self.build_dir = pathlib.Path(tempfile.mkdtemp(dir=tmp_dir)) # scraper options self.topics = ( [] if not topics else [c.strip().replace(" ", "+") for c in topics.split(",")] ) self.autoplay = autoplay self.playlist = playlist self.subtitles_enough = subtitles_enough self.subtitles_setting = ( subtitles_setting if subtitles_setting == ALL or subtitles_setting == MATCHING or subtitles_setting == NONE else self.to_ted_langcodes( [lang.strip() for lang in subtitles_setting.split(",")] ) ) self.threads = threads self.yt_downloader = None # optimization cache self.s3_url_with_credentials = s3_url_with_credentials self.use_any_optimized_version = use_any_optimized_version self.s3_storage = None self.video_quality = "low" if self.low_quality else "high" # debug/developer options self.no_zim = no_zim self.keep_build_dir = keep_build_dir self.debug = debug # class members self.videos = [] self.playlist_title = None self.playlist_description = None self.source_languages = ( [] if not self.languages else self.to_ted_langcodes(self.languages) ) self.zim_lang = None self.already_visited = [] # set and record locale for translations locale_details = get_language_details(locale_name) if locale_details["querytype"] != "locale": locale_name = locale_details["iso-639-1"] try: self.locale = setlocale(ROOT_DIR, locale_name) except locale.Error: logger.error( f"No locale for {locale_name}. Use --locale to specify it. " "defaulting to en_US" ) self.locale = setlocale(ROOT_DIR, "en") # locale's language code self.locale_name = self.to_ted_langcodes(locale_name)
def __init__( self, archive, collection, nb_items_per_page, show_description, output_dir, no_zim, fname, debug, keep_build_dir, skip_download, language, locale_name, tags, name=None, title=None, description=None, creator=None, publisher=None, favicon=None, main_logo=None, secondary_logo=None, main_color=None, secondary_color=None, ): # options & zim params self.archive = archive self.collection = handle_user_provided_file(source=collection, nocopy=True) self.nb_items_per_page = nb_items_per_page self.show_author = True self.show_description = show_description self.fname = fname self.language = language self.tags = [t.strip() for t in tags.split(",")] self.title = title self.description = description self.creator = creator self.publisher = publisher self.name = name self.favicon = favicon self.main_logo = main_logo self.secondary_logo = secondary_logo self.main_color = main_color self.secondary_color = secondary_color # process-related self.output_dir = Path(output_dir).expanduser().resolve() self.period = datetime.datetime.now().strftime("%Y-%m") # debug/devel options self.no_zim = no_zim self.debug = debug self.keep_build_dir = keep_build_dir self.skip_download = skip_download self.build_dir = self.output_dir.joinpath("build") # store ZIM-related info self.zim_info = ZimInfo( language=language, tags=tags, title=title, description=description, creator=creator, publisher=publisher, name=name, scraper=SCRAPER, ) # set and record locale for translations locale_name = locale_name or get_language_details( self.language)["iso-639-1"] try: self.locale = setlocale(ROOT_DIR, locale_name) except locale.Error: logger.error( f"No locale for {locale_name}. Use --locale to specify it. defaulting to en_US" ) self.locale = setlocale(ROOT_DIR, "en")
def __init__( self, collection_type, youtube_id, api_key, video_format, low_quality, nb_videos_per_page, all_subtitles, autoplay, output_dir, no_zim, fname, debug, keep_build_dir, skip_download, max_concurrency, youtube_store, language, locale_name, tags, title=None, description=None, creator=None, publisher=None, name=None, profile_image=None, banner_image=None, main_color=None, secondary_color=None, only_test_branding=None, ): # data-retrieval info self.collection_type = collection_type self.youtube_id = youtube_id self.api_key = api_key # video-encoding info self.video_format = video_format self.low_quality = low_quality # options & zim params self.nb_videos_per_page = nb_videos_per_page self.all_subtitles = all_subtitles self.autoplay = autoplay self.fname = fname self.language = language self.tags = [t.strip() for t in tags.split(",")] self.title = title self.description = description self.creator = creator self.publisher = publisher self.name = name self.profile_image = profile_image self.banner_image = banner_image self.main_color = main_color self.secondary_color = secondary_color # process-related self.output_dir = Path(output_dir).expanduser().resolve() self.playlists = [] self.uploads_playlist_id = None self.videos_ids = [] self.main_channel_id = None # use for branding self.only_test_branding = only_test_branding # debug/devel options self.no_zim = no_zim self.debug = debug self.keep_build_dir = keep_build_dir self.skip_download = skip_download self.max_concurrency = max_concurrency self.build_dir = self.output_dir.joinpath("build") # store ZIM-related info self.zim_info = ZimInfo( language=language, tags=tags, title=title, description=description, creator=creator, publisher=publisher, name=name, scraper=SCRAPER, favicon="favicon.jpg", ) # update youtube credentials store youtube_store.update(build_dir=self.build_dir, api_key=self.api_key, cache_dir=self.cache_dir) # set and record locale for translations locale_name = locale_name or get_language_details( self.language)["iso-639-1"] try: self.locale = setlocale(ROOT_DIR, locale_name) except locale.Error: logger.error( f"No locale for {locale_name}. Use --locale to specify it. defaulting to en_US" ) self.locale = setlocale(ROOT_DIR, "en")
def run(self): if not self.inputs: logger.info( "Arguments valid, no inputs to process. Exiting with error code 100" ) return 100 self.find_main_page_metadata() # make sure Language metadata is ISO-639-3 and setup translations try: lang_data = get_language_details(self.language) self.language = lang_data["iso-639-3"] setlocale( pathlib.Path(__file__).parent, lang_data.get("iso-639-1")) except Exception: logger.error( f"Invalid language setting `{self.language}`. Using `eng`.") self.env = self.init_env() # init head insert template = self.env.get_template(HEAD_INSERT_FILE) self.head_insert = ("<head>" + template.render()).encode("utf-8") if self.custom_css: self.css_insert = ( f'\n<link type="text/css" href="{CUSTOM_CSS_URL}" ' 'rel="Stylesheet" />\n</head>').encode("utf-8") else: self.css_insert = None self.creator = Creator( self.full_filename, main_path="A/index.html", language=self.language or "eng", title=self.title, date=datetime.date.today(), **self.metadata, ).start() self.add_replayer() for filename in pkg_resources.resource_listdir("warc2zim", "templates"): if filename == HEAD_INSERT_FILE or filename == SW_JS: continue self.creator.add_item( StaticArticle(self.env, filename, self.main_url)) for record in self.iter_all_warc_records(): self.add_items_for_warc_record(record) # process revisits, headers only for url, record in self.revisits.items(): if url not in self.indexed_urls: logger.debug("Adding revisit {0} -> {1}".format( url, record.rec_headers["WARC-Refers-To-Target-URI"])) self.creator.add_item(WARCHeadersItem(record)) self.indexed_urls.add(url) if self.favicon_url: self.add_illustration() logger.debug(f"Found {self.total_records} records in WARCs") self.creator.finish()
def __init__( self, collection_type, youtube_id, api_key, video_format, low_quality, nb_videos_per_page, all_subtitles, autoplay, output_dir, no_zim, fname, debug, tmp_dir, keep_build_dir, max_concurrency, youtube_store, language, locale_name, tags, dateafter, use_any_optimized_version, s3_url_with_credentials, title=None, description=None, creator=None, publisher=None, name=None, profile_image=None, banner_image=None, main_color=None, secondary_color=None, ): # data-retrieval info self.collection_type = collection_type self.youtube_id = youtube_id self.api_key = api_key self.dateafter = dateafter # video-encoding info self.video_format = video_format self.low_quality = low_quality # options & zim params self.nb_videos_per_page = nb_videos_per_page self.all_subtitles = all_subtitles self.autoplay = autoplay self.fname = fname self.language = language self.tags = [t.strip() for t in tags.split(",")] self.title = title self.description = description self.creator = creator self.publisher = publisher self.name = name self.profile_image = profile_image self.banner_image = banner_image self.main_color = main_color self.secondary_color = secondary_color # directory setup self.output_dir = Path(output_dir).expanduser().resolve() if tmp_dir: tmp_dir = Path(tmp_dir).expanduser().resolve() tmp_dir.mkdir(parents=True, exist_ok=True) self.build_dir = Path(tempfile.mkdtemp(dir=tmp_dir)) # process-related self.playlists = [] self.uploads_playlist_id = None self.videos_ids = [] self.main_channel_id = None # use for branding # debug/devel options self.no_zim = no_zim self.debug = debug self.keep_build_dir = keep_build_dir self.max_concurrency = max_concurrency # update youtube credentials store youtube_store.update(build_dir=self.build_dir, api_key=self.api_key, cache_dir=self.cache_dir) # Optimization-cache self.s3_url_with_credentials = s3_url_with_credentials self.use_any_optimized_version = use_any_optimized_version self.video_quality = "low" if self.low_quality else "high" self.s3_storage = None # set and record locale for translations locale_name = locale_name or get_language_details( self.language)["iso-639-1"] try: self.locale = setlocale(ROOT_DIR, locale_name) except locale.Error: logger.error( f"No locale for {locale_name}. Use --locale to specify it. defaulting to en_US" ) self.locale = setlocale(ROOT_DIR, "en")
def test_lang_details(iso_639_3, expected): assert get_language_details(iso_639_3) == expected