Example #1
0
def do_upload_file(url: str, fpath: pathlib.Path, key: str = None):
    if not fpath.exists():
        raise IOError(f"{fpath} missing.")
    fsize = fpath.stat().st_size
    if not key:
        key = fpath.name

    s3 = KiwixStorage(url)
    dest = f"s3://{s3.url.netloc}/{s3.bucket_name}"

    if s3.has_object(key):
        raise ValueError(f"Key `{key}` already exists at {dest}. Specify another one.")
    print(f"Uploading {fpath.name} ({format_size(fsize)}) to {dest}/{key}")

    progress = CustomProgressBar(fsize).callback if progressbar else True
    s3.upload_file(fpath=fpath, key=key, progress=progress)
Example #2
0
class Ted2Zim:
    def __init__(
        self,
        topics,
        debug,
        name,
        video_format,
        low_quality,
        output_dir,
        no_zim,
        fname,
        languages,
        locale_name,
        title,
        description,
        creator,
        publisher,
        tags,
        keep_build_dir,
        autoplay,
        use_any_optimized_version,
        s3_url_with_credentials,
        playlist,
        subtitles_enough,
        subtitles_setting,
        tmp_dir,
        threads,
    ):

        # video-encoding info
        self.video_format = video_format
        self.low_quality = low_quality

        # zim params
        self.fname = fname
        self.languages = (
            [] if languages is None else [l.strip() for l in languages.split(",")]
        )
        self.tags = [] if tags is None else [t.strip() for t in tags.split(",")]
        self.title = title
        self.description = description
        self.creator = creator
        self.publisher = publisher
        self.name = name

        # directory setup
        self.output_dir = pathlib.Path(output_dir).expanduser().resolve()
        if tmp_dir:
            pathlib.Path(tmp_dir).mkdir(parents=True, exist_ok=True)
        self.build_dir = pathlib.Path(tempfile.mkdtemp(dir=tmp_dir))

        # scraper options
        self.topics = (
            []
            if not topics
            else [c.strip().replace(" ", "+") for c in topics.split(",")]
        )
        self.autoplay = autoplay
        self.playlist = playlist
        self.subtitles_enough = subtitles_enough
        self.subtitles_setting = (
            subtitles_setting
            if subtitles_setting == ALL
            or subtitles_setting == MATCHING
            or subtitles_setting == NONE
            else self.to_ted_langcodes(
                [lang.strip() for lang in subtitles_setting.split(",")]
            )
        )
        self.threads = threads
        self.yt_downloader = None

        # optimization cache
        self.s3_url_with_credentials = s3_url_with_credentials
        self.use_any_optimized_version = use_any_optimized_version
        self.s3_storage = None
        self.video_quality = "low" if self.low_quality else "high"

        # debug/developer options
        self.no_zim = no_zim
        self.keep_build_dir = keep_build_dir
        self.debug = debug

        # class members
        self.videos = []
        self.playlist_title = None
        self.playlist_description = None
        self.source_languages = (
            [] if not self.languages else self.to_ted_langcodes(self.languages)
        )
        self.zim_lang = None
        self.already_visited = []

        # set and record locale for translations
        locale_details = get_language_details(locale_name)
        if locale_details["querytype"] != "locale":
            locale_name = locale_details["iso-639-1"]
        try:
            self.locale = setlocale(ROOT_DIR, locale_name)
        except locale.Error:
            logger.error(
                f"No locale for {locale_name}. Use --locale to specify it. "
                "defaulting to en_US"
            )
            self.locale = setlocale(ROOT_DIR, "en")
        # locale's language code
        self.locale_name = self.to_ted_langcodes(locale_name)

    @property
    def templates_dir(self):
        return ROOT_DIR.joinpath("templates")

    @property
    def videos_dir(self):
        return self.build_dir.joinpath("videos")

    @property
    def ted_videos_json(self):
        return self.build_dir.joinpath("ted_videos.json")

    @property
    def ted_topics_json(self):
        return self.build_dir.joinpath("ted_topics.json")

    @property
    def talks_base_url(self):
        return BASE_URL + "talks"

    @property
    def playlists_base_url(self):
        return BASE_URL + "playlists"

    def append_part1_or_part3(self, lang_code_list, lang_info):
        """Fills missing ISO languages codes for all in list

        lang_code_list: list og lang codes
        lang_info: see zimscraperlib.i18n"""

        # ignore extra language mappings if supplied query was an iso-639-1 code
        if "part1" in lang_info["iso_types"]:
            lang_code_list.append(lang_info["iso-639-1"])

        # supplied query was not iso-639-1
        else:
            if lang_info["iso-639-1"]:
                lang_code_list.append(lang_info["iso-639-1"])
                # check for extra language codes to include
                if lang_info["iso-639-1"] in TEDLANGS["mappings"]:
                    for code in TEDLANGS["mappings"][lang_info["iso-639-1"]]:
                        lang_code_list.append(code)
            elif lang_info["iso-639-3"]:
                lang_code_list.append(lang_info["iso-639-3"])
            else:
                supplied_lang = lang_info["query"]
                logger.error(f"Language {supplied_lang} is not supported by TED")

    def to_ted_langcodes(self, languages):
        """Converts languages queries into TED language codes

        Examples:
            ["English", "fr", "hin"] => ["en", "fr", "hi"]
            ["chi", "fake"] => ["zh", "zh-cn", "zh-tw"]
        """

        lang_code_list = []
        for lang in languages:
            lang_info = get_language_details(lang, failsafe=True)
            if lang_info:
                if lang_info["querytype"] == "purecode":
                    self.append_part1_or_part3(lang_code_list, lang_info)
                elif lang_info["querytype"] == "locale":
                    query = lang_info["query"].replace("_", "-")
                    if query in TEDLANGS["locales"]:
                        lang_code_list.append(query)
                    else:
                        self.append_part1_or_part3(lang_code_list, lang_info)
                else:
                    self.append_part1_or_part3(lang_code_list, lang_info)
        return list(set(lang_code_list))

    def extract_videos_from_playlist(self, playlist):
        """extracts metadata for all videos in the given playlist

        calls extract_video_info on all links to get this data
        """

        playlist_url = f"{self.playlists_base_url}/{playlist}"
        logger.debug(f"extract_videos_from_playlist: {playlist_url}")
        soup = BeautifulSoup(download_link(playlist_url).text, features="html.parser")
        video_elements = soup.find_all("a", attrs={"class": "group"})
        self.playlist_title = soup.find("h1").string
        self.playlist_description = soup.find("p", attrs={"class": "text-base"}).string

        for element in video_elements:
            relative_path = element.get("href")
            url = urllib.parse.urljoin(self.talks_base_url, relative_path)
            if self.extract_info_from_video_page(url):
                if self.source_languages and len(self.source_languages) > 1:
                    other_lang_urls = self.generate_urls_for_other_languages(url)
                    logger.debug(
                        f"Searching info for the video in other {len(other_lang_urls)} language(s)"
                    )
                    for lang_url in other_lang_urls:
                        self.extract_info_from_video_page(lang_url)
                    self.already_visited.append(urllib.parse.urlparse(url)[2])
            logger.debug(f"Seen {relative_path}")
        logger.debug(f"Total videos found on playlist: {len(video_elements)}")
        if not video_elements:
            raise ValueError("Wrong playlist ID supplied. No videos found")

    def generate_search_result_and_scrape(self, topic_url, total_videos_scraped):
        """generates a search result and returns the total number of videos scraped"""

        page = 1
        while True:
            logger.debug(f"generate_search_result_and_scrape: {topic_url}&page={page}")
            html = download_link(f"{topic_url}&page={page}").text
            nb_videos_extracted, nb_videos_on_page = self.extract_videos_on_topic_page(
                html
            )
            if nb_videos_on_page == 0:
                break
            total_videos_scraped += nb_videos_extracted
            page += 1
        return total_videos_scraped

    def extract_videos_from_topics(self, topic):
        """extracts metadata for required number of videos on different topics"""

        logger.debug(f"Fetching video links for topic: {topic}")
        topic_url = f"{self.talks_base_url}?topics%5B%5D={topic}"
        total_videos_scraped = 0

        if self.source_languages:
            for lang in self.source_languages:
                topic_url = topic_url + f"&language={lang}"
                total_videos_scraped = self.generate_search_result_and_scrape(
                    topic_url, total_videos_scraped
                )

        else:
            total_videos_scraped = self.generate_search_result_and_scrape(
                topic_url, total_videos_scraped
            )

        logger.info(f"Total video links found in {topic}: {total_videos_scraped}")
        if total_videos_scraped == 0:
            return False
        return True

    def update_zim_metadata(self):

        if not self.languages:
            self.zim_lang = "eng"
        else:
            if len(self.source_languages) > 1:
                self.zim_lang = "mul"
            else:
                lang_info = get_language_details(
                    self.source_languages[0], failsafe=True
                )
                if lang_info:
                    self.zim_lang = lang_info["iso-639-3"]
                else:
                    self.zim_lang = "eng"

        if self.playlist:
            if not self.title:
                self.title = self.playlist_title.strip()
            if not self.description:
                self.description = self.playlist_description.strip()
        else:
            if len(self.topics) > 1:
                if not self.title:
                    self.title = "TED Collection"
                if not self.description:
                    self.description = "A selection of TED videos from several topics"
            else:
                topic_str = self.topics[0].replace("+", " ")
                if not self.title:
                    self.title = f"{topic_str.capitalize()} from TED"
                if not self.description:
                    self.description = f"A selection of {topic_str} videos from TED"

    def get_display_name(self, lang_code, lang_name):
        """Display name for language"""

        lang_info = get_language_details(lang_code, failsafe=True)
        if lang_code != "en" and lang_info:

            return lang_info["native"] + " - " + lang_name
        return lang_name

    def get_subtitle_dict(self, lang):
        """dict of language name and code from a larger dict lang

        Example:
        {
            'languageCode': 'en',
            'languageName': 'English'
        }
        """

        return {
            "languageName": self.get_display_name(
                lang["languageCode"], lang["languageName"]
            ),
            "languageCode": lang["languageCode"],
        }

    def generate_subtitle_list(self, video_id, langs, page_lang, audio_lang):
        """List of all subtitle languages with link to their pages"""

        subtitles = []
        if self.subtitles_setting == ALL or (
            not self.source_languages and self.topics and self.subtitles_setting != NONE
        ):
            subtitles = [self.get_subtitle_dict(lang) for lang in langs]
        elif self.subtitles_setting == MATCHING or (
            self.subtitles_enough
            and self.subtitles_setting == NONE
            and page_lang != audio_lang
        ):
            subtitles = [
                self.get_subtitle_dict(lang)
                for lang in langs
                if lang["languageCode"] == page_lang
            ]
        elif self.subtitles_setting and self.subtitles_setting != NONE:
            if not self.subtitles_enough and self.topics:
                subtitles = [
                    self.get_subtitle_dict(lang)
                    for lang in langs
                    if lang["languageCode"] in self.subtitles_setting
                ]
            else:
                subtitles = [
                    self.get_subtitle_dict(lang)
                    for lang in langs
                    if lang["languageCode"] in self.subtitles_setting
                    or lang["languageCode"] in self.source_languages
                ]

        return update_subtitles_list(video_id, subtitles)

    def generate_urls_for_other_languages(self, url):
        """Possible URLs for other requested languages based on a video url"""

        urls = []
        page_lang, query = self.get_lang_code_from_url(url, with_full_query=True)
        url_parts = list(urllib.parse.urlparse(url))

        # update the language query field value with other languages and form URLs
        for language in self.source_languages:
            if language != page_lang:
                query.update({"language": language})
                url_parts[4] = urllib.parse.urlencode(query)
                urls.append(urllib.parse.urlunparse(url_parts))
        return urls

    def extract_videos_on_topic_page(self, page_html):

        # all videos are embedded in a <div> with the class name 'row'.
        # we are searching for the div inside this div, that has an <a>-tag
        # with the class name 'media__image', because this is the relative
        # link to the representative TED talk. It turns this relative link to
        # an absolute link and calls extract_video_info for them
        soup = BeautifulSoup(page_html, features="html.parser")
        video_links = soup.select("div.row div.media__image a")
        nb_extracted = 0
        nb_listed = len(video_links)
        logger.debug(f"{nb_listed} video(s) found on current page")
        for video_link in video_links:
            url = urllib.parse.urljoin(self.talks_base_url, video_link["href"])
            if self.extract_info_from_video_page(url):
                nb_extracted += 1
                if self.source_languages and len(self.source_languages) > 1:
                    other_lang_urls = self.generate_urls_for_other_languages(url)
                    logger.debug(
                        f"Searching info for video in other {len(other_lang_urls)} language(s)"
                    )
                    for lang_url in other_lang_urls:
                        self.extract_info_from_video_page(lang_url)
                    self.already_visited.append(urllib.parse.urlparse(url)[2])
            logger.debug(f"Seen {video_link['href']}")
        return nb_extracted, nb_listed

    def get_lang_code_from_url(self, url, with_full_query=False):
        """gets the queried language code from a ted talk url"""

        # sample - https://www.ted.com/talks/alex_rosenthal_the_gauntlet_think_like_a_coder_ep_8?language=ja
        url_parts = list(urllib.parse.urlparse(url))

        # explode url to extract `language` query field value
        query = dict(urllib.parse.parse_qsl(url_parts[4]))
        current_lang = query.get("language")
        if with_full_query:
            return current_lang, query
        return current_lang

    def extract_download_link(self, talk_data):
        """Returns download link / youtube video ID for a TED video"""

        if (
            isinstance(talk_data.get("resources", {}).get("h264"), list)
            and len(talk_data["resources"]["h264"])
            and talk_data["resources"]["h264"][0].get("file")
        ):
            logger.debug(
                "Using h264 resource link for bitrate="
                f"{talk_data['resources']['h264'][0].get('bitrate')}"
            )
            return talk_data["resources"]["h264"][0]["file"]

        logger.error("No download link found for the video")
        return None

    def update_videos_list(
        self,
        video_id,
        lang_code,
        lang_name,
        title,
        description,
        speaker,
        speaker_profession,
        speaker_bio,
        speaker_picture,
        date,
        thumbnail,
        video_link,
        length,
        subtitles,
    ):
        # append to self.videos and return if not present
        if not [video for video in self.videos if video.get("id", None) == video_id]:
            self.videos.append(
                {
                    "id": video_id,
                    "languages": [
                        {
                            "languageCode": lang_code,
                            "languageName": self.get_display_name(lang_code, lang_name),
                        }
                    ],
                    "title": [{"lang": lang_code, "text": title}],
                    "description": [{"lang": lang_code, "text": description}],
                    "speaker": speaker,
                    "speaker_profession": speaker_profession,
                    "speaker_bio": speaker_bio,
                    "speaker_picture": speaker_picture,
                    "date": date,
                    "thumbnail": thumbnail,
                    "video_link": video_link,
                    "length": length,
                    "subtitles": subtitles,
                }
            )
            logger.debug(f"Successfully inserted video {video_id} into video list")
            return True

        # update localized meta for video if already in self.videos
        # based on --subtitles=matching
        logger.debug(f"Video {video_id} already present in video list")
        for index, video in enumerate(self.videos):
            if video.get("id", None) == video_id:
                if {"lang": lang_code, "text": title} not in video["title"]:
                    self.videos[index]["title"].append(
                        {"lang": lang_code, "text": title}
                    )
                    self.videos[index]["description"].append(
                        {"lang": lang_code, "text": description}
                    )
                    self.videos[index]["languages"].append(
                        {
                            "languageCode": lang_code,
                            "languageName": self.get_display_name(lang_code, lang_name),
                        }
                    )
                if self.subtitles_setting == MATCHING or self.subtitles_setting == NONE:
                    self.videos[index]["subtitles"] += subtitles
        return False

    def extract_video_info_from_json(self, json_data):
        player_data = json.loads(json_data["playerData"])
        lang_code = json_data["language"]
        lang_name = [
            lang["languageName"]
            for lang in player_data["languages"]
            if lang["languageCode"] == lang_code
        ][-1]
        # talk_info = json_data["talks"][0]
        native_talk_language = player_data["nativeLanguage"]
        if (
            not self.subtitles_enough
            and self.source_languages
            and native_talk_language != lang_code
            and self.topics
        ):
            return False

        # Extract the speaker of the TED talk
        if len(json_data["speakers"]):
            speaker_info = json_data["speakers"][0]
            speaker = " ".join(
                [
                    speaker_info.get("firstName"),
                    speaker_info.get("middleName"),
                    speaker_info.get("lastName"),
                ]
            )
        else:
            speaker_info = {
                "description": "None",
                "whotheyare": "None",
                "photo_url": "",
            }
            if "presenterDisplayName" in json_data:
                speaker = json_data["presenterDisplayName"]
            else:
                speaker = "None"

        # Extract the ted talk details from json
        video_id = json_data["id"]
        speaker_profession = speaker_info["description"]
        speaker_bio = speaker_info.get("whoTheyAre", "-")
        speaker_picture = speaker_info.get("avatar", "-")
        title = json_data["title"]
        description = json_data["description"]
        date = dateutil.parser.parse(json_data["recordedOn"]).strftime("%d %B %Y")
        length = int(json_data["duration"]) // 60
        thumbnail = player_data["thumb"]
        video_link = self.extract_download_link(player_data)
        if not video_link:
            logger.error("No suitable download link found. Skipping video")
            return False

        langs = player_data["languages"]
        subtitles = self.generate_subtitle_list(
            video_id, langs, lang_code, native_talk_language
        )
        return self.update_videos_list(
            video_id=video_id,
            lang_code=lang_code,
            lang_name=lang_name,
            title=title,
            description=description,
            speaker=speaker,
            speaker_profession=speaker_profession,
            speaker_bio=speaker_bio,
            speaker_picture=speaker_picture,
            date=date,
            thumbnail=thumbnail,
            video_link=video_link,
            length=length,
            subtitles=subtitles,
        )

    def extract_info_from_video_page(self, url, retry_count=0):
        """extract all info from a TED video page url and update self.videos"""

        # Every TED video page has a <script>-tag with a Javascript
        # object with JSON in it. We will just stip away the object
        # signature and load the json to extract meta-data out of it.
        # returns True if successfully scraped new video

        # don't scrape if URL already visited
        if urllib.parse.urlparse(url)[2] in self.already_visited:
            return False

        # don't scrape if maximum retry count is reached
        if retry_count > 5:
            logger.error("Max retries exceeded. Skipping video")
            return False

        logger.debug(f"extract_info_from_video_page: {url}")
        soup = BeautifulSoup(download_link(url).text, features="html.parser")

        json_data = json.loads(
            soup.find("script", attrs={"id": "__NEXT_DATA__"}).string
        )["props"]["pageProps"]["videoData"]

        requested_lang_code = self.get_lang_code_from_url(url)
        if requested_lang_code and json_data["language"] != requested_lang_code:
            logger.error(
                f"Video has not yet been translated into {requested_lang_code}"
            )
            return False
        return self.extract_video_info_from_json(json_data)

    def add_default_language(self):
        """add metatada in default language (english or first avail) on all videos"""

        for video in self.videos:
            en_found = False
            for index, lang in enumerate(video["languages"]):
                if lang["languageCode"] == "en":
                    en_found = True
                    video["title"] = [
                        {"lang": "default", "text": video["title"][index]["text"]}
                    ] + video["title"]
                    video["description"] = [
                        {"lang": "default", "text": video["description"][index]["text"]}
                    ] + video["description"]

            if not en_found:
                video["title"] = [
                    {"lang": "default", "text": video["title"][0]["text"]}
                ] + video["title"]
                video["description"] = [
                    {"lang": "default", "text": video["description"][0]["text"]}
                ] + video["description"]

            # update video slug
            video["slug"] = slugify(video["title"][0]["text"], separator="-")

    def render_video_pages(self):

        # Render static html pages from the scraped video data and
        # save the pages in build_dir/<video-id>/index.html
        env = jinja2.Environment(
            loader=jinja2.FileSystemLoader(str(self.templates_dir)), autoescape=True
        )
        for video in self.videos:
            titles = video["title"]
            html = env.get_template("article.html").render(
                speaker=video["speaker"],
                languages=video["subtitles"],
                speaker_bio=video["speaker_bio"].replace("Full bio", ""),
                speaker_img=video["speaker_picture"],
                date=video["date"],
                profession=video["speaker_profession"],
                video_format=self.video_format,
                autoplay=self.autoplay,
                video_id=str(video["id"]),
                title=get_main_title(titles, self.locale_name),
                titles=titles,
                descriptions=video["description"],
                back_to_list=_("Back to the list"),
            )
            html_path = self.build_dir.joinpath(video["slug"])
            with open(html_path, "w", encoding="utf-8") as html_page:
                html_page.write(html)

    def render_home_page(self):

        # Render the homepage
        env = jinja2.Environment(
            loader=jinja2.FileSystemLoader(str(self.templates_dir)), autoescape=True
        )
        all_langs = {
            language["languageCode"]: language["languageName"]
            for video in self.videos
            for language in video["subtitles"] + video["languages"]
        }
        languages = [
            {"languageName": value, "languageCode": key}
            for key, value in all_langs.items()
        ]
        languages = sorted(languages, key=lambda x: x["languageName"])
        html = env.get_template("home.html").render(
            languages=languages,
            page_title=_("TED Talks"),
            language_filter_text=_("Filter by language"),
            back_to_top=_("Back to the top"),
            pagination_text=_("Page"),
        )
        home_page_path = self.build_dir.joinpath("index")
        with open(home_page_path, "w", encoding="utf-8") as html_page:
            html_page.write(html)

    def copy_files_to_build_directory(self):

        # Copy files from template_dir to build_dir
        assets_dir = self.templates_dir.joinpath("assets")
        if assets_dir.exists():
            shutil.copytree(
                assets_dir, self.build_dir.joinpath("assets"), dirs_exist_ok=True
            )
        shutil.copy(
            self.templates_dir.joinpath("favicon.png"),
            self.build_dir.joinpath("favicon.png"),
        )

    def generate_datafile(self):
        """Generate data.js inside assets folder"""

        video_list = []
        for video in self.videos:
            lang_codes = [lang["languageCode"] for lang in video["subtitles"]] + [
                lang["languageCode"] for lang in video["languages"]
            ]
            json_data = {
                "languages": [lang_code for lang_code in set(lang_codes)],
                "id": video["id"],
                "description": video["description"],
                "title": video["title"],
                "speaker": video["speaker"],
                "slug": video["slug"],
            }
            video_list.append(json_data)
        assets_path = self.build_dir.joinpath("assets")
        if not assets_path.exists():
            assets_path.mkdir(parents=True)

        with open(assets_path.joinpath("data.js"), "w") as data_file:
            data_file.write("json_data = " + json.dumps(video_list, indent=4))

    def download_jpeg_image_and_convert(
        self, url, fpath, preset_options={}, resize=None
    ):
        """downloads a JPEG image and converts and optimizes it into desired format detected from fpath"""

        org_jpeg_path = pathlib.Path(
            tempfile.NamedTemporaryFile(delete=False, suffix=".jpg").name
        )
        save_large_file(url, org_jpeg_path)
        if resize is not None:
            resize_image(
                org_jpeg_path,
                width=resize[0],
                height=resize[1],
                method="cover",
            )
        optimize_image(
            org_jpeg_path, fpath, convert=True, delete_src=True, **preset_options
        )
        logger.debug(f"Converted {org_jpeg_path} to {fpath} and optimized ")

    def download_speaker_image(
        self, video_id, video_title, video_speaker, speaker_path
    ):
        """downloads the speaker image"""

        downloaded_from_cache = False
        preset = WebpMedium()
        if self.s3_storage:
            s3_key = f"speaker_image/{video_id}"
            downloaded_from_cache = self.download_from_cache(
                s3_key, speaker_path, preset.VERSION
            )
        if not downloaded_from_cache:
            try:
                # download an image of the speaker
                if not video_speaker:
                    logger.debug("Speaker doesn't have an image")
                else:
                    logger.debug(f"Downloading Speaker image for {video_title}")
                    self.download_jpeg_image_and_convert(
                        video_speaker, speaker_path, preset_options=preset.options
                    )
            except Exception:
                logger.error(f"Could not download speaker image for {video_title}")
            else:
                if self.s3_storage and video_speaker:
                    self.upload_to_cache(s3_key, speaker_path, preset.VERSION)

    def download_thumbnail(
        self, video_id, video_title, video_thumbnail, thumbnail_path
    ):
        """download the thumbnail"""

        downloaded_from_cache = False
        preset = WebpMedium()
        if self.s3_storage:
            s3_key = f"thumbnail/{video_id}"
            downloaded_from_cache = self.download_from_cache(
                s3_key, thumbnail_path, preset.VERSION
            )
        if not downloaded_from_cache:
            try:
                # download the thumbnail of the video
                logger.debug(f"Downloading thumbnail for {video_title}")
                self.download_jpeg_image_and_convert(
                    video_thumbnail,
                    thumbnail_path,
                    preset_options=preset.options,
                    resize=(248, 187),
                )
            except Exception:
                logger.error(f"Could not download thumbnail for {video_title}")
            else:
                if self.s3_storage:
                    self.upload_to_cache(s3_key, thumbnail_path, preset.VERSION)

    def download_video_files(self, video):
        """download all video files (video, thumbnail, speaker)"""

        # Download all the TED talk videos and the meta-data for it.
        # Save the videos in build_dir/{video id}/video.mp4.
        # Save the thumbnail for the video in build_dir/{video id}/thumbnail.jpg.
        # Save the image of the speaker in build_dir/{video id}/speaker.jpg.

        # set up variables
        video_id = str(video["id"])
        # Take the english version of title or else whatever language it's available in
        video_title = video["title"][0]["text"]
        video_link = video["video_link"]
        video_speaker = video["speaker_picture"]
        video_thumbnail = video["thumbnail"]
        video_dir = self.videos_dir.joinpath(video_id)
        org_video_file_path = video_dir.joinpath("video.mp4")
        req_video_file_path = video_dir.joinpath(f"video.{self.video_format}")
        speaker_path = video_dir.joinpath("speaker.webp")
        thumbnail_path = video_dir.joinpath("thumbnail.webp")

        # ensure that video directory exists
        if not video_dir.exists():
            video_dir.mkdir(parents=True)

        # set preset
        preset = {"mp4": VideoMp4Low}.get(self.video_format, VideoWebmLow)()

        # download video
        downloaded_from_cache = False
        logger.debug(f"Downloading {video_title}")
        if self.s3_storage:
            s3_key = f"{self.video_format}/{self.video_quality}/{video_id}"
            downloaded_from_cache = self.download_from_cache(
                s3_key, req_video_file_path, preset.VERSION
            )
        if not downloaded_from_cache:
            try:
                if "https://" not in video_link:
                    options = (
                        BestWebm if self.video_format == "webm" else BestMp4
                    ).get_options(
                        target_dir=video_dir, filepath=pathlib.Path("video.%(ext)s")
                    )
                    self.yt_downloader.download(video_link, options)
                else:
                    save_large_file(video_link, org_video_file_path)
            except Exception:
                logger.error(f"Could not download {org_video_file_path}")

        # download speaker and thumbnail images
        self.download_speaker_image(video_id, video_title, video_speaker, speaker_path)
        self.download_thumbnail(video_id, video_title, video_thumbnail, thumbnail_path)

        # recompress if necessary
        try:
            if not downloaded_from_cache:
                post_process_video(
                    video_dir,
                    video_id,
                    preset,
                    self.video_format,
                    self.low_quality,
                )
        except Exception as e:
            logger.error(f"Failed to post process video {video_id}")
            logger.debug(e)
        else:
            # upload to cache only if recompress was successful
            if self.s3_storage and not downloaded_from_cache:
                self.upload_to_cache(s3_key, req_video_file_path, preset.VERSION)

    def download_video_files_parallel(self):
        """download videos and images parallely"""

        self.yt_downloader = YoutubeDownloader(threads=1)
        with concurrent.futures.ThreadPoolExecutor(
            max_workers=self.threads
        ) as executor:
            fs = [
                executor.submit(self.download_video_files, video)
                for video in self.videos
            ]
            concurrent.futures.wait(fs, return_when=concurrent.futures.ALL_COMPLETED)
        self.yt_downloader.shutdown()

    def download_subtitles(self, index, video):
        """download, converts and writes VTT subtitles for a video at a specific index in self.videos"""

        # Download the subtitle files, generate a WebVTT file
        # and save the subtitles in
        # build_dir/{video id}/subs/subs_{language code}.vtt
        if not video["subtitles"]:
            return
        video_dir = self.videos_dir.joinpath(video["id"])
        subs_dir = video_dir.joinpath("subs")
        if not subs_dir.exists():
            subs_dir.mkdir(parents=True)
        else:
            logger.debug(f"Subs dir exists already")

        # download subtitles
        logger.debug(f"Downloading subtitles for {video['title'][0]['text']}")
        valid_subs = []
        for subtitle in video["subtitles"]:
            time.sleep(0.5)  # throttling
            vtt_subtitle = WebVTT(subtitle["link"]).convert()
            if not vtt_subtitle:
                logger.error(
                    f"Subtitle file for {subtitle['languageCode']} could not be created"
                )
                continue
            valid_subs.append(subtitle)
            vtt_path = subs_dir.joinpath(f"subs_{subtitle['languageCode']}.vtt")
            with open(vtt_path, "w", encoding="utf-8") as sub_file:
                sub_file.write(vtt_subtitle)
        self.videos[index]["subtitles"] = valid_subs

    def download_subtitles_parallel(self):
        """download subtitles for all videos parallely"""

        with concurrent.futures.ThreadPoolExecutor(
            max_workers=self.threads
        ) as executor:
            fs = [
                executor.submit(self.download_subtitles, index, video)
                for index, video in enumerate(self.videos)
            ]
            concurrent.futures.wait(fs, return_when=concurrent.futures.ALL_COMPLETED)

    def s3_credentials_ok(self):
        logger.info("Testing S3 Optimization Cache credentials")
        self.s3_storage = KiwixStorage(self.s3_url_with_credentials)
        if not self.s3_storage.check_credentials(
            list_buckets=True, bucket=True, write=True, read=True, failsafe=True
        ):
            logger.error("S3 cache connection error testing permissions.")
            logger.error(f"  Server: {self.s3_storage.url.netloc}")
            logger.error(f"  Bucket: {self.s3_storage.bucket_name}")
            logger.error(f"  Key ID: {self.s3_storage.params.get('keyid')}")
            logger.error(f"  Public IP: {get_public_ip()}")
            return False
        return True

    def download_from_cache(self, key, object_path, encoder_version):
        """whether it downloaded from S3 cache"""

        if self.use_any_optimized_version:
            if not self.s3_storage.has_object(key, self.s3_storage.bucket_name):
                return False
        else:
            if not self.s3_storage.has_object_matching_meta(
                key, tag="encoder_version", value=f"v{encoder_version}"
            ):
                return False
        object_path.parent.mkdir(parents=True, exist_ok=True)
        try:
            self.s3_storage.download_file(key, object_path)
        except Exception as exc:
            logger.error(f"{key} failed to download from cache: {exc}")
            return False
        logger.info(f"downloaded {object_path} from cache at {key}")
        return True

    def upload_to_cache(self, key, object_path, encoder_version):
        """whether it uploaded from S3 cache"""

        try:
            self.s3_storage.upload_file(
                object_path, key, meta={"encoder_version": f"v{encoder_version}"}
            )
        except Exception as exc:
            logger.error(f"{key} failed to upload to cache: {exc}")
            return False
        logger.info(f"uploaded {object_path} to cache at {key}")
        return True

    def remove_failed_topics_and_check_extraction(self, failed_topics):
        """removes failed topics from topics list and raises error if scraper cannot continue"""

        for topic in failed_topics:
            self.topics.remove(topic)
        if not self.topics:
            if self.source_languages:
                raise ValueError(
                    "No videos found for any topic in the language(s) requested. Check topic(s) and/or language code supplied to --languages"
                )
            raise ValueError("Wrong topic(s) were supplied. No videos found")

    def run(self):
        logger.info(
            f"Starting scraper with:\n"
            f"  langs: {', '.join(self.source_languages)}\n"
            f"  subtitles : {', '.join(self.subtitles_setting) if isinstance(self.subtitles_setting, list) else self.subtitles_setting}\n"
            f"  video format : {self.video_format}"
        )

        if self.s3_url_with_credentials and not self.s3_credentials_ok():
            raise ValueError("Unable to connect to Optimization Cache. Check its URL.")
        if self.s3_storage:
            logger.info(
                f"Using cache: {self.s3_storage.url.netloc} with bucket: {self.s3_storage.bucket_name}"
            )

        # playlist mode requested
        if self.playlist:
            self.extract_videos_from_playlist(self.playlist)
        # topic(s) mode requested
        else:
            failed = []
            for topic in self.topics:
                if not self.extract_videos_from_topics(topic):
                    failed.append(topic)
                else:
                    logger.debug(f"Successfully scraped {topic}")
            self.remove_failed_topics_and_check_extraction(failed)

        self.add_default_language()
        self.update_zim_metadata()
        self.download_video_files_parallel()
        self.download_subtitles_parallel()
        self.render_home_page()
        self.render_video_pages()
        self.copy_files_to_build_directory()
        self.generate_datafile()

        # zim creation and cleanup
        if not self.no_zim:
            self.fname = (
                self.fname or f"{self.name.replace(' ', '-')}_{{period}}.zim"
            ).format(period=datetime.datetime.now().strftime("%Y-%m"))
            logger.info("building ZIM file")
            if not self.output_dir.exists():
                self.output_dir.mkdir(parents=True)
            make_zim_file(
                build_dir=self.build_dir,
                fpath=self.output_dir.joinpath(self.fname),
                name=self.name,
                main_page="index",
                favicon="favicon.png",
                title=self.title,
                description=self.description,
                language=self.zim_lang,
                creator=self.creator,
                publisher=self.publisher,
                tags=self.tags + ["_category:ted", "ted", "_videos:yes"],
                scraper=SCRAPER,
            )
            if not self.keep_build_dir:
                logger.info("removing temp folder")
                shutil.rmtree(self.build_dir, ignore_errors=True)

        logger.info("Done Everything")
Example #3
0
def s3_upload_file(
    src_path,
    upload_uri,
    filesize,
    private_key,  # not used
    resume=False,  # not supported
    move=False,  # not relevant
    delete=False,
    compress=False,  # not relevant
    bandwidth=None,  # not supported
    cipher=None,  # not relevant
    delete_after=None,  # nb of days to expire upload file after
):
    started_on = now()
    s3_storage = KiwixStorage(rebuild_uri(upload_uri, scheme="https").geturl())
    logger.debug(f"S3 initialized for {s3_storage.url.netloc}/{s3_storage.bucket_name}")

    key = upload_uri.path[1:]
    if upload_uri.path.endswith("/"):
        key += src_path.name

    try:
        logger.info(f"Uploading to {key}")
        hook = FileTransferHook(filename=src_path)
        s3_storage.upload_file(fpath=str(src_path), key=key, Callback=hook)
        print("", flush=True)
    except Exception as exc:
        # as there is no resume, uploading to existing URL will result in DELETE+UPLOAD
        # if credentials doesn't allow DELETE or if there is an unsatisfied
        # retention, will raise PermissionError
        logger.error(f"uploader failed: {exc}")
        logger.exception(exc)
        return 1
    ended_on = now()
    logger.info("uploader ran successfuly.")

    # setting autodelete
    if delete_after is not None:
        try:
            # set expiration after bucket's min retention.
            # bucket retention is 1d minumum.
            # can be configured to loger value.
            # if expiration before bucket min retention, raises 400 Bad Request
            # on compliance
            expire_on = (
                datetime.datetime.now()
                + datetime.timedelta(days=delete_after or 1)
                # adding 1mn to prevent clash with bucket's equivalent min retention
                + datetime.timedelta(seconds=60)
            )
            logger.info(f"Setting autodelete to {expire_on}")
            s3_storage.set_object_autodelete_on(key=key, on=expire_on)
        except Exception as exc:
            logger.error(f"Failed to set autodelete: {exc}")
            logger.exception(exc)

    if delete:
        remove_source_file(src_path)
    display_stats(filesize, started_on, ended_on)

    return 0
Example #4
0
class Youtube2Zim:
    def __init__(
        self,
        collection_type,
        youtube_id,
        api_key,
        video_format,
        low_quality,
        nb_videos_per_page,
        all_subtitles,
        autoplay,
        output_dir,
        no_zim,
        fname,
        debug,
        tmp_dir,
        keep_build_dir,
        max_concurrency,
        youtube_store,
        language,
        locale_name,
        tags,
        dateafter,
        use_any_optimized_version,
        s3_url_with_credentials,
        title=None,
        description=None,
        creator=None,
        publisher=None,
        name=None,
        profile_image=None,
        banner_image=None,
        main_color=None,
        secondary_color=None,
    ):
        # data-retrieval info
        self.collection_type = collection_type
        self.youtube_id = youtube_id
        self.api_key = api_key
        self.dateafter = dateafter

        # video-encoding info
        self.video_format = video_format
        self.low_quality = low_quality

        # options & zim params
        self.nb_videos_per_page = nb_videos_per_page
        self.all_subtitles = all_subtitles
        self.autoplay = autoplay
        self.fname = fname
        self.language = language
        self.tags = [t.strip() for t in tags.split(",")]
        self.title = title
        self.description = description
        self.creator = creator
        self.publisher = publisher
        self.name = name
        self.profile_image = profile_image
        self.banner_image = banner_image
        self.main_color = main_color
        self.secondary_color = secondary_color

        # directory setup
        self.output_dir = Path(output_dir).expanduser().resolve()
        if tmp_dir:
            tmp_dir = Path(tmp_dir).expanduser().resolve()
            tmp_dir.mkdir(parents=True, exist_ok=True)
        self.build_dir = Path(tempfile.mkdtemp(dir=tmp_dir))

        # process-related
        self.playlists = []
        self.uploads_playlist_id = None
        self.videos_ids = []
        self.main_channel_id = None  # use for branding

        # debug/devel options
        self.no_zim = no_zim
        self.debug = debug
        self.keep_build_dir = keep_build_dir
        self.max_concurrency = max_concurrency

        # update youtube credentials store
        youtube_store.update(build_dir=self.build_dir,
                             api_key=self.api_key,
                             cache_dir=self.cache_dir)

        # Optimization-cache
        self.s3_url_with_credentials = s3_url_with_credentials
        self.use_any_optimized_version = use_any_optimized_version
        self.video_quality = "low" if self.low_quality else "high"
        self.s3_storage = None

        # set and record locale for translations
        locale_name = locale_name or get_language_details(
            self.language)["iso-639-1"]
        try:
            self.locale = setlocale(ROOT_DIR, locale_name)
        except locale.Error:
            logger.error(
                f"No locale for {locale_name}. Use --locale to specify it. defaulting to en_US"
            )
            self.locale = setlocale(ROOT_DIR, "en")

    @property
    def root_dir(self):
        return ROOT_DIR

    @property
    def templates_dir(self):
        return self.root_dir.joinpath("templates")

    @property
    def assets_src_dir(self):
        return self.templates_dir.joinpath("assets")

    @property
    def assets_dir(self):
        return self.build_dir.joinpath("assets")

    @property
    def channels_dir(self):
        return self.build_dir.joinpath("channels")

    @property
    def cache_dir(self):
        return self.build_dir.joinpath("cache")

    @property
    def videos_dir(self):
        return self.build_dir.joinpath("videos")

    @property
    def profile_path(self):
        return self.build_dir.joinpath("profile.jpg")

    @property
    def banner_path(self):
        return self.build_dir.joinpath("banner.jpg")

    @property
    def is_user(self):
        return self.collection_type == USER

    @property
    def is_channel(self):
        return self.collection_type == CHANNEL

    @property
    def is_playlist(self):
        return self.collection_type == PLAYLIST

    @property
    def is_single_channel(self):
        if self.is_channel or self.is_user:
            return True
        return len(list(set([pl.creator_id for pl in self.playlists]))) == 1

    @property
    def sorted_playlists(self):
        """ sorted list of playlists (by title) but with Uploads one at first if any """
        if len(self.playlists) < 2:
            return self.playlists

        sorted_playlists = sorted(self.playlists, key=lambda x: x.title)
        index = 0
        # make sure our Uploads, special playlist is first
        if self.uploads_playlist_id:
            try:
                index = [
                    index for index, p in enumerate(sorted_playlists)
                    if p.playlist_id == self.uploads_playlist_id
                ][-1]
            except Exception:
                index = 0
        return ([sorted_playlists[index]] + sorted_playlists[0:index] +
                sorted_playlists[index + 1:])

    def run(self):
        """ execute the scraper step by step """

        self.validate_id()

        # validate dateafter input
        self.validate_dateafter_input()

        logger.info(
            f"starting youtube scraper for {self.collection_type}#{self.youtube_id}"
        )
        logger.info("preparing build folder at {}".format(
            self.build_dir.resolve()))
        self.prepare_build_folder()

        logger.info("testing Youtube credentials")
        if not credentials_ok():
            raise ValueError(
                "Unable to connect to Youtube API v3. check `API_KEY`.")

        if self.s3_url_with_credentials and not self.s3_credentials_ok():
            raise ValueError(
                "Unable to connect to Optimization Cache. Check its URL.")

        # fail early if supplied branding files are missing
        self.check_branding_values()

        logger.info("compute playlists list to retrieve")
        self.extract_playlists()

        logger.info(".. {} playlists:\n   {}".format(
            len(self.playlists),
            "\n   ".join([p.playlist_id for p in self.playlists]),
        ))

        logger.info("compute list of videos")
        self.extract_videos_list()

        nb_videos_msg = f".. {len(self.videos_ids)} videos"
        if self.dateafter.start.year != 1:
            nb_videos_msg += (
                f" in date range: {self.dateafter.start} - {datetime.date.today()}"
            )
        logger.info(f"{nb_videos_msg}.")

        # download videos (and recompress)
        logger.info(
            f"downloading all videos, subtitles and thumbnails (concurrency={self.max_concurrency})"
        )
        logger.info(f"  format: {self.video_format}")
        logger.info(f"  quality: {self.video_quality}")
        logger.info(f"  generated-subtitles: {self.all_subtitles}")
        if self.s3_storage:
            logger.info(
                f"  using cache: {self.s3_storage.url.netloc} with bucket: {self.s3_storage.bucket_name}"
            )
        succeeded, failed = self.download_video_files(
            max_concurrency=self.max_concurrency)
        if failed:
            logger.error(
                f"{len(failed)} video(s) failed to download: {failed}")
            if len(failed) >= len(succeeded):
                logger.critical("More than half of videos failed. exiting")
                raise IOError("Too much videos failed to download")

        logger.info("retrieve channel-info for all videos (author details)")
        get_videos_authors_info(succeeded)

        logger.info("download all author's profile pictures")
        self.download_authors_branding()

        logger.info("update general metadata")
        self.update_metadata()

        logger.info("creating HTML files")
        self.make_html_files(succeeded)

        # make zim file
        os.makedirs(self.output_dir, exist_ok=True)
        if not self.no_zim:
            period = datetime.datetime.now().strftime("%Y-%m")
            self.fname = (self.fname.format(
                period=period) if self.fname else f"{self.name}_{period}.zim")
            logger.info("building ZIM file")
            make_zim_file(
                build_dir=self.build_dir,
                fpath=self.output_dir / self.fname,
                name=self.name,
                main_page="home.html",
                favicon="favicon.jpg",
                title=self.title,
                description=self.description,
                language=self.language,
                creator=self.creator,
                publisher="Kiwix",
                tags=self.tags,
                scraper=SCRAPER,
            )

            if not self.keep_build_dir:
                logger.info("removing temp folder")
                shutil.rmtree(self.build_dir, ignore_errors=True)

        logger.info("all done!")

    def s3_credentials_ok(self):
        logger.info("testing S3 Optimization Cache credentials")
        self.s3_storage = KiwixStorage(self.s3_url_with_credentials)
        if not self.s3_storage.check_credentials(list_buckets=True,
                                                 bucket=True,
                                                 write=True,
                                                 read=True,
                                                 failsafe=True):
            logger.error("S3 cache connection error testing permissions.")
            logger.error(f"  Server: {self.s3_storage.url.netloc}")
            logger.error(f"  Bucket: {self.s3_storage.bucket_name}")
            logger.error(f"  Key ID: {self.s3_storage.params.get('keyid')}")
            logger.error(f"  Public IP: {get_public_ip()}")
            return False
        return True

    def validate_dateafter_input(self):
        try:
            self.dateafter = youtube_dl.DateRange(self.dateafter)
        except Exception as exc:
            logger.error(
                f"Invalid dateafter input. Valid dateafter format: "
                f"YYYYMMDD or (now|today)[+-][0-9](day|week|month|year)(s).")
            raise ValueError(f"Invalid dateafter input: {exc}")

    def validate_id(self):
        # space not allowed in youtube-ID
        self.youtube_id = self.youtube_id.replace(" ", "")
        if self.collection_type == "channel" and len(self.youtube_id) > 24:
            raise ValueError("Invalid ChannelId")
        if "," in self.youtube_id and self.collection_type != "playlist":
            raise ValueError("Invalid YoutubeId")

    def prepare_build_folder(self):
        """ prepare build folder before we start downloading data """

        # copy assets
        shutil.copytree(self.assets_src_dir, self.assets_dir)

        fix_source_dir(self.assets_dir, "assets")

        # cache folder to store youtube-api results
        self.cache_dir.mkdir(exist_ok=True)

        # make videos placeholder
        self.videos_dir.mkdir(exist_ok=True)

        # make channels placeholder (profile files)
        self.channels_dir.mkdir(exist_ok=True)

    def check_branding_values(self):
        """checks that user-supplied images and colors are valid (so to fail early)

        Images are checked for existence or downloaded then resized
        Colors are check for validity"""

        # skip this step if none of related values were supplied
        if not sum([
                bool(x) for x in (
                    self.profile_image,
                    self.banner_image,
                    self.main_color,
                    self.secondary_color,
                )
        ]):
            return
        logger.info("checking your branding files and values")
        if self.profile_image:
            if self.profile_image.startswith("http"):
                stream_file(self.profile_image, self.profile_path)
            else:
                if not self.profile_image.exists():
                    raise IOError(
                        f"--profile image could not be found: {self.profile_image}"
                    )
                shutil.move(self.profile_image, self.profile_path)
            resize_image(self.profile_path,
                         width=100,
                         height=100,
                         method="thumbnail")
        if self.banner_image:
            if self.banner_image.startswith("http"):
                stream_file(self.banner_image, self.banner_path)
            else:
                if not self.banner_image.exists():
                    raise IOError(
                        f"--banner image could not be found: {self.banner_image}"
                    )
                shutil.move(self.banner_image, self.banner_path)
            resize_image(self.banner_path,
                         width=1060,
                         height=175,
                         method="thumbnail")

        if self.main_color and not is_hex_color(self.main_color):
            raise ValueError(
                f"--main-color is not a valid hex color: {self.main_color}")

        if self.secondary_color and not is_hex_color(self.secondary_color):
            raise ValueError(
                f"--secondary_color-color is not a valid hex color: {self.secondary_color}"
            )

    def extract_playlists(self):
        """prepare a list of Playlist from user request

        USER: we fetch the hidden channel associate to it
        CHANNEL (and USER): we grab all playlists + `uploads` playlist
        PLAYLIST: we retrieve from the playlist Id(s)"""

        (
            self.playlists,
            self.main_channel_id,
            self.uploads_playlist_id,
        ) = extract_playlists_details_from(self.collection_type,
                                           self.youtube_id)

    def extract_videos_list(self):
        all_videos = load_json(self.cache_dir, "videos")
        if all_videos is None:
            all_videos = {}

            # we only return video_ids that we'll use later on. per-playlist JSON stored
            for playlist in self.playlists:
                videos_json = get_videos_json(playlist.playlist_id)
                # filter in videos within date range and filter away deleted videos
                skip_outofrange = functools.partial(skip_outofrange_videos,
                                                    self.dateafter)
                filter_videos = filter(skip_outofrange, videos_json)
                filter_videos = filter(skip_deleted_videos, filter_videos)
                all_videos.update(
                    {v["contentDetails"]["videoId"]: v
                     for v in filter_videos})
            save_json(self.cache_dir, "videos", all_videos)
        self.videos_ids = [*all_videos.keys()
                           ]  # unpacking so it's subscriptable

    def download_video_files(self, max_concurrency):

        audext, vidext = {
            "webm": ("webm", "webm"),
            "mp4": ("m4a", "mp4")
        }[self.video_format]

        # prepare options which are shared with every downloader
        options = {
            "cachedir": self.videos_dir,
            "writethumbnail": True,
            "write_all_thumbnails": False,
            "writesubtitles": True,
            "allsubtitles": True,
            "subtitlesformat": "vtt",
            "keepvideo": False,
            "ignoreerrors": False,
            "retries": 20,
            "fragment-retries": 50,
            "skip-unavailable-fragments": True,
            # "external_downloader": "aria2c",
            # "external_downloader_args": ["--max-tries=20", "--retry-wait=30"],
            "outtmpl": str(self.videos_dir.joinpath("%(id)s",
                                                    "video.%(ext)s")),
            "preferredcodec": self.video_format,
            "format":
            f"best[ext={vidext}]/bestvideo[ext={vidext}]+bestaudio[ext={audext}]/best",
            "y2z_videos_dir": self.videos_dir,
        }
        if self.all_subtitles:
            options.update({"writeautomaticsub": True})

        # find number of actuall parallel workers
        nb_videos = len(self.videos_ids)
        concurrency = nb_videos if nb_videos < max_concurrency else max_concurrency

        # short-circuit concurency if we have only one thread (can help debug)
        if concurrency <= 1:
            return self.download_video_files_batch(options, self.videos_ids)

        # prepare out videos_ids batches
        def get_slot():
            n = 0
            while True:
                yield n
                n += 1
                if n >= concurrency:
                    n = 0

        batches = [[] for _ in range(0, concurrency)]
        slot = get_slot()
        for video_id in self.videos_ids:
            batches[next(slot)].append(video_id)

        overall_succeeded = []
        overall_failed = []
        # execute the batches concurrently
        with concurrent.futures.ThreadPoolExecutor(
                max_workers=concurrency) as executor:
            fs = [
                executor.submit(self.download_video_files_batch, options,
                                videos_ids) for videos_ids in batches
            ]
            done, not_done = concurrent.futures.wait(
                fs, return_when=concurrent.futures.ALL_COMPLETED)

            # we have some `not_done` batches, indicating errors within
            if not_done:
                logger.critical(
                    "Not all video-processing batches completed. Cancelling…")
                for future in not_done:
                    exc = future.exception()
                    if exc:
                        logger.exception(exc)
                        raise exc

            # retrieve our list of successful/failed video_ids
            for future in done:
                succeeded, failed = future.result()
                overall_succeeded += succeeded
                overall_failed += failed

        # remove left-over files for failed downloads
        logger.debug(
            f"removing left-over files of {len(overall_failed)} failed videos")
        for video_id in overall_failed:
            shutil.rmtree(self.videos_dir.joinpath(video_id),
                          ignore_errors=True)

        return overall_succeeded, overall_failed

    def download_from_cache(self, key, video_path, encoder_version):
        """ whether it successfully downloaded from cache """
        if self.use_any_optimized_version:
            if not self.s3_storage.has_object(key,
                                              self.s3_storage.bucket_name):
                return False
        else:
            if not self.s3_storage.has_object_matching_meta(
                    key, tag="encoder_version", value=f"v{encoder_version}"):
                return False
        video_path.parent.mkdir(parents=True, exist_ok=True)
        try:
            self.s3_storage.download_file(key, video_path)
        except Exception as exc:
            logger.error(f"{key} failed to download from cache: {exc}")
            return False
        logger.info(f"downloaded {video_path} from cache at {key}")
        return True

    def upload_to_cache(self, key, video_path, encoder_version):
        """ whether it successfully uploaded to cache """
        try:
            self.s3_storage.upload_file(
                video_path,
                key,
                meta={"encoder_version": f"v{encoder_version}"})
        except Exception as exc:
            logger.error(f"{key} failed to upload to cache: {exc}")
            return False
        logger.info(f"uploaded {video_path} to cache at {key}")
        return True

    def download_video(self, video_id, options):
        """ download the video from cache/youtube and return True if successful """

        preset = {"mp4": VideoMp4Low}.get(self.video_format, VideoWebmLow)()
        options_copy = options.copy()
        video_location = options_copy["y2z_videos_dir"].joinpath(video_id)
        video_path = video_location.joinpath(f"video.{self.video_format}")

        if self.s3_storage:
            s3_key = f"{self.video_format}/{self.video_quality}/{video_id}"
            logger.debug(
                f"Attempting to download video file for {video_id} from cache..."
            )
            if self.download_from_cache(s3_key, video_path, preset.VERSION):
                return True

        try:
            # skip downloading the thumbnails
            options_copy.update({
                "writethumbnail": False,
                "writesubtitles": False,
                "allsubtitles": False,
                "writeautomaticsub": False,
            })
            with youtube_dl.YoutubeDL(options_copy) as ydl:
                ydl.download([video_id])
            post_process_video(
                video_location,
                video_id,
                preset,
                self.video_format,
                self.low_quality,
            )
        except (
                youtube_dl.utils.DownloadError,
                FileNotFoundError,
                subprocess.CalledProcessError,
        ) as exc:
            logger.error(f"Video file for {video_id} could not be downloaded")
            logger.debug(exc)
            return False
        else:  # upload to cache only if everything went well
            if self.s3_storage:
                logger.debug(
                    f"Uploading video file for {video_id} to cache ...")
                self.upload_to_cache(s3_key, video_path, preset.VERSION)
            return True

    def download_thumbnail(self, video_id, options):
        """ download the thumbnail from cache/youtube and return True if successful """

        preset = WebpHigh()
        options_copy = options.copy()
        video_location = options_copy["y2z_videos_dir"].joinpath(video_id)
        thumbnail_path = video_location.joinpath("video.webp")

        if self.s3_storage:
            s3_key = f"thumbnails/high/{video_id}"
            logger.debug(
                f"Attempting to download thumbnail for {video_id} from cache..."
            )
            if self.download_from_cache(s3_key, thumbnail_path,
                                        preset.VERSION):
                return True

        try:
            # skip downloading the video
            options_copy.update({
                "skip_download": True,
                "writesubtitles": False,
                "allsubtitles": False,
                "writeautomaticsub": False,
            })
            with youtube_dl.YoutubeDL(options_copy) as ydl:
                ydl.download([video_id])
            process_thumbnail(thumbnail_path, preset)
        except (
                youtube_dl.utils.DownloadError,
                FileNotFoundError,
                subprocess.CalledProcessError,
        ) as exc:
            logger.error(f"Thumbnail for {video_id} could not be downloaded")
            logger.debug(exc)
            return False
        else:  # upload to cache only if everything went well
            if self.s3_storage:
                logger.debug(
                    f"Uploading thumbnail for {video_id} to cache ...")
                self.upload_to_cache(s3_key, thumbnail_path, preset.VERSION)
            return True

    def download_subtitles(self, video_id, options):
        """ download subtitles for a video """

        options_copy = options.copy()
        options_copy.update({"skip_download": True, "writethumbnail": False})
        try:
            with youtube_dl.YoutubeDL(options_copy) as ydl:
                ydl.download([video_id])
        except Exception:
            logger.error(f"Could not download subtitles for {video_id}")

    def download_video_files_batch(self, options, videos_ids):
        """ download video file and thumbnail for all videos in batch and return succeeded and failed video ids """

        succeeded = []
        failed = []
        for video_id in videos_ids:
            if self.download_video(video_id,
                                   options) and self.download_thumbnail(
                                       video_id, options):
                self.download_subtitles(video_id, options)
                succeeded.append(video_id)
            else:
                failed.append(video_id)
        return succeeded, failed

    def download_authors_branding(self):
        videos_channels_json = load_json(self.cache_dir, "videos_channels")
        uniq_channel_ids = list(
            set([chan["channelId"] for chan in videos_channels_json.values()]))
        for channel_id in uniq_channel_ids:
            save_channel_branding(self.channels_dir,
                                  channel_id,
                                  save_banner=False)
            self.copy_default_banner(channel_id)

    def copy_default_banner(self, channel_id):
        banner_path = self.channels_dir / channel_id / "banner.jpg"
        if not banner_path.exists():
            shutil.copy(
                self.templates_dir / "assets" / "banner.jpg",
                self.channels_dir / channel_id / "banner.jpg",
            )

    def update_metadata(self):
        # we use title, description, profile and banner of channel/user
        # or channel of first playlist
        try:
            main_channel_json = get_channel_json(self.main_channel_id)
        except KeyError:
            main_channel_json = {
                "snippet": {
                    "title": "Unknown",
                    "description": ""
                }
            }
        else:
            save_channel_branding(self.channels_dir,
                                  self.main_channel_id,
                                  save_banner=True)
        self.copy_default_banner(self.main_channel_id)

        # if a single playlist was requested, use if for names;
        # otherwise, use main_channel's details.
        auto_title = (self.playlists[0].title
                      if self.is_playlist and len(self.playlists) == 1 else
                      main_channel_json["snippet"]["title"].strip())
        auto_description = (clean_text(self.playlists[0].description)
                            if self.is_playlist and len(self.playlists) == 1
                            else clean_text(
                                main_channel_json["snippet"]["description"]))
        self.title = self.title or auto_title or "-"
        self.description = self.description or auto_description or "-"

        if self.creator is None:
            if self.is_single_channel:
                self.creator = _("Youtube Channel “{title}”").format(
                    title=main_channel_json["snippet"]["title"])
            else:
                self.creator = _("Youtube Channels")
        self.publisher = self.publisher or "Kiwix"

        self.tags = self.tags or ["youtube"]
        if "_videos:yes" not in self.tags:
            self.tags.append("_videos:yes")

        # copy our main_channel branding into /(profile|banner).jpg if not supplied
        if not self.profile_path.exists():
            shutil.copy(
                self.channels_dir.joinpath(self.main_channel_id,
                                           "profile.jpg"),
                self.profile_path,
            )
        if not self.banner_path.exists():
            shutil.copy(
                self.channels_dir.joinpath(self.main_channel_id, "banner.jpg"),
                self.banner_path,
            )

        # set colors from images if not supplied
        if self.main_color is None or self.secondary_color is None:
            profile_main, profile_secondary = get_colors(self.profile_path)
        self.main_color = self.main_color or profile_main
        self.secondary_color = self.secondary_color or profile_secondary

        resize_image(
            self.profile_path,
            width=48,
            height=48,
            method="thumbnail",
            dst=self.build_dir.joinpath("favicon.jpg"),
        )

    def make_html_files(self, actual_videos_ids):
        """make up HTML structure to read the content

        /home.html                                  Homepage

        for each video:
            - <slug-title>.html                     HTML article
            - videos/<videoId>/video.<ext>          video file
            - videos/<videoId>/video.<lang>.vtt     subtititle(s)
            - videos/<videoId>/video.webp            template
        """
        def remove_unused_videos(videos):
            video_ids = [
                video["contentDetails"]["videoId"] for video in videos
            ]
            for path in self.videos_dir.iterdir():
                if path.is_dir() and path.name not in video_ids:
                    logger.debug(f"Removing unused video {path.name}")
                    shutil.rmtree(path, ignore_errors=True)

        def is_present(video):
            """ whether this video has actually been succeffuly downloaded """
            return video["contentDetails"]["videoId"] in actual_videos_ids

        def video_has_channel(videos_channels, video):
            return video["contentDetails"]["videoId"] in videos_channels

        def get_subtitles(video_id):
            video_dir = self.videos_dir.joinpath(video_id)
            languages = [
                x.stem.split(".")[1] for x in video_dir.iterdir()
                if x.is_file() and x.name.endswith(".vtt")
            ]

            def to_jinja_subtitle(lang):
                try:
                    subtitle = get_language_details(
                        YOUTUBE_LANG_MAP.get(lang, lang))
                except Exception:
                    logger.error(f"Failed to get language details for {lang}")
                    raise
                return {
                    "code": lang,
                    # Youtube.com uses `English - code` format.
                    # Note: videojs displays it lowercased anyway
                    "name":
                    f"{subtitle['english'].title()} - {subtitle['query']}",
                }

            # Youtube.com sorts subtitles by English name
            return sorted(map(to_jinja_subtitle, languages),
                          key=lambda x: x["name"])

        env = jinja2.Environment(loader=jinja2.FileSystemLoader(
            str(self.templates_dir)),
                                 autoescape=True)

        videos = load_json(self.cache_dir, "videos").values()
        # filter videos so we only include the ones we could retrieve
        videos = list(filter(is_present, videos))
        videos_channels = load_json(self.cache_dir, "videos_channels")
        has_channel = functools.partial(video_has_channel, videos_channels)
        # filter videos to exclude those for which we have no channel (#76)
        videos = list(filter(has_channel, videos))
        for video in videos:
            video_id = video["contentDetails"]["videoId"]
            title = video["snippet"]["title"]
            slug = get_slug(title)
            description = video["snippet"]["description"]
            publication_date = dt_parser.parse(
                video["contentDetails"]["videoPublishedAt"])
            author = videos_channels[video_id]
            subtitles = get_subtitles(video_id)
            video_url = f"https://www.youtube.com/watch?v={video_id}"

            html = env.get_template("article.html").render(
                video_id=video_id,
                video_format=self.video_format,
                author=author,
                title=title,
                description=description,
                date=format_date(publication_date,
                                 format="medium",
                                 locale=self.locale),
                subtitles=subtitles,
                url=video_url,
                channel_id=video["snippet"]["channelId"],
                color=self.main_color,
                background_color=self.secondary_color,
                autoplay=self.autoplay,
            )
            with open(self.build_dir.joinpath(f"{slug}.html"),
                      "w",
                      encoding="utf-8") as fp:
                fp.write(html)

        # build homepage
        html = env.get_template("home.html").render(
            playlists=self.playlists,
            video_format=self.video_format,
            title=self.title,
            description=self.description,
            color=self.main_color,
            background_color=self.secondary_color,
            page_label=_("Page {current}/{total}"),
            back_label=_("Back to top"),
        )
        with open(self.build_dir.joinpath("home.html"), "w",
                  encoding="utf-8") as fp:
            fp.write(html)

        # rewrite app.js including `format`
        with open(self.assets_dir.joinpath("app.js"), "w",
                  encoding="utf-8") as fp:
            fp.write(
                env.get_template("assets/app.js").render(
                    video_format=self.video_format))

        # rewrite app.js including `pagination`
        with open(self.assets_dir.joinpath("db.js"), "w",
                  encoding="utf-8") as fp:
            fp.write(
                env.get_template("assets/db.js").render(
                    NB_VIDEOS_PER_PAGE=self.nb_videos_per_page))

        # write list of videos in data.js
        def to_data_js(video):
            return {
                "id":
                video["contentDetails"]["videoId"],
                "title":
                video["snippet"]["title"],
                "slug":
                get_slug(video["snippet"]["title"]),
                "description":
                video["snippet"]["description"],
                "subtitles":
                get_subtitles(video["contentDetails"]["videoId"]),
                "thumbnail":
                str(
                    Path("videos").joinpath(video["contentDetails"]["videoId"],
                                            "video.webp")),
            }

        with open(self.assets_dir.joinpath("data.js"), "w",
                  encoding="utf-8") as fp:
            # write all playlists as they are
            for playlist in self.playlists:
                # retrieve list of videos for PL
                playlist_videos = load_json(
                    self.cache_dir, f"playlist_{playlist.playlist_id}_videos")
                # filtering-out missing ones (deleted or not downloaded)
                playlist_videos = list(
                    filter(skip_deleted_videos, playlist_videos))
                playlist_videos = list(filter(is_present, playlist_videos))
                playlist_videos = list(filter(has_channel, playlist_videos))
                # sorting them based on playlist
                playlist_videos.sort(key=lambda v: v["snippet"]["position"])

                fp.write("var json_{slug} = {json_str};\n".format(
                    slug=playlist.slug,
                    json_str=json.dumps(list(map(to_data_js, playlist_videos)),
                                        indent=4),
                ))

        # write a metadata.json file with some content-related data
        with open(self.build_dir.joinpath("metadata.json"),
                  "w",
                  encoding="utf-8") as fp:
            json.dump({"video_format": self.video_format}, fp, indent=4)

        # clean videos left out in videos directory
        remove_unused_videos(videos)
Example #5
0
    def upload_image_s3(self):
        self.logger.info("Starting S3 upload")

        # add credentials to URL
        url = urllib.parse.urlparse(self.task["upload_uri"])
        qs = urllib.parse.parse_qs(url.query)
        qs["keyId"] = Setting.s3_access_key
        qs["secretAccessKey"] = Setting.s3_secret_key

        # setup upload logging
        uploader_log = io.StringIO()
        uploader_logger = logging.getLogger("uploader_log")
        uploader_logger.propagate = True
        uploader_logger.setLevel(logging.DEBUG)
        uploader_logger.addHandler(logging.StreamHandler(stream=uploader_log))

        # init and test storage
        uploader_logger.info("initializing S3")
        s3_storage = KiwixStorage(
            urllib.parse.SplitResult(
                "https",
                url.netloc,
                url.path,
                urllib.parse.urlencode(qs, doseq=True),
                url.fragment,
            ).geturl())
        uploader_logger.debug(
            f"S3 initialized for {s3_storage.url.netloc}/{s3_storage.bucket_name}"
        )

        # torrent
        dl_url = urllib.parse.urlparse(self.task["download_uri"])
        upload_torrent = "torrent" in dl_url.scheme

        if upload_torrent:
            parts = list(urllib.parse.urlsplit(dl_url.geturl()))
            parts[0] = parts[0].replace("+torrent", "")
            dl_url = urllib.parse.urlparse(urllib.parse.urlunsplit(parts))

            uploader_logger.info(
                f"Creating torrent file for {self.img_path.name}")
            torrent_path = self.img_path.with_suffix(
                f"{self.img_path.suffix}.torrent")
            download_url = f"{dl_url.geturl()}/{self.img_path.name}"
            torrent = torf.Torrent(
                path=self.img_path,
                trackers=[
                    "https://opentracker.xyz:443/announce",
                    "http://torrent.nwps.ws:80/announce",
                    "udp://tracker.open-internet.nl:6969/announce",
                    "udp://tracker.coppersurfer.tk:6969/announce",
                    "udp://tracker.openbittorrent.com:80/announce",
                ],
                webseeds=[download_url],
            )
            torrent.generate()
            torrent.write(torrent_path)
            uploader_logger.info(f".. created {torrent_path.name}")

            uploader_logger.info(f"Uploading {torrent_path.name}")
            s3_storage.upload_file(fpath=str(torrent_path),
                                   key=torrent_path.name)
            uploader_logger.info(".. uploaded")
            torrent_path.unlink()

        # upload
        uploader_logger.info(f"Uploading to {self.img_path.name}")
        try:
            hook = ImageTransferHook(output=uploader_log, fpath=self.img_path)
            s3_storage.upload_file(fpath=str(self.img_path),
                                   key=self.img_path.name,
                                   Callback=hook)
            uploaded = True
        except Exception as exc:
            uploaded = False
            uploader_logger.error(f"uploader failed: {exc}")
            uploader_logger.exception(exc)
        else:
            uploader_logger.info("uploader ran successfuly.")

        # setting autodelete
        try:
            # make sure autodelete is above bucket's min retention (should be 1d)
            expire_on = datetime.datetime.now() + datetime.timedelta(
                days=max([2, self.task["media_duration"]]))
            uploader_logger.info(f"Setting autodelete to {expire_on}")
            s3_storage.set_object_autodelete_on(key=self.img_path.name,
                                                on=expire_on)
        except Exception as exc:
            uploader_logger.error("Failed to set autodelete")
            uploader_logger.exception(exc)

        if upload_torrent:
            try:
                uploader_logger.info(
                    f"Setting torrent autodelete to {expire_on}")
                s3_storage.set_object_autodelete_on(key=torrent_path.name,
                                                    on=expire_on)
            except Exception as exc:
                uploader_logger.error("Failed to set autodelete on torrent")
                uploader_logger.exception(exc)

        self.logger.info("collecting uploader log")
        try:
            self.logs["uploader_log"] = uploader_log.getvalue()
            uploader_log.close()
        except Exception as exc:
            self.logger.error(f"Failed to collect logs: {exc}")

        # remove image
        try:
            self.logger.info("removing image file: {}".format(
                self.img_path.name))
            self.img_path.unlink()
        except Exception as exp:
            self.logger.error("Unable to remove image file: {}".format(exp))
            self.logger.exception(exp)

        if not uploaded:
            raise subprocess.SubprocessError("S3 upload failed")