Ejemplo n.º 1
0
def storage_for_key(key):

    # init and test storage
    s3_storage = KiwixStorage(get_url_for())

    for wh in ("download", "warehouse"):
        bucket_name = f"org-kiwix-hotspot-cardshop-{wh}"
        print(f"testing {bucket_name}")
        if s3_storage.has_object(key, bucket_name=bucket_name):
            return KiwixStorage(get_url_for(bucket_name))

    raise ValueError(f"Couldn't find a bucket with key `{key}`")
Ejemplo n.º 2
0
def test_url_access(url):
    s3 = KiwixStorage(url)
    print(f"testing: {s3.url.geturl()}")

    for test in ("list_buckets", "bucket", "write", "write_and_read"):
        params = {test: True}
        if test == "write_and_read":
            params = {"write": True, "read": True}
        result = s3.check_credentials(failsafe=True, **params)
        print(f"can {test}: {result}")
        if test == "list_buckets":
            print(f"{s3.bucket_names=}")
Ejemplo n.º 3
0
 def s3_credentials_ok(self):
     logger.info("Testing S3 Optimization Cache credentials")
     self.s3_storage = KiwixStorage(self.s3_url_with_credentials)
     if not self.s3_storage.check_credentials(
         list_buckets=True, bucket=True, write=True, read=True, failsafe=True
     ):
         logger.error("S3 cache connection error testing permissions.")
         logger.error(f"  Server: {self.s3_storage.url.netloc}")
         logger.error(f"  Bucket: {self.s3_storage.bucket_name}")
         logger.error(f"  Key ID: {self.s3_storage.params.get('keyid')}")
         logger.error(f"  Public IP: {get_public_ip()}")
         return False
     return True
Ejemplo n.º 4
0
def setup_s3_and_check_credentials(s3_url_with_credentials):
    logger.info("testing S3 Optimization Cache credentials")
    s3_storage = KiwixStorage(s3_url_with_credentials)
    if not s3_storage.check_credentials(
            list_buckets=True, bucket=True, write=True, read=True,
            failsafe=True):
        logger.error("S3 cache connection error testing permissions.")
        logger.error(f"  Server: {s3_storage.url.netloc}")
        logger.error(f"  Bucket: {s3_storage.bucket_name}")
        logger.error(f"  Key ID: {s3_storage.params.get('keyid')}")
        logger.error(f"  Public IP: {get_public_ip()}")
        raise ValueError(
            "Unable to connect to Optimization Cache. Check its URL.")
    return s3_storage
Ejemplo n.º 5
0
def do_upload_file(url: str, fpath: pathlib.Path, key: str = None):
    if not fpath.exists():
        raise IOError(f"{fpath} missing.")
    fsize = fpath.stat().st_size
    if not key:
        key = fpath.name

    s3 = KiwixStorage(url)
    dest = f"s3://{s3.url.netloc}/{s3.bucket_name}"

    if s3.has_object(key):
        raise ValueError(f"Key `{key}` already exists at {dest}. Specify another one.")
    print(f"Uploading {fpath.name} ({format_size(fsize)}) to {dest}/{key}")

    progress = CustomProgressBar(fsize).callback if progressbar else True
    s3.upload_file(fpath=fpath, key=key, progress=progress)
Ejemplo n.º 6
0
    def get_list_of_all(self, mode):
        """returns a list of topics or playlists"""
        # get all topics
        topics_list = json.loads(
            download_link(
                "https://www.ted.com/topics/combo?models=Talks").text)
        if mode == "topic":
            return topics_list

        # mode is playlist
        s3_url_with_credentials = None
        s3_arg = "--optimization-cache"
        for index, arg in enumerate(self.extra_args):
            if arg.startswith(s3_arg):
                s3_url_with_credentials = (arg[len(s3_arg) + 1:] if "=" in arg
                                           else self.extra_args[index + 1])
                break

        if s3_url_with_credentials:
            s3_storage = KiwixStorage(s3_url_with_credentials)
            if not s3_storage.check_credentials(list_buckets=True,
                                                bucket=True,
                                                write=True,
                                                read=True,
                                                failsafe=True):
                logger.error(
                    "S3 credential check failed. Continuing without S3")
                return self.download_playlists_list_from_site(topics_list)

            key = "playlists_list.json"
            playlists_list = self.download_playlists_list_from_cache(
                key, s3_storage)
            if not playlists_list:
                logger.debug("Attempting to retrieve playlists list from TED")
                playlists_list = self.download_playlists_list_from_site(
                    topics_list)
                self.upload_playlists_list_to_cache(playlists_list, key,
                                                    s3_storage)
            return playlists_list
        return self.download_playlists_list_from_site(topics_list)
Ejemplo n.º 7
0
    def process_image(self, url: str, path, is_profile: bool = False) -> str:
        """download image from url or S3 and add to Zim at path. Upload if req."""

        if self.aborted:
            return

        # setup resizing based on request
        resize_args = ({
            "width": PROFILE_IMAGE_SIZE,
            "height": PROFILE_IMAGE_SIZE,
            "method": "thumbnail",
        } if is_profile else {
            "width": POSTS_IMAGE_SIZE
        })

        # just download, optimize and add to ZIM if not using S3
        if not Global.conf.s3_url:
            with Global.lock:
                Global.creator.add_item_for(
                    path=path,
                    content=self.get_image_data(url.geturl(),
                                                **resize_args).getvalue(),
                    mimetype="image/webp",
                    is_front=False,
                    callback=self.once_done,
                )
            return path

        # we are using S3 cache
        ident = self.get_version_ident_for(url.geturl())
        if ident is None:
            logger.error(f"Unable to query {url.geturl()}. Skipping")
            return path

        key = self.get_s3_key_for(url.geturl())
        s3_storage = KiwixStorage(Global.conf.s3_url)
        meta = {"ident": ident, "encoder_version": str(IMAGES_ENCODER_VERSION)}

        download_failed = False  # useful to trigger reupload or not
        try:
            logger.debug(f"Attempting download of S3::{key} into ZIM::{path}")
            fileobj = io.BytesIO()
            s3_storage.download_matching_fileobj(key, fileobj, meta=meta)
        except NotFoundError:
            # don't have it, not a donwload error. we'll upload after processing
            pass
        except Exception as exc:
            logger.error(f"failed to download {key} from cache: {exc}")
            logger.exception(exc)
            download_failed = True
        else:
            with Global.lock:
                Global.creator.add_item_for(
                    path=path,
                    content=fileobj.getvalue(),
                    mimetype="image/webp",
                    is_front=False,
                    callback=self.once_done,
                )
            return path

        # we're using S3 but don't have it or failed to download
        try:
            fileobj = self.get_image_data(url.geturl(), **resize_args)
        except Exception as exc:
            logger.error(
                f"Failed to download/convert/optim source  at {url.geturl()}")
            logger.exception(exc)
            return path

        with Global.lock:
            Global.creator.add_item_for(
                path=path,
                content=fileobj.getvalue(),
                mimetype="image/webp",
                is_front=False,
                callback=self.once_done,
            )

        # only upload it if we didn't have it in cache
        if not download_failed:
            logger.debug(f"Uploading {url.geturl()} to S3::{key} with {meta}")
            try:
                s3_storage.upload_fileobj(fileobj=fileobj, key=key, meta=meta)
            except Exception as exc:
                logger.error(f"{key} failed to upload to cache: {exc}")

        return path
Ejemplo n.º 8
0
class Ted2Zim:
    def __init__(
        self,
        topics,
        debug,
        name,
        video_format,
        low_quality,
        output_dir,
        no_zim,
        fname,
        languages,
        locale_name,
        title,
        description,
        creator,
        publisher,
        tags,
        keep_build_dir,
        autoplay,
        use_any_optimized_version,
        s3_url_with_credentials,
        playlist,
        subtitles_enough,
        subtitles_setting,
        tmp_dir,
        threads,
    ):

        # video-encoding info
        self.video_format = video_format
        self.low_quality = low_quality

        # zim params
        self.fname = fname
        self.languages = (
            [] if languages is None else [l.strip() for l in languages.split(",")]
        )
        self.tags = [] if tags is None else [t.strip() for t in tags.split(",")]
        self.title = title
        self.description = description
        self.creator = creator
        self.publisher = publisher
        self.name = name

        # directory setup
        self.output_dir = pathlib.Path(output_dir).expanduser().resolve()
        if tmp_dir:
            pathlib.Path(tmp_dir).mkdir(parents=True, exist_ok=True)
        self.build_dir = pathlib.Path(tempfile.mkdtemp(dir=tmp_dir))

        # scraper options
        self.topics = (
            []
            if not topics
            else [c.strip().replace(" ", "+") for c in topics.split(",")]
        )
        self.autoplay = autoplay
        self.playlist = playlist
        self.subtitles_enough = subtitles_enough
        self.subtitles_setting = (
            subtitles_setting
            if subtitles_setting == ALL
            or subtitles_setting == MATCHING
            or subtitles_setting == NONE
            else self.to_ted_langcodes(
                [lang.strip() for lang in subtitles_setting.split(",")]
            )
        )
        self.threads = threads
        self.yt_downloader = None

        # optimization cache
        self.s3_url_with_credentials = s3_url_with_credentials
        self.use_any_optimized_version = use_any_optimized_version
        self.s3_storage = None
        self.video_quality = "low" if self.low_quality else "high"

        # debug/developer options
        self.no_zim = no_zim
        self.keep_build_dir = keep_build_dir
        self.debug = debug

        # class members
        self.videos = []
        self.playlist_title = None
        self.playlist_description = None
        self.source_languages = (
            [] if not self.languages else self.to_ted_langcodes(self.languages)
        )
        self.zim_lang = None
        self.already_visited = []

        # set and record locale for translations
        locale_details = get_language_details(locale_name)
        if locale_details["querytype"] != "locale":
            locale_name = locale_details["iso-639-1"]
        try:
            self.locale = setlocale(ROOT_DIR, locale_name)
        except locale.Error:
            logger.error(
                f"No locale for {locale_name}. Use --locale to specify it. "
                "defaulting to en_US"
            )
            self.locale = setlocale(ROOT_DIR, "en")
        # locale's language code
        self.locale_name = self.to_ted_langcodes(locale_name)

    @property
    def templates_dir(self):
        return ROOT_DIR.joinpath("templates")

    @property
    def videos_dir(self):
        return self.build_dir.joinpath("videos")

    @property
    def ted_videos_json(self):
        return self.build_dir.joinpath("ted_videos.json")

    @property
    def ted_topics_json(self):
        return self.build_dir.joinpath("ted_topics.json")

    @property
    def talks_base_url(self):
        return BASE_URL + "talks"

    @property
    def playlists_base_url(self):
        return BASE_URL + "playlists"

    def append_part1_or_part3(self, lang_code_list, lang_info):
        """Fills missing ISO languages codes for all in list

        lang_code_list: list og lang codes
        lang_info: see zimscraperlib.i18n"""

        # ignore extra language mappings if supplied query was an iso-639-1 code
        if "part1" in lang_info["iso_types"]:
            lang_code_list.append(lang_info["iso-639-1"])

        # supplied query was not iso-639-1
        else:
            if lang_info["iso-639-1"]:
                lang_code_list.append(lang_info["iso-639-1"])
                # check for extra language codes to include
                if lang_info["iso-639-1"] in TEDLANGS["mappings"]:
                    for code in TEDLANGS["mappings"][lang_info["iso-639-1"]]:
                        lang_code_list.append(code)
            elif lang_info["iso-639-3"]:
                lang_code_list.append(lang_info["iso-639-3"])
            else:
                supplied_lang = lang_info["query"]
                logger.error(f"Language {supplied_lang} is not supported by TED")

    def to_ted_langcodes(self, languages):
        """Converts languages queries into TED language codes

        Examples:
            ["English", "fr", "hin"] => ["en", "fr", "hi"]
            ["chi", "fake"] => ["zh", "zh-cn", "zh-tw"]
        """

        lang_code_list = []
        for lang in languages:
            lang_info = get_language_details(lang, failsafe=True)
            if lang_info:
                if lang_info["querytype"] == "purecode":
                    self.append_part1_or_part3(lang_code_list, lang_info)
                elif lang_info["querytype"] == "locale":
                    query = lang_info["query"].replace("_", "-")
                    if query in TEDLANGS["locales"]:
                        lang_code_list.append(query)
                    else:
                        self.append_part1_or_part3(lang_code_list, lang_info)
                else:
                    self.append_part1_or_part3(lang_code_list, lang_info)
        return list(set(lang_code_list))

    def extract_videos_from_playlist(self, playlist):
        """extracts metadata for all videos in the given playlist

        calls extract_video_info on all links to get this data
        """

        playlist_url = f"{self.playlists_base_url}/{playlist}"
        logger.debug(f"extract_videos_from_playlist: {playlist_url}")
        soup = BeautifulSoup(download_link(playlist_url).text, features="html.parser")
        video_elements = soup.find_all("a", attrs={"class": "group"})
        self.playlist_title = soup.find("h1").string
        self.playlist_description = soup.find("p", attrs={"class": "text-base"}).string

        for element in video_elements:
            relative_path = element.get("href")
            url = urllib.parse.urljoin(self.talks_base_url, relative_path)
            if self.extract_info_from_video_page(url):
                if self.source_languages and len(self.source_languages) > 1:
                    other_lang_urls = self.generate_urls_for_other_languages(url)
                    logger.debug(
                        f"Searching info for the video in other {len(other_lang_urls)} language(s)"
                    )
                    for lang_url in other_lang_urls:
                        self.extract_info_from_video_page(lang_url)
                    self.already_visited.append(urllib.parse.urlparse(url)[2])
            logger.debug(f"Seen {relative_path}")
        logger.debug(f"Total videos found on playlist: {len(video_elements)}")
        if not video_elements:
            raise ValueError("Wrong playlist ID supplied. No videos found")

    def generate_search_result_and_scrape(self, topic_url, total_videos_scraped):
        """generates a search result and returns the total number of videos scraped"""

        page = 1
        while True:
            logger.debug(f"generate_search_result_and_scrape: {topic_url}&page={page}")
            html = download_link(f"{topic_url}&page={page}").text
            nb_videos_extracted, nb_videos_on_page = self.extract_videos_on_topic_page(
                html
            )
            if nb_videos_on_page == 0:
                break
            total_videos_scraped += nb_videos_extracted
            page += 1
        return total_videos_scraped

    def extract_videos_from_topics(self, topic):
        """extracts metadata for required number of videos on different topics"""

        logger.debug(f"Fetching video links for topic: {topic}")
        topic_url = f"{self.talks_base_url}?topics%5B%5D={topic}"
        total_videos_scraped = 0

        if self.source_languages:
            for lang in self.source_languages:
                topic_url = topic_url + f"&language={lang}"
                total_videos_scraped = self.generate_search_result_and_scrape(
                    topic_url, total_videos_scraped
                )

        else:
            total_videos_scraped = self.generate_search_result_and_scrape(
                topic_url, total_videos_scraped
            )

        logger.info(f"Total video links found in {topic}: {total_videos_scraped}")
        if total_videos_scraped == 0:
            return False
        return True

    def update_zim_metadata(self):

        if not self.languages:
            self.zim_lang = "eng"
        else:
            if len(self.source_languages) > 1:
                self.zim_lang = "mul"
            else:
                lang_info = get_language_details(
                    self.source_languages[0], failsafe=True
                )
                if lang_info:
                    self.zim_lang = lang_info["iso-639-3"]
                else:
                    self.zim_lang = "eng"

        if self.playlist:
            if not self.title:
                self.title = self.playlist_title.strip()
            if not self.description:
                self.description = self.playlist_description.strip()
        else:
            if len(self.topics) > 1:
                if not self.title:
                    self.title = "TED Collection"
                if not self.description:
                    self.description = "A selection of TED videos from several topics"
            else:
                topic_str = self.topics[0].replace("+", " ")
                if not self.title:
                    self.title = f"{topic_str.capitalize()} from TED"
                if not self.description:
                    self.description = f"A selection of {topic_str} videos from TED"

    def get_display_name(self, lang_code, lang_name):
        """Display name for language"""

        lang_info = get_language_details(lang_code, failsafe=True)
        if lang_code != "en" and lang_info:

            return lang_info["native"] + " - " + lang_name
        return lang_name

    def get_subtitle_dict(self, lang):
        """dict of language name and code from a larger dict lang

        Example:
        {
            'languageCode': 'en',
            'languageName': 'English'
        }
        """

        return {
            "languageName": self.get_display_name(
                lang["languageCode"], lang["languageName"]
            ),
            "languageCode": lang["languageCode"],
        }

    def generate_subtitle_list(self, video_id, langs, page_lang, audio_lang):
        """List of all subtitle languages with link to their pages"""

        subtitles = []
        if self.subtitles_setting == ALL or (
            not self.source_languages and self.topics and self.subtitles_setting != NONE
        ):
            subtitles = [self.get_subtitle_dict(lang) for lang in langs]
        elif self.subtitles_setting == MATCHING or (
            self.subtitles_enough
            and self.subtitles_setting == NONE
            and page_lang != audio_lang
        ):
            subtitles = [
                self.get_subtitle_dict(lang)
                for lang in langs
                if lang["languageCode"] == page_lang
            ]
        elif self.subtitles_setting and self.subtitles_setting != NONE:
            if not self.subtitles_enough and self.topics:
                subtitles = [
                    self.get_subtitle_dict(lang)
                    for lang in langs
                    if lang["languageCode"] in self.subtitles_setting
                ]
            else:
                subtitles = [
                    self.get_subtitle_dict(lang)
                    for lang in langs
                    if lang["languageCode"] in self.subtitles_setting
                    or lang["languageCode"] in self.source_languages
                ]

        return update_subtitles_list(video_id, subtitles)

    def generate_urls_for_other_languages(self, url):
        """Possible URLs for other requested languages based on a video url"""

        urls = []
        page_lang, query = self.get_lang_code_from_url(url, with_full_query=True)
        url_parts = list(urllib.parse.urlparse(url))

        # update the language query field value with other languages and form URLs
        for language in self.source_languages:
            if language != page_lang:
                query.update({"language": language})
                url_parts[4] = urllib.parse.urlencode(query)
                urls.append(urllib.parse.urlunparse(url_parts))
        return urls

    def extract_videos_on_topic_page(self, page_html):

        # all videos are embedded in a <div> with the class name 'row'.
        # we are searching for the div inside this div, that has an <a>-tag
        # with the class name 'media__image', because this is the relative
        # link to the representative TED talk. It turns this relative link to
        # an absolute link and calls extract_video_info for them
        soup = BeautifulSoup(page_html, features="html.parser")
        video_links = soup.select("div.row div.media__image a")
        nb_extracted = 0
        nb_listed = len(video_links)
        logger.debug(f"{nb_listed} video(s) found on current page")
        for video_link in video_links:
            url = urllib.parse.urljoin(self.talks_base_url, video_link["href"])
            if self.extract_info_from_video_page(url):
                nb_extracted += 1
                if self.source_languages and len(self.source_languages) > 1:
                    other_lang_urls = self.generate_urls_for_other_languages(url)
                    logger.debug(
                        f"Searching info for video in other {len(other_lang_urls)} language(s)"
                    )
                    for lang_url in other_lang_urls:
                        self.extract_info_from_video_page(lang_url)
                    self.already_visited.append(urllib.parse.urlparse(url)[2])
            logger.debug(f"Seen {video_link['href']}")
        return nb_extracted, nb_listed

    def get_lang_code_from_url(self, url, with_full_query=False):
        """gets the queried language code from a ted talk url"""

        # sample - https://www.ted.com/talks/alex_rosenthal_the_gauntlet_think_like_a_coder_ep_8?language=ja
        url_parts = list(urllib.parse.urlparse(url))

        # explode url to extract `language` query field value
        query = dict(urllib.parse.parse_qsl(url_parts[4]))
        current_lang = query.get("language")
        if with_full_query:
            return current_lang, query
        return current_lang

    def extract_download_link(self, talk_data):
        """Returns download link / youtube video ID for a TED video"""

        if (
            isinstance(talk_data.get("resources", {}).get("h264"), list)
            and len(talk_data["resources"]["h264"])
            and talk_data["resources"]["h264"][0].get("file")
        ):
            logger.debug(
                "Using h264 resource link for bitrate="
                f"{talk_data['resources']['h264'][0].get('bitrate')}"
            )
            return talk_data["resources"]["h264"][0]["file"]

        logger.error("No download link found for the video")
        return None

    def update_videos_list(
        self,
        video_id,
        lang_code,
        lang_name,
        title,
        description,
        speaker,
        speaker_profession,
        speaker_bio,
        speaker_picture,
        date,
        thumbnail,
        video_link,
        length,
        subtitles,
    ):
        # append to self.videos and return if not present
        if not [video for video in self.videos if video.get("id", None) == video_id]:
            self.videos.append(
                {
                    "id": video_id,
                    "languages": [
                        {
                            "languageCode": lang_code,
                            "languageName": self.get_display_name(lang_code, lang_name),
                        }
                    ],
                    "title": [{"lang": lang_code, "text": title}],
                    "description": [{"lang": lang_code, "text": description}],
                    "speaker": speaker,
                    "speaker_profession": speaker_profession,
                    "speaker_bio": speaker_bio,
                    "speaker_picture": speaker_picture,
                    "date": date,
                    "thumbnail": thumbnail,
                    "video_link": video_link,
                    "length": length,
                    "subtitles": subtitles,
                }
            )
            logger.debug(f"Successfully inserted video {video_id} into video list")
            return True

        # update localized meta for video if already in self.videos
        # based on --subtitles=matching
        logger.debug(f"Video {video_id} already present in video list")
        for index, video in enumerate(self.videos):
            if video.get("id", None) == video_id:
                if {"lang": lang_code, "text": title} not in video["title"]:
                    self.videos[index]["title"].append(
                        {"lang": lang_code, "text": title}
                    )
                    self.videos[index]["description"].append(
                        {"lang": lang_code, "text": description}
                    )
                    self.videos[index]["languages"].append(
                        {
                            "languageCode": lang_code,
                            "languageName": self.get_display_name(lang_code, lang_name),
                        }
                    )
                if self.subtitles_setting == MATCHING or self.subtitles_setting == NONE:
                    self.videos[index]["subtitles"] += subtitles
        return False

    def extract_video_info_from_json(self, json_data):
        player_data = json.loads(json_data["playerData"])
        lang_code = json_data["language"]
        lang_name = [
            lang["languageName"]
            for lang in player_data["languages"]
            if lang["languageCode"] == lang_code
        ][-1]
        # talk_info = json_data["talks"][0]
        native_talk_language = player_data["nativeLanguage"]
        if (
            not self.subtitles_enough
            and self.source_languages
            and native_talk_language != lang_code
            and self.topics
        ):
            return False

        # Extract the speaker of the TED talk
        if len(json_data["speakers"]):
            speaker_info = json_data["speakers"][0]
            speaker = " ".join(
                [
                    speaker_info.get("firstName"),
                    speaker_info.get("middleName"),
                    speaker_info.get("lastName"),
                ]
            )
        else:
            speaker_info = {
                "description": "None",
                "whotheyare": "None",
                "photo_url": "",
            }
            if "presenterDisplayName" in json_data:
                speaker = json_data["presenterDisplayName"]
            else:
                speaker = "None"

        # Extract the ted talk details from json
        video_id = json_data["id"]
        speaker_profession = speaker_info["description"]
        speaker_bio = speaker_info.get("whoTheyAre", "-")
        speaker_picture = speaker_info.get("avatar", "-")
        title = json_data["title"]
        description = json_data["description"]
        date = dateutil.parser.parse(json_data["recordedOn"]).strftime("%d %B %Y")
        length = int(json_data["duration"]) // 60
        thumbnail = player_data["thumb"]
        video_link = self.extract_download_link(player_data)
        if not video_link:
            logger.error("No suitable download link found. Skipping video")
            return False

        langs = player_data["languages"]
        subtitles = self.generate_subtitle_list(
            video_id, langs, lang_code, native_talk_language
        )
        return self.update_videos_list(
            video_id=video_id,
            lang_code=lang_code,
            lang_name=lang_name,
            title=title,
            description=description,
            speaker=speaker,
            speaker_profession=speaker_profession,
            speaker_bio=speaker_bio,
            speaker_picture=speaker_picture,
            date=date,
            thumbnail=thumbnail,
            video_link=video_link,
            length=length,
            subtitles=subtitles,
        )

    def extract_info_from_video_page(self, url, retry_count=0):
        """extract all info from a TED video page url and update self.videos"""

        # Every TED video page has a <script>-tag with a Javascript
        # object with JSON in it. We will just stip away the object
        # signature and load the json to extract meta-data out of it.
        # returns True if successfully scraped new video

        # don't scrape if URL already visited
        if urllib.parse.urlparse(url)[2] in self.already_visited:
            return False

        # don't scrape if maximum retry count is reached
        if retry_count > 5:
            logger.error("Max retries exceeded. Skipping video")
            return False

        logger.debug(f"extract_info_from_video_page: {url}")
        soup = BeautifulSoup(download_link(url).text, features="html.parser")

        json_data = json.loads(
            soup.find("script", attrs={"id": "__NEXT_DATA__"}).string
        )["props"]["pageProps"]["videoData"]

        requested_lang_code = self.get_lang_code_from_url(url)
        if requested_lang_code and json_data["language"] != requested_lang_code:
            logger.error(
                f"Video has not yet been translated into {requested_lang_code}"
            )
            return False
        return self.extract_video_info_from_json(json_data)

    def add_default_language(self):
        """add metatada in default language (english or first avail) on all videos"""

        for video in self.videos:
            en_found = False
            for index, lang in enumerate(video["languages"]):
                if lang["languageCode"] == "en":
                    en_found = True
                    video["title"] = [
                        {"lang": "default", "text": video["title"][index]["text"]}
                    ] + video["title"]
                    video["description"] = [
                        {"lang": "default", "text": video["description"][index]["text"]}
                    ] + video["description"]

            if not en_found:
                video["title"] = [
                    {"lang": "default", "text": video["title"][0]["text"]}
                ] + video["title"]
                video["description"] = [
                    {"lang": "default", "text": video["description"][0]["text"]}
                ] + video["description"]

            # update video slug
            video["slug"] = slugify(video["title"][0]["text"], separator="-")

    def render_video_pages(self):

        # Render static html pages from the scraped video data and
        # save the pages in build_dir/<video-id>/index.html
        env = jinja2.Environment(
            loader=jinja2.FileSystemLoader(str(self.templates_dir)), autoescape=True
        )
        for video in self.videos:
            titles = video["title"]
            html = env.get_template("article.html").render(
                speaker=video["speaker"],
                languages=video["subtitles"],
                speaker_bio=video["speaker_bio"].replace("Full bio", ""),
                speaker_img=video["speaker_picture"],
                date=video["date"],
                profession=video["speaker_profession"],
                video_format=self.video_format,
                autoplay=self.autoplay,
                video_id=str(video["id"]),
                title=get_main_title(titles, self.locale_name),
                titles=titles,
                descriptions=video["description"],
                back_to_list=_("Back to the list"),
            )
            html_path = self.build_dir.joinpath(video["slug"])
            with open(html_path, "w", encoding="utf-8") as html_page:
                html_page.write(html)

    def render_home_page(self):

        # Render the homepage
        env = jinja2.Environment(
            loader=jinja2.FileSystemLoader(str(self.templates_dir)), autoescape=True
        )
        all_langs = {
            language["languageCode"]: language["languageName"]
            for video in self.videos
            for language in video["subtitles"] + video["languages"]
        }
        languages = [
            {"languageName": value, "languageCode": key}
            for key, value in all_langs.items()
        ]
        languages = sorted(languages, key=lambda x: x["languageName"])
        html = env.get_template("home.html").render(
            languages=languages,
            page_title=_("TED Talks"),
            language_filter_text=_("Filter by language"),
            back_to_top=_("Back to the top"),
            pagination_text=_("Page"),
        )
        home_page_path = self.build_dir.joinpath("index")
        with open(home_page_path, "w", encoding="utf-8") as html_page:
            html_page.write(html)

    def copy_files_to_build_directory(self):

        # Copy files from template_dir to build_dir
        assets_dir = self.templates_dir.joinpath("assets")
        if assets_dir.exists():
            shutil.copytree(
                assets_dir, self.build_dir.joinpath("assets"), dirs_exist_ok=True
            )
        shutil.copy(
            self.templates_dir.joinpath("favicon.png"),
            self.build_dir.joinpath("favicon.png"),
        )

    def generate_datafile(self):
        """Generate data.js inside assets folder"""

        video_list = []
        for video in self.videos:
            lang_codes = [lang["languageCode"] for lang in video["subtitles"]] + [
                lang["languageCode"] for lang in video["languages"]
            ]
            json_data = {
                "languages": [lang_code for lang_code in set(lang_codes)],
                "id": video["id"],
                "description": video["description"],
                "title": video["title"],
                "speaker": video["speaker"],
                "slug": video["slug"],
            }
            video_list.append(json_data)
        assets_path = self.build_dir.joinpath("assets")
        if not assets_path.exists():
            assets_path.mkdir(parents=True)

        with open(assets_path.joinpath("data.js"), "w") as data_file:
            data_file.write("json_data = " + json.dumps(video_list, indent=4))

    def download_jpeg_image_and_convert(
        self, url, fpath, preset_options={}, resize=None
    ):
        """downloads a JPEG image and converts and optimizes it into desired format detected from fpath"""

        org_jpeg_path = pathlib.Path(
            tempfile.NamedTemporaryFile(delete=False, suffix=".jpg").name
        )
        save_large_file(url, org_jpeg_path)
        if resize is not None:
            resize_image(
                org_jpeg_path,
                width=resize[0],
                height=resize[1],
                method="cover",
            )
        optimize_image(
            org_jpeg_path, fpath, convert=True, delete_src=True, **preset_options
        )
        logger.debug(f"Converted {org_jpeg_path} to {fpath} and optimized ")

    def download_speaker_image(
        self, video_id, video_title, video_speaker, speaker_path
    ):
        """downloads the speaker image"""

        downloaded_from_cache = False
        preset = WebpMedium()
        if self.s3_storage:
            s3_key = f"speaker_image/{video_id}"
            downloaded_from_cache = self.download_from_cache(
                s3_key, speaker_path, preset.VERSION
            )
        if not downloaded_from_cache:
            try:
                # download an image of the speaker
                if not video_speaker:
                    logger.debug("Speaker doesn't have an image")
                else:
                    logger.debug(f"Downloading Speaker image for {video_title}")
                    self.download_jpeg_image_and_convert(
                        video_speaker, speaker_path, preset_options=preset.options
                    )
            except Exception:
                logger.error(f"Could not download speaker image for {video_title}")
            else:
                if self.s3_storage and video_speaker:
                    self.upload_to_cache(s3_key, speaker_path, preset.VERSION)

    def download_thumbnail(
        self, video_id, video_title, video_thumbnail, thumbnail_path
    ):
        """download the thumbnail"""

        downloaded_from_cache = False
        preset = WebpMedium()
        if self.s3_storage:
            s3_key = f"thumbnail/{video_id}"
            downloaded_from_cache = self.download_from_cache(
                s3_key, thumbnail_path, preset.VERSION
            )
        if not downloaded_from_cache:
            try:
                # download the thumbnail of the video
                logger.debug(f"Downloading thumbnail for {video_title}")
                self.download_jpeg_image_and_convert(
                    video_thumbnail,
                    thumbnail_path,
                    preset_options=preset.options,
                    resize=(248, 187),
                )
            except Exception:
                logger.error(f"Could not download thumbnail for {video_title}")
            else:
                if self.s3_storage:
                    self.upload_to_cache(s3_key, thumbnail_path, preset.VERSION)

    def download_video_files(self, video):
        """download all video files (video, thumbnail, speaker)"""

        # Download all the TED talk videos and the meta-data for it.
        # Save the videos in build_dir/{video id}/video.mp4.
        # Save the thumbnail for the video in build_dir/{video id}/thumbnail.jpg.
        # Save the image of the speaker in build_dir/{video id}/speaker.jpg.

        # set up variables
        video_id = str(video["id"])
        # Take the english version of title or else whatever language it's available in
        video_title = video["title"][0]["text"]
        video_link = video["video_link"]
        video_speaker = video["speaker_picture"]
        video_thumbnail = video["thumbnail"]
        video_dir = self.videos_dir.joinpath(video_id)
        org_video_file_path = video_dir.joinpath("video.mp4")
        req_video_file_path = video_dir.joinpath(f"video.{self.video_format}")
        speaker_path = video_dir.joinpath("speaker.webp")
        thumbnail_path = video_dir.joinpath("thumbnail.webp")

        # ensure that video directory exists
        if not video_dir.exists():
            video_dir.mkdir(parents=True)

        # set preset
        preset = {"mp4": VideoMp4Low}.get(self.video_format, VideoWebmLow)()

        # download video
        downloaded_from_cache = False
        logger.debug(f"Downloading {video_title}")
        if self.s3_storage:
            s3_key = f"{self.video_format}/{self.video_quality}/{video_id}"
            downloaded_from_cache = self.download_from_cache(
                s3_key, req_video_file_path, preset.VERSION
            )
        if not downloaded_from_cache:
            try:
                if "https://" not in video_link:
                    options = (
                        BestWebm if self.video_format == "webm" else BestMp4
                    ).get_options(
                        target_dir=video_dir, filepath=pathlib.Path("video.%(ext)s")
                    )
                    self.yt_downloader.download(video_link, options)
                else:
                    save_large_file(video_link, org_video_file_path)
            except Exception:
                logger.error(f"Could not download {org_video_file_path}")

        # download speaker and thumbnail images
        self.download_speaker_image(video_id, video_title, video_speaker, speaker_path)
        self.download_thumbnail(video_id, video_title, video_thumbnail, thumbnail_path)

        # recompress if necessary
        try:
            if not downloaded_from_cache:
                post_process_video(
                    video_dir,
                    video_id,
                    preset,
                    self.video_format,
                    self.low_quality,
                )
        except Exception as e:
            logger.error(f"Failed to post process video {video_id}")
            logger.debug(e)
        else:
            # upload to cache only if recompress was successful
            if self.s3_storage and not downloaded_from_cache:
                self.upload_to_cache(s3_key, req_video_file_path, preset.VERSION)

    def download_video_files_parallel(self):
        """download videos and images parallely"""

        self.yt_downloader = YoutubeDownloader(threads=1)
        with concurrent.futures.ThreadPoolExecutor(
            max_workers=self.threads
        ) as executor:
            fs = [
                executor.submit(self.download_video_files, video)
                for video in self.videos
            ]
            concurrent.futures.wait(fs, return_when=concurrent.futures.ALL_COMPLETED)
        self.yt_downloader.shutdown()

    def download_subtitles(self, index, video):
        """download, converts and writes VTT subtitles for a video at a specific index in self.videos"""

        # Download the subtitle files, generate a WebVTT file
        # and save the subtitles in
        # build_dir/{video id}/subs/subs_{language code}.vtt
        if not video["subtitles"]:
            return
        video_dir = self.videos_dir.joinpath(video["id"])
        subs_dir = video_dir.joinpath("subs")
        if not subs_dir.exists():
            subs_dir.mkdir(parents=True)
        else:
            logger.debug(f"Subs dir exists already")

        # download subtitles
        logger.debug(f"Downloading subtitles for {video['title'][0]['text']}")
        valid_subs = []
        for subtitle in video["subtitles"]:
            time.sleep(0.5)  # throttling
            vtt_subtitle = WebVTT(subtitle["link"]).convert()
            if not vtt_subtitle:
                logger.error(
                    f"Subtitle file for {subtitle['languageCode']} could not be created"
                )
                continue
            valid_subs.append(subtitle)
            vtt_path = subs_dir.joinpath(f"subs_{subtitle['languageCode']}.vtt")
            with open(vtt_path, "w", encoding="utf-8") as sub_file:
                sub_file.write(vtt_subtitle)
        self.videos[index]["subtitles"] = valid_subs

    def download_subtitles_parallel(self):
        """download subtitles for all videos parallely"""

        with concurrent.futures.ThreadPoolExecutor(
            max_workers=self.threads
        ) as executor:
            fs = [
                executor.submit(self.download_subtitles, index, video)
                for index, video in enumerate(self.videos)
            ]
            concurrent.futures.wait(fs, return_when=concurrent.futures.ALL_COMPLETED)

    def s3_credentials_ok(self):
        logger.info("Testing S3 Optimization Cache credentials")
        self.s3_storage = KiwixStorage(self.s3_url_with_credentials)
        if not self.s3_storage.check_credentials(
            list_buckets=True, bucket=True, write=True, read=True, failsafe=True
        ):
            logger.error("S3 cache connection error testing permissions.")
            logger.error(f"  Server: {self.s3_storage.url.netloc}")
            logger.error(f"  Bucket: {self.s3_storage.bucket_name}")
            logger.error(f"  Key ID: {self.s3_storage.params.get('keyid')}")
            logger.error(f"  Public IP: {get_public_ip()}")
            return False
        return True

    def download_from_cache(self, key, object_path, encoder_version):
        """whether it downloaded from S3 cache"""

        if self.use_any_optimized_version:
            if not self.s3_storage.has_object(key, self.s3_storage.bucket_name):
                return False
        else:
            if not self.s3_storage.has_object_matching_meta(
                key, tag="encoder_version", value=f"v{encoder_version}"
            ):
                return False
        object_path.parent.mkdir(parents=True, exist_ok=True)
        try:
            self.s3_storage.download_file(key, object_path)
        except Exception as exc:
            logger.error(f"{key} failed to download from cache: {exc}")
            return False
        logger.info(f"downloaded {object_path} from cache at {key}")
        return True

    def upload_to_cache(self, key, object_path, encoder_version):
        """whether it uploaded from S3 cache"""

        try:
            self.s3_storage.upload_file(
                object_path, key, meta={"encoder_version": f"v{encoder_version}"}
            )
        except Exception as exc:
            logger.error(f"{key} failed to upload to cache: {exc}")
            return False
        logger.info(f"uploaded {object_path} to cache at {key}")
        return True

    def remove_failed_topics_and_check_extraction(self, failed_topics):
        """removes failed topics from topics list and raises error if scraper cannot continue"""

        for topic in failed_topics:
            self.topics.remove(topic)
        if not self.topics:
            if self.source_languages:
                raise ValueError(
                    "No videos found for any topic in the language(s) requested. Check topic(s) and/or language code supplied to --languages"
                )
            raise ValueError("Wrong topic(s) were supplied. No videos found")

    def run(self):
        logger.info(
            f"Starting scraper with:\n"
            f"  langs: {', '.join(self.source_languages)}\n"
            f"  subtitles : {', '.join(self.subtitles_setting) if isinstance(self.subtitles_setting, list) else self.subtitles_setting}\n"
            f"  video format : {self.video_format}"
        )

        if self.s3_url_with_credentials and not self.s3_credentials_ok():
            raise ValueError("Unable to connect to Optimization Cache. Check its URL.")
        if self.s3_storage:
            logger.info(
                f"Using cache: {self.s3_storage.url.netloc} with bucket: {self.s3_storage.bucket_name}"
            )

        # playlist mode requested
        if self.playlist:
            self.extract_videos_from_playlist(self.playlist)
        # topic(s) mode requested
        else:
            failed = []
            for topic in self.topics:
                if not self.extract_videos_from_topics(topic):
                    failed.append(topic)
                else:
                    logger.debug(f"Successfully scraped {topic}")
            self.remove_failed_topics_and_check_extraction(failed)

        self.add_default_language()
        self.update_zim_metadata()
        self.download_video_files_parallel()
        self.download_subtitles_parallel()
        self.render_home_page()
        self.render_video_pages()
        self.copy_files_to_build_directory()
        self.generate_datafile()

        # zim creation and cleanup
        if not self.no_zim:
            self.fname = (
                self.fname or f"{self.name.replace(' ', '-')}_{{period}}.zim"
            ).format(period=datetime.datetime.now().strftime("%Y-%m"))
            logger.info("building ZIM file")
            if not self.output_dir.exists():
                self.output_dir.mkdir(parents=True)
            make_zim_file(
                build_dir=self.build_dir,
                fpath=self.output_dir.joinpath(self.fname),
                name=self.name,
                main_page="index",
                favicon="favicon.png",
                title=self.title,
                description=self.description,
                language=self.zim_lang,
                creator=self.creator,
                publisher=self.publisher,
                tags=self.tags + ["_category:ted", "ted", "_videos:yes"],
                scraper=SCRAPER,
            )
            if not self.keep_build_dir:
                logger.info("removing temp folder")
                shutil.rmtree(self.build_dir, ignore_errors=True)

        logger.info("Done Everything")
Ejemplo n.º 9
0
class Kolibri2Zim:
    def __init__(self, **kwargs):

        for option in options:
            if option not in kwargs:
                raise ValueError(f"Missing parameter `{option}`")

        def go(option):
            return kwargs.get(option)

        self.channel_id = go("channel_id")
        self.root_id = go("root_id")

        # video-encoding info
        self.use_webm = go("use_webm")
        self.low_quality = go("low_quality")
        self.autoplay = go("autoplay")

        # zim params
        self.fname = go("fname")
        self.tags = ([] if go("tags") is None else
                     [t.strip() for t in go("tags").split(",")])
        self.title = go("title")
        self.description = go("description")
        self.author = go("creator")
        self.publisher = go("publisher")
        self.name = go("name")

        # customization
        self.favicon = go("favicon")
        self.about = go("about")
        self.css = go("css")

        # directory setup
        self.output_dir = Path(go("output_dir")).expanduser().resolve()
        if go("tmp_dir"):
            Path(go("tmp_dir")).mkdir(parents=True, exist_ok=True)
        self.build_dir = Path(tempfile.mkdtemp(dir=go("tmp_dir")))

        # performances options
        self.nb_threads = go("threads")
        self.nb_processes = go("processes")
        self.s3_url_with_credentials = go("s3_url_with_credentials")
        self.s3_storage = None

        # debug/developer options
        self.keep_build_dir = go("keep_build_dir")
        self.debug = go("debug")
        self.only_topics = go("only_topics")

        # jinja2 environment setup
        self.jinja2_env = jinja2.Environment(loader=jinja2.FileSystemLoader(
            str(self.templates_dir)),
                                             autoescape=True)

    @property
    def templates_dir(self):
        return ROOT_DIR.joinpath("templates")

    def add_local_files(self, root_path, folder):
        """ recursively add local files from {folder} starting at {path} """
        non_front = ("viewer.html", "epub_embed.html")
        for fpath in folder.iterdir():
            path = "/".join([root_path, fpath.name])
            if fpath.is_file():
                mimetype = "text/html;raw=true" if fpath.name in non_front else None
                self.creator.add_item_for(path=path,
                                          title="",
                                          fpath=fpath,
                                          mimetype=mimetype)
                logger.debug(f"Adding {path}")
            else:
                self.add_local_files(path, fpath)

    def populate_nodes_executor(self):
        """Loop on content nodes to create zim entries from kolibri DB"""
        def schedule_node(item):
            future = self.nodes_executor.submit(self.add_node, item=item)
            self.nodes_futures.update({future: item[0]})

        # schedule root-id
        schedule_node((self.db.root["id"], self.db.root["kind"]))

        # fill queue with (node_id, kind) tuples for all root node's descendants
        for node in self.db.get_node_descendants(self.root_id):
            schedule_node((node["id"], node["kind"]))

    def add_node(self, item):
        """ process a content node from the tuple in queue """
        node_id, kind = item
        # check if we have a handler for this {kind} of node
        handler = getattr(self, f"add_{kind}_node", None)

        # debug espace
        if self.only_topics and kind != "topic":
            return

        if handler:
            # add thumbnail to zim if there's one for this node
            thumbnail = self.db.get_node_thumbnail(node_id)
            if thumbnail:
                self.funnel_file(thumbnail["id"], thumbnail["ext"])
            # fire the add_{kind}_node() method which will actually process it
            handler(node_id)

    def funnel_file(self, fid, fext):
        """ directly add a Kolibri file to the ZIM using same name """
        url, fname = get_kolibri_url_for(fid, fext)
        with self.creator_lock:
            self.creator.add_item(URLItem(url=url, path=fname))
        logger.debug(f"Added {fname} from Studio")

    def download_to_disk(self, file_id, ext):
        """ download a Kolibri file to the build-dir using its filename """
        url, fname = get_kolibri_url_for(file_id, ext)
        fpath = self.build_dir / fname
        stream_file(url, fpath)
        return fpath

    def funnel_from_s3(self, file_id, path, checksum, preset):
        """whether it could fetch and add the file from S3 cache

        - checks if a cache is configured
        - checks if file is present
        - checks if file is valid (corresponds to same original file)
        - downloads and add to zim

        returns True is all this succeeded, False otherwise"""
        if not self.s3_storage:
            return False

        key = self.s3_key_for(file_id, preset)

        # exit early if we don't have this object in bucket
        if not self.s3_storage.has_object_matching(
                key,
                meta={
                    "checksum": checksum,
                    "encoder_version": str(preset.VERSION)
                }):
            return False

        # download file into memory
        fileobj = io.BytesIO()
        try:
            self.s3_storage.download_fileobj(key, fileobj)
        except Exception as exc:
            logger.error(f"failed to download {key} from cache: {exc}")
            logger.exception(exc)
            # make sure we fallback to re-encode
            return False

        # add to zim
        with self.creator_lock:
            self.creator.add_item(
                StaticItem(
                    path=path,
                    fileobj=fileobj,
                    mimetype=preset.mimetype,
                ))
        logger.debug(f"Added {path} from S3::{key}")
        return True

    def s3_key_for(self, file_id, preset):
        """compute in-bucket key for file"""
        return f"{file_id[0]}/{file_id[1]}/{file_id}/{type(preset).__name__.lower()}"

    def upload_to_s3(self, key, fpath, **meta):
        """ whether it successfully uploaded to cache """
        if not self.s3_storage:
            return

        logger.debug(f"Uploading {fpath.name} to S3::{key} with {meta}")
        try:
            self.s3_storage.upload_file(fpath, key, meta=meta)
        except Exception as exc:
            logger.error(f"{key} failed to upload to cache: {exc}")
            return False
        return True

    def add_topic_node(self, node_id):
        """Build and add the HTML page for a single topic node

        Topic nodes are used only for hierarchy and solely contains metadata"""

        # fetch details including parents for breadcrumb and children to link to
        node = self.db.get_node(node_id, with_parents=True, with_children=True)

        html = self.jinja2_env.get_template("topic.html").render(
            node_id=node_id, **node)
        with self.creator_lock:
            self.creator.add_item_for(path=node_id,
                                      title=node["title"],
                                      content=html,
                                      mimetype="text/html")
        logger.debug(f"Added topic #{node_id}")

    def add_video_node(self, node_id):
        """Add content from this `video` node to zim

        video node is composed of (1) or (2) videos files and optional subtitle files
        video files are at most one of each `high_res_video` or `low_res_video`
        subtitle files (`video_subtitle`) are VTT files and are only limited by the
        number of language to select from in kolibri studio"""

        files = self.db.get_node_files(node_id, thumbnail=False)
        if not files:
            return
        files = sorted(files, key=lambda f: f["prio"])
        it = filter(lambda f: f["supp"] == 0, files)

        try:
            # find main video file
            video_file = next(it)
        except StopIteration:
            # we have no video file
            return

        try:
            alt_video_file = next(it)
        except StopIteration:
            # we have no supplementary video file (which is OK)
            alt_video_file = None

        # now decide which file to keep and what to do with it

        # content_file has a 1:1 rel with content_localfile which is thre
        # *implementation* of the file. We use that local file ID (its checksum)
        # everywhere BUT as S3 cache ID as we want to overwrite the same key
        # should a new version of the localfile for the same file arrives.
        vid = video_file["id"]  # the local file ID (current version)
        vfid = video_file["fid"]  # the file ID in DB (version agnostic)
        vchk = video_file["checksum"]

        # we'll reencode, using the best file with appropriate preset
        if self.use_webm:
            preset = VideoWebmLow() if self.low_quality else VideoWebmHigh()
            src_fname = Path(filename_for(video_file))
            path = str(src_fname.with_suffix(f".{preset.ext}"))
            video_filename_ext = preset.ext
            video_filename = src_fname.with_suffix(
                f".{video_filename_ext}").name

            # funnel from S3 cache if it is present there
            if not self.funnel_from_s3(vfid, path, vchk, preset):

                # download original video
                src = self.download_to_disk(vid, video_file["ext"])
                dst = src.with_suffix(".webm")

                # request conversion
                self.convert_and_add_video_aside(vfid, src, vchk, dst, path,
                                                 preset)

        # we want low-q but no webm yet don't have low_res file, let's reencode
        elif self.low_quality and alt_video_file is None:
            preset = VideoMp4Low()
            src_fname = Path(filename_for(video_file))
            path = str(src_fname.with_suffix(f".{preset.ext}"))
            video_filename_ext = preset.ext
            video_filename = src_fname.with_suffix(
                f".{video_filename_ext}").name

            # funnel from S3 cache if it is present there
            if not self.funnel_from_s3(vfid, path, vchk, preset):

                # download original video
                src = self.download_to_disk(vid, video_file["ext"])

                # move source file to a new name and swap variables so our target will
                # be the previously source one
                src_ = src.with_suffix(f"{src.suffix}.orig")
                shutil.move(src, src_)
                dst = src
                src = src_

                # request conversion
                self.convert_and_add_video_aside(vfid, src, vchk, dst, path,
                                                 preset)

        # we want mp4, either in high-q or we have a low_res file to use
        else:
            video_file = alt_video_file if self.low_quality else video_file
            self.funnel_file(video_file["id"], video_file["ext"])
            video_filename = filename_for(video_file)
            video_filename_ext = video_file["ext"]

        # prepare list of subtitles for template
        subtitles = []
        for file in filter(lambda f: f["preset"] == "video_subtitle", files):
            self.funnel_file(file["id"], file["ext"])
            try:
                local, english = find_language_names(file["lang"])
            except Exception:
                english = file["lang"]
            finally:
                subtitles.append({
                    "code": file["lang"],
                    "name": english,
                    "filename": filename_for(file),
                })

        node = self.db.get_node(node_id, with_parents=True)
        html = self.jinja2_env.get_template("video.html").render(
            node_id=node_id,
            video_filename=video_filename,
            video_filename_ext=video_filename_ext,
            subtitles=sorted(subtitles, key=lambda i: i["code"]),
            thumbnail=self.db.get_thumbnail_name(node_id),
            autoplay=self.autoplay,
            **node,
        )
        with self.creator_lock:
            self.creator.add_item_for(
                path=node_id,
                title=node["title"],
                content=html,
                mimetype="text/html",
            )

    def add_video_upon_completion(self, future):
        """adds the converted video inside this future to the zim

        logs error in case of failure"""
        if future.cancelled():
            return
        src_fname, dst_fpath, path = self.videos_futures.get(future)

        try:
            future.result()
        except Exception as exc:
            logger.error(f"Error re-encoding {src_fname}: {exc}")
            logger.exception(exc)
            return

        logger.debug(f"Re-encoded {src_fname} successfuly")

        kwargs = {
            "path": path,
            "filepath": dst_fpath,
            "mimetype": get_file_mimetype(dst_fpath),
        }
        # we shall request s3 upload on the threads pool, only once item has been
        # added to ZIM so it can be removed altogether
        if self.s3_storage:
            kwargs.update({"callback": self.request_s3_upload_and_removal})

        # simply add the item, autodeleting the file
        else:
            kwargs.update({"remove": True})

        with self.creator_lock:
            self.creator.add_item(StaticItem(**kwargs))
        logger.debug(f"Added {path} from re-encoded file")

    def convert_and_add_video_aside(self, file_id, src_fpath, src_checksum,
                                    dest_fpath, path, preset):
        """add video to the process-based convertion queue"""

        future = self.videos_executor.submit(
            reencode,
            src_path=src_fpath,
            dst_path=dest_fpath,
            ffmpeg_args=preset.to_ffmpeg_args(),
            delete_src=True,
            with_process=False,
            failsafe=False,
        )
        self.videos_futures.update(
            {future: (src_fpath.name, dest_fpath, path)})
        self.pending_upload.update({
            path: (
                dest_fpath,
                self.s3_key_for(file_id, preset),
                {
                    "checksum": src_checksum,
                    "encoder_version": str(preset.VERSION)
                },
            )
        })
        future.add_done_callback(self.add_video_upon_completion)

    def request_s3_upload_and_removal(self, item):
        """ add file from item to uploads list """
        path = item.path
        del item
        dest_fpath, key, meta = self.pending_upload.get(path)
        # TODO: submit to a thread executor (to create) instead
        # this is currently called on main-tread.
        self.upload_to_s3(key, dest_fpath, **meta)

    def add_audio_node(self, node_id):
        """Add content from this `audio` node to zim

        audio node are composed of a single mp3 file"""
        file = self.db.get_node_file(node_id, thumbnail=False)
        if not file:
            return
        self.funnel_file(file["id"], file["ext"])

        node = self.db.get_node(node_id, with_parents=True)
        html = self.jinja2_env.get_template("audio.html").render(
            node_id=node_id,
            filename=filename_for(file),
            ext=file["ext"],
            thumbnail=self.db.get_thumbnail_name(node_id),
            autoplay=self.autoplay,
            **node,
        )
        with self.creator_lock:
            self.creator.add_item_for(
                path=node_id,
                title=node["title"],
                content=html,
                mimetype="text/html",
            )

    def add_exercise_node(self, node_id):
        """Add content from this `exercise` node to zim

        exercise node is composed of a single perseus file

        a perseus file is a ZIP containing an exercise.json entrypoint and other files

        we'd solely add the perseus file in the ZIM along with the perseus reader from
        https://github.com/Khan/perseus"""
        logger.warning(f"[NOT SUPPORTED] not adding exercice node {node_id}")

    def add_document_node(self, node_id):
        """Add content from this `document` node to zim

        document node is composed of one main (`priority` 1) file and
        an optionnal (`priority` 2) file
        Format for each is either `pdf` (`document` preset) or `epub` (`epub` preset)


        - add the actual PDF/epub files to zim at /{node_id}.{ext} (files' IDs)
        - add an HTML page linking to files for download
        - includes an iframe with the appropriate viewer
         - using pdf.js for PDF
         - using epub.js for EPUB
        - add an additional page for the alternate document with its viewer
        """
        def target_for(file):
            filename = filename_for(file)
            if file["ext"] == "pdf":
                return f"./assets/pdfjs/web/viewer.html?file=../../../{filename}"
            if file["ext"] == "epub":
                return f"./assets/epub_embed.html?url=../{filename}"

        # record the actual document
        files = self.db.get_node_files(node_id, thumbnail=False)
        if not files:
            return
        files = sorted(filter(lambda f: f["supp"] == 0, files),
                       key=lambda f: f["prio"])
        it = iter(files)

        try:
            main_document = next(it)
        except StopIteration:
            return

        try:
            alt_document = next(it)
        except StopIteration:
            alt_document = None

        for file in files:
            self.funnel_file(file["id"], file["ext"])
            file["target"] = target_for(file)

        node = self.db.get_node(node_id, with_parents=True)
        # convert generator to list as we might read it twice
        node["parents"] = list(node["parents"])

        # generate page once for each document, changing only `is_alt`
        if alt_document:
            options = [False, True]
        else:
            options = [False]  # main_document only

        for is_alt in options:
            html = self.jinja2_env.get_template("document.html").render(
                node_id=node_id,
                main_document=filename_for(main_document),
                main_document_ext=main_document["ext"],
                alt_document=filename_for(alt_document)
                if alt_document else None,
                alt_document_ext=alt_document["ext"] if alt_document else None,
                target=target_for(alt_document if is_alt else main_document),
                is_alt=is_alt,
                **node,
            )
            with self.creator_lock:
                path = node_id
                if is_alt:
                    path += "_alt"
                self.creator.add_item_for(
                    path=path,
                    title=node["title"],
                    content=html,
                    mimetype="text/html",
                )

    def add_html5_node(self, node_id):
        """Add content from this `html5` node to zim

        html5 node is single ZIP file containing a standalone HTML app
        which entrypoint is a file named index.html

        we extract and add each file from the ZIP to /{node_id}/

        Note: Studio doesn't enforce the mandatory index.html, thus allowing invalid
        html5 app (unreachable)"""

        file = self.db.get_node_file(node_id, thumbnail=False)
        if not file:
            return

        # download ZIP file to memory
        ark_url, ark_name = get_kolibri_url_for(file["id"], file["ext"])
        ark_data = io.BytesIO()
        stream_file(url=ark_url, byte_stream=ark_data)

        # loop over zip members and create an entry for each
        zip_ark = zipfile.ZipFile(ark_data)
        for ark_member in zip_ark.namelist():
            with self.creator_lock:
                self.creator.add_item_for(
                    path=f"{node_id}/{ark_member}",
                    content=zip_ark.open(ark_member).read(),
                )

    def run(self):
        if self.s3_url_with_credentials and not self.s3_credentials_ok():
            raise ValueError(
                "Unable to connect to Optimization Cache. Check its URL.")

        s3_msg = (f"  using cache: {self.s3_storage.url.netloc} "
                  f"with bucket: {self.s3_storage.bucket_name}"
                  if self.s3_storage else "")
        logger.info(f"Starting scraper with:\n"
                    f"  channel_id: {self.channel_id}\n"
                    f"  build_dir: {self.build_dir}\n"
                    f"  output_dir: {self.output_dir}\n"
                    f"  using webm : {self.use_webm}\n"
                    f"  low_quality : {self.low_quality}\n"
                    f"{s3_msg}")

        logger.info("Download database")
        self.download_db()

        self.sanitize_inputs()
        # display basic stats
        logger.info(f"  Starting ZIM creation with:\n"
                    f"  filename: {self.fname}\n"
                    f"  title: {self.title}\n"
                    f"  description: {self.description}\n"
                    f"  creator: {self.author}\n"
                    f"  publisher: {self.publisher}\n"
                    f"  tags: {';'.join(self.tags)}")

        logger.info("Setup Zim Creator")
        self.output_dir.mkdir(parents=True, exist_ok=True)

        self.creator_lock = threading.Lock()
        self.creator = Creator(
            filename=self.output_dir.joinpath(self.fname),
            main_path=self.root_id,
            favicon_path="favicon.png",
            language="eng",
            title=self.title,
            description=self.description,
            creator=self.author,
            publisher=self.publisher,
            name=self.name,
            tags=";".join(self.tags),
        ).start()

        succeeded = False
        try:
            self.add_favicon()
            self.add_custom_about_and_css()

            # add static files
            logger.info("Adding local files (assets)")
            self.add_local_files("assets",
                                 self.templates_dir.joinpath("assets"))

            # setup queue for nodes processing
            self.nodes_futures = {}  # future: node_id
            self.nodes_executor = cf.ThreadPoolExecutor(
                max_workers=self.nb_threads)

            # setup a dedicated queue for videos to convert
            self.videos_futures = {}  # future: src_fname, dst_fpath, path
            self.pending_upload = {}  # path: filepath, key, checksum
            self.videos_executor = cf.ProcessPoolExecutor(
                max_workers=self.nb_processes)

            logger.info("Starting nodes processing")
            self.populate_nodes_executor()

            # await completion of all futures (nodes and videos)
            result = cf.wait(
                self.videos_futures.keys() | self.nodes_futures.keys(),
                return_when=cf.FIRST_EXCEPTION,
            )
            self.nodes_executor.shutdown()
            # properly shutting down the executor should allow processing
            # futures's callbacks (zim addition) as the wait() function
            # only awaits future completion and doesn't include callbacks
            self.videos_executor.shutdown()

            succeeded = not result.not_done
        except KeyboardInterrupt:
            self.creator.can_finish = False
            logger.error("KeyboardInterrupt, exiting.")
        except Exception as exc:
            # request Creator not to create a ZIM file on finish
            self.creator.can_finish = False
            logger.error("Interrupting process due to error: {exc}")
            logger.exception(exc)
        finally:
            if succeeded:
                logger.info("Finishing ZIM file…")
            # we need to release libzim's resources.
            # currently does nothing but crash if can_finish=False but that's awaiting
            # impl. at libkiwix level
            with self.creator_lock:
                self.creator.finish()

        if not self.keep_build_dir:
            logger.info("Removing build folder")
            shutil.rmtree(self.build_dir, ignore_errors=True)

        return 0 if succeeded else 1

    def s3_credentials_ok(self):
        logger.info("testing S3 Optimization Cache credentials")
        self.s3_storage = KiwixStorage(self.s3_url_with_credentials)
        if not self.s3_storage.check_credentials(list_buckets=True,
                                                 bucket=True,
                                                 write=True,
                                                 read=True,
                                                 failsafe=True):
            logger.error("S3 cache connection error testing permissions.")
            logger.error(f"  Server: {self.s3_storage.url.netloc}")
            logger.error(f"  Bucket: {self.s3_storage.bucket_name}")
            logger.error(f"  Key ID: {self.s3_storage.params.get('keyid')}")
            logger.error(f"  Public IP: {get_public_ip()}")
            return False
        return True

    def download_db(self):
        """download channel DB from kolibri and initialize DB

        Also sets the root_id with DB-computer value"""
        # download database
        fpath = self.build_dir.joinpath("db.sqlite3")
        logger.debug(f"Downloading database into {fpath.name}…")
        stream_file(
            f"{STUDIO_URL}/content/databases/{self.channel_id}.sqlite3",
            fpath,
        )
        self.db = KolibriDB(fpath, self.root_id)
        self.root_id = self.db.root_id

    def sanitize_inputs(self):
        channel_meta = self.db.get_channel_metadata(self.channel_id)

        # input  & metadata sanitation
        period = datetime.datetime.now().strftime("%Y-%m")
        if self.fname:
            # make sure we were given a filename and not a path
            self.fname = Path(self.fname.format(period=period))
            if Path(self.fname.name) != self.fname:
                raise ValueError(f"filename is not a filename: {self.fname}")
        else:
            self.fname = f"{self.name}_{period}.zim"

        if not self.title:
            self.title = channel_meta["name"]
        self.title = self.title.strip()

        if not self.description:
            self.description = channel_meta["description"]
        self.description = self.description.strip()

        if not self.author:
            self.author = channel_meta["author"] or "Kolibri"
        self.author = self.author.strip()

        if not self.publisher:
            self.publisher = "Openzim"
        self.publisher = self.publisher.strip()

        self.tags = list(
            set(self.tags + ["_category:other", "kolibri", "_videos:yes"]))

    def add_favicon(self):
        favicon_orig = self.build_dir / "favicon"
        # if user provided a custom favicon, retrieve that
        if self.favicon:
            handle_user_provided_file(source=self.favicon, dest=favicon_orig)
        # otherwise, get thumbnail from database
        else:
            # add channel thumbnail as favicon
            try:
                favicon_prefix, favicon_data = self.db.get_channel_metadata(
                    self.channel_id)["thumbnail"].split(";base64,", 1)
                favicon_data = base64.standard_b64decode(favicon_data)
                # favicon_mime = favicon_prefix.replace("data:", "")
                with open(favicon_orig, "wb") as fh:
                    fh.write(favicon_data)
                del favicon_data
            except Exception as exc:
                logger.warning(
                    "Unable to extract favicon from DB; using default")
                logger.exception(exc)

                # use a default favicon
                handle_user_provided_file(source=self.templates_dir /
                                          "kolibri-logo.png",
                                          dest=favicon_orig)

        # convert to PNG (might already be PNG but it's OK)
        favicon_fpath = favicon_orig.with_suffix(".png")
        convert_image(favicon_orig, favicon_fpath)

        # resize to appropriate size (ZIM uses 48x48)
        resize_image(favicon_fpath, width=96, height=96, method="thumbnail")

        # generate favicon
        favicon_ico_path = favicon_fpath.with_suffix(".ico")
        create_favicon(src=favicon_fpath, dst=favicon_ico_path)

        self.creator.add_item_for("favicon.png", fpath=favicon_fpath)
        self.creator.add_item_for("favicon.ico", fpath=favicon_ico_path)

    def add_custom_about_and_css(self):
        channel_meta = self.db.get_channel_metadata(self.channel_id)

        if self.about:
            # if user provided a custom about page, use it
            with open(
                    handle_user_provided_file(source=self.about,
                                              in_dir=self.build_dir,
                                              nocopy=True),
                    "r",
            ) as fh:
                soup = BeautifulSoup(fh.read(), "lxml")
                title = soup.find("title").text
                content = soup.select("body > .container")
                # we're only interested in the first one
                if isinstance(content, list):
                    content = content[0]
        else:
            title = channel_meta["name"]
            content = None

        html = self.jinja2_env.get_template("about.html").render(
            title=title, content=content, **channel_meta)
        with self.creator_lock:
            self.creator.add_item_for(
                path="about",
                title=title,
                content=html,
                mimetype="text/html",
            )
        del html

        # if user provided a custom CSS file, use it
        if self.css:
            with open(
                    handle_user_provided_file(source=self.css,
                                              in_dir=self.build_dir,
                                              nocopy=True),
                    "r",
            ) as fh:
                content = fh.read()
        # otherwise, create a blank one
        else:
            content = ""

        self.creator.add_item_for("custom.css",
                                  content=content,
                                  mimetype="text/css")
        logger.debug("Added about page and custom CSS")
Ejemplo n.º 10
0
def s3_upload_file(
    src_path,
    upload_uri,
    filesize,
    private_key,  # not used
    resume=False,  # not supported
    move=False,  # not relevant
    delete=False,
    compress=False,  # not relevant
    bandwidth=None,  # not supported
    cipher=None,  # not relevant
    delete_after=None,  # nb of days to expire upload file after
):
    started_on = now()
    s3_storage = KiwixStorage(rebuild_uri(upload_uri, scheme="https").geturl())
    logger.debug(f"S3 initialized for {s3_storage.url.netloc}/{s3_storage.bucket_name}")

    key = upload_uri.path[1:]
    if upload_uri.path.endswith("/"):
        key += src_path.name

    try:
        logger.info(f"Uploading to {key}")
        hook = FileTransferHook(filename=src_path)
        s3_storage.upload_file(fpath=str(src_path), key=key, Callback=hook)
        print("", flush=True)
    except Exception as exc:
        # as there is no resume, uploading to existing URL will result in DELETE+UPLOAD
        # if credentials doesn't allow DELETE or if there is an unsatisfied
        # retention, will raise PermissionError
        logger.error(f"uploader failed: {exc}")
        logger.exception(exc)
        return 1
    ended_on = now()
    logger.info("uploader ran successfuly.")

    # setting autodelete
    if delete_after is not None:
        try:
            # set expiration after bucket's min retention.
            # bucket retention is 1d minumum.
            # can be configured to loger value.
            # if expiration before bucket min retention, raises 400 Bad Request
            # on compliance
            expire_on = (
                datetime.datetime.now()
                + datetime.timedelta(days=delete_after or 1)
                # adding 1mn to prevent clash with bucket's equivalent min retention
                + datetime.timedelta(seconds=60)
            )
            logger.info(f"Setting autodelete to {expire_on}")
            s3_storage.set_object_autodelete_on(key=key, on=expire_on)
        except Exception as exc:
            logger.error(f"Failed to set autodelete: {exc}")
            logger.exception(exc)

    if delete:
        remove_source_file(src_path)
    display_stats(filesize, started_on, ended_on)

    return 0
Ejemplo n.º 11
0
def main(url=None):
    if url is None:
        url = os.getenv("S3URL")
    if not url:
        print("Missing URL.")
        sys.exit(1)

    s3 = KiwixStorage(url=url)
    if not s3.check_credentials(list_buckets=True, failsafe=False):
        print("credentials not OK")
        sys.exit(1)

    print("credentials OK")

    iam = s3.get_service("iam")

    versions = iam.list_policy_versions(PolicyArn=POLICY_ARN).get(
        "Versions", [])
    print(f"We have {len(versions)} for {POLICY_ARN}")
    version_id = None
    for version in versions:
        if version["IsDefaultVersion"]:
            version_id = version["VersionId"]

    print(f"Default version is {version_id}")

    # delete all other versions
    if len(versions) == 5:
        print("Deleting all other versions…")
        for version in versions:
            if version["VersionId"] == version_id:
                continue
            print(f"Deleting version {version['VersionId']}")
            iam.delete_policy_version(PolicyArn=POLICY_ARN,
                                      VersionId=version["VersionId"])

    if not version_id:
        print("Existing policy doesn't exist. probably a mistake?")
        sys.exit(1)

    pv = (iam.get_policy_version(PolicyArn=POLICY_ARN,
                                 VersionId=version_id).get(
                                     "PolicyVersion", {}).get("Document"))
    if not pv:
        print("We don't have a policy document.")
        sys.exit(1)

    print(f"Current Policy:")
    from pprint import pprint as pp

    pp(pv)

    statement = get_statement()  # gen new statement

    try:
        stmt_index = [s["Sid"] for s in pv["Statement"]].index(STATEMENT_ID)
        pv["Statement"][stmt_index] = statement
    except ValueError:
        pv["Statement"].append(statement)

    new_policy = json.dumps(pv, indent=4)
    print(f"New Policy:\n{new_policy}")

    print("Overwriting policy…")
    iam.create_policy_version(PolicyArn=POLICY_ARN,
                              PolicyDocument=new_policy,
                              SetAsDefault=True)
Ejemplo n.º 12
0
class Youtube2Zim:
    def __init__(
        self,
        collection_type,
        youtube_id,
        api_key,
        video_format,
        low_quality,
        nb_videos_per_page,
        all_subtitles,
        autoplay,
        output_dir,
        no_zim,
        fname,
        debug,
        tmp_dir,
        keep_build_dir,
        max_concurrency,
        youtube_store,
        language,
        locale_name,
        tags,
        dateafter,
        use_any_optimized_version,
        s3_url_with_credentials,
        title=None,
        description=None,
        creator=None,
        publisher=None,
        name=None,
        profile_image=None,
        banner_image=None,
        main_color=None,
        secondary_color=None,
    ):
        # data-retrieval info
        self.collection_type = collection_type
        self.youtube_id = youtube_id
        self.api_key = api_key
        self.dateafter = dateafter

        # video-encoding info
        self.video_format = video_format
        self.low_quality = low_quality

        # options & zim params
        self.nb_videos_per_page = nb_videos_per_page
        self.all_subtitles = all_subtitles
        self.autoplay = autoplay
        self.fname = fname
        self.language = language
        self.tags = [t.strip() for t in tags.split(",")]
        self.title = title
        self.description = description
        self.creator = creator
        self.publisher = publisher
        self.name = name
        self.profile_image = profile_image
        self.banner_image = banner_image
        self.main_color = main_color
        self.secondary_color = secondary_color

        # directory setup
        self.output_dir = Path(output_dir).expanduser().resolve()
        if tmp_dir:
            tmp_dir = Path(tmp_dir).expanduser().resolve()
            tmp_dir.mkdir(parents=True, exist_ok=True)
        self.build_dir = Path(tempfile.mkdtemp(dir=tmp_dir))

        # process-related
        self.playlists = []
        self.uploads_playlist_id = None
        self.videos_ids = []
        self.main_channel_id = None  # use for branding

        # debug/devel options
        self.no_zim = no_zim
        self.debug = debug
        self.keep_build_dir = keep_build_dir
        self.max_concurrency = max_concurrency

        # update youtube credentials store
        youtube_store.update(build_dir=self.build_dir,
                             api_key=self.api_key,
                             cache_dir=self.cache_dir)

        # Optimization-cache
        self.s3_url_with_credentials = s3_url_with_credentials
        self.use_any_optimized_version = use_any_optimized_version
        self.video_quality = "low" if self.low_quality else "high"
        self.s3_storage = None

        # set and record locale for translations
        locale_name = locale_name or get_language_details(
            self.language)["iso-639-1"]
        try:
            self.locale = setlocale(ROOT_DIR, locale_name)
        except locale.Error:
            logger.error(
                f"No locale for {locale_name}. Use --locale to specify it. defaulting to en_US"
            )
            self.locale = setlocale(ROOT_DIR, "en")

    @property
    def root_dir(self):
        return ROOT_DIR

    @property
    def templates_dir(self):
        return self.root_dir.joinpath("templates")

    @property
    def assets_src_dir(self):
        return self.templates_dir.joinpath("assets")

    @property
    def assets_dir(self):
        return self.build_dir.joinpath("assets")

    @property
    def channels_dir(self):
        return self.build_dir.joinpath("channels")

    @property
    def cache_dir(self):
        return self.build_dir.joinpath("cache")

    @property
    def videos_dir(self):
        return self.build_dir.joinpath("videos")

    @property
    def profile_path(self):
        return self.build_dir.joinpath("profile.jpg")

    @property
    def banner_path(self):
        return self.build_dir.joinpath("banner.jpg")

    @property
    def is_user(self):
        return self.collection_type == USER

    @property
    def is_channel(self):
        return self.collection_type == CHANNEL

    @property
    def is_playlist(self):
        return self.collection_type == PLAYLIST

    @property
    def is_single_channel(self):
        if self.is_channel or self.is_user:
            return True
        return len(list(set([pl.creator_id for pl in self.playlists]))) == 1

    @property
    def sorted_playlists(self):
        """ sorted list of playlists (by title) but with Uploads one at first if any """
        if len(self.playlists) < 2:
            return self.playlists

        sorted_playlists = sorted(self.playlists, key=lambda x: x.title)
        index = 0
        # make sure our Uploads, special playlist is first
        if self.uploads_playlist_id:
            try:
                index = [
                    index for index, p in enumerate(sorted_playlists)
                    if p.playlist_id == self.uploads_playlist_id
                ][-1]
            except Exception:
                index = 0
        return ([sorted_playlists[index]] + sorted_playlists[0:index] +
                sorted_playlists[index + 1:])

    def run(self):
        """ execute the scraper step by step """

        self.validate_id()

        # validate dateafter input
        self.validate_dateafter_input()

        logger.info(
            f"starting youtube scraper for {self.collection_type}#{self.youtube_id}"
        )
        logger.info("preparing build folder at {}".format(
            self.build_dir.resolve()))
        self.prepare_build_folder()

        logger.info("testing Youtube credentials")
        if not credentials_ok():
            raise ValueError(
                "Unable to connect to Youtube API v3. check `API_KEY`.")

        if self.s3_url_with_credentials and not self.s3_credentials_ok():
            raise ValueError(
                "Unable to connect to Optimization Cache. Check its URL.")

        # fail early if supplied branding files are missing
        self.check_branding_values()

        logger.info("compute playlists list to retrieve")
        self.extract_playlists()

        logger.info(".. {} playlists:\n   {}".format(
            len(self.playlists),
            "\n   ".join([p.playlist_id for p in self.playlists]),
        ))

        logger.info("compute list of videos")
        self.extract_videos_list()

        nb_videos_msg = f".. {len(self.videos_ids)} videos"
        if self.dateafter.start.year != 1:
            nb_videos_msg += (
                f" in date range: {self.dateafter.start} - {datetime.date.today()}"
            )
        logger.info(f"{nb_videos_msg}.")

        # download videos (and recompress)
        logger.info(
            f"downloading all videos, subtitles and thumbnails (concurrency={self.max_concurrency})"
        )
        logger.info(f"  format: {self.video_format}")
        logger.info(f"  quality: {self.video_quality}")
        logger.info(f"  generated-subtitles: {self.all_subtitles}")
        if self.s3_storage:
            logger.info(
                f"  using cache: {self.s3_storage.url.netloc} with bucket: {self.s3_storage.bucket_name}"
            )
        succeeded, failed = self.download_video_files(
            max_concurrency=self.max_concurrency)
        if failed:
            logger.error(
                f"{len(failed)} video(s) failed to download: {failed}")
            if len(failed) >= len(succeeded):
                logger.critical("More than half of videos failed. exiting")
                raise IOError("Too much videos failed to download")

        logger.info("retrieve channel-info for all videos (author details)")
        get_videos_authors_info(succeeded)

        logger.info("download all author's profile pictures")
        self.download_authors_branding()

        logger.info("update general metadata")
        self.update_metadata()

        logger.info("creating HTML files")
        self.make_html_files(succeeded)

        # make zim file
        os.makedirs(self.output_dir, exist_ok=True)
        if not self.no_zim:
            period = datetime.datetime.now().strftime("%Y-%m")
            self.fname = (self.fname.format(
                period=period) if self.fname else f"{self.name}_{period}.zim")
            logger.info("building ZIM file")
            make_zim_file(
                build_dir=self.build_dir,
                fpath=self.output_dir / self.fname,
                name=self.name,
                main_page="home.html",
                favicon="favicon.jpg",
                title=self.title,
                description=self.description,
                language=self.language,
                creator=self.creator,
                publisher="Kiwix",
                tags=self.tags,
                scraper=SCRAPER,
            )

            if not self.keep_build_dir:
                logger.info("removing temp folder")
                shutil.rmtree(self.build_dir, ignore_errors=True)

        logger.info("all done!")

    def s3_credentials_ok(self):
        logger.info("testing S3 Optimization Cache credentials")
        self.s3_storage = KiwixStorage(self.s3_url_with_credentials)
        if not self.s3_storage.check_credentials(list_buckets=True,
                                                 bucket=True,
                                                 write=True,
                                                 read=True,
                                                 failsafe=True):
            logger.error("S3 cache connection error testing permissions.")
            logger.error(f"  Server: {self.s3_storage.url.netloc}")
            logger.error(f"  Bucket: {self.s3_storage.bucket_name}")
            logger.error(f"  Key ID: {self.s3_storage.params.get('keyid')}")
            logger.error(f"  Public IP: {get_public_ip()}")
            return False
        return True

    def validate_dateafter_input(self):
        try:
            self.dateafter = youtube_dl.DateRange(self.dateafter)
        except Exception as exc:
            logger.error(
                f"Invalid dateafter input. Valid dateafter format: "
                f"YYYYMMDD or (now|today)[+-][0-9](day|week|month|year)(s).")
            raise ValueError(f"Invalid dateafter input: {exc}")

    def validate_id(self):
        # space not allowed in youtube-ID
        self.youtube_id = self.youtube_id.replace(" ", "")
        if self.collection_type == "channel" and len(self.youtube_id) > 24:
            raise ValueError("Invalid ChannelId")
        if "," in self.youtube_id and self.collection_type != "playlist":
            raise ValueError("Invalid YoutubeId")

    def prepare_build_folder(self):
        """ prepare build folder before we start downloading data """

        # copy assets
        shutil.copytree(self.assets_src_dir, self.assets_dir)

        fix_source_dir(self.assets_dir, "assets")

        # cache folder to store youtube-api results
        self.cache_dir.mkdir(exist_ok=True)

        # make videos placeholder
        self.videos_dir.mkdir(exist_ok=True)

        # make channels placeholder (profile files)
        self.channels_dir.mkdir(exist_ok=True)

    def check_branding_values(self):
        """checks that user-supplied images and colors are valid (so to fail early)

        Images are checked for existence or downloaded then resized
        Colors are check for validity"""

        # skip this step if none of related values were supplied
        if not sum([
                bool(x) for x in (
                    self.profile_image,
                    self.banner_image,
                    self.main_color,
                    self.secondary_color,
                )
        ]):
            return
        logger.info("checking your branding files and values")
        if self.profile_image:
            if self.profile_image.startswith("http"):
                stream_file(self.profile_image, self.profile_path)
            else:
                if not self.profile_image.exists():
                    raise IOError(
                        f"--profile image could not be found: {self.profile_image}"
                    )
                shutil.move(self.profile_image, self.profile_path)
            resize_image(self.profile_path,
                         width=100,
                         height=100,
                         method="thumbnail")
        if self.banner_image:
            if self.banner_image.startswith("http"):
                stream_file(self.banner_image, self.banner_path)
            else:
                if not self.banner_image.exists():
                    raise IOError(
                        f"--banner image could not be found: {self.banner_image}"
                    )
                shutil.move(self.banner_image, self.banner_path)
            resize_image(self.banner_path,
                         width=1060,
                         height=175,
                         method="thumbnail")

        if self.main_color and not is_hex_color(self.main_color):
            raise ValueError(
                f"--main-color is not a valid hex color: {self.main_color}")

        if self.secondary_color and not is_hex_color(self.secondary_color):
            raise ValueError(
                f"--secondary_color-color is not a valid hex color: {self.secondary_color}"
            )

    def extract_playlists(self):
        """prepare a list of Playlist from user request

        USER: we fetch the hidden channel associate to it
        CHANNEL (and USER): we grab all playlists + `uploads` playlist
        PLAYLIST: we retrieve from the playlist Id(s)"""

        (
            self.playlists,
            self.main_channel_id,
            self.uploads_playlist_id,
        ) = extract_playlists_details_from(self.collection_type,
                                           self.youtube_id)

    def extract_videos_list(self):
        all_videos = load_json(self.cache_dir, "videos")
        if all_videos is None:
            all_videos = {}

            # we only return video_ids that we'll use later on. per-playlist JSON stored
            for playlist in self.playlists:
                videos_json = get_videos_json(playlist.playlist_id)
                # filter in videos within date range and filter away deleted videos
                skip_outofrange = functools.partial(skip_outofrange_videos,
                                                    self.dateafter)
                filter_videos = filter(skip_outofrange, videos_json)
                filter_videos = filter(skip_deleted_videos, filter_videos)
                all_videos.update(
                    {v["contentDetails"]["videoId"]: v
                     for v in filter_videos})
            save_json(self.cache_dir, "videos", all_videos)
        self.videos_ids = [*all_videos.keys()
                           ]  # unpacking so it's subscriptable

    def download_video_files(self, max_concurrency):

        audext, vidext = {
            "webm": ("webm", "webm"),
            "mp4": ("m4a", "mp4")
        }[self.video_format]

        # prepare options which are shared with every downloader
        options = {
            "cachedir": self.videos_dir,
            "writethumbnail": True,
            "write_all_thumbnails": False,
            "writesubtitles": True,
            "allsubtitles": True,
            "subtitlesformat": "vtt",
            "keepvideo": False,
            "ignoreerrors": False,
            "retries": 20,
            "fragment-retries": 50,
            "skip-unavailable-fragments": True,
            # "external_downloader": "aria2c",
            # "external_downloader_args": ["--max-tries=20", "--retry-wait=30"],
            "outtmpl": str(self.videos_dir.joinpath("%(id)s",
                                                    "video.%(ext)s")),
            "preferredcodec": self.video_format,
            "format":
            f"best[ext={vidext}]/bestvideo[ext={vidext}]+bestaudio[ext={audext}]/best",
            "y2z_videos_dir": self.videos_dir,
        }
        if self.all_subtitles:
            options.update({"writeautomaticsub": True})

        # find number of actuall parallel workers
        nb_videos = len(self.videos_ids)
        concurrency = nb_videos if nb_videos < max_concurrency else max_concurrency

        # short-circuit concurency if we have only one thread (can help debug)
        if concurrency <= 1:
            return self.download_video_files_batch(options, self.videos_ids)

        # prepare out videos_ids batches
        def get_slot():
            n = 0
            while True:
                yield n
                n += 1
                if n >= concurrency:
                    n = 0

        batches = [[] for _ in range(0, concurrency)]
        slot = get_slot()
        for video_id in self.videos_ids:
            batches[next(slot)].append(video_id)

        overall_succeeded = []
        overall_failed = []
        # execute the batches concurrently
        with concurrent.futures.ThreadPoolExecutor(
                max_workers=concurrency) as executor:
            fs = [
                executor.submit(self.download_video_files_batch, options,
                                videos_ids) for videos_ids in batches
            ]
            done, not_done = concurrent.futures.wait(
                fs, return_when=concurrent.futures.ALL_COMPLETED)

            # we have some `not_done` batches, indicating errors within
            if not_done:
                logger.critical(
                    "Not all video-processing batches completed. Cancelling…")
                for future in not_done:
                    exc = future.exception()
                    if exc:
                        logger.exception(exc)
                        raise exc

            # retrieve our list of successful/failed video_ids
            for future in done:
                succeeded, failed = future.result()
                overall_succeeded += succeeded
                overall_failed += failed

        # remove left-over files for failed downloads
        logger.debug(
            f"removing left-over files of {len(overall_failed)} failed videos")
        for video_id in overall_failed:
            shutil.rmtree(self.videos_dir.joinpath(video_id),
                          ignore_errors=True)

        return overall_succeeded, overall_failed

    def download_from_cache(self, key, video_path, encoder_version):
        """ whether it successfully downloaded from cache """
        if self.use_any_optimized_version:
            if not self.s3_storage.has_object(key,
                                              self.s3_storage.bucket_name):
                return False
        else:
            if not self.s3_storage.has_object_matching_meta(
                    key, tag="encoder_version", value=f"v{encoder_version}"):
                return False
        video_path.parent.mkdir(parents=True, exist_ok=True)
        try:
            self.s3_storage.download_file(key, video_path)
        except Exception as exc:
            logger.error(f"{key} failed to download from cache: {exc}")
            return False
        logger.info(f"downloaded {video_path} from cache at {key}")
        return True

    def upload_to_cache(self, key, video_path, encoder_version):
        """ whether it successfully uploaded to cache """
        try:
            self.s3_storage.upload_file(
                video_path,
                key,
                meta={"encoder_version": f"v{encoder_version}"})
        except Exception as exc:
            logger.error(f"{key} failed to upload to cache: {exc}")
            return False
        logger.info(f"uploaded {video_path} to cache at {key}")
        return True

    def download_video(self, video_id, options):
        """ download the video from cache/youtube and return True if successful """

        preset = {"mp4": VideoMp4Low}.get(self.video_format, VideoWebmLow)()
        options_copy = options.copy()
        video_location = options_copy["y2z_videos_dir"].joinpath(video_id)
        video_path = video_location.joinpath(f"video.{self.video_format}")

        if self.s3_storage:
            s3_key = f"{self.video_format}/{self.video_quality}/{video_id}"
            logger.debug(
                f"Attempting to download video file for {video_id} from cache..."
            )
            if self.download_from_cache(s3_key, video_path, preset.VERSION):
                return True

        try:
            # skip downloading the thumbnails
            options_copy.update({
                "writethumbnail": False,
                "writesubtitles": False,
                "allsubtitles": False,
                "writeautomaticsub": False,
            })
            with youtube_dl.YoutubeDL(options_copy) as ydl:
                ydl.download([video_id])
            post_process_video(
                video_location,
                video_id,
                preset,
                self.video_format,
                self.low_quality,
            )
        except (
                youtube_dl.utils.DownloadError,
                FileNotFoundError,
                subprocess.CalledProcessError,
        ) as exc:
            logger.error(f"Video file for {video_id} could not be downloaded")
            logger.debug(exc)
            return False
        else:  # upload to cache only if everything went well
            if self.s3_storage:
                logger.debug(
                    f"Uploading video file for {video_id} to cache ...")
                self.upload_to_cache(s3_key, video_path, preset.VERSION)
            return True

    def download_thumbnail(self, video_id, options):
        """ download the thumbnail from cache/youtube and return True if successful """

        preset = WebpHigh()
        options_copy = options.copy()
        video_location = options_copy["y2z_videos_dir"].joinpath(video_id)
        thumbnail_path = video_location.joinpath("video.webp")

        if self.s3_storage:
            s3_key = f"thumbnails/high/{video_id}"
            logger.debug(
                f"Attempting to download thumbnail for {video_id} from cache..."
            )
            if self.download_from_cache(s3_key, thumbnail_path,
                                        preset.VERSION):
                return True

        try:
            # skip downloading the video
            options_copy.update({
                "skip_download": True,
                "writesubtitles": False,
                "allsubtitles": False,
                "writeautomaticsub": False,
            })
            with youtube_dl.YoutubeDL(options_copy) as ydl:
                ydl.download([video_id])
            process_thumbnail(thumbnail_path, preset)
        except (
                youtube_dl.utils.DownloadError,
                FileNotFoundError,
                subprocess.CalledProcessError,
        ) as exc:
            logger.error(f"Thumbnail for {video_id} could not be downloaded")
            logger.debug(exc)
            return False
        else:  # upload to cache only if everything went well
            if self.s3_storage:
                logger.debug(
                    f"Uploading thumbnail for {video_id} to cache ...")
                self.upload_to_cache(s3_key, thumbnail_path, preset.VERSION)
            return True

    def download_subtitles(self, video_id, options):
        """ download subtitles for a video """

        options_copy = options.copy()
        options_copy.update({"skip_download": True, "writethumbnail": False})
        try:
            with youtube_dl.YoutubeDL(options_copy) as ydl:
                ydl.download([video_id])
        except Exception:
            logger.error(f"Could not download subtitles for {video_id}")

    def download_video_files_batch(self, options, videos_ids):
        """ download video file and thumbnail for all videos in batch and return succeeded and failed video ids """

        succeeded = []
        failed = []
        for video_id in videos_ids:
            if self.download_video(video_id,
                                   options) and self.download_thumbnail(
                                       video_id, options):
                self.download_subtitles(video_id, options)
                succeeded.append(video_id)
            else:
                failed.append(video_id)
        return succeeded, failed

    def download_authors_branding(self):
        videos_channels_json = load_json(self.cache_dir, "videos_channels")
        uniq_channel_ids = list(
            set([chan["channelId"] for chan in videos_channels_json.values()]))
        for channel_id in uniq_channel_ids:
            save_channel_branding(self.channels_dir,
                                  channel_id,
                                  save_banner=False)
            self.copy_default_banner(channel_id)

    def copy_default_banner(self, channel_id):
        banner_path = self.channels_dir / channel_id / "banner.jpg"
        if not banner_path.exists():
            shutil.copy(
                self.templates_dir / "assets" / "banner.jpg",
                self.channels_dir / channel_id / "banner.jpg",
            )

    def update_metadata(self):
        # we use title, description, profile and banner of channel/user
        # or channel of first playlist
        try:
            main_channel_json = get_channel_json(self.main_channel_id)
        except KeyError:
            main_channel_json = {
                "snippet": {
                    "title": "Unknown",
                    "description": ""
                }
            }
        else:
            save_channel_branding(self.channels_dir,
                                  self.main_channel_id,
                                  save_banner=True)
        self.copy_default_banner(self.main_channel_id)

        # if a single playlist was requested, use if for names;
        # otherwise, use main_channel's details.
        auto_title = (self.playlists[0].title
                      if self.is_playlist and len(self.playlists) == 1 else
                      main_channel_json["snippet"]["title"].strip())
        auto_description = (clean_text(self.playlists[0].description)
                            if self.is_playlist and len(self.playlists) == 1
                            else clean_text(
                                main_channel_json["snippet"]["description"]))
        self.title = self.title or auto_title or "-"
        self.description = self.description or auto_description or "-"

        if self.creator is None:
            if self.is_single_channel:
                self.creator = _("Youtube Channel “{title}”").format(
                    title=main_channel_json["snippet"]["title"])
            else:
                self.creator = _("Youtube Channels")
        self.publisher = self.publisher or "Kiwix"

        self.tags = self.tags or ["youtube"]
        if "_videos:yes" not in self.tags:
            self.tags.append("_videos:yes")

        # copy our main_channel branding into /(profile|banner).jpg if not supplied
        if not self.profile_path.exists():
            shutil.copy(
                self.channels_dir.joinpath(self.main_channel_id,
                                           "profile.jpg"),
                self.profile_path,
            )
        if not self.banner_path.exists():
            shutil.copy(
                self.channels_dir.joinpath(self.main_channel_id, "banner.jpg"),
                self.banner_path,
            )

        # set colors from images if not supplied
        if self.main_color is None or self.secondary_color is None:
            profile_main, profile_secondary = get_colors(self.profile_path)
        self.main_color = self.main_color or profile_main
        self.secondary_color = self.secondary_color or profile_secondary

        resize_image(
            self.profile_path,
            width=48,
            height=48,
            method="thumbnail",
            dst=self.build_dir.joinpath("favicon.jpg"),
        )

    def make_html_files(self, actual_videos_ids):
        """make up HTML structure to read the content

        /home.html                                  Homepage

        for each video:
            - <slug-title>.html                     HTML article
            - videos/<videoId>/video.<ext>          video file
            - videos/<videoId>/video.<lang>.vtt     subtititle(s)
            - videos/<videoId>/video.webp            template
        """
        def remove_unused_videos(videos):
            video_ids = [
                video["contentDetails"]["videoId"] for video in videos
            ]
            for path in self.videos_dir.iterdir():
                if path.is_dir() and path.name not in video_ids:
                    logger.debug(f"Removing unused video {path.name}")
                    shutil.rmtree(path, ignore_errors=True)

        def is_present(video):
            """ whether this video has actually been succeffuly downloaded """
            return video["contentDetails"]["videoId"] in actual_videos_ids

        def video_has_channel(videos_channels, video):
            return video["contentDetails"]["videoId"] in videos_channels

        def get_subtitles(video_id):
            video_dir = self.videos_dir.joinpath(video_id)
            languages = [
                x.stem.split(".")[1] for x in video_dir.iterdir()
                if x.is_file() and x.name.endswith(".vtt")
            ]

            def to_jinja_subtitle(lang):
                try:
                    subtitle = get_language_details(
                        YOUTUBE_LANG_MAP.get(lang, lang))
                except Exception:
                    logger.error(f"Failed to get language details for {lang}")
                    raise
                return {
                    "code": lang,
                    # Youtube.com uses `English - code` format.
                    # Note: videojs displays it lowercased anyway
                    "name":
                    f"{subtitle['english'].title()} - {subtitle['query']}",
                }

            # Youtube.com sorts subtitles by English name
            return sorted(map(to_jinja_subtitle, languages),
                          key=lambda x: x["name"])

        env = jinja2.Environment(loader=jinja2.FileSystemLoader(
            str(self.templates_dir)),
                                 autoescape=True)

        videos = load_json(self.cache_dir, "videos").values()
        # filter videos so we only include the ones we could retrieve
        videos = list(filter(is_present, videos))
        videos_channels = load_json(self.cache_dir, "videos_channels")
        has_channel = functools.partial(video_has_channel, videos_channels)
        # filter videos to exclude those for which we have no channel (#76)
        videos = list(filter(has_channel, videos))
        for video in videos:
            video_id = video["contentDetails"]["videoId"]
            title = video["snippet"]["title"]
            slug = get_slug(title)
            description = video["snippet"]["description"]
            publication_date = dt_parser.parse(
                video["contentDetails"]["videoPublishedAt"])
            author = videos_channels[video_id]
            subtitles = get_subtitles(video_id)
            video_url = f"https://www.youtube.com/watch?v={video_id}"

            html = env.get_template("article.html").render(
                video_id=video_id,
                video_format=self.video_format,
                author=author,
                title=title,
                description=description,
                date=format_date(publication_date,
                                 format="medium",
                                 locale=self.locale),
                subtitles=subtitles,
                url=video_url,
                channel_id=video["snippet"]["channelId"],
                color=self.main_color,
                background_color=self.secondary_color,
                autoplay=self.autoplay,
            )
            with open(self.build_dir.joinpath(f"{slug}.html"),
                      "w",
                      encoding="utf-8") as fp:
                fp.write(html)

        # build homepage
        html = env.get_template("home.html").render(
            playlists=self.playlists,
            video_format=self.video_format,
            title=self.title,
            description=self.description,
            color=self.main_color,
            background_color=self.secondary_color,
            page_label=_("Page {current}/{total}"),
            back_label=_("Back to top"),
        )
        with open(self.build_dir.joinpath("home.html"), "w",
                  encoding="utf-8") as fp:
            fp.write(html)

        # rewrite app.js including `format`
        with open(self.assets_dir.joinpath("app.js"), "w",
                  encoding="utf-8") as fp:
            fp.write(
                env.get_template("assets/app.js").render(
                    video_format=self.video_format))

        # rewrite app.js including `pagination`
        with open(self.assets_dir.joinpath("db.js"), "w",
                  encoding="utf-8") as fp:
            fp.write(
                env.get_template("assets/db.js").render(
                    NB_VIDEOS_PER_PAGE=self.nb_videos_per_page))

        # write list of videos in data.js
        def to_data_js(video):
            return {
                "id":
                video["contentDetails"]["videoId"],
                "title":
                video["snippet"]["title"],
                "slug":
                get_slug(video["snippet"]["title"]),
                "description":
                video["snippet"]["description"],
                "subtitles":
                get_subtitles(video["contentDetails"]["videoId"]),
                "thumbnail":
                str(
                    Path("videos").joinpath(video["contentDetails"]["videoId"],
                                            "video.webp")),
            }

        with open(self.assets_dir.joinpath("data.js"), "w",
                  encoding="utf-8") as fp:
            # write all playlists as they are
            for playlist in self.playlists:
                # retrieve list of videos for PL
                playlist_videos = load_json(
                    self.cache_dir, f"playlist_{playlist.playlist_id}_videos")
                # filtering-out missing ones (deleted or not downloaded)
                playlist_videos = list(
                    filter(skip_deleted_videos, playlist_videos))
                playlist_videos = list(filter(is_present, playlist_videos))
                playlist_videos = list(filter(has_channel, playlist_videos))
                # sorting them based on playlist
                playlist_videos.sort(key=lambda v: v["snippet"]["position"])

                fp.write("var json_{slug} = {json_str};\n".format(
                    slug=playlist.slug,
                    json_str=json.dumps(list(map(to_data_js, playlist_videos)),
                                        indent=4),
                ))

        # write a metadata.json file with some content-related data
        with open(self.build_dir.joinpath("metadata.json"),
                  "w",
                  encoding="utf-8") as fp:
            json.dump({"video_format": self.video_format}, fp, indent=4)

        # clean videos left out in videos directory
        remove_unused_videos(videos)
Ejemplo n.º 13
0
    def upload_image_s3(self):
        self.logger.info("Starting S3 upload")

        # add credentials to URL
        url = urllib.parse.urlparse(self.task["upload_uri"])
        qs = urllib.parse.parse_qs(url.query)
        qs["keyId"] = Setting.s3_access_key
        qs["secretAccessKey"] = Setting.s3_secret_key

        # setup upload logging
        uploader_log = io.StringIO()
        uploader_logger = logging.getLogger("uploader_log")
        uploader_logger.propagate = True
        uploader_logger.setLevel(logging.DEBUG)
        uploader_logger.addHandler(logging.StreamHandler(stream=uploader_log))

        # init and test storage
        uploader_logger.info("initializing S3")
        s3_storage = KiwixStorage(
            urllib.parse.SplitResult(
                "https",
                url.netloc,
                url.path,
                urllib.parse.urlencode(qs, doseq=True),
                url.fragment,
            ).geturl())
        uploader_logger.debug(
            f"S3 initialized for {s3_storage.url.netloc}/{s3_storage.bucket_name}"
        )

        # torrent
        dl_url = urllib.parse.urlparse(self.task["download_uri"])
        upload_torrent = "torrent" in dl_url.scheme

        if upload_torrent:
            parts = list(urllib.parse.urlsplit(dl_url.geturl()))
            parts[0] = parts[0].replace("+torrent", "")
            dl_url = urllib.parse.urlparse(urllib.parse.urlunsplit(parts))

            uploader_logger.info(
                f"Creating torrent file for {self.img_path.name}")
            torrent_path = self.img_path.with_suffix(
                f"{self.img_path.suffix}.torrent")
            download_url = f"{dl_url.geturl()}/{self.img_path.name}"
            torrent = torf.Torrent(
                path=self.img_path,
                trackers=[
                    "https://opentracker.xyz:443/announce",
                    "http://torrent.nwps.ws:80/announce",
                    "udp://tracker.open-internet.nl:6969/announce",
                    "udp://tracker.coppersurfer.tk:6969/announce",
                    "udp://tracker.openbittorrent.com:80/announce",
                ],
                webseeds=[download_url],
            )
            torrent.generate()
            torrent.write(torrent_path)
            uploader_logger.info(f".. created {torrent_path.name}")

            uploader_logger.info(f"Uploading {torrent_path.name}")
            s3_storage.upload_file(fpath=str(torrent_path),
                                   key=torrent_path.name)
            uploader_logger.info(".. uploaded")
            torrent_path.unlink()

        # upload
        uploader_logger.info(f"Uploading to {self.img_path.name}")
        try:
            hook = ImageTransferHook(output=uploader_log, fpath=self.img_path)
            s3_storage.upload_file(fpath=str(self.img_path),
                                   key=self.img_path.name,
                                   Callback=hook)
            uploaded = True
        except Exception as exc:
            uploaded = False
            uploader_logger.error(f"uploader failed: {exc}")
            uploader_logger.exception(exc)
        else:
            uploader_logger.info("uploader ran successfuly.")

        # setting autodelete
        try:
            # make sure autodelete is above bucket's min retention (should be 1d)
            expire_on = datetime.datetime.now() + datetime.timedelta(
                days=max([2, self.task["media_duration"]]))
            uploader_logger.info(f"Setting autodelete to {expire_on}")
            s3_storage.set_object_autodelete_on(key=self.img_path.name,
                                                on=expire_on)
        except Exception as exc:
            uploader_logger.error("Failed to set autodelete")
            uploader_logger.exception(exc)

        if upload_torrent:
            try:
                uploader_logger.info(
                    f"Setting torrent autodelete to {expire_on}")
                s3_storage.set_object_autodelete_on(key=torrent_path.name,
                                                    on=expire_on)
            except Exception as exc:
                uploader_logger.error("Failed to set autodelete on torrent")
                uploader_logger.exception(exc)

        self.logger.info("collecting uploader log")
        try:
            self.logs["uploader_log"] = uploader_log.getvalue()
            uploader_log.close()
        except Exception as exc:
            self.logger.error(f"Failed to collect logs: {exc}")

        # remove image
        try:
            self.logger.info("removing image file: {}".format(
                self.img_path.name))
            self.img_path.unlink()
        except Exception as exp:
            self.logger.error("Unable to remove image file: {}".format(exp))
            self.logger.exception(exp)

        if not uploaded:
            raise subprocess.SubprocessError("S3 upload failed")
Ejemplo n.º 14
0
    def download_image_s3(self):
        self.logger.info("Starting s3 image download")

        # add credentials to URL
        url = urllib.parse.urlparse(self.task["download_uri"])
        qs = urllib.parse.parse_qs(url.query)
        qs["keyId"] = Setting.s3_access_key
        qs["secretAccessKey"] = Setting.s3_secret_key

        # setup download logging
        downloader_log = io.StringIO()
        downloader_logger = logging.getLogger("downloader_log")
        downloader_logger.propagate = True
        downloader_logger.setLevel(logging.DEBUG)
        downloader_logger.addHandler(logging.StreamHandler(stream=downloader_log))

        # init and test storage
        downloader_logger.info("initializing S3")
        s3_storage = KiwixStorage(
            urllib.parse.SplitResult(
                "https",
                url.netloc,
                url.path,
                urllib.parse.urlencode(qs, doseq=True),
                url.fragment,
            ).geturl()
        )
        downloader_logger.debug(
            f"S3 initialized for {s3_storage.url.netloc}/{s3_storage.bucket_name}"
        )

        # download
        downloader_logger.info(f"Downloading from {self.img_path.name}")
        try:
            hook = ImageTransferHook(
                output=downloader_log,
                size=s3_storage.get_object_stat(key=self.img_path.name).size,
                name=self.img_path.name,
            )
            s3_storage.download_file(
                key=self.img_path.name, fpath=str(self.img_path), Callback=hook
            )
            downloaded = True
        except Exception as exc:
            downloaded = False
            downloader_logger.error(f"downloader failed: {exc}")
            downloader_logger.exception(exc)
        else:
            downloader_logger.info("downloader ran successfuly.")

        if downloaded:
            # image downloaded, mark for autodeletion
            try:
                autodelete_on = datetime.datetime.now() + datetime.timedelta(days=1)
                downloader_logger.info(f"Setting autodelete to now ({autodelete_on})")
                s3_storage.set_object_autodelete_on(
                    key=self.img_path.name, on=autodelete_on
                )
            except Exception as exc:
                downloader_logger.error(
                    "Failed to set autodelete (normal if before bucket retention)"
                )
                downloader_logger.exception(exc)

        self.logger.info("collecting downloader log")
        try:
            self.logs["downloader_log"] = downloader_log.getvalue()
            downloader_log.close()
        except Exception as exc:
            self.logger.error(f"Failed to collect logs: {exc}")

        if not downloaded:
            raise subprocess.SubprocessError("S3 download failed")