Python KiwixStorage Examples

Programming Language: Python

Namespace/Package Name: kiwixstorage

Class/Type: KiwixStorage

Examples at hotexamples.com: 14

Python KiwixStorage - 14 examples found. These are the top rated real world Python examples of kiwixstorage.KiwixStorage extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

KiwixStorage(11)

check_credentials(7)

upload_file(5)

has_object(4)

download_file(3)

set_object_autodelete_on(3)

has_object_matching_meta(2)

download_matching_fileobj(1)

get_object_stat(1)

get_service(1)

upload_fileobj(1)

Example #1

Show file

def storage_for_key(key):

    # init and test storage
    s3_storage = KiwixStorage(get_url_for())

    for wh in ("download", "warehouse"):
        bucket_name = f"org-kiwix-hotspot-cardshop-{wh}"
        print(f"testing {bucket_name}")
        if s3_storage.has_object(key, bucket_name=bucket_name):
            return KiwixStorage(get_url_for(bucket_name))

    raise ValueError(f"Couldn't find a bucket with key `{key}`")

Example #2

Show file

def test_url_access(url):
    s3 = KiwixStorage(url)
    print(f"testing: {s3.url.geturl()}")

    for test in ("list_buckets", "bucket", "write", "write_and_read"):
        params = {test: True}
        if test == "write_and_read":
            params = {"write": True, "read": True}
        result = s3.check_credentials(failsafe=True, **params)
        print(f"can {test}: {result}")
        if test == "list_buckets":
            print(f"{s3.bucket_names=}")

Example #3

Show file

 def s3_credentials_ok(self):
     logger.info("Testing S3 Optimization Cache credentials")
     self.s3_storage = KiwixStorage(self.s3_url_with_credentials)
     if not self.s3_storage.check_credentials(
         list_buckets=True, bucket=True, write=True, read=True, failsafe=True
     ):
         logger.error("S3 cache connection error testing permissions.")
         logger.error(f"  Server: {self.s3_storage.url.netloc}")
         logger.error(f"  Bucket: {self.s3_storage.bucket_name}")
         logger.error(f"  Key ID: {self.s3_storage.params.get('keyid')}")
         logger.error(f"  Public IP: {get_public_ip()}")
         return False
     return True

Example #4

Show file

def setup_s3_and_check_credentials(s3_url_with_credentials):
    logger.info("testing S3 Optimization Cache credentials")
    s3_storage = KiwixStorage(s3_url_with_credentials)
    if not s3_storage.check_credentials(
            list_buckets=True, bucket=True, write=True, read=True,
            failsafe=True):
        logger.error("S3 cache connection error testing permissions.")
        logger.error(f"  Server: {s3_storage.url.netloc}")
        logger.error(f"  Bucket: {s3_storage.bucket_name}")
        logger.error(f"  Key ID: {s3_storage.params.get('keyid')}")
        logger.error(f"  Public IP: {get_public_ip()}")
        raise ValueError(
            "Unable to connect to Optimization Cache. Check its URL.")
    return s3_storage

Example #5

Show file

File: upload.py Project: kiwix/python-storagelib

def do_upload_file(url: str, fpath: pathlib.Path, key: str = None):
    if not fpath.exists():
        raise IOError(f"{fpath} missing.")
    fsize = fpath.stat().st_size
    if not key:
        key = fpath.name

    s3 = KiwixStorage(url)
    dest = f"s3://{s3.url.netloc}/{s3.bucket_name}"

    if s3.has_object(key):
        raise ValueError(f"Key `{key}` already exists at {dest}. Specify another one.")
    print(f"Uploading {fpath.name} ({format_size(fsize)}) to {dest}/{key}")

    progress = CustomProgressBar(fsize).callback if progressbar else True
    s3.upload_file(fpath=fpath, key=key, progress=progress)

Example #6

Show file

    def get_list_of_all(self, mode):
        """returns a list of topics or playlists"""
        # get all topics
        topics_list = json.loads(
            download_link(
                "https://www.ted.com/topics/combo?models=Talks").text)
        if mode == "topic":
            return topics_list

        # mode is playlist
        s3_url_with_credentials = None
        s3_arg = "--optimization-cache"
        for index, arg in enumerate(self.extra_args):
            if arg.startswith(s3_arg):
                s3_url_with_credentials = (arg[len(s3_arg) + 1:] if "=" in arg
                                           else self.extra_args[index + 1])
                break

        if s3_url_with_credentials:
            s3_storage = KiwixStorage(s3_url_with_credentials)
            if not s3_storage.check_credentials(list_buckets=True,
                                                bucket=True,
                                                write=True,
                                                read=True,
                                                failsafe=True):
                logger.error(
                    "S3 credential check failed. Continuing without S3")
                return self.download_playlists_list_from_site(topics_list)

            key = "playlists_list.json"
            playlists_list = self.download_playlists_list_from_cache(
                key, s3_storage)
            if not playlists_list:
                logger.debug("Attempting to retrieve playlists list from TED")
                playlists_list = self.download_playlists_list_from_site(
                    topics_list)
                self.upload_playlists_list_to_cache(playlists_list, key,
                                                    s3_storage)
            return playlists_list
        return self.download_playlists_list_from_site(topics_list)

Example #7

Show file

    def process_image(self, url: str, path, is_profile: bool = False) -> str:
        """download image from url or S3 and add to Zim at path. Upload if req."""

        if self.aborted:
            return

        # setup resizing based on request
        resize_args = ({
            "width": PROFILE_IMAGE_SIZE,
            "height": PROFILE_IMAGE_SIZE,
            "method": "thumbnail",
        } if is_profile else {
            "width": POSTS_IMAGE_SIZE
        })

        # just download, optimize and add to ZIM if not using S3
        if not Global.conf.s3_url:
            with Global.lock:
                Global.creator.add_item_for(
                    path=path,
                    content=self.get_image_data(url.geturl(),
                                                **resize_args).getvalue(),
                    mimetype="image/webp",
                    is_front=False,
                    callback=self.once_done,
                )
            return path

        # we are using S3 cache
        ident = self.get_version_ident_for(url.geturl())
        if ident is None:
            logger.error(f"Unable to query {url.geturl()}. Skipping")
            return path

        key = self.get_s3_key_for(url.geturl())
        s3_storage = KiwixStorage(Global.conf.s3_url)
        meta = {"ident": ident, "encoder_version": str(IMAGES_ENCODER_VERSION)}

        download_failed = False  # useful to trigger reupload or not
        try:
            logger.debug(f"Attempting download of S3::{key} into ZIM::{path}")
            fileobj = io.BytesIO()
            s3_storage.download_matching_fileobj(key, fileobj, meta=meta)
        except NotFoundError:
            # don't have it, not a donwload error. we'll upload after processing
            pass
        except Exception as exc:
            logger.error(f"failed to download {key} from cache: {exc}")
            logger.exception(exc)
            download_failed = True
        else:
            with Global.lock:
                Global.creator.add_item_for(
                    path=path,
                    content=fileobj.getvalue(),
                    mimetype="image/webp",
                    is_front=False,
                    callback=self.once_done,
                )
            return path

        # we're using S3 but don't have it or failed to download
        try:
            fileobj = self.get_image_data(url.geturl(), **resize_args)
        except Exception as exc:
            logger.error(
                f"Failed to download/convert/optim source  at {url.geturl()}")
            logger.exception(exc)
            return path

        with Global.lock:
            Global.creator.add_item_for(
                path=path,
                content=fileobj.getvalue(),
                mimetype="image/webp",
                is_front=False,
                callback=self.once_done,
            )

        # only upload it if we didn't have it in cache
        if not download_failed:
            logger.debug(f"Uploading {url.geturl()} to S3::{key} with {meta}")
            try:
                s3_storage.upload_fileobj(fileobj=fileobj, key=key, meta=meta)
            except Exception as exc:
                logger.error(f"{key} failed to upload to cache: {exc}")

        return path

Example #8

Show file

class Ted2Zim:
    def __init__(
        self,
        topics,
        debug,
        name,
        video_format,
        low_quality,
        output_dir,
        no_zim,
        fname,
        languages,
        locale_name,
        title,
        description,
        creator,
        publisher,
        tags,
        keep_build_dir,
        autoplay,
        use_any_optimized_version,
        s3_url_with_credentials,
        playlist,
        subtitles_enough,
        subtitles_setting,
        tmp_dir,
        threads,
    ):

        # video-encoding info
        self.video_format = video_format
        self.low_quality = low_quality

        # zim params
        self.fname = fname
        self.languages = (
            [] if languages is None else [l.strip() for l in languages.split(",")]
        )
        self.tags = [] if tags is None else [t.strip() for t in tags.split(",")]
        self.title = title
        self.description = description
        self.creator = creator
        self.publisher = publisher
        self.name = name

        # directory setup
        self.output_dir = pathlib.Path(output_dir).expanduser().resolve()
        if tmp_dir:
            pathlib.Path(tmp_dir).mkdir(parents=True, exist_ok=True)
        self.build_dir = pathlib.Path(tempfile.mkdtemp(dir=tmp_dir))

        # scraper options
        self.topics = (
            []
            if not topics
            else [c.strip().replace(" ", "+") for c in topics.split(",")]
        )
        self.autoplay = autoplay
        self.playlist = playlist
        self.subtitles_enough = subtitles_enough
        self.subtitles_setting = (
            subtitles_setting
            if subtitles_setting == ALL
            or subtitles_setting == MATCHING
            or subtitles_setting == NONE
            else self.to_ted_langcodes(
                [lang.strip() for lang in subtitles_setting.split(",")]
            )
        )
        self.threads = threads
        self.yt_downloader = None

        # optimization cache
        self.s3_url_with_credentials = s3_url_with_credentials
        self.use_any_optimized_version = use_any_optimized_version
        self.s3_storage = None
        self.video_quality = "low" if self.low_quality else "high"

        # debug/developer options
        self.no_zim = no_zim
        self.keep_build_dir = keep_build_dir
        self.debug = debug

        # class members
        self.videos = []
        self.playlist_title = None
        self.playlist_description = None
        self.source_languages = (
            [] if not self.languages else self.to_ted_langcodes(self.languages)
        )
        self.zim_lang = None
        self.already_visited = []

        # set and record locale for translations
        locale_details = get_language_details(locale_name)
        if locale_details["querytype"] != "locale":
            locale_name = locale_details["iso-639-1"]
        try:
            self.locale = setlocale(ROOT_DIR, locale_name)
        except locale.Error:
            logger.error(
                f"No locale for {locale_name}. Use --locale to specify it. "
                "defaulting to en_US"
            )
            self.locale = setlocale(ROOT_DIR, "en")
        # locale's language code
        self.locale_name = self.to_ted_langcodes(locale_name)

    @property
    def templates_dir(self):
        return ROOT_DIR.joinpath("templates")

    @property
    def videos_dir(self):
        return self.build_dir.joinpath("videos")

    @property
    def ted_videos_json(self):
        return self.build_dir.joinpath("ted_videos.json")

    @property
    def ted_topics_json(self):
        return self.build_dir.joinpath("ted_topics.json")

    @property
    def talks_base_url(self):
        return BASE_URL + "talks"

    @property
    def playlists_base_url(self):
        return BASE_URL + "playlists"

    def append_part1_or_part3(self, lang_code_list, lang_info):
        """Fills missing ISO languages codes for all in list

        lang_code_list: list og lang codes
        lang_info: see zimscraperlib.i18n"""

        # ignore extra language mappings if supplied query was an iso-639-1 code
        if "part1" in lang_info["iso_types"]:
            lang_code_list.append(lang_info["iso-639-1"])

        # supplied query was not iso-639-1
        else:
            if lang_info["iso-639-1"]:
                lang_code_list.append(lang_info["iso-639-1"])
                # check for extra language codes to include
                if lang_info["iso-639-1"] in TEDLANGS["mappings"]:
                    for code in TEDLANGS["mappings"][lang_info["iso-639-1"]]:
                        lang_code_list.append(code)
            elif lang_info["iso-639-3"]:
                lang_code_list.append(lang_info["iso-639-3"])
            else:
                supplied_lang = lang_info["query"]
                logger.error(f"Language {supplied_lang} is not supported by TED")

    def to_ted_langcodes(self, languages):
        """Converts languages queries into TED language codes

        Examples:
            ["English", "fr", "hin"] => ["en", "fr", "hi"]
            ["chi", "fake"] => ["zh", "zh-cn", "zh-tw"]
        """

        lang_code_list = []
        for lang in languages:
            lang_info = get_language_details(lang, failsafe=True)
            if lang_info:
                if lang_info["querytype"] == "purecode":
                    self.append_part1_or_part3(lang_code_list, lang_info)
                elif lang_info["querytype"] == "locale":
                    query = lang_info["query"].replace("_", "-")
                    if query in TEDLANGS["locales"]:
                        lang_code_list.append(query)
                    else:
                        self.append_part1_or_part3(lang_code_list, lang_info)
                else:
                    self.append_part1_or_part3(lang_code_list, lang_info)
        return list(set(lang_code_list))

    def extract_videos_from_playlist(self, playlist):
        """extracts metadata for all videos in the given playlist

        calls extract_video_info on all links to get this data
        """

        playlist_url = f"{self.playlists_base_url}/{playlist}"
        logger.debug(f"extract_videos_from_playlist: {playlist_url}")
        soup = BeautifulSoup(download_link(playlist_url).text, features="html.parser")
        video_elements = soup.find_all("a", attrs={"class": "group"})
        self.playlist_title = soup.find("h1").string
        self.playlist_description = soup.find("p", attrs={"class": "text-base"}).string

        for element in video_elements:
            relative_path = element.get("href")
            url = urllib.parse.urljoin(self.talks_base_url, relative_path)
            if self.extract_info_from_video_page(url):
                if self.source_languages and len(self.source_languages) > 1:
                    other_lang_urls = self.generate_urls_for_other_languages(url)
                    logger.debug(
                        f"Searching info for the video in other {len(other_lang_urls)} language(s)"
                    )
                    for lang_url in other_lang_urls:
                        self.extract_info_from_video_page(lang_url)
                    self.already_visited.append(urllib.parse.urlparse(url)[2])
            logger.debug(f"Seen {relative_path}")
        logger.debug(f"Total videos found on playlist: {len(video_elements)}")
        if not video_elements:
            raise ValueError("Wrong playlist ID supplied. No videos found")

    def generate_search_result_and_scrape(self, topic_url, total_videos_scraped):
        """generates a search result and returns the total number of videos scraped"""

        page = 1
        while True:
            logger.debug(f"generate_search_result_and_scrape: {topic_url}&page={page}")
            html = download_link(f"{topic_url}&page={page}").text
            nb_videos_extracted, nb_videos_on_page = self.extract_videos_on_topic_page(
                html
            )
            if nb_videos_on_page == 0:
                break
            total_videos_scraped += nb_videos_extracted
            page += 1
        return total_videos_scraped

    def extract_videos_from_topics(self, topic):
        """extracts metadata for required number of videos on different topics"""

        logger.debug(f"Fetching video links for topic: {topic}")
        topic_url = f"{self.talks_base_url}?topics%5B%5D={topic}"
        total_videos_scraped = 0

        if self.source_languages:
            for lang in self.source_languages:
                topic_url = topic_url + f"&language={lang}"
                total_videos_scraped = self.generate_search_result_and_scrape(
                    topic_url, total_videos_scraped
                )

        else:
            total_videos_scraped = self.generate_search_result_and_scrape(
                topic_url, total_videos_scraped
            )

        logger.info(f"Total video links found in {topic}: {total_videos_scraped}")
        if total_videos_scraped == 0:
            return False
        return True

    def update_zim_metadata(self):

        if not self.languages:
            self.zim_lang = "eng"
        else:
            if len(self.source_languages) > 1:
                self.zim_lang = "mul"
            else:
                lang_info = get_language_details(
                    self.source_languages[0], failsafe=True
                )
                if lang_info:
                    self.zim_lang = lang_info["iso-639-3"]
                else:
                    self.zim_lang = "eng"

        if self.playlist:
            if not self.title:
                self.title = self.playlist_title.strip()
            if not self.description:
                self.description = self.playlist_description.strip()
        else:
            if len(self.topics) > 1:
                if not self.title:
                    self.title = "TED Collection"
                if not self.description:
                    self.description = "A selection of TED videos from several topics"
            else:
                topic_str = self.topics[0].replace("+", " ")
                if not self.title:
                    self.title = f"{topic_str.capitalize()} from TED"
                if not self.description:
                    self.description = f"A selection of {topic_str} videos from TED"

    def get_display_name(self, lang_code, lang_name):
        """Display name for language"""

        lang_info = get_language_details(lang_code, failsafe=True)
        if lang_code != "en" and lang_info:

            return lang_info["native"] + " - " + lang_name
        return lang_name

    def get_subtitle_dict(self, lang):
        """dict of language name and code from a larger dict lang

        Example:
        {
            'languageCode': 'en',
            'languageName': 'English'
        }
        """

        return {
            "languageName": self.get_display_name(
                lang["languageCode"], lang["languageName"]
            ),
            "languageCode": lang["languageCode"],
        }

    def generate_subtitle_list(self, video_id, langs, page_lang, audio_lang):
        """List of all subtitle languages with link to their pages"""

        subtitles = []
        if self.subtitles_setting == ALL or (
            not self.source_languages and self.topics and self.subtitles_setting != NONE
        ):
            subtitles = [self.get_subtitle_dict(lang) for lang in langs]
        elif self.subtitles_setting == MATCHING or (
            self.subtitles_enough
            and self.subtitles_setting == NONE
            and page_lang != audio_lang
        ):
            subtitles = [
                self.get_subtitle_dict(lang)
                for lang in langs
                if lang["languageCode"] == page_lang
            ]
        elif self.subtitles_setting and self.subtitles_setting != NONE:
            if not self.subtitles_enough and self.topics:
                subtitles = [
                    self.get_subtitle_dict(lang)
                    for lang in langs
                    if lang["languageCode"] in self.subtitles_setting
                ]
            else:
                subtitles = [
                    self.get_subtitle_dict(lang)
                    for lang in langs
                    if lang["languageCode"] in self.subtitles_setting
                    or lang["languageCode"] in self.source_languages
                ]

        return update_subtitles_list(video_id, subtitles)

    def generate_urls_for_other_languages(self, url):
        """Possible URLs for other requested languages based on a video url"""

        urls = []
        page_lang, query = self.get_lang_code_from_url(url, with_full_query=True)
        url_parts = list(urllib.parse.urlparse(url))

        # update the language query field value with other languages and form URLs
        for language in self.source_languages:
            if language != page_lang:
                query.update({"language": language})
                url_parts[4] = urllib.parse.urlencode(query)
                urls.append(urllib.parse.urlunparse(url_parts))
        return urls

    def extract_videos_on_topic_page(self, page_html):

        # all videos are embedded in a <div> with the class name 'row'.
        # we are searching for the div inside this div, that has an <a>-tag
        # with the class name 'media__image', because this is the relative
        # link to the representative TED talk. It turns this relative link to
        # an absolute link and calls extract_video_info for them
        soup = BeautifulSoup(page_html, features="html.parser")
        video_links = soup.select("div.row div.media__image a")
        nb_extracted = 0
        nb_listed = len(video_links)
        logger.debug(f"{nb_listed} video(s) found on current page")
        for video_link in video_links:
            url = urllib.parse.urljoin(self.talks_base_url, video_link["href"])
            if self.extract_info_from_video_page(url):
                nb_extracted += 1
                if self.source_languages and len(self.source_languages) > 1:
                    other_lang_urls = self.generate_urls_for_other_languages(url)
                    logger.debug(
                        f"Searching info for video in other {len(other_lang_urls)} language(s)"
                    )
                    for lang_url in other_lang_urls:
                        self.extract_info_from_video_page(lang_url)
                    self.already_visited.append(urllib.parse.urlparse(url)[2])
            logger.debug(f"Seen {video_link['href']}")
        return nb_extracted, nb_listed

    def get_lang_code_from_url(self, url, with_full_query=False):
        """gets the queried language code from a ted talk url"""

        # sample - https://www.ted.com/talks/alex_rosenthal_the_gauntlet_think_like_a_coder_ep_8?language=ja
        url_parts = list(urllib.parse.urlparse(url))

        # explode url to extract `language` query field value
        query = dict(urllib.parse.parse_qsl(url_parts[4]))
        current_lang = query.get("language")
        if with_full_query:
            return current_lang, query
        return current_lang

    def extract_download_link(self, talk_data):
        """Returns download link / youtube video ID for a TED video"""

        if (
            isinstance(talk_data.get("resources", {}).get("h264"), list)
            and len(talk_data["resources"]["h264"])
            and talk_data["resources"]["h264"][0].get("file")
        ):
            logger.debug(
                "Using h264 resource link for bitrate="
                f"{talk_data['resources']['h264'][0].get('bitrate')}"
            )
            return talk_data["resources"]["h264"][0]["file"]

        logger.error("No download link found for the video")
        return None

    def update_videos_list(
        self,
        video_id,
        lang_code,
        lang_name,
        title,
        description,
        speaker,
        speaker_profession,
        speaker_bio,
        speaker_picture,
        date,
        thumbnail,
        video_link,
        length,
        subtitles,
    ):
        # append to self.videos and return if not present
        if not [video for video in self.videos if video.get("id", None) == video_id]:
            self.videos.append(
                {
                    "id": video_id,
                    "languages": [
                        {
                            "languageCode": lang_code,
                            "languageName": self.get_display_name(lang_code, lang_name),
                        }
                    ],
                    "title": [{"lang": lang_code, "text": title}],
                    "description": [{"lang": lang_code, "text": description}],
                    "speaker": speaker,
                    "speaker_profession": speaker_profession,
                    "speaker_bio": speaker_bio,
                    "speaker_picture": speaker_picture,
                    "date": date,
                    "thumbnail": thumbnail,
                    "video_link": video_link,
                    "length": length,
                    "subtitles": subtitles,
                }
            )
            logger.debug(f"Successfully inserted video {video_id} into video list")
            return True

        # update localized meta for video if already in self.videos
        # based on --subtitles=matching
        logger.debug(f"Video {video_id} already present in video list")
        for index, video in enumerate(self.videos):
            if video.get("id", None) == video_id:
                if {"lang": lang_code, "text": title} not in video["title"]:
                    self.videos[index]["title"].append(
                        {"lang": lang_code, "text": title}
                    )
                    self.videos[index]["description"].append(
                        {"lang": lang_code, "text": description}
                    )
                    self.videos[index]["languages"].append(
                        {
                            "languageCode": lang_code,
                            "languageName": self.get_display_name(lang_code, lang_name),
                        }
                    )
                if self.subtitles_setting == MATCHING or self.subtitles_setting == NONE:
                    self.videos[index]["subtitles"] += subtitles
        return False

    def extract_video_info_from_json(self, json_data):
        player_data = json.loads(json_data["playerData"])
        lang_code = json_data["language"]
        lang_name = [
            lang["languageName"]
            for lang in player_data["languages"]
            if lang["languageCode"] == lang_code
        ][-1]
        # talk_info = json_data["talks"][0]
        native_talk_language = player_data["nativeLanguage"]
        if (
            not self.subtitles_enough
            and self.source_languages
            and native_talk_language != lang_code
            and self.topics
        ):
            return False

        # Extract the speaker of the TED talk
        if len(json_data["speakers"]):
            speaker_info = json_data["speakers"][0]
            speaker = " ".join(
                [
                    speaker_info.get("firstName"),
                    speaker_info.get("middleName"),
                    speaker_info.get("lastName"),
                ]
            )
        else:
            speaker_info = {
                "description": "None",
                "whotheyare": "None",
                "photo_url": "",
            }
            if "presenterDisplayName" in json_data:
                speaker = json_data["presenterDisplayName"]
            else:
                speaker = "None"

        # Extract the ted talk details from json
        video_id = json_data["id"]
        speaker_profession = speaker_info["description"]
        speaker_bio = speaker_info.get("whoTheyAre", "-")
        speaker_picture = speaker_info.get("avatar", "-")
        title = json_data["title"]
        description = json_data["description"]
        date = dateutil.parser.parse(json_data["recordedOn"]).strftime("%d %B %Y")
        length = int(json_data["duration"]) // 60
        thumbnail = player_data["thumb"]
        video_link = self.extract_download_link(player_data)
        if not video_link:
            logger.error("No suitable download link found. Skipping video")
            return False

        langs = player_data["languages"]
        subtitles = self.generate_subtitle_list(
            video_id, langs, lang_code, native_talk_language
        )
        return self.update_videos_list(
            video_id=video_id,
            lang_code=lang_code,
            lang_name=lang_name,
            title=title,
            description=description,
            speaker=speaker,
            speaker_profession=speaker_profession,
            speaker_bio=speaker_bio,
            speaker_picture=speaker_picture,
            date=date,
            thumbnail=thumbnail,
            video_link=video_link,
            length=length,
            subtitles=subtitles,
        )

    def extract_info_from_video_page(self, url, retry_count=0):
        """extract all info from a TED video page url and update self.videos"""

        # Every TED video page has a <script>-tag with a Javascript
        # object with JSON in it. We will just stip away the object
        # signature and load the json to extract meta-data out of it.
        # returns True if successfully scraped new video

        # don't scrape if URL already visited
        if urllib.parse.urlparse(url)[2] in self.already_visited:
            return False

        # don't scrape if maximum retry count is reached
        if retry_count > 5:
            logger.error("Max retries exceeded. Skipping video")
            return False

        logger.debug(f"extract_info_from_video_page: {url}")
        soup = BeautifulSoup(download_link(url).text, features="html.parser")

        json_data = json.loads(
            soup.find("script", attrs={"id": "__NEXT_DATA__"}).string
        )["props"]["pageProps"]["videoData"]

        requested_lang_code = self.get_lang_code_from_url(url)
        if requested_lang_code and json_data["language"] != requested_lang_code:
            logger.error(
                f"Video has not yet been translated into {requested_lang_code}"
            )
            return False
        return self.extract_video_info_from_json(json_data)

    def add_default_language(self):
        """add metatada in default language (english or first avail) on all videos"""

        for video in self.videos:
            en_found = False
            for index, lang in enumerate(video["languages"]):
                if lang["languageCode"] == "en":
                    en_found = True
                    video["title"] = [
                        {"lang": "default", "text": video["title"][index]["text"]}
                    ] + video["title"]
                    video["description"] = [
                        {"lang": "default", "text": video["description"][index]["text"]}
                    ] + video["description"]

            if not en_found:
                video["title"] = [
                    {"lang": "default", "text": video["title"][0]["text"]}
                ] + video["title"]
                video["description"] = [
                    {"lang": "default", "text": video["description"][0]["text"]}
                ] + video["description"]

            # update video slug
            video["slug"] = slugify(video["title"][0]["text"], separator="-")

    def render_video_pages(self):

        # Render static html pages from the scraped video data and
        # save the pages in build_dir/<video-id>/index.html
        env = jinja2.Environment(
            loader=jinja2.FileSystemLoader(str(self.templates_dir)), autoescape=True
        )
        for video in self.videos:
            titles = video["title"]
            html = env.get_template("article.html").render(
                speaker=video["speaker"],
                languages=video["subtitles"],
                speaker_bio=video["speaker_bio"].replace("Full bio", ""),
                speaker_img=video["speaker_picture"],
                date=video["date"],
                profession=video["speaker_profession"],
                video_format=self.video_format,
                autoplay=self.autoplay,
                video_id=str(video["id"]),
                title=get_main_title(titles, self.locale_name),
                titles=titles,
                descriptions=video["description"],
                back_to_list=_("Back to the list"),
            )
            html_path = self.build_dir.joinpath(video["slug"])
            with open(html_path, "w", encoding="utf-8") as html_page:
                html_page.write(html)

    def render_home_page(self):

        # Render the homepage
        env = jinja2.Environment(
            loader=jinja2.FileSystemLoader(str(self.templates_dir)), autoescape=True
        )
        all_langs = {
            language["languageCode"]: language["languageName"]
            for video in self.videos
            for language in video["subtitles"] + video["languages"]
        }
        languages = [
            {"languageName": value, "languageCode": key}
            for key, value in all_langs.items()
        ]
        languages = sorted(languages, key=lambda x: x["languageName"])
        html = env.get_template("home.html").render(
            languages=languages,
            page_title=_("TED Talks"),
            language_filter_text=_("Filter by language"),
            back_to_top=_("Back to the top"),
            pagination_text=_("Page"),
        )
        home_page_path = self.build_dir.joinpath("index")
        with open(home_page_path, "w", encoding="utf-8") as html_page:
            html_page.write(html)

    def copy_files_to_build_directory(self):

        # Copy files from template_dir to build_dir
        assets_dir = self.templates_dir.joinpath("assets")
        if assets_dir.exists():
            shutil.copytree(
                assets_dir, self.build_dir.joinpath("assets"), dirs_exist_ok=True
            )
        shutil.copy(
            self.templates_dir.joinpath("favicon.png"),
            self.build_dir.joinpath("favicon.png"),
        )

    def generate_datafile(self):
        """Generate data.js inside assets folder"""

        video_list = []
        for video in self.videos:
            lang_codes = [lang["languageCode"] for lang in video["subtitles"]] + [
                lang["languageCode"] for lang in video["languages"]
            ]
            json_data = {
                "languages": [lang_code for lang_code in set(lang_codes)],
                "id": video["id"],
                "description": video["description"],
                "title": video["title"],
                "speaker": video["speaker"],
                "slug": video["slug"],
            }
            video_list.append(json_data)
        assets_path = self.build_dir.joinpath("assets")
        if not assets_path.exists():
            assets_path.mkdir(parents=True)

        with open(assets_path.joinpath("data.js"), "w") as data_file:
            data_file.write("json_data = " + json.dumps(video_list, indent=4))

    def download_jpeg_image_and_convert(
        self, url, fpath, preset_options={}, resize=None
    ):
        """downloads a JPEG image and converts and optimizes it into desired format detected from fpath"""

        org_jpeg_path = pathlib.Path(
            tempfile.NamedTemporaryFile(delete=False, suffix=".jpg").name
        )
        save_large_file(url, org_jpeg_path)
        if resize is not None:
            resize_image(
                org_jpeg_path,
                width=resize[0],
                height=resize[1],
                method="cover",
            )
        optimize_image(
            org_jpeg_path, fpath, convert=True, delete_src=True, **preset_options
        )
        logger.debug(f"Converted {org_jpeg_path} to {fpath} and optimized ")

    def download_speaker_image(
        self, video_id, video_title, video_speaker, speaker_path
    ):
        """downloads the speaker image"""

        downloaded_from_cache = False
        preset = WebpMedium()
        if self.s3_storage:
            s3_key = f"speaker_image/{video_id}"
            downloaded_from_cache = self.download_from_cache(
                s3_key, speaker_path, preset.VERSION
            )
        if not downloaded_from_cache:
            try:
                # download an image of the speaker
                if not video_speaker:
                    logger.debug("Speaker doesn't have an image")
                else:
                    logger.debug(f"Downloading Speaker image for {video_title}")
                    self.download_jpeg_image_and_convert(
                        video_speaker, speaker_path, preset_options=preset.options
                    )
            except Exception:
                logger.error(f"Could not download speaker image for {video_title}")
            else:
                if self.s3_storage and video_speaker:
                    self.upload_to_cache(s3_key, speaker_path, preset.VERSION)

    def download_thumbnail(
        self, video_id, video_title, video_thumbnail, thumbnail_path
    ):
        """download the thumbnail"""

        downloaded_from_cache = False
        preset = WebpMedium()
        if self.s3_storage:
            s3_key = f"thumbnail/{video_id}"
            downloaded_from_cache = self.download_from_cache(
                s3_key, thumbnail_path, preset.VERSION
            )
        if not downloaded_from_cache:
            try:
                # download the thumbnail of the video
                logger.debug(f"Downloading thumbnail for {video_title}")
                self.download_jpeg_image_and_convert(
                    video_thumbnail,
                    thumbnail_path,
                    preset_options=preset.options,
                    resize=(248, 187),
                )
            except Exception:
                logger.error(f"Could not download thumbnail for {video_title}")
            else:
                if self.s3_storage:
                    self.upload_to_cache(s3_key, thumbnail_path, preset.VERSION)

    def download_video_files(self, video):
        """download all video files (video, thumbnail, speaker)"""

        # Download all the TED talk videos and the meta-data for it.
        # Save the videos in build_dir/{video id}/video.mp4.
        # Save the thumbnail for the video in build_dir/{video id}/thumbnail.jpg.
        # Save the image of the speaker in build_dir/{video id}/speaker.jpg.

        # set up variables
        video_id = str(video["id"])
        # Take the english version of title or else whatever language it's available in
        video_title = video["title"][0]["text"]
        video_link = video["video_link"]
        video_speaker = video["speaker_picture"]
        video_thumbnail = video["thumbnail"]
        video_dir = self.videos_dir.joinpath(video_id)
        org_video_file_path = video_dir.joinpath("video.mp4")
        req_video_file_path = video_dir.joinpath(f"video.{self.video_format}")
        speaker_path = video_dir.joinpath("speaker.webp")
        thumbnail_path = video_dir.joinpath("thumbnail.webp")

        # ensure that video directory exists
        if not video_dir.exists():
            video_dir.mkdir(parents=True)

        # set preset
        preset = {"mp4": VideoMp4Low}.get(self.video_format, VideoWebmLow)()

        # download video
        downloaded_from_cache = False
        logger.debug(f"Downloading {video_title}")
        if self.s3_storage:
            s3_key = f"{self.video_format}/{self.video_quality}/{video_id}"
            downloaded_from_cache = self.download_from_cache(
                s3_key, req_video_file_path, preset.VERSION
            )
        if not downloaded_from_cache:
            try:
                if "https://" not in video_link:
                    options = (
                        BestWebm if self.video_format == "webm" else BestMp4
                    ).get_options(
                        target_dir=video_dir, filepath=pathlib.Path("video.%(ext)s")
                    )
                    self.yt_downloader.download(video_link, options)
                else:
                    save_large_file(video_link, org_video_file_path)
            except Exception:
                logger.error(f"Could not download {org_video_file_path}")

        # download speaker and thumbnail images
        self.download_speaker_image(video_id, video_title, video_speaker, speaker_path)
        self.download_thumbnail(video_id, video_title, video_thumbnail, thumbnail_path)

        # recompress if necessary
        try:
            if not downloaded_from_cache:
                post_process_video(
                    video_dir,
                    video_id,
                    preset,
                    self.video_format,
                    self.low_quality,
                )
        except Exception as e:
            logger.error(f"Failed to post process video {video_id}")
            logger.debug(e)
        else:
            # upload to cache only if recompress was successful
            if self.s3_storage and not downloaded_from_cache:
                self.upload_to_cache(s3_key, req_video_file_path, preset.VERSION)

    def download_video_files_parallel(self):
        """download videos and images parallely"""

        self.yt_downloader = YoutubeDownloader(threads=1)
        with concurrent.futures.ThreadPoolExecutor(
            max_workers=self.threads
        ) as executor:
            fs = [
                executor.submit(self.download_video_files, video)
                for video in self.videos
            ]
            concurrent.futures.wait(fs, return_when=concurrent.futures.ALL_COMPLETED)
        self.yt_downloader.shutdown()

    def download_subtitles(self, index, video):
        """download, converts and writes VTT subtitles for a video at a specific index in self.videos"""

        # Download the subtitle files, generate a WebVTT file
        # and save the subtitles in
        # build_dir/{video id}/subs/subs_{language code}.vtt
        if not video["subtitles"]:
            return
        video_dir = self.videos_dir.joinpath(video["id"])
        subs_dir = video_dir.joinpath("subs")
        if not subs_dir.exists():
            subs_dir.mkdir(parents=True)
        else:
            logger.debug(f"Subs dir exists already")

        # download subtitles
        logger.debug(f"Downloading subtitles for {video['title'][0]['text']}")
        valid_subs = []
        for subtitle in video["subtitles"]:
            time.sleep(0.5)  # throttling
            vtt_subtitle = WebVTT(subtitle["link"]).convert()
            if not vtt_subtitle:
                logger.error(
                    f"Subtitle file for {subtitle['languageCode']} could not be created"
                )
                continue
            valid_subs.append(subtitle)
            vtt_path = subs_dir.joinpath(f"subs_{subtitle['languageCode']}.vtt")
            with open(vtt_path, "w", encoding="utf-8") as sub_file:
                sub_file.write(vtt_subtitle)
        self.videos[index]["subtitles"] = valid_subs

    def download_subtitles_parallel(self):
        """download subtitles for all videos parallely"""

        with concurrent.futures.ThreadPoolExecutor(
            max_workers=self.threads
        ) as executor:
            fs = [
                executor.submit(self.download_subtitles, index, video)
                for index, video in enumerate(self.videos)
            ]
            concurrent.futures.wait(fs, return_when=concurrent.futures.ALL_COMPLETED)

    def s3_credentials_ok(self):
        logger.info("Testing S3 Optimization Cache credentials")
        self.s3_storage = KiwixStorage(self.s3_url_with_credentials)
        if not self.s3_storage.check_credentials(
            list_buckets=True, bucket=True, write=True, read=True, failsafe=True
        ):
            logger.error("S3 cache connection error testing permissions.")
            logger.error(f"  Server: {self.s3_storage.url.netloc}")
            logger.error(f"  Bucket: {self.s3_storage.bucket_name}")
            logger.error(f"  Key ID: {self.s3_storage.params.get('keyid')}")
            logger.error(f"  Public IP: {get_public_ip()}")
            return False
        return True

    def download_from_cache(self, key, object_path, encoder_version):
        """whether it downloaded from S3 cache"""

        if self.use_any_optimized_version:
            if not self.s3_storage.has_object(key, self.s3_storage.bucket_name):
                return False
        else:
            if not self.s3_storage.has_object_matching_meta(
                key, tag="encoder_version", value=f"v{encoder_version}"
            ):
                return False
        object_path.parent.mkdir(parents=True, exist_ok=True)
        try:
            self.s3_storage.download_file(key, object_path)
        except Exception as exc:
            logger.error(f"{key} failed to download from cache: {exc}")
            return False
        logger.info(f"downloaded {object_path} from cache at {key}")
        return True

    def upload_to_cache(self, key, object_path, encoder_version):
        """whether it uploaded from S3 cache"""

        try:
            self.s3_storage.upload_file(
                object_path, key, meta={"encoder_version": f"v{encoder_version}"}
            )
        except Exception as exc:
            logger.error(f"{key} failed to upload to cache: {exc}")
            return False
        logger.info(f"uploaded {object_path} to cache at {key}")
        return True

    def remove_failed_topics_and_check_extraction(self, failed_topics):
        """removes failed topics from topics list and raises error if scraper cannot continue"""

        for topic in failed_topics:
            self.topics.remove(topic)
        if not self.topics:
            if self.source_languages:
                raise ValueError(
                    "No videos found for any topic in the language(s) requested. Check topic(s) and/or language code supplied to --languages"
                )
            raise ValueError("Wrong topic(s) were supplied. No videos found")

    def run(self):
        logger.info(
            f"Starting scraper with:\n"
            f"  langs: {', '.join(self.source_languages)}\n"
            f"  subtitles : {', '.join(self.subtitles_setting) if isinstance(self.subtitles_setting, list) else self.subtitles_setting}\n"
            f"  video format : {self.video_format}"
        )

        if self.s3_url_with_credentials and not self.s3_credentials_ok():
            raise ValueError("Unable to connect to Optimization Cache. Check its URL.")
        if self.s3_storage:
            logger.info(
                f"Using cache: {self.s3_storage.url.netloc} with bucket: {self.s3_storage.bucket_name}"
            )

        # playlist mode requested
        if self.playlist:
            self.extract_videos_from_playlist(self.playlist)
        # topic(s) mode requested
        else:
            failed = []
            for topic in self.topics:
                if not self.extract_videos_from_topics(topic):
                    failed.append(topic)
                else:
                    logger.debug(f"Successfully scraped {topic}")
            self.remove_failed_topics_and_check_extraction(failed)

        self.add_default_language()
        self.update_zim_metadata()
        self.download_video_files_parallel()
        self.download_subtitles_parallel()
        self.render_home_page()
        self.render_video_pages()
        self.copy_files_to_build_directory()
        self.generate_datafile()

        # zim creation and cleanup
        if not self.no_zim:
            self.fname = (
                self.fname or f"{self.name.replace(' ', '-')}_{{period}}.zim"
            ).format(period=datetime.datetime.now().strftime("%Y-%m"))
            logger.info("building ZIM file")
            if not self.output_dir.exists():
                self.output_dir.mkdir(parents=True)
            make_zim_file(
                build_dir=self.build_dir,
                fpath=self.output_dir.joinpath(self.fname),
                name=self.name,
                main_page="index",
                favicon="favicon.png",
                title=self.title,
                description=self.description,
                language=self.zim_lang,
                creator=self.creator,
                publisher=self.publisher,
                tags=self.tags + ["_category:ted", "ted", "_videos:yes"],
                scraper=SCRAPER,
            )
            if not self.keep_build_dir:
                logger.info("removing temp folder")
                shutil.rmtree(self.build_dir, ignore_errors=True)

        logger.info("Done Everything")

Example #9

Show file

class Kolibri2Zim:
    def __init__(self, **kwargs):

        for option in options:
            if option not in kwargs:
                raise ValueError(f"Missing parameter `{option}`")

        def go(option):
            return kwargs.get(option)

        self.channel_id = go("channel_id")
        self.root_id = go("root_id")

        # video-encoding info
        self.use_webm = go("use_webm")
        self.low_quality = go("low_quality")
        self.autoplay = go("autoplay")

        # zim params
        self.fname = go("fname")
        self.tags = ([] if go("tags") is None else
                     [t.strip() for t in go("tags").split(",")])
        self.title = go("title")
        self.description = go("description")
        self.author = go("creator")
        self.publisher = go("publisher")
        self.name = go("name")

        # customization
        self.favicon = go("favicon")
        self.about = go("about")
        self.css = go("css")

        # directory setup
        self.output_dir = Path(go("output_dir")).expanduser().resolve()
        if go("tmp_dir"):
            Path(go("tmp_dir")).mkdir(parents=True, exist_ok=True)
        self.build_dir = Path(tempfile.mkdtemp(dir=go("tmp_dir")))

        # performances options
        self.nb_threads = go("threads")
        self.nb_processes = go("processes")
        self.s3_url_with_credentials = go("s3_url_with_credentials")
        self.s3_storage = None

        # debug/developer options
        self.keep_build_dir = go("keep_build_dir")
        self.debug = go("debug")
        self.only_topics = go("only_topics")

        # jinja2 environment setup
        self.jinja2_env = jinja2.Environment(loader=jinja2.FileSystemLoader(
            str(self.templates_dir)),
                                             autoescape=True)

    @property
    def templates_dir(self):
        return ROOT_DIR.joinpath("templates")

    def add_local_files(self, root_path, folder):
        """ recursively add local files from {folder} starting at {path} """
        non_front = ("viewer.html", "epub_embed.html")
        for fpath in folder.iterdir():
            path = "/".join([root_path, fpath.name])
            if fpath.is_file():
                mimetype = "text/html;raw=true" if fpath.name in non_front else None
                self.creator.add_item_for(path=path,
                                          title="",
                                          fpath=fpath,
                                          mimetype=mimetype)
                logger.debug(f"Adding {path}")
            else:
                self.add_local_files(path, fpath)

    def populate_nodes_executor(self):
        """Loop on content nodes to create zim entries from kolibri DB"""
        def schedule_node(item):
            future = self.nodes_executor.submit(self.add_node, item=item)
            self.nodes_futures.update({future: item[0]})

        # schedule root-id
        schedule_node((self.db.root["id"], self.db.root["kind"]))

        # fill queue with (node_id, kind) tuples for all root node's descendants
        for node in self.db.get_node_descendants(self.root_id):
            schedule_node((node["id"], node["kind"]))

    def add_node(self, item):
        """ process a content node from the tuple in queue """
        node_id, kind = item
        # check if we have a handler for this {kind} of node
        handler = getattr(self, f"add_{kind}_node", None)

        # debug espace
        if self.only_topics and kind != "topic":
            return

        if handler:
            # add thumbnail to zim if there's one for this node
            thumbnail = self.db.get_node_thumbnail(node_id)
            if thumbnail:
                self.funnel_file(thumbnail["id"], thumbnail["ext"])
            # fire the add_{kind}_node() method which will actually process it
            handler(node_id)

    def funnel_file(self, fid, fext):
        """ directly add a Kolibri file to the ZIM using same name """
        url, fname = get_kolibri_url_for(fid, fext)
        with self.creator_lock:
            self.creator.add_item(URLItem(url=url, path=fname))
        logger.debug(f"Added {fname} from Studio")

    def download_to_disk(self, file_id, ext):
        """ download a Kolibri file to the build-dir using its filename """
        url, fname = get_kolibri_url_for(file_id, ext)
        fpath = self.build_dir / fname
        stream_file(url, fpath)
        return fpath

    def funnel_from_s3(self, file_id, path, checksum, preset):
        """whether it could fetch and add the file from S3 cache

        - checks if a cache is configured
        - checks if file is present
        - checks if file is valid (corresponds to same original file)
        - downloads and add to zim

        returns True is all this succeeded, False otherwise"""
        if not self.s3_storage:
            return False

        key = self.s3_key_for(file_id, preset)

        # exit early if we don't have this object in bucket
        if not self.s3_storage.has_object_matching(
                key,
                meta={
                    "checksum": checksum,
                    "encoder_version": str(preset.VERSION)
                }):
            return False

        # download file into memory
        fileobj = io.BytesIO()
        try:
            self.s3_storage.download_fileobj(key, fileobj)
        except Exception as exc:
            logger.error(f"failed to download {key} from cache: {exc}")
            logger.exception(exc)
            # make sure we fallback to re-encode
            return False

        # add to zim
        with self.creator_lock:
            self.creator.add_item(
                StaticItem(
                    path=path,
                    fileobj=fileobj,
                    mimetype=preset.mimetype,
                ))
        logger.debug(f"Added {path} from S3::{key}")
        return True

    def s3_key_for(self, file_id, preset):
        """compute in-bucket key for file"""
        return f"{file_id[0]}/{file_id[1]}/{file_id}/{type(preset).__name__.lower()}"

    def upload_to_s3(self, key, fpath, **meta):
        """ whether it successfully uploaded to cache """
        if not self.s3_storage:
            return

        logger.debug(f"Uploading {fpath.name} to S3::{key} with {meta}")
        try:
            self.s3_storage.upload_file(fpath, key, meta=meta)
        except Exception as exc:
            logger.error(f"{key} failed to upload to cache: {exc}")
            return False
        return True

    def add_topic_node(self, node_id):
        """Build and add the HTML page for a single topic node

        Topic nodes are used only for hierarchy and solely contains metadata"""

        # fetch details including parents for breadcrumb and children to link to
        node = self.db.get_node(node_id, with_parents=True, with_children=True)

        html = self.jinja2_env.get_template("topic.html").render(
            node_id=node_id, **node)
        with self.creator_lock:
            self.creator.add_item_for(path=node_id,
                                      title=node["title"],
                                      content=html,
                                      mimetype="text/html")
        logger.debug(f"Added topic #{node_id}")

    def add_video_node(self, node_id):
        """Add content from this `video` node to zim

        video node is composed of (1) or (2) videos files and optional subtitle files
        video files are at most one of each `high_res_video` or `low_res_video`
        subtitle files (`video_subtitle`) are VTT files and are only limited by the
        number of language to select from in kolibri studio"""

        files = self.db.get_node_files(node_id, thumbnail=False)
        if not files:
            return
        files = sorted(files, key=lambda f: f["prio"])
        it = filter(lambda f: f["supp"] == 0, files)

        try:
            # find main video file
            video_file = next(it)
        except StopIteration:
            # we have no video file
            return

        try:
            alt_video_file = next(it)
        except StopIteration:
            # we have no supplementary video file (which is OK)
            alt_video_file = None

        # now decide which file to keep and what to do with it

        # content_file has a 1:1 rel with content_localfile which is thre
        # *implementation* of the file. We use that local file ID (its checksum)
        # everywhere BUT as S3 cache ID as we want to overwrite the same key
        # should a new version of the localfile for the same file arrives.
        vid = video_file["id"]  # the local file ID (current version)
        vfid = video_file["fid"]  # the file ID in DB (version agnostic)
        vchk = video_file["checksum"]

        # we'll reencode, using the best file with appropriate preset
        if self.use_webm:
            preset = VideoWebmLow() if self.low_quality else VideoWebmHigh()
            src_fname = Path(filename_for(video_file))
            path = str(src_fname.with_suffix(f".{preset.ext}"))
            video_filename_ext = preset.ext
            video_filename = src_fname.with_suffix(
                f".{video_filename_ext}").name

            # funnel from S3 cache if it is present there
            if not self.funnel_from_s3(vfid, path, vchk, preset):

                # download original video
                src = self.download_to_disk(vid, video_file["ext"])
                dst = src.with_suffix(".webm")

                # request conversion
                self.convert_and_add_video_aside(vfid, src, vchk, dst, path,
                                                 preset)

        # we want low-q but no webm yet don't have low_res file, let's reencode
        elif self.low_quality and alt_video_file is None:
            preset = VideoMp4Low()
            src_fname = Path(filename_for(video_file))
            path = str(src_fname.with_suffix(f".{preset.ext}"))
            video_filename_ext = preset.ext
            video_filename = src_fname.with_suffix(
                f".{video_filename_ext}").name

            # funnel from S3 cache if it is present there
            if not self.funnel_from_s3(vfid, path, vchk, preset):

                # download original video
                src = self.download_to_disk(vid, video_file["ext"])

                # move source file to a new name and swap variables so our target will
                # be the previously source one
                src_ = src.with_suffix(f"{src.suffix}.orig")
                shutil.move(src, src_)
                dst = src
                src = src_

                # request conversion
                self.convert_and_add_video_aside(vfid, src, vchk, dst, path,
                                                 preset)

        # we want mp4, either in high-q or we have a low_res file to use
        else:
            video_file = alt_video_file if self.low_quality else video_file
            self.funnel_file(video_file["id"], video_file["ext"])
            video_filename = filename_for(video_file)
            video_filename_ext = video_file["ext"]

        # prepare list of subtitles for template
        subtitles = []
        for file in filter(lambda f: f["preset"] == "video_subtitle", files):
            self.funnel_file(file["id"], file["ext"])
            try:
                local, english = find_language_names(file["lang"])
            except Exception:
                english = file["lang"]
            finally:
                subtitles.append({
                    "code": file["lang"],
                    "name": english,
                    "filename": filename_for(file),
                })

        node = self.db.get_node(node_id, with_parents=True)
        html = self.jinja2_env.get_template("video.html").render(
            node_id=node_id,
            video_filename=video_filename,
            video_filename_ext=video_filename_ext,
            subtitles=sorted(subtitles, key=lambda i: i["code"]),
            thumbnail=self.db.get_thumbnail_name(node_id),
            autoplay=self.autoplay,
            **node,
        )
        with self.creator_lock:
            self.creator.add_item_for(
                path=node_id,
                title=node["title"],
                content=html,
                mimetype="text/html",
            )

    def add_video_upon_completion(self, future):
        """adds the converted video inside this future to the zim

        logs error in case of failure"""
        if future.cancelled():
            return
        src_fname, dst_fpath, path = self.videos_futures.get(future)

        try:
            future.result()
        except Exception as exc:
            logger.error(f"Error re-encoding {src_fname}: {exc}")
            logger.exception(exc)
            return

        logger.debug(f"Re-encoded {src_fname} successfuly")

        kwargs = {
            "path": path,
            "filepath": dst_fpath,
            "mimetype": get_file_mimetype(dst_fpath),
        }
        # we shall request s3 upload on the threads pool, only once item has been
        # added to ZIM so it can be removed altogether
        if self.s3_storage:
            kwargs.update({"callback": self.request_s3_upload_and_removal})

        # simply add the item, autodeleting the file
        else:
            kwargs.update({"remove": True})

        with self.creator_lock:
            self.creator.add_item(StaticItem(**kwargs))
        logger.debug(f"Added {path} from re-encoded file")

    def convert_and_add_video_aside(self, file_id, src_fpath, src_checksum,
                                    dest_fpath, path, preset):
        """add video to the process-based convertion queue"""

        future = self.videos_executor.submit(
            reencode,
            src_path=src_fpath,
            dst_path=dest_fpath,
            ffmpeg_args=preset.to_ffmpeg_args(),
            delete_src=True,
            with_process=False,
            failsafe=False,
        )
        self.videos_futures.update(
            {future: (src_fpath.name, dest_fpath, path)})
        self.pending_upload.update({
            path: (
                dest_fpath,
                self.s3_key_for(file_id, preset),
                {
                    "checksum": src_checksum,
                    "encoder_version": str(preset.VERSION)
                },
            )
        })
        future.add_done_callback(self.add_video_upon_completion)

    def request_s3_upload_and_removal(self, item):
        """ add file from item to uploads list """
        path = item.path
        del item
        dest_fpath, key, meta = self.pending_upload.get(path)
        # TODO: submit to a thread executor (to create) instead
        # this is currently called on main-tread.
        self.upload_to_s3(key, dest_fpath, **meta)

    def add_audio_node(self, node_id):
        """Add content from this `audio` node to zim

        audio node are composed of a single mp3 file"""
        file = self.db.get_node_file(node_id, thumbnail=False)
        if not file:
            return
        self.funnel_file(file["id"], file["ext"])

        node = self.db.get_node(node_id, with_parents=True)
        html = self.jinja2_env.get_template("audio.html").render(
            node_id=node_id,
            filename=filename_for(file),
            ext=file["ext"],
            thumbnail=self.db.get_thumbnail_name(node_id),
            autoplay=self.autoplay,
            **node,
        )
        with self.creator_lock:
            self.creator.add_item_for(
                path=node_id,
                title=node["title"],
                content=html,
                mimetype="text/html",
            )

    def add_exercise_node(self, node_id):
        """Add content from this `exercise` node to zim

        exercise node is composed of a single perseus file

        a perseus file is a ZIP containing an exercise.json entrypoint and other files

        we'd solely add the perseus file in the ZIM along with the perseus reader from
        https://github.com/Khan/perseus"""
        logger.warning(f"[NOT SUPPORTED] not adding exercice node {node_id}")

    def add_document_node(self, node_id):
        """Add content from this `document` node to zim

        document node is composed of one main (`priority` 1) file and
        an optionnal (`priority` 2) file
        Format for each is either `pdf` (`document` preset) or `epub` (`epub` preset)


        - add the actual PDF/epub files to zim at /{node_id}.{ext} (files' IDs)
        - add an HTML page linking to files for download
        - includes an iframe with the appropriate viewer
         - using pdf.js for PDF
         - using epub.js for EPUB
        - add an additional page for the alternate document with its viewer
        """
        def target_for(file):
            filename = filename_for(file)
            if file["ext"] == "pdf":
                return f"./assets/pdfjs/web/viewer.html?file=../../../{filename}"
            if file["ext"] == "epub":
                return f"./assets/epub_embed.html?url=../{filename}"

        # record the actual document
        files = self.db.get_node_files(node_id, thumbnail=False)
        if not files:
            return
        files = sorted(filter(lambda f: f["supp"] == 0, files),
                       key=lambda f: f["prio"])
        it = iter(files)

        try:
            main_document = next(it)
        except StopIteration:
            return

        try:
            alt_document = next(it)
        except StopIteration:
            alt_document = None

        for file in files:
            self.funnel_file(file["id"], file["ext"])
            file["target"] = target_for(file)

        node = self.db.get_node(node_id, with_parents=True)
        # convert generator to list as we might read it twice
        node["parents"] = list(node["parents"])

        # generate page once for each document, changing only `is_alt`
        if alt_document:
            options = [False, True]
        else:
            options = [False]  # main_document only

        for is_alt in options:
            html = self.jinja2_env.get_template("document.html").render(
                node_id=node_id,
                main_document=filename_for(main_document),
                main_document_ext=main_document["ext"],
                alt_document=filename_for(alt_document)
                if alt_document else None,
                alt_document_ext=alt_document["ext"] if alt_document else None,
                target=target_for(alt_document if is_alt else main_document),
                is_alt=is_alt,
                **node,
            )
            with self.creator_lock:
                path = node_id
                if is_alt:
                    path += "_alt"
                self.creator.add_item_for(
                    path=path,
                    title=node["title"],
                    content=html,
                    mimetype="text/html",
                )

    def add_html5_node(self, node_id):
        """Add content from this `html5` node to zim

        html5 node is single ZIP file containing a standalone HTML app
        which entrypoint is a file named index.html

        we extract and add each file from the ZIP to /{node_id}/

        Note: Studio doesn't enforce the mandatory index.html, thus allowing invalid
        html5 app (unreachable)"""

        file = self.db.get_node_file(node_id, thumbnail=False)
        if not file:
            return

        # download ZIP file to memory
        ark_url, ark_name = get_kolibri_url_for(file["id"], file["ext"])
        ark_data = io.BytesIO()
        stream_file(url=ark_url, byte_stream=ark_data)

        # loop over zip members and create an entry for each
        zip_ark = zipfile.ZipFile(ark_data)
        for ark_member in zip_ark.namelist():
            with self.creator_lock:
                self.creator.add_item_for(
                    path=f"{node_id}/{ark_member}",
                    content=zip_ark.open(ark_member).read(),
                )

    def run(self):
        if self.s3_url_with_credentials and not self.s3_credentials_ok():
            raise ValueError(
                "Unable to connect to Optimization Cache. Check its URL.")

        s3_msg = (f"  using cache: {self.s3_storage.url.netloc} "
                  f"with bucket: {self.s3_storage.bucket_name}"
                  if self.s3_storage else "")
        logger.info(f"Starting scraper with:\n"
                    f"  channel_id: {self.channel_id}\n"
                    f"  build_dir: {self.build_dir}\n"
                    f"  output_dir: {self.output_dir}\n"
                    f"  using webm : {self.use_webm}\n"
                    f"  low_quality : {self.low_quality}\n"
                    f"{s3_msg}")

        logger.info("Download database")
        self.download_db()

        self.sanitize_inputs()
        # display basic stats
        logger.info(f"  Starting ZIM creation with:\n"
                    f"  filename: {self.fname}\n"
                    f"  title: {self.title}\n"
                    f"  description: {self.description}\n"
                    f"  creator: {self.author}\n"
                    f"  publisher: {self.publisher}\n"
                    f"  tags: {';'.join(self.tags)}")

        logger.info("Setup Zim Creator")
        self.output_dir.mkdir(parents=True, exist_ok=True)

        self.creator_lock = threading.Lock()
        self.creator = Creator(
            filename=self.output_dir.joinpath(self.fname),
            main_path=self.root_id,
            favicon_path="favicon.png",
            language="eng",
            title=self.title,
            description=self.description,
            creator=self.author,
            publisher=self.publisher,
            name=self.name,
            tags=";".join(self.tags),
        ).start()

        succeeded = False
        try:
            self.add_favicon()
            self.add_custom_about_and_css()

            # add static files
            logger.info("Adding local files (assets)")
            self.add_local_files("assets",
                                 self.templates_dir.joinpath("assets"))

            # setup queue for nodes processing
            self.nodes_futures = {}  # future: node_id
            self.nodes_executor = cf.ThreadPoolExecutor(
                max_workers=self.nb_threads)

            # setup a dedicated queue for videos to convert
            self.videos_futures = {}  # future: src_fname, dst_fpath, path
            self.pending_upload = {}  # path: filepath, key, checksum
            self.videos_executor = cf.ProcessPoolExecutor(
                max_workers=self.nb_processes)

            logger.info("Starting nodes processing")
            self.populate_nodes_executor()

            # await completion of all futures (nodes and videos)
            result = cf.wait(
                self.videos_futures.keys() | self.nodes_futures.keys(),
                return_when=cf.FIRST_EXCEPTION,
            )
            self.nodes_executor.shutdown()
            # properly shutting down the executor should allow processing
            # futures's callbacks (zim addition) as the wait() function
            # only awaits future completion and doesn't include callbacks
            self.videos_executor.shutdown()

            succeeded = not result.not_done
        except KeyboardInterrupt:
            self.creator.can_finish = False
            logger.error("KeyboardInterrupt, exiting.")
        except Exception as exc:
            # request Creator not to create a ZIM file on finish
            self.creator.can_finish = False
            logger.error("Interrupting process due to error: {exc}")
            logger.exception(exc)
        finally:
            if succeeded:
                logger.info("Finishing ZIM file…")
            # we need to release libzim's resources.
            # currently does nothing but crash if can_finish=False but that's awaiting
            # impl. at libkiwix level
            with self.creator_lock:
                self.creator.finish()

        if not self.keep_build_dir:
            logger.info("Removing build folder")
            shutil.rmtree(self.build_dir, ignore_errors=True)

        return 0 if succeeded else 1

    def s3_credentials_ok(self):
        logger.info("testing S3 Optimization Cache credentials")
        self.s3_storage = KiwixStorage(self.s3_url_with_credentials)
        if not self.s3_storage.check_credentials(list_buckets=True,
                                                 bucket=True,
                                                 write=True,
                                                 read=True,
                                                 failsafe=True):
            logger.error("S3 cache connection error testing permissions.")
            logger.error(f"  Server: {self.s3_storage.url.netloc}")
            logger.error(f"  Bucket: {self.s3_storage.bucket_name}")
            logger.error(f"  Key ID: {self.s3_storage.params.get('keyid')}")
            logger.error(f"  Public IP: {get_public_ip()}")
            return False
        return True

    def download_db(self):
        """download channel DB from kolibri and initialize DB

        Also sets the root_id with DB-computer value"""
        # download database
        fpath = self.build_dir.joinpath("db.sqlite3")
        logger.debug(f"Downloading database into {fpath.name}…")
        stream_file(
            f"{STUDIO_URL}/content/databases/{self.channel_id}.sqlite3",
            fpath,
        )
        self.db = KolibriDB(fpath, self.root_id)
        self.root_id = self.db.root_id

    def sanitize_inputs(self):
        channel_meta = self.db.get_channel_metadata(self.channel_id)

        # input  & metadata sanitation
        period = datetime.datetime.now().strftime("%Y-%m")
        if self.fname:
            # make sure we were given a filename and not a path
            self.fname = Path(self.fname.format(period=period))
            if Path(self.fname.name) != self.fname:
                raise ValueError(f"filename is not a filename: {self.fname}")
        else:
            self.fname = f"{self.name}_{period}.zim"

        if not self.title:
            self.title = channel_meta["name"]
        self.title = self.title.strip()

        if not self.description:
            self.description = channel_meta["description"]
        self.description = self.description.strip()

        if not self.author:
            self.author = channel_meta["author"] or "Kolibri"
        self.author = self.author.strip()

        if not self.publisher:
            self.publisher = "Openzim"
        self.publisher = self.publisher.strip()

        self.tags = list(
            set(self.tags + ["_category:other", "kolibri", "_videos:yes"]))

    def add_favicon(self):
        favicon_orig = self.build_dir / "favicon"
        # if user provided a custom favicon, retrieve that
        if self.favicon:
            handle_user_provided_file(source=self.favicon, dest=favicon_orig)
        # otherwise, get thumbnail from database
        else:
            # add channel thumbnail as favicon
            try:
                favicon_prefix, favicon_data = self.db.get_channel_metadata(
                    self.channel_id)["thumbnail"].split(";base64,", 1)
                favicon_data = base64.standard_b64decode(favicon_data)
                # favicon_mime = favicon_prefix.replace("data:", "")
                with open(favicon_orig, "wb") as fh:
                    fh.write(favicon_data)
                del favicon_data
            except Exception as exc:
                logger.warning(
                    "Unable to extract favicon from DB; using default")
                logger.exception(exc)

                # use a default favicon
                handle_user_provided_file(source=self.templates_dir /
                                          "kolibri-logo.png",
                                          dest=favicon_orig)

        # convert to PNG (might already be PNG but it's OK)
        favicon_fpath = favicon_orig.with_suffix(".png")
        convert_image(favicon_orig, favicon_fpath)

        # resize to appropriate size (ZIM uses 48x48)
        resize_image(favicon_fpath, width=96, height=96, method="thumbnail")

        # generate favicon
        favicon_ico_path = favicon_fpath.with_suffix(".ico")
        create_favicon(src=favicon_fpath, dst=favicon_ico_path)

        self.creator.add_item_for("favicon.png", fpath=favicon_fpath)
        self.creator.add_item_for("favicon.ico", fpath=favicon_ico_path)

    def add_custom_about_and_css(self):
        channel_meta = self.db.get_channel_metadata(self.channel_id)

        if self.about:
            # if user provided a custom about page, use it
            with open(
                    handle_user_provided_file(source=self.about,
                                              in_dir=self.build_dir,
                                              nocopy=True),
                    "r",
            ) as fh:
                soup = BeautifulSoup(fh.read(), "lxml")
                title = soup.find("title").text
                content = soup.select("body > .container")
                # we're only interested in the first one
                if isinstance(content, list):
                    content = content[0]
        else:
            title = channel_meta["name"]
            content = None

        html = self.jinja2_env.get_template("about.html").render(
            title=title, content=content, **channel_meta)
        with self.creator_lock:
            self.creator.add_item_for(
                path="about",
                title=title,
                content=html,
                mimetype="text/html",
            )
        del html

        # if user provided a custom CSS file, use it
        if self.css:
            with open(
                    handle_user_provided_file(source=self.css,
                                              in_dir=self.build_dir,
                                              nocopy=True),
                    "r",
            ) as fh:
                content = fh.read()
        # otherwise, create a blank one
        else:
            content = ""

        self.creator.add_item_for("custom.css",
                                  content=content,
                                  mimetype="text/css")
        logger.debug("Added about page and custom CSS")

Example #10

Show file

File: uploader.py Project: zux-hidden/zimfarm

def s3_upload_file(
    src_path,
    upload_uri,
    filesize,
    private_key,  # not used
    resume=False,  # not supported
    move=False,  # not relevant
    delete=False,
    compress=False,  # not relevant
    bandwidth=None,  # not supported
    cipher=None,  # not relevant
    delete_after=None,  # nb of days to expire upload file after
):
    started_on = now()
    s3_storage = KiwixStorage(rebuild_uri(upload_uri, scheme="https").geturl())
    logger.debug(f"S3 initialized for {s3_storage.url.netloc}/{s3_storage.bucket_name}")

    key = upload_uri.path[1:]
    if upload_uri.path.endswith("/"):
        key += src_path.name

    try:
        logger.info(f"Uploading to {key}")
        hook = FileTransferHook(filename=src_path)
        s3_storage.upload_file(fpath=str(src_path), key=key, Callback=hook)
        print("", flush=True)
    except Exception as exc:
        # as there is no resume, uploading to existing URL will result in DELETE+UPLOAD
        # if credentials doesn't allow DELETE or if there is an unsatisfied
        # retention, will raise PermissionError
        logger.error(f"uploader failed: {exc}")
        logger.exception(exc)
        return 1
    ended_on = now()
    logger.info("uploader ran successfuly.")

    # setting autodelete
    if delete_after is not None:
        try:
            # set expiration after bucket's min retention.
            # bucket retention is 1d minumum.
            # can be configured to loger value.
            # if expiration before bucket min retention, raises 400 Bad Request
            # on compliance
            expire_on = (
                datetime.datetime.now()
                + datetime.timedelta(days=delete_after or 1)
                # adding 1mn to prevent clash with bucket's equivalent min retention
                + datetime.timedelta(seconds=60)
            )
            logger.info(f"Setting autodelete to {expire_on}")
            s3_storage.set_object_autodelete_on(key=key, on=expire_on)
        except Exception as exc:
            logger.error(f"Failed to set autodelete: {exc}")
            logger.exception(exc)

    if delete:
        remove_source_file(src_path)
    display_stats(filesize, started_on, ended_on)

    return 0

Example #11

Show file

def main(url=None):
    if url is None:
        url = os.getenv("S3URL")
    if not url:
        print("Missing URL.")
        sys.exit(1)

    s3 = KiwixStorage(url=url)
    if not s3.check_credentials(list_buckets=True, failsafe=False):
        print("credentials not OK")
        sys.exit(1)

    print("credentials OK")

    iam = s3.get_service("iam")

    versions = iam.list_policy_versions(PolicyArn=POLICY_ARN).get(
        "Versions", [])
    print(f"We have {len(versions)} for {POLICY_ARN}")
    version_id = None
    for version in versions:
        if version["IsDefaultVersion"]:
            version_id = version["VersionId"]

    print(f"Default version is {version_id}")

    # delete all other versions
    if len(versions) == 5:
        print("Deleting all other versions…")
        for version in versions:
            if version["VersionId"] == version_id:
                continue
            print(f"Deleting version {version['VersionId']}")
            iam.delete_policy_version(PolicyArn=POLICY_ARN,
                                      VersionId=version["VersionId"])

    if not version_id:
        print("Existing policy doesn't exist. probably a mistake?")
        sys.exit(1)

    pv = (iam.get_policy_version(PolicyArn=POLICY_ARN,
                                 VersionId=version_id).get(
                                     "PolicyVersion", {}).get("Document"))
    if not pv:
        print("We don't have a policy document.")
        sys.exit(1)

    print(f"Current Policy:")
    from pprint import pprint as pp

    pp(pv)

    statement = get_statement()  # gen new statement

    try:
        stmt_index = [s["Sid"] for s in pv["Statement"]].index(STATEMENT_ID)
        pv["Statement"][stmt_index] = statement
    except ValueError:
        pv["Statement"].append(statement)

    new_policy = json.dumps(pv, indent=4)
    print(f"New Policy:\n{new_policy}")

    print("Overwriting policy…")
    iam.create_policy_version(PolicyArn=POLICY_ARN,
                              PolicyDocument=new_policy,
                              SetAsDefault=True)

Example #12

Show file

class Youtube2Zim:
    def __init__(
        self,
        collection_type,
        youtube_id,
        api_key,
        video_format,
        low_quality,
        nb_videos_per_page,
        all_subtitles,
        autoplay,
        output_dir,
        no_zim,
        fname,
        debug,
        tmp_dir,
        keep_build_dir,
        max_concurrency,
        youtube_store,
        language,
        locale_name,
        tags,
        dateafter,
        use_any_optimized_version,
        s3_url_with_credentials,
        title=None,
        description=None,
        creator=None,
        publisher=None,
        name=None,
        profile_image=None,
        banner_image=None,
        main_color=None,
        secondary_color=None,
    ):
        # data-retrieval info
        self.collection_type = collection_type
        self.youtube_id = youtube_id
        self.api_key = api_key
        self.dateafter = dateafter

        # video-encoding info
        self.video_format = video_format
        self.low_quality = low_quality

        # options & zim params
        self.nb_videos_per_page = nb_videos_per_page
        self.all_subtitles = all_subtitles
        self.autoplay = autoplay
        self.fname = fname
        self.language = language
        self.tags = [t.strip() for t in tags.split(",")]
        self.title = title
        self.description = description
        self.creator = creator
        self.publisher = publisher
        self.name = name
        self.profile_image = profile_image
        self.banner_image = banner_image
        self.main_color = main_color
        self.secondary_color = secondary_color

        # directory setup
        self.output_dir = Path(output_dir).expanduser().resolve()
        if tmp_dir:
            tmp_dir = Path(tmp_dir).expanduser().resolve()
            tmp_dir.mkdir(parents=True, exist_ok=True)
        self.build_dir = Path(tempfile.mkdtemp(dir=tmp_dir))

        # process-related
        self.playlists = []
        self.uploads_playlist_id = None
        self.videos_ids = []
        self.main_channel_id = None  # use for branding

        # debug/devel options
        self.no_zim = no_zim
        self.debug = debug
        self.keep_build_dir = keep_build_dir
        self.max_concurrency = max_concurrency

        # update youtube credentials store
        youtube_store.update(build_dir=self.build_dir,
                             api_key=self.api_key,
                             cache_dir=self.cache_dir)

        # Optimization-cache
        self.s3_url_with_credentials = s3_url_with_credentials
        self.use_any_optimized_version = use_any_optimized_version
        self.video_quality = "low" if self.low_quality else "high"
        self.s3_storage = None

        # set and record locale for translations
        locale_name = locale_name or get_language_details(
            self.language)["iso-639-1"]
        try:
            self.locale = setlocale(ROOT_DIR, locale_name)
        except locale.Error:
            logger.error(
                f"No locale for {locale_name}. Use --locale to specify it. defaulting to en_US"
            )
            self.locale = setlocale(ROOT_DIR, "en")

    @property
    def root_dir(self):
        return ROOT_DIR

    @property
    def templates_dir(self):
        return self.root_dir.joinpath("templates")

    @property
    def assets_src_dir(self):
        return self.templates_dir.joinpath("assets")

    @property
    def assets_dir(self):
        return self.build_dir.joinpath("assets")

    @property
    def channels_dir(self):
        return self.build_dir.joinpath("channels")

    @property
    def cache_dir(self):
        return self.build_dir.joinpath("cache")

    @property
    def videos_dir(self):
        return self.build_dir.joinpath("videos")

    @property
    def profile_path(self):
        return self.build_dir.joinpath("profile.jpg")

    @property
    def banner_path(self):
        return self.build_dir.joinpath("banner.jpg")

    @property
    def is_user(self):
        return self.collection_type == USER

    @property
    def is_channel(self):
        return self.collection_type == CHANNEL

    @property
    def is_playlist(self):
        return self.collection_type == PLAYLIST

    @property
    def is_single_channel(self):
        if self.is_channel or self.is_user:
            return True
        return len(list(set([pl.creator_id for pl in self.playlists]))) == 1

    @property
    def sorted_playlists(self):
        """ sorted list of playlists (by title) but with Uploads one at first if any """
        if len(self.playlists) < 2:
            return self.playlists

        sorted_playlists = sorted(self.playlists, key=lambda x: x.title)
        index = 0
        # make sure our Uploads, special playlist is first
        if self.uploads_playlist_id:
            try:
                index = [
                    index for index, p in enumerate(sorted_playlists)
                    if p.playlist_id == self.uploads_playlist_id
                ][-1]
            except Exception:
                index = 0
        return ([sorted_playlists[index]] + sorted_playlists[0:index] +
                sorted_playlists[index + 1:])

    def run(self):
        """ execute the scraper step by step """

        self.validate_id()

        # validate dateafter input
        self.validate_dateafter_input()

        logger.info(
            f"starting youtube scraper for {self.collection_type}#{self.youtube_id}"
        )
        logger.info("preparing build folder at {}".format(
            self.build_dir.resolve()))
        self.prepare_build_folder()

        logger.info("testing Youtube credentials")
        if not credentials_ok():
            raise ValueError(
                "Unable to connect to Youtube API v3. check `API_KEY`.")

        if self.s3_url_with_credentials and not self.s3_credentials_ok():
            raise ValueError(
                "Unable to connect to Optimization Cache. Check its URL.")

        # fail early if supplied branding files are missing
        self.check_branding_values()

        logger.info("compute playlists list to retrieve")
        self.extract_playlists()

        logger.info(".. {} playlists:\n   {}".format(
            len(self.playlists),
            "\n   ".join([p.playlist_id for p in self.playlists]),
        ))

        logger.info("compute list of videos")
        self.extract_videos_list()

        nb_videos_msg = f".. {len(self.videos_ids)} videos"
        if self.dateafter.start.year != 1:
            nb_videos_msg += (
                f" in date range: {self.dateafter.start} - {datetime.date.today()}"
            )
        logger.info(f"{nb_videos_msg}.")

        # download videos (and recompress)
        logger.info(
            f"downloading all videos, subtitles and thumbnails (concurrency={self.max_concurrency})"
        )
        logger.info(f"  format: {self.video_format}")
        logger.info(f"  quality: {self.video_quality}")
        logger.info(f"  generated-subtitles: {self.all_subtitles}")
        if self.s3_storage:
            logger.info(
                f"  using cache: {self.s3_storage.url.netloc} with bucket: {self.s3_storage.bucket_name}"
            )
        succeeded, failed = self.download_video_files(
            max_concurrency=self.max_concurrency)
        if failed:
            logger.error(
                f"{len(failed)} video(s) failed to download: {failed}")
            if len(failed) >= len(succeeded):
                logger.critical("More than half of videos failed. exiting")
                raise IOError("Too much videos failed to download")

        logger.info("retrieve channel-info for all videos (author details)")
        get_videos_authors_info(succeeded)

        logger.info("download all author's profile pictures")
        self.download_authors_branding()

        logger.info("update general metadata")
        self.update_metadata()

        logger.info("creating HTML files")
        self.make_html_files(succeeded)

        # make zim file
        os.makedirs(self.output_dir, exist_ok=True)
        if not self.no_zim:
            period = datetime.datetime.now().strftime("%Y-%m")
            self.fname = (self.fname.format(
                period=period) if self.fname else f"{self.name}_{period}.zim")
            logger.info("building ZIM file")
            make_zim_file(
                build_dir=self.build_dir,
                fpath=self.output_dir / self.fname,
                name=self.name,
                main_page="home.html",
                favicon="favicon.jpg",
                title=self.title,
                description=self.description,
                language=self.language,
                creator=self.creator,
                publisher="Kiwix",
                tags=self.tags,
                scraper=SCRAPER,
            )

            if not self.keep_build_dir:
                logger.info("removing temp folder")
                shutil.rmtree(self.build_dir, ignore_errors=True)

        logger.info("all done!")

    def s3_credentials_ok(self):
        logger.info("testing S3 Optimization Cache credentials")
        self.s3_storage = KiwixStorage(self.s3_url_with_credentials)
        if not self.s3_storage.check_credentials(list_buckets=True,
                                                 bucket=True,
                                                 write=True,
                                                 read=True,
                                                 failsafe=True):
            logger.error("S3 cache connection error testing permissions.")
            logger.error(f"  Server: {self.s3_storage.url.netloc}")
            logger.error(f"  Bucket: {self.s3_storage.bucket_name}")
            logger.error(f"  Key ID: {self.s3_storage.params.get('keyid')}")
            logger.error(f"  Public IP: {get_public_ip()}")
            return False
        return True

    def validate_dateafter_input(self):
        try:
            self.dateafter = youtube_dl.DateRange(self.dateafter)
        except Exception as exc:
            logger.error(
                f"Invalid dateafter input. Valid dateafter format: "
                f"YYYYMMDD or (now|today)[+-][0-9](day|week|month|year)(s).")
            raise ValueError(f"Invalid dateafter input: {exc}")

    def validate_id(self):
        # space not allowed in youtube-ID
        self.youtube_id = self.youtube_id.replace(" ", "")
        if self.collection_type == "channel" and len(self.youtube_id) > 24:
            raise ValueError("Invalid ChannelId")
        if "," in self.youtube_id and self.collection_type != "playlist":
            raise ValueError("Invalid YoutubeId")

    def prepare_build_folder(self):
        """ prepare build folder before we start downloading data """

        # copy assets
        shutil.copytree(self.assets_src_dir, self.assets_dir)

        fix_source_dir(self.assets_dir, "assets")

        # cache folder to store youtube-api results
        self.cache_dir.mkdir(exist_ok=True)

        # make videos placeholder
        self.videos_dir.mkdir(exist_ok=True)

        # make channels placeholder (profile files)
        self.channels_dir.mkdir(exist_ok=True)

    def check_branding_values(self):
        """checks that user-supplied images and colors are valid (so to fail early)

        Images are checked for existence or downloaded then resized
        Colors are check for validity"""

        # skip this step if none of related values were supplied
        if not sum([
                bool(x) for x in (
                    self.profile_image,
                    self.banner_image,
                    self.main_color,
                    self.secondary_color,
                )
        ]):
            return
        logger.info("checking your branding files and values")
        if self.profile_image:
            if self.profile_image.startswith("http"):
                stream_file(self.profile_image, self.profile_path)
            else:
                if not self.profile_image.exists():
                    raise IOError(
                        f"--profile image could not be found: {self.profile_image}"
                    )
                shutil.move(self.profile_image, self.profile_path)
            resize_image(self.profile_path,
                         width=100,
                         height=100,
                         method="thumbnail")
        if self.banner_image:
            if self.banner_image.startswith("http"):
                stream_file(self.banner_image, self.banner_path)
            else:
                if not self.banner_image.exists():
                    raise IOError(
                        f"--banner image could not be found: {self.banner_image}"
                    )
                shutil.move(self.banner_image, self.banner_path)
            resize_image(self.banner_path,
                         width=1060,
                         height=175,
                         method="thumbnail")

        if self.main_color and not is_hex_color(self.main_color):
            raise ValueError(
                f"--main-color is not a valid hex color: {self.main_color}")

        if self.secondary_color and not is_hex_color(self.secondary_color):
            raise ValueError(
                f"--secondary_color-color is not a valid hex color: {self.secondary_color}"
            )

    def extract_playlists(self):
        """prepare a list of Playlist from user request

        USER: we fetch the hidden channel associate to it
        CHANNEL (and USER): we grab all playlists + `uploads` playlist
        PLAYLIST: we retrieve from the playlist Id(s)"""

        (
            self.playlists,
            self.main_channel_id,
            self.uploads_playlist_id,
        ) = extract_playlists_details_from(self.collection_type,
                                           self.youtube_id)

    def extract_videos_list(self):
        all_videos = load_json(self.cache_dir, "videos")
        if all_videos is None:
            all_videos = {}

            # we only return video_ids that we'll use later on. per-playlist JSON stored
            for playlist in self.playlists:
                videos_json = get_videos_json(playlist.playlist_id)
                # filter in videos within date range and filter away deleted videos
                skip_outofrange = functools.partial(skip_outofrange_videos,
                                                    self.dateafter)
                filter_videos = filter(skip_outofrange, videos_json)
                filter_videos = filter(skip_deleted_videos, filter_videos)
                all_videos.update(
                    {v["contentDetails"]["videoId"]: v
                     for v in filter_videos})
            save_json(self.cache_dir, "videos", all_videos)
        self.videos_ids = [*all_videos.keys()
                           ]  # unpacking so it's subscriptable

    def download_video_files(self, max_concurrency):

        audext, vidext = {
            "webm": ("webm", "webm"),
            "mp4": ("m4a", "mp4")
        }[self.video_format]

        # prepare options which are shared with every downloader
        options = {
            "cachedir": self.videos_dir,
            "writethumbnail": True,
            "write_all_thumbnails": False,
            "writesubtitles": True,
            "allsubtitles": True,
            "subtitlesformat": "vtt",
            "keepvideo": False,
            "ignoreerrors": False,
            "retries": 20,
            "fragment-retries": 50,
            "skip-unavailable-fragments": True,
            # "external_downloader": "aria2c",
            # "external_downloader_args": ["--max-tries=20", "--retry-wait=30"],
            "outtmpl": str(self.videos_dir.joinpath("%(id)s",
                                                    "video.%(ext)s")),
            "preferredcodec": self.video_format,
            "format":
            f"best[ext={vidext}]/bestvideo[ext={vidext}]+bestaudio[ext={audext}]/best",
            "y2z_videos_dir": self.videos_dir,
        }
        if self.all_subtitles:
            options.update({"writeautomaticsub": True})

        # find number of actuall parallel workers
        nb_videos = len(self.videos_ids)
        concurrency = nb_videos if nb_videos < max_concurrency else max_concurrency

        # short-circuit concurency if we have only one thread (can help debug)
        if concurrency <= 1:
            return self.download_video_files_batch(options, self.videos_ids)

        # prepare out videos_ids batches
        def get_slot():
            n = 0
            while True:
                yield n
                n += 1
                if n >= concurrency:
                    n = 0

        batches = [[] for _ in range(0, concurrency)]
        slot = get_slot()
        for video_id in self.videos_ids:
            batches[next(slot)].append(video_id)

        overall_succeeded = []
        overall_failed = []
        # execute the batches concurrently
        with concurrent.futures.ThreadPoolExecutor(
                max_workers=concurrency) as executor:
            fs = [
                executor.submit(self.download_video_files_batch, options,
                                videos_ids) for videos_ids in batches
            ]
            done, not_done = concurrent.futures.wait(
                fs, return_when=concurrent.futures.ALL_COMPLETED)

            # we have some `not_done` batches, indicating errors within
            if not_done:
                logger.critical(
                    "Not all video-processing batches completed. Cancelling…")
                for future in not_done:
                    exc = future.exception()
                    if exc:
                        logger.exception(exc)
                        raise exc

            # retrieve our list of successful/failed video_ids
            for future in done:
                succeeded, failed = future.result()
                overall_succeeded += succeeded
                overall_failed += failed

        # remove left-over files for failed downloads
        logger.debug(
            f"removing left-over files of {len(overall_failed)} failed videos")
        for video_id in overall_failed:
            shutil.rmtree(self.videos_dir.joinpath(video_id),
                          ignore_errors=True)

        return overall_succeeded, overall_failed

    def download_from_cache(self, key, video_path, encoder_version):
        """ whether it successfully downloaded from cache """
        if self.use_any_optimized_version:
            if not self.s3_storage.has_object(key,
                                              self.s3_storage.bucket_name):
                return False
        else:
            if not self.s3_storage.has_object_matching_meta(
                    key, tag="encoder_version", value=f"v{encoder_version}"):
                return False
        video_path.parent.mkdir(parents=True, exist_ok=True)
        try:
            self.s3_storage.download_file(key, video_path)
        except Exception as exc:
            logger.error(f"{key} failed to download from cache: {exc}")
            return False
        logger.info(f"downloaded {video_path} from cache at {key}")
        return True

    def upload_to_cache(self, key, video_path, encoder_version):
        """ whether it successfully uploaded to cache """
        try:
            self.s3_storage.upload_file(
                video_path,
                key,
                meta={"encoder_version": f"v{encoder_version}"})
        except Exception as exc:
            logger.error(f"{key} failed to upload to cache: {exc}")
            return False
        logger.info(f"uploaded {video_path} to cache at {key}")
        return True

    def download_video(self, video_id, options):
        """ download the video from cache/youtube and return True if successful """

        preset = {"mp4": VideoMp4Low}.get(self.video_format, VideoWebmLow)()
        options_copy = options.copy()
        video_location = options_copy["y2z_videos_dir"].joinpath(video_id)
        video_path = video_location.joinpath(f"video.{self.video_format}")

        if self.s3_storage:
            s3_key = f"{self.video_format}/{self.video_quality}/{video_id}"
            logger.debug(
                f"Attempting to download video file for {video_id} from cache..."
            )
            if self.download_from_cache(s3_key, video_path, preset.VERSION):
                return True

        try:
            # skip downloading the thumbnails
            options_copy.update({
                "writethumbnail": False,
                "writesubtitles": False,
                "allsubtitles": False,
                "writeautomaticsub": False,
            })
            with youtube_dl.YoutubeDL(options_copy) as ydl:
                ydl.download([video_id])
            post_process_video(
                video_location,
                video_id,
                preset,
                self.video_format,
                self.low_quality,
            )
        except (
                youtube_dl.utils.DownloadError,
                FileNotFoundError,
                subprocess.CalledProcessError,
        ) as exc:
            logger.error(f"Video file for {video_id} could not be downloaded")
            logger.debug(exc)
            return False
        else:  # upload to cache only if everything went well
            if self.s3_storage:
                logger.debug(
                    f"Uploading video file for {video_id} to cache ...")
                self.upload_to_cache(s3_key, video_path, preset.VERSION)
            return True

    def download_thumbnail(self, video_id, options):
        """ download the thumbnail from cache/youtube and return True if successful """

        preset = WebpHigh()
        options_copy = options.copy()
        video_location = options_copy["y2z_videos_dir"].joinpath(video_id)
        thumbnail_path = video_location.joinpath("video.webp")

        if self.s3_storage:
            s3_key = f"thumbnails/high/{video_id}"
            logger.debug(
                f"Attempting to download thumbnail for {video_id} from cache..."
            )
            if self.download_from_cache(s3_key, thumbnail_path,
                                        preset.VERSION):
                return True

        try:
            # skip downloading the video
            options_copy.update({
                "skip_download": True,
                "writesubtitles": False,
                "allsubtitles": False,
                "writeautomaticsub": False,
            })
            with youtube_dl.YoutubeDL(options_copy) as ydl:
                ydl.download([video_id])
            process_thumbnail(thumbnail_path, preset)
        except (
                youtube_dl.utils.DownloadError,
                FileNotFoundError,
                subprocess.CalledProcessError,
        ) as exc:
            logger.error(f"Thumbnail for {video_id} could not be downloaded")
            logger.debug(exc)
            return False
        else:  # upload to cache only if everything went well
            if self.s3_storage:
                logger.debug(
                    f"Uploading thumbnail for {video_id} to cache ...")
                self.upload_to_cache(s3_key, thumbnail_path, preset.VERSION)
            return True

    def download_subtitles(self, video_id, options):
        """ download subtitles for a video """

        options_copy = options.copy()
        options_copy.update({"skip_download": True, "writethumbnail": False})
        try:
            with youtube_dl.YoutubeDL(options_copy) as ydl:
                ydl.download([video_id])
        except Exception:
            logger.error(f"Could not download subtitles for {video_id}")

    def download_video_files_batch(self, options, videos_ids):
        """ download video file and thumbnail for all videos in batch and return succeeded and failed video ids """

        succeeded = []
        failed = []
        for video_id in videos_ids:
            if self.download_video(video_id,
                                   options) and self.download_thumbnail(
                                       video_id, options):
                self.download_subtitles(video_id, options)
                succeeded.append(video_id)
            else:
                failed.append(video_id)
        return succeeded, failed

    def download_authors_branding(self):
        videos_channels_json = load_json(self.cache_dir, "videos_channels")
        uniq_channel_ids = list(
            set([chan["channelId"] for chan in videos_channels_json.values()]))
        for channel_id in uniq_channel_ids:
            save_channel_branding(self.channels_dir,
                                  channel_id,
                                  save_banner=False)
            self.copy_default_banner(channel_id)

    def copy_default_banner(self, channel_id):
        banner_path = self.channels_dir / channel_id / "banner.jpg"
        if not banner_path.exists():
            shutil.copy(
                self.templates_dir / "assets" / "banner.jpg",
                self.channels_dir / channel_id / "banner.jpg",
            )

    def update_metadata(self):
        # we use title, description, profile and banner of channel/user
        # or channel of first playlist
        try:
            main_channel_json = get_channel_json(self.main_channel_id)
        except KeyError:
            main_channel_json = {
                "snippet": {
                    "title": "Unknown",
                    "description": ""
                }
            }
        else:
            save_channel_branding(self.channels_dir,
                                  self.main_channel_id,
                                  save_banner=True)
        self.copy_default_banner(self.main_channel_id)

        # if a single playlist was requested, use if for names;
        # otherwise, use main_channel's details.
        auto_title = (self.playlists[0].title
                      if self.is_playlist and len(self.playlists) == 1 else
                      main_channel_json["snippet"]["title"].strip())
        auto_description = (clean_text(self.playlists[0].description)
                            if self.is_playlist and len(self.playlists) == 1
                            else clean_text(
                                main_channel_json["snippet"]["description"]))
        self.title = self.title or auto_title or "-"
        self.description = self.description or auto_description or "-"

        if self.creator is None:
            if self.is_single_channel:
                self.creator = _("Youtube Channel “{title}”").format(
                    title=main_channel_json["snippet"]["title"])
            else:
                self.creator = _("Youtube Channels")
        self.publisher = self.publisher or "Kiwix"

        self.tags = self.tags or ["youtube"]
        if "_videos:yes" not in self.tags:
            self.tags.append("_videos:yes")

        # copy our main_channel branding into /(profile|banner).jpg if not supplied
        if not self.profile_path.exists():
            shutil.copy(
                self.channels_dir.joinpath(self.main_channel_id,
                                           "profile.jpg"),
                self.profile_path,
            )
        if not self.banner_path.exists():
            shutil.copy(
                self.channels_dir.joinpath(self.main_channel_id, "banner.jpg"),
                self.banner_path,
            )

        # set colors from images if not supplied
        if self.main_color is None or self.secondary_color is None:
            profile_main, profile_secondary = get_colors(self.profile_path)
        self.main_color = self.main_color or profile_main
        self.secondary_color = self.secondary_color or profile_secondary

        resize_image(
            self.profile_path,
            width=48,
            height=48,
            method="thumbnail",
            dst=self.build_dir.joinpath("favicon.jpg"),
        )

    def make_html_files(self, actual_videos_ids):
        """make up HTML structure to read the content

        /home.html                                  Homepage

        for each video:
            - <slug-title>.html                     HTML article
            - videos/<videoId>/video.<ext>          video file
            - videos/<videoId>/video.<lang>.vtt     subtititle(s)
            - videos/<videoId>/video.webp            template
        """
        def remove_unused_videos(videos):
            video_ids = [
                video["contentDetails"]["videoId"] for video in videos
            ]
            for path in self.videos_dir.iterdir():
                if path.is_dir() and path.name not in video_ids:
                    logger.debug(f"Removing unused video {path.name}")
                    shutil.rmtree(path, ignore_errors=True)

        def is_present(video):
            """ whether this video has actually been succeffuly downloaded """
            return video["contentDetails"]["videoId"] in actual_videos_ids

        def video_has_channel(videos_channels, video):
            return video["contentDetails"]["videoId"] in videos_channels

        def get_subtitles(video_id):
            video_dir = self.videos_dir.joinpath(video_id)
            languages = [
                x.stem.split(".")[1] for x in video_dir.iterdir()
                if x.is_file() and x.name.endswith(".vtt")
            ]

            def to_jinja_subtitle(lang):
                try:
                    subtitle = get_language_details(
                        YOUTUBE_LANG_MAP.get(lang, lang))
                except Exception:
                    logger.error(f"Failed to get language details for {lang}")
                    raise
                return {
                    "code": lang,
                    # Youtube.com uses `English - code` format.
                    # Note: videojs displays it lowercased anyway
                    "name":
                    f"{subtitle['english'].title()} - {subtitle['query']}",
                }

            # Youtube.com sorts subtitles by English name
            return sorted(map(to_jinja_subtitle, languages),
                          key=lambda x: x["name"])

        env = jinja2.Environment(loader=jinja2.FileSystemLoader(
            str(self.templates_dir)),
                                 autoescape=True)

        videos = load_json(self.cache_dir, "videos").values()
        # filter videos so we only include the ones we could retrieve
        videos = list(filter(is_present, videos))
        videos_channels = load_json(self.cache_dir, "videos_channels")
        has_channel = functools.partial(video_has_channel, videos_channels)
        # filter videos to exclude those for which we have no channel (#76)
        videos = list(filter(has_channel, videos))
        for video in videos:
            video_id = video["contentDetails"]["videoId"]
            title = video["snippet"]["title"]
            slug = get_slug(title)
            description = video["snippet"]["description"]
            publication_date = dt_parser.parse(
                video["contentDetails"]["videoPublishedAt"])
            author = videos_channels[video_id]
            subtitles = get_subtitles(video_id)
            video_url = f"https://www.youtube.com/watch?v={video_id}"

            html = env.get_template("article.html").render(
                video_id=video_id,
                video_format=self.video_format,
                author=author,
                title=title,
                description=description,
                date=format_date(publication_date,
                                 format="medium",
                                 locale=self.locale),
                subtitles=subtitles,
                url=video_url,
                channel_id=video["snippet"]["channelId"],
                color=self.main_color,
                background_color=self.secondary_color,
                autoplay=self.autoplay,
            )
            with open(self.build_dir.joinpath(f"{slug}.html"),
                      "w",
                      encoding="utf-8") as fp:
                fp.write(html)

        # build homepage
        html = env.get_template("home.html").render(
            playlists=self.playlists,
            video_format=self.video_format,
            title=self.title,
            description=self.description,
            color=self.main_color,
            background_color=self.secondary_color,
            page_label=_("Page {current}/{total}"),
            back_label=_("Back to top"),
        )
        with open(self.build_dir.joinpath("home.html"), "w",
                  encoding="utf-8") as fp:
            fp.write(html)

        # rewrite app.js including `format`
        with open(self.assets_dir.joinpath("app.js"), "w",
                  encoding="utf-8") as fp:
            fp.write(
                env.get_template("assets/app.js").render(
                    video_format=self.video_format))

        # rewrite app.js including `pagination`
        with open(self.assets_dir.joinpath("db.js"), "w",
                  encoding="utf-8") as fp:
            fp.write(
                env.get_template("assets/db.js").render(
                    NB_VIDEOS_PER_PAGE=self.nb_videos_per_page))

        # write list of videos in data.js
        def to_data_js(video):
            return {
                "id":
                video["contentDetails"]["videoId"],
                "title":
                video["snippet"]["title"],
                "slug":
                get_slug(video["snippet"]["title"]),
                "description":
                video["snippet"]["description"],
                "subtitles":
                get_subtitles(video["contentDetails"]["videoId"]),
                "thumbnail":
                str(
                    Path("videos").joinpath(video["contentDetails"]["videoId"],
                                            "video.webp")),
            }

        with open(self.assets_dir.joinpath("data.js"), "w",
                  encoding="utf-8") as fp:
            # write all playlists as they are
            for playlist in self.playlists:
                # retrieve list of videos for PL
                playlist_videos = load_json(
                    self.cache_dir, f"playlist_{playlist.playlist_id}_videos")
                # filtering-out missing ones (deleted or not downloaded)
                playlist_videos = list(
                    filter(skip_deleted_videos, playlist_videos))
                playlist_videos = list(filter(is_present, playlist_videos))
                playlist_videos = list(filter(has_channel, playlist_videos))
                # sorting them based on playlist
                playlist_videos.sort(key=lambda v: v["snippet"]["position"])

                fp.write("var json_{slug} = {json_str};\n".format(
                    slug=playlist.slug,
                    json_str=json.dumps(list(map(to_data_js, playlist_videos)),
                                        indent=4),
                ))

        # write a metadata.json file with some content-related data
        with open(self.build_dir.joinpath("metadata.json"),
                  "w",
                  encoding="utf-8") as fp:
            json.dump({"video_format": self.video_format}, fp, indent=4)

        # clean videos left out in videos directory
        remove_unused_videos(videos)

Example #13

Show file

    def upload_image_s3(self):
        self.logger.info("Starting S3 upload")

        # add credentials to URL
        url = urllib.parse.urlparse(self.task["upload_uri"])
        qs = urllib.parse.parse_qs(url.query)
        qs["keyId"] = Setting.s3_access_key
        qs["secretAccessKey"] = Setting.s3_secret_key

        # setup upload logging
        uploader_log = io.StringIO()
        uploader_logger = logging.getLogger("uploader_log")
        uploader_logger.propagate = True
        uploader_logger.setLevel(logging.DEBUG)
        uploader_logger.addHandler(logging.StreamHandler(stream=uploader_log))

        # init and test storage
        uploader_logger.info("initializing S3")
        s3_storage = KiwixStorage(
            urllib.parse.SplitResult(
                "https",
                url.netloc,
                url.path,
                urllib.parse.urlencode(qs, doseq=True),
                url.fragment,
            ).geturl())
        uploader_logger.debug(
            f"S3 initialized for {s3_storage.url.netloc}/{s3_storage.bucket_name}"
        )

        # torrent
        dl_url = urllib.parse.urlparse(self.task["download_uri"])
        upload_torrent = "torrent" in dl_url.scheme

        if upload_torrent:
            parts = list(urllib.parse.urlsplit(dl_url.geturl()))
            parts[0] = parts[0].replace("+torrent", "")
            dl_url = urllib.parse.urlparse(urllib.parse.urlunsplit(parts))

            uploader_logger.info(
                f"Creating torrent file for {self.img_path.name}")
            torrent_path = self.img_path.with_suffix(
                f"{self.img_path.suffix}.torrent")
            download_url = f"{dl_url.geturl()}/{self.img_path.name}"
            torrent = torf.Torrent(
                path=self.img_path,
                trackers=[
                    "https://opentracker.xyz:443/announce",
                    "http://torrent.nwps.ws:80/announce",
                    "udp://tracker.open-internet.nl:6969/announce",
                    "udp://tracker.coppersurfer.tk:6969/announce",
                    "udp://tracker.openbittorrent.com:80/announce",
                ],
                webseeds=[download_url],
            )
            torrent.generate()
            torrent.write(torrent_path)
            uploader_logger.info(f".. created {torrent_path.name}")

            uploader_logger.info(f"Uploading {torrent_path.name}")
            s3_storage.upload_file(fpath=str(torrent_path),
                                   key=torrent_path.name)
            uploader_logger.info(".. uploaded")
            torrent_path.unlink()

        # upload
        uploader_logger.info(f"Uploading to {self.img_path.name}")
        try:
            hook = ImageTransferHook(output=uploader_log, fpath=self.img_path)
            s3_storage.upload_file(fpath=str(self.img_path),
                                   key=self.img_path.name,
                                   Callback=hook)
            uploaded = True
        except Exception as exc:
            uploaded = False
            uploader_logger.error(f"uploader failed: {exc}")
            uploader_logger.exception(exc)
        else:
            uploader_logger.info("uploader ran successfuly.")

        # setting autodelete
        try:
            # make sure autodelete is above bucket's min retention (should be 1d)
            expire_on = datetime.datetime.now() + datetime.timedelta(
                days=max([2, self.task["media_duration"]]))
            uploader_logger.info(f"Setting autodelete to {expire_on}")
            s3_storage.set_object_autodelete_on(key=self.img_path.name,
                                                on=expire_on)
        except Exception as exc:
            uploader_logger.error("Failed to set autodelete")
            uploader_logger.exception(exc)

        if upload_torrent:
            try:
                uploader_logger.info(
                    f"Setting torrent autodelete to {expire_on}")
                s3_storage.set_object_autodelete_on(key=torrent_path.name,
                                                    on=expire_on)
            except Exception as exc:
                uploader_logger.error("Failed to set autodelete on torrent")
                uploader_logger.exception(exc)

        self.logger.info("collecting uploader log")
        try:
            self.logs["uploader_log"] = uploader_log.getvalue()
            uploader_log.close()
        except Exception as exc:
            self.logger.error(f"Failed to collect logs: {exc}")

        # remove image
        try:
            self.logger.info("removing image file: {}".format(
                self.img_path.name))
            self.img_path.unlink()
        except Exception as exp:
            self.logger.error("Unable to remove image file: {}".format(exp))
            self.logger.exception(exp)

        if not uploaded:
            raise subprocess.SubprocessError("S3 upload failed")

Example #14

Show file

File: download.py Project: palash-cj/cardshop

    def download_image_s3(self):
        self.logger.info("Starting s3 image download")

        # add credentials to URL
        url = urllib.parse.urlparse(self.task["download_uri"])
        qs = urllib.parse.parse_qs(url.query)
        qs["keyId"] = Setting.s3_access_key
        qs["secretAccessKey"] = Setting.s3_secret_key

        # setup download logging
        downloader_log = io.StringIO()
        downloader_logger = logging.getLogger("downloader_log")
        downloader_logger.propagate = True
        downloader_logger.setLevel(logging.DEBUG)
        downloader_logger.addHandler(logging.StreamHandler(stream=downloader_log))

        # init and test storage
        downloader_logger.info("initializing S3")
        s3_storage = KiwixStorage(
            urllib.parse.SplitResult(
                "https",
                url.netloc,
                url.path,
                urllib.parse.urlencode(qs, doseq=True),
                url.fragment,
            ).geturl()
        )
        downloader_logger.debug(
            f"S3 initialized for {s3_storage.url.netloc}/{s3_storage.bucket_name}"
        )

        # download
        downloader_logger.info(f"Downloading from {self.img_path.name}")
        try:
            hook = ImageTransferHook(
                output=downloader_log,
                size=s3_storage.get_object_stat(key=self.img_path.name).size,
                name=self.img_path.name,
            )
            s3_storage.download_file(
                key=self.img_path.name, fpath=str(self.img_path), Callback=hook
            )
            downloaded = True
        except Exception as exc:
            downloaded = False
            downloader_logger.error(f"downloader failed: {exc}")
            downloader_logger.exception(exc)
        else:
            downloader_logger.info("downloader ran successfuly.")

        if downloaded:
            # image downloaded, mark for autodeletion
            try:
                autodelete_on = datetime.datetime.now() + datetime.timedelta(days=1)
                downloader_logger.info(f"Setting autodelete to now ({autodelete_on})")
                s3_storage.set_object_autodelete_on(
                    key=self.img_path.name, on=autodelete_on
                )
            except Exception as exc:
                downloader_logger.error(
                    "Failed to set autodelete (normal if before bucket retention)"
                )
                downloader_logger.exception(exc)

        self.logger.info("collecting downloader log")
        try:
            self.logs["downloader_log"] = downloader_log.getvalue()
            downloader_log.close()
        except Exception as exc:
            self.logger.error(f"Failed to collect logs: {exc}")

        if not downloaded:
            raise subprocess.SubprocessError("S3 download failed")