def __init__(self,
              source_id,
              name=None,
              type_name="Youtube",
              lang="ar",
              embeded=False,
              section_title=None,
              author=None,
              license=None):
     if embeded is True:
         source_id = YouTubeResourceNode.transform_embed(source_id)
     else:
         source_id = self.clean_url(source_id)
     YouTubeResource.__init__(self, source_id)
     Node.__init__(self,
                   title=None,
                   source_id=source_id,
                   lang=lang,
                   author=author,
                   license=license)
     LOGGER.info("    + Resource Type: {}".format(type_name))
     LOGGER.info("    - URL: {}".format(self.source_id))
     self.filename = None
     self.type_name = type_name
     self.filepath = None
     self.name = name
     self.section_title = section_title
     self.file_format = file_formats.MP4
     self.is_valid = False
Exemple #2
0
    def _get_youtube_info(self, use_proxy=True, use_cache=True, options=None):
        youtube_info = None
        # 1. Try to get from cache if allowed:
        if os.path.exists(self.cache_path) and use_cache:
            LOGGER.info("==> [%s] Retrieving cached information...", self.__str__())
            youtube_info = json.load(open(self.cache_path))
        # 2. Fetch info from youtube_dl
        if not youtube_info:
            LOGGER.info("==> [%s] Requesting info from youtube...", self.__str__())
            os.makedirs(self.cache_dir, exist_ok=True)
            try:
                youtube_resource = YouTubeResource(self.url, useproxy=use_proxy)
            except youtube_dl.utils.ExtractorError as e:
                if "unavailable" in str(e):
                    LOGGER.error("==> [%s] Resource unavailable for URL: %s", self.__str__, self.url)
                    return None

            if youtube_resource:
                try:
                    # Save YouTube info to JSON cache file
                    youtube_info = youtube_resource.get_resource_info(options)
                    if youtube_info:
                        json.dump(youtube_info,
                                  open(self.cache_path, 'w'),
                                  indent=4,
                                  ensure_ascii=False,
                                  sort_keys=True)
                    else:
                        LOGGER.error("==> [%s] Failed to extract YouTube info", self.__str__())
                except Exception as e:
                    LOGGER.error("==> [%s] Failed to get YouTube info: %s", self.__str__(), e)
                    return None
        return youtube_info
Exemple #3
0
 def build_video_nodes(self, base_path, content):
     videos_url = self.get_videos_urls(content)
     base_path = build_path([DATA_DIR])
     video_nodes = []
     for video_url in videos_url:
         if YouTubeResource.is_youtube(
                 video_url) and not YouTubeResource.is_channel(video_url):
             video = YouTubeResourceNode(video_url, lang=self.lang)
             video.download(download=DOWNLOAD_VIDEOS, base_path=base_path)
             yield video
    def download_info(self, use_cache=True):
        """
        Download video info to json file
        """
        match = YOUTUBE_ID_REGEX.match(self.url)
        if not match:
            LOGGER.error('==> URL ' + self.url +
                         ' does not match YOUTUBE_ID_REGEX')
            return False
        youtube_id = match.group('youtube_id')
        if not os.path.isdir(YOUTUBE_CACHE_DIR):
            os.mkdir(YOUTUBE_CACHE_DIR)
        vinfo_json_path = os.path.join(YOUTUBE_CACHE_DIR, youtube_id + '.json')
        # First try to get from cache:
        vinfo = None
        if os.path.exists(vinfo_json_path) and use_cache:
            vinfo = json.load(open(vinfo_json_path))
            LOGGER.info("Retrieving cached video information...")
        # else get using youtube_dl:
        if not vinfo:
            LOGGER.info("Downloading %s from youtube...", self.url)
            try:
                video = YouTubeResource(self.url)
            except youtube_dl.utils.ExtractorError as e:
                if "unavailable" in str(e):
                    LOGGER.error("Video not found at URL: %s", self.url)
                    return False

            if video:
                try:
                    vinfo = video.get_resource_info()
                    json.dump(vinfo,
                              open(vinfo_json_path, 'w'),
                              indent=4,
                              ensure_ascii=False,
                              sort_keys=True)
                    return True
                except Exception as e:
                    LOGGER.error("Failed to get video info: %s", e)
                    return False

        else:
            return False

        self.uid = vinfo[
            'id']  # video must have id because required to set youtube_id later
        self.title = vinfo.get('title', '')
        self.description = vinfo.get('description', '')
        if not vinfo['license']:
            self.license = "Licensed not available"
        elif "Creative Commons" in vinfo['license']:
            self.license_common = True
        else:
            self.license = vinfo['license']
        return True
Exemple #5
0
def test_proxy_playlist_download(tmp_path):
    playlist = YouTubeResource(YOUTUBE_TEST_PLAYLIST)
    playlist.download(tmp_path)

    temp_files = os.listdir(os.path.join(tmp_path, 'Playlist'))
    expected = [
        'zbkizy-Y3qw.jpg', 'oXnzstpBEOg.mp4', 'oXnzstpBEOg.jpg',
        'zbkizy-Y3qw.mp4'
    ]

    assert set(temp_files) == set(expected)
    def download_info(self):

        match = YOUTUBE_ID_REGEX.match(self.url)
        if not match:
            print('==> URL ' + self.url + ' does not match YOUTUBE_ID_REGEX')
            return False
        youtube_id = match.group('youtube_id')
        if not os.path.isdir(YOUTUBE_CACHE_DIR):
            os.mkdir(YOUTUBE_CACHE_DIR)
        vinfo_json_path = os.path.join(YOUTUBE_CACHE_DIR, youtube_id+'.json')
        # First try to get from cache:
        vinfo = None
        if os.path.exists(vinfo_json_path):
            vinfo = json.load(open(vinfo_json_path))
            if not vinfo:
                # the json data for "Video unavailable" is `null` so can skip them
                return False
            print("Using cached video info for youtube_id", youtube_id)

        # else get using YouTubeResource
        if not vinfo:
            print("Downloading {} from youtube...".format(self.url))
            try:
                video = YouTubeResource(self.url)
            except youtube_dl.utils.ExtractorError as e:
                if "unavailable" in str(e):
                    print("Video not found at URL: {}".format(self.url))
                    return False

            if video:
                try:
                    vinfo = video.get_resource_info()
                    # Save the remaining "temporary scraped values" of attributes with actual values
                    # from the video metadata.
                    json.dump(vinfo, open(vinfo_json_path, 'w'), indent=4, ensure_ascii=False, sort_keys=True)
                except Exception as e:
                    print(e)
                    return False

            else:
                return False

        self.uid = vinfo['id']  # video must have id because required to set youtube_id later
        self.title = vinfo.get('title', '')
        self.description = vinfo.get('description', '')
        if not vinfo['license']:
            self.license = "Licensed not available"
        elif "Creative Commons" in vinfo['license']:
            self.license_common = True
        else:
            self.license = vinfo['license']

        return True
Exemple #7
0
def test_proxy_download(tmp_path):
    proxy.get_proxies(refresh=True)
    assert len(proxy.PROXY_LIST) > 1

    video = YouTubeResource(YOUTUBE_TEST_VIDEO)
    video.download(tmp_path)

    temp_files = os.listdir(os.path.join(tmp_path, 'Watch'))
    has_video = False
    for afile in temp_files:
        if afile.endswith('.mp4'):
            has_video = True

    assert has_video, 'Video file not found'
Exemple #8
0
def test_bad_proxies_get_banned(tmp_path):
    # create some fake proxies...
    FAKE_PROXIES = [
        '123.123.123.123:1234',
        '142.123.1.234:123345',
        '156.245.233.211:12323',
        '11.22.33.44:123',
    ]
    # initialize PROXY_LIST to known-bad proxies to check that they get banned
    proxy.PROXY_LIST = FAKE_PROXIES.copy()

    video = YouTubeResource(YOUTUBE_TEST_VIDEO)
    video.download(tmp_path)

    # Fake proxies should get added to BROKEN_PROXIES
    assert set(FAKE_PROXIES).issubset(set(proxy.BROKEN_PROXIES))
Exemple #9
0
    def get_videos_urls(self, content):
        urls = set([])
        if content is not None:
            video_urls = content.find_all(
                lambda tag: tag.name == "a" and tag.attrs.get("href", "").find(
                    "youtube") != -1 or tag.attrs.get("href", "").find(
                        "youtu.be") != -1 or tag.text.lower() == "youtube")

            for video_url in video_urls:
                urls.add(video_url.get("href", ""))

            for iframe in content.find_all("iframe"):
                url = iframe["src"]
                if YouTubeResource.is_youtube(
                        url) and not YouTubeResource.is_channel(url):
                    urls.add(YouTubeResource.transform_embed(url))
        return urls
    def get_playlist_info(self):
        """
        Get playlist info from either local json cache or URL
        """
        if not os.path.isdir(YOUTUBE_CACHE_DIR):
            os.mkdir(YOUTUBE_CACHE_DIR)

        playlist_info = None
        if os.path.exists(self.playlist_info_json_path) and self.use_cache:
            LOGGER.info(
                "[Playlist %s] Retrieving cached playlist information...",
                self.playlist_id)
            playlist_info = json.load(open(self.playlist_info_json_path))

        if not playlist_info:
            playlist_url = YOUTUBE_PLAYLIST_URL_FORMAT.format(self.playlist_id)
            playlist_resource = YouTubeResource(playlist_url)

            if playlist_resource:
                try:
                    playlist_info = playlist_resource.get_resource_info(
                        dict(ignoreerrors=True, skip_download=True))

                    # Traverse through the video list to remove duplicates
                    video_set = set()
                    videos = playlist_info.get('children')
                    for video in videos:
                        if video['id'] in video_set:
                            videos.remove(video)
                        else:
                            video_set.add(video['id'])

                    json.dump(playlist_info,
                              open(self.playlist_info_json_path, 'w'),
                              indent=4,
                              ensure_ascii=False,
                              sort_keys=False)
                    LOGGER.info("[Playlist %s] Successfully get playlist info",
                                self.playlist_id)
                    return playlist_info
                except Exception as e:
                    LOGGER.error(
                        "[Playlist %s] Failed to get playlist info: %s",
                        self.playlist_id, e)
                    return None
        return playlist_info
Exemple #11
0
def get_subtitles_using_youtube_dl(youtube_id):
    youtube_url = 'https://youtube.com/watch?v=' + youtube_id
    yt_resource = YouTubeResource(youtube_url)
    lang_codes = []
    try:
        result = yt_resource.get_resource_subtitles()
        # TODO(ivan) Consider including auto-generated subtitles to increase
        #       coverage and handle edge cases of videos that are transalted
        #       but no metadata: https://www.youtube.com/watch?v=qlGjA9p1UAM
        if result:
            for lang_code, lang_subs in result['subtitles'].items():
                for lang_sub in lang_subs:
                    if 'ext' in lang_sub and lang_sub[
                            'ext'] == 'vtt' and lang_code not in lang_codes:
                        lang_codes.append(lang_code)
    except Exception as e:
        LOGGER.error('get_subtitles_using_youtube_dl failed for ' +
                     youtube_url)
        LOGGER.error(str(e))
    return lang_codes
Exemple #12
0
def download_from_web(web_url,
                      download_settings,
                      file_format=file_formats.MP4,
                      ext="",
                      download_ext=""):
    """
    Download `web_url` using YoutubeDL using `download_settings` options.
    Args:
        download_settings (dict): options to pass onto YoutubeDL
        file_format (str): one of "mp4" or "vtt"
        ext (str): extensions to use as part of `outtmpl` given to YoutubeDL
        download_ext (str): extensions to append to `outtmpl` after downloading
    This is function operates differently when downloadin videos and substitles.
    For videos we set the `outtmpl` to the actual filename that will be downloaded,
    and the function must be called with ext = ".mp4" and download_ext="".
    For subtitles we set the `outtmpl` to extension-less string, and YoutubeDL
    automatically appends the language code and vtt extension, so the function
    must be called with ext="" and download_ext=".{youtube_lang}.vtt"
    :return: filename derived from hash of file contents {md5hash(file)}.ext
    """
    key = generate_key("DOWNLOADED", web_url, settings=download_settings)
    cache_file = get_cache_filename(key)
    if cache_file:
        return cache_file

    # Get hash of web_url to act as temporary storage name
    url_hash = hashlib.md5()
    url_hash.update(web_url.encode('utf-8'))
    tempfilename = "{}{ext}".format(url_hash.hexdigest(), ext=ext)
    outtmpl_path = os.path.join(tempfile.gettempdir(), tempfilename)
    download_settings["outtmpl"] = outtmpl_path
    destination_path = outtmpl_path + download_ext  # file dest. after download

    # Delete files in case previously downloaeded
    if os.path.exists(outtmpl_path):
        os.remove(outtmpl_path)
    if os.path.exists(destination_path):
        os.remove(destination_path)

    # Download the web_url which can be either a video or subtitles
    if not config.USEPROXY:
        # Connect to YouTube directly
        with youtube_dl.YoutubeDL(download_settings) as ydl:
            ydl.download([web_url])
            if not os.path.exists(destination_path):
                raise youtube_dl.utils.DownloadError('Failed to download ' +
                                                     web_url)
    else:
        # Connect to YouTube via an HTTP proxy
        yt_resource = YouTubeResource(web_url,
                                      useproxy=True,
                                      options=download_settings)
        result1 = yt_resource.get_resource_info()
        if result1 is None:
            raise youtube_dl.utils.DownloadError('Failed to get resource info')
        download_settings[
            "writethumbnail"] = False  # overwrite default behaviour
        if file_format == file_formats.VTT:
            # We need to use the proxy when downloading subtitles
            result2 = yt_resource.download(options=download_settings,
                                           useproxy=True)
        else:
            # For video files we can skip the proxy for faster download speed
            result2 = yt_resource.download(options=download_settings)
        if result2 is None or not os.path.exists(destination_path):
            raise youtube_dl.utils.DownloadError(
                'Failed to download resource ' + web_url)

    # Write file to local storage
    filename = "{}.{}".format(get_hash(destination_path), file_format)
    with open(destination_path,
              "rb") as dlf, open(config.get_storage_path(filename),
                                 'wb') as destf:
        shutil.copyfileobj(dlf, destf)

    FILECACHE.set(key, bytes(filename, "utf-8"))
    return filename