def fetch_video(video):
    youtube_id = video['id']
    title = video['title']
    description = video['description']
    youtube_url = video['webpage_url']
    subtitle_languages = video['subtitles'].keys()

    print("    Fetching video data: %s (%s)" % (title, youtube_url))

    video_node = nodes.VideoNode(
        source_id=youtube_id,
        title=truncate_metadata(title),
        license=LICENSE,
        description=truncate_description(description),
        derive_thumbnail=True,
        language="en",
        files=[files.YouTubeVideoFile(youtube_id=youtube_id)],
    )

    # Add subtitles in whichever languages are available.
    for language in subtitle_languages:
        # TODO(david): Should catch exception thrown by
        # files.YouTubeSubtitleFile rather than breaking abstraction.
        if languages.getlang(language) or languages.getlang_by_alpha2(
                language):
            video_node.add_file(
                files.YouTubeSubtitleFile(youtube_id=youtube_id,
                                          language=language))
        else:
            print("WARNING: Subtitle language %s not found in languages file" %
                  language)

    return video_node
Example #2
0
def add_files(node, file_list):
    for f in file_list:

        path = f.get('path')
        if path is not None:
            abspath = get_abspath(path)      # NEW: expand  content://  -->  ./content/  in file paths
        else:
            abspath = None

        file_type = guess_file_type(node.kind, filepath=abspath, youtube_id=f.get('youtube_id'), web_url=f.get('web_url'), encoding=f.get('encoding'))

        if file_type == FileTypes.AUDIO_FILE:
            node.add_file(files.AudioFile(path=abspath, language=f.get('language')))
        elif file_type == FileTypes.THUMBNAIL:
            node.add_file(files.ThumbnailFile(path=abspath))
        elif file_type == FileTypes.DOCUMENT_FILE:
            node.add_file(files.DocumentFile(path=abspath, language=f.get('language')))
        elif file_type == FileTypes.HTML_ZIP_FILE:
            node.add_file(files.HTMLZipFile(path=abspath, language=f.get('language')))
        elif file_type == FileTypes.VIDEO_FILE:
            node.add_file(files.VideoFile(path=abspath, language=f.get('language'), ffmpeg_settings=f.get('ffmpeg_settings')))
        elif file_type == FileTypes.SUBTITLE_FILE:
            node.add_file(files.SubtitleFile(path=abspath, language=f['language']))
        elif file_type == FileTypes.BASE64_FILE:
            node.add_file(files.Base64ImageFile(encoding=f['encoding']))
        elif file_type == FileTypes.WEB_VIDEO_FILE:
            node.add_file(files.WebVideoFile(web_url=f['web_url'], high_resolution=f.get('high_resolution')))
        elif file_type == FileTypes.YOUTUBE_VIDEO_FILE:
            node.add_file(files.YouTubeVideoFile(youtube_id=f['youtube_id'], high_resolution=f.get('high_resolution')))
            node.add_file(files.YouTubeSubtitleFile(youtube_id=f['youtube_id'], language='en'))
        else:
            raise UnknownFileTypeError("Unrecognized file type '{0}'".format(f['path']))
Example #3
0
def fetch_video(video):
    youtube_id = video['id']
    title = video['title']
    description = video['description']
    youtube_url = video['webpage_url']
    subtitle_languages = video['subtitles'].keys()

    print("    Fetching video data: %s (%s)" % (title, youtube_url))

    video_node = nodes.VideoNode(
        source_id=youtube_id,
        title=truncate_metadata(title),
        license=LICENSE,
        description=truncate_description(description),
        derive_thumbnail=True,
        language="en",
        files=[files.YouTubeVideoFile(youtube_id=youtube_id)],
    )

    # Add subtitles in whichever languages are available.
    for language in subtitle_languages:
        if getlang_patched(language):
            video_node.add_file(LanguagePatchedYouTubeSubtitleFile(
                youtube_id=youtube_id, youtube_language=language))

    return video_node
def download_video_topics(topic_node,
                          playlist_item,
                          lang_obj,
                          use_cache=True,
                          to_sheet=False):
    """
    Scrape, collect, and download the videos from playlist.
    """
    playlist_obj = RefugeeResponsePlaylist(playlist_item, use_cache)
    playlist_info = playlist_obj.get_playlist_info()
    videos = [entry['id'] for entry in playlist_info.get('children')]
    for video in playlist_info.get('children'):
        video_id = video['id']
        video_url = YOUTUBE_VIDEO_URL_FORMAT.format(video_id)
        video_source_id = 'refugee-response-{0}-{1}'.format(
            lang_obj.name, video_id)
        if video_id in VIDEO_DESCRIPTION_MAP:
            video_description = VIDEO_DESCRIPTION_MAP[video_id]
        else:
            # Exclude videos
            continue
        LOGGER.info("Video Description: '%s'", video_description)
        try:
            video_node = nodes.VideoNode(
                source_id=video_source_id,
                title=video['title'],
                description=video_description,
                author=REFUGEE_RESPONSE,
                language=lang_obj.code,
                provider=REFUGEE_RESPONSE,
                thumbnail=video['thumbnail'],
                license=licenses.get_license(
                    "CC BY-NC-ND", copyright_holder=REFUGEE_RESPONSE),
                files=[
                    files.YouTubeVideoFile(youtube_id=video_id,
                                           language=lang_obj.code)
                ])
            topic_node.add_child(video_node)
        except Exception as e:
            LOGGER.error('Error downloading this video: %s', e)
Example #5
0
def add_files(node, file_list):
    EXPECTED_FILE_TYPES = [
        VIDEO_FILE, AUDIO_FILE, DOCUMENT_FILE, HTML5_FILE, THUMBNAIL_FILE,
        SUBTITLES_FILE
    ]

    for f in file_list:
        file_type = f.get('file_type')
        if file_type not in EXPECTED_FILE_TYPES:
            LOGGER.critical(file_type)
            raise NotImplementedError(
                'Unexpected File type found in channel json.')

        path = f.get('path')  # path can be an URL or a local path (or None)

        # handle different types of files
        if file_type == VIDEO_FILE:
            # handle three types of video files
            if 'youtube_id' in f:
                video_file = files.YouTubeVideoFile(
                    youtube_id=f['youtube_id'],
                    download_settings=f.get('download_settings', None),
                    high_resolution=f.get('high_resolution', True),
                    maxheight=f.get('maxheight', None),
                    language=f.get('language', None),
                )
            elif 'web_url' in f:
                video_file = files.WebVideoFile(
                    web_url=f['web_url'],
                    download_settings=f.get('download_settings', None),
                    high_resolution=f.get('high_resolution', True),
                    maxheight=f.get('maxheight', None),
                    language=f.get('language', None),
                )
            else:
                video_file = files.VideoFile(
                    path=f['path'],
                    language=f.get('language', None),
                    ffmpeg_settings=f.get('ffmpeg_settings'),
                )
            node.add_file(video_file)

        elif file_type == AUDIO_FILE:
            node.add_file(
                files.AudioFile(path=f['path'],
                                language=f.get('language', None)))

        elif file_type == DOCUMENT_FILE:
            node.add_file(
                files.DocumentFile(path=path, language=f.get('language',
                                                             None)))

        elif file_type == HTML5_FILE:
            node.add_file(
                files.HTMLZipFile(path=path, language=f.get('language', None)))

        elif file_type == THUMBNAIL_FILE:
            if 'encoding' in f:
                node.add_file(files.Base64ImageFile(encoding=f['encoding'], ))
            else:
                node.add_file(
                    files.ThumbnailFile(
                        path=path,
                        language=f.get('language', None),
                    ))

        elif file_type == SUBTITLES_FILE:
            if 'youtube_id' in f:
                node.add_file(
                    files.YouTubeSubtitleFile(youtube_id=f['youtube_id'],
                                              language=f['language']))
            else:
                node.add_file(
                    files.SubtitleFile(path=path, language=f['language']))

        else:
            raise UnknownFileTypeError("Unrecognized file type '{0}'".format(
                f['path']))
    def add_video_nodes_from_playlist(
            self,
            youtube_client,
            playlist_id,
            subtitle_languages=SUBTITLE_LANGUAGES,
            copyright_holder=COPYRIGHT_HOLDER,
            only_creative_commons=ONLY_CREATIVE_COMMONS):
        first_page = True
        next_page_token = None
        playlist_request_kwargs = {
            'part': 'contentDetails',
            'maxResults': 50,
            'playlistId': playlist_id,
        }

        # Apparently the same video is in one of the playlists twice!
        # This is used to keep track of videos that have already been added.
        videos_added = {}
        while first_page or next_page_token:
            first_page = False  # we're visiting the first page now!
            playlist_info = youtube_client.playlistItems().list(
                **playlist_request_kwargs).execute()
            playlist_items = playlist_info['items']

            video_ids = [
                vid['contentDetails']['videoId'] for vid in playlist_items
            ]
            videos = youtube_client.videos().list(
                part='status,snippet',
                id=','.join(video_ids)).execute()['items']

            # Apparently the same video is in one of the playlists twice!
            # Uncomment the following code to see for yourself:

            # video_ids = [v['id'] for v in videos]
            # duplicated_videos = [v for v in video_ids if video_ids.count(v) > 1]
            # print("The following videos are duplicated: {}".format(duplicated_videos))

            for video in videos:
                if video['id'] in videos_added:
                    continue
                if only_creative_commons and video['status'][
                        'license'] != 'creativeCommon':
                    print(
                        "The video '{}' is not licensed as Creative Commons... it is licensed as {}"
                        .format(video['snippet']['title'],
                                video['status']['license']))
                else:
                    try:
                        video_license = licenses.CC_BY \
                                        if video['status']['license'] == 'creativeCommon' \
                                        else NON_CREATIVE_COMMONS_LICENSE_DEFAULT
                        video_node = nodes.VideoNode(
                            # source_id="{}__{}".format(video['id'], playlist_id),
                            source_id=video['id'],
                            title=video['snippet']['title'],
                            language=CHANNEL_LANGUAGE,
                            license=get_license(
                                video_license,
                                copyright_holder=copyright_holder),
                            thumbnail=get_largest_thumbnail(
                                video['snippet']['thumbnails']).get('url'),
                            files=[
                                files.YouTubeVideoFile(video['id']),
                            ])

                        # Get subtitles for languages designated in SUBTITLE_LANGUAGES
                        for lang_code in subtitle_languages:
                            if files.is_youtube_subtitle_file_supported_language(
                                    lang_code):
                                video_node.add_file(
                                    files.YouTubeSubtitleFile(
                                        youtube_id=video['id'],
                                        language=lang_code))
                            else:
                                print('Unsupported subtitle language code:',
                                      lang_code)

                        self.add_child(video_node)
                        videos_added[video['id']] = video_node
                    except Exception as e:
                        raise e

            # set up the next page, if there is one
            next_page_token = playlist_info.get('nextPageToken')
            if next_page_token:
                playlist_request_kwargs['pageToken'] = next_page_token
            else:
                try:
                    del playlist_request_kwargs['pageToken']
                except Exception as e:
                    pass
Example #7
0
    def construct_channel(self, *args, **kwargs):
        """
        Creates ChannelNode and build topic tree
        Args:
          - args: arguments passed in during upload_channel (currently None)
          - kwargs: extra argumens and options not handled by `uploadchannel`.
            For example, add the command line option   lang="fr"  and the string
            "fr" will be passed along to `construct_channel` as kwargs['lang'].
        Returns: ChannelNode
        """
        channel = self.get_channel(*args, **kwargs)  # Create ChannelNode from data in self.channel_info
        
        from apiclient.discovery import build
        # instantiate a YouTube Data API v3 client
        youtube = build('youtube', 'v3', developerKey=kwargs['--youtube-api-token'])
        playlists = youtube.playlists().list( # list all of the YouTube channel's playlists
            part='snippet',
            channelId=YOUTUBE_CHANNEL_ID,
            maxResults=50
        ).execute()['items']

        # For getting the thumbnail automatically
        
        # youtube_channel = youtube.channels().list(
        #     id=YOUTUBE_CHANNEL_ID,
        #     part='snippet'
        # ).execute()['items'][0]

        # channel.thumbnail = get_largest_thumbnail(youtube_channel['snippet']['thumbnails']).get('url')

        for playlist in playlists:
            topic = nodes.TopicNode(title=playlist['snippet']['title'], source_id=playlist['id'])
            first_page = True
            next_page_token = None
            playlist_request_kwargs = {
                'part': 'contentDetails',
                'maxResults': 50,
                'playlistId': playlist['id'],
            }

            while first_page or next_page_token:
                first_page = False # we're visiting the first page now!
                playlist_info = youtube.playlistItems().list(**playlist_request_kwargs).execute()
                playlist_items = playlist_info['items']

                video_ids = [vid['contentDetails']['videoId'] for vid in playlist_items]
                videos = youtube.videos().list(
                    part='status,snippet',
                    id=','.join(video_ids)
                ).execute()['items']

                for video in videos:
                    if video['status']['license'] == 'creativeCommon':
                        try:
                            video_node = nodes.VideoNode(
                                source_id=video['id'],
                                title=video['snippet']['title'],
                                language=CHANNEL_LANGUAGE,
                                license=get_license(licenses.CC_BY, copyright_holder='Espresso English'),
                                thumbnail=get_largest_thumbnail(video['snippet']['thumbnails']).get('url'),
                                files=[
                                    files.YouTubeVideoFile(video['id']),
                                ]
                            )

                            topic.add_child(video_node)
                            
                            # Get subtitles for languages designated in SUBTITLE_LANGUAGES
                            for lang_code in SUBTITLE_LANGUAGES:
                                if files.is_youtube_subtitle_file_supported_language(lang_code):
                                    video_node.add_file(
                                        files.YouTubeSubtitleFile(
                                            youtube_id=video['id'],
                                            language=lang_code
                                        )
                                    )
                                else:
                                    print('Unsupported subtitle language code:', lang_code)

                        except Exception as e:
                            raise e
                
                # set up the next page, if there is one
                next_page_token = playlist_info.get('nextPageToken')
                if next_page_token:
                    playlist_request_kwargs['pageToken'] = next_page_token
                else:
                    try:
                        del playlist_request_kwargs['pageToken']
                    except Exception as e:
                        pass

            channel.add_child(topic)

        raise_for_invalid_channel(channel)  # Check for errors in channel construction

        return channel
Example #8
0
def download_content_node(category_node,
                          url,
                          title,
                          thumbnail=None,
                          description=None):
    doc = get_parsed_html_from_url(url)

    destination = tempfile.mkdtemp()
    doc = download_static_assets(doc,
                                 destination,
                                 'https://k12.thoughtfullearning.com',
                                 request_fn=make_request,
                                 url_blacklist=url_blacklist)

    remove_node(doc, '#header')
    remove_node(doc, '.subMenuBarContainer')
    remove_node(doc, '.breadbookmarkcontainer')
    remove_node(doc, '.resourcePageTypeTitle')
    remove_node(doc, '.sharethis-wrapper')
    remove_node(doc, '.ccBlock')
    remove_node(doc, '#block-views-resource-info-block-block-1')
    remove_node(doc, '#block-views-resource-info-block-block-1')
    remove_node(doc, '#block-views-resource-info-block-block')
    remove_node(doc, '.productSuggestionContainer')
    remove_node(doc, 'footer')

    # For minilessons
    remove_node(doc, '.field-name-field-minilesson-downloadables')

    # For writing assessments
    remove_node(doc, '.assessmentTGLink')
    remove_node(doc, '.assessmentModelRubrics')
    remove_node(doc, '.view-display-id-attachment_1')

    # Write out the HTML source.
    with open(os.path.join(destination, "index.html"), "w") as f:
        f.write(str(doc))

    print("    ... downloaded to %s" % destination)
    #preview_in_browser(destination)

    thumbnail_path = None
    if thumbnail:
        # Manually download the thumbnail and use it so we can lowercase the
        # extension to be accepted by Ricecooker.
        thumbnail_filename = derive_filename(thumbnail)
        thumbnail_path = os.path.join(destination, thumbnail_filename)
        download_file(thumbnail,
                      destination,
                      request_fn=make_request,
                      filename=thumbnail_filename)

    # If there is an embedded video in the page source grab it as a video node.
    video_node = None
    iframe = doc.select_one('.embedded-video iframe')
    if iframe:
        youtube_url = iframe['src']
        youtube_id = get_youtube_id_from_url(youtube_url)
        info = ydl.extract_info(youtube_url, download=False)
        video_title = info['title']
        print(
            "    ... and with video titled %s from www.youtube.com/watch?v=%s"
            % (video_title, youtube_id))
        video_node = nodes.VideoNode(
            source_id=youtube_id,
            title=truncate_metadata(info['title']),
            license=licenses.CC_BY_NC_SALicense(
                copyright_holder=truncate_metadata('Thoughtful Learning')),
            description=info['description'],
            language="en",
            derive_thumbnail=True,
            files=[files.YouTubeVideoFile(youtube_id)],
        )
        category_node.add_child(video_node)

    zip_path = create_predictable_zip(destination)
    app_node = nodes.HTML5AppNode(
        source_id=url,
        title=truncate_metadata(title),
        license=licenses.CC_BY_NC_SALicense(
            copyright_holder=truncate_metadata('Thoughtful Learning')),
        description=description,
        thumbnail=thumbnail_path,
        files=[files.HTMLZipFile(zip_path)],
        language="en",
    )

    category_node.add_child(app_node)
Example #9
0
    def construct_channel(self, *args, **kwargs):
        """
        Creates ChannelNode and build topic tree
        Args:
          - args: arguments passed in on the command line
          - kwargs: extra options passed in as key="value" pairs on the command line
            For example, add the command line option   lang="fr"  and the value
            "fr" will be passed along to `construct_channel` as kwargs['lang'].
        Returns: ChannelNode
        """
        # editing metadata
        for key, value in kwargs.items():
            if key == NO_CACHE_KEYNAME:
                self.use_cache = False
                LOGGER.info("use_cache = '%d'", self.use_cache)
            if key == EXTRACT_VIDEO_INFO:
                self.insert_video_info = True
                self.video_list = value.split(",")
            if key == EXTRACT_VIDEO_PLAYLIST_INFO:
                self.insert_video_info = True
                self.to_playlist = value
            if key == DOWNLOAD_TO_CSV:
                if value == "true":
                    print('csv = true')
                    create_csv()
                    exit(0)

        channel = self.get_channel(
            *args,
            **kwargs)  # Create ChannelNode from data in self.channel_info
        # Get Channel Topics

        # Create thumbnails folder in chefdata if not exists
        if not os.path.isdir(os.path.join('chefdata', 'thumbnails')):
            os.makedirs(os.path.join('chefdata', 'thumbnails'))

        # youtube_cache = os.path.join("chefdata", "youtubecache")

        for playlist_id in PLAYLIST_MAP:

            playlist = YouTubePlaylistUtils(id=playlist_id,
                                            cache_dir=YOUTUBE_CACHE_DIR)

            playlist_info = playlist.get_playlist_info(use_proxy=False)

            # Get channel description if there is any
            playlist_description = ''
            if playlist_info["description"]:
                playlist_description = playlist_info["description"]
            else:
                playlist_description = playlist_info["title"]

            topic_source_id = 'aimhi-child-topic-{0}'.format(
                playlist_info["title"])
            topic_node = nodes.TopicNode(title=playlist_info["title"],
                                         source_id=topic_source_id,
                                         author="AimHi",
                                         provider="AimHi",
                                         description=playlist_description,
                                         language="en")

            video_ids = []

            # insert videos into playlist topic after creation
            for child in playlist_info["children"]:
                # check for duplicate videos
                if child["id"] not in video_ids:
                    video = YouTubeVideoUtils(id=child["id"], cache_dir=False)
                    video_details = video.get_video_info(use_proxy=False)
                    video_source_id = "AimHi-{0}-{1}".format(
                        playlist_info["title"], video_details["id"])

                    # Check youtube thumbnail extension as some are not supported formats
                    thumbnail_link = ''
                    print(video_details["thumbnail"])
                    image_response = requests.get("{0}".format(
                        video_details["thumbnail"]))

                    img = Image.open(BytesIO(image_response.content))
                    if img.format not in ['JPG', 'PNG', 'JPEG']:
                        # if not in correct format, convert image and download to files folder
                        print(video_details["thumbnail"])
                        print("{0}'s thumbnail not supported ({1}).".format(
                            video_details["id"], img.format))
                        img_file_name = '{}_thumbnail.jpg'.format(
                            video_details["id"])
                        thumbnail_link = os.path.join('chefdata', 'thumbnails',
                                                      img_file_name)

                        jpg_img = img.convert("RGB")

                        # resive image to thumbnail dimensions
                        jpg_img = jpg_img.resize((400, 225), Image.ANTIALIAS)
                        jpg_img.save(thumbnail_link)
                    else:
                        thumbnail_link = video_details["thumbnail"]

                    print(thumbnail_link)
                    video_node = nodes.VideoNode(
                        source_id=video_source_id,
                        title=video_details["title"],
                        description=video_details["description"],
                        author="AimHi",
                        language="en",
                        provider="AimHi",
                        thumbnail=thumbnail_link,
                        license=licenses.get_license("CC BY-NC-ND",
                                                     copyright_holder="AimHi"),
                        files=[
                            files.YouTubeVideoFile(
                                youtube_id=video_details["id"], language="en")
                        ])
                    # add video to topic
                    print(video_details["id"] + " has been added!")
                    # add id to video_ids array
                    video_ids.append(video_details["id"])
                    topic_node.add_child(video_node)

                else:
                    continue

            # add topic to channel
            channel.add_child(topic_node)

        return channel