def add_files(node, file_list):
    for f in file_list:

        path = f.get('path')
        if path is not None:
            abspath = get_abspath(path)      # NEW: expand  content://  -->  ./content/  in file paths
        else:
            abspath = None

        file_type = guess_file_type(node.kind, filepath=abspath, youtube_id=f.get('youtube_id'), web_url=f.get('web_url'), encoding=f.get('encoding'))

        if file_type == FileTypes.AUDIO_FILE:
            node.add_file(files.AudioFile(path=abspath, language=f.get('language')))
        elif file_type == FileTypes.THUMBNAIL:
            node.add_file(files.ThumbnailFile(path=abspath))
        elif file_type == FileTypes.DOCUMENT_FILE:
            node.add_file(files.DocumentFile(path=abspath, language=f.get('language')))
        elif file_type == FileTypes.HTML_ZIP_FILE:
            node.add_file(files.HTMLZipFile(path=abspath, language=f.get('language')))
        elif file_type == FileTypes.VIDEO_FILE:
            node.add_file(files.VideoFile(path=abspath, language=f.get('language'), ffmpeg_settings=f.get('ffmpeg_settings')))
        elif file_type == FileTypes.SUBTITLE_FILE:
            node.add_file(files.SubtitleFile(path=abspath, language=f['language']))
        elif file_type == FileTypes.BASE64_FILE:
            node.add_file(files.Base64ImageFile(encoding=f['encoding']))
        elif file_type == FileTypes.WEB_VIDEO_FILE:
            node.add_file(files.WebVideoFile(web_url=f['web_url'], high_resolution=f.get('high_resolution')))
        elif file_type == FileTypes.YOUTUBE_VIDEO_FILE:
            node.add_file(files.YouTubeVideoFile(youtube_id=f['youtube_id'], high_resolution=f.get('high_resolution')))
            node.add_file(files.YouTubeSubtitleFile(youtube_id=f['youtube_id'], language='en'))
        else:
            raise UnknownFileTypeError("Unrecognized file type '{0}'".format(f['path']))
    def construct_channel(self, *args, **kwargs):
        """
        Creates ChannelNode and build topic tree
        Args:
          - args: arguments passed in during upload_channel (currently None)
          - kwargs: extra argumens and options not handled by `uploadchannel`.
            For example, add the command line option   lang="fr"  and the string
            "fr" will be passed along to `construct_channel` as kwargs['lang'].
        Returns: ChannelNode

        Healing Classrooms is organized with the following hierarchy:
            Playlist (TopicNode)
            |   Youtube Video (VideoNode)
            |   Youtube Video (VideoNode)

        """
        channel = self.get_channel(*args, **kwargs)  # Create ChannelNode from data in self.channel_info

        # Download the playlist/video information
        with youtube_dl.YoutubeDL({'skip_download': True}) as ydl:
            info_dict = ydl.extract_info(PLAYLISTS_URL, download=False)

            # Generate topics based off playlist entries in dict
            for playlist in info_dict['entries']:

                # Get language of playlist (hack)
                language = "fr"
                if "English" in playlist['title']:
                    language = "en"
                elif "Arabic" in playlist['title']:
                    language = "ar"

                playlist_topic = nodes.TopicNode(title=playlist['title'], source_id=playlist['id'], language=language)
                channel.add_child(playlist_topic)


                # Generate videos based off video entries in dict
                for video in playlist['entries']:
                    thumbnail_url = len(video['thumbnails']) and video['thumbnails'][0]['url']

                    playlist_topic.add_child(nodes.VideoNode(
                        title = video['title'],
                        source_id = video['id'],
                        license = licenses.PublicDomainLicense(),
                        description = video['description'],
                        derive_thumbnail = not thumbnail_url,
                        files = [files.WebVideoFile(video['webpage_url'])],
                        thumbnail = thumbnail_url,
                        author = AUTHOR,
                        # tags = video['categories'] + video['tags'], # TODO: uncomment this when added
                    ))

        raise_for_invalid_channel(channel)  # Check for errors in channel construction

        return channel
def scrape_video_collection(url, topic):
    """ Scrape videos under video collection and add to the topic node
        Args:
            url (str): url to video page (e.g. https://www.exploratorium.edu/video/inflatable-jimmy-kuehnle)
            topic (TopicNode): topic to add video nodes to
    """
    try:
        collection_contents = BeautifulSoup(read(url), 'html5lib')
        for result in collection_contents.find_all('div',
                                                   {'class': 'search-result'}):
            header = result.find('div',
                                 {'class': 'views-field-field-html-title'})
            LOGGER.info("            {}".format(header.text.strip()))

            # Get video from given url
            description = result.find('div', {'class': 'search-description'})
            video_contents = BeautifulSoup(read(header.find('a')['href']),
                                           'html.parser')
            for k, v in get_brightcove_mapping(video_contents).items():
                video_node = nodes.VideoNode(
                    source_id=k,
                    title=header.text.strip().replace("’", "'"),
                    description=description.text.strip()
                    if description else "",
                    license=LICENSE,
                    copyright_holder=COPYRIGHT_HOLDER,
                    author=v.get('author') or "",
                    files=[
                        files.WebVideoFile(v['url'], high_resolution=False)
                    ],
                    thumbnail=get_thumbnail_url(result.find('img')['src']),
                )

                # If video doesn't already exist here, add to topic
                if not next((c for c in topic.children
                             if c.source_id == video_node.source_id), None):
                    topic.add_child(video_node)

        # Scrape next page (if any)
        next_page_url = get_next_page_url(collection_contents)
        if next_page_url:
            scrape_video_collection(next_page_url, topic)

    except requests.exceptions.HTTPError:
        LOGGER.error("Could not read collection at {}".format(url))
Beispiel #4
0
def add_files(node, file_list):
    EXPECTED_FILE_TYPES = [
        VIDEO_FILE, AUDIO_FILE, DOCUMENT_FILE, HTML5_FILE, THUMBNAIL_FILE,
        SUBTITLES_FILE
    ]

    for f in file_list:
        file_type = f.get('file_type')
        if file_type not in EXPECTED_FILE_TYPES:
            LOGGER.critical(file_type)
            raise NotImplementedError(
                'Unexpected File type found in channel json.')

        path = f.get('path')  # path can be an URL or a local path (or None)

        # handle different types of files
        if file_type == VIDEO_FILE:
            # handle three types of video files
            if 'youtube_id' in f:
                video_file = files.YouTubeVideoFile(
                    youtube_id=f['youtube_id'],
                    download_settings=f.get('download_settings', None),
                    high_resolution=f.get('high_resolution', True),
                    maxheight=f.get('maxheight', None),
                    language=f.get('language', None),
                )
            elif 'web_url' in f:
                video_file = files.WebVideoFile(
                    web_url=f['web_url'],
                    download_settings=f.get('download_settings', None),
                    high_resolution=f.get('high_resolution', True),
                    maxheight=f.get('maxheight', None),
                    language=f.get('language', None),
                )
            else:
                video_file = files.VideoFile(
                    path=f['path'],
                    language=f.get('language', None),
                    ffmpeg_settings=f.get('ffmpeg_settings'),
                )
            node.add_file(video_file)

        elif file_type == AUDIO_FILE:
            node.add_file(
                files.AudioFile(path=f['path'],
                                language=f.get('language', None)))

        elif file_type == DOCUMENT_FILE:
            node.add_file(
                files.DocumentFile(path=path, language=f.get('language',
                                                             None)))

        elif file_type == HTML5_FILE:
            node.add_file(
                files.HTMLZipFile(path=path, language=f.get('language', None)))

        elif file_type == THUMBNAIL_FILE:
            if 'encoding' in f:
                node.add_file(files.Base64ImageFile(encoding=f['encoding'], ))
            else:
                node.add_file(
                    files.ThumbnailFile(
                        path=path,
                        language=f.get('language', None),
                    ))

        elif file_type == SUBTITLES_FILE:
            if 'youtube_id' in f:
                node.add_file(
                    files.YouTubeSubtitleFile(youtube_id=f['youtube_id'],
                                              language=f['language']))
            else:
                node.add_file(
                    files.SubtitleFile(path=path, language=f['language']))

        else:
            raise UnknownFileTypeError("Unrecognized file type '{0}'".format(
                f['path']))
Beispiel #5
0
    def construct_channel(self, *args, **kwargs):
        """
        Creates ChannelNode and build topic tree
        Args:
          - args: arguments passed in during upload_channel (currently None)
          - kwargs: extra argumens and options not handled by `uploadchannel`.
            For example, add the command line option   lang="fr"  and the string
            "fr" will be passed along to `construct_channel` as kwargs['lang'].
        Returns: ChannelNode

        Healing Classrooms is organized with the following hierarchy:
            Playlist (TopicNode)
            |   Youtube Video (VideoNode)
            |   Youtube Video (VideoNode)

        """
        channel = self.get_channel(
            *args,
            **kwargs)  # Create ChannelNode from data in self.channel_info

        # Download the playlist/video information
        try:
            with youtube_dl.YoutubeDL({'skip_download': True}) as ydl:
                info_dict = ydl.extract_info(PLAYLISTS_URL, download=False)
                print(info_dict.keys())

                # Generate topics based off playlist entries in dict
                #for playlist in info_dict['entries']:

                # Get language of playlist (hack)
                #    language = "fr"
                #    if "English" in playlist['title']:
                #        language = "en"
                #    elif "Arabic" in playlist['title']:
                language = "ar"

                #    playlist_topic = nodes.TopicNode(title=playlist['title'], source_id=playlist['id'], language=language)
                #    channel.add_child(playlist_topic)

                # Generate videos based off video entries in dict
                videos = sorted(
                    info_dict['entries'],
                    key=lambda x: int(re.search("\d+", x['title']).group()))
                print([v['title'] for v in videos])
                import time
                time.sleep(15)
                for video in videos:
                    #try:
                    #    num, = re.findall("\d+",video['title'])
                    #    title = re.sub(video['title'], num, "")
                    #    title = ("0"+num)[-2:] + " " + title
                    #except Exception as e:
                    #    print (e)
                    #    print (video['title'])
                    #    print (repr(video['title']))
                    #    raise
                    thumbnail_url = len(
                        video['thumbnails']) and video['thumbnails'][0]['url']

                    channel.add_child(
                        nodes.VideoNode(
                            title=video['title'],
                            source_id=video['id'],
                            license=licenses.PublicDomainLicense(),
                            description=video['description'],
                            derive_thumbnail=not thumbnail_url,
                            files=[files.WebVideoFile(video['webpage_url'])],
                            thumbnail=thumbnail_url,
                            author=AUTHOR,
                            # tags = video['categories'] + video['tags'], # TODO: uncomment this when added
                        ))
        except Exception as e:
            import traceback, sys
            traceback.print_exc(file=sys.stdout)
            raise

        raise_for_invalid_channel(
            channel)  # Check for errors in channel construction

        return channel