def add_files(node, file_list): for f in file_list: path = f.get('path') if path is not None: abspath = get_abspath(path) # NEW: expand content:// --> ./content/ in file paths else: abspath = None file_type = guess_file_type(node.kind, filepath=abspath, youtube_id=f.get('youtube_id'), web_url=f.get('web_url'), encoding=f.get('encoding')) if file_type == FileTypes.AUDIO_FILE: node.add_file(files.AudioFile(path=abspath, language=f.get('language'))) elif file_type == FileTypes.THUMBNAIL: node.add_file(files.ThumbnailFile(path=abspath)) elif file_type == FileTypes.DOCUMENT_FILE: node.add_file(files.DocumentFile(path=abspath, language=f.get('language'))) elif file_type == FileTypes.HTML_ZIP_FILE: node.add_file(files.HTMLZipFile(path=abspath, language=f.get('language'))) elif file_type == FileTypes.VIDEO_FILE: node.add_file(files.VideoFile(path=abspath, language=f.get('language'), ffmpeg_settings=f.get('ffmpeg_settings'))) elif file_type == FileTypes.SUBTITLE_FILE: node.add_file(files.SubtitleFile(path=abspath, language=f['language'])) elif file_type == FileTypes.BASE64_FILE: node.add_file(files.Base64ImageFile(encoding=f['encoding'])) elif file_type == FileTypes.WEB_VIDEO_FILE: node.add_file(files.WebVideoFile(web_url=f['web_url'], high_resolution=f.get('high_resolution'))) elif file_type == FileTypes.YOUTUBE_VIDEO_FILE: node.add_file(files.YouTubeVideoFile(youtube_id=f['youtube_id'], high_resolution=f.get('high_resolution'))) node.add_file(files.YouTubeSubtitleFile(youtube_id=f['youtube_id'], language='en')) else: raise UnknownFileTypeError("Unrecognized file type '{0}'".format(f['path']))
def construct_channel(self, *args, **kwargs): """ Creates ChannelNode and build topic tree Args: - args: arguments passed in during upload_channel (currently None) - kwargs: extra argumens and options not handled by `uploadchannel`. For example, add the command line option lang="fr" and the string "fr" will be passed along to `construct_channel` as kwargs['lang']. Returns: ChannelNode Healing Classrooms is organized with the following hierarchy: Playlist (TopicNode) | Youtube Video (VideoNode) | Youtube Video (VideoNode) """ channel = self.get_channel(*args, **kwargs) # Create ChannelNode from data in self.channel_info # Download the playlist/video information with youtube_dl.YoutubeDL({'skip_download': True}) as ydl: info_dict = ydl.extract_info(PLAYLISTS_URL, download=False) # Generate topics based off playlist entries in dict for playlist in info_dict['entries']: # Get language of playlist (hack) language = "fr" if "English" in playlist['title']: language = "en" elif "Arabic" in playlist['title']: language = "ar" playlist_topic = nodes.TopicNode(title=playlist['title'], source_id=playlist['id'], language=language) channel.add_child(playlist_topic) # Generate videos based off video entries in dict for video in playlist['entries']: thumbnail_url = len(video['thumbnails']) and video['thumbnails'][0]['url'] playlist_topic.add_child(nodes.VideoNode( title = video['title'], source_id = video['id'], license = licenses.PublicDomainLicense(), description = video['description'], derive_thumbnail = not thumbnail_url, files = [files.WebVideoFile(video['webpage_url'])], thumbnail = thumbnail_url, author = AUTHOR, # tags = video['categories'] + video['tags'], # TODO: uncomment this when added )) raise_for_invalid_channel(channel) # Check for errors in channel construction return channel
def scrape_video_collection(url, topic): """ Scrape videos under video collection and add to the topic node Args: url (str): url to video page (e.g. https://www.exploratorium.edu/video/inflatable-jimmy-kuehnle) topic (TopicNode): topic to add video nodes to """ try: collection_contents = BeautifulSoup(read(url), 'html5lib') for result in collection_contents.find_all('div', {'class': 'search-result'}): header = result.find('div', {'class': 'views-field-field-html-title'}) LOGGER.info(" {}".format(header.text.strip())) # Get video from given url description = result.find('div', {'class': 'search-description'}) video_contents = BeautifulSoup(read(header.find('a')['href']), 'html.parser') for k, v in get_brightcove_mapping(video_contents).items(): video_node = nodes.VideoNode( source_id=k, title=header.text.strip().replace("’", "'"), description=description.text.strip() if description else "", license=LICENSE, copyright_holder=COPYRIGHT_HOLDER, author=v.get('author') or "", files=[ files.WebVideoFile(v['url'], high_resolution=False) ], thumbnail=get_thumbnail_url(result.find('img')['src']), ) # If video doesn't already exist here, add to topic if not next((c for c in topic.children if c.source_id == video_node.source_id), None): topic.add_child(video_node) # Scrape next page (if any) next_page_url = get_next_page_url(collection_contents) if next_page_url: scrape_video_collection(next_page_url, topic) except requests.exceptions.HTTPError: LOGGER.error("Could not read collection at {}".format(url))
def add_files(node, file_list): EXPECTED_FILE_TYPES = [ VIDEO_FILE, AUDIO_FILE, DOCUMENT_FILE, HTML5_FILE, THUMBNAIL_FILE, SUBTITLES_FILE ] for f in file_list: file_type = f.get('file_type') if file_type not in EXPECTED_FILE_TYPES: LOGGER.critical(file_type) raise NotImplementedError( 'Unexpected File type found in channel json.') path = f.get('path') # path can be an URL or a local path (or None) # handle different types of files if file_type == VIDEO_FILE: # handle three types of video files if 'youtube_id' in f: video_file = files.YouTubeVideoFile( youtube_id=f['youtube_id'], download_settings=f.get('download_settings', None), high_resolution=f.get('high_resolution', True), maxheight=f.get('maxheight', None), language=f.get('language', None), ) elif 'web_url' in f: video_file = files.WebVideoFile( web_url=f['web_url'], download_settings=f.get('download_settings', None), high_resolution=f.get('high_resolution', True), maxheight=f.get('maxheight', None), language=f.get('language', None), ) else: video_file = files.VideoFile( path=f['path'], language=f.get('language', None), ffmpeg_settings=f.get('ffmpeg_settings'), ) node.add_file(video_file) elif file_type == AUDIO_FILE: node.add_file( files.AudioFile(path=f['path'], language=f.get('language', None))) elif file_type == DOCUMENT_FILE: node.add_file( files.DocumentFile(path=path, language=f.get('language', None))) elif file_type == HTML5_FILE: node.add_file( files.HTMLZipFile(path=path, language=f.get('language', None))) elif file_type == THUMBNAIL_FILE: if 'encoding' in f: node.add_file(files.Base64ImageFile(encoding=f['encoding'], )) else: node.add_file( files.ThumbnailFile( path=path, language=f.get('language', None), )) elif file_type == SUBTITLES_FILE: if 'youtube_id' in f: node.add_file( files.YouTubeSubtitleFile(youtube_id=f['youtube_id'], language=f['language'])) else: node.add_file( files.SubtitleFile(path=path, language=f['language'])) else: raise UnknownFileTypeError("Unrecognized file type '{0}'".format( f['path']))
def construct_channel(self, *args, **kwargs): """ Creates ChannelNode and build topic tree Args: - args: arguments passed in during upload_channel (currently None) - kwargs: extra argumens and options not handled by `uploadchannel`. For example, add the command line option lang="fr" and the string "fr" will be passed along to `construct_channel` as kwargs['lang']. Returns: ChannelNode Healing Classrooms is organized with the following hierarchy: Playlist (TopicNode) | Youtube Video (VideoNode) | Youtube Video (VideoNode) """ channel = self.get_channel( *args, **kwargs) # Create ChannelNode from data in self.channel_info # Download the playlist/video information try: with youtube_dl.YoutubeDL({'skip_download': True}) as ydl: info_dict = ydl.extract_info(PLAYLISTS_URL, download=False) print(info_dict.keys()) # Generate topics based off playlist entries in dict #for playlist in info_dict['entries']: # Get language of playlist (hack) # language = "fr" # if "English" in playlist['title']: # language = "en" # elif "Arabic" in playlist['title']: language = "ar" # playlist_topic = nodes.TopicNode(title=playlist['title'], source_id=playlist['id'], language=language) # channel.add_child(playlist_topic) # Generate videos based off video entries in dict videos = sorted( info_dict['entries'], key=lambda x: int(re.search("\d+", x['title']).group())) print([v['title'] for v in videos]) import time time.sleep(15) for video in videos: #try: # num, = re.findall("\d+",video['title']) # title = re.sub(video['title'], num, "") # title = ("0"+num)[-2:] + " " + title #except Exception as e: # print (e) # print (video['title']) # print (repr(video['title'])) # raise thumbnail_url = len( video['thumbnails']) and video['thumbnails'][0]['url'] channel.add_child( nodes.VideoNode( title=video['title'], source_id=video['id'], license=licenses.PublicDomainLicense(), description=video['description'], derive_thumbnail=not thumbnail_url, files=[files.WebVideoFile(video['webpage_url'])], thumbnail=thumbnail_url, author=AUTHOR, # tags = video['categories'] + video['tags'], # TODO: uncomment this when added )) except Exception as e: import traceback, sys traceback.print_exc(file=sys.stdout) raise raise_for_invalid_channel( channel) # Check for errors in channel construction return channel