def add_files(node, file_list): for f in file_list: path = f.get('path') if path is not None: abspath = get_abspath(path) # NEW: expand content:// --> ./content/ in file paths else: abspath = None file_type = guess_file_type(node.kind, filepath=abspath, youtube_id=f.get('youtube_id'), web_url=f.get('web_url'), encoding=f.get('encoding')) if file_type == FileTypes.AUDIO_FILE: node.add_file(files.AudioFile(path=abspath, language=f.get('language'))) elif file_type == FileTypes.THUMBNAIL: node.add_file(files.ThumbnailFile(path=abspath)) elif file_type == FileTypes.DOCUMENT_FILE: node.add_file(files.DocumentFile(path=abspath, language=f.get('language'))) elif file_type == FileTypes.HTML_ZIP_FILE: node.add_file(files.HTMLZipFile(path=abspath, language=f.get('language'))) elif file_type == FileTypes.VIDEO_FILE: node.add_file(files.VideoFile(path=abspath, language=f.get('language'), ffmpeg_settings=f.get('ffmpeg_settings'))) elif file_type == FileTypes.SUBTITLE_FILE: node.add_file(files.SubtitleFile(path=abspath, language=f['language'])) elif file_type == FileTypes.BASE64_FILE: node.add_file(files.Base64ImageFile(encoding=f['encoding'])) elif file_type == FileTypes.WEB_VIDEO_FILE: node.add_file(files.WebVideoFile(web_url=f['web_url'], high_resolution=f.get('high_resolution'))) elif file_type == FileTypes.YOUTUBE_VIDEO_FILE: node.add_file(files.YouTubeVideoFile(youtube_id=f['youtube_id'], high_resolution=f.get('high_resolution'))) node.add_file(files.YouTubeSubtitleFile(youtube_id=f['youtube_id'], language='en')) else: raise UnknownFileTypeError("Unrecognized file type '{0}'".format(f['path']))
def fetch_video(video): youtube_id = video['id'] title = video['title'] description = video['description'] youtube_url = video['webpage_url'] subtitle_languages = video['subtitles'].keys() print(" Fetching video data: %s (%s)" % (title, youtube_url)) video_node = nodes.VideoNode( source_id=youtube_id, title=truncate_metadata(title), license=LICENSE, description=truncate_description(description), derive_thumbnail=True, language="en", files=[files.YouTubeVideoFile(youtube_id=youtube_id)], ) # Add subtitles in whichever languages are available. for language in subtitle_languages: # TODO(david): Should catch exception thrown by # files.YouTubeSubtitleFile rather than breaking abstraction. if languages.getlang(language) or languages.getlang_by_alpha2( language): video_node.add_file( files.YouTubeSubtitleFile(youtube_id=youtube_id, language=language)) else: print("WARNING: Subtitle language %s not found in languages file" % language) return video_node
def add_files(node, file_list): EXPECTED_FILE_TYPES = [ VIDEO_FILE, AUDIO_FILE, DOCUMENT_FILE, HTML5_FILE, THUMBNAIL_FILE, SUBTITLES_FILE ] for f in file_list: file_type = f.get('file_type') if file_type not in EXPECTED_FILE_TYPES: LOGGER.critical(file_type) raise NotImplementedError( 'Unexpected File type found in channel json.') path = f.get('path') # path can be an URL or a local path (or None) # handle different types of files if file_type == VIDEO_FILE: # handle three types of video files if 'youtube_id' in f: video_file = files.YouTubeVideoFile( youtube_id=f['youtube_id'], download_settings=f.get('download_settings', None), high_resolution=f.get('high_resolution', True), maxheight=f.get('maxheight', None), language=f.get('language', None), ) elif 'web_url' in f: video_file = files.WebVideoFile( web_url=f['web_url'], download_settings=f.get('download_settings', None), high_resolution=f.get('high_resolution', True), maxheight=f.get('maxheight', None), language=f.get('language', None), ) else: video_file = files.VideoFile( path=f['path'], language=f.get('language', None), ffmpeg_settings=f.get('ffmpeg_settings'), ) node.add_file(video_file) elif file_type == AUDIO_FILE: node.add_file( files.AudioFile(path=f['path'], language=f.get('language', None))) elif file_type == DOCUMENT_FILE: node.add_file( files.DocumentFile(path=path, language=f.get('language', None))) elif file_type == HTML5_FILE: node.add_file( files.HTMLZipFile(path=path, language=f.get('language', None))) elif file_type == THUMBNAIL_FILE: if 'encoding' in f: node.add_file(files.Base64ImageFile(encoding=f['encoding'], )) else: node.add_file( files.ThumbnailFile( path=path, language=f.get('language', None), )) elif file_type == SUBTITLES_FILE: if 'youtube_id' in f: node.add_file( files.YouTubeSubtitleFile(youtube_id=f['youtube_id'], language=f['language'])) else: node.add_file( files.SubtitleFile(path=path, language=f['language'])) else: raise UnknownFileTypeError("Unrecognized file type '{0}'".format( f['path']))
def add_video_nodes_from_playlist( self, youtube_client, playlist_id, subtitle_languages=SUBTITLE_LANGUAGES, copyright_holder=COPYRIGHT_HOLDER, only_creative_commons=ONLY_CREATIVE_COMMONS): first_page = True next_page_token = None playlist_request_kwargs = { 'part': 'contentDetails', 'maxResults': 50, 'playlistId': playlist_id, } # Apparently the same video is in one of the playlists twice! # This is used to keep track of videos that have already been added. videos_added = {} while first_page or next_page_token: first_page = False # we're visiting the first page now! playlist_info = youtube_client.playlistItems().list( **playlist_request_kwargs).execute() playlist_items = playlist_info['items'] video_ids = [ vid['contentDetails']['videoId'] for vid in playlist_items ] videos = youtube_client.videos().list( part='status,snippet', id=','.join(video_ids)).execute()['items'] # Apparently the same video is in one of the playlists twice! # Uncomment the following code to see for yourself: # video_ids = [v['id'] for v in videos] # duplicated_videos = [v for v in video_ids if video_ids.count(v) > 1] # print("The following videos are duplicated: {}".format(duplicated_videos)) for video in videos: if video['id'] in videos_added: continue if only_creative_commons and video['status'][ 'license'] != 'creativeCommon': print( "The video '{}' is not licensed as Creative Commons... it is licensed as {}" .format(video['snippet']['title'], video['status']['license'])) else: try: video_license = licenses.CC_BY \ if video['status']['license'] == 'creativeCommon' \ else NON_CREATIVE_COMMONS_LICENSE_DEFAULT video_node = nodes.VideoNode( # source_id="{}__{}".format(video['id'], playlist_id), source_id=video['id'], title=video['snippet']['title'], language=CHANNEL_LANGUAGE, license=get_license( video_license, copyright_holder=copyright_holder), thumbnail=get_largest_thumbnail( video['snippet']['thumbnails']).get('url'), files=[ files.YouTubeVideoFile(video['id']), ]) # Get subtitles for languages designated in SUBTITLE_LANGUAGES for lang_code in subtitle_languages: if files.is_youtube_subtitle_file_supported_language( lang_code): video_node.add_file( files.YouTubeSubtitleFile( youtube_id=video['id'], language=lang_code)) else: print('Unsupported subtitle language code:', lang_code) self.add_child(video_node) videos_added[video['id']] = video_node except Exception as e: raise e # set up the next page, if there is one next_page_token = playlist_info.get('nextPageToken') if next_page_token: playlist_request_kwargs['pageToken'] = next_page_token else: try: del playlist_request_kwargs['pageToken'] except Exception as e: pass
def scrape_content(title, content_url): """ title: Boys' clothing content_url: http://www.touchableearth.org/china-culture-boys-clothing/ """ print(" Scraping content node: %s (%s)" % (title, content_url)) doc = get_parsed_html_from_url(content_url) if not doc: # 404 return None description = create_description(doc) source_id = doc.select_one(".current_post.active .post_id")["value"] base_node_attributes = { "source_id": source_id, "title": title, "license": TE_LICENSE, "description": description, } youtube_iframe = doc.select_one(".video-container iframe") if youtube_iframe: youtube_url = doc.select_one(".video-container iframe")["src"] youtube_id = get_youtube_id_from_url(youtube_url) if not youtube_id: print(" *** WARNING: youtube_id not found for content url", content_url) print(" Skipping.") return None try: info = ydl.extract_info(youtube_url, download=False) subtitles = info.get("subtitles") subtitle_languages = subtitles.keys() if subtitles else [] print(" ... with subtitles in languages:", subtitle_languages) except youtube_dl.DownloadError as e: # Some of the videos have been removed from the YouTube channel -- # skip creating content nodes for them entirely so they don't show up # as non-loadable videos in Kolibri. print(" NOTE: Skipping video download due to error: ", e) return None video_node = nodes.VideoNode( **base_node_attributes, derive_thumbnail=True, files=[WatermarkedYouTubeVideoFile(youtube_id=youtube_id)], ) # Add subtitles in whichever languages are available. for language in subtitle_languages: video_node.add_file( files.YouTubeSubtitleFile(youtube_id=youtube_id, language=language)) return video_node img = doc.select_one(".uncode-single-media-wrapper img") if img: img_src = img["data-guid"] or img["src"] destination = tempfile.mkdtemp() download_file(img_src, destination, request_fn=make_request, filename="image.jpg") with open(os.path.join(destination, "index.html"), "w") as f: f.write(""" <!doctype html> <html> <head></head> <body> <img src="image.jpg" style="width: 100%; max-width: 1200px;" /> </body> </html> """) zip_path = create_predictable_zip(destination) return nodes.HTML5AppNode( **base_node_attributes, files=[files.HTMLZipFile(zip_path)], thumbnail=img_src, ) return None
def construct_channel(self, *args, **kwargs): """ Creates ChannelNode and build topic tree Args: - args: arguments passed in during upload_channel (currently None) - kwargs: extra argumens and options not handled by `uploadchannel`. For example, add the command line option lang="fr" and the string "fr" will be passed along to `construct_channel` as kwargs['lang']. Returns: ChannelNode """ channel = self.get_channel(*args, **kwargs) # Create ChannelNode from data in self.channel_info from apiclient.discovery import build # instantiate a YouTube Data API v3 client youtube = build('youtube', 'v3', developerKey=kwargs['--youtube-api-token']) playlists = youtube.playlists().list( # list all of the YouTube channel's playlists part='snippet', channelId=YOUTUBE_CHANNEL_ID, maxResults=50 ).execute()['items'] # For getting the thumbnail automatically # youtube_channel = youtube.channels().list( # id=YOUTUBE_CHANNEL_ID, # part='snippet' # ).execute()['items'][0] # channel.thumbnail = get_largest_thumbnail(youtube_channel['snippet']['thumbnails']).get('url') for playlist in playlists: topic = nodes.TopicNode(title=playlist['snippet']['title'], source_id=playlist['id']) first_page = True next_page_token = None playlist_request_kwargs = { 'part': 'contentDetails', 'maxResults': 50, 'playlistId': playlist['id'], } while first_page or next_page_token: first_page = False # we're visiting the first page now! playlist_info = youtube.playlistItems().list(**playlist_request_kwargs).execute() playlist_items = playlist_info['items'] video_ids = [vid['contentDetails']['videoId'] for vid in playlist_items] videos = youtube.videos().list( part='status,snippet', id=','.join(video_ids) ).execute()['items'] for video in videos: if video['status']['license'] == 'creativeCommon': try: video_node = nodes.VideoNode( source_id=video['id'], title=video['snippet']['title'], language=CHANNEL_LANGUAGE, license=get_license(licenses.CC_BY, copyright_holder='Espresso English'), thumbnail=get_largest_thumbnail(video['snippet']['thumbnails']).get('url'), files=[ files.YouTubeVideoFile(video['id']), ] ) topic.add_child(video_node) # Get subtitles for languages designated in SUBTITLE_LANGUAGES for lang_code in SUBTITLE_LANGUAGES: if files.is_youtube_subtitle_file_supported_language(lang_code): video_node.add_file( files.YouTubeSubtitleFile( youtube_id=video['id'], language=lang_code ) ) else: print('Unsupported subtitle language code:', lang_code) except Exception as e: raise e # set up the next page, if there is one next_page_token = playlist_info.get('nextPageToken') if next_page_token: playlist_request_kwargs['pageToken'] = next_page_token else: try: del playlist_request_kwargs['pageToken'] except Exception as e: pass channel.add_child(topic) raise_for_invalid_channel(channel) # Check for errors in channel construction return channel