def add_files(node, file_list): for f in file_list: path = f.get('path') if path is not None: abspath = get_abspath(path) # NEW: expand content:// --> ./content/ in file paths else: abspath = None file_type = guess_file_type(node.kind, filepath=abspath, youtube_id=f.get('youtube_id'), web_url=f.get('web_url'), encoding=f.get('encoding')) if file_type == FileTypes.AUDIO_FILE: node.add_file(files.AudioFile(path=abspath, language=f.get('language'))) elif file_type == FileTypes.THUMBNAIL: node.add_file(files.ThumbnailFile(path=abspath)) elif file_type == FileTypes.DOCUMENT_FILE: node.add_file(files.DocumentFile(path=abspath, language=f.get('language'))) elif file_type == FileTypes.HTML_ZIP_FILE: node.add_file(files.HTMLZipFile(path=abspath, language=f.get('language'))) elif file_type == FileTypes.VIDEO_FILE: node.add_file(files.VideoFile(path=abspath, language=f.get('language'), ffmpeg_settings=f.get('ffmpeg_settings'))) elif file_type == FileTypes.SUBTITLE_FILE: node.add_file(files.SubtitleFile(path=abspath, language=f['language'])) elif file_type == FileTypes.BASE64_FILE: node.add_file(files.Base64ImageFile(encoding=f['encoding'])) elif file_type == FileTypes.WEB_VIDEO_FILE: node.add_file(files.WebVideoFile(web_url=f['web_url'], high_resolution=f.get('high_resolution'))) elif file_type == FileTypes.YOUTUBE_VIDEO_FILE: node.add_file(files.YouTubeVideoFile(youtube_id=f['youtube_id'], high_resolution=f.get('high_resolution'))) node.add_file(files.YouTubeSubtitleFile(youtube_id=f['youtube_id'], language='en')) else: raise UnknownFileTypeError("Unrecognized file type '{0}'".format(f['path']))
def scrape_iversity(channel): url = "{}/en/my/courses/rethinking-us-them-integration-and-diversity-in-europe/lesson_units".format( BASE_URL) LOGGER.info(" Scraping Migration Matters at {}".format(url)) source = read_source(url) chapters = source.find_all('div', {'class': 'chapter-units-wrapper'}) for chapter in chapters: title = str(chapter.find('div', {'class': 'chapter-title'}).string) source_id = title.strip().replace(" ", "_") topic = nodes.TopicNode(source_id=source_id, title=title) lessons = chapter.find_all('a', {'class': 'unit-wrapper'}) for lesson in lessons: video_exists = lesson.find('i', {'class': 'unit_video'}) video_title = str( lesson.find('span', { 'class': 'unit-title' }).string).strip() if video_exists: video_source_id = video_title.replace(" ", "_") video_url = "{}{}".format(BASE_URL, lesson.attrs["href"]) video_source = read_source(video_url) video_info = video_source.find('video') video_subtitle_path = video_info.find('track', { 'kind': 'subtitles' }).attrs["src"] video_subtitle = files.SubtitleFile( path=video_subtitle_path, language=languages.getlang('en').code) video_link = video_info.find('source', { 'res': '480' }).attrs["src"] video_file = files.VideoFile( path=video_link, language=languages.getlang('en').code) video_node = nodes.VideoNode( source_id=video_source_id, title=video_title, files=[video_file, video_subtitle], license=CHANNEL_LICENSE, copyright_holder=COPYRIGHT_HOLDER) LOGGER.info(" Uploading video - {}".format( video_title.strip())) topic.add_child(video_node) else: LOGGER.info( "Format of the file is not supported by the sushi chef : {}" .format(video_title)) channel.add_child(topic)
def add_files(node, file_list): EXPECTED_FILE_TYPES = [ VIDEO_FILE, AUDIO_FILE, DOCUMENT_FILE, HTML5_FILE, THUMBNAIL_FILE, SUBTITLES_FILE ] for f in file_list: file_type = f.get('file_type') if file_type not in EXPECTED_FILE_TYPES: LOGGER.critical(file_type) raise NotImplementedError( 'Unexpected File type found in channel json.') path = f.get('path') # path can be an URL or a local path (or None) # handle different types of files if file_type == VIDEO_FILE: # handle three types of video files if 'youtube_id' in f: video_file = files.YouTubeVideoFile( youtube_id=f['youtube_id'], download_settings=f.get('download_settings', None), high_resolution=f.get('high_resolution', True), maxheight=f.get('maxheight', None), language=f.get('language', None), ) elif 'web_url' in f: video_file = files.WebVideoFile( web_url=f['web_url'], download_settings=f.get('download_settings', None), high_resolution=f.get('high_resolution', True), maxheight=f.get('maxheight', None), language=f.get('language', None), ) else: video_file = files.VideoFile( path=f['path'], language=f.get('language', None), ffmpeg_settings=f.get('ffmpeg_settings'), ) node.add_file(video_file) elif file_type == AUDIO_FILE: node.add_file( files.AudioFile(path=f['path'], language=f.get('language', None))) elif file_type == DOCUMENT_FILE: node.add_file( files.DocumentFile(path=path, language=f.get('language', None))) elif file_type == HTML5_FILE: node.add_file( files.HTMLZipFile(path=path, language=f.get('language', None))) elif file_type == THUMBNAIL_FILE: if 'encoding' in f: node.add_file(files.Base64ImageFile(encoding=f['encoding'], )) else: node.add_file( files.ThumbnailFile( path=path, language=f.get('language', None), )) elif file_type == SUBTITLES_FILE: if 'youtube_id' in f: node.add_file( files.YouTubeSubtitleFile(youtube_id=f['youtube_id'], language=f['language'])) else: node.add_file( files.SubtitleFile(path=path, language=f['language'])) else: raise UnknownFileTypeError("Unrecognized file type '{0}'".format( f['path']))
def crawl_each_post(post_url): resp = requests.get(post_url, headers=HEADERS) soup = BeautifulSoup(resp.content, "html.parser") wrapper = soup.find('div', {'class': 'wpb_wrapper'}) course_name = wrapper.find('div', { 'class': 'vc_custom_heading' }).getText().strip() delimiters = " OF ", " FROM " regex_pattern = '|'.join(map(re.escape, delimiters)) course = re.split(regex_pattern, course_name)[1] wpb_video_wrapper = wrapper.find_all('div', {'class': 'wpb_video_wrapper'}) if wpb_video_wrapper: for each_wrapper in wpb_video_wrapper: video_url = each_wrapper.find('iframe').attrs["src"].split( "?feature")[0] video_id = video_url.split("/")[-1] ydl = youtube_dl.YoutubeDL({ 'outtmpl': './downloads/%(id)s.%(ext)s', 'writeautomaticsub': True, 'logger': LOGGER }) with ydl: result = ydl.extract_info( "http://www.youtube.com/watch?v={}".format(video_id), download=True) if 'entries' in result: video = result['entries'][0] else: video = result video_title = video["title"] video_source_id = video_title.strip().replace(" ", "_") video_path = "{}/{}.mp4".format(DOWNLOAD_DIRECTORY, video_id) video_subtitle_path = "{}/{}.en.vtt".format( DOWNLOAD_DIRECTORY, video_id) video_file = files.VideoFile(path=video_path, language=languages.getlang('en').code) video_subtitle = files.SubtitleFile( path=video_subtitle_path, language=languages.getlang('en').code) video_node = nodes.VideoNode( source_id=video_source_id, title=video_title, files=[video_file, video_subtitle], license=CHANNEL_LICENSE, copyright_holder=COPYRIGHT_HOLDER, ) if course not in EPISODE_DICT: EPISODE_DICT[course] = [video_node] else: EPISODE_DICT[course].append(video_node) LOGGER.info(" Uploading video - {}".format(video_title.strip())) else: LOGGER.info( "Format of the file is not supported by the sushi chef : {}". format(course_name))