def add_files(node, file_list): for f in file_list: path = f.get('path') if path is not None: abspath = get_abspath(path) # NEW: expand content:// --> ./content/ in file paths else: abspath = None file_type = guess_file_type(node.kind, filepath=abspath, youtube_id=f.get('youtube_id'), web_url=f.get('web_url'), encoding=f.get('encoding')) if file_type == FileTypes.AUDIO_FILE: node.add_file(files.AudioFile(path=abspath, language=f.get('language'))) elif file_type == FileTypes.THUMBNAIL: node.add_file(files.ThumbnailFile(path=abspath)) elif file_type == FileTypes.DOCUMENT_FILE: node.add_file(files.DocumentFile(path=abspath, language=f.get('language'))) elif file_type == FileTypes.HTML_ZIP_FILE: node.add_file(files.HTMLZipFile(path=abspath, language=f.get('language'))) elif file_type == FileTypes.VIDEO_FILE: node.add_file(files.VideoFile(path=abspath, language=f.get('language'), ffmpeg_settings=f.get('ffmpeg_settings'))) elif file_type == FileTypes.SUBTITLE_FILE: node.add_file(files.SubtitleFile(path=abspath, language=f['language'])) elif file_type == FileTypes.BASE64_FILE: node.add_file(files.Base64ImageFile(encoding=f['encoding'])) elif file_type == FileTypes.WEB_VIDEO_FILE: node.add_file(files.WebVideoFile(web_url=f['web_url'], high_resolution=f.get('high_resolution'))) elif file_type == FileTypes.YOUTUBE_VIDEO_FILE: node.add_file(files.YouTubeVideoFile(youtube_id=f['youtube_id'], high_resolution=f.get('high_resolution'))) node.add_file(files.YouTubeSubtitleFile(youtube_id=f['youtube_id'], language='en')) else: raise UnknownFileTypeError("Unrecognized file type '{0}'".format(f['path']))
def create_document_node(path, title, target_node, source_id, **details): document_file = files.DocumentFile(path) document_id = title.replace(" ", "-").lower() target_node.add_child( nodes.DocumentNode(source_id="{}-{}".format(source_id, document_id), title=title, files=[document_file], **details))
def construct_channel(self, *args, **kwargs): """ Creates ChannelNode and build topic tree Args: - args: arguments passed in during upload_channel (currently None) - kwargs: extra argumens and options not handled by `uploadchannel`. For example, add the command line option lang="fr" and the string "fr" will be passed along to `construct_channel` as kwargs['lang']. Returns: ChannelNode """ channel = self.get_channel( *args, **kwargs) # Create ChannelNode from data in self.channel_info topics = load_json_from_file(JSON_FILE) for topic in topics: book_title = topic['book_title'] source_id = book_title.replace(" ", "_") url = topic['path_or_url'] topic_node = nodes.TopicNode(source_id=source_id, title=book_title, tags=[ "Teacher facing", "Professional development", "Life skills", "Intercultural skills", "Mentorship", "Formal contexts" ]) channel.add_child(topic_node) parser = pdf.PDFParser(url, toc=topic['chapters']) parser.open() chapters = parser.split_chapters() for chapter in chapters: title = chapter['title'] pdf_path = chapter['path'] pdf_file = files.DocumentFile(pdf_path) pdf_node = nodes.DocumentNode( source_id="{} {}".format(book_title, title), title=title, author="INTO", tags=[ "Teacher facing", "Professional development", "Life skills", "Intercultural skills", "Mentorship", "Formal contexts" ], files=[pdf_file], license=licenses.get_license(CHANNEL_LICENSE, "INTO", LICENSE_DESCRIPTION), copyright_holder="INTO") topic_node.add_child(pdf_node) raise_for_invalid_channel( channel) # Check for errors in channel construction return channel
def add_file_node(target_node, url, title, **details): """ Creates file node at target topic node """ document_file = files.DocumentFile(path=url) document_id = title.replace(" ", "-").lower() document_node = nodes.DocumentNode(source_id="{}-{}".format( target_node.source_id, document_id), title=title, files=[document_file], **details) target_node.add_child(document_node)
def save_book(book_detail, channel): book_id = book_detail["id"] book_source_id = get_book_source_id(book_id) book_title = book_detail["name"] level_id = book_detail["readingLevel"] language = book_detail["language"] language_id = language["id"] tags = book_detail["tags"] epub_url = book_detail["epubUrl"] pdf_urls = book_detail["pdfUrl"] pdf_portrait_url = pdf_urls.get("portraitUrl", "") if pdf_urls else "" pdf_landscape_url = pdf_urls.get("landscapeUrl", "") if pdf_urls else "" pdf_booklet_url = pdf_urls.get("bookletUrl", "") if pdf_urls else "" pdf_url = pdf_portrait_url or pdf_landscape_url or pdf_booklet_url if not pdf_url and not epub_url: LOGGER.error("No file found for \n {}".format(book_source_id)) raise NoFileAvailableError() book_files = [] if pdf_url: pdf_file = files.DocumentFile(path=pdf_url) book_files.append(pdf_file) if epub_url: epub_file = files.EPubFile(path=epub_url) book_files.append(epub_file) book = nodes.DocumentNode( source_id=book_source_id, title=book_title, license=licenses. PUBLIC_DOMAIN, # TODO: get a real license and copyright holder files=book_files) language_topic = get_or_create_language_topic(language, channel) level_topic = get_or_create_level_topic(level_id, language_id, language_topic) if not tags: level_topic.add_child(book) return for tag in tags: tag_topic = get_or_create_tag_topic(tag, language_id, level_id, level_topic) tag_topic.add_child(book)
def generate_pdf_nodes(data, topic, source=""): """ Generates nodes related to pdfs Args: - data (dict) data on pdf details (split pdfs, file paths, exercises, etc.) - topic (TopicNode) node to add sub nodes to - source (str) unique string associated with this pdf Returns None """ # Iterate through chapter data for chapter in data: # Create topics if we're dealing with a section if chapter.get('header'): source_id = "{}-{}".format(source, chapter['header']) subtopic = nodes.TopicNode(title=chapter['header'], source_id=source_id) topic.add_child(subtopic) generate_pdf_nodes(chapter['chapters'], subtopic, source=source_id) # Create a document node and its related exercise nodes if it's a document elif chapter.get("chapter"): # Create doucment node source_id = "{}-{}".format(source, chapter['chapter']) topic.add_child( nodes.DocumentNode(title=chapter['chapter'], source_id=source_id, copyright_holder=COPYRIGHT_HOLDER, license=LICENSE, files=[files.DocumentFile(chapter['path']) ])) # Create exercise nodes for index, exercise in enumerate(chapter.get("exercises") or []): exercise_id = "{} Exercise {}".format(source_id, index) exercise_node = nodes.ExerciseNode( title=chapter['chapter'], source_id=exercise_id, description=exercise.get('description'), copyright_holder=COPYRIGHT_HOLDER, license=LICENSE, ) topic.add_child(exercise_node) create_exercise_questions(exercise_node, exercise.get('questions') or [])
def create_slideshow(images, source_id, title, language_name): """ images: {url: str, caption: str} """ thumbnailFile = files.ThumbnailFile(images[0]['url']) if '--slides' in sys.argv: slides = [ files.SlideImageFile(image['url'], caption=image.get('caption', '')) for image in images ] return nodes.SlideshowNode(source_id=source_id, title=title, license=LICENSE, language=LANGUAGE_MAP[language_name], files=[thumbnailFile] + slides) # Create PDF filename = hashlib.md5(source_id.encode('utf-8')).hexdigest() pdfpath = '{}{}{}.pdf'.format(DOCUMENT_DOWNLOAD_DIR, os.path.sep, filename) if not os.path.exists(pdfpath): image_list = [] for image in images: img = Image.open(BytesIO(downloader.read(image['url']))) if img.mode == 'RGBA': img = img.convert('RGB') image_list.append(img) image_list[0].save(pdfpath, save_all=True, append_images=image_list[1:]) return nodes.DocumentNode( source_id=source_id, title=title, license=LICENSE, language=LANGUAGE_MAP[language_name], files=[thumbnailFile, files.DocumentFile(pdfpath)])
def make_content_node(kind, source_id, title, license, filepath, optionals): """ Create `kind` subclass of ContentNode based on required args and optionals. """ content_node = None if kind == content_kinds.VIDEO: content_node = nodes.VideoNode( source_id=source_id, title=title, license=license, author=optionals.get("author", None), description=optionals.get("description", None), derive_thumbnail=True, # video-specific data files=[files.VideoFile(path=filepath)], ) elif kind == content_kinds.AUDIO: content_node = nodes.AudioNode( source_id=source_id, title=title, license=license, author=optionals.get("author", None), description=optionals.get("description", None), thumbnail=optionals.get("thumbnail", None), files=[files.AudioFile(path=filepath)], ) elif kind == content_kinds.DOCUMENT: content_node = nodes.DocumentNode( source_id=source_id, title=title, license=license, author=optionals.get("author", None), description=optionals.get("description", None), thumbnail=optionals.get("thumbnail", None), files=[files.DocumentFile(path=filepath)], ) return content_node
def add_files(node, file_list): for f in file_list: file_name, file_type = parse_file_name(f) print (f) if file_type == 'mp3':#FileTypes.AUDIO_FILE: node.add_file(files.AudioFile(path=f)) #elif file_type == FileTypes.THUMBNAIL: # node.add_file(files.ThumbnailFile(path=f['path'])) elif file_type == 'pdf':#FileTypes.DOCUMENT_FILE: node.add_file(files.DocumentFile(path=f)) #elif file_type == FileTypes.HTML_ZIP_FILE: # node.add_file(files.HTMLZipFile(path=f['path'], language=f.get('language'))) elif file_type == 'mp4':#FileTypes.VIDEO_FILE: node.add_file(files.VideoFile(path=f)) #elif file_type == FileTypes.SUBTITLE_FILE: # node.add_file(files.SubtitleFile(path=f['path'], language=f['language'])) #elif file_type == FileTypes.BASE64_FILE: # node.add_file(files.Base64ImageFile(encoding=f['encoding'])) #elif file_type == FileTypes.WEB_VIDEO_FILE: # node.add_file(files.WebVideoFile(web_url=f['web_url'], high_resolution=f.get('high_resolution'))) #elif file_type == FileTypes.YOUTUBE_VIDEO_FILE: # node.add_file(files.YouTubeVideoFile(youtube_id=f['youtube_id'], high_resolution=f.get('high_resolution'))) else: raise UnknownFileTypeError("Unrecognised file type '{0}'".format(f['path']))
def create_leaf_node(self, module, subject_node, subject_id): # zips are always SCORMs in this case. assert 'file' in module, "Invalid module: {}".format(module) if 'file' in module: ext = os.path.splitext(module['file'])[1].lower() if ext == '.zip': self.get_scorm_topic_tree(subject_node, module['file']) elif ext == '.pdf': license = licenses.SpecialPermissionsLicense(copyright_holder="ProFuturo", description="FIXME: Get license info") doc_id = '{}-{}'.format(subject_id, module['id']) doc_file = files.DocumentFile(path=module['file']) doc_node = nodes.DocumentNode(title=module['title'], source_id=doc_id, files=[doc_file], license=license) subject_node.add_child(doc_node) role = roles.LEARNER if 'role' in module: role = module['role'] def set_role_recursive(node, role): node.role = role for child in node.children: set_role_recursive(child, role) set_role_recursive(subject_node, role)
def add_files(node, file_list): EXPECTED_FILE_TYPES = [ VIDEO_FILE, AUDIO_FILE, DOCUMENT_FILE, HTML5_FILE, THUMBNAIL_FILE, SUBTITLES_FILE ] for f in file_list: file_type = f.get('file_type') if file_type not in EXPECTED_FILE_TYPES: LOGGER.critical(file_type) raise NotImplementedError( 'Unexpected File type found in channel json.') path = f.get('path') # path can be an URL or a local path (or None) # handle different types of files if file_type == VIDEO_FILE: # handle three types of video files if 'youtube_id' in f: video_file = files.YouTubeVideoFile( youtube_id=f['youtube_id'], download_settings=f.get('download_settings', None), high_resolution=f.get('high_resolution', True), maxheight=f.get('maxheight', None), language=f.get('language', None), ) elif 'web_url' in f: video_file = files.WebVideoFile( web_url=f['web_url'], download_settings=f.get('download_settings', None), high_resolution=f.get('high_resolution', True), maxheight=f.get('maxheight', None), language=f.get('language', None), ) else: video_file = files.VideoFile( path=f['path'], language=f.get('language', None), ffmpeg_settings=f.get('ffmpeg_settings'), ) node.add_file(video_file) elif file_type == AUDIO_FILE: node.add_file( files.AudioFile(path=f['path'], language=f.get('language', None))) elif file_type == DOCUMENT_FILE: node.add_file( files.DocumentFile(path=path, language=f.get('language', None))) elif file_type == HTML5_FILE: node.add_file( files.HTMLZipFile(path=path, language=f.get('language', None))) elif file_type == THUMBNAIL_FILE: if 'encoding' in f: node.add_file(files.Base64ImageFile(encoding=f['encoding'], )) else: node.add_file( files.ThumbnailFile( path=path, language=f.get('language', None), )) elif file_type == SUBTITLES_FILE: if 'youtube_id' in f: node.add_file( files.YouTubeSubtitleFile(youtube_id=f['youtube_id'], language=f['language'])) else: node.add_file( files.SubtitleFile(path=path, language=f['language'])) else: raise UnknownFileTypeError("Unrecognized file type '{0}'".format( f['path']))
def scrape_page(channel): """ Read main page for Saylor (https://www.saylor.org/books/) """ try: page = BeautifulSoup(read_source(BASE_URL, loadjs=True), 'html.parser') contents = page.find('div', { 'class': 'main-content' }).find('div', {'class', 'row'}) # Site doesn't have special designation for subjects, so get headers for subject in contents.find_all('h3'): # Create subject topic title = subject.text.replace(u'\xa0', u' ').replace('\n', '') source_id = generate_id(title) category_topic = nodes.TopicNode(source_id=source_id, title=title) channel.add_child(category_topic) LOGGER.info(title) # Get list from subject book_list = subject.findNext('ul') for book in book_list.find_all('li'): license = LICENSE page_links = [] # Some books have subsections for different formats/licenses # e.g. See Business-General/Miscellaneous > Information Systems for Business and Beyond if book.find('small'): # Determine what license to use for l in licenses.choices: if l[0] in book.find('small').text: license = l[0] break booktitle = book.contents[0] LOGGER.info(" " + booktitle) # Download one of the sublinks for sublink in book.find_all('a'): if not sublink.get('href'): continue elif "PDF" in sublink.text: category_topic.add_child( nodes.DocumentNode( source_id=source_id + os.path.basename(sublink['href']), title=booktitle, license=license, copyright_holder=COPYRIGHT_HOLDER, files=[ files.DocumentFile( path=sublink['href']) ])) break # only need to download one format of the book elif "HTML" in sublink.text: html_node = scrape_book(sublink['href'], license=license) if html_node: category_topic.add_child(html_node) break # only need to download one format of the book # Most book links go straight to an html page else: page_links.append(book.find('a')['href']) html_node = scrape_book(book.find('a')['href'], license) if html_node: category_topic.add_child(html_node) finally: # No matter what, add link to video mapping for future runs with open(VIDEO_MAP_JSON, "w") as videojson: json.dump(VIDEO_MAPPING, videojson)