def scrape_resource(url, topic): resource = BeautifulSoup(downloader.read(url), 'html5lib') LOGGER.info(' {}'.format(resource.find('h2').text)) filepath = download_resource(resource.find('div', {'class': 'decargas'}).find('a')['href']) license = None author = '' for data_section in resource.find('div', {'class': 'datos_generales'}).find_all('h4'): if 'Licencia' in data_section.text: try: license = LICENSE_MAP[data_section.find_next_sibling('p').text](copyright_holder="Ceibal") except KeyError as e: LOGGER.error(str(e)) license = licenses.CC_BYLicense elif 'Autor' in data_section.text: author = data_section.find_next_sibling('p').text if filepath: thumbnail = resource.find('div', {'class': 'img-recurso'}).find('img')['src'] if thumbnail.endswith('.gif'): thumbnail = os.path.sep.join([DOWNLOAD_DIRECTORY, thumbnail.split('/')[-1].replace('.gif', '.png')]) with open(thumbnail, 'wb') as fobj: fobj.write(downloader.read(resource.find('div', {'class': 'img-recurso'}).find('img')['src'])) topic.add_child(nodes.HTML5AppNode( title=resource.find('h2').text, source_id=url, license=license, author=author, description=resource.find('form').find_all('p')[1].text, thumbnail=thumbnail, tags = [tag.text[:30] for tag in resource.find_all('a', {'class': 'tags'})], files=[files.HTMLZipFile(path=filepath)], ))
def scrape_snack_subject(slug, topic): """ Scrape snack subject page Args: slug (str): url slug to scrape from (e.g. /subject/arts) topic (TopicNode): topic to add html nodes to """ contents = BeautifulSoup(read(slug), 'html5lib') for activity in contents.find_all('div', {'class': 'activity'}): LOGGER.info(" {}".format(activity.find('h5').text.strip())) # Scrape snack pages into zips write_to_path, tags = scrape_snack_page(activity.find('a')['href']) if not write_to_path: continue # Create html node description = activity.find('div', {'class': 'pod-description'}) topic.add_child( nodes.HTML5AppNode( source_id=activity.find('a')['href'], title=activity.find('h5').text.strip().replace("’", "'"), description=description.text.strip() if description else "", license=LICENSE, copyright_holder=COPYRIGHT_HOLDER, files=[files.HTMLZipFile(path=write_to_path)], thumbnail=get_thumbnail_url(activity.find('img')['src']), tags=tags, )) # Scrape next page (if any) next_page_url = get_next_page_url(contents) if next_page_url: scrape_snack_subject(next_page_url, topic)
def download_wikipedia_page(url, thumbnail, title): """ Create zip file to use for html pages """ destpath = tempfile.mkdtemp( ) # Create a temp directory to house our downloaded files # downlod the main wikipedia page, apply a middleware processor, and call it index.html localref, _ = download_file( url, destpath, filename="index.html", middleware_callbacks=process_wikipedia_page, ) zippath = create_predictable_zip( destpath) # Turn the temp folder into a zip file # Create an HTML5 app node html5app = nodes.HTML5AppNode( files=[files.HTMLZipFile(zippath)], title=title, thumbnail=thumbnail, source_id=url.split("/")[-1], license=CHANNEL_LICENSE, ) return html5app
def download_puzzle(puzzle_url, title, description, thumbnail, le_language_code, blockly_language_code): """Download a single puzzle and return an HTML5 app node.""" with WebDriver("https://blockly-games.appspot.com/%s" % puzzle_url, delay=1000) as driver: doc = BeautifulSoup(driver.page_source, "html.parser") # Create a temporary folder to download all the files for a puzzle. destination = tempfile.mkdtemp() # Download all the JS/CSS/images/audio/etc we can get from scraping the # page source. doc = download_static_assets(doc, destination, 'https://blockly-games.appspot.com', request_fn=make_request, url_blacklist=['analytics.js']) # Download other files not picked up by the above generic assets fetching, # e.g. from GitHub. puzzle_name = puzzle_url.split('?')[0] download_additional_assets(destination, puzzle_name) # Make some modifications to the HTML source -- hide some elements. remove_node(doc, '#languageMenu') remove_node(doc, '#title') # Copy over some of our own JS/CSS files and then add links to them in the # page source. copy_tree("static", os.path.join(destination, "static")) chef_body_script = doc.new_tag("script", src="static/chef_end_of_body.js") doc.select_one('body').append(chef_body_script) chef_head_script = doc.new_tag("script") chef_head_script.string = 'window["BlocklyGamesLang"] = "%s";' % blockly_language_code doc.select_one('head').insert(0, chef_head_script) # Write out the HTML source. with open(os.path.join(destination, "index.html"), "w") as f: f.write(str(doc)) print( " Downloaded puzzle %s titled \"%s\" (thumbnail %s) to destination %s" % (puzzle_url, title, thumbnail, destination)) # preview_in_browser(destination) zip_path = create_predictable_zip(destination) return nodes.HTML5AppNode( source_id=puzzle_url, title=truncate_metadata(title), description=description, license=licenses.PublicDomainLicense(copyright_holder='Google'), thumbnail=thumbnail, files=[files.HTMLZipFile(zip_path)], language=le_language_code, )
def download_writing_topic_category(category_doc, title, level_id): destination = tempfile.mkdtemp() # Download a font font_url = make_fully_qualified_url( '//fonts.googleapis.com/css?family=Roboto:400,300,300italic,400italic,700,700italic' ) download_file(font_url, destination, request_fn=make_request, filename='roboto.css') # Write out the HTML source, based on CSS formatting from # https://k12.thoughtfullearning.com/resources/writingtopics topics = (("<li>%s</li>" % topic.text) for topic in category_doc.select('.views-row')) html_source = """ <!DOCTYPE html> <head> <link href='roboto.css' rel='stylesheet' type='text/css'> <style> ul { margin: 0 0 0 40px; padding: 0; } li { font-family: "Roboto", sans-serif; font-weight: 300; font-size: 19.2px; line-height: 24.96px; color: #202020; margin-top: 10px; } </style> </head> <body> <ul>%s</ul> </body> """ % ''.join(topics) with open(os.path.join(destination, "index.html"), "w") as f: f.write(html_source) print(" ... downloaded to %s" % destination) #preview_in_browser(destination) zip_path = create_predictable_zip(destination) return nodes.HTML5AppNode( source_id="%s|%s" % (level_id, title), title=truncate_metadata(title), license=licenses.CC_BY_NC_SALicense( copyright_holder=truncate_metadata('Thoughtful Learning')), files=[files.HTMLZipFile(zip_path)], language="en", thumbnail=writing_topic_thumbnail, )
def construct_channel(self, *args, **kwargs): channel = self.get_channel( *args, **kwargs) # Create ChannelNode from data in self.channel_info lang_names = list(self.data.keys()) lang_names.sort() for lang_name in lang_names: lang_data = self.data[lang_name] LOGGER.info("Creating app for language: {}".format(lang_name)) lang = languages.getlang_by_native_name(lang_name) zip_dir = self.client.create_zip_dir_for_page(lang_data['url']) soup = self.client.get_page_soup(lang_data['url']) # Remove the translation list if found translations = soup.find('div', {'id': 'translations'}) if translations: translations.extract() # Grab the localized title title = soup.find('span', {'id': 'share_title'}).text # Save the modified index.html page thumbnail = None for resource in lang_data['resources']: if 'dp3t.png' in resource: thumbnail = os.path.join(zip_dir, resource) break with open(os.path.join(zip_dir, 'index.html'), 'wb') as f: f.write(soup.prettify(encoding='utf-8')) # create_predictable_zip ensures that the ZIP file does not change each time it's created. This # ensures that the zip doesn't get re-uploaded just because zip metadata changed. zip_file = zip.create_predictable_zip(zip_dir) zip_name = lang.primary_code if lang else lang_name zip_filename = os.path.join(self.ZIP_DIR, "{}.zip".format(zip_name)) os.makedirs(os.path.dirname(zip_filename), exist_ok=True) os.rename(zip_file, zip_filename) topic = nodes.TopicNode(source_id=lang_name, title=lang_name) zip_node = nodes.HTML5AppNode( source_id="covid19-sim-{}".format(lang_name), title=title, files=[files.HTMLZipFile(zip_filename)], license=licenses.PublicDomainLicense( "Marcel Salathé & Nicky Case"), language=lang, thumbnail=thumbnail) topic.add_child(zip_node) channel.add_child(topic) return channel
def to_contentnode(self, title, directory=None, *args, **kwargs): # Generate a node based on the kind attribute filepath = self.to_file(directory=directory) if self.kind == content_kinds.HTML5: return nodes.HTML5AppNode(source_id=self.url, title=title, files=[files.HTMLZipFile(filepath)], **kwargs) elif self.kind == content_kinds.VIDEO: return nodes.VideoNode(source_id=self.url, title=title, files=[files.VideoFile(filepath)], **kwargs)
def create_topic_nodes_recursive(self, topic_info): """ Create nodes for all the content items in the tree. Currently supports HTML5 app node and topic node creation. :param topic_info: Dictionary with information about the current topic to use for generating nodes. :return: A TopicNode of the node topic_info along with all child topics and nodes. """ topic_node = nodes.TopicNode(source_id=str(topic_info['id']), title=topic_info['text']) has_content = False if 'nodes' in topic_info: has_content = True topic_nodes = topic_info['nodes'] for anode in topic_nodes: node_files = [files.HTMLZipFile(anode['html5_zip'])] if 'needs_dep_zip' in anode and anode['needs_dep_zip']: print("Needs dep zip: {}".format(anode)) node_files.append(self.dep_zip_file) html_node = nodes.HTML5AppNode(files=node_files, title=anode['title'], source_id=anode['dir'], license=licenses.CC_BY_NC, copyright_holder="ekShiksha") if 'description' in anode: html_node.description = anode['description'] # One possible way to store metadata about each content node on Studio. # extra_fields = {'metadata' : {}} # metadata = extra_fields['metadata'] # metadata['grades'] = [{'curriculum': 'CBSE', 'grades': [int(anode['standard'])] }] # metadata['subject'] = topic_info['text'] # TODO: Add the topic tree as 'categories' topic_node.add_child(html_node) if 'subtopics' in topic_info: for subtopic in topic_info['subtopics']: child = self.create_topic_nodes_recursive(subtopic) if child: has_content = True topic_node.add_child(child) # This shouldn't happen, so output a warning if it does. if not has_content: print("Node {} has no content".format(topic_info)) return None return topic_node
def create_html5_app_node(license, content_dict, ims_dir, scraper_class=None, temp_dir=None, needs_scorm_support=False): if scraper_class: index_path = os.path.join(ims_dir, content_dict['index_file']) if '?' in index_path: index_path = index_path.split('?')[0] if '#' in index_path: index_path = index_path.split('#')[0] if content_dict['scormtype'] == 'sco' and needs_scorm_support: add_scorm_support(index_path, ims_dir) index_uri = pathlib.Path(os.path.abspath(index_path)).as_uri() zip_name = '%s.zip' % hashlib.md5( index_uri.encode('utf-8')).hexdigest() temp_dir = temp_dir if temp_dir else tempfile.gettempdir() zip_path = os.path.join(temp_dir, zip_name) scraper = scraper_class(index_uri) scraper.download_file(zip_path) logging.info('Webmixer scraper outputted HTML app to %s' % zip_path) else: with tempfile.TemporaryDirectory() as destination: index_src_path = os.path.join(ims_dir, content_dict['index_file']) index_dest_path = os.path.join(destination, 'index.html') shutil.copyfile(index_src_path, index_dest_path) for file_path in content_dict['files']: shutil.copy(os.path.join(ims_dir, file_path), destination) if content_dict.get('scormtype') == 'sco' and needs_scorm_support: add_scorm_support(index_dest_path, destination) #preview_in_browser(destination) zip_path = create_predictable_zip(destination) return nodes.HTML5AppNode( source_id=content_dict['identifier'], title=content_dict.get('title'), license=license, files=[files.HTMLZipFile(zip_path)], )
def create_html5_app_node(license, content_dict): with tempfile.TemporaryDirectory() as destination: index_copy_path = os.path.join(destination, 'index.html') shutil.copyfile(content_dict['index_file'], index_copy_path) for file_path in content_dict['files']: shutil.copy(file_path, destination) #preview_in_browser(destination) zip_path = create_predictable_zip(destination) return nodes.HTML5AppNode( source_id=content_dict['identifier'], title=content_dict.get('title'), license=license, files=[files.HTMLZipFile(zip_path)], )
def scrape_book(url, license): """ Scrape book and return html node e.g. https://saylordotorg.github.io/text_financial-accounting/ """ page = BeautifulSoup(read_source(url), 'html.parser') if not page.find('div', {'id': 'book-content' }): # Skip books that link to other websites return # Get fields for new html node title = page.find('h1').text.replace(u'\xa0', u' ').replace('\n', '') source_id = generate_id(title) write_to_path = "{}{}{}.zip".format(DOWNLOAD_DIRECTORY, os.path.sep, source_id) LOGGER.info(" " + title) # Write to html zip # if not os.path.isfile(write_to_path): with html.HTMLWriter(write_to_path) as zipper: # Parse table of contents contents = BeautifulSoup(read_source(url), 'html.parser') parse_page_links(url, contents, zipper) # Parse all links in the table of contents for link in contents.find_all('a'): if link.get('href'): # Get page content and write to zip chapter_contents = BeautifulSoup( read_source(url, endpoint=link['href']), 'html.parser') parse_page_links(url, chapter_contents, zipper, link['href']) zipper.write_contents(link['href'], chapter_contents.prettify()) # Write main index.html file and all shared files zipper.write_index_contents(contents.prettify()) write_shared_library_to_zip(zipper) return nodes.HTML5AppNode(source_id=source_id, title=title, license=license, copyright_holder=COPYRIGHT_HOLDER, files=[files.HTMLZipFile(path=write_to_path)])
def download_content_node(url, title): doc = get_parsed_html_from_url(url) destination = tempfile.mkdtemp() doc = download_static_assets(doc, destination, 'http://migranthealth.eu/', request_fn=make_request, url_blacklist=url_blacklist, derive_filename=derive_filename) nodes_to_remove = [ 'header', '#page-top-header', '#block-region-side-pre', '#region-main .row-fluid .span4.heading-rts', '.readmoreLinks', '.courseSectionNext', 'img[alt="next"]', '.modified', '.footer-rts', '#page-footer', '.back-to-top', '.skiplinks', '.linkicon', '.generalbox table tr:nth-of-type(2)', ] for selector in nodes_to_remove: for node in doc.select(selector): node.decompose() # Write out the HTML source. with open(os.path.join(destination, "index.html"), "w") as f: f.write(str(doc)) print(" ... downloaded to %s" % destination) #preview_in_browser(destination) zip_path = create_predictable_zip(destination) return nodes.HTML5AppNode( source_id=url, title=truncate_metadata(title), license=MEET_LICENSE, files=[files.HTMLZipFile(zip_path)], language="en", )
def make_topic_tree_with_entrypoints(license, imscp_zip, imscp_dict, ims_dir, temp_dir=None, parent_id=None, node_options=None): """Return a TopicTree node from a dict of some subset of an IMSCP manifest. The actual IMSCP zip is marked as a dependency, and the zip loaded by Kolibri only contains an index.html file that redirects to the entrypoint defined in the manifest. This minimizes the additional content generated for Kolibri, and also allows us to support content where multiple content nodes have entrypoints defined by parameters, e.g. index.html#chapter2, index.html#chapter3, etc. Ready to be uploaded via Ricecooker to Studio or used in Kolibri. Args: license - License to apply to content nodes. imscp_dict - Dict of IMSCP from extract_from_zip or extract_from_dir. ims_dir (string) - Path of directory of IMSCP scraper_class (webmixer.HTMLPageScraper class, optional): Webmixer scraper class to use for pruning an HTML page. temp_dir (string, optional) - Full path of temporary directory to output HTML zip files to. parent_id (string, optional) - Parent ID string to concatenate to source ID. node_options (dict, optional) - Options to pass to content renderer in Kolibri. """ if not temp_dir: temp_dir = tempfile.tempdir source_id = imscp_dict['identifier'] assert source_id, "{} has no identifier, parent id = {}".format( os.path.basename(imscp_zip), parent_id) if parent_id: source_id = '{}-{}'.format(parent_id, source_id) if imscp_dict.get('children'): topic_node = nodes.TopicNode(source_id=source_id, title=imscp_dict['title']) counter = 1 for child in imscp_dict['children']: # We will get duplicate IDs if we don't have any ID set. if not child['identifier']: child['identifier'] = 'item{}'.format(counter) topic_node.add_child( make_topic_tree_with_entrypoints(license, imscp_zip, child, ims_dir, temp_dir=temp_dir, parent_id=source_id, node_options=node_options)) counter += 1 return topic_node else: if imscp_dict['type'] == 'webcontent': entrypoint_dir = os.path.join(temp_dir, 'entrypoint') if os.path.exists(entrypoint_dir): shutil.rmtree(entrypoint_dir) os.makedirs(entrypoint_dir) index = os.path.join(entrypoint_dir, "index.html") entrypoint_url = '/zipcontent/{}/{}'.format( os.path.basename(imscp_zip), imscp_dict['href']) f = open(index, "w", encoding="utf-8") f.write(ENTRYPOINT_TEMPLATE.format(entrypoint_url)) f.close() zip_path = create_predictable_zip(entrypoint_dir) html5_node = nodes.HTML5AppNode( source_id=source_id, title=imscp_dict.get('title'), license=license, files=[ files.HTMLZipFile(zip_path), files.HTMLZipFile( imscp_zip, preset=format_presets.HTML5_DEPENDENCY_ZIP) ], ) if node_options is not None: extra_data = {'options': node_options} html5_node.extra_fields.update(extra_data) return html5_node else: logging.warning('Content type %s not supported yet.' % imscp_dict['type'])
def construct_channel(self, **kwargs): # create channel channel = self.get_channel(**kwargs) # create a topic and add it to channel for grade in GRADES: grade_node = nodes.TopicNode( source_id=str(grade), title="Grade {grade}".format(grade=grade), description="", ) channel.add_child(grade_node) filename = localise.make_local( BASE_URL.format(grade=grade, target='teachers') + "/teacher_course_guide.html") print(filename) file = HTMLZipFile(filename) course_guide_node = nodes.HTML5AppNode( source_id="{grade}-teachers-teacher_course_guide".format( grade=grade), title="Grade {grade} Teacher Course Guide".format(grade=grade), license=licenses.CC_BY_NC_SA, copyright_holder="Open Up Resources", #author="Open Up Resources", #description="", #thumbnail="", #extra_fields={}, #domain_ns="", files=[file], ) grade_node.add_child(course_guide_node) filename = localise.make_local( BASE_URL.format(grade=grade, target="students") + "/1/1.html") print(filename) file = HTMLZipFile(filename) course_guide_node = nodes.HTML5AppNode( source_id="{grade}-students-1-1".format(grade=grade), title="Grade {grade} 1-1".format(grade=grade), license=licenses.CC_BY_NC_SA, copyright_holder="Open Up Resources", #author="Open Up Resources", #description="", #thumbnail="", #extra_fields={}, #domain_ns="", files=[file], ) grade_node.add_child(course_guide_node) """6/teachers/1.html -- has description of this topic; has drop down list of lessons within it 6/teachers/1/1.html -- Is a lesson plan. 6/teachers/1/assessments/unit_assessments.html -- broken 6/teachers/1/practice_problems.html -- practice problems for all lessons w/solutons 6/teachers/1/downloads.html -- 7x links to pdfs/zips of pdfs 6/teachers/1/family_materials.html -- same as family? (YES) topicwide 6/teachers/teacher_course_guide.html -- single page per year 6/families/1.html -- same as teachers / family materials 6/students/1/1.html -- is student resources. 6/students/1/practice_problems.html - nothing complex 6/students/1/glossary.html - nothing complex 6/students/1/my_reflections.html - nothing complex """ return channel
def scrape_page(exp_id, language, subject_node): # format to appropriate url depending on language my_downloader = downloader.ArchiveDownloader(EXPERIMENTS_FOLDER) url = format_url(exp_id, language) # page = downloader.archive_page(url, EXPERIMENTS_FOLDER) page = my_downloader.get_page(url, refresh=True) my_zip_dir = my_downloader.create_zip_dir_for_page(url) index_file = os.path.join(my_zip_dir, 'index.html') # entry = page['index_path'] zip_path_entry = os.path.relpath(index_file, os.path.join('chefdata', 'experiments')) soup = BeautifulSoup(open(index_file, encoding='utf-8'), 'html.parser') # get title visible_SRAtitle = soup.find('h1', {'class': 'SRAtitle'}) title = visible_SRAtitle.get_text(strip=True) # get tags visible_SRAtd = soup.findAll('div', {'class': 'SRAtd'}) visible_tags = visible_SRAtd[-1] tags_arr = [] for a_tags in visible_tags.findAll('a'): tag = a_tags.get_text(strip=True) # remove special characters tag = re.sub(r"[^a-zA-Z0-9]+", ' ', tag) # removing ending whitespace tag = tag.rstrip() tags_arr.append(tag) # remove navbar navbar = soup.find('nav') navbar.decompose() # remove footer footer = soup.find('footer') footer.decompose() # remove all hrefs for a_tag in soup.findAll('a'): del a_tag['href'] # move all children of a tag to parent a_tag.replaceWithChildren() # write updated soup to html file soup_str = str(soup) # html_file = open(entry, 'w', encoding = 'utf-8') html_file = open(index_file, 'w', encoding='utf-8') html_file.write(soup_str) html_file.close() # zippath = zip.create_predictable_zip(EXPERIMENTS_FOLDER, zip_path_entry) zippath = zip.create_predictable_zip(my_zip_dir) # copy zippath to temp folder here if necessary shutil.copy(zippath, TEMP_FOLDER) html5_node = nodes.HTML5AppNode( source_id='{0}_{1}'.format(language, url), files=[files.HTMLZipFile(zippath)], title=title, description='', license=licenses.CC_BYLicense('Sciensation'), language=language, thumbnail=None, author='Sciensation', tags=tags_arr) subject_node.add_child(html5_node) return subject_node
def build_tree_from_json(parent_node, sourcetree): """ Recusively parse nodes in the list `sourcetree` and add them as children to the `parent_node`. Usually called with `parent_node` being a `ChannelNode`. """ EXPECTED_NODE_TYPES = [ TOPIC_NODE, VIDEO_NODE, AUDIO_NODE, EXERCISE_NODE, DOCUMENT_NODE, HTML5_NODE ] for source_node in sourcetree: kind = source_node['kind'] if kind not in EXPECTED_NODE_TYPES: LOGGER.critical('Unexpected node type found: ' + kind) raise NotImplementedError( 'Unexpected node type found in json data.') if kind == TOPIC_NODE: child_node = nodes.TopicNode( source_id=source_node.get("source_id", None), title=source_node["title"], author=source_node.get("author"), description=source_node.get("description"), language=source_node.get('language', None), thumbnail=source_node.get("thumbnail"), ) parent_node.add_child(child_node) source_tree_children = source_node.get("children", []) build_tree_from_json(child_node, source_tree_children) elif kind == VIDEO_NODE: child_node = nodes.VideoNode( source_id=source_node["source_id"], title=source_node["title"], license=get_license(**source_node['license']), author=source_node.get("author"), description=source_node.get("description"), language=source_node.get('language', None), derive_thumbnail=source_node.get( 'derive_thumbnail', True), # video-specific option thumbnail=source_node.get('thumbnail'), ) add_files(child_node, source_node.get("files") or []) parent_node.add_child(child_node) elif kind == AUDIO_NODE: child_node = nodes.AudioNode( source_id=source_node["source_id"], title=source_node["title"], license=get_license(**source_node['license']), author=source_node.get("author"), description=source_node.get("description"), language=source_node.get('language', None), thumbnail=source_node.get('thumbnail'), ) add_files(child_node, source_node.get("files") or []) parent_node.add_child(child_node) elif kind == EXERCISE_NODE: child_node = nodes.ExerciseNode( source_id=source_node["source_id"], title=source_node["title"], license=get_license(**source_node['license']), author=source_node.get("author"), description=source_node.get("description"), language=source_node.get('language', None), thumbnail=source_node.get("thumbnail"), questions=[], ) add_questions(child_node, source_node.get("questions") or []) parent_node.add_child(child_node) elif kind == DOCUMENT_NODE: child_node = nodes.DocumentNode( source_id=source_node["source_id"], title=source_node["title"], license=get_license(**source_node['license']), author=source_node.get("author"), description=source_node.get("description"), language=source_node.get('language', None), thumbnail=source_node.get("thumbnail"), ) add_files(child_node, source_node.get("files") or []) parent_node.add_child(child_node) elif kind == HTML5_NODE: child_node = nodes.HTML5AppNode( source_id=source_node["source_id"], title=source_node["title"], license=get_license(**source_node['license']), author=source_node.get("author"), description=source_node.get("description"), language=source_node.get('language', None), thumbnail=source_node.get("thumbnail"), ) add_files(child_node, source_node.get("files") or []) parent_node.add_child(child_node) else: LOGGER.critical("Encountered an unknown kind: " + str(source_node)) continue return parent_node
def process_node_from_doc(doc, book_id, title, thumbnail): """ Create a Ricecooker HTML5AppNode instance given the HTML source and metadata. """ if DOWNLOAD_ONE_TO_webroot: # Save the book's contents to the folder `webroot` in the chef root dir. # Use the script ./ricecooker/utils/kolibripreview.py to preview in K destination = './webroot' if os.path.exists(destination): shutil.rmtree(destination) os.mkdir(destination) else: # Create a temporary folder to download all the files for a book destination = tempfile.mkdtemp() # Ensure the thumbnail is in a format Ricecooker can accept, and if not, # use the first slide as the thumbnail. thumbnail_extensions = ('jpg', 'jpeg', 'png') if not thumbnail.lower().endswith(thumbnail_extensions): print("Thumbnail src (%s) doesn't end in any of %s." " Will use the first slide as the source." % ( thumbnail, thumbnail_extensions)) first_slide_src = doc.select_one('#slide-container .slide img')['src'] thumbnail = make_fully_qualified_url(first_slide_src) if not thumbnail.lower().endswith(thumbnail_extensions): thumbnail = None # Download all the JS/CSS/images/audio/et needed to make a standalone app doc = download_static_assets(doc, destination) # Remove a bunch of HTML that we don't want showing in our standalone app doc.select_one('base')['href'] = '' remove_node(doc, '#loading') remove_node(doc, '#finishedActions') remove_node(doc, '.bookmarkbtn') remove_node(doc, '.reader-expand') remove_node(doc, '#progressBar') remove_node(doc, '#androidNotification') remove_node(doc, '#exit') remove_node(doc, '#ttmenu') # Remove unnecessary scripts in the head for pat in tag_content_patterns_to_remove_in_head: remove_nodes_containing_pattern(doc, pat, parent_tag_name='head') for pat in tag_content_patterns_to_remove_in_body: remove_nodes_containing_pattern(doc, pat, parent_tag_name='body') for pat_start, pat_end in cut_start_end_patterns: remove_nodes_between_comments(doc, pat_start, pat_end, parent_tag_name='body') # Write out the HTML source with open(os.path.join(destination, "index.html"), "w") as f: f.write(str(doc)) print("Downloaded book %s titled \"%s\" (thumbnail %s) to destination %s" % ( book_id, title, thumbnail, destination)) #preview_in_browser(destination) zip_path = create_predictable_zip(destination) return nodes.HTML5AppNode( source_id=book_id, title=truncate_metadata(title), license=licenses.CC_BY_NC_SALicense(copyright_holder='3asafeer.com'), thumbnail=thumbnail, files=[files.HTMLZipFile(zip_path)], language="ar", )
def scrape_content(title, content_url): """ title: Boys' clothing content_url: http://www.touchableearth.org/china-culture-boys-clothing/ """ print(" Scraping content node: %s (%s)" % (title, content_url)) doc = get_parsed_html_from_url(content_url) if not doc: # 404 return None description = create_description(doc) source_id = doc.select_one(".current_post.active .post_id")["value"] base_node_attributes = { "source_id": source_id, "title": title, "license": TE_LICENSE, "description": description, } youtube_iframe = doc.select_one(".video-container iframe") if youtube_iframe: youtube_url = doc.select_one(".video-container iframe")["src"] youtube_id = get_youtube_id_from_url(youtube_url) if not youtube_id: print(" *** WARNING: youtube_id not found for content url", content_url) print(" Skipping.") return None try: info = ydl.extract_info(youtube_url, download=False) subtitles = info.get("subtitles") subtitle_languages = subtitles.keys() if subtitles else [] print(" ... with subtitles in languages:", subtitle_languages) except youtube_dl.DownloadError as e: # Some of the videos have been removed from the YouTube channel -- # skip creating content nodes for them entirely so they don't show up # as non-loadable videos in Kolibri. print(" NOTE: Skipping video download due to error: ", e) return None video_node = nodes.VideoNode( **base_node_attributes, derive_thumbnail=True, files=[WatermarkedYouTubeVideoFile(youtube_id=youtube_id)], ) # Add subtitles in whichever languages are available. for language in subtitle_languages: video_node.add_file( files.YouTubeSubtitleFile(youtube_id=youtube_id, language=language)) return video_node img = doc.select_one(".uncode-single-media-wrapper img") if img: img_src = img["data-guid"] or img["src"] destination = tempfile.mkdtemp() download_file(img_src, destination, request_fn=make_request, filename="image.jpg") with open(os.path.join(destination, "index.html"), "w") as f: f.write(""" <!doctype html> <html> <head></head> <body> <img src="image.jpg" style="width: 100%; max-width: 1200px;" /> </body> </html> """) zip_path = create_predictable_zip(destination) return nodes.HTML5AppNode( **base_node_attributes, files=[files.HTMLZipFile(zip_path)], thumbnail=img_src, ) return None
def download_content_node(category_node, url, title, thumbnail=None, description=None): doc = get_parsed_html_from_url(url) destination = tempfile.mkdtemp() doc = download_static_assets(doc, destination, 'https://k12.thoughtfullearning.com', request_fn=make_request, url_blacklist=url_blacklist) remove_node(doc, '#header') remove_node(doc, '.subMenuBarContainer') remove_node(doc, '.breadbookmarkcontainer') remove_node(doc, '.resourcePageTypeTitle') remove_node(doc, '.sharethis-wrapper') remove_node(doc, '.ccBlock') remove_node(doc, '#block-views-resource-info-block-block-1') remove_node(doc, '#block-views-resource-info-block-block-1') remove_node(doc, '#block-views-resource-info-block-block') remove_node(doc, '.productSuggestionContainer') remove_node(doc, 'footer') # For minilessons remove_node(doc, '.field-name-field-minilesson-downloadables') # For writing assessments remove_node(doc, '.assessmentTGLink') remove_node(doc, '.assessmentModelRubrics') remove_node(doc, '.view-display-id-attachment_1') # Write out the HTML source. with open(os.path.join(destination, "index.html"), "w") as f: f.write(str(doc)) print(" ... downloaded to %s" % destination) #preview_in_browser(destination) thumbnail_path = None if thumbnail: # Manually download the thumbnail and use it so we can lowercase the # extension to be accepted by Ricecooker. thumbnail_filename = derive_filename(thumbnail) thumbnail_path = os.path.join(destination, thumbnail_filename) download_file(thumbnail, destination, request_fn=make_request, filename=thumbnail_filename) # If there is an embedded video in the page source grab it as a video node. video_node = None iframe = doc.select_one('.embedded-video iframe') if iframe: youtube_url = iframe['src'] youtube_id = get_youtube_id_from_url(youtube_url) info = ydl.extract_info(youtube_url, download=False) video_title = info['title'] print( " ... and with video titled %s from www.youtube.com/watch?v=%s" % (video_title, youtube_id)) video_node = nodes.VideoNode( source_id=youtube_id, title=truncate_metadata(info['title']), license=licenses.CC_BY_NC_SALicense( copyright_holder=truncate_metadata('Thoughtful Learning')), description=info['description'], language="en", derive_thumbnail=True, files=[files.YouTubeVideoFile(youtube_id)], ) category_node.add_child(video_node) zip_path = create_predictable_zip(destination) app_node = nodes.HTML5AppNode( source_id=url, title=truncate_metadata(title), license=licenses.CC_BY_NC_SALicense( copyright_holder=truncate_metadata('Thoughtful Learning')), description=description, thumbnail=thumbnail_path, files=[files.HTMLZipFile(zip_path)], language="en", ) category_node.add_child(app_node)
def _build_tree(node, sourcetree): """ Parse nodes given in `sourcetree` and add as children of `node`. """ for child_source_node in sourcetree: try: main_file = child_source_node['files'][0] if 'files' in child_source_node else {} kind = guess_content_kind(path=main_file.get('path'), web_video_data=main_file.get('youtube_id') or main_file.get('web_url'), questions=child_source_node.get("questions")) except UnknownContentKindError: continue if kind == content_kinds.TOPIC: child_node = nodes.TopicNode( source_id=child_source_node["id"], title=child_source_node["title"], author=child_source_node.get("author"), description=child_source_node.get("description"), thumbnail=child_source_node.get("thumbnail"), ) node.add_child(child_node) source_tree_children = child_source_node.get("children", []) _build_tree(child_node, source_tree_children) elif kind == content_kinds.VIDEO: child_node = nodes.VideoNode( source_id=child_source_node["id"], title=child_source_node["title"], license=get_license(child_source_node.get("license"), description="Description of license"), author=child_source_node.get("author"), description=child_source_node.get("description"), derive_thumbnail=True, # video-specific data thumbnail=child_source_node.get('thumbnail'), ) add_files(child_node, child_source_node.get("files") or []) node.add_child(child_node) elif kind == content_kinds.AUDIO: child_node = nodes.AudioNode( source_id=child_source_node["id"], title=child_source_node["title"], license=child_source_node.get("license"), author=child_source_node.get("author"), description=child_source_node.get("description"), thumbnail=child_source_node.get("thumbnail"), ) add_files(child_node, child_source_node.get("files") or []) node.add_child(child_node) elif kind == content_kinds.DOCUMENT: child_node = nodes.DocumentNode( source_id=child_source_node["id"], title=child_source_node["title"], license=child_source_node.get("license"), author=child_source_node.get("author"), description=child_source_node.get("description"), thumbnail=child_source_node.get("thumbnail"), ) add_files(child_node, child_source_node.get("files") or []) node.add_child(child_node) elif kind == content_kinds.EXERCISE: child_node = nodes.ExerciseNode( source_id=child_source_node["id"], title=child_source_node["title"], license=child_source_node.get("license"), author=child_source_node.get("author"), description=child_source_node.get("description"), exercise_data={}, # Just set to default thumbnail=child_source_node.get("thumbnail"), ) add_files(child_node, child_source_node.get("files") or []) for q in child_source_node.get("questions"): question = create_question(q) child_node.add_question(question) node.add_child(child_node) elif kind == content_kinds.HTML5: child_node = nodes.HTML5AppNode( source_id=child_source_node["id"], title=child_source_node["title"], license=child_source_node.get("license"), author=child_source_node.get("author"), description=child_source_node.get("description"), thumbnail=child_source_node.get("thumbnail"), ) add_files(child_node, child_source_node.get("files") or []) node.add_child(child_node) else: # unknown content file format continue return node
def build_tree_from_json(parent_node, sourcetree): """ Recusively parse nodes in the list `sourcetree` and add them as children to the `parent_node`. Usually called with `parent_node` being a `ChannelNode`. """ EXPECTED_NODE_TYPES = [TOPIC_NODE, VIDEO_NODE, AUDIO_NODE, EXERCISE_NODE, DOCUMENT_NODE, HTML5_NODE, SLIDESHOW_NODE] for source_node in sourcetree: kind = source_node['kind'] if kind not in EXPECTED_NODE_TYPES: LOGGER.critical('Unexpected node kind found: ' + kind) raise NotImplementedError('Unexpected node kind found in json data.') if kind == TOPIC_NODE: child_node = nodes.TopicNode( source_id=source_node.get('source_id', None), title=source_node['title'], description=source_node.get('description'), author=source_node.get('author'), aggregator=source_node.get('aggregator'), provider=source_node.get('provider'), # no role for topics (computed dynaically from descendants) language=source_node.get('language'), thumbnail=source_node.get('thumbnail'), tags=source_node.get('tags'), ) parent_node.add_child(child_node) source_tree_children = source_node.get('children', []) build_tree_from_json(child_node, source_tree_children) elif kind == VIDEO_NODE: child_node = nodes.VideoNode( source_id=source_node['source_id'], title=source_node['title'], description=source_node.get('description'), license=get_license(**source_node['license']), author=source_node.get('author'), aggregator=source_node.get('aggregator'), provider=source_node.get('provider'), role=source_node.get('role', roles.LEARNER), language=source_node.get('language'), derive_thumbnail=source_node.get('derive_thumbnail', True), # video-specific option thumbnail=source_node.get('thumbnail'), tags=source_node.get('tags'), ) add_files(child_node, source_node.get('files') or []) parent_node.add_child(child_node) elif kind == AUDIO_NODE: child_node = nodes.AudioNode( source_id=source_node['source_id'], title=source_node['title'], description=source_node.get('description'), license=get_license(**source_node['license']), author=source_node.get('author'), aggregator=source_node.get('aggregator'), provider=source_node.get('provider'), role=source_node.get('role', roles.LEARNER), language=source_node.get('language'), thumbnail=source_node.get('thumbnail'), tags=source_node.get('tags'), ) add_files(child_node, source_node.get('files') or []) parent_node.add_child(child_node) elif kind == EXERCISE_NODE: child_node = nodes.ExerciseNode( source_id=source_node['source_id'], title=source_node['title'], description=source_node.get('description'), license=get_license(**source_node['license']), author=source_node.get('author'), aggregator=source_node.get('aggregator'), provider=source_node.get('provider'), role=source_node.get('role', roles.LEARNER), language=source_node.get('language'), thumbnail=source_node.get('thumbnail'), tags=source_node.get('tags'), exercise_data=source_node.get('exercise_data'), questions=[], ) add_questions(child_node, source_node.get('questions') or []) parent_node.add_child(child_node) elif kind == DOCUMENT_NODE: child_node = nodes.DocumentNode( source_id=source_node['source_id'], title=source_node['title'], description=source_node.get('description'), license=get_license(**source_node['license']), author=source_node.get('author'), aggregator=source_node.get('aggregator'), provider=source_node.get('provider'), role=source_node.get('role', roles.LEARNER), language=source_node.get('language'), thumbnail=source_node.get('thumbnail'), tags=source_node.get('tags'), ) add_files(child_node, source_node.get('files') or []) parent_node.add_child(child_node) elif kind == HTML5_NODE: child_node = nodes.HTML5AppNode( source_id=source_node['source_id'], title=source_node['title'], description=source_node.get('description'), license=get_license(**source_node['license']), author=source_node.get('author'), aggregator=source_node.get('aggregator'), provider=source_node.get('provider'), role=source_node.get('role', roles.LEARNER), language=source_node.get('language'), thumbnail=source_node.get('thumbnail'), tags=source_node.get('tags'), ) add_files(child_node, source_node.get('files') or []) parent_node.add_child(child_node) elif kind == SLIDESHOW_NODE: child_node = nodes.SlideshowNode( source_id=source_node['source_id'], title=source_node['title'], description=source_node.get('description'), license=get_license(**source_node['license']), author=source_node.get('author'), aggregator=source_node.get('aggregator'), provider=source_node.get('provider'), role=source_node.get('role', roles.LEARNER), language=source_node.get('language'), thumbnail=source_node.get('thumbnail'), tags=source_node.get('tags') ) add_files(child_node, source_node.get('files') or []) parent_node.add_child(child_node) else: LOGGER.critical('Encountered an unknown kind: ' + str(source_node)) continue return parent_node