def add_questions(exercise_node, question_list): EXPECTED_QUESTION_TYPES = [ INPUT_QUESTION, MULTIPLE_SELECTION, SINGLE_SELECTION, FREE_RESPONSE, PERSEUS_QUESTION ] for q in question_list: question_type = q.get('question_type') if question_type not in EXPECTED_QUESTION_TYPES: LOGGER.critical(question_type) raise NotImplementedError( 'Unexpected question type found in channel json.') question_text = q.get('question') hints = q.get('hints') hints = hints if isinstance(hints, str) else [hint for hint in hints or []] if question_type == exercises.MULTIPLE_SELECTION: q_obj = questions.MultipleSelectQuestion( id=q['id'], question=question_text, correct_answers=[answer for answer in q['correct_answers']], all_answers=[answer for answer in q['all_answers']], hints=hints, ) exercise_node.add_question(q_obj) elif question_type == exercises.SINGLE_SELECTION: q_obj = questions.SingleSelectQuestion( id=q['id'], question=question_text, correct_answer=q['correct_answer'], all_answers=[answer for answer in q['all_answers']], hints=hints, ) exercise_node.add_question(q_obj) elif question_type == exercises.INPUT_QUESTION: q_obj = questions.InputQuestion( id=q['id'], question=question_text, answers=[answer for answer in q['answers']], hints=hints, ) exercise_node.add_question(q_obj) elif question_type == exercises.PERSEUS_QUESTION: q_obj = questions.PerseusQuestion( id=q['id'], raw_data=q.get('item_data'), source_url="https://www.khanacademy.org/", ) exercise_node.add_question(q_obj) else: raise UnknownQuestionTypeError( "Unrecognized question type '{0}': accepted types are {1}". format(question_type, [key for key, value in exercises.question_choices]))
def get_html5_app_zip_path(slug): resp = session.get(READ_URL.format(slug)) if resp.status_code == 200: resp = resp.json() else: LOGGER.info('The story {} is not available.\n'.format(slug)) return None content = "" for page in (resp['data']['pages']): soup = BeautifulSoup(page['html'], 'html.parser') if page.get('coverImage', None): img_src = page['coverImage']['sizes'][-1]['url'] soup.img['src'] = img_src content = content + "\n" + str(soup) context = {'content': content} handle, destination = tempfile.mkstemp(suffix=".zip") os.close(handle) htmlwriter = HTMLWriter(destination) with htmlwriter as f: index_html = TEMPLATE_ENVIRONMENT.get_template('indexfile').render( context) f.write_index_contents(index_html) LOGGER.info(destination) return destination
def _get_youtube_info(self, use_proxy=True, use_cache=True, options=None): youtube_info = None # 1. Try to get from cache if allowed: if os.path.exists(self.cache_path) and use_cache: LOGGER.info("==> [%s] Retrieving cached information...", self.__str__()) youtube_info = json.load(open(self.cache_path)) # 2. Fetch info from youtube_dl if not youtube_info: LOGGER.info("==> [%s] Requesting info from youtube...", self.__str__()) os.makedirs(self.cache_dir, exist_ok=True) try: youtube_resource = YouTubeResource(self.url, useproxy=use_proxy) except youtube_dl.utils.ExtractorError as e: if "unavailable" in str(e): LOGGER.error("==> [%s] Resource unavailable for URL: %s", self.__str__, self.url) return None if youtube_resource: try: # Save YouTube info to JSON cache file youtube_info = youtube_resource.get_resource_info(options) if youtube_info: json.dump(youtube_info, open(self.cache_path, 'w'), indent=4, ensure_ascii=False, sort_keys=True) else: LOGGER.error("==> [%s] Failed to extract YouTube info", self.__str__()) except Exception as e: LOGGER.error("==> [%s] Failed to get YouTube info: %s", self.__str__(), e) return None return youtube_info
def process(self): if 'fonts' in self.link: # Omit google fonts self.tag.decompose() return # Parse urls in css (using parseString because it is much faster than parseUrl) style_sheet = downloader.read(self.link).decode('utf-8-sig', errors='ignore') sheet = cssutils.parseString(style_sheet) for css_url in cssutils.getUrls(sheet): if not css_url.startswith('data:image') and not css_url.startswith( 'data:application'): try: style_sheet = style_sheet.replace( css_url, os.path.basename( self.write_url(css_url, url=self.link, default_ext='.png'))) except BROKEN_EXCEPTIONS as e: LOGGER.warn( 'Unable to download stylesheet url at {} ({})'.format( self.url, str(e))) self.tag[self.attribute] = self.format_url( self.write_contents(self.get_filename(self.link), style_sheet)) return self.tag[self.attribute]
def is_media_file(self, url): """ Makes a HEAD request for `url` and reuturns (vertict, head_response), where verdict is True if `url` points to a media file (.pdf, .docx, etc.) """ head_response = self.make_request(url, method='HEAD') if head_response: content_type = head_response.headers.get('content-type', None) if not content_type: LOGGER.warning( 'HEAD response does not have `content-type` header. url = ' + url) return (False, None) if content_type in self.MEDIA_CONTENT_TYPES: return (True, head_response) else: return (False, head_response) else: LOGGER.warning('HEAD request failed for url ' + url) # Fallback strategy: try to guess if media link based on extension for media_ext in self.MEDIA_FILE_FORMATS: if url.endswith('.' + media_ext): return (True, None) # if all else fails, assume False return (False, None)
def scrape_video_menu(url): """ Scrape videos from url Args: url (str): url to scrape from (e.g. https://www.exploratorium.edu/video/subjects) Returns TopicNode containing all videos """ LOGGER.info("SCRAPING VIDEOS...") video_topic = nodes.TopicNode(title="Videos", source_id="main-topic-videos") contents = BeautifulSoup(read(url), 'html5lib') for subject in contents.find_all('div', {'class': 'subject'}): title = subject.find('div', { 'class': 'name' }).text.strip().replace("’", "'") LOGGER.info(" {}".format(title)) topic = nodes.TopicNode( title=title, source_id="videos-{}".format(title), thumbnail=get_thumbnail_url(subject.find('img')['src']), ) video_topic.add_child(topic) scrape_video_subject(subject.find('a')['href'], topic) return video_topic
def get_subtitle_languages(youtube_id): """ Returns a list of the subtitle language codes available for a given video. We'll try to get the list using two approach: 1. The Youtube API (works for public videos when YOUTUBE_API_KEY defined) 2. Slow by using YouTubeResource, which in turn calls youtube_dl """ # Check if we already have the lang_codes list for this youtube_id cached... cache_filename = '{}__lang_codes.json'.format(youtube_id) cache_filepath = os.path.join(SUBTITLE_LANGUAGES_CACHE_DIR, cache_filename) if os.path.exists(cache_filepath): # Cache hit! with open(cache_filepath) as jsonf: cache_data = json.load(jsonf) return cache_data['lang_codes'] if YOUTUBE_API_KEY: try: lang_codes = get_subtitles_using_api(youtube_id) return lang_codes except HttpError as e: LOGGER.info("Can't access API for video {} ...".format(youtube_id)) lang_codes = get_subtitles_using_youtube_dl(youtube_id) # Cache the results in chefdata/sublangscache/{youtube_id}__lang_codes.json cache_data = {"lang_codes": lang_codes} with open(cache_filepath, 'w') as jsonf: json.dump(cache_data, jsonf, ensure_ascii=True) return lang_codes
def recursive_extract_website_games(subtree): """ Processes all child nodes of the subtree then calls itself on any folder-like child nodes. Weird, I know, but it works so I'm not touching it. """ if 'children' in subtree: # do processing new_children = [] for child in subtree['children']: child_url = child['url'] if child['kind'] == 'PrathamZipResource': if is_website_game(child_url): # extract all game names referenced in manual curation Excel file to process separately... child_url = child_url.replace( 'https://www.prathamopenschool.org/CourseContent/Games/', '') child_url = child_url.replace( 'http://www.prathamopenschool.org/CourseContent/Games/', '') child['title_en'] = child_url.replace('.zip', '') print('EXTRACTED game name', child['title_en'], 'form url', child['url']) website_games.append(child) else: # leave other games where they are LOGGER.info('Undocumented game-like web resource ' + child['url']) new_children.append(child) else: # leave other content as is new_children.append(child) # # recurse for child in subtree['children']: recursive_extract_website_games(child)
def pre_run(self, args, options): """ Build the ricecooker json tree for the channel. The code here is similar to the code in `ricecooker_channel/chef.py`, but the channel hiearachy is build using dictionary objects instead of classes. """ LOGGER.info('In pre_run...') # 1. Create the channel tree ricecooker_json_tree = dict( title='Sample JSON channel', source_domain='source.org', source_id='sample-json-channel', description='This channel was created from the files in the content/ ' \ + 'directory and the metadata in sample_ricecooker_json_tree.json', thumbnail='./content/sample-json-channel-files/channel_thumbnail.jpg', language='en', children=[], ) # The root object of the ricecooker json tree contains the channel info; # add topic and content nodes and to the children list to build the tree. # 2. Add topics nodes and content nodes as to the tree self.create_content_nodes(ricecooker_json_tree) self.create_exercise_nodes(ricecooker_json_tree) # 3. Save the tree to chefdata/trees/sample_ricecooker_json_tree.json json_tree_path = self.get_json_tree_path() write_tree_to_json_tree(json_tree_path, ricecooker_json_tree) LOGGER.info('Finished writing ricecooker json tree.')
def website_game_webresouce_to_ricecooker_node(lang, web_resource): """ Create Ricecooker Json structure for game from web resource dict `web_resource`. """ game_node = dict( kind=content_kinds.HTML5, source_id=web_resource['source_id'], language=lang, title=web_resource['title'], description='source_url=' + web_resource['url'] if DEBUG_MODE else '', license=PRADIGI_LICENSE, thumbnail=web_resource.get('thumbnail_url'), files=[], ) zip_tmp_path = get_zip_file(web_resource['url'], web_resource['main_file']) if zip_tmp_path: zip_file = dict( file_type=file_types.HTML5, path=zip_tmp_path, language=lang, ) game_node['files'].append(zip_file) LOGGER.debug('Created HTML5AppNode for game ' + web_resource['title']) return game_node else: LOGGER.error('Failed to create zip for game at url=' + web_resource['url']) return None
def make_request(url, *args, **kwargs): response = sess.get(url, *args, **kwargs) if response.status_code != 200: LOGGER.debug("NOT FOUND:" + url) elif not response.from_cache: LOGGER.debug("NOT CACHED:" + url) return response
def on_page(self, url, page, context): """ Basic handler that appends current page to parent's children list and adds all links on current page to the crawling queue. """ LOGGER.debug('in on_page ' + url) page_dict = dict( kind='PageWebResource', url=url, children=[], ) page_dict.update(context) # attach this page as another child in parent page context['parent']['children'].append(page_dict) links = page.find_all('a') for i, link in enumerate(links): if link.has_attr('href'): link_url = urljoin(url, link['href']) if self.should_ignore_url(link_url): pass # Uncomment three lines below for debugging to record ignored links # ignored_rsrc_dict = self.create_ignored_url_dict(link_url) # ignored_rsrc_dict['parent'] = page_dict # page_dict['children'].append(page_dict) else: self.enqueue_url_and_context(link_url, {'parent': page_dict}) else: pass
def load_pradigi_structure(which=None): csv_path = download_structure_csv(which=which) struct_list = [] with open(csv_path, 'r') as csvfile: reader = csv.DictReader(csvfile, fieldnames=PRADIGI_SHEET_CSV_FILEDNAMES) next(reader) # Skip Headers row next(reader) # Skip info line for row in reader: clean_row = _clean_dict(row) if clean_row[SUBJECT_KEY] is None: continue # skip blank lines (identified by missing subject col) if clean_row[AGE_GROUP_KEY] in PRADIGI_AGE_GROUPS and clean_row[ SUBJECT_KEY] in PRADIGI_SUBJECTS: resource_type = clean_row[RESOURCE_TYPE_KEY] if resource_type == 'Game' and clean_row[GAMENAME_KEY]: # make sure Game Name is present when specifying a game struct_list.append(clean_row) elif resource_type == 'Website Resources': struct_list.append(clean_row) else: LOGGER.warning('Problem with structure row {}'.format( str(clean_row))) else: LOGGER.warning('Unrecognized structure row {}'.format( str(clean_row))) return struct_list
def add_subpages_from_wikipedia_list(topic, list_url): """ add_subpages_from_wikipedia_list: Parses wiki pages and creates corresponding files To understand how the following parsing works, look at: 1. the source of the page (e.g. https://en.wikipedia.org/wiki/List_of_citrus_fruits), or inspect in chrome dev tools 2. the documentation for BeautifulSoup version 4: https://www.crummy.com/software/BeautifulSoup/bs4/doc/ """ page = read_source(list_url) # Parse the the page into BeautifulSoup format, so we can loop through and manipulate it table = page.find("table") # Extract the main table from the page # Loop through all the rows in the table for row in table.find_all("tr"): columns = row.find_all("td") # Extract the columns (cells, really) within the current row if not columns: # Some rows are empty, so just skip continue link = columns[0].find("a") # Get the link to the subpage if not link: # Some rows don't have links, so skip continue # Extract the URL and title for the subpage url = make_fully_qualified_url(link["href"]) title = link.text LOGGER.info(" Writing {}...".format(title)) # Attempt to extract a thumbnail for the subpage, from the second column in the table image = columns[1].find("img") thumbnail_url = make_fully_qualified_url(image["src"]) if image else None if thumbnail_url and not (thumbnail_url.endswith("jpg") or thumbnail_url.endswith("png")): thumbnail_url = None # Download the wikipedia page into an HTML5 app node html5app = download_wikipedia_page(url, thumbnail=thumbnail_url, title=title) # Add the downloaded HTML5 app node into the topic topic.add_child(html5app)
def apply_corrections_by_node_id(api, channel_tree, channel_id, corrections_by_node_id): """ Given a dict `corrections_by_node_id` of the form, { 'nodes_modified': { '<node_id (str)>': { modification dict1 }, '<node_id (str)>': { modification dict2 }, } 'nodes_added': { '<node_id (str)>': { 'new_parent': (str), 'attributes': {...}}, }, 'nodes_deleted': { '<node_id (str)>': {'old_parent': (str), 'attributes': {...}}, }, 'nodes_moved': { '<node_id (str)>': {'old_parent': (str), 'new_parent': (str), 'attributes': {...}}, }, } this function will make the appropriate Studio API calls to apply the patch. """ LOGGER.debug('Applying corrections...') # # Modifications for node_id, modifications_dict in corrections_by_node_id['nodes_modified'].items(): apply_modifications_for_node_id(api, channel_tree, node_id, modifications_dict) # # Deletions for node_id, deletion_dict in corrections_by_node_id['nodes_deleted'].items(): apply_deletion_for_node_id(api, channel_tree, channel_id, node_id, deletion_dict)
def construct_channel(self, *args, **kwargs): """ construct_channel: Creates ChannelNode and build topic tree Solar Spell is organized with the following hierarchy(Sample): Creative Arts (source_id = dir-creative-arts) |--- Culinary Arts (source_id = dir-culinary-arts) |--- |--- Real Pasifik 2 introducing Chef Alexis Tahiapuhe of Tahiti (source_id = file-real pasifik 2 introducing chef lela bolobolo of fiji.mp4) |--- Pacific Islands Arts and Culture(source_id = dir_pacific_islands_arts_and_culture) |--- |--- Cook Islands National Cultural Policy 10 July 2017_final english (File) |--- Teaching Resources and Classroom Activities Environment (source_id = dir-environment) |--- Adapting to Climate Change |--- |--- Action Against Climate Change Tuvalu Water and climate change |--- Climate Change Info |--- |--- Animated Pacific Island Climate Change Videos ... Returns: ChannelNode """ LOGGER.info("Constructing channel from {}...".format(BASE_URL)) channel = self.get_channel( *args, **kwargs) # Creates ChannelNode from data in self.channel_info LOGGER.info(' Writing {} Folder...'.format(CHANNEL_NAME)) endpoint = BASE_URL + "content/" scrape_content(endpoint, channel) raise_for_invalid_channel( channel) # Check for errors in channel construction return channel
def keep_folder(raw_path): keep = True for pattern in DIR_EXCLUDE_PATTERNS: if pattern in raw_path: LOGGER.debug('rejecting', raw_path) keep = False return keep
def scrape_snack_subject(slug, topic): """ Scrape snack subject page Args: slug (str): url slug to scrape from (e.g. /subject/arts) topic (TopicNode): topic to add html nodes to """ contents = BeautifulSoup(read(slug), 'html5lib') for activity in contents.find_all('div', {'class': 'activity'}): LOGGER.info(" {}".format(activity.find('h5').text.strip())) # Scrape snack pages into zips write_to_path, tags = scrape_snack_page(activity.find('a')['href']) if not write_to_path: continue # Create html node description = activity.find('div', {'class': 'pod-description'}) topic.add_child( nodes.HTML5AppNode( source_id=activity.find('a')['href'], title=activity.find('h5').text.strip().replace("’", "'"), description=description.text.strip() if description else "", license=LICENSE, copyright_holder=COPYRIGHT_HOLDER, files=[files.HTMLZipFile(path=write_to_path)], thumbnail=get_thumbnail_url(activity.find('img')['src']), tags=tags, )) # Scrape next page (if any) next_page_url = get_next_page_url(contents) if next_page_url: scrape_snack_subject(next_page_url, topic)
def get_nodes_by_ids_complete(self, studio_id): headers = {"Authorization": "Token {0}".format(self.token)} url = NODES_ENDPOINT + studio_id LOGGER.info(' GET ' + url) response = requests.get(url, headers=headers) studio_node = response.json()[0] return studio_node
def construct_channel(self, *args, **kwargs): """ construct_channel: Creates ChannelNode and build topic tree Wikipedia is organized with the following hierarchy: Citrus (Folder) | Citrus Page HTML Zip (File) Potatoes (Folder) | Potatoes Page HTML Zip (File) Returns: ChannelNode """ LOGGER.info("Constructing channel from {}...".format(BASE_URL)) channel = self.get_channel( *args, **kwargs) # Creates ChannelNode from data in self.channel_info create_topic(channel, "Citrus!", "List_of_citrus_fruits") # Add Citrus folder create_topic(channel, "Potatoes!", "List_of_potato_cultivars") # Add Potatoes folder raise_for_invalid_channel( channel) # Check for errors in channel construction return channel
def write_question_row_from_question_dict(self, source_id, question_dict): file_path = get_metadata_file_path(self.channeldir, self.questionsinfo) with open(file_path, 'a') as csv_file: csvwriter = csv.DictWriter(csv_file, EXERCISE_QUESTIONS_INFO_HEADER) def _safe_list_get(l, idx, default): try: return l[idx] except IndexError: return default # change image links to local question_dict = self._make_local_question_images(question_dict) type_lookup = { 'single_selection': exercises.SINGLE_SELECTION, 'true_false': exercises.SINGLE_SELECTION, 'multiple_selection': exercises.MULTIPLE_SELECTION, 'input_question': exercises.INPUT_QUESTION, } # ANSWERS answers = json.loads(question_dict['answers']) options = [] # all options correct = [] # correct andwers for ans in answers: options.append(ans['answer']) if ans['correct']: correct.append(ans['answer']) extra_options = DEFAULT_EXTRA_ITEMS_SEPARATOR.join(options[5:]) # HINTS hints_raw = json.loads(question_dict['hints']) if hints_raw: raise ValueError('Found hints but not handled..') LOGGER.info(' - writing question with studio_id=' + question_dict['assessment_id']) question_row = {} question_row[EXERCISE_SOURCEID_KEY] = source_id question_row[EXERCISE_QUESTIONS_QUESTIONID_KEY] = question_dict['assessment_id'] # question_dict['assessment_id'] question_row[EXERCISE_QUESTIONS_TYPE_KEY] = type_lookup[question_dict['type']] question_row[EXERCISE_QUESTIONS_QUESTION_KEY] = question_dict['question'] question_row[EXERCISE_QUESTIONS_OPTION_A_KEY] = _safe_list_get(options, 0, None) question_row[EXERCISE_QUESTIONS_OPTION_B_KEY] = _safe_list_get(options, 1, None) question_row[EXERCISE_QUESTIONS_OPTION_C_KEY] = _safe_list_get(options, 2, None) question_row[EXERCISE_QUESTIONS_OPTION_D_KEY] = _safe_list_get(options, 3, None) question_row[EXERCISE_QUESTIONS_OPTION_E_KEY] = _safe_list_get(options, 4, None) question_row[EXERCISE_QUESTIONS_OPTION_FGHI_KEY] = extra_options question_row[EXERCISE_QUESTIONS_CORRECT_ANSWER_KEY] = _safe_list_get(correct, 0, None) question_row[EXERCISE_QUESTIONS_CORRECT_ANSWER2_KEY] = _safe_list_get(correct, 1, None) question_row[EXERCISE_QUESTIONS_CORRECT_ANSWER3_KEY] = _safe_list_get(correct, 2, None) question_row[EXERCISE_QUESTIONS_HINT_1_KEY] = None # TODO question_row[EXERCISE_QUESTIONS_HINT_2_KEY] = None # TODO question_row[EXERCISE_QUESTIONS_HINT_3_KEY] = None # TODO question_row[EXERCISE_QUESTIONS_HINT_4_KEY] = None # TODO question_row[EXERCISE_QUESTIONS_HINT_5_KEY] = None # TODO question_row[EXERCISE_QUESTIONS_HINT_6789_KEY] = None # TODO # WRITE QUESTION ROW csvwriter.writerow(question_row)
def process_folder(channel, rel_path, filenames, metadata_provider): """ Create `ContentNode`s from each file in this folder and the node to `channel` under the path `rel_path`. """ LOGGER.debug('IN process_folder ' + str(rel_path) + ' ' + str(filenames)) if not keep_folder(rel_path): return chan_path = chan_path_from_rel_path(rel_path, metadata_provider.channeldir) chan_path_tuple = path_to_tuple(chan_path) chan_path_list = list(chan_path_tuple) LOGGER.debug('chan_path_list=' + str(chan_path_list)) # FIND THE CONTAINING NODE (channel or topic) if len(chan_path_list) == 1: # CASE CHANNEL ROOT: `rel_path` points to `channeldir` # No need to create a topic node here since channel already exists containing_node = channel # attach content nodes in filenames directly to channel else: # CASE TOPIC FOLDER: `rel_path` points to a channelroot subfolder (a.k.a TopicNode) dirname = chan_path_list.pop() # name of the folder (used as ID for internal lookup) topic_parent_node = get_topic_for_path(channel, chan_path_list) # read topic metadata to get title and description for the TopicNode topic_metadata = metadata_provider.get(chan_path_tuple) thumbnail_chan_path = topic_metadata.get('thumbnail_chan_path', None) if thumbnail_chan_path: thumbnail_rel_path = rel_path_from_chan_path(thumbnail_chan_path, metadata_provider.channeldir) else: thumbnail_rel_path = None # create TopicNode for this folder topic = dict( kind=TOPIC_NODE, dirname=dirname, source_id='sourceid:' + rel_path, title=topic_metadata.get('title', dirname), description=topic_metadata.get('description', None), author=topic_metadata.get('author', None), language=topic_metadata.get('language', None), license=topic_metadata.get('license', None), thumbnail=thumbnail_rel_path, children=[], ) topic_parent_node['children'].append(topic) containing_node = topic # attach content nodes in filenames to the newly created topic # filter filenames filenames_cleaned = filter_filenames(filenames) filenames_cleaned2 = filter_thumbnail_files(chan_path, filenames_cleaned, metadata_provider) # PROCESS FILES for filename in filenames_cleaned2: chan_filepath = os.path.join(chan_path, filename) chan_filepath_tuple = path_to_tuple(chan_filepath) metadata = metadata_provider.get(chan_filepath_tuple) node = make_content_node(metadata_provider.channeldir, rel_path, filename, metadata) containing_node['children'].append(node) # attach content node to containing_node
def to_tag(self, filename=None): try: img = self.create_tag('img') img['src'] = self.to_zip(filename=filename) return img except BROKEN_EXCEPTIONS as e: LOGGER.error(str(e)) return self.create_broken_link_message(self.url)
def construct_channel(self, *args, **kwargs): channel = self.get_channel( *args, **kwargs) # Create ChannelNode from data in self.channel_info lang_names = list(self.data.keys()) lang_names.sort() for lang_name in lang_names: lang_data = self.data[lang_name] LOGGER.info("Creating app for language: {}".format(lang_name)) lang = languages.getlang_by_native_name(lang_name) zip_dir = self.client.create_zip_dir_for_page(lang_data['url']) soup = self.client.get_page_soup(lang_data['url']) # Remove the translation list if found translations = soup.find('div', {'id': 'translations'}) if translations: translations.extract() # Grab the localized title title = soup.find('span', {'id': 'share_title'}).text # Save the modified index.html page thumbnail = None for resource in lang_data['resources']: if 'dp3t.png' in resource: thumbnail = os.path.join(zip_dir, resource) break with open(os.path.join(zip_dir, 'index.html'), 'wb') as f: f.write(soup.prettify(encoding='utf-8')) # create_predictable_zip ensures that the ZIP file does not change each time it's created. This # ensures that the zip doesn't get re-uploaded just because zip metadata changed. zip_file = zip.create_predictable_zip(zip_dir) zip_name = lang.primary_code if lang else lang_name zip_filename = os.path.join(self.ZIP_DIR, "{}.zip".format(zip_name)) os.makedirs(os.path.dirname(zip_filename), exist_ok=True) os.rename(zip_file, zip_filename) topic = nodes.TopicNode(source_id=lang_name, title=lang_name) zip_node = nodes.HTML5AppNode( source_id="covid19-sim-{}".format(lang_name), title=title, files=[files.HTMLZipFile(zip_filename)], license=licenses.PublicDomainLicense( "Marcel Salathé & Nicky Case"), language=lang, thumbnail=thumbnail) topic.add_child(zip_node) channel.add_child(topic) return channel
def download_resource(endpoint): try: url = '{}{}'.format(BASE_URL, endpoint.lstrip('/')) filename, ext = os.path.splitext(endpoint) filename = '{}.zip'.format(filename.lstrip('/').replace('/', '-')) write_to_path = CeibalPageScraper(url, locale='es').to_file(filename=filename, directory=DOWNLOAD_DIRECTORY) return write_to_path except Exception as e: LOGGER.error(str(e))
def keep_folder(raw_path): """ Keep only folders that don't contain patterns in `DIR_EXCLUDE_PATTERNS`. """ keep = True for pattern in DIR_EXCLUDE_PATTERNS: if pattern in raw_path: LOGGER.debug('rejecting', raw_path) keep = False return keep
def to_tag(self, filename=None): try: embed = self.create_tag('embed') embed['src'] = self.to_zip(filename=filename) embed['width'] = '100%' embed['style'] = 'height: 500px;max-height: 100vh;' return embed except BROKEN_EXCEPTIONS as e: LOGGER.error(str(e)) return self.create_broken_link_message(self.url)
def make_fully_qualified_url(url): if url.startswith("//"): return "https:" + url if url.startswith("/"): return "https://en.wikipedia.org" + url if not url.startswith("http"): LOGGER.warning("Skipping bad URL (relative to unknown location): " + url) return None return url
def _download_file(self, write_to_path): with html_writer.HTMLWriter(write_to_path) as zipper: try: self.zipper = zipper self.to_zip(filename='index.html') except Exception as e: # Any errors here will just say index.html file does not exist, so # print out error for more descriptive debugging LOGGER.error(str(e))
def transform_video_vertical(vertical, parent_title=None): if 'children' not in vertical: return None, [] # 1. LOOK FOR AN OPTIONAL html PREFIX TO USE AS DESCRIPTION description = '' # Extract an optional description from the first html node first_child = vertical['children'][0] if first_child['kind'] == 'html': description = extract_text_from_html_item(first_child, translate_from='ar') if parent_title: video_title = parent_title + ' ' + vertical['display_name'] else: video_title = vertical['display_name'] # 2. GET THE VIDEO videos = [ch for ch in vertical['children'] if ch['kind'] == 'video'] assert len(videos) == 1, 'multiple videos found' video = videos[0] video_dict = dict(kind=content_kinds.VIDEO, source_id=video.get('youtube_id') or video.get('path'), title=video_title, author='Edraak', description=description, language=getlang('ar').code, license=EDRAAK_LICENSE, files=[]) if 'youtube_id' in video: file_dict = dict( file_type=content_kinds.VIDEO, youtube_id=video['youtube_id'], language=getlang('ar').code, high_resolution=False, ) elif 'path' in video: file_dict = dict( file_type=content_kinds.VIDEO, path=video['path'], language=getlang('ar').code, ffmpeg_settings={"crf": 24}, ) else: LOGGER.error('Video does not have youtube_id or path ' + str(video)) video_dict['files'].append(file_dict) # 3. LOOK FOR AN OPTIONAL RESOURCES html downloadable_resources = [] htmls = [ch for ch in vertical['children'] if ch['kind'] == 'html'] for html in htmls: if 'downloadable_resources' in html: downloadable_resources.extend(html['downloadable_resources']) return video_dict, downloadable_resources