def render_topic_pages(node): parents = [node] if node.get("children") else [] parent = node["parent"] while parent: parents.append(parent) parent = parent["parent"] # Finally, render templates into the destination template_context = { "topic_tree": topic_tree, "topic": node, "parents": parents } with i18n.translate_block(language): topic_html = render_to_string("kalite_zim/topic.html", template_context) # Replace absolute references to '/static' with relative topic_html = topic_html.replace("/static", "static") dest_html = os.path.join(tmp_dir, node["id"] + ".html") logger.info("Rendering {}".format(dest_html)) open(dest_html, "w").write(topic_html) render_topic_pages.pages_rendered += 1 for child in node.get('children', []): render_topic_pages(child)
def recurse_nodes(node): child_availability = [] # Do the recursion for child in node.get("children", []): recurse_nodes(child) child_availability.append(child.get("available", False)) # If child_availability is empty then node has no children so we can determine availability if child_availability: node["available"] = any(child_availability) else: # By default this is very charitable, assuming if something has not been annotated # it is available. if node.get("kind") == "Exercise": cache_node = exercise_cache.get(node.get("id"), {}) else: cache_node = content_cache.get(node.get("id"), {}) node["available"] = cache_node.get("available", True) # Translate everything for good measure with i18n.translate_block(language): node["title"] = _(node.get("title", "")) node["description"] = _(node.get("description", "")) if node.get("description") else ""
def annotate_tree(topic, depth=0, parent=None): """ We need to recurse into the tree in order to annotate elements with topic data and exercise data """ children = topic.get('children', []) new_children = [] for child_topic in children: if child_topic.get("kind") in ("Video", "Topic"): annotate_tree(child_topic, depth=depth + 1, parent=topic) new_children.append(child_topic) topic["children"] = new_children if topic.get("kind") == "Exercise": topic['exercise'] = exercise_cache.get(topic.get("id"), {}) exercise_json_output[topic.get("id")] = topic['exercise'] elif topic.get("kind") == "Topic": pass else: topic['exercise'] = None topic['content'] = content_cache.get(topic.get("id"), {}) content_json_output[topic.get("id")] = topic['content'] if not topic['content']: logger.error('No content!?, id is: {}'.format(topic.get('id'))) # Translate everything for good measure with i18n.translate_block(language): topic["title"] = _(topic.get("title", "")) topic["description"] = _(topic.get("description", "")) if topic.get("description") else "" topic["url"] = topic["id"] + ".html" topic["parent"] = parent topic["depth"] = depth for key in ("child_data", "keywords", "hide", "contains"): topic.pop(key, None)
def generate_flat_topic_tree(node_cache=None, lang_code=settings.LANGUAGE_CODE, alldata=False): with i18n.translate_block(lang_code): categories = node_cache or get_node_cache(language=i18n.lcode_to_django_lang(lang_code)) result = dict() # make sure that we only get the slug of child of a topic # to avoid redundancy for category_name, category in categories.iteritems(): result[category_name] = {} for node_name, node in category.iteritems(): if alldata: relevant_data = node else: relevant_data = { 'title': _(node['title']), 'path': node['path'], 'kind': node['kind'], 'available': node.get('available', True), 'keywords': node.get('keywords', []), } result[category_name][node_name] = relevant_data return result
def annotate_tree(topic, depth=0, parent=None): """ We need to recurse into the tree in order to annotate elements with topic data and exercise data """ children = topic.get('children', []) new_children = [] for child_topic in children: if child_topic.get("kind") in ("Video", "Topic"): annotate_tree(child_topic, depth=depth + 1, parent=topic) new_children.append(child_topic) topic["children"] = new_children if topic.get("kind") == "Exercise": topic['exercise'] = exercise_cache.get(topic.get("id"), {}) exercise_json_output[topic.get("id")] = topic['exercise'] elif topic.get("kind") == "Topic": pass else: topic['exercise'] = None topic['content'] = content_cache.get(topic.get("id"), {}) content_json_output[topic.get("id")] = topic['content'] if not topic['content']: logger.error('No content!?, id is: {}'.format( topic.get('id'))) # Translate everything for good measure with i18n.translate_block(language): topic["title"] = _(topic.get("title", "")) topic["description"] = _(topic.get( "description", "")) if topic.get("description") else "" topic["url"] = topic["id"] + ".html" topic["parent"] = parent topic["depth"] = depth for key in ("child_data", "keywords", "hide", "contains"): topic.pop(key, None)
def handle(self, *args, **options): if len(args) != 1: raise CommandError("Takes exactly 1 argument") dest_file = os.path.abspath(args[0]) logger.info("Starting up KA Lite export2zim command") beginning = datetime.now() logger.info("Begin: {}".format(beginning)) language = options.get('language') if not language: raise CommandError("Must specify a language!") if not options.get('tmp_dir'): tmp_dir = os.path.join(tempfile.gettempdir(), 'ka-lite-zim_{}'.format(language)) else: tmp_dir = options.get('tmp_dir') tmp_dir = os.path.abspath(tmp_dir) if os.path.exists(tmp_dir) and os.listdir(tmp_dir): if options['clear']: logger.info("Clearing directory {}".format(tmp_dir)) shutil.rmtree(tmp_dir) elif options['resume']: logger.info("Resuming in dirty tmp directory {}".format(tmp_dir)) else: raise CommandError( "{} not empty, use the -c option to clean it, -r to resume, or use an empty destination directory.".format( tmp_dir ) ) zimwriterfs = options.get("zimwriterfs", None) publisher = options.get("publisher") transcode2webm = options.get("transcode2webm") ffmpeg = find_executable("ffmpeg") if not ffmpeg: logger.warning("FFMpeg not found in your path, you won't be able to create missing thumbnails or transcode to webm.") if not zimwriterfs: zimwriterfs = find_executable("zimwriterfs") if not zimwriterfs: raise CommandError("Could not find zimwriterfs in your path, try specifying --zimwriterfs=/path") if not os.path.exists(zimwriterfs): raise CommandError("Invalid --zimwriterfs") from kalite_zim import __name__ as base_path base_path = os.path.abspath(base_path) data_path = os.path.join(base_path, 'data') # Where subtitles are found in KA Lite subtitle_src_dir = i18n.get_srt_path(language) logger.info("Will export videos for language: {}".format(language)) logger.info("Preparing KA Lite topic tree...") # Use live data if not options.get('test'): # This way of doing things will be deprecated in KA Lite 0.16 topic_tree_json_path = topic_tools_settings.TOPICS_FILEPATHS.get('khan') content_cache = get_content_cache(language=language, annotate=True) exercise_cache = get_exercise_cache(language=language) # Use test data else: topic_tree_json_path = os.path.join(data_path, 'test_topics.json') content_cache = json.load( open(os.path.join(data_path, 'test_content.json')) ) exercise_cache = json.load( open(os.path.join(data_path, 'test_exercise.json')) ) topic_tree = softload_json(topic_tree_json_path, logger=logger.debug, raises=False) content_json_output = {} exercise_json_output = {} def annotate_tree(topic, depth=0, parent=None): """ We need to recurse into the tree in order to annotate elements with topic data and exercise data """ children = topic.get('children', []) new_children = [] for child_topic in children: if child_topic.get("kind") in ("Video", "Topic"): annotate_tree(child_topic, depth=depth + 1, parent=topic) new_children.append(child_topic) topic["children"] = new_children if topic.get("kind") == "Exercise": topic['exercise'] = exercise_cache.get(topic.get("id"), {}) exercise_json_output[topic.get("id")] = topic['exercise'] elif topic.get("kind") == "Topic": pass else: topic['exercise'] = None topic['content'] = content_cache.get(topic.get("id"), {}) content_json_output[topic.get("id")] = topic['content'] if not topic['content']: logger.error('No content!?, id is: {}'.format(topic.get('id'))) # Translate everything for good measure with i18n.translate_block(language): topic["title"] = _(topic.get("title", "")) topic["description"] = _(topic.get("description", "")) if topic.get("description") else "" topic["url"] = topic["id"] + ".html" topic["parent"] = parent topic["depth"] = depth for key in ("child_data", "keywords", "hide", "contains"): topic.pop(key, None) # 1. Annotate a topic tree annotate_tree(topic_tree) # 2. Now go through the tree and copy each element into the destination # zim file system def copy_media(node): if node['kind'] == 'Topic': # Don't do anything if it's a topic pass elif node['kind'] == 'Exercise': # Exercises cannot be displayed node["content"]["available"] = False elif node['kind'] == 'Video': if node['content']['format'] == "webm": logger.warning("Found a duplicate ID for {}, re-downloading".format(node['id'])) node['content']['format'] = "mp4" # Available is False by default until we locate the file node["content"]["available"] = False node_dir = os.path.join(tmp_dir, node["path"]) if not os.path.exists(node_dir): os.makedirs(node_dir) video_file_name = node['id'] + '.' + node['content']['format'] thumb_file_name = node['id'] + '.png' video_file_src = os.path.join(CONTENT_ROOT, video_file_name) video_file_dest = os.path.join(node_dir, video_file_name) thumb_file_src = os.path.join(CONTENT_ROOT, thumb_file_name) thumb_file_dest = os.path.join(node_dir, thumb_file_name) if options['download'] and not os.path.exists(video_file_src): logger.info("Video file being downloaded to: {}".format(video_file_src)) download_video( node['content']['youtube_id'], node['content']['format'], CONTENT_ROOT, ) if os.path.exists(video_file_src): if transcode2webm: ffmpeg_pass_log = "/tmp/logfile_vp8.fpf" if os.path.isfile(ffmpeg_pass_log): os.unlink(ffmpeg_pass_log) video_file_name = node['id'] + '.webm' video_file_dest = os.path.join(node_dir, video_file_name) if os.path.isfile(video_file_dest): logger.info("Already encoded: {}".format(video_file_dest)) else: ffmpeg_base_args = [ ffmpeg, "-i", video_file_src, "-codec:v", "libvpx", "-quality", "best", "-cpu-used", "0", "-b:v", "300k", "-qmin", "10", # 10=lowest value "-qmax", "35", # 42=highest value "-maxrate", "300k", "-bufsize", "600k", "-threads", "8", # "-vf", "scale=-1", "-codec:a", "libvorbis", # "-b:a", "128k", "-aq", "5", "-f", "webm", ] ffmpeg_pass1 = ffmpeg_base_args + [ "-an", # Disables audio, no effect first pass "-pass", "1", "-passlogfile", ffmpeg_pass_log, video_file_dest, ] ffmpeg_pass2 = ffmpeg_base_args + [ "-pass", "2", "-y", "-passlogfile", ffmpeg_pass_log, video_file_dest, ] for cmd in (ffmpeg_pass1, ffmpeg_pass2): process = subprocess.Popen(cmd, stdout=subprocess.PIPE) stdout_data, _stderr_data = process.communicate() if process.returncode != 0: logger.error("Error invoking ffmpeg: {}".format((_stderr_data or "") + (stdout_data or ""))) logger.error("Command was: {}".format(" ".join(cmd))) raise CommandError("Could not complete transcoding") node['content']['format'] = "webm" else: # If not transcoding, just link the original file os.link(video_file_src, video_file_dest) node["video_url"] = os.path.join( node["path"], video_file_name ) copy_media.videos_found += 1 logger.info("Videos processed: {}".format(copy_media.videos_found)) node["content"]["available"] = True # Create thumbnail if it wasn't downloaded if not os.path.exists(thumb_file_src): fp = create_thumbnail(video_file_src, output_format="png") if fp is None: logger.error("Failed to create thumbnail for {}".format(video_file_src)) else: logger.info("Successfully created thumbnail for {}".format(video_file_src)) file(thumb_file_src, 'wb').write(fp.read()) # Handle thumbnail if os.path.exists(thumb_file_src): node["thumbnail_url"] = os.path.join( node["path"], node['id'] + '.png' ) if not os.path.exists(thumb_file_dest): os.link(thumb_file_src, thumb_file_dest) else: node["thumbnail_url"] = None subtitle_srt = os.path.join( subtitle_src_dir, node['id'] + '.srt' ) if os.path.isfile(subtitle_srt): subtitle_vtt = os.path.join( node_dir, node['id'] + '.vtt' ) # Convert to .vtt because this format is understood # by latest video.js and the old ones that read # .srt don't work with newer jquery etc. submarine_parser(subtitle_srt, subtitle_vtt) if not os.path.exists(subtitle_vtt): logger.warning("Subtitle not converted: {}".format(subtitle_srt)) else: logger.info("Subtitle convert from SRT to VTT: {}".format(subtitle_vtt)) node["subtitle_url"] = os.path.join( node["path"], node['id'] + '.vtt' ) else: if options['download']: logger.error("File not found or downloaded: {}".format(video_file_src)) else: logger.error("Invalid node, kind: {}".format(node.get("kind", None))) # Exercises cannot be displayed node["content"] = {"available": False} new_children = [] for child in node.get('children', []): copy_media(child) empty_topic = child["kind"] == "Topic" and not child.get("children", []) unavailable_video = child["kind"] == "Video" and not child.get("content", {}).get("available", False) if not (empty_topic or unavailable_video): new_children.append(child) node['children'] = new_children copy_media.videos_found = 0 def render_topic_pages(node): parents = [node] if node.get("children") else [] parent = node["parent"] while parent: parents.append(parent) parent = parent["parent"] # Finally, render templates into the destination template_context = { "topic_tree": topic_tree, "topic": node, "parents": parents } with i18n.translate_block(language): topic_html = render_to_string("kalite_zim/topic.html", template_context) # Replace absolute references to '/static' with relative topic_html = topic_html.replace("/static", "static") dest_html = os.path.join(tmp_dir, node["id"] + ".html") logger.info("Rendering {}".format(dest_html)) open(dest_html, "w").write(topic_html) render_topic_pages.pages_rendered += 1 for child in node.get('children', []): render_topic_pages(child) render_topic_pages.pages_rendered = 0 logger.info("Hard linking video files from KA Lite...") copy_media(topic_tree) sys.stderr.write("\n") logger.info("Done!") # Configure django-compressor compressor_init(os.path.join(base_path, 'static')) # Finally, render templates into the destination template_context = { "topic_tree": topic_tree, "welcome": True, } with i18n.translate_block(language): welcome_html = render_to_string("kalite_zim/welcome.html", template_context) about_html = render_to_string("kalite_zim/about.html", template_context) # Replace absolute references to '/static' with relative welcome_html = welcome_html.replace("/static", "static") about_html = about_html.replace("/static", "static") # Write the welcome.html file open(os.path.join(tmp_dir, 'welcome.html'), 'w').write(welcome_html) open(os.path.join(tmp_dir, 'about.html'), 'w').write(about_html) # Render all topic html files render_topic_pages(topic_tree) # Copy in static data after it's been handled by django compressor # (this happens during template rendering) shutil.copytree(os.path.join(base_path, 'static'), os.path.join(tmp_dir, 'static')) ending = datetime.now() duration = int((ending - beginning).total_seconds()) logger.info("Total number of videos found: {}".format(copy_media.videos_found)) logger.info("Total number of topic pages created: {}".format(render_topic_pages.pages_rendered)) logger.info("Invoking zimwriterfs, writing to: {}".format(dest_file)) zimwriterfs_args = ( zimwriterfs, "--welcome", "welcome.html", "--favicon", "static/img/ka_leaf.png", "--publisher", publisher, "--creator", "KhanAcademy.org", "--description", "Khan Academy ({})".format(language), "--description", "Videos from Khan Academy", "--language", language, tmp_dir, dest_file, ) process = subprocess.Popen(zimwriterfs_args, stdout=subprocess.PIPE) stdout_data, _stderr_data = process.communicate() if process.returncode != 0: logger.error("Error invoking zimwriterfs: {}").format(_stderr_data + stdout_data) logger.info( "Duration: {h:} hours, {m:} minutes, {s:} seconds".format( h=duration // 3600, m=(duration % 3600) // 60, s=duration % 60, ) )
def get_content_cache(force=False, annotate=False, language=None): if not language: language = django_settings.LANGUAGE_CODE global CONTENT if CONTENT is None: CONTENT = {} if CONTENT.get(language) is None: content = None if settings.DO_NOT_RELOAD_CONTENT_CACHE_AT_STARTUP and not force: content = softload_sqlite_cache(settings.CONTENT_CACHE_FILEPATH) if content: CONTENT[language] = content return CONTENT[language] else: if settings.DO_NOT_RELOAD_CONTENT_CACHE_AT_STARTUP: call_command("create_content_db") content = softload_sqlite_cache(settings.CONTENT_CACHE_FILEPATH) else: content = softload_json(settings.CONTENT_FILEPATH, logger=logging.debug, raises=False) CONTENT[language] = content annotate = True if annotate: # Loop through all content items and put thumbnail urls, content urls, # and subtitle urls on the content dictionary, and list all languages # that the content is available in. try: contents_folder = os.listdir(django_settings.CONTENT_ROOT) except OSError: contents_folder = [] subtitle_langs = {} if os.path.exists(i18n.get_srt_path()): for (dirpath, dirnames, filenames) in os.walk(i18n.get_srt_path()): # Only both looking at files that are inside a 'subtitles' directory if os.path.basename(dirpath) == "subtitles": lc = os.path.basename(os.path.dirname(dirpath)) for filename in filenames: if filename in subtitle_langs: subtitle_langs[filename].append(lc) else: subtitle_langs[filename] = [lc] for key, content in CONTENT[language].iteritems(): default_thumbnail = create_thumbnail_url(content.get("id")) dubmap = i18n.get_id2oklang_map(content.get("id")) if dubmap: content_lang = i18n.select_best_available_language(language, available_codes=dubmap.keys()) or "" if content_lang: dubbed_id = dubmap.get(content_lang) format = content.get("format", "") if (dubbed_id + "." + format) in contents_folder: content["available"] = True thumbnail = create_thumbnail_url(dubbed_id) or default_thumbnail content["content_urls"] = { "stream": django_settings.CONTENT_URL + dubmap.get(content_lang) + "." + format, "stream_type": "{kind}/{format}".format(kind=content.get("kind", "").lower(), format=format), "thumbnail": thumbnail, } elif django_settings.BACKUP_VIDEO_SOURCE: content["available"] = True content["content_urls"] = { "stream": django_settings.BACKUP_VIDEO_SOURCE.format(youtube_id=dubbed_id, video_format=format), "stream_type": "{kind}/{format}".format(kind=content.get("kind", "").lower(), format=format), "thumbnail": django_settings.BACKUP_VIDEO_SOURCE.format(youtube_id=dubbed_id, video_format="png"), } else: content["available"] = False else: content["available"] = False else: content["available"] = False # Get list of subtitle language codes currently available subtitle_lang_codes = subtitle_langs.get("{id}.srt".format(id=content.get("id")), []) # Generate subtitle URLs for any subtitles that do exist for this content item subtitle_urls = [{ "code": lc, "url": django_settings.STATIC_URL + "srt/{code}/subtitles/{id}.srt".format(code=lc, id=content.get("id")), "name": i18n.get_language_name(lc) } for lc in subtitle_lang_codes] # Sort all subtitle URLs by language code content["subtitle_urls"] = sorted(subtitle_urls, key=lambda x: x.get("code", "")) with i18n.translate_block(language): content["selected_language"] = content_lang content["title"] = _(content["title"]) content["description"] = _(content.get("description")) if content.get("description") else "" CONTENT[language][key] = content if settings.DO_NOT_RELOAD_CONTENT_CACHE_AT_STARTUP: try: CONTENT[language].commit() except IOError as e: logging.warn("Annotated content cache file failed in saving with error {e}".format(e=e)) return CONTENT[language]
def get_exercise_cache(force=False, language=None): if not language: language = django_settings.LANGUAGE_CODE global EXERCISES if EXERCISES is None: EXERCISES = {} if EXERCISES.get(language) is None: if settings.DO_NOT_RELOAD_CONTENT_CACHE_AT_STARTUP and not force: exercises = softload_json( cache_file_path("exercises_{0}.json".format(language)), logger=logging.debug, raises=False ) if exercises: EXERCISES[language] = exercises return EXERCISES[language] EXERCISES[language] = softload_json(settings.EXERCISES_FILEPATH, logger=logging.debug, raises=False) # English-language exercises live in application space, translations in user space if language == "en": exercise_root = os.path.join(settings.KHAN_EXERCISES_DIRPATH, "exercises") else: exercise_root = i18n.get_localized_exercise_dirpath(language) if os.path.exists(exercise_root): try: exercise_templates = os.listdir(exercise_root) except OSError: exercise_templates = [] else: exercise_templates = [] for exercise in EXERCISES[language].values(): exercise_file = exercise["name"] + ".html" exercise_template = exercise_file exercise_lang = "en" # The central server doesn't have an assessment item database if django_settings.CENTRAL_SERVER: available = False elif exercise.get("uses_assessment_items", False): available = False items = [] for item in exercise.get("all_assessment_items", []): item = json.loads(item) if get_assessment_item_data(request=None, assessment_item_id=item.get("id")): items.append(item) available = True exercise["all_assessment_items"] = items else: available = exercise_template in exercise_templates # Get the language codes for exercise templates that exist # Try to minimize the number of os.path.exists calls (since they're a bottleneck) by using the same # precedence rules in i18n.select_best_available_languages available_langs = set(["en"] + [language] * available) # Return the best available exercise template exercise_lang = i18n.select_best_available_language(language, available_codes=available_langs) if exercise_lang == "en": exercise_template = exercise_file else: exercise_template = os.path.join(exercise_lang, exercise_file) with i18n.translate_block(language): exercise["available"] = available exercise["lang"] = exercise_lang exercise["template"] = exercise_template exercise["title"] = _(exercise.get("title", "")) exercise["description"] = _(exercise.get("description", "")) if exercise.get("description") else "" if settings.DO_NOT_RELOAD_CONTENT_CACHE_AT_STARTUP: try: with open(cache_file_path("exercises_{0}.json".format(language)), "w") as f: json.dump(EXERCISES[language], f) except IOError as e: logging.warn("Annotated exercise cache file failed in saving with error {e}".format(e=e)) return EXERCISES[language]
def get_content_cache(force=False, annotate=False, language=settings.LANGUAGE_CODE): global CONTENT, CONTENT_FILEPATH if CONTENT is None: CONTENT = {} if CONTENT.get(language) is None: CONTENT[language] = softload_json(CONTENT_FILEPATH, logger=logging.debug, raises=False) annotate = True if annotate: if settings.DO_NOT_RELOAD_CONTENT_CACHE_AT_STARTUP and not force: content = softload_json(CONTENT_FILEPATH + "_" + language + ".cache", logger=logging.debug, raises=False) if content: CONTENT[language] = content return CONTENT[language] # Loop through all content items and put thumbnail urls, content urls, # and subtitle urls on the content dictionary, and list all languages # that the content is available in. for content in CONTENT[language].values(): default_thumbnail = create_thumbnail_url(content.get("id")) dubmap = i18n.get_id2oklang_map(content.get("id")) if dubmap: content_lang = i18n.select_best_available_language(language, available_codes=dubmap.keys()) or "" if content_lang: dubbed_id = dubmap.get(content_lang) format = content.get("format", "") if is_content_on_disk(dubbed_id, format): content["available"] = True thumbnail = create_thumbnail_url(dubbed_id) or default_thumbnail content["content_urls"] = { "stream": settings.CONTENT_URL + dubmap.get(content_lang) + "." + format, "stream_type": "{kind}/{format}".format(kind=content.get("kind", "").lower(), format=format), "thumbnail": thumbnail, } else: content["available"] = False else: content["available"] = False else: content["available"] = False # Get list of subtitle language codes currently available subtitle_lang_codes = [] if not os.path.exists(i18n.get_srt_path()) else [lc for lc in os.listdir(i18n.get_srt_path()) if os.path.exists(i18n.get_srt_path(lc, content.get("id")))] # Generate subtitle URLs for any subtitles that do exist for this content item subtitle_urls = [{ "code": lc, "url": settings.STATIC_URL + "srt/{code}/subtitles/{id}.srt".format(code=lc, id=content.get("id")), "name": i18n.get_language_name(lc) } for lc in subtitle_lang_codes if os.path.exists(i18n.get_srt_path(lc, content.get("id")))] # Sort all subtitle URLs by language code content["subtitle_urls"] = sorted(subtitle_urls, key=lambda x: x.get("code", "")) with i18n.translate_block(content_lang): content["selected_language"] = content_lang content["title"] = _(content["title"]) content["description"] = _(content.get("description", "")) if content.get("description") else "" if settings.DO_NOT_RELOAD_CONTENT_CACHE_AT_STARTUP: try: with open(CONTENT_FILEPATH + "_" + language + ".cache", "w") as f: json.dump(CONTENT[language], f) except IOError as e: logging.warn("Annotated content cache file failed in saving with error {e}".format(e=e)) return CONTENT[language]
def get_exercise_cache(force=False, language=settings.LANGUAGE_CODE): global EXERCISES, EXERCISES_FILEPATH if EXERCISES is None: EXERCISES = {} if EXERCISES.get(language) is None: if settings.DO_NOT_RELOAD_CONTENT_CACHE_AT_STARTUP and not force: exercises = softload_json(EXERCISES_FILEPATH + "_" + language + ".cache", logger=logging.debug, raises=False) if exercises: EXERCISES[language] = exercises return EXERCISES[language] EXERCISES[language] = softload_json(EXERCISES_FILEPATH, logger=logging.debug, raises=False) exercise_root = os.path.join(settings.KHAN_EXERCISES_DIRPATH, "exercises") if os.path.exists(exercise_root): exercise_templates = os.listdir(exercise_root) else: exercise_templates = [] assessmentitems = get_assessment_item_cache() TEMPLATE_FILE_PATH = os.path.join(settings.KHAN_EXERCISES_DIRPATH, "exercises", "%s") for exercise in EXERCISES[language].values(): exercise_file = exercise["name"] + ".html" exercise_template = exercise_file exercise_lang = "en" if exercise.get("uses_assessment_items", False): available = False items = [] for item in exercise.get("all_assessment_items","[]"): item = json.loads(item) if assessmentitems.get(item.get("id")): items.append(item) available = True exercise["all_assessment_items"] = items else: available = os.path.isfile(TEMPLATE_FILE_PATH % exercise_template) # Get the language codes for exercise templates that exist available_langs = set(["en"] + [lang_code for lang_code in exercise_templates if os.path.exists(os.path.join(exercise_root, lang_code, exercise_file))]) # Return the best available exercise template exercise_lang = i18n.select_best_available_language(language, available_codes=available_langs) if exercise_lang == "en": exercise_template = exercise_file else: exercise_template = os.path.join(exercise_lang, exercise_file) with i18n.translate_block(language): exercise["available"] = available exercise["lang"] = exercise_lang exercise["template"] = exercise_template exercise["title"] = _(exercise.get("title", "")) exercise["description"] = _(exercise.get("description", "")) if exercise.get("description") else "" if settings.DO_NOT_RELOAD_CONTENT_CACHE_AT_STARTUP: try: with open(EXERCISES_FILEPATH + "_" + language + ".cache", "w") as f: json.dump(EXERCISES[language], f) except IOError as e: logging.warn("Annotated exercise cache file failed in saving with error {e}".format(e=e)) return EXERCISES[language]
def handle(self, *args, **options): if len(args) != 1: raise CommandError("Takes exactly 1 argument") dest_file = os.path.abspath(args[0]) logger.info("Starting up KA Lite export2zim command") beginning = datetime.now() logger.info("Begin: {}".format(beginning)) language = options.get('language') if not language: raise CommandError("Must specify a language!") if not options.get('tmp_dir'): tmp_dir = os.path.join(tempfile.gettempdir(), 'ka-lite-zim_{}'.format(language)) else: tmp_dir = options.get('tmp_dir') tmp_dir = os.path.abspath(tmp_dir) if os.path.exists(tmp_dir) and os.listdir(tmp_dir): if options['clear']: logger.info("Clearing directory {}".format(tmp_dir)) shutil.rmtree(tmp_dir) elif options['resume']: logger.info( "Resuming in dirty tmp directory {}".format(tmp_dir)) else: raise CommandError( "{} not empty, use the -c option to clean it, -r to resume, or use an empty destination directory." .format(tmp_dir)) zimwriterfs = options.get("zimwriterfs", None) publisher = options.get("publisher") transcode2webm = options.get("transcode2webm") ffmpeg = find_executable("ffmpeg") if not ffmpeg: logger.warning( "FFMpeg not found in your path, you won't be able to create missing thumbnails or transcode to webm." ) if not zimwriterfs: zimwriterfs = find_executable("zimwriterfs") if not zimwriterfs: raise CommandError( "Could not find zimwriterfs in your path, try specifying --zimwriterfs=/path" ) if not os.path.exists(zimwriterfs): raise CommandError("Invalid --zimwriterfs") from kalite_zim import __name__ as base_path base_path = os.path.abspath(base_path) data_path = os.path.join(base_path, 'data') # Where subtitles are found in KA Lite subtitle_src_dir = i18n.get_srt_path(language) logger.info("Will export videos for language: {}".format(language)) logger.info("Preparing KA Lite topic tree...") # Use live data if not options.get('test'): # This way of doing things will be deprecated in KA Lite 0.16 topic_tree_json_path = topic_tools_settings.TOPICS_FILEPATHS.get( 'khan') content_cache = get_content_cache(language=language, annotate=True) exercise_cache = get_exercise_cache(language=language) # Use test data else: topic_tree_json_path = os.path.join(data_path, 'test_topics.json') content_cache = json.load( open(os.path.join(data_path, 'test_content.json'))) exercise_cache = json.load( open(os.path.join(data_path, 'test_exercise.json'))) topic_tree = softload_json(topic_tree_json_path, logger=logger.debug, raises=False) content_json_output = {} exercise_json_output = {} def annotate_tree(topic, depth=0, parent=None): """ We need to recurse into the tree in order to annotate elements with topic data and exercise data """ children = topic.get('children', []) new_children = [] for child_topic in children: if child_topic.get("kind") in ("Video", "Topic"): annotate_tree(child_topic, depth=depth + 1, parent=topic) new_children.append(child_topic) topic["children"] = new_children if topic.get("kind") == "Exercise": topic['exercise'] = exercise_cache.get(topic.get("id"), {}) exercise_json_output[topic.get("id")] = topic['exercise'] elif topic.get("kind") == "Topic": pass else: topic['exercise'] = None topic['content'] = content_cache.get(topic.get("id"), {}) content_json_output[topic.get("id")] = topic['content'] if not topic['content']: logger.error('No content!?, id is: {}'.format( topic.get('id'))) # Translate everything for good measure with i18n.translate_block(language): topic["title"] = _(topic.get("title", "")) topic["description"] = _(topic.get( "description", "")) if topic.get("description") else "" topic["url"] = topic["id"] + ".html" topic["parent"] = parent topic["depth"] = depth for key in ("child_data", "keywords", "hide", "contains"): topic.pop(key, None) # 1. Annotate a topic tree annotate_tree(topic_tree) # 2. Now go through the tree and copy each element into the destination # zim file system def copy_media(node): if node['kind'] == 'Topic': # Don't do anything if it's a topic pass elif node['kind'] == 'Exercise': # Exercises cannot be displayed node["content"]["available"] = False elif node['kind'] == 'Video': if node['content']['format'] == "webm": logger.warning( "Found a duplicate ID for {}, re-downloading".format( node['id'])) node['content']['format'] = "mp4" # Available is False by default until we locate the file node["content"]["available"] = False node_dir = os.path.join(tmp_dir, node["path"]) if not os.path.exists(node_dir): os.makedirs(node_dir) video_file_name = node['id'] + '.' + node['content']['format'] thumb_file_name = node['id'] + '.png' video_file_src = os.path.join(CONTENT_ROOT, video_file_name) video_file_dest = os.path.join(node_dir, video_file_name) thumb_file_src = os.path.join(CONTENT_ROOT, thumb_file_name) thumb_file_dest = os.path.join(node_dir, thumb_file_name) if options['download'] and not os.path.exists(video_file_src): logger.info("Video file being downloaded to: {}".format( video_file_src)) download_video( node['content']['youtube_id'], node['content']['format'], CONTENT_ROOT, ) if os.path.exists(video_file_src): if transcode2webm: ffmpeg_pass_log = "/tmp/logfile_vp8.fpf" if os.path.isfile(ffmpeg_pass_log): os.unlink(ffmpeg_pass_log) video_file_name = node['id'] + '.webm' video_file_dest = os.path.join(node_dir, video_file_name) if os.path.isfile(video_file_dest): logger.info( "Already encoded: {}".format(video_file_dest)) else: ffmpeg_base_args = [ ffmpeg, "-i", video_file_src, "-codec:v", "libvpx", "-quality", "best", "-cpu-used", "0", "-b:v", "300k", "-qmin", "10", # 10=lowest value "-qmax", "35", # 42=highest value "-maxrate", "300k", "-bufsize", "600k", "-threads", "8", # "-vf", "scale=-1", "-codec:a", "libvorbis", # "-b:a", "128k", "-aq", "5", "-f", "webm", ] ffmpeg_pass1 = ffmpeg_base_args + [ "-an", # Disables audio, no effect first pass "-pass", "1", "-passlogfile", ffmpeg_pass_log, video_file_dest, ] ffmpeg_pass2 = ffmpeg_base_args + [ "-pass", "2", "-y", "-passlogfile", ffmpeg_pass_log, video_file_dest, ] for cmd in (ffmpeg_pass1, ffmpeg_pass2): process = subprocess.Popen( cmd, stdout=subprocess.PIPE) stdout_data, _stderr_data = process.communicate( ) if process.returncode != 0: logger.error( "Error invoking ffmpeg: {}".format( (_stderr_data or "") + (stdout_data or ""))) logger.error("Command was: {}".format( " ".join(cmd))) raise CommandError( "Could not complete transcoding") node['content']['format'] = "webm" else: # If not transcoding, just link the original file os.link(video_file_src, video_file_dest) node["video_url"] = os.path.join(node["path"], video_file_name) copy_media.videos_found += 1 logger.info("Videos processed: {}".format( copy_media.videos_found)) node["content"]["available"] = True # Create thumbnail if it wasn't downloaded if not os.path.exists(thumb_file_src): fp = create_thumbnail(video_file_src, output_format="png") if fp is None: logger.error( "Failed to create thumbnail for {}".format( video_file_src)) else: logger.info( "Successfully created thumbnail for {}".format( video_file_src)) file(thumb_file_src, 'wb').write(fp.read()) # Handle thumbnail if os.path.exists(thumb_file_src): node["thumbnail_url"] = os.path.join( node["path"], node['id'] + '.png') if not os.path.exists(thumb_file_dest): os.link(thumb_file_src, thumb_file_dest) else: node["thumbnail_url"] = None subtitle_srt = os.path.join(subtitle_src_dir, node['id'] + '.srt') if os.path.isfile(subtitle_srt): subtitle_vtt = os.path.join(node_dir, node['id'] + '.vtt') # Convert to .vtt because this format is understood # by latest video.js and the old ones that read # .srt don't work with newer jquery etc. submarine_parser(subtitle_srt, subtitle_vtt) if not os.path.exists(subtitle_vtt): logger.warning("Subtitle not converted: {}".format( subtitle_srt)) else: logger.info( "Subtitle convert from SRT to VTT: {}".format( subtitle_vtt)) node["subtitle_url"] = os.path.join( node["path"], node['id'] + '.vtt') else: if options['download']: logger.error("File not found or downloaded: {}".format( video_file_src)) else: logger.error("Invalid node, kind: {}".format( node.get("kind", None))) # Exercises cannot be displayed node["content"] = {"available": False} new_children = [] for child in node.get('children', []): copy_media(child) empty_topic = child["kind"] == "Topic" and not child.get( "children", []) unavailable_video = child["kind"] == "Video" and not child.get( "content", {}).get("available", False) if not (empty_topic or unavailable_video): new_children.append(child) node['children'] = new_children copy_media.videos_found = 0 def render_topic_pages(node): parents = [node] if node.get("children") else [] parent = node["parent"] while parent: parents.append(parent) parent = parent["parent"] # Finally, render templates into the destination template_context = { "topic_tree": topic_tree, "topic": node, "parents": parents } with i18n.translate_block(language): topic_html = render_to_string("kalite_zim/topic.html", template_context) # Replace absolute references to '/static' with relative topic_html = topic_html.replace("/static", "static") dest_html = os.path.join(tmp_dir, node["id"] + ".html") logger.info("Rendering {}".format(dest_html)) open(dest_html, "w").write(topic_html) render_topic_pages.pages_rendered += 1 for child in node.get('children', []): render_topic_pages(child) render_topic_pages.pages_rendered = 0 logger.info("Hard linking video files from KA Lite...") copy_media(topic_tree) sys.stderr.write("\n") logger.info("Done!") # Configure django-compressor compressor_init(os.path.join(base_path, 'static')) # Finally, render templates into the destination template_context = { "topic_tree": topic_tree, "welcome": True, } with i18n.translate_block(language): welcome_html = render_to_string("kalite_zim/welcome.html", template_context) about_html = render_to_string("kalite_zim/about.html", template_context) # Replace absolute references to '/static' with relative welcome_html = welcome_html.replace("/static", "static") about_html = about_html.replace("/static", "static") # Write the welcome.html file open(os.path.join(tmp_dir, 'welcome.html'), 'w').write(welcome_html) open(os.path.join(tmp_dir, 'about.html'), 'w').write(about_html) # Render all topic html files render_topic_pages(topic_tree) # Copy in static data after it's been handled by django compressor # (this happens during template rendering) shutil.copytree(os.path.join(base_path, 'static'), os.path.join(tmp_dir, 'static')) ending = datetime.now() duration = int((ending - beginning).total_seconds()) logger.info("Total number of videos found: {}".format( copy_media.videos_found)) logger.info("Total number of topic pages created: {}".format( render_topic_pages.pages_rendered)) logger.info("Invoking zimwriterfs, writing to: {}".format(dest_file)) zimwriterfs_args = ( zimwriterfs, "--welcome", "welcome.html", "--favicon", "static/img/ka_leaf.png", "--publisher", publisher, "--creator", "KhanAcademy.org", "--description", "Khan Academy ({})".format(language), "--description", "Videos from Khan Academy", "--language", language, tmp_dir, dest_file, ) process = subprocess.Popen(zimwriterfs_args, stdout=subprocess.PIPE) stdout_data, _stderr_data = process.communicate() if process.returncode != 0: logger.error("Error invoking zimwriterfs: {}").format( _stderr_data + stdout_data) logger.info("Duration: {h:} hours, {m:} minutes, {s:} seconds".format( h=duration // 3600, m=(duration % 3600) // 60, s=duration % 60, ))
def get_content_cache(force=False, annotate=False, language=settings.LANGUAGE_CODE): global CONTENT, CONTENT_FILEPATH if CONTENT is None: CONTENT = {} if CONTENT.get(language) is None: CONTENT[language] = softload_json(CONTENT_FILEPATH, logger=logging.debug, raises=False) annotate = True if annotate: if settings.DO_NOT_RELOAD_CONTENT_CACHE_AT_STARTUP and not force: content = softload_json(CONTENT_FILEPATH + "_" + language + ".cache", logger=logging.debug, raises=False) if content: CONTENT[language] = content return CONTENT[language] # Loop through all content items and put thumbnail urls, content urls, # and subtitle urls on the content dictionary, and list all languages # that the content is available in. for content in CONTENT[language].values(): default_thumbnail = create_thumbnail_url(content.get("id")) dubmap = i18n.get_id2oklang_map(content.get("id")) if dubmap: content_lang = i18n.select_best_available_language(language, available_codes=dubmap.keys()) or "" if content_lang: dubbed_id = dubmap.get(content_lang) format = content.get("format", "") if is_content_on_disk(dubbed_id, format): content["available"] = True thumbnail = create_thumbnail_url(dubbed_id) or default_thumbnail content["content_urls"] = { "stream": settings.CONTENT_URL + dubmap.get(content_lang) + "." + format, "stream_type": "{kind}/{format}".format(kind=content.get("kind", "").lower(), format=format), "thumbnail": thumbnail, } elif settings.BACKUP_VIDEO_SOURCE: content["available"] = True content["content_urls"] = { "stream": settings.BACKUP_VIDEO_SOURCE.format(youtube_id=dubbed_id, video_format=format), "stream_type": "{kind}/{format}".format(kind=content.get("kind", "").lower(), format=format), "thumbnail": settings.BACKUP_VIDEO_SOURCE.format(youtube_id=dubbed_id, video_format="png"), } else: content["available"] = False else: content["available"] = False else: content["available"] = False # Get list of subtitle language codes currently available subtitle_lang_codes = [] if not os.path.exists(i18n.get_srt_path()) else [lc for lc in os.listdir(i18n.get_srt_path()) if os.path.exists(i18n.get_srt_path(lc, content.get("id")))] # Generate subtitle URLs for any subtitles that do exist for this content item subtitle_urls = [{ "code": lc, "url": settings.STATIC_URL + "srt/{code}/subtitles/{id}.srt".format(code=lc, id=content.get("id")), "name": i18n.get_language_name(lc) } for lc in subtitle_lang_codes if os.path.exists(i18n.get_srt_path(lc, content.get("id")))] # Sort all subtitle URLs by language code content["subtitle_urls"] = sorted(subtitle_urls, key=lambda x: x.get("code", "")) with i18n.translate_block(content_lang): content["selected_language"] = content_lang content["title"] = _(content["title"]) content["description"] = _(content.get("description", "")) if content.get("description") else "" if settings.DO_NOT_RELOAD_CONTENT_CACHE_AT_STARTUP: try: with open(CONTENT_FILEPATH + "_" + language + ".cache", "w") as f: json.dump(CONTENT[language], f) except IOError as e: logging.warn("Annotated content cache file failed in saving with error {e}".format(e=e)) return CONTENT[language]