def move_srts(lang_code): """ Srts live in the locale directory, but that's not exposed at any URL. So instead, we have to move the srts out to /static/subtitles/[lang_code]/ """ lang_code_ietf = lcode_to_ietf(lang_code) lang_code_django = lcode_to_django_dir(lang_code) subtitles_static_dir = os.path.join(settings.STATIC_ROOT, "subtitles") src_dir = os.path.join(LOCALE_ROOT, lang_code_django, "subtitles") dest_dir = get_srt_path(lang_code_django) ensure_dir(dest_dir) lang_subtitles = glob.glob(os.path.join(src_dir, "*.srt")) logging.info("Moving %d subtitles from %s to %s" % (len(lang_subtitles), src_dir, dest_dir)) for fil in lang_subtitles: srt_dest_path = os.path.join(dest_dir, os.path.basename(fil)) if os.path.exists(srt_dest_path): os.remove(srt_dest_path ) # we're going to replace any srt with a newer version shutil.move(fil, srt_dest_path) if not os.path.exists(src_dir): logging.info("No subtitles for language pack %s" % lang_code) elif os.listdir(src_dir): logging.warn( "%s is not empty; will not remove. Please check that all subtitles were moved." % src_dir) else: logging.info("Removing empty source directory (%s)." % src_dir) shutil.rmtree(src_dir)
def move_srts(lang_code): """ Srts live in the locale directory, but that's not exposed at any URL. So instead, we have to move the srts out to /static/subtitles/[lang_code]/ """ lang_code_ietf = lcode_to_ietf(lang_code) lang_code_django = lcode_to_django_dir(lang_code) subtitles_static_dir = os.path.join(settings.USER_STATIC_FILES, "subtitles") src_dir = os.path.join(settings.USER_WRITABLE_LOCALE_DIR, lang_code_django, "subtitles") dest_dir = get_srt_path(lang_code_django) ensure_dir(dest_dir) lang_subtitles = glob.glob(os.path.join(src_dir, "*.srt")) logging.info("Moving %d subtitles from %s to %s" % (len(lang_subtitles), src_dir, dest_dir)) for fil in lang_subtitles: srt_dest_path = os.path.join(dest_dir, os.path.basename(fil)) if os.path.exists(srt_dest_path): os.remove(srt_dest_path) # we're going to replace any srt with a newer version shutil.move(fil, srt_dest_path) if not os.path.exists(src_dir): logging.info("No subtitles for language pack %s" % lang_code) elif os.listdir(src_dir): logging.warn("%s is not empty; will not remove. Please check that all subtitles were moved." % src_dir) else: logging.info("Removing empty source directory (%s)." % src_dir) shutil.rmtree(src_dir)
def delete_language(lang_code): langpack_resource_paths = [get_localized_exercise_dirpath(lang_code), get_srt_path(lang_code), get_locale_path(lang_code)] for langpack_resource_path in langpack_resource_paths: try: shutil.rmtree(langpack_resource_path) logging.info("Deleted language pack resource path: %s" % langpack_resource_path) except OSError as e: if e.errno != 2: # Only ignore error: No Such File or Directory raise else: logging.debug("Not deleting missing language pack resource path: %s" % langpack_resource_path) invalidate_web_cache()
def stamp_availability_on_video(video, format="mp4", force=False, stamp_urls=True, videos_path=None): """ Stamp all relevant urls and availability onto a video object (if necessary), including: * whether the video is available (on disk or online) """ videos_path = videos_path or settings.CONTENT_ROOT def compute_video_availability(youtube_id, format, videos_path=videos_path): return {"on_disk": is_video_on_disk(youtube_id, format, videos_path=videos_path)} def compute_video_metadata(youtube_id, format): return {"stream_type": "video/%s" % format} def compute_video_urls(youtube_id, format, lang_code, on_disk=None, thumb_formats=["png", "jpg"], videos_path=videos_path): if on_disk is None: on_disk = is_video_on_disk(youtube_id, format, videos_path=videos_path) if on_disk: video_base_url = settings.CONTENT_URL + youtube_id stream_url = video_base_url + ".%s" % format thumbnail_url = None # default to None now, so we know when no thumbnail is available. for thumb_format in thumb_formats: # find the thumbnail on disk thumb_filename = '%s.%s' % (youtube_id, thumb_format) thumb_filepath = os.path.join(videos_path, thumb_filename) if os.path.exists(thumb_filepath): thumbnail_url = video_base_url + "." + thumb_format # default break elif settings.BACKUP_VIDEO_SOURCE and lang_code == "en": dict_vals = {"youtube_id": youtube_id, "video_format": format, "thumb_format": thumb_formats[0] } stream_url = settings.BACKUP_VIDEO_SOURCE % dict_vals thumbnail_url = settings.BACKUP_THUMBNAIL_SOURCE % dict_vals if settings.BACKUP_THUMBNAIL_SOURCE else None else: return {} # no URLs return {"stream": stream_url, "thumbnail": thumbnail_url} video_availability = video.get("availability", {}) if not force else {} en_youtube_id = get_youtube_id(video["id"], lang_code=None) # get base ID video_map = get_id2oklang_map(video["id"]) or {} if not "on_disk" in video_availability: for lang_code in video_map.keys(): youtube_id = video_map[lang_code].encode('utf-8') video_availability[lang_code] = compute_video_availability(youtube_id, format=format, videos_path=videos_path) video_availability["en"] = video_availability.get("en", {"on_disk": False}) # en should always be defined # Summarize status any_on_disk = any([lang_avail["on_disk"] for lang_avail in video_availability.values()]) any_available = any_on_disk or bool(settings.BACKUP_VIDEO_SOURCE) if stamp_urls: # Loop over all known dubbed videos for lang_code, youtube_id in video_map.iteritems(): urls = compute_video_urls(youtube_id, format, lang_code, on_disk=video_availability[lang_code]["on_disk"], videos_path=videos_path) if urls: # Only add properties if anything is available. video_availability[lang_code].update(urls) video_availability[lang_code].update(compute_video_metadata(youtube_id, format)) # Get the (english) subtitle urls subtitle_lang_codes = get_langs_with_subtitle(en_youtube_id) subtitles_tuple = [(lc, get_srt_url(en_youtube_id, lc)) for lc in subtitle_lang_codes if os.path.exists(get_srt_path(lc, en_youtube_id))] subtitles_urls = dict(subtitles_tuple) video_availability["en"]["subtitles"] = subtitles_urls # now scrub any values that don't actually exist for lang_code in video_availability.keys(): if not video_availability[lang_code]["on_disk"] and len(video_availability[lang_code]) == 1: del video_availability[lang_code] # Now summarize some availability onto the video itself video["availability"] = video_availability video["on_disk"] = any_on_disk video["available"] = any_available return video
def handle(self, *args, **options): if len(args) != 1: raise CommandError("Takes exactly 1 argument") dest_file = os.path.abspath(args[0]) logger.info("Starting up KA Lite export2zim command") beginning = datetime.now() logger.info("Begin: {}".format(beginning)) language = options.get('language') if not language: raise CommandError("Must specify a language!") if not options.get('tmp_dir'): tmp_dir = os.path.join(tempfile.gettempdir(), 'ka-lite-zim_{}'.format(language)) else: tmp_dir = options.get('tmp_dir') tmp_dir = os.path.abspath(tmp_dir) if os.path.exists(tmp_dir) and os.listdir(tmp_dir): if options['clear']: logger.info("Clearing directory {}".format(tmp_dir)) shutil.rmtree(tmp_dir) elif options['resume']: logger.info("Resuming in dirty tmp directory {}".format(tmp_dir)) else: raise CommandError( "{} not empty, use the -c option to clean it, -r to resume, or use an empty destination directory.".format( tmp_dir ) ) zimwriterfs = options.get("zimwriterfs", None) publisher = options.get("publisher") transcode2webm = options.get("transcode2webm") ffmpeg = find_executable("ffmpeg") if not ffmpeg: logger.warning("FFMpeg not found in your path, you won't be able to create missing thumbnails or transcode to webm.") if not zimwriterfs: zimwriterfs = find_executable("zimwriterfs") if not zimwriterfs: raise CommandError("Could not find zimwriterfs in your path, try specifying --zimwriterfs=/path") if not os.path.exists(zimwriterfs): raise CommandError("Invalid --zimwriterfs") from kalite_zim import __name__ as base_path base_path = os.path.abspath(base_path) data_path = os.path.join(base_path, 'data') # Where subtitles are found in KA Lite subtitle_src_dir = i18n.get_srt_path(language) logger.info("Will export videos for language: {}".format(language)) logger.info("Preparing KA Lite topic tree...") # Use live data if not options.get('test'): # This way of doing things will be deprecated in KA Lite 0.16 topic_tree_json_path = topic_tools_settings.TOPICS_FILEPATHS.get('khan') content_cache = get_content_cache(language=language, annotate=True) exercise_cache = get_exercise_cache(language=language) # Use test data else: topic_tree_json_path = os.path.join(data_path, 'test_topics.json') content_cache = json.load( open(os.path.join(data_path, 'test_content.json')) ) exercise_cache = json.load( open(os.path.join(data_path, 'test_exercise.json')) ) topic_tree = softload_json(topic_tree_json_path, logger=logger.debug, raises=False) content_json_output = {} exercise_json_output = {} def annotate_tree(topic, depth=0, parent=None): """ We need to recurse into the tree in order to annotate elements with topic data and exercise data """ children = topic.get('children', []) new_children = [] for child_topic in children: if child_topic.get("kind") in ("Video", "Topic"): annotate_tree(child_topic, depth=depth + 1, parent=topic) new_children.append(child_topic) topic["children"] = new_children if topic.get("kind") == "Exercise": topic['exercise'] = exercise_cache.get(topic.get("id"), {}) exercise_json_output[topic.get("id")] = topic['exercise'] elif topic.get("kind") == "Topic": pass else: topic['exercise'] = None topic['content'] = content_cache.get(topic.get("id"), {}) content_json_output[topic.get("id")] = topic['content'] if not topic['content']: logger.error('No content!?, id is: {}'.format(topic.get('id'))) # Translate everything for good measure with i18n.translate_block(language): topic["title"] = _(topic.get("title", "")) topic["description"] = _(topic.get("description", "")) if topic.get("description") else "" topic["url"] = topic["id"] + ".html" topic["parent"] = parent topic["depth"] = depth for key in ("child_data", "keywords", "hide", "contains"): topic.pop(key, None) # 1. Annotate a topic tree annotate_tree(topic_tree) # 2. Now go through the tree and copy each element into the destination # zim file system def copy_media(node): if node['kind'] == 'Topic': # Don't do anything if it's a topic pass elif node['kind'] == 'Exercise': # Exercises cannot be displayed node["content"]["available"] = False elif node['kind'] == 'Video': if node['content']['format'] == "webm": logger.warning("Found a duplicate ID for {}, re-downloading".format(node['id'])) node['content']['format'] = "mp4" # Available is False by default until we locate the file node["content"]["available"] = False node_dir = os.path.join(tmp_dir, node["path"]) if not os.path.exists(node_dir): os.makedirs(node_dir) video_file_name = node['id'] + '.' + node['content']['format'] thumb_file_name = node['id'] + '.png' video_file_src = os.path.join(CONTENT_ROOT, video_file_name) video_file_dest = os.path.join(node_dir, video_file_name) thumb_file_src = os.path.join(CONTENT_ROOT, thumb_file_name) thumb_file_dest = os.path.join(node_dir, thumb_file_name) if options['download'] and not os.path.exists(video_file_src): logger.info("Video file being downloaded to: {}".format(video_file_src)) download_video( node['content']['youtube_id'], node['content']['format'], CONTENT_ROOT, ) if os.path.exists(video_file_src): if transcode2webm: ffmpeg_pass_log = "/tmp/logfile_vp8.fpf" if os.path.isfile(ffmpeg_pass_log): os.unlink(ffmpeg_pass_log) video_file_name = node['id'] + '.webm' video_file_dest = os.path.join(node_dir, video_file_name) if os.path.isfile(video_file_dest): logger.info("Already encoded: {}".format(video_file_dest)) else: ffmpeg_base_args = [ ffmpeg, "-i", video_file_src, "-codec:v", "libvpx", "-quality", "best", "-cpu-used", "0", "-b:v", "300k", "-qmin", "10", # 10=lowest value "-qmax", "35", # 42=highest value "-maxrate", "300k", "-bufsize", "600k", "-threads", "8", # "-vf", "scale=-1", "-codec:a", "libvorbis", # "-b:a", "128k", "-aq", "5", "-f", "webm", ] ffmpeg_pass1 = ffmpeg_base_args + [ "-an", # Disables audio, no effect first pass "-pass", "1", "-passlogfile", ffmpeg_pass_log, video_file_dest, ] ffmpeg_pass2 = ffmpeg_base_args + [ "-pass", "2", "-y", "-passlogfile", ffmpeg_pass_log, video_file_dest, ] for cmd in (ffmpeg_pass1, ffmpeg_pass2): process = subprocess.Popen(cmd, stdout=subprocess.PIPE) stdout_data, _stderr_data = process.communicate() if process.returncode != 0: logger.error("Error invoking ffmpeg: {}".format((_stderr_data or "") + (stdout_data or ""))) logger.error("Command was: {}".format(" ".join(cmd))) raise CommandError("Could not complete transcoding") node['content']['format'] = "webm" else: # If not transcoding, just link the original file os.link(video_file_src, video_file_dest) node["video_url"] = os.path.join( node["path"], video_file_name ) copy_media.videos_found += 1 logger.info("Videos processed: {}".format(copy_media.videos_found)) node["content"]["available"] = True # Create thumbnail if it wasn't downloaded if not os.path.exists(thumb_file_src): fp = create_thumbnail(video_file_src, output_format="png") if fp is None: logger.error("Failed to create thumbnail for {}".format(video_file_src)) else: logger.info("Successfully created thumbnail for {}".format(video_file_src)) file(thumb_file_src, 'wb').write(fp.read()) # Handle thumbnail if os.path.exists(thumb_file_src): node["thumbnail_url"] = os.path.join( node["path"], node['id'] + '.png' ) if not os.path.exists(thumb_file_dest): os.link(thumb_file_src, thumb_file_dest) else: node["thumbnail_url"] = None subtitle_srt = os.path.join( subtitle_src_dir, node['id'] + '.srt' ) if os.path.isfile(subtitle_srt): subtitle_vtt = os.path.join( node_dir, node['id'] + '.vtt' ) # Convert to .vtt because this format is understood # by latest video.js and the old ones that read # .srt don't work with newer jquery etc. submarine_parser(subtitle_srt, subtitle_vtt) if not os.path.exists(subtitle_vtt): logger.warning("Subtitle not converted: {}".format(subtitle_srt)) else: logger.info("Subtitle convert from SRT to VTT: {}".format(subtitle_vtt)) node["subtitle_url"] = os.path.join( node["path"], node['id'] + '.vtt' ) else: if options['download']: logger.error("File not found or downloaded: {}".format(video_file_src)) else: logger.error("Invalid node, kind: {}".format(node.get("kind", None))) # Exercises cannot be displayed node["content"] = {"available": False} new_children = [] for child in node.get('children', []): copy_media(child) empty_topic = child["kind"] == "Topic" and not child.get("children", []) unavailable_video = child["kind"] == "Video" and not child.get("content", {}).get("available", False) if not (empty_topic or unavailable_video): new_children.append(child) node['children'] = new_children copy_media.videos_found = 0 def render_topic_pages(node): parents = [node] if node.get("children") else [] parent = node["parent"] while parent: parents.append(parent) parent = parent["parent"] # Finally, render templates into the destination template_context = { "topic_tree": topic_tree, "topic": node, "parents": parents } with i18n.translate_block(language): topic_html = render_to_string("kalite_zim/topic.html", template_context) # Replace absolute references to '/static' with relative topic_html = topic_html.replace("/static", "static") dest_html = os.path.join(tmp_dir, node["id"] + ".html") logger.info("Rendering {}".format(dest_html)) open(dest_html, "w").write(topic_html) render_topic_pages.pages_rendered += 1 for child in node.get('children', []): render_topic_pages(child) render_topic_pages.pages_rendered = 0 logger.info("Hard linking video files from KA Lite...") copy_media(topic_tree) sys.stderr.write("\n") logger.info("Done!") # Configure django-compressor compressor_init(os.path.join(base_path, 'static')) # Finally, render templates into the destination template_context = { "topic_tree": topic_tree, "welcome": True, } with i18n.translate_block(language): welcome_html = render_to_string("kalite_zim/welcome.html", template_context) about_html = render_to_string("kalite_zim/about.html", template_context) # Replace absolute references to '/static' with relative welcome_html = welcome_html.replace("/static", "static") about_html = about_html.replace("/static", "static") # Write the welcome.html file open(os.path.join(tmp_dir, 'welcome.html'), 'w').write(welcome_html) open(os.path.join(tmp_dir, 'about.html'), 'w').write(about_html) # Render all topic html files render_topic_pages(topic_tree) # Copy in static data after it's been handled by django compressor # (this happens during template rendering) shutil.copytree(os.path.join(base_path, 'static'), os.path.join(tmp_dir, 'static')) ending = datetime.now() duration = int((ending - beginning).total_seconds()) logger.info("Total number of videos found: {}".format(copy_media.videos_found)) logger.info("Total number of topic pages created: {}".format(render_topic_pages.pages_rendered)) logger.info("Invoking zimwriterfs, writing to: {}".format(dest_file)) zimwriterfs_args = ( zimwriterfs, "--welcome", "welcome.html", "--favicon", "static/img/ka_leaf.png", "--publisher", publisher, "--creator", "KhanAcademy.org", "--description", "Khan Academy ({})".format(language), "--description", "Videos from Khan Academy", "--language", language, tmp_dir, dest_file, ) process = subprocess.Popen(zimwriterfs_args, stdout=subprocess.PIPE) stdout_data, _stderr_data = process.communicate() if process.returncode != 0: logger.error("Error invoking zimwriterfs: {}").format(_stderr_data + stdout_data) logger.info( "Duration: {h:} hours, {m:} minutes, {s:} seconds".format( h=duration // 3600, m=(duration % 3600) // 60, s=duration % 60, ) )
def get_content_cache(force=False, annotate=False, language=None): if not language: language = django_settings.LANGUAGE_CODE global CONTENT if CONTENT is None: CONTENT = {} if CONTENT.get(language) is None: content = None if settings.DO_NOT_RELOAD_CONTENT_CACHE_AT_STARTUP and not force: content = softload_sqlite_cache(settings.CONTENT_CACHE_FILEPATH) if content: CONTENT[language] = content return CONTENT[language] else: if settings.DO_NOT_RELOAD_CONTENT_CACHE_AT_STARTUP: call_command("create_content_db") content = softload_sqlite_cache(settings.CONTENT_CACHE_FILEPATH) else: content = softload_json(settings.CONTENT_FILEPATH, logger=logging.debug, raises=False) CONTENT[language] = content annotate = True if annotate: # Loop through all content items and put thumbnail urls, content urls, # and subtitle urls on the content dictionary, and list all languages # that the content is available in. try: contents_folder = os.listdir(django_settings.CONTENT_ROOT) except OSError: contents_folder = [] subtitle_langs = {} if os.path.exists(i18n.get_srt_path()): for (dirpath, dirnames, filenames) in os.walk(i18n.get_srt_path()): # Only both looking at files that are inside a 'subtitles' directory if os.path.basename(dirpath) == "subtitles": lc = os.path.basename(os.path.dirname(dirpath)) for filename in filenames: if filename in subtitle_langs: subtitle_langs[filename].append(lc) else: subtitle_langs[filename] = [lc] for key, content in CONTENT[language].iteritems(): default_thumbnail = create_thumbnail_url(content.get("id")) dubmap = i18n.get_id2oklang_map(content.get("id")) if dubmap: content_lang = i18n.select_best_available_language(language, available_codes=dubmap.keys()) or "" if content_lang: dubbed_id = dubmap.get(content_lang) format = content.get("format", "") if (dubbed_id + "." + format) in contents_folder: content["available"] = True thumbnail = create_thumbnail_url(dubbed_id) or default_thumbnail content["content_urls"] = { "stream": django_settings.CONTENT_URL + dubmap.get(content_lang) + "." + format, "stream_type": "{kind}/{format}".format(kind=content.get("kind", "").lower(), format=format), "thumbnail": thumbnail, } elif django_settings.BACKUP_VIDEO_SOURCE: content["available"] = True content["content_urls"] = { "stream": django_settings.BACKUP_VIDEO_SOURCE.format(youtube_id=dubbed_id, video_format=format), "stream_type": "{kind}/{format}".format(kind=content.get("kind", "").lower(), format=format), "thumbnail": django_settings.BACKUP_VIDEO_SOURCE.format(youtube_id=dubbed_id, video_format="png"), } else: content["available"] = False else: content["available"] = False else: content["available"] = False # Get list of subtitle language codes currently available subtitle_lang_codes = subtitle_langs.get("{id}.srt".format(id=content.get("id")), []) # Generate subtitle URLs for any subtitles that do exist for this content item subtitle_urls = [{ "code": lc, "url": django_settings.STATIC_URL + "srt/{code}/subtitles/{id}.srt".format(code=lc, id=content.get("id")), "name": i18n.get_language_name(lc) } for lc in subtitle_lang_codes] # Sort all subtitle URLs by language code content["subtitle_urls"] = sorted(subtitle_urls, key=lambda x: x.get("code", "")) with i18n.translate_block(language): content["selected_language"] = content_lang content["title"] = _(content["title"]) content["description"] = _(content.get("description")) if content.get("description") else "" CONTENT[language][key] = content if settings.DO_NOT_RELOAD_CONTENT_CACHE_AT_STARTUP: try: CONTENT[language].commit() except IOError as e: logging.warn("Annotated content cache file failed in saving with error {e}".format(e=e)) return CONTENT[language]
def get_content_cache(force=False, annotate=False, language=settings.LANGUAGE_CODE): global CONTENT, CONTENT_FILEPATH if CONTENT is None: CONTENT = {} if CONTENT.get(language) is None: CONTENT[language] = softload_json(CONTENT_FILEPATH, logger=logging.debug, raises=False) annotate = True if annotate: if settings.DO_NOT_RELOAD_CONTENT_CACHE_AT_STARTUP and not force: content = softload_json(CONTENT_FILEPATH + "_" + language + ".cache", logger=logging.debug, raises=False) if content: CONTENT[language] = content return CONTENT[language] # Loop through all content items and put thumbnail urls, content urls, # and subtitle urls on the content dictionary, and list all languages # that the content is available in. for content in CONTENT[language].values(): default_thumbnail = create_thumbnail_url(content.get("id")) dubmap = i18n.get_id2oklang_map(content.get("id")) if dubmap: content_lang = i18n.select_best_available_language(language, available_codes=dubmap.keys()) or "" if content_lang: dubbed_id = dubmap.get(content_lang) format = content.get("format", "") if is_content_on_disk(dubbed_id, format): content["available"] = True thumbnail = create_thumbnail_url(dubbed_id) or default_thumbnail content["content_urls"] = { "stream": settings.CONTENT_URL + dubmap.get(content_lang) + "." + format, "stream_type": "{kind}/{format}".format(kind=content.get("kind", "").lower(), format=format), "thumbnail": thumbnail, } else: content["available"] = False else: content["available"] = False else: content["available"] = False # Get list of subtitle language codes currently available subtitle_lang_codes = [] if not os.path.exists(i18n.get_srt_path()) else [lc for lc in os.listdir(i18n.get_srt_path()) if os.path.exists(i18n.get_srt_path(lc, content.get("id")))] # Generate subtitle URLs for any subtitles that do exist for this content item subtitle_urls = [{ "code": lc, "url": settings.STATIC_URL + "srt/{code}/subtitles/{id}.srt".format(code=lc, id=content.get("id")), "name": i18n.get_language_name(lc) } for lc in subtitle_lang_codes if os.path.exists(i18n.get_srt_path(lc, content.get("id")))] # Sort all subtitle URLs by language code content["subtitle_urls"] = sorted(subtitle_urls, key=lambda x: x.get("code", "")) with i18n.translate_block(content_lang): content["selected_language"] = content_lang content["title"] = _(content["title"]) content["description"] = _(content.get("description", "")) if content.get("description") else "" if settings.DO_NOT_RELOAD_CONTENT_CACHE_AT_STARTUP: try: with open(CONTENT_FILEPATH + "_" + language + ".cache", "w") as f: json.dump(CONTENT[language], f) except IOError as e: logging.warn("Annotated content cache file failed in saving with error {e}".format(e=e)) return CONTENT[language]
def handle(self, *args, **options): if len(args) != 1: raise CommandError("Takes exactly 1 argument") dest_file = os.path.abspath(args[0]) logger.info("Starting up KA Lite export2zim command") beginning = datetime.now() logger.info("Begin: {}".format(beginning)) language = options.get('language') if not language: raise CommandError("Must specify a language!") if not options.get('tmp_dir'): tmp_dir = os.path.join(tempfile.gettempdir(), 'ka-lite-zim_{}'.format(language)) else: tmp_dir = options.get('tmp_dir') tmp_dir = os.path.abspath(tmp_dir) if os.path.exists(tmp_dir) and os.listdir(tmp_dir): if options['clear']: logger.info("Clearing directory {}".format(tmp_dir)) shutil.rmtree(tmp_dir) elif options['resume']: logger.info( "Resuming in dirty tmp directory {}".format(tmp_dir)) else: raise CommandError( "{} not empty, use the -c option to clean it, -r to resume, or use an empty destination directory." .format(tmp_dir)) zimwriterfs = options.get("zimwriterfs", None) publisher = options.get("publisher") transcode2webm = options.get("transcode2webm") ffmpeg = find_executable("ffmpeg") if not ffmpeg: logger.warning( "FFMpeg not found in your path, you won't be able to create missing thumbnails or transcode to webm." ) if not zimwriterfs: zimwriterfs = find_executable("zimwriterfs") if not zimwriterfs: raise CommandError( "Could not find zimwriterfs in your path, try specifying --zimwriterfs=/path" ) if not os.path.exists(zimwriterfs): raise CommandError("Invalid --zimwriterfs") from kalite_zim import __name__ as base_path base_path = os.path.abspath(base_path) data_path = os.path.join(base_path, 'data') # Where subtitles are found in KA Lite subtitle_src_dir = i18n.get_srt_path(language) logger.info("Will export videos for language: {}".format(language)) logger.info("Preparing KA Lite topic tree...") # Use live data if not options.get('test'): # This way of doing things will be deprecated in KA Lite 0.16 topic_tree_json_path = topic_tools_settings.TOPICS_FILEPATHS.get( 'khan') content_cache = get_content_cache(language=language, annotate=True) exercise_cache = get_exercise_cache(language=language) # Use test data else: topic_tree_json_path = os.path.join(data_path, 'test_topics.json') content_cache = json.load( open(os.path.join(data_path, 'test_content.json'))) exercise_cache = json.load( open(os.path.join(data_path, 'test_exercise.json'))) topic_tree = softload_json(topic_tree_json_path, logger=logger.debug, raises=False) content_json_output = {} exercise_json_output = {} def annotate_tree(topic, depth=0, parent=None): """ We need to recurse into the tree in order to annotate elements with topic data and exercise data """ children = topic.get('children', []) new_children = [] for child_topic in children: if child_topic.get("kind") in ("Video", "Topic"): annotate_tree(child_topic, depth=depth + 1, parent=topic) new_children.append(child_topic) topic["children"] = new_children if topic.get("kind") == "Exercise": topic['exercise'] = exercise_cache.get(topic.get("id"), {}) exercise_json_output[topic.get("id")] = topic['exercise'] elif topic.get("kind") == "Topic": pass else: topic['exercise'] = None topic['content'] = content_cache.get(topic.get("id"), {}) content_json_output[topic.get("id")] = topic['content'] if not topic['content']: logger.error('No content!?, id is: {}'.format( topic.get('id'))) # Translate everything for good measure with i18n.translate_block(language): topic["title"] = _(topic.get("title", "")) topic["description"] = _(topic.get( "description", "")) if topic.get("description") else "" topic["url"] = topic["id"] + ".html" topic["parent"] = parent topic["depth"] = depth for key in ("child_data", "keywords", "hide", "contains"): topic.pop(key, None) # 1. Annotate a topic tree annotate_tree(topic_tree) # 2. Now go through the tree and copy each element into the destination # zim file system def copy_media(node): if node['kind'] == 'Topic': # Don't do anything if it's a topic pass elif node['kind'] == 'Exercise': # Exercises cannot be displayed node["content"]["available"] = False elif node['kind'] == 'Video': if node['content']['format'] == "webm": logger.warning( "Found a duplicate ID for {}, re-downloading".format( node['id'])) node['content']['format'] = "mp4" # Available is False by default until we locate the file node["content"]["available"] = False node_dir = os.path.join(tmp_dir, node["path"]) if not os.path.exists(node_dir): os.makedirs(node_dir) video_file_name = node['id'] + '.' + node['content']['format'] thumb_file_name = node['id'] + '.png' video_file_src = os.path.join(CONTENT_ROOT, video_file_name) video_file_dest = os.path.join(node_dir, video_file_name) thumb_file_src = os.path.join(CONTENT_ROOT, thumb_file_name) thumb_file_dest = os.path.join(node_dir, thumb_file_name) if options['download'] and not os.path.exists(video_file_src): logger.info("Video file being downloaded to: {}".format( video_file_src)) download_video( node['content']['youtube_id'], node['content']['format'], CONTENT_ROOT, ) if os.path.exists(video_file_src): if transcode2webm: ffmpeg_pass_log = "/tmp/logfile_vp8.fpf" if os.path.isfile(ffmpeg_pass_log): os.unlink(ffmpeg_pass_log) video_file_name = node['id'] + '.webm' video_file_dest = os.path.join(node_dir, video_file_name) if os.path.isfile(video_file_dest): logger.info( "Already encoded: {}".format(video_file_dest)) else: ffmpeg_base_args = [ ffmpeg, "-i", video_file_src, "-codec:v", "libvpx", "-quality", "best", "-cpu-used", "0", "-b:v", "300k", "-qmin", "10", # 10=lowest value "-qmax", "35", # 42=highest value "-maxrate", "300k", "-bufsize", "600k", "-threads", "8", # "-vf", "scale=-1", "-codec:a", "libvorbis", # "-b:a", "128k", "-aq", "5", "-f", "webm", ] ffmpeg_pass1 = ffmpeg_base_args + [ "-an", # Disables audio, no effect first pass "-pass", "1", "-passlogfile", ffmpeg_pass_log, video_file_dest, ] ffmpeg_pass2 = ffmpeg_base_args + [ "-pass", "2", "-y", "-passlogfile", ffmpeg_pass_log, video_file_dest, ] for cmd in (ffmpeg_pass1, ffmpeg_pass2): process = subprocess.Popen( cmd, stdout=subprocess.PIPE) stdout_data, _stderr_data = process.communicate( ) if process.returncode != 0: logger.error( "Error invoking ffmpeg: {}".format( (_stderr_data or "") + (stdout_data or ""))) logger.error("Command was: {}".format( " ".join(cmd))) raise CommandError( "Could not complete transcoding") node['content']['format'] = "webm" else: # If not transcoding, just link the original file os.link(video_file_src, video_file_dest) node["video_url"] = os.path.join(node["path"], video_file_name) copy_media.videos_found += 1 logger.info("Videos processed: {}".format( copy_media.videos_found)) node["content"]["available"] = True # Create thumbnail if it wasn't downloaded if not os.path.exists(thumb_file_src): fp = create_thumbnail(video_file_src, output_format="png") if fp is None: logger.error( "Failed to create thumbnail for {}".format( video_file_src)) else: logger.info( "Successfully created thumbnail for {}".format( video_file_src)) file(thumb_file_src, 'wb').write(fp.read()) # Handle thumbnail if os.path.exists(thumb_file_src): node["thumbnail_url"] = os.path.join( node["path"], node['id'] + '.png') if not os.path.exists(thumb_file_dest): os.link(thumb_file_src, thumb_file_dest) else: node["thumbnail_url"] = None subtitle_srt = os.path.join(subtitle_src_dir, node['id'] + '.srt') if os.path.isfile(subtitle_srt): subtitle_vtt = os.path.join(node_dir, node['id'] + '.vtt') # Convert to .vtt because this format is understood # by latest video.js and the old ones that read # .srt don't work with newer jquery etc. submarine_parser(subtitle_srt, subtitle_vtt) if not os.path.exists(subtitle_vtt): logger.warning("Subtitle not converted: {}".format( subtitle_srt)) else: logger.info( "Subtitle convert from SRT to VTT: {}".format( subtitle_vtt)) node["subtitle_url"] = os.path.join( node["path"], node['id'] + '.vtt') else: if options['download']: logger.error("File not found or downloaded: {}".format( video_file_src)) else: logger.error("Invalid node, kind: {}".format( node.get("kind", None))) # Exercises cannot be displayed node["content"] = {"available": False} new_children = [] for child in node.get('children', []): copy_media(child) empty_topic = child["kind"] == "Topic" and not child.get( "children", []) unavailable_video = child["kind"] == "Video" and not child.get( "content", {}).get("available", False) if not (empty_topic or unavailable_video): new_children.append(child) node['children'] = new_children copy_media.videos_found = 0 def render_topic_pages(node): parents = [node] if node.get("children") else [] parent = node["parent"] while parent: parents.append(parent) parent = parent["parent"] # Finally, render templates into the destination template_context = { "topic_tree": topic_tree, "topic": node, "parents": parents } with i18n.translate_block(language): topic_html = render_to_string("kalite_zim/topic.html", template_context) # Replace absolute references to '/static' with relative topic_html = topic_html.replace("/static", "static") dest_html = os.path.join(tmp_dir, node["id"] + ".html") logger.info("Rendering {}".format(dest_html)) open(dest_html, "w").write(topic_html) render_topic_pages.pages_rendered += 1 for child in node.get('children', []): render_topic_pages(child) render_topic_pages.pages_rendered = 0 logger.info("Hard linking video files from KA Lite...") copy_media(topic_tree) sys.stderr.write("\n") logger.info("Done!") # Configure django-compressor compressor_init(os.path.join(base_path, 'static')) # Finally, render templates into the destination template_context = { "topic_tree": topic_tree, "welcome": True, } with i18n.translate_block(language): welcome_html = render_to_string("kalite_zim/welcome.html", template_context) about_html = render_to_string("kalite_zim/about.html", template_context) # Replace absolute references to '/static' with relative welcome_html = welcome_html.replace("/static", "static") about_html = about_html.replace("/static", "static") # Write the welcome.html file open(os.path.join(tmp_dir, 'welcome.html'), 'w').write(welcome_html) open(os.path.join(tmp_dir, 'about.html'), 'w').write(about_html) # Render all topic html files render_topic_pages(topic_tree) # Copy in static data after it's been handled by django compressor # (this happens during template rendering) shutil.copytree(os.path.join(base_path, 'static'), os.path.join(tmp_dir, 'static')) ending = datetime.now() duration = int((ending - beginning).total_seconds()) logger.info("Total number of videos found: {}".format( copy_media.videos_found)) logger.info("Total number of topic pages created: {}".format( render_topic_pages.pages_rendered)) logger.info("Invoking zimwriterfs, writing to: {}".format(dest_file)) zimwriterfs_args = ( zimwriterfs, "--welcome", "welcome.html", "--favicon", "static/img/ka_leaf.png", "--publisher", publisher, "--creator", "KhanAcademy.org", "--description", "Khan Academy ({})".format(language), "--description", "Videos from Khan Academy", "--language", language, tmp_dir, dest_file, ) process = subprocess.Popen(zimwriterfs_args, stdout=subprocess.PIPE) stdout_data, _stderr_data = process.communicate() if process.returncode != 0: logger.error("Error invoking zimwriterfs: {}").format( _stderr_data + stdout_data) logger.info("Duration: {h:} hours, {m:} minutes, {s:} seconds".format( h=duration // 3600, m=(duration % 3600) // 60, s=duration % 60, ))
def get_content_cache(force=False, annotate=False, language=settings.LANGUAGE_CODE): global CONTENT, CONTENT_FILEPATH if CONTENT is None: CONTENT = {} if CONTENT.get(language) is None: CONTENT[language] = softload_json(CONTENT_FILEPATH, logger=logging.debug, raises=False) annotate = True if annotate: if settings.DO_NOT_RELOAD_CONTENT_CACHE_AT_STARTUP and not force: content = softload_json(CONTENT_FILEPATH + "_" + language + ".cache", logger=logging.debug, raises=False) if content: CONTENT[language] = content return CONTENT[language] # Loop through all content items and put thumbnail urls, content urls, # and subtitle urls on the content dictionary, and list all languages # that the content is available in. for content in CONTENT[language].values(): default_thumbnail = create_thumbnail_url(content.get("id")) dubmap = i18n.get_id2oklang_map(content.get("id")) if dubmap: content_lang = i18n.select_best_available_language(language, available_codes=dubmap.keys()) or "" if content_lang: dubbed_id = dubmap.get(content_lang) format = content.get("format", "") if is_content_on_disk(dubbed_id, format): content["available"] = True thumbnail = create_thumbnail_url(dubbed_id) or default_thumbnail content["content_urls"] = { "stream": settings.CONTENT_URL + dubmap.get(content_lang) + "." + format, "stream_type": "{kind}/{format}".format(kind=content.get("kind", "").lower(), format=format), "thumbnail": thumbnail, } elif settings.BACKUP_VIDEO_SOURCE: content["available"] = True content["content_urls"] = { "stream": settings.BACKUP_VIDEO_SOURCE.format(youtube_id=dubbed_id, video_format=format), "stream_type": "{kind}/{format}".format(kind=content.get("kind", "").lower(), format=format), "thumbnail": settings.BACKUP_VIDEO_SOURCE.format(youtube_id=dubbed_id, video_format="png"), } else: content["available"] = False else: content["available"] = False else: content["available"] = False # Get list of subtitle language codes currently available subtitle_lang_codes = [] if not os.path.exists(i18n.get_srt_path()) else [lc for lc in os.listdir(i18n.get_srt_path()) if os.path.exists(i18n.get_srt_path(lc, content.get("id")))] # Generate subtitle URLs for any subtitles that do exist for this content item subtitle_urls = [{ "code": lc, "url": settings.STATIC_URL + "srt/{code}/subtitles/{id}.srt".format(code=lc, id=content.get("id")), "name": i18n.get_language_name(lc) } for lc in subtitle_lang_codes if os.path.exists(i18n.get_srt_path(lc, content.get("id")))] # Sort all subtitle URLs by language code content["subtitle_urls"] = sorted(subtitle_urls, key=lambda x: x.get("code", "")) with i18n.translate_block(content_lang): content["selected_language"] = content_lang content["title"] = _(content["title"]) content["description"] = _(content.get("description", "")) if content.get("description") else "" if settings.DO_NOT_RELOAD_CONTENT_CACHE_AT_STARTUP: try: with open(CONTENT_FILEPATH + "_" + language + ".cache", "w") as f: json.dump(CONTENT[language], f) except IOError as e: logging.warn("Annotated content cache file failed in saving with error {e}".format(e=e)) return CONTENT[language]
def get_content_cache(force=False, annotate=False): global CONTENT, CONTENT_FILEPATH if CONTENT is None: CONTENT = softload_json(CONTENT_FILEPATH, logger=logging.debug, raises=False) annotate = True if annotate: if settings.DO_NOT_RELOAD_CONTENT_CACHE_AT_STARTUP and not force: content = softload_json(CONTENT_FILEPATH + ".cache", logger=logging.debug, raises=False) if content: CONTENT = content return CONTENT # Loop through all content items and put thumbnail urls, content urls, # and subtitle urls on the content dictionary, and list all languages # that the content is available in. for content in CONTENT.values(): languages = [] default_thumbnail = create_thumbnail_url(content.get("id")) dubmap = i18n.get_id2oklang_map(content.get("id")) for lang, dubbed_id in dubmap.items(): format = content.get("format", "") if is_content_on_disk(dubbed_id, format): languages.append(lang) thumbnail = create_thumbnail_url( dubbed_id) or default_thumbnail content["lang_data_" + lang] = { "stream": settings.CONTENT_URL + dubmap.get(lang) + "." + format, "stream_type": "{kind}/{format}".format(kind=content.get("kind", "").lower(), format=format), "thumbnail": thumbnail, } content["languages"] = languages # Get list of subtitle language codes currently available subtitle_lang_codes = [] if not os.path.exists( i18n.get_srt_path()) else [ lc for lc in os.listdir(i18n.get_srt_path()) if os.path.exists(i18n.get_srt_path(lc, content.get("id"))) ] # Generate subtitle URLs for any subtitles that do exist for this content item subtitle_urls = [ { "code": lc, "url": settings.STATIC_URL + "srt/{code}/subtitles/{id}.srt".format( code=lc, id=content.get("id")), "name": i18n.get_language_name(lc) } for lc in subtitle_lang_codes if os.path.exists(i18n.get_srt_path(lc, content.get("id"))) ] # Sort all subtitle URLs by language code content["subtitle_urls"] = sorted(subtitle_urls, key=lambda x: x.get("code", "")) if settings.DO_NOT_RELOAD_CONTENT_CACHE_AT_STARTUP: try: with open(CONTENT_FILEPATH + ".cache", "w") as f: json.dump(CONTENT, f) except IOError as e: logging.warn( "Annotated content cache file failed in saving with error {e}" .format(e=e)) return CONTENT