def handle(self, *args, **options): if not options["lang_code"]: raise CommandError("You must specify a language code.") lang_code = lcode_to_ietf(options["lang_code"]) if lang_code not in AVAILABLE_EXERCISE_LANGUAGE_CODES: logging.info("No exercises available for language %s" % lang_code) else: # Get list of exercises exercise_ids = options["exercise_ids"].split( ",") if options["exercise_ids"] else None exercise_ids = exercise_ids or ([ ex["id"] for ex in get_topic_exercises(topic_id=options["topic_id"]) ] if options["topic_id"] else None) exercise_ids = exercise_ids or get_node_cache("Exercise").keys() # Download the exercises for exercise_id in exercise_ids: scrape_exercise(exercise_id=exercise_id, lang_code=lang_code, force=options["force"]) logging.info("Process complete.")
def get_all_prepped_lang_codes(): """Pre-prepped language codes, for downloading srts""" lang_codes = [] for filename in get_all_download_status_files(): lang_code = os.path.basename(filename).split("_")[0] lang_codes.append(lcode_to_ietf(lang_code)) return lang_codes
def handle(self, *args, **options): if not settings.CENTRAL_SERVER: raise CommandError("This must only be run on the central server.") # None represents all lang_codes = [lcode_to_ietf(options["lang_code"])] if options["lang_code"] else None del options["lang_code"] if len(args) > 1: raise CommandError("Max 1 arg") elif len(args) == 1: if args[0] == "clear": logging.info("Clearing subtitles...") clear_subtitles_cache(lang_codes) else: raise CommandError("Unknown argument: %s" % args[0]) else: validate_language_map(lang_codes) logging.info("Downloading...") download_srt_from_3rd_party(lang_codes=lang_codes, **options) validate_language_map(lang_codes) # again at the end, so output is visible # for compatibility with KA Lite versions less than 0.10.3 for lang in (lang_codes or get_langs_with_subtitles()): generate_srt_availability_file(lang) logging.info("Process complete.")
def move_srts(lang_code): """ Srts live in the locale directory, but that's not exposed at any URL. So instead, we have to move the srts out to /static/subtitles/[lang_code]/ """ lang_code_ietf = lcode_to_ietf(lang_code) lang_code_django = lcode_to_django_dir(lang_code) subtitles_static_dir = os.path.join(settings.STATIC_ROOT, "subtitles") src_dir = os.path.join(LOCALE_ROOT, lang_code_django, "subtitles") dest_dir = get_srt_path(lang_code_django) ensure_dir(dest_dir) lang_subtitles = glob.glob(os.path.join(src_dir, "*.srt")) logging.info("Moving %d subtitles from %s to %s" % (len(lang_subtitles), src_dir, dest_dir)) for fil in lang_subtitles: srt_dest_path = os.path.join(dest_dir, os.path.basename(fil)) if os.path.exists(srt_dest_path): os.remove(srt_dest_path) # we're going to replace any srt with a newer version shutil.move(fil, srt_dest_path) if not os.path.exists(src_dir): logging.info("No subtitles for language pack %s" % lang_code) elif os.listdir(src_dir): logging.warn("%s is not empty; will not remove. Please check that all subtitles were moved." % src_dir) else: logging.info("Removing empty source directory (%s)." % src_dir) shutil.rmtree(src_dir)
def move_srts(lang_code): """ Srts live in the locale directory, but that's not exposed at any URL. So instead, we have to move the srts out to /static/subtitles/[lang_code]/ """ lang_code_ietf = lcode_to_ietf(lang_code) lang_code_django = lcode_to_django_dir(lang_code) subtitles_static_dir = os.path.join(settings.STATIC_ROOT, "subtitles") src_dir = os.path.join(LOCALE_ROOT, lang_code_django, "subtitles") dest_dir = get_srt_path(lang_code_django) ensure_dir(dest_dir) lang_subtitles = glob.glob(os.path.join(src_dir, "*.srt")) logging.info("Moving %d subtitles from %s to %s" % (len(lang_subtitles), src_dir, dest_dir)) for fil in lang_subtitles: srt_dest_path = os.path.join(dest_dir, os.path.basename(fil)) if os.path.exists(srt_dest_path): os.remove(srt_dest_path ) # we're going to replace any srt with a newer version shutil.move(fil, srt_dest_path) if not os.path.exists(src_dir): logging.info("No subtitles for language pack %s" % lang_code) elif os.listdir(src_dir): logging.warn( "%s is not empty; will not remove. Please check that all subtitles were moved." % src_dir) else: logging.info("Removing empty source directory (%s)." % src_dir) shutil.rmtree(src_dir)
def get_language_pack(lang_code, software_version, callback): """Download language pack for specified language""" lang_code = lcode_to_ietf(lang_code) logging.info("Retrieving language pack: %s" % lang_code) request_url = get_language_pack_url(lang_code, software_version) path, response = download_file(request_url, callback=callback_percent_proxy(callback)) return path
def handle(self, *args, **options): if settings.CENTRAL_SERVER: raise CommandError( "This must only be run on distributed servers server.") lang_code = lcode_to_ietf(options["lang_code"]) software_version = options["software_version"] logging.info( "Downloading language pack for lang_code=%s, software_version=%s" % (lang_code, software_version)) # Download the language pack try: if options['file']: self.start( _("Using local language pack '%(filepath)s'") % {"filepath": options['file']}) zip_filepath = options['file'] else: self.start( _("Downloading language pack '%(lang_code)s'") % {"lang_code": lang_code}) zip_filepath = get_language_pack(lang_code, software_version, callback=self.cb) # Unpack into locale directory self.next_stage( _("Unpacking language pack '%(lang_code)s'") % {"lang_code": lang_code}) unpack_language(lang_code, zip_filepath=zip_filepath) # self.next_stage( _("Creating static files for language pack '%(lang_code)s'") % {"lang_code": lang_code}) update_jsi18n_file(lang_code) self.next_stage( _("Moving files to their appropriate local disk locations.")) move_dubbed_video_map(lang_code) move_exercises(lang_code) move_srts(lang_code) move_video_sizes_file(lang_code) self.next_stage(_("Invalidate caches")) caching.invalidate_all_caches() self.complete( _("Finished processing language pack %(lang_code)s") % {"lang_code": lang_code}) except Exception as e: self.cancel(stage_status="error", notes=_("Error: %(error_msg)s") % {"error_msg": unicode(e)}) raise
def update_videos(request, max_to_show=4): installed_languages = get_installed_language_packs(force=True).copy() # we copy to avoid changing the original installed language list default_language_name = lang_best_name(installed_languages.pop(lcode_to_ietf(request.session["default_language"]))) languages_to_show = [lang_best_name(l) for l in installed_languages.values()[:max_to_show]] other_languages_count = max(0, len(installed_languages) - max_to_show) context = update_context(request) context.update({ "video_count": VideoFile.objects.filter(percent_complete=100).count(), "languages": languages_to_show, "default_language_name": default_language_name, "other_languages_count": other_languages_count, }) return context
def clear_subtitles_cache(lang_codes=None, locale_root=LOCALE_ROOT): """ Language codes will be converted to django format (e.g. en_US) """ lang_codes = lang_codes or get_langs_with_subtitles() for lang_code in lang_codes: lang_code = lcode_to_ietf(lang_code) # Clear the status file lm_file = get_lang_map_filepath(lang_code) download_status = softload_json(lm_file, raises=True) for key in download_status: download_status[key] = {u'downloaded': False, u'last_success': u'', u'last_attempt': u'', u'api_response': u''} with open(lm_file, "w") as fp: json.dump(download_status, fp) # Delete all srt files srt_path = get_srt_path(lang_code) if os.path.exists(srt_path): shutil.rmtree(srt_path)
def handle(self, *args, **options): if not options["lang_code"]: raise CommandError("You must specify a language code.") lang_code = lcode_to_ietf(options["lang_code"]) if lang_code not in AVAILABLE_EXERCISE_LANGUAGE_CODES: logging.info("No exercises available for language %s" % lang_code) else: # Get list of exercises exercise_ids = options["exercise_ids"].split(",") if options["exercise_ids"] else None exercise_ids = exercise_ids or ([ex["id"] for ex in get_topic_exercises(topic_id=options["topic_id"])] if options["topic_id"] else None) exercise_ids = exercise_ids or get_node_cache("Exercise").keys() # Download the exercises for exercise_id in exercise_ids: scrape_exercise(exercise_id=exercise_id, lang_code=lang_code, force=options["force"]) logging.info("Process complete.")
def validate_language_map(lang_codes): """ This function will tell you any blockers that you'll hit while running this command. All srt languages must exist in the language map; missing languages will cause errors during command running (which can be long). This function avoids that problem by doing the above consistency check. """ lang_codes = lang_codes or get_all_prepped_lang_codes() missing_langs = [] for lang_code in lang_codes: try: get_language_name(lcode_to_ietf(lang_code), error_on_missing=True) except LanguageNotFoundError: missing_langs.append(lang_code) if missing_langs: logging.warn("Please add the following language codes to %s:\n\t%s" % ( LANG_LOOKUP_FILEPATH, missing_langs, ))
def scrape_exercise(exercise_id, lang_code, force=False): ietf_lang_code = lcode_to_ietf(lang_code) exercise_dest_filepath = get_exercise_filepath(exercise_id, lang_code=lang_code) exercise_localized_root = os.path.dirname(exercise_dest_filepath) if os.path.exists(exercise_dest_filepath) and not force: return exercise_url = "https://es.khanacademy.org/khan-exercises/exercises/%s.html?lang=%s" % (exercise_id, ietf_lang_code) logging.info("Retrieving exercise %s from %s" % (exercise_id, exercise_url)) try: ensure_dir(exercise_localized_root) resp = requests.get(exercise_url) resp.raise_for_status() with open(exercise_dest_filepath, "wb") as fp: fp.write(resp.content) except Exception as e: logging.error("Failed to download %s: %s" % (exercise_url, e))
def handle(self, *args, **options): if settings.CENTRAL_SERVER: raise CommandError("This must only be run on distributed servers server.") lang_code = lcode_to_ietf(options["lang_code"]) software_version = options["software_version"] logging.info("Downloading language pack for lang_code=%s, software_version=%s" % (lang_code, software_version)) # Download the language pack try: if options['file']: self.start(_("Using local language pack '%(filepath)s'") % {"filepath": options['file']}) zip_filepath = options['file'] else: self.start(_("Downloading language pack '%(lang_code)s'") % {"lang_code": lang_code}) zip_filepath = get_language_pack(lang_code, software_version, callback=self.cb) # Unpack into locale directory self.next_stage(_("Unpacking language pack '%(lang_code)s'") % {"lang_code": lang_code}) unpack_language(lang_code, zip_filepath=zip_filepath) # self.next_stage(_("Creating static files for language pack '%(lang_code)s'") % {"lang_code": lang_code}) update_jsi18n_file(lang_code) self.next_stage(_("Moving files to their appropriate local disk locations.")) move_dubbed_video_map(lang_code) move_exercises(lang_code) move_srts(lang_code) move_video_sizes_file(lang_code) self.next_stage(_("Invalidate caches")) caching.invalidate_all_caches() self.complete(_("Finished processing language pack %(lang_code)s") % {"lang_code": lang_code}) except Exception as e: self.cancel(stage_status="error", notes=_("Error: %(error_msg)s") % {"error_msg": unicode(e)}) raise
def update_videos(request, max_to_show=4): installed_languages = get_installed_language_packs(force=True).copy( ) # we copy to avoid changing the original installed language list default_language_name = lang_best_name( installed_languages.pop( lcode_to_ietf(request.session["default_language"]))) languages_to_show = [ lang_best_name(l) for l in installed_languages.values()[:max_to_show] ] other_languages_count = max(0, len(installed_languages) - max_to_show) context = update_context(request) context.update({ "video_count": VideoFile.objects.filter(percent_complete=100).count(), "languages": languages_to_show, "default_language_name": default_language_name, "other_languages_count": other_languages_count, }) return context
def handle(self, *args, **options): if settings.CENTRAL_SERVER: raise CommandError("This must only be run on the distributed server.") if not options["lang_code"]: raise CommandError("You must specify a language code.") # ensure_dir(settings.CONTENT_ROOT) # Get list of videos lang_code = lcode_to_ietf(options["lang_code"]) video_map = get_dubbed_video_map(lang_code) or {} video_ids = options["video_ids"].split(",") if options["video_ids"] else None video_ids = video_ids or ([vid["id"] for vid in get_topic_videos(topic_id=options["topic_id"])] if options["topic_id"] else None) video_ids = video_ids or video_map.keys() # Download the videos for video_id in video_ids: if video_id in video_map: youtube_id = video_map[video_id] elif video_id in video_map.values(): # Perhaps they sent in a youtube ID? We can handle that! youtube_id = video_id else: logging.error("No mapping for video_id=%s; skipping" % video_id) continue try: scrape_video(youtube_id=youtube_id, format=options["format"], force=options["force"]) #scrape_thumbnail(youtube_id=youtube_id) logging.info("Access video %s at %s" % (youtube_id, get_node_cache("Video")[video_id][0]["path"])) except Exception as e: logging.error("Failed to download video %s: %s" % (youtube_id, e)) logging.info("Process complete.")
def download_srt_from_3rd_party(lang_codes=None, **kwargs): """Download subtitles specified by command line args""" lang_codes = lang_codes or get_all_prepped_lang_codes() bad_languages = {} for lang_code in lang_codes: lang_code = lcode_to_ietf(lang_code) lang_code = get_supported_language_map(lang_code)['amara'] try: lang_map_filepath = get_lang_map_filepath(lang_code) if not os.path.exists(lang_map_filepath): videos = {} # happens if an unknown set for subtitles. else: with open(lang_map_filepath, "r") as fp: videos = json.load(fp) except Exception as e: error_msg = "Error in subtitles metadata file for %s: %s" % (lang_code, e) logging.error(error_msg) bad_languages[lang_code] = error_msg continue try: download_if_criteria_met(videos, lang_code=lang_code, **kwargs) except Exception as e: error_msg = "Error downloading subtitles for %s: %s" % (lang_code, e) logging.error(error_msg) bad_languages[lang_code] = error_msg continue # now report final results if bad_languages: outstr = "Failed to download subtitles for the following languages: %s" % (bad_languages.keys()) outstr += "\n" + str(bad_languages) logging.error(outstr)
def scrape_exercise(exercise_id, lang_code, force=False): ietf_lang_code = lcode_to_ietf(lang_code) exercise_dest_filepath = get_exercise_filepath(exercise_id, lang_code=lang_code) exercise_localized_root = os.path.dirname(exercise_dest_filepath) if os.path.exists(exercise_dest_filepath) and not force: return exercise_url = "https://es.khanacademy.org/khan-exercises/exercises/%s.html?lang=%s" % ( exercise_id, ietf_lang_code) logging.info("Retrieving exercise %s from %s" % (exercise_id, exercise_url)) try: ensure_dir(exercise_localized_root) resp = requests.get(exercise_url) resp.raise_for_status() with open(exercise_dest_filepath, "wb") as fp: fp.write(resp.content) except Exception as e: logging.error("Failed to download %s: %s" % (exercise_url, e))
def update_language_srt_map(map_file=SRTS_JSON_FILEPATH): """ Translate the srts_remote_availability dictionary into language specific files that can be used by the cache_subtitles command. Note: srt map deals with amara, so uses ietf codes (e.g. en-us) """ # Load the current download status api_info_map = softload_json(map_file, logger=logging.warn) # Next we want to iterate through those and create a big srt dictionary organized by language code remote_availability_map = {} for youtube_id, data in api_info_map.items(): languages = data.get("language_codes", []) for lang_code in languages: lang_code = lcode_to_ietf(lang_code) if not lang_code in remote_availability_map: # logging.info("Creating language section '%s'" % lang_code) remote_availability_map[lang_code] = {} # This entry will be valid if it's new, otherwise it will be overwitten later remote_availability_map[lang_code][youtube_id] = { "downloaded": False, "api_response": "", "last_attempt": "", "last_success": "", } # Finally we need to iterate through that dictionary and create individual files for each language code for lang_code, new_data in remote_availability_map.items(): # Try to open previous language file lang_map_filepath = get_lang_map_filepath(lang_code) if not os.path.exists(lang_map_filepath): lang_map = {} else: lang_map = softload_json(lang_map_filepath, logger=logging.error) # First, check to see if it's empty (e.g. no subtitles available for any videos) if not new_data: logging.info("Subtitle support for %s has been terminated; removing." % lang_code) if os.path.exists(lang_map_filepath): os.remove(lang_map_filepath) continue # Compare how many empty entries you are adding and add them to master map old_yt_ids = set(new_data.keys()) new_yt_ids = set(lang_map.keys()) yt_ids_to_add = set(new_data.keys()) - set(lang_map.keys()) yt_ids_to_delete = set(lang_map.keys()) - set(new_data.keys()) if yt_ids_to_add: logging.info("Adding %d new YouTube IDs to language (%s)" % (len(yt_ids_to_add), lang_code)) for yt_id in yt_ids_to_add: lang_map[yt_id] = new_data.get(yt_id) if yt_ids_to_delete: logging.info( "Deleting %d old YouTube IDs from language (%s) because they are no longer supported." % (len(yt_ids_to_delete), lang_code) ) for yt_id in yt_ids_to_delete: lang_map.pop(yt_id, None) # Write the new file to the correct location logging.debug("Writing %s" % lang_map_filepath) ensure_dir(os.path.dirname(lang_map_filepath)) with open(lang_map_filepath, "w") as outfile: json.dump(lang_map, outfile) # Update the big mapping with the most accurate numbers remote_availability_map[lang_code].update(lang_map) # Finally, remove any files not found in the current map at all. if lang_map_filepath: for filename in os.listdir(os.path.dirname(lang_map_filepath)): lang_code = lang_code = filename.split("_")[0] if not lang_code in remote_availability_map: file_to_remove = get_lang_map_filepath(lang_code) logging.info("Subtitle support for %s has been terminated; removing." % lang_code) if os.path.exists(file_to_remove): os.remove(file_to_remove) else: logging.warn( "Subtitles metadata for %s not found; skipping deletion of non-existent file %s." % (lang_code, file_to_remove) ) return remote_availability_map
def update_language_srt_map(map_file=SRTS_JSON_FILEPATH): """ Translate the srts_remote_availability dictionary into language specific files that can be used by the cache_subtitles command. Note: srt map deals with amara, so uses ietf codes (e.g. en-us) """ # Load the current download status api_info_map = softload_json(map_file, logger=logging.warn) # Next we want to iterate through those and create a big srt dictionary organized by language code remote_availability_map = {} for youtube_id, data in api_info_map.items(): languages = data.get("language_codes", []) for lang_code in languages: lang_code = lcode_to_ietf(lang_code) if not lang_code in remote_availability_map: #logging.info("Creating language section '%s'" % lang_code) remote_availability_map[lang_code] = {} # This entry will be valid if it's new, otherwise it will be overwitten later remote_availability_map[lang_code][youtube_id] = { "downloaded": False, "api_response": "", "last_attempt": "", "last_success": "", } # Finally we need to iterate through that dictionary and create individual files for each language code for lang_code, new_data in remote_availability_map.items(): # Try to open previous language file lang_map_filepath = get_lang_map_filepath(lang_code) if not os.path.exists(lang_map_filepath): lang_map = {} else: lang_map = softload_json(lang_map_filepath, logger=logging.error) # First, check to see if it's empty (e.g. no subtitles available for any videos) if not new_data: logging.info( "Subtitle support for %s has been terminated; removing." % lang_code) if os.path.exists(lang_map_filepath): os.remove(lang_map_filepath) continue # Compare how many empty entries you are adding and add them to master map old_yt_ids = set(new_data.keys()) new_yt_ids = set(lang_map.keys()) yt_ids_to_add = set(new_data.keys()) - set(lang_map.keys()) yt_ids_to_delete = set(lang_map.keys()) - set(new_data.keys()) if yt_ids_to_add: logging.info("Adding %d new YouTube IDs to language (%s)" % (len(yt_ids_to_add), lang_code)) for yt_id in yt_ids_to_add: lang_map[yt_id] = new_data.get(yt_id) if yt_ids_to_delete: logging.info( "Deleting %d old YouTube IDs from language (%s) because they are no longer supported." % (len(yt_ids_to_delete), lang_code)) for yt_id in yt_ids_to_delete: lang_map.pop(yt_id, None) # Write the new file to the correct location logging.debug("Writing %s" % lang_map_filepath) ensure_dir(os.path.dirname(lang_map_filepath)) with open(lang_map_filepath, 'w') as outfile: json.dump(lang_map, outfile) # Update the big mapping with the most accurate numbers remote_availability_map[lang_code].update(lang_map) # Finally, remove any files not found in the current map at all. if lang_map_filepath: for filename in os.listdir(os.path.dirname(lang_map_filepath)): lang_code = lang_code = filename.split("_")[0] if not lang_code in remote_availability_map: file_to_remove = get_lang_map_filepath(lang_code) logging.info( "Subtitle support for %s has been terminated; removing." % lang_code) if os.path.exists(file_to_remove): os.remove(file_to_remove) else: logging.warn( "Subtitles metadata for %s not found; skipping deletion of non-existent file %s." % (lang_code, file_to_remove)) return remote_availability_map