def move_srts(lang_code): """ Srts live in the locale directory, but that's not exposed at any URL. So instead, we have to move the srts out to /static/subtitles/[lang_code]/ """ lang_code_ietf = lcode_to_ietf(lang_code) lang_code_django = lcode_to_django_dir(lang_code) subtitles_static_dir = os.path.join(settings.STATIC_ROOT, "subtitles") src_dir = os.path.join(LOCALE_ROOT, lang_code_django, "subtitles") dest_dir = get_srt_path(lang_code_django) ensure_dir(dest_dir) lang_subtitles = glob.glob(os.path.join(src_dir, "*.srt")) logging.info("Moving %d subtitles from %s to %s" % (len(lang_subtitles), src_dir, dest_dir)) for fil in lang_subtitles: srt_dest_path = os.path.join(dest_dir, os.path.basename(fil)) if os.path.exists(srt_dest_path): os.remove(srt_dest_path) # we're going to replace any srt with a newer version shutil.move(fil, srt_dest_path) if not os.path.exists(src_dir): logging.info("No subtitles for language pack %s" % lang_code) elif os.listdir(src_dir): logging.warn("%s is not empty; will not remove. Please check that all subtitles were moved." % src_dir) else: logging.info("Removing empty source directory (%s)." % src_dir) shutil.rmtree(src_dir)
def recurse_nodes_to_clean_related_videos(node): """ Internal function for recursing the topic tree and marking related exercises. Requires rebranding of metadata done by recurse_nodes function. """ def get_video_node(video_slug, node): if node["kind"] == "Topic": for child in node.get("children", []): video_node = get_video_node(video_slug, child) if video_node: return video_node elif node["kind"] == "Video" and node["slug"] == video_slug: return node return None if node["kind"] == "Exercise": videos_to_delete = [] for vi, video_slug in enumerate(node["related_video_slugs"]): if not get_video_node(video_slug, topic_tree): videos_to_delete.append(vi) for vi in reversed(videos_to_delete): logging.warn("Deleting unknown video %s" % node["related_video_slugs"][vi]) del node["related_video_slugs"][vi] for child in node.get("children", []): recurse_nodes_to_clean_related_videos(child)
def end_user_activity(cls, user, activity_type="login", end_datetime=None, suppress_save=False): # don't accept language--we're just closing previous activity. """Helper function to complete an existing user activity log entry.""" # Do nothing if the max # of records is zero # (i.e. this functionality is disabled) if not cls.is_enabled(): return if not user: raise ValidationError("A valid user must always be specified.") if not end_datetime: # must be done outside the function header (else becomes static) end_datetime = datetime.now() activity_type = cls.get_activity_int(activity_type) cur_log = cls.get_latest_open_log_or_None(user=user, activity_type=activity_type) if cur_log: # How could you start after you ended?? if cur_log.start_datetime > end_datetime: raise ValidationError("Update time must always be later than the login time.") else: # No unstopped starts. Start should have been called first! logging.warn("%s: Had to BEGIN a user log entry, but ENDING(%d)! @ %s" % (user.username, activity_type, end_datetime)) cur_log = cls.begin_user_activity(user=user, activity_type=activity_type, start_datetime=end_datetime, suppress_save=True) logging.debug("%s: Logging LOGOUT activity @ %s" % (user.username, end_datetime)) cur_log.end_datetime = end_datetime if not suppress_save: cur_log.save() # total-seconds will be computed here. return cur_log
def update_user_activity(cls, user, activity_type="login", update_datetime=None, language=None, suppress_save=False): """Helper function to update an existing user activity log entry.""" # Do nothing if the max # of records is zero # (i.e. this functionality is disabled) if not cls.is_enabled(): return if not user: raise ValidationError("A valid user must always be specified.") if not update_datetime: # must be done outside the function header (else becomes static) update_datetime = datetime.now() activity_type = cls.get_activity_int(activity_type) cur_log = cls.get_latest_open_log_or_None(user=user, activity_type=activity_type) if cur_log: # How could you start after you updated?? if cur_log.start_datetime > update_datetime: raise ValidationError("Update time must always be later than the login time.") else: # No unstopped starts. Start should have been called first! logging.warn("%s: Had to create a user log entry on an UPDATE(%d)! @ %s" % (user.username, activity_type, update_datetime)) cur_log = cls.begin_user_activity(user=user, activity_type=activity_type, start_datetime=update_datetime, suppress_save=True) logging.debug("%s: UPDATE activity (%d) @ %s" % (user.username, activity_type, update_datetime)) cur_log.last_active_datetime = update_datetime cur_log.language = language or cur_log.language # set the language to the current language, if there is one. if not suppress_save: cur_log.save() return cur_log
def begin_user_activity(cls, user, activity_type="login", start_datetime=None, language=None, suppress_save=False): """Helper function to create a user activity log entry.""" # Do nothing if the max # of records is zero # (i.e. this functionality is disabled) if not cls.is_enabled(): return if not user: raise ValidationError("A valid user must always be specified.") if not start_datetime: # must be done outside the function header (else becomes static) start_datetime = datetime.now() activity_type = cls.get_activity_int(activity_type) cur_log = cls.get_latest_open_log_or_None(user=user, activity_type=activity_type) if cur_log: # Seems we're logging in without logging out of the previous. # Best thing to do is simulate a login # at the previous last update time. # # Note: this can be a recursive call logging.warn("%s: had to END activity on a begin(%d) @ %s" % (user.username, activity_type, start_datetime)) # Don't mark current language when closing an old one cls.end_user_activity(user=user, activity_type=activity_type, end_datetime=cur_log.last_active_datetime) # can't suppress save cur_log = None # Create a new entry logging.debug("%s: BEGIN activity(%d) @ %s" % (user.username, activity_type, start_datetime)) cur_log = cls(user=user, activity_type=activity_type, start_datetime=start_datetime, last_active_datetime=start_datetime, language=language) if not suppress_save: cur_log.save() return cur_log
def recurse_nodes_to_extract_knowledge_map(node, node_cache): """ Internal function for recursing the topic tree and building the knowledge map. Requires rebranding of metadata done by recurse_nodes function. """ assert node["kind"] == "Topic" if node.get("in_knowledge_map", None): if node["slug"] not in knowledge_map["topics"]: logging.debug("Not in knowledge map: %s" % node["slug"]) node["in_knowledge_map"] = False for node in node_cache["Topic"][node["slug"]]: node["in_knowledge_map"] = False knowledge_topics[node["slug"]] = topic_tools.get_all_leaves(node, leaf_type="Exercise") if not knowledge_topics[node["slug"]]: sys.stderr.write("Removing topic from topic tree: no exercises. %s" % node["slug"]) del knowledge_topics[node["slug"]] del knowledge_map["topics"][node["slug"]] node["in_knowledge_map"] = False for node in node_cache["Topic"][node["slug"]]: node["in_knowledge_map"] = False else: if node["slug"] in knowledge_map["topics"]: sys.stderr.write("Removing topic from topic tree; does not belong. '%s'" % node["slug"]) logging.warn("Removing from knowledge map: %s" % node["slug"]) del knowledge_map["topics"][node["slug"]] for child in [n for n in node.get("children", []) if n["kind"] == "Topic"]: recurse_nodes_to_extract_knowledge_map(child, node_cache)
def download_kmap_icons(knowledge_map): for key, value in knowledge_map["topics"].items(): # Note: id here is retrieved from knowledge_map, so we're OK # that we blew away ID in the topic tree earlier. if "icon_url" not in value: logging.warn("No icon URL for %s" % key) value["icon_url"] = iconfilepath + value["id"] + iconextension knowledge_map["topics"][key] = value out_path = data_path + "../" + value["icon_url"] if os.path.exists(out_path) and not force_icons: continue icon_khan_url = "http://www.khanacademy.org" + value["icon_url"] sys.stdout.write("Downloading icon %s from %s..." % (value["id"], icon_khan_url)) sys.stdout.flush() try: icon = requests.get(icon_khan_url) except Exception as e: sys.stdout.write("\n") # complete the "downloading" output sys.stderr.write("Failed to download %-80s: %s\n" % (icon_khan_url, e)) continue if icon.status_code == 200: iconfile = file(data_path + "../" + value["icon_url"], "w") iconfile.write(icon.content) else: sys.stdout.write(" [NOT FOUND]") value["icon_url"] = iconfilepath + defaulticon + iconextension sys.stdout.write(" done.\n") # complete the "downloading" output
def update_all_distributed_callback(request): """ """ if request.method != "POST": raise PermissionDenied("Only POST allowed to this URL endpoint.") videos = json.loads(request.POST["video_logs"]) exercises = json.loads(request.POST["exercise_logs"]) user = FacilityUser.objects.get(id=request.POST["user_id"]) node_cache = get_node_cache() # Save videos n_videos_uploaded = 0 for video in videos: video_id = video['video_id'] youtube_id = video['youtube_id'] # Only save video logs for videos that we recognize. if video_id not in node_cache["Video"]: logging.warn("Skipping unknown video %s" % video_id) continue try: (vl, _) = VideoLog.get_or_initialize(user=user, video_id=video_id, youtube_id=youtube_id) for key,val in video.iteritems(): setattr(vl, key, val) logging.debug("Saving video log for %s: %s" % (video_id, vl)) vl.save() n_videos_uploaded += 1 except KeyError: # logging.error("Could not save video log for data with missing values: %s" % video) except Exception as e: error_message = "Unexpected error importing videos: %s" % e return JsonResponseMessageError(error_message) # Save exercises n_exercises_uploaded = 0 for exercise in exercises: # Only save video logs for videos that we recognize. if exercise['exercise_id'] not in node_cache['Exercise']: logging.warn("Skipping unknown video %s" % exercise['exercise_id']) continue try: (el, _) = ExerciseLog.get_or_initialize(user=user, exercise_id=exercise["exercise_id"]) for key,val in exercise.iteritems(): setattr(el, key, val) logging.debug("Saving exercise log for %s: %s" % (exercise['exercise_id'], el)) el.save() n_exercises_uploaded += 1 except KeyError: logging.error("Could not save exercise log for data with missing values: %s" % exercise) except Exception as e: error_message = "Unexpected error importing exercises: %s" % e return JsonResponseMessageError(error_message) return JsonResponse({"success": "Uploaded %d exercises and %d videos" % (n_exercises_uploaded, n_videos_uploaded)})
def get_file2lang_map(force=False): """Map from youtube_id to language code""" global YT2LANG_MAP if YT2LANG_MAP is None or force: YT2LANG_MAP = {} for lang_code, dic in get_dubbed_video_map().iteritems(): for dubbed_youtube_id in dic.values(): if dubbed_youtube_id in YT2LANG_MAP: # Sanity check, but must be failsafe, since we don't control these data if YT2LANG_MAP[dubbed_youtube_id] == lang_code: logging.warn("Duplicate entry found in %s language map for dubbed video %s" % (lang_code, dubbed_youtube_id)) else: logging.error("Conflicting entry found in language map for video %s; overwriting previous entry of %s to %s." % (dubbed_youtube_id, YT2LANG_MAP[dubbed_youtube_id], lang_code)) YT2LANG_MAP[dubbed_youtube_id] = lang_code return YT2LANG_MAP
def get_shell_script(self, cmd_glob, location=None): if not location: location = self.working_dir + '/kalite' cmd_glob += system_script_extension() # Find the command cmd = glob.glob(location + "/" + cmd_glob) if len(cmd) > 1: raise CommandError("Multiple commands found (%s)? Should choose based on platform, but ... how to do in Python? Contact us to implement this!" % cmd_glob) elif len(cmd)==1: cmd = cmd[0] else: cmd = None logging.warn("No command found: (%s in %s)" % (cmd_glob, location)) return cmd
def clean_orphaned_polylines(knowledge_map): """ We remove some topics (without leaves); need to remove polylines associated with these topics. """ all_topic_points = [(km["x"],km["y"]) for km in knowledge_map["topics"].values()] polylines_to_delete = [] for li, polyline in enumerate(knowledge_map["polylines"]): if any(["x" for pt in polyline["path"] if (pt["x"], pt["y"]) not in all_topic_points]): polylines_to_delete.append(li) logging.warn("Removing %s of %s polylines in top-level knowledge map" % (len(polylines_to_delete), len(knowledge_map["polylines"]))) for i in reversed(polylines_to_delete): del knowledge_map["polylines"][i] return knowledge_map
def get_dubbed_video_map(lang_code=None, force=False): """ Stores a key per language. Value is a dictionary between video_id and (dubbed) youtube_id """ global DUBBED_VIDEO_MAP, DUBBED_VIDEO_MAP_RAW, DUBBED_VIDEOS_MAPPING_FILEPATH if DUBBED_VIDEO_MAP is None or force: try: if not os.path.exists(DUBBED_VIDEOS_MAPPING_FILEPATH) or force: try: if settings.CENTRAL_SERVER: # Never call commands that could fail from the distributed server. # Always create a central server API to abstract things (see below) logging.debug("Generating dubbed video mappings.") call_command("generate_dubbed_video_mappings", force=force) else: # Generate from the spreadsheet response = requests.get("http://%s/api/i18n/videos/dubbed_video_map" % (settings.CENTRAL_SERVER_HOST)) response.raise_for_status() with open(DUBBED_VIDEOS_MAPPING_FILEPATH, "wb") as fp: fp.write(response.content.decode('utf-8')) # wait until content has been confirmed before opening file. except Exception as e: if not os.path.exists(DUBBED_VIDEOS_MAPPING_FILEPATH): # Unrecoverable error, so raise raise elif DUBBED_VIDEO_MAP: # No need to recover--allow the downstream dude to catch the error. raise else: # We can recover by NOT forcing reload. logging.warn("%s" % e) DUBBED_VIDEO_MAP_RAW = softload_json(DUBBED_VIDEOS_MAPPING_FILEPATH, raises=True) except Exception as e: logging.info("Failed to get dubbed video mappings; defaulting to empty.") DUBBED_VIDEO_MAP_RAW = {} # setting this will avoid triggering reload on every call DUBBED_VIDEO_MAP = {} for lang_name, video_map in DUBBED_VIDEO_MAP_RAW.iteritems(): logging.debug("Adding dubbed video map entry for %s (name=%s)" % (get_langcode_map(lang_name), lang_name)) DUBBED_VIDEO_MAP[get_langcode_map(lang_name)] = video_map return DUBBED_VIDEO_MAP.get(lang_code, {}) if lang_code else DUBBED_VIDEO_MAP
def recurse_nodes_to_remove_childless_nodes(node): """ When we remove exercises, we remove dead-end topics. Khan just sends us dead-end topics, too. Let's remove those too. """ children_to_delete = [] for ci, child in enumerate(node.get("children", [])): # Mark all unrecognized exercises for deletion if child["kind"] != "Topic": continue recurse_nodes_to_remove_childless_nodes(child) if not child.get("children"): children_to_delete.append(ci) logging.warn("Removing KA childless topic: %s" % child["slug"]) for ci in reversed(children_to_delete): del node["children"][ci]
def validate_language_map(lang_codes): """ This function will tell you any blockers that you'll hit while running this command. All srt languages must exist in the language map; missing languages will cause errors during command running (which can be long). This function avoids that problem by doing the above consistency check. """ lang_codes = lang_codes or get_all_prepped_lang_codes() missing_langs = [] for lang_code in lang_codes: try: get_language_name(lcode_to_ietf(lang_code), error_on_missing=True) except LanguageNotFoundError: missing_langs.append(lang_code) if missing_langs: logging.warn("Please add the following language codes to %s:\n\t%s" % ( LANG_LOOKUP_FILEPATH, missing_langs, ))
def move_exercises(lang_code): lang_pack_location = os.path.join(LOCALE_ROOT, lang_code) src_exercise_dir = os.path.join(lang_pack_location, "exercises") dest_exercise_dir = get_localized_exercise_dirpath(lang_code, is_central_server=False) if not os.path.exists(src_exercise_dir): logging.warn("Could not find downloaded exercises; skipping: %s" % src_exercise_dir) else: # Move over one at a time, to combine with any other resources that were there before. ensure_dir(dest_exercise_dir) all_exercise_files = glob.glob(os.path.join(src_exercise_dir, "*.html")) logging.info("Moving %d downloaded exercises to %s" % (len(all_exercise_files), dest_exercise_dir)) for exercise_file in all_exercise_files: shutil.move(exercise_file, os.path.join(dest_exercise_dir, os.path.basename(exercise_file))) logging.debug("Removing emtpy directory") try: shutil.rmtree(src_exercise_dir) except Exception as e: logging.error("Error removing dubbed video directory (%s): %s" % (src_exercise_dir, e))
def verify_inner_zip(self, zip_file): """ Extract contents of outer zip, verify the inner zip """ zip = ZipFile(zip_file, "r") nfiles = len(zip.namelist()) for fi,afile in enumerate(zip.namelist()): zip.extract(afile, path=self.working_dir) self.signature_file = os.path.join(self.working_dir, Command.signature_filename) self.inner_zip_file = os.path.join(self.working_dir, Command.inner_zip_filename) central_server = Device.get_central_server() lines = open(self.signature_file, "r").read().split("\n") chunk_size = int(lines.pop(0)) if not central_server: logging.warn("No central server device object found; trusting zip file because you asked me to...") elif central_server.key.verify_large_file(self.inner_zip_file, signature=lines, chunk_size=chunk_size): logging.info("Verified file!") else: raise Exception("Failed to verify inner zip file.") return self.inner_zip_file
def recurse_nodes_to_delete_exercise(node): """ Internal function for recursing the topic tree and removing new exercises. Requires rebranding of metadata done by recurse_nodes function. Returns a list of exercise slugs for the exercises that were deleted. """ # Stop recursing when we hit leaves if node["kind"] != "Topic": return [] slugs_deleted = [] children_to_delete = [] for ci, child in enumerate(node.get("children", [])): # Mark all unrecognized exercises for deletion if child["kind"] == "Exercise": if not os.path.exists(exercise_path % child["slug"]): children_to_delete.append(ci) # Recurse over children to delete elif child.get("children", None): slugs_deleted += recurse_nodes_to_delete_exercise(child) if not child.get("children", None): # Delete children without children (all their children were removed) logging.warn("Removing now-childless topic node '%s'" % child["slug"]) children_to_delete.append(ci) elif not any([ch["kind"] == "Exercise" or "Exercise" in ch.get("contains", []) for ch in child["children"]]): # If there are no longer exercises, be honest about it child["contains"] = list(set(child["contains"]) - set(["Exercise"])) # Do the actual deletion for i in reversed(children_to_delete): logging.warn("Deleting unknown exercise %s" % node["children"][i]["slug"]) del node["children"][i] return slugs_deleted
def scrub_knowledge_map(knowledge_map, node_cache): """ Some topics in the knowledge map, we don't keep in our topic tree / node cache. Eliminate them from the knowledge map here. """ for slug in knowledge_map["topics"].keys(): nodecache_node = node_cache["Topic"].get(slug, [{}])[0] topictree_node = topic_tools.get_topic_by_path(nodecache_node.get("path"), root_node=topic_tree) if not nodecache_node or not topictree_node: logging.warn("Removing unrecognized knowledge_map topic '%s'" % slug) elif not topictree_node.get("children"): logging.warn("Removing knowledge_map topic '%s' with no children." % slug) elif not "Exercise" in topictree_node.get("contains"): logging.warn("Removing knowledge_map topic '%s' with no exercises." % slug) else: continue del knowledge_map["topics"][slug] topictree_node["in_knowledge_map"] = False
def update_all_central_callback(request): """ Callback after authentication. Parses out the request token verification. Then finishes the request by getting an auth token. """ if not "ACCESS_TOKEN" in request.session: finish_auth(request) exercises = get_api_resource(request, "/api/v1/user/exercises") videos = get_api_resource(request, "/api/v1/user/videos") node_cache = get_node_cache() # Collate videos video_logs = [] for video in videos: # Assume that KA videos are all english-language, not dubbed (for now) video_id = youtube_id = video.get('video', {}).get('youtube_id', "") # Only save videos with progress if not video.get('seconds_watched', None): continue # Only save video logs for videos that we recognize. if video_id not in node_cache["Video"]: logging.warn("Skipping unknown video %s" % video_id) continue try: video_logs.append({ "video_id": video_id, "youtube_id": youtube_id, "total_seconds_watched": video['seconds_watched'], "points": VideoLog.calc_points(video['seconds_watched'], video['duration']), "complete": video['completed'], "completion_timestamp": convert_ka_date(video['last_watched']) if video['completed'] else None, }) logging.debug("Got video log for %s: %s" % (video_id, video_logs[-1])) except KeyError: # logging.error("Could not save video log for data with missing values: %s" % video) # Collate exercises exercise_logs = [] for exercise in exercises: # Only save exercises that have any progress. if not exercise.get('last_done', None): continue # Only save video logs for videos that we recognize. slug = exercise.get('exercise', "") if slug not in node_cache['Exercise']: logging.warn("Skipping unknown video %s" % slug) continue try: completed = exercise['streak'] >= 10 basepoints = node_cache['Exercise'][slug][0]['basepoints'] exercise_logs.append({ "exercise_id": slug, "streak_progress": min(100, 100 * exercise['streak']/10), # duplicates logic elsewhere "attempts": exercise['total_done'], "points": ExerciseLog.calc_points(basepoints, ncorrect=exercise['streak'], add_randomness=False), # no randomness when importing from KA "complete": completed, "attempts_before_completion": exercise['total_done'] if not exercise['practiced'] else None, #can't figure this out if they practiced after mastery. "completion_timestamp": convert_ka_date(exercise['proficient_date']) if completed else None, }) logging.debug("Got exercise log for %s: %s" % (slug, exercise_logs[-1])) except KeyError: logging.error("Could not save exercise log for data with missing values: %s" % exercise) # POST the data back to the distributed server try: dthandler = lambda obj: obj.isoformat() if isinstance(obj, datetime.datetime) else None logging.debug("POST'ing to %s" % request.session["distributed_callback_url"]) response = requests.post( request.session["distributed_callback_url"], cookies={ "csrftoken": request.session["distributed_csrf_token"] }, data = { "csrfmiddlewaretoken": request.session["distributed_csrf_token"], "video_logs": json.dumps(video_logs, default=dthandler), "exercise_logs": json.dumps(exercise_logs, default=dthandler), "user_id": request.session["distributed_user_id"], } ) logging.debug("Response (%d): %s" % (response.status_code, response.content)) except requests.exceptions.ConnectionError as e: return HttpResponseRedirect(set_query_params(request.session["distributed_redirect_url"], { "message_type": "error", "message": _("Could not connect to your KA Lite installation to share Khan Academy data."), "message_id": "id_khanload", })) except Exception as e: return HttpResponseRedirect(set_query_params(request.session["distributed_redirect_url"], { "message_type": "error", "message": _("Failure to send data to your KA Lite installation: %s") % e, "message_id": "id_khanload", })) try: json_response = json.loads(response.content) if not isinstance(json_response, dict) or len(json_response) != 1: # Could not validate the message is a single key-value pair raise Exception(_("Unexpected response format from your KA Lite installation.")) message_type = json_response.keys()[0] message = json_response.values()[0] except ValueError as e: message_type = "error" message = unicode(e) except Exception as e: message_type = "error" message = _("Loading json object: %s") % e # If something broke on the distribute d server, we are SCREWED. # For now, just show the error to users. # # Ultimately, we have a message, would like to share with the distributed server. # if response.status_code != 200: # return HttpResponseServerError(response.content) return HttpResponseRedirect(set_query_params(request.session["distributed_redirect_url"], { "message_type": message_type, "message": message, "message_id": "id_khanload", }))
def generate_dubbed_video_mappings(download_url=None, csv_data=None): """ Function to do the heavy lifting in getting the dubbed videos map. Could be moved into utils """ if not download_url: download_url = SPREADSHEET_BASE_URL params = {'key': SPREADSHEET_ID, 'gid': SPREADSHEET_GID, 'output': SPREADSHEET_EXPORT_FORMAT} else: params = {} if not csv_data: logging.info("Downloading dubbed video data from %s" % download_url) response = requests.get(download_url, params=params) if response.status_code != 200: raise CommandError("Failed to download dubbed video CSV data: status=%s" % response.status) csv_data = response.content # This CSV file is in standard format: separated by ",", quoted by '"' logging.info("Parsing csv file.") reader = csv.reader(StringIO(csv_data)) # Build a two-level video map. # First key: language name # Second key: english youtube ID # Value: corresponding youtube ID in the new language. video_map = {} row_num = -1 try: # Loop through each row in the spreadsheet. while (True): row_num += 1 row = reader.next() if row_num < 4: # Rows 1-4 are crap. continue elif row_num == 4: # Row 5 is the header row. header_row = [v.lower() for v in row] # lcase all header row values (including language names) slug_idx = header_row.index("titled id") english_idx = header_row.index("english") assert slug_idx != -1, "Video slug column header should be found." assert english_idx != -1, "English video column header should be found." else: # Rows 6 and beyond are data. assert len(row) == len(header_row), "Values line length equals headers line length" # Grab the slug and english video ID. video_slug = row[slug_idx] english_video_id = row[english_idx] assert english_video_id, "English Video ID should not be empty" assert video_slug, "Slug should not be empty" # English video is the first video ID column, # and following columns (until the end) are other languages. # Loop through those columns and, if a video exists, # add it to the dictionary. for idx in range(english_idx, len(row)): if not row[idx]: # make sure there's a dubbed video continue lang = header_row[idx] if lang not in video_map: # add the first level if it doesn't exist video_map[lang] = {} dubbed_youtube_id = row[idx] if english_video_id == dubbed_youtube_id and lang != "english": logging.error("Removing entry for (%s, %s): dubbed and english youtube ID are the same." % (lang, english_video_id)) #elif dubbed_youtube_id in video_map[lang].values(): # Talked to Bilal, and this is actually supposed to be OK. Would throw us for a loop! # For now, just keep one. #for key in video_map[lang].keys(): # if video_map[lang][key] == dubbed_youtube_id: # del video_map[lang][key] # break #logging.error("Removing entry for (%s, %s): the same dubbed video ID is used in two places, and we can only keep one in our current system." % (lang, english_video_id)) else: video_map[lang][english_video_id] = row[idx] # add the corresponding video id for the video, in this language. except StopIteration: # The loop ends when the CSV file hits the end and throws a StopIteration pass # Now, validate the mappings with our topic data known_videos = get_node_cache("Video").keys() missing_videos = set(known_videos) - set(video_map["english"].keys()) extra_videos = set(video_map["english"].keys()) - set(known_videos) if missing_videos: logging.warn("There are %d known videos not in the list of dubbed videos" % len(missing_videos)) logging.warn("Adding missing English videos to English dubbed video map") for video in missing_videos: video_map["english"][video] = video if extra_videos: logging.warn("There are %d videos in the list of dubbed videos that we have never heard of." % len(extra_videos)) return (video_map, csv_data)
def zip_language_packs(lang_codes=None, version=VERSION): """Zip up and expose all language packs converts all into ietf """ sizes = {} lang_codes = lang_codes or os.listdir(LANGUAGE_PACK_BUILD_DIR) lang_codes = [lcode_to_ietf(lc) for lc in lang_codes] logging.info("Zipping up %d language pack(s)" % len(lang_codes)) for lang_code_ietf in lang_codes: lang_code_map = get_supported_language_map(lang_code_ietf) # Initialize values sizes[lang_code_ietf] = { "package_size": 0, "zip_size": 0} # lang_locale_path = get_lp_build_dir(lang_code_ietf, version=version) if not os.path.exists(lang_locale_path): logging.warn("Unexpectedly skipping missing directory: %s" % lang_code_ietf) elif not os.path.isdir(lang_locale_path): logging.error("Skipping language where a file exists where a directory was expected: %s" % lang_code_ietf) # Create a zipfile for this language zip_filepath = get_language_pack_filepath(lang_code_ietf, version=version) ensure_dir(os.path.dirname(zip_filepath)) logging.info("Creating zip file in %s" % zip_filepath) z = zipfile.ZipFile(zip_filepath, 'w', zipfile.ZIP_DEFLATED) # Get metadata from the versioned directory for metadata_file in glob.glob('%s/*.json' % get_lp_build_dir(lang_code_ietf, version=version)): # Get every single file in the directory and zip it up filepath = os.path.join(lang_locale_path, metadata_file) z.write(filepath, arcname=os.path.basename(metadata_file)) sizes[lang_code_ietf]["package_size"] += os.path.getsize(filepath) # Get mo files from the directory lang_code_crowdin = lang_code_map["crowdin"] mo_files = glob.glob('%s/*.mo' % get_lp_build_dir(lcode_to_ietf(lang_code_crowdin), version=version)) if lang_code_crowdin else [] for mo_file in mo_files: # Get every single compiled language file filepath = os.path.join(lang_locale_path, mo_file) z.write(filepath, arcname=os.path.join("LC_MESSAGES", os.path.basename(mo_file))) sizes[lang_code_ietf]["package_size"] += os.path.getsize(filepath) # include video file sizes remote_video_size_list = get_all_remote_video_sizes() z.writestr('video_file_sizes.json', str(remote_video_size_list)) srt_dirpath = get_srt_path(lcode_to_django_dir(lang_code_map["amara"])) for srt_file in glob.glob(os.path.join(srt_dirpath, "*.srt")): z.write(srt_file, arcname=os.path.join("subtitles", os.path.basename(srt_file))) sizes[lang_code_ietf]["package_size"] += os.path.getsize(srt_file) if version_diff(version, "0.10.3") > 0: # since these are globally available, need to check version. exercises_dirpath = get_localized_exercise_dirpath(lang_code_map["exercises"]) for exercise_file in glob.glob(os.path.join(exercises_dirpath, "*.html")): # Get every single compiled language file filepath = os.path.join(exercises_dirpath, exercise_file) z.write(filepath, arcname=os.path.join("exercises", os.path.basename(exercise_file))) sizes[lang_code_ietf]["package_size"] += os.path.getsize(filepath) # Add dubbed video map z.write(DUBBED_VIDEOS_MAPPING_FILEPATH, arcname=os.path.join("dubbed_videos", os.path.basename(DUBBED_VIDEOS_MAPPING_FILEPATH))) sizes[lang_code_ietf]["package_size"] += os.path.getsize(DUBBED_VIDEOS_MAPPING_FILEPATH) z.close() sizes[lang_code_ietf]["zip_size"]= os.path.getsize(zip_filepath) logging.info("Done.") return sizes
def recurse_nodes(node, path="", ancestor_ids=[]): """ Internal function for recursing over the topic tree, marking relevant metadata, and removing undesired attributes and children. """ kind = node["kind"] # Only keep key data we can use for key in node.keys(): if key not in attribute_whitelists[kind]: del node[key] # Fix up data if slug_key[kind] not in node: logging.warn("Could not find expected slug key (%s) on node: %s" % (slug_key[kind], node)) node[slug_key[kind]] = node["id"] # put it SOMEWHERE. node["slug"] = node[slug_key[kind]] if node[slug_key[kind]] != "root" else "" node["id"] = node[id_key[kind]] # these used to be the same; now not. Easier if they stay the same (issue #233) node["path"] = path + khanload.kind_slugs[kind] + node["slug"] + "/" node["title"] = node[title_key[kind]].strip() # Add some attribute that should have been on there to start with. node["parent_id"] = ancestor_ids[-1] if ancestor_ids else None node["ancestor_ids"] = ancestor_ids if kind == "Exercise": # For each exercise, need to set the exercise_id # get related videos # and compute base points node["exercise_id"] = node["slug"] # compute base points # Paste points onto the exercise node["basepoints"] = ceil(7 * log(node["seconds_per_fast_problem"])); # Related videos related_video_slugs = [vid["readable_id"] for vid in download_khan_data("http://www.khanacademy.org/api/v1/exercises/%s/videos" % node["name"], node["name"] + ".json")] node["related_video_slugs"] = related_video_slugs related_exercise_metadata = { "id": node["id"], "slug": node["slug"], "title": node["title"], "path": node["path"], } for video_slug in node.get("related_video_slugs", []): related_exercise[video_slug] = related_exercise_metadata # Recurse through children, remove any blacklisted items children_to_delete = [] child_kinds = set() for i, child in enumerate(node.get("children", [])): child_kind = child.get("kind", None) # Blacklisted--remove if child_kind in kind_blacklist: children_to_delete.append(i) continue elif child[slug_key[child_kind]] in slug_blacklist: children_to_delete.append(i) continue elif not child.get("live", True) and remove_disabled_topics: # node is not live logging.debug("Remvong non-live child: %s" % child[slug_key[child_kind]]) children_to_delete.append(i) continue elif child.get("hide", False) and remove_disabled_topics: # node is hidden. Note that root is hidden, and we're implicitly skipping that. children_to_delete.append(i) logging.debug("Remvong hidden child: %s" % child[slug_key[child_kind]]) continue elif child_kind == "Video" and set(["mp4", "png"]) - set(child.get("download_urls", {}).keys()): # for now, since we expect the missing videos to be filled in soon, # we won't remove these nodes sys.stderr.write("WARNING: No download link for video: %s: authors='%s'\n" % (child["youtube_id"], child["author_names"])) children_to_delete.append(i) continue child_kinds = child_kinds.union(set([child_kind])) child_kinds = child_kinds.union(recurse_nodes(child, path=node["path"], ancestor_ids=ancestor_ids + [node["id"]])) # Delete those marked for completion for i in reversed(children_to_delete): del node["children"][i] # Mark on topics whether they contain Videos, Exercises, or both if kind == "Topic": node["contains"] = list(child_kinds) return child_kinds