def get_exercise_page_paths(video_id=None, video_slug=None): assert (video_id or video_slug) and not (video_id and video_slug), "One arg, not two" try: exercise_paths = set() for exercise in get_related_exercises(video=topic_tools.get_node_cache("Video")[video_slug]): exercise_paths = exercise_paths.union(set(exercise["paths"])) return list(exercise_paths) except: return []
def get_video_page_paths(video_id=None, video_slug=None): assert (video_id or video_slug) and not (video_id and video_slug), "One arg, not two" try: if not video_slug: video_slug = topic_tools.get_id2slug_map()[video_id] return topic_tools.get_node_cache("Video")[video_slug]['paths'] except: return []
def create_youtube_id_to_slug_map(node_cache=None, data_path=settings.PROJECT_PATH + "/static/data/"): """ Go through all videos, and make a map of youtube_id to slug, for fast look-up later """ if not node_cache: node_cache = topic_tools.get_node_cache(force=True) map_file = os.path.join(data_path, topic_tools.video_remap_file) id2slug_map = dict() # Make a map from youtube ID to video slug for v in node_cache['Video'].values(): assert v["youtube_id"] not in id2slug_map, "Make sure there's a 1-to-1 mapping between youtube_id and slug" id2slug_map[v['youtube_id']] = v['slug'] # Save the map! with open(map_file, "w") as fp: fp.write(json.dumps(id2slug_map, indent=2))
def create_youtube_id_to_slug_map(node_cache=None, data_path=settings.PROJECT_PATH + "/static/data/"): """ Go through all videos, and make a map of youtube_id to slug, for fast look-up later """ if not node_cache: node_cache = topic_tools.get_node_cache(force=True) map_file = os.path.join(data_path, topic_tools.video_remap_file) id2slug_map = dict() # Make a map from youtube ID to video slug for v in node_cache['Video'].values(): assert v[ "youtube_id"] not in id2slug_map, "Make sure there's a 1-to-1 mapping between youtube_id and slug" id2slug_map[v['youtube_id']] = v['slug'] # Save the map! with open(map_file, "w") as fp: fp.write(json.dumps(id2slug_map, indent=2))
def create_all_mappings(force=False, frequency_to_save=100, response_to_check=None, date_to_check=None): """Write or update JSON file that maps from YouTube ID to Amara code and languages available""" videos = get_node_cache("Video") # Initialize the data out_file = settings.SUBTITLES_DATA_ROOT + SRTS_JSON_FILENAME if not os.path.exists(out_file): srts_dict = {} else: # Open the file, read, and clean out old videos. try: with open(out_file, "r") as fp: srts_dict = json.load(fp) except Exception as e: logging.debug("JSON file corrupted, using empty json and starting from scratch.\n%s" % e) srts_dict = {} else: logging.info("Loaded %d mappings." % (len(srts_dict))) # Set of videos no longer used by KA Lite removed_videos = set(srts_dict.keys()) - set([v["youtube_id"] for v in videos.values()]) if removed_videos: logging.info("Removing subtitle information for %d videos (no longer used)." % len(removed_videos)) for vid in removed_videos: del srts_dict[vid] logging.info("Querying %d mappings." % (len(videos) - (0 if (force or date_to_check) else len(srts_dict)))) # Once we have the current mapping, proceed through logic to update the mapping n_new_entries = 0 n_failures = 0 for video, data in videos.iteritems(): # Decide whether or not to update this video based on the arguments provided at the command line youtube_id = data["youtube_id"] cached = youtube_id in srts_dict if not force and cached: # First, check against date flag_for_refresh = True # not (response_code or last_attempt) last_attempt = srts_dict[youtube_id].get("last_attempt") last_attempt = None if not last_attempt else datetime.datetime.strptime(last_attempt, "%Y-%m-%d") flag_for_refresh = flag_for_refresh and (not date_to_check or date_to_check > last_attempt) if not flag_for_refresh: logging.debug("Skipping %s for date-check" % youtube_id) continue # Second, check against response code response_code = srts_dict[youtube_id].get("api_response") flag_for_refresh = flag_for_refresh and ( not response_to_check or response_to_check == "all" or response_to_check == response_code ) if not (flag_for_refresh): logging.debug("Skipping %s for response-code" % youtube_id) continue if not response_to_check and not date_to_check and cached: # no flags specified and already cached - skip logging.debug("Skipping %s for already-cached and no flags specified" % youtube_id) continue else: if force and not cached: logging.debug("Updating %s because force flag (-f) given and video not cached." % youtube_id) elif force and cached: logging.debug("Updating %s because force flag (-f) given. Video was previously cached." % youtube_id) else: logging.debug("Updating %s because video not yet cached." % youtube_id) # If it makes it to here without hitting a continue, then update the entry try: srts_dict[youtube_id] = update_video_entry(youtube_id, entry=srts_dict.get(youtube_id, {})) except Exception as e: logging.warn("Error updating video %s: %s" % (youtube_id, e)) n_failures += 1 continue if n_new_entries % frequency_to_save == 0: logging.info("On loop %d dumping dictionary into %s" % (n_new_entries, out_file)) with open(out_file, "wb") as fp: json.dump(srts_dict, fp) n_new_entries += 1 # Finished the loop: save and report with open(out_file, "wb") as fp: json.dump(srts_dict, fp) if n_failures == 0: logging.info("Great success! Stored %d fresh entries, %d total." % (n_new_entries, len(srts_dict))) else: logging.warn("Stored %s fresh entries, but with %s failures." % (n_new_entries, n_failures))
import json import os import settings from utils import topic_tools TOPICS = topic_tools.get_topic_tree() NODE_CACHE = topic_tools.get_node_cache() ID2SLUG_MAP = topic_tools.get_id2slug_map()
import json import os import settings from utils import topic_tools TOPICS = topic_tools.get_topic_tree() NODE_CACHE = topic_tools.get_node_cache() EXERCISE_TOPICS = topic_tools.get_exercise_topics() ID2SLUG_MAP = topic_tools.get_id2slug_map() LANGUAGE_LOOKUP = json.loads( open(os.path.join(settings.DATA_PATH, "languages.json")).read()) LANGUAGE_LIST = json.loads( open(os.path.join(settings.SUBTITLES_DATA_ROOT, "listedlanguages.json")).read())
def create_all_mappings(force=False, frequency_to_save=100, response_to_check=None, date_to_check=None): """Write or update JSON file that maps from YouTube ID to Amara code and languages available""" videos = get_node_cache('Video') # Initialize the data out_file = settings.SUBTITLES_DATA_ROOT + SRTS_JSON_FILENAME if not os.path.exists(out_file): srts_dict = {} else: # Open the file, read, and clean out old videos. try: with open(out_file, "r") as fp: srts_dict = json.load(fp) except Exception as e: logging.debug("JSON file corrupted, using empty json and starting from scratch.\n%s" % e) srts_dict = {} else: logging.info("Loaded %d mappings." % (len(srts_dict))) # Set of videos no longer used by KA Lite removed_videos = set(srts_dict.keys()) - set([v["youtube_id"] for v in videos.values()]) if removed_videos: logging.info("Removing subtitle information for %d videos (no longer used)." % len(removed_videos)) for vid in removed_videos: del srts_dict[vid] logging.info("Querying %d mappings." % (len(videos) - (0 if (force or date_to_check) else len(srts_dict)))) # Once we have the current mapping, proceed through logic to update the mapping n_new_entries = 0 n_failures = 0 for video, data in videos.iteritems(): # Decide whether or not to update this video based on the arguments provided at the command line youtube_id = data['youtube_id'] cached = youtube_id in srts_dict if not force and cached: # First, check against date flag_for_refresh = True # not (response_code or last_attempt) last_attempt = srts_dict[youtube_id].get("last_attempt") last_attempt = None if not last_attempt else datetime.datetime.strptime(last_attempt, '%Y-%m-%d') flag_for_refresh = flag_for_refresh and (not date_to_check or date_to_check > last_attempt) if not flag_for_refresh: logging.debug("Skipping %s for date-check" % youtube_id) continue # Second, check against response code response_code = srts_dict[youtube_id].get("api_response") flag_for_refresh = flag_for_refresh and (not response_to_check or response_to_check == "all" or response_to_check == response_code) if not (flag_for_refresh): logging.debug("Skipping %s for response-code" % youtube_id) continue if not response_to_check and not date_to_check and cached: # no flags specified and already cached - skip logging.debug("Skipping %s for already-cached and no flags specified" % youtube_id) continue else: if force and not cached: logging.debug("Updating %s because force flag (-f) given and video not cached." % youtube_id) elif force and cached: logging.debug("Updating %s because force flag (-f) given. Video was previously cached." % youtube_id) else: logging.debug("Updating %s because video not yet cached." % youtube_id) # If it makes it to here without hitting a continue, then update the entry try: srts_dict[youtube_id] = update_video_entry(youtube_id, entry=srts_dict.get(youtube_id, {})) except Exception as e: logging.warn("Error updating video %s: %s" % (youtube_id, e)) n_failures += 1 continue if n_new_entries % frequency_to_save == 0: logging.info("On loop %d dumping dictionary into %s" %(n_new_entries, out_file)) with open(out_file, 'wb') as fp: json.dump(srts_dict, fp) n_new_entries += 1 # Finished the loop: save and report with open(out_file, 'wb') as fp: json.dump(srts_dict, fp) if n_failures == 0: logging.info("Great success! Stored %d fresh entries, %d total." % (n_new_entries, len(srts_dict))) else: logging.warn("Stored %s fresh entries, but with %s failures." % (n_new_entries, n_failures))
def rebuild_topictree(data_path=settings.PROJECT_PATH + "/static/data/", remove_unknown_exercises=False): """ Downloads topictree (and supporting) data from Khan Academy and uses it to rebuild the KA Lite topictree cache (topics.json). """ topictree = download_khan_data("http://www.khanacademy.org/api/v1/topictree") related_exercise = {} # Temp variable to save exercises related to particular videos def recurse_nodes(node, path=""): """ Internal function for recursing over the topic tree, marking relevant metadata, and removing undesired attributes and children. """ kind = node["kind"] # Only keep key data we can use for key in node.keys(): if key not in attribute_whitelists[kind]: del node[key] # Fix up data if slug_key[kind] not in node: logging.warn("Could not find expected slug key (%s) on node: %s" % (slug_key[kind], node)) node[slug_key[kind]] = node["id"] # put it SOMEWHERE. node["slug"] = node[slug_key[kind]] if node[slug_key[kind]] != "root" else "" node["id"] = node["slug"] # these used to be the same; now not. Easier if they stay the same (issue #233) node["path"] = path + topic_tools.kind_slugs[kind] + node["slug"] + "/" node["title"] = node[title_key[kind]] kinds = set([kind]) # For each exercise, need to get related videos if kind == "Exercise": related_video_readable_ids = [vid["readable_id"] for vid in download_khan_data("http://www.khanacademy.org/api/v1/exercises/%s/videos" % node["name"], node["name"] + ".json")] node["related_video_readable_ids"] = related_video_readable_ids exercise = { "slug": node[slug_key[kind]], "title": node[title_key[kind]], "path": node["path"], } for video_id in node.get("related_video_readable_ids", []): related_exercise[video_id] = exercise # Recurse through children, remove any blacklisted items children_to_delete = [] for i, child in enumerate(node.get("children", [])): child_kind = child.get("kind", None) if child_kind in kind_blacklist: children_to_delete.append(i) continue if child[slug_key[child_kind]] in slug_blacklist: children_to_delete.append(i) continue kinds = kinds.union(recurse_nodes(child, node["path"])) for i in reversed(children_to_delete): del node["children"][i] # Mark on topics whether they contain Videos, Exercises, or both if kind == "Topic": node["contains"] = list(kinds) return kinds recurse_nodes(topictree) # Limit exercises to only the previous list def recurse_nodes_to_delete_exercise(node, OLD_NODE_CACHE): """ Internal function for recursing the topic tree and removing new exercises. Requires rebranding of metadata done by recurse_nodes function. """ # Stop recursing when we hit leaves if node["kind"] != "Topic": return children_to_delete = [] for ci, child in enumerate(node.get("children", [])): # Mark all unrecognized exercises for deletion if child["kind"] == "Exercise": if not child["slug"] in OLD_NODE_CACHE["Exercise"].keys(): children_to_delete.append(ci) # Recurse over children to delete elif child.get("children", None): recurse_nodes_to_delete_exercise(child, OLD_NODE_CACHE) # Delete children without children (all their children were removed) if not child.get("children", None): logging.debug("Removing now-childless topic node '%s'" % child["slug"]) children_to_delete.append(ci) # If there are no longer exercises, be honest about it elif not any([ch["kind"] == "Exercise" or "Exercise" in ch.get("contains", []) for ch in child["children"]]): child["contains"] = list(set(child["contains"]) - set(["Exercise"])) # Do the actual deletion for i in reversed(children_to_delete): logging.debug("Deleting unknown exercise %s" % node["children"][i]["slug"]) del node["children"][i] if remove_unknown_exercises: OLD_NODE_CACHE = topic_tools.get_node_cache() recurse_nodes_to_delete_exercise(topictree, OLD_NODE_CACHE) # do this before [add related] for vid, ex in related_exercise.items(): if ex and ex["slug"] not in OLD_NODE_CACHE["Exercise"].keys(): related_exercise[vid] = None def recurse_nodes_to_add_related_exercise(node): """ Internal function for recursing the topic tree and marking related exercises. Requires rebranding of metadata done by recurse_nodes function. """ if node["kind"] == "Video": node["related_exercise"] = related_exercise.get(node["slug"], None) for child in node.get("children", []): recurse_nodes_to_add_related_exercise(child) recurse_nodes_to_add_related_exercise(topictree) def recurse_nodes_to_remove_childless_nodes(node): """ When we remove exercises, we remove dead-end topics. Khan just sends us dead-end topics, too. Let's remove those too. """ children_to_delete = [] for ci, child in enumerate(node.get("children", [])): # Mark all unrecognized exercises for deletion if child["kind"] != "Topic": continue recurse_nodes_to_remove_childless_nodes(child) if not child.get("children"): children_to_delete.append(ci) logging.debug("Removing KA childless topic: %s" % child["slug"]) for ci in reversed(children_to_delete): del node["children"][ci] recurse_nodes_to_remove_childless_nodes(topictree) # Do the actual deletion with open(os.path.join(data_path, topic_tools.topics_file), "w") as fp: fp.write(json.dumps(topictree, indent=2)) return topictree