def retrieve_html_exercises(exercises: [str], lang: str, force=False) -> (str, [str]): """ Return a 2-tuple with the first element pointing to the path the exercise files are stored, and the second element a list of exercise ids that have html exercises. """ BUILD_DIR = os.path.join(os.getcwd(), "build", lang) EN_BUILD_DIR = os.path.join(os.getcwd(), "build", EN_LANG_CODE) EXERCISE_DOWNLOAD_URL_TEMPLATE = ("https://es.khanacademy.org/" "khan-exercises/exercises/{id}.html?lang={lang}") lang_codes = get_lang_code_list(lang) def _download_html_exercise(exercise_id): """ Download an exercise and return its exercise id *if* the downloaded url from the selected language is different from the english version. """ try: for lang in lang_codes: lang_url = EXERCISE_DOWNLOAD_URL_TEMPLATE.format(id=exercise_id, lang=lang) en_url = EXERCISE_DOWNLOAD_URL_TEMPLATE.format(id=exercise_id, lang=EN_LANG_CODE) try: lang_file = download_and_cache_file(lang_url, cachedir=BUILD_DIR, ignorecache=force) en_file = download_and_cache_file(en_url, cachedir=EN_BUILD_DIR, ignorecache=force) if not filecmp.cmp(lang_file, en_file, shallow=False): return exercise_id except requests.exceptions.HTTPError as e: logging.warning("Failed to fetch html for lang: {}, exercise {}, exception: {}".format(lang, exercise_id, e)) except requests.exceptions.HTTPError as e: logging.warning("Failed to fetch exercise for lang_codes: {}, exception: {}".format(lang_codes, e)) return None pool = ThreadPool(processes=NUM_PROCESSES) translated_exercises = pool.map(_download_html_exercise, exercises) # filter out Nones, since it means we got an error downloading those exercises result = [e for e in translated_exercises if e] return (BUILD_DIR, result)
def retrieve_exercise_dict(lang=None, force=False) -> str: lang_codes = get_lang_code_list(lang) exercise_data = [] for lang in lang_codes: url = "https://www.khanacademy.org/api/internal/exercises" + ("?lang={lang}".format(lang=lang) if lang else "") exercise_data_path = download_exercise_data(url, ignorecache=force, filename="exercises.json") with open(exercise_data_path, "r") as f: exercise_data += ujson.load(f) return {ex.get("id"): ex for ex in exercise_data}
def retrieve_exercise_dict(lang=None, force=False) -> str: lang_codes = get_lang_code_list(lang) exercise_data = [] for lang in lang_codes: url = "https://www.khanacademy.org/api/internal/exercises" + ("?lang={lang}".format(lang=lang) if lang else "") exercise_data_path = download_exercise_data(url, ignorecache=force, filename="exercises.json") with open(exercise_data_path, 'r') as f: exercise_data += ujson.load(f) return {ex.get("id"): ex for ex in exercise_data}
def retrieve_kalite_data(lang=EN_LANG_CODE, force=False, ka_domain=KA_DOMAIN, no_dubbed_videos=False) -> list: """ Retrieve the KA content data direct from KA. Note: use the same language code in the video, topic and exercises node data to prevent issues. """ node_data = [] topic_path_list = [] exercise_ids = [] youtube_ids = [] """ Get all possible language codes for the language because one language may have multiple language codes or names. Example 1: Swahili language { "sw":{ "name":"Swahili", "native_name":"Kiswahili" }, "swa":{ "name":"Swahili", "native_name":"Kiswahili" }, } Example 2: Somali language { "som":{ "name":"Somali", "native_name":"Soomaaliga" }, "so":{ "name":"Somali", "native_name":"Soomaaliga, af Soomaali" }, } """ # MUST: Get all possible language codes for the language. lang_codes = get_lang_code_list(lang) # Loop-thru all lang codes and populate the topic, exercise, video lists while checking for duplicates. logging.info("Found %s language codes for the language %s." % ( lang_codes, lang, )) for lang_code in lang_codes: logging.info(" Processing language code %s..." % lang_code) projection = json.dumps(PROJECTION_KEYS) url = API_URL.format(projection=projection, lang=lang_code, ka_domain=ka_domain) node_data_path = download_and_clean_kalite_data(url, lang=lang_code, ignorecache=force, filename="nodes.json") with open(node_data_path, 'r') as f: node_data_temp = ujson.load(f) for node_temp in node_data_temp: node_data.append(node_temp) if not lang == EN_LANG_CODE and not no_dubbed_videos: node_data = add_dubbed_video_mappings(node_data, lang) return node_data
def retrieve_kalite_data(lang=EN_LANG_CODE, force=False, ka_domain=KA_DOMAIN, no_dubbed_videos=False) -> list: """ Retrieve the KA content data direct from KA. Note: use the same language code in the video, topic and exercises node data to prevent issues. """ node_data = [] topic_path_list = [] exercise_ids = [] youtube_ids = [] """ Get all possible language codes for the language because one language may have multiple language codes or names. Example 1: Swahili language { "sw":{ "name":"Swahili", "native_name":"Kiswahili" }, "swa":{ "name":"Swahili", "native_name":"Kiswahili" }, } Example 2: Somali language { "som":{ "name":"Somali", "native_name":"Soomaaliga" }, "so":{ "name":"Somali", "native_name":"Soomaaliga, af Soomaali" }, } """ # MUST: Get all possible language codes for the language. lang_codes = get_lang_code_list(lang) # Loop-thru all lang codes and populate the topic, exercise, video lists while checking for duplicates. logging.info("Found %s language codes for the language %s." % ( lang_codes, lang, )) for lang_code in lang_codes: logging.info(" Processing language code %s..." % lang_code) projection = json.dumps(PROJECTION_KEYS) url = API_URL.format(projection=projection, lang=lang_code, ka_domain=ka_domain) node_data_path = download_and_clean_kalite_data(url, lang=lang_code, ignorecache=force, filename="nodes.json") with open(node_data_path, 'r') as f: node_data_temp = ujson.load(f) for node_temp in node_data_temp: node_kind = node_temp.get("kind") if (node_kind == NodeType.topic): if not node_temp["path"] in topic_path_list: topic_path_list.append(node_temp["path"]) node_data.append(node_temp) if (node_kind == NodeType.exercise): if not node_temp["id"] in exercise_ids: exercise_ids.append(node_temp["id"]) node_data.append(node_temp) if (node_kind == NodeType.video): if not node_temp["youtube_id"] in youtube_ids: youtube_lang = node_temp["translated_youtube_lang"] if youtube_lang == lang: youtube_ids.append(node_temp["youtube_id"]) node_data.append(node_temp) elif not youtube_lang == EN_LANG_CODE: """ Some translated_youtube_lang values return from KHAN API did not match to the specified language code. We need to override it to use the same language code. Example: using pt-BR language code in the khan api will return pt translated_youtube_lang. """ youtube_ids.append(node_temp["youtube_id"]) node_temp["translated_youtube_lang"] = lang node_data.append(node_temp) if not lang == EN_LANG_CODE and not no_dubbed_videos: node_data = add_dubbed_video_mappings(node_data, lang) return node_data
def retrieve_kalite_data(lang=EN_LANG_CODE, force=False, ka_domain=KA_DOMAIN, no_dubbed_videos=False) -> list: """ Retrieve the KA content data direct from KA. Note: use the same language code in the video, topic and exercises node data to prevent issues. """ node_data = [] topic_path_list = [] exercise_ids = [] youtube_ids = [] """ Get all possible language codes for the language because one language may have multiple language codes or names. Example 1: Swahili language { "sw":{ "name":"Swahili", "native_name":"Kiswahili" }, "swa":{ "name":"Swahili", "native_name":"Kiswahili" }, } Example 2: Somali language { "som":{ "name":"Somali", "native_name":"Soomaaliga" }, "so":{ "name":"Somali", "native_name":"Soomaaliga, af Soomaali" }, } """ # MUST: Get all possible language codes for the language. lang_codes = get_lang_code_list(lang) # Loop-thru all lang codes and populate the topic, exercise, video lists while checking for duplicates. logging.info("Found %s language codes for the language %s." % (lang_codes, lang)) for lang_code in lang_codes: logging.info(" Processing language code %s..." % lang_code) projection = json.dumps(PROJECTION_KEYS) url = API_URL.format(projection=projection, lang=lang_code, ka_domain=ka_domain) node_data_path = download_and_clean_kalite_data(url, lang=lang_code, ignorecache=force, filename="nodes.json") with open(node_data_path, "r") as f: node_data_temp = ujson.load(f) for node_temp in node_data_temp: node_kind = node_temp.get("kind") if node_kind == NodeType.topic: if not node_temp["path"] in topic_path_list: topic_path_list.append(node_temp["path"]) node_data.append(node_temp) if node_kind == NodeType.exercise: if not node_temp["id"] in exercise_ids: exercise_ids.append(node_temp["id"]) node_data.append(node_temp) if node_kind == NodeType.video: if not node_temp["youtube_id"] in youtube_ids: youtube_lang = node_temp["translated_youtube_lang"] if youtube_lang == lang: youtube_ids.append(node_temp["youtube_id"]) node_data.append(node_temp) elif not youtube_lang == EN_LANG_CODE: """ Some translated_youtube_lang values return from KHAN API did not match to the specified language code. We need to override it to use the same language code. Example: using pt-BR language code in the khan api will return pt translated_youtube_lang. """ youtube_ids.append(node_temp["youtube_id"]) node_temp["translated_youtube_lang"] = lang node_data.append(node_temp) if not lang == EN_LANG_CODE and not no_dubbed_videos: node_data = add_dubbed_video_mappings(node_data, lang) return node_data