Example #1
0
def retrieve_html_exercises(exercises: [str], lang: str, force=False) -> (str, [str]):
    """
    Return a 2-tuple with the first element pointing to the path the exercise files are stored,
    and the second element a list of exercise ids that have html exercises.
    """
    BUILD_DIR = os.path.join(os.getcwd(), "build", lang)
    EN_BUILD_DIR = os.path.join(os.getcwd(), "build", EN_LANG_CODE)
    EXERCISE_DOWNLOAD_URL_TEMPLATE = ("https://es.khanacademy.org/"
                                      "khan-exercises/exercises/{id}.html?lang={lang}")
    lang_codes = get_lang_code_list(lang)
    def _download_html_exercise(exercise_id):
        """
        Download an exercise and return its exercise id *if* the
        downloaded url from the selected language is different from the english version.
        """
        try:
            for lang in lang_codes:
                lang_url = EXERCISE_DOWNLOAD_URL_TEMPLATE.format(id=exercise_id, lang=lang)
                en_url = EXERCISE_DOWNLOAD_URL_TEMPLATE.format(id=exercise_id, lang=EN_LANG_CODE)
                try:
                    lang_file = download_and_cache_file(lang_url, cachedir=BUILD_DIR, ignorecache=force)
                    en_file = download_and_cache_file(en_url, cachedir=EN_BUILD_DIR, ignorecache=force)
                    if not filecmp.cmp(lang_file, en_file, shallow=False):
                        return exercise_id
                except requests.exceptions.HTTPError as e:
                    logging.warning("Failed to fetch html for lang: {}, exercise {}, exception: {}".format(lang, exercise_id, e))
        except requests.exceptions.HTTPError as e:
            logging.warning("Failed to fetch exercise for lang_codes: {}, exception: {}".format(lang_codes, e))
            return None

    pool = ThreadPool(processes=NUM_PROCESSES)
    translated_exercises = pool.map(_download_html_exercise, exercises)
    # filter out Nones, since it means we got an error downloading those exercises
    result = [e for e in translated_exercises if e]
    return (BUILD_DIR, result)
def retrieve_exercise_dict(lang=None, force=False) -> str:
    lang_codes = get_lang_code_list(lang)
    exercise_data = []
    for lang in lang_codes:
        url = "https://www.khanacademy.org/api/internal/exercises" + ("?lang={lang}".format(lang=lang) if lang else "")
        exercise_data_path = download_exercise_data(url, ignorecache=force, filename="exercises.json")
        with open(exercise_data_path, "r") as f:
            exercise_data += ujson.load(f)

    return {ex.get("id"): ex for ex in exercise_data}
Example #3
0
def retrieve_exercise_dict(lang=None, force=False) -> str:
    lang_codes = get_lang_code_list(lang)
    exercise_data = []
    for lang in lang_codes:
        url = "https://www.khanacademy.org/api/internal/exercises" + ("?lang={lang}".format(lang=lang) if lang else "")
        exercise_data_path = download_exercise_data(url, ignorecache=force, filename="exercises.json")
        with open(exercise_data_path, 'r') as f:
            exercise_data += ujson.load(f)

    return {ex.get("id"): ex for ex in exercise_data}
Example #4
0
def retrieve_kalite_data(lang=EN_LANG_CODE,
                         force=False,
                         ka_domain=KA_DOMAIN,
                         no_dubbed_videos=False) -> list:
    """
    Retrieve the KA content data direct from KA.
    Note: use the same language code in the video, topic and exercises node data to prevent issues.
    """

    node_data = []
    topic_path_list = []
    exercise_ids = []
    youtube_ids = []
    """
    Get all possible language codes for the language because one language may have multiple language codes or names.

    Example 1: Swahili language {
                "sw":{ "name":"Swahili", "native_name":"Kiswahili" },
                "swa":{ "name":"Swahili", "native_name":"Kiswahili" },
            }

    Example 2: Somali language {
                "som":{ "name":"Somali", "native_name":"Soomaaliga" },
                "so":{ "name":"Somali", "native_name":"Soomaaliga, af Soomaali" },
            }
    """

    # MUST: Get all possible language codes for the language.
    lang_codes = get_lang_code_list(lang)

    # Loop-thru all lang codes and populate the topic, exercise, video lists while checking for duplicates.
    logging.info("Found %s language codes for the language %s." % (
        lang_codes,
        lang,
    ))
    for lang_code in lang_codes:
        logging.info("  Processing language code %s..." % lang_code)
        projection = json.dumps(PROJECTION_KEYS)
        url = API_URL.format(projection=projection,
                             lang=lang_code,
                             ka_domain=ka_domain)
        node_data_path = download_and_clean_kalite_data(url,
                                                        lang=lang_code,
                                                        ignorecache=force,
                                                        filename="nodes.json")
        with open(node_data_path, 'r') as f:
            node_data_temp = ujson.load(f)
        for node_temp in node_data_temp:
            node_data.append(node_temp)
    if not lang == EN_LANG_CODE and not no_dubbed_videos:
        node_data = add_dubbed_video_mappings(node_data, lang)
    return node_data
Example #5
0
def retrieve_kalite_data(lang=EN_LANG_CODE,
                         force=False,
                         ka_domain=KA_DOMAIN,
                         no_dubbed_videos=False) -> list:
    """
    Retrieve the KA content data direct from KA.
    Note: use the same language code in the video, topic and exercises node data to prevent issues.
    """

    node_data = []
    topic_path_list = []
    exercise_ids = []
    youtube_ids = []
    """
    Get all possible language codes for the language because one language may have multiple language codes or names.  

    Example 1: Swahili language { 
                "sw":{ "name":"Swahili", "native_name":"Kiswahili" },
                "swa":{ "name":"Swahili", "native_name":"Kiswahili" },
            }

    Example 2: Somali language {
                "som":{ "name":"Somali", "native_name":"Soomaaliga" },
                "so":{ "name":"Somali", "native_name":"Soomaaliga, af Soomaali" },
            }
    """

    # MUST: Get all possible language codes for the language.
    lang_codes = get_lang_code_list(lang)

    # Loop-thru all lang codes and populate the topic, exercise, video lists while checking for duplicates.
    logging.info("Found %s language codes for the language %s." % (
        lang_codes,
        lang,
    ))
    for lang_code in lang_codes:
        logging.info("  Processing language code %s..." % lang_code)
        projection = json.dumps(PROJECTION_KEYS)
        url = API_URL.format(projection=projection,
                             lang=lang_code,
                             ka_domain=ka_domain)
        node_data_path = download_and_clean_kalite_data(url,
                                                        lang=lang_code,
                                                        ignorecache=force,
                                                        filename="nodes.json")
        with open(node_data_path, 'r') as f:
            node_data_temp = ujson.load(f)
        for node_temp in node_data_temp:
            node_kind = node_temp.get("kind")
            if (node_kind == NodeType.topic):
                if not node_temp["path"] in topic_path_list:
                    topic_path_list.append(node_temp["path"])
                    node_data.append(node_temp)
            if (node_kind == NodeType.exercise):
                if not node_temp["id"] in exercise_ids:
                    exercise_ids.append(node_temp["id"])
                    node_data.append(node_temp)
            if (node_kind == NodeType.video):
                if not node_temp["youtube_id"] in youtube_ids:
                    youtube_lang = node_temp["translated_youtube_lang"]
                    if youtube_lang == lang:
                        youtube_ids.append(node_temp["youtube_id"])
                        node_data.append(node_temp)
                    elif not youtube_lang == EN_LANG_CODE:
                        """
                        Some translated_youtube_lang values return from KHAN API did not match
                            to the specified language code. We need to override it to use the same
                            language code.
                        Example: using pt-BR language code in the khan api will return pt translated_youtube_lang.
                        """
                        youtube_ids.append(node_temp["youtube_id"])
                        node_temp["translated_youtube_lang"] = lang
                        node_data.append(node_temp)
    if not lang == EN_LANG_CODE and not no_dubbed_videos:
        node_data = add_dubbed_video_mappings(node_data, lang)
    return node_data
def retrieve_kalite_data(lang=EN_LANG_CODE, force=False, ka_domain=KA_DOMAIN, no_dubbed_videos=False) -> list:
    """
    Retrieve the KA content data direct from KA.
    Note: use the same language code in the video, topic and exercises node data to prevent issues.
    """

    node_data = []
    topic_path_list = []
    exercise_ids = []
    youtube_ids = []

    """
    Get all possible language codes for the language because one language may have multiple language codes or names.  

    Example 1: Swahili language { 
                "sw":{ "name":"Swahili", "native_name":"Kiswahili" },
                "swa":{ "name":"Swahili", "native_name":"Kiswahili" },
            }

    Example 2: Somali language {
                "som":{ "name":"Somali", "native_name":"Soomaaliga" },
                "so":{ "name":"Somali", "native_name":"Soomaaliga, af Soomaali" },
            }
    """

    # MUST: Get all possible language codes for the language.
    lang_codes = get_lang_code_list(lang)

    # Loop-thru all lang codes and populate the topic, exercise, video lists while checking for duplicates.
    logging.info("Found %s language codes for the language %s." % (lang_codes, lang))
    for lang_code in lang_codes:
        logging.info("  Processing language code %s..." % lang_code)
        projection = json.dumps(PROJECTION_KEYS)
        url = API_URL.format(projection=projection, lang=lang_code, ka_domain=ka_domain)
        node_data_path = download_and_clean_kalite_data(url, lang=lang_code, ignorecache=force, filename="nodes.json")
        with open(node_data_path, "r") as f:
            node_data_temp = ujson.load(f)
        for node_temp in node_data_temp:
            node_kind = node_temp.get("kind")
            if node_kind == NodeType.topic:
                if not node_temp["path"] in topic_path_list:
                    topic_path_list.append(node_temp["path"])
                    node_data.append(node_temp)
            if node_kind == NodeType.exercise:
                if not node_temp["id"] in exercise_ids:
                    exercise_ids.append(node_temp["id"])
                    node_data.append(node_temp)
            if node_kind == NodeType.video:
                if not node_temp["youtube_id"] in youtube_ids:
                    youtube_lang = node_temp["translated_youtube_lang"]
                    if youtube_lang == lang:
                        youtube_ids.append(node_temp["youtube_id"])
                        node_data.append(node_temp)
                    elif not youtube_lang == EN_LANG_CODE:
                        """
                        Some translated_youtube_lang values return from KHAN API did not match
                            to the specified language code. We need to override it to use the same
                            language code.
                        Example: using pt-BR language code in the khan api will return pt translated_youtube_lang.
                        """
                        youtube_ids.append(node_temp["youtube_id"])
                        node_temp["translated_youtube_lang"] = lang
                        node_data.append(node_temp)
    if not lang == EN_LANG_CODE and not no_dubbed_videos:
        node_data = add_dubbed_video_mappings(node_data, lang)
    return node_data