def dubbed_video_data_from_api(lang_code): k = Khan(lang=lang_code) videos = k.get_videos() return { v["youtube_id"]: v["translated_youtube_id"] for v in videos if v["youtube_id"] != v["translated_youtube_id"] }
def retrieve_API_data(channel=None): # TODO(jamalex): See how much of what we do here can be replaced by KA's new projection-based API # http://www.khanacademy.org/api/v2/topics/topictree?projection={"topics":[{"slug":1,"childData":[{"id":1}]}]} khan = Khan() logging.info("Fetching Khan topic tree") topic_tree = khan.get_topic_tree() logging.info("Fetching Khan exercises") exercises = khan.get_exercises() exercises_dummy = khan.get_exercises() logging.info("Fetching Khan videos") content = khan.get_videos() # Hack to hardcode the mp4 format flag on Videos. for con in content: con["format"] = "mp4" # Compute and save file sizes logging.info("Checking remote content file sizes...") try: with open(REMOTE_VIDEO_SIZE_FILEPATH, "r") as fp: old_sizes = json.load(fp) except: old_sizes = {} blacklist = [key for key, val in old_sizes.items() if val > 0] # exclude any we already know about sizes_by_id, sizes = query_remote_content_file_sizes(content, blacklist=blacklist) ensure_dir(os.path.dirname(REMOTE_VIDEO_SIZE_FILEPATH)) old_sizes.update(sizes_by_id) sizes = OrderedDict(sorted(old_sizes.items())) with open(REMOTE_VIDEO_SIZE_FILEPATH, "w") as fp: json.dump(sizes, fp, indent=2) logging.info("Finished checking remote content file sizes...") assessment_items = [] # Limit number of simultaneous requests semaphore = threading.BoundedSemaphore(100) def fetch_assessment_data(exercise): logging.info("Fetching Assessment Item Data for {exercise}".format(exercise=exercise.display_name)) for assessment_item in exercise.all_assessment_items: counter = 0 wait = 5 while wait: try: semaphore.acquire() logging.info("Fetching assessment item {assessment}".format(assessment=assessment_item["id"])) assessment_data = khan.get_assessment_item(assessment_item["id"]) semaphore.release() if assessment_data.get("item_data"): wait = 0 assessment_items.append(assessment_data) else: logging.info("Fetching assessment item {assessment} failed retrying in {wait}".format(assessment=assessment_item["id"], wait=wait)) time.sleep(wait) wait = wait*2 counter += 1 except (requests.ConnectionError, requests.Timeout): semaphore.release() time.sleep(wait) wait = wait*2 counter += 1 if counter > 5: logging.info("Fetching assessment item {assessment} failed more than 5 times, aborting".format(assessment=assessment_item["id"])) break threads = [threading.Thread(target=fetch_assessment_data, args=(exercise,)) for exercise in exercises_dummy] for thread in threads: thread.start() for thread in threads: thread.join() return topic_tree, exercises, assessment_items, content
def retrieve_API_data(channel=None): # TODO(jamalex): See how much of what we do here can be replaced by KA's new projection-based API # http://www.khanacademy.org/api/v2/topics/topictree?projection={"topics":[{"slug":1,"childData":[{"id":1}]}]} khan = Khan() logging.info("Fetching Khan topic tree") topic_tree = khan.get_topic_tree() logging.info("Fetching Khan exercises") exercises = khan.get_exercises() exercises_dummy = khan.get_exercises() logging.info("Fetching Khan videos") content = khan.get_videos() # Hack to hardcode the mp4 format flag on Videos. for con in content: con["format"] = "mp4" # Compute and save file sizes logging.info("Checking remote content file sizes...") try: with open(REMOTE_VIDEO_SIZE_FILEPATH, "r") as fp: old_sizes = json.load(fp) except: old_sizes = {} blacklist = [key for key, val in old_sizes.items() if val > 0] # exclude any we already know about sizes_by_id, sizes = query_remote_content_file_sizes(content, blacklist=blacklist) ensure_dir(os.path.dirname(REMOTE_VIDEO_SIZE_FILEPATH)) old_sizes.update(sizes_by_id) sizes = OrderedDict(sorted(old_sizes.items())) with open(REMOTE_VIDEO_SIZE_FILEPATH, "w") as fp: json.dump(sizes, fp, indent=2) logging.info("Finished checking remote content file sizes...") assessment_items = [] # Limit number of simultaneous requests semaphore = threading.BoundedSemaphore(100) def fetch_assessment_data(exercise): logging.info("Fetching Assessment Item Data for {exercise}".format( exercise=exercise.display_name)) for assessment_item in exercise.all_assessment_items: counter = 0 wait = 5 while wait: try: semaphore.acquire() logging.info( "Fetching assessment item {assessment}".format( assessment=assessment_item["id"])) assessment_data = khan.get_assessment_item( assessment_item["id"]) semaphore.release() if assessment_data.get("item_data"): wait = 0 assessment_items.append(assessment_data) else: logging.info( "Fetching assessment item {assessment} failed retrying in {wait}" .format(assessment=assessment_item["id"], wait=wait)) time.sleep(wait) wait = wait * 2 counter += 1 except (requests.ConnectionError, requests.Timeout): semaphore.release() time.sleep(wait) wait = wait * 2 counter += 1 if counter > 5: logging.info( "Fetching assessment item {assessment} failed more than 5 times, aborting" .format(assessment=assessment_item["id"])) break threads = [ threading.Thread(target=fetch_assessment_data, args=(exercise, )) for exercise in exercises_dummy ] for thread in threads: thread.start() for thread in threads: thread.join() return topic_tree, exercises, assessment_items, content
def dubbed_video_data_from_api(lang_code): k = Khan(lang=lang_code) videos = k.get_videos() return {v["youtube_id"]: v["translated_youtube_id"] for v in videos if v["youtube_id"] != v["translated_youtube_id"]}