def _download_html_exercise(exercise_id): """ Download an exercise and return its exercise id *if* the downloaded url from the selected language is different from the english version. """ try: for lang in lang_codes: lang_url = EXERCISE_DOWNLOAD_URL_TEMPLATE.format( id=exercise_id, lang=lang) en_url = EXERCISE_DOWNLOAD_URL_TEMPLATE.format( id=exercise_id, lang=EN_LANG_CODE) try: lang_file = download_and_cache_file(lang_url, cachedir=BUILD_DIR, ignorecache=force) en_file = download_and_cache_file(en_url, cachedir=EN_BUILD_DIR, ignorecache=force) if not filecmp.cmp(lang_file, en_file, shallow=False): return exercise_id except requests.exceptions.HTTPError as e: logging.warning( "Failed to fetch html for lang: {}, exercise {}, exception: {}" .format(lang, exercise_id, e)) except requests.exceptions.HTTPError as e: logging.warning( "Failed to fetch exercise for lang_codes: {}, exception: {}". format(lang_codes, e)) return None
def retrieve_subtitles(videos: list, lang="en", force=False) -> list: #videos => contains list of youtube ids """return list of youtubeids that were downloaded""" downloaded_videos = [] not_downloaded_videos = [] for youtube_id in videos: request_url = "https://www.amara.org/api2/partners/videos/?format=json&video_url=http://www.youtube.com/watch?v=%s" % ( youtube_id) try: response = requests.get(request_url) response.raise_for_status() except requests.exceptions.HTTPError: print("Skipping {}".format(youtube_id)) continue content = ujson.loads(response.content) if not content["objects"]: not_downloaded_videos.append(youtube_id) continue else: amara_id = content["objects"][0]["id"] subtitle_download_uri = "https://www.amara.org/api/videos/%s/languages/%s/subtitles/?format=vtt" %(amara_id, lang) try: response_code = urllib.request.urlopen(subtitle_download_uri) except urllib.error.HTTPError: continue file_dir = os.path.join(os.getcwd(), "build", "subtitles", lang) filename = "{}.vtt".format(youtube_id) download_and_cache_file(subtitle_download_uri, file_dir, filename=filename, ignorecache=force) downloaded_videos.append(youtube_id) return downloaded_videos
def _download_html_exercise(exercise_id): """ Download an exercise and return its exercise id *if* the downloaded url from the selected language is different from the english version. """ lang_url = EXERCISE_DOWNLOAD_URL_TEMPLATE.format(id=exercise_id, lang=lang) en_url = EXERCISE_DOWNLOAD_URL_TEMPLATE.format(id=exercise_id, lang="en") try: lang_file = download_and_cache_file(lang_url, cachedir=BUILD_DIR, ignorecache=force) en_file = download_and_cache_file(en_url, cachedir=EN_BUILD_DIR, ignorecache=force) if not filecmp.cmp(lang_file, en_file, shallow=False): return exercise_id except urllib.error.HTTPError: return None
def _download_html_exercise(exercise_id): """ Download an exercise and return its exercise id *if* the downloaded url from the selected language is different from the english version. """ lang_url = EXERCISE_DOWNLOAD_URL_TEMPLATE.format(id=exercise_id, lang=lang) en_url = EXERCISE_DOWNLOAD_URL_TEMPLATE.format(id=exercise_id, lang="en") try: lang_file = download_and_cache_file(lang_url, cachedir=BUILD_DIR, ignorecache=force) en_file = download_and_cache_file(en_url, cachedir=EN_BUILD_DIR, ignorecache=force) if not filecmp.cmp(lang_file, en_file, shallow=False): return exercise_id except requests.exceptions.HTTPError as e: logging.warning("Failed to fetch html for exercise {}, exception: {}".format(exercise_id, e)) return None
def retrieve_translations(crowdin_project_name, crowdin_secret_key, lang_code=EN_LANG_CODE, force=False, includes="*.po") -> Catalog: request_url_template = ("https://api.crowdin.com/api/" "project/{project_id}/download/" "{lang_code}.zip?key={key}") export_url_template = ("https://api.crowdin.com/api/" "project/{project_id}/export/" "{lang_code}.zip?key={key}") request_url = request_url_template.format( project_id=crowdin_project_name, lang_code=lang_code, key=crowdin_secret_key, ) export_url = request_url_template.format( project_id=crowdin_project_name, lang_code=lang_code, key=crowdin_secret_key, ) logging.info("requesting CrowdIn to rebuild latest translations.") try: requests.get(export_url) except requests.exceptions.RequestException as e: logging.warning( "Got exception when building CrowdIn translations: {}".format(e)) logging.debug("Retrieving translations from {}".format(request_url)) zip_path = download_and_cache_file(request_url, ignorecache=force) zip_extraction_path = tempfile.mkdtemp() with zipfile.ZipFile(zip_path) as zf: zf.extractall(zip_extraction_path) all_filenames = glob.iglob(os.path.join(zip_extraction_path, "**"), recursive=True) filenames = fnmatch.filter(all_filenames, includes) # use the polib library, since it's much faster at concatenating # po files. it doesn't have a dict interface though, so we'll # reread the file using babel.Catalog. with tempfile.NamedTemporaryFile() as f: main_pofile = polib.POFile(fpath=f.name) for filename in filenames: pofile = polib.pofile(filename) main_pofile.merge(pofile) for entry in main_pofile: entry.obsolete = False main_pofile.save() shutil.rmtree(zip_extraction_path) msgid_mapping = Catalog(main_pofile) return msgid_mapping
def _download_subtitle_data(youtube_id): logging.info("trying to download subtitle for %s" % youtube_id) request_url = "https://www.amara.org/api2/partners/videos/?format=json&video_url=http://www.youtube.com/watch?v=%s" % ( youtube_id) try: amara_id_file = retrieve_subtitle_meta_data( request_url, filename="subtitles/meta_data/{youtube_id}".format( youtube_id=youtube_id)) with open(amara_id_file, 'r') as f: amara_id = f.read() subtitle_download_uri = "https://www.amara.org/api/videos/%s/languages/%s/subtitles/?format=vtt" % ( amara_id, lang) filename = "subtitles/{lang}/{youtube_id}.vtt".format( lang=lang, youtube_id=youtube_id) subtitle_path = download_and_cache_file(subtitle_download_uri, filename=filename, ignorecache=False) logging.info("subtitle path: {}".format(subtitle_path)) return youtube_id, subtitle_path except (requests.exceptions.RequestException, KeyError, urllib.error.HTTPError, urllib.error.URLError) as e: logging.info("got error while downloading subtitles: {}".format(e)) pass
def _download_subtitle_data(youtube_id): logging.info("trying to download subtitle for %s" % youtube_id) request_url = ( "https://www.amara.org/api2/partners/videos/?format=json&video_url=http://www.youtube.com/watch?v=%s" % (youtube_id) ) try: amara_id_file = retrieve_subtitle_meta_data( request_url, filename="subtitles/meta_data/{youtube_id}".format(youtube_id=youtube_id) ) with open(amara_id_file, "r") as f: amara_id = f.read() subtitle_download_uri = "https://www.amara.org/api/videos/%s/languages/%s/subtitles/?format=vtt" % ( amara_id, lang, ) filename = "subtitles/{lang}/{youtube_id}.vtt".format(lang=lang, youtube_id=youtube_id) subtitle_path = download_and_cache_file(subtitle_download_uri, filename=filename, ignorecache=False) logging.info("subtitle path: {}".format(subtitle_path)) return youtube_id, subtitle_path except (requests.exceptions.RequestException, KeyError, urllib.error.HTTPError, urllib.error.URLError) as e: logging.info("got error while downloading subtitles: {}".format(e)) pass
def retrieve_translations(crowdin_project_name, crowdin_secret_key, lang_code=EN_LANG_CODE, force=False, includes="*.po") -> Catalog: request_url_template = ("https://api.crowdin.com/api/" "project/{project_id}/download/" "{lang_code}.zip?key={key}") export_url_template = ("https://api.crowdin.com/api/" "project/{project_id}/export/" "{lang_code}.zip?key={key}") request_url = request_url_template.format( project_id=crowdin_project_name, lang_code=lang_code, key=crowdin_secret_key, ) export_url = request_url_template.format( project_id=crowdin_project_name, lang_code=lang_code, key=crowdin_secret_key, ) logging.info("requesting CrowdIn to rebuild latest translations.") try: requests.get(export_url) except requests.exceptions.RequestException as e: logging.warning( "Got exception when building CrowdIn translations: {}".format(e) ) logging.debug("Retrieving translations from {}".format(request_url)) zip_path = download_and_cache_file(request_url, ignorecache=force) zip_extraction_path = tempfile.mkdtemp() with zipfile.ZipFile(zip_path) as zf: zf.extractall(zip_extraction_path) all_filenames = glob.iglob( os.path.join(zip_extraction_path, "**"), recursive=True ) filenames = fnmatch.filter(all_filenames, includes) # use the polib library, since it's much faster at concatenating # po files. it doesn't have a dict interface though, so we'll # reread the file using babel.Catalog. with tempfile.NamedTemporaryFile() as f: main_pofile = polib.POFile(fpath=f.name) for filename in filenames: pofile = polib.pofile(filename) main_pofile.merge(pofile) for entry in main_pofile: entry.obsolete = False main_pofile.save() shutil.rmtree(zip_extraction_path) msgid_mapping = Catalog(main_pofile) return msgid_mapping
def retrieve_kalite_topic_data(url=None, force=False): """ Retrieve the KA Lite topics.json file in the master branch. If url is given, download from that url instead. """ if not url: url = "https://raw.githubusercontent.com/learningequality/ka-lite/master/data/khan/topics.json" path = download_and_cache_file(url, ignorecache=force) with open(path) as f: return ujson.load(f)
def retrieve_translations(crowdin_project_name, crowdin_secret_key, lang_code="en", force=False, includes="*.po") -> polib.POFile: request_url_template = ("https://api.crowdin.com/api/" "project/{project_id}/download/" "{lang_code}.zip?key={key}") request_url = request_url_template.format( project_id=crowdin_project_name, lang_code=lang_code, key=crowdin_secret_key, ) zip_path = download_and_cache_file(request_url, ignorecache=force) zip_extraction_path = tempfile.mkdtemp() with zipfile.ZipFile(zip_path) as zf: zf.extractall(zip_extraction_path) all_filenames = glob.iglob( os.path.join(zip_extraction_path, "**"), recursive=True ) filenames = fnmatch.filter(all_filenames, includes) # use the polib library, since it's much faster at concatenating # po files. it doesn't have a dict interface though, so we'll # reread the file using babel.Catalog. with tempfile.NamedTemporaryFile() as f: main_pofile = polib.POFile(fpath=f.name) for filename in filenames: pofile = polib.pofile(filename) main_pofile.merge(pofile) for entry in main_pofile: entry.obsolete = False main_pofile.save() shutil.rmtree(zip_extraction_path) # add convenience dict for mapping a msgid to msgstr main_pofile.msgid_mapping = {m.msgid: m.msgstr for m in main_pofile if m.translated()} return main_pofile
def retrieve_assessment_item_data(assessment_item, lang=None, force=False) -> (dict, [str]): """ Retrieve assessment item data and images for a single assessment item. :param assessment_item: id of assessment item :param lang: language to retrieve data in :param force: refetch assessment item and images even if it exists on disk :return: tuple of dict of assessment item data and list of paths to files """ if lang: url = "http://www.khanacademy.org/api/v1/assessment_items/{assessment_item}?lang={lang}".format(lang=lang) filename = "assessment_items/{assessment_item}_{lang}.json".format(lang=lang) else: url = "http://www.khanacademy.org/api/v1/assessment_items/{assessment_item}" filename = "assessment_items/{assessment_item}.json" try: url = url.format(assessment_item=assessment_item) filename = filename.format(assessment_item=assessment_item) path = download_assessment_item_data(url, filename=filename, lang=lang, force=force) except requests.RequestException: logging.error("Download failure for assessment item: {assessment_item}".format(assessment_item=assessment_item)) raise with open(path, "r") as f: item_data = json.load(f) image_urls = find_all_image_urls(item_data) graphie_urls = find_all_graphie_urls(item_data) file_paths = [] for url in itertools.chain(image_urls, graphie_urls): filename = MANUAL_IMAGE_URL_TO_FILENAME_MAPPING.get(url, os.path.basename(url)) filepath = _get_subpath_from_filename(filename) file_paths.append(download_and_cache_file(url, filename=filepath)) item_data = localize_image_urls(item_data) item_data = localize_content_links(item_data) item_data = localize_graphie_urls(item_data) return item_data, file_paths
def test_returns_existing_file(self): url = "https://google.com" path = download_and_cache_file(url) assert os.path.exists(path)
def _download_image_urls(url): filename = MANUAL_IMAGE_URL_TO_FILENAME_MAPPING.get( url, os.path.basename(url)) filepath = _get_subpath_from_filename(filename) return download_and_cache_file(url, filename=filepath)
def _download_image_urls(url): filename = MANUAL_IMAGE_URL_TO_FILENAME_MAPPING.get(url, os.path.basename(url)) filepath = _get_subpath_from_filename(filename) return download_and_cache_file(url, filename=filepath)