def move_srts(lang_code): """ Srts live in the locale directory, but that's not exposed at any URL. So instead, we have to move the srts out to /static/subtitles/[lang_code]/ """ lang_code_ietf = lcode_to_ietf(lang_code) lang_code_django = lcode_to_django_dir(lang_code) subtitles_static_dir = os.path.join(settings.STATIC_ROOT, "subtitles") src_dir = os.path.join(LOCALE_ROOT, lang_code_django, "subtitles") dest_dir = get_srt_path(lang_code_django) ensure_dir(dest_dir) lang_subtitles = glob.glob(os.path.join(src_dir, "*.srt")) logging.info("Moving %d subtitles from %s to %s" % (len(lang_subtitles), src_dir, dest_dir)) for fil in lang_subtitles: srt_dest_path = os.path.join(dest_dir, os.path.basename(fil)) if os.path.exists(srt_dest_path): os.remove(srt_dest_path) # we're going to replace any srt with a newer version shutil.move(fil, srt_dest_path) if not os.path.exists(src_dir): logging.info("No subtitles for language pack %s" % lang_code) elif os.listdir(src_dir): logging.warn("%s is not empty; will not remove. Please check that all subtitles were moved." % src_dir) else: logging.info("Removing empty source directory (%s)." % src_dir) shutil.rmtree(src_dir)
def move_srts(lang_code): """ Srts live in the locale directory, but that's not exposed at any URL. So instead, we have to move the srts out to /static/subtitles/[lang_code]/ """ lang_code_ietf = lcode_to_ietf(lang_code) lang_code_django = lcode_to_django_dir(lang_code) subtitles_static_dir = os.path.join(settings.STATIC_ROOT, "subtitles") src_dir = os.path.join(LOCALE_ROOT, lang_code_django, "subtitles") dest_dir = get_srt_path(lang_code_django) ensure_dir(dest_dir) lang_subtitles = glob.glob(os.path.join(src_dir, "*.srt")) logging.info("Moving %d subtitles from %s to %s" % (len(lang_subtitles), src_dir, dest_dir)) for fil in lang_subtitles: srt_dest_path = os.path.join(dest_dir, os.path.basename(fil)) if os.path.exists(srt_dest_path): os.remove(srt_dest_path ) # we're going to replace any srt with a newer version shutil.move(fil, srt_dest_path) if not os.path.exists(src_dir): logging.info("No subtitles for language pack %s" % lang_code) elif os.listdir(src_dir): logging.warn( "%s is not empty; will not remove. Please check that all subtitles were moved." % src_dir) else: logging.info("Removing empty source directory (%s)." % src_dir) shutil.rmtree(src_dir)
def download_subtitle(request, lang_code, youtube_id): """Dummy function for capturing a video download request and logging to output, so we can collect stats.""" # Log the info stats_logger("subtitles").info("sd;%s;%s;%s" % (get_request_ip(request), lang_code, youtube_id)) # Find the file to return srt_filepath = get_srt_path(lang_code, youtube_id=youtube_id) if not os.path.exists(srt_filepath): raise Http404 # Stream it back to the user # Stream it back to the user zh = open(srt_filepath, "rb") response = HttpResponse(content=zh, mimetype="text/plain", content_type="text/plain") response["Content-Disposition"] = 'attachment; filename="%s"' % os.path.basename(srt_filepath) return response
def clear_subtitles_cache(lang_codes=None, locale_root=LOCALE_ROOT): """ Language codes will be converted to django format (e.g. en_US) """ lang_codes = lang_codes or get_langs_with_subtitles() for lang_code in lang_codes: lang_code = lcode_to_ietf(lang_code) # Clear the status file lm_file = get_lang_map_filepath(lang_code) download_status = softload_json(lm_file, raises=True) for key in download_status: download_status[key] = {u'downloaded': False, u'last_success': u'', u'last_attempt': u'', u'api_response': u''} with open(lm_file, "w") as fp: json.dump(download_status, fp) # Delete all srt files srt_path = get_srt_path(lang_code) if os.path.exists(srt_path): shutil.rmtree(srt_path)
def store_new_counts(lang_code, data_path=SUBTITLES_DATA_ROOT, locale_root=LOCALE_ROOT): """Write a new dictionary of srt file counts in respective download folders""" language_subtitle_count = {} subtitles_path = get_srt_path(lang_code) lang_name = get_language_name(lang_code) try: count = len(glob.glob("%s/*.srt" % subtitles_path)) language_subtitle_count[lang_name] = {} language_subtitle_count[lang_name]["count"] = count language_subtitle_count[lang_name]["code"] = lang_code except LanguageNameDoesNotExist as ldne: count = 0 logging.debug(ldne) except: count = 0 logging.info("%-4s subtitles for %-20s" % ("No", lang_name)) # Always write to disk. write_count_to_json(language_subtitle_count, data_path) return count
def generate_srt_availability_file(lang_code): ''' For compatibility with versions less than 0.10.3, we need to generate this json file that contains the srts for the videos. ''' # this path is a direct copy of the path found in the old function that generated this file srts_file_dest_path = os.path.join(settings.STATIC_ROOT, 'data', 'subtitles', 'languages', "%s_available_srts.json") % lang_code ensure_dir(os.path.dirname(srts_file_dest_path)) srts_path = get_srt_path(lang_code) # not sure yet about this; change once command is complete try: files = os.listdir(srts_path) except OSError: # directory doesnt exist or we cant read it files = [] yt_ids = [f.rstrip(".srt") for f in files] srts_dict = { 'srt_files': yt_ids } with open(srts_file_dest_path, 'wb') as fp: logging.debug('Creating %s', srts_file_dest_path) json.dump(srts_dict, fp) return yt_ids
def download_subtitle(request, lang_code, youtube_id): """Dummy function for capturing a video download request and logging to output, so we can collect stats.""" # Log the info stats_logger("subtitles").info( "sd;%s;%s;%s" % (get_request_ip(request), lang_code, youtube_id)) # Find the file to return srt_filepath = get_srt_path(lang_code, youtube_id=youtube_id) if not os.path.exists(srt_filepath): raise Http404 # Stream it back to the user # Stream it back to the user zh = open(srt_filepath, "rb") response = HttpResponse(content=zh, mimetype='text/plain', content_type='text/plain') response[ 'Content-Disposition'] = 'attachment; filename="%s"' % os.path.basename( srt_filepath) return response
def stamp_availability_on_video(video, format="mp4", force=False, stamp_urls=True, videos_path=settings.CONTENT_ROOT): """ Stamp all relevant urls and availability onto a video object (if necessary), including: * whether the video is available (on disk or online) """ def compute_video_availability(youtube_id, format, videos_path=settings.CONTENT_ROOT): return { "on_disk": is_video_on_disk(youtube_id, format, videos_path=videos_path) } def compute_video_metadata(youtube_id, format): return {"stream_type": "video/%s" % format} def compute_video_urls(youtube_id, format, lang_code, on_disk=None, thumb_formats=["png", "jpg"], videos_path=videos_path): if on_disk is None: on_disk = is_video_on_disk(youtube_id, format, videos_path=videos_path) if on_disk: video_base_url = settings.CONTENT_URL + youtube_id stream_url = video_base_url + ".%s" % format thumbnail_url = None # default to None now, so we know when no thumbnail is available. for thumb_format in thumb_formats: # find the thumbnail on disk thumb_filename = '%s.%s' % (youtube_id, thumb_format) thumb_filepath = os.path.join(videos_path, thumb_filename) if os.path.exists(thumb_filepath): thumbnail_url = video_base_url + "." + thumb_format # default break elif settings.BACKUP_VIDEO_SOURCE and lang_code == "en": dict_vals = { "youtube_id": youtube_id, "video_format": format, "thumb_format": thumb_formats[0] } stream_url = settings.BACKUP_VIDEO_SOURCE % dict_vals thumbnail_url = settings.BACKUP_THUMBNAIL_SOURCE % dict_vals if settings.BACKUP_THUMBNAIL_SOURCE else None else: return {} # no URLs return {"stream": stream_url, "thumbnail": thumbnail_url} video_availability = video.get("availability", {}) if not force else {} en_youtube_id = get_youtube_id(video["id"], None) video_map = get_id2oklang_map(video["id"]) or {} if not "on_disk" in video_availability: for lang_code in video_map.keys(): youtube_id = video_map[lang_code].encode('utf-8') video_availability[lang_code] = compute_video_availability( youtube_id, format=format, videos_path=videos_path) video_availability["en"] = video_availability.get( "en", {"on_disk": False}) # en should always be defined # Summarize status any_on_disk = any([ lang_avail["on_disk"] for lang_avail in video_availability.values() ]) any_available = any_on_disk or bool(settings.BACKUP_VIDEO_SOURCE) if stamp_urls: # Loop over all known dubbed videos for lang_code, youtube_id in video_map.iteritems(): urls = compute_video_urls( youtube_id, format, lang_code, on_disk=video_availability[lang_code]["on_disk"], videos_path=videos_path) if urls: # Only add properties if anything is available. video_availability[lang_code].update(urls) video_availability[lang_code].update( compute_video_metadata(youtube_id, format)) # Get the (english) subtitle urls subtitle_lang_codes = get_langs_with_subtitle(en_youtube_id) subtitles_tuple = [(lc, get_srt_url(en_youtube_id, lc)) for lc in subtitle_lang_codes if os.path.exists(get_srt_path(lc, en_youtube_id))] subtitles_urls = dict(subtitles_tuple) video_availability["en"]["subtitles"] = subtitles_urls # now scrub any values that don't actually exist for lang_code in video_availability.keys(): if not video_availability[lang_code]["on_disk"] and len( video_availability[lang_code]) == 1: del video_availability[lang_code] # Now summarize some availability onto the video itself video["availability"] = video_availability video["on_disk"] = any_on_disk video["available"] = any_available return video
def stamp_availability_on_video(video, format="mp4", force=False, stamp_urls=True, videos_path=settings.CONTENT_ROOT): """ Stamp all relevant urls and availability onto a video object (if necessary), including: * whether the video is available (on disk or online) """ def compute_video_availability(youtube_id, format, videos_path=settings.CONTENT_ROOT): return {"on_disk": is_video_on_disk(youtube_id, format, videos_path=videos_path)} def compute_video_metadata(youtube_id, format): return {"stream_type": "video/%s" % format} def compute_video_urls(youtube_id, format, lang_code, on_disk=None, thumb_formats=["png", "jpg"], videos_path=videos_path): if on_disk is None: on_disk = is_video_on_disk(youtube_id, format, videos_path=videos_path) if on_disk: video_base_url = settings.CONTENT_URL + youtube_id stream_url = video_base_url + ".%s" % format thumbnail_url = None # default to None now, so we know when no thumbnail is available. for thumb_format in thumb_formats: # find the thumbnail on disk thumb_filename = '%s.%s' % (youtube_id, thumb_format) thumb_filepath = os.path.join(videos_path, thumb_filename) if os.path.exists(thumb_filepath): thumbnail_url = video_base_url + "." + thumb_format # default break elif settings.BACKUP_VIDEO_SOURCE and lang_code == "en": dict_vals = {"youtube_id": youtube_id, "video_format": format, "thumb_format": thumb_formats[0] } stream_url = settings.BACKUP_VIDEO_SOURCE % dict_vals thumbnail_url = settings.BACKUP_THUMBNAIL_SOURCE % dict_vals if settings.BACKUP_THUMBNAIL_SOURCE else None else: return {} # no URLs return {"stream": stream_url, "thumbnail": thumbnail_url} video_availability = video.get("availability", {}) if not force else {} en_youtube_id = get_youtube_id(video["id"], None) video_map = get_id2oklang_map(video["id"]) or {} if not "on_disk" in video_availability: for lang_code in video_map.keys(): youtube_id = video_map[lang_code].encode('utf-8') video_availability[lang_code] = compute_video_availability(youtube_id, format=format, videos_path=videos_path) video_availability["en"] = video_availability.get("en", {"on_disk": False}) # en should always be defined # Summarize status any_on_disk = any([lang_avail["on_disk"] for lang_avail in video_availability.values()]) any_available = any_on_disk or bool(settings.BACKUP_VIDEO_SOURCE) if stamp_urls: # Loop over all known dubbed videos for lang_code, youtube_id in video_map.iteritems(): urls = compute_video_urls(youtube_id, format, lang_code, on_disk=video_availability[lang_code]["on_disk"], videos_path=videos_path) if urls: # Only add properties if anything is available. video_availability[lang_code].update(urls) video_availability[lang_code].update(compute_video_metadata(youtube_id, format)) # Get the (english) subtitle urls subtitle_lang_codes = get_langs_with_subtitle(en_youtube_id) subtitles_tuple = [(lc, get_srt_url(en_youtube_id, lc)) for lc in subtitle_lang_codes if os.path.exists(get_srt_path(lc, en_youtube_id))] subtitles_urls = dict(subtitles_tuple) video_availability["en"]["subtitles"] = subtitles_urls # now scrub any values that don't actually exist for lang_code in video_availability.keys(): if not video_availability[lang_code]["on_disk"] and len(video_availability[lang_code]) == 1: del video_availability[lang_code] # Now summarize some availability onto the video itself video["availability"] = video_availability video["on_disk"] = any_on_disk video["available"] = any_available return video
def download_if_criteria_met(videos, lang_code, force, response_code, date_since_attempt, frequency_to_save, *args, **kwargs): """Execute download of subtitle if it meets the criteria specified by the command line args Note: videos are a dict; keys=youtube_id, values=data Note: lang_code is in IETF format. """ date_specified = convert_date_input(date_since_attempt) # Filter up front, for efficiency (& reporting's sake) n_videos = len(videos) logging.info("There are (up to) %s total videos with subtitles for language '%s'. Let's go get them!" % ( n_videos, lang_code, )) # Filter based on response code if response_code and response_code != "all": logging.info("Filtering based on response code (%s)..." % response_code) response_code_filter = partial( lambda vid, rcode: rcode == vid["api_response"], rcode=response_code) videos = dict([(k, v) for k, v in videos.iteritems() if response_code_filter(v)]) logging.info("%4d of %4d videos match your specified response code (%s)" % ( len(videos), n_videos, response_code, )) if date_specified: logging.info("Filtering based on date...") for k in videos.keys(): if not videos[k]["last_attempt"]: continue elif datetime.datetime.strptime(videos[k]["last_attempt"], '%Y-%m-%d') < date_specified: continue elif False: # TODO(bcipolli): check output filename exists, as per # 1359 continue else: del videos[k] logging.info("%4d of %4d videos need refreshing (last refresh more recent than %s)" % ( len(videos), n_videos, date_specified, )) # Loop over videos needing refreshing n_loops = 0 srt_count = None for youtube_id, entry in videos.items(): previously_downloaded = entry.get("downloaded") if previously_downloaded and not force: logging.info("Already downloaded %s/%s. To redownload, run again with -f." % ( lang_code, youtube_id, )) continue logging.debug("Attempting to download subtitle for lang: %s and YouTube ID: %s" % ( lang_code, youtube_id, )) response = download_subtitle(youtube_id, lang_code, format="srt") time_of_attempt = unicode(datetime.datetime.now().date()) if response in ["client-error", "server-error", "unexpected_error"]: # Couldn't download logging.info("%s/%s.srt: Updating JSON file to record error (%s)." % ( lang_code, youtube_id, response, )) update_json(youtube_id, lang_code, previously_downloaded, response, time_of_attempt) else: dirpath = get_srt_path(lang_code) fullpath = os.path.join(dirpath, youtube_id + ".srt") ensure_dir(dirpath) logging.debug("Writing file to %s" % fullpath) with open(fullpath, 'w') as fp: fp.write(response.encode('UTF-8')) logging.info("%s/%s.srt: Updating JSON file to record success." % ( lang_code, youtube_id, )) update_json(youtube_id, lang_code, True, "success", time_of_attempt) # Update srt availability mapping n_loops += 1 if n_loops % frequency_to_save == 0 or n_loops == len(videos.keys()): srt_count = store_new_counts(lang_code=lang_code) logging.info("%s: On loop %d / %d, stored: subtitle count = %d." % ( lang_code, n_loops, len(videos), srt_count, )) # Summarize output if srt_count is None: # only none if nothing was done. logging.info("Nothing was done.") else: logging.info("We now have %d subtitles (amara thought they had %d) for language '%s'!" % ( srt_count, n_videos, lang_code, ))