Example #1
0
def move_srts(lang_code):
    """
    Srts live in the locale directory, but that's not exposed at any URL.  So instead,
    we have to move the srts out to /static/subtitles/[lang_code]/
    """
    lang_code_ietf = lcode_to_ietf(lang_code)
    lang_code_django = lcode_to_django_dir(lang_code)

    subtitles_static_dir = os.path.join(settings.STATIC_ROOT, "subtitles")
    src_dir = os.path.join(LOCALE_ROOT, lang_code_django, "subtitles")
    dest_dir = get_srt_path(lang_code_django)
    ensure_dir(dest_dir)

    lang_subtitles = glob.glob(os.path.join(src_dir, "*.srt"))
    logging.info("Moving %d subtitles from %s to %s" % (len(lang_subtitles), src_dir, dest_dir))

    for fil in lang_subtitles:
        srt_dest_path = os.path.join(dest_dir, os.path.basename(fil))
        if os.path.exists(srt_dest_path):
            os.remove(srt_dest_path)  # we're going to replace any srt with a newer version
        shutil.move(fil, srt_dest_path)

    if not os.path.exists(src_dir):
        logging.info("No subtitles for language pack %s" % lang_code)
    elif os.listdir(src_dir):
        logging.warn("%s is not empty; will not remove.  Please check that all subtitles were moved." % src_dir)
    else:
        logging.info("Removing empty source directory (%s)." % src_dir)
        shutil.rmtree(src_dir)
def move_srts(lang_code):
    """
    Srts live in the locale directory, but that's not exposed at any URL.  So instead,
    we have to move the srts out to /static/subtitles/[lang_code]/
    """
    lang_code_ietf = lcode_to_ietf(lang_code)
    lang_code_django = lcode_to_django_dir(lang_code)

    subtitles_static_dir = os.path.join(settings.STATIC_ROOT, "subtitles")
    src_dir = os.path.join(LOCALE_ROOT, lang_code_django, "subtitles")
    dest_dir = get_srt_path(lang_code_django)
    ensure_dir(dest_dir)

    lang_subtitles = glob.glob(os.path.join(src_dir, "*.srt"))
    logging.info("Moving %d subtitles from %s to %s" %
                 (len(lang_subtitles), src_dir, dest_dir))

    for fil in lang_subtitles:
        srt_dest_path = os.path.join(dest_dir, os.path.basename(fil))
        if os.path.exists(srt_dest_path):
            os.remove(srt_dest_path
                      )  # we're going to replace any srt with a newer version
        shutil.move(fil, srt_dest_path)

    if not os.path.exists(src_dir):
        logging.info("No subtitles for language pack %s" % lang_code)
    elif os.listdir(src_dir):
        logging.warn(
            "%s is not empty; will not remove.  Please check that all subtitles were moved."
            % src_dir)
    else:
        logging.info("Removing empty source directory (%s)." % src_dir)
        shutil.rmtree(src_dir)
Example #3
0
def download_subtitle(request, lang_code, youtube_id):
    """Dummy function for capturing a video download request and logging
    to output, so we can collect stats."""

    # Log the info
    stats_logger("subtitles").info("sd;%s;%s;%s" % (get_request_ip(request), lang_code, youtube_id))

    # Find the file to return
    srt_filepath = get_srt_path(lang_code, youtube_id=youtube_id)
    if not os.path.exists(srt_filepath):
        raise Http404

    # Stream it back to the user
    # Stream it back to the user
    zh = open(srt_filepath, "rb")
    response = HttpResponse(content=zh, mimetype="text/plain", content_type="text/plain")
    response["Content-Disposition"] = 'attachment; filename="%s"' % os.path.basename(srt_filepath)

    return response
def clear_subtitles_cache(lang_codes=None, locale_root=LOCALE_ROOT):
    """
    Language codes will be converted to django format (e.g. en_US)
    """
    lang_codes = lang_codes or get_langs_with_subtitles()
    for lang_code in lang_codes:
        lang_code = lcode_to_ietf(lang_code)

        # Clear the status file
        lm_file = get_lang_map_filepath(lang_code)
        download_status = softload_json(lm_file, raises=True)
        for key in download_status:
            download_status[key] = {u'downloaded': False, u'last_success': u'', u'last_attempt': u'', u'api_response': u''}
        with open(lm_file, "w") as fp:
            json.dump(download_status, fp)

        # Delete all srt files
        srt_path = get_srt_path(lang_code)
        if os.path.exists(srt_path):
            shutil.rmtree(srt_path)
Example #5
0
def clear_subtitles_cache(lang_codes=None, locale_root=LOCALE_ROOT):
    """
    Language codes will be converted to django format (e.g. en_US)
    """
    lang_codes = lang_codes or get_langs_with_subtitles()
    for lang_code in lang_codes:
        lang_code = lcode_to_ietf(lang_code)

        # Clear the status file
        lm_file = get_lang_map_filepath(lang_code)
        download_status = softload_json(lm_file, raises=True)
        for key in download_status:
            download_status[key] = {u'downloaded': False, u'last_success': u'', u'last_attempt': u'', u'api_response': u''}
        with open(lm_file, "w") as fp:
            json.dump(download_status, fp)

        # Delete all srt files
        srt_path = get_srt_path(lang_code)
        if os.path.exists(srt_path):
            shutil.rmtree(srt_path)
def store_new_counts(lang_code, data_path=SUBTITLES_DATA_ROOT, locale_root=LOCALE_ROOT):
    """Write a new dictionary of srt file counts in respective download folders"""
    language_subtitle_count = {}
    subtitles_path = get_srt_path(lang_code)
    lang_name = get_language_name(lang_code)

    try:
        count = len(glob.glob("%s/*.srt" % subtitles_path))

        language_subtitle_count[lang_name] = {}
        language_subtitle_count[lang_name]["count"] = count
        language_subtitle_count[lang_name]["code"] = lang_code
    except LanguageNameDoesNotExist as ldne:
        count = 0
        logging.debug(ldne)
    except:
        count = 0
        logging.info("%-4s subtitles for %-20s" % ("No", lang_name))

    # Always write to disk.
    write_count_to_json(language_subtitle_count, data_path)

    return count
Example #7
0
def store_new_counts(lang_code, data_path=SUBTITLES_DATA_ROOT, locale_root=LOCALE_ROOT):
    """Write a new dictionary of srt file counts in respective download folders"""
    language_subtitle_count = {}
    subtitles_path = get_srt_path(lang_code)
    lang_name = get_language_name(lang_code)

    try:
        count = len(glob.glob("%s/*.srt" % subtitles_path))

        language_subtitle_count[lang_name] = {}
        language_subtitle_count[lang_name]["count"] = count
        language_subtitle_count[lang_name]["code"] = lang_code
    except LanguageNameDoesNotExist as ldne:
        count = 0
        logging.debug(ldne)
    except:
        count = 0
        logging.info("%-4s subtitles for %-20s" % ("No", lang_name))

    # Always write to disk.
    write_count_to_json(language_subtitle_count, data_path)

    return count
def generate_srt_availability_file(lang_code):
    '''
    For compatibility with versions less than 0.10.3, we need to generate this
    json file that contains the srts for the videos.
    '''

    # this path is a direct copy of the path found in the old function that generated this file
    srts_file_dest_path = os.path.join(settings.STATIC_ROOT, 'data', 'subtitles', 'languages', "%s_available_srts.json") % lang_code
    ensure_dir(os.path.dirname(srts_file_dest_path))

    srts_path = get_srt_path(lang_code) # not sure yet about this; change once command is complete
    try:
        files = os.listdir(srts_path)
    except OSError:             # directory doesnt exist or we cant read it
        files = []

    yt_ids = [f.rstrip(".srt") for f in files]
    srts_dict = { 'srt_files': yt_ids }

    with open(srts_file_dest_path, 'wb') as fp:
        logging.debug('Creating %s', srts_file_dest_path)
        json.dump(srts_dict, fp)

    return yt_ids
Example #9
0
def download_subtitle(request, lang_code, youtube_id):
    """Dummy function for capturing a video download request and logging
    to output, so we can collect stats."""

    # Log the info
    stats_logger("subtitles").info(
        "sd;%s;%s;%s" % (get_request_ip(request), lang_code, youtube_id))

    # Find the file to return
    srt_filepath = get_srt_path(lang_code, youtube_id=youtube_id)
    if not os.path.exists(srt_filepath):
        raise Http404

    # Stream it back to the user
    # Stream it back to the user
    zh = open(srt_filepath, "rb")
    response = HttpResponse(content=zh,
                            mimetype='text/plain',
                            content_type='text/plain')
    response[
        'Content-Disposition'] = 'attachment; filename="%s"' % os.path.basename(
            srt_filepath)

    return response
Example #10
0
def generate_srt_availability_file(lang_code):
    '''
    For compatibility with versions less than 0.10.3, we need to generate this
    json file that contains the srts for the videos.
    '''

    # this path is a direct copy of the path found in the old function that generated this file
    srts_file_dest_path = os.path.join(settings.STATIC_ROOT, 'data', 'subtitles', 'languages', "%s_available_srts.json") % lang_code
    ensure_dir(os.path.dirname(srts_file_dest_path))

    srts_path = get_srt_path(lang_code) # not sure yet about this; change once command is complete
    try:
        files = os.listdir(srts_path)
    except OSError:             # directory doesnt exist or we cant read it
        files = []

    yt_ids = [f.rstrip(".srt") for f in files]
    srts_dict = { 'srt_files': yt_ids }

    with open(srts_file_dest_path, 'wb') as fp:
        logging.debug('Creating %s', srts_file_dest_path)
        json.dump(srts_dict, fp)

    return yt_ids
Example #11
0
def stamp_availability_on_video(video,
                                format="mp4",
                                force=False,
                                stamp_urls=True,
                                videos_path=settings.CONTENT_ROOT):
    """
    Stamp all relevant urls and availability onto a video object (if necessary), including:
    * whether the video is available (on disk or online)
    """
    def compute_video_availability(youtube_id,
                                   format,
                                   videos_path=settings.CONTENT_ROOT):
        return {
            "on_disk":
            is_video_on_disk(youtube_id, format, videos_path=videos_path)
        }

    def compute_video_metadata(youtube_id, format):
        return {"stream_type": "video/%s" % format}

    def compute_video_urls(youtube_id,
                           format,
                           lang_code,
                           on_disk=None,
                           thumb_formats=["png", "jpg"],
                           videos_path=videos_path):
        if on_disk is None:
            on_disk = is_video_on_disk(youtube_id,
                                       format,
                                       videos_path=videos_path)

        if on_disk:
            video_base_url = settings.CONTENT_URL + youtube_id
            stream_url = video_base_url + ".%s" % format
            thumbnail_url = None  # default to None now, so we know when no thumbnail is available.

            for thumb_format in thumb_formats:  # find the thumbnail on disk
                thumb_filename = '%s.%s' % (youtube_id, thumb_format)
                thumb_filepath = os.path.join(videos_path, thumb_filename)
                if os.path.exists(thumb_filepath):
                    thumbnail_url = video_base_url + "." + thumb_format  # default
                    break

        elif settings.BACKUP_VIDEO_SOURCE and lang_code == "en":
            dict_vals = {
                "youtube_id": youtube_id,
                "video_format": format,
                "thumb_format": thumb_formats[0]
            }
            stream_url = settings.BACKUP_VIDEO_SOURCE % dict_vals
            thumbnail_url = settings.BACKUP_THUMBNAIL_SOURCE % dict_vals if settings.BACKUP_THUMBNAIL_SOURCE else None

        else:
            return {}  # no URLs
        return {"stream": stream_url, "thumbnail": thumbnail_url}

    video_availability = video.get("availability", {}) if not force else {}
    en_youtube_id = get_youtube_id(video["id"], None)
    video_map = get_id2oklang_map(video["id"]) or {}

    if not "on_disk" in video_availability:
        for lang_code in video_map.keys():
            youtube_id = video_map[lang_code].encode('utf-8')
            video_availability[lang_code] = compute_video_availability(
                youtube_id, format=format, videos_path=videos_path)
        video_availability["en"] = video_availability.get(
            "en", {"on_disk": False})  # en should always be defined

        # Summarize status
        any_on_disk = any([
            lang_avail["on_disk"]
            for lang_avail in video_availability.values()
        ])
        any_available = any_on_disk or bool(settings.BACKUP_VIDEO_SOURCE)

    if stamp_urls:
        # Loop over all known dubbed videos
        for lang_code, youtube_id in video_map.iteritems():
            urls = compute_video_urls(
                youtube_id,
                format,
                lang_code,
                on_disk=video_availability[lang_code]["on_disk"],
                videos_path=videos_path)
            if urls:
                # Only add properties if anything is available.
                video_availability[lang_code].update(urls)
                video_availability[lang_code].update(
                    compute_video_metadata(youtube_id, format))

        # Get the (english) subtitle urls
        subtitle_lang_codes = get_langs_with_subtitle(en_youtube_id)
        subtitles_tuple = [(lc, get_srt_url(en_youtube_id, lc))
                           for lc in subtitle_lang_codes
                           if os.path.exists(get_srt_path(lc, en_youtube_id))]
        subtitles_urls = dict(subtitles_tuple)
        video_availability["en"]["subtitles"] = subtitles_urls

    # now scrub any values that don't actually exist
    for lang_code in video_availability.keys():
        if not video_availability[lang_code]["on_disk"] and len(
                video_availability[lang_code]) == 1:
            del video_availability[lang_code]

    # Now summarize some availability onto the video itself
    video["availability"] = video_availability
    video["on_disk"] = any_on_disk
    video["available"] = any_available

    return video
Example #12
0
def stamp_availability_on_video(video, format="mp4", force=False, stamp_urls=True, videos_path=settings.CONTENT_ROOT):
    """
    Stamp all relevant urls and availability onto a video object (if necessary), including:
    * whether the video is available (on disk or online)
    """
    def compute_video_availability(youtube_id, format, videos_path=settings.CONTENT_ROOT):
        return {"on_disk": is_video_on_disk(youtube_id, format, videos_path=videos_path)}

    def compute_video_metadata(youtube_id, format):
        return {"stream_type": "video/%s" % format}

    def compute_video_urls(youtube_id, format, lang_code, on_disk=None, thumb_formats=["png", "jpg"], videos_path=videos_path):
        if on_disk is None:
            on_disk = is_video_on_disk(youtube_id, format, videos_path=videos_path)

        if on_disk:
            video_base_url = settings.CONTENT_URL + youtube_id
            stream_url = video_base_url + ".%s" % format
            thumbnail_url = None  # default to None now, so we know when no thumbnail is available.

            for thumb_format in thumb_formats:  # find the thumbnail on disk
                thumb_filename = '%s.%s' % (youtube_id, thumb_format)
                thumb_filepath = os.path.join(videos_path, thumb_filename)
                if os.path.exists(thumb_filepath):
                    thumbnail_url = video_base_url + "." + thumb_format  # default
                    break

        elif settings.BACKUP_VIDEO_SOURCE and lang_code == "en":
            dict_vals = {"youtube_id": youtube_id, "video_format": format, "thumb_format": thumb_formats[0] }
            stream_url = settings.BACKUP_VIDEO_SOURCE % dict_vals
            thumbnail_url = settings.BACKUP_THUMBNAIL_SOURCE % dict_vals if settings.BACKUP_THUMBNAIL_SOURCE else None

        else:
            return {}  # no URLs
        return {"stream": stream_url, "thumbnail": thumbnail_url}

    video_availability = video.get("availability", {}) if not force else {}
    en_youtube_id = get_youtube_id(video["id"], None)
    video_map = get_id2oklang_map(video["id"]) or {}

    if not "on_disk" in video_availability:
        for lang_code in video_map.keys():
            youtube_id = video_map[lang_code].encode('utf-8')
            video_availability[lang_code] = compute_video_availability(youtube_id, format=format, videos_path=videos_path)
        video_availability["en"] = video_availability.get("en", {"on_disk": False})  # en should always be defined

        # Summarize status
        any_on_disk = any([lang_avail["on_disk"] for lang_avail in video_availability.values()])
        any_available = any_on_disk or bool(settings.BACKUP_VIDEO_SOURCE)

    if stamp_urls:
        # Loop over all known dubbed videos
        for lang_code, youtube_id in video_map.iteritems():
            urls = compute_video_urls(youtube_id, format, lang_code, on_disk=video_availability[lang_code]["on_disk"], videos_path=videos_path)
            if urls:
                # Only add properties if anything is available.
                video_availability[lang_code].update(urls)
                video_availability[lang_code].update(compute_video_metadata(youtube_id, format))

        # Get the (english) subtitle urls
        subtitle_lang_codes = get_langs_with_subtitle(en_youtube_id)
        subtitles_tuple = [(lc, get_srt_url(en_youtube_id, lc)) for lc in subtitle_lang_codes if os.path.exists(get_srt_path(lc, en_youtube_id))]
        subtitles_urls = dict(subtitles_tuple)
        video_availability["en"]["subtitles"] = subtitles_urls

    # now scrub any values that don't actually exist
    for lang_code in video_availability.keys():
        if not video_availability[lang_code]["on_disk"] and len(video_availability[lang_code]) == 1:
            del video_availability[lang_code]

    # Now summarize some availability onto the video itself
    video["availability"] = video_availability
    video["on_disk"]   = any_on_disk
    video["available"] = any_available

    return video
def download_if_criteria_met(videos, lang_code, force, response_code, date_since_attempt, frequency_to_save, *args, **kwargs):
    """Execute download of subtitle if it meets the criteria specified by the command line args

    Note: videos are a dict; keys=youtube_id, values=data
    Note: lang_code is in IETF format.
    """
    date_specified = convert_date_input(date_since_attempt)

    # Filter up front, for efficiency (& reporting's sake)
    n_videos = len(videos)

    logging.info("There are (up to) %s total videos with subtitles for language '%s'.  Let's go get them!" % (
        n_videos, lang_code,
    ))

    # Filter based on response code
    if response_code and response_code != "all":
        logging.info("Filtering based on response code (%s)..." %
                     response_code)
        response_code_filter = partial(
            lambda vid, rcode: rcode == vid["api_response"], rcode=response_code)
        videos = dict([(k, v) for k, v in videos.iteritems() if response_code_filter(v)])
        logging.info("%4d of %4d videos match your specified response code (%s)" % (
            len(videos), n_videos, response_code,
        ))

    if date_specified:
        logging.info("Filtering based on date...")
        for k in videos.keys():
            if not videos[k]["last_attempt"]:
                continue
            elif datetime.datetime.strptime(videos[k]["last_attempt"], '%Y-%m-%d') < date_specified:
                continue
            elif False:  # TODO(bcipolli): check output filename exists, as per # 1359
                continue
            else:
                del videos[k]

        logging.info("%4d of %4d videos need refreshing (last refresh more recent than %s)" % (
            len(videos), n_videos, date_specified,
        ))

    # Loop over videos needing refreshing
    n_loops = 0
    srt_count = None
    for youtube_id, entry in videos.items():
        previously_downloaded = entry.get("downloaded")

        if previously_downloaded and not force:
            logging.info("Already downloaded %s/%s. To redownload, run again with -f." % (
                lang_code, youtube_id,
            ))
            continue

        logging.debug("Attempting to download subtitle for lang: %s and YouTube ID: %s" % (
            lang_code, youtube_id,
        ))
        response = download_subtitle(youtube_id, lang_code, format="srt")
        time_of_attempt = unicode(datetime.datetime.now().date())

        if response in ["client-error", "server-error", "unexpected_error"]:
            # Couldn't download
            logging.info("%s/%s.srt: Updating JSON file to record error (%s)." % (
                lang_code, youtube_id, response,
            ))
            update_json(youtube_id, lang_code, previously_downloaded, response, time_of_attempt)

        else:
            dirpath = get_srt_path(lang_code)
            fullpath = os.path.join(dirpath, youtube_id + ".srt")
            ensure_dir(dirpath)

            logging.debug("Writing file to %s" % fullpath)
            with open(fullpath, 'w') as fp:
                fp.write(response.encode('UTF-8'))

            logging.info("%s/%s.srt: Updating JSON file to record success." % (
                lang_code, youtube_id,
            ))
            update_json(youtube_id, lang_code, True, "success", time_of_attempt)

        # Update srt availability mapping
        n_loops += 1
        if n_loops % frequency_to_save == 0 or n_loops == len(videos.keys()):
            srt_count = store_new_counts(lang_code=lang_code)
            logging.info("%s: On loop %d / %d, stored: subtitle count = %d." % (
                lang_code, n_loops, len(videos), srt_count,
            ))

    # Summarize output
    if srt_count is None:
        # only none if nothing was done.
        logging.info("Nothing was done.")
    else:
        logging.info("We now have %d subtitles (amara thought they had %d) for language '%s'!" % (
            srt_count, n_videos, lang_code,
        ))
Example #14
0
def download_if_criteria_met(videos, lang_code, force, response_code, date_since_attempt, frequency_to_save, *args, **kwargs):
    """Execute download of subtitle if it meets the criteria specified by the command line args

    Note: videos are a dict; keys=youtube_id, values=data
    Note: lang_code is in IETF format.
    """
    date_specified = convert_date_input(date_since_attempt)

    # Filter up front, for efficiency (& reporting's sake)
    n_videos = len(videos)

    logging.info("There are (up to) %s total videos with subtitles for language '%s'.  Let's go get them!" % (
        n_videos, lang_code,
    ))

    # Filter based on response code
    if response_code and response_code != "all":
        logging.info("Filtering based on response code (%s)..." %
                     response_code)
        response_code_filter = partial(
            lambda vid, rcode: rcode == vid["api_response"], rcode=response_code)
        videos = dict([(k, v) for k, v in videos.iteritems() if response_code_filter(v)])
        logging.info("%4d of %4d videos match your specified response code (%s)" % (
            len(videos), n_videos, response_code,
        ))

    if date_specified:
        logging.info("Filtering based on date...")
        for k in videos.keys():
            if not videos[k]["last_attempt"]:
                continue
            elif datetime.datetime.strptime(videos[k]["last_attempt"], '%Y-%m-%d') < date_specified:
                continue
            elif False:  # TODO(bcipolli): check output filename exists, as per # 1359
                continue
            else:
                del videos[k]

        logging.info("%4d of %4d videos need refreshing (last refresh more recent than %s)" % (
            len(videos), n_videos, date_specified,
        ))

    # Loop over videos needing refreshing
    n_loops = 0
    srt_count = None
    for youtube_id, entry in videos.items():
        previously_downloaded = entry.get("downloaded")

        if previously_downloaded and not force:
            logging.info("Already downloaded %s/%s. To redownload, run again with -f." % (
                lang_code, youtube_id,
            ))
            continue

        logging.debug("Attempting to download subtitle for lang: %s and YouTube ID: %s" % (
            lang_code, youtube_id,
        ))
        response = download_subtitle(youtube_id, lang_code, format="srt")
        time_of_attempt = unicode(datetime.datetime.now().date())

        if response in ["client-error", "server-error", "unexpected_error"]:
            # Couldn't download
            logging.info("%s/%s.srt: Updating JSON file to record error (%s)." % (
                lang_code, youtube_id, response,
            ))
            update_json(youtube_id, lang_code, previously_downloaded, response, time_of_attempt)

        else:
            dirpath = get_srt_path(lang_code)
            fullpath = os.path.join(dirpath, youtube_id + ".srt")
            ensure_dir(dirpath)

            logging.debug("Writing file to %s" % fullpath)
            with open(fullpath, 'w') as fp:
                fp.write(response.encode('UTF-8'))

            logging.info("%s/%s.srt: Updating JSON file to record success." % (
                lang_code, youtube_id,
            ))
            update_json(youtube_id, lang_code, True, "success", time_of_attempt)

        # Update srt availability mapping
        n_loops += 1
        if n_loops % frequency_to_save == 0 or n_loops == len(videos.keys()):
            srt_count = store_new_counts(lang_code=lang_code)
            logging.info("%s: On loop %d / %d, stored: subtitle count = %d." % (
                lang_code, n_loops, len(videos), srt_count,
            ))

    # Summarize output
    if srt_count is None:
        # only none if nothing was done.
        logging.info("Nothing was done.")
    else:
        logging.info("We now have %d subtitles (amara thought they had %d) for language '%s'!" % (
            srt_count, n_videos, lang_code,
        ))