def update_metadata(package_metadata, version=VERSION):
    """
    We've zipped the packages, and now have unzipped & zipped sizes.
    Update this info in the local metadata (but not inside the zip)
    """
    master_filepath = get_language_pack_availability_filepath(version=version)
    master_metadata = softload_json(master_filepath, logger=logging.warn, errmsg="Error opening master language pack metadata")

    for lc, updated_meta in package_metadata.iteritems():
        lang_code_ietf = lcode_to_ietf(lc)

        # Gather existing metadata
        metadata_filepath = get_language_pack_metadata_filepath(lang_code_ietf, version=version)
        stored_meta = softload_json(metadata_filepath, logger=logging.warn, errmsg="Error opening %s language pack metadata" % lc)

        stored_meta.update(updated_meta)

        # Write locally (this is used on download by distributed server to update it's database)
        with open(metadata_filepath, 'w') as output:
            json.dump(stored_meta, output)

        # Update master (this is used for central server to handle API requests for data)
        master_metadata[lang_code_ietf] = stored_meta

    # Save updated master
    ensure_dir(os.path.dirname(master_filepath))
    with open(master_filepath, 'w') as output:
        json.dump(master_metadata, output)
    logging.info("Local record of translations updated")
Beispiel #2
0
def update_json(youtube_id, lang_code, downloaded, api_response, time_of_attempt):
    """Update language_srt_map to reflect download status

    lang_code in IETF format
    """
    # Open JSON file
    filepath = get_lang_map_filepath(lang_code)
    language_srt_map = softload_json(filepath, logger=logging.error)
    if not language_srt_map:
        return False

    # create updated entry
    entry = language_srt_map[youtube_id]
    entry["downloaded"] = downloaded
    entry["api_response"] = api_response
    entry["last_attempt"] = time_of_attempt
    if api_response == "success":
        entry["last_success"] = time_of_attempt

    # update full-size JSON with new information
    language_srt_map[youtube_id].update(entry)

    # write it to file
    json_file = open(filepath, "wb")
    json_file.write(json.dumps(language_srt_map))
    json_file.close()
    logging.debug("File updated.")

    return True
Beispiel #3
0
    def handle(self, *args, **options):
        if settings.CENTRAL_SERVER:
            raise CommandError(
                "Run this command on the distributed server only.")

        # Load videos
        video_sizes = softload_json(REMOTE_VIDEO_SIZE_FILEPATH,
                                    logger=logging.debug)

        # Query current files
        all_video_filepaths = glob.glob(
            os.path.join(settings.CONTENT_ROOT, "*.mp4"))
        logging.info("Querying sizes for %d video(s)." %
                     len(all_video_filepaths))

        # Get all current sizes
        for video_filepath in all_video_filepaths:
            youtube_id = os.path.splitext(os.path.basename(video_filepath))[0]
            # Set to max, so that local compressed videos will not affect things.
            video_sizes[youtube_id] = max(video_sizes.get(youtube_id, 0),
                                          os.path.getsize(video_filepath))

        # Sort results
        video_sizes = OrderedDict([(key, video_sizes[key])
                                   for key in sorted(video_sizes.keys())])

        logging.info("Saving results to disk.")
        ensure_dir(os.path.dirname(REMOTE_VIDEO_SIZE_FILEPATH))
        with open(REMOTE_VIDEO_SIZE_FILEPATH, "w") as fp:
            json.dump(video_sizes, fp, indent=2)
Beispiel #4
0
def write_count_to_json(subtitle_counts, data_path):
    """Write JSON to file in static/data/subtitles/"""
    current_counts = softload_json(SUBTITLE_COUNTS_FILEPATH, logger=logging.error)
    current_counts.update(subtitle_counts)

    logging.debug("Writing fresh srt counts to %s" % SUBTITLE_COUNTS_FILEPATH)
    with open(SUBTITLE_COUNTS_FILEPATH, 'wb') as fp:
        # sort here, so we don't have to sort later when seving to clients
        json.dump(current_counts, fp, sort_keys=True)
Beispiel #5
0
def get_language_names(lang_code=None):
    """
    Returns dictionary of names (English name, "Native" name)
    for a given language code.
    """
    global LANG_NAMES_MAP
    lang_code = lcode_to_ietf(lang_code)
    if not LANG_NAMES_MAP:
        LANG_NAMES_MAP = softload_json(LANG_LOOKUP_FILEPATH)
    return LANG_NAMES_MAP.get(lang_code) if lang_code else LANG_NAMES_MAP
Beispiel #6
0
def get_code2lang_map(lang_code=None, force=False):
    """
    """
    global LANG_LOOKUP_FILEPATH, CODE2LANG_MAP

    if force or not CODE2LANG_MAP:
        lmap = softload_json(LANG_LOOKUP_FILEPATH, logger=logging.debug)

        CODE2LANG_MAP = {}
        for lc, entry in lmap.iteritems():
            CODE2LANG_MAP[lcode_to_ietf(lc)] = entry

    return CODE2LANG_MAP.get(lang_code) if lang_code else CODE2LANG_MAP
def update_metadata(package_metadata, version=VERSION):
    """
    We've zipped the packages, and now have unzipped & zipped sizes.
    Update this info in the local metadata (but not inside the zip)
    """
    master_filepath = get_language_pack_availability_filepath(version=version)
    master_metadata = softload_json(
        master_filepath,
        logger=logging.warn,
        errmsg="Error opening master language pack metadata")

    for lc, updated_meta in package_metadata.iteritems():
        lang_code_ietf = lcode_to_ietf(lc)

        # Gather existing metadata
        metadata_filepath = get_language_pack_metadata_filepath(
            lang_code_ietf, version=version)
        stored_meta = softload_json(
            metadata_filepath,
            logger=logging.warn,
            errmsg="Error opening %s language pack metadata" % lc)

        stored_meta.update(updated_meta)

        # Write locally (this is used on download by distributed server to update it's database)
        with open(metadata_filepath, 'w') as output:
            json.dump(stored_meta, output)

        # Update master (this is used for central server to handle API requests for data)
        master_metadata[lang_code_ietf] = stored_meta

    # Save updated master
    ensure_dir(os.path.dirname(master_filepath))
    with open(master_filepath, 'w') as output:
        json.dump(master_metadata, output)
    logging.info("Local record of translations updated")
Beispiel #8
0
def get_topic_tree(force=False, props=None):
    global TOPICS, TOPICS_FILEPATH
    if TOPICS is None or force:
        TOPICS = softload_json(TOPICS_FILEPATH, logger=logging.debug)
        validate_ancestor_ids(TOPICS)  # make sure ancestor_ids are set properly

        # Limit the memory footprint by unloading particular values
        if props:
            node_cache = get_node_cache()
            for kind, list_by_kind in node_cache.iteritems():
                for node_list in list_by_kind.values():
                    for node in node_list:
                        for att in node.keys():
                            if att not in props:
                                del node[att]
    return TOPICS
Beispiel #9
0
def get_topic_tree(force=False, props=None):
    global TOPICS, TOPICS_FILEPATH
    if TOPICS is None or force:
        TOPICS = softload_json(TOPICS_FILEPATH, logger=logging.debug)
        validate_ancestor_ids(
            TOPICS)  # make sure ancestor_ids are set properly

        # Limit the memory footprint by unloading particular values
        if props:
            node_cache = get_node_cache()
            for kind, list_by_kind in node_cache.iteritems():
                for node_list in list_by_kind.values():
                    for node in node_list:
                        for att in node.keys():
                            if att not in props:
                                del node[att]
    return TOPICS
Beispiel #10
0
def get_dubbed_video_map(lang_code=None, force=False):
    """
    Stores a key per language.  Value is a dictionary between video_id and (dubbed) youtube_id
    """
    global DUBBED_VIDEO_MAP, DUBBED_VIDEO_MAP_RAW, DUBBED_VIDEOS_MAPPING_FILEPATH

    if DUBBED_VIDEO_MAP is None or force:
        try:
            if not os.path.exists(DUBBED_VIDEOS_MAPPING_FILEPATH) or force:
                try:
                    if settings.CENTRAL_SERVER:
                        # Never call commands that could fail from the distributed server.
                        #   Always create a central server API to abstract things (see below)
                        logging.debug("Generating dubbed video mappings.")
                        call_command("generate_dubbed_video_mappings", force=force)
                    else:
                        # Generate from the spreadsheet
                        response = requests.get("http://%s/api/i18n/videos/dubbed_video_map" % (settings.CENTRAL_SERVER_HOST))
                        response.raise_for_status()
                        with open(DUBBED_VIDEOS_MAPPING_FILEPATH, "wb") as fp:
                            fp.write(response.content.decode('utf-8'))  # wait until content has been confirmed before opening file.
                except Exception as e:
                    if not os.path.exists(DUBBED_VIDEOS_MAPPING_FILEPATH):
                        # Unrecoverable error, so raise
                        raise
                    elif DUBBED_VIDEO_MAP:
                        # No need to recover--allow the downstream dude to catch the error.
                        raise
                    else:
                        # We can recover by NOT forcing reload.
                        logging.warn("%s" % e)

            DUBBED_VIDEO_MAP_RAW = softload_json(DUBBED_VIDEOS_MAPPING_FILEPATH, raises=True)
        except Exception as e:
            logging.info("Failed to get dubbed video mappings; defaulting to empty.")
            DUBBED_VIDEO_MAP_RAW = {}  # setting this will avoid triggering reload on every call

        DUBBED_VIDEO_MAP = {}
        for lang_name, video_map in DUBBED_VIDEO_MAP_RAW.iteritems():
            logging.debug("Adding dubbed video map entry for %s (name=%s)" % (get_langcode_map(lang_name), lang_name))
            DUBBED_VIDEO_MAP[get_langcode_map(lang_name)] = video_map

    return DUBBED_VIDEO_MAP.get(lang_code, {}) if lang_code else DUBBED_VIDEO_MAP
Beispiel #11
0
def _get_installed_language_packs():
    """
    On-disk method to show currently installed languages and meta data.
    """

    # There's always English...
    installed_language_packs = [{
        'code': 'en',
        'software_version': VERSION,
        'language_pack_version': 0,
        'percent_translated': 100,
        'subtitle_count': 0,
        'name': 'English',
        'native_name': 'English',
    }]

    # Loop through locale folders
    for locale_dir in settings.LOCALE_PATHS:
        if not os.path.exists(locale_dir):
            continue

        # Loop through folders in each locale dir
        for django_disk_code in os.listdir(locale_dir):

            # Inside each folder, read from the JSON file - language name, % UI trans, version number
            try:
                # Get the metadata
                metadata_filepath = os.path.join(locale_dir, django_disk_code, "%s_metadata.json" % lcode_to_ietf(django_disk_code))
                lang_meta = softload_json(metadata_filepath, raises=True)

                logging.debug("Found language pack %s" % (django_disk_code))
            except Exception as e:
                if isinstance(e, IOError) and e.errno == 2:
                    logging.info("Ignoring non-language pack %s in %s" % (django_disk_code, locale_dir))
                else:
                    logging.error("Error reading %s metadata (%s): %s" % (django_disk_code, metadata_filepath, e))
                continue

            installed_language_packs.append(lang_meta)

    sorted_list = sorted(installed_language_packs, key=lambda m: m['name'].lower())
    return OrderedDict([(lcode_to_ietf(val["code"]), val) for val in sorted_list])
Beispiel #12
0
def clear_subtitles_cache(lang_codes=None, locale_root=LOCALE_ROOT):
    """
    Language codes will be converted to django format (e.g. en_US)
    """
    lang_codes = lang_codes or get_langs_with_subtitles()
    for lang_code in lang_codes:
        lang_code = lcode_to_ietf(lang_code)

        # Clear the status file
        lm_file = get_lang_map_filepath(lang_code)
        download_status = softload_json(lm_file, raises=True)
        for key in download_status:
            download_status[key] = {u'downloaded': False, u'last_success': u'', u'last_attempt': u'', u'api_response': u''}
        with open(lm_file, "w") as fp:
            json.dump(download_status, fp)

        # Delete all srt files
        srt_path = get_srt_path(lang_code)
        if os.path.exists(srt_path):
            shutil.rmtree(srt_path)
Beispiel #13
0
    def handle(self, *args, **options):
        if settings.CENTRAL_SERVER:
            raise CommandError("Run this command on the distributed server only.")

        # Load videos
        video_sizes = softload_json(REMOTE_VIDEO_SIZE_FILEPATH, logger=logging.debug)

        # Query current files
        all_video_filepaths = glob.glob(os.path.join(settings.CONTENT_ROOT, "*.mp4"))
        logging.info("Querying sizes for %d video(s)." % len(all_video_filepaths))

        # Get all current sizes
        for video_filepath in all_video_filepaths:
            youtube_id = os.path.splitext(os.path.basename(video_filepath))[0]
            # Set to max, so that local compressed videos will not affect things.
            video_sizes[youtube_id] = max(video_sizes.get(youtube_id, 0), os.path.getsize(video_filepath))

        # Sort results
        video_sizes = OrderedDict([(key, video_sizes[key]) for key in sorted(video_sizes.keys())])

        logging.info("Saving results to disk.")
        ensure_dir(os.path.dirname(REMOTE_VIDEO_SIZE_FILEPATH))
        with open(REMOTE_VIDEO_SIZE_FILEPATH, "w") as fp:
            json.dump(video_sizes, fp, indent=2)
Beispiel #14
0
def download_subtitle(youtube_id, lang_code, format="srt"):
    """
    Return subtitles for YouTube ID in language specified. Return False if they do not exist. Update local JSON accordingly.

    Note: srt map deals with amara, so uses lower-cased ietf codes (e.g. en-us)
    """
    assert format == "srt", "We only support srt download at the moment."


    # srt map deals with amara, so uses ietf codes (e.g. en-us)
    api_info_map = softload_json(SRTS_JSON_FILEPATH, raises=True)

    # get amara id
    amara_code = api_info_map.get(youtube_id, {}).get("amara_code")

    # make request
    # Please see http://amara.readthedocs.org/en/latest/api.html
    base_url = "https://amara.org/api2/partners/videos"

    resp = make_request(AMARA_HEADERS, "%s/%s/languages/%s/subtitles/?format=srt" % (
        base_url, amara_code, lang_code.lower(),
    ))
    if isinstance(resp, basestring):
        return resp
    else:
        # return the subtitle text, replacing empty subtitle lines with
        # spaces to make the FLV player happy
        try:
            resp.encoding = "UTF-8"
            response = (resp.text or u"") \
                .replace("\n\n\n", "\n   \n\n") \
                .replace("\r\n\r\n\r\n", "\r\n   \r\n\r\n")
        except Exception as e:
            logging.error(e)
            response = "client-error"
        return response
def generate_metadata(package_metadata=None, version=VERSION, force_version_update=False):
    """Loop through locale folder, create or update language specific meta
    and create or update master file, skipping broken languages
    """
    logging.info("Generating new language pack metadata")

    lang_codes = package_metadata.keys() if package_metadata else os.listdir(LOCALE_ROOT)
    broken_langs = [lc for lc, md in package_metadata.iteritems() if md.get("broken")] if package_metadata else []

    master_filepath = get_language_pack_availability_filepath(version=version)
    master_metadata = softload_json(master_filepath, logger=logging.warn, errmsg="Error opening master language pack metadata")

    # loop through all languages in locale, update master file
    crowdin_meta_dict = download_crowdin_metadata()

    for lc in lang_codes:
        lang_code_django = lcode_to_django_dir(lc)
        lang_code_ietf = lcode_to_ietf(lc)
        lang_name = get_language_name(lang_code_ietf)
        metadata_filepath = get_language_pack_metadata_filepath(lang_code_ietf, version=version)
        ensure_dir(os.path.dirname(metadata_filepath))

        if broken_langs and lang_code_django in broken_langs:  # broken_langs is django format
            logging.info("Skipping directory %s because it did not compile." % lang_code_django)
            continue

        # Gather existing metadata
        crowdin_meta = next((meta for meta in crowdin_meta_dict if meta["code"] == lang_code_ietf), {})
        stored_meta = softload_json(metadata_filepath, logger=logging.info, errmsg="Could not open %s language pack metadata" % lc)

        updated_meta = package_metadata.get(lang_code_ietf) or {}
        updated_meta.update({
            "code": lang_code_ietf,  # user-facing code
            "name": lang_name,
            "software_version": version,
        })

        try:
            # Augment the metadata
            updated_meta.update(get_language_names(lang_code_django))
        except LanguageNotFoundError:
            logging.warning("Unrecognized language; unable to add extra naming metadata %s" % lang_code_django)
            continue

        if force_version_update:
            language_pack_version = 1 + stored_meta.get("language_pack_version", 0)  # will increment to one
        else:
            language_pack_version = increment_language_pack_version(stored_meta, updated_meta)

        updated_meta["language_pack_version"] = language_pack_version
        stored_meta.update(updated_meta)

        # Write locally (this is used on download by distributed server to update it's database)
        with open(metadata_filepath, 'w') as output:
            json.dump(stored_meta, output)

        # Update master (this is used for central server to handle API requests for data)
        master_metadata[lang_code_ietf] = stored_meta

    # Save updated master
    ensure_dir(os.path.dirname(master_filepath))
    with open(master_filepath, 'w') as fp:
        json.dump(master_metadata, fp)
    logging.info("Local record of translations updated")
def update_language_srt_map(map_file=SRTS_JSON_FILEPATH):
    """
    Translate the srts_remote_availability dictionary into language specific files
    that can be used by the cache_subtitles command.

    Note: srt map deals with amara, so uses ietf codes (e.g. en-us)
    """
    # Load the current download status
    api_info_map = softload_json(map_file, logger=logging.warn)

    # Next we want to iterate through those and create a big srt dictionary organized by language code
    remote_availability_map = {}
    for youtube_id, data in api_info_map.items():
        languages = data.get("language_codes", [])
        for lang_code in languages:
            lang_code = lcode_to_ietf(lang_code)
            if not lang_code in remote_availability_map:
                # logging.info("Creating language section '%s'" % lang_code)
                remote_availability_map[lang_code] = {}
            # This entry will be valid if it's new, otherwise it will be overwitten later
            remote_availability_map[lang_code][youtube_id] = {
                "downloaded": False,
                "api_response": "",
                "last_attempt": "",
                "last_success": "",
            }

    # Finally we need to iterate through that dictionary and create individual files for each language code
    for lang_code, new_data in remote_availability_map.items():

        # Try to open previous language file
        lang_map_filepath = get_lang_map_filepath(lang_code)
        if not os.path.exists(lang_map_filepath):
            lang_map = {}
        else:
            lang_map = softload_json(lang_map_filepath, logger=logging.error)

        # First, check to see if it's empty (e.g. no subtitles available for any videos)
        if not new_data:
            logging.info("Subtitle support for %s has been terminated; removing." % lang_code)
            if os.path.exists(lang_map_filepath):
                os.remove(lang_map_filepath)
            continue

        # Compare how many empty entries you are adding and add them to master map
        old_yt_ids = set(new_data.keys())
        new_yt_ids = set(lang_map.keys())
        yt_ids_to_add = set(new_data.keys()) - set(lang_map.keys())
        yt_ids_to_delete = set(lang_map.keys()) - set(new_data.keys())

        if yt_ids_to_add:
            logging.info("Adding %d new YouTube IDs to language (%s)" % (len(yt_ids_to_add), lang_code))
            for yt_id in yt_ids_to_add:
                lang_map[yt_id] = new_data.get(yt_id)

        if yt_ids_to_delete:
            logging.info(
                "Deleting %d old YouTube IDs from language (%s) because they are no longer supported."
                % (len(yt_ids_to_delete), lang_code)
            )
            for yt_id in yt_ids_to_delete:
                lang_map.pop(yt_id, None)

        # Write the new file to the correct location
        logging.debug("Writing %s" % lang_map_filepath)
        ensure_dir(os.path.dirname(lang_map_filepath))
        with open(lang_map_filepath, "w") as outfile:
            json.dump(lang_map, outfile)

        # Update the big mapping with the most accurate numbers
        remote_availability_map[lang_code].update(lang_map)

    # Finally, remove any files not found in the current map at all.
    if lang_map_filepath:
        for filename in os.listdir(os.path.dirname(lang_map_filepath)):
            lang_code = lang_code = filename.split("_")[0]
            if not lang_code in remote_availability_map:
                file_to_remove = get_lang_map_filepath(lang_code)
                logging.info("Subtitle support for %s has been terminated; removing." % lang_code)
                if os.path.exists(file_to_remove):
                    os.remove(file_to_remove)
                else:
                    logging.warn(
                        "Subtitles metadata for %s not found; skipping deletion of non-existent file %s."
                        % (lang_code, file_to_remove)
                    )

    return remote_availability_map
def create_all_mappings(
    force=False, frequency_to_save=100, response_to_check=None, date_to_check=None, map_file=SRTS_JSON_FILEPATH
):
    """
    Write or update JSON file that maps from YouTube ID to Amara code and languages available.

    This command updates the json file that records what languages videos have been subtitled in.
    It loops through all video ids, records a list of which languages Amara says it has been subtitled in
    and meta data about the request (e.g. date, response code).

    See the schema in the docstring for fcn update_video_entry.
    """
    youtube_ids = get_slug2id_map().values()

    # Initialize the data
    if not os.path.exists(map_file):
        ensure_dir(os.path.dirname(map_file))
        if not settings.DEBUG:
            raise CommandError("TRUE central server's srts dict should never be empty; where is your %s?" % map_file)
        else:
            # Pull it from the central server
            try:
                logging.debug("Fetching central server's srt availability file.")
                resp = requests.get(
                    "http://kalite.learningequality.org:7007/media/testing/%s" % (os.path.basename(map_file))
                )
                resp.raise_for_status()
                with open(map_file, "w") as fp:
                    fp.write(resp.content)
                srts_dict = json.loads(resp.content)
            except Exception as e:
                logging.error("Failed to download TRUE central server's srts availability file: %s" % e)
                srts_dict = {}

    else:
        # Open the file, read, and clean out old videos.
        #   only handle the error if force=True.
        #   Otherwise, these data are too valuable to lose, so just assume a temp problem.
        srts_dict = softload_json(map_file, raises=not force, logger=logging.error)
        if srts_dict:
            logging.info("Loaded %d mappings." % (len(srts_dict)))

        # Set of videos no longer used by KA Lite
        removed_videos = set(srts_dict.keys()) - set(youtube_ids)
        if removed_videos:
            logging.info("Removing subtitle information for %d videos (no longer used)." % len(removed_videos))
            for vid in removed_videos:
                del srts_dict[vid]
    logging.info("Querying %d mappings." % (len(youtube_ids) - (0 if (force or date_to_check) else len(srts_dict))))

    # Once we have the current mapping, proceed through logic to update the mapping
    n_refreshed = 0  # keep track to avoid writing if nothing's been refreshed.
    n_new_entries = 0  # keep track for reporting
    n_failures = 0  # keep track for reporting
    for youtube_id in youtube_ids:

        # Decide whether or not to update this video based on the arguments provided at the command line
        cached = youtube_id in srts_dict
        if not force and cached:

            # First, check against date
            flag_for_refresh = True  # not (response_code or last_attempt)
            last_attempt = srts_dict[youtube_id].get("last_attempt")
            last_attempt = None if not last_attempt else datetime.datetime.strptime(last_attempt, "%Y-%m-%d")
            flag_for_refresh = flag_for_refresh and (not date_to_check or date_to_check > last_attempt)
            if not flag_for_refresh:
                logging.debug("Skipping %s for date-check" % youtube_id)
                continue

            # Second, check against response code
            response_code = srts_dict[youtube_id].get("api_response")
            flag_for_refresh = flag_for_refresh and (
                not response_to_check or response_to_check == "all" or response_to_check == response_code
            )
            if not (flag_for_refresh):
                logging.debug("Skipping %s for response-code" % youtube_id)
                continue
            if not response_to_check and not date_to_check and cached:  # no flags specified and already cached - skip
                logging.debug("Skipping %s for already-cached and no flags specified" % youtube_id)
                continue

        # We're gonna check; just report the reason why.
        if force and not cached:
            logging.debug("Updating %s because force flag (-f) given and video not cached." % youtube_id)
        elif force and cached:
            logging.debug("Updating %s because force flag (-f) given. Video was previously cached." % youtube_id)
        else:
            logging.debug("Updating %s because video subtitles metadata not yet cached." % youtube_id)

        # If it makes it to here without hitting a continue, then update the entry

        try:
            srts_dict[youtube_id] = update_video_entry(youtube_id, entry=srts_dict.get(youtube_id, {}))
            n_refreshed += 1
        except Exception as e:
            logging.warn("Error updating video %s: %s" % (youtube_id, e))
            n_failures += 1
            continue

        if n_new_entries % frequency_to_save == 0:
            logging.info("On loop %d dumping dictionary into %s" % (n_new_entries, map_file))
            with open(map_file, "wb") as fp:
                json.dump(srts_dict, fp)
        n_new_entries += 1

    # Finished the loop: save and report
    if n_refreshed > 0:
        with open(map_file, "wb") as fp:
            json.dump(srts_dict, fp)
    if n_failures == 0:
        logging.info(
            "Great success! Added %d entries, updated %d entries, of %d total."
            % (n_new_entries, n_refreshed, len(srts_dict))
        )
    else:
        logging.warn(
            "Stored %d new entries, refreshed %d entries, but with %s failures, of %d total."
            % (n_new_entries, n_refreshed, n_failures, len(srts_dict))
        )

    return n_refreshed != 0
Beispiel #18
0
def get_remote_video_size(youtube_id, default=AVERAGE_VIDEO_SIZE, force=False):
    global REMOTE_VIDEO_SIZES
    if REMOTE_VIDEO_SIZES is None:
        REMOTE_VIDEO_SIZES = softload_json(REMOTE_VIDEO_SIZE_FILEPATH, logger=logging.debug)
    return REMOTE_VIDEO_SIZES.get(youtube_id, default)
def generate_metadata(package_metadata=None,
                      version=VERSION,
                      force_version_update=False):
    """Loop through locale folder, create or update language specific meta
    and create or update master file, skipping broken languages
    """
    logging.info("Generating new language pack metadata")

    lang_codes = package_metadata.keys() if package_metadata else os.listdir(
        LOCALE_ROOT)
    broken_langs = [
        lc for lc, md in package_metadata.iteritems() if md.get("broken")
    ] if package_metadata else []

    master_filepath = get_language_pack_availability_filepath(version=version)
    master_metadata = softload_json(
        master_filepath,
        logger=logging.warn,
        errmsg="Error opening master language pack metadata")

    # loop through all languages in locale, update master file
    crowdin_meta_dict = download_crowdin_metadata()

    for lc in lang_codes:
        lang_code_django = lcode_to_django_dir(lc)
        lang_code_ietf = lcode_to_ietf(lc)
        lang_name = get_language_name(lang_code_ietf)
        metadata_filepath = get_language_pack_metadata_filepath(
            lang_code_ietf, version=version)
        ensure_dir(os.path.dirname(metadata_filepath))

        if broken_langs and lang_code_django in broken_langs:  # broken_langs is django format
            logging.info("Skipping directory %s because it did not compile." %
                         lang_code_django)
            continue

        # Gather existing metadata
        crowdin_meta = next(
            (meta
             for meta in crowdin_meta_dict if meta["code"] == lang_code_ietf),
            {})
        stored_meta = softload_json(
            metadata_filepath,
            logger=logging.info,
            errmsg="Could not open %s language pack metadata" % lc)

        updated_meta = package_metadata.get(lang_code_ietf) or {}
        updated_meta.update({
            "code": lang_code_ietf,  # user-facing code
            "name": lang_name,
            "software_version": version,
        })

        try:
            # Augment the metadata
            updated_meta.update(get_language_names(lang_code_django))
        except LanguageNotFoundError:
            logging.warning(
                "Unrecognized language; unable to add extra naming metadata %s"
                % lang_code_django)
            continue

        if force_version_update:
            language_pack_version = 1 + stored_meta.get(
                "language_pack_version", 0)  # will increment to one
        else:
            language_pack_version = increment_language_pack_version(
                stored_meta, updated_meta)

        updated_meta["language_pack_version"] = language_pack_version
        stored_meta.update(updated_meta)

        # Write locally (this is used on download by distributed server to update it's database)
        with open(metadata_filepath, 'w') as output:
            json.dump(stored_meta, output)

        # Update master (this is used for central server to handle API requests for data)
        master_metadata[lang_code_ietf] = stored_meta

    # Save updated master
    ensure_dir(os.path.dirname(master_filepath))
    with open(master_filepath, 'w') as fp:
        json.dump(master_metadata, fp)
    logging.info("Local record of translations updated")
Beispiel #20
0
def get_remote_video_size(youtube_id, default=AVERAGE_VIDEO_SIZE, force=False):
    global REMOTE_VIDEO_SIZES
    if REMOTE_VIDEO_SIZES is None:
        REMOTE_VIDEO_SIZES = softload_json(REMOTE_VIDEO_SIZE_FILEPATH,
                                           logger=logging.debug)
    return REMOTE_VIDEO_SIZES.get(youtube_id, default)
Beispiel #21
0
def get_all_remote_video_sizes():
    global REMOTE_VIDEO_SIZES
    if REMOTE_VIDEO_SIZES is None:
        REMOTE_VIDEO_SIZES = softload_json(REMOTE_VIDEO_SIZE_FILEPATH, logger=logging.debug)
    return REMOTE_VIDEO_SIZES
Beispiel #22
0
def update_language_srt_map(map_file=SRTS_JSON_FILEPATH):
    """
    Translate the srts_remote_availability dictionary into language specific files
    that can be used by the cache_subtitles command.

    Note: srt map deals with amara, so uses ietf codes (e.g. en-us)
    """
    # Load the current download status
    api_info_map = softload_json(map_file, logger=logging.warn)

    # Next we want to iterate through those and create a big srt dictionary organized by language code
    remote_availability_map = {}
    for youtube_id, data in api_info_map.items():
        languages = data.get("language_codes", [])
        for lang_code in languages:
            lang_code = lcode_to_ietf(lang_code)
            if not lang_code in remote_availability_map:
                #logging.info("Creating language section '%s'" % lang_code)
                remote_availability_map[lang_code] = {}
            # This entry will be valid if it's new, otherwise it will be overwitten later
            remote_availability_map[lang_code][youtube_id] = {
                "downloaded": False,
                "api_response": "",
                "last_attempt": "",
                "last_success": "",
            }

    # Finally we need to iterate through that dictionary and create individual files for each language code
    for lang_code, new_data in remote_availability_map.items():

        # Try to open previous language file
        lang_map_filepath = get_lang_map_filepath(lang_code)
        if not os.path.exists(lang_map_filepath):
            lang_map = {}
        else:
            lang_map = softload_json(lang_map_filepath, logger=logging.error)

        # First, check to see if it's empty (e.g. no subtitles available for any videos)
        if not new_data:
            logging.info(
                "Subtitle support for %s has been terminated; removing." %
                lang_code)
            if os.path.exists(lang_map_filepath):
                os.remove(lang_map_filepath)
            continue

        # Compare how many empty entries you are adding and add them to master map
        old_yt_ids = set(new_data.keys())
        new_yt_ids = set(lang_map.keys())
        yt_ids_to_add = set(new_data.keys()) - set(lang_map.keys())
        yt_ids_to_delete = set(lang_map.keys()) - set(new_data.keys())

        if yt_ids_to_add:
            logging.info("Adding %d new YouTube IDs to language (%s)" %
                         (len(yt_ids_to_add), lang_code))
            for yt_id in yt_ids_to_add:
                lang_map[yt_id] = new_data.get(yt_id)

        if yt_ids_to_delete:
            logging.info(
                "Deleting %d old YouTube IDs from language (%s) because they are no longer supported."
                % (len(yt_ids_to_delete), lang_code))
            for yt_id in yt_ids_to_delete:
                lang_map.pop(yt_id, None)

        # Write the new file to the correct location
        logging.debug("Writing %s" % lang_map_filepath)
        ensure_dir(os.path.dirname(lang_map_filepath))
        with open(lang_map_filepath, 'w') as outfile:
            json.dump(lang_map, outfile)

        # Update the big mapping with the most accurate numbers
        remote_availability_map[lang_code].update(lang_map)

    # Finally, remove any files not found in the current map at all.
    if lang_map_filepath:
        for filename in os.listdir(os.path.dirname(lang_map_filepath)):
            lang_code = lang_code = filename.split("_")[0]
            if not lang_code in remote_availability_map:
                file_to_remove = get_lang_map_filepath(lang_code)
                logging.info(
                    "Subtitle support for %s has been terminated; removing." %
                    lang_code)
                if os.path.exists(file_to_remove):
                    os.remove(file_to_remove)
                else:
                    logging.warn(
                        "Subtitles metadata for %s not found; skipping deletion of non-existent file %s."
                        % (lang_code, file_to_remove))

    return remote_availability_map
Beispiel #23
0
def get_all_remote_video_sizes():
    global REMOTE_VIDEO_SIZES
    if REMOTE_VIDEO_SIZES is None:
        REMOTE_VIDEO_SIZES = softload_json(REMOTE_VIDEO_SIZE_FILEPATH,
                                           logger=logging.debug)
    return REMOTE_VIDEO_SIZES
Beispiel #24
0
def create_all_mappings(force=False,
                        frequency_to_save=100,
                        response_to_check=None,
                        date_to_check=None,
                        map_file=SRTS_JSON_FILEPATH):
    """
    Write or update JSON file that maps from YouTube ID to Amara code and languages available.

    This command updates the json file that records what languages videos have been subtitled in.
    It loops through all video ids, records a list of which languages Amara says it has been subtitled in
    and meta data about the request (e.g. date, response code).

    See the schema in the docstring for fcn update_video_entry.
    """
    youtube_ids = get_slug2id_map().values()

    # Initialize the data
    if not os.path.exists(map_file):
        ensure_dir(os.path.dirname(map_file))
        if not settings.DEBUG:
            raise CommandError(
                "TRUE central server's srts dict should never be empty; where is your %s?"
                % map_file)
        else:
            # Pull it from the central server
            try:
                logging.debug(
                    "Fetching central server's srt availability file.")
                resp = requests.get(
                    "http://kalite.learningequality.org:7007/media/testing/%s"
                    % (os.path.basename(map_file)))
                resp.raise_for_status()
                with open(map_file, "w") as fp:
                    fp.write(resp.content)
                srts_dict = json.loads(resp.content)
            except Exception as e:
                logging.error(
                    "Failed to download TRUE central server's srts availability file: %s"
                    % e)
                srts_dict = {}

    else:
        # Open the file, read, and clean out old videos.
        #   only handle the error if force=True.
        #   Otherwise, these data are too valuable to lose, so just assume a temp problem.
        srts_dict = softload_json(map_file,
                                  raises=not force,
                                  logger=logging.error)
        if srts_dict:
            logging.info("Loaded %d mappings." % (len(srts_dict)))

        # Set of videos no longer used by KA Lite
        removed_videos = set(srts_dict.keys()) - set(youtube_ids)
        if removed_videos:
            logging.info(
                "Removing subtitle information for %d videos (no longer used)."
                % len(removed_videos))
            for vid in removed_videos:
                del srts_dict[vid]
    logging.info("Querying %d mappings." %
                 (len(youtube_ids) -
                  (0 if (force or date_to_check) else len(srts_dict))))

    # Once we have the current mapping, proceed through logic to update the mapping
    n_refreshed = 0  # keep track to avoid writing if nothing's been refreshed.
    n_new_entries = 0  # keep track for reporting
    n_failures = 0  # keep track for reporting
    for youtube_id in youtube_ids:

        # Decide whether or not to update this video based on the arguments provided at the command line
        cached = youtube_id in srts_dict
        if not force and cached:

            # First, check against date
            flag_for_refresh = True  # not (response_code or last_attempt)
            last_attempt = srts_dict[youtube_id].get("last_attempt")
            last_attempt = None if not last_attempt else datetime.datetime.strptime(
                last_attempt, '%Y-%m-%d')
            flag_for_refresh = flag_for_refresh and (
                not date_to_check or date_to_check > last_attempt)
            if not flag_for_refresh:
                logging.debug("Skipping %s for date-check" % youtube_id)
                continue

            # Second, check against response code
            response_code = srts_dict[youtube_id].get("api_response")
            flag_for_refresh = flag_for_refresh and (
                not response_to_check or response_to_check == "all"
                or response_to_check == response_code)
            if not (flag_for_refresh):
                logging.debug("Skipping %s for response-code" % youtube_id)
                continue
            if not response_to_check and not date_to_check and cached:  # no flags specified and already cached - skip
                logging.debug(
                    "Skipping %s for already-cached and no flags specified" %
                    youtube_id)
                continue

        # We're gonna check; just report the reason why.
        if force and not cached:
            logging.debug(
                "Updating %s because force flag (-f) given and video not cached."
                % youtube_id)
        elif force and cached:
            logging.debug(
                "Updating %s because force flag (-f) given. Video was previously cached."
                % youtube_id)
        else:
            logging.debug(
                "Updating %s because video subtitles metadata not yet cached."
                % youtube_id)

        # If it makes it to here without hitting a continue, then update the entry

        try:
            srts_dict[youtube_id] = update_video_entry(youtube_id,
                                                       entry=srts_dict.get(
                                                           youtube_id, {}))
            n_refreshed += 1
        except Exception as e:
            logging.warn("Error updating video %s: %s" % (youtube_id, e))
            n_failures += 1
            continue

        if n_new_entries % frequency_to_save == 0:
            logging.info("On loop %d dumping dictionary into %s" %
                         (n_new_entries, map_file))
            with open(map_file, 'wb') as fp:
                json.dump(srts_dict, fp)
        n_new_entries += 1

    # Finished the loop: save and report
    if n_refreshed > 0:
        with open(map_file, 'wb') as fp:
            json.dump(srts_dict, fp)
    if n_failures == 0:
        logging.info(
            "Great success! Added %d entries, updated %d entries, of %d total."
            % (n_new_entries, n_refreshed, len(srts_dict)))
    else:
        logging.warn(
            "Stored %d new entries, refreshed %d entries, but with %s failures, of %d total."
            % (n_new_entries, n_refreshed, n_failures, len(srts_dict)))

    return n_refreshed != 0