def create_all_mappings(force=False, frequency_to_save=100, response_to_check=None, date_to_check=None, map_file=SRTS_JSON_FILEPATH):
    """
    Write or update JSON file that maps from YouTube ID to Amara code and languages available.

    This command updates the json file that records what languages videos have been subtitled in.
    It loops through all video ids, records a list of which languages Amara says it has been subtitled in
    and meta data about the request (e.g. date, response code).

    See the schema in the docstring for fcn update_video_entry.
    """
    youtube_ids = get_slug2id_map().values()

    # Initialize the data
    if not os.path.exists(map_file):
        ensure_dir(os.path.dirname(map_file))
        srts_dict = {}
    else:
        # Open the file, read, and clean out old videos.
        try:
            with open(map_file, "r") as fp:
                srts_dict = json.load(fp)
        except Exception as e:
            if not force:  # only handle the error if force=True.  Otherwise, these data are too valuable to lose, so just assume a temp problem.
                raise
            else:
                logging.error("JSON file corrupted, using empty json and starting from scratch (%s)" % e)
                srts_dict = {}
        else:
            logging.info("Loaded %d mappings." % (len(srts_dict)))

        # Set of videos no longer used by KA Lite
        removed_videos = set(srts_dict.keys()) - set(youtube_ids)
        if removed_videos:
            logging.info("Removing subtitle information for %d videos (no longer used)." % len(removed_videos))
            for vid in removed_videos:
                del srts_dict[vid]
    logging.info("Querying %d mappings." % (len(youtube_ids) - (0 if (force or date_to_check) else len(srts_dict))))

    # Once we have the current mapping, proceed through logic to update the mapping
    n_refreshed = 0     # keep track to avoid writing if nothing's been refreshed.
    n_new_entries = 0   # keep track for reporting
    n_failures = 0      # keep track for reporting
    for youtube_id in youtube_ids:

        # Decide whether or not to update this video based on the arguments provided at the command line
        cached = youtube_id in srts_dict
        if not force and cached:

            # First, check against date
            flag_for_refresh = True # not (response_code or last_attempt)
            last_attempt = srts_dict[youtube_id].get("last_attempt")
            last_attempt = None if not last_attempt else datetime.datetime.strptime(last_attempt, '%Y-%m-%d')
            flag_for_refresh = flag_for_refresh and (not date_to_check or date_to_check > last_attempt)
            if not flag_for_refresh:
                logging.debug("Skipping %s for date-check" % youtube_id)
                continue

            # Second, check against response code
            response_code = srts_dict[youtube_id].get("api_response")
            flag_for_refresh = flag_for_refresh and (not response_to_check or response_to_check == "all" or response_to_check == response_code)
            if not (flag_for_refresh):
                logging.debug("Skipping %s for response-code" % youtube_id)
                continue
            if not response_to_check and not date_to_check and cached: # no flags specified and already cached - skip
                logging.debug("Skipping %s for already-cached and no flags specified" % youtube_id)
                continue

        # We're gonna check; just report the reason why.
        if force and not cached:
            logging.debug("Updating %s because force flag (-f) given and video not cached." % youtube_id)
        elif force and cached:
            logging.debug("Updating %s because force flag (-f) given. Video was previously cached." % youtube_id)
        else:
            logging.debug("Updating %s because video not yet cached." % youtube_id)

        # If it makes it to here without hitting a continue, then update the entry

        try:
            srts_dict[youtube_id] = update_video_entry(youtube_id, entry=srts_dict.get(youtube_id, {}))
            n_refreshed += 1
        except Exception as e:
            logging.warn("Error updating video %s: %s" % (youtube_id, e))
            n_failures += 1
            continue

        if n_new_entries % frequency_to_save == 0:
            logging.info("On loop %d dumping dictionary into %s" % (n_new_entries, map_file))
            with open(map_file, 'wb') as fp:
                json.dump(srts_dict, fp)
        n_new_entries += 1

    # Finished the loop: save and report
    if n_refreshed > 0:
        with open(map_file, 'wb') as fp:
            json.dump(srts_dict, fp)
    if n_failures == 0:
        logging.info("Great success! Added %d entries, updated %d entries, of %d total." % (n_new_entries, n_refreshed, len(srts_dict)))
    else:
        logging.warn("Stored %d new entries, refreshed %d entries, but with %s failures, of %d total." % (n_new_entries, n_refreshed, n_failures, len(srts_dict)))

    return n_refreshed != 0
Exemple #2
0
from shared import topic_tools

TOPICS = topic_tools.get_topic_tree()
NODE_CACHE = topic_tools.get_node_cache()
SLUG2ID_MAP = topic_tools.get_slug2id_map()
from shared import topic_tools

TOPICS          = topic_tools.get_topic_tree()
NODE_CACHE      = topic_tools.get_node_cache()
SLUG2ID_MAP     = topic_tools.get_slug2id_map()