def create_all_mappings(force=False, frequency_to_save=100, response_to_check=None, date_to_check=None, map_file=SRTS_JSON_FILEPATH): """ Write or update JSON file that maps from YouTube ID to Amara code and languages available. This command updates the json file that records what languages videos have been subtitled in. It loops through all video ids, records a list of which languages Amara says it has been subtitled in and meta data about the request (e.g. date, response code). See the schema in the docstring for fcn update_video_entry. """ youtube_ids = get_slug2id_map().values() # Initialize the data if not os.path.exists(map_file): ensure_dir(os.path.dirname(map_file)) srts_dict = {} else: # Open the file, read, and clean out old videos. try: with open(map_file, "r") as fp: srts_dict = json.load(fp) except Exception as e: if not force: # only handle the error if force=True. Otherwise, these data are too valuable to lose, so just assume a temp problem. raise else: logging.error("JSON file corrupted, using empty json and starting from scratch (%s)" % e) srts_dict = {} else: logging.info("Loaded %d mappings." % (len(srts_dict))) # Set of videos no longer used by KA Lite removed_videos = set(srts_dict.keys()) - set(youtube_ids) if removed_videos: logging.info("Removing subtitle information for %d videos (no longer used)." % len(removed_videos)) for vid in removed_videos: del srts_dict[vid] logging.info("Querying %d mappings." % (len(youtube_ids) - (0 if (force or date_to_check) else len(srts_dict)))) # Once we have the current mapping, proceed through logic to update the mapping n_refreshed = 0 # keep track to avoid writing if nothing's been refreshed. n_new_entries = 0 # keep track for reporting n_failures = 0 # keep track for reporting for youtube_id in youtube_ids: # Decide whether or not to update this video based on the arguments provided at the command line cached = youtube_id in srts_dict if not force and cached: # First, check against date flag_for_refresh = True # not (response_code or last_attempt) last_attempt = srts_dict[youtube_id].get("last_attempt") last_attempt = None if not last_attempt else datetime.datetime.strptime(last_attempt, '%Y-%m-%d') flag_for_refresh = flag_for_refresh and (not date_to_check or date_to_check > last_attempt) if not flag_for_refresh: logging.debug("Skipping %s for date-check" % youtube_id) continue # Second, check against response code response_code = srts_dict[youtube_id].get("api_response") flag_for_refresh = flag_for_refresh and (not response_to_check or response_to_check == "all" or response_to_check == response_code) if not (flag_for_refresh): logging.debug("Skipping %s for response-code" % youtube_id) continue if not response_to_check and not date_to_check and cached: # no flags specified and already cached - skip logging.debug("Skipping %s for already-cached and no flags specified" % youtube_id) continue # We're gonna check; just report the reason why. if force and not cached: logging.debug("Updating %s because force flag (-f) given and video not cached." % youtube_id) elif force and cached: logging.debug("Updating %s because force flag (-f) given. Video was previously cached." % youtube_id) else: logging.debug("Updating %s because video not yet cached." % youtube_id) # If it makes it to here without hitting a continue, then update the entry try: srts_dict[youtube_id] = update_video_entry(youtube_id, entry=srts_dict.get(youtube_id, {})) n_refreshed += 1 except Exception as e: logging.warn("Error updating video %s: %s" % (youtube_id, e)) n_failures += 1 continue if n_new_entries % frequency_to_save == 0: logging.info("On loop %d dumping dictionary into %s" % (n_new_entries, map_file)) with open(map_file, 'wb') as fp: json.dump(srts_dict, fp) n_new_entries += 1 # Finished the loop: save and report if n_refreshed > 0: with open(map_file, 'wb') as fp: json.dump(srts_dict, fp) if n_failures == 0: logging.info("Great success! Added %d entries, updated %d entries, of %d total." % (n_new_entries, n_refreshed, len(srts_dict))) else: logging.warn("Stored %d new entries, refreshed %d entries, but with %s failures, of %d total." % (n_new_entries, n_refreshed, n_failures, len(srts_dict))) return n_refreshed != 0
from shared import topic_tools TOPICS = topic_tools.get_topic_tree() NODE_CACHE = topic_tools.get_node_cache() SLUG2ID_MAP = topic_tools.get_slug2id_map()