def divide_videos_by_language(youtube_ids): """Utility function for separating a list of youtube ids into a dictionary of lists, separated by video language (as determined by the current dubbed video map) """ buckets_by_lang = defaultdict(lambda: []) for y_id in youtube_ids: buckets_by_lang[get_video_language(y_id)].append(y_id) return buckets_by_lang
def show_logs(request, ndays=None): """Show file-based logging info for video downloads, language packs, and subtitles""" ndays = ndays or int(request.GET.get("days", 7)) def get_logger_filename(logger_type): return stats_logger(logger_type).handlers[0].baseFilename def parse_data(logger_type, data_fields, windowsize=128, ndays=None): parsed_data = {} nparts = len(data_fields) summary_data = dict([(fld, {}) for fld in (data_fields + ["date"])]) filepath = get_logger_filename(logger_type) if not os.path.exists(filepath): return (parsed_data, summary_data) # Group by ip, date, and youtube_id old_data = "" first_loop = True last_loop = False with open(filepath, "r") as fp: fp.seek(0, 2) # go to the end of the stream while True: # Read the next chunk of data try: # Get the data try: if first_loop: fp.seek(-windowsize, 1) # go backwards by a few first_loop = False else: fp.seek(-2 * windowsize, 1) # go backwards by a few cur_data = fp.read(windowsize) + old_data except: if last_loop and not old_data: raise elif last_loop: cur_data = old_data old_data = "" else: last_loop = True fp.seek(0) cur_data = fp.read(windowsize) + old_data # could be some overlap... if not cur_data: break except: break # Parse the data lines = cur_data.split("\n") old_data = lines[0] if len(lines) > 1 else "" new_data = lines[1:] if len(lines) > 1 else lines for l in new_data: if not l: continue # All start with a date parts = l.split(" - ", 2) if len(parts) != 2: continue tim = parts[0] dat = tim.split(" ")[0] # Validate that this date is within the accepted range parsed_date = datetime.datetime.strptime(dat, "%Y-%m-%d") logging.debug("%s %s" % (parsed_date, (datetime.datetime.now() - timedelta(days=ndays)))) if ndays is not None and datetime.datetime.now() - timedelta(days=ndays) > parsed_date: last_loop = True old_data = "" break # The rest is semicolon-delimited parts = parts[1].split(";") # vd;127.0.0.1;xvnpSRO9IDM # Now save things off parsed_data[tim] = dict([(data_fields[idx], parts[idx]) for idx in range(nparts)]) summary_data["date"][dat] = 1 + summary_data["date"].get(dat, 0) for idx in range(nparts): summary_data[data_fields[idx]][parts[idx]] = 1 + summary_data[data_fields[idx]].get( parts[idx], 0 ) for key, val in summary_data.iteritems(): summary_data[key] = sorted_dict(val, key=lambda t: t[0]) return (parsed_data, summary_data) (video_raw_data, video_summary_data) = parse_data("videos", ["task_id", "ip_address", "youtube_id"], ndays=ndays) (lp_raw_data, lp_summary_data) = parse_data( "language_packs", ["task_id", "ip_address", "lang_code", "version"], ndays=ndays ) (srt_raw_data, srt_summary_data) = parse_data( "subtitles", ["task_id", "ip_address", "lang_code", "youtube_id"], ndays=ndays ) return { "ndays": ndays, "videos": { "raw": video_raw_data, "dates": video_summary_data["date"], "ips": video_summary_data["ip_address"], "slugs": sum_counter( video_summary_data["youtube_id"], fn=lambda yid: get_id2slug_map().get(get_video_id(yid)) ), "lang_codes": sum_counter(video_summary_data["youtube_id"], fn=lambda yid: get_video_language(yid)), }, "language_packs": { "raw": lp_raw_data, "dates": lp_summary_data["date"], "ips": lp_summary_data["ip_address"], "lang_codes": lp_summary_data["lang_code"], "versions": lp_summary_data["version"], }, "subtitles": { "raw": srt_raw_data, "dates": srt_summary_data["date"], "ips": srt_summary_data["ip_address"], "lang_codes": srt_summary_data["lang_code"], }, }
def show_logs(request, ndays=None): """Show file-based logging info for video downloads, language packs, and subtitles""" ndays = ndays or int(request.GET.get("days", 7)) def get_logger_filename(logger_type): return stats_logger(logger_type).handlers[0].baseFilename def parse_data(logger_type, data_fields, windowsize=128, ndays=None): parsed_data = {} nparts = len(data_fields) summary_data = dict([(fld, {}) for fld in (data_fields + ["date"])]) filepath = get_logger_filename(logger_type) if not os.path.exists(filepath): return (parsed_data, summary_data) # Group by ip, date, and youtube_id old_data = "" first_loop = True last_loop = False with open(filepath, "r") as fp: fp.seek(0, 2) # go to the end of the stream while True: # Read the next chunk of data try: # Get the data try: if first_loop: fp.seek(-windowsize, 1) # go backwards by a few first_loop = False else: fp.seek(-2 * windowsize, 1) # go backwards by a few cur_data = fp.read(windowsize) + old_data except: if last_loop and not old_data: raise elif last_loop: cur_data = old_data old_data = "" else: last_loop = True fp.seek(0) cur_data = fp.read( windowsize ) + old_data # could be some overlap... if not cur_data: break except: break # Parse the data lines = cur_data.split("\n") old_data = lines[0] if len(lines) > 1 else "" new_data = lines[1:] if len(lines) > 1 else lines for l in new_data: if not l: continue # All start with a date parts = l.split(" - ", 2) if len(parts) != 2: continue tim = parts[0] dat = tim.split(" ")[0] # Validate that this date is within the accepted range parsed_date = datetime.datetime.strptime(dat, "%Y-%m-%d") logging.debug( "%s %s" % (parsed_date, (datetime.datetime.now() - timedelta(days=ndays)))) if ndays is not None and datetime.datetime.now( ) - timedelta(days=ndays) > parsed_date: last_loop = True old_data = "" break # The rest is semicolon-delimited parts = parts[1].split(";") # vd;127.0.0.1;xvnpSRO9IDM # Now save things off parsed_data[tim] = dict([(data_fields[idx], parts[idx]) for idx in range(nparts)]) summary_data["date"][dat] = 1 + summary_data["date"].get( dat, 0) for idx in range(nparts): summary_data[data_fields[idx]][parts[ idx]] = 1 + summary_data[data_fields[idx]].get( parts[idx], 0) for key, val in summary_data.iteritems(): summary_data[key] = sorted_dict(val, key=lambda t: t[0]) return (parsed_data, summary_data) (video_raw_data, video_summary_data) = parse_data("videos", ["task_id", "ip_address", "youtube_id"], ndays=ndays) (lp_raw_data, lp_summary_data) = parse_data( "language_packs", ["task_id", "ip_address", "lang_code", "version"], ndays=ndays) (srt_raw_data, srt_summary_data) = parse_data( "subtitles", ["task_id", "ip_address", "lang_code", "youtube_id"], ndays=ndays) return { "ndays": ndays, "videos": { "raw": video_raw_data, "dates": video_summary_data["date"], "ips": video_summary_data["ip_address"], "slugs": sum_counter( video_summary_data["youtube_id"], fn=lambda yid: get_id2slug_map().get(get_video_id(yid))), "lang_codes": sum_counter(video_summary_data["youtube_id"], fn=lambda yid: get_video_language(yid)), }, "language_packs": { "raw": lp_raw_data, "dates": lp_summary_data["date"], "ips": lp_summary_data["ip_address"], "lang_codes": lp_summary_data["lang_code"], "versions": lp_summary_data["version"], }, "subtitles": { "raw": srt_raw_data, "dates": srt_summary_data["date"], "ips": srt_summary_data["ip_address"], "lang_codes": srt_summary_data["lang_code"], }, }