Example #1
0
def divide_videos_by_language(youtube_ids):
    """Utility function for separating a list of youtube ids
    into a dictionary of lists, separated by video language
    (as determined by the current dubbed video map)
    """

    buckets_by_lang = defaultdict(lambda: [])
    for y_id in youtube_ids:
        buckets_by_lang[get_video_language(y_id)].append(y_id)
    return buckets_by_lang
Example #2
0
def divide_videos_by_language(youtube_ids):
    """Utility function for separating a list of youtube ids
    into a dictionary of lists, separated by video language
    (as determined by the current dubbed video map)
    """

    buckets_by_lang = defaultdict(lambda: [])
    for y_id in youtube_ids:
        buckets_by_lang[get_video_language(y_id)].append(y_id)
    return buckets_by_lang
Example #3
0
def show_logs(request, ndays=None):
    """Show file-based logging info for video downloads, language packs, and subtitles"""
    ndays = ndays or int(request.GET.get("days", 7))

    def get_logger_filename(logger_type):
        return stats_logger(logger_type).handlers[0].baseFilename

    def parse_data(logger_type, data_fields, windowsize=128, ndays=None):
        parsed_data = {}
        nparts = len(data_fields)
        summary_data = dict([(fld, {}) for fld in (data_fields + ["date"])])

        filepath = get_logger_filename(logger_type)
        if not os.path.exists(filepath):
            return (parsed_data, summary_data)

        # Group by ip, date, and youtube_id
        old_data = ""
        first_loop = True
        last_loop = False
        with open(filepath, "r") as fp:
            fp.seek(0, 2)  # go to the end of the stream
            while True:
                # Read the next chunk of data
                try:
                    # Get the data
                    try:
                        if first_loop:
                            fp.seek(-windowsize, 1)  # go backwards by a few
                            first_loop = False
                        else:
                            fp.seek(-2 * windowsize, 1)  # go backwards by a few

                        cur_data = fp.read(windowsize) + old_data
                    except:
                        if last_loop and not old_data:
                            raise
                        elif last_loop:
                            cur_data = old_data
                            old_data = ""
                        else:
                            last_loop = True
                            fp.seek(0)
                            cur_data = fp.read(windowsize) + old_data  # could be some overlap...

                    if not cur_data:
                        break
                except:
                    break

                # Parse the data
                lines = cur_data.split("\n")
                old_data = lines[0] if len(lines) > 1 else ""
                new_data = lines[1:] if len(lines) > 1 else lines
                for l in new_data:
                    if not l:
                        continue

                    # All start with a date
                    parts = l.split(" - ", 2)
                    if len(parts) != 2:
                        continue
                    tim = parts[0]
                    dat = tim.split(" ")[0]

                    # Validate that this date is within the accepted range
                    parsed_date = datetime.datetime.strptime(dat, "%Y-%m-%d")
                    logging.debug("%s %s" % (parsed_date, (datetime.datetime.now() - timedelta(days=ndays))))
                    if ndays is not None and datetime.datetime.now() - timedelta(days=ndays) > parsed_date:
                        last_loop = True
                        old_data = ""
                        break

                    # The rest is semicolon-delimited
                    parts = parts[1].split(";")  # vd;127.0.0.1;xvnpSRO9IDM

                    # Now save things off
                    parsed_data[tim] = dict([(data_fields[idx], parts[idx]) for idx in range(nparts)])
                    summary_data["date"][dat] = 1 + summary_data["date"].get(dat, 0)
                    for idx in range(nparts):
                        summary_data[data_fields[idx]][parts[idx]] = 1 + summary_data[data_fields[idx]].get(
                            parts[idx], 0
                        )

        for key, val in summary_data.iteritems():
            summary_data[key] = sorted_dict(val, key=lambda t: t[0])

        return (parsed_data, summary_data)

    (video_raw_data, video_summary_data) = parse_data("videos", ["task_id", "ip_address", "youtube_id"], ndays=ndays)
    (lp_raw_data, lp_summary_data) = parse_data(
        "language_packs", ["task_id", "ip_address", "lang_code", "version"], ndays=ndays
    )
    (srt_raw_data, srt_summary_data) = parse_data(
        "subtitles", ["task_id", "ip_address", "lang_code", "youtube_id"], ndays=ndays
    )

    return {
        "ndays": ndays,
        "videos": {
            "raw": video_raw_data,
            "dates": video_summary_data["date"],
            "ips": video_summary_data["ip_address"],
            "slugs": sum_counter(
                video_summary_data["youtube_id"], fn=lambda yid: get_id2slug_map().get(get_video_id(yid))
            ),
            "lang_codes": sum_counter(video_summary_data["youtube_id"], fn=lambda yid: get_video_language(yid)),
        },
        "language_packs": {
            "raw": lp_raw_data,
            "dates": lp_summary_data["date"],
            "ips": lp_summary_data["ip_address"],
            "lang_codes": lp_summary_data["lang_code"],
            "versions": lp_summary_data["version"],
        },
        "subtitles": {
            "raw": srt_raw_data,
            "dates": srt_summary_data["date"],
            "ips": srt_summary_data["ip_address"],
            "lang_codes": srt_summary_data["lang_code"],
        },
    }
Example #4
0
def show_logs(request, ndays=None):
    """Show file-based logging info for video downloads, language packs, and subtitles"""
    ndays = ndays or int(request.GET.get("days", 7))

    def get_logger_filename(logger_type):
        return stats_logger(logger_type).handlers[0].baseFilename

    def parse_data(logger_type, data_fields, windowsize=128, ndays=None):
        parsed_data = {}
        nparts = len(data_fields)
        summary_data = dict([(fld, {}) for fld in (data_fields + ["date"])])

        filepath = get_logger_filename(logger_type)
        if not os.path.exists(filepath):
            return (parsed_data, summary_data)

        # Group by ip, date, and youtube_id
        old_data = ""
        first_loop = True
        last_loop = False
        with open(filepath, "r") as fp:
            fp.seek(0, 2)  # go to the end of the stream
            while True:
                # Read the next chunk of data
                try:
                    # Get the data
                    try:
                        if first_loop:
                            fp.seek(-windowsize, 1)  # go backwards by a few
                            first_loop = False
                        else:
                            fp.seek(-2 * windowsize,
                                    1)  # go backwards by a few

                        cur_data = fp.read(windowsize) + old_data
                    except:
                        if last_loop and not old_data:
                            raise
                        elif last_loop:
                            cur_data = old_data
                            old_data = ""
                        else:
                            last_loop = True
                            fp.seek(0)
                            cur_data = fp.read(
                                windowsize
                            ) + old_data  # could be some overlap...

                    if not cur_data:
                        break
                except:
                    break

                # Parse the data
                lines = cur_data.split("\n")
                old_data = lines[0] if len(lines) > 1 else ""
                new_data = lines[1:] if len(lines) > 1 else lines
                for l in new_data:
                    if not l:
                        continue

                    # All start with a date
                    parts = l.split(" - ", 2)
                    if len(parts) != 2:
                        continue
                    tim = parts[0]
                    dat = tim.split(" ")[0]

                    # Validate that this date is within the accepted range
                    parsed_date = datetime.datetime.strptime(dat, "%Y-%m-%d")
                    logging.debug(
                        "%s %s" %
                        (parsed_date,
                         (datetime.datetime.now() - timedelta(days=ndays))))
                    if ndays is not None and datetime.datetime.now(
                    ) - timedelta(days=ndays) > parsed_date:
                        last_loop = True
                        old_data = ""
                        break

                    # The rest is semicolon-delimited
                    parts = parts[1].split(";")  # vd;127.0.0.1;xvnpSRO9IDM

                    # Now save things off
                    parsed_data[tim] = dict([(data_fields[idx], parts[idx])
                                             for idx in range(nparts)])
                    summary_data["date"][dat] = 1 + summary_data["date"].get(
                        dat, 0)
                    for idx in range(nparts):
                        summary_data[data_fields[idx]][parts[
                            idx]] = 1 + summary_data[data_fields[idx]].get(
                                parts[idx], 0)

        for key, val in summary_data.iteritems():
            summary_data[key] = sorted_dict(val, key=lambda t: t[0])

        return (parsed_data, summary_data)

    (video_raw_data,
     video_summary_data) = parse_data("videos",
                                      ["task_id", "ip_address", "youtube_id"],
                                      ndays=ndays)
    (lp_raw_data, lp_summary_data) = parse_data(
        "language_packs", ["task_id", "ip_address", "lang_code", "version"],
        ndays=ndays)
    (srt_raw_data, srt_summary_data) = parse_data(
        "subtitles", ["task_id", "ip_address", "lang_code", "youtube_id"],
        ndays=ndays)

    return {
        "ndays": ndays,
        "videos": {
            "raw":
            video_raw_data,
            "dates":
            video_summary_data["date"],
            "ips":
            video_summary_data["ip_address"],
            "slugs":
            sum_counter(
                video_summary_data["youtube_id"],
                fn=lambda yid: get_id2slug_map().get(get_video_id(yid))),
            "lang_codes":
            sum_counter(video_summary_data["youtube_id"],
                        fn=lambda yid: get_video_language(yid)),
        },
        "language_packs": {
            "raw": lp_raw_data,
            "dates": lp_summary_data["date"],
            "ips": lp_summary_data["ip_address"],
            "lang_codes": lp_summary_data["lang_code"],
            "versions": lp_summary_data["version"],
        },
        "subtitles": {
            "raw": srt_raw_data,
            "dates": srt_summary_data["date"],
            "ips": srt_summary_data["ip_address"],
            "lang_codes": srt_summary_data["lang_code"],
        },
    }