Ejemplo n.º 1
0
def main():
    parser = ArgumentParser(
        description='Convert local Telemetry pings to server storage structure'
    )
    parser.add_argument("--input-dir", required=True)
    parser.add_argument("--output-dir", required=True)
    parser.add_argument("--schema",
                        type=file,
                        default='./telemetry/telemetry_schema.json')
    parser.add_argument("--histogram-cache-dir",
                        default='/tmp/telemetry_histogram_cache')
    args = parser.parse_args()

    print "Getting pings from", args.input_dir, "converting them and storing them in", args.output_dir
    schema = TelemetrySchema(json.load(args.schema))
    cache = RevisionCache(args.histogram_cache_dir, 'hg.mozilla.org')
    converter = Converter(cache, schema)
    storage = StorageLayout(schema, args.output_dir, 500000000)
    ping_dir = args.input_dir
    ping_files = get_pings(ping_dir)
    if len(ping_files) == 0:
        # Try the usual ping dir (if the user just gave the Profile Dir)
        ping_dir = os.path.join(args.input_dir, "saved-telemetry-pings")
        ping_files = get_pings(ping_dir)

    print "found", len(ping_files), "pings"
    for ping_file in ping_files:
        with open(os.path.join(ping_dir, ping_file), "r") as f:
            ping = json.load(f)
            reason = ping['reason']
            key = ping['slug']
            payload = ping['payload']
            submission_date = date.today().strftime("%Y%m%d")
            dims = schema.dimensions_from(payload, submission_date)
            try:
                parsed_data, dims = converter.convert_obj(payload, dims[-1])
                serialized_data = converter.serialize(parsed_data)
                data_version = Converter.VERSION_CONVERTED
                try:
                    # Write to persistent storage
                    n = storage.write(key, serialized_data, dims, data_version)
                    print "Successfully saved ping", key, "to", n
                except Exception, e:
                    traceback.print_exc()
            except BadPayloadError, e:
                print "Bad Payload:", e.msg
            except Exception, e:
                traceback.print_exc()
def main():
    parser = ArgumentParser(description='Convert local Telemetry pings to server storage structure')
    parser.add_argument("--input-dir", required=True)
    parser.add_argument("--output-dir", required=True)
    parser.add_argument("--schema", type=file, default='./telemetry/telemetry_schema.json')
    parser.add_argument("--histogram-cache-dir", default='/tmp/telemetry_histogram_cache')
    args = parser.parse_args()

    print "Getting pings from", args.input_dir, "converting them and storing them in", args.output_dir
    schema = TelemetrySchema(json.load(args.schema))
    cache = RevisionCache(args.histogram_cache_dir, 'hg.mozilla.org')
    converter = Converter(cache, schema)
    storage = StorageLayout(schema, args.output_dir, 500000000)
    ping_dir = args.input_dir
    ping_files = get_pings(ping_dir)
    if len(ping_files) == 0:
        # Try the usual ping dir (if the user just gave the Profile Dir)
        ping_dir = os.path.join(args.input_dir, "saved-telemetry-pings")
        ping_files = get_pings(ping_dir)

    print "found", len(ping_files), "pings"
    for ping_file in ping_files:
        with open(os.path.join(ping_dir, ping_file), "r") as f:
            ping = json.load(f)
            reason = ping['reason']
            key = ping['slug']
            payload = ping['payload']
            submission_date = date.today().strftime("%Y%m%d")
            dims = schema.dimensions_from(payload, submission_date)
            try:
                parsed_data, dims = converter.convert_obj(payload, dims[-1])
                serialized_data = converter.serialize(parsed_data)
                data_version = Converter.VERSION_CONVERTED
                try:
                    # Write to persistent storage
                    n = storage.write(key, serialized_data, dims, data_version)
                    print "Successfully saved ping", key, "to", n
                except Exception, e:
                    traceback.print_exc()
            except BadPayloadError, e:
                print "Bad Payload:", e.msg
            except Exception, e:
                traceback.print_exc()
Ejemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser(
        description='Split raw logs into partitioned files.',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("-m",
                        "--max-output-size",
                        metavar="N",
                        help="Rotate output files after N bytes",
                        type=int,
                        default=500000000)
    parser.add_argument("-i",
                        "--input-file",
                        help="Filename to read from",
                        required=True)
    parser.add_argument("-o",
                        "--output-dir",
                        help="Base directory to store split files",
                        required=True)
    parser.add_argument("-t",
                        "--telemetry-schema",
                        help="Filename of telemetry schema spec",
                        required=True)
    parser.add_argument("-f",
                        "--file-version",
                        help="Log file version (if omitted, we'll guess)")
    args = parser.parse_args()

    schema_data = open(args.telemetry_schema)
    schema = TelemetrySchema(json.load(schema_data))
    schema_data.close()

    storage = StorageLayout(schema, args.output_dir, args.max_output_size)

    expected_dim_count = len(schema._dimensions)

    if not os.path.isdir(args.output_dir):
        os.makedirs(args.output_dir)

    record_count = 0
    bad_record_count = 0
    bytes_read = 0
    start = datetime.now()
    file_version = args.file_version
    if not file_version:
        file_version = fileutil.detect_file_version(args.input_file)
    for r in fileutil.unpack(args.input_file, file_version=file_version):
        record_count += 1
        if r.error:
            bad_record_count += 1
            continue
        # Incoming timestamps are in milliseconds, so convert to POSIX first
        # (ie. seconds)
        submission_date = date.fromtimestamp(r.timestamp /
                                             1000).strftime("%Y%m%d")
        # Deal with unicode
        path = unicode(r.path, errors="replace")
        data = unicode(r.data, errors="replace")

        bytes_read += r.len_ip + r.len_path + r.len_data + fileutil.RECORD_PREAMBLE_LENGTH[
            file_version]
        #print "Path for record", record_count, path, "length of data:", r.len_data, "data:", data[0:5] + "..."

        path_components = path.split("/")
        if len(path_components) != expected_dim_count:
            # We're going to pop the ID off, but we'll also add the submission
            # date, so it evens out.
            print "Found an invalid path in record", record_count, path
            bad_record_count += 1
            continue

        key = path_components.pop(0)
        info = {}
        info["reason"] = path_components.pop(0)
        info["appName"] = path_components.pop(0)
        info["appVersion"] = path_components.pop(0)
        info["appUpdateChannel"] = path_components.pop(0)
        info["appBuildID"] = path_components.pop(0)
        dimensions = schema.dimensions_from(info, submission_date)
        #print "  Converted path to filename", schema.get_filename(args.output_dir, dimensions)
        storage.write(key, data, dimensions)
    duration = timer.delta_sec(start)
    mb_read = bytes_read / 1024.0 / 1024.0
    print "Read %.2fMB in %.2fs (%.2fMB/s), %d of %d records were bad" % (
        mb_read, duration, mb_read / duration, bad_record_count, record_count)
    return 0
Ejemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser(
        description="Split raw logs into partitioned files.", formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "-m", "--max-output-size", metavar="N", help="Rotate output files after N bytes", type=int, default=500000000
    )
    parser.add_argument("-i", "--input-file", help="Filename to read from", required=True)
    parser.add_argument("-o", "--output-dir", help="Base directory to store split files", required=True)
    parser.add_argument("-t", "--telemetry-schema", help="Filename of telemetry schema spec", required=True)
    parser.add_argument("-b", "--bucket", help="S3 Bucket name")
    parser.add_argument("-k", "--aws-key", help="AWS Key")
    parser.add_argument("-s", "--aws-secret-key", help="AWS Secret Key")
    args = parser.parse_args()

    schema_data = open(args.telemetry_schema)
    schema = TelemetrySchema(json.load(schema_data))
    schema_data.close()

    storage = StorageLayout(schema, args.output_dir, args.max_output_size)

    expected_dim_count = len(schema._dimensions)

    if not os.path.isdir(args.output_dir):
        os.makedirs(args.output_dir)

    record_count = 0
    bad_record_count = 0
    bytes_read = 0
    start = datetime.now()
    for len_path, len_data, timestamp, path, data, err in fileutil.unpack(args.input_file):
        record_count += 1
        if err:
            bad_record_count += 1
            continue
        # Incoming timestamps are in milliseconds, so convert to POSIX first
        # (ie. seconds)
        submission_date = date.fromtimestamp(timestamp / 1000).strftime("%Y%m%d")
        # Deal with unicode
        path = unicode(path, errors="replace")
        data = unicode(data, errors="replace")

        bytes_read += len_path + len_data + fileutil.RECORD_PREAMBLE_LENGTH
        # print "Path for record", record_count, path, "length of data:", len_data, "data:", data[0:5] + "..."

        path_components = path.split("/")
        if len(path_components) != expected_dim_count:
            # We're going to pop the ID off, but we'll also add the submission
            # date, so it evens out.
            print "Found an invalid path in record", record_count, path
            bad_record_count += 1
            continue

        key = path_components.pop(0)
        info = {}
        info["reason"] = path_components.pop(0)
        info["appName"] = path_components.pop(0)
        info["appVersion"] = path_components.pop(0)
        info["appUpdateChannel"] = path_components.pop(0)
        info["appBuildID"] = path_components.pop(0)
        dimensions = schema.dimensions_from(info, submission_date)
        # print "  Converted path to filename", schema.get_filename(args.output_dir, dimensions)
        storage.write(key, data, dimensions)
    duration = timer.delta_sec(start)
    mb_read = bytes_read / 1024.0 / 1024.0
    print "Read %.2fMB in %.2fs (%.2fMB/s), %d of %d records were bad" % (
        mb_read,
        duration,
        mb_read / duration,
        bad_record_count,
        record_count,
    )
    return 0