def handle(self, raw_file):
        self.log("Reading " + raw_file)
        try:
            record_count = 0
            bytes_read = 0
            start = now()
            file_version = fileutil.detect_file_version(raw_file, simple_detection=True)
            self.log("Detected version {0} for file {1}".format(file_version,
                     raw_file))
            for unpacked in fileutil.unpack(raw_file, file_version=file_version):
                record_count += 1
                common_bytes = unpacked.len_path + fileutil.RECORD_PREAMBLE_LENGTH[file_version]
                current_bytes = common_bytes + unpacked.len_data
                current_bytes_uncompressed = common_bytes + len(unpacked.data)
                bytes_read += current_bytes
                if unpacked.error:
                    self.log("ERROR: Found corrupted data for record {0} in " \
                             "{1} path: {2} Error: {3}".format(record_count,
                                 raw_file, unpacked.path, unpacked.error))
                    self.stats.increment(records_read=1,
                            bytes_read=current_bytes,
                            bytes_uncompressed=current_bytes_uncompressed,
                            bad_records=1, bad_record_type="corrupted_data")
                    continue
                if len(unpacked.data) == 0:
                    self.log("WARN: Found empty data for record {0} in " \
                             "{2} path: {2}".format(record_count, raw_file,
                                 unpacked.path))
                    self.stats.increment(records_read=1,
                            bytes_read=current_bytes,
                            bytes_uncompressed=current_bytes_uncompressed,
                            bad_records=1, bad_record_type="empty_data")
                    continue

                submission_date = ts_to_yyyymmdd(unpacked.timestamp)
                path = fileutil.to_unicode(unpacked.path)

                if unpacked.data[0] != "{":
                    # Data looks weird, should be JSON.
                    self.log("Warning: Found unexpected data for record {0}" \
                             " in {1} path: {2} data:\n{3}".format(record_count,
                                 raw_file, path, unpacked.data))
                else:
                    # Raw JSON, make sure we treat it as unicode.
                    unpacked.data = fileutil.to_unicode(unpacked.data)

                path_components = path.split("/")
                if len(path_components) != self.expected_dim_count:
                    # We're going to pop the ID off, but we'll also add the
                    # submission date, so it evens out.
                    bad_record_type = "invalid_path"
                    if ReadRawStep.UUID_ONLY_PATH.match(path):
                        bad_record_type = "uuid_only_path"
                    else:
                        self.log("Found an invalid path in record {0}: " \
                             "{1}".format(record_count, path))
                    self.stats.increment(records_read=1,
                            bytes_read=current_bytes,
                            bytes_uncompressed=current_bytes_uncompressed,
                            bad_records=1, bad_record_type=bad_record_type)
                    continue

                key = path_components.pop(0)
                info = {}
                info["reason"] = path_components.pop(0)
                info["appName"] = path_components.pop(0)
                info["appVersion"] = path_components.pop(0)
                info["appUpdateChannel"] = path_components.pop(0)
                info["appBuildID"] = path_components.pop(0)
                dims = self.schema.dimensions_from(info, submission_date)
                channel = self.schema.get_field(dims, "appUpdateChannel",
                        True, True)

                self.stats.increment(channel=channel, records_read=1,
                        bytes_read=current_bytes,
                        bytes_uncompressed=current_bytes_uncompressed)

                try:
                    # Convert data:
                    if self.converter is None:
                        serialized_data = unpacked.data
                        # TODO: Converter.VERSION_UNCONVERTED
                        data_version = 1
                    else:
                        parsed_data, parsed_dims = self.converter.convert_json(
                                unpacked.data, dims[-1], unpacked.ip)
                        # TODO: take this out if it's too slow
                        for i in range(len(dims)):
                            if dims[i] != parsed_dims[i]:
                                self.log("Record {0} mismatched dimension " \
                                         "{1}: '{2}' != '{3}'".format(
                                            record_count, i, dims[i],
                                            parsed_dims[i]))
                        serialized_data = self.converter.serialize(parsed_data)
                        dims = parsed_dims
                        # TODO: Converter.VERSION_CONVERTED
                        data_version = 2
                    try:
                        # Write to persistent storage
                        n = self.storage.write(key, serialized_data, dims,
                                data_version)
                        self.stats.increment(channel=channel, records_written=1,
                            bytes_written=len(key) + len(serialized_data) + 2)
                        # Compress rotated files as we generate them
                        if n.endswith(StorageLayout.PENDING_COMPRESSION_SUFFIX):
                            self.q_out.put(n)
                    except Exception, e:
                        self.write_bad_record(key, dims, serialized_data,
                                str(e), "ERROR Writing to output file:",
                                "write_failed")
                except BadPayloadError, e:
                    self.write_bad_record(key, dims, unpacked.data, e.msg,
                            "Bad Payload:", "bad_payload")
                except Exception, e:
                    err_message = str(e)
                    if err_message == "Missing in payload: info.revision":
                        # We don't need to write these bad records out - we know
                        # why they are being skipped.
                        self.stats.increment(channel=channel, bad_records=1,
                                bad_record_type="missing_revision")
                    elif err_message == "Invalid revision URL: /rev/":
                        # We do want to log these payloads, but we don't want
                        # the full stack trace.
                        self.write_bad_record(key, dims, unpacked.data, err_message,
                                "Conversion Error", "missing_revision_repo")
                    # Don't split this long string - we want to be able to find it in the code
                    elif err_message.startswith("JSONDecodeError: Invalid control character"):
                        self.write_bad_record(key, dims, unpacked.data, err_message,
                                "Conversion Error", "invalid_control_char")
                    else:
                        # TODO: Recognize other common failure modes and handle
                        #       them gracefully.
                        self.write_bad_record(key, dims, unpacked.data, err_message,
                                "Conversion Error", "conversion_error")
                        self.log(traceback.format_exc())
Example #2
0
def main():
    parser = argparse.ArgumentParser(
        description='Split raw logs into partitioned files.',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("-m",
                        "--max-output-size",
                        metavar="N",
                        help="Rotate output files after N bytes",
                        type=int,
                        default=500000000)
    parser.add_argument("-i",
                        "--input-file",
                        help="Filename to read from",
                        required=True)
    parser.add_argument("-o",
                        "--output-dir",
                        help="Base directory to store split files",
                        required=True)
    parser.add_argument("-t",
                        "--telemetry-schema",
                        help="Filename of telemetry schema spec",
                        required=True)
    parser.add_argument("-f",
                        "--file-version",
                        help="Log file version (if omitted, we'll guess)")
    args = parser.parse_args()

    schema_data = open(args.telemetry_schema)
    schema = TelemetrySchema(json.load(schema_data))
    schema_data.close()

    storage = StorageLayout(schema, args.output_dir, args.max_output_size)

    expected_dim_count = len(schema._dimensions)

    if not os.path.isdir(args.output_dir):
        os.makedirs(args.output_dir)

    record_count = 0
    bad_record_count = 0
    bytes_read = 0
    start = datetime.now()
    file_version = args.file_version
    if not file_version:
        file_version = fileutil.detect_file_version(args.input_file)
    for r in fileutil.unpack(args.input_file, file_version=file_version):
        record_count += 1
        if r.error:
            bad_record_count += 1
            continue
        # Incoming timestamps are in milliseconds, so convert to POSIX first
        # (ie. seconds)
        submission_date = date.fromtimestamp(r.timestamp /
                                             1000).strftime("%Y%m%d")
        # Deal with unicode
        path = unicode(r.path, errors="replace")
        data = unicode(r.data, errors="replace")

        bytes_read += r.len_ip + r.len_path + r.len_data + fileutil.RECORD_PREAMBLE_LENGTH[
            file_version]
        #print "Path for record", record_count, path, "length of data:", r.len_data, "data:", data[0:5] + "..."

        path_components = path.split("/")
        if len(path_components) != expected_dim_count:
            # We're going to pop the ID off, but we'll also add the submission
            # date, so it evens out.
            print "Found an invalid path in record", record_count, path
            bad_record_count += 1
            continue

        key = path_components.pop(0)
        info = {}
        info["reason"] = path_components.pop(0)
        info["appName"] = path_components.pop(0)
        info["appVersion"] = path_components.pop(0)
        info["appUpdateChannel"] = path_components.pop(0)
        info["appBuildID"] = path_components.pop(0)
        dimensions = schema.dimensions_from(info, submission_date)
        #print "  Converted path to filename", schema.get_filename(args.output_dir, dimensions)
        storage.write(key, data, dimensions)
    duration = timer.delta_sec(start)
    mb_read = bytes_read / 1024.0 / 1024.0
    print "Read %.2fMB in %.2fs (%.2fMB/s), %d of %d records were bad" % (
        mb_read, duration, mb_read / duration, bad_record_count, record_count)
    return 0
Example #3
0
    def handle(self, raw_file):
        self.log("Reading " + raw_file)
        try:
            record_count = 0
            bytes_read = 0
            start = now()
            file_version = fileutil.detect_file_version(raw_file,
                                                        simple_detection=True)
            self.log("Detected version {0} for file {1}".format(
                file_version, raw_file))
            for unpacked in fileutil.unpack(raw_file,
                                            file_version=file_version):
                record_count += 1
                common_bytes = unpacked.len_path + fileutil.RECORD_PREAMBLE_LENGTH[
                    file_version]
                current_bytes = common_bytes + unpacked.len_data
                current_bytes_uncompressed = common_bytes + len(unpacked.data)
                bytes_read += current_bytes
                if unpacked.error:
                    self.log("ERROR: Found corrupted data for record {0} in " \
                             "{1} path: {2} Error: {3}".format(record_count,
                                 raw_file, unpacked.path, unpacked.error))
                    self.stats.increment(
                        records_read=1,
                        bytes_read=current_bytes,
                        bytes_uncompressed=current_bytes_uncompressed,
                        bad_records=1,
                        bad_record_type="corrupted_data")
                    continue
                if len(unpacked.data) == 0:
                    self.log("WARN: Found empty data for record {0} in " \
                             "{2} path: {2}".format(record_count, raw_file,
                                 unpacked.path))
                    self.stats.increment(
                        records_read=1,
                        bytes_read=current_bytes,
                        bytes_uncompressed=current_bytes_uncompressed,
                        bad_records=1,
                        bad_record_type="empty_data")
                    continue

                submission_date = ts_to_yyyymmdd(unpacked.timestamp)
                path = unicode(unpacked.path, errors="replace")

                if unpacked.data[0] != "{":
                    # Data looks weird, should be JSON.
                    self.log("Warning: Found unexpected data for record {0}" \
                             " in {1} path: {2} data:\n{3}".format(record_count,
                                 raw_file, path, unpacked.data))
                else:
                    # Raw JSON, make sure we treat it as unicode.
                    unpacked.data = unicode(unpacked.data, errors="replace")

                path_components = path.split("/")
                if len(path_components) != self.expected_dim_count:
                    # We're going to pop the ID off, but we'll also add the
                    # submission date, so it evens out.
                    bad_record_type = "invalid_path"
                    if ReadRawStep.UUID_ONLY_PATH.match(path):
                        bad_record_type = "uuid_only_path"
                    else:
                        self.log("Found an invalid path in record {0}: " \
                             "{1}".format(record_count, path))
                    self.stats.increment(
                        records_read=1,
                        bytes_read=current_bytes,
                        bytes_uncompressed=current_bytes_uncompressed,
                        bad_records=1,
                        bad_record_type=bad_record_type)
                    continue

                key = path_components.pop(0)
                info = {}
                info["reason"] = path_components.pop(0)
                info["appName"] = path_components.pop(0)
                info["appVersion"] = path_components.pop(0)
                info["appUpdateChannel"] = path_components.pop(0)
                info["appBuildID"] = path_components.pop(0)
                dims = self.schema.dimensions_from(info, submission_date)
                channel = self.schema.get_field(dims, "appUpdateChannel", True,
                                                True)

                self.stats.increment(
                    channel=channel,
                    records_read=1,
                    bytes_read=current_bytes,
                    bytes_uncompressed=current_bytes_uncompressed)

                try:
                    # Convert data:
                    if self.converter is None:
                        serialized_data = unpacked.data
                        # TODO: Converter.VERSION_UNCONVERTED
                        data_version = 1
                    else:
                        parsed_data, parsed_dims = self.converter.convert_json(
                            unpacked.data, dims[-1], unpacked.ip)
                        # TODO: take this out if it's too slow
                        for i in range(len(dims)):
                            if dims[i] != parsed_dims[i]:
                                self.log("Record {0} mismatched dimension " \
                                         "{1}: '{2}' != '{3}'".format(
                                            record_count, i, dims[1],
                                            parsed_dims[i]))
                        serialized_data = self.converter.serialize(parsed_data)
                        dims = parsed_dims
                        # TODO: Converter.VERSION_CONVERTED
                        data_version = 2
                    try:
                        # Write to persistent storage
                        n = self.storage.write(key, serialized_data, dims,
                                               data_version)
                        self.stats.increment(channel=channel,
                                             records_written=1,
                                             bytes_written=len(key) +
                                             len(serialized_data) + 2)
                        # Compress rotated files as we generate them
                        if n.endswith(
                                StorageLayout.PENDING_COMPRESSION_SUFFIX):
                            self.q_out.put(n)
                    except Exception, e:
                        self.write_bad_record(key, dims, serialized_data,
                                              str(e),
                                              "ERROR Writing to output file:",
                                              "write_failed")
                except BadPayloadError, e:
                    self.write_bad_record(key, dims, unpacked.data, e.msg,
                                          "Bad Payload:", "bad_payload")
                except Exception, e:
                    err_message = str(e)
                    if err_message == "Missing in payload: info.revision":
                        # We don't need to write these bad records out - we know
                        # why they are being skipped.
                        self.stats.increment(
                            channel=channel,
                            bad_records=1,
                            bad_record_type="missing_revision")
                    elif err_message == "Invalid revision URL: /rev/":
                        # We do want to log these payloads, but we don't want
                        # the full stack trace.
                        self.write_bad_record(key, dims, unpacked.data,
                                              err_message, "Conversion Error",
                                              "missing_revision_repo")
                    # Don't split this long string - we want to be able to find it in the code
                    elif err_message.startswith(
                            "JSONDecodeError: Invalid control character"):
                        self.write_bad_record(key, dims, unpacked.data,
                                              err_message, "Conversion Error",
                                              "invalid_control_char")
                    else:
                        # TODO: Recognize other common failure modes and handle
                        #       them gracefully.
                        self.write_bad_record(key, dims, unpacked.data,
                                              err_message, "Conversion Error",
                                              "conversion_error")
                        self.log(traceback.format_exc())
def main():
    parser = argparse.ArgumentParser(description='Split raw logs into partitioned files.', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("-m", "--max-output-size", metavar="N", help="Rotate output files after N bytes", type=int, default=500000000)
    parser.add_argument("-i", "--input-file", help="Filename to read from", required=True)
    parser.add_argument("-o", "--output-dir", help="Base directory to store split files", required=True)
    parser.add_argument("-t", "--telemetry-schema", help="Filename of telemetry schema spec", required=True)
    parser.add_argument("-f", "--file-version", help="Log file version (if omitted, we'll guess)")
    args = parser.parse_args()

    schema_data = open(args.telemetry_schema)
    schema = TelemetrySchema(json.load(schema_data))
    schema_data.close()

    storage = StorageLayout(schema, args.output_dir, args.max_output_size)

    expected_dim_count = len(schema._dimensions)

    if not os.path.isdir(args.output_dir):
        os.makedirs(args.output_dir)

    record_count = 0
    bad_record_count = 0
    bytes_read = 0
    start = datetime.now()
    file_version = args.file_version
    if not file_version:
        file_version = fileutil.detect_file_version(args.input_file)
    for r in fileutil.unpack(args.input_file, file_version=file_version):
        record_count += 1
        if r.error:
            bad_record_count += 1
            continue
        # Incoming timestamps are in milliseconds, so convert to POSIX first
        # (ie. seconds)
        submission_date = date.fromtimestamp(r.timestamp / 1000).strftime("%Y%m%d")
        # Deal with unicode
        path = fileutil.to_unicode(r.path)
        data = fileutil.to_unicode(r.data)

        bytes_read += r.len_ip + r.len_path + r.len_data + fileutil.RECORD_PREAMBLE_LENGTH[file_version]
        #print "Path for record", record_count, path, "length of data:", r.len_data, "data:", data[0:5] + "..."

        path_components = path.split("/")
        if len(path_components) != expected_dim_count:
            # We're going to pop the ID off, but we'll also add the submission
            # date, so it evens out.
            print "Found an invalid path in record", record_count, path
            bad_record_count += 1
            continue

        key = path_components.pop(0)
        info = {}
        info["reason"] = path_components.pop(0)
        info["appName"] = path_components.pop(0)
        info["appVersion"] = path_components.pop(0)
        info["appUpdateChannel"] = path_components.pop(0)
        info["appBuildID"] = path_components.pop(0)
        dimensions = schema.dimensions_from(info, submission_date)
        #print "  Converted path to filename", schema.get_filename(args.output_dir, dimensions)
        storage.write(key, data, dimensions)
    duration = timer.delta_sec(start)
    mb_read = bytes_read / 1024.0 / 1024.0
    print "Read %.2fMB in %.2fs (%.2fMB/s), %d of %d records were bad" % (mb_read, duration, mb_read / duration, bad_record_count, record_count)
    return 0