def handle(self, raw_file): print self.label, "reading", raw_file try: record_count = 0 bytes_read = 0 start = datetime.now() for len_path, len_data, timestamp, path, data, err in fileutil.unpack(raw_file): record_count += 1 self.records_read += 1 if err: print self.label, "ERROR: Found corrupted data for record", record_count, "in", raw_file, "path:", path, "Error:", err self.bad_records += 1 continue if len(data) == 0: print self.label, "ERROR: Found empty data for record", record_count, "in", raw_file, "path:", path self.bad_records += 1 continue # Incoming timestamps are in milliseconds, so convert to POSIX first # (ie. seconds) submission_date = date.fromtimestamp(timestamp / 1000).strftime("%Y%m%d") path = unicode(path, errors="replace") #print "Path for record", record_count, path, "length of data:", len_data if data[0] != "{": # Data looks weird, should be JSON. print self.label, "Warning: Found unexpected data for record", record_count, "in", raw_file, "path:", path, "data:" print data else: # Raw JSON, make sure we treat it as unicode. data = unicode(data, errors="replace") current_bytes = len_path + len_data + fileutil.RECORD_PREAMBLE_LENGTH bytes_read += current_bytes self.bytes_read += current_bytes path_components = path.split("/") if len(path_components) != self.expected_dim_count: # We're going to pop the ID off, but we'll also add the # submission date, so it evens out. print self.label, "Found an invalid path in record", record_count, path continue key = path_components.pop(0) info = {} info["reason"] = path_components.pop(0) info["appName"] = path_components.pop(0) info["appVersion"] = path_components.pop(0) info["appUpdateChannel"] = path_components.pop(0) info["appBuildID"] = path_components.pop(0) dims = self.schema.dimensions_from(info, submission_date) try: # Convert data: if self.converter is None: serialized_data = data data_version = 1 else: parsed_data, parsed_dims = self.converter.convert_json(data, dims[-1]) # TODO: take this out if it's too slow for i in range(len(dims)): if dims[i] != parsed_dims[i]: print self.label, "Record", self.records_read, "mismatched dimension", i, dims[i], "!=", parsed_dims[i] serialized_data = self.converter.serialize(parsed_data) dims = parsed_dims data_version = 2 try: # Write to persistent storage n = self.storage.write(key, serialized_data, dims, data_version) self.bytes_written += len(key) + len(serialized_data) + 1 self.records_written += 1 # Compress rotated files as we generate them if n.endswith(StorageLayout.PENDING_COMPRESSION_SUFFIX): self.q_out.put(n) except Exception, e: self.write_bad_record(key, dims, serialized_data, str(e), "ERROR Writing to output file:") except BadPayloadError, e: self.write_bad_record(key, dims, data, e.msg, "Bad Payload:") except Exception, e: err_message = str(e) # We don't need to write these bad records out - we know # why they are being skipped. if err_message != "Missing in payload: info.revision": # TODO: recognize other common failure modes and handle them gracefully. self.write_bad_record(key, dims, data, err_message, "Conversion Error:") traceback.print_exc()
def handle(self, raw_file): self.log("Reading " + raw_file) try: record_count = 0 bytes_read = 0 start = now() file_version = fileutil.detect_file_version(raw_file, simple_detection=True) self.log("Detected version {0} for file {1}".format( file_version, raw_file)) for unpacked in fileutil.unpack(raw_file, file_version=file_version): record_count += 1 common_bytes = unpacked.len_path + fileutil.RECORD_PREAMBLE_LENGTH[ file_version] current_bytes = common_bytes + unpacked.len_data current_bytes_uncompressed = common_bytes + len(unpacked.data) bytes_read += current_bytes if unpacked.error: self.log("ERROR: Found corrupted data for record {0} in " \ "{1} path: {2} Error: {3}".format(record_count, raw_file, unpacked.path, unpacked.error)) self.stats.increment( records_read=1, bytes_read=current_bytes, bytes_uncompressed=current_bytes_uncompressed, bad_records=1, bad_record_type="corrupted_data") continue if len(unpacked.data) == 0: self.log("WARN: Found empty data for record {0} in " \ "{2} path: {2}".format(record_count, raw_file, unpacked.path)) self.stats.increment( records_read=1, bytes_read=current_bytes, bytes_uncompressed=current_bytes_uncompressed, bad_records=1, bad_record_type="empty_data") continue submission_date = ts_to_yyyymmdd(unpacked.timestamp) path = unicode(unpacked.path, errors="replace") if unpacked.data[0] != "{": # Data looks weird, should be JSON. self.log("Warning: Found unexpected data for record {0}" \ " in {1} path: {2} data:\n{3}".format(record_count, raw_file, path, unpacked.data)) else: # Raw JSON, make sure we treat it as unicode. unpacked.data = unicode(unpacked.data, errors="replace") path_components = path.split("/") if len(path_components) != self.expected_dim_count: # We're going to pop the ID off, but we'll also add the # submission date, so it evens out. bad_record_type = "invalid_path" if ReadRawStep.UUID_ONLY_PATH.match(path): bad_record_type = "uuid_only_path" else: self.log("Found an invalid path in record {0}: " \ "{1}".format(record_count, path)) self.stats.increment( records_read=1, bytes_read=current_bytes, bytes_uncompressed=current_bytes_uncompressed, bad_records=1, bad_record_type=bad_record_type) continue key = path_components.pop(0) info = {} info["reason"] = path_components.pop(0) info["appName"] = path_components.pop(0) info["appVersion"] = path_components.pop(0) info["appUpdateChannel"] = path_components.pop(0) info["appBuildID"] = path_components.pop(0) dims = self.schema.dimensions_from(info, submission_date) channel = self.schema.get_field(dims, "appUpdateChannel", True, True) self.stats.increment( channel=channel, records_read=1, bytes_read=current_bytes, bytes_uncompressed=current_bytes_uncompressed) try: # Convert data: if self.converter is None: serialized_data = unpacked.data # TODO: Converter.VERSION_UNCONVERTED data_version = 1 else: parsed_data, parsed_dims = self.converter.convert_json( unpacked.data, dims[-1], unpacked.ip) # TODO: take this out if it's too slow for i in range(len(dims)): if dims[i] != parsed_dims[i]: self.log("Record {0} mismatched dimension " \ "{1}: '{2}' != '{3}'".format( record_count, i, dims[1], parsed_dims[i])) serialized_data = self.converter.serialize(parsed_data) dims = parsed_dims # TODO: Converter.VERSION_CONVERTED data_version = 2 try: # Write to persistent storage n = self.storage.write(key, serialized_data, dims, data_version) self.stats.increment(channel=channel, records_written=1, bytes_written=len(key) + len(serialized_data) + 2) # Compress rotated files as we generate them if n.endswith( StorageLayout.PENDING_COMPRESSION_SUFFIX): self.q_out.put(n) except Exception, e: self.write_bad_record(key, dims, serialized_data, str(e), "ERROR Writing to output file:", "write_failed") except BadPayloadError, e: self.write_bad_record(key, dims, unpacked.data, e.msg, "Bad Payload:", "bad_payload") except Exception, e: err_message = str(e) if err_message == "Missing in payload: info.revision": # We don't need to write these bad records out - we know # why they are being skipped. self.stats.increment( channel=channel, bad_records=1, bad_record_type="missing_revision") elif err_message == "Invalid revision URL: /rev/": # We do want to log these payloads, but we don't want # the full stack trace. self.write_bad_record(key, dims, unpacked.data, err_message, "Conversion Error", "missing_revision_repo") # Don't split this long string - we want to be able to find it in the code elif err_message.startswith( "JSONDecodeError: Invalid control character"): self.write_bad_record(key, dims, unpacked.data, err_message, "Conversion Error", "invalid_control_char") else: # TODO: Recognize other common failure modes and handle # them gracefully. self.write_bad_record(key, dims, unpacked.data, err_message, "Conversion Error", "conversion_error") self.log(traceback.format_exc())
def handle(self, raw_file): self.log("Reading " + raw_file) try: record_count = 0 bytes_read = 0 start = now() file_version = fileutil.detect_file_version(raw_file, simple_detection=True) self.log("Detected version {0} for file {1}".format(file_version, raw_file)) for unpacked in fileutil.unpack(raw_file, file_version=file_version): record_count += 1 common_bytes = unpacked.len_path + fileutil.RECORD_PREAMBLE_LENGTH[file_version] current_bytes = common_bytes + unpacked.len_data current_bytes_uncompressed = common_bytes + len(unpacked.data) bytes_read += current_bytes if unpacked.error: self.log("ERROR: Found corrupted data for record {0} in " \ "{1} path: {2} Error: {3}".format(record_count, raw_file, unpacked.path, unpacked.error)) self.stats.increment(records_read=1, bytes_read=current_bytes, bytes_uncompressed=current_bytes_uncompressed, bad_records=1, bad_record_type="corrupted_data") continue if len(unpacked.data) == 0: self.log("WARN: Found empty data for record {0} in " \ "{2} path: {2}".format(record_count, raw_file, unpacked.path)) self.stats.increment(records_read=1, bytes_read=current_bytes, bytes_uncompressed=current_bytes_uncompressed, bad_records=1, bad_record_type="empty_data") continue submission_date = ts_to_yyyymmdd(unpacked.timestamp) path = fileutil.to_unicode(unpacked.path) if unpacked.data[0] != "{": # Data looks weird, should be JSON. self.log("Warning: Found unexpected data for record {0}" \ " in {1} path: {2} data:\n{3}".format(record_count, raw_file, path, unpacked.data)) else: # Raw JSON, make sure we treat it as unicode. unpacked.data = fileutil.to_unicode(unpacked.data) path_components = path.split("/") if len(path_components) != self.expected_dim_count: # We're going to pop the ID off, but we'll also add the # submission date, so it evens out. bad_record_type = "invalid_path" if ReadRawStep.UUID_ONLY_PATH.match(path): bad_record_type = "uuid_only_path" else: self.log("Found an invalid path in record {0}: " \ "{1}".format(record_count, path)) self.stats.increment(records_read=1, bytes_read=current_bytes, bytes_uncompressed=current_bytes_uncompressed, bad_records=1, bad_record_type=bad_record_type) continue key = path_components.pop(0) info = {} info["reason"] = path_components.pop(0) info["appName"] = path_components.pop(0) info["appVersion"] = path_components.pop(0) info["appUpdateChannel"] = path_components.pop(0) info["appBuildID"] = path_components.pop(0) dims = self.schema.dimensions_from(info, submission_date) channel = self.schema.get_field(dims, "appUpdateChannel", True, True) self.stats.increment(channel=channel, records_read=1, bytes_read=current_bytes, bytes_uncompressed=current_bytes_uncompressed) try: # Convert data: if self.converter is None: serialized_data = unpacked.data # TODO: Converter.VERSION_UNCONVERTED data_version = 1 else: parsed_data, parsed_dims = self.converter.convert_json( unpacked.data, dims[-1], unpacked.ip) # TODO: take this out if it's too slow for i in range(len(dims)): if dims[i] != parsed_dims[i]: self.log("Record {0} mismatched dimension " \ "{1}: '{2}' != '{3}'".format( record_count, i, dims[i], parsed_dims[i])) serialized_data = self.converter.serialize(parsed_data) dims = parsed_dims # TODO: Converter.VERSION_CONVERTED data_version = 2 try: # Write to persistent storage n = self.storage.write(key, serialized_data, dims, data_version) self.stats.increment(channel=channel, records_written=1, bytes_written=len(key) + len(serialized_data) + 2) # Compress rotated files as we generate them if n.endswith(StorageLayout.PENDING_COMPRESSION_SUFFIX): self.q_out.put(n) except Exception, e: self.write_bad_record(key, dims, serialized_data, str(e), "ERROR Writing to output file:", "write_failed") except BadPayloadError, e: self.write_bad_record(key, dims, unpacked.data, e.msg, "Bad Payload:", "bad_payload") except Exception, e: err_message = str(e) if err_message == "Missing in payload: info.revision": # We don't need to write these bad records out - we know # why they are being skipped. self.stats.increment(channel=channel, bad_records=1, bad_record_type="missing_revision") elif err_message == "Invalid revision URL: /rev/": # We do want to log these payloads, but we don't want # the full stack trace. self.write_bad_record(key, dims, unpacked.data, err_message, "Conversion Error", "missing_revision_repo") # Don't split this long string - we want to be able to find it in the code elif err_message.startswith("JSONDecodeError: Invalid control character"): self.write_bad_record(key, dims, unpacked.data, err_message, "Conversion Error", "invalid_control_char") else: # TODO: Recognize other common failure modes and handle # them gracefully. self.write_bad_record(key, dims, unpacked.data, err_message, "Conversion Error", "conversion_error") self.log(traceback.format_exc())
def test_utf8(self): count = 0 # This packed file contains the same record twice, once gzipped, once # raw. The record contains the UTF-8 encoded string "Wikipédia" for r in fu.unpack('test/unicode.v1.packed', file_version="v1"): self.assertIs(r.error, None) self.assertTrue(len(r.data) > 0) self.assertEqual(r.data[0], '{') count += 1 # self.print_byte_range(r.data, 3323, 3362) # Make sure that the raw bytes are correct: # Incoming is UTF-8, so we expect # W i k i p 0xc3 0xa9 d i a self.assertEqual(ord(r.data[0xd16]), 0x70) self.assertEqual(ord(r.data[0xd17]), 0xc3) self.assertEqual(ord(r.data[0xd18]), 0xa9) self.assertEqual(ord(r.data[0xd19]), 0x64) # Convert the data the wrong way: bad = unicode(r.data, errors="replace") # self.print_byte_range(bad, 3323, 3362) # Verify that we see the replacement chars in the expected places: # W i k i p 0xfffd 0xfffd d i a self.assertEqual(ord(bad[0xd16]), 0x70) self.assertEqual(ord(bad[0xd17]), 0xfffd) self.assertEqual(ord(bad[0xd18]), 0xfffd) self.assertEqual(ord(bad[0xd19]), 0x64) # Now convert properly: good = fu.to_unicode(r.data) # self.print_byte_range(good, 3322, 3360) # Now we have unicode, so we expect # W i k i p 0xe9 d i a self.assertEqual(ord(good[0xd15]), 0x70) self.assertEqual(ord(good[0xd16]), 0xe9) self.assertEqual(ord(good[0xd17]), 0x64) converted, dimensions = ConvertTest.converter.convert_json( good, "20131114", None) engine = converted["simpleMeasurements"]["UITelemetry"][ "toolbars"]["currentSearchEngine"] # print engine self.assertEqual(ord(engine[4]), 0x70) self.assertEqual(ord(engine[5]), 0xe9) self.assertEqual(ord(engine[6]), 0x64) serialized = ConvertTest.converter.serialize(converted, sort_keys=True) # self.print_byte_range(serialized, 4007, 4049) # Now we have escaped unicode, so we expect # W i k i p \ u 0 0 e 9 d i a self.assertEqual(ord(serialized[0xfc2]), 0x70) self.assertEqual(ord(serialized[0xfc3]), 0x5c) self.assertEqual(ord(serialized[0xfc4]), 0x75) self.assertEqual(ord(serialized[0xfc5]), 0x30) self.assertEqual(ord(serialized[0xfc6]), 0x30) self.assertEqual(ord(serialized[0xfc7]), 0x65) self.assertEqual(ord(serialized[0xfc8]), 0x39) self.assertEqual(ord(serialized[0xfc9]), 0x64) self.assertEqual(count, 2)
def main(): parser = argparse.ArgumentParser( description='Split raw logs into partitioned files.', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("-m", "--max-output-size", metavar="N", help="Rotate output files after N bytes", type=int, default=500000000) parser.add_argument("-i", "--input-file", help="Filename to read from", required=True) parser.add_argument("-o", "--output-dir", help="Base directory to store split files", required=True) parser.add_argument("-t", "--telemetry-schema", help="Filename of telemetry schema spec", required=True) parser.add_argument("-f", "--file-version", help="Log file version (if omitted, we'll guess)") args = parser.parse_args() schema_data = open(args.telemetry_schema) schema = TelemetrySchema(json.load(schema_data)) schema_data.close() storage = StorageLayout(schema, args.output_dir, args.max_output_size) expected_dim_count = len(schema._dimensions) if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir) record_count = 0 bad_record_count = 0 bytes_read = 0 start = datetime.now() file_version = args.file_version if not file_version: file_version = fileutil.detect_file_version(args.input_file) for r in fileutil.unpack(args.input_file, file_version=file_version): record_count += 1 if r.error: bad_record_count += 1 continue # Incoming timestamps are in milliseconds, so convert to POSIX first # (ie. seconds) submission_date = date.fromtimestamp(r.timestamp / 1000).strftime("%Y%m%d") # Deal with unicode path = unicode(r.path, errors="replace") data = unicode(r.data, errors="replace") bytes_read += r.len_ip + r.len_path + r.len_data + fileutil.RECORD_PREAMBLE_LENGTH[ file_version] #print "Path for record", record_count, path, "length of data:", r.len_data, "data:", data[0:5] + "..." path_components = path.split("/") if len(path_components) != expected_dim_count: # We're going to pop the ID off, but we'll also add the submission # date, so it evens out. print "Found an invalid path in record", record_count, path bad_record_count += 1 continue key = path_components.pop(0) info = {} info["reason"] = path_components.pop(0) info["appName"] = path_components.pop(0) info["appVersion"] = path_components.pop(0) info["appUpdateChannel"] = path_components.pop(0) info["appBuildID"] = path_components.pop(0) dimensions = schema.dimensions_from(info, submission_date) #print " Converted path to filename", schema.get_filename(args.output_dir, dimensions) storage.write(key, data, dimensions) duration = timer.delta_sec(start) mb_read = bytes_read / 1024.0 / 1024.0 print "Read %.2fMB in %.2fs (%.2fMB/s), %d of %d records were bad" % ( mb_read, duration, mb_read / duration, bad_record_count, record_count) return 0
def handle(self, raw_file): print self.label, "reading", raw_file try: record_count = 0 bytes_read = 0 start = datetime.now() for len_path, len_data, timestamp, path, data, err in fileutil.unpack( raw_file): record_count += 1 self.records_read += 1 if err: print self.label, "ERROR: Found corrupted data for record", record_count, "in", raw_file, "path:", path, "Error:", err self.bad_records += 1 continue if len(data) == 0: print self.label, "ERROR: Found empty data for record", record_count, "in", raw_file, "path:", path self.bad_records += 1 continue # Incoming timestamps are in milliseconds, so convert to POSIX first # (ie. seconds) submission_date = date.fromtimestamp(timestamp / 1000).strftime("%Y%m%d") path = fileutil.to_unicode(path) #print "Path for record", record_count, path, "length of data:", len_data if data[0] != "{": # Data looks weird, should be JSON. print self.label, "Warning: Found unexpected data for record", record_count, "in", raw_file, "path:", path, "data:" print data else: # Raw JSON, make sure we treat it as unicode. data = fileutil.to_unicode(data) current_bytes = len_path + len_data + fileutil.RECORD_PREAMBLE_LENGTH[ "v1"] bytes_read += current_bytes self.bytes_read += current_bytes path_components = path.split("/") if len(path_components) != self.expected_dim_count: # We're going to pop the ID off, but we'll also add the # submission date, so it evens out. print self.label, "Found an invalid path in record", record_count, path continue key = path_components.pop(0) info = {} info["reason"] = path_components.pop(0) info["appName"] = path_components.pop(0) info["appVersion"] = path_components.pop(0) info["appUpdateChannel"] = path_components.pop(0) info["appBuildID"] = path_components.pop(0) dims = self.schema.dimensions_from(info, submission_date) try: # Convert data: if self.converter is None: serialized_data = data data_version = 1 else: parsed_data, parsed_dims = self.converter.convert_json( data, dims[-1]) # TODO: take this out if it's too slow for i in range(len(dims)): if dims[i] != parsed_dims[i]: print self.label, "Record", self.records_read, "mismatched dimension", i, dims[ i], "!=", parsed_dims[i] serialized_data = self.converter.serialize(parsed_data) dims = parsed_dims data_version = 2 try: # Write to persistent storage n = self.storage.write(key, serialized_data, dims, data_version) self.bytes_written += len(key) + len( serialized_data) + 1 self.records_written += 1 # Compress rotated files as we generate them if n.endswith( StorageLayout.PENDING_COMPRESSION_SUFFIX): self.q_out.put(n) except Exception, e: self.write_bad_record(key, dims, serialized_data, str(e), "ERROR Writing to output file:") except BadPayloadError, e: self.write_bad_record(key, dims, data, e.msg, "Bad Payload:") except Exception, e: err_message = str(e) # We don't need to write these bad records out - we know # why they are being skipped. if err_message != "Missing in payload: info.revision": # TODO: recognize other common failure modes and handle them gracefully. self.write_bad_record(key, dims, data, err_message, "Conversion Error:") traceback.print_exc()
def test_utf8(self): count = 0 # This packed file contains the same record twice, once gzipped, once # raw. The record contains the UTF-8 encoded string "Wikipédia" for r in fu.unpack('test/unicode.v1.packed', file_version="v1"): self.assertIs(r.error, None) self.assertTrue(len(r.data) > 0) self.assertEqual(r.data[0], '{') count += 1 # self.print_byte_range(r.data, 3323, 3362) # Make sure that the raw bytes are correct: # Incoming is UTF-8, so we expect # W i k i p 0xc3 0xa9 d i a self.assertEqual(ord(r.data[0xd16]), 0x70) self.assertEqual(ord(r.data[0xd17]), 0xc3) self.assertEqual(ord(r.data[0xd18]), 0xa9) self.assertEqual(ord(r.data[0xd19]), 0x64) # Convert the data the wrong way: bad = unicode(r.data, errors="replace") # self.print_byte_range(bad, 3323, 3362) # Verify that we see the replacement chars in the expected places: # W i k i p 0xfffd 0xfffd d i a self.assertEqual(ord(bad[0xd16]), 0x70) self.assertEqual(ord(bad[0xd17]), 0xfffd) self.assertEqual(ord(bad[0xd18]), 0xfffd) self.assertEqual(ord(bad[0xd19]), 0x64) # Now convert properly: good = fu.to_unicode(r.data) # self.print_byte_range(good, 3322, 3360) # Now we have unicode, so we expect # W i k i p 0xe9 d i a self.assertEqual(ord(good[0xd15]), 0x70) self.assertEqual(ord(good[0xd16]), 0xe9) self.assertEqual(ord(good[0xd17]), 0x64) converted, dimensions = ConvertTest.converter.convert_json(good, "20131114", None) engine = converted["simpleMeasurements"]["UITelemetry"]["toolbars"]["currentSearchEngine"] # print engine self.assertEqual(ord(engine[4]), 0x70) self.assertEqual(ord(engine[5]), 0xe9) self.assertEqual(ord(engine[6]), 0x64) serialized = ConvertTest.converter.serialize(converted, sort_keys=True) # self.print_byte_range(serialized, 4007, 4049) # Now we have escaped unicode, so we expect # W i k i p \ u 0 0 e 9 d i a self.assertEqual(ord(serialized[0xfc2]), 0x70) self.assertEqual(ord(serialized[0xfc3]), 0x5c) self.assertEqual(ord(serialized[0xfc4]), 0x75) self.assertEqual(ord(serialized[0xfc5]), 0x30) self.assertEqual(ord(serialized[0xfc6]), 0x30) self.assertEqual(ord(serialized[0xfc7]), 0x65) self.assertEqual(ord(serialized[0xfc8]), 0x39) self.assertEqual(ord(serialized[0xfc9]), 0x64) self.assertEqual(count, 2)
def main(): parser = argparse.ArgumentParser( description="Split raw logs into partitioned files.", formatter_class=argparse.ArgumentDefaultsHelpFormatter ) parser.add_argument( "-m", "--max-output-size", metavar="N", help="Rotate output files after N bytes", type=int, default=500000000 ) parser.add_argument("-i", "--input-file", help="Filename to read from", required=True) parser.add_argument("-o", "--output-dir", help="Base directory to store split files", required=True) parser.add_argument("-t", "--telemetry-schema", help="Filename of telemetry schema spec", required=True) parser.add_argument("-b", "--bucket", help="S3 Bucket name") parser.add_argument("-k", "--aws-key", help="AWS Key") parser.add_argument("-s", "--aws-secret-key", help="AWS Secret Key") args = parser.parse_args() schema_data = open(args.telemetry_schema) schema = TelemetrySchema(json.load(schema_data)) schema_data.close() storage = StorageLayout(schema, args.output_dir, args.max_output_size) expected_dim_count = len(schema._dimensions) if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir) record_count = 0 bad_record_count = 0 bytes_read = 0 start = datetime.now() for len_path, len_data, timestamp, path, data, err in fileutil.unpack(args.input_file): record_count += 1 if err: bad_record_count += 1 continue # Incoming timestamps are in milliseconds, so convert to POSIX first # (ie. seconds) submission_date = date.fromtimestamp(timestamp / 1000).strftime("%Y%m%d") # Deal with unicode path = unicode(path, errors="replace") data = unicode(data, errors="replace") bytes_read += len_path + len_data + fileutil.RECORD_PREAMBLE_LENGTH # print "Path for record", record_count, path, "length of data:", len_data, "data:", data[0:5] + "..." path_components = path.split("/") if len(path_components) != expected_dim_count: # We're going to pop the ID off, but we'll also add the submission # date, so it evens out. print "Found an invalid path in record", record_count, path bad_record_count += 1 continue key = path_components.pop(0) info = {} info["reason"] = path_components.pop(0) info["appName"] = path_components.pop(0) info["appVersion"] = path_components.pop(0) info["appUpdateChannel"] = path_components.pop(0) info["appBuildID"] = path_components.pop(0) dimensions = schema.dimensions_from(info, submission_date) # print " Converted path to filename", schema.get_filename(args.output_dir, dimensions) storage.write(key, data, dimensions) duration = timer.delta_sec(start) mb_read = bytes_read / 1024.0 / 1024.0 print "Read %.2fMB in %.2fs (%.2fMB/s), %d of %d records were bad" % ( mb_read, duration, mb_read / duration, bad_record_count, record_count, ) return 0