def main(): parser = ArgumentParser( description='Convert local Telemetry pings to server storage structure' ) parser.add_argument("--input-dir", required=True) parser.add_argument("--output-dir", required=True) parser.add_argument("--schema", type=file, default='./telemetry/telemetry_schema.json') parser.add_argument("--histogram-cache-dir", default='/tmp/telemetry_histogram_cache') args = parser.parse_args() print "Getting pings from", args.input_dir, "converting them and storing them in", args.output_dir schema = TelemetrySchema(json.load(args.schema)) cache = RevisionCache(args.histogram_cache_dir, 'hg.mozilla.org') converter = Converter(cache, schema) storage = StorageLayout(schema, args.output_dir, 500000000) ping_dir = args.input_dir ping_files = get_pings(ping_dir) if len(ping_files) == 0: # Try the usual ping dir (if the user just gave the Profile Dir) ping_dir = os.path.join(args.input_dir, "saved-telemetry-pings") ping_files = get_pings(ping_dir) print "found", len(ping_files), "pings" for ping_file in ping_files: with open(os.path.join(ping_dir, ping_file), "r") as f: ping = json.load(f) reason = ping['reason'] key = ping['slug'] payload = ping['payload'] submission_date = date.today().strftime("%Y%m%d") dims = schema.dimensions_from(payload, submission_date) try: parsed_data, dims = converter.convert_obj(payload, dims[-1]) serialized_data = converter.serialize(parsed_data) data_version = Converter.VERSION_CONVERTED try: # Write to persistent storage n = storage.write(key, serialized_data, dims, data_version) print "Successfully saved ping", key, "to", n except Exception, e: traceback.print_exc() except BadPayloadError, e: print "Bad Payload:", e.msg except Exception, e: traceback.print_exc()
def main(): parser = ArgumentParser(description='Convert local Telemetry pings to server storage structure') parser.add_argument("--input-dir", required=True) parser.add_argument("--output-dir", required=True) parser.add_argument("--schema", type=file, default='./telemetry/telemetry_schema.json') parser.add_argument("--histogram-cache-dir", default='/tmp/telemetry_histogram_cache') args = parser.parse_args() print "Getting pings from", args.input_dir, "converting them and storing them in", args.output_dir schema = TelemetrySchema(json.load(args.schema)) cache = RevisionCache(args.histogram_cache_dir, 'hg.mozilla.org') converter = Converter(cache, schema) storage = StorageLayout(schema, args.output_dir, 500000000) ping_dir = args.input_dir ping_files = get_pings(ping_dir) if len(ping_files) == 0: # Try the usual ping dir (if the user just gave the Profile Dir) ping_dir = os.path.join(args.input_dir, "saved-telemetry-pings") ping_files = get_pings(ping_dir) print "found", len(ping_files), "pings" for ping_file in ping_files: with open(os.path.join(ping_dir, ping_file), "r") as f: ping = json.load(f) reason = ping['reason'] key = ping['slug'] payload = ping['payload'] submission_date = date.today().strftime("%Y%m%d") dims = schema.dimensions_from(payload, submission_date) try: parsed_data, dims = converter.convert_obj(payload, dims[-1]) serialized_data = converter.serialize(parsed_data) data_version = Converter.VERSION_CONVERTED try: # Write to persistent storage n = storage.write(key, serialized_data, dims, data_version) print "Successfully saved ping", key, "to", n except Exception, e: traceback.print_exc() except BadPayloadError, e: print "Bad Payload:", e.msg except Exception, e: traceback.print_exc()
def main(): parser = argparse.ArgumentParser( description='Split raw logs into partitioned files.', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("-m", "--max-output-size", metavar="N", help="Rotate output files after N bytes", type=int, default=500000000) parser.add_argument("-i", "--input-file", help="Filename to read from", required=True) parser.add_argument("-o", "--output-dir", help="Base directory to store split files", required=True) parser.add_argument("-t", "--telemetry-schema", help="Filename of telemetry schema spec", required=True) parser.add_argument("-f", "--file-version", help="Log file version (if omitted, we'll guess)") args = parser.parse_args() schema_data = open(args.telemetry_schema) schema = TelemetrySchema(json.load(schema_data)) schema_data.close() storage = StorageLayout(schema, args.output_dir, args.max_output_size) expected_dim_count = len(schema._dimensions) if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir) record_count = 0 bad_record_count = 0 bytes_read = 0 start = datetime.now() file_version = args.file_version if not file_version: file_version = fileutil.detect_file_version(args.input_file) for r in fileutil.unpack(args.input_file, file_version=file_version): record_count += 1 if r.error: bad_record_count += 1 continue # Incoming timestamps are in milliseconds, so convert to POSIX first # (ie. seconds) submission_date = date.fromtimestamp(r.timestamp / 1000).strftime("%Y%m%d") # Deal with unicode path = unicode(r.path, errors="replace") data = unicode(r.data, errors="replace") bytes_read += r.len_ip + r.len_path + r.len_data + fileutil.RECORD_PREAMBLE_LENGTH[ file_version] #print "Path for record", record_count, path, "length of data:", r.len_data, "data:", data[0:5] + "..." path_components = path.split("/") if len(path_components) != expected_dim_count: # We're going to pop the ID off, but we'll also add the submission # date, so it evens out. print "Found an invalid path in record", record_count, path bad_record_count += 1 continue key = path_components.pop(0) info = {} info["reason"] = path_components.pop(0) info["appName"] = path_components.pop(0) info["appVersion"] = path_components.pop(0) info["appUpdateChannel"] = path_components.pop(0) info["appBuildID"] = path_components.pop(0) dimensions = schema.dimensions_from(info, submission_date) #print " Converted path to filename", schema.get_filename(args.output_dir, dimensions) storage.write(key, data, dimensions) duration = timer.delta_sec(start) mb_read = bytes_read / 1024.0 / 1024.0 print "Read %.2fMB in %.2fs (%.2fMB/s), %d of %d records were bad" % ( mb_read, duration, mb_read / duration, bad_record_count, record_count) return 0
def main(): parser = argparse.ArgumentParser( description="Split raw logs into partitioned files.", formatter_class=argparse.ArgumentDefaultsHelpFormatter ) parser.add_argument( "-m", "--max-output-size", metavar="N", help="Rotate output files after N bytes", type=int, default=500000000 ) parser.add_argument("-i", "--input-file", help="Filename to read from", required=True) parser.add_argument("-o", "--output-dir", help="Base directory to store split files", required=True) parser.add_argument("-t", "--telemetry-schema", help="Filename of telemetry schema spec", required=True) parser.add_argument("-b", "--bucket", help="S3 Bucket name") parser.add_argument("-k", "--aws-key", help="AWS Key") parser.add_argument("-s", "--aws-secret-key", help="AWS Secret Key") args = parser.parse_args() schema_data = open(args.telemetry_schema) schema = TelemetrySchema(json.load(schema_data)) schema_data.close() storage = StorageLayout(schema, args.output_dir, args.max_output_size) expected_dim_count = len(schema._dimensions) if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir) record_count = 0 bad_record_count = 0 bytes_read = 0 start = datetime.now() for len_path, len_data, timestamp, path, data, err in fileutil.unpack(args.input_file): record_count += 1 if err: bad_record_count += 1 continue # Incoming timestamps are in milliseconds, so convert to POSIX first # (ie. seconds) submission_date = date.fromtimestamp(timestamp / 1000).strftime("%Y%m%d") # Deal with unicode path = unicode(path, errors="replace") data = unicode(data, errors="replace") bytes_read += len_path + len_data + fileutil.RECORD_PREAMBLE_LENGTH # print "Path for record", record_count, path, "length of data:", len_data, "data:", data[0:5] + "..." path_components = path.split("/") if len(path_components) != expected_dim_count: # We're going to pop the ID off, but we'll also add the submission # date, so it evens out. print "Found an invalid path in record", record_count, path bad_record_count += 1 continue key = path_components.pop(0) info = {} info["reason"] = path_components.pop(0) info["appName"] = path_components.pop(0) info["appVersion"] = path_components.pop(0) info["appUpdateChannel"] = path_components.pop(0) info["appBuildID"] = path_components.pop(0) dimensions = schema.dimensions_from(info, submission_date) # print " Converted path to filename", schema.get_filename(args.output_dir, dimensions) storage.write(key, data, dimensions) duration = timer.delta_sec(start) mb_read = bytes_read / 1024.0 / 1024.0 print "Read %.2fMB in %.2fs (%.2fMB/s), %d of %d records were bad" % ( mb_read, duration, mb_read / duration, bad_record_count, record_count, ) return 0