def __init__(self, config): # Sanity check args. if config.num_mappers <= 0: raise ValueError("Number of mappers must be greater than zero") if config.num_reducers <= 0: raise ValueError("Number of reducers must be greater than zero") if not os.path.isdir(config.data_dir): raise ValueError("Data dir must be a valid directory") if not os.path.isdir(config.work_dir): raise ValueError("Work dir must be a valid directory") if not os.path.isfile(config.job_script): raise ValueError("Job script must be a valid python file") if not os.path.isfile(config.input_filter): raise ValueError("Input filter must be a valid json file") self._input_dir = config.data_dir if self._input_dir[-1] == os.path.sep: self._input_dir = self._input_dir[0:-1] self._work_dir = config.work_dir self._input_filter = TelemetrySchema( json.load(open(config.input_filter))) self._allowed_values = self._input_filter.sanitize_allowed_values() self._output_file = config.output self._num_mappers = config.num_mappers self._num_reducers = config.num_reducers self._local_only = config.local_only self._bucket_name = config.bucket self._aws_key = config.aws_key self._aws_secret_key = config.aws_secret_key modulefd = open(config.job_script) ## Lifted from FileDriver.py in jydoop. self._job_module = imp.load_module("telemetry_job", modulefd, config.job_script, ('.py', 'U', 1))
def __init__(self, config): # Sanity check args. if config.num_mappers <= 0: raise ValueError("Number of mappers must be greater than zero") if config.num_reducers <= 0: raise ValueError("Number of reducers must be greater than zero") if not os.path.isdir(config.data_dir): raise ValueError("Data dir must be a valid directory") if not os.path.isdir(config.work_dir): raise ValueError("Work dir must be a valid directory") if not os.path.isfile(config.job_script): raise ValueError("Job script must be a valid python file") if not os.path.isfile(config.input_filter): raise ValueError("Input filter must be a valid json file") self._input_dir = config.data_dir if self._input_dir[-1] == os.path.sep: self._input_dir = self._input_dir[0:-1] self._work_dir = config.work_dir self._input_filter = TelemetrySchema(json.load(open(config.input_filter))) self._allowed_values = self._input_filter.sanitize_allowed_values() self._output_file = config.output self._num_mappers = config.num_mappers self._num_reducers = config.num_reducers self._local_only = config.local_only self._bucket_name = config.bucket self._aws_key = config.aws_key self._aws_secret_key = config.aws_secret_key modulefd = open(config.job_script) ## Lifted from FileDriver.py in jydoop. self._job_module = imp.load_module("telemetry_job", modulefd, config.job_script, ('.py', 'U', 1))
def main(argv=None): parser = argparse.ArgumentParser(description="Convert Telemetry data") parser.add_argument("-c", "--config-file", help="Read configuration from this file", default="./telemetry_server_config.json") parser.add_argument("-d", "--date", help="Use specified date for dimensions") args = parser.parse_args() try: server_config = open(args.config_file, "r") config = json.load(server_config) server_config.close() except IOError: config = {} cache_dir = config.get("revision_cache_path", "./histogram_cache") server = config.get("revision_cache_server", "hg.mozilla.org") schema_filename = config.get("schema_filename", "./telemetry_schema.json") schema_data = open(schema_filename) schema = TelemetrySchema(json.load(schema_data)) schema_data.close() cache = revision_cache.RevisionCache(cache_dir, server) converter = Converter(cache, schema) process(converter, args.date)
def setUpClass(cls): cls.cache_dir = "/tmp/histogram_revision_cache" cls.schema_filename = "./telemetry/telemetry_schema.json" assert not os.path.exists(cls.cache_dir) schema_file = open(cls.schema_filename, "r") cls.schema = TelemetrySchema(json.load(schema_file)) schema_file.close() cls.cache = revision_cache.RevisionCache(cls.cache_dir, 'hg.mozilla.org') cls.converter = Converter(cls.cache, cls.schema)
class Job: """A class for orchestrating a Telemetry MapReduce job""" DOWNLOAD_BATCH_SIZE = 100 # 1. read input filter # 2. generate filtered list of local input files # 2a. generate filtered list of remote input files # 3. load mapper # 4. spawn N processes # 5. distribute files among processes # 6. map(key, value, dims) each line in the file # 7. combine map output for each file # 8. reduce combine output overall def __init__(self, config): # Sanity check args. if config.num_mappers <= 0: raise ValueError("Number of mappers must be greater than zero") if config.num_reducers <= 0: raise ValueError("Number of reducers must be greater than zero") if not os.path.isdir(config.data_dir): raise ValueError("Data dir must be a valid directory") if not os.path.isdir(config.work_dir): raise ValueError("Work dir must be a valid directory") if not os.path.isfile(config.job_script): raise ValueError("Job script must be a valid python file") if not os.path.isfile(config.input_filter): raise ValueError("Input filter must be a valid json file") self._input_dir = config.data_dir if self._input_dir[-1] == os.path.sep: self._input_dir = self._input_dir[0:-1] self._work_dir = config.work_dir self._input_filter = TelemetrySchema(json.load(open(config.input_filter))) self._allowed_values = self._input_filter.sanitize_allowed_values() self._output_file = config.output self._num_mappers = config.num_mappers self._num_reducers = config.num_reducers self._local_only = config.local_only self._bucket_name = config.bucket self._aws_key = config.aws_key self._aws_secret_key = config.aws_secret_key modulefd = open(config.job_script) ## Lifted from FileDriver.py in jydoop. self._job_module = imp.load_module("telemetry_job", modulefd, config.job_script, ('.py', 'U', 1)) def dump_stats(self, partitions): total = sum(partitions) avg = total / len(partitions) for i in range(len(partitions)): print "Partition %d contained %d (%+d)" % (i, partitions[i], float(partitions[i]) - avg) def fetch_remotes(self, remotes): # TODO: download remotes in groups of size DOWNLOAD_BATCH_SIZE remote_names = [ r["name"] for r in remotes if r["type"] == "remote" ] # TODO: check cache first. result = 0 fetch_cwd = os.path.join(self._work_dir, "cache") if len(remote_names) > 0: if not os.path.isdir(fetch_cwd): os.makedirs(fetch_cwd) fetch_cmd = ["/usr/local/bin/s3funnel"] fetch_cmd.append(self._bucket_name) fetch_cmd.append("get") fetch_cmd.append("-a") fetch_cmd.append(self._aws_key) fetch_cmd.append("-s") fetch_cmd.append(self._aws_secret_key) fetch_cmd.append("-t") fetch_cmd.append("8") start = datetime.now() result = subprocess.call(fetch_cmd + remote_names, cwd=fetch_cwd) delta = (datetime.now() - start) duration_sec = float(delta.seconds) + float(delta.microseconds) / 1000000 downloaded_bytes = sum([ r["size"] for r in remotes if r["type"] == "remote" ]) downloaded_mb = float(downloaded_bytes) / 1024.0 / 1024.0 print "Downloaded %.2fMB in %.2fs (%.2fMB/s)" % (downloaded_mb, duration_sec, downloaded_mb / duration_sec) return result def mapreduce(self): # Find files matching specified input filter files = self.local_files() remote_files = self.get_filtered_files_s3() file_count = len(files) + len(remote_files) # Not useful to have more mappers than input files. if file_count < self._num_mappers: print "There are only", file_count, "input files. Reducing number of mappers accordingly." self._num_mappers = file_count # Partition files into reasonably equal groups for use by mappers print "Partitioning input data..." partitions = self.partition(files, remote_files) print "Done" # Partitions are ready. Map. mappers = [] for i in range(self._num_mappers): if len(partitions[i]) > 0: # Fetch the files we need for each mapper print "Fetching remotes for partition", i self.fetch_remotes(partitions[i]) print "Done" p = Process( target=Mapper, args=(i, partitions[i], self._work_dir, self._job_module, self._num_reducers)) mappers.append(p) p.start() else: print "Skipping mapper", i, "- no input files to process" for m in mappers: m.join() # Mappers are done. Reduce. reducers = [] for i in range(self._num_reducers): p = Process( target=Reducer, args=(i, self._work_dir, self._job_module, self._num_mappers)) reducers.append(p) p.start() for r in reducers: r.join() # Reducers are done. Output results. os.rename(os.path.join(self._work_dir, "reducer_0"), self._output_file) if self._num_reducers > 1: out = open(self._output_file, "a") for i in range(1, self._num_reducers): # FIXME: this reads the entire reducer output into memory reducer_filename = os.path.join(self._work_dir, "reducer_" + str(i)) reducer_output = open(reducer_filename, "r") out.write(reducer_output.read()) reducer_output.close() os.remove(reducer_filename) # TODO: clean up downloaded files? # Clean up mapper outputs for m in range(self._num_mappers): for r in range(self._num_reducers): mfile = os.path.join(self._work_dir, "mapper_%d_%d" % (m, r)) if os.path.exists(mfile): os.remove(mfile) else: print "Warning: Could not find", mfile def local_files(self): out_files = self.get_filtered_files(self._input_dir) if self._input_filter._include_invalid: invalid_dir = os.path.join(self._input_dir, TelemetrySchema.INVALID_DIR) #print "Looking for invalid data in", invalid_dir out_files += self.get_filtered_files(invalid_dir) return out_files # Split up the input files into groups of approximately-equal on-disk size. def partition(self, files, remote_files): namesize = [ { "type": "local", "name": files[i], "size": os.stat(files[i]).st_size, "dimensions": self._input_filter.get_dimensions(self._input_dir, files[i]) } for i in range(0, len(files)) ] partitions = [] sums = [] for p in range(self._num_mappers): partitions.append([]) sums.append(0) min_idx = 0 # Greedily assign the largest file to the smallest partition while len(namesize) > 0: current = namesize.pop() #print "putting", current, "into partition", min_idx partitions[min_idx].append(current) sums[min_idx] += current["size"] min_idx = find_min_idx(sums) # And now do the same with the remote files. # TODO: if this is too slow, just distribute remote files round-robin. if len(remote_files) > 0: conn = S3Connection(self._aws_key, self._aws_secret_key) bucket = conn.get_bucket(self._bucket_name) for r in remote_files: key = bucket.lookup(r) size = key.size dims = self._input_filter.get_dimensions(".", r) remote = {"type": "remote", "name": r, "size": size, "dimensions": dims} #print "putting", remote, "into partition", min_idx partitions[min_idx].append(remote) sums[min_idx] += size min_idx = find_min_idx(sums) # Print out some info to see how balanced the partitions were: self.dump_stats(sums) return partitions def get_filtered_files(self, searchdir): level_offset = searchdir.count(os.path.sep) out_files = [] for root, dirs, files in os.walk(searchdir): level = root.count(os.path.sep) - level_offset dirs[:] = [i for i in dirs if self.filter_includes(level, i)] for f in files: full_filename = os.path.join(root, f) dims = self._input_filter.get_dimensions(searchdir, full_filename) include = True for l in range(level, len(self._allowed_values)): if not self.filter_includes(l, dims[l]): include = False break if include: out_files.append(full_filename) return out_files def get_filtered_files_s3(self): out_files = [] if not self._local_only: print "Fetching file list from S3..." # Plain boto should be fast enough to list bucket contents. conn = S3Connection(self._aws_key, self._aws_secret_key) bucket = conn.get_bucket(self._bucket_name) # TODO: potential optimization - if our input filter is reasonably # restrictive an/or our list of keys is very long, it may be # a win to use the "prefix" and "delimiter" params. for f in bucket.list(): dims = self._input_filter.get_dimensions(".", f.name) #print f.name, "->", ",".join(dims) include = True for i in range(len(self._allowed_values)): if not self.filter_includes(i, dims[i]): include = False break if include: out_files.append(f.name) conn.close() print "Done!" return out_files def filter_includes(self, level, value): # Filter out 'invalid' data. It is included explicitly if needed. if level == 0 and value == TelemetrySchema.INVALID_DIR: return False allowed_values = self._allowed_values[level] return self._input_filter.is_allowed(value, allowed_values)
def main(): parser = argparse.ArgumentParser(description='Split raw logs into partitioned files.', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("-m", "--max-output-size", metavar="N", help="Rotate output files after N bytes", type=int, default=500000000) parser.add_argument("-i", "--input-file", help="Filename to read from", required=True) parser.add_argument("-o", "--output-dir", help="Base directory to store split files", required=True) parser.add_argument("-t", "--telemetry-schema", help="Filename of telemetry schema spec", required=True) parser.add_argument("-b", "--bucket", help="S3 Bucket name") parser.add_argument("-k", "--aws-key", help="AWS Key") parser.add_argument("-s", "--aws-secret-key", help="AWS Secret Key") args = parser.parse_args() schema_data = open(args.telemetry_schema) schema = TelemetrySchema(json.load(schema_data)) schema_data.close() storage = StorageLayout(schema, args.output_dir, args.max_output_size) expected_dim_count = len(schema._dimensions) if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir) record_count = 0; fin = open(args.input_file, "rb") bytes_read = 0 start = datetime.now() while True: record_count += 1 # Read two 4-byte values and one 8-byte value lengths = fin.read(16) if lengths == '': break len_path, len_data, timestamp = struct.unpack("<IIQ", lengths) # Incoming timestamps are in milliseconds, so convert to POSIX first # (ie. seconds) submission_date = date.fromtimestamp(timestamp / 1000).strftime("%Y%m%d") path = unicode(fin.read(len_path), errors="replace") #print "Path for record", record_count, path, "length of data:", len_data # Detect and handle gzipped data. data = fin.read(len_data) try: # Note: from brief testing, cStringIO doesn't appear to be any # faster. In fact, it seems slightly slower than StringIO. data_reader = StringIO.StringIO(data) uncompressor = gzip.GzipFile(fileobj=data_reader, mode="r") data = unicode(uncompressor.read(), errors="replace") uncompressor.close() data_reader.close() except Exception, e: #print e # Use the string as-is data = unicode(data, errors="replace") bytes_read += 8 + len_path + len_data #print "Path for record", record_count, path, "length of data:", len_data, "data:", data[0:5] + "..." path_components = path.split("/") if len(path_components) != expected_dim_count: # We're going to pop the ID off, but we'll also add the submission, # so it evens out. print "Found an invalid path in record", record_count, path continue key = path_components.pop(0) info = {} info["reason"] = path_components.pop(0) info["appName"] = path_components.pop(0) info["appVersion"] = path_components.pop(0) info["appUpdateChannel"] = path_components.pop(0) info["appBuildID"] = path_components.pop(0) dimensions = schema.dimensions_from(info, submission_date) #print " Converted path to filename", schema.get_filename(args.output_dir, dimensions) storage.write(key, data, dimensions)
def setUp(self): self.schema = TelemetrySchema(self.get_schema_spec()) self.allowed_values = self.schema.sanitize_allowed_values()
class TelemetrySchemaTest(unittest.TestCase): def setUp(self): self.schema = TelemetrySchema(self.get_schema_spec()) self.allowed_values = self.schema.sanitize_allowed_values() def get_schema_spec(self): return { "version": 1, "dimensions": [ { "field_name": "reason", "allowed_values": ["saved-session"] }, { "field_name": "appName", "allowed_values": "*" }, { "field_name": "appUpdateChannel", "allowed_values": ["nightly"] }, { "field_name": "appVersion", "allowed_values": "*" }, { "field_name": "appBuildID", "allowed_values": "*" }, { "field_name": "submission_date", "allowed_values": ["20130908"] } ] } def get_file_list(self): return [ "/foo/bar/baz/bla.txt", "processed/idle_daily/Firefox/nightly/26.0a1/20130826074752.20130903.log.e0c7ff434e474c8aa745763eed408b9c.lzma", "garbage/idle_daily/Firefox/nightly/26.0a1/20130826074752.20130903.log.e0c7ff434e474c8aa745763eed408b9c.lzma", "processed/idle_daily/Firefox/nightly/26.0a1/20130901030218.20130903.log.17ff07fda8994e23baf983550246a94b.lzma", "processed/idle_daily/Firefox/nightly/26.0a1/20130819030205.20130907.log.ec6f22acd37349b3b5ef03da1cc150da.lzma", "processed/idle_daily/Firefox/nightly/26.0a1/20130807161117.20130903.log.64478ac84a734677bc14cbcf6cc114b7.lzma", "processed/idle_daily/Firefox/nightly/26.0a1/20130814030204.20130906.log.a382f1337d1f47ef8aad08f8fb14a79a.lzma", "processed/idle_daily/Firefox/nightly/26.0a1/20130830030205.20130903.log.c86e2c3f31c043ac8fc311d5dd1abc28.lzma", "processed/idle_daily/Firefox/nightly/26.0a1/20130823151250.20130907.log.939bec39c3d24c89a09834463b220d9a.lzma", "processed/idle_daily/Firefox/nightly/26.0a1/20130830030205.20130906.log.0bf2c1edf2634ca5bdc865a54957a690.lzma", "processed/idle_daily/Firefox/nightly/26.0a1/20130826074752.20130903.log.8e33cc0f130849dfbb8afe7331123be3.lzma", "processed/idle_daily/Firefox/nightly/26.0a1/20130826074752.20130902.log.2349f0434be64c6684f91eccabf9b3e6.lzma", "processed/idle_daily/Firefox/nightly/26.0a1/20130826030203.20130902.log.57a017a3378b420cbbfb666532606b16.lzma", "processed/saved_session/Firefox/nightly/26.0a1/20130902030220.20130908.log.7e9556a5e32b4990a9d378eea65f57a9.lzma", "processed/saved_session/Firefox/nightly/26.0a1/20130829030201.20130909.log.c227775e57e24854b1aac7c21c59f85c.lzma", "processed/saved_session/Firefox/nightly/26.0a1/20130826030203.20130906.log.88620da62e77482285f28d5ea69beb1e.lzma", "processed/saved_session/Firefox/nightly/26.0a1/20130826030203.20130908.log.cc1b0c52365947c38ac2636f3384503c.lzma", "processed/saved_session/Firefox/nightly/26.0a1/20130830030205.20130906.log.77905bb7503a4a98aa7231b10073f47e.lzma", "processed/saved_session/Firefox/nightly/26.0a1/20130823030204.20130908.log.55f4ab6ada3c4e1d939f24b5da7f8dc2.lzma", "processed/saved_session/Firefox/nightly/26.0a1/20130902030220.20130908.log.f213918c08804d449d30e1aaec70089a.lzma", "processed/idle_daily/Firefox/nightly/26.0a1/20130813030205.20130907.log.24c445d3d2c241bcb5001a63a78e98fa.lzma", "processed/idle_daily/Firefox/nightly/26.0a1/20130831030224.20130902.log.ebe3cd20fa264cd19aab02b8ffe8cbf1.lzma", "processed/idle_daily/Firefox/nightly/26.0a1/20130821030213.20130906.log.778737ad596d43e4a5e9e59c38428b61.lzma", "processed/idle_daily/Firefox/nightly/26.0a1/20130809030203.20130903.log.43ae292120ca475589b20be24fa70171.lzma", "processed/idle_daily/Firefox/nightly/26.0a1/20130814141812.20130907.log.7c6c5d65b702443cac2768eb6f0e3c91.lzma", "processed/idle_daily/Firefox/nightly/26.0a1/20130823030204.20130903.log.f73682ebc57a4661a6f48a2a5cf2629c.lzma", "processed/idle_daily/Firefox/nightly/26.0a1/20130821050136.20130904.log.2d423ec779e04113996914ce81e27bfe.lzma" ] def test_filtering(self): all_files = self.get_file_list() error_files = [] included_files = [] excluded_files = [] for f in all_files: include = True try: dims = self.schema.get_dimensions("processed", f) for i in range(len(self.allowed_values)): if not self.schema.is_allowed(dims[i], self.allowed_values[i]): include = False break except ValueError: include = False error_files.append(f) if include: included_files.append(f) else: excluded_files.append(f) #print "Found", len(excluded_files), "excluded files:" #for f in excluded_files: # print " - ", f #print "Found", len(included_files), "included files:" #for f in included_files: # print " + ", f #print "Found", len(error_files), "invalid files" #for f in error_files: # print " x ", f self.assertEqual(len(included_files), 4) self.assertEqual(len(error_files), 2) self.assertEqual(len(all_files), (len(excluded_files) + len(included_files))) def test_safe_filename(self): tests = { "Hello World!": "Hello_World_", "what\nam\ni": "what_am_i", "saved-session": "saved_session" } for key, value in tests.iteritems(): self.assertEqual(self.schema.safe_filename(key), value) def test_sanitize_allowed_values(self): self.assertEqual(self.allowed_values[0][0], "saved_session") def test_allowed_values(self): allowed = "saved_session" not_allowed = "anything_else" self.assertEqual(self.schema.get_allowed_value(allowed, self.allowed_values[0]), allowed) self.assertEqual(self.schema.get_allowed_value(not_allowed, self.allowed_values[0]), TelemetrySchema.DISALLOWED_VALUE) def test_apply_schema(self): test_inputs = [] expected_ot = [] # <-- bad name, convenient indenting. other = TelemetrySchema.DISALLOWED_VALUE # fields: reason appName channel appVersion appBuildID submission_date # ------------- ----------- ------- -------------- ----------- --------------- # allowed: saved-session * nightly * * 20130908 test_inputs.append(["saved-session", "someAppName", "nightly", "someAppVersion", "someBuildID", "20130908"]) expected_ot.append(["saved-session", "someAppName", "nightly", "someAppVersion", "someBuildID", "20130908"]) test_inputs.append(["saved-session", "another", "nightly", "anything is ok", "wooooo", "20130908"]) expected_ot.append(["saved-session", "another", "nightly", "anything is ok", "wooooo", "20130908"]) test_inputs.append(["bogus", "someAppName", "nightly", "someAppVersion", "someBuildID", "20130908"]) expected_ot.append([other, "someAppName", "nightly", "someAppVersion", "someBuildID", "20130908"]) test_inputs.append(["bogus", "someAppName", "aurora", "someAppVersion", "someBuildID", "20140428"]) expected_ot.append([other, "someAppName", other, "someAppVersion", "someBuildID", other]) test_inputs.append(["saved-session", "someAppName", "nightly", "someAppVersion", "someBuildID", "20130908", "more", "bonus", "dimensions!"]) expected_ot.append(["saved-session", "someAppName", "nightly", "someAppVersion", "someBuildID", "20130908"]) for i in range(len(test_inputs)): actual = self.schema.apply_schema(test_inputs[i]) self.assertEqual(actual, expected_ot[i]) def test_get_current_file(self): # everything but "submission_date": dims = ["saved-session", "someAppName", "nightly", "someAppVersion", "someBuildID"] filename = self.schema.get_current_file("foo", dims, "20130908", 1) self.assertEqual(filename, "foo/saved_session/someAppName/nightly/someAppVersion/someBuildID.20130908.v1.log") def test_get_filename(self): dims = ["saved-session", "someAppName", "nightly", "someAppVersion", "someBuildID", "20130908"] filename = self.schema.get_filename("foo", dims, 99) self.assertEqual(filename, "foo/saved_session/someAppName/nightly/someAppVersion/someBuildID.20130908.v99.log") def test_dimensions_from(self): test_inputs = [] expected_ot = [] test_inputs.append({"reason": "saved-session", "appName": "Firefox", "appUpdateChannel": "release", "appVersion": "28.0", "appBuildID": "20140401001122"}) expected_ot.append(["saved-session", "Firefox", "release", "28.0", "20140401001122", "20130908"]) test_inputs.append({"reason": "idle-daily", "appUpdateChannel": "release", "appVersion": "28.0", "appBuildID": "20140401001122"}) expected_ot.append(["idle-daily", "UNKNOWN", "release", "28.0", "20140401001122", "20130908"]) test_inputs.append({}) expected_ot.append(["UNKNOWN", "UNKNOWN", "UNKNOWN", "UNKNOWN", "UNKNOWN", "20130908"]) for i in range(len(test_inputs)): actual = self.schema.dimensions_from(test_inputs[i], "20130908") self.assertEqual(actual, expected_ot[i]) def test_get_field(self): dims = ["saved-session", "Firefox", "release", "28.0", "20130908010101", "20130908"] # Basic functionality self.assertEqual(self.schema.get_field(dims, "reason"), "saved-session") self.assertEqual(self.schema.get_field(dims, "appName"), "Firefox") self.assertEqual(self.schema.get_field(dims, "appUpdateChannel"), "release") self.assertEqual(self.schema.get_field(dims, "appVersion"), "28.0") self.assertEqual(self.schema.get_field(dims, "appBuildID"), "20130908010101") self.assertEqual(self.schema.get_field(dims, "submission_date"), "20130908") other = TelemetrySchema.DISALLOWED_VALUE allowed = True sanitize = True # T, T self.assertEqual(self.schema.get_field(dims, "reason", limit_to_allowed=allowed, sanitize=sanitize), "saved_session") self.assertEqual(self.schema.get_field(dims, "appName", limit_to_allowed=allowed, sanitize=sanitize), "Firefox") self.assertEqual(self.schema.get_field(dims, "appUpdateChannel", limit_to_allowed=allowed, sanitize=sanitize), other) self.assertEqual(self.schema.get_field(dims, "appVersion", limit_to_allowed=allowed, sanitize=sanitize), "28.0") self.assertEqual(self.schema.get_field(dims, "appBuildID", limit_to_allowed=allowed, sanitize=sanitize), "20130908010101") self.assertEqual(self.schema.get_field(dims, "submission_date", limit_to_allowed=allowed, sanitize=sanitize), "20130908") sanitize = False # T, F self.assertEqual(self.schema.get_field(dims, "reason", limit_to_allowed=allowed, sanitize=sanitize), "saved-session") self.assertEqual(self.schema.get_field(dims, "appName", limit_to_allowed=allowed, sanitize=sanitize), "Firefox") self.assertEqual(self.schema.get_field(dims, "appUpdateChannel", limit_to_allowed=allowed, sanitize=sanitize), other) self.assertEqual(self.schema.get_field(dims, "appVersion", limit_to_allowed=allowed, sanitize=sanitize), "28.0") self.assertEqual(self.schema.get_field(dims, "appBuildID", limit_to_allowed=allowed, sanitize=sanitize), "20130908010101") self.assertEqual(self.schema.get_field(dims, "submission_date", limit_to_allowed=allowed, sanitize=sanitize), "20130908") allowed = False sanitize = True # F, T self.assertEqual(self.schema.get_field(dims, "reason", limit_to_allowed=allowed, sanitize=sanitize), "saved_session") self.assertEqual(self.schema.get_field(dims, "appName", limit_to_allowed=allowed, sanitize=sanitize), "Firefox") self.assertEqual(self.schema.get_field(dims, "appUpdateChannel", limit_to_allowed=allowed, sanitize=sanitize), "release") self.assertEqual(self.schema.get_field(dims, "appVersion", limit_to_allowed=allowed, sanitize=sanitize), "28.0") self.assertEqual(self.schema.get_field(dims, "appBuildID", limit_to_allowed=allowed, sanitize=sanitize), "20130908010101") self.assertEqual(self.schema.get_field(dims, "submission_date", limit_to_allowed=allowed, sanitize=sanitize), "20130908") sanitize = False # F, F self.assertEqual(self.schema.get_field(dims, "reason", limit_to_allowed=allowed, sanitize=sanitize), "saved-session") self.assertEqual(self.schema.get_field(dims, "appName", limit_to_allowed=allowed, sanitize=sanitize), "Firefox") self.assertEqual(self.schema.get_field(dims, "appUpdateChannel", limit_to_allowed=allowed, sanitize=sanitize), "release") self.assertEqual(self.schema.get_field(dims, "appVersion", limit_to_allowed=allowed, sanitize=sanitize), "28.0") self.assertEqual(self.schema.get_field(dims, "appBuildID", limit_to_allowed=allowed, sanitize=sanitize), "20130908010101") self.assertEqual(self.schema.get_field(dims, "submission_date", limit_to_allowed=allowed, sanitize=sanitize), "20130908") with self.assertRaises(ValueError): v = self.schema.get_field(dims, "oranges") # Remove the last dimension: dims.pop() with self.assertRaises(ValueError): v = self.schema.get_field(dims, "submission_date") def test_more_allowed(self): spec = { "version": 1, "dimensions": [ { "field_name": "reason", "allowed_values": ["saved-session"] }, { "field_name": "appName", "allowed_values": "*" }, { "field_name": "appUpdateChannel", "allowed_values": ["nightly"] }, { "field_name": "appVersion", "allowed_values": "*" }, { "field_name": "appBuildID", "allowed_values": "one_specific_build" }, { "field_name": "submission_date", "allowed_values": { "min": "20130908", "max": "20140401" } } ] } schema = TelemetrySchema(spec) allowed = schema.sanitize_allowed_values() self.assertTrue(schema.is_allowed("20130908", allowed[5])) self.assertTrue(schema.is_allowed("20140401", allowed[5])) self.assertTrue(schema.is_allowed("20130909", allowed[5])) self.assertTrue(schema.is_allowed("20140101", allowed[5])) self.assertFalse(schema.is_allowed("20130907", allowed[5])) self.assertFalse(schema.is_allowed("20000000", allowed[5])) self.assertFalse(schema.is_allowed("20140402", allowed[5])) self.assertFalse(schema.is_allowed("99999999", allowed[5])) self.assertTrue(schema.is_allowed("one_specific_build", allowed[4])) self.assertFalse(schema.is_allowed("two_specific_build", allowed[4])) self.assertFalse(schema.is_allowed("*", allowed[4])) self.assertFalse(schema.is_allowed("one_specific_build ", allowed[4])) self.assertFalse(schema.is_allowed("one-specific-build", allowed[4]))
def test_more_allowed(self): spec = { "version": 1, "dimensions": [ { "field_name": "reason", "allowed_values": ["saved-session"] }, { "field_name": "appName", "allowed_values": "*" }, { "field_name": "appUpdateChannel", "allowed_values": ["nightly"] }, { "field_name": "appVersion", "allowed_values": "*" }, { "field_name": "appBuildID", "allowed_values": "one_specific_build" }, { "field_name": "submission_date", "allowed_values": { "min": "20130908", "max": "20140401" } } ] } schema = TelemetrySchema(spec) allowed = schema.sanitize_allowed_values() self.assertTrue(schema.is_allowed("20130908", allowed[5])) self.assertTrue(schema.is_allowed("20140401", allowed[5])) self.assertTrue(schema.is_allowed("20130909", allowed[5])) self.assertTrue(schema.is_allowed("20140101", allowed[5])) self.assertFalse(schema.is_allowed("20130907", allowed[5])) self.assertFalse(schema.is_allowed("20000000", allowed[5])) self.assertFalse(schema.is_allowed("20140402", allowed[5])) self.assertFalse(schema.is_allowed("99999999", allowed[5])) self.assertTrue(schema.is_allowed("one_specific_build", allowed[4])) self.assertFalse(schema.is_allowed("two_specific_build", allowed[4])) self.assertFalse(schema.is_allowed("*", allowed[4])) self.assertFalse(schema.is_allowed("one_specific_build ", allowed[4])) self.assertFalse(schema.is_allowed("one-specific-build", allowed[4]))
class TelemetrySchemaTest(unittest.TestCase): def setUp(self): self.schema = TelemetrySchema(self.get_schema_spec()) self.allowed_values = self.schema.sanitize_allowed_values() def get_schema_spec(self): return { "version": 1, "dimensions": [{ "field_name": "reason", "allowed_values": ["saved-session"] }, { "field_name": "appName", "allowed_values": "*" }, { "field_name": "appUpdateChannel", "allowed_values": ["nightly"] }, { "field_name": "appVersion", "allowed_values": "*" }, { "field_name": "appBuildID", "allowed_values": "*" }, { "field_name": "submission_date", "allowed_values": ["20130908"] }] } def get_file_list(self): return [ "/foo/bar/baz/bla.txt", "processed/idle_daily/Firefox/nightly/26.0a1/20130826074752.20130903.log.e0c7ff434e474c8aa745763eed408b9c.lzma", "garbage/idle_daily/Firefox/nightly/26.0a1/20130826074752.20130903.log.e0c7ff434e474c8aa745763eed408b9c.lzma", "processed/idle_daily/Firefox/nightly/26.0a1/20130901030218.20130903.log.17ff07fda8994e23baf983550246a94b.lzma", "processed/idle_daily/Firefox/nightly/26.0a1/20130819030205.20130907.log.ec6f22acd37349b3b5ef03da1cc150da.lzma", "processed/idle_daily/Firefox/nightly/26.0a1/20130807161117.20130903.log.64478ac84a734677bc14cbcf6cc114b7.lzma", "processed/idle_daily/Firefox/nightly/26.0a1/20130814030204.20130906.log.a382f1337d1f47ef8aad08f8fb14a79a.lzma", "processed/idle_daily/Firefox/nightly/26.0a1/20130830030205.20130903.log.c86e2c3f31c043ac8fc311d5dd1abc28.lzma", "processed/idle_daily/Firefox/nightly/26.0a1/20130823151250.20130907.log.939bec39c3d24c89a09834463b220d9a.lzma", "processed/idle_daily/Firefox/nightly/26.0a1/20130830030205.20130906.log.0bf2c1edf2634ca5bdc865a54957a690.lzma", "processed/idle_daily/Firefox/nightly/26.0a1/20130826074752.20130903.log.8e33cc0f130849dfbb8afe7331123be3.lzma", "processed/idle_daily/Firefox/nightly/26.0a1/20130826074752.20130902.log.2349f0434be64c6684f91eccabf9b3e6.lzma", "processed/idle_daily/Firefox/nightly/26.0a1/20130826030203.20130902.log.57a017a3378b420cbbfb666532606b16.lzma", "processed/saved_session/Firefox/nightly/26.0a1/20130902030220.20130908.log.7e9556a5e32b4990a9d378eea65f57a9.lzma", "processed/saved_session/Firefox/nightly/26.0a1/20130829030201.20130909.log.c227775e57e24854b1aac7c21c59f85c.lzma", "processed/saved_session/Firefox/nightly/26.0a1/20130826030203.20130906.log.88620da62e77482285f28d5ea69beb1e.lzma", "processed/saved_session/Firefox/nightly/26.0a1/20130826030203.20130908.log.cc1b0c52365947c38ac2636f3384503c.lzma", "processed/saved_session/Firefox/nightly/26.0a1/20130830030205.20130906.log.77905bb7503a4a98aa7231b10073f47e.lzma", "processed/saved_session/Firefox/nightly/26.0a1/20130823030204.20130908.log.55f4ab6ada3c4e1d939f24b5da7f8dc2.lzma", "processed/saved_session/Firefox/nightly/26.0a1/20130902030220.20130908.log.f213918c08804d449d30e1aaec70089a.lzma", "processed/idle_daily/Firefox/nightly/26.0a1/20130813030205.20130907.log.24c445d3d2c241bcb5001a63a78e98fa.lzma", "processed/idle_daily/Firefox/nightly/26.0a1/20130831030224.20130902.log.ebe3cd20fa264cd19aab02b8ffe8cbf1.lzma", "processed/idle_daily/Firefox/nightly/26.0a1/20130821030213.20130906.log.778737ad596d43e4a5e9e59c38428b61.lzma", "processed/idle_daily/Firefox/nightly/26.0a1/20130809030203.20130903.log.43ae292120ca475589b20be24fa70171.lzma", "processed/idle_daily/Firefox/nightly/26.0a1/20130814141812.20130907.log.7c6c5d65b702443cac2768eb6f0e3c91.lzma", "processed/idle_daily/Firefox/nightly/26.0a1/20130823030204.20130903.log.f73682ebc57a4661a6f48a2a5cf2629c.lzma", "processed/idle_daily/Firefox/nightly/26.0a1/20130821050136.20130904.log.2d423ec779e04113996914ce81e27bfe.lzma" ] def test_filtering(self): all_files = self.get_file_list() error_files = [] included_files = [] excluded_files = [] for f in all_files: include = True try: dims = self.schema.get_dimensions("processed", f) for i in range(len(self.allowed_values)): if not self.schema.is_allowed(dims[i], self.allowed_values[i]): include = False break except ValueError: include = False error_files.append(f) if include: included_files.append(f) else: excluded_files.append(f) #print "Found", len(excluded_files), "excluded files:" #for f in excluded_files: # print " - ", f #print "Found", len(included_files), "included files:" #for f in included_files: # print " + ", f #print "Found", len(error_files), "invalid files" #for f in error_files: # print " x ", f self.assertEqual(len(included_files), 4) self.assertEqual(len(error_files), 2) self.assertEqual(len(all_files), (len(excluded_files) + len(included_files))) def test_safe_filename(self): tests = { "Hello World!": "Hello_World_", "what\nam\ni": "what_am_i", "saved-session": "saved_session" } for key, value in tests.iteritems(): self.assertEqual(self.schema.safe_filename(key), value) def test_sanitize_allowed_values(self): self.assertEqual(self.allowed_values[0][0], "saved_session") def test_allowed_values(self): allowed = "saved_session" not_allowed = "anything_else" self.assertEqual( self.schema.get_allowed_value(allowed, self.allowed_values[0]), allowed) self.assertEqual( self.schema.get_allowed_value(not_allowed, self.allowed_values[0]), TelemetrySchema.DISALLOWED_VALUE) def test_apply_schema(self): test_inputs = [] expected_ot = [] # <-- bad name, convenient indenting. other = TelemetrySchema.DISALLOWED_VALUE # fields: reason appName channel appVersion appBuildID submission_date # ------------- ----------- ------- -------------- ----------- --------------- # allowed: saved-session * nightly * * 20130908 test_inputs.append([ "saved-session", "someAppName", "nightly", "someAppVersion", "someBuildID", "20130908" ]) expected_ot.append([ "saved-session", "someAppName", "nightly", "someAppVersion", "someBuildID", "20130908" ]) test_inputs.append([ "saved-session", "another", "nightly", "anything is ok", "wooooo", "20130908" ]) expected_ot.append([ "saved-session", "another", "nightly", "anything is ok", "wooooo", "20130908" ]) test_inputs.append([ "bogus", "someAppName", "nightly", "someAppVersion", "someBuildID", "20130908" ]) expected_ot.append([ other, "someAppName", "nightly", "someAppVersion", "someBuildID", "20130908" ]) test_inputs.append([ "bogus", "someAppName", "aurora", "someAppVersion", "someBuildID", "20140428" ]) expected_ot.append([ other, "someAppName", other, "someAppVersion", "someBuildID", other ]) test_inputs.append([ "saved-session", "someAppName", "nightly", "someAppVersion", "someBuildID", "20130908", "more", "bonus", "dimensions!" ]) expected_ot.append([ "saved-session", "someAppName", "nightly", "someAppVersion", "someBuildID", "20130908" ]) for i in range(len(test_inputs)): actual = self.schema.apply_schema(test_inputs[i]) self.assertEqual(actual, expected_ot[i]) def test_get_current_file(self): # everything but "submission_date": dims = [ "saved-session", "someAppName", "nightly", "someAppVersion", "someBuildID" ] filename = self.schema.get_current_file("foo", dims, "20130908", 1) self.assertEqual( filename, "foo/saved_session/someAppName/nightly/someAppVersion/someBuildID.20130908.v1.log" ) def test_get_filename(self): dims = [ "saved-session", "someAppName", "nightly", "someAppVersion", "someBuildID", "20130908" ] filename = self.schema.get_filename("foo", dims, 99) self.assertEqual( filename, "foo/saved_session/someAppName/nightly/someAppVersion/someBuildID.20130908.v99.log" ) def test_dimensions_from(self): test_inputs = [] expected_ot = [] test_inputs.append({ "reason": "saved-session", "appName": "Firefox", "appUpdateChannel": "release", "appVersion": "28.0", "appBuildID": "20140401001122" }) expected_ot.append([ "saved-session", "Firefox", "release", "28.0", "20140401001122", "20130908" ]) test_inputs.append({ "reason": "idle-daily", "appUpdateChannel": "release", "appVersion": "28.0", "appBuildID": "20140401001122" }) expected_ot.append([ "idle-daily", "UNKNOWN", "release", "28.0", "20140401001122", "20130908" ]) test_inputs.append({}) expected_ot.append([ "UNKNOWN", "UNKNOWN", "UNKNOWN", "UNKNOWN", "UNKNOWN", "20130908" ]) for i in range(len(test_inputs)): actual = self.schema.dimensions_from(test_inputs[i], "20130908") self.assertEqual(actual, expected_ot[i]) def test_get_field(self): dims = [ "saved-session", "Firefox", "release", "28.0", "20130908010101", "20130908" ] # Basic functionality self.assertEqual(self.schema.get_field(dims, "reason"), "saved-session") self.assertEqual(self.schema.get_field(dims, "appName"), "Firefox") self.assertEqual(self.schema.get_field(dims, "appUpdateChannel"), "release") self.assertEqual(self.schema.get_field(dims, "appVersion"), "28.0") self.assertEqual(self.schema.get_field(dims, "appBuildID"), "20130908010101") self.assertEqual(self.schema.get_field(dims, "submission_date"), "20130908") other = TelemetrySchema.DISALLOWED_VALUE allowed = True sanitize = True # T, T self.assertEqual( self.schema.get_field(dims, "reason", limit_to_allowed=allowed, sanitize=sanitize), "saved_session") self.assertEqual( self.schema.get_field(dims, "appName", limit_to_allowed=allowed, sanitize=sanitize), "Firefox") self.assertEqual( self.schema.get_field(dims, "appUpdateChannel", limit_to_allowed=allowed, sanitize=sanitize), other) self.assertEqual( self.schema.get_field(dims, "appVersion", limit_to_allowed=allowed, sanitize=sanitize), "28.0") self.assertEqual( self.schema.get_field(dims, "appBuildID", limit_to_allowed=allowed, sanitize=sanitize), "20130908010101") self.assertEqual( self.schema.get_field(dims, "submission_date", limit_to_allowed=allowed, sanitize=sanitize), "20130908") sanitize = False # T, F self.assertEqual( self.schema.get_field(dims, "reason", limit_to_allowed=allowed, sanitize=sanitize), "saved-session") self.assertEqual( self.schema.get_field(dims, "appName", limit_to_allowed=allowed, sanitize=sanitize), "Firefox") self.assertEqual( self.schema.get_field(dims, "appUpdateChannel", limit_to_allowed=allowed, sanitize=sanitize), other) self.assertEqual( self.schema.get_field(dims, "appVersion", limit_to_allowed=allowed, sanitize=sanitize), "28.0") self.assertEqual( self.schema.get_field(dims, "appBuildID", limit_to_allowed=allowed, sanitize=sanitize), "20130908010101") self.assertEqual( self.schema.get_field(dims, "submission_date", limit_to_allowed=allowed, sanitize=sanitize), "20130908") allowed = False sanitize = True # F, T self.assertEqual( self.schema.get_field(dims, "reason", limit_to_allowed=allowed, sanitize=sanitize), "saved_session") self.assertEqual( self.schema.get_field(dims, "appName", limit_to_allowed=allowed, sanitize=sanitize), "Firefox") self.assertEqual( self.schema.get_field(dims, "appUpdateChannel", limit_to_allowed=allowed, sanitize=sanitize), "release") self.assertEqual( self.schema.get_field(dims, "appVersion", limit_to_allowed=allowed, sanitize=sanitize), "28.0") self.assertEqual( self.schema.get_field(dims, "appBuildID", limit_to_allowed=allowed, sanitize=sanitize), "20130908010101") self.assertEqual( self.schema.get_field(dims, "submission_date", limit_to_allowed=allowed, sanitize=sanitize), "20130908") sanitize = False # F, F self.assertEqual( self.schema.get_field(dims, "reason", limit_to_allowed=allowed, sanitize=sanitize), "saved-session") self.assertEqual( self.schema.get_field(dims, "appName", limit_to_allowed=allowed, sanitize=sanitize), "Firefox") self.assertEqual( self.schema.get_field(dims, "appUpdateChannel", limit_to_allowed=allowed, sanitize=sanitize), "release") self.assertEqual( self.schema.get_field(dims, "appVersion", limit_to_allowed=allowed, sanitize=sanitize), "28.0") self.assertEqual( self.schema.get_field(dims, "appBuildID", limit_to_allowed=allowed, sanitize=sanitize), "20130908010101") self.assertEqual( self.schema.get_field(dims, "submission_date", limit_to_allowed=allowed, sanitize=sanitize), "20130908") with self.assertRaises(ValueError): v = self.schema.get_field(dims, "oranges") # Remove the last dimension: dims.pop() with self.assertRaises(ValueError): v = self.schema.get_field(dims, "submission_date") def test_more_allowed(self): spec = { "version": 1, "dimensions": [{ "field_name": "reason", "allowed_values": ["saved-session"] }, { "field_name": "appName", "allowed_values": "*" }, { "field_name": "appUpdateChannel", "allowed_values": ["nightly"] }, { "field_name": "appVersion", "allowed_values": "*" }, { "field_name": "appBuildID", "allowed_values": "one_specific_build" }, { "field_name": "submission_date", "allowed_values": { "min": "20130908", "max": "20140401" } }] } schema = TelemetrySchema(spec) allowed = schema.sanitize_allowed_values() self.assertTrue(schema.is_allowed("20130908", allowed[5])) self.assertTrue(schema.is_allowed("20140401", allowed[5])) self.assertTrue(schema.is_allowed("20130909", allowed[5])) self.assertTrue(schema.is_allowed("20140101", allowed[5])) self.assertFalse(schema.is_allowed("20130907", allowed[5])) self.assertFalse(schema.is_allowed("20000000", allowed[5])) self.assertFalse(schema.is_allowed("20140402", allowed[5])) self.assertFalse(schema.is_allowed("99999999", allowed[5])) self.assertTrue(schema.is_allowed("one_specific_build", allowed[4])) self.assertFalse(schema.is_allowed("two_specific_build", allowed[4])) self.assertFalse(schema.is_allowed("*", allowed[4])) self.assertFalse(schema.is_allowed("one_specific_build ", allowed[4])) self.assertFalse(schema.is_allowed("one-specific-build", allowed[4]))
def test_more_allowed(self): spec = { "version": 1, "dimensions": [{ "field_name": "reason", "allowed_values": ["saved-session"] }, { "field_name": "appName", "allowed_values": "*" }, { "field_name": "appUpdateChannel", "allowed_values": ["nightly"] }, { "field_name": "appVersion", "allowed_values": "*" }, { "field_name": "appBuildID", "allowed_values": "one_specific_build" }, { "field_name": "submission_date", "allowed_values": { "min": "20130908", "max": "20140401" } }] } schema = TelemetrySchema(spec) allowed = schema.sanitize_allowed_values() self.assertTrue(schema.is_allowed("20130908", allowed[5])) self.assertTrue(schema.is_allowed("20140401", allowed[5])) self.assertTrue(schema.is_allowed("20130909", allowed[5])) self.assertTrue(schema.is_allowed("20140101", allowed[5])) self.assertFalse(schema.is_allowed("20130907", allowed[5])) self.assertFalse(schema.is_allowed("20000000", allowed[5])) self.assertFalse(schema.is_allowed("20140402", allowed[5])) self.assertFalse(schema.is_allowed("99999999", allowed[5])) self.assertTrue(schema.is_allowed("one_specific_build", allowed[4])) self.assertFalse(schema.is_allowed("two_specific_build", allowed[4])) self.assertFalse(schema.is_allowed("*", allowed[4])) self.assertFalse(schema.is_allowed("one_specific_build ", allowed[4])) self.assertFalse(schema.is_allowed("one-specific-build", allowed[4]))
class Job: """A class for orchestrating a Telemetry MapReduce job""" DOWNLOAD_BATCH_SIZE = 100 # 1. read input filter # 2. generate filtered list of local input files # 2a. generate filtered list of remote input files # 3. load mapper # 4. spawn N processes # 5. distribute files among processes # 6. map(key, value, dims) each line in the file # 7. combine map output for each file # 8. reduce combine output overall def __init__(self, config): # Sanity check args. if config.num_mappers <= 0: raise ValueError("Number of mappers must be greater than zero") if config.num_reducers <= 0: raise ValueError("Number of reducers must be greater than zero") if not os.path.isdir(config.data_dir): raise ValueError("Data dir must be a valid directory") if not os.path.isdir(config.work_dir): raise ValueError("Work dir must be a valid directory") if not os.path.isfile(config.job_script): raise ValueError("Job script must be a valid python file") if not os.path.isfile(config.input_filter): raise ValueError("Input filter must be a valid json file") self._input_dir = config.data_dir if self._input_dir[-1] == os.path.sep: self._input_dir = self._input_dir[0:-1] self._work_dir = config.work_dir self._input_filter = TelemetrySchema( json.load(open(config.input_filter))) self._allowed_values = self._input_filter.sanitize_allowed_values() self._output_file = config.output self._num_mappers = config.num_mappers self._num_reducers = config.num_reducers self._local_only = config.local_only self._bucket_name = config.bucket self._aws_key = config.aws_key self._aws_secret_key = config.aws_secret_key modulefd = open(config.job_script) ## Lifted from FileDriver.py in jydoop. self._job_module = imp.load_module("telemetry_job", modulefd, config.job_script, ('.py', 'U', 1)) def dump_stats(self, partitions): total = sum(partitions) avg = total / len(partitions) for i in range(len(partitions)): print "Partition %d contained %d (%+d)" % ( i, partitions[i], float(partitions[i]) - avg) def fetch_remotes(self, remotes): # TODO: download remotes in groups of size DOWNLOAD_BATCH_SIZE remote_names = [r["name"] for r in remotes if r["type"] == "remote"] # TODO: check cache first. result = 0 fetch_cwd = os.path.join(self._work_dir, "cache") if len(remote_names) > 0: if not os.path.isdir(fetch_cwd): os.makedirs(fetch_cwd) fetch_cmd = ["/usr/local/bin/s3funnel"] fetch_cmd.append(self._bucket_name) fetch_cmd.append("get") fetch_cmd.append("-a") fetch_cmd.append(self._aws_key) fetch_cmd.append("-s") fetch_cmd.append(self._aws_secret_key) fetch_cmd.append("-t") fetch_cmd.append("8") start = datetime.now() result = subprocess.call(fetch_cmd + remote_names, cwd=fetch_cwd) delta = (datetime.now() - start) duration_sec = float( delta.seconds) + float(delta.microseconds) / 1000000 downloaded_bytes = sum( [r["size"] for r in remotes if r["type"] == "remote"]) downloaded_mb = float(downloaded_bytes) / 1024.0 / 1024.0 print "Downloaded %.2fMB in %.2fs (%.2fMB/s)" % ( downloaded_mb, duration_sec, downloaded_mb / duration_sec) return result def mapreduce(self): # Find files matching specified input filter files = self.local_files() remote_files = self.get_filtered_files_s3() file_count = len(files) + len(remote_files) # Not useful to have more mappers than input files. if file_count < self._num_mappers: print "There are only", file_count, "input files. Reducing number of mappers accordingly." self._num_mappers = file_count # Partition files into reasonably equal groups for use by mappers print "Partitioning input data..." partitions = self.partition(files, remote_files) print "Done" # Partitions are ready. Map. mappers = [] for i in range(self._num_mappers): if len(partitions[i]) > 0: # Fetch the files we need for each mapper print "Fetching remotes for partition", i self.fetch_remotes(partitions[i]) print "Done" p = Process(target=Mapper, args=(i, partitions[i], self._work_dir, self._job_module, self._num_reducers)) mappers.append(p) p.start() else: print "Skipping mapper", i, "- no input files to process" for m in mappers: m.join() # Mappers are done. Reduce. reducers = [] for i in range(self._num_reducers): p = Process(target=Reducer, args=(i, self._work_dir, self._job_module, self._num_mappers)) reducers.append(p) p.start() for r in reducers: r.join() # Reducers are done. Output results. os.rename(os.path.join(self._work_dir, "reducer_0"), self._output_file) if self._num_reducers > 1: out = open(self._output_file, "a") for i in range(1, self._num_reducers): # FIXME: this reads the entire reducer output into memory reducer_filename = os.path.join(self._work_dir, "reducer_" + str(i)) reducer_output = open(reducer_filename, "r") out.write(reducer_output.read()) reducer_output.close() os.remove(reducer_filename) # TODO: clean up downloaded files? # Clean up mapper outputs for m in range(self._num_mappers): for r in range(self._num_reducers): mfile = os.path.join(self._work_dir, "mapper_%d_%d" % (m, r)) if os.path.exists(mfile): os.remove(mfile) else: print "Warning: Could not find", mfile def local_files(self): out_files = self.get_filtered_files(self._input_dir) if self._input_filter._include_invalid: invalid_dir = os.path.join(self._input_dir, TelemetrySchema.INVALID_DIR) #print "Looking for invalid data in", invalid_dir out_files += self.get_filtered_files(invalid_dir) return out_files # Split up the input files into groups of approximately-equal on-disk size. def partition(self, files, remote_files): namesize = [{ "type": "local", "name": files[i], "size": os.stat(files[i]).st_size, "dimensions": self._input_filter.get_dimensions(self._input_dir, files[i]) } for i in range(0, len(files))] partitions = [] sums = [] for p in range(self._num_mappers): partitions.append([]) sums.append(0) min_idx = 0 # Greedily assign the largest file to the smallest partition while len(namesize) > 0: current = namesize.pop() #print "putting", current, "into partition", min_idx partitions[min_idx].append(current) sums[min_idx] += current["size"] min_idx = find_min_idx(sums) # And now do the same with the remote files. # TODO: if this is too slow, just distribute remote files round-robin. if len(remote_files) > 0: conn = S3Connection(self._aws_key, self._aws_secret_key) bucket = conn.get_bucket(self._bucket_name) for r in remote_files: key = bucket.lookup(r) size = key.size dims = self._input_filter.get_dimensions(".", r) remote = { "type": "remote", "name": r, "size": size, "dimensions": dims } #print "putting", remote, "into partition", min_idx partitions[min_idx].append(remote) sums[min_idx] += size min_idx = find_min_idx(sums) # Print out some info to see how balanced the partitions were: self.dump_stats(sums) return partitions def get_filtered_files(self, searchdir): level_offset = searchdir.count(os.path.sep) out_files = [] for root, dirs, files in os.walk(searchdir): level = root.count(os.path.sep) - level_offset dirs[:] = [i for i in dirs if self.filter_includes(level, i)] for f in files: full_filename = os.path.join(root, f) dims = self._input_filter.get_dimensions( searchdir, full_filename) include = True for l in range(level, len(self._allowed_values)): if not self.filter_includes(l, dims[l]): include = False break if include: out_files.append(full_filename) return out_files def get_filtered_files_s3(self): out_files = [] if not self._local_only: print "Fetching file list from S3..." # Plain boto should be fast enough to list bucket contents. conn = S3Connection(self._aws_key, self._aws_secret_key) bucket = conn.get_bucket(self._bucket_name) # TODO: potential optimization - if our input filter is reasonably # restrictive an/or our list of keys is very long, it may be # a win to use the "prefix" and "delimiter" params. for f in bucket.list(): dims = self._input_filter.get_dimensions(".", f.name) #print f.name, "->", ",".join(dims) include = True for i in range(len(self._allowed_values)): if not self.filter_includes(i, dims[i]): include = False break if include: out_files.append(f.name) conn.close() print "Done!" return out_files def filter_includes(self, level, value): # Filter out 'invalid' data. It is included explicitly if needed. if level == 0 and value == TelemetrySchema.INVALID_DIR: return False allowed_values = self._allowed_values[level] return self._input_filter.is_allowed(value, allowed_values)
def main(): parser = argparse.ArgumentParser( description='Split raw logs into partitioned files.', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("-m", "--max-output-size", metavar="N", help="Rotate output files after N bytes", type=int, default=500000000) parser.add_argument("-i", "--input-file", help="Filename to read from", required=True) parser.add_argument("-o", "--output-dir", help="Base directory to store split files", required=True) parser.add_argument("-t", "--telemetry-schema", help="Filename of telemetry schema spec", required=True) parser.add_argument("-b", "--bucket", help="S3 Bucket name") parser.add_argument("-k", "--aws-key", help="AWS Key") parser.add_argument("-s", "--aws-secret-key", help="AWS Secret Key") args = parser.parse_args() schema_data = open(args.telemetry_schema) schema = TelemetrySchema(json.load(schema_data)) schema_data.close() storage = StorageLayout(schema, args.output_dir, args.max_output_size) expected_dim_count = len(schema._dimensions) if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir) record_count = 0 fin = open(args.input_file, "rb") bytes_read = 0 start = datetime.now() while True: record_count += 1 # Read two 4-byte values and one 8-byte value lengths = fin.read(16) if lengths == '': break len_path, len_data, timestamp = struct.unpack("<IIQ", lengths) # Incoming timestamps are in milliseconds, so convert to POSIX first # (ie. seconds) submission_date = date.fromtimestamp(timestamp / 1000).strftime("%Y%m%d") path = unicode(fin.read(len_path), errors="replace") #print "Path for record", record_count, path, "length of data:", len_data # Detect and handle gzipped data. data = fin.read(len_data) try: # Note: from brief testing, cStringIO doesn't appear to be any # faster. In fact, it seems slightly slower than StringIO. data_reader = StringIO.StringIO(data) uncompressor = gzip.GzipFile(fileobj=data_reader, mode="r") data = unicode(uncompressor.read(), errors="replace") uncompressor.close() data_reader.close() except Exception, e: #print e # Use the string as-is data = unicode(data, errors="replace") bytes_read += 8 + len_path + len_data #print "Path for record", record_count, path, "length of data:", len_data, "data:", data[0:5] + "..." path_components = path.split("/") if len(path_components) != expected_dim_count: # We're going to pop the ID off, but we'll also add the submission, # so it evens out. print "Found an invalid path in record", record_count, path continue key = path_components.pop(0) info = {} info["reason"] = path_components.pop(0) info["appName"] = path_components.pop(0) info["appVersion"] = path_components.pop(0) info["appUpdateChannel"] = path_components.pop(0) info["appBuildID"] = path_components.pop(0) dimensions = schema.dimensions_from(info, submission_date) #print " Converted path to filename", schema.get_filename(args.output_dir, dimensions) storage.write(key, data, dimensions)