def __init__(self, cfg): self.job_bundle = cfg.job_bundle if cfg.input_filter: self.input_filter = TelemetrySchema( json.load(open(cfg.input_filter))) else: self.input_filter = None if cfg.input_list_file: self.input_list = cfg.input_list_file else: self.input_list = None self.job_id = str(uuid4()) self.target_queue = cfg.target_queue self.aws_key = cfg.aws_key self.aws_secret_key = cfg.aws_secret_key self.input_bucket = "telemetry-published-v1" self.job_name = cfg.name self.job_owner = cfg.owner self.date_limit = cfg.date_limit # Bucket with intermediate data for this analysis job self.analysis_bucket = "jonasfj-telemetry-analysis" self.s3_code_path = "batch-jobs/" + self.job_id + ".tar.gz" # S3 region of operation self.aws_region = "us-west-2" self.task_size_limit = 400 * 1024 * 1024 self.sqs_input_name = cfg.sqs_queue
def update_published_v2_files(sdb, from_submission_date=None, to_submission_date=None, limit=None): s3 = S3Connection() bucket_name = "telemetry-published-v2" bucket = s3.get_bucket(bucket_name) schema_key = bucket.get_key("telemetry_schema.json") schema_string = schema_key.get_contents_as_string() schema = TelemetrySchema(json.loads(schema_string)) termination_requested = [False] def keyboard_interrupt_handler(signal, frame): termination_requested[0] = True signal.signal(signal.SIGINT, keyboard_interrupt_handler) added_count = 0 total_count = 0 start_time = datetime.now() done = False last_key = '' batch = BatchPut(sdb) while not done: try: for key in bucket.list(marker=last_key): last_key = key.name if total_count % 1e5 == 0: print("Looked at {} total records in {} seconds, added {}". format(total_count, delta_sec(start_time), added_count)) dims = schema.get_dimension_map(schema.get_dimensions(".", key.name)) if (from_submission_date is None or dims["submission_date"] >= from_submission_date) and \ (to_submission_date is None or dims["submission_date"] <= to_submission_date) and \ dims["submission_date"][:-2] in sdb and \ dims["reason"] != "idle_daily": attributes = {"reason": dims.get("reason"), "appName": dims.get("appName"), "appUpdateChannel": dims.get("appUpdateChannel"), "appVersion": dims.get("appVersion"), "appBuildID": dims.get("appBuildID"), "submissionDate": dims.get("submission_date")} batch.put(dims["submission_date"][:-2], key.name, attributes) added_count += 1 total_count += 1 if total_count == limit or termination_requested[0]: done = True break except Exception as e: print("Error listing keys: {}".format(e)) traceback.print_exc() print("Continuing from last seen key: {}".format(last_key)) continue break batch.flush() print("Overall, added {} of {} in {} seconds".format(added_count, total_count, delta_sec(start_time)))
def main(): parser = ArgumentParser( description='Convert local Telemetry pings to server storage structure' ) parser.add_argument("--input-dir", required=True) parser.add_argument("--output-dir", required=True) parser.add_argument("--schema", type=file, default='./telemetry/telemetry_schema.json') parser.add_argument("--histogram-cache-dir", default='/tmp/telemetry_histogram_cache') args = parser.parse_args() print "Getting pings from", args.input_dir, "converting them and storing them in", args.output_dir schema = TelemetrySchema(json.load(args.schema)) cache = RevisionCache(args.histogram_cache_dir, 'hg.mozilla.org') converter = Converter(cache, schema) storage = StorageLayout(schema, args.output_dir, 500000000) ping_dir = args.input_dir ping_files = get_pings(ping_dir) if len(ping_files) == 0: # Try the usual ping dir (if the user just gave the Profile Dir) ping_dir = os.path.join(args.input_dir, "saved-telemetry-pings") ping_files = get_pings(ping_dir) print "found", len(ping_files), "pings" for ping_file in ping_files: with open(os.path.join(ping_dir, ping_file), "r") as f: ping = json.load(f) reason = ping['reason'] key = ping['slug'] payload = ping['payload'] submission_date = date.today().strftime("%Y%m%d") dims = schema.dimensions_from(payload, submission_date) try: parsed_data, dims = converter.convert_obj(payload, dims[-1]) serialized_data = converter.serialize(parsed_data) data_version = Converter.VERSION_CONVERTED try: # Write to persistent storage n = storage.write(key, serialized_data, dims, data_version) print "Successfully saved ping", key, "to", n except Exception, e: traceback.print_exc() except BadPayloadError, e: print "Bad Payload:", e.msg except Exception, e: traceback.print_exc()
def update_published_v4_files(sdb, bucket, bucket_prefix, submission_date, limit=None): s3 = S3Connection() metadata = s3.get_bucket(METADATA_BUCKET) schema_key = metadata.get_key("{}/schema.json".format(bucket_prefix)) schema_string = schema_key.get_contents_as_string() schema = TelemetrySchema(json.loads(schema_string)) bucket = s3.get_bucket(bucket) added_count = 0 total_count = 0 start_time = datetime.now() done = False last_key = '' batch = BatchPut(sdb) prefix = "{}/{}".format(bucket_prefix, submission_date) if submission_date else bucket_prefix print "Bucket: {} - Prefix: {} - Date: {}".format(bucket.name, bucket_prefix, submission_date) while not done: try: for key in bucket.list(marker=last_key, prefix=prefix): last_key = key.name if total_count % 1e5 == 0: print("Looked at {} total records in {} seconds, added {}". format(total_count, delta_sec(start_time), added_count)) dims = schema.get_dimension_map(schema.get_dimensions(".", key.name[len(bucket_prefix) + 1:], dirs_only=True)) if (dims["submissionDate"] == submission_date) and dims["submissionDate"][:-2] in sdb: batch.put(dims["submissionDate"][:-2], key.name, dims) added_count += 1 total_count += 1 if total_count == limit: done = True break except Exception as e: print("Error listing keys: {}".format(e)) traceback.print_exc() print("Continuing from last seen key: {}".format(last_key)) continue break batch.flush() print("Overall, added {} of {} in {} seconds".format(added_count, total_count, delta_sec(start_time)))
def __init__(self, config): # Sanity check args. if config.num_mappers <= 0: raise ValueError("Number of mappers must be greater than zero") if config.num_reducers <= 0: raise ValueError("Number of reducers must be greater than zero") if not os.path.isdir(config.data_dir): raise ValueError("Data dir must be a valid directory") if not os.path.isdir(config.work_dir): raise ValueError("Work dir must be a valid directory") if not os.path.isfile(config.job_script): raise ValueError("Job script must be a valid python file") if not os.path.isfile(config.input_filter): raise ValueError("Input filter must be a valid json file") self._input_dir = config.data_dir if self._input_dir[-1] == os.path.sep: self._input_dir = self._input_dir[0:-1] self._work_dir = config.work_dir self._input_filter = TelemetrySchema(json.load(open(config.input_filter))) self._allowed_values = self._input_filter.sanitize_allowed_values() self._output_file = config.output self._num_mappers = config.num_mappers self._num_reducers = config.num_reducers self._local_only = config.local_only self._bucket_name = config.bucket self._aws_key = config.aws_key self._aws_secret_key = config.aws_secret_key modulefd = open(config.job_script) # let the job script import additional modules under its path sys.path.append(os.path.dirname(config.job_script)) ## Lifted from FileDriver.py in jydoop. self._job_module = imp.load_module("telemetry_job", modulefd, config.job_script, ('.py', 'U', 1))
def __init__(self, cfg): self.job_bundle = cfg.job_bundle if cfg.input_filter: self.input_filter = TelemetrySchema(json.load(open(cfg.input_filter))) else: self.input_filter = None if cfg.input_list_file: self.input_list = cfg.input_list_file else: self.input_list = None self.job_id = str(uuid4()) self.target_queue = cfg.target_queue self.aws_key = cfg.aws_key self.aws_secret_key = cfg.aws_secret_key self.input_bucket = "telemetry-published-v1" self.job_name = cfg.name self.job_owner = cfg.owner self.date_limit = cfg.date_limit # Bucket with intermediate data for this analysis job self.analysis_bucket = "jonasfj-telemetry-analysis" self.s3_code_path = "batch-jobs/" + self.job_id + ".tar.gz" # S3 region of operation self.aws_region = "us-west-2" self.task_size_limit = 400 * 1024 * 1024 self.sqs_input_name = cfg.sqs_queue
def main(): parser = ArgumentParser(description='Convert local Telemetry pings to server storage structure') parser.add_argument("--input-dir", required=True) parser.add_argument("--output-dir", required=True) parser.add_argument("--schema", type=file, default='./telemetry/telemetry_schema.json') parser.add_argument("--histogram-cache-dir", default='/tmp/telemetry_histogram_cache') args = parser.parse_args() print "Getting pings from", args.input_dir, "converting them and storing them in", args.output_dir schema = TelemetrySchema(json.load(args.schema)) cache = RevisionCache(args.histogram_cache_dir, 'hg.mozilla.org') converter = Converter(cache, schema) storage = StorageLayout(schema, args.output_dir, 500000000) ping_dir = args.input_dir ping_files = get_pings(ping_dir) if len(ping_files) == 0: # Try the usual ping dir (if the user just gave the Profile Dir) ping_dir = os.path.join(args.input_dir, "saved-telemetry-pings") ping_files = get_pings(ping_dir) print "found", len(ping_files), "pings" for ping_file in ping_files: with open(os.path.join(ping_dir, ping_file), "r") as f: ping = json.load(f) reason = ping['reason'] key = ping['slug'] payload = ping['payload'] submission_date = date.today().strftime("%Y%m%d") dims = schema.dimensions_from(payload, submission_date) try: parsed_data, dims = converter.convert_obj(payload, dims[-1]) serialized_data = converter.serialize(parsed_data) data_version = Converter.VERSION_CONVERTED try: # Write to persistent storage n = storage.write(key, serialized_data, dims, data_version) print "Successfully saved ping", key, "to", n except Exception, e: traceback.print_exc() except BadPayloadError, e: print "Bad Payload:", e.msg except Exception, e: traceback.print_exc()
def _filter_to_schema(schema, filter_args): new_schema = {"version": 1, "dimensions": []} for i, dim in enumerate(schema["dimensions"]): new_filter = { "field_name": schema["dimensions"][i].get("field_name", "field{}".format(i)), "allowed_values": "*" } if dim["field_name"] in filter_args: new_filter["allowed_values"] = filter_args[dim["field_name"]] new_schema["dimensions"].append(new_filter) return TelemetrySchema(new_schema)
def test_v4execschema(): schema_spec = { "version": 2, "dimensions": [{ "field_name": "submissionDate", "allowed_values": { "max": "20150901" } }] } schema = TelemetrySchema(schema_spec) found = set() for f in s3util.list_heka_partitions(v4execbucket, schema=schema): found.add(f.name) assert (len(found) == 3) assert ("20150901/20150901221519.541_ip-172-31-16-184" in found) assert ("20150901/20150901223019.579_ip-172-31-16-184" in found) assert ("20150901/20150901224519.623_ip-172-31-16-184" in found) # Test with a prefix: found = set() for f in s3util.list_heka_partitions( v4prefixbucket, prefix="telemetry-executive-summary-2", schema=schema): found.add(f.name) assert (len(found) == 3) assert ( "telemetry-executive-summary-2/20150901/20150901221519.541_ip-172-31-16-184" in found) assert ( "telemetry-executive-summary-2/20150901/20150901223019.579_ip-172-31-16-184" in found) assert ( "telemetry-executive-summary-2/20150901/20150901224519.623_ip-172-31-16-184" in found) # Test with a bunch of prefixes: found = set() for f in s3util.list_heka_partitions(multiprefixbucket, prefix="a/b/c/d", schema=schema): found.add(f.name) assert (len(found) == 3) assert ("a/b/c/d/20150901/20150901221519.541_ip-172-31-16-184" in found) assert ("a/b/c/d/20150901/20150901223019.579_ip-172-31-16-184" in found) assert ("a/b/c/d/20150901/20150901224519.623_ip-172-31-16-184" in found)
def __init__(self, config): # Sanity check args. if config.get("num_mappers") <= 0: raise ValueError("Number of mappers must be greater than zero") if config.get("num_reducers") <= 0: raise ValueError("Number of reducers must be greater than zero") if not os.path.isdir(config.get("data_dir")): raise ValueError("Data dir must be a valid directory") if not os.path.isdir(config.get("work_dir")): raise ValueError("Work dir must be a valid directory") if not os.path.isfile(config.get("job_script", "")): raise ValueError("Job script must be a valid python file") if not os.path.isfile(config.get("input_filter")): raise ValueError("Input filter must be a valid json file") self._input_dir = config.get("data_dir") if self._input_dir[-1] == os.path.sep: self._input_dir = self._input_dir[0:-1] self._work_dir = config.get("work_dir") with open(config.get("input_filter")) as filter_file: self._input_filter = TelemetrySchema(json.load(filter_file)) self._allowed_values = self._input_filter.sanitize_allowed_values() self._output_file = config.get("output") self._num_mappers = config.get("num_mappers") self._num_reducers = config.get("num_reducers") self._local_only = config.get("local_only") self._bucket_name = config.get("bucket") self._aws_key = config.get("aws_key") self._aws_secret_key = config.get("aws_secret_key") self._profile = config.get("profile") self._delete_data = config.get("delete_data") with open(config.get("job_script")) as modulefd: # let the job script import additional modules under its path sys.path.append(os.path.dirname(config.get("job_script"))) ## Lifted from FileDriver.py in jydoop. self._job_module = imp.load_module("telemetry_job", modulefd, config.get("job_script"), ('.py', 'U', 1))
def test_v4schema(): schema_spec = { "version": 2, "dimensions": [{ "field_name": "submissionDate", "allowed_values": "20150903" }, { "field_name": "sourceName", "allowed_values": "*" }, { "field_name": "sourceVersion", "allowed_values": "4" }, { "field_name": "docType", "allowed_values": ["saved-session"] }, { "field_name": "appName", "allowed_values": ["Firefox"] }, { "field_name": "appUpdateChannel", "allowed_values": ["release"] }, { "field_name": "appVersion", "allowed_values": "24.0" }, { "field_name": "appBuildId", "allowed_values": "20130910160258" }] } schema = TelemetrySchema(schema_spec) found = set() for f in s3util.list_heka_partitions(v4bucket, schema=schema): found.add(f.name) assert (len(found) == 3) assert ( "20150903/telemetry/4/saved_session/Firefox/release/24.0/20130910160258/20150903051633.482_ip-172-31-16-184" in found) assert ( "20150903/telemetry/4/saved_session/Firefox/release/24.0/20130910160258/20150903051644.482_ip-172-31-16-184" in found) assert ( "20150903/telemetry/4/saved_session/Firefox/release/24.0/20130910160258/20150903051655.482_ip-172-31-16-184" in found)
def test_v2schema(): schema_spec = { "version": 1, "dimensions": [{ "field_name": "reason", "allowed_values": ["saved-session"] }, { "field_name": "appName", "allowed_values": ["Firefox"] }, { "field_name": "appUpdateChannel", "allowed_values": ["release"] }, { "field_name": "appVersion", "allowed_values": ["24.0"] }, { "field_name": "appBuildID", "allowed_values": ["20130910160258"] }, { "field_name": "submission_date", "allowed_values": ["20131003", "20131004"] }] } schema = TelemetrySchema(schema_spec) found = set() for f in s3util.list_partitions(v2bucket, schema=schema, include_keys=True): found.add(f.name) assert (len(found) == 2) assert ( "saved_session/Firefox/release/24.0/20130910160258.20131003.v2.log.25b53e7042c74188b08d71ce32e87237.lzma" in found) assert ( "saved_session/Firefox/release/24.0/20130910160258.20131004.v2.log.29afd7a250154729bd53c20253f8af78.lzma" in found)
def test_schema(d): schema_spec = { "version": 1, "dimensions": [{ "field_name": "reason", "allowed_values": ["saved-session"] }, { "field_name": "appName", "allowed_values": ["Firefox"] }, { "field_name": "appUpdateChannel", "allowed_values": ["nightly"] }, { "field_name": "appVersion", "allowed_values": ["27.0a1"] }, { "field_name": "appBuildID", "allowed_values": ["20130918030202"] }, { "field_name": "submission_date", "allowed_values": ["20131001"] }] } schema = TelemetrySchema(schema_spec) successfully_downloaded = [] failfully_downloaded = [] for f, r, err in d.get_schema(schema): if err is not None: print err failfully_downloaded.append(f) else: print "Downloaded", f successfully_downloaded.append(f) assert len(failfully_downloaded) == 0 print "Successfully downloaded", len(successfully_downloaded) assert len(successfully_downloaded) == 20
def __init__(self, config): # Sanity check args. if config.get("num_mappers") <= 0: raise ValueError("Number of mappers must be greater than zero") if config.get("num_reducers") <= 0: raise ValueError("Number of reducers must be greater than zero") if not os.path.isdir(config.get("data_dir")): raise ValueError("Data dir must be a valid directory") if not os.path.isdir(config.get("work_dir")): raise ValueError("Work dir must be a valid directory") if not os.path.isfile(config.get("job_script", "")): raise ValueError("Job script must be a valid python file") if not os.path.isfile(config.get("input_filter")): raise ValueError("Input filter must be a valid json file") self._input_dir = config.get("data_dir") if self._input_dir[-1] == os.path.sep: self._input_dir = self._input_dir[0:-1] self._work_dir = config.get("work_dir") with open(config.get("input_filter")) as filter_file: self._input_filter = TelemetrySchema(json.load(filter_file)) self._allowed_values = self._input_filter.sanitize_allowed_values() self._output_file = config.get("output") self._num_mappers = config.get("num_mappers") self._num_reducers = config.get("num_reducers") self._local_only = config.get("local_only") self._bucket_name = config.get("bucket") self._aws_key = config.get("aws_key") self._aws_secret_key = config.get("aws_secret_key") self._profile = config.get("profile") self._delete_data = config.get("delete_data") with open(config.get("job_script")) as modulefd: # let the job script import additional modules under its path sys.path.append(os.path.dirname(config.get("job_script"))) ## Lifted from FileDriver.py in jydoop. self._job_module = imp.load_module( "telemetry_job", modulefd, config.get("job_script"), ('.py', 'U', 1))
class TestPersist(unittest.TestCase): def setUp(self): test_dir = self.get_test_dir() self.schema = TelemetrySchema(self.get_schema_spec()) self.storage = StorageLayout(self.schema, test_dir, 10000) assert not os.path.exists(test_dir) os.makedirs(test_dir) def tearDown(self): shutil.rmtree(self.get_test_dir()) def get_test_dir(self): return "/tmp/test_telemetry_persist" def get_schema_spec(self): return { "version": 1, "dimensions": [{ "field_name": "reason", "allowed_values": ["r1", "r2"] }, { "field_name": "appName", "allowed_values": ["a1"] }, { "field_name": "appUpdateChannel", "allowed_values": ["c1", "c2", "c3"] }, { "field_name": "appVersion", "allowed_values": "*" }, { "field_name": "appBuildID", "allowed_values": "*" }, { "field_name": "submission_date", "allowed_values": { "min": "20130101", "max": "20131231" } }] } def test_write_filename(self): test_file = os.path.join(self.get_test_dir(), "test.log") self.storage.write_filename("foo", '{"bar":"baz"}', test_file) test_file_md5, test_file_size = fileutil.md5file(test_file) self.assertEqual(test_file_md5, "0ea91df239ea79ed2ebab34b46d455fc") test_file = os.path.join(self.get_test_dir(), "test2.log") # Now test writing an object self.storage.write_filename("foo", {"bar": "baz"}, test_file) test_file_md5, test_file_size = fileutil.md5file(test_file) self.assertEqual(test_file_md5, "0ea91df239ea79ed2ebab34b46d455fc") def test_write(self): dims = ["r1", "a1", "c1", "v1", "b1", "20130102"] test_dir = self.get_test_dir() test_file = self.schema.get_filename(test_dir, dims) self.assertEquals(test_file, test_dir + "/r1/a1/c1/v1/b1.20130102.v1.log") self.storage.write("foo", '{"bar":"baz"}', dims) md5, size = fileutil.md5file(test_file) self.assertEqual(md5, "0ea91df239ea79ed2ebab34b46d455fc") def test_clean_newlines(self): self.assertEqual(self.storage.clean_newlines("ab\n\ncd\r\n"), "ab cd ") def test_rotate(self): test_file = os.path.join(self.get_test_dir(), "test.log") key = "01234567890123456789012345678901234567890123456789" value = '{"some filler stuff here":"fffffffffffffffffff"}' # each iteration should be 100 bytes. for i in range(99): result = self.storage.write_filename(key, value, test_file) self.assertEquals(result, test_file) # The 100th iteration should cause the file to rotate rolled = self.storage.write_filename(key, value, test_file) # rolled should be <test_dir>/test.log.<pid>.<timestamp><suffix> self.assertNotEqual(rolled, test_file) self.assertTrue(rolled.startswith(test_file)) self.assertTrue( rolled.endswith(StorageLayout.PENDING_COMPRESSION_SUFFIX))
def setUp(self): test_dir = self.get_test_dir() self.schema = TelemetrySchema(self.get_schema_spec()) self.storage = StorageLayout(self.schema, test_dir, 10000) assert not os.path.exists(test_dir) os.makedirs(test_dir)
class Job: """A class for orchestrating a Heka MapReduce job""" # 1. read input filter # 2. generate filtered list of local input files # 2a. generate filtered list of remote input files # 3. load mapper # 4. spawn N processes # 5. distribute files among processes # 6. map(key, value, dims) each line in the file # 7. combine map output for each file # 8. reduce combine output overall def __init__(self, config): # Sanity check args. if config.get("num_mappers") <= 0: raise ValueError("Number of mappers must be greater than zero") if config.get("num_reducers") <= 0: raise ValueError("Number of reducers must be greater than zero") if not os.path.isdir(config.get("data_dir")): raise ValueError("Data dir must be a valid directory") if not os.path.isdir(config.get("work_dir")): raise ValueError("Work dir must be a valid directory") if not os.path.isfile(config.get("job_script", "")): raise ValueError("Job script must be a valid python file") if not os.path.isfile(config.get("input_filter")): raise ValueError("Input filter must be a valid json file") self._input_dir = config.get("data_dir") if self._input_dir[-1] == os.path.sep: self._input_dir = self._input_dir[0:-1] self._work_dir = config.get("work_dir") with open(config.get("input_filter")) as filter_file: self._input_filter = TelemetrySchema(json.load(filter_file)) self._allowed_values = self._input_filter.sanitize_allowed_values() self._output_file = config.get("output") self._num_mappers = config.get("num_mappers") self._num_reducers = config.get("num_reducers") self._local_only = config.get("local_only") self._bucket_name = config.get("bucket") self._aws_key = config.get("aws_key") self._aws_secret_key = config.get("aws_secret_key") self._profile = config.get("profile") self._delete_data = config.get("delete_data") with open(config.get("job_script")) as modulefd: # let the job script import additional modules under its path sys.path.append(os.path.dirname(config.get("job_script"))) ## Lifted from FileDriver.py in jydoop. self._job_module = imp.load_module( "telemetry_job", modulefd, config.get("job_script"), ('.py', 'U', 1)) def dump_stats(self, partitions): total = sum(partitions) avg = total / len(partitions) for i in range(len(partitions)): print "Partition %d contained %d (%+d)" % (i, partitions[i], float(partitions[i]) - avg) def dedupe_remotes(self, remote_files, local_files): return ( r for r in remote_files if os.path.join(self._input_dir, r.name) not in local_files ) def mapreduce(self): # Find files matching specified input filter files = set(self.get_filtered_files(self._input_dir)) remote_files = self.get_filtered_files_s3() # If we're using the cache dir as the data dir, we will end up reading # each already-downloaded file twice, so we should skip any remote files # that exist in the data dir. remote_files = self.dedupe_remotes(remote_files, files) # Partition files into reasonably equal groups for use by mappers print "Partitioning input data..." partitions = self.partition(files, remote_files) print "Done" if not any(part for part in partitions): print "Filter didn't match any files... nothing to do" return partitions = [part for part in partitions if part] # Not useful to have more mappers than partitions. if len(partitions) < self._num_mappers: print "Filter matched only %d input files. Reducing number of mappers accordingly." % ( len(partitions),) self._num_mappers = len(partitions) # Free up our set of names. We want to minimize # our memory usage prior to forking map jobs. #files = None gc.collect() def checkExitCode(proc): # If process was terminated by a signal, exitcode is the negative signal value if proc.exitcode == -signal.SIGKILL: # SIGKILL is most likely an OOM kill raise MemoryError("%s ran out of memory" % proc.name) elif proc.exitcode: raise OSError("%s exited with code %d" % (proc.name, proc.exitcode)) # Partitions are ready. Map. mappers = [] for i in range(self._num_mappers): if len(partitions[i]) > 0: p = Process( target=Mapper, name=("Mapper-%d" % i), args=(i, self._profile, partitions[i], self._work_dir, self._job_module, self._num_reducers, self._delete_data, self._aws_key, self._aws_secret_key, self._bucket_name)) mappers.append(p) p.start() else: print "Skipping mapper", i, "- no input files to process" for m in mappers: m.join() checkExitCode(m) # Mappers are done. Reduce. reducers = [] for i in range(self._num_reducers): p = Process( target=Reducer, name=("Reducer-%d" % i), args=(i, self._profile, self._work_dir, self._job_module, self._num_mappers)) reducers.append(p) p.start() for r in reducers: r.join() checkExitCode(r) # Reducers are done. Output results. to_combine = 1 try: os.rename(os.path.join(self._work_dir, "reducer_0"), self._output_file) except OSError, e: if e.errno != errno.EXDEV: raise else: # OSError: [Errno 18] Invalid cross-device link (EXDEV == 18) # We can't rename across devices :( Copy / delete instead. to_combine = 0 # TODO: If _output_file ends with a compressed suffix (.gz, .xz, .bz2, etc), # try to compress it after writing. if self._num_reducers > to_combine: with open(self._output_file, "a") as out: for i in range(to_combine, self._num_reducers): # FIXME: this reads the entire reducer output into memory reducer_filename = os.path.join(self._work_dir, "reducer_" + str(i)) with open(reducer_filename, "r") as reducer_output: out.write(reducer_output.read()) os.remove(reducer_filename) # Clean up mapper outputs for m in range(self._num_mappers): for r in range(self._num_reducers): mfile = os.path.join(self._work_dir, "mapper_%d_%d" % (m, r)) if os.path.exists(mfile): os.remove(mfile) else: print "Warning: Could not find", mfile
class Job: """A class for orchestrating a Telemetry MapReduce job""" # 1. read input filter # 2. generate filtered list of local input files # 2a. generate filtered list of remote input files # 3. load mapper # 4. spawn N processes # 5. distribute files among processes # 6. map(key, value, dims) each line in the file # 7. combine map output for each file # 8. reduce combine output overall def __init__(self, config): # Sanity check args. if config.num_mappers <= 0: raise ValueError("Number of mappers must be greater than zero") if config.num_reducers <= 0: raise ValueError("Number of reducers must be greater than zero") if not os.path.isdir(config.data_dir): raise ValueError("Data dir must be a valid directory") if not os.path.isdir(config.work_dir): raise ValueError("Work dir must be a valid directory") if not os.path.isfile(config.job_script): raise ValueError("Job script must be a valid python file") if not os.path.isfile(config.input_filter): raise ValueError("Input filter must be a valid json file") self._input_dir = config.data_dir if self._input_dir[-1] == os.path.sep: self._input_dir = self._input_dir[0:-1] self._work_dir = config.work_dir self._input_filter = TelemetrySchema(json.load(open(config.input_filter))) self._allowed_values = self._input_filter.sanitize_allowed_values() self._output_file = config.output self._num_mappers = config.num_mappers self._num_reducers = config.num_reducers self._local_only = config.local_only self._bucket_name = config.bucket self._aws_key = config.aws_key self._aws_secret_key = config.aws_secret_key modulefd = open(config.job_script) # let the job script import additional modules under its path sys.path.append(os.path.dirname(config.job_script)) ## Lifted from FileDriver.py in jydoop. self._job_module = imp.load_module("telemetry_job", modulefd, config.job_script, ('.py', 'U', 1)) def dump_stats(self, partitions): total = sum(partitions) avg = total / len(partitions) for i in range(len(partitions)): print "Partition %d contained %d (%+d)" % (i, partitions[i], float(partitions[i]) - avg) def fetch_remotes(self, remotes): # TODO: fetch remotes inside Mappers, and process each one as it becomes available. remote_names = [ r["name"] for r in remotes if r["type"] == "remote" ] # TODO: check cache first. result = 0 if len(remote_names) == 0: return result fetch_cwd = os.path.join(self._work_dir, "cache") if not os.path.isdir(fetch_cwd): os.makedirs(fetch_cwd) loader = s3util.Loader(fetch_cwd, self._bucket_name, aws_key=self._aws_key, aws_secret_key=self._aws_secret_key) start = datetime.now() downloaded_bytes = 0 for local, remote, err in loader.get_list(remote_names): if err is None: print "Downloaded", remote downloaded_bytes += os.path.getsize(local) else: print "Failed to download", remote result += 1 duration_sec = timer.delta_sec(start) downloaded_mb = float(downloaded_bytes) / 1024.0 / 1024.0 print "Downloaded %.2fMB in %.2fs (%.2fMB/s)" % (downloaded_mb, duration_sec, downloaded_mb / duration_sec) return result def dedupe_remotes(self, remote_files, local_files): return [ r for r in remote_files if os.path.join(self._input_dir, r.name) not in local_files ] def mapreduce(self): # Find files matching specified input filter files = self.get_filtered_files(self._input_dir) remote_files = self.get_filtered_files_s3() # If we're using the cache dir as the data dir, we will end up reading # each already-downloaded file twice, so we should skip any remote files # that exist in the data dir. remote_files = self.dedupe_remotes(remote_files, files) file_count = len(files) + len(remote_files) if file_count == 0: print "Filter didn't match any files... nothing to do" return # Not useful to have more mappers than input files. if file_count < self._num_mappers: print "Filter matched only %s input files (%s local in %s and %s " \ "remote from %s). Reducing number of mappers accordingly." \ % (file_count, len(files), self._input_dir, len(remote_files), self._bucket_name) self._num_mappers = file_count # Partition files into reasonably equal groups for use by mappers print "Partitioning input data..." partitions = self.partition(files, remote_files) print "Done" def checkExitCode(proc): # If process was terminated by a signal, exitcode is the negative signal value if proc.exitcode == -signal.SIGKILL: # SIGKILL is most likely an OOM kill raise MemoryError("%s ran out of memory" % proc.name) elif proc.exitcode: raise OSError("%s exited with code %d" % (proc.name, proc.exitcode)) # Partitions are ready. Map. mappers = [] for i in range(self._num_mappers): if len(partitions[i]) > 0: # Fetch the files we need for each mapper print "Fetching remotes for partition", i fetch_result = self.fetch_remotes(partitions[i]) if fetch_result == 0: print "Remote files fetched successfully" else: print "ERROR: Failed to fetch", fetch_result, "files." # TODO: Bail, since results will be unreliable? p = Process( target=Mapper, name=("Mapper-%d" % i), args=(i, partitions[i], self._work_dir, self._job_module, self._num_reducers)) mappers.append(p) p.start() else: print "Skipping mapper", i, "- no input files to process" for m in mappers: m.join() checkExitCode(m) # Mappers are done. Reduce. reducers = [] for i in range(self._num_reducers): p = Process( target=Reducer, name=("Reducer-%d" % i), args=(i, self._work_dir, self._job_module, self._num_mappers)) reducers.append(p) p.start() for r in reducers: r.join() checkExitCode(r) # Reducers are done. Output results. to_combine = 1 try: os.rename(os.path.join(self._work_dir, "reducer_0"), self._output_file) except OSError, e: if e.errno != errno.EXDEV: raise else: # OSError: [Errno 18] Invalid cross-device link (EXDEV == 18) # We can't rename across devices :( Copy / delete instead. to_combine = 0 # TODO: If _output_file ends with a compressed suffix (.gz, .xz, .bz2, etc), # try to compress it after writing. if self._num_reducers > to_combine: out = open(self._output_file, "a") for i in range(to_combine, self._num_reducers): # FIXME: this reads the entire reducer output into memory reducer_filename = os.path.join(self._work_dir, "reducer_" + str(i)) reducer_output = open(reducer_filename, "r") out.write(reducer_output.read()) reducer_output.close() os.remove(reducer_filename) # TODO: clean up downloaded files? # Clean up mapper outputs for m in range(self._num_mappers): for r in range(self._num_reducers): mfile = os.path.join(self._work_dir, "mapper_%d_%d" % (m, r)) if os.path.exists(mfile): os.remove(mfile) else: print "Warning: Could not find", mfile
class Job: """A class for orchestrating a Heka MapReduce job""" # 1. read input filter # 2. generate filtered list of local input files # 2a. generate filtered list of remote input files # 3. load mapper # 4. spawn N processes # 5. distribute files among processes # 6. map(key, value, dims) each line in the file # 7. combine map output for each file # 8. reduce combine output overall def __init__(self, config): # Sanity check args. if config.get("num_mappers") <= 0: raise ValueError("Number of mappers must be greater than zero") if config.get("num_reducers") <= 0: raise ValueError("Number of reducers must be greater than zero") if not os.path.isdir(config.get("data_dir")): raise ValueError("Data dir must be a valid directory") if not os.path.isdir(config.get("work_dir")): raise ValueError("Work dir must be a valid directory") if not os.path.isfile(config.get("job_script", "")): raise ValueError("Job script must be a valid python file") if not os.path.isfile(config.get("input_filter")): raise ValueError("Input filter must be a valid json file") self._input_dir = config.get("data_dir") if self._input_dir[-1] == os.path.sep: self._input_dir = self._input_dir[0:-1] self._work_dir = config.get("work_dir") with open(config.get("input_filter")) as filter_file: self._input_filter = TelemetrySchema(json.load(filter_file)) self._allowed_values = self._input_filter.sanitize_allowed_values() self._output_file = config.get("output") self._num_mappers = config.get("num_mappers") self._num_reducers = config.get("num_reducers") self._local_only = config.get("local_only") self._bucket_name = config.get("bucket") self._aws_key = config.get("aws_key") self._aws_secret_key = config.get("aws_secret_key") self._profile = config.get("profile") self._delete_data = config.get("delete_data") with open(config.get("job_script")) as modulefd: # let the job script import additional modules under its path sys.path.append(os.path.dirname(config.get("job_script"))) ## Lifted from FileDriver.py in jydoop. self._job_module = imp.load_module("telemetry_job", modulefd, config.get("job_script"), (".py", "U", 1)) def dump_stats(self, partitions): total = sum(partitions) avg = total / len(partitions) for i in range(len(partitions)): print "Partition %d contained %d (%+d)" % (i, partitions[i], float(partitions[i]) - avg) def dedupe_remotes(self, remote_files, local_files): return (r for r in remote_files if os.path.join(self._input_dir, r.name) not in local_files) def mapreduce(self): # Find files matching specified input filter files = set(self.get_filtered_files(self._input_dir)) remote_files = self.get_filtered_files_s3() # If we're using the cache dir as the data dir, we will end up reading # each already-downloaded file twice, so we should skip any remote files # that exist in the data dir. remote_files = self.dedupe_remotes(remote_files, files) # Partition files into reasonably equal groups for use by mappers print "Partitioning input data..." partitions = self.partition(files, remote_files) print "Done" if not any(part for part in partitions): print "Filter didn't match any files... nothing to do" return partitions = [part for part in partitions if part] # Not useful to have more mappers than partitions. if len(partitions) < self._num_mappers: print "Filter matched only %d input files. Reducing number of mappers accordingly." % (len(partitions),) self._num_mappers = len(partitions) # Free up our set of names. We want to minimize # our memory usage prior to forking map jobs. # files = None gc.collect() def checkExitCode(proc): # If process was terminated by a signal, exitcode is the negative signal value if proc.exitcode == -signal.SIGKILL: # SIGKILL is most likely an OOM kill raise MemoryError("%s ran out of memory" % proc.name) elif proc.exitcode: raise OSError("%s exited with code %d" % (proc.name, proc.exitcode)) # Partitions are ready. Map. mappers = [] for i in range(self._num_mappers): if len(partitions[i]) > 0: p = Process( target=Mapper, name=("Mapper-%d" % i), args=( i, self._profile, partitions[i], self._work_dir, self._job_module, self._num_reducers, self._delete_data, self._aws_key, self._aws_secret_key, self._bucket_name, ), ) mappers.append(p) p.start() else: print "Skipping mapper", i, "- no input files to process" for m in mappers: m.join() checkExitCode(m) # Mappers are done. Reduce. reducers = [] for i in range(self._num_reducers): p = Process( target=Reducer, name=("Reducer-%d" % i), args=(i, self._profile, self._work_dir, self._job_module, self._num_mappers), ) reducers.append(p) p.start() for r in reducers: r.join() checkExitCode(r) # Reducers are done. Output results. to_combine = 1 try: os.rename(os.path.join(self._work_dir, "reducer_0"), self._output_file) except OSError, e: if e.errno != errno.EXDEV: raise else: # OSError: [Errno 18] Invalid cross-device link (EXDEV == 18) # We can't rename across devices :( Copy / delete instead. to_combine = 0 # TODO: If _output_file ends with a compressed suffix (.gz, .xz, .bz2, etc), # try to compress it after writing. if self._num_reducers > to_combine: with open(self._output_file, "a") as out: for i in range(to_combine, self._num_reducers): # FIXME: this reads the entire reducer output into memory reducer_filename = os.path.join(self._work_dir, "reducer_" + str(i)) with open(reducer_filename, "r") as reducer_output: out.write(reducer_output.read()) os.remove(reducer_filename) # Clean up mapper outputs for m in range(self._num_mappers): for r in range(self._num_reducers): mfile = os.path.join(self._work_dir, "mapper_%d_%d" % (m, r)) if os.path.exists(mfile): os.remove(mfile) else: print "Warning: Could not find", mfile
class Job: """A class for orchestrating a Telemetry MapReduce job""" # 1. read input filter # 2. generate filtered list of local input files # 2a. generate filtered list of remote input files # 3. load mapper # 4. spawn N processes # 5. distribute files among processes # 6. map(key, value, dims) each line in the file # 7. combine map output for each file # 8. reduce combine output overall def __init__(self, config): # Sanity check args. if config.get("num_mappers") <= 0: raise ValueError("Number of mappers must be greater than zero") if config.get("num_reducers") <= 0: raise ValueError("Number of reducers must be greater than zero") if not os.path.isdir(config.get("data_dir")): raise ValueError("Data dir must be a valid directory") if not os.path.isdir(config.get("work_dir")): raise ValueError("Work dir must be a valid directory") if not os.path.isfile(config.get("job_script", "")): raise ValueError("Job script must be a valid python file") if not os.path.isfile(config.get("input_filter")): raise ValueError("Input filter must be a valid json file") self._input_dir = config.get("data_dir") if self._input_dir[-1] == os.path.sep: self._input_dir = self._input_dir[0:-1] self._work_dir = config.get("work_dir") self._input_filter = TelemetrySchema( json.load(open(config.get("input_filter")))) self._allowed_values = self._input_filter.sanitize_allowed_values() self._output_file = config.get("output") self._num_mappers = config.get("num_mappers") self._num_reducers = config.get("num_reducers") self._local_only = config.get("local_only") self._bucket_name = config.get("bucket") self._aws_key = config.get("aws_key") self._aws_secret_key = config.get("aws_secret_key") self._profile = config.get("profile") modulefd = open(config.get("job_script")) # let the job script import additional modules under its path sys.path.append(os.path.dirname(config.get("job_script"))) ## Lifted from FileDriver.py in jydoop. self._job_module = imp.load_module("telemetry_job", modulefd, config.get("job_script"), ('.py', 'U', 1)) def dump_stats(self, partitions): total = sum(partitions) avg = total / len(partitions) for i in range(len(partitions)): print "Partition %d contained %d (%+d)" % ( i, partitions[i], float(partitions[i]) - avg) def fetch_remotes(self, remotes): # TODO: fetch remotes inside Mappers, and process each one as it becomes available. remote_names = [r["name"] for r in remotes if r["type"] == "remote"] # TODO: check cache first. result = 0 if len(remote_names) == 0: return result fetch_cwd = os.path.join(self._work_dir, "cache") if not os.path.isdir(fetch_cwd): os.makedirs(fetch_cwd) loader = s3util.Loader(fetch_cwd, self._bucket_name, aws_key=self._aws_key, aws_secret_key=self._aws_secret_key) start = datetime.now() downloaded_bytes = 0 for local, remote, err in loader.get_list(remote_names): if err is None: print "Downloaded", remote downloaded_bytes += os.path.getsize(local) else: print "Failed to download", remote result += 1 duration_sec = timer.delta_sec(start) downloaded_mb = float(downloaded_bytes) / 1024.0 / 1024.0 print "Downloaded %.2fMB in %.2fs (%.2fMB/s)" % ( downloaded_mb, duration_sec, downloaded_mb / duration_sec) return result def dedupe_remotes(self, remote_files, local_files): return [ r for r in remote_files if os.path.join(self._input_dir, r.name) not in local_files ] def mapreduce(self): # Find files matching specified input filter files = self.get_filtered_files(self._input_dir) remote_files = self.get_filtered_files_s3() # If we're using the cache dir as the data dir, we will end up reading # each already-downloaded file twice, so we should skip any remote files # that exist in the data dir. remote_files = self.dedupe_remotes(remote_files, files) file_count = len(files) + len(remote_files) if file_count == 0: print "Filter didn't match any files... nothing to do" return # Not useful to have more mappers than input files. if file_count < self._num_mappers: print "Filter matched only %s input files (%s local in %s and %s " \ "remote from %s). Reducing number of mappers accordingly." \ % (file_count, len(files), self._input_dir, len(remote_files), self._bucket_name) self._num_mappers = file_count # Partition files into reasonably equal groups for use by mappers print "Partitioning input data..." partitions = self.partition(files, remote_files) print "Done" def checkExitCode(proc): # If process was terminated by a signal, exitcode is the negative signal value if proc.exitcode == -signal.SIGKILL: # SIGKILL is most likely an OOM kill raise MemoryError("%s ran out of memory" % proc.name) elif proc.exitcode: raise OSError("%s exited with code %d" % (proc.name, proc.exitcode)) # Partitions are ready. Map. mappers = [] for i in range(self._num_mappers): if len(partitions[i]) > 0: # Fetch the files we need for each mapper print "Fetching remotes for partition", i fetch_result = self.fetch_remotes(partitions[i]) if fetch_result == 0: print "Remote files fetched successfully" else: print "ERROR: Failed to fetch", fetch_result, "files." # TODO: Bail, since results will be unreliable? p = Process(target=Mapper, name=("Mapper-%d" % i), args=(i, self._profile, partitions[i], self._work_dir, self._job_module, self._num_reducers)) mappers.append(p) p.start() else: print "Skipping mapper", i, "- no input files to process" for m in mappers: m.join() checkExitCode(m) # Mappers are done. Reduce. reducers = [] for i in range(self._num_reducers): p = Process(target=Reducer, name=("Reducer-%d" % i), args=(i, self._profile, self._work_dir, self._job_module, self._num_mappers)) reducers.append(p) p.start() for r in reducers: r.join() checkExitCode(r) # Reducers are done. Output results. to_combine = 1 try: os.rename(os.path.join(self._work_dir, "reducer_0"), self._output_file) except OSError, e: if e.errno != errno.EXDEV: raise else: # OSError: [Errno 18] Invalid cross-device link (EXDEV == 18) # We can't rename across devices :( Copy / delete instead. to_combine = 0 # TODO: If _output_file ends with a compressed suffix (.gz, .xz, .bz2, etc), # try to compress it after writing. if self._num_reducers > to_combine: out = open(self._output_file, "a") for i in range(to_combine, self._num_reducers): # FIXME: this reads the entire reducer output into memory reducer_filename = os.path.join(self._work_dir, "reducer_" + str(i)) reducer_output = open(reducer_filename, "r") out.write(reducer_output.read()) reducer_output.close() os.remove(reducer_filename) # TODO: clean up downloaded files? # Clean up mapper outputs for m in range(self._num_mappers): for r in range(self._num_reducers): mfile = os.path.join(self._work_dir, "mapper_%d_%d" % (m, r)) if os.path.exists(mfile): os.remove(mfile) else: print "Warning: Could not find", mfile
class TestPersist(unittest.TestCase): def setUp(self): test_dir = self.get_test_dir() self.schema = TelemetrySchema(self.get_schema_spec()) self.storage = StorageLayout(self.schema, test_dir, 10000) assert not os.path.exists(test_dir) os.makedirs(test_dir) def tearDown(self): shutil.rmtree(self.get_test_dir()) def get_test_dir(self): return "/tmp/test_telemetry_persist" def get_schema_spec(self): return { "version": 1, "dimensions": [ { "field_name": "reason", "allowed_values": ["r1","r2"] }, { "field_name": "appName", "allowed_values": ["a1"] }, { "field_name": "appUpdateChannel", "allowed_values": ["c1", "c2", "c3"] }, { "field_name": "appVersion", "allowed_values": "*" }, { "field_name": "appBuildID", "allowed_values": "*" }, { "field_name": "submission_date", "allowed_values": { "min": "20130101", "max": "20131231" } } ] } def test_write_filename(self): test_file = os.path.join(self.get_test_dir(), "test.log") self.storage.write_filename("foo", '{"bar":"baz"}', test_file) test_file_md5, test_file_size = fileutil.md5file(test_file) self.assertEqual(test_file_md5, "0ea91df239ea79ed2ebab34b46d455fc") test_file = os.path.join(self.get_test_dir(), "test2.log") # Now test writing an object self.storage.write_filename("foo", {"bar":"baz"}, test_file) test_file_md5, test_file_size = fileutil.md5file(test_file) self.assertEqual(test_file_md5, "0ea91df239ea79ed2ebab34b46d455fc") def test_write(self): dims = ["r1", "a1", "c1", "v1", "b1", "20130102"] test_dir = self.get_test_dir() test_file = self.schema.get_filename(test_dir, dims) self.assertEquals(test_file, test_dir + "/r1/a1/c1/v1/b1.20130102.v1.log") self.storage.write("foo", '{"bar":"baz"}', dims) md5, size = fileutil.md5file(test_file) self.assertEqual(md5, "0ea91df239ea79ed2ebab34b46d455fc") def test_clean_newlines(self): self.assertEqual(self.storage.clean_newlines("ab\n\ncd\r\n"), "ab cd ") def test_rotate(self): test_file = os.path.join(self.get_test_dir(), "test.log") key = "01234567890123456789012345678901234567890123456789" value = '{"some filler stuff here":"fffffffffffffffffff"}' # each iteration should be 100 bytes. for i in range(99): result = self.storage.write_filename(key, value, test_file) self.assertEquals(result, test_file) # The 100th iteration should cause the file to rotate rolled = self.storage.write_filename(key, value, test_file) # rolled should be <test_dir>/test.log.<pid>.<timestamp><suffix> self.assertNotEqual(rolled, test_file) self.assertTrue(rolled.startswith(test_file)) self.assertTrue(rolled.endswith(StorageLayout.PENDING_COMPRESSION_SUFFIX))
def main(): parser = argparse.ArgumentParser( description="Split raw logs into partitioned files.", formatter_class=argparse.ArgumentDefaultsHelpFormatter ) parser.add_argument( "-m", "--max-output-size", metavar="N", help="Rotate output files after N bytes", type=int, default=500000000 ) parser.add_argument("-i", "--input-file", help="Filename to read from", required=True) parser.add_argument("-o", "--output-dir", help="Base directory to store split files", required=True) parser.add_argument("-t", "--telemetry-schema", help="Filename of telemetry schema spec", required=True) parser.add_argument("-b", "--bucket", help="S3 Bucket name") parser.add_argument("-k", "--aws-key", help="AWS Key") parser.add_argument("-s", "--aws-secret-key", help="AWS Secret Key") args = parser.parse_args() schema_data = open(args.telemetry_schema) schema = TelemetrySchema(json.load(schema_data)) schema_data.close() storage = StorageLayout(schema, args.output_dir, args.max_output_size) expected_dim_count = len(schema._dimensions) if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir) record_count = 0 bad_record_count = 0 bytes_read = 0 start = datetime.now() for len_path, len_data, timestamp, path, data, err in fileutil.unpack(args.input_file): record_count += 1 if err: bad_record_count += 1 continue # Incoming timestamps are in milliseconds, so convert to POSIX first # (ie. seconds) submission_date = date.fromtimestamp(timestamp / 1000).strftime("%Y%m%d") # Deal with unicode path = unicode(path, errors="replace") data = unicode(data, errors="replace") bytes_read += len_path + len_data + fileutil.RECORD_PREAMBLE_LENGTH # print "Path for record", record_count, path, "length of data:", len_data, "data:", data[0:5] + "..." path_components = path.split("/") if len(path_components) != expected_dim_count: # We're going to pop the ID off, but we'll also add the submission # date, so it evens out. print "Found an invalid path in record", record_count, path bad_record_count += 1 continue key = path_components.pop(0) info = {} info["reason"] = path_components.pop(0) info["appName"] = path_components.pop(0) info["appVersion"] = path_components.pop(0) info["appUpdateChannel"] = path_components.pop(0) info["appBuildID"] = path_components.pop(0) dimensions = schema.dimensions_from(info, submission_date) # print " Converted path to filename", schema.get_filename(args.output_dir, dimensions) storage.write(key, data, dimensions) duration = timer.delta_sec(start) mb_read = bytes_read / 1024.0 / 1024.0 print "Read %.2fMB in %.2fs (%.2fMB/s), %d of %d records were bad" % ( mb_read, duration, mb_read / duration, bad_record_count, record_count, ) return 0
class AnalysisJob: def __init__(self, cfg): self.job_bundle = cfg.job_bundle if cfg.input_filter: self.input_filter = TelemetrySchema( json.load(open(cfg.input_filter))) else: self.input_filter = None if cfg.input_list_file: self.input_list = cfg.input_list_file else: self.input_list = None self.job_id = str(uuid4()) self.target_queue = cfg.target_queue self.aws_key = cfg.aws_key self.aws_secret_key = cfg.aws_secret_key self.input_bucket = "telemetry-published-v1" self.job_name = cfg.name self.job_owner = cfg.owner self.date_limit = cfg.date_limit # Bucket with intermediate data for this analysis job self.analysis_bucket = "jonasfj-telemetry-analysis" self.s3_code_path = "batch-jobs/" + self.job_id + ".tar.gz" # S3 region of operation self.aws_region = "us-west-2" self.task_size_limit = 400 * 1024 * 1024 self.sqs_input_name = cfg.sqs_queue def get_filtered_files(self): conn = S3Connection(self.aws_key, self.aws_secret_key) bucket = conn.get_bucket(self.input_bucket) # date_limit is a hack that makes it easy to launch everything before # a given date... say to back process all we have in the bucket... if self.date_limit != None: print "Launching limiting to before " + self.date_limit for k, s in self.list_partitions(bucket): if k.split('/')[-1].split('.')[1] < self.date_limit: yield (k, s) else: for k, s in self.list_partitions(bucket): yield (k, s) def get_filtered_files_old(self): """ Get tuples of name and size for all input files """ # Setup some auxiliary functions allowed_values = self.input_filter.sanitize_allowed_values() nb_dims = len(allowed_values) def filter_includes(level, value): return self.input_filter.is_allowed(value, allowed_values[level]) # iterate over all files in bucket, this is very slow and we should be # be able to something much smarter using prefix listing and ordering # to break listing. count = 0 selected = 0 conn = S3Connection(self.aws_key, self.aws_secret_key) bucket = conn.get_bucket(self.input_bucket) for f in bucket.list(): count += 1 dims = self.input_filter.get_dimensions(".", f.key) include = True for i in xrange(nb_dims): if not filter_includes(i, dims[i]): include = False break if include: selected += 1 yield (f.key, f.size) if count % 5000 == 0: print "%i files listed with %i selected" % (count, selected) conn.close() def list_partitions(self, bucket, prefix='', level=0): if self.input_filter: #print "Listing...", prefix, level allowed_values = self.input_filter.sanitize_allowed_values() delimiter = '/' if level > 3: delimiter = '.' for k in bucket.list(prefix=prefix, delimiter=delimiter): partitions = k.name.split("/") if level > 3: # split the last couple of partition components by "." instead of "/" partitions.extend(partitions.pop().split(".", 2)) if self.input_filter.is_allowed(partitions[level], allowed_values[level]): if level >= 5: for f in bucket.list(prefix=k.name): yield (f.key, f.size) else: for k, s in self.list_partitions( bucket, k.name, level + 1): yield (k, s) elif self.input_list: print "Using input list..." for line in self.input_list: key_name = line.strip() k = bucket.get_key(key_name) yield (k.key, k.size) else: print "Don't know how to list partitions without a filter or list :(" raise ValueError("Missing both input_filter and input_list") def generate_tasks(self): """ Generates SQS tasks, we batch small files into a single task """ taskid = 1 taskfiles = [] tasksize = 0 total_size_of_all = 0 for key, size in self.get_filtered_files(): # If the task have reached desired size we yield it # Note, as SQS messages are limited to 65 KiB we limit tasks to # 100 filenames, for simplicity # boto only uses signature version 4, hence, we're limited to 65 KiB if 0 < len(taskfiles) and (tasksize + size > self.task_size_limit or len(taskfiles) > 200): # Reduce to only filenames, sort by size... smallest first they are # faster to download when handling the job taskfiles = [ f for f, s in sorted(taskfiles, key=lambda (f, s): s) ] yield { 'id': self.job_id + "/" + str(taskid), 'name': self.job_name, 'owner': self.job_owner, 'code': self.s3_code_path, 'target-queue': self.target_queue, 'files': taskfiles, 'size': tasksize } total_size_of_all += tasksize print "%i tasks created acc. size: %s" % (taskid, total_size_of_all) taskid += 1 taskfiles = [] tasksize = 0 tasksize += size taskfiles.append((key, size)) if len(taskfiles) > 0: taskfiles = [f for f, s in sorted(taskfiles, key=lambda (f, s): s)] yield { 'id': self.job_id + "/" + str(taskid), 'name': self.job_name, 'owner': self.job_owner, 'code': self.s3_code_path, 'target-queue': self.target_queue, 'files': taskfiles, 'size': tasksize } print "Finished:" print "%i tasks created total size: %s" % (taskid, total_size_of_all + tasksize) def put_sqs_tasks(self): """ Create an SQS tasks for this analysis job """ print "Populate SQS input queue with tasks" # Connect to SQS is desired region conn = sqs.connect_to_region(self.aws_region, aws_access_key_id=self.aws_key, aws_secret_access_key=self.aws_secret_key) # Create queue queue = conn.get_queue(self.sqs_input_name) queue.set_message_class(JSONMessage) # Populate queue with tasks for task in self.generate_tasks(): #print "enqueueing", task["id"], "size:", task["size"] msg = queue.new_message(body=task) queue.write(msg) conn.close() def setup(self): self.upload_job_bundle() self.put_sqs_tasks() print "Uploaded with job_id: %s" % self.job_id def upload_job_bundle(self): """ Upload job bundle to S3 """ conn = S3Connection(self.aws_key, self.aws_secret_key) bucket = conn.get_bucket(self.analysis_bucket) k = Key(bucket) k.key = self.s3_code_path k.set_contents_from_filename(self.job_bundle) conn.close()
def main(): parser = argparse.ArgumentParser( description='Split raw logs into partitioned files.', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("-m", "--max-output-size", metavar="N", help="Rotate output files after N bytes", type=int, default=500000000) parser.add_argument("-i", "--input-file", help="Filename to read from", required=True) parser.add_argument("-o", "--output-dir", help="Base directory to store split files", required=True) parser.add_argument("-t", "--telemetry-schema", help="Filename of telemetry schema spec", required=True) parser.add_argument("-f", "--file-version", help="Log file version (if omitted, we'll guess)") args = parser.parse_args() schema_data = open(args.telemetry_schema) schema = TelemetrySchema(json.load(schema_data)) schema_data.close() storage = StorageLayout(schema, args.output_dir, args.max_output_size) expected_dim_count = len(schema._dimensions) if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir) record_count = 0 bad_record_count = 0 bytes_read = 0 start = datetime.now() file_version = args.file_version if not file_version: file_version = fileutil.detect_file_version(args.input_file) for r in fileutil.unpack(args.input_file, file_version=file_version): record_count += 1 if r.error: bad_record_count += 1 continue # Incoming timestamps are in milliseconds, so convert to POSIX first # (ie. seconds) submission_date = date.fromtimestamp(r.timestamp / 1000).strftime("%Y%m%d") # Deal with unicode path = unicode(r.path, errors="replace") data = unicode(r.data, errors="replace") bytes_read += r.len_ip + r.len_path + r.len_data + fileutil.RECORD_PREAMBLE_LENGTH[ file_version] #print "Path for record", record_count, path, "length of data:", r.len_data, "data:", data[0:5] + "..." path_components = path.split("/") if len(path_components) != expected_dim_count: # We're going to pop the ID off, but we'll also add the submission # date, so it evens out. print "Found an invalid path in record", record_count, path bad_record_count += 1 continue key = path_components.pop(0) info = {} info["reason"] = path_components.pop(0) info["appName"] = path_components.pop(0) info["appVersion"] = path_components.pop(0) info["appUpdateChannel"] = path_components.pop(0) info["appBuildID"] = path_components.pop(0) dimensions = schema.dimensions_from(info, submission_date) #print " Converted path to filename", schema.get_filename(args.output_dir, dimensions) storage.write(key, data, dimensions) duration = timer.delta_sec(start) mb_read = bytes_read / 1024.0 / 1024.0 print "Read %.2fMB in %.2fs (%.2fMB/s), %d of %d records were bad" % ( mb_read, duration, mb_read / duration, bad_record_count, record_count) return 0
def update_published_v4_files(sdb, bucket, bucket_prefix, submission_date, limit=None): conn = boto.connect_s3(host=S3_DEFAULT_ENDPOINT) metadata = conn.get_bucket(METADATA_BUCKET, validate=False) schema_key = metadata.get_key("{}/schema.json".format(bucket_prefix)) schema_string = schema_key.get_contents_as_string() schema = TelemetrySchema(json.loads(schema_string)) bucket = conn.get_bucket(bucket, validate=False) added_count = 0 total_count = 0 start_time = datetime.now() done = False last_key = '' batch = BatchPut(sdb) prefix = "{}/{}".format( bucket_prefix, submission_date) if submission_date else bucket_prefix print "Bucket: {} - Prefix: {} - Date: {}".format(bucket.name, bucket_prefix, submission_date) while not done: try: for key in bucket.list(marker=last_key, prefix=prefix): last_key = key.name if total_count % 1e5 == 0: print("Looked at {} total records in {} seconds, added {}". format(total_count, delta_sec(start_time), added_count)) dims = schema.get_dimension_map( schema.get_dimensions(".", key.name[len(bucket_prefix) + 1:], dirs_only=True)) if (dims["submissionDate"] == submission_date ) and dims["submissionDate"][:-2] in sdb: batch.put(dims["submissionDate"][:-2], key.name, dims) added_count += 1 total_count += 1 if total_count == limit: done = True break except Exception as e: print("Error listing keys: {}".format(e)) traceback.print_exc() print("Continuing from last seen key: {}".format(last_key)) continue break batch.flush() print("Overall, added {} of {} in {} seconds".format( added_count, total_count, delta_sec(start_time)))
def main(): signal.signal(signal.SIGINT, handle_sigint) parser = argparse.ArgumentParser( description='Process incoming Telemetry data', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("-c", "--config", required=True, type=file, help="AWS Configuration file (json)") parser.add_argument("-w", "--work-dir", required=True, help="Location to cache downloaded files") parser.add_argument("-o", "--output-dir", required=True, help="Base dir to store processed data") parser.add_argument("-i", "--input-files", type=file, help="File containing a list of keys to process") parser.add_argument("-b", "--bad-data-log", help="Save bad records to this file") parser.add_argument("-l", "--log-file", help="Log output to this file") parser.add_argument("-s", "--stats-file", help="Log statistics to this file") parser.add_argument("--histogram-cache-path", default="./histogram_cache", help="Path to store a local cache of histograms") parser.add_argument("-t", "--telemetry-schema", required=True, help="Location of the desired telemetry schema") parser.add_argument("-m", "--max-output-size", metavar="N", type=int, default=500000000, help="Rotate output files after N bytes") parser.add_argument("-D", "--dry-run", action="store_true", help="Don't modify remote files") parser.add_argument("-n", "--no-clean", action="store_true", help="Don't clean out the output-dir before beginning") parser.add_argument("-v", "--verbose", action="store_true", help="Print more detailed output") args = parser.parse_args() if args.verbose: # Turn on mp logging multiprocessing.log_to_stderr(logging.DEBUG) config = json.load(args.config) # TODO: allow commandline args to override config values. if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir) schema_data = open(args.telemetry_schema) schema = TelemetrySchema(json.load(schema_data)) schema_data.close() cache = RevisionCache(args.histogram_cache_path, "hg.mozilla.org") converter = Converter(cache, schema) storage = StorageLayout(schema, args.output_dir, args.max_output_size) logger = Log(args.log_file, "Master") num_cpus = multiprocessing.cpu_count() conn = None incoming_bucket = None incoming_queue = None s3downloader = None raw_readers = None compressors = None exporters = None done = False if args.no_clean: logger.log("Not removing log files in {}".format(args.output_dir)) else: # Remove existing log files from output_dir (to clean up after an # incomplete previous run, for example). logger.log("Removing log files in {}".format(args.output_dir)) for root, dirs, files in os.walk(args.output_dir): for f in files: if f.endswith(".log"): full = os.path.join(root, f) if args.dry_run: logger.log("Would be deleting {}, except it's a " \ "dry run".format(full)) else: try: logger.log("Removing existing file: " + full) os.remove(full) except Exception, e: logger.log("Error removing existing " \ " file {}: {}".format(full, e))
def main(): parser = argparse.ArgumentParser( description='Process incoming Telemetry data', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("incoming_bucket", help="The S3 bucket containing incoming files") parser.add_argument("publish_bucket", help="The S3 bucket to save processed files") parser.add_argument("-k", "--aws-key", help="AWS Key", required=True) parser.add_argument("-s", "--aws-secret-key", help="AWS Secret Key", required=True) parser.add_argument("-r", "--aws-region", help="AWS Region", default="us-west-2") parser.add_argument("-w", "--work-dir", help="Location to cache downloaded files", required=True) parser.add_argument("-o", "--output-dir", help="Base dir to store processed data", required=True) parser.add_argument("-i", "--input-files", help="File containing a list of keys to process", type=file) parser.add_argument("-b", "--bad-data-log", help="Save bad records to this file") parser.add_argument("-q", "--queue", help="SQS Queue name to poll for incoming data") parser.add_argument("-c", "--histogram-cache-path", help="Path to store a local cache of histograms", default="./histogram_cache") parser.add_argument("-t", "--telemetry-schema", help="Location of the desired telemetry schema", required=True) parser.add_argument("-m", "--max-output-size", metavar="N", help="Rotate output files after N bytes", type=int, default=500000000) parser.add_argument("-D", "--dry-run", help="Don't modify remote files", action="store_true") parser.add_argument("-C", "--skip-conversion", help="Skip validation/conversion of payloads", action="store_true") args = parser.parse_args() if not os.path.isfile(S3FUNNEL_PATH): print "ERROR: s3funnel not found at", S3FUNNEL_PATH print "You can get it from github: https://github.com/sstoiana/s3funnel" return -1 if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir) schema_data = open(args.telemetry_schema) schema = TelemetrySchema(json.load(schema_data)) schema_data.close() cache = RevisionCache(args.histogram_cache_path, "hg.mozilla.org") if args.skip_conversion: converter = None else: converter = Converter(cache, schema) storage = StorageLayout(schema, args.output_dir, args.max_output_size) num_cpus = multiprocessing.cpu_count() start = datetime.now() conn = None incoming_bucket = None incoming_queue = None incoming_queue_messages = [] if not args.dry_run: conn = S3Connection(args.aws_key, args.aws_secret_key) incoming_bucket = conn.get_bucket(args.incoming_bucket) incoming_filenames = [] if args.queue is not None: print "Fetching file list from queue", args.queue if args.dry_run: print "Dry run mode... can't read from the queue without messing things up..." else: q_conn = boto.sqs.connect_to_region( args.aws_region, aws_access_key_id=args.aws_key, aws_secret_access_key=args.aws_secret_key) incoming_queue = q_conn.get_queue(args.queue) if incoming_queue is None: print "Error: could not get queue", args.queue return -2 # Sometimes we don't get all the messages, even if more are # available, so keep trying until we have enough (or there aren't # any left) for i in range(num_cpus): messages = incoming_queue.get_messages(num_cpus - len(incoming_filenames)) for m in messages: # TODO: Make sure this file exists in S3 first? possible_filename = m.get_body() key = incoming_bucket.get_key(possible_filename) if key is None: print "Could not find queued filename in bucket", args.incoming_bucket, ":", possible_filename # try to delete it: incoming_queue.delete_message(m) else: incoming_filenames.append(possible_filename) incoming_queue_messages.append(m) if len(messages) == 0 or len(incoming_filenames) >= num_cpus: break elif args.input_files: print "Fetching file list from file", args.input_files incoming_filenames = [l.strip() for l in args.input_files.readlines()] else: print "Fetching file list from S3..." for f in incoming_bucket.list(): incoming_filenames.append(f.name) print "Done" if len(incoming_filenames) == 0: print "Nothing to do!" return 0 for f in incoming_filenames: print " ", f print "Verifying that we can write to", args.publish_bucket if args.dry_run: print "Dry run mode: don't care!" else: try: publish_bucket = conn.get_bucket(args.publish_bucket) print "Looks good!" except S3ResponseError: print "Bucket", args.publish_bucket, "not found. Attempting to create it." publish_bucket = conn.create_bucket(args.publish_bucket) result = 0 print "Downloading", len(incoming_filenames), "files..." if args.dry_run: print "Dry run mode: skipping download from S3" else: result = fetch_s3_files(incoming_filenames, args.work_dir, incoming_bucket, args.aws_key, args.aws_secret_key) if result != 0: print "Error downloading files. Return code of s3funnel was", result return result print "Done" after_download = datetime.now() local_filenames = [ os.path.join(args.work_dir, f) for f in incoming_filenames ] # TODO: try a SimpleQueue raw_files = Queue() for l in local_filenames: raw_files.put(l) completed_files = Queue() compressed_files = Queue() # Begin reading raw input raw_readers = start_workers( num_cpus, "Reader", ReadRawStep, raw_files, (completed_files, schema, converter, storage, args.bad_data_log)) # Tell readers when to stop: for i in range(num_cpus): raw_files.put(PipeStep.SENTINEL) # Compress completed files. compressors = start_workers(num_cpus, "Compressor", CompressCompletedStep, completed_files, (compressed_files, )) # Export compressed files to S3. exporters = start_workers( num_cpus, "Exporter", ExportCompressedStep, compressed_files, (args.output_dir, args.aws_key, args.aws_secret_key, args.publish_bucket, args.dry_run)) wait_for(raw_readers, "Raw Readers") # `find <out_dir> -type f -not -name ".compressme"` # Add them to completed_files for root, dirs, files in os.walk(args.output_dir): for f in files: if f.endswith(".log"): completed_files.put(os.path.join(root, f)) for i in range(num_cpus): completed_files.put(PipeStep.SENTINEL) wait_for(compressors, "Compressors") for i in range(num_cpus): compressed_files.put(PipeStep.SENTINEL) wait_for(exporters, "Exporters") print "Removing processed logs from S3..." for f in incoming_filenames: if args.dry_run: print " Dry run, so not really deleting", f else: print " Deleting", f incoming_bucket.delete_key(f) # Delete file locally too. os.remove(os.path.join(args.work_dir, f)) print "Done" if len(incoming_queue_messages) > 0: print "Removing processed messages from SQS..." for m in incoming_queue_messages: if args.dry_run: print " Dry run, so not really deleting", m.get_body() else: print " Deleting", m.get_body() if incoming_queue.delete_message(m): print " Message deleted successfully" else: print " Failed to delete message :(" print "Done" duration = timer.delta_sec(start) print "All done in %.2fs (%.2fs excluding download time)" % ( duration, timer.delta_sec(after_download)) return 0
class AnalysisJob: def __init__(self, cfg): self.job_bundle = cfg.job_bundle if cfg.input_filter: self.input_filter = TelemetrySchema(json.load(open(cfg.input_filter))) else: self.input_filter = None if cfg.input_list_file: self.input_list = cfg.input_list_file else: self.input_list = None self.job_id = str(uuid4()) self.target_queue = cfg.target_queue self.aws_key = cfg.aws_key self.aws_secret_key = cfg.aws_secret_key self.input_bucket = "telemetry-published-v1" self.job_name = cfg.name self.job_owner = cfg.owner self.date_limit = cfg.date_limit # Bucket with intermediate data for this analysis job self.analysis_bucket = "jonasfj-telemetry-analysis" self.s3_code_path = "batch-jobs/" + self.job_id + ".tar.gz" # S3 region of operation self.aws_region = "us-west-2" self.task_size_limit = 400 * 1024 * 1024 self.sqs_input_name = cfg.sqs_queue def get_filtered_files(self): conn = S3Connection(self.aws_key, self.aws_secret_key) bucket = conn.get_bucket(self.input_bucket) # date_limit is a hack that makes it easy to launch everything before # a given date... say to back process all we have in the bucket... if self.date_limit != None: print "Launching limiting to before " + self.date_limit for k,s in self.list_partitions(bucket): if k.split('/')[-1].split('.')[1] < self.date_limit: yield (k, s) else: for k,s in self.list_partitions(bucket): yield (k, s) def get_filtered_files_old(self): """ Get tuples of name and size for all input files """ # Setup some auxiliary functions allowed_values = self.input_filter.sanitize_allowed_values() nb_dims = len(allowed_values) def filter_includes(level, value): return self.input_filter.is_allowed(value, allowed_values[level]) # iterate over all files in bucket, this is very slow and we should be # be able to something much smarter using prefix listing and ordering # to break listing. count = 0 selected = 0 conn = S3Connection(self.aws_key, self.aws_secret_key) bucket = conn.get_bucket(self.input_bucket) for f in bucket.list(): count += 1 dims = self.input_filter.get_dimensions(".", f.key) include = True for i in xrange(nb_dims): if not filter_includes(i, dims[i]): include = False break if include: selected += 1 yield (f.key, f.size) if count % 5000 == 0: print "%i files listed with %i selected" % (count, selected) conn.close() def list_partitions(self, bucket, prefix='', level=0): if self.input_filter: #print "Listing...", prefix, level allowed_values = self.input_filter.sanitize_allowed_values() delimiter = '/' if level > 3: delimiter = '.' for k in bucket.list(prefix=prefix, delimiter=delimiter): partitions = k.name.split("/") if level > 3: # split the last couple of partition components by "." instead of "/" partitions.extend(partitions.pop().split(".", 2)) if self.input_filter.is_allowed(partitions[level], allowed_values[level]): if level >= 5: for f in bucket.list(prefix=k.name): yield (f.key, f.size) else: for k, s in self.list_partitions(bucket, k.name, level + 1): yield (k, s) elif self.input_list: print "Using input list..." for line in self.input_list: key_name = line.strip() k = bucket.get_key(key_name) yield (k.key, k.size) else: print "Don't know how to list partitions without a filter or list :(" raise ValueError("Missing both input_filter and input_list") def generate_tasks(self): """ Generates SQS tasks, we batch small files into a single task """ taskid = 1 taskfiles = [] tasksize = 0 total_size_of_all = 0 for key, size in self.get_filtered_files(): # If the task have reached desired size we yield it # Note, as SQS messages are limited to 65 KiB we limit tasks to # 100 filenames, for simplicity # boto only uses signature version 4, hence, we're limited to 65 KiB if 0 < len(taskfiles) and (tasksize + size > self.task_size_limit or len(taskfiles) > 200): # Reduce to only filenames, sort by size... smallest first they are # faster to download when handling the job taskfiles = [f for f,s in sorted(taskfiles, key=lambda (f,s): s)] yield { 'id': self.job_id + "/" + str(taskid), 'name': self.job_name, 'owner': self.job_owner, 'code': self.s3_code_path, 'target-queue': self.target_queue, 'files': taskfiles, 'size': tasksize } total_size_of_all += tasksize print "%i tasks created acc. size: %s" % (taskid, total_size_of_all) taskid += 1 taskfiles = [] tasksize = 0 tasksize += size taskfiles.append((key, size)) if len(taskfiles) > 0: taskfiles = [f for f,s in sorted(taskfiles, key=lambda (f,s): s)] yield { 'id': self.job_id + "/" + str(taskid), 'name': self.job_name, 'owner': self.job_owner, 'code': self.s3_code_path, 'target-queue': self.target_queue, 'files': taskfiles, 'size': tasksize } print "Finished:" print "%i tasks created total size: %s" % (taskid, total_size_of_all + tasksize) def put_sqs_tasks(self): """ Create an SQS tasks for this analysis job """ print "Populate SQS input queue with tasks" # Connect to SQS is desired region conn = sqs.connect_to_region( self.aws_region, aws_access_key_id = self.aws_key, aws_secret_access_key = self.aws_secret_key ) # Create queue queue = conn.get_queue(self.sqs_input_name) queue.set_message_class(JSONMessage) # Populate queue with tasks for task in self.generate_tasks(): #print "enqueueing", task["id"], "size:", task["size"] msg = queue.new_message(body = task) queue.write(msg) conn.close() def setup(self): self.upload_job_bundle() self.put_sqs_tasks() print "Uploaded with job_id: %s" % self.job_id def upload_job_bundle(self): """ Upload job bundle to S3 """ conn = S3Connection(self.aws_key, self.aws_secret_key) bucket = conn.get_bucket(self.analysis_bucket) k = Key(bucket) k.key = self.s3_code_path k.set_contents_from_filename(self.job_bundle) conn.close()