def __init__(self, aws_region, s3_bucket, s3_prefix, invoker, runtime_s3_bucket, runtime_s3_key, job_max_runtime): self.aws_region = aws_region self.s3_bucket = s3_bucket self.s3_prefix = s3_prefix self.session = botocore.session.get_session() self.invoker = invoker self.s3client = self.session.create_client('s3', region_name=aws_region) self.job_max_runtime = job_max_runtime self.runtime_bucket = runtime_s3_bucket self.runtime_key = runtime_s3_key self.runtime_meta_info = runtime.get_runtime_info( runtime_s3_bucket, runtime_s3_key) if not runtime.runtime_key_valid(self.runtime_meta_info): raise Exception( "The indicated runtime: s3://{}/{} is not approprite for this python version" .format(runtime_s3_bucket, runtime_s3_key)) if 'preinstalls' in self.runtime_meta_info: logger.info("using serializer with meta-supplied preinstalls") self.serializer = serialize.SerializeIndependent( self.runtime_meta_info['preinstalls']) else: self.serializer = serialize.SerializeIndependent()
def __init__(self, invoker, config, job_max_runtime): self.invoker = invoker self.job_max_runtime = job_max_runtime self.config = config self.storage_config = wrenconfig.extract_storage_config(self.config) self.storage = storage.Storage(self.storage_config) self.runtime_meta_info = runtime.get_runtime_info(config['runtime']) # print('runtime_meta_info: ', self.runtime_meta_info) self.runtime_meta_info['preinstalls'].append(['pandas', True]) self.runtime_meta_info['preinstalls'].append(['thrift', True]) self.runtime_meta_info['preinstalls'].append(['Thrift', True]) if 'preinstalls' in self.runtime_meta_info: logger.info("using serializer with meta-supplied preinstalls") self.serializer = serialize.SerializeIndependent( self.runtime_meta_info['preinstalls']) else: self.serializer = serialize.SerializeIndependent() self.map_item_limit = None if 'scheduler' in self.config: if 'map_item_limit' in config['scheduler']: self.map_item_limit = config['scheduler']['map_item_limit']
def __init__(self, invoker, config, job_max_runtime): self.invoker = invoker self.job_max_runtime = job_max_runtime self.config = config self.storage_config = wrenconfig.extract_storage_config(self.config) self.storage = storage.Storage(self.storage_config) self.runtime_meta_info = runtime.get_runtime_info(config['runtime']) if 'preinstalls' in self.runtime_meta_info: logger.info("using serializer with meta-supplied preinstalls") self.serializer = serialize.SerializeIndependent(self.runtime_meta_info['preinstalls']) else: self.serializer = serialize.SerializeIndependent()
def run_experiment(problem_size, shard_size, pipeline, priority, lru, eager, truncate, max_cores, start_cores, trial, launch_granularity, timeout, log_granularity, autoscale_policy, failure_percentage, max_failure_events, failure_time): # set up logging logger = logging.getLogger() for key in logging.Logger.manager.loggerDict: logging.getLogger(key).setLevel(logging.CRITICAL) logger.setLevel(logging.DEBUG) arg_bytes = pickle.dumps( (problem_size, shard_size, pipeline, priority, lru, eager, truncate, max_cores, start_cores, trial, launch_granularity, timeout, log_granularity, autoscale_policy, failure_percentage, max_failure_events, failure_time)) arg_hash = hashlib.md5(arg_bytes).hexdigest() log_file = "failure_experiments/{0}.log".format(arg_hash) fh = logging.FileHandler(log_file) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh.setFormatter(formatter) ch = logging.StreamHandler() ch.setLevel(logging.INFO) ch.setFormatter(formatter) logger.addHandler(fh) logger.addHandler(ch) logger.info("Logging to {0}".format(log_file)) X = np.random.randn(problem_size, 1) pwex = pywren.default_executor() shard_sizes = [shard_size, 1] X_sharded = BigMatrix("cholesky_test_{0}_{1}".format( problem_size, shard_size), shape=X.shape, shard_sizes=shard_sizes, write_header=True) shard_matrix(X_sharded, X) print("Generating PSD matrix...") XXT_sharded = binops.gemm(pwex, X_sharded, X_sharded.T, overwrite=False) XXT_sharded.lambdav = problem_size * 10 instructions, L_sharded, trailing = lp._chol(XXT_sharded) pipeline_width = args.pipeline if (priority): num_priorities = 5 else: num_priorities = 1 if (lru): cache_size = 5 else: cache_size = 0 REDIS_CLIENT = redis.StrictRedis(REDIS_ADDR, port=REDIS_PORT, password=REDIS_PASS, db=0, socket_timeout=5) if (truncate is not None): instructions = instructions[:truncate] config = pwex.config program = lp.LambdaPackProgram(instructions, executor=pywren.lambda_executor, pywren_config=config, num_priorities=num_priorities, eager=eager) redis_env = { "REDIS_ADDR": os.environ.get("REDIS_ADDR", ""), "REDIS_PASS": os.environ.get("REDIS_PASS", "") } done_counts = [] ready_counts = [] post_op_counts = [] not_ready_counts = [] running_counts = [] sqs_invis_counts = [] sqs_vis_counts = [] up_workers_counts = [] busy_workers_counts = [] times = [] flops = [] reads = [] writes = [] failure_times = [] exp = {} exp["redis_done_counts"] = done_counts exp["redis_ready_counts"] = ready_counts exp["redis_post_op_counts"] = post_op_counts exp["redis_not_ready_counts"] = not_ready_counts exp["redis_running_counts"] = running_counts exp["sqs_invis_counts"] = sqs_invis_counts exp["sqs_vis_counts"] = sqs_vis_counts exp["busy_workers"] = busy_workers_counts exp["up_workers"] = up_workers_counts exp["times"] = times exp["lru"] = lru exp["priority"] = priority exp["eager"] = eager exp["truncate"] = truncate exp["max_cores"] = max_cores exp["problem_size"] = problem_size exp["shard_size"] = shard_size exp["pipeline"] = pipeline exp["flops"] = flops exp["reads"] = reads exp["writes"] = writes exp["trial"] = trial exp["launch_granularity"] = launch_granularity exp["log_granularity"] = log_granularity exp["autoscale_policy"] = autoscale_policy exp["failure_times"] = failure_times logger.info("Longest Path: {0}".format(program.longest_path)) program.start() t = time.time() logger.info("Starting with {0} cores".format(start_cores)) failure_keys = [ "{0}_failure_{1}_{2}".format(program.hash, i, 0) for i in range(start_cores) ] all_futures = pwex.map(lambda x: job_runner.lambdapack_run_with_failures( failure_keys[x], program, pipeline_width=pipeline_width, cache_size=cache_size, timeout=timeout), range(start_cores), extra_env=redis_env) start_time = time.time() last_run_time = start_time last_failure = time.time() num_failure_events = 0 while (program.program_status() == lp.PS.RUNNING): curr_time = int(time.time() - start_time) max_pc = program.get_max_pc() times.append(int(time.time())) time.sleep(log_granularity) waiting = 0 running = 0 for i, queue_url in enumerate(program.queue_urls): client = boto3.client('sqs') attrs = client.get_queue_attributes( QueueUrl=queue_url, AttributeNames=[ 'ApproximateNumberOfMessages', 'ApproximateNumberOfMessagesNotVisible' ])['Attributes'] waiting += int(attrs["ApproximateNumberOfMessages"]) running += int(attrs["ApproximateNumberOfMessagesNotVisible"]) sqs_invis_counts.append(running) sqs_vis_counts.append(waiting) busy_workers = REDIS_CLIENT.get("{0}_busy".format(program.hash)) if (busy_workers == None): busy_workers = 0 else: busy_workers = int(busy_workers) up_workers = program.get_up() if (up_workers == None): up_workers = 0 else: up_workers = int(up_workers) up_workers_counts.append(up_workers) busy_workers_counts.append(busy_workers) logger.debug("Waiting: {0}, Currently Processing: {1}".format( waiting, running)) logger.debug("{2}: Up Workers: {0}, Busy Workers: {1}".format( up_workers, busy_workers, curr_time)) if ((curr_time % INFO_FREQ) == 0): logger.info("Max PC is {0}".format(max_pc)) logger.info("Waiting: {0}, Currently Processing: {1}".format( waiting, running)) logger.info("{2}: Up Workers: {0}, Busy Workers: {1}".format( up_workers, busy_workers, curr_time)) #print("{5}: Not Ready: {0}, Ready: {1}, Running: {4}, Post OP: {2}, Done: {3}".format(not_ready_count, ready_count, post_op_count, done_count, running_count, curr_time)) current_gflops = program.get_flops() if (current_gflops is None): current_gflops = 0 else: current_gflops = int(current_gflops) / 1e9 flops.append(current_gflops) current_gbytes_read = program.get_read() if (current_gbytes_read is None): current_gbytes_read = 0 else: current_gbytes_read = int(current_gbytes_read) / 1e9 reads.append(current_gbytes_read) current_gbytes_write = program.get_write() if (current_gbytes_write is None): current_gbytes_write = 0 else: current_gbytes_write = int(current_gbytes_write) / 1e9 writes.append(current_gbytes_write) #print("{0}: Total GFLOPS {1}, Total GBytes Read {2}, Total GBytes Write {3}".format(curr_time, current_gflops, current_gbytes_read, current_gbytes_write)) time_since_launch = time.time() - last_run_time if (autoscale_policy == "dynamic"): if (time_since_launch > launch_granularity and up_workers < np.ceil(waiting * 0.5 / pipeline_width) and up_workers < max_cores): cores_to_launch = int( min( np.ceil(waiting / pipeline_width) - up_workers, max_cores - up_workers)) logger.info( "launching {0} new tasks....".format(cores_to_launch)) _failure_keys = [ "{0}_failure_{1}_{2}".format(program.hash, i, curr_time) for i in range(cores_to_launch) ] new_futures = pwex.map( lambda x: job_runner.lambdapack_run_with_failures( _failure_keys[x], program, pipeline_width=pipeline_width, cache_size=cache_size, timeout=timeout), range(cores_to_launch), extra_env=redis_env) last_run_time = time.time() # check if we OOM-erred # [x.result() for x in all_futures] all_futures.extend(new_futures) elif (autoscale_policy == "constant_timeout"): if (time_since_launch > (0.75 * timeout)): cores_to_launch = max_cores logger.info( "launching {0} new tasks....".format(cores_to_launch)) _failure_keys = [ "{0}_failure_{1}_{2}".format(program.hash, i, curr_time) for i in range(cores_to_launch) ] new_futures = pwex.map( lambda x: job_runner.lambdapack_run_with_failures( _failure_keys[x], program, pipeline_width=pipeline_width, cache_size=cache_size, timeout=timeout), range(cores_to_launch), extra_env=redis_env) last_run_time = time.time() failure_keys += _failure_keys # check if we OOM-erred # [x.result() for x in all_futures] all_futures.extend(new_futures) else: raise Exception("unknown autoscale policy") if ((time.time() - last_failure) > failure_time and num_failure_events < max_failure_events): logging.info("Killing some jobs") idxs = np.random.choice(len(failure_keys), int(failure_percentage * len(failure_keys)), replace=False) num_failure_events += 1 last_failure = time.time() failure_times.append(last_failure) for i in idxs: logging.info("Killing: job {0}".format(i)) REDIS_CLIENT.set(failure_keys[i], 1) exp["all_futures"] = all_futures for pc in range(program.num_inst_blocks): run_count = REDIS_CLIENT.get("{0}_{1}_start".format(program.hash, pc)) if (run_count is None): run_count = 0 else: run_count = int(run_count) if (run_count != 1): logger.info("PC: {0}, Run Count: {1}".format(pc, run_count)) e = time.time() logger.info(program.program_status()) logger.info("PROGRAM STATUS " + str(program.program_status())) logger.info("PROGRAM HASH " + str(program.hash)) logger.info("Took {0} seconds".format(e - t)) exp["total_runtime"] = e - t exp["num_failure_events"] = num_failure_events # collect in executor = fs.ThreadPoolExecutor(72) futures = [] for i in range(0, program.num_inst_blocks, 1): futures.append(executor.submit(program.get_profiling_info, i)) res = fs.wait(futures) profiled_blocks = [f.result() for f in futures] serializer = serialize.SerializeIndependent() byte_string = serializer([profiled_blocks])[0][0] exp["profiled_block_pickle_bytes"] = byte_string read, write, total_flops, bins, instructions, runtimes = lp.perf_profile( profiled_blocks, num_bins=100) flop_rate = sum(total_flops) / max(bins) exp["flop_rate"] = flop_rate print("Average Flop rate of {0}".format(flop_rate)) # save other stuff try: os.mkdir("failure_experiments/") except FileExistsError: pass exp_bytes = pickle.dumps(exp) dump_path = "failure_experiments/{0}.pickle".format(arg_hash) print("Dumping experiment pickle to {0}".format(dump_path)) with open(dump_path, "wb+") as f: f.write(exp_bytes)