def gemm(A, B): b_fac = 4 assert (A.shape[1] == B.shape[0]) assert (A.shard_sizes[1] == B.shard_sizes[0]) shard_sizes = (A.shard_sizes[0], B.shard_sizes[1]) num_tree_levels = max( int(np.ceil(np.log2(A.num_blocks(1)) / np.log2(b_fac))), 1) Temp = BigMatrix(f"matmul_test_Temp({A.key},{B.key})", shape=(A.shape[0], B.shape[1], B.shape[0], num_tree_levels), shard_sizes=[A.shard_sizes[0], B.shard_sizes[1], 1, 1], write_header=True, safe=False, parent_fn=constant_zeros) C_sharded = BigMatrix("matmul_test_C", shape=(A.shape[0], B.shape[1]), shard_sizes=shard_sizes, write_header=True) config = npw.config.default() t = time.time() p0 = lpcompile_for_execution(GEMM, inputs=["A", "B"], outputs=["Out"]) print("tree depth", np.ceil(np.log(B.num_blocks(1)) / np.log(4))) p1 = p0(A, B, A.num_blocks(0), A.num_blocks(1), B.num_blocks(1), Temp, C_sharded) e = time.time() c_time = e - t program = lp.LambdaPackProgram(p1, config=config) return program, { "outputs": [C_sharded], "intermediates": [Temp], "compile_time": c_time }
def test_if_run(self): X = np.random.randn(64) shard_sizes = (int(X.shape[0]/8),) X_sharded = BigMatrix("if_test", shape=X.shape, shard_sizes=shard_sizes, write_header=True) O_sharded = BigMatrix("if_test_output", shape=X.shape, shard_sizes=shard_sizes, write_header=True) X_sharded.free() shard_matrix(X_sharded, X) f = frontend.lpcompile(f1_if) p = f(X_sharded, O_sharded, X_sharded.num_blocks(0)) num_cores = 1 executor = fs.ProcessPoolExecutor(num_cores) config = npw.config.default() p_ex = lp.LambdaPackProgram(p, config=config) p_ex.start() all_futures = [] for i in range(num_cores): all_futures.append(executor.submit( job_runner.lambdapack_run, p_ex, pipeline_width=1, idle_timeout=5, timeout=60)) p_ex.wait() time.sleep(5) p_ex.free() for i in range(X_sharded.num_blocks(0)): Ob = O_sharded.get_block(i) Xb = X_sharded.get_block(i) if ((i % 2) == 0): assert(np.allclose(Ob, 1*Xb)) else: assert(np.allclose(Ob, 2*Xb))
def cholesky(X, truncate=0): S = BigMatrix("Cholesky.Intermediate({0})".format(X.key), shape=(X.num_blocks(1) + 1, X.shape[0], X.shape[0]), shard_sizes=(1, X.shard_sizes[0], X.shard_sizes[0]), bucket=X.bucket, write_header=True, parent_fn=constant_zeros) #S.free() O = BigMatrix("Cholesky({0})".format(X.key), shape=(X.shape[0], X.shape[0]), shard_sizes=(X.shard_sizes[0], X.shard_sizes[0]), write_header=True, parent_fn=constant_zeros) t = time.time() p0 = lpcompile_for_execution(CHOLESKY, inputs=["I"], outputs=["O"]) p1 = p0(O, X, S, int(np.ceil(X.shape[0] / X.shard_sizes[0])), truncate) e = time.time() c_time = e - t config = npw.config.default() program = lp.LambdaPackProgram(p1, config=config) return program, { "outputs": [O], "intermediates": [S], "compile_time": c_time }
def test_matmul(self): size = 4 shard_size = 2 np.random.seed(0) A = np.random.randn(size, size) B = np.random.randn(size, size) C = np.dot(A, B) shard_sizes = (shard_size, shard_size) A_sharded = BigMatrix("matmul_test_A", shape=A.shape, shard_sizes=shard_sizes, write_header=True) A_sharded.free() shard_matrix(A_sharded, A) B_sharded = BigMatrix("matmul_test_B", shape=B.shape, shard_sizes=shard_sizes, write_header=True) B_sharded.free() shard_matrix(B_sharded, B) Temp = BigMatrix("matmul_test_Temp", shape=[A.shape[0], B.shape[1], B.shape[0], 100], shard_sizes=[ A_sharded.shard_sizes[0], B_sharded.shard_sizes[1], 1, 1 ], write_header=True) C_sharded = BigMatrix("matmul_test_C", shape=C.shape, shard_sizes=shard_sizes, write_header=True) b_fac = 2 config = npw.config.default() compiled_matmul = frontend.lpcompile(matmul) program = compiled_matmul(A_sharded, B_sharded, A_sharded.num_blocks(0), A_sharded.num_blocks(1), B_sharded.num_blocks(1), b_fac, Temp, C_sharded) program_executable = lp.LambdaPackProgram(program, config=config) program_executable.start() job_runner.lambdapack_run(program_executable, pipeline_width=1, idle_timeout=5, timeout=60) executor = fs.ThreadPoolExecutor(1) all_futures = [ executor.submit(job_runner.lambdapack_run, program_executable, pipeline_width=1, idle_timeout=5, timeout=60) ] program_executable.wait() program_executable.free() C_remote = C_sharded.numpy() assert (np.allclose(C, C_remote))
def qr(A): b_fac = 2 N = A.shape[0] N_blocks = A.num_blocks(0) b_fac = 2 shard_size = A.shard_sizes[0] num_tree_levels = max( int(np.ceil(np.log2(A.num_blocks(0)) / np.log2(b_fac))), 1) + 1 Vs = BigMatrix("Vs", shape=(2 * N, 2 * N, num_tree_levels), shard_sizes=(shard_size, shard_size, 1), write_header=True, parent_fn=constant_zeros, safe=False) Ts = BigMatrix("Ts", shape=(2 * N, 2 * N, num_tree_levels), shard_sizes=(shard_size, shard_size, 1), write_header=True, parent_fn=constant_zeros, safe=False) Rs = BigMatrix("Rs", shape=(2 * N, 2 * N, num_tree_levels), shard_sizes=(shard_size, shard_size, 1), write_header=True, parent_fn=constant_zeros, safe=False) Ss = BigMatrix("Ss", shape=(2 * N, 2 * N, 2 * N, num_tree_levels * shard_size), shard_sizes=(shard_size, shard_size, 1, 1), write_header=True, parent_fn=constant_zeros, safe=False) print("Rs", Rs.shape) print("Ss", Ss.shape) print("Ts", Ts.shape) print("Vs", Vs.shape) t = time.time() p0 = lpcompile_for_execution(QR, inputs=["I"], outputs=["Rs"]) p1 = p0(A, Vs, Ts, Rs, Ss, N_blocks, 0) e = time.time() c_time = e - t config = npw.config.default() program = lp.LambdaPackProgram(p1, config=config) return program, { "outputs": [Rs, Vs, Ts], "intermediates": [Ss], "compile_time": c_time }
def tsqr(X, truncate=0): b_fac = 2 assert (X.shard_sizes[1] == X.shape[1]) shard_size = X.shard_sizes[0] shard_sizes = X.shard_sizes num_tree_levels = max( int(np.ceil(np.log2(X.num_blocks(0)) / np.log2(b_fac))), 1) R_sharded = BigMatrix("tsqr_R({0})".format(X.key), shape=(num_tree_levels * shard_size, X.shape[0]), shard_sizes=shard_sizes, write_header=True, safe=False) T_sharded = BigMatrix("tsqr_T({0})".format(X.key), shape=(num_tree_levels * shard_size * b_fac, X.shape[0]), shard_sizes=(shard_size * b_fac, shard_size), write_header=True, safe=False) V_sharded = BigMatrix("tsqr_V({0})".format(X.key), shape=(num_tree_levels * shard_size * b_fac, X.shape[0]), shard_sizes=(shard_size * b_fac, shard_size), write_header=True, safe=False) t = time.time() p0 = lpcompile_for_execution(TSQR, inputs=["A"], outputs=["Rs"]) config = npw.config.default() N_blocks = X.num_blocks(0) p1 = p0(X, V_sharded, T_sharded, R_sharded, N_blocks) e = time.time() c_time = e - t program = lp.LambdaPackProgram(p1, config=config) return program, { "outputs": [R_sharded, V_sharded, T_sharded], "intermediates": [], "compile_time": c_time }
def run_experiment(problem_size, shard_size, pipeline, num_priorities, lru, eager, truncate, max_cores, start_cores, trial, launch_granularity, timeout, log_granularity, autoscale_policy, standalone, warmup, verify, matrix_exists, read_limit, write_limit): # set up logging invoke_executor = fs.ThreadPoolExecutor(1) logger = logging.getLogger() region = wc.default()["account"]["aws_region"] print("REGION", region) for key in logging.Logger.manager.loggerDict: logging.getLogger(key).setLevel(logging.CRITICAL) logger.setLevel(logging.DEBUG) arg_bytes = pickle.dumps( (problem_size, shard_size, pipeline, num_priorities, lru, eager, truncate, max_cores, start_cores, trial, launch_granularity, timeout, log_granularity, autoscale_policy, read_limit, write_limit)) arg_hash = hashlib.md5(arg_bytes).hexdigest() log_file = "{0}.log".format(arg_hash) fh = logging.FileHandler(log_file) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh.setFormatter(formatter) ch = logging.StreamHandler() ch.setLevel(logging.INFO) ch.setFormatter(formatter) logger.addHandler(fh) logger.addHandler(ch) logger.info("Logging to {0}".format(log_file)) if standalone: extra_env = { "AWS_ACCESS_KEY_ID": os.environ["AWS_ACCESS_KEY_ID"], "AWS_SECRET_ACCESS_KEY": os.environ["AWS_ACCESS_KEY_ID"], "OMP_NUM_THREADS": "1", "AWS_DEFAULT_REGION": region } config = wc.default() config['runtime']['s3_bucket'] = 'numpywrenpublic' key = "pywren.runtime/pywren_runtime-3.6-numpywren-standalone.tar.gz" config['runtime']['s3_key'] = key pwex = pywren.standalone_executor(config=config) else: extra_env = {"AWS_DEFAULT_REGION": region} config = wc.default() config['runtime']['s3_bucket'] = 'numpywrenpublic-us-east-1' key = "pywren.runtime/pywren_runtime-3.6-numpywren-08-25-2018.tar.gz" config['runtime']['s3_key'] = key pwex = pywren.default_executor(config=config) if (not matrix_exists): X = np.random.randn(problem_size, 1) shard_sizes = [shard_size, 1] X_sharded = BigMatrix("cholesky_test_{0}_{1}".format( problem_size, shard_size), shape=X.shape, shard_sizes=shard_sizes, write_header=True, autosqueeze=False, bucket="numpywrentop500test", hash_keys=False) shard_matrix(X_sharded, X) print("Generating PSD matrix...") t = time.time() print(X_sharded.shape) XXT_sharded = binops.gemm(pwex, X_sharded, X_sharded.T, overwrite=False) e = time.time() print("GEMM took {0}".format(e - t)) else: X_sharded = BigMatrix("cholesky_test_{0}_{1}".format( problem_size, shard_size), autosqueeze=False, hash_keys=False, bucket="numpywrentop500test") key_name = binops.generate_key_name_binop(X_sharded, X_sharded.T, "gemm") XXT_sharded = BigMatrix(key_name, hash_keys=False, bucket="numpywrentop500test") XXT_sharded.lambdav = problem_size * 10 if (verify): A = XXT_sharded.numpy() print("Computing local cholesky") L = np.linalg.cholesky(A) t = time.time() instructions, trailing, L_sharded = compiler._chol(XXT_sharded, truncate=truncate) pipeline_width = args.pipeline if (lru): cache_size = 5 else: cache_size = 0 pywren_config = pwex.config config = npw.config.default() program = lp.LambdaPackProgram(instructions, executor=pywren.lambda_executor, pywren_config=pywren_config, num_priorities=num_priorities, eager=eager, config=config, write_limit=write_limit, read_limit=read_limit) warmup_start = time.time() if (warmup): warmup_sleep = 170 def warmup_fn(x): program.incr_up(1) time.sleep(warmup_sleep) program.decr_up(1) print("Warming up...") futures = pwex.map(warmup_fn, range(max_cores)) last_spinup = time.time() while (True): if ((time.time() - last_spinup) > 0.75 * warmup_sleep): print("Calling pwex.map..") futures = pwex.map(warmup_fn, range(max_cores)) last_spinup = time.time() time.sleep(2) if (program.get_up() is None): up_workers = 0 else: up_workers = int(program.get_up()) print("{0} workers alive".format(up_workers)) if (up_workers >= max_cores): time.sleep(warmup_sleep) break warmup_end = time.time() print("Warmup took {0} seconds".format(warmup_end - warmup_start)) e = time.time() print("Program compile took {0} seconds".format(e - t)) print("program.hash", program.hash) REDIS_CLIENT = program.control_plane.client done_counts = [] ready_counts = [] post_op_counts = [] not_ready_counts = [] running_counts = [] sqs_invis_counts = [] sqs_vis_counts = [] up_workers_counts = [] busy_workers_counts = [] read_objects = [] write_objects = [] all_read_timeouts = [] all_write_timeouts = [] all_redis_timeouts = [] times = [time.time()] flops = [0] reads = [0] writes = [0] print("LRU", lru) print("eager", eager) exp = {} exp["redis_done_counts"] = done_counts exp["redis_ready_counts"] = ready_counts exp["redis_post_op_counts"] = post_op_counts exp["redis_not_ready_counts"] = not_ready_counts exp["redis_running_counts"] = running_counts exp["sqs_invis_counts"] = sqs_invis_counts exp["sqs_vis_counts"] = sqs_vis_counts exp["busy_workers"] = busy_workers_counts exp["up_workers"] = up_workers_counts exp["times"] = times exp["lru"] = lru exp["priority"] = num_priorities exp["eager"] = eager exp["truncate"] = truncate exp["max_cores"] = max_cores exp["problem_size"] = problem_size exp["shard_size"] = shard_size exp["pipeline"] = pipeline exp["flops"] = flops exp["reads"] = reads exp["writes"] = writes exp["read_objects"] = read_objects exp["write_objects"] = write_objects exp["read_timeouts"] = all_read_timeouts exp["write_timeouts"] = all_write_timeouts exp["redis_timeouts"] = all_redis_timeouts exp["trial"] = trial exp["launch_granularity"] = launch_granularity exp["log_granularity"] = log_granularity exp["autoscale_policy"] = autoscale_policy exp["standalone"] = standalone exp["program"] = program exp["time_steps"] = 1 exp["failed"] = False program.start() t = time.time() logger.info("Starting with {0} cores".format(start_cores)) invoker = fs.ThreadPoolExecutor(1) all_future_futures = invoker.submit(lambda: pwex.map( lambda x: job_runner.lambdapack_run(program, pipeline_width=pipeline_width, cache_size=cache_size, timeout=timeout), range(start_cores), extra_env=extra_env)) # print(all_future_futures.result()) all_futures = [all_future_futures] # print([f.result() for f in all_futures]) start_time = time.time() last_run_time = start_time print(program.program_status()) print("QUEUE URLS", len(program.queue_urls)) total_lambda_epochs = start_cores try: while (program.program_status() == lp.PS.RUNNING): time.sleep(log_granularity) curr_time = int(time.time() - start_time) p = program.get_progress() if (p is None): print("no progress...") continue else: p = int(p) times.append(int(time.time())) max_pc = p waiting = 0 running = 0 for i, queue_url in enumerate(program.queue_urls): client = boto3.client('sqs') attrs = client.get_queue_attributes( QueueUrl=queue_url, AttributeNames=[ 'ApproximateNumberOfMessages', 'ApproximateNumberOfMessagesNotVisible' ])['Attributes'] waiting += int(attrs["ApproximateNumberOfMessages"]) running += int(attrs["ApproximateNumberOfMessagesNotVisible"]) sqs_invis_counts.append(running) sqs_vis_counts.append(waiting) busy_workers = REDIS_CLIENT.get("{0}_busy".format(program.hash)) if (busy_workers == None): busy_workers = 0 else: busy_workers = int(busy_workers) up_workers = program.get_up() if (up_workers == None): up_workers = 0 else: up_workers = int(up_workers) up_workers_counts.append(up_workers) busy_workers_counts.append(busy_workers) logger.debug("{2}: Up Workers: {0}, Busy Workers: {1}".format( up_workers, busy_workers, curr_time)) if ((curr_time % INFO_FREQ) == 0): logger.info("Waiting: {0}, Currently Processing: {1}".format( waiting, running)) logger.info("{2}: Up Workers: {0}, Busy Workers: {1}".format( up_workers, busy_workers, curr_time)) current_gflops = program.get_flops() if (current_gflops is None): current_gflops = 0 else: current_gflops = int(current_gflops) / 1e9 flops.append(current_gflops) current_gbytes_read = program.get_read() if (current_gbytes_read is None): current_gbytes_read = 0 else: current_gbytes_read = int(current_gbytes_read) / 1e9 reads.append(current_gbytes_read) current_gbytes_write = program.get_write() if (current_gbytes_write is None): current_gbytes_write = 0 else: current_gbytes_write = int(current_gbytes_write) / 1e9 writes.append(current_gbytes_write) gflops_rate = flops[-1] / (times[-1] - times[0]) greads_rate = reads[-1] / (times[-1] - times[0]) gwrites_rate = writes[-1] / (times[-1] - times[0]) b = XXT_sharded.shard_sizes[0] current_objects_read = (current_gbytes_read * 1e9) / (b * b * 8) current_objects_write = (current_gbytes_write * 1e9) / (b * b * 8) read_objects.append(current_objects_read) write_objects.append(current_objects_write) read_rate = read_objects[-1] / (times[-1] - times[0]) write_rate = write_objects[-1] / (times[-1] - times[0]) avg_workers = np.mean(up_workers_counts) smooth_len = 10 if (len(flops) > smooth_len + 5): gflops_rate_5_min_window = (flops[-1] - flops[-smooth_len]) / ( times[-1] - times[-smooth_len]) gread_rate_5_min_window = (reads[-1] - reads[-smooth_len]) / ( times[-1] - times[-smooth_len]) gwrite_rate_5_min_window = ( writes[-1] - writes[-smooth_len]) / (times[-1] - times[-smooth_len]) read_rate_5_min_window = (read_objects[-1] - read_objects[-smooth_len]) / ( times[-1] - times[-smooth_len]) write_rate_5_min_window = (write_objects[-1] - write_objects[-smooth_len]) / ( times[-1] - times[-smooth_len]) workers_5_min_window = np.mean(up_workers_counts[-smooth_len:]) else: gflops_rate_5_min_window = "N/A" gread_rate_5_min_window = "N/A" gwrite_rate_5_min_window = "N/A" workers_5_min_window = "N/A" read_rate_5_min_window = "N/A" write_rate_5_min_window = "N/A" read_timeouts = int(REDIS_CLIENT.get("s3.timeouts.read")) write_timeouts = int(REDIS_CLIENT.get("s3.timeouts.write")) redis_timeouts = int(REDIS_CLIENT.get("redis.timeouts")) all_read_timeouts.append(read_timeouts) all_write_timeouts.append(write_timeouts) all_redis_timeouts.append(redis_timeouts) read_timeouts_fraction = read_timeouts / current_objects_read write_timeouts_fraction = write_timeouts / current_objects_write print("=======================================") print("Max PC is {0}".format(max_pc)) print("Waiting: {0}, Currently Processing: {1}".format( waiting, running)) print("{2}: Up Workers: {0}, Busy Workers: {1}".format( up_workers, busy_workers, curr_time)) print( "{0}: Total GFLOPS {1}, Total GBytes Read {2}, Total GBytes Write {3}" .format(curr_time, current_gflops, current_gbytes_read, current_gbytes_write)) print( "{0}: Average GFLOPS rate {1}, Average GBytes Read rate {2}, Average GBytes Write rate {3}, Average Worker Count {4}" .format(curr_time, gflops_rate, greads_rate, gwrites_rate, avg_workers)) print("{0}: Average read txns/s {1}, Average write txns/s {2}". format(curr_time, read_rate, write_rate)) print( "{0}: smoothed GFLOPS rate {1}, smoothed GBytes Read rate {2}, smoothed GBytes Write rate {3}, smoothed Worker Count {4}" .format(curr_time, gflops_rate_5_min_window, gread_rate_5_min_window, gwrite_rate_5_min_window, workers_5_min_window)) print("{0}: smoothed read txns/s {1}, smoothed write txns/s {2}". format(curr_time, read_rate_5_min_window, write_rate_5_min_window)) print( "{0}: Read timeouts: {1}, Write timeouts: {2}, Redis timeouts: {3} " .format(curr_time, read_timeouts, write_timeouts, redis_timeouts)) print( "{0}: Read timeouts fraction: {1}, Write timeouts fraction: {2}" .format(curr_time, read_timeouts_fraction, write_timeouts_fraction)) print("=======================================") time_since_launch = time.time() - last_run_time if (autoscale_policy == "dynamic"): if (time_since_launch > launch_granularity and up_workers < np.ceil(waiting * 0.5 / pipeline_width) and up_workers < max_cores): cores_to_launch = int( min( np.ceil(waiting / pipeline_width) - up_workers, max_cores - up_workers)) logger.info( "launching {0} new tasks....".format(cores_to_launch)) new_future_futures = invoker.submit( lambda: pwex.map(lambda x: job_runner.lambdapack_run( program, pipeline_width=pipeline_width, cache_size=cache_size, timeout=timeout), range(cores_to_launch), extra_env=extra_env)) last_run_time = time.time() # check if we OOM-erred # [x.result() for x in all_futures] all_futures.extend(new_future_futures) elif (autoscale_policy == "constant_timeout"): if (time_since_launch > (0.85 * timeout)): cores_to_launch = max_cores logger.info( "launching {0} new tasks....".format(cores_to_launch)) new_future_futures = invoker.submit( lambda: pwex.map(lambda x: job_runner.lambdapack_run( program, pipeline_width=pipeline_width, cache_size=cache_size, timeout=timeout), range(cores_to_launch), extra_env=extra_env)) last_run_time = time.time() # check if we OOM-erred # [x.result() for x in all_futures] all_futures.append(new_future_futures) else: raise Exception("unknown autoscale policy") exp["time_steps"] += 1 if (verify): L_sharded_local = L_sharded.numpy() print("max diff", np.max(np.abs(L_sharded_local - L))) except KeyboardInterrupt: exp["failed"] = True program.stop() pass except Exception as e: traceback.print_exc() exp["failed"] = True program.stop() raise pass print(program.program_status()) exp["all_futures"] = all_futures exp_bytes = dill.dumps(exp) client = boto3.client('s3') client.put_object(Key="lambdapack/{0}/runtime.pickle".format(program.hash), Body=exp_bytes, Bucket=program.bucket) print("=======================") print("=======================") print("Execution Summary:") print("Executed Program ID: {0}".format(program.hash)) print("Program Success: {0}".format((not exp["failed"]))) print("Problem Size: {0}".format(exp["problem_size"])) print("Shard Size: {0}".format(exp["shard_size"])) print("Total Execution time: {0}".format(times[-1] - times[0])) print("Average Flop Rate (GFlop/s): {0}".format(exp["flops"][-1] / (times[-1] - times[0]))) with open("/tmp/last_run", "w+") as f: f.write(program.hash)
def test_cholesky_multi_repeats(self): ''' Insert repeated instructions into PC queue avoid double increments ''' print("RUNNING MULTI") np.random.seed(1) size = 256 shard_size = 30 repeats = 15 total_repeats = 150 np.random.seed(2) print("Generating X") X = np.random.randn(size, 128) print("Generating A") A = X.dot(X.T) + size*np.eye(X.shape[0]) shard_sizes = (shard_size, shard_size) A_sharded = BigMatrix("cholesky_test_A_{0}".format( int(time.time())), shape=A.shape, shard_sizes=shard_sizes, write_header=True) A_sharded.free() shard_matrix(A_sharded, A) instructions, trailing, L_sharded = compiler._chol(A_sharded) all_nodes = instructions.unroll_program() L_sharded.free() pwex = pywren.default_executor() executor = pywren.lambda_executor config = npw.config.default() pywren_config = pwex.config program = lp.LambdaPackProgram( instructions, executor=executor, pywren_config=pywren_config, config=config, eager=True) print("PROGRAM HASH", program.hash) cores = 1 program.start() jobs = [] for c in range(cores): p = mp.Process(target=job_runner.lambdapack_run, args=( program,), kwargs={'timeout': 3600, 'pipeline_width': 5}) jobs.append(p) p.start() np.random.seed(0) try: while(program.program_status() == lp.PS.RUNNING): sqs = boto3.resource( 'sqs', region_name=program.control_plane.region) time.sleep(0.5) waiting = 0 running = 0 for i, queue_url in enumerate(program.queue_urls): client = boto3.client('sqs') print("Priority {0}".format(i)) attrs = client.get_queue_attributes(QueueUrl=queue_url, AttributeNames=[ 'ApproximateNumberOfMessages', 'ApproximateNumberOfMessagesNotVisible'])['Attributes'] print(attrs) waiting += int(attrs["ApproximateNumberOfMessages"]) running += int(attrs["ApproximateNumberOfMessagesNotVisible"]) print("SQS QUEUE STATUS Waiting {0}, Running {1}".format( waiting, running)) for i in range(repeats): p = program.get_progress() if (p is None): continue else: p = int(p) pc = int(np.random.choice(min(p, len(all_nodes)), 1)) node = all_nodes[pc] queue = sqs.Queue(program.queue_urls[0]) total_repeats -= 1 if (total_repeats > 0): print("Malicilously enqueueing node ", pc, node, total_repeats) queue.send_message(MessageBody=json.dumps(node)) time.sleep(1) # for p in jobs: # p.join() except: pass print("Program status") print(program.program_status()) for node in all_nodes: edge_sum = lp.get(program.control_plane.client, program._node_edge_sum_key(*node)) if (edge_sum == None): edge_sum = 0 edge_sum = int(edge_sum) parents = program.program.get_parents(*node) children = program.program.get_children(*node) indegree = len(parents) node_status = program.get_node_status(*node) redis_str = "Node: {0}, Edge Sum: {1}, Indegree: {2}, Node Status {3}".format( node, edge_sum, indegree, node_status) if (edge_sum != indegree): print(redis_str) for p in parents: p_status = program.get_node_status(*p) edge_key = program._edge_key(p[0], p[1], node[0], node[1]) edge_value = lp.get(program.control_plane.client, edge_key) child_str = "Parent Node: {0}, Parent Status: {1}, Edge Key: {2}".format( p, p_status, edge_value) print(child_str) #assert(edge_sum == indegree) program.free() L_npw = L_sharded.numpy() L = np.linalg.cholesky(A) z = np.argmax(np.abs(L - L_npw)) assert(np.allclose(L_npw, L))
def test_cholesky_multi_failures(self): ''' Insert repeated instructions into PC queue avoid double increments ''' print("RUNNING MULTI") np.random.seed(1) size = 256 shard_size = 64 failures = 4 np.random.seed(1) print("Generating X") X = np.random.randn(size, 128) print("Generating A") A = X.dot(X.T) + size*np.eye(X.shape[0]) shard_sizes = (shard_size, shard_size) A_sharded = BigMatrix("cholesky_test_A", shape=A.shape, shard_sizes=shard_sizes, write_header=True) A_sharded.free() shard_matrix(A_sharded, A) instructions, trailing, L_sharded = compiler._chol(A_sharded) pwex = pywren.default_executor() executor = pywren.lambda_executor pywren_config = pwex.config config = npw.config.default() program = lp.LambdaPackProgram( instructions, executor=executor, pywren_config=pywren_config, config=config, eager=False) cores = 16 program.start() jobs = [] for c in range(cores): p = mp.Process(target=job_runner.lambdapack_run, args=( program,), kwargs={'timeout': 3600, 'pipeline_width': 4}) jobs.append(p) p.start() np.random.seed(0) while(program.program_status() == lp.PS.RUNNING): sqs = boto3.resource( 'sqs', region_name=program.control_plane.region) waiting = 0 running = 0 for i, queue_url in enumerate(program.queue_urls): client = boto3.client('sqs') print("Priority {0}".format(i)) attrs = client.get_queue_attributes(QueueUrl=queue_url, AttributeNames=[ 'ApproximateNumberOfMessages', 'ApproximateNumberOfMessagesNotVisible'])['Attributes'] print(attrs) waiting += int(attrs["ApproximateNumberOfMessages"]) running += int(attrs["ApproximateNumberOfMessagesNotVisible"]) print("SQS QUEUE STATUS Waiting {0}, Running {1}".format( waiting, running)) time.sleep(10) if (np.random.random() > 0.65): for i in range(failures): core = int(np.random.choice(cores, 1)[0]) print("Maliciously Killing a job!") jobs[core].terminate() p = mp.Process(target=job_runner.lambdapack_run, args=( program,), kwargs={'timeout': 3600, 'pipeline_width': 4}) p.start() jobs[core] = p for p in jobs: p.join() print("Program status") print(program.program_status()) program.free() L_npw = L_sharded.numpy() L = np.linalg.cholesky(A) print(L_npw) print(L) print("MAX ", np.max(np.abs(L - L_npw))) assert(np.allclose(L_npw, L))
def run_experiment(problem_size, shard_size, pipeline, priority, lru, eager, truncate, max_cores, start_cores, trial, launch_granularity, timeout, log_granularity, autoscale_policy, failure_percentage, max_failure_events, failure_time): # set up logging logger = logging.getLogger() for key in logging.Logger.manager.loggerDict: logging.getLogger(key).setLevel(logging.CRITICAL) logger.setLevel(logging.DEBUG) arg_bytes = pickle.dumps( (problem_size, shard_size, pipeline, priority, lru, eager, truncate, max_cores, start_cores, trial, launch_granularity, timeout, log_granularity, autoscale_policy, failure_percentage, max_failure_events, failure_time)) arg_hash = hashlib.md5(arg_bytes).hexdigest() log_file = "failure_experiments/{0}.log".format(arg_hash) fh = logging.FileHandler(log_file) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh.setFormatter(formatter) ch = logging.StreamHandler() ch.setLevel(logging.INFO) ch.setFormatter(formatter) logger.addHandler(fh) logger.addHandler(ch) logger.info("Logging to {0}".format(log_file)) X = np.random.randn(problem_size, 1) pwex = pywren.default_executor() shard_sizes = [shard_size, 1] X_sharded = BigMatrix("cholesky_test_{0}_{1}".format( problem_size, shard_size), shape=X.shape, shard_sizes=shard_sizes, write_header=True) shard_matrix(X_sharded, X) print("Generating PSD matrix...") XXT_sharded = binops.gemm(pwex, X_sharded, X_sharded.T, overwrite=False) XXT_sharded.lambdav = problem_size * 10 instructions, L_sharded, trailing = lp._chol(XXT_sharded) pipeline_width = args.pipeline if (priority): num_priorities = 5 else: num_priorities = 1 if (lru): cache_size = 5 else: cache_size = 0 REDIS_CLIENT = redis.StrictRedis(REDIS_ADDR, port=REDIS_PORT, password=REDIS_PASS, db=0, socket_timeout=5) if (truncate is not None): instructions = instructions[:truncate] config = pwex.config program = lp.LambdaPackProgram(instructions, executor=pywren.lambda_executor, pywren_config=config, num_priorities=num_priorities, eager=eager) redis_env = { "REDIS_ADDR": os.environ.get("REDIS_ADDR", ""), "REDIS_PASS": os.environ.get("REDIS_PASS", "") } done_counts = [] ready_counts = [] post_op_counts = [] not_ready_counts = [] running_counts = [] sqs_invis_counts = [] sqs_vis_counts = [] up_workers_counts = [] busy_workers_counts = [] times = [] flops = [] reads = [] writes = [] failure_times = [] exp = {} exp["redis_done_counts"] = done_counts exp["redis_ready_counts"] = ready_counts exp["redis_post_op_counts"] = post_op_counts exp["redis_not_ready_counts"] = not_ready_counts exp["redis_running_counts"] = running_counts exp["sqs_invis_counts"] = sqs_invis_counts exp["sqs_vis_counts"] = sqs_vis_counts exp["busy_workers"] = busy_workers_counts exp["up_workers"] = up_workers_counts exp["times"] = times exp["lru"] = lru exp["priority"] = priority exp["eager"] = eager exp["truncate"] = truncate exp["max_cores"] = max_cores exp["problem_size"] = problem_size exp["shard_size"] = shard_size exp["pipeline"] = pipeline exp["flops"] = flops exp["reads"] = reads exp["writes"] = writes exp["trial"] = trial exp["launch_granularity"] = launch_granularity exp["log_granularity"] = log_granularity exp["autoscale_policy"] = autoscale_policy exp["failure_times"] = failure_times logger.info("Longest Path: {0}".format(program.longest_path)) program.start() t = time.time() logger.info("Starting with {0} cores".format(start_cores)) failure_keys = [ "{0}_failure_{1}_{2}".format(program.hash, i, 0) for i in range(start_cores) ] all_futures = pwex.map(lambda x: job_runner.lambdapack_run_with_failures( failure_keys[x], program, pipeline_width=pipeline_width, cache_size=cache_size, timeout=timeout), range(start_cores), extra_env=redis_env) start_time = time.time() last_run_time = start_time last_failure = time.time() num_failure_events = 0 while (program.program_status() == lp.PS.RUNNING): curr_time = int(time.time() - start_time) max_pc = program.get_max_pc() times.append(int(time.time())) time.sleep(log_granularity) waiting = 0 running = 0 for i, queue_url in enumerate(program.queue_urls): client = boto3.client('sqs') attrs = client.get_queue_attributes( QueueUrl=queue_url, AttributeNames=[ 'ApproximateNumberOfMessages', 'ApproximateNumberOfMessagesNotVisible' ])['Attributes'] waiting += int(attrs["ApproximateNumberOfMessages"]) running += int(attrs["ApproximateNumberOfMessagesNotVisible"]) sqs_invis_counts.append(running) sqs_vis_counts.append(waiting) busy_workers = REDIS_CLIENT.get("{0}_busy".format(program.hash)) if (busy_workers == None): busy_workers = 0 else: busy_workers = int(busy_workers) up_workers = program.get_up() if (up_workers == None): up_workers = 0 else: up_workers = int(up_workers) up_workers_counts.append(up_workers) busy_workers_counts.append(busy_workers) logger.debug("Waiting: {0}, Currently Processing: {1}".format( waiting, running)) logger.debug("{2}: Up Workers: {0}, Busy Workers: {1}".format( up_workers, busy_workers, curr_time)) if ((curr_time % INFO_FREQ) == 0): logger.info("Max PC is {0}".format(max_pc)) logger.info("Waiting: {0}, Currently Processing: {1}".format( waiting, running)) logger.info("{2}: Up Workers: {0}, Busy Workers: {1}".format( up_workers, busy_workers, curr_time)) #print("{5}: Not Ready: {0}, Ready: {1}, Running: {4}, Post OP: {2}, Done: {3}".format(not_ready_count, ready_count, post_op_count, done_count, running_count, curr_time)) current_gflops = program.get_flops() if (current_gflops is None): current_gflops = 0 else: current_gflops = int(current_gflops) / 1e9 flops.append(current_gflops) current_gbytes_read = program.get_read() if (current_gbytes_read is None): current_gbytes_read = 0 else: current_gbytes_read = int(current_gbytes_read) / 1e9 reads.append(current_gbytes_read) current_gbytes_write = program.get_write() if (current_gbytes_write is None): current_gbytes_write = 0 else: current_gbytes_write = int(current_gbytes_write) / 1e9 writes.append(current_gbytes_write) #print("{0}: Total GFLOPS {1}, Total GBytes Read {2}, Total GBytes Write {3}".format(curr_time, current_gflops, current_gbytes_read, current_gbytes_write)) time_since_launch = time.time() - last_run_time if (autoscale_policy == "dynamic"): if (time_since_launch > launch_granularity and up_workers < np.ceil(waiting * 0.5 / pipeline_width) and up_workers < max_cores): cores_to_launch = int( min( np.ceil(waiting / pipeline_width) - up_workers, max_cores - up_workers)) logger.info( "launching {0} new tasks....".format(cores_to_launch)) _failure_keys = [ "{0}_failure_{1}_{2}".format(program.hash, i, curr_time) for i in range(cores_to_launch) ] new_futures = pwex.map( lambda x: job_runner.lambdapack_run_with_failures( _failure_keys[x], program, pipeline_width=pipeline_width, cache_size=cache_size, timeout=timeout), range(cores_to_launch), extra_env=redis_env) last_run_time = time.time() # check if we OOM-erred # [x.result() for x in all_futures] all_futures.extend(new_futures) elif (autoscale_policy == "constant_timeout"): if (time_since_launch > (0.75 * timeout)): cores_to_launch = max_cores logger.info( "launching {0} new tasks....".format(cores_to_launch)) _failure_keys = [ "{0}_failure_{1}_{2}".format(program.hash, i, curr_time) for i in range(cores_to_launch) ] new_futures = pwex.map( lambda x: job_runner.lambdapack_run_with_failures( _failure_keys[x], program, pipeline_width=pipeline_width, cache_size=cache_size, timeout=timeout), range(cores_to_launch), extra_env=redis_env) last_run_time = time.time() failure_keys += _failure_keys # check if we OOM-erred # [x.result() for x in all_futures] all_futures.extend(new_futures) else: raise Exception("unknown autoscale policy") if ((time.time() - last_failure) > failure_time and num_failure_events < max_failure_events): logging.info("Killing some jobs") idxs = np.random.choice(len(failure_keys), int(failure_percentage * len(failure_keys)), replace=False) num_failure_events += 1 last_failure = time.time() failure_times.append(last_failure) for i in idxs: logging.info("Killing: job {0}".format(i)) REDIS_CLIENT.set(failure_keys[i], 1) exp["all_futures"] = all_futures for pc in range(program.num_inst_blocks): run_count = REDIS_CLIENT.get("{0}_{1}_start".format(program.hash, pc)) if (run_count is None): run_count = 0 else: run_count = int(run_count) if (run_count != 1): logger.info("PC: {0}, Run Count: {1}".format(pc, run_count)) e = time.time() logger.info(program.program_status()) logger.info("PROGRAM STATUS " + str(program.program_status())) logger.info("PROGRAM HASH " + str(program.hash)) logger.info("Took {0} seconds".format(e - t)) exp["total_runtime"] = e - t exp["num_failure_events"] = num_failure_events # collect in executor = fs.ThreadPoolExecutor(72) futures = [] for i in range(0, program.num_inst_blocks, 1): futures.append(executor.submit(program.get_profiling_info, i)) res = fs.wait(futures) profiled_blocks = [f.result() for f in futures] serializer = serialize.SerializeIndependent() byte_string = serializer([profiled_blocks])[0][0] exp["profiled_block_pickle_bytes"] = byte_string read, write, total_flops, bins, instructions, runtimes = lp.perf_profile( profiled_blocks, num_bins=100) flop_rate = sum(total_flops) / max(bins) exp["flop_rate"] = flop_rate print("Average Flop rate of {0}".format(flop_rate)) # save other stuff try: os.mkdir("failure_experiments/") except FileExistsError: pass exp_bytes = pickle.dumps(exp) dump_path = "failure_experiments/{0}.pickle".format(arg_hash) print("Dumping experiment pickle to {0}".format(dump_path)) with open(dump_path, "wb+") as f: f.write(exp_bytes)
def bdfac(A, truncate=0): b_fac = 2 N = A.shape[0] N_blocks = A.num_blocks(0) b_fac = 2 shard_size = A.shard_sizes[0] num_tree_levels = max( int(np.ceil(np.log2(A.num_blocks(0)) / np.log2(b_fac))), 1) + 1 V_QR = BigMatrix("V_QR", shape=(2 * N, num_tree_levels, 2 * N), shard_sizes=(1, 1, shard_size), write_header=True, safe=False) T_QR = BigMatrix("T_QR", shape=(2 * N, num_tree_levels, 2 * N), shard_sizes=(1, 1, shard_size), write_header=True, safe=False) R_QR = BigMatrix("R_QR", shape=(2 * N, num_tree_levels, 2 * N), parent_fn=constant_zeros, shard_sizes=(shard_size, 1, shard_size), write_header=True, safe=False) S_QR = BigMatrix("S_QR", shape=(2 * N, num_tree_levels, 2 * N, 2 * N), parent_fn=constant_zeros, shard_sizes=(1, 1, shard_size, shard_size), write_header=True, safe=False) V_LQ = BigMatrix("V_LQ", shape=(2 * N, num_tree_levels, 2 * N), shard_sizes=(1, 1, shard_size), write_header=True, safe=False) T_LQ = BigMatrix("T_LQ", shape=(2 * N, num_tree_levels, 2 * N), shard_sizes=(1, 1, shard_size), write_header=True, safe=False) L_LQ = BigMatrix("L_LQ", shape=(2 * N, num_tree_levels, 2 * N), parent_fn=constant_zeros_ext, shard_sizes=(1, 1, shard_size), write_header=True, safe=False) S_LQ = BigMatrix("S_LQ", shape=(2 * N, num_tree_levels, 2 * N, 2 * N), parent_fn=constant_zeros_ext, shard_sizes=(1, 1, shard_size, shard_size), write_header=True, safe=False) t = time.time() p0 = lpcompile_for_execution(BDFAC, inputs=["I"], outputs=["R_QR", "L_LQ"]) p1 = p0(A, V_QR, T_QR, S_QR, R_QR, V_LQ, T_LQ, S_LQ, L_LQ, N_blocks, truncate) e = time.time() c_time = e - t config = npw.config.default() program = lp.LambdaPackProgram(p1, config=config) return program, { "outputs": [L_LQ, R_QR], "intermediates": [S_LQ, S_QR, T_QR, V_QR, V_LQ, T_LQ], "compile_time": c_time }