def make_constraints(self, budget): constraints = [] T = self.T ram_costs = self.g.cost_ram ram_cost_vec = np.asarray([ram_costs[i] for i in range(T)]) with Timer("Var bounds"): constraints.extend([self.R >= 0, self.R <= 1]) constraints.extend([self.Sram >= 0, self.Sram <= 1]) constraints.extend([self.Ssd >= 0, self.Ssd <= 1]) constraints.extend([self.Min >= 0, self.Min <= 1]) constraints.extend([self.Mout >= 0, self.Mout <= 1]) constraints.extend([self.Free_E >= 0, self.Free_E <= 1]) constraints.extend([self.U >= 0, self.U <= budget]) constraints.append(cp.diag(self.R) == 1) constraints.append(cp.upper_tri(self.R) == 0) constraints.append(cp.diag(self.Sram) == 0) constraints.append(cp.upper_tri(self.Sram) == 0) constraints.append(cp.diag(self.Ssd) == 0) constraints.append(cp.upper_tri(self.Ssd) == 0) constraints.append(cp.upper_tri(self.Min) == 0) constraints.append(cp.upper_tri(self.Mout) == 0) with Timer("Correctness constraints"): # ensure all computations are possible for (u, v) in self.g.edge_list: constraints.append(self.R[:, v] <= self.R[:, u] + self.Sram[:, u]) # ensure all checkpoints are in memory constraints.append(self.Sram[1:, :] <= self.R[:-1, :] + self.Sram[:-1, :] + self.Min[:-1, :]) constraints.append(self.Ssd[1:, :] <= self.Ssd[:-1, :] + self.Mout[:-1, :]) constraints.append(self.Min <= self.Ssd) constraints.append(self.Mout <= self.Sram) with Timer("Free_E constraints"): # Constraint: sum_k Free_{t,i,k} <= 1 for i in range(T): frees = [self.Free_E[:, eidx] for eidx, (j, _) in enumerate(self.g.edge_list) if i == j] if frees: constraints.append(cp.sum(frees, axis=0) <= 1) # Constraint: Free_{t,i,k} <= 1 - S_{t+1, i} for eidx, (i, k) in enumerate(self.g.edge_list): constraints.append(self.Free_E[:-1, eidx] + self.Sram[1:, i] <= 1) # Constraint: Free_{t,i,k} <= 1 - R_{t, j} for eidx, (i, k) in enumerate(self.g.edge_list): for j in self.g.successors(i): if j > k: constraints.append(self.Free_E[:, eidx] + self.R[:, j] <= 1) with Timer("U constraints"): constraints.append(self.U[:, 0] == self.R[:, 0] * ram_costs[0] + ram_cost_vec @ self.Sram.T) for k in range(T - 1): mem_freed = cp.sum([ram_costs[i] * self.Free_E[:, eidx] for (eidx, i) in self.g.predecessors_indexed(k)]) constraints.append(self.U[:, k + 1] == self.U[:, k] + self.R[:, k + 1] * ram_costs[k + 1] - mem_freed) return constraints
def solve_chen_greedy(g: DFGraph, segment_mem_B: int, use_actuation_points: bool = True): with Timer("solve_chen_greedy") as timer_solve: C = g.articulation_points if use_actuation_points else g.v temp = 0 x = 0 checkpoints = set() for v in g.topological_order_fwd: temp += g.cost_ram[v] if v in C and temp > segment_mem_B: x += g.cost_ram[v] temp = 0 checkpoints.add(v) S = gen_s_matrix_fixed_checkpoints(g, checkpoints) R = solve_r_opt(g, S) schedule, aux_data = schedule_from_rs(g, R, S) return ScheduledResult( solve_strategy=SolveStrategy.CHEN_GREEDY if use_actuation_points else SolveStrategy.CHEN_GREEDY_NOAP, solver_budget=segment_mem_B, feasible=True, schedule=schedule, schedule_aux_data=aux_data, solve_time_s=timer_solve.elapsed, )
def solve(self): T = self.g.size with self.profiler("Gurobi model optimization", extra_data={"T": str(T), "budget": str(self.budget)}): with Timer("ILPSolve") as solve_ilp: self.m.optimize() self.solve_time = solve_ilp.elapsed infeasible = self.m.status == GRB.INFEASIBLE try: _ = self.R[0, 0].X _ = self.S[0, 0].X _ = self.U[0, 0].X _ = self.batch_size.X except AttributeError as e: infeasible = True if infeasible: raise ValueError("Infeasible model, check constraints carefully. Insufficient memory?") Rout = np.zeros((T, T), dtype=SOLVER_DTYPE) Sout = np.zeros((T, T), dtype=SOLVER_DTYPE) Uout = np.zeros((T, T), dtype=SOLVER_DTYPE) Free_Eout = np.zeros((T, len(self.g.edge_list)), dtype=SOLVER_DTYPE) batch_size = self.batch_size.X try: for t in range(T): for i in range(T): Rout[t][i] = int(self.R[t, i].X) Sout[t][i] = int(self.S[t, i].X) Uout[t][i] = self.U[t, i].X * self.ram_gcd for e in range(len(self.g.edge_list)): Free_Eout[t][e] = int(self.Free_E[t, e].X) except AttributeError as e: logging.exception(e) return None, None, None, None Rout = solve_r_opt(self.g, Sout) # prune R using optimal recomputation solver ilp_aux_data = ILPAuxData( U=Uout, Free_E=Free_Eout, ilp_approx=False, ilp_time_limit=0, ilp_eps_noise=0, ilp_num_constraints=self.m.numConstrs, ilp_num_variables=self.m.numVars, ) schedule, aux_data = schedule_from_rs(self.g, Rout, Sout) return ( ScheduledResult( solve_strategy=SolveStrategy.OPTIMAL_ILP_GC, solver_budget=self.budget, feasible=True, schedule=schedule, schedule_aux_data=aux_data, solve_time_s=self.solve_time, ilp_aux_data=ilp_aux_data, ), batch_size, )
def schedule_from_rs( g: DFGraph, r: np.ndarray, s: np.ndarray ) -> Tuple[Optional[Schedule], Optional[SchedulerAuxData]]: debug_collect_ram_usage = "DEBUG_SCHEDULER_RAM" in active_env_var_flags if r is None or s is None: return None, None # infeasible T = g.size def _used_after(t_, u_, i_): """Returns True if v_u is used after v_i in stage t""" is_retained_snapshot = t_ < T - 1 and s[t_ + 1, u_] == 1 is_used_by_successor = not all( [r[t_, v] == 0 or v <= i_ for v in g.successors(u_)]) return is_retained_snapshot or is_used_by_successor with Timer("schedule_rs_matrix") as schedule_timer: # compute last usage to determine whether to update auxiliary variables # last_used = {i: max([t for t in range(T) if r[t, i] == 1]) for i in range(T)} mem_usage = np.zeros((T, T), dtype=np.int) sb = ScheduleBuilder(g, verbosity=1) for t in range(T): # Free unused checkpoints if debug_collect_ram_usage: for i in filter(lambda x: sb.is_op_cached(x), range(T)): if not _used_after(t, i, i): sb.deallocate_register(i) for i in range(T): if r[t, i] == 1: # sb.run_operator(i, last_used[i] == t) sb.run_operator( i, False ) # todo(paras) prune away last_used in favor of recompute blacklist if debug_collect_ram_usage: mem_usage[t, i] = sb.current_ram + g.cost_ram_fixed # Free memory if debug_collect_ram_usage: for u in filter(lambda x: sb.is_op_cached(x), itertools.chain(g.predecessors(i), [i])): if not _used_after(t, u, i): sb.deallocate_register(u) total_ram = sb.max_ram + g.cost_ram_fixed ram_timeline = [mem + g.cost_ram_fixed for mem in sb.ram_timeline] return ( sb.schedule, SchedulerAuxData( R=r, S=s, cpu=sb.total_cpu, peak_ram=total_ram, activation_ram=sb.max_ram, mem_grid=mem_usage, mem_timeline=ram_timeline, schedule_time_s=schedule_timer.elapsed, ), )
def solve_checkpoint_all(g: DFGraph): with Timer("solve_checkpoint_all") as timer_solve: s = gen_s_matrix_fixed_checkpoints(g, g.vfwd) r = solve_r_opt(g, s) schedule, aux_data = schedule_from_rs(g, r, s) return ScheduledResult( solve_strategy=SolveStrategy.CHECKPOINT_ALL, solver_budget=0, feasible=True, schedule=schedule, schedule_aux_data=aux_data, solve_time_s=timer_solve.elapsed, )
def solve_checkpoint_last_node(g: DFGraph): """Checkpoint only one node between stages""" with Timer("solve_checkpoint_last_node") as timer_solve: s = np.zeros((g.size, g.size), dtype=SOLVER_DTYPE) np.fill_diagonal(s[1:], 1) r = solve_r_opt(g, s) schedule, aux_data = schedule_from_rs(g, r, s) return ScheduledResult( solve_strategy=SolveStrategy.CHECKPOINT_LAST_NODE, solver_budget=0, feasible=True, schedule=schedule, schedule_aux_data=aux_data, solve_time_s=timer_solve.elapsed, )
def solve(self, solver_override=None, verbose=False, num_threads=os.cpu_count()): installed_solvers = cp.installed_solvers() with Timer("Solve", print_results=verbose) as solve_timer: if solver_override is not None: self.problem.solve(verbose=verbose, solver=solver_override) elif "MOSEK" in installed_solvers: self.problem.solve(verbose=verbose, solver=cp.MOSEK) elif "GUROBI" in installed_solvers: self.problem.solve(verbose=verbose, solver=cp.GUROBI) elif "CBC" in installed_solvers: self.problem.solve(verbose=verbose, solver=cp.CBC, numberThreads=num_threads) else: self.problem.solve(verbose=verbose) self.solve_time = solve_timer.elapsed if self.problem.status in ["infeasible", "unbounded"]: raise ValueError("Model infeasible") return self.R.value, self.S.value, self.U.value, self.Free_E.value
def solve_chen_sqrtn(g: DFGraph, use_actuation_points: bool = True) -> ScheduledResult: with Timer("solve_chen_sqrtn") as timer_solve: C = g.articulation_points if use_actuation_points else g.v k = int(math.sqrt(len(C))) checkpoints = [v for idx, v in enumerate(C) if (idx + 1) % k == 0] S = gen_s_matrix_fixed_checkpoints(g, set(checkpoints)) R = solve_r_opt(g, S) schedule, aux_data = schedule_from_rs(g, R, S) return ScheduledResult( solve_strategy=SolveStrategy.CHEN_SQRTN if use_actuation_points else SolveStrategy.CHEN_SQRTN_NOAP, solver_budget=0, feasible=True, schedule=schedule, schedule_aux_data=aux_data, solve_time_s=timer_solve.elapsed, )
def _load_griewank(graph_size: int) -> pd.DataFrame: fname = "{}.pkl.gz".format(graph_size) local_path_base = checkmate_cache_dir() / "griewank_solutions" local_path = local_path_base / fname remote_path = "https://optimalcheckpointing.s3.amazonaws.com/griewank_solutions/pickle/{}".format( fname) if local_path.exists(): try: return pd.read_pickle(local_path) except Exception as e: logging.exception(e) logging.warning( "Error loading cached griewank solution, corrupt file? Reloading from S3" ) with Timer("griewank_dl") as dl_timer: local_path_base.mkdir(parents=True, exist_ok=True) urllib.request.urlretrieve(remote_path, local_path) logging.info("Loaded graph from {} and saving to {} in {:.2f}s".format( remote_path, local_path, dl_timer.elapsed)) return pd.read_pickle(local_path)
import tensorflow as tf import logging from checkmate.core.solvers.strategy_chen import solve_chen_sqrtn from checkmate.core.utils.timer import Timer from checkmate.tf2.extraction import dfgraph_from_tf_function from checkmate.tf2.util.load_keras_model import get_keras_model BS = 128 if __name__ == "__main__": logging.basicConfig(level=logging.DEBUG) logging.info("building graph") with Timer("build_graph", print_results=True): model = get_keras_model("ResNet50") def grads(images, labels): with tf.GradientTape() as tape: pred = model(images) loss = tf.reduce_mean(pred - labels) gradient = tape.gradient(loss, model.trainable_variables) return loss, gradient grad_fn = tf.function(grads).get_concrete_function( tf.TensorSpec(shape=(BS, 224, 224, 3)), tf.TensorSpec(shape=(BS, 1000))) logging.info("tracing graph") with Timer("trace_graph", print_results=True): g = dfgraph_from_tf_function(grad_fn) # sched_result = solve_ilp_gurobi(g, budget=platform_memory("p2xlarge"), approx=False, eps_noise=0.0) # sched_result = solve_approx_lp_deterministic_05_threshold(g, budget=platform_memory("p2xlarge"))
def build_model(self): T = self.g.size dict_val_div = lambda cost_dict, divisor: { k: v / divisor for k, v in cost_dict.items() } permute_ram = dict_val_div(self.g.cost_ram, self.ram_gcd) budget = self.budget / self.ram_gcd permute_eps = lambda cost_dict, eps: { k: v * (1.0 + eps * np.random.randn()) for k, v in cost_dict.items() } permute_cpu = dict_val_div(self.g.cost_cpu, self.g.cpu_gcd()) if self.eps_noise: permute_cpu = permute_eps(permute_cpu, self.eps_noise) with Timer("Gurobi model construction", extra_data={ "T": str(T), "budget": str(budget) }): with Timer("Objective construction", extra_data={ "T": str(T), "budget": str(budget) }): # seed solver with a baseline strategy if self.seed_s is not None: for x in range(T): for y in range(T): if self.seed_s[x, y] < 1: self.init_constraints.append( self.m.addLConstr(self.S[x, y], GRB.EQUAL, 0)) self.m.update() # define objective function self.m.setObjective( quicksum(self.R[t, i] * permute_cpu[i] for t in range(T) for i in range(T)), GRB.MINIMIZE) with Timer("Variable initialization", extra_data={ "T": str(T), "budget": str(budget) }): if self.imposed_schedule == ImposedSchedule.FULL_SCHEDULE: self.m.addLConstr( quicksum(self.R[t, i] for t in range(T) for i in range(t + 1, T)), GRB.EQUAL, 0) self.m.addLConstr( quicksum(self.S[t, i] for t in range(T) for i in range(t, T)), GRB.EQUAL, 0) self.m.addLConstr(quicksum(self.R[t, t] for t in range(T)), GRB.EQUAL, T) elif self.imposed_schedule == ImposedSchedule.COVER_ALL_NODES: self.m.addLConstr(quicksum(self.S[0, i] for i in range(T)), GRB.EQUAL, 0) for i in range(T): self.m.addLConstr( quicksum(self.R[t, i] for t in range(T)), GRB.GREATER_EQUAL, 1) elif self.imposed_schedule == ImposedSchedule.COVER_LAST_NODE: self.m.addLConstr(quicksum(self.S[0, i] for i in range(T)), GRB.EQUAL, 0) # note: the integrality gap is very large as this constraint # is only applied to the last node (last column of self.R). self.m.addLConstr( quicksum(self.R[t, T - 1] for t in range(T)), GRB.GREATER_EQUAL, 1) with Timer("Correctness constraints", extra_data={ "T": str(T), "budget": str(budget) }): # ensure all checkpoints are in memory for t in range(T - 1): for i in range(T): self.m.addLConstr(self.S[t + 1, i], GRB.LESS_EQUAL, self.S[t, i] + self.R[t, i]) # ensure all computations are possible for (u, v) in self.g.edge_list: for t in range(T): self.m.addLConstr(self.R[t, v], GRB.LESS_EQUAL, self.R[t, u] + self.S[t, u]) # define memory constraints def _num_hazards(t, i, k): if t + 1 < T: return 1 - self.R[t, k] + self.S[t + 1, i] + quicksum( self.R[t, j] for j in self.g.successors(i) if j > k) return 1 - self.R[t, k] + quicksum( self.R[t, j] for j in self.g.successors(i) if j > k) def _max_num_hazards(t, i, k): num_uses_after_k = sum(1 for j in self.g.successors(i) if j > k) if t + 1 < T: return 2 + num_uses_after_k return 1 + num_uses_after_k with Timer("Constraint: upper bound for 1 - Free_E", extra_data={ "T": str(T), "budget": str(budget) }): for t in range(T): for eidx, (i, k) in enumerate(self.g.edge_list): self.m.addLConstr(1 - self.Free_E[t, eidx], GRB.LESS_EQUAL, _num_hazards(t, i, k)) with Timer("Constraint: lower bound for 1 - Free_E", extra_data={ "T": str(T), "budget": str(budget) }): for t in range(T): for eidx, (i, k) in enumerate(self.g.edge_list): self.m.addLConstr( _max_num_hazards(t, i, k) * (1 - self.Free_E[t, eidx]), GRB.GREATER_EQUAL, _num_hazards(t, i, k)) with Timer( "Constraint: initialize memory usage (includes spurious checkpoints)", extra_data={ "T": str(T), "budget": str(budget) }, ): for t in range(T): self.m.addLConstr( self.U[t, 0], GRB.EQUAL, self.R[t, 0] * permute_ram[0] + quicksum(self.S[t, i] * permute_ram[i] for i in range(T)), ) with Timer("Constraint: memory recurrence", extra_data={ "T": str(T), "budget": str(budget) }): for t in range(T): for k in range(T - 1): mem_freed = quicksum( permute_ram[i] * self.Free_E[t, eidx] for (eidx, i) in self.g.predecessors_indexed(k)) self.m.addLConstr( self.U[t, k + 1], GRB.EQUAL, self.U[t, k] + self.R[t, k + 1] * permute_ram[k + 1] - mem_freed) if self.model_file is not None and self.g.size < 200: # skip for big models to save runtime with Timer("Saving model", extra_data={ "T": str(T), "budget": str(budget) }): self.m.write(self.model_file) return None # return value ensures ray remote call can be chained
def solve(self): T = self.g.size with Timer("Gurobi model optimization", extra_data={ "T": str(T), "budget": str(self.budget) }): if self.seed_s is not None: self.m.Params.TimeLimit = self.GRB_CONSTRAINED_PRESOLVE_TIME_LIMIT self.m.optimize() if self.m.status == GRB.INFEASIBLE: print("Infeasible ILP seed at budget {:.2E}".format( self.budget)) self.m.remove(self.init_constraints) self.m.Params.TimeLimit = self.gurobi_params.get("TimeLimit", 0) self.m.message("\n\nRestarting solve\n\n") with Timer("ILPSolve") as solve_ilp: self.m.optimize() self.solve_time = solve_ilp.elapsed infeasible = self.m.status == GRB.INFEASIBLE if infeasible: raise ValueError( "Infeasible model, check constraints carefully. Insufficient memory?" ) if self.m.solCount < 1: raise ValueError( "Model status is {} (not infeasible), but solCount is {}". format(self.m.status, self.m.solCount)) Rout = np.zeros((T, T), dtype=checkmate.core.utils.solver_common.SOLVER_DTYPE if self.integral else np.float) Sout = np.zeros((T, T), dtype=checkmate.core.utils.solver_common.SOLVER_DTYPE if self.integral else np.float) Uout = np.zeros((T, T), dtype=checkmate.core.utils.solver_common.SOLVER_DTYPE if self.integral else np.float) Free_Eout = np.zeros( (T, len(self.g.edge_list)), dtype=checkmate.core.utils.solver_common.SOLVER_DTYPE) solver_dtype_cast = int if self.integral else float try: for t in range(T): for i in range(T): try: Rout[t][i] = solver_dtype_cast(self.R[t, i].X) except (AttributeError, TypeError) as e: Rout[t][i] = solver_dtype_cast(self.R[t, i]) try: Sout[t][i] = solver_dtype_cast(self.S[t, i]) except (AttributeError, TypeError) as e: Sout[t][i] = solver_dtype_cast(self.S[t, i].X) try: Uout[t][i] = self.U[t, i].X * self.ram_gcd except (AttributeError, TypeError) as e: Uout[t][i] = self.U[t, i] * self.ram_gcd for e in range(len(self.g.edge_list)): try: Free_Eout[t][e] = solver_dtype_cast(self.Free_E[t, e].X) except (AttributeError, TypeError) as e: Free_Eout[t][e] = solver_dtype_cast(self.Free_E[t, e]) except AttributeError as e: logging.exception(e) return None, None, None, None # prune R using closed-form solver if self.solve_r and self.integral: Rout = solve_r_opt(self.g, Sout) return Rout, Sout, Uout, Free_Eout
logging.error( "Skipping Griewank baselines as it was broken in parasj/checkmate#65" ) # scheduler_result_griewank = solve_griewank(g, B) # plot_schedule(scheduler_result_griewank, False, save_file=scratch_dir / "GRIEWANK.png") # data.append( # { # "Strategy": str(scheduler_result_griewank.solve_strategy.value), # "Name": "GRIEWANK", # "CPU": scheduler_result_griewank.schedule_aux_data.cpu, # "Activation RAM": scheduler_result_griewank.schedule_aux_data.activation_ram, # } # ) with Timer("ilp") as timer_ilp: scheduler_result_ilp = solve_ilp_gurobi( g, B, seed_s=scheduler_result_sqrtn.schedule_aux_data.S) plot_schedule(scheduler_result_ilp, False, save_file=scratch_dir / "CHECKMATE_ILP.png") data.append({ "Strategy": str(scheduler_result_ilp.solve_strategy.value), "Name": "CHECKMATE_ILP", "CPU": scheduler_result_ilp.schedule_aux_data.cpu, "Activation RAM": scheduler_result_ilp.schedule_aux_data.activation_ram, })