def graph_latency(graph: Graph, batch_size=1, warmup=2, number=6, repeat=6, profile_stage=False): if lib is None: raise FileNotFoundError('IOS Backend library not found') num_stages = sum(len(b.stages) for b in graph.blocks) results_t = ctypes.c_float * int(repeat) results = results_t() stage_results_t = ctypes.c_float * int(repeat * num_stages) stage_results = stage_results_t() for i in range(repeat * num_stages): stage_results[i] = 0.0 lib.graph_latency( c_string(json.dumps(graph.export_config())), ctypes.c_int(batch_size), ctypes.c_int(warmup), ctypes.c_int(number), ctypes.c_int(repeat), ctypes.c_int(1 if profile_stage else 0), ctypes.cast(results, ctypes.POINTER(ctypes.c_float)), ctypes.cast(stage_results, ctypes.POINTER(ctypes.c_float))) if profile_stage: return [float(v) for v in results], [float(v) for v in stage_results] else: return [float(v) for v in results]
def graph_inference(graph: Graph, batch_size, input: np.ndarray): output = np.empty(shape=(batch_size, *graph.blocks[-1].exit_node.output_shape), dtype=np.float32) conv_nodes: List[Conv] = list(get_nodes_by_type(graph.nodes(), Conv)) output_data = output.ctypes.data_as(ctypes.c_void_p) lib.graph_inference(c_string(json.dumps(graph.export_config())), ctypes.c_int(batch_size), input.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(len(conv_nodes)), c_string_list([node.name for node in conv_nodes]), c_ndarray_list([node.weight for node in conv_nodes]), c_ndarray_list([node.bias for node in conv_nodes]), output_data) return output
def main(): logs = {} with open(f'schedules/{args.graph}.json', 'r') as f: graph = Graph.from_config(json.load(f)) cost_model = IOSCostModel() name = graph.name graph_latency = cost_model.get_graph_latency(graph, args.bs, warmup=args.warmup, number=args.number, repeat=args.repeat) block_latency = [ np.mean( cost_model.get_block_latency(block, args.bs, args.warmup, args.number, args.repeat)) for block in graph.blocks ] logs[name] = {} logs[name]['latency'] = graph_latency logs[name]['mean'] = float(np.mean(graph_latency)) logs[name]['std'] = float(np.std(graph_latency)) logs[name]['block_latency'] = block_latency summary = summary_str(np.mean(graph_latency)) print(summary) for bindex, block in enumerate(graph.blocks): block_dir = f'{expr_dir}/{name}_blocks' os.makedirs(block_dir, exist_ok=True) draw_block( block, f'{block_dir}/{bindex}.png', f'{name} block {bindex}, latency {block_latency[bindex]:.3f}') draw(graph, f"{expr_dir}/{name}.png", label=f'{name}, latency {float(np.mean(graph_latency)):.3f}') with open(f"{expr_dir}/{name}.json", "w") as f: json.dump(graph.export_config(), f, indent=2) with open(f'{expr_dir}/latency.json', 'w') as f: json.dump(logs, f, indent=2) with open(f'{expr_dir}/summary.txt', 'w') as f: f.write(summary + "\n") with open(f'{expr_dir}/arguments.txt', 'w') as f: json.dump(args.__dict__, f, indent=2)
def optimize(graph: Graph, batch_size=1, cost_model: Optional[CostModel] = None, opt_type: str = 'dp_parallel_merge', warmup=2, number=6, repeat=6, max_num_groups=8, max_part_size=100, compute_weight=False, max_group_size=3, debug_dp_info=None, verbose=False) -> Graph: """ Optimize the computation graph and generate a highly optimized schedule. :param graph: Graph The computation graph that is going to be optimized. :param batch_size: int The batch size that IOS optimizes for. :param cost_model: ios.cost_model.CostModel The cost model used to measure the latency of stages. It can be either ios.cost_model.IOSCostModel() or ios.cost_model.RandomCostModel(). The latter one can be used to generate random schedule. The default cost model is ios.cost_model.IOSCostModel(). :param opt_type: str, one of 'dp_parallel_merge', 'dp_parallel', and 'dp_merge' The optimization type, which specifies the parallelization strategy that can be used in the optimization. There are two parallelization strategy for each stage: 'operator merge' and 'concurrent execution'. When opt_type='dp_parallel', only use 'concurrent execution'. When opt_type='dp_merge', only use 'operator merge'. When op_type='dp_parallel_merge', use both strategies. :param warmup: int, default 2 The number of warm-ups for the stage latency measurement. :param number: int, default 6 The number of execution in one repeat during the latency measurement. :param repeat: int, default 6 The number of repeats during the latency measurement. There are totally warmup + number * repeat times executions of a stage in a stage latency measurement. The average latency is used to estimate the stage latency. :param max_num_groups: int, default 8 The maximum number of parallel groups in a stage. :param max_part_size: int, default 100 When the number of schedule operators in a block is larger than max_part_size, the block is split into multiple parts. Each part has this maximum number of schedule operators. :param compute_weight: boolean, default False Compute the weights for the optimized computation graph. It is not necessary to compute the weights when we only want to measure the latency of optimized computation graph. However, when you want to execute the optimized computation graph in real weights and data, set it to True. :param max_group_size: int, default 3 The maximum size of group in each stage. :param debug_dp_info: None or a dict, default None Get the debug information of the optimization. The debug information is stored in the dict when it is not None. :param verbose: boolean, default False Print verbose information during optimization. :return: Graph Return the optimized computation graph and execution schedule. """ if cost_model is None: cost_model = IOSCostModel() graph_enter = Placeholder(graph.input.name, graph.input.hint_name, graph.input.output_shape) graph_enter.output_shape = graph.enter_node.output_shape blocks = [] on_debug = debug_dp_info is not None if on_debug: assert isinstance(debug_dp_info, dict) debug_dp_info['dp'] = {} debug_dp_info['stage_latency'] = {} debug_dp_info['#states'] = [] debug_dp_info['#transitions'] = [] debug_dp_info['#schedules'] = [] debug_dp_info['width'] = [] debug_dp_info['#operators'] = [] debug_dp_info['meta'] = [] log(f"optimize {opt_type} on {graph.name}", verbose) assert 'parallel' in opt_type or 'merge' in opt_type for bindex, block in enumerate(graph.blocks): all_nodes = block.inner_nodes + [block.exit_node] node_parts = block.parts nid: Dict[Node, int] = {node: i for i, node in enumerate(all_nodes)} idn: Dict[int, Node] = {i: node for i, node in enumerate(all_nodes)} if node_parts is None: node_parts = [] for idx_part in range( (len(all_nodes) + max_part_size - 1) // max_part_size): begin = idx_part * max_part_size end = min((idx_part + 1) * max_part_size, len(all_nodes)) node_parts.append([all_nodes[i] for i in range(begin, end)]) log( f"block {bindex} with {len(all_nodes)} nodes {len(node_parts)} parts", verbose) stage_list = [] for part_index, npart in enumerate(node_parts): log(f"part {part_index} with {len(npart)} nodes", verbose) ipart = [nid[nd] for nd in npart] dp: Dict[int, float] = {} ep: Dict[int, Tuple[List[int], str]] = {} merge_latency: Dict[int, float] = {} parallel_latency: Dict[int, float] = {} part_graph = build_graph(npart, nid) chains = graph_chain_decomposition(part_graph) max_num_endings = functools.reduce( operator.mul, [len(chain) + 1 for chain in chains]) if verbose: print(f"#Chains: {len(chains)}") print(f"Max number of endings: {max_num_endings}") if on_debug: debug_dp_info['#states'].append(0) debug_dp_info['#transitions'].append(0) debug_dp_info['#schedules'].append(0) debug_dp_info['width'].append(0) debug_dp_info['#operators'].append(len(npart)) debug_dp_info['meta'].append({0: 1}) ustate = sum(1 << i for i in ipart) dop(ustate, block, chains, on_debug, debug_dp_info, idn, nid, dp, ep, opt_type, max_group_size, max_num_groups, merge_latency, parallel_latency, cost_model, batch_size, warmup, number, repeat) stage_list.extend(get_stage_list(ep, ustate)) if on_debug: for ss in parallel_latency: stage_name = ' '.join( ['parallel'] + [nd.name for nd in state2nset(ss, idn)]) debug_dp_info['stage_latency'][ stage_name] = parallel_latency[ss] for ss in merge_latency: stage_name = ' '.join( ['merge'] + [nd.name for nd in state2nset(ss, idn)]) debug_dp_info['stage_latency'][stage_name] = merge_latency[ ss] for s in dp: state_name = ' '.join(nd.name for nd in state2nset(s, idn)) stage_name = ' '.join(nd.name for nd in state2nset( sum(1 << u for u in itertools.chain(*ep[s][0])), idn)) debug_dp_info['dp'][ state_name] = f'{dp[s]:.3f} ({ep[s][1]} {stage_name})' debug_dp_info['#schedules'][-1] = debug_dp_info['meta'][-1][ ustate] new_block = construct(stage_list, block, blocks, graph_enter, idn, nid, compute_weight) blocks.append(new_block) new_graph = Graph(graph.name + "_" + opt_type, graph_enter, blocks) new_graph.infer_shape() return new_graph