def convert(self, model: "keras.models.Model") -> Graph: """convert(model, input_orders=None) Convert kerasmodel into WebDNN IR Graph. First, WebDNN try to convert backend TensorFlow graph by TensorFlowConverter. If TensorFlowConverter failed to convert, then KerasConverter converts model by itself Args: model (`keras.models.Model`): keras model .. example:: model = keras.models.load_model("pre_trained_model.h5") graph = KerasConverter(batch_size=1).convert(model) Returns: (:class:`~webdnn.graph.graph.Graph`): WebDNN IR Graph """ if not self._use_tensorflow_converter: return self._convert_fallback(model) else: # noinspection PyBroadException try: return TensorFlowConverter( session=K.get_session(), batch_size=self._batch_size).convert( model.inputs, model.outputs) except Exception: self._use_tensorflow_converter = False console.debug(traceback.format_exc()) console.debug( "[KerasConverter] TensorflowConverter failed to convert.") return self._convert_fallback(model)
def optimize(self, graph: Graph) -> Tuple[Graph, bool]: """optimize(graph) Optimize the given graph. In the single call, this rule is applied multiple times until the graph will be not changed. args: graph(:class:`~webdnn.Graph`): Computational graph returns: (tuple of :class:`~webdnn.Graph` and bool): Optimized graph and flag whether the graph is changed or not. """ if not all(self.flags()): return graph, False flag_retry = True flag_totally_changed = False while flag_retry: flag_retry = False for sub_rule in self.sub_rules: if not all(sub_rule.flags()): continue graph, flag_changed = sub_rule.optimize(graph) if flag_changed: console.debug(f"[OptimizeRule] apply: {sub_rule.__class__.__name__}") flag_retry |= flag_changed flag_totally_changed |= flag_retry return graph, flag_totally_changed
def _optimize_inplace(operators: List[Operator], allocations_dict: AllocationDict): if not (flags.optimize.OPTIMIZE and flags.optimize.OPTIMIZE_MEMORY_ALLOCATION and flags.optimize.OPTIMIZE_INPLACE_OPERATION): console.debug('_optimize_inplace is skipped') return for op in operators: for attr in op.get_attribute(Inplace): v_in = attr.get_input() v_out = attr.get_output() if v_in.has_attribute(Input): continue if isinstance(v_in, ConstantVariable): continue if any(v_in.stride_dict[a] != v_out.stride_dict[a] for a in v_out.order.axes if a in v_in.order.axes): continue _merge_allocation(allocations_dict, allocations_dict[v_in], allocations_dict[v_out])
def _optimize_inplace(operators: List[Operator], allocations_dict: AllocationDict): if not (flags.optimize.OPTIMIZE and flags.optimize.OPTIMIZE_MEMORY_ALLOCATION and flags.optimize.OPTIMIZE_INPLACE_OPERATION): console.debug('_optimize_inplace is skipped') return for op in operators: for attr in op.get_attribute(Inplace): # type: Inplace _merge_allocation(allocations_dict, allocations_dict[attr.get_input()], allocations_dict[attr.get_output()])
def _convert_operator(self, proto: INodeProto): console.debug( f"-----------------------------------------------------------") console.debug(f"Type : {proto.op_type}") console.debug(f"Input : {proto.input}") console.debug(f"Output: {proto.output}") for name, val in attribute_dict(proto).items(): console.debug(f"Attr : {name} = {val}") super(ONNXConverter, self)._convert_operator(proto)
def dump_op(op: Operator): # pragma: no cover parameters_sorted = [ repr(key) + ': ' + str(op.parameters[key]) for key in sorted(op.parameters.keys()) ] console.debug(f"{op.__class__.__name__} : {op.name}") console.debug(f" In : {op.inputs}") console.debug(f" Out : {op.outputs}") console.debug( f" Attr: {', '.join(sorted(str(attr) for attr in op.attributes))}") console.debug(f" Parameters: {{{', '.join(parameters_sorted)}}}")
def validate_kernel_source(descriptor: GraphDescriptor): # FIXME: WebGPU supports multi shader languages, but this test supposes the language as METAL. source = descriptor.concat_kernel_sources() if os.name != 'posix': # os.name in mac is 'posix', and xcrun command is only in mac console.warning( "[WebGPUDescriptorGenerator] 'xcrun' command is not found. validation of generated source code in webgpu backend is " "skipped.") return with tmp.TemporaryDirectory() as tmpdir: source_path = path.join(tmpdir, "kernel.metal") lib_path = path.join(tmpdir, "kernel.air") with open(source_path, "w+") as f: f.write(source) try: result = subprocess.run([ "xcrun", "-sdk", "macosx", "metal", source_path, "-o", lib_path ], stdout=subprocess.PIPE, stderr=subprocess.PIPE) if result.returncode == 0: if result.stderr == b"": console.debug( "[WebGPUDescriptorGenerator] Generated kernel source is valid." ) else: console.warning( "[WebGPUDescriptorGenerator] In validating kernel source, warnings are generated." ) console.stderr(result.stderr.decode("utf-8")) else: console.error( "[WebGPUDescriptorGenerator] Generated kernel source is invalid." ) console.stderr(result.stderr.decode("utf-8")) exit(result.returncode) except FileNotFoundError: console.warning( "[WebGPUDescriptorGenerator] 'xcrun' command is not found. validation of generated source code in webgpu backend is " "skipped.") return
def generate(cls, graph: Graph, **kwargs): if flags.DEBUG: traverse.dump(graph) memory_layout = allocate(graph) console.debug( f"[FallbackDescriptorGenerator] memory_layout total size: {memory_layout.total_size * 4}" ) console.debug( f"[FallbackDescriptorGenerator] memory_layout static size: {memory_layout.static_size * 4}" ) console.debug( f"[FallbackDescriptorGenerator] memory_layout dynamic size: {memory_layout.dynamic_size * 4}" ) constant_encoder = ConstantEncoder.get_encoder( kwargs.get("constant_encoder_name", None)) constants_bytes = constant_encoder.encode(memory_layout) console.debug( f"[FallbackDescriptorGenerator] constants encoded size: {len(constants_bytes)}" ) descriptor = GraphDescriptor(kernels=cls.generate_kernels( graph, memory_layout), memory_layout=memory_layout, inputs=graph.inputs, outputs=graph.outputs, constants_encoding=constant_encoder.name, licenses=graph.licenses) return GraphExecutionData(graph, descriptor, constants_bytes)
def generate(cls, graph: Graph, **kwargs): graph, _ = WebGPUOptimizeRule().optimize(graph) if flags.DEBUG: traverse.dump(graph) memory_layout = allocate(graph) console.debug(f"[WebGPUDescriptorGenerator] memory_layout total size: {memory_layout.total_size * 4}[B]") console.debug(f"[WebGPUDescriptorGenerator] memory_layout static size: {memory_layout.static_size * 4}[B]") console.debug(f"[WebGPUDescriptorGenerator] memory_layout dynamic size: {memory_layout.dynamic_size * 4}[B]") constant_encoder = ConstantEncoder.get_encoder(kwargs.get("constant_encoder_name", None)) constants_bytes = constant_encoder.encode(memory_layout) console.debug(f"[WebGPUDescriptorGenerator] constants encoded size: {len(constants_bytes)}[B]") kernels = cls.generate_kernels(graph, memory_layout) descriptor = GraphDescriptor( kernels=kernels, memory_layout=memory_layout, inputs=graph.inputs, outputs=graph.outputs, constants_encoding=constant_encoder.name, licenses=graph.licenses ) if flags.optimize.VALIDATE_GENERATED_SOURCE: validate_kernel_source(descriptor) return GraphExecutionData(graph, descriptor, constants_bytes)
def convert(self, chainer_computational_graph: chainer.computational_graph. ComputationalGraph, input_c_vars: List[chainer.Variable], output_c_vars: List[chainer.Variable]) -> Graph: # In chainer v2, variables are represented as Variable and VariableNode object, and # graph information such as edge connection is contained in variable node. # Therefore all chainer variable must be normalized into variable node. input_c_vars = [_to_variable_node(v) for v in input_c_vars] output_c_vars = [_to_variable_node(v) for v in output_c_vars] # Append InputVariable attribute to input variables input_n_vars = [] for c_var in input_c_vars: n_var = self._convert_var(c_var) n_var.attributes.add(Input(n_var)) input_n_vars.append(n_var) self._convert_weight_vars(chainer_computational_graph) pending_c_oprs = [ c_opr for c_opr in chainer_computational_graph.nodes if isinstance(c_opr, chainer.Function) ] while len(pending_c_oprs) > 0: for c_opr in pending_c_oprs: if all(((self.has_variable(_to_variable_node(c_var))) for c_var in c_opr.inputs)): # All input variables of the `cfunc` are converted, so this `c_opr` can be converted. self.convert_operator(c_opr) pending_c_oprs.remove(c_opr) break # for c_opr in pending_functions else: console.debug(pending_c_oprs) raise ValueError("Inputs to functions cannot be resolved.") # Append OutputVariable attribute to output variables output_n_vars = [] for c_var in output_c_vars: if not self.has_variable(c_var): raise ValueError("Output variable is not generated by graph.") n_var = self.get_variable(c_var) n_var.attributes.add(Output) output_n_vars.append(n_var) # Convert variable order into typical one in Chainer self._transpose_vars() return Graph(input_n_vars, output_n_vars)
def _convert_batch_normalization_function( converter: ChainerConverter, c_op: chainer.functions.normalization. batch_normalization.BatchNormalizationFunction): x = converter.get_variable(c_op.inputs[0]) gamma = converter.get_variable(c_op.inputs[1]) beta = converter.get_variable(c_op.inputs[2]) if len(c_op.inputs) == 5: # noinspection PyUnresolvedReferences mean_data = converter.get_variable(c_op.inputs[3]).data # noinspection PyUnresolvedReferences variance_data = converter.get_variable(c_op.inputs[4]).data elif len(c_op.inputs) == 3: variance_data = c_op.running_var mean_data = c_op.running_mean else: raise ValueError( "inputs to BatchNormalizationFunction have to be 5 or 3.") console.debug(variance_data) # Simplify scale and bias # # from: # y = (x - mean) / sqrt(var + eps) * gamma + beta # # to: # y = x * gamma_div_std + beta_scaled # # gamma_div_std = gamma / sqrt(var + eps) # beta_scaled = beta - mean * gamma_div_std # noinspection PyUnresolvedReferences gamma_div_std = gamma.data / np.sqrt(variance_data + c_op.eps) # noinspection PyUnresolvedReferences beta_scaled = beta.data - mean_data * gamma_div_std scale_opr = AxiswiseScale(None, axis=Axis.C) gamma_div_std_const = ConstantVariable(gamma_div_std, OrderC) scale_out, = scale_opr(x, gamma_div_std_const) offset_opr = AxiswiseBias(None, axis=Axis.C) beta_scaled_const = ConstantVariable(beta_scaled, OrderC) offset_out, = offset_opr(scale_out, beta_scaled_const) converter.set_variable(c_op.outputs[0](), offset_out)
def _optimize_inplace(operators: Sequence[Operator], allocations_dict: AllocationDict): if not (flags.optimize.OPTIMIZE and flags.optimize.OPTIMIZE_MEMORY_ALLOCATION and flags.optimize.OPTIMIZE_INPLACE_OPERATION): console.debug('_optimize_inplace is skipped') return for op in operators: for attr in op.get_attribute(Inplace): # type: Inplace a1 = allocations_dict[attr.get_input()] a2 = allocations_dict[attr.get_output()] if not Placeholder.check_resolved( a1.size) or not Placeholder.check_resolved(a2.size): continue _merge_allocation(allocations_dict, a1, a2)
def convert(self, model: "keras.models.Model") -> Graph: """convert(model, input_orders=None) Convert kerasmodel into WebDNN IR Graph. First, WebDNN try to convert backend TensorFlow graph by TensorFlowConverter. If TensorFlowConverter failed to convert, then KerasConverter converts model by itself Args: model (`keras.models.Model`): keras model .. admonition:: example Convert pre-trained keras ResNet model. .. code:: import keras from webdnn.frontend.keras import KerasConverter model = keras.applications.resnet50.ResNet50(include_top=True, weights='imagenet') graph = KerasConverter(batch_size=1).convert(model) Returns: (:class:`~webdnn.graph.graph.Graph`): WebDNN IR Graph """ if not self._use_tensorflow_converter: return self._convert_fallback(model) else: # noinspection PyBroadException try: return TensorFlowConverter( session=K.get_session(), batch_size=self._batch_size).convert( model.inputs, model.outputs) except Exception: self._use_tensorflow_converter = False console.debug(traceback.format_exc()) console.debug( "[KerasConverter] TensorflowConverter failed to convert.") return self._convert_fallback(model)
def generate(cls, graph: Graph, **kwargs): graph, _ = WebassemblyOptimizeRule().optimize(graph) if flags.DEBUG: traverse.dump(graph) memory_layout = Allocator.allocate(graph) console.debug( f"[WebassemblyDescriptorGenerator] memory_layout total size: {memory_layout.total_size * 4}" ) console.debug( f"[WebassemblyDescriptorGenerator] memory_layout static size: {memory_layout.static_size * 4}" ) console.debug( f"[WebassemblyDescriptorGenerator] memory_layout dynamic size: {memory_layout.dynamic_size * 4}" ) constant_encoder = ConstantEncoder.get_encoder( kwargs.get("constant_encoder_name", None)) constants_bytes = constant_encoder.encode(memory_layout) console.debug( f"[WebassemblyDescriptorGenerator] constants encoded size: {len(constants_bytes)}" ) kernels = cls.generate_kernels(graph, memory_layout) heap_block_size = 16 * 1024 * 1024 if isinstance(memory_layout.dynamic_size, int): dynamic_size_byte_int = memory_layout.dynamic_size * 4 else: dynamic_size_byte_int = kwargs.get("dynamic_allocation_size", heap_block_size) total_size_byte = memory_layout.static_size * 4 + dynamic_size_byte_int # required for calculation (size ceiling to one block) + one block required_heap = ( (total_size_byte + heap_block_size - 1) // heap_block_size + 1) * heap_block_size descriptor = GraphDescriptor(kernels=kernels, memory_layout=memory_layout, inputs=graph.inputs, outputs=graph.outputs, constants_encoding=constant_encoder.name, required_heap=required_heap, licenses=graph.licenses) return GraphExecutionData(graph, descriptor, constants_bytes)
def _listup_operations(inputs: Sequence[T_NODE], outputs: Sequence[T_NODE]): def get_prev_nodes(node: T_NODE) -> Sequence[T_NODE]: if node in inputs: return [] elif isinstance(node, tf.Tensor): return [node.op] else: return node.inputs result = [] # type: List[tf.Operation] stack = [(node, None) for node in outputs] # type: List[Tuple[T_NODE, T_NODE]] dependency_count = {} # type: Dict[T_NODE, int] while len(stack) > 0: node_from, node_to = stack.pop() if node_from not in dependency_count: stack.append((node_from, node_to)) prev_nodes = get_prev_nodes(node_from) dependency_count[node_from] = 0 for prev_node in prev_nodes: if dependency_count.get(prev_node, 1) > 0: dependency_count[node_from] += 1 stack.append((prev_node, node_from)) elif dependency_count[node_from] == 0: if isinstance(node_from, tf.Operation): result.append(node_from) if node_to is not None: dependency_count[node_to] -= 1 else: console.debug( "[TensorFlowConverter] Cycle is detected in computation graph") console.debug("cycle starting node:") console.debug(node_from) raise CyclicGraphError( "[TensorFlowConverter] Cycles are detected, but TensorFlowConverter cannot convert cyclic graph" ) return result
def _optimize_buffer_reuse(allocations_dict: AllocationDict): """ Optimize memory size by reusing buffer if available Algorithm: Considering 4 variables with follow size and lifetime. Size and Lifetime) var |size| Lifetime (t=0 -> ...) ----+----+--------------- a | 5 | [0, 2) b | 4 | [2, 4) c | 2 | [3, 5) d | 1 | [6, 8) e | 3 | [0, 5) ----+----+--------------- In this case, we want to get follow optimized allocation: ---------> address time | aaaaa_e | aaaaa_e | bbbb__e | bbbbcce | ____cce V d______ d______ First, construct "Merge Offset Table". table = { reduced_size = { a: {}, a: {}, b: { a: 0 }, b: { a: 4 }, c: { a: 0, b: 4 }, c: { a: 2, b: 0 } d: { a: 0, b: 0, c: 0, e: 0 }, d: { a: 1, b: 1, c: 1, e: 1 }, e: { a: 5, b: 4, c: 2, d: 0 } e: { a: 0, b: 0, c: 0, d: 1 } } } `table[x][y]` means offset value in case that variable `x` is merged into variable `y`. For example, when `b` is merged into (=reused the memory allocated for) `a`, offset value is `0` because they are not exist at same time. However, when `c` is merged into `b`, offset value is `4` because they are exist at same time (t=3). Next, for each mergeable pair, calculate the reduced size if two variables are merged. For example, if `b` is merged into `a`, the reduced size is `4`. Then merge the pair which has the largest reduced size. In this case such pair is `a` and `b`, and update the table. table = { reduced_size = { ab: {}, ab: {}, c: { ab: 4 }, c: { ab: 1 } d: { ab: 0, c: 0, e: 0 }, d: { ab: 1, c: 1, e: 1 }, e: { ab: 5, c: 2, d: 0 } e: { ab: 0, c: 0, d: 1 } } } Iterate this procedure until all variables are merged into single allocation. Merge `d` into `ab` with offset `0`: table = { reduced_size = { abd: {}, abd: {}, c: { abd: 4 }, c: { abd: 1 }, e: { abd: 5, c: 2 } e: { abd: 0, c: 0 } } } Merge `c` into `abd` with offset `4`: table = { reduced_size = { abcd: {}, abcd: {}, e: { abcd: 5 } e: { abcd: 0 } } } Merge `e` into `abcd` with offset `5`: table = { reduced_size = { abcde: {} abcde: {} } } Finish. Time order: Build Table: O(N^2) Iteration: O(N) times update table: O(N) Total: O(N^2) """ if not (flags.optimize.OPTIMIZE and flags.optimize.OPTIMIZE_MEMORY_ALLOCATION): console.debug('_optimize_buffer_reuse is skipped') return allocations = list( set( filter( lambda x: Placeholder.check_resolved(x.size) and Placeholder. check_resolved(x.offset), allocations_dict.values()))) allocations = sorted(allocations, key=lambda a: a.size, reverse=True) # Construct offset table offset_table = {a2: {} for a2 in allocations} for i1, a1 in enumerate(allocations): for i2, a2 in enumerate(allocations[i1 + 1:]): # align offset as 16-byte alignment offset_table[a2][a1] = 0 if ( a1.end <= a2.begin or a2.end <= a1.begin) else _align(a1.size) # Merge merge_tree = {} # type: Dict[Allocation, Tuple[Allocation, int]] while len(offset_table) > 1: if len(offset_table) % 10 == 0: console.debug( f"Memory allocation optimization: {(1-len(offset_table)/len(allocations)) * 100:4.1f}% complete." ) # Get max score pair max_score = -1 max_a1 = None max_a2 = None for a2, a1s in offset_table.items(): for a1, offset in a1s.items(): score = max(min(a1.size - offset, a2.size), 0) if max_score < score: max_score = score max_a1 = a1 max_a2 = a2 # Merge a1 = max_a1 a2 = max_a2 offset12 = offset_table[a2][a1] merge_tree[a2] = (a1, offset12) # Update offset table for a3, offset32 in offset_table[a2].items(): if a1 in offset_table[a3]: # +-------+ # | V # a2->a3->a1 # # condition # - min(a1.size - offset12, a2.size) > min(a1.size - offset13, a3.size) # - a2.size < a3.size < a1.size # # ========================================================== # case 1) offset13 < offset12 + a2.size # # before) # a1 |a1...................| # a2 <-offset12---->|a2...| # a3 <-offset13--------->|a3............| # <-offset23->|a2...| # # after) # a1 |a1...................| # a3 <-offset13------------>|a3............| # # ========================================================== # # before 2) offset13 > offset12 + a2.size # a1 |a1...................| # a2 <-offset12---->|a2...| # a3 <-offset13------------->|a3............| # <-offset23->|a2...| # # after) # a1 |a1...................| # a3 <-offset13------------->|a3............| # offset_table[a3][a1] = max(offset_table[a3][a1], _align(offset12 + a2.size)) elif a3 in offset_table[a1]: # +-------+ # | V # a2->a1->a3 # # condition # - min(a1.size - offset12, a2.size) > min(a1.size - offset13, a3.size) # - a2.size < a1.size < a3.size # # ========================================================== # case 1) offset32 > offset31 + offset12 # # before) # <-offset32----------->|a2...| # a3 |a3..........| # a1 <-offset31--->|a1........| # a2 <-offset12->|a2...| # # after) nothing changed # a3 |a3..........| # a1 <-offset31--->|a1........||a2...| # # ========================================================== # case 2) offset32 > offset31 + offset12 # # before) # <-offset32------------------->|a2...........................| # a3 |a3.........................| # a1 <-offset31->|a1........................................| # a2 <-offset12->|a2...........................| # # after) offset31 = offset32 - offset12 # a3 |a3.........................| # a1 <------offset31->|a1........................................| # offset_table[a1][a3] = max(offset_table[a1][a3], _align(offset32 - offset12)) del offset_table[a2] for a3, a4s in offset_table.items(): if a3 == a1: continue if a2 in a4s: if a1 in a4s: # +-------+ # | V # a3->a2->a1 a4s[a1] = max(a4s[a1], offset12 + a4s[a2]) del a4s[a2] else: raise NotImplementedError # # a3->a2->a1 # # a4s[a1] = offset12 + a4s[a2] # del a4s[a2] console.debug(f"Memory allocation optimization: 100.0% complete.") if len(offset_table) > 0: # Shift allocation block to 0-offset. list(offset_table.keys())[0].offset = 0 for a2, (a1, offset) in merge_tree.items(): while a1 in merge_tree: a1, offset2 = merge_tree[a1] offset += offset2 a2.offset = offset
from webdnn.graph.variable import Variable from webdnn.graph.variables.attributes.input import Input from webdnn.graph.variables.attributes.output import Output from webdnn.graph.variables.constant_variable import ConstantVariable from webdnn.util import console FLAG_KERAS_INSTALLED = False try: import keras import keras.backend as K import tensorflow as tf if not "2." <= keras.__version__ < "3.": console.debug( f"WebDNN supports Keras v2.*.*. Currently, keras {keras.__version__} is installed." ) FLAG_KERAS_INSTALLED = True except ImportError as e: console.debug("Keras and Tensorflow are not completely installed.") pass def get_default_order(tf_tensor: "tf.Tensor"): if len(tf_tensor.shape) == 2: return OrderNC elif len(tf_tensor.shape) == 3: return OrderNTC
def convert(self, chainer_computational_graph: "chainer.computational_graph.ComputationalGraph", input_c_vars: List["chainer.Variable"], output_c_vars: List["chainer.Variable"]) -> Graph: """convert(chainer_computational_graph, input_c_vars, output_c_vars) Convert chainer computational graph into WebDNN IR. Instead of using this method directly, you should use :func:`convert_from_inout_vars<webdnn.frontend.chainer.ChainerConverter.convert_from_inout_vars>`. Args: chainer_computational_graph(chainer.computational_graph.ComputationalGraph): chainer computational graph input_c_vars(list of chainer.Variable): input chainer variables output_c_vars(list of chainer.Variable): output chainer variables Returns: (:class:`~webdnn.Graph`): WebDNN Graph """ # In chainer v2, variables are represented as Variable and VariableNode object, and # graph information such as edge connection is contained in variable node. # Therefore all chainer variable must be normalized into variable node. input_c_vars = [_to_variable_node(v) for v in input_c_vars] output_c_vars = [_to_variable_node(v) for v in output_c_vars] # Append InputVariable attribute to input variables input_n_vars = [] for c_var in input_c_vars: n_var = self._convert_var(c_var) n_var.attributes.add(Input(n_var)) input_n_vars.append(n_var) self._convert_weight_vars(chainer_computational_graph) pending_c_oprs = [ c_opr for c_opr in chainer_computational_graph.nodes if isinstance(c_opr, chainer.Function) ] while len(pending_c_oprs) > 0: for c_opr in pending_c_oprs: if all(((self.has_variable(_to_variable_node(c_var))) for c_var in c_opr.inputs)): # All input variables of the `cfunc` are converted, so this `c_opr` can be converted. self._convert_operator(c_opr) pending_c_oprs.remove(c_opr) break # for c_opr in pending_functions else: console.debug(pending_c_oprs) raise ValueError("Inputs to functions cannot be resolved.") # Append OutputVariable attribute to output variables output_n_vars = [] for c_var in output_c_vars: if not self.has_variable(c_var): raise ValueError("Output variable is not generated by graph.") n_var = self.get_variable(c_var) n_var.attributes.add(Output) output_n_vars.append(n_var) graph = Graph(input_n_vars, output_n_vars) # Convert variable order into typical one in Chainer self._transpose_vars(graph) return graph
try: import chainer import chainer.computational_graph if chainer.__version__ >= "2.": chainer_v2 = True # noinspection PyUnresolvedReferences VariableNode = chainer.variable.VariableNode else: chainer_v2 = False VariableNode = chainer.variable.Variable FLAG_CHAINER_INSTALLED = True except ImportError as e: console.debug("Chainer is not completely installed.") pass def _to_variable_node( chainer_variable: Union["chainer.Variable", "VariableNode"]) -> "VariableNode": if chainer_v2 and not isinstance(chainer_variable, VariableNode): # noinspection PyUnresolvedReferences return chainer_variable.node else: return chainer_variable class ChainerConverter(Converter["chainer.Function"]): """ChainerConverter()
def optimize(self, graph: Graph) -> Tuple[Graph, bool]: if not (flags.optimize.OPTIMIZE and flags.optimize.CONCAT_SCALAR_OPERATION): return graph, False flag_changed = False matches = search_sub_structure(graph, [ScalarOperation, Variable, ScalarOperation]) while len(matches) > 0: match = matches[0] op1 = match[0] # type: Operator op2 = match[2] # type: Operator y1 = op1.outputs["y"] y2 = op2.outputs["y"] if isinstance(op1, ScalarAffine): if isinstance(op2, ScalarAffine): op1.scale = op1.scale * op2.scale op1.bias = op1.bias * op2.scale + op2.bias op2.remove_all() op1.replace_output(y1, y2) elif isinstance(op2, ScalarAdd): op1.bias += op2.value op2.remove_all() op1.replace_output(y1, y2) elif isinstance(op2, ScalarMul): op1.scale *= op2.value op1.bias *= op2.value op2.remove_all() op1.replace_output(y1, y2) else: console.debug(f"[ConcatScalarOperation] unhandled pair: {type(op1)} and {type(op2)}") elif isinstance(op1, ScalarAdd): if isinstance(op2, ScalarAffine): op2.bias += op1.value * op2.scale x = op1.inputs["x0"] op1.remove_all() x.replace(y1) elif isinstance(op2, ScalarAdd): op1.parameters["value"] += op2.value op2.remove_all() op1.replace_output(y1, y2) elif isinstance(op2, ScalarMul): x = op1.inputs["x0"] new_op = ScalarAffine(None, scale=op2.value, bias=op1.value * op2.value) new_y, = new_op(x) op1.remove_all() op2.remove_all() y2.replace(new_y) else: console.debug(f"[ConcatScalarOperation] unhandled pair: {type(op1)} and {type(op2)}") elif isinstance(op1, ScalarMul): if isinstance(op2, ScalarAffine): op2.scale *= op1.value x = op1.inputs["x0"] op1.remove_all() x.replace(y1) elif isinstance(op2, ScalarAdd): x = op1.inputs["x0"] new_op = ScalarAffine(None, scale=op1.value, bias=op2.value) new_y, = new_op(x) op1.remove_all() op2.remove_all() y2.replace(new_y) elif isinstance(op2, ScalarMul): op1.parameters["value"] *= op2.value op2.remove_all() op1.replace_output(y1, y2) else: console.debug(f"[ConcatScalarOperation] unhandled pair: {type(op1)} and {type(op2)}") flag_changed = True matches = search_sub_structure(graph, [ScalarAffine, Variable, ScalarAffine]) return graph, flag_changed
from webdnn.graph.graph import Graph from webdnn.graph.order import Order from webdnn.graph.variable import Variable from webdnn.graph.variables.attributes.input import Input from webdnn.graph.variables.attributes.output import Output from webdnn.graph.variables.constant_variable import ConstantVariable from webdnn.util import console FLAG_ONNX_INSTALLED = False try: import onnx FLAG_ONNX_INSTALLED = True except ImportError as e: console.debug("ONNX is not completely installed.") pass def attribute_dict(proto: INodeProto) -> Dict[str, IAttributeProto]: return {attr.name: attr for attr in proto.attribute} class ONNXConverter(Converter["onnx.NodeProto"]): """ONNXConverter() Converter for `Open Neural Network Exchange (ONNX) <http://onnx.ai/>`_. To use this converter, you need to install ONNX python module. see `ONNX github repository <https://github.com/onnx/onnx>`_. """
def dump(graph: Graph): indent = "" for op in listup_operators(graph): parameters_sorted = [repr(key) + ': ' + str(op.parameters[key]) for key in sorted(op.parameters.keys())] console.debug(f"---------------------------------------------------------------------------") console.debug(f"{indent}{op.__class__.__name__} : {op.name}") console.debug(f"{indent} In : {op.inputs}") console.debug(f"{indent} Out : {op.outputs}") console.debug(f"{indent} Attr: {sorted([attr.__class__.__name__ for attr in op.attributes])}") console.debug(f"{indent} Parameters: {{{', '.join(parameters_sorted)}}}")
def dump(graph: Graph): for op in listup_operators(graph): console.debug( f"---------------------------------------------------------------------------" ) dump_op(op)
def _optimize_buffer_reuse(allocations_dict: AllocationDict): """ Optimize memory size by reusing buffer if available Algorithm: Considering 4 variables with follow size and lifetime. Size and Lifetime) var |size| Lifetime (t=0 -> ...) ----+----+--------------- a | 5 | [0, 2) b | 4 | [2, 4) c | 2 | [3, 5) d | 1 | [6, 8) e | 3 | [0, 5) ----+----+--------------- In this case, we want to get follow optimized allocation: ---------> address time | aaaaa_e | aaaaa_e | bbbb__e | bbbbcce | ____cce V d______ d______ First, construct "Merge Offset Table". table = { a: {}, b: { a: 0 }, c: { a: 0, b: 4, e: 3 }, d: { a: 0, b: 0, c: 0, e: 0 }, e: { a: 5, b: 4 } } `table[x][y]` means offset value in case that variable `x` is merged into variable `y`. For example, when `b` is merged into (=reused the memory allocated for) `a`, offset value is `0` because they are not exist at same time. However, when `c` is merged into `b`, offset value is `4` because they are exist at same time (t=3). Next, for each mergeable pair, calculate the reduced size if two variables are merged. For example, if `b` is merged into `a`, the reduced size is `4`. Then merge the pair which has the largest reduced size. In this case such pair is `a` and `b`, and update the table. table = { ab: {}, c: { ab: 4, e: 3 }, d: { ab: 0, c: 0, e: 0 }, e: { ab: 5 } } Iterate this procedure until all variables are merged into single allocation. Merge `d` into `ab` with offset `0`: table = { abd: {}, c: { abd: 4, e: 3 }, e: { abd: 5 } } Merge `c` into `abd` with offset `4`: table = { abcd: {}, e: { abcd: 5 } } Merge `e` into `abcd` with offset `5`: table = { abcde: {}, } Finish. Time order: Build Table: O(N^2) Iteration: O(N) times update table: O(N) Total: O(N^2) """ if not flags.optimize.OPTIMIZE: console.debug('_optimize_buffer_reuse is skipped') return allocations = list( set( filter(lambda x: Placeholder.check_resolved(x), allocations_dict.values()))) allocations = sorted(allocations, key=lambda a: a.size, reverse=True) # Construct offset table offset_table = {a2: {} for a2 in allocations} for i1, a1 in enumerate(allocations): for i2, a2 in enumerate(allocations[i1 + 1:]): # align offset as 16-byte alignment offset_table[a2][a1] = 0 if ( a1.end <= a2.begin or a2.end <= a1.begin) else _align(a1.size) # Merge merge_tree = {} # type: Dict[Allocation, Tuple[Allocation, int]] while len(offset_table) > 1: # Get max score pair max_score = -1 max_a1 = None max_a2 = None for a2, a1s in offset_table.items(): for a1, offset in a1s.items(): score = max(min(a1.size - offset, a2.size), 0) if max_score < score: max_score = score max_a1 = a1 max_a2 = a2 # Merge a1 = max_a1 a2 = max_a2 offset12 = offset_table[a2][a1] merge_tree[a2] = (a1, offset12) # Update offset table for a3, offset23 in offset_table[a2].items(): if a1 in offset_table[a3]: # a2->a3->a1 # | | # +-------+ # # a1 |/////a1///////| # a2 <----offset12-->|//a2//| # <-offset32-> # a3 |////a3////| offset_table[a3][a1] = max(offset_table[a3][a1], _align(offset12 + a2.size)) del offset_table[a2] for a3, a4s in offset_table.items(): if a3 == a1: continue if a2 in a4s: if a1 in a4s: # a3->a2->a1 # | | # +-------+ a4s[a1] = max(a4s[a1], offset12 + a4s[a2]) del a4s[a2] else: a4s[a1] = offset12 + a4s[a2] del a4s[a2] # Update all allocation offset value list(offset_table.keys())[0].offset = 0 for a2, (a1, offset) in merge_tree.items(): while a1 in merge_tree: a1, offset2 = merge_tree[a1] offset += offset2 a2.offset = offset
def dump(graph: Graph): # pragma: no cover for op in listup_operators(graph): console.debug( f"---------------------------------------------------------------------------" ) dump_op(op)
def _split_axis(v: Variable, axis: Axis, graph): """ split variable by specified axis """ s1 = v.shape_dict[axis] // 2 s2 = v.shape_dict[axis] - s1 if isinstance(v, ConstantVariable): v_datum = np.split(v.data, [s1], v.order.axes_dict[axis]) v1 = ConstantVariable(v_datum[0], v.order) v2 = ConstantVariable(v_datum[1], v.order) else: v1 = Variable([s1 if a == axis else v.shape_dict[a] for a in v.order.axes], v.order) v2 = Variable([s2 if a == axis else v.shape_dict[a] for a in v.order.axes], v.order) ops = list(v.input_to) if v.output_from is not None: ops += [v.output_from] for op in ops: if all(isinstance(v, ConstantVariable) for v in op.inputs.values()): op.fold_constance() elif isinstance(op, SplitAxis): _split_splitaxis(graph, op, v, [v1, v2], axis) elif isinstance(op, Concat): _split_concat(graph, op, v, [v1, v2], axis) elif isinstance(op, Im2Col): _split_im2col(graph, op, v, [v1, v2], axis) elif isinstance(op, PartialIm2Col): _split_partial_im2col(graph, op, v, [v1, v2], axis) elif isinstance(op, Reshape): _split_reshape(graph, op, v, [v1, v2], axis) elif isinstance(op, Sgemm): _split_sgemm(graph, op, v, [v1, v2], axis) elif Tensorwise.check_splittable(op, axis): _split_tensorwise(graph, op, v, [v1, v2], axis) else: console.debug("-------------------------------------------------") console.debug(f"{v}") console.debug(f" original order: {v.order}") console.debug(f" original shape: {v.shape}") console.debug(f"") console.debug(f" split axis: {axis}") console.debug(f"") console.debug(f" related operators:") for related_op in ops: console.debug(f" {related_op}") console.debug(f"") with open("cg-failed.dot", "w") as f: f.write(traverse.dump_dot(graph)) raise NotImplementedError(f"Variable is too large to handle in WebGL backend: {v}")
def _choose_split_axis(v: Variable) -> Axis: """ For too-large texture `v`, choose one axis which is the best one to reduce texture size by splitting `v` in that axis. Args: v: Variable, whose size is too large (= this variable has :code:`SplitTarget` attribute) Returns: axis """ ops = list(v.input_to) if v.output_from is not None: ops += [v.output_from] splittable_axes = list(v.order.axes) for op in ops: _op_splittable_axes = _listup_splittable_axis( v, op) + [attr.axis for attr in op.get_attribute(Tensorwise)] for a in list(splittable_axes): if a not in _op_splittable_axes: splittable_axes.remove(a) if len(splittable_axes) == 0: raise ValueError("No axis is splittable") # Calculate the size of a side of texture which will be changed when each axis is split # # ex) OrderNC, N=512, C=2048, texture(width=2048, height=512) # => If axis `N` is split, then height will be changed => N: 512 (=height) # If axis `C` is split, then width will be changed => C: 2048 (=width) # # ex) OrderNCHW, N=1, C=512, H=13, W=13, texture(width=2048, height=43) # => TexW == W*H*(partial of C) texture width consists of axis W, H and C. # TexH == (partial of C)*N texture height consists of axis C and N. # => N cannot be split => N: -1 # C is related both width and height. In this case, use large one. => C: 2048 # H is included in width => H: 2048 # W is also included in width => W: 2048 axis_corresponding_texture_size = AxisKeyDict() element_per_pixel = ChannelMode.elements_per_pixel(v) tex_h, tex_w = TextureShape.get(v) tex_w = (tex_w + element_per_pixel - 1) // element_per_pixel for a in v.order.axes: if v.shape_dict[a] == 1: # This axis cannot be split axis_corresponding_texture_size[a] = -1 elif v.stride_dict[a] >= tex_w * element_per_pixel: axis_corresponding_texture_size[a] = tex_h elif v.stride_dict[a] * v.shape_dict[a] >= tex_w * element_per_pixel: axis_corresponding_texture_size[a] = max(tex_h, tex_w) else: axis_corresponding_texture_size[a] = tex_w splittable_axes.sort(key=lambda a: axis_corresponding_texture_size[a], reverse=True) target_axis = splittable_axes[0] console.debug( f"===========================================================================" ) console.debug(f"{v}") console.debug(f" original order: {v.order}") console.debug(f" original shape: {v.shape}") console.debug(f" texture shape: {TextureShape.get(v)}") console.debug(f"") console.debug(f" splittable axis: {splittable_axes}") console.debug(f" split axis: {target_axis}") console.debug(f"") console.debug(f" related operators:") for related_op in ops: console.debug( f"---------------------------------------------------------------------------" ) traverse.dump_op(related_op) console.debug(f"") if axis_corresponding_texture_size[target_axis] <= 0: raise NotImplementedError( f"Variable is too large to handle in WebGL backend: {v}") return target_axis
from webdnn.graph.graph import Graph from webdnn.graph.order import Order, OrderNC, OrderNTC, OrderNHWC, OrderC from webdnn.graph.placeholder import Placeholder from webdnn.graph.variable import Variable from webdnn.graph.variables.constant_variable import ConstantVariable from webdnn.optimizer.tensorflow_frontend_optimize_rule import TensorFlowFrontendOptimizeRule from webdnn.util import console from webdnn.util import flags FLAG_TF_INSTALLED = True try: import tensorflow as tf except ImportError as e: console.debug("Tensorflow are not completely installed.") FLAG_TF_INSTALLED = False pass def get_default_order(ndim: int): if ndim == 1: return OrderC elif ndim == 2: return OrderNC elif ndim == 3: return OrderNTC elif ndim == 4: