def axiswise_bias_same_order(op: AxiswiseBias, memory_layout: MemoryLayout) -> List[Kernel]: x = memory_layout[op.inputs["x"]] b = memory_layout[op.inputs["b"]] y = memory_layout[op.outputs["y"]] target_axis_index = x.variable.order.axes_dict[op.axis] D1 = mul(x.variable.shape[:target_axis_index]) D2 = x.variable.shape[target_axis_index] D3 = mul(x.variable.shape[target_axis_index + 1:]) buffer_injector = BufferInjector() buffer_injector.register({ "axiswise_bias_X": x, "axiswise_bias_B": b, "axiswise_bias_Y": y, "axiswise_bias_D1": D1, "axiswise_bias_D2": D2, "axiswise_bias_D3": D3 }) name_injector = KernelNameInjector(op) source = generate_template_same_order(D1, D3) source = buffer_injector.inject(source) source = name_injector.inject(source) kernel = Kernel({name_injector.name: source}, name_injector.name, GPUSize(8, 1, 1), GPUSize(MAX_THREADS_PER_THREADGROUP, 1, 1), buffer_injector.buffer, buffer_injector.unresolved_value_list) return [kernel]
def softmax_same_order(op: Softmax, memory_layout: MemoryLayout) -> List[Kernel]: x = op.inputs["x"] y = op.outputs["y"] target_axis = op.parameters["axis"] target_axis_index = x.order.axes_dict[target_axis] D1 = mul(x.shape[:target_axis_index]) D2 = x.shape[target_axis_index] D3 = mul(x.shape[target_axis_index + 1:]) buffer_injector = BufferInjector() buffer_injector.register({ "softmax_X": memory_layout[x], "softmax_Y": memory_layout[y], "softmax_D1": D1, "softmax_D2": D2, "softmax_D3": D3 }) name_injector = KernelNameInjector(op) source = template_same_order source = buffer_injector.inject(source) source = name_injector.inject(source) kernel = Kernel({name_injector.name: source}, name_injector.name, GPUSize(8, 1, 1), GPUSize(MAX_THREADS_PER_THREADGROUP, 1, 1), buffer_injector.buffer, buffer_injector.unresolved_value_list) return [kernel]
def template(x_shape, axis, description: str = ""): np_axis = 1 if axis is None else axis vx = np.random.rand(*x_shape) new_shape = [mul(vx.shape[:np_axis]), mul(vx.shape[np_axis:])] max_i = np.argmax(vx.reshape(new_shape), axis=1) vy = np.zeros(new_shape) vy[np.arange(vy.shape[0]), max_i] = 1 x = make_tensor_value_info("x", vx.shape) y = make_tensor_value_info("y", vy.shape) kwargs = {} if axis is not None: kwargs["axis"] = axis operator = make_node("Hardmax", ["x"], ["y"], **kwargs) model = make_model([operator], [x], [y]) graph = ONNXConverter().convert(model) assert tuple(vy.shape) == tuple( graph.outputs[0].shape ), f"vy: {vy.shape}, graph.outputs[0]: {graph.outputs[0].shape}" generate_kernel_test_case( description=f"[ONNX] Hardmax {description}", graph=graph, backend=["webgpu", "webgl", "webassembly"], inputs={graph.inputs[0]: vx}, expected={graph.outputs[0]: vy}, )
def axiswise_scale_same_order(op: AxiswiseScale, memory_layout: MemoryLayout) -> List[Kernel]: x = memory_layout[op.inputs["x"]] s = memory_layout[op.inputs["s"]] y = memory_layout[op.outputs["y"]] target_axis_index = x.variable.order.axes_dict[op.axis] D1 = mul(x.variable.shape[:target_axis_index]) D2 = x.variable.shape[target_axis_index] D3 = mul(x.variable.shape[target_axis_index + 1:]) buffer_injector = BufferInjector() buffer_injector.register({ "axiswise_scale_X": x, "axiswise_scale_S": s, "axiswise_scale_Y": y, "axiswise_scale_D1": D1, "axiswise_scale_D2": D2, "axiswise_scale_D3": D3 }) name_injector = KernelNameInjector(op) source = template_same_order source = buffer_injector.inject(source) source = name_injector.inject(source) kernel = Kernel({name_injector.name: source}, name_injector.name, buffer_injector.buffer, buffer_injector.unresolved_value_list) return [kernel]
def axiswise_scale_general(op: AxiswiseScale, memory_layout: MemoryLayout) -> List[Kernel]: x = memory_layout[op.inputs["x"]] s = memory_layout[op.inputs["s"]] y = memory_layout[op.outputs["y"]] x_shape = x.variable.shape target_axis_index = x.variable.order.axes_dict[op.axis] D1 = mul(x_shape[:target_axis_index]) D2 = x_shape[target_axis_index] D3 = mul(x_shape[target_axis_index + 1:]) y_strides = [] stride = 1 for sh in reversed(y.variable.shape): y_strides.insert(0, stride) stride *= sh x_stride_in_y = [ y_strides[y.variable.order.axes_dict[axis]] for axis in x.variable.order.axes ] buffer_injector = BufferInjector() buffer_injector.register({ "axiswise_scale_X": x, "axiswise_scale_S": s, "axiswise_scale_Y": y, "axiswise_scale_D1": D1, "axiswise_scale_D2": D2, "axiswise_scale_D3": D3, "axiswise_scale_D": x.variable.ndim, "axiswise_scale_d_target": x.variable.order.axes_dict[op.axis], "axiswise_scale_x_shape": x_shape, "axiswise_scale_x_stride_in_y": x_stride_in_y, }) name_injector = KernelNameInjector(op) source = template_general source = buffer_injector.inject(source) source = name_injector.inject(source) kernel = Kernel({name_injector.name: source}, name_injector.name, buffer_injector.buffer, buffer_injector.unresolved_value_list) return [kernel]
def col2im(op: Col2Im) -> List[Kernel]: col = op.inputs["col"] im = op.outputs["im"] assert col.order.check_same_axes( Order([Axis.N, Axis.H, Axis.W, Axis.KH, Axis.KW, Axis.C])) assert col.order.axes_dict[Axis.KH] + 2 == col.order.axes_dict[ Axis.KW] + 1 == col.order.axes_dict[Axis.C] == 5 assert im.order.check_same_axes(OrderNHWC) assert ChannelMode.get(col) == ChannelModeEnum.R assert ChannelMode.get(im) == ChannelModeEnum.R col_shape = col.shape[0:3] + (mul(col.shape[3:6]), ) col_stride = [mul(col_shape[i + 1:]) for i in range(len(col_shape))] col_order = Order(col.order.axes[0:3] + (Axis.C, )) code = KernelCode([ """ void main() { ivec4 variable_position_im = """, change_order(get_output_position(im), im.order, OrderNHWC), f"""; int n = variable_position_im.x; int h1 = variable_position_im.y; int w1 = variable_position_im.z; int c1 = variable_position_im.w; float sum = 0.0; for (int kh = 0; kh < {op.KH}; kh++) {{ int h2 = (h1 + {op.PH} - kh) / {op.SH}; if (mod(h1 + {op.PH} - kh, {op.SH}) != 0 || h2 < 0 || h2 >= {col.shape_dict[Axis.H]}) continue; for (int kw = 0; kw < {op.KW}; kw++) {{ int w2 = (w1 + {op.PW} - kw) / {op.SW}; if (mod(w1 + {op.PW} - kw, {op.SW}) != 0 || w2 < 0 || w2 >= {col.shape_dict[Axis.W]}) continue; int khkwc1 = (kh * {op.KW} + kw) * {im.shape_dict[Axis.C]} + c1; sum += texture2D(""", col, ",", convert_coord( change_order("vec4(n, h2, w2, khkwc1)", OrderNHWC, col_order), col_shape, col_stride, texture_shape(col)[:2][::-1], texture_stride(col)[:2][::-1]), """).r; } } gl_FragColor.r = sum; } """ ], name=op.__class__.__name__) source = code.generate() return [Kernel(source, code.name, code.samplers, code.uniforms, im)]
def _convert_flatten(converter: ONNXConverter, onnx_op: INodeProto): x = converter.get_variable(onnx_op.input[0]) attrs = attribute_dict(onnx_op) axis = attrs["axis"].i if "axis" in attrs else 1 new_shape = [mul(x.shape[:axis]), mul(x.shape[axis:])] new_order = Order([None, None]) y = x.reshape(shape=new_shape, order=new_order) converter.set_variable(onnx_op.output[0], y)
def _convert_softmax(converter: ONNXConverter, onnx_op: INodeProto): x = converter.get_variable(onnx_op.input[0]) attrs = attribute_dict(onnx_op) axis = attrs["axis"].i if "axis" in attrs else 1 new_shape = [mul(x.shape[:axis]), mul(x.shape[axis:])] new_order = Order([None, None]) x = x.reshape(shape=new_shape, order=new_order) max_x, = Max(None, axis=x.order.axes[1])(x) y = x >= max_x converter.set_variable(onnx_op.output[0], y)
def tensordot(op: Tensordot, memory_layout: MemoryLayout) -> List[Kernel]: A = op.inputs["A"] B = op.inputs["B"] C = op.outputs["C"] shape_A_reduced_axes = [A.shape_dict[a] for a in op.axes[0]] shape_B_reduced_axes = [B.shape_dict[a] for a in op.axes[1]] kernel = Kernel({"tensordot": source}, "tensordot", inputs=[memory_layout[A], memory_layout[B]], outputs=[memory_layout[C]], call_option={ "reduction_size": mul(A.shape_dict[a] for a in op.axes[0]), "stride_A": A.stride, "stride_B": B.stride, "stride_C": C.stride, "shape_C": C.shape, "stride_A_for_C_axes": [ 0 if a not in A.order.axes or a in op.axes[0] else A.stride_dict[a] for a in C.order.axes ], "stride_B_for_C_axes": [ 0 if a not in B.order.axes or a in op.axes[1] else B.stride_dict[a] for a in C.order.axes ], "shape_A_reduced_axes": shape_A_reduced_axes, "stride_A_reduced_axes": [ mul(shape_A_reduced_axes[i + 1:]) for i in range(len(shape_A_reduced_axes)) ], "stride_A_reduced_axes_for_whole": [A.stride_dict[a] for a in op.axes[0]], "shape_B_reduced_axes": shape_B_reduced_axes, "stride_B_reduced_axes": [ mul(shape_B_reduced_axes[i + 1:]) for i in range(len(shape_B_reduced_axes)) ], "stride_B_reduced_axes_for_whole": [B.stride_dict[a] for a in op.axes[1]] }) return [kernel]
def optimize(self, graph: Graph) -> Tuple[Graph, bool]: flag_changed = False for op in traverse.filter_nodes( traverse.listup_operators(graph), Deconvolution2D): # type: Deconvolution2D x = op.inputs["x"] w = op.inputs["w"] y = op.outputs["y"] flag_changed = True op.remove_all() a_filter, a_kh, a_kw = Axis(), Axis(), Axis() w, = ReinterpretAxis(None, in_order=OrderNHWC, out_order=Order( [Axis.C, a_kh, a_kw, a_filter]))(w) x, = ReinterpretAxis(None, in_order=OrderNHWC, out_order=Order( [Axis.N, Axis.H, Axis.W, a_filter]))(x) col, = Tensordot(None, axes=a_filter)(x, w) col = col.transpose( Order([Axis.N, Axis.H, Axis.W, a_kh, a_kw, Axis.C])) col = col.reshape(shape=[*col.shape[0:3], mul(col.shape[3:6])], order=OrderNHWC) new_y, = Col2Im(None, ksize=op.ksize, stride=op.stride, padding=op.padding)(col) OptimizeRule.replace_variable(graph, new_y.transpose_like(y), y) return graph, flag_changed
def axiswise_bias(op: AxiswiseBias, memory_layout: MemoryLayout) -> List[Kernel]: # 該当軸のsize, strideを与える x = op.inputs["x"] b = op.inputs["b"] y = op.outputs["y"] assert b.ndim == 1 axis_pos = x.order.axes_dict[op.parameters["axis"]] # NCHWでaxis=Cなら、1 axis_size = x.shape[axis_pos] assert axis_size == b.size axis_stride = mul(x.shape[axis_pos + 1:]) kernel = Kernel({"axiswise_bias": source}, "axiswise_bias", inputs=[x, b], outputs=[y], call_option={ "n": x.size, "axis_stride": axis_stride, "axis_size": axis_size }) return [kernel]
def _listup_splittable_axis(v: Variable, op: Operator) -> List[Axis]: if isinstance(op, (Concat, SplitAxis)): return list(v.order.axes) elif isinstance(op, Reshape): """ For more detail of this condition check, please see the comment document of `_split_reshape` """ splittable_axes = [] # type: List[Axis] v1 = v v2 = op.outputs["y"] if v == op.inputs["x"] else op.inputs["x"] for a1 in v1.order.axes: d1 = mul(v1.shape[v1.order.axes_dict[a1]:]) d2 = 1 for a2 in reversed(v2.order.axes): d2 *= v2.shape_dict[a2] if d2 == d1: splittable_axes.append(a1) continue elif d2 > d1: continue return splittable_axes elif isinstance(op, Im2Col): op = op # type: Im2Col if v in op.outputs.values(): if v.shape_dict[Axis.C] % (op.ksize[0] * op.ksize[1]) == 0: return [Axis.N, Axis.H, Axis.W, Axis.C] else: return [Axis.N, Axis.H, Axis.W] else: return [] elif isinstance(op, PartialIm2Col): op = op # type: PartialIm2Col if v in op.outputs.values(): return [] else: return [op.axis] elif isinstance(op, Sgemm): if v == op.outputs["C"]: return [] else: return list(v.order.axes) elif isinstance(op, Tensordot): if v == op.outputs["C"]: return [] else: return list(v.order.axes) else: return list(attr.axis for attr in op.get_attribute(Tensorwise))
def template(axis=1, ndim=2, description: str = ""): if chainer.__version__ < "1.24" and axis != 1: raise SkipTest( f"chainer.functions.softmax support \"xis\" parameter since v1.24, current installed version is {chainer.__version__}" ) shape = (np.arange(ndim, ) + 2).tolist() vx = chainer.Variable( np.arange(mul(shape)).reshape(shape).astype(np.float32)) if chainer.__version__ < "1.24": vy = chainer.functions.softmax(vx) else: vy = chainer.functions.softmax(vx, axis=axis) graph = ChainerConverter().convert([vx], [vy]) x = graph.inputs[0] y = graph.outputs[0] generate_kernel_test_case( description=f"[chainer] F.softmax {description}", graph=graph, inputs={x: vx.data}, backend=["webgpu", "webassembly"], expected={y: vy.data}, )
def reshape(op: Reshape, memory_layout: MemoryLayout) -> List[Kernel]: x = op.inputs["x"] y = op.outputs["y"] if memory_layout[x] == memory_layout[y]: # This is inplace operation return [] assert x.order == op.parameters["in_order"] assert y.order == op.parameters["out_order"] assert y.size == mul(op.parameters["out_shape"]) buffer_injector = BufferInjector() buffer_injector.register({ "reshape_x": memory_layout[x], "reshape_y": memory_layout[y], "reshape_N": y.size, }) name_injector = KernelNameInjector(op) source = template source = buffer_injector.inject(source) source = name_injector.inject(source) kernel = Kernel({name_injector.name: source}, name_injector.name, GPUSize(8, 1, 1), GPUSize(MAX_THREADS_PER_THREADGROUP, 1, 1), buffer_injector.buffer, buffer_injector.unresolved_value_list) return [kernel]
def reshape(op: Reshape, memory_layout: MemoryLayout) -> List[Kernel]: x = memory_layout[op.inputs["x"]] y = memory_layout[op.outputs["y"]] assert x.variable.order == op.parameters["in_order"] assert y.variable.order == op.parameters["out_order"] assert y.variable.size == mul(op.parameters["out_shape"]) buffer_injector = BufferInjector() buffer_injector.register({ "reshape_x": x, "reshape_y": y, "reshape_N": y.variable.size, }) name_injector = KernelNameInjector(op) source = template source = buffer_injector.inject(source) source = name_injector.inject(source) kernel = Kernel({name_injector.name: source}, name_injector.name, GPUSize(8, 1, 1), GPUSize(1024, 1, 1), buffer_injector.buffer, buffer_injector.unresolved_value_list) return [kernel]
def reshape(op: Reshape, memory_layout: MemoryLayout) -> List[Kernel]: # Operation without need for transposition is currently supported x = op.inputs["x"] y = op.outputs["y"] if memory_layout[x] == memory_layout[y]: # This is inplace operation return [] assert x.order == op.parameters["in_order"] assert y.order == op.parameters["out_order"] assert y.size == mul(op.parameters["out_shape"]) buffer_injector = BufferInjector() buffer_injector.register({ "reshape_x": memory_layout[x], "reshape_y": memory_layout[y], "reshape_N": y.size, }) name_injector = KernelNameInjector(op) source = template source = buffer_injector.inject(source) source = name_injector.inject(source) kernel = Kernel({name_injector.name: source}, name_injector.name, buffer_injector.buffer, buffer_injector.unresolved_value_list) return [kernel]
def __call__(self, x: Variable): """ Args: x (:class:`~webdnn.graph.variable.Variable`): Input Returns: tuple of :class:`~webdnn.graph.variable.Variable`: Output """ out_axes = list(x.order.axes) for axis in self.parameters["in_axes"]: if axis not in out_axes: raise ValueError( f"Axis {axis} is not contained in input variable") out_axes.remove(axis) out_shape = [x.shape_dict[axis] for axis in out_axes] if self.parameters["out_axis"] in out_axes: raise ValueError(f"Axis {axis} is duplicated") out_axes.append(self.parameters["out_axis"]) out_shape.append( mul([x.shape_dict[axis] for axis in self.parameters["in_axes"]])) y = Variable(out_shape, Order(out_axes)) self.append_input("x", x) self.append_output("y", y) return y,
def _convert_reshape(converter: ChainerConverter, c_op: "chainer.functions.Reshape"): assert len(c_op.inputs) == 1, \ f"For 'Reshape' operator in chainer, expected number of inputs is 1, but actual is {len(c_op.inputs)}" x = converter.get_variable(c_op.inputs[0]) out_shape = list(c_op.shape) # c_op.shape is tuple if len(out_shape) == 1: out_order = OrderC elif len(out_shape) == 2: out_order = OrderNC elif len(out_shape) == 4: out_order = OrderNCHW else: raise NotImplementedError( "Reshaping into dimensions none of 1, 2, 4 is not supported.") assert mul(out_shape) == x.size y, = Reshape(None, in_order=x.order, out_order=out_order, out_shape=out_shape)(x) converter.set_variable(c_op.outputs[0](), y)
def _convert_reshape(converter: ONNXConverter, onnx_op: INodeProto): x = converter.get_variable(onnx_op.input[0]) if converter.opset_version >= 5: # output shape is specified by onnx_op.input[1] # It have to be ConstantVariable. # TODO: test for different operator set version shape_var = converter.get_variable(onnx_op.input[1]) assert isinstance( shape_var, ConstantVariable ), "Shape specifier of Reshape operator have to be constant." out_shape = [int(d) for d in shape_var.data] else: # Reshape-1 attrs = attribute_dict(onnx_op) out_shape = [ r if s == 0 else s for r, s in zip(x.shape, attrs["shape"].ints) ] if -1 in out_shape: i = out_shape.index(-1) out_shape.remove(-1) out_shape.insert(i, x.size // mul(out_shape)) out_order = Order([None] * len(out_shape)) y, = Reshape(None, in_order=x.order, out_order=out_order, out_shape=out_shape)(x) converter.set_variable(onnx_op.output[0], y)
def _listup_splittable_axis(v: Variable, op: Operator) -> List[Axis]: if isinstance(op, (Concat, SplitAxis)): return list(v.order.axes) if isinstance(op, Reshape): """ For more detail of this condition check, please see the comment document of `_split_reshape` """ splittable_axes = [] # type: List[Axis] v1 = v v2 = op.outputs["y"] if v == op.inputs["x"] else op.inputs["x"] v1_order = op.in_order if v1 == op.inputs["x"] else op.out_order v2_order = op.in_order if v2 == op.inputs["x"] else op.out_order v1_shape = [v1.shape_dict[a] for a in v1_order.axes] for a1 in v1_order.axes: d1 = mul(v1_shape[v1_order.axes_dict[a1]:]) d2 = 1 axes = [] for a2 in reversed(v2_order.axes): d2 *= v2.shape_dict[a2] axes.append(a2) if d2 == d1 and any(v2.shape_dict[a3] % 2 == 0 for a3 in axes): # TODO splittable_axes.append(a1) continue elif d2 > d1: continue return splittable_axes if isinstance(op, Im2Col): op = op # type: Im2Col if v in op.outputs.values(): return [Axis.N, Axis.H, Axis.W, Axis.C] else: return [] if isinstance(op, PartialIm2Col): op = op # type: PartialIm2Col if v in op.outputs.values(): axes = [Axis.N, Axis.C] if op.axis not in axes: axes.append(op.axis) return axes else: return [] if isinstance(op, Tensordot): return list(v.order.axes) if isinstance(op, Pooling2D): return [Axis.H, Axis.W] return []
def convert_layer_global_average_pooling2d( converter: KerasConverter, k_op: "keras.layers.GlobalAveragePooling2D"): x = converter.get_variable(converter.get_input_tensor(k_op)[0]) if k_op.data_format == "channels_first": assert x.order == OrderNCHW elif k_op.data_format == "channels_last": assert x.order == OrderNHWC else: raise ValueError( f"[KerasConverter] Unknown data format: {k_op.data_format}") y, = AveragePooling2D(None, ksize=(x.shape_dict[Axis.H], x.shape_dict[Axis.W]), stride=(1, 1), padding=(0, 0))(x) # flatten without changing memory layout z, = Reshape(None, in_order=y.order, out_order=OrderNC, out_shape=[y.shape[0], mul(y.shape[1:])])(y) converter.set_variable(converter.get_output_tensor(k_op)[0], z)
def template(axis=1, ndim=2, description: str = ""): shape = (np.arange(ndim, ) + 2).tolist() vx = chainer.Variable( np.arange(mul(shape)).reshape(shape).astype(np.float32)) vy = chainer.functions.softmax(vx, axis) graph = ChainerConverter().convert_from_inout_vars([vx], [vy]) x = graph.inputs[0] y = graph.outputs[0] generate_kernel_test_case( description=f"[chainer] F.softmax {description}", graph=graph, inputs={ x: np.transpose( vx.data, [default_order[ndim].axes_dict[a] for a in x.order.axes]) }, expected={ y: np.transpose( vy.data, [default_order[ndim].axes_dict[a] for a in y.order.axes]) }, )
def _convert_linear_function( converter: ChainerConverter, c_op: "chainer.functions.connection.linear.LinearFunction"): x = converter.get_variable(c_op.inputs[0]) w = converter.get_variable(c_op.inputs[1]) # type: ConstantVariable x2, = Reshape(None, in_order=x.order, out_order=OrderNC, out_shape=[x.shape[0], mul(x.shape[1:])])(x) w2, = ReinterpretAxis(None, in_order=w.order, out_order=OrderNC)(w) w2, = Transpose(None)(w2) w2.change_order(OrderCN) y, = Linear(None)(x2, w2) y, = ReinterpretAxis(None, in_order=y.order, out_order=Order([x.order.axes[0], w.order.axes[0]]))(y) if len(c_op.inputs) == 3: # with bias b = converter.get_variable(c_op.inputs[2]) check_broadcast_constraints(y, b) y = y + b converter.set_variable(c_op.outputs[0](), y)
def __call__(self, A: Variable, B: Variable): for axis in self.axes[0]: assert axis in A.order.axes, f""" [Tensordot] Input variable "A" must have axes "{axis}": (op) = {self} (op.axes[0]) = {self.axes[0]} (A) = {A}""" for axis in A.order.axes: if axis not in self.axes[0]: assert axis in self.axes[1] or axis not in B.order.axes, f""" [Tensordot] Axes of "A" which are not reduced must not be contained in "B": (op) = {self} (A.order.axes) = {A.order.axes} (B.order.axes) = {B.order.axes} (op.axes) = {self.axes}""" for axis in self.axes[1]: assert axis in B.order.axes, f""" [Tensordot] Input variable "B" must have axes "{axis}": (op) = {self} (op.axes[1]) = {self.axes[1]} (B) = {B}""" for axis in B.order.axes: if axis not in self.axes[1]: assert axis in self.axes[0] or axis not in A.order.axes, f""" [Tensordot] Axes of "B" which are not reduced must not be contained in "A": (op) = {self} (A.order.axes) = {A.order.axes} (B.order.axes) = {B.order.axes} (op.axes) = {self.axes}""" reduction_size_a = mul(A.shape_dict[a] for a in self.axes[0]) reduction_size_b = mul(B.shape_dict[a] for a in self.axes[1]) assert reduction_size_a == reduction_size_b, f""" [Tensordot] Reduction size of "A" and "B" must be same: (A) = {A} (B) = {B} (axes) = {self.axes} (reduction size of A) = {reduction_size_a} (reduction size of B) = {reduction_size_b} """ self.append_input("A", A) self.append_input("B", B) return self.exec()
def tensordot(op: Tensordot, memory_layout: MemoryLayout) -> List[Kernel]: A = op.inputs["A"] B = op.inputs["B"] C = op.outputs["C"] axes = op.axes # Reduced axes must be located on inside of input variables. assert A.order.axes[-len(axes[0]):] == axes[0] assert B.order.axes[-len(axes[1]):] == axes[1] # output variable's axes order must be as [*a_remained_axes, *b_remained_axes] assert C.order.axes[:A.ndim - len(axes[0])] == A.order.axes[:-len(axes[0])] assert C.order.axes[-(B.ndim - len(axes[1])):] == B.order.axes[:-len(axes[1])] assert C.ndim == A.ndim - len(axes[0]) + B.ndim - len(axes[1]) K = mul(A.shape_dict[a] for a in axes[0]) M = A.size // K N = B.size // K buffer_injector = BufferInjector() buffer_injector.register({ "sgemm_A": memory_layout[A], "sgemm_B": memory_layout[B], "sgemm_C": memory_layout[C], "sgemm_M": M, "sgemm_N": N, "sgemm_K": K }) if op.has_attribute(UseEigenAttribute): source = generate_template_eigen(True, False) buffer_injector.register({ "sgemm_A": memory_layout[A], "sgemm_B": memory_layout[B], "sgemm_C": memory_layout[C] }) else: source = generate_template(True, False) buffer_injector.register({ "sgemm_A": memory_layout[A], "sgemm_B": memory_layout[B], "sgemm_C": memory_layout[C], "sgemm_M": op.M, "sgemm_N": op.N, "sgemm_K": op.K }) name_injector = KernelNameInjector(op) source = buffer_injector.inject(source) source = name_injector.inject(source) kernel = Kernel({name_injector.name: source}, name_injector.name, buffer_injector.buffer, buffer_injector.unresolved_value_list) return [kernel]
def local_response_normalization_same_order( op: LocalResponseNormalization, memory_layout: MemoryLayout) -> List[Kernel]: x = op.inputs["x"] y = op.outputs["y"] target_axis = Axis.C # FIXME target_axis_index = x.order.axes_dict[target_axis] D1 = mul(x.shape[:target_axis_index]) D2 = x.shape[target_axis_index] D3 = mul(x.shape[target_axis_index + 1:]) buffer_injector = BufferInjector() buffer_injector.register({ "local_response_normalization_X": memory_layout[x], "local_response_normalization_Y": memory_layout[y], "local_response_normalization_D1": D1, "local_response_normalization_D2": D2, "local_response_normalization_D3": D3, "local_response_normalization_param_half_n": int(op.parameters["n"] // 2), "local_response_normalization_param_k": float(op.parameters["k"]), "local_response_normalization_param_alpha": float(op.parameters["alpha"]), "local_response_normalization_param_minus_beta": float(-op.parameters["beta"]) }) name_injector = KernelNameInjector(op) source = template_same_order source = buffer_injector.inject(source) source = name_injector.inject(source) kernel = Kernel({name_injector.name: source}, name_injector.name, GPUSize(8, 1, 1), GPUSize(MAX_THREADS_PER_THREADGROUP, 1, 1), buffer_injector.buffer, buffer_injector.unresolved_value_list) return [kernel]
def _convert_reshape(converter: ChainerConverter, c_op: "chainer.functions.Reshape"): x = converter.get_variable(c_op.inputs[0]) if any(not Placeholder.check_resolved(v) for v in x.shape): raise NotImplementedError("[ChainerConverter] \"Reshape\" for dynamic shape variable is not supported ") out_shape = list(c_op.shape) out_order = Order([None] * len(out_shape)) if -1 in out_shape: i = out_shape.index(-1) out_shape.pop(i) out_shape.insert(i, x.size // mul(out_shape)) assert mul(out_shape) == x.size, f"[ChainerConverter] Shape mismatch: mul(out_shape)={mul(out_shape)}, x.size={x.size}" y = x.reshape(out_shape, out_order) converter.set_variable(c_op.outputs[0](), y)
def optimize_loop_structure(variables: List[Variable], key_variable: Variable): """ Optimize loop structure to iterate each element in variables Returns: (tuple): two elements are returned - First one is shape dictionary of all variables. - Second one is stride dictionary of all variables. """ orders, shape_dicts = _simplify_orders( variables ) # type: Dict[Variable, Order], Dict[Variable, AxisKeyDict[List[int]]] shapes = { v: [shape_dicts[v][a] for a in orders[v].axes] for v in variables } strides = { v: [mul(shapes[v][orders[v].axes_dict[a] + 1:]) for a in orders[v].axes] for v in variables } stride_dicts = { v: AxisKeyDict(orders[v].axes, strides[v]) for v in variables } # re-ordering axes = [] for v in sorted(variables, key=lambda v: orders[v].ndim): axes += [axis for axis in orders[v].axes if axis not in axes] orders = { v: Order(list(filter(lambda x: x in orders[v].axes, axes))) for v in variables } shapes = { v: [shape_dicts[v][a] for a in orders[v].axes] for v in variables } strides = { v: [stride_dicts[v][a] for a in orders[v].axes] for v in variables } key_order = orders[key_variable] if key_order.ndim > 4: raise NotImplementedError( 'Currently, loop nest depth larger than 4 is not supported') for v in variables: shape = shapes[v] stride = strides[v] while len(shape) < 4: stride.append(1) shape.append(1) return shapes, strides
def __init__(self, name: Optional[str], M: Union[int, Placeholder], N: Union[int, Placeholder], K: Union[int, Placeholder], out_shape: Sequence[Union[int, Placeholder]], out_order: Order, transpose_A: bool, transpose_B: bool): super().__init__(name) assert len(out_shape) == out_order.ndim if Placeholder.check_resolved( mul(out_shape)) and Placeholder.check_resolved(M * N): assert mul(out_shape) == M * N self.parameters["M"] = M self.parameters["N"] = N self.parameters["K"] = K self.parameters["out_shape"] = out_shape self.parameters["out_order"] = out_order self.parameters["transpose_A"] = transpose_A self.parameters["transpose_B"] = transpose_B
def _convert_flatten(converter: KerasConverter, k_op: "keras.layers.Flatten"): x = converter.get_variable(converter.get_input_tensor(k_op)[0]) # flatten without changing memory layout y, = Reshape(None, in_order=x.order, out_order=OrderNC, out_shape=[x.shape[0], mul(x.shape[1:])])(x) converter.set_variable(converter.get_output_tensor(k_op)[0], y)