def elementwise_add(op: Sgemm) -> List[Kernel]: A = op.inputs["A"] B = op.inputs["B"] C = op.outputs["C"] assert ChannelMode.get_mode(A) == ChannelMode.get_mode(B) name_injector = KernelNameInjector(op) uniform_injector = UniformInjector() uniform_injector.register({ "A": A, "B": B, "s_c": texture_stride(C), "d_C": [op.M, op.N], "s_C": [op.N, 1], "d_a": texture_shape(A), "s_a": texture_stride(A), "s_A": [op.K, 1] if op.transpose_A else [1, op.M], "d_b": texture_shape(B), "s_b": texture_stride(B), "s_B": [op.N, 1] if op.transpose_B else [1, op.K], "K": op.K }) source = generate_template(mode=ChannelMode.get_mode(A), K=op.K) source = uniform_injector.inject(source) source = name_injector.inject(source) kernel = Kernel(source, name_injector.name, uniform_injector.samplers, uniform_injector.uniforms, C) return [kernel]
def elementwise_add(op: Softsign) -> List[Kernel]: x0 = op.inputs["x0"] y = op.outputs["y"] shapes, strides = optimize_loop_structure([x0, y], y) name_injector = KernelNameInjector(op) uniform_injector = UniformInjector() uniform_injector.register({ "X0": x0, "s_y": texture_stride(y), "d_Y": shapes[y], "s_Y": strides[y], "d_x0": texture_shape(x0), "s_x0": texture_stride(x0), "d_X0": shapes[x0], "s_X0": strides[x0], }) source = template_R if ChannelMode.get_mode( y) == ChannelModeEnum.R else template_RGBA source = uniform_injector.inject(source) source = name_injector.inject(source) kernel = Kernel(source, name_injector.name, uniform_injector.samplers, uniform_injector.uniforms, y) return [kernel]
def texture_shape(v: Variable): channel_mode = ChannelMode.get_mode(v) if channel_mode == ChannelModeEnum.R: texture_length = v.size elif channel_mode == ChannelModeEnum.RGBA: texture_length = (v.size + 4 - 1) // 4 else: raise NotImplementedError(f"Unknown channel mode: {channel_mode}") return [ texture_length if texture_length < 2048 else 2048, (texture_length + 2048 - 1) // 2048 ]
def texture_stride(v: Variable): result = [] channel_mode = ChannelMode.get_mode(v) if channel_mode == ChannelModeEnum.R: s = 1 elif channel_mode == ChannelModeEnum.RGBA: s = 4 else: raise NotImplementedError(f"Unknown channel mode: {channel_mode}") for d in texture_shape(v): result.append(s) s *= d return result
def elementwise_add(op: Im2Col) -> List[Kernel]: im = op.inputs["im"] col = op.outputs["col"] assert im.order == OrderNHWC assert col.order == OrderNHWC or col.order == OrderCNHW name_injector = KernelNameInjector(op) uniform_injector = UniformInjector() uniform_injector.register({ "im": im, "s_col": texture_stride(col), "d_Col": col.shape, "s_Col": col.stride, "d_im": texture_shape(im), "s_im": texture_stride(im), "d_Im": im.shape, "s_Im": im.stride, "C1": im.shape_dict[Axis.C], "H1": im.shape_dict[Axis.H], "W1": im.shape_dict[Axis.W], "KH": op.KH, "KW": op.KW, "DH": op.DH, "DW": op.DW, "SH": op.SH, "SW": op.SW, "PH": op.PH, "PW": op.PW, }) source = template_R if ChannelMode.get_mode( col) == ChannelModeEnum.R else template_RGBA source = uniform_injector.inject(source) source = name_injector.inject(source) kernel = Kernel(source, name_injector.name, uniform_injector.samplers, uniform_injector.uniforms, col) return [kernel]
def generate(cls, graph: Graph, **kwargs): graph, _ = WebGLOptimizeRule().optimize(graph) if flags.DEBUG: traverse.dump(graph) with open("cg.dot", "w") as f: f.write(traverse.dump_dot(graph)) memory_layout = allocate(graph) allocations = {} for v, a in memory_layout.allocations.items(): allocations[v] = WebGLAllocation( allocation=a, channel_mode=ChannelMode.get_mode(v)) constants_map = {} for constant in traverse.filter_nodes( traverse.listup_nodes(graph), ConstantVariable): # type: ConstantVariable constants_map[constant.name] = { "byte_offset": memory_layout[constant].offset * 4, "size": constant.size } constant_encoder = ConstantEncoder.get_encoder( kwargs.get("constant_encoder_name", None)) constants_bytes = constant_encoder.encode(memory_layout) kernels = cls.generate_kernels(graph) descriptor = GraphDescriptor(kernels=kernels, memory_layout=memory_layout, inputs=graph.inputs, outputs=graph.outputs, constants_encoding=constant_encoder.name, allocations=allocations, constants_map=constants_map, licenses=graph.licenses) return GraphExecutionData(graph, descriptor, constants_bytes)