def batch_matmul_2D(data1, data2, bias=None, out_dtype="float32", layout1="NHDT", layout2="NHDT", layout_out="NHDT"): layout1_dict = {} layout2_dict = {} layout1 = layout1[2:] layout2 = layout2[2:] layout1_str = layout1.replace('D', 'm').replace('T', 'k') layout2_str = layout2.replace('D', 'n').replace('T', 'k') layout1_list = list(layout1_str) layout2_list = list(layout2_str) for i in range(len(layout1)): layout1_dict[layout1_list[i]] = data1.shape[i] layout2_dict[layout2_list[i]] = data2.shape[i] reduce_axis = tvm.reduce_axis((0, layout1_dict['k']), name='reduce_axis') if out_dtype=="float32": res = tvm.compute((layout1_dict['m'], layout2_dict['n']), lambda i, j: tvm.sum( data1[i if layout1_list[0] == 'm' else reduce_axis, reduce_axis if layout1_list[1] == 'k' else i].astype("float") * data2[j if layout2_list[0] == 'n' else reduce_axis, reduce_axis if layout2_list[1] == 'k' else j].astype("float"), axis=reduce_axis)) else: res = tvm.compute((layout1_dict['m'], layout2_dict['n']), lambda i, j: tvm.sum( data1[i if layout1_list[0] == 'm' else reduce_axis, reduce_axis if layout1_list[1] == 'k' else i] * data2[j if layout2_list[0] == 'n' else reduce_axis, reduce_axis if layout2_list[1] == 'k' else j], axis=reduce_axis)) if bias is not None: res = topi.add(res, bias) if layout_out != "NHDT": res = auto_out_transpose(res, layout_out) return res
def TensorcoreConv(data, weight, stride=[1, 1], pad=[0, 0, 0, 0], dilation=[1, 1], out_dtype="float32", name="out", target=utils.CUDA): batch, in_h, in_w, in_c = data.shape out_c, k_h, k_w, _ = weight.shape pad_top, pad_bottom, pad_left, pad_right = pad s_h, s_w = stride d_h, d_w = dilation k_h_d = (k_h - 1) * d_h + 1 k_w_d = (k_w - 1) * d_w + 1 o_h = (in_h + pad_top + pad_bottom - k_h_d) // s_h + 1 o_w = (in_w + pad_left + pad_right - k_w_d) // s_w + 1 has_pad = not (pad_left == 0 and pad_right == 0 and pad_top == 0 and pad_bottom == 0) if has_pad: data_pad = tvm.compute( (batch, in_h + pad_top + pad_bottom, in_w + pad_left + pad_right, in_c), lambda n, h, w, i: tvm.if_then_else( tvm.all(h >= pad_top, h - pad_bottom < in_h, w >= pad_left, w - pad_right < in_w), data[n, h - pad_top, w - pad_left, i], tvm.const(0.0, "float16"), ), name="Pad", ) else: data_pad = data rc = tvm.reduce_axis((0, in_c), name="rc") rh = tvm.reduce_axis((0, k_h), name="rh") rw = tvm.reduce_axis((0, k_w), name="rw") if out_dtype == "float32": out = tvm.compute( (batch, o_h, o_w, out_c), lambda n, h, w, o: tvm.sum(data_pad[n, (h * s_h + rh * d_h), ( w * s_w + rw * d_w), rc].astype("float32") * weight[ o, rh, rw, rc].astype("float32"), axis=[rc, rh, rw]), name=name) else: out = tvm.compute( (batch, o_h, o_w, out_c), lambda n, h, w, o: tvm.sum(data_pad[n, (h * s_h + rh * d_h), ( w * s_w + rw * d_w), rc] * weight[o, rh, rw, rc], axis=[rc, rh, rw]), name=name) return out
def Conv(data, weight, stride=[1, 1], pad=[0, 0, 0, 0], dilation=[1, 1], name="out", target=utils.CUDA): """ Supported Platforms: 'GPU' """ if target != utils.CUDA: raise RuntimeError("the target %s is not supported!" % target) batch, in_c, in_h, in_w = data.shape out_c, in_c, k_h, k_w = weight.shape pad_top, pad_bottom, pad_left, pad_right = pad s_h, s_w = stride d_h, d_w = dilation k_h_d = (k_h - 1) * d_h + 1 k_w_d = (k_w - 1) * d_w + 1 o_h = (in_h + pad_top + pad_bottom - k_h_d) // s_h + 1 o_w = (in_w + pad_left + pad_right - k_w_d) // s_w + 1 out_shape = (batch, out_c, o_h, o_w) data_pad = topi.nn.pad(data, [0, 0, pad_top, pad_left], [0, 0, pad_bottom, pad_right], 0.0) rc = tvm.reduce_axis((0, in_c), name="rc") rh = tvm.reduce_axis((0, k_h), name="rh") rw = tvm.reduce_axis((0, k_w), name="rw") out = tvm.compute(out_shape, lambda n, c, h, w: tvm.sum( data_pad[n, rc, h * s_h + rh * d_h, w * s_w + rw * d_w] * weight[c, rc, rh, rw], axis=[rc, rh, rw]), name=name) # use for relu condition # out = tvm.compute(out.shape, lambda *i: tvm.max(out(*i), tvm.const(0, out.dtype)), name="relu") return out
def batch_matmul_4d(data1, data2, attrs): """batch matmul for 4-D data""" bias, out_dtype, layout1, layout2, layout_out = attrs layout1_dict = {} layout2_dict = {} layout1_str = layout1.replace('N', 'B').replace( 'H', 'b').replace('D', 'm').replace('T', 'k') layout2_str = layout2.replace('N', 'B').replace( 'H', 'b').replace('D', 'n').replace('T', 'k') layout1_list = list(layout1_str) layout2_list = list(layout2_str) for i in range(len(layout1)): layout1_dict[layout1_list[i]] = data1.shape[i] layout2_dict[layout2_list[i]] = data2.shape[i] reduce_axis = tvm.reduce_axis( (0, layout1_dict.get('k')), name='reduce_axis') if out_dtype == "float32": res = tvm.compute( (layout1_dict.get('B'), layout1_dict.get('b'), layout1_dict.get('m'), layout2_dict.get('n')), lambda B, b, i, j: tvm.sum( data1[B, b, i if layout1_list[2] == 'm' else reduce_axis, reduce_axis if layout1_list[3] == 'k' else i].astype("float") * data2[B, b, j if layout2_list[2] == 'n' else reduce_axis, reduce_axis if layout2_list[3] == 'k' else j].astype("float"), axis=reduce_axis)) else: res = tvm.compute( (layout1_dict.get('B'), layout1_dict.get('b'), layout1_dict.get('m'), layout2_dict.get('n')), lambda B, b, i, j: tvm.sum( data1[B, b, i if layout1_list[2] == 'm' else reduce_axis, reduce_axis if layout1_list[3] == 'k' else i] * data2[B, b, j if layout2_list[2] == 'n' else reduce_axis, reduce_axis if layout2_list[3] == 'k' else j], axis=reduce_axis)) if bias is not None: res = topi.add(res, bias) if layout_out != "NHDT": res = auto_out_transpose(res, layout_out) return res
def batch_matmul_4D(data1, data2, bias, layout1="NHDT", layout2="NHDT", layout_out="NHDT"): if layout1 != "NHDT": data1 = auto_in_transpose(data1, layout1) if layout2 != "NHDT": data2 = auto_in_transpose(data2, layout2) b1, b2, m, k = data1.shape b1, b2, n, k = data2.shape reduce_axis = tvm.reduce_axis((0, k), name='reduce_axis') res = tvm.compute((b1, b2, m, n), lambda i_b1, i_b2, i_m, i_n: tvm.sum(data1[i_b1, i_b2, i_m, reduce_axis] * data2[i_b1, i_b2, i_n, reduce_axis], axis=reduce_axis), name='matmul_compute') if bias is not None: res = topi.add(res, bias) if layout_out != "NHDT": res = auto_out_transpose(res, layout_out) return res
def conv2d_nhwc(inputs, attrs): attrs = {k: v for k, v in attrs.items()} # Check inputs and attrs if len(inputs) != 2: raise ValueError("length of inputs shoule be 2, but got %d." % len(inputs)) if "stride" not in attrs: raise ValueError("stride not be found in the attrs") data = inputs[0] weight = inputs[1] output_name = "T_conv2d_nhwc_" + data.op.name + "_" + weight.op.name stride = attrs["stride"] data_dtype = data.dtype weight_dtype = weight.dtype # Check data type vc_util.ops_dtype_check(data_dtype, vc_util.DtypeForDavinci.FLOAT16) vc_util.ops_dtype_check(weight_dtype, vc_util.DtypeForDavinci.FLOAT16) # Check shape if len(data.shape) != 4 or len(weight.shape) != 4: raise ValueError( "shape of data and weight should be 4-dim, but got %d and %d." % (len(data.shape), len(weight.shape))) # Compute output n, in_h, in_w, in_c = data.shape out_c, k_h, k_w, in_c = weight.shape _, _, s_h, s_w = stride o_h = (in_h - k_h) // s_h + 1 o_w = (in_w - k_w) // s_w + 1 rc = tvm.reduce_axis((0, in_c), name="rc") rh = tvm.reduce_axis((0, k_h), name="rh") rw = tvm.reduce_axis((0, k_w), name="rw") output = tvm.compute((n, o_h, o_w, out_c), lambda n, h, w, o: tvm.sum(data[n, (h * s_h + rh), ( w * s_w + rw), rc] * weight[o, rh, rw, rc], axis=[rc, rh, rw]), name=output_name) return output