def fused_bn_follow_relu(data0, data1, data2, data3, data4, layout='NHWC', out_dtype='float16', target=utils.CUDA): """ input: data0-4: bn parameters for conv2d tensor, length is 5 data0: param0 beta data1: param1 gamma data2: param2 BNupdate: xi_variance data3: param6 BNreduce: xi_mean data4: param7 xi_conv2d, float16 layout: only (N, H, W, C), (N, C, H, W) supported out_dtype: float16 output: ReLU: max(batch-normalized tensor, 0) """ if layout == 'NCHW': data4 = topi.transpose(data4, (0, 2, 3, 1)) elif layout != 'NHWC': raise NotImplementedError( 'Layout not supported {} '.format(layout)) add0 = fused_bn_follow(data0, data1, data2, data3, data4) add0 = topi.cast(add0, out_dtype) output = topi.maximum(add0, 0) if layout == "NCHW": output = topi.transpose(output, (0, 3, 1, 2)) return output
def fused_bn_follow_relu_avgpool(data0, data1, data2, data3, data4, data5, layout='NHWC', out_dtype='float16', target=utils.CUDA): """ input: data: length is 6 data0: tensor1 after bn_double_relu data1-6: bn parameters for conv2d tensor2 layout: only (N, H, W, C), (N, C, H, W) supported out_dtype: float16 output: avg-pooling( max(batch-normalized tensor1 + batch-normalized tensor2, 0) ) """ if layout == 'NCHW': data0 = topi.transpose(data0, (0, 2, 3, 1)) data5 = topi.transpose(data5, (0, 2, 3, 1)) elif layout != 'NHWC': raise NotImplementedError( 'Layout not supported {} '.format(layout)) n, h, w, c = data0.shape inter_dtype = 'float32' add0 = fused_bn_follow(data1, data2, data3, data4, data5) add0 = topi.cast(add0, data0.dtype) add1 = topi.add(data0, add0) output = topi.maximum(add1, 0) output = topi.cast(output, inter_dtype) output = topi.sum(output, axis=(1, 2)) output = topi.divide(output, h * w) output = topi.cast(output, out_dtype) return output
def fused_bn_reduce_grad(data0, data1, data2, data3, data4, data5, data6, data7, layout='NHWC', out_dtype='float16', target=utils.CUDA): if layout == 'NCHW': data3 = topi.transpose(data3, (0, 2, 3, 1)) data7 = topi.transpose(data7, (0, 2, 3, 1)) elif layout != 'NHWC': raise NotImplementedError('Layout not supported {} '.format(layout)) n, h, w, c = data3.shape const = n * h * w inter_dtype = 'float32' out1 = topi.multiply(data4, data5) out1 = topi.divide(out1, const) out1 = topi.expand_dims(out1, axis=0, num_newaxis=3) out1 = topi.broadcast_to(out1, (n, h, w, c)) data3 = topi.cast(data3, inter_dtype) data2 = topi.expand_dims(data2, axis=0, num_newaxis=3) data2 = topi.broadcast_to(data2, (n, h, w, c)) out2 = topi.multiply(data3, const) out2 = topi.subtract(out2, data2) data1 = topi.expand_dims(data1, axis=0, num_newaxis=3) data1 = topi.broadcast_to(data1, (n, h, w, c)) data7 = topi.cast(data7, inter_dtype) out3 = topi.divide(data6, const) out3 = topi.subtract(data7, out3) out3 = topi.multiply(data1, out3) out3 = topi.divide(out3, data0) output = topi.subtract(out2, out3) output = topi.multiply(output, out1) output = topi.cast(output, out_dtype) if layout == "NCHW": output = topi.transpose(output, (0, 3, 1, 2)) return output
def fused_bn_reduce(data, layout, out_dtype): """ input: data: 4-D Tensor layout: input layout, only 'NCHW', 'NHWC' supported out_dtype: "float16" or "float32" output: out1_sum: 1-D tensor (C), sum on the axis "C" of input out2_squared_sum: 1-D tensor (C), sum of squared on the axis "C" of input """ if layout == "NCHW": data = topi.transpose(data, axes=(0, 2, 3, 1)) elif layout != "NHWC": raise NotImplementedError('Layout not supported {} '.format(layout)) inter_dtype = 'float32' data_cast = topi.cast(data, inter_dtype) out1_sum = topi.sum(data_cast, axis=(0, 1, 2)) out1_sum = topi.cast(out1_sum, out_dtype) squared = topi.multiply(data_cast, data_cast) out2_squared_sum = topi.sum(squared, axis=(0, 1, 2)) out2_squared_sum = topi.cast(out2_squared_sum, out_dtype) return [out1_sum, out2_squared_sum]
def fused_relu_grad_bn_double_update_grad(data_1, data_2, data_3, data_4, data_5, data_6, data_7, layout='NHWC'): transform_list = [data_2, data_4, data_5, data_6, data_7] for i in transform_list: if layout == "NCHW": i = topi.transpose(i, axes=(0, 2, 3, 1)) elif layout != "NHWC": raise NotImplementedError( 'Layout not supported {} '.format(layout)) data_tmp1 = topi.full_like(data_7, 0.0) data_tmp2 = topi.greater(data_7, data_tmp1) data_tmp3 = topi.add(data_5, data_6) data_tmp4 = topi.where(data_tmp2, data_tmp3, data_tmp1) data_tmp5 = topi.cast(data_tmp4, 'float32') data_tmp7 = topi.sum(data_tmp5, axis=(0, 1, 2)) n, h, w, c = data_7.shape data_tmp8 = topi.cast(data_2, 'float32') data_tmp9 = topi.full_like(data_tmp7, 1.0/(n*h*w)) data_tmp10 = topi.multiply(data_1, data_tmp9) data_tmp11 = topi.broadcast_to(data_tmp10, data_tmp8.shape) data_tmp12 = topi.subtract(data_tmp8, data_tmp11) data_tmp13 = topi.multiply(data_tmp5, data_tmp12) data_tmp15 = topi.sum(data_tmp13, axis=(0, 1, 2)) data_tmp16 = topi.cast(data_4, 'float32') data_tmp17 = topi.multiply(data_3, data_tmp9) data_tmp18 = topi.broadcast_to(data_tmp17, data_tmp16.shape) data_tmp19 = topi.subtract(data_tmp16, data_tmp18) data_tmp20 = topi.multiply(data_tmp5, data_tmp19) data_tmp22 = topi.sum(data_tmp20, axis=(0, 1, 2)) return [data_tmp7, data_tmp15, data_tmp22]
def fused_pad(input, pad_before, pad_after, layout='NHWC', pad_value=0.0, target=utils.CUDA): """ fused_pad. Args: input : tvm.Tensor or Expr pad_before : list / tuple of n ints. (Pad width on each dimension to pad the before the axis begin.) pad_after : list / tuple of n ints. (Pad width each dimension to pad the after the axis end.) pad_value : float. (The value to be padded.) Returns tvm.Tensor """ if layout == "NCHW": data = topi.transpose(data, axes=(0, 2, 3, 1)) elif layout != "NHWC": raise NotImplementedError('Layout not supported {} '.format(layout)) cast_after = topi.cast(input, 'float16') output = topi.nn.pad(cast_after, pad_before, pad_after, pad_value) return output
def auto_in_transpose(data, layout="NHDT"): layout_int = layout.replace('N', '0').replace( 'H', '1').replace('D', '2').replace('T', '3') layout_list = list(layout_int) layout_axis = np.argsort(layout_list) data = topi.transpose(data, axes=tuple(layout_axis)) return data
def fused_l2loss_grad(data_f16, data_f32, layout='NHWC', fill_data=4e-05, target=utils.CUDA): """ fused_l2loss_grad. Args: input: tvm.tensor.Tensor. Returns: ret. """ if layout == "NCHW": data_f16 = topi.transpose(data_f16, axes=(0, 2, 3, 1)) elif layout != "NHWC": raise NotImplementedError('Layout not supported {} '.format(layout)) data_f16 = topi.cast(data_f16, 'float32') constant_tmp = topi.cast(fill_data, 'float32') data_constant = topi.full_like(data_f32, constant_tmp) data_out = topi.multiply(data_constant, data_f32) data_out = topi.add(data_f16, data_out) return data_out
def bn_beta_grad_np(head, layout='NHWC'): if layout == 'NCHW': head = topi.transpose(head, (0, 2, 3, 1)) elif layout != "NHWC": raise NotImplementedError('layout is not supported {} '.format(layout)) bn_beta_grad = np.sum(head, axis=(0, 1, 2)) return bn_beta_grad
def auto_out_transpose(expect, layout_out="NHDT"): if len(expect.shape) == 3: layout_out = layout_out[1:] if len(expect.shape) == 2: layout_out = layout_out[2:] layout_out_int = layout_out.replace('N', '0').replace('H', '1').replace('D', '2').replace('T', '3') layout_out_list = list(layout_out_int) layout_out_axis = np.argsort(layout_out_list) expect = topi.transpose(expect, axes=tuple(layout_out_axis)) return expect
def fused_bn_double_follow_relu(data0, data1, data2, data3, data4, data5, data6, data7, data8, data9, layout='NHWC', out_dtype='float16', target=utils.CUDA): """ input: data: length is 5 data0-4: bn parameters for conv2d tensor 1 data5-9: bn parameters for conv2d tensor 2 layout: only (N, H, W, C), (N, C, H, W) supported out_dtype: float16 output: ReLU: max(batch-normalized tensor1 + batch-normalized tensor2, 0) """ if layout == 'NCHW': data4 = topi.transpose(data4, (0, 2, 3, 1)) data9 = topi.transpose(data9, (0, 2, 3, 1)) elif layout != 'NHWC': raise NotImplementedError('Layout not supported {} '.format(layout)) add0 = fused_bn_follow(data0, data1, data2, data3, data4) add1 = fused_bn_follow(data5, data6, data7, data8, data9) add0 = topi.cast(add0, out_dtype) add1 = topi.cast(add1, out_dtype) add2 = topi.add(add0, add1) output = topi.maximum(add2, 0) if layout == "NCHW": output = topi.transpose(output, (0, 3, 1, 2)) return output
def fused_relu_grad_bn_reduce_grad(data_1, data_2, data_3, data_4, data_5, data_6, data_7, data_8, data_9, layout='NHWC', target=utils.CUDA): """ fused_relu_grad_bn_reduce_grad. Args: data_1~data_9: tvm.tensor.Tensor. layout: input layout, only 'NCHW', 'NHWC' supported Returns: tvm.tensor.Tensor. """ transform_list = [data_7, data_8, data_9] for i in transform_list: if layout == "NCHW": i = topi.transpose(i, axes=(0, 2, 3, 1)) elif layout != "NHWC": raise NotImplementedError( 'Layout not supported {} '.format(layout)) data_tmp1 = topi.multiply(data_4, data_5) n, h, w, c = data_9.shape data_tmp2 = topi.full_like(data_tmp1, 1.0 / (n * h * w)) data_tmp3 = topi.multiply(data_tmp1, data_tmp2) data_tmp5 = topi.full_like(data_9, 0.0) data_tmp6 = topi.greater(data_9, data_tmp5) data_tmp7 = topi.where(data_tmp6, data_8, data_tmp5) data_tmp8 = topi.cast(data_tmp7, 'float32') data_tmp9 = topi.full_like(data_tmp8, n * h * w) data_tmp10 = topi.multiply(data_tmp8, data_tmp9) data_tmp12 = topi.subtract(data_tmp10, data_3) data_tmp14 = topi.cast(data_7, 'float32') data_tmp15 = topi.multiply(data_6, data_tmp2) data_tmp17 = topi.subtract(data_tmp14, data_tmp15) data_tmp18 = topi.multiply(data_2, data_tmp17) data_tmp20 = topi.divide(data_tmp18, data_1) data_tmp21 = topi.subtract(data_tmp12, data_tmp20) data_tmp22 = topi.multiply(data_tmp3, data_tmp21) data_out = topi.cast(data_tmp22, 'float16') return data_out
def bn_gamma_grad_np(head, in_data, data_sum, layout='NHWC'): if layout == 'NCHW': head = topi.transpose(head, (0, 2, 3, 1)) elif layout != "NHWC": raise NotImplementedError('layout is not supported {} '.format(layout)) n, h, w, c = head.shape mean = np.divide(data_sum, n * h * w) x_hat = np.subtract(in_data, mean) x_hat_mul = np.multiply(x_hat, head) bn_gamma_grad = np.sum(x_hat_mul, axis=(0, 1, 2)) return bn_gamma_grad
def fused_relu_grad_bn_double_reduce_grad(data0, data1, data2, data3, data4, data5, data6, data7, data8, data9, data10, data11, data12, data13, data14, data15, layout="NHWC", out_dtype="float16", target=utils.CUDA): if layout == 'NCHW': data5 = topi.transpose(data5, (0, 2, 3, 1)) data9 = topi.transpose(data9, (0, 2, 3, 1)) data13 = topi.transpose(data13, (0, 2, 3, 1)) data14 = topi.transpose(data14, (0, 2, 3, 1)) data15 = topi.transpose(data15, (0, 2, 3, 1)) elif layout != 'NHWC': raise NotImplementedError( 'Layout not supported {} '.format(layout)) inter_dtype = "float32" n, h, w, c = data5.shape scale = n * h * w mul = topi.multiply(data2, data3) mul1221 = topi.divide(mul, scale) # ReluGrad zero = tvm.const(0, data15.dtype) add = topi.add(data13, data14) addgrad = tvm.compute(add.shape, lambda *i: tvm.if_then_else(data15(*i) >= zero, add(*i), zero), tag=tag.INJECTIVE) addgrad = topi.cast(addgrad, inter_dtype) mul3283 = topi.multiply(scale, addgrad) sub1159 = topi.subtract(mul3283, data6) data5_cast = topi.cast(data5, inter_dtype) mul2372 = topi.divide(data4, scale) sub631 = topi.subtract(data5_cast, mul2372) mul1220 = topi.multiply(sub631, data1) div = topi.divide(mul1220, data0) sub271 = topi.subtract(sub1159, div) mul1218 = topi.multiply(mul1221, sub271) mul1218_cast = topi.cast(mul1218, out_dtype) mul1231 = topi.multiply(data11, data12) mul1230 = topi.divide(mul1231, scale) data9_cast = topi.cast(data9, inter_dtype) mul2364 = topi.divide(data8, scale) sub625 = topi.subtract(data9_cast, mul2364) mul1229 = topi.multiply(data10, sub625) div272 = topi.divide(mul1229, data7) sub272 = topi.subtract(sub1159, div272) mul1228 = topi.multiply(mul1230, sub272) mul1228_cast = topi.cast(mul1228, out_dtype) if layout == "NCHW": mul1218_cast = topi.transpose(mul1218_cast, (0, 3, 1, 2)) mul1228_cast = topi.transpose(mul1228_cast, (0, 3, 1, 2)) return [mul1218_cast, mul1228_cast]
def fused_is_finite(data, layout='NHWC'): """ fused_is_finite. Args: input: tvm.tensor.Tensor. Returns: ret. """ if layout == "NCHW": data = topi.transpose(data, axes=(0, 2, 3, 1)) elif layout != "NHWC": raise NotImplementedError('Layout not supported {} '.format(layout)) data_isfinite = topi.isfinite(data) n, h, w, c = data_isfinite.shape data_out = topi.all(data_isfinite, axis=(0, 1, 2, 3)) return data_out