def conv1d_transpose_ncw(cfg, data, kernel, stride, padding, out_dtype): """Transposed 1D convolution ncw forward operator. Parameters ---------- cfg: ConfigEntity The config for this template Input : tvm.Tensor 3-D with shape [batch, in_channel, inp_width] Filter : tvm.Tensor 3-D with shape [in_channel, num_filter, kernel_size] stride : tuple of one int The spatial stride along width padding : int, tuple, or string int: padding size tuple of 2 ints: (pad_left, pad_right) for left and right padding string: ['VALID', 'SAME'] out_dtype: str The output type. This is used in mixed precision Returns ------- Output : tvm.Tensor u 3-D with shape [batch, out_channel, out_width] """ if isinstance(stride, (tuple, list)): stride = stride[0] cfg.stride = stride batch, inp_channels, inp_width = get_const_tuple(data.shape) _, out_channels, kernel_size = get_const_tuple(kernel.shape) pad_left, pad_right = nn.get_pad_tuple1d(padding, kernel_size) out_width = (inp_width - 1) * stride + kernel_size - pad_left - pad_right pad_left = kernel_size - 1 - pad_left pad_right = kernel_size - 1 - pad_right dilated_width = stride * (inp_width - 1) + 1 data = tvm.compute( (batch, inp_channels, pad_left + dilated_width + pad_right), lambda n, c, x: tvm.if_then_else( tvm.all(x >= pad_left, x < pad_left + dilated_width, tvm.indexmod(x - pad_left, stride).equal(0)), data[ n, c, tvm.indexdiv(x - pad_left, stride)], tvm.const(0., "float32")), name='data_pad') dc = tvm.reduce_axis((0, inp_channels), name='dc') dw = tvm.reduce_axis((0, kernel_size), name='dw') data_out = tvm.compute( (batch, out_channels, out_width), lambda b, c, w: tvm.sum(data[b, dc, w + dw].astype(out_dtype) * kernel[ dc, c, kernel_size - 1 - dw].astype(out_dtype), axis=[dc, dw]), tag="conv1d_transpose_ncw") return data_out
def conv2d_transpose_nchw_cuda(cfg, data, kernel, stride, padding, out_dtype): """Transposed 2D convolution nchw forward operator. Parameters ---------- cfg: ConfigEntity The config for this template Input : tvm.Tensor 4-D with shape [batch, in_channel, in_height, in_width] Filter : tvm.Tensor 4-D with shape [in_channel, num_filter, filter_height, filter_width] strides : tuple of two ints The spatial stride along height and width padding : int or str Padding size, or ['VALID', 'SAME'] out_dtype: str The output type. This is used in mixed precision Returns ------- Output : tvm.Tensor 4-D with shape [batch, out_channel, out_height, out_width] """ batch, inp_channels, inp_height, inp_width = get_const_tuple(data.shape) _, out_channels, kernel_height, kernel_width = get_const_tuple(kernel.shape) stride_height, stride_width = stride cfg.stride = stride pad_top, pad_left, pad_bottom, pad_right = nn.get_pad_tuple( padding, (kernel_height, kernel_width)) out_width = (inp_width - 1) * stride_width + \ kernel_width - pad_left - pad_right pad_left = kernel_width - 1 - pad_left pad_right = kernel_width - 1 - pad_right dilated_width = stride_width * (inp_width - 1) + 1 out_height = (inp_height - 1) * stride_height + \ kernel_height - pad_top - pad_bottom pad_top = kernel_height - 1 - pad_top pad_bottom = kernel_height - 1 - pad_bottom dilated_height = stride_height * (inp_height - 1) + 1 # compute pad data = tvm.compute( (batch, inp_channels, pad_top + dilated_height + pad_bottom, pad_left + dilated_width + pad_right), lambda n, c, y, x: tvm.if_then_else( tvm.all(x >= pad_left, x < pad_left + dilated_width, tvm.indexmod(x - pad_left, stride_width).equal(0), y >= pad_top, y < pad_top + dilated_height, tvm.indexmod(y - pad_top, stride_height).equal(0)), data[n, c, tvm.indexdiv(y - pad_top, stride_height), tvm.indexdiv(x - pad_left, stride_width)], tvm.const(0., "float32")), name='data_pad') # compute transposed conv dc = tvm.reduce_axis((0, inp_channels), name='dc') dh = tvm.reduce_axis((0, kernel_height), name='dh') dw = tvm.reduce_axis((0, kernel_width), name='dw') data_out = tvm.compute( (batch, out_channels, out_height, out_width), lambda b, c, h, w: tvm.sum( data[b, dc, h + dh, w + dw].astype(out_dtype) * kernel[dc, c, kernel_height - 1 - dh, kernel_width - 1 - dw].astype(out_dtype), axis=[dc, dh, dw]), tag="conv2d_transpose_nchw") return data_out
def fused_convs(input_data, filters, resnet_block=False): out_dtype = input_data.dtype Input = None nodes = [input_data] params = [input_data] for f in filters: Input = nodes[-1] Filter = f.placeholder layout = f.layout depthwise = f.depthwise kernel = f.kernel stride = f.stride padding = f.padding dilation = f.dilation assert not (depthwise and kernel == 1) # Don't consider 1by1 depthwise padded_count = 0 conv_count = 0 depthwise_count = 0 if isinstance(stride, int): stride_h = stride_w = stride else: stride_h, stride_w = stride if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation batch, in_height, in_width, in_channel = Input.shape if f.NHWC_transpose: # HWOI kernel_h, kernel_w, tmp, kernel_channel = Filter.shape else: # HWIO kernel_h, kernel_w, kernel_channel, tmp = Filter.shape if depthwise: channel_multiplier = tmp else: num_filter = tmp # compute the output shape dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 pad_top, pad_left, pad_down, pad_right = get_pad_tuple( padding, (dilated_kernel_h, dilated_kernel_w)) out_channel = simplify(in_channel * channel_multiplier) if depthwise else num_filter out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) if f.kernel > 1: print("Padding is needed!") pad_before = [0, pad_top, pad_left, 0] pad_after = [0, pad_down, pad_right, 0] PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput_{}".format(padded_count)) padded_count += 1 nodes.append(PaddedInput) # Update Input Input = PaddedInput batch, in_height, in_width, in_channel = Input.shape if not depthwise: rc = tvm.reduce_axis((0, in_channel), name='rc') if kernel > 1: ry = tvm.reduce_axis((0, kernel_h), name='ry') rx = tvm.reduce_axis((0, kernel_w), name='rx') if not depthwise: # Normal convolution if kernel > 1: Output = tvm.compute( (batch, out_height, out_width, out_channel), lambda nn, yy, xx, ff: tvm.sum( Input[nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rc].astype(out_dtype) * (Filter[ry, rx, ff, rc] if f.NHWC_transpose else Filter[ry, rx, rc, ff]).astype(out_dtype), axis=[ry, rx, rc]), name="Conv2dOutput_{}".format(conv_count), tag="conv2d_nhwc") else: # Only reduce rc axis Output = tvm.compute( (batch, out_height, out_width, out_channel), lambda nn, yy, xx, ff: tvm.sum( Input[nn, yy * stride_h, xx * stride_w, rc].astype(out_dtype) * (Filter[0, 0, ff, rc] if f.NHWC_transpose else Filter[0, 0, rc, ff]).astype(out_dtype), axis=[rc]), name="Conv2dOutput_{}".format(conv_count), tag="conv2d_nhwc") conv_count += 1 else: # Depthwise convolution (kernel > 1) Output = tvm.compute( (batch, out_height, out_width, out_channel), lambda b, i, j, c: tvm.sum( (Input[b, i*stride_h + ry*dilation_h, j*stride_w + rx*dilation_w, tvm.indexdiv(c, channel_multiplier)].astype(out_dtype) * (Filter[ry, rx, tvm.indexmod(c, channel_multiplier), tvm.indexdiv(c, channel_multiplier)] if f.NHWC_transpose else Filter[ry, rx, tvm.indexdiv(c, channel_multiplier), tvm.indexmod(c, channel_multiplier)]).astype(out_dtype)), axis=[ry, rx]), name='DepthwiseConv2dOutput_{}'.format(depthwise_count), tag="depthwise_nhwc") depthwise_count += 1 nodes.append(Output) params.append(Filter) if resnet_block: First = nodes[0] Last = nodes[-1] assert (first.shape == last.shape) Output = tvm.compute( (batch, out_height, out_width, out_channel), lambda b, i, j, c: tvm.sum( (First[b, i, j, c].astype(out_dtype) + (Last[b, i, j, c]).astype(out_dtype))), name='ElementwiseAddOutput_{}'.format(depthwise_count), tag="elem_nhwc") nodes.append(Output) params.append(nodes[-1]) # Final output return nodes, params