def _sample(i, c, ph, pw): roi = rois[i] batch_index = roi[0].astype('int32') roi_start_w, roi_start_h, roi_end_w, roi_end_h = roi[1], roi[2], roi[3], roi[4] roi_start_h *= spatial_scale roi_end_h *= spatial_scale roi_start_w *= spatial_scale roi_end_w *= spatial_scale # force malformed ROIs to be 1x1 roi_h = tvm.max(roi_end_h - roi_start_h, tvm.const(1.0, dtype)) roi_w = tvm.max(roi_end_w - roi_start_w, tvm.const(1.0, dtype)) bin_h = roi_h / pooled_size_h bin_w = roi_w / pooled_size_w if sample_ratio > 0: roi_bin_grid_h = roi_bin_grid_w = tvm.const(sample_ratio, 'int32') else: roi_bin_grid_h = tvm.ceil(roi_h / pooled_size_h).astype('int32') roi_bin_grid_w = tvm.ceil(roi_w / pooled_size_w).astype('int32') count = roi_bin_grid_h * roi_bin_grid_w rh = tvm.reduce_axis((0, roi_bin_grid_h)) rw = tvm.reduce_axis((0, roi_bin_grid_w)) roi_start_h += ph * bin_h roi_start_w += pw * bin_w return tvm.sum(_bilinear(batch_index, c, roi_start_h + (rh + 0.5) * bin_h / roi_bin_grid_h, roi_start_w + (rw + 0.5) * bin_w / roi_bin_grid_w) / count, axis=[rh, rw])
def _sample(i, c, ph, pw): roi = rois[i] batch_index = roi[0].astype('int32') roi_start_w, roi_start_h, roi_end_w, roi_end_h = roi[1], roi[2], roi[ 3], roi[4] roi_start_h *= spatial_scale roi_end_h *= spatial_scale roi_start_w *= spatial_scale roi_end_w *= spatial_scale # force malformed ROIs to be 1x1 roi_h = tvm.max(roi_end_h - roi_start_h, tvm.const(1.0, dtype)) roi_w = tvm.max(roi_end_w - roi_start_w, tvm.const(1.0, dtype)) bin_h = roi_h / pooled_size_h bin_w = roi_w / pooled_size_w if sample_ratio > 0: roi_bin_grid_h = roi_bin_grid_w = tvm.const(sample_ratio, 'int32') else: roi_bin_grid_h = tvm.ceil(roi_h / pooled_size_h).astype('int32') roi_bin_grid_w = tvm.ceil(roi_w / pooled_size_w).astype('int32') count = roi_bin_grid_h * roi_bin_grid_w rh = tvm.reduce_axis((0, roi_bin_grid_h)) rw = tvm.reduce_axis((0, roi_bin_grid_w)) roi_start_h += ph * bin_h roi_start_w += pw * bin_w return tvm.sum( _bilinear(batch_index, c, roi_start_h + (rh + 0.5) * bin_h / roi_bin_grid_h, roi_start_w + (rw + 0.5) * bin_w / roi_bin_grid_w) / count, axis=[rh, rw])
def _bilinear(i, c, y, x): y_low = y.astype('int32') x_low = x.astype('int32') y_high = tvm.min(tvm.ceil(y).astype('int32'), height - 1) x_high = tvm.min(tvm.ceil(x).astype('int32'), width - 1) y_lerp = y - y_low x_lerp = x - x_low bottom = x_lerp * data[i, c, y_high, x_high] + \ (1-x_lerp) * data[i, c, y_high, x_low] top = x_lerp * data[i, c, y_low, x_high] + \ (1-x_lerp) * data[i, c, y_low, x_low] return y_lerp * bottom + (1 - y_lerp) * top
def prepare_output_ir(sorted_bbox_buf, remove_mask_buf, out_buf): """Copy output after applying nms to continuous memory. Parameters ---------- sorted_bbox_buf : tvm.schedule.Buffer 3-D with shape [batch, num_bbox, 5]. The last dimension is in format of [w_start, h_start, w_end, h_end, score]. remove_mask_buf : tvm.schedule.Buffer 2-D with shape [batch, num_bbox]. Boolean mask of whether a bounding box should be removed. out_buf : tvm.schedule.Buffer 2-D with shape [batch * rpn_post_nms_top_n, 5]. The last dimension is in format of [batch_index, w_start, h_start, w_end, h_end]. Returns ------- stmt : Stmt The result IR statement. """ batch, num_bbox, _ = get_const_tuple(sorted_bbox_buf.shape) rpn_post_nms_top_n = get_const_int(out_buf.shape[0]) // batch ib = tvm.ir_builder.create() i = ib.allocate('int32', (batch, ), 'i', scope='local') p_sorted_bbox = ib.buffer_ptr(sorted_bbox_buf) p_remove = ib.buffer_ptr(remove_mask_buf) p_out = ib.buffer_ptr(out_buf) nkeep = ib.allocate('int32', (batch, ), 'nkeep', scope='local') with ib.for_range(0, batch) as b: nkeep[b] = 0 i[b] = 0 with ib.for_range(0, num_bbox) as j: with ib.for_range(0, batch) as b: with ib.if_scope(p_remove[b * num_bbox + j] == False): nkeep[b] += 1 with ib.for_range(0, batch) as b: with ib.if_scope(nkeep[b] > 0): with ib.for_range( 0, tvm.ceil( tvm.const(rpn_post_nms_top_n, 'float32') / nkeep[b]).astype('int32')): with ib.for_range(0, num_bbox) as j: offset_j = (b * num_bbox + j) * 5 offset_i = (b * rpn_post_nms_top_n + i[b]) * 5 with ib.if_scope( tvm.all(i[b] < rpn_post_nms_top_n, p_remove[(b * num_bbox + j)] == False)): p_out[offset_i] = tvm.expr.Cast('float32', b) with ib.for_range(0, 4, for_type='unroll') as k: p_out[offset_i + k + 1] = p_sorted_bbox[offset_j + k] i[b] = i[b] + 1 body = ib.get() return body
def prepare_output_ir(sorted_bbox_buf, remove_mask_buf, out_buf): """Copy output after applying nms to continuous memory. Parameters ---------- sorted_bbox_buf : tvm.schedule.Buffer 3-D with shape [batch, num_bbox, 5]. The last dimension is in format of [w_start, h_start, w_end, h_end, score]. remove_mask_buf : tvm.schedule.Buffer 2-D with shape [batch, num_bbox]. Boolean mask of whether a bounding box should be removed. out_buf : tvm.schedule.Buffer 2-D with shape [batch * rpn_post_nms_top_n, 5]. The last dimension is in format of [batch_index, w_start, h_start, w_end, h_end]. Returns ------- stmt : Stmt The result IR statement. """ batch, num_bbox, _ = get_const_tuple(sorted_bbox_buf.shape) rpn_post_nms_top_n = get_const_int(out_buf.shape[0]) // batch nthread_tx = batch tx = tvm.thread_axis("threadIdx.x") ib = tvm.ir_builder.create() ib.scope_attr(tx, "thread_extent", nthread_tx) i = ib.allocate('int32', (1,), 'i', scope='local') i[0] = 0 p_sorted_bbox = ib.buffer_ptr(sorted_bbox_buf) p_remove = ib.buffer_ptr(remove_mask_buf) p_out = ib.buffer_ptr(out_buf) b = tx nkeep = ib.allocate('int32', (1,), 'nkeep', scope='local') nkeep[0] = 0 # number of bbox after nms with ib.for_range(0, num_bbox) as j: with ib.if_scope(p_remove[b * num_bbox + j] == False): nkeep[0] += 1 with ib.if_scope(nkeep[0] > 0): with ib.for_range(0, tvm.ceil( tvm.const(rpn_post_nms_top_n, 'float32') / nkeep[0]).astype('int32')): with ib.for_range(0, num_bbox) as j: offset_j = (b * num_bbox + j) * 5 offset_i = (b * rpn_post_nms_top_n + i[0]) * 5 with ib.if_scope(tvm.all(i[0] < rpn_post_nms_top_n, p_remove[(b*num_bbox+j)] == False)): p_out[offset_i] = tvm.expr.Cast('float32', b) with ib.for_range(0, 4, for_type='unroll') as k: p_out[offset_i + k + 1] = p_sorted_bbox[offset_j + k] i[0] = i[0] + 1 body = ib.get() return body
def _pool(i, c, ph, pw): roi = rois[i] batch_index = roi[0].astype('int32') roi_start_w, roi_start_h, roi_end_w, roi_end_h = roi[1], roi[2], roi[ 3], roi[4] roi_start_h = tvm.round(roi_start_h * spatial_scale).astype('int32') roi_start_w = tvm.round(roi_start_w * spatial_scale).astype('int32') roi_end_h = tvm.round(roi_end_h * spatial_scale).astype('int32') roi_end_w = tvm.round(roi_end_w * spatial_scale).astype('int32') # force malformed ROIs to be 1x1 roi_h = tvm.max(roi_end_h - roi_start_h + 1, tvm.const(1, 'int32')) roi_w = tvm.max(roi_end_w - roi_start_w + 1, tvm.const(1, 'int32')) bin_h = roi_h.astype(dtype) / pooled_size_h bin_w = roi_w.astype(dtype) / pooled_size_w # use epsilon to prevent floating point precision loss in floor/ceil epsilon = tvm.const(0.00001, dtype) hstart = tvm.floor(ph * bin_h + epsilon).astype('int32') wstart = tvm.floor(pw * bin_w + epsilon).astype('int32') hend = tvm.ceil((ph + 1) * bin_h - epsilon).astype('int32') wend = tvm.ceil((pw + 1) * bin_w - epsilon).astype('int32') hstart = tvm.min(tvm.max(hstart + roi_start_h, 0), height) wstart = tvm.min(tvm.max(wstart + roi_start_w, 0), width) hend = tvm.min(tvm.max(hend + roi_start_h, 0), height) wend = tvm.min(tvm.max(wend + roi_start_w, 0), width) non_empty = tvm.all(hstart < hend, wstart < wend) min_value = lambda dtype: tvm.if_then_else( non_empty, tvm.min_value(dtype), tvm.const(0.0, dtype)) # pylint: disable=unnecessary-lambda _max = tvm.comm_reducer(lambda x, y: tvm.make._OpMax(x, y), min_value, name='max') rh = tvm.reduce_axis((0, hend - hstart), 'rh') rw = tvm.reduce_axis((0, wend - wstart), 'rw') return _max(data[batch_index, c, hstart + rh, wstart + rw], axis=[rh, rw])
def ceil(x): """Take ceil of input x. Parameters ---------- x : tvm.Tensor Input argument. Returns ------- y : tvm.Tensor The result. """ return tvm.compute(x.shape, lambda *i: tvm.ceil(x(*i)))
def test_const_propagation(): x1 = tvm.const(4, "int32") x2 = x1 + 5 assert isinstance(x2, tvm.expr.IntImm) and x2.value == 9 x3 = x2 / 3 assert isinstance(x3, tvm.expr.IntImm) and x3.value == 3 x4 = x3 + 0.5 assert isinstance(x4, tvm.expr.FloatImm) and x4.value == 3.5 x5 = tvm.ceil(x4) assert isinstance(x5, tvm.expr.FloatImm) and x5.value == 4 x6 = x5.astype('int') assert isinstance(x6, tvm.expr.IntImm) and x6.value == 4 y = (tvm.round((tvm.const(6.5, 'float32') - 1) / 1.5) + 2).astype('int') assert isinstance(y, tvm.expr.IntImm) and y.value == 6
def test_const_fold4(): x1 = tvm.const(4, "int32") x2 = x1 + 5 assert isinstance(x2, tvm.expr.IntImm) and x2.value == 9 x3 = x2 / 3 assert isinstance(x3, tvm.expr.IntImm) and x3.value == 3 x4 = x3 + 0.55 assert isinstance(x4, tvm.expr.FloatImm) and abs(x4.value - 3.55) < 1e-6 x5 = tvm.ceil(x4) assert isinstance(x5, tvm.expr.FloatImm) and x5.value == 4 x6 = x5.astype('int') assert isinstance(x6, tvm.expr.IntImm) and x6.value == 4, "x6={}".format(x6) y = (tvm.round((tvm.const(6.5, 'float32') - 1) / 1.5) + 2).astype('int') assert isinstance(y, tvm.expr.IntImm) and y.value == 6
def resize_bilinear(indices, data, image_height, image_width, target_height, target_width, boxes=None, box_indices=None, extrapolation_value=None, layout='NCHW', coordinate_transformation_mode="align_corners", out_dtype=None): """Perform resize operation with bilinear method on the data. For details about Bilinear interpolation please refer to https://en.wikipedia.org/wiki/Bilinear_interpolation. Parameters ---------- indices : tuple The indices of input data data : tvm.Tensor inputs is a 4-D tensor with shape [batch, channel, in_height, in_width] or [batch, in_height, in_width, channel] image_height : integer Input image height image_width : integer Input image width target_height : integer The target resized image height target_width : integer The target resized image width boxes : tvm.Tensor, optional A 2-D tensor of shape [num_boxes, 4]. Each row of the tensor specifies the coordinates of a box. box_indices : tvm.Tensor, optional A 1-D tensor of shape [num_boxes], box_indices[i] specifies the data that the i-th box refers to. extrapolation_value: float, optional Value used for extrapolation, when applicable. layout: string, optional "NCHW", "NHWC", or "NCHWc". coordinate_transformation_mode: string, optional Describes how to transform the coordinate in the resized tensor to the coordinate in the original tensor. Refer to the ONNX Resize operator specification for details. Available options are "half_pixel", "align_corners" and "asymmetric". out_dtype: string, optional Type to return. If left None will be same as input type. Returns ------- output : out_dtype The computed result with type out_dtype """ def _cast_output(value, data_dtype="float32", out_dtype=None): if out_dtype: dtype = out_dtype else: dtype = data_dtype return value.astype(dtype) def _lerp(A, B, t): return A * (1.0 - t) + B * t def _get_indices(indices, layout='NCHW'): if layout == 'NHWC': n, y, x, c = indices cc = None elif layout == 'NCHW': n, c, y, x = indices cc = None else: n, c, y, x, cc = indices return n, c, y, x, cc def _get_pixel(data, layout, n, c, y, x, cc): if boxes is None: y = tvm.max(tvm.min(y, image_height - 1), 0) x = tvm.max(tvm.min(x, image_width - 1), 0) if layout == 'NHWC': return data(n, y, x, c).astype('float') if layout == 'NCHW': return data(n, c, y, x).astype('float') # else must be NCHWxc return data(n, c, y, x, cc).astype('float') n, c, y, x, cc = _get_indices(indices, layout=layout) box_idx = box_indices(n) if box_indices is not None else n if boxes is not None: y1, x1 = boxes(n, 0), boxes(n, 1) y2, x2 = boxes(n, 2), boxes(n, 3) in_h = (image_height - 1) * (y2 - y1) in_w = (image_width - 1) * (x2 - x1) h_scale = in_h.astype('float') / (target_height - 1) w_scale = in_w.astype('float') / (target_width - 1) in_y = y1 * (image_height - 1) + h_scale * y in_x = x1 * (image_width - 1) + w_scale * x else: if coordinate_transformation_mode == "align_corners": h_scale = (image_height - 1).astype('float') / (target_height - 1) w_scale = (image_width - 1).astype('float') / (target_width - 1) elif coordinate_transformation_mode in ["asymmetric", "half_pixel"]: h_scale = image_height.astype('float') / target_height w_scale = image_width.astype('float') / target_width else: raise ValueError("Unsupported coordinate_transformation_mode: {}".format( coordinate_transformation_mode)) if coordinate_transformation_mode == "half_pixel": in_y = h_scale * (y + 0.5) - 0.5 in_x = w_scale * (x + 0.5) - 0.5 else: in_y = h_scale * y in_x = w_scale * x top_y_index = tvm.floor(in_y).astype('int32') bottom_y_index = tvm.ceil(in_y).astype('int32') y_lerp = in_y - top_y_index left_x_index = tvm.floor(in_x).astype('int32') right_x_index = tvm.ceil(in_x).astype('int32') x_lerp = in_x - left_x_index top_left = _get_pixel(data, layout, box_idx, c, top_y_index, left_x_index, cc) top_right = _get_pixel(data, layout, box_idx, c, top_y_index, right_x_index, cc) bottom_left = _get_pixel(data, layout, box_idx, c, bottom_y_index, left_x_index, cc) bottom_right = _get_pixel(data, layout, box_idx, c, bottom_y_index, right_x_index, cc) top = _lerp(top_left, top_right, x_lerp) bottom = _lerp(bottom_left, bottom_right, x_lerp) value = _lerp(top, bottom, y_lerp) # use extrapolation_value if in_y/in_x is out of boundary if extrapolation_value is not None: out = tvm.if_then_else(in_y < 0, extrapolation_value, tvm.if_then_else(in_y > image_height - 1, extrapolation_value, value)) value = tvm.if_then_else(in_x < 0, extrapolation_value, tvm.if_then_else(in_x > image_width - 1, extrapolation_value, out)) return _cast_output(value, data.dtype, out_dtype=out_dtype)