def forward(self, inputs): self.retain_inputs((0, )) xp = backend.get_array_module(*inputs) x, gy = inputs self._gy_shape = gy.shape gW = xp.zeros(self.w_shape, dtype=gy.dtype) if xp is numpy: # It is equivalent to `numpy.add.at(gW, x, gy)` but ufunc.at is # too slow. for ix, igy in six.moves.zip(x.ravel(), gy.reshape(x.size, -1)): if ix == self.ignore_label: continue gW[ix] += igy else: utils.nondeterministic('atomicAdd') if self.ignore_label is None: cuda.elementwise( 'T gy, S x, S n_out', 'raw T gW', 'ptrdiff_t w_ind[] = {x, i % n_out};' 'atomicAdd(&gW[w_ind], gy)', 'embed_id_bwd')(gy, xp.expand_dims(x, -1), gW.shape[1], gW) else: cuda.elementwise( 'T gy, S x, S n_out, S ignore', 'raw T gW', ''' if (x != ignore) { ptrdiff_t w_ind[] = {x, i % n_out}; atomicAdd(&gW[w_ind], gy); } ''', 'embed_id_bwd_ignore_label')(gy, xp.expand_dims(x, -1), gW.shape[1], self.ignore_label, gW) return gW,
def label_probability(self, label_size, path, path_length, multiply_seq, xp): seq_length = len(multiply_seq) n_batch = len(path) dtype = multiply_seq.dtype ret = xp.zeros((seq_length, n_batch, label_size), dtype) if xp == numpy: for b in six.moves.range(len(path)): target_path = path[b, :path_length[b]] chars = {c for c in target_path} for c in chars: ret[:, b, c] = xp.sum( multiply_seq[:, b, 0:path_length[b]][:, target_path == c], axis=1) else: utils.nondeterministic('atomicAdd') cuda.elementwise( 'T prob, I path, I path_length, I max_path_length', 'raw T cum_prob', ''' I t = i % max_path_length; if (t < path_length) { int n_batch = cum_prob.shape()[1]; I s = i / (max_path_length * n_batch); I b = (i - s * (max_path_length * n_batch)) / max_path_length; int ind[] = {s, b, path}; atomicAdd(&cum_prob[ind], prob); } ''', 'ctc_label_prob_sum')(multiply_seq, path, path_length[:, None], path.shape[1], ret) return ret
def label_probability(self, label_size, path, path_length, multiply_seq, xp): seq_length = len(multiply_seq) n_batch = len(path) dtype = multiply_seq.dtype ret = xp.zeros((seq_length, n_batch, label_size), dtype) if xp == numpy: for b in six.moves.range(len(path)): target_path = path[b, :path_length[b]] chars = {c for c in target_path} for c in chars: ret[:, b, c] = xp.sum( multiply_seq[:, b, 0:path_length[b]] [:, target_path == c], axis=1) else: utils.nondeterministic('atomicAdd') cuda.elementwise( 'T prob, I path, I path_length, I max_path_length', 'raw T cum_prob', ''' I t = i % max_path_length; if (t < path_length) { int n_batch = cum_prob.shape()[1]; I s = i / (max_path_length * n_batch); I b = (i - s * (max_path_length * n_batch)) / max_path_length; int ind[] = {s, b, path}; atomicAdd(&cum_prob[ind], prob); } ''', 'ctc_label_prob_sum' )(multiply_seq, path, path_length[:, None], path.shape[1], ret) return ret
def forward_gpu(self, inputs): utils.nondeterministic('atomicAdd') self.retain_inputs((0, 1, 2)) x, W, gy = inputs if self.reduce == 'no': gy = gy[:, None] samples = self.samples wx = self.wx.astype(x.dtype, copy=False) g = cuda.elementwise( 'T wx, T gy, int32 m', 'T g', ''' T y; if (i % m == 0) { y = 1; } else { y = -1; } g = -y * gy / (1.0f + __expf(wx * y)); ''', 'negative_sampling_calculate_g' )(wx, gy, self.sample_size + 1) cupy = cuda.cupy gx = cupy.zeros_like(x) n_in = x.shape[1] cuda.elementwise( 'raw T g, raw T W, bool mask, raw S k, int32 c, int32 m', 'T gx', ''' int d = i / c; T w = 0; if (mask == 1){ for (int j = 0; j < m; ++j) { w += g[d * m + j] * W[k[d * m + j] * c + i % c]; } } gx = w; ''', 'negative_sampling_calculate_gx' )(g, W, self.ignore_mask[:, None], samples, n_in, self.sample_size + 1, gx) gW = cupy.zeros_like(W) cuda.elementwise( 'T g, raw T x, S k, bool mask, int32 c, int32 m', 'raw T gW', ''' T gi = g; if (mask == 1) { for (int j = 0; j < c; ++j) { atomicAdd(&gW[k * c + j], gi * x[(i / m) * c + j]); } } ''', 'negative_sampling_calculate_gw' )(g, x, samples, self.ignore_mask[:, None], n_in, self.sample_size + 1, gW) return gx, None, gW
def backward_gpu(self, inputs, gy): utils.nondeterministic('atomicAdd') bottom_rois, bottom_roi_indices = inputs[1:] channels, height, width = self._bottom_data_shape[1:] bottom_diff = cuda.cupy.zeros(self._bottom_data_shape, bottom_rois.dtype) cuda.elementwise( ''' raw T top_diff, raw int32 argmax_data, raw T bottom_rois, raw int32 bottom_roi_indices, int32 num_rois, T spatial_scale, int32 channels, int32 height, int32 width, int32 pooled_height, int32 pooled_width ''', 'raw T bottom_diff', ''' int pw = i % pooled_width; int ph = (i / pooled_width) % pooled_height; int c = (i / pooled_width / pooled_height) % channels; int n = i / pooled_width / pooled_height / channels; int roi_batch_ind = bottom_roi_indices[n]; int bottom_diff_offset = (roi_batch_ind * channels + c) * height * width; int top_diff_offset = (n * channels + c) * pooled_height * pooled_width; int max_index = argmax_data[top_diff_offset + ph * pooled_width + pw]; if (max_index != -1) { atomicAdd( &bottom_diff[bottom_diff_offset + max_index], top_diff[top_diff_offset + ph * pooled_width + pw]); } ''', 'roi_max_pooling_2d_bwd')(gy[0], self.argmax_data, bottom_rois, bottom_roi_indices, bottom_rois.shape[0], self.spatial_scale, channels, height, width, self.outh, self.outw, bottom_diff, size=gy[0].size) return bottom_diff, None, None
def _cupy_coo_matmul(): utils.nondeterministic('atomicAdd') return cuda.elementwise( 'int32 nb, int32 _m, int32 _n, int32 _k, int32 nnz, int32 chunk, \ raw A A_data, raw T A_row, raw T A_col, \ raw B _B', 'raw C _C', ''' int i_n = (i % _n); int i0 = (i / _n) * chunk; int i_C = -1; C val_C = 0; for (int i1 = 0; i1 < chunk; i1++) { int i_A = i0 + i1; int i_b = i_A / nnz; if (i_b >= nb) { continue; } int i_k = A_col[i_A]; if (i_k < 0) { continue; } assert(i_k < _k); int i_m = A_row[i_A]; if (i_m < 0) { continue; } assert(i_m < _m); int i_B = i_n + _n * (i_k + _k * i_b); int i_C_now = i_n + _n * (i_m + _m * i_b); A val_A = A_data[i_A]; B val_B = _B[i_B]; C val_C_now = static_cast<C>(val_A * val_B); if (i_C >= 0 && i_C != i_C_now) { atomicAdd(&_C[i_C], val_C); val_C = 0; } i_C = i_C_now; val_C += val_C_now; } if (i_C >= 0) { atomicAdd(&_C[i_C], val_C); } ''', 'coo_matmul')
def _cupy_coo_matmul(): utils.nondeterministic('atomicAdd') return cuda.elementwise( 'int32 nb, int32 _m, int32 _n, int32 _k, int32 nnz, int32 chunk, \ raw A A_data, raw T A_row, raw T A_col, \ raw B _B', 'raw C _C', ''' int i_n = (i % _n); int i0 = (i / _n) * chunk; int i_C = -1; C val_C = 0; for (int i1 = 0; i1 < chunk; i1++) { int i_A = i0 + i1; int i_b = i_A / nnz; if (i_b >= nb) { continue; } int i_k = A_col[i_A]; if (i_k < 0) { continue; } assert(i_k < _k); int i_m = A_row[i_A]; if (i_m < 0) { continue; } assert(i_m < _m); int i_B = i_n + _n * (i_k + _k * i_b); int i_C_now = i_n + _n * (i_m + _m * i_b); A val_A = A_data[i_A]; B val_B = _B[i_B]; C val_C_now = static_cast<C>(val_A * val_B); if (i_C >= 0 && i_C != i_C_now) { atomicAdd(&_C[i_C], val_C); val_C = 0; } i_C = i_C_now; val_C += val_C_now; } if (i_C >= 0) { atomicAdd(&_C[i_C], val_C); } ''', 'coo_matmul')
def backward_gpu(self, inputs, gy): utils.nondeterministic('atomicAdd') bottom_rois, bottom_roi_indices = inputs[1:] channels, height, width = self._bottom_data_shape[1:] bottom_diff = cuda.cupy.zeros( self._bottom_data_shape, bottom_rois.dtype) cuda.elementwise( ''' raw T top_diff, raw int32 argmax_data, raw T bottom_rois, raw int32 bottom_roi_indices, int32 num_rois, T spatial_scale, int32 channels, int32 height, int32 width, int32 pooled_height, int32 pooled_width ''', 'raw T bottom_diff', ''' int pw = i % pooled_width; int ph = (i / pooled_width) % pooled_height; int c = (i / pooled_width / pooled_height) % channels; int n = i / pooled_width / pooled_height / channels; int roi_batch_ind = bottom_roi_indices[n]; int bottom_diff_offset = (roi_batch_ind * channels + c) * height * width; int top_diff_offset = (n * channels + c) * pooled_height * pooled_width; int max_index = argmax_data[top_diff_offset + ph * pooled_width + pw]; if (max_index != -1) { atomicAdd( &bottom_diff[bottom_diff_offset + max_index], top_diff[top_diff_offset + ph * pooled_width + pw]); } ''', 'roi_max_pooling_2d_bwd' )(gy[0], self.argmax_data, bottom_rois, bottom_roi_indices, bottom_rois.shape[0], self.spatial_scale, channels, height, width, self.outh, self.outw, bottom_diff, size=gy[0].size) return bottom_diff, None, None
def backward_gpu(self, inputs, grad_outputs): utils.nondeterministic('atomicAdd') x, t, W = inputs gloss, = grad_outputs n_in = x.shape[1] gx = cuda.cupy.zeros_like(x) gW = cuda.cupy.zeros_like(W) cuda.elementwise( '''T wxy, raw T x, raw T w, raw int32 ts, raw int32 paths, raw T codes, raw int32 begins, raw T gloss, int32 c, int32 max_length''', 'raw T gx, raw T gw', ''' int ind = i / max_length; int offset = i - ind * max_length; int t = ts[ind]; int begin = begins[t]; int length = begins[t + 1] - begins[t]; if (offset < length) { int p = begin + offset; int node = paths[p]; T code = codes[p]; T g = -gloss[0] * code / (1.0 + exp(wxy)); for (int j = 0; j < c; ++j) { int w_ind[] = {node, j}; int x_ind[] = {ind, j}; atomicAdd(&gx[x_ind], g * w[w_ind]); atomicAdd(&gw[w_ind], g * x[x_ind]); } } ''', 'binary_hierarchical_softmax_bwd' )(self.wxy, x, W, t, self.paths, self.codes, self.begins, gloss, n_in, self.max_length, gx, gW) return gx, None, gW
def forward(self, inputs): self.retain_inputs((0,)) xp = backend.get_array_module(*inputs) x, gy = inputs self._gy_shape = gy.shape gW = xp.zeros(self.w_shape, dtype=gy.dtype) if xp is numpy: # It is equivalent to `numpy.add.at(gW, x, gy)` but ufunc.at is # too slow. for ix, igy in six.moves.zip(x.ravel(), gy.reshape(x.size, -1)): if ix == self.ignore_label: continue gW[ix] += igy else: utils.nondeterministic('atomicAdd') if self.ignore_label is None: cuda.elementwise( 'T gy, S x, S n_out', 'raw T gW', 'ptrdiff_t w_ind[] = {x, i % n_out};' 'atomicAdd(&gW[w_ind], gy)', 'embed_id_bwd')( gy, xp.expand_dims(x, -1), gW.shape[1], gW) else: cuda.elementwise( 'T gy, S x, S n_out, S ignore', 'raw T gW', ''' if (x != ignore) { ptrdiff_t w_ind[] = {x, i % n_out}; atomicAdd(&gW[w_ind], gy); } ''', 'embed_id_bwd_ignore_label')( gy, xp.expand_dims(x, -1), gW.shape[1], self.ignore_label, gW) return gW,
def backward_gpu(self, inputs, gy): utils.nondeterministic('atomicAdd') bottom_rois, bottom_roi_indices = inputs[1:] channels, height, width = self._bottom_data_shape[1:] bottom_diff = cuda.cupy.zeros(self._bottom_data_shape, gy[0].dtype) if self.sampling_ratio[0] is None: sampling_ratio_h = 0 else: sampling_ratio_h = self.sampling_ratio[0] if self.sampling_ratio[1] is None: sampling_ratio_w = 0 else: sampling_ratio_w = self.sampling_ratio[1] cuda.elementwise( ''' raw T top_diff, T spatial_scale, int32 channels, int32 height, int32 width, int32 pooled_height, int32 pooled_width, int32 sampling_ratio_h, int32 sampling_ratio_w, raw T bottom_rois, raw int32 bottom_roi_indices ''', 'raw T bottom_diff, raw int32 argmax_data', ''' // (n, c, h, w) coords in bottom data int pw = i % pooled_width; int ph = (i / pooled_width) % pooled_height; int c = (i / pooled_width / pooled_height) % channels; int n = i / pooled_width / pooled_height / channels; // Do not using rounding; this implementation detail is critical int roi_batch_ind = bottom_roi_indices[n]; T roi_start_h = bottom_rois[n * 4 + 0] * spatial_scale; T roi_start_w = bottom_rois[n * 4 + 1] * spatial_scale; T roi_end_h = bottom_rois[n * 4 + 2] * spatial_scale; T roi_end_w = bottom_rois[n * 4 + 3] * spatial_scale; // Force malformed ROIs to be 1x1 T roi_width = max(roi_end_w - roi_start_w, (T)1.); T roi_height = max(roi_end_h - roi_start_h, (T)1.); T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height); T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width); int bottom_diff_offset = (roi_batch_ind * channels + c) * height * width; int top_offset = (n * channels + c) * pooled_height * pooled_width; int max_index = argmax_data[top_offset + ph * pooled_width + pw]; if (max_index != -1) { T top_diff_this_bin = top_diff[top_offset + ph * pooled_width + pw]; // We use roi_bin_grid to sample the grid and mimic integral int roi_bin_grid_h = (sampling_ratio_h > 0) ? sampling_ratio_h : ceil(roi_height / pooled_height); // e.g. = 2 int roi_bin_grid_w = (sampling_ratio_w > 0) ? sampling_ratio_w : ceil(roi_width / pooled_width); int iy = max_index / roi_bin_grid_w; int ix = max_index % roi_bin_grid_w; T y = roi_start_h + ph * bin_size_h + static_cast<T>(iy + .5f) * bin_size_h / static_cast<T>(roi_bin_grid_h); // e.g. 0.5, 1.5 T x = roi_start_w + pw * bin_size_w + static_cast<T>(ix + .5f) * bin_size_w / static_cast<T>(roi_bin_grid_w); // bilinear_interpolation_gradient {{ int y_low, x_low, y_high, x_high; T w1, w2, w3, w4; bool y_ret = get_bounds(y, height, y_low, y_high); bool x_ret = get_bounds(x, width, x_low, x_high); if (!x_ret || !y_ret) continue; get_bilinear_interp_params( y, x, y_low, x_low, y_high, x_high, w1, w2, w3, w4); if (w1 > 0 && y_low >= 0 && x_low >= 0) { T g1 = top_diff_this_bin * w1; atomicAdd(&bottom_diff[ bottom_diff_offset + y_low * width + x_low], g1); } if (w2 > 0 && y_low >= 0 && x_high <= width - 1) { T g2 = top_diff_this_bin * w2; atomicAdd(&bottom_diff[ bottom_diff_offset + y_low * width + x_high], g2); } if (w3 > 0 && y_high <= height - 1 && x_low >= 0) { T g3 = top_diff_this_bin * w3; atomicAdd(&bottom_diff[ bottom_diff_offset + y_high * width + x_low], g3); } if (w4 > 0 && y_high <= height - 1 && x_high <= width - 1) { T g4 = top_diff_this_bin * w4; atomicAdd(&bottom_diff[ bottom_diff_offset + y_high * width + x_high], g4); } } // }} ''', 'roi_max_align_2d_bwd', preamble=_GET_BILINEAR_INTERP_KERNEL, )(gy[0], self.spatial_scale, channels, height, width, self.outh, self.outw, sampling_ratio_h, sampling_ratio_w, bottom_rois, bottom_roi_indices, bottom_diff, self.argmax_data, size=gy[0].size) return bottom_diff, None, None
def backward_gpu(self, inputs, gy): utils.nondeterministic('atomicAdd') bottom_rois, bottom_roi_indices = inputs[1:] channels, height, width = self._bottom_data_shape[1:] bottom_diff = cuda.cupy.zeros(self._bottom_data_shape, gy[0].dtype) cuda.elementwise( ''' raw T top_diff, raw T bottom_rois, raw int32 bottom_roi_indices, T spatial_scale, int32 channels, int32 height, int32 width, int32 pooled_height, int32 pooled_width ''', 'raw T bottom_diff', ''' // pos in output filter int pw = i % pooled_width; int ph = (i / pooled_width) % pooled_height; int c = (i / pooled_width / pooled_height) % channels; int n = i / pooled_width / pooled_height / channels; int roi_batch_ind = bottom_roi_indices[n]; int roi_start_h = round(bottom_rois[n * 4 + 0] * spatial_scale); int roi_start_w = round(bottom_rois[n * 4 + 1] * spatial_scale); int roi_end_h = round(bottom_rois[n * 4 + 2] * spatial_scale); int roi_end_w = round(bottom_rois[n * 4 + 3] * spatial_scale); // Force malformed ROIs to be 1x1 int roi_height = max(roi_end_h - roi_start_h, 1); int roi_width = max(roi_end_w - roi_start_w, 1); T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height); T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width); int hstart = static_cast<int>(floor(static_cast<T>(ph) * bin_size_h)); int wstart = static_cast<int>(floor(static_cast<T>(pw) * bin_size_w)); int hend = static_cast<int>(ceil(static_cast<T>(ph + 1) * bin_size_h)); int wend = static_cast<int>(ceil(static_cast<T>(pw + 1) * bin_size_w)); // Add roi offsets and clip to input boundaries hstart = min(max(hstart + roi_start_h, 0), height); hend = min(max(hend + roi_start_h, 0), height); wstart = min(max(wstart + roi_start_w, 0), width); wend = min(max(wend + roi_start_w, 0), width); bool is_empty = (hend <= hstart) || (wend <= wstart); int bottom_diff_offset = (roi_batch_ind * channels + c) * height * width; int top_offset = (n * channels + c) * pooled_height * pooled_width; T count = (hend - hstart) * (wend - wstart); T diff_val = is_empty ? 0. : top_diff[top_offset + ph * pooled_width + pw] / count; for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { int bottom_index = h * width + w; atomicAdd( &bottom_diff[bottom_diff_offset + bottom_index], diff_val); } } ''', 'roi_average_pooling_2d_bwd')(gy[0], bottom_rois, bottom_roi_indices, self.spatial_scale, channels, height, width, self.outh, self.outw, bottom_diff, size=gy[0].size) return bottom_diff, None, None
def backward_gpu(self, inputs, gy): utils.nondeterministic('atomicAdd') bottom_rois, bottom_roi_indices = inputs[1:] channels, height, width = self._bottom_data_shape[1:] bottom_diff = cuda.cupy.zeros(self._bottom_data_shape, gy[0].dtype) if self.sampling_ratio[0] is None: sampling_ratio_h = 0 else: sampling_ratio_h = self.sampling_ratio[0] if self.sampling_ratio[1] is None: sampling_ratio_w = 0 else: sampling_ratio_w = self.sampling_ratio[1] cuda.elementwise( ''' raw T top_diff, T spatial_scale, int32 channels, int32 height, int32 width, int32 pooled_height, int32 pooled_width, int32 sampling_ratio_h, int32 sampling_ratio_w, raw T bottom_rois, raw int32 bottom_roi_indices ''', 'raw T bottom_diff, raw int32 argmax_data', ''' // (n, c, h, w) coords in bottom data int pw = i % pooled_width; int ph = (i / pooled_width) % pooled_height; int c = (i / pooled_width / pooled_height) % channels; int n = i / pooled_width / pooled_height / channels; // Do not using rounding; this implementation detail is critical int roi_batch_ind = bottom_roi_indices[n]; T roi_start_h = bottom_rois[n * 4 + 0] * spatial_scale; T roi_start_w = bottom_rois[n * 4 + 1] * spatial_scale; T roi_end_h = bottom_rois[n * 4 + 2] * spatial_scale; T roi_end_w = bottom_rois[n * 4 + 3] * spatial_scale; // Force malformed ROIs to be 1x1 T roi_width = max(roi_end_w - roi_start_w, (T)1.); T roi_height = max(roi_end_h - roi_start_h, (T)1.); T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height); T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width); int bottom_diff_offset = (roi_batch_ind * channels + c) * height * width; int top_offset = (n * channels + c) * pooled_height * pooled_width; int max_index = argmax_data[top_offset + ph * pooled_width + pw]; if (max_index != -1) { T top_diff_this_bin = top_diff[top_offset + ph * pooled_width + pw]; // We use roi_bin_grid to sample the grid and mimic integral int roi_bin_grid_h = (sampling_ratio_h > 0) ? sampling_ratio_h : ceil(roi_height / pooled_height); // e.g. = 2 int roi_bin_grid_w = (sampling_ratio_w > 0) ? sampling_ratio_w : ceil(roi_width / pooled_width); int iy = max_index / roi_bin_grid_w; int ix = max_index % roi_bin_grid_w; T y = roi_start_h + ph * bin_size_h + static_cast<T>(iy + .5f) * bin_size_h / static_cast<T>(roi_bin_grid_h); // e.g. 0.5, 1.5 T x = roi_start_w + pw * bin_size_w + static_cast<T>(ix + .5f) * bin_size_w / static_cast<T>(roi_bin_grid_w); // bilinear_interpolation_gradient {{ int y_low, x_low, y_high, x_high; T w1, w2, w3, w4; bool y_ret = get_bounds(y, height, y_low, y_high); bool x_ret = get_bounds(x, width, x_low, x_high); if (!x_ret || !y_ret) continue; get_bilinear_interp_params( y, x, y_low, x_low, y_high, x_high, w1, w2, w3, w4); T g1 = top_diff_this_bin * w1; T g2 = top_diff_this_bin * w2; T g3 = top_diff_this_bin * w3; T g4 = top_diff_this_bin * w4; if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { atomicAdd(&bottom_diff[bottom_diff_offset + y_low * width + x_low], g1); atomicAdd(&bottom_diff[bottom_diff_offset + y_low * width + x_high], g2); atomicAdd(&bottom_diff[bottom_diff_offset + y_high * width + x_low], g3); atomicAdd(&bottom_diff[bottom_diff_offset + y_high * width + x_high], g4); } } // }} ''', 'roi_max_align_2d_bwd', preamble=_GET_BILINEAR_INTERP_KERNEL, )(gy[0], self.spatial_scale, channels, height, width, self.outh, self.outw, sampling_ratio_h, sampling_ratio_w, bottom_rois, bottom_roi_indices, bottom_diff, self.argmax_data, size=gy[0].size) return bottom_diff, None, None
def backward_gpu(self, inputs, gy): utils.nondeterministic('atomicAdd') bottom_rois, bottom_roi_indices = inputs[1:] channels, height, width = self._bottom_data_shape[1:] bottom_diff = cuda.cupy.zeros( self._bottom_data_shape, gy[0].dtype) cuda.elementwise( ''' raw T top_diff, raw T bottom_rois, raw int32 bottom_roi_indices, T spatial_scale, int32 channels, int32 height, int32 width, int32 pooled_height, int32 pooled_width ''', 'raw T bottom_diff', ''' // pos in output filter int pw = i % pooled_width; int ph = (i / pooled_width) % pooled_height; int c = (i / pooled_width / pooled_height) % channels; int n = i / pooled_width / pooled_height / channels; int roi_batch_ind = bottom_roi_indices[n]; int roi_start_h = round(bottom_rois[n * 4 + 0] * spatial_scale); int roi_start_w = round(bottom_rois[n * 4 + 1] * spatial_scale); int roi_end_h = round(bottom_rois[n * 4 + 2] * spatial_scale); int roi_end_w = round(bottom_rois[n * 4 + 3] * spatial_scale); // Force malformed ROIs to be 1x1 int roi_height = max(roi_end_h - roi_start_h, 1); int roi_width = max(roi_end_w - roi_start_w, 1); T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height); T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width); int hstart = static_cast<int>(floor(static_cast<T>(ph) * bin_size_h)); int wstart = static_cast<int>(floor(static_cast<T>(pw) * bin_size_w)); int hend = static_cast<int>(ceil(static_cast<T>(ph + 1) * bin_size_h)); int wend = static_cast<int>(ceil(static_cast<T>(pw + 1) * bin_size_w)); // Add roi offsets and clip to input boundaries hstart = min(max(hstart + roi_start_h, 0), height); hend = min(max(hend + roi_start_h, 0), height); wstart = min(max(wstart + roi_start_w, 0), width); wend = min(max(wend + roi_start_w, 0), width); bool is_empty = (hend <= hstart) || (wend <= wstart); int bottom_diff_offset = (roi_batch_ind * channels + c) * height * width; int top_offset = (n * channels + c) * pooled_height * pooled_width; T count = (hend - hstart) * (wend - wstart); T diff_val = is_empty ? 0. : top_diff[top_offset + ph * pooled_width + pw] / count; for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { int bottom_index = h * width + w; atomicAdd( &bottom_diff[bottom_diff_offset + bottom_index], diff_val); } } ''', 'roi_average_pooling_2d_bwd' )(gy[0], bottom_rois, bottom_roi_indices, self.spatial_scale, channels, height, width, self.outh, self.outw, bottom_diff, size=gy[0].size) return bottom_diff, None, None