def _helper_boxes_shape(self, func): # test boxes as Tensor[N, 5] with self.assertRaises(AssertionError): a = torch.linspace(1, 8 * 8, 8 * 8).reshape(1, 1, 8, 8) boxes = torch.tensor([[0, 0, 3, 3]], dtype=a.dtype) func(a, boxes, output_size=(2, 2)) # test boxes as List[Tensor[N, 4]] with self.assertRaises(AssertionError): a = torch.linspace(1, 8 * 8, 8 * 8).reshape(1, 1, 8, 8) boxes = torch.tensor([[0, 0, 3]], dtype=a.dtype) ops.roi_pool(a, [boxes], output_size=(2, 2))
def forward(self, batch, proposals): """ Feed forward the proposal regions into the RCNN head to predict object ROIs and corresponding classes. """ # Perform ROI Max Pooling to create feature sets of the same size for # obtained proposals. #_______________________________________________________________________ # Append batch indices to proposal coordinates and permute them to # [k x1 y1 x2 y2] format as required by roi_pool: B, N, _ = proposals.size() batchids = torch.from_numpy(np.repeat(np.arange(B), N)) batchids = batchids.view(-1,1).to(proposals.device).to(proposals.dtype) rois = torch.cat((batchids, proposals.view(-1, 4)),dim=1) # NOTE: IMPORTANT - Have to convert YX -> XY xyROIs = rois[:, [0,2,1,4,3]] # Perform pooling: scale = 1 / float(self.spatial_scale) pool = roi_pool(batch, xyROIs, self.pool_size, spatial_scale=scale) # Feed pooled features to RCNN head, obtain ROI targets and scores: #_______________________________________________________________________ pool_features = self.Classifier(pool.view(pool.size(0), -1)) roi_targets = self.RCNNBBox(pool_features) roi_scores = self.RCNNClass(pool_features) # Resize the predictions to batch specific scores: #_______________________________________________________________________ roi_targets = roi_targets.view([B, N, roi_targets.size(1)]) roi_scores = roi_scores.view([B, N, roi_scores.size(1)]) return roi_targets, roi_scores
def forward(self, x): feature = x[0] print("feature = {}".format(feature.shape)) rois = x[1].view(-1, 4) print("rois = {}".format(rois.shape)) samples = ops.roi_pool(input=feature, boxes=rois, output_size=(self.pooling_regions, self.pooling_regions)) return samples
def forward(self, batch_imgs, batch_boxes, batch_scores): # assume batch size is 1 batch_boxes = [batch_boxes[0]] out = self.features(batch_imgs) # [1, 256, 21, 29] out = roi_pool(out, batch_boxes, self.roi_output_size, 1.0 / 16) out = out.view(len(batch_boxes[0]), -1) out = out * batch_scores[0] # apply box scores out = self.fcs(out) # [4000, 4096] classification_scores = F.softmax(self.fc_c(out), dim=1) detection_scores = F.softmax(self.fc_d(out), dim=0) combined_scores = classification_scores * detection_scores return combined_scores
def cal_perc_feat(self, x, target_bbox=None): initial_size = x.size() image_w = initial_size[2] output = {} mask_output = {} roi_cnt = 0 for name, module in self.vgg_layers._modules.items(): x = module(x) if name in self.layers_mapping: if target_bbox is not None: spatial_scale = x.shape[2] / image_w mask_output[self.layers_mapping[name]] = roi_pool( x, target_bbox, self.roi_size[roi_cnt], spatial_scale) roi_cnt += 1 output[self.layers_mapping[name]] = x return output, mask_output
def train(self,epoch): self.network.train() num_trains = len(self.train_loader.dataset) for idx, (data, target) in enumerate(self.train_loader): data = torch.stack(data, 0) # 不使用多尺度,因此会resize到同一尺度,可以直接按batch计算,加快速度 if self.use_cuda: data = data.to(self.device) target = [{k: v.to(self.device) for k, v in targ.items()} for targ in target] output,feature = self.network(data) loss_dict = self.loss_func(output, target) # """ proposals = self.loss_func.proposal(output) # ROi pooling outs = [] proposal_list = [] fh, fw = feature.shape[-2:] stride_h, stride_w = self.resize[0] / fh, self.resize[1] / fw for i,proposal in enumerate(proposals): boxes = proposal["boxes"] / torch.as_tensor([stride_w, stride_h, stride_w, stride_h],dtype=torch.float32, device=self.device).unsqueeze(0) roi_out = roi_pool(feature[i].unsqueeze(0),proposal["boxes"],[7,7]) outs.append(roi_out) proposal_list.append(boxes) output = self.network.doRCNN(outs) loss_dict_rcnn = self.loss_func_rcnn(output,proposal_list,target) loss_dict.update(loss_dict_rcnn) # """ losses = sum(loss for loss in loss_dict.values()) self.optimizer.zero_grad() losses.backward() self.optimizer.step() # 记录到TensorBoard self.writer.add_scalar('total_loss', losses.item(), epoch * num_trains // self.batch_size + idx) for key, loss in loss_dict.items(): self.writer.add_scalar(key, loss.item(), epoch * num_trains // self.batch_size + idx) if idx % self.print_freq == 0: ss = "epoch:{}-({}/{})".format(epoch, idx * self.batch_size, num_trains) ss += "\ttotal:{:.3f}".format(losses.item()) for key, loss in loss_dict.items(): ss += "\t{}:{:.3f}".format(key, loss.item()) print(ss)
def forward(self, x, pred_boxes, pred_batch_idx): num_regions = len(pred_batch_idx) pred_indices_and_boxes = np.concatenate( (pred_batch_idx.reshape(-1, 1), pred_boxes), axis=1) pred_indices_and_boxes = torch.from_numpy(pred_indices_and_boxes).to(x) # TODO should I use roi_align? Roi sampling should be configurable # regions = ops.roi_align(x, pred_indices_and_boxes, self.roi_align_size, self.spatial_scale) regions = ops.roi_pool(x, pred_indices_and_boxes, self.roi_align_size, self.spatial_scale) y = self.avgpool(self.layer4(regions)) y = torch.flatten(y, start_dim=1) pred_roi_cls = self.fc_cls(y) pred_roi_loc = self.fc_loc(y).view(num_regions, -1, 4) return pred_roi_cls, pred_roi_loc
def forward(self, x, region_proposal): # assume batch size is 1 # print(region_proposal.shape) region_proposal = [region_proposal[0]] # print(x.shape) out = self.features(x) # [1, 512, 30, 30] # print(out.shape) # print(len(region_proposal[0])) out = roi_pool(out, region_proposal, self.roi_output_size, 1.0 / 16) # spp # print(out.shape) out = out.view(len(region_proposal[0]), -1) # print(out.shape) # out = out * batch_scores[0] # apply box scores out = self.fc6_and_fc7(out) # [4000, 4096] # print(out.shape) classification_scores = F.softmax(self.fc8c(out), dim=1) detection_scores = F.softmax(self.fc8d(out), dim=0) combined_scores = classification_scores * detection_scores # print(combined_scores.shape) return combined_scores
def relocalize(xA_, yA_, xB_, yB_, score_, feature_A_2x, feature_B_2x, N_matches=None, upsample_positions=True, crop_size=2): assert crop_size == 3 or crop_size == 2 if N_matches is None: N_matches = xA_.shape[1] else: idx = torch.argsort(-score_.view(-1)) N_matches = min(N_matches, idx.shape[0]) idx = idx[:N_matches] score_ = score_[:, idx] xA_ = xA_[:, idx] yA_ = yA_[:, idx] xB_ = xB_[:, idx] yB_ = yB_[:, idx] if upsample_positions: xA_ = xA_ * 2 yA_ = yA_ * 2 xB_ = xB_ * 2 yB_ = yB_ * 2 coords_A = torch.cat( (torch.zeros(1, N_matches).to(xA_.device), xA_ - (crop_size % 2), yA_ - (crop_size % 2), xA_ + 1, yA_ + 1), dim=0).t() coords_B = torch.cat( (torch.zeros(1, N_matches).to(xB_.device), xB_ - (crop_size % 2), yB_ - (crop_size % 2), xB_ + 1, yB_ + 1), dim=0).t() ch = feature_A_2x.shape[1] feature_A_local = O.roi_pool(feature_A_2x, coords_A, output_size=(crop_size, crop_size)).view( N_matches, ch, -1, 1) feature_B_local = O.roi_pool(feature_B_2x, coords_B, output_size=(crop_size, crop_size)).view( N_matches, ch, 1, -1) deltaY, deltaX = torch.meshgrid( torch.linspace(-(crop_size % 2), 1, crop_size), torch.linspace(-(crop_size % 2), 1, crop_size)) deltaX = deltaX.contiguous().view(-1).to(xA_.device) deltaY = deltaY.contiguous().view(-1).to(xA_.device) corr_local = (feature_A_local * feature_B_local).sum(dim=1) delta_A_idx = torch.argmax(corr_local.max(dim=2, keepdim=True)[0], dim=1) delta_B_idx = torch.argmax(corr_local.max(dim=1, keepdim=True)[0], dim=2) xA_ = xA_ + deltaX[delta_A_idx].t() yA_ = yA_ + deltaY[delta_A_idx].t() xB_ = xB_ + deltaX[delta_B_idx].t() yB_ = yB_ + deltaY[delta_B_idx].t() return xA_, yA_, xB_, yB_, score_
def forward(self, x, rois): return roi_pool(x, rois, (self.outh, self.outw), self.spatial_scale)
def forward(self, features, rois): return roi_pool(features, rois, self.out_size, self.spatial_scale)
j, y1, x1, y2, x2, oH, oW, ) # (i,j) define the number and the position of the 4x3 grid # so for each sample for each bounding box I do this 4x3= 12 times cause i need 12 sectors over which apply the max and get a 4x3 final matrix # IMPORTANT: REGIONS CAN OVERLAP! IT DOES NOT HAPPEN IF I SUBSTITUTE .floor and .ceil with .round! slice = input[ n, :, y_start:y_end, x_start: x_end] # input is torch.Size([2, 16, 5, 7]) -> I take 1 sample from the 0 one to the n-1, all channels and a portion defined by those 12 group of coordinates -> torch.Size([16, 2, 2]) slice, _ = torch.max( torch.max(slice, dim=1)[0], dim=1 ) # IMPORTANT PART: from this tensor, so a 2x2 image with 16 channels I want to take max value in each of the 12 grid (in this case grid are only 2x2 cause it's too little) along row and along col -> so in numpy would be something like np.amax(slice, axis=(-1, -2)), here you have to do 2 times taking the max over axis=1 -> torch.Size([16]) # inner torch.max torch.Size([16, 2, 2]) -> torch.Size([16, 2]) # outer torch.max torch.Size([16, 2) -> torch.Size([16]) # torch.max return tensor after max and indexes of max values, so discard second value out[n, l, :, i, j] = slice # now you can insert a whole channel in ':' out_pytorch = roi_pool(input, boxes, (oH, oW), spatial_scale=1.0) out_pytorch = out_pytorch.reshape((N, L, C, oH, oW)) # Computes element-wise equality # print(torch.eq(out, out_pytorch, out=None)) # not equal # slice = input.numpy()[:, :, 3:6, 0:4] convert to numpy # print (slice.data) get tensor data
def prepare_decoder(self, diffs, boxes_resized, image_features, image_sizes, lengths, batched=False): B, L, F = diffs.shape bounds = (torch.cumsum(lengths, dim=0) - 1).tolist() keep = torch.tensor( sorted( list( set(range(len(image_sizes) - 1)).difference( set(bounds[:-1]))))) if self.use_pre_conv: assert not isinstance(image_features, list) image_features = self.pre_conv(image_features) # roi pooling on enlarged areas around boxes widths, heights = get_width(boxes_resized[keep]), get_height( boxes_resized[keep]) dx = ((self.correlation_args['patch_size'] - 1) * widths * self.correlation_args['dilation_patch']) / (2 * self.roi_output_size) dy = ((self.correlation_args['patch_size'] - 1) * heights * self.correlation_args['dilation_patch']) / (2 * self.roi_output_size) if not self.use_roi_align: dx, dy = dx.ceil(), dy.ceil() if self.fixed_env: dx, dy = dx + widths, dy + heights dpos = torch.stack([-dx, -dy, dx, dy], dim=1) proposals = list((boxes_resized[keep] + dpos).unsqueeze(1)) if self.use_roi_align: if self.fixed_env: if batched: box_to_images = torch.cat([ torch.arange(lengths.max() - l, lengths.max() - 1) for l in lengths ]) enlarged_boxes = boxes_resized[keep] + dpos proposals = [ enlarged_boxes[box_to_images == l] for l in range(lengths.max() - 1) ] image_sizes = image_sizes[0].repeat( len(image_features) - 1, 1) perm = torch.zeros_like(box_to_images) current_i = 0 for i in range(box_to_images.max().item() + 1): mask = box_to_images == i perm[mask] = torch.arange( current_i, current_i + mask.sum().item()) current_i = perm.max() + 1 prev_features = self.roi_pool_env_ext( OrderedDict([(0, image_features[:-1])]), proposals, image_sizes.tolist())[perm] next_features = self.roi_pool_env_ext( OrderedDict([(0, image_features[1:])]), proposals, image_sizes.tolist())[perm] else: prev_features = self.roi_pool_env_ext( OrderedDict([(0, image_features[keep])]), proposals, image_sizes[keep].tolist()) next_features = self.roi_pool_env_ext( OrderedDict([(0, image_features[keep + 1])]), proposals, image_sizes[keep + 1].tolist()) else: assert not batched prev_features = self.roi_pool_ext( OrderedDict([(0, image_features[keep])]), proposals, image_sizes[keep].tolist()) next_features = self.roi_pool_ext( OrderedDict([(0, image_features[keep + 1])]), proposals, image_sizes[keep + 1].tolist()) else: output_size = (self.roi_output_size_ext, self.roi_output_size_ext) prev_features = roi_pool(image_features[keep], proposals, output_size, spatial_scale=0.125) next_features = roi_pool(image_features[keep + 1], proposals, output_size, spatial_scale=0.125) # correlate correlation = correlate(prev_features, next_features, self.correlation_args) if self.fixed_env: # for boxes with height > threshold, set appropriate locations to zero del_idc = heights > 120 margin = int( (self.roi_output_size_env_ext - self.roi_output_size) / 2) mask = torch.ones_like(correlation).cuda() mask[del_idc, :, :margin] = 0 mask[del_idc, :, -margin:] = 0 mask[del_idc, :, :, :margin] = 0 mask[del_idc, :, :, -margin:] = 0 correlation = correlation * mask # now extract box features margin = int( (self.roi_output_size_env_ext - self.roi_output_size_env) / 2) correlation = correlation[:, :, margin:-margin, margin:-margin] elif not self.use_env_features: assert not self.fixed_env # isolate correlation features which belong to the bounding box margin = int((self.roi_output_size_ext - self.roi_output_size) / 2) correlation = correlation[:, :, margin:-margin, margin:-margin] if self.correlation_only: if self.refine_correlation: out_conv3 = self.conv3_1(correlation) box_features = self.conv4_1(self.conv4(out_conv3)) else: box_features = self.conv_reduce(correlation) else: # roi pool image features and append them to corr features box_proposals = list(boxes_resized[keep].unsqueeze(1)) roi_out = self.roi_pool(OrderedDict([(0, image_features[keep])]), box_proposals, image_sizes[keep].tolist()) out_conv_redir = self.conv_redir(roi_out) in_conv3_1 = torch.cat([out_conv_redir, correlation], dim=1) out_conv3 = self.conv3_1(in_conv3_1) box_features = self.conv4_1(self.conv4(out_conv3)) if self.avg_box_features: assert not self.max_box_features box_features = box_features.view(*box_features.shape[:2], -1).mean(2).unsqueeze(2) elif self.max_box_features: box_features = box_features.view(*box_features.shape[:2], -1).max(dim=2, keepdim=True)[0] corr_lengths = lengths - 1 target_idc = (torch.cumsum(corr_lengths, dim=0) - 1).tolist() in_idc = list(set(range(len(keep))).difference(set(target_idc))) encoder_in = torch.zeros(B, L, self.input_size).cuda() encoder_in[:, :, :F] = diffs mask = torch.zeros(encoder_in.shape[:2], dtype=torch.bool) for i, l in enumerate(corr_lengths): if l - 1 > 0: mask[i, -(l - 1):] = True if len(in_idc) > 0 and not self.correlation_last_only: t_tmp = encoder_in[mask] t_tmp[:, F:] = box_features[in_idc].view(len(in_idc), -1) encoder_in[mask] = t_tmp # feed features into encoder, retrieve hidden states encoder_out = self.encoder(encoder_in) # encoder_out[0]: 32, 60, 48 decoder_h = encoder_out[1][0] decoder_c = torch.zeros(self.n_layers, B, self.hidden_size).cuda() # construct decoder input decoder_in = torch.zeros(B, 1, self.input_size).cuda() decoder_in[:, 0, F - 2] = 1. # start token decoder_in[:, 0, F:] = box_features[target_idc].view(len(target_idc), -1) return encoder_out, decoder_in, decoder_h, decoder_c
def forward(self, x, targetObject_img, coords): x2_feat, x3_feat, x4_feat = self.encoder(x) ## (torch.Size([20, 128, 38, 38]), torch.Size([20, 256, 19, 19]), torch.Size([20, 512, 10, 10])) targetObject_img = nn.functional.interpolate(targetObject_img, size=[100, 100]) con_x2_feat, con_x3_feat, con_x4_feat = self.encoder(targetObject_img) ## (torch.Size([20, 128, 13, 13]), torch.Size([20, 256, 7, 7]), torch.Size([20, 512, 4, 4])) # Mutual Adaptation Module DC_2 = AdaptiveConv2d(x2_feat.size(0) * x2_feat.size(1), x2_feat.size(0) * x2_feat.size(1), 5, padding=1, \ groups=x2_feat.size(0) * x2_feat.size(1), bias=False) DC_3 = AdaptiveConv2d(x3_feat.size(0) * x3_feat.size(1), x3_feat.size(0) * x3_feat.size(1), 5, padding=1, \ groups=x3_feat.size(0) * x3_feat.size(1), bias=False) DC_4 = AdaptiveConv2d(x4_feat.size(0) * x4_feat.size(1), x4_feat.size(0) * x4_feat.size(1), 5, padding=1, \ groups=x4_feat.size(0) * x4_feat.size(1), bias=False) dc_feats_2 = DC_2(x2_feat, con_x2_feat) ## torch.Size([20, 128, 28, 28]) # dc_feats_2 = self.relu(dc_feats_2) dc_feats_3 = DC_3(x3_feat, con_x3_feat) ## torch.Size([20, 256, 15, 15]) # dc_feats_3 = self.relu(dc_feats_3) dc_feats_4 = DC_4(x4_feat, con_x4_feat) ## torch.Size([20, 512, 9, 9]) # dc_feats_4 = self.relu(dc_feats_4) gated_2 = torch.sigmoid(dc_feats_2) gated_3 = torch.sigmoid(dc_feats_3) gated_4 = torch.sigmoid(dc_feats_4) gated_output_2 = gated_2 * dc_feats_2 ## torch.Size([20, 128, 28, 28]) gated_output_3 = gated_3 * dc_feats_3 ## torch.Size([20, 256, 15, 15]) gated_output_4 = gated_4 * dc_feats_4 ## torch.Size([20, 512, 9, 9]) # encoded_feat = gated_output_2 + gated_output_3 + gated_output_4 # pdb.set_trace() gated_output_3 = nn.functional.interpolate( gated_output_3, size=[18, 18]) ## torch.Size([20, 256, 18, 18]) gated_output_2 = nn.functional.interpolate(gated_output_2, size=[36, 36]) #################################################### ###### decoding + concat path #################################################### gated_output_4 = self.CT_1(gated_output_4) gated_output_4 = self.CT_2(gated_output_4) gated_output_4 = self.CT_3(gated_output_4) gated_output_4 = self.relu(gated_output_4) ## gated_output_4.shape: torch.Size([20, 512, 9, 9]) gated_output_4_new = torch.zeros(gated_output_4.shape[0], gated_output_4.shape[1] + 2, gated_output_4.shape[2], gated_output_4.shape[3]) # pdb.set_trace() for point_idx in range(gated_output_4.shape[0]): feat_map = gated_output_4[point_idx] point = coords[point_idx] coords_feat = self.addcoords(feat_map, point) coords_feat = torch.squeeze(coords_feat, dim=0) fused_feats = torch.cat((coords_feat, feat_map), dim=0) gated_output_4_new[point_idx] = fused_feats gated_output_4_new = gated_output_4_new.cuda() bi = torch.arange(coords.shape[0]) bi = torch.unsqueeze(bi, dim=1) ## (batchSize, 1) rois = torch.cat((coords * 9 // 300, (coords) * 9 // 300), dim=1) ## (x1, y1, x2, y2) bi = bi.type(torch.FloatTensor) rois = torch.cat((bi, rois), dim=1).cuda() output_size = (1, 1) www = torchops.roi_pool( gated_output_4, rois, output_size, spatial_scale=1.0) ## torch.Size([20, 512, 1, 1]) www = torch.squeeze(www, dim=2) ## (20, 512, 1) www = torch.squeeze(www, dim=2) ## (20, 512) adaIN_input = self.fc_controler(www) ## (20, 512) # pdb.set_trace() dc_feats_4 = self.CT_4(gated_output_4_new) dc_feats_4 = self.CT_5(dc_feats_4) dc_feats_4 = self.CT_6(dc_feats_4) dc_feats_4 = self.relu(dc_feats_4) up_d4 = self.Upsamp_2( dc_feats_4) ## dc_feats_4: torch.Size([20, 512, 9, 9]) ## up_d4 torch.Size([20, 512, 18, 18]) AdaIN_output = self.AdaIN( up_d4, adaIN_input) ## torch.Size([20, 512, 18, 18]) AdaIN_output = torch.cat((AdaIN_output, up_d4), dim=1) # pdb.set_trace() dc_feats_3 = self.CT_7(AdaIN_output) dc_feats_3 = self.CT_8(torch.cat((dc_feats_3, gated_output_3), dim=1)) dc_feats_3 = self.CT_9(dc_feats_3) dc_feats_3 = self.relu(dc_feats_3) up_d3 = self.Upsamp_3( dc_feats_3) ## up_d3: torch.Size([20, 768, 36, 36]) dc_feats_2 = self.CT_10( up_d3) ## dc_feats_2: torch.Size([20, 512, 36, 36]) dc_feats_2 = self.CT_11(torch.cat((dc_feats_2, gated_output_2), dim=1)) dc_feats_2 = self.relu( dc_feats_2) ## dc_feats_2: torch.Size([20, 640, 36, 36]) up_d2 = self.Upsamp_4(dc_feats_2) ## torch.Size([20, 640, 72, 72]) dc_feats_1 = self.CT_12(up_d2) dc_feats_1 = self.CT_13(dc_feats_1) dc_feats_1 = self.relu(dc_feats_1) dc_feats_1 = self.Upsamp_5(dc_feats_1) dc_feats_1 = self.relu(dc_feats_1) ## torch.Size([20, 64, 216, 216]) # pdb.set_trace() # output = self.Conv_1x1(dc_feats_1) output = self.mymodules[0](dc_feats_1) output = self.mymodules[1](output) # output = nn.functional.interpolate(output, size=[300, 300]) return output
def forward(self, im_data, gt_boxes, im_info): batch_size = im_data.size(0) im_info = im_info.data if not gt_boxes is None: gt_boxes = gt_boxes.data # feed image data to base model to obtain base feature map base_feat = self.RCNN_base(im_data) # feed base feature map to RPN to obtain rois rois, rpn_loss_cls, rpn_loss_bbox = self.RCNN_rpn(base_feat, im_info, gt_boxes) # if it is training phase, then use ground truth bboxes for refining if self.training: roi_data = self.RCNN_proposal_target(rois, gt_boxes) rois, rois_label, rois_target, rois_inside_ws, rois_outside_ws = roi_data rois_label = Variable(rois_label.view(-1).long()) rois_target = Variable(rois_target.view(-1, rois_target.size(2))) rois_inside_ws = Variable(rois_inside_ws.view(-1, rois_inside_ws.size(2))) rois_outside_ws = Variable(rois_outside_ws.view(-1, rois_outside_ws.size(2))) else: rois_label = None rois_target = None rois_inside_ws = None rois_outside_ws = None rpn_loss_cls = 0 rpn_loss_bbox = 0 rois = Variable(rois) # do roi pooling based on predicted rois if cfg.pooling_mode == 'align': # pooled_feat = self.RCNN_roi_align(feature_map, rois.view(-1, 5)) pooled_feat = roi_align(base_feat, rois.view(-1, 5), (cfg.pool_size, cfg.pool_size), 1.0/16) elif cfg.pooling_mode == 'pool': #pooled_feat = self.RCNN_roi_pool(feature_map, rois.view(-1, 5)) pooled_feat = roi_pool(base_feat, rois.view(-1, 5), (cfg.pool_size, cfg.pool_size), 1.0/16) # feed pooled features to top model pooled_feat = self._head_to_tail(pooled_feat) # compute bbox offset bbox_pred = self.RCNN_bbox_pred(pooled_feat) if self.training and not self.class_agnostic: # select the corresponding columns according to roi labels bbox_pred_view = bbox_pred.view(bbox_pred.size(0), int(bbox_pred.size(1) / 4), 4) bbox_pred_select = torch.gather(bbox_pred_view, 1, rois_label.view(rois_label.size(0), 1, 1).expand(rois_label.size(0), 1, 4)) bbox_pred = bbox_pred_select.squeeze(1) # compute object classification probability cls_score = self.RCNN_cls_score(pooled_feat) cls_prob = F.softmax(cls_score, 1) RCNN_loss_cls = 0 RCNN_loss_bbox = 0 if self.training: # classification loss RCNN_loss_cls = F.cross_entropy(cls_score, rois_label) # bounding box regression L1 loss RCNN_loss_bbox = _smooth_l1_loss(bbox_pred, rois_target, rois_inside_ws, rois_outside_ws) cls_prob = cls_prob.view(batch_size, rois.size(1), -1) bbox_pred = bbox_pred.view(batch_size, rois.size(1), -1) return rois, cls_prob, bbox_pred, rpn_loss_cls, rpn_loss_bbox, RCNN_loss_cls, RCNN_loss_bbox, rois_label
def relocalize_soft(xA_, yA_, xB_, yB_, score_, feature_A_2x, feature_B_2x, N_matches=None, sigma=10, upsample_positions=True): if N_matches is None: N_matches = xA_.shape[1] else: idx = torch.argsort(-score_.view(-1)) N_matches = min(N_matches, idx.shape[0]) idx = idx[:N_matches] score_ = score_[:, idx] xA_ = xA_[:, idx] yA_ = yA_[:, idx] xB_ = xB_[:, idx] yB_ = yB_[:, idx] if upsample_positions: xA_ = xA_ * 2 yA_ = yA_ * 2 xB_ = xB_ * 2 yB_ = yB_ * 2 coords_A = torch.cat((torch.zeros(1, N_matches).to( xA_.device), xA_ - 1, yA_ - 1, xA_ + 1, yA_ + 1), dim=0).t() coords_B = torch.cat((torch.zeros(1, N_matches).to( xB_.device), xB_ - 1, yB_ - 1, xB_ + 1, yB_ + 1), dim=0).t() ch = feature_A_2x.shape[1] feature_A_local = O.roi_pool(feature_A_2x, coords_A, output_size=(3, 3)) feature_B_local = O.roi_pool(feature_B_2x, coords_B, output_size=(3, 3)) deltaY, deltaX = torch.meshgrid(torch.linspace(-1, 1, 3), torch.linspace(-1, 1, 3)) deltaX = deltaX.contiguous().to(xA_.device).unsqueeze(0) deltaY = deltaY.contiguous().to(xA_.device).unsqueeze(0) corrA_B = (feature_A_local[:, :, 1:2, 1:2] * feature_B_local).sum( dim=1).mul(sigma).view(N_matches, -1).softmax(dim=1).view(N_matches, 3, 3) corrB_A = (feature_B_local[:, :, 1:2, 1:2] * feature_A_local).sum( dim=1).mul(sigma).view(N_matches, -1).softmax(dim=1).view(N_matches, 3, 3) deltaX_B = (corrA_B * deltaX).view(N_matches, -1).sum(dim=1).unsqueeze(0) deltaY_B = (corrA_B * deltaY).view(N_matches, -1).sum(dim=1).unsqueeze(0) deltaX_A = (corrB_A * deltaX).view(N_matches, -1).sum(dim=1).unsqueeze(0) deltaY_A = (corrB_A * deltaY).view(N_matches, -1).sum(dim=1).unsqueeze(0) xA_ = xA_ + deltaX_A yA_ = yA_ + deltaY_A xB_ = xB_ + deltaX_B yB_ = yB_ + deltaY_B return xA_, yA_, xB_, yB_, score_
def _roi_pool(pred_heatmap, rois, patch_size=8): from torchvision.ops import roi_pool patches = roi_pool(pred_heatmap, rois.float(), (patch_size, patch_size), spatial_scale=1.0) return patches pass
def forward(self, input, rois): return roi_pool(input, rois, self.output_size, self.spatial_scale)
def forward(self, images: torch.Tensor, boxes_coordinate: torch.Tensor, transcripts: torch.Tensor, src_key_padding_mask: torch.Tensor): ''' :param images: whole_images, shape is (B, N, H, W, C), where B is batch size, N is the number of segments of the documents, H is height of image, W is width of image, C is channel of images (default is 3). :param boxes_coordinate: boxes coordinate, shape is (B, N, 8), where 8 is coordinates (x1, y1, x2, y2, x3, y3, x4, y4). :param transcripts: text segments, shape is (B, N, T, D), where T is the max length of transcripts, D is dimension of model. :param src_key_padding_mask: text padding mask, shape is (B*N, T), True for padding value. if provided, specified padding elements in the key will be ignored by the attention. This is an binary mask. When the value is True, the corresponding value on the attention layer of Transformer will be filled with -inf. need_weights: output attn_output_weights. :return: set of nodes X, shape is (B*N, T, D) ''' B, N, T, D = transcripts.shape # get image embedding using cnn # (B, 3, H, W) _, _, origin_H, origin_W = images.shape # image embedding: (B, C, H/16, W/16) images = self.cnn(images) _, C, H, W = images.shape # generate rois for roi pooling, rois shape is (B, N, 5), 5 means (batch_index, x0, y0, x1, y1) rois_batch = torch.zeros(B, N, 5, device=images.device) # Loop on the every image. for i in range(B): # (B, N, 8) # (N, 8) doc_boxes = boxes_coordinate[i] # (N, 4) pos = torch.stack([ doc_boxes[:, 0], doc_boxes[:, 1], doc_boxes[:, 4], doc_boxes[:, 5] ], dim=1) rois_batch[i, :, 1:5] = pos rois_batch[i, :, 0] = i spatial_scale = float(H / origin_H) # use roi pooling get image segments # (B*N, C, roi_pooling_size, roi_pooling_size) if self.roi_pooling_mode == 'roi_align': image_segments = roi_align(images, rois_batch.view(-1, 5), self.roi_pooling_size, spatial_scale) else: image_segments = roi_pool(images, rois_batch.view(-1, 5), self.roi_pooling_size, spatial_scale) # (B*N, D, 1, 1) image_segments = F.relu(self.bn(self.conv(image_segments))) # # (B*N, D,) image_segments = image_segments.squeeze() # (B*N, 1, D) image_segments = image_segments.unsqueeze(dim=1) # add positional embedding transcripts_segments = self.pe_droput( transcripts + self.position_embedding[:, :, :transcripts.size(2), :]) # (B*N, T ,D) transcripts_segments = transcripts_segments.reshape(B * N, T, D) # (B*N, T, D) image_segments = image_segments.expand_as(transcripts_segments) # here we first add image embedding and text embedding together, # then as the input of transformer to get a non-local fusion features, different from paper process. out = image_segments + transcripts_segments # (T, B*N, D) out = out.transpose(0, 1).contiguous() # (T, B*N, D) out = self.transformer_encoder( out, src_key_padding_mask=src_key_padding_mask) # (B*N, T, D) out = out.transpose(0, 1).contiguous() out = self.norm(out) out = F.dropout(out, p=self.dropout, training=self.training) return out
def script_func(input, rois): return ops.roi_pool(input, rois, 5, 1.0)[0]
def script_fn(input, rois, pool_size): # type: (Tensor, Tensor, int) -> Tensor return ops.roi_pool(input, rois, pool_size, 1.0)[0]
def forward(self, features, rois): outputs = roi_pool(features, rois, (self.pooled_height, self.pooled_width), self.spatial_scale) return outputs