def forward(self, x, rois, roi_indices): """Forward the chain. We assume that there are :math:`N` batches. Args: x (Variable): 4D image variable. rois (Tensor): A bounding box array containing coordinates of proposal boxes. This is a concatenation of bounding box arrays from multiple images in the batch. Its shape is :math:`(R', 4)`. Given :math:`R_i` proposed RoIs from the :math:`i` th image, :math:`R' = \\sum _{i=1} ^ N R_i`. roi_indices (Tensor): An array containing indices of images to which bounding boxes correspond to. Its shape is :math:`(R',)`. """ # in case roi_indices is ndarray roi_indices = at.to_tensor(roi_indices).float() rois = at.to_tensor(rois).float() indices_and_rois = t.cat([roi_indices[:, None], rois], dim=1) # NOTE: important: the prediction of roi will be tensor type([N,H,W]), so convert back from yx->xy xy_indices_and_rois = indices_and_rois[:, [0, 2, 1, 4, 3]] indices_and_rois = xy_indices_and_rois.contiguous() pool = self.roi(x, indices_and_rois) pool = pool.view(pool.size(0), -1) fc7 = self.classifier(pool) roi_cls_locs = self.cls_loc(fc7) roi_scores = self.score(fc7) return roi_cls_locs, roi_scores
def train_forward_net(self, img_names, imgs, bboxes, labels, indexs, gt_relations, scale): n = bboxes.shape[0] if n != 1: raise ValueError('Currently only batch size 1 is supported.') # Since batch size is one, convert variables to singular form bboxes = bboxes[0] gt_relations = gt_relations[0] _, _, H, W = imgs.shape # Tensor type (C,H,W) feature = self.net.extractor(imgs) combined_features, rel_flip = get_combined_feature(feature, bboxes, use_spatial_feature=self.conf.use_spatial_feature) relation_scores = [] for combined_feature in combined_features: relation_score = self.net(*combined_feature) relation_scores.append(relation_score) relation_scores = torch.stack(relation_scores, dim=1).squeeze(0) ##to see if need flip for n, flip in enumerate(rel_flip): if flip: #if flip if gt_relations[n] == 1: gt_relations[n] = 2 elif gt_relations[n] == 2: gt_relations[n] = 1 gt_relations = at.to_tensor(gt_relations).long().cuda() #self.rel_cm.add(relation_scores, gt_relations.data) relation_loss = nn.CrossEntropyLoss()(relation_scores, gt_relations) ##(onehot tensor, not onehot long tensor) return LossTuple(total_loss=relation_loss)
def _smooth_l1_loss(x, t, in_weight, sigma): sigma2 = sigma**2 diff = in_weight * (x - t) abs_diff = diff.abs() flag = (abs_diff.data < (1. / sigma2)).float() flag = at.to_tensor(flag) y = (flag * (sigma2 / 2.) * (diff**2) + (1 - flag) * (abs_diff - 0.5 / sigma2)) return y.sum()
def _fast_rcnn_loc_loss(pred_loc, gt_loc, gt_label, sigma): in_weight = torch.zeros(gt_loc.shape).cuda() # Localization loss is calculated only for positive rois. # NOTE: unlike origin implementation, # we don't need inside_weight and outside_weight, they can calculate by gt_label in_weight[(gt_label > 0).view(-1, 1).expand_as(in_weight).cuda()] = 1 loc_loss = _smooth_l1_loss(pred_loc, gt_loc, at.to_tensor(in_weight), sigma) # Normalize by total number of negtive and positive rois. loc_loss /= (gt_label >= 0).sum().to( torch.float) # ignore gt_label==-1 for rpn_loss return loc_loss
def forward(self, x, rois, roi_indices, if_color=False): """Forward the chain. We assume that there are :math:`N` batches. Args: x (Variable): 4D image variable. rois (Tensor): A bounding box array containing coordinates of proposal boxes. This is a concatenation of bounding box arrays from multiple images in the batch. Its shape is :math:`(R', 4)`. Given :math:`R_i` proposed RoIs from the :math:`i` th image, :math:`R' = \\sum _{i=1} ^ N R_i`. roi_indices (Tensor): An array containing indices of images to which bounding boxes correspond to. Its shape is :math:`(R',)`. """ # in case roi_indices is ndarray roi_indices = at.to_tensor(roi_indices).float() rois = at.to_tensor(rois).float() indices_and_rois = torch.cat([roi_indices[:, None], rois], dim=1) # NOTE: important: yx->xy xy_indices_and_rois = indices_and_rois[:, [0, 2, 1, 4, 3]] indices_and_rois = xy_indices_and_rois.contiguous() #print("shape of x: {0}".format(x.shape)) #pool = self.roi_pool(x, indices_and_rois.view(-1, 5)) #print("shape of pool: {0}".format(pool.shape)) #128,1024,7,7 #print(pool.shape) pool = self.roi(x, indices_and_rois) #pool = pool.view(pool.size(0), -1) #hide when use layer 4 clf fc7 = self.classifier(pool).mean(3).mean( 2) # [128, 7*7*512] #[128,7*7*1024] mean used when use layer 4 clf #print('fc7.shape',fc7.shape) roi_cls_locs = self.cls_loc(fc7) roi_scores = self.score(fc7) return roi_cls_locs, roi_scores
def predict(self, img, scale, visualize=False): """Detect objects from images. This method predicts objects for each image. Args: img Image object Returns: tuple of lists: This method returns a tuple of three lists, :obj:`(bboxes, labels, scores)`. * **bboxes**: A list of float arrays of shape :math:`(R, 4)`, \ where :math:`R` is the number of bounding boxes in a image. \ Each bouding box is organized by \ :math:`(y_{min}, x_{min}, y_{max}, x_{max})` \ in the second axis. * **labels** : A list of integer arrays of shape :math:`(R,)`. \ Each value indicates the class of the bounding box. \ Values are in range :math:`[0, L - 1]`, where :math:`L` is the \ number of the foreground classes. * **scores** : A list of float arrays of shape :math:`(R,)`. \ Each value indicates how confident the prediction is. """ self.eval() if visualize: self.use_preset('visualize') else: self.use_preset('evaluate') with t.no_grad(): # size = [img.size[1], img.size[0]] # H,W # img = at.to_tensor(self.pred_transform(img)).float()[None] # add batch dimension # scale = img.size(3) / size[1] # w_new/w_old # # recover original size scale = np.asscalar(scale.numpy()[0]) size = np.round([img.size(2) / scale, img.size(3) / scale]) img = img.float().cuda() roi_name_locs, roi_name_scores, roi_color_locs, roi_color_scores, rois, _ = self(img, scale=scale) # We are assuming that batch size is 1. roi_name_scores = roi_name_scores.data roi_name_locs = roi_name_locs.data # color roi_color_scores = roi_color_scores.data roi_color_locs = roi_color_locs.data roi_org = at.to_tensor(rois) / scale # Convert predictions to bounding boxes in image coordinates. # Bounding boxes are scaled to the scale of the input images. mean = t.Tensor(self.loc_normalize_mean).cuda(). \ repeat(self.head_name.n_class)[None] std = t.Tensor(self.loc_normalize_std).cuda(). \ repeat(self.head_name.n_class)[None] roi_name_locs = (roi_name_locs * std + mean) roi_name_locs = roi_name_locs.view(-1, self.head_name.n_class, 4) roi = roi_org.view(-1, 1, 4).expand_as(roi_name_locs) name_bbox = loc2bbox(at.to_np(roi).reshape((-1, 4)), at.to_np(roi_name_locs).reshape((-1, 4))) name_bbox = at.to_tensor(name_bbox) name_bbox = name_bbox.view(-1, self.head_name.n_class * 4) # clip bounding box name_bbox[:, 0::2] = (name_bbox[:, 0::2]).clamp(min=0, max=size[0]) name_bbox[:, 1::2] = (name_bbox[:, 1::2]).clamp(min=0, max=size[1]) prob = at.to_np(F.softmax(at.to_tensor(roi_name_scores), dim=1)) # print("prob : {0}".format(prob.shape))#(300,4) raw_name_bboxes_ = at.to_np(name_bbox) raw_name_probs_ = at.to_np(prob) ##color mean = t.Tensor(self.loc_normalize_mean).cuda(). \ repeat(self.head_color.n_class)[None] std = t.Tensor(self.loc_normalize_std).cuda(). \ repeat(self.head_color.n_class)[None] roi_color_locs = (roi_color_locs * std + mean) roi_color_locs = roi_color_locs.view(-1, self.head_color.n_class, 4) roi = roi_org.view(-1, 1, 4).expand_as(roi_color_locs) color_bbox = loc2bbox(at.to_np(roi).reshape((-1, 4)), at.to_np(roi_color_locs).reshape((-1, 4))) color_bbox = at.to_tensor(color_bbox) color_bbox = color_bbox.view(-1, self.head_color.n_class * 4) # clip bounding box color_bbox[:, 0::2] = (color_bbox[:, 0::2]).clamp(min=0, max=size[0]) color_bbox[:, 1::2] = (color_bbox[:, 1::2]).clamp(min=0, max=size[1]) prob = at.to_np(F.softmax(at.to_tensor(roi_color_scores), dim=1)) # print("prob : {0}".format(prob.shape))#(300,4) raw_color_bboxes_ = at.to_np(color_bbox) raw_color_probs_ = at.to_np(prob) bboxes_, names_, name_scores_, colors_, color_scores_ \ = self._suppress(raw_name_bboxes_, raw_name_probs_, raw_color_bboxes_, raw_color_probs_, prob_thresh=0.7) """ print('name') print(names_, name_scores_) print('color') print(colors_, color_scores_) set_trace() """ if raw_name_bboxes_.shape[0] != raw_color_bboxes_.shape[0]: print("raw_name_bboxes_ {0}".format(raw_name_bboxes_.shape)) print("raw_color_bboxes_ {0}".format(raw_color_bboxes_.shape)) a = 1 assert a == 0, "raw is not equal" if names_.shape[0] != colors_.shape[0]: print("names_ {0}".format(names_.shape)) print("colors_ {0}".format(colors_.shape)) a = 1 print("bboxes_ == ".format(bboxes_.shape)) assert a == 0, "names_ is not equal" self.use_preset('evaluate') self.train() return bboxes_, names_, name_scores_, colors_, color_scores_
def train_forward_net(self, img_names, imgs, indexs, bboxes, names, shapes, colors, _, scale): n = bboxes.shape[0] if n != 1: raise ValueError('Currently only batch size 1 is supported.') _, _, H, W = imgs.shape img_size = (H, W) #print("img_name {0}".format(img_name)) #vis_image(imgs.cpu()[0]) #plt.show() features = self.net.extractor(imgs) ##vgg16 (1,512,37,50) #print(features.shape) rpn_locs, rpn_scores, rois, _, anchor = self.net.rpn( features, img_size, scale) # Since batch size is one, convert variables to singular form bbox = bboxes[0] name = names[0] color = colors[0] rpn_score = rpn_scores[0] rpn_loc = rpn_locs[0] roi = rois # Sample RoIs(For the training of head(classification network)) and forward sample_roi, gt_roi_loc, gt_roi_name, gt_roi_color = self.proposal_target( roi, at.to_np(bbox), at.to_np(name), at.to_np(color)) # NOTE it's all zero because now it only support for batch=1 now sample_roi_index = torch.zeros(len(sample_roi)) roi_name_loc, roi_name_score = self.net.head_name( features, sample_roi, sample_roi_index) _, roi_color_score = self.net.head_color(features, sample_roi, sample_roi_index) # ------------------ RPN losses -------------------# # Target for RPN => Anchor Target gt_rpn_loc, gt_rpn_label = self.anchor_target(at.to_np(bbox), anchor, img_size) gt_rpn_label = at.to_tensor(gt_rpn_label).long() gt_rpn_loc = at.to_tensor(gt_rpn_loc) rpn_loc_loss = _fast_rcnn_loc_loss(rpn_loc, gt_rpn_loc, gt_rpn_label.data, self.conf.rpn_sigma) # NOTE: default value of ignore_index is -100 ... rpn_cls_loss = torch.nn.functional.cross_entropy(rpn_score, gt_rpn_label.cuda(), ignore_index=-1) _gt_rpn_label = gt_rpn_label[gt_rpn_label > -1] _rpn_score = at.to_np(rpn_score)[at.to_np(gt_rpn_label) > -1] self.rpn_cm.add(at.to_tensor(_rpn_score, False), _gt_rpn_label.data.long()) # ------------------ ROI losses (fast rcnn loss) -------------------# ##name## n_sample = roi_name_loc.shape[0] roi_name_loc = roi_name_loc.view(n_sample, -1, 4) name_roi_loc = roi_name_loc[torch.arange(0, n_sample).long().cuda(), \ at.to_tensor(gt_roi_name).long()] ## not one-hot gt_roi_name = at.to_tensor(gt_roi_name).long() gt_roi_loc = at.to_tensor(gt_roi_loc) roi_loc_loss = _fast_rcnn_loc_loss(name_roi_loc.contiguous(), gt_roi_loc, gt_roi_name.data, self.conf.roi_sigma) roi_name_loss = nn.CrossEntropyLoss()(roi_name_score, gt_roi_name.cuda()) self.roi_name_cm.add(at.to_tensor(roi_name_score, False), gt_roi_name.data.long()) ##color## gt_roi_color = at.to_tensor(gt_roi_color).long() roi_color_loss = nn.CrossEntropyLoss()(roi_color_score, gt_roi_color.cuda()) self.roi_color_cm.add(at.to_tensor(roi_color_score, False), gt_roi_color.data.long()) ##sum up all loss## losses = [ rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_name_loss, roi_color_loss ] losses = losses + [sum(losses)] return LossTuple(*losses)
def get_combined_feature(feature, bboxes, use_spatial_feature=False, roi_size=7, spatial_scale=1 / 16, flip=True): """ :param feature: feature has passed extractor, shape[1,512,37,50] :param bboxes: shape[N, 4], N is num of object :return: combined_features: list of all combined feature(as same order as gt relation), is a list contains list of features in following type: [obj_a, obj_b, rel] or [obj_a, obj_b, rel, spatial_feature] rel_flip: list of rel that has been flipped """ roi_pooling = RoIPooling2D(roi_size, roi_size, spatial_scale) # spatial scale is not important num_of_bbox = bboxes.shape[0] #set_trace() ##bbox_scaling into fature scale ##TODO scale_x_imgtofeature, scale_y_imgtofeature = float( feature.size(3) / 1000), float(feature.size(2) / 600) bboxes_f = np.zeros( bboxes.shape, dtype=int ) ##bboxes_f is bboxes after resized to scale of feture map(37,50) for (bbox, bbox_f) in zip(bboxes, bboxes_f): bbox_f[0] = int(bbox[0] * scale_y_imgtofeature) bbox_f[2] = int(bbox[2] * scale_y_imgtofeature) bbox_f[1] = int(bbox[1] * scale_x_imgtofeature) bbox_f[3] = int(bbox[3] * scale_x_imgtofeature) assert bbox_f[0] in range(feature.size(2)+1) and bbox_f[2] in range(feature.size(2)+1) \ and bbox_f[1] in range(feature.size(3)+1) and bbox_f[3] in range(feature.size(3)+1), "bbox:{0} {1}".format(bbox, feature.shape) ##start forward rel_flip = [] combined_features = [] for i_obj_a in range(num_of_bbox): for i_obj_b in range(i_obj_a + 1, num_of_bbox): bbox_f_a = bboxes_f[i_obj_a] # in (ymin, xmin, ymax, xmax) bbox_f_b = bboxes_f[i_obj_b] rel_flip.append(False) if rd.random() > 0.5 and flip: # randomly swap obj1 and obj2 bbox_f_a, bbox_f_b = bbox_f_b, bbox_f_a rel_flip[-1] = True ##get union cordinate union_cord = np.zeros(4) union_cord[0] = min(bbox_f_a[0], bbox_f_b[0]) union_cord[1] = min(bbox_f_a[1], bbox_f_b[1]) union_cord[2] = max(bbox_f_a[2], bbox_f_b[2]) union_cord[3] = max(bbox_f_a[3], bbox_f_b[3]) ##spatial feature if use_spatial_feature: ##dual spatial mask dual_channel_feature = torch.zeros(2, 37, 50) # need modified for ii, bbox in enumerate([bbox_f_a, bbox_f_b]): # obj_a_first dual_channel_feature[ii, bbox[0]:bbox[2], bbox[1]:bbox[3]] = 1 ##combine features rois = np.stack([bbox_f_a, bbox_f_b, union_cord], axis=0) roi_indices = np.zeros(3) roi_indices = at.to_tensor(roi_indices).float() rois = at.to_tensor(rois).float() indices_and_rois = torch.cat([roi_indices[:, None], rois], dim=1) xy_indices_and_rois = indices_and_rois[:, [0, 2, 1, 4, 3]] indices_and_rois = xy_indices_and_rois.contiguous() combined_feature_arr = roi_pooling( feature, indices_and_rois) # in obj_a, obj_b, union oreder combined_feature = [] for idx in range(combined_feature_arr.size(0)): combined_feature.append( torch.unsqueeze(combined_feature_arr[idx], 0)) #increase 1 dim assert combined_feature[-1].shape == (1, 512, 7, 7) combined_features.append(combined_feature) return combined_features, rel_flip