def threshold_bbox(self, proposal_bbox_inst, thres=0.7, proposal_type="roih"): if proposal_type == "rpn": valid_map = proposal_bbox_inst.objectness_logits > thres # create instances containing boxes and gt_classes image_shape = proposal_bbox_inst.image_size new_proposal_inst = Instances(image_shape) # create box new_bbox_loc = proposal_bbox_inst.proposal_boxes.tensor[valid_map, :] new_boxes = Boxes(new_bbox_loc) # add boxes to instances new_proposal_inst.gt_boxes = new_boxes new_proposal_inst.objectness_logits = proposal_bbox_inst.objectness_logits[ valid_map ] elif proposal_type == "roih": valid_map = proposal_bbox_inst.scores > thres # create instances containing boxes and gt_classes image_shape = proposal_bbox_inst.image_size new_proposal_inst = Instances(image_shape) # create box new_bbox_loc = proposal_bbox_inst.pred_boxes.tensor[valid_map, :] new_boxes = Boxes(new_bbox_loc) # add boxes to instances new_proposal_inst.gt_boxes = new_boxes new_proposal_inst.gt_classes = proposal_bbox_inst.pred_classes[valid_map] new_proposal_inst.scores = proposal_bbox_inst.scores[valid_map] return new_proposal_inst
def doit(raw_image, raw_boxes, predictor): # Process Boxes raw_boxes = Boxes(torch.from_numpy(raw_boxes).cuda()) with torch.no_grad(): raw_height, raw_width = raw_image.shape[:2] # print("Original image size: ", (raw_height, raw_width)) # Preprocessing image = predictor.transform_gen.get_transform(raw_image).apply_image(raw_image) # print("Transformed image size: ", image.shape[:2]) # Scale the box new_height, new_width = image.shape[:2] scale_x = 1. * new_width / raw_width scale_y = 1. * new_height / raw_height #print(scale_x, scale_y) boxes = raw_boxes.clone() boxes.scale(scale_x=scale_x, scale_y=scale_y) # ---- image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1)) inputs = [{"image": image, "height": raw_height, "width": raw_width}] images = predictor.model.preprocess_image(inputs) # Run Backbone Res1-Res4 features = predictor.model.backbone(images.tensor) # Run RoI head for each proposal (RoI Pooling + Res5) proposal_boxes = [boxes] features = [features[f] for f in predictor.model.roi_heads.in_features] box_features = predictor.model.roi_heads._shared_roi_transform( features, proposal_boxes ) feature_pooled = box_features.mean(dim=[2, 3]) # pooled to 1x1 # print('Pooled features size:', feature_pooled.shape) # Predict classes pred_class_logits, pred_proposal_deltas = predictor.model.roi_heads.box_predictor(feature_pooled) and boxes for each proposal. pred_class_logits, pred_attr_logits, pred_proposal_deltas = predictor.model.roi_heads.box_predictor( feature_pooled) pred_class_prob = nn.functional.softmax(pred_class_logits, -1) pred_scores, pred_classes = pred_class_prob[..., :-1].max(-1) attr_prob = pred_attr_logits[..., :-1].softmax(-1) max_attr_prob, max_attr_label = attr_prob.max(-1) # Detectron2 Formatting (for visualization only) roi_features = feature_pooled instances = Instances( image_size=(raw_height, raw_width), pred_boxes=raw_boxes, scores=pred_scores, pred_classes=pred_classes, attr_scores=max_attr_prob, attr_classes=max_attr_label ) return instances, roi_features
def generate_poposals(images, model, score_threshold=0): inputs = [{ "image": torch.as_tensor(image.astype("float32").transpose(2, 0, 1)), "height": image.shape[0], "width": image.shape[1] } for image in images] with torch.no_grad(): images = model.preprocess_image(inputs) features = model.backbone(images.tensor) proposals, _ = model.proposal_generator(images, features, None) features_ = [features[f] for f in model.roi_heads.box_in_features] box_features = model.roi_heads.box_pooler( features_, [x.proposal_boxes for x in proposals]) box_features = model.roi_heads.box_head(box_features) proposals_scores, proposals_deltas = model.roi_heads.box_predictor( box_features) boxes_tensors = model.roi_heads.box_predictor.predict_boxes( (proposals_scores, proposals_deltas), proposals) scores = model.roi_heads.box_predictor.predict_probs( (proposals_scores, proposals_deltas), proposals) result = [] for i in range(len(inputs)): image_size = proposals[i].image_size num_bbox_reg_classes = boxes_tensors[i].shape[1] // 4 boxes = Boxes(boxes_tensors[i].reshape(-1, 4)) boxes.clip(image_size) boxes = boxes.tensor.view(-1, num_bbox_reg_classes, 4) img_scores = scores[i][:, :-1] max_scores, pred_classes = torch.max(img_scores, dim=1) keep_mask = max_scores > score_threshold filtered_scores = img_scores[keep_mask, :] filtered_max_scores = max_scores[keep_mask] filtered_pred_classes = pred_classes[keep_mask] boxes = boxes[keep_mask, filtered_pred_classes, :] result_instance = Instances(image_size) result_instance.pred_boxes = Boxes(boxes) result_instance.scores = filtered_max_scores result_instance.pred_classes = filtered_pred_classes result_instance.class_distributions = filtered_scores result.append(result_instance) return result
def get_instance(self, frame_id): if frame_id not in self.frame_ids: return None else: i = self.frame_ids.index(frame_id) if self.proposal_instances[i] is not None: return self.proposal_instances[i] else: # This has been a skipped frame ... interpolate box from the neighboring instances index_before = index_after = i while self.proposal_instances[ index_before] is None and index_before > 0: index_before -= 1 while self.proposal_instances[ index_after] is None and index_after < len( self.proposal_instances): index_after += 1 instance_before = self.proposal_instances[index_before] instance_after = self.proposal_instances[index_after] interpolation_factor = (i - index_before) / (index_after - index_before) interpolated_instance = Instances(instance_before.image_size) interpolated_instance.pred_boxes = Boxes( instance_before.pred_boxes.tensor + interpolation_factor * (instance_after.pred_boxes.tensor - instance_before.pred_boxes.tensor)) interpolated_instance.scores = torch.tensor([0]) interpolated_instance.pred_classes = instance_before.pred_classes interpolated_instance.class_distributions = instance_before.class_distributions interpolated_instance.generation_process = ["I"] return interpolated_instance
def get_box_union(boxes: Boxes): """Merge all boxes into a single box""" if len(boxes) == 0: return boxes bt = boxes.tensor union_bt = torch.cat( (torch.min(bt[:, :2], 0).values, torch.max(bt[:, 2:], 0).values) ).reshape(1, -1) return Boxes(union_bt)
def get2d2box(box): xmin = min(box[0]) xmax = max(box[0]) ymin = min(box[1]) ymax = max(box[1]) return Boxes( torch.as_tensor([[xmin, ymin, xmax, ymax]], dtype=torch.float32, device='cuda'))
def get2d2boxes(boxes): list_boxes = [] for box in boxes: xmin = min(box[0]) xmax = max(box[0]) ymin = min(box[1]) ymax = max(box[1]) list_boxes.append([xmin, ymin, xmax, ymax]) return Boxes( torch.as_tensor(list_boxes, dtype=torch.float32, device='cuda'))
def test_clip_area_0_degree(self): for _ in range(50): num_boxes = 100 boxes_5d = torch.zeros(num_boxes, 5) boxes_5d[:, 0] = torch.FloatTensor(num_boxes).uniform_(-100, 500) boxes_5d[:, 1] = torch.FloatTensor(num_boxes).uniform_(-100, 500) boxes_5d[:, 2] = torch.FloatTensor(num_boxes).uniform_(0, 500) boxes_5d[:, 3] = torch.FloatTensor(num_boxes).uniform_(0, 500) # Convert from (x_ctr, y_ctr, w, h, 0) to (x1, y1, x2, y2) boxes_4d = torch.zeros(num_boxes, 4) boxes_4d[:, 0] = boxes_5d[:, 0] - boxes_5d[:, 2] / 2.0 boxes_4d[:, 1] = boxes_5d[:, 1] - boxes_5d[:, 3] / 2.0 boxes_4d[:, 2] = boxes_5d[:, 0] + boxes_5d[:, 2] / 2.0 boxes_4d[:, 3] = boxes_5d[:, 1] + boxes_5d[:, 3] / 2.0 image_size = (500, 600) test_boxes_4d = Boxes(boxes_4d) test_boxes_5d = RotatedBoxes(boxes_5d) # Before clip areas_4d = test_boxes_4d.area() areas_5d = test_boxes_5d.area() self.assertTrue(torch.allclose(areas_4d, areas_5d, atol=1e-1, rtol=1e-5)) # After clip test_boxes_4d.clip(image_size) test_boxes_5d.clip(image_size) areas_4d = test_boxes_4d.area() areas_5d = test_boxes_5d.area() self.assertTrue(torch.allclose(areas_4d, areas_5d, atol=1e-1, rtol=1e-5))
def highest_only(self, predict): instance = predict["instances"].to(self.cpu_device) image_size = instance.image_size get_scores = instance.get("scores") pred_classes_index = [] if len(get_scores.tolist()) != 0: _, highest_index = torch.max(get_scores, 0) pred_classes_index.append(highest_index) pred_classes = self.tensor_transform(instance.get("pred_classes"), pred_classes_index) scores = self.tensor_transform(instance.get("scores"), pred_classes_index) pred_boxes = Boxes( self.tensor_transform( instance.get("pred_boxes").tensor, pred_classes_index)) return Instances(image_size=image_size, pred_boxes=pred_boxes, scores=scores, pred_classes=pred_classes)
def generate_instance(self, instance: Instances, class_names: List[str], total_classes: List[str]): instance = instance.to('cpu') boxes = instance.pred_boxes.tensor.numpy() masks = None scores = instance.scores.numpy() if instance.has("pred_masks"): masks = instance.pred_masks.numpy() for index, name in enumerate(class_names): if name not in total_classes: boxes[index:index+1] = 0 scores[index] = 0 if masks is not None: masks[index:index+1] = False instance.pred_boxes = Boxes(torch.from_numpy(boxes)) if masks is not None: instance.pred_masks = torch.from_numpy(masks) instance.scores = torch.from_numpy(scores) return instance
def flaw_only(predict): ''' 预测结果中有正常元件,水印,瑕疵,在这里筛选出瑕疵信息,其他的去掉。 :param predict: 模型的正常输出的预测结果,预测出了许多矩形框,包含了矩形框的位置和大小信息(用左上角和右下角坐标来表示), 矩形框预测的类别、分数,矩形框内部的mask(用只含bool类型的矩阵表示像素级别的mask),等等 :return: 筛选完的预测结果 ''' cpu_device = torch.device("cpu") instance = predict["instances"].to(cpu_device) image_size = instance.image_size get_pred_classes = instance.get("pred_classes").numpy() pred_classes_index = [] pred_classes = [] for c in range(len(get_pred_classes)): if get_pred_classes[c] != 0 and get_pred_classes[c] != 1: pred_classes_index.append(c) pred_classes.append(get_pred_classes[c]) pred_classes = torch.from_numpy(np.asarray(pred_classes)) scores = tensor_transform(instance.get("scores"), pred_classes_index) pred_masks = tensor_transform(instance.get("pred_masks"), pred_classes_index) pred_boxes = Boxes(tensor_transform(instance.get("pred_boxes").tensor, pred_classes_index)) return Instances(image_size=image_size, pred_boxes=pred_boxes, scores=scores, pred_classes=pred_classes, pred_masks=pred_masks)
def flaw_only(self, predict): instance = predict["instances"].to(self.cpu_device) image_size = instance.image_size get_pred_classes = instance.get("pred_classes").numpy() pred_classes_index = [] pred_classes = [] for c in range(len(get_pred_classes)): if get_pred_classes[c] != 0 and get_pred_classes[c] != 1: pred_classes_index.append(c) pred_classes.append(get_pred_classes[c]) pred_classes = torch.from_numpy(np.asarray(pred_classes)) scores = self.tensor_transform(instance.get("scores"), pred_classes_index) pred_masks = self.tensor_transform(instance.get("pred_masks"), pred_classes_index) pred_boxes = Boxes( self.tensor_transform( instance.get("pred_boxes").tensor, pred_classes_index)) return Instances(image_size=image_size, pred_boxes=pred_boxes, scores=scores, pred_classes=pred_classes, pred_masks=pred_masks)
def project_proposal_instance(self, frame_index): if len(self.proposal_instances) == 1: # We consider the first frame, there is nothing to project here return self.proposal_instances[0] instance_index_current = self.last_key_instance_index[-1] instance_index_before = self.last_key_instance_index[-2] frame_index_current = self.frame_ids[instance_index_current] assert frame_index > frame_index_current if frame_index == frame_index_current: # We are replacing the most recent proposal instance if len(self.proposal_instances) < 3: instance_index_current = self.last_key_instance_index[-2] return self.proposal_instances[instance_index_current] else: instance_index_current = self.last_key_instance_index[-2] instance_index_before = self.last_key_instance_index[-3] frame_index_current = self.frame_ids[instance_index_current] instance_current = self.proposal_instances[instance_index_current] instance_before = self.proposal_instances[instance_index_before] centers_current = instance_current.pred_boxes.get_centers() centers_before = instance_before.pred_boxes.get_centers() centers_delta = (centers_current - centers_before) / ( instance_index_current - instance_index_before) * (frame_index - frame_index_current) projected_instance = Instances(instance_current.image_size) projected_instance.pred_boxes = Boxes( instance_current.pred_boxes.tensor + centers_delta.repeat(1, 2)) projected_instance.scores = instance_current.scores projected_instance.pred_classes = instance_current.pred_classes projected_instance.class_distributions = instance_current.class_distributions projected_instance.generation_process = ["P"] return projected_instance
def draw_instance_predictions(self, predictions, category=None): """ Draw instance-level prediction results on an image. Args: predictions (Instances): the output of an instance detection/segmentation model. Following fields will be used to draw: "pred_boxes", "pred_classes", "scores", "pred_masks" (or "pred_masks_rle"). category: the integer category for the desired annotation to display as a list or None if all of them Returns: output (VisImage): image object with visualizations. """ # start additional code if category == None: boxes = predictions.pred_boxes if predictions.has( "pred_boxes") else None scores = predictions.scores if predictions.has("scores") else None classes = predictions.pred_classes if predictions.has( "pred_classes") else None labels = self._create_text_labels( classes, scores, self.metadata.get("thing_classes", None)) keypoints = predictions.pred_keypoints if predictions.has( "pred_keypoints") else None else: all_boxes = predictions.pred_boxes if predictions.has( "pred_boxes") else None all_scores = predictions.scores if predictions.has( "scores") else None all_classes = predictions.pred_classes if predictions.has( "pred_classes") else None all_labels = self._create_text_labels( all_classes, all_scores, self.metadata.get("thing_classes", None)) all_keypoints = predictions.pred_keypoints if predictions.has( "pred_keypoints") else None boxes = [] if all_boxes != None else None scores = [] if all_scores != None else None classes = [] if all_classes != None else None labels = [] if all_labels != None else None keypoints = [] if all_keypoints != None else None for c in category: for i in range(0, len(all_classes)): if all_classes[i] == c: classes.append(all_classes[i]) if all_boxes != None: boxes.append(all_boxes[i]) if all_scores != None: scores.append(all_scores[i]) if all_labels != None: labels.append(all_labels[i]) if all_keypoints != None: keypoints.append(all_keypoints[i]) if boxes != None and len(boxes) > 0: boxes = Boxes(torch.cat([b.tensor for b in boxes], dim=0)) if scores != None and len(scores) > 0: scores = torch.stack(scores) if classes != None and len(classes) > 0: classes = torch.stack(classes) # end additional code # removed alpha from here and put it as fixed value if predictions.has("pred_masks"): masks = np.asarray(predictions.pred_masks) masks = [ GenericMask(x, self.output.height, self.output.width) for x in masks ] else: masks = None if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get( "thing_colors"): colors = [ self._jitter([x / 255 for x in self.metadata.thing_colors[c]]) for c in classes ] else: colors = None if self._instance_mode == ColorMode.IMAGE_BW: self.output.img = self._create_grayscale_image( (predictions.pred_masks.any(dim=0) > 0).numpy()) self.overlay_instances(labels=labels, boxes=boxes, masks=masks, keypoints=keypoints, assigned_colors=colors, alpha=1) return self.output
def to_boxes_from_xywh(bbox_xywh: torch.Tensor) -> torch.Tensor: return Boxes(get_bbox_xyxy_from_xywh(bbox_xywh).unsqueeze(0))
for i in range(dims): shape.append(struct.unpack("=i", f.read(4))[0]) count = np.prod(shape) data = [] for i in range(count): data.append(struct.unpack("=f", f.read(4))[0]) return np.asarray(data, dtype=np.float32).reshape(shape) if __name__ == '__main__': priorbox0 = OriginPriorBox().forward().numpy() print(priorbox0, priorbox0.shape) # priorbox1 = load_priors('/media/ps/A1/XPC/data/CCPD/ccpd_rotate_coco/output/model_0335999.anc') # print(priorbox1, priorbox1.shape) fmap = [ torch.randn(1, 3, 100, 100), torch.randn(1, 3, 50, 50), torch.randn(1, 3, 25, 25), ] dag = DefaultAnchorGenerator( sizes=[[16, 32], [64, 128], [256, 512]], aspect_ratios=[[1.0]], strides=[8, 16, 32], offset=0.5, ) anc = dag(fmap) anc = Boxes.cat(anc).tensor.detach().cpu().numpy() print(anc, anc.shape)
def _geometric_aug_func(x, target, angle=0, translate=(0, 0), scale=1, shear=(0, 0), hflip=False, boxes_sample_prob=[], scale_ratio=1.0): use_mask = ('gt_masks' in target) boxes_and_labels = [(target['gt_boxes'].tensor[i], target['gt_classes'][i], target['gt_masks'].polygons[i] if use_mask else None) for i in range(len(target['gt_boxes'])) if random.random() < boxes_sample_prob[i]] boxes = [b_and_l[0] for b_and_l in boxes_and_labels] labels = [b_and_l[1] for b_and_l in boxes_and_labels] masks = [b_and_l[2] for b_and_l in boxes_and_labels] if random.random() < 0.5: angle *= -1 translate = (-translate[0], -translate[1]) shear = (-shear[0], -shear[1]) translate = (0, 0) height, width = x.shape[1], x.shape[2] x_crops = [] boxes_crops = [] boxes_new = [] labels_new = [] masks_new = [] for i, box in enumerate(boxes): box_crop = scale_area(box, height, width, scale_ratio) y1, x1, y2, x2 = box_crop.long() x_crop = x[:, x1:x2, y1:y2] boxes_crops.append(box_crop) if x1 >= x2 or y1 >= y2: x_crops.append(x_crop) continue if hflip: x_crop = x_crop.flip(-1) elif translate[0] + translate[1] != 0: offset_y = (y2 + translate[0]).clamp(0, width).long().tolist() - y2 offset_x = (x2 + translate[1]).clamp(0, height).long().tolist() - x2 if offset_x != 0 or offset_y != 0: offset = [offset_y, offset_x] boxes_new.append(box + torch.Tensor(offset * 2)) labels_new.append(labels[i]) if use_mask: polys = masks[i] polys_out = [] for poly in polys: poly_new = copy.deepcopy(poly) poly_new[0::2] = poly_new[0::2] + offset_y poly_new[1::2] = poly_new[1::2] + offset_x polys_out.append(poly_new) masks_new.append(polys_out) else: x_crop = transforms.functional.to_pil_image(x_crop.cpu()) x_crop = transforms.functional.affine( x_crop, angle, translate, scale, shear, resample=2, fillcolor=tuple([int(i) for i in pixel_mean])) x_crop = transforms.functional.to_tensor(x_crop).to(x.device) x_crops.append(x_crop) y = _transform(x, x_crops, boxes_crops, translate) if translate[0] + translate[1] != 0 and len(boxes_new) > 0: target['gt_boxes'] = Boxes( torch.cat((target['gt_boxes'], torch.stack(boxes_new)))) target['gt_classes'] = torch.cat( (target['gt_classes'], torch.Tensor(labels_new).long())) if use_mask: target['gt_masks'] = PolygonMasks(target['gt_masks'].polygons + masks_new) return y, target
def vis_training_targets(cfg, fcose_outputs, image_list, idx=0): import matplotlib.pyplot as plt import matplotlib.patches as patches import numpy as np colors = np.array([[1, 1, 198], [51, 1, 148], [101, 1, 98], [151, 1, 48], [201, 1, 8]]) / 255. num_loc_list = [len(loc) for loc in fcose_outputs.locations] fcose_outputs.num_loc_list = num_loc_list # compute locations to size ranges loc_to_size_range = [] for l, loc_per_level in enumerate(fcose_outputs.locations): loc_to_size_range_per_level = loc_per_level.new_tensor(fcose_outputs.sizes_of_interest[l]) loc_to_size_range.append( loc_to_size_range_per_level[None].expand(num_loc_list[l], -1) ) # (Sigma_{levels_points}, 2) loc_to_size_range = torch.cat(loc_to_size_range, dim=0) locations = torch.cat(fcose_outputs.locations, dim=0) training_targets = fcose_outputs.compute_targets_for_locations( locations, fcose_outputs.gt_instances, loc_to_size_range ) training_target = {k: v[idx] for k, v in training_targets.items()} fig, ax = plt.subplots(1, figsize=(20, 10)) fig.tight_layout() labels = training_target['labels'] reg_targets = training_target['reg_targets'] ext_targets = training_target['ext_targets'] idxOfloc_of_interest = torch.where(labels != 20)[0] global locxys, reg_targets_oi, ext_targets_oi, detections locxys = locations[idxOfloc_of_interest] reg_targets_oi = reg_targets[idxOfloc_of_interest] ext_targets_oi = ext_targets[idxOfloc_of_interest] detections = torch.stack([ locxys[:, 0] - reg_targets_oi[:, 0], locxys[:, 1] - reg_targets_oi[:, 1], locxys[:, 0] + reg_targets_oi[:, 2], locxys[:, 1] + reg_targets_oi[:, 3], ], dim=1) global tmp, ext_points ext_points = ExtremePoints.from_boxes(Boxes(detections), ext_targets_oi, locxys).tensor.cpu().numpy() tmp = ext_points im = image_list.tensor[idx] pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(im.device).view(-1, 1, 1) pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(im.device).view(-1, 1, 1) im_norm = ((im * pixel_std) + pixel_mean).cpu().numpy().transpose(1, 2, 0).astype(np.uint8) ax.imshow(im_norm) locxys_np = locxys.cpu().numpy() reg_targets_oi_np = reg_targets_oi.cpu().numpy() ext_targets_oi_np = ext_targets_oi.cpu().numpy() detections_np = detections.cpu().numpy() for i in range(len(locxys_np)): ax.scatter(locxys_np[i, 0], locxys_np[i, 1], color=colors[i % len(colors)].tolist(), marker='*') x1, y1, x2, y2 = detections_np[i, :] rect = patches.Rectangle((x1, y1), x2 - x1, y2 - y1, linewidth=1, edgecolor=colors[i % len(colors)].tolist(), facecolor='none', fill=False) ax.add_patch(rect) ax.scatter(ext_points[i][:, 0], ext_points[i][:, 1], color=colors[i % len(colors)].tolist(), marker='+') plt.show()
def to_boxes_from_xywh(bbox_xywh): return Boxes(get_bbox_xyxy_from_xywh(bbox_xywh).unsqueeze(0))
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * image: Tensor, image in (C, H, W) format. * instances (optional): groundtruth :class:`Instances` * proposals (optional): :class:`Instances`, precomputed proposals. Other information that's included in the original dicts, such as: * "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: list[dict]: Each dict is the output for one input image. The dict contains one key "instances" whose value is a :class:`Instances`. The :class:`Instances` object has the following keys: "pred_boxes", "pred_classes", "scores", "pred_masks", "pred_keypoints" """ if not self.training: return self.inference(batched_inputs) images = self.preprocess_image(batched_inputs) if "instances" in batched_inputs[0]: gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10) gt_instances = [ x["targets"].to(self.device) for x in batched_inputs ] else: gt_instances = None features = self.backbone(images.tensor) proposals, proposal_losses = self.proposal_generator( images, features, gt_instances) _, outputs_classic, outputs = self.tsd(images, features, proposals, gt_instances) detector_classic_losses = outputs_classic.losses() detector_losses = outputs.losses() detector_classic_losses[ 'loss_cls_classic'] = detector_classic_losses.pop('loss_cls') detector_classic_losses[ 'loss_box_reg_classic'] = detector_classic_losses.pop( 'loss_box_reg') if self.vis_period > 0: storage = get_event_storage() if storage.iter % self.vis_period == 0: self.visualize_training(batched_inputs, proposals) # Progressive constraints margin_regression_losses = 0 predict_boxes_classic = outputs_classic.predict_boxes_for_gt_classes() predict_boxes = outputs.predict_boxes_for_gt_classes() idx = -1 endIdx = 0 ind = outputs.gt_classes != (outputs.pred_proposal_deltas.size(1) / 4) for pbc, pb in zip(predict_boxes_classic, predict_boxes): idx += 1 startIdx = endIdx endIdx += outputs.num_preds_per_image[idx] iind = ind[startIdx:endIdx] margin_regression_losses += F.relu(self.MR - abs( matched_boxlist_iou(Boxes(pbc[iind]), outputs.gt_boxes[startIdx:endIdx][iind]) - matched_boxlist_iou(Boxes(pb[iind]), outputs. gt_boxes[startIdx:endIdx][iind]))).mean() margin_regression_losses = margin_regression_losses / len( predict_boxes) margin_classification_losses = 0 for ppc, pc in zip(outputs_classic.predict_probs(), outputs.predict_probs()): margin_classification_losses += F.relu(self.MC - (abs(ppc - pc)).sum(1)).mean() margin_classification_losses = margin_classification_losses / len( outputs.predict_probs()) losses = {} losses.update(detector_classic_losses) losses.update(detector_losses) losses.update(proposal_losses) losses.update({ 'loss_margin_classification': margin_classification_losses, 'loss_margin_regression': margin_regression_losses }) return losses
def forward(self, features, all_phrase_ids, targets, precomp_boxes, precomp_score, precomp_det_label, image_scale, all_sent_sgs, all_sentences, image_unique_id, det_label_embedding): """ :param obj_proposals: proposal from each images :param features: features maps from the backbone :param target: gt relation labels :param object_vocab, object_vocab_len [[xxx,xxx],[xxx],[xxx]], [2,1,1] :param sent_sg: sentence scene graph :return: prediction, loss note that first dimension is images """ img_num_per_gpu = len(features) batch_decode_logits = [] batch_topk_decoder_logits = [] batch_pred_similarity = [] batch_precomp_boxes = [] batch_topk_precomp_boxes=[] batch_pred_boxes = [] batch_topk_pred_boxes = [] batch_topk_fusion_pred_boxes = [] batch_topk_pred_similarity = [] batch_topk_fusion_similarity = [] batch_boxes_targets = [] batch_ctx_embed = [] batch_ctx_s1_embed = [] batch_pred_targets = [] batch_topk_pred_targets = [] """ Language Embedding""" batch_phrase_ids, batch_phrase_types, batch_phrase_embed, batch_phrase_len, \ batch_phrase_dec_ids, batch_phrase_mask, batch_decoder_word_embed, batch_phrase_glove_embed, batch_rel_phrase_embed, batch_relation_conn, batch_sent_embed,\ batch_decoder_rel_word_embed, batch_rel_mask, batch_rel_dec_idx = self.phrase_embed(all_sentences, all_phrase_ids, all_sent_sgs) h, w = features.shape[-2:] # self.storage = get_event_storage() for bid in range(img_num_per_gpu): """ Visual Embedding """ precomp_boxes_bid = precomp_boxes[bid].to(self.device) ## 100*4 order = [] for phr_ids in batch_phrase_ids[bid]: order.append(all_phrase_ids[bid].index(phr_ids)) target_filter = targets[bid][np.array(order)] batch_boxes_targets.append(target_filter.to(self.device)) batch_precomp_boxes.append(precomp_boxes_bid) img_feat_bid = features[[bid]] visual_features_bid = self.rcnn_top(self.det_roi_pooler([img_feat_bid], [precomp_boxes_bid])).mean(dim=[2, 3]).contiguous() if cfg.MODEL.VG.SPATIAL_FEAT: spa_feat = meshgrid_generation(h, w) spa_feat = self.det_roi_pooler([spa_feat], [precomp_boxes_bid]).view(visual_features_bid.shape[0], -1) spa_feat = self.spatial_trans(spa_feat) visual_features_bid = torch.cat((visual_features_bid, spa_feat), dim=1) visual_features_bid = self.visual_embedding(visual_features_bid) visual_features_bid = self.vis_batchnorm(visual_features_bid) """ Noun Phrase embedding """ phrase_embed_bid = batch_phrase_embed[bid] if phrase_embed_bid.shape[0] == 1 and self.training: phrase_embed_bid = self.phr_batchnorm(phrase_embed_bid.repeat(2,1))[[0]] else: phrase_embed_bid = self.phr_batchnorm(phrase_embed_bid) """ Similarity and attention prediction """ num_box = precomp_boxes_bid.tensor.size(0) num_phrase = phrase_embed_bid.size(0) phr_inds, obj_inds = self.make_pair(num_phrase, num_box) pred_similarity_bid, pred_targets_bid = self.similarity(visual_features_bid, phrase_embed_bid, obj_inds, phr_inds) pred_similarity_bid = pred_similarity_bid.reshape(num_phrase, num_box) pred_targets_bid = pred_targets_bid.reshape(num_phrase, num_box, 4) batch_pred_targets.append(pred_targets_bid) if cfg.MODEL.VG.USING_DET_KNOWLEDGE : det_label_embedding_bid = det_label_embedding[bid].to(self.device) sim = self.cal_det_label_sim_max(det_label_embedding_bid, batch_phrase_glove_embed[bid]) pred_similarity_bid = pred_similarity_bid * sim sim_mask = (sim > 0).float() atten_bid = numerical_stability_masked_softmax(pred_similarity_bid, sim_mask, dim=1) else: atten_bid = F.softmax(pred_similarity_bid, dim=1) ## reconstruction visual features visual_reconst_bid = torch.mm(atten_bid, visual_features_bid) decode_phr_logits = self.phrase_decoder(visual_reconst_bid, batch_decoder_word_embed[bid]) batch_decode_logits.append(decode_phr_logits) atten_score_topk, atten_ranking_topk = torch.topk(atten_bid, dim=1, k=self.s2_topk) ## (N, 10) ind_phr_topk = np.arange(num_phrase).repeat(self.s2_topk) ## -----------------------------------------------------## ## crop 2st features ## -----------------------------------------------------## if self.storage.iter <= cfg.SOLVER.REG_START_ITER: visual_features_topk_bid = visual_features_bid[atten_ranking_topk.reshape(-1)] precomp_boxes_topk_bid = precomp_boxes_bid[atten_ranking_topk.reshape(-1)] batch_topk_precomp_boxes.append(precomp_boxes_topk_bid) else: topk_box_ids = atten_ranking_topk.reshape(-1) + torch.as_tensor(ind_phr_topk, dtype=torch.long).to(self.device)*num_box precomp_boxes_tensor, box_size = precomp_boxes_bid.tensor, precomp_boxes_bid.size precomp_boxes_topk_tensor = precomp_boxes_tensor[atten_ranking_topk.reshape(-1)] ## (N*10, 4) pred_targets_s0 = pred_targets_bid.view(-1, 4)[topk_box_ids] precomp_boxes_topk_bid = self.box2box_translation.apply_deltas(pred_targets_s0, precomp_boxes_topk_tensor) precomp_boxes_topk_bid = Boxes(precomp_boxes_topk_bid, box_size) precomp_boxes_topk_bid.clip() batch_topk_precomp_boxes.append(precomp_boxes_topk_bid) visual_features_topk_bid = self.rcnn_top(self.det_roi_pooler([img_feat_bid], [precomp_boxes_topk_bid])).mean(dim=[2, 3]).contiguous() if cfg.MODEL.VG.SPATIAL_FEAT: spa_feat = meshgrid_generation(h, w) spa_feat = self.det_roi_pooler([spa_feat], [precomp_boxes_topk_bid]).view(visual_features_topk_bid.shape[0], -1) spa_feat = self.spatial_trans(spa_feat) visual_features_topk_bid = torch.cat((visual_features_topk_bid, spa_feat), dim=1) visual_features_topk_bid = self.visual_embedding(visual_features_topk_bid)## (N*10, 1024) visual_features_topk_bid = self.vis_batchnorm(visual_features_topk_bid) pred_similarity_topk_bid, pred_targets_topk_bid = self.similarity_topk(visual_features_topk_bid, phrase_embed_bid, ind_phr_topk) pred_similarity_topk_bid = pred_similarity_topk_bid.reshape(num_phrase, self.s2_topk) pred_targets_topk_bid = pred_targets_topk_bid.reshape(num_phrase, self.s2_topk, 4) batch_topk_pred_targets.append(pred_targets_topk_bid) if cfg.MODEL.VG.USING_DET_KNOWLEDGE: sim_topk = torch.gather(sim, dim=1, index=atten_ranking_topk.long()) sim_mask = (sim_topk>0).float() pred_similarity_topk_bid = pred_similarity_topk_bid * sim_topk atten_topk_bid = numerical_stability_masked_softmax(pred_similarity_topk_bid, sim_mask, dim=1) else: atten_topk_bid = F.softmax(pred_similarity_topk_bid, dim=1) atten_fusion = atten_topk_bid * atten_score_topk ## N*10 visual_features_topk_bid = visual_features_topk_bid.view(num_phrase, self.s2_topk, -1) visual_reconst_topk_bid = (atten_fusion.unsqueeze(2)*visual_features_topk_bid).sum(1) ## N*1024 decoder_phr_topk_logits = self.phrase_decoder(visual_reconst_topk_bid, batch_decoder_word_embed[bid]) batch_topk_decoder_logits.append(decoder_phr_topk_logits) ## construct the discriminative loss batch_ctx_s1_embed.append(self.visual_mlp(visual_reconst_bid.mean(0, keepdim=True))) batch_ctx_embed.append(self.visual_mlp(visual_reconst_topk_bid.mean(0, keepdim=True))) batch_pred_similarity.append(atten_bid) batch_topk_pred_similarity.append(atten_topk_bid) batch_topk_fusion_similarity.append(atten_fusion) ### transform boxes for stage-1 num_phrase_indices = torch.arange(num_phrase).long().to(self.device) max_box_ind = atten_bid.detach().cpu().numpy().argmax(1) precomp_boxes_delta_max = pred_targets_bid[num_phrase_indices, max_box_ind] ## numPhrase*4 max_topk_id = torch.topk(atten_topk_bid, dim=1, k=1)[1].long().squeeze(1) precomp_boxes_delta_max_topk = pred_targets_topk_bid[num_phrase_indices, max_topk_id] ## num_phrase*4 precomp_boxes_topk_bid_tensor = precomp_boxes_topk_bid.tensor.reshape(-1, self.s2_topk, 4) max_fusion_topk_id = torch.topk(atten_fusion, dim=1, k=1)[1].long().squeeze() precomp_boxes_delta_max_topk_fusion = pred_targets_topk_bid[num_phrase_indices, max_fusion_topk_id] ## num_phrase*4 phr_index = torch.arange(num_phrase).to(self.device) * self.s2_topk if self.storage.iter <= cfg.SOLVER.REG_START_ITER: max_select_boxes = precomp_boxes_bid[max_box_ind] max_precomp_boxes = precomp_boxes_topk_bid[max_topk_id + phr_index] max_fusion_precomp_boxes = precomp_boxes_topk_bid[max_fusion_topk_id + phr_index] else: max_select_boxes = Boxes(self.box2box_translation.apply_deltas(precomp_boxes_delta_max, precomp_boxes_bid[max_box_ind].tensor), precomp_boxes_bid.size) max_precomp_boxes = Boxes(self.box2box_translation.apply_deltas(precomp_boxes_delta_max_topk, precomp_boxes_topk_bid_tensor[num_phrase_indices, max_topk_id]), precomp_boxes_bid.size) max_fusion_precomp_boxes = Boxes(self.box2box_translation.apply_deltas(precomp_boxes_delta_max_topk_fusion, precomp_boxes_topk_bid_tensor[num_phrase_indices, max_fusion_topk_id]), precomp_boxes_bid.size) batch_pred_boxes.append(max_select_boxes) batch_topk_pred_boxes.append(max_precomp_boxes) batch_topk_fusion_pred_boxes.append(max_fusion_precomp_boxes) batch_ctx_sim, batch_ctx_sim_s1 = self.generate_image_sent_discriminative(batch_sent_embed, batch_ctx_embed, batch_ctx_s1_embed) noun_reconst_loss, noun_topk_reconst_loss, disc_img_sent_loss_s1, disc_img_sent_loss_s2, reg_loss, \ reg_loss_s1 = self.VGLoss(batch_phrase_mask, batch_decode_logits, batch_topk_decoder_logits, batch_phrase_dec_ids, batch_ctx_sim, batch_ctx_sim_s1, batch_pred_similarity, batch_topk_pred_similarity, batch_boxes_targets, batch_precomp_boxes, batch_pred_targets, batch_topk_pred_targets, batch_topk_precomp_boxes) all_loss = dict(noun_reconst_loss=noun_reconst_loss, noun_topk_reconst_loss=noun_topk_reconst_loss, disc_img_sent_loss_s1=disc_img_sent_loss_s1, disc_img_sent_loss_s2=disc_img_sent_loss_s2, reg_loss_s1=reg_loss, reg_loss_s2=reg_loss_s1) if self.training: return all_loss, None else: return all_loss, (batch_phrase_ids, batch_phrase_types, move2cpu(batch_pred_boxes), move2cpu(batch_pred_similarity), move2cpu(batch_boxes_targets), move2cpu(batch_precomp_boxes), image_unique_id, move2cpu(batch_topk_pred_similarity), move2cpu(batch_topk_fusion_similarity), move2cpu(batch_topk_pred_boxes), move2cpu(batch_topk_fusion_pred_boxes), move2cpu(batch_topk_precomp_boxes), move2cpu(batch_topk_pred_targets), move2cpu(batch_pred_targets))
def __call__(self, tensor, target): if self.ratio >= 1.0: return tensor, target self.img_pool.append({'tensor': tensor, 'target': target}) if len(self.img_pool) > self.img_pool_size: self.img_pool.pop(0) if len(self.img_pool) < 4: return tensor, target use_mask = ('gt_masks' in target) bbox = target['gt_boxes'] classes = target['gt_classes'] masks = target['gt_masks'] if use_mask else None c, h, w = tensor.shape h = int(math.ceil(h / self.size_divisible) * self.size_divisible) w = int(math.ceil(w / self.size_divisible) * self.size_divisible) new_h, new_w = int(self.ratio * h), int(self.ratio * w) in_tensor, in_bbox, in_mask = scale_jitter(tensor, bbox, self.ratio, (new_h, new_w), masks) pad_imgs = random.sample(self.img_pool, 3) pad_tensors, pad_bboxes, pad_masks = [], [], [] for img in pad_imgs: pad_tensor, pad_bbox, pad_mask = scale_jitter( img['tensor'], img['target']['gt_boxes'], self.ratio, (new_h, new_w), img['target']['gt_masks'] if use_mask else None) pad_tensors.append(pad_tensor) pad_bboxes.append(pad_bbox) pad_masks.append(pad_mask) crop_boxes = [(new_h, w - new_w), (h - new_h, new_w), (h - new_h, w - new_w)] tensor_out = in_tensor.new(*(c, h, w)).zero_() tensor_out[:c, :new_h, :new_w].copy_(in_tensor) tensor_out[:c, :new_h, new_w:].copy_( pad_tensors[0][:c, :crop_boxes[0][0], :crop_boxes[0][1]]) tensor_out[:c, new_h:, :new_w].copy_( pad_tensors[1][:c, :crop_boxes[1][0], :crop_boxes[1][1]]) tensor_out[:c, new_h:, new_w:].copy_( pad_tensors[2][:c, :crop_boxes[2][0], :crop_boxes[2][1]]) crop_bboxes, crop_classes, crop_masks = [], [], [] for i, pad_bbox in enumerate(pad_bboxes): crop_bbox = copy.deepcopy(pad_bbox) crop_bbox.clip(crop_boxes[i]) ious = crop_bbox.area() / pad_bbox.area() inds = ious >= self.iou_threshold crop_bbox = crop_bbox[inds] crop_bboxes.append(crop_bbox) crop_classes.append(pad_imgs[i]['target']['gt_classes'][inds]) if use_mask: crop_masks.append( [mask for j, mask in enumerate(pad_masks[i]) if inds[j]]) offsets_box = [ torch.Tensor([0.0, 0.0, 0.0, 0.0]), torch.Tensor([new_w, 0.0, new_w, 0.0]), torch.Tensor([0.0, new_h, 0.0, new_h]), torch.Tensor([new_w, new_h, new_w, new_h]) ] offsets_mask = [[0.0, 0.0], [0.0, new_w], [new_h, 0], [new_h, new_w]] bbox_out = Boxes( torch.cat([ target.tensor + offsets_box[i] for i, target in enumerate([in_bbox] + crop_bboxes) ], dim=0)) classes_out = torch.cat([classes] + crop_classes, dim=0) target_out = {'gt_boxes': bbox_out, 'gt_classes': classes_out} if use_mask: masks_out = [] for i, crop_mask in enumerate([in_mask] + crop_masks): mask_out = [] for polys in crop_mask: poly_out = [] for poly in polys: poly_new = copy.deepcopy(poly) poly_new[0::2] = poly_new[0::2] + offsets_mask[i][1] poly_new[1::2] = poly_new[1::2] + offsets_mask[i][0] poly_out.append(poly_new) mask_out.append(poly_out) masks_out += mask_out masks_out = PolygonMasks(masks_out) target_out['gt_masks'] = masks_out return tensor_out, target_out