def forward(self, samples: NestedTensor): """ The forward expects a NestedTensor, which consists of: - samples.tensor: batched images, of shape [batch_size x 3 x H x W] - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels It returns a dict with the following elements: - "pred_logits": the classification logits (including no-object) for all queries. Shape= [batch_size x num_queries x (num_classes + 1)] - "pred_boxes": The normalized boxes coordinates for all queries, represented as (center_x, center_y, height, width). These values are normalized in [0, 1], relative to the size of each individual image (disregarding possible padding). See PostProcess for information on how to retrieve the unnormalized bounding box. - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of dictionnaries containing the two above keys for each decoder layer. """ if isinstance(samples, (list, torch.Tensor)): samples = nested_tensor_from_tensor_list(samples) features, pos = self.backbone(samples) src, mask = features[-1].decompose() assert mask is not None hs = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos[-1])[0] outputs_class = self.class_embed(hs) outputs_coord = self.bbox_embed(hs).sigmoid() out = { 'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1] } if self.aux_loss: out['aux_outputs'] = self._set_aux_loss(outputs_class, outputs_coord) return out
def forward(self, samples: NestedTensor): if isinstance(samples, (list, torch.Tensor)): samples = nested_tensor_from_tensor_list(samples) features, pos = self.detr.backbone(samples) bs = features[-1].tensors.shape[0] src, mask = features[-1].decompose() assert mask is not None src_proj = self.detr.input_proj(src) hs, memory = self.detr.transformer( src_proj, mask, self.detr.query_embed.weight, pos[-1] ) outputs_class = self.detr.class_embed(hs) outputs_coord = self.detr.bbox_embed(hs).sigmoid() out = {"pred_logits": outputs_class[-1], "pred_boxes": outputs_coord[-1]} if self.detr.aux_loss: out["aux_outputs"] = self.detr._set_aux_loss(outputs_class, outputs_coord) # FIXME h_boxes takes the last one computed, keep this in mind bbox_mask = self.bbox_attention(hs[-1], memory, mask=mask) seg_masks = self.mask_head( src_proj, bbox_mask, [features[2].tensors, features[1].tensors, features[0].tensors], ) outputs_seg_masks = seg_masks.view( bs, self.detr.num_queries, seg_masks.shape[-2], seg_masks.shape[-1] ) out["pred_masks"] = outputs_seg_masks return out
def run_model(self, model, inputs_list, tolerate_small_mismatch=False, do_constant_folding=True, dynamic_axes=None, output_names=None, input_names=None): model.eval() onnx_io = io.BytesIO() # export to onnx with the first input torch.onnx.export(model, inputs_list[0], onnx_io, do_constant_folding=do_constant_folding, opset_version=12, dynamic_axes=dynamic_axes, input_names=input_names, output_names=output_names) # validate the exported model with onnx runtime for test_inputs in inputs_list: with torch.no_grad(): if isinstance(test_inputs, torch.Tensor) or isinstance( test_inputs, list): test_inputs = ( nested_tensor_from_tensor_list(test_inputs), ) test_ouputs = model(*test_inputs) if isinstance(test_ouputs, torch.Tensor): test_ouputs = (test_ouputs, ) self.ort_validate(onnx_io, test_inputs, test_ouputs, tolerate_small_mismatch)
def loss_masks(self, outputs, targets, indices, num_boxes): """Compute the losses related to the masks: the focal loss and the dice loss. targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w] """ assert "pred_masks" in outputs src_idx = self._get_src_permutation_idx(indices) tgt_idx = self._get_tgt_permutation_idx(indices) src_masks = outputs["pred_masks"] src_masks = src_masks[src_idx] masks = [t["masks"] for t in targets] target_masks, valid = nested_tensor_from_tensor_list(masks).decompose() target_masks = target_masks.to(src_masks) target_masks = target_masks[tgt_idx] # upsample predictions to the target size src_masks = interpolate( src_masks[:, None], size=target_masks.shape[-2:], mode="bilinear", align_corners=False, ) src_masks = src_masks[:, 0].flatten(1) target_masks = target_masks.flatten(1) target_masks = target_masks.view(src_masks.shape) losses = { "loss_mask": sigmoid_focal_loss(src_masks, target_masks, num_boxes), "loss_dice": dice_loss(src_masks, target_masks, num_boxes), } return losses
def _assert_model_output(self, model, scripted_model): x = nested_tensor_from_tensor_list( [torch.rand(3, 200, 200), torch.rand(3, 200, 250)]) out = model(x) out_script = scripted_model(x) self.assertTrue(out["pred_logits"].equal(out_script["pred_logits"])) self.assertTrue(out["pred_boxes"].equal(out_script["pred_boxes"]))
def test_model_script_detection(self): model = detr_resnet50(pretrained=False).eval() scripted_model = torch.jit.script(model) x = nested_tensor_from_tensor_list([torch.rand(3, 200, 200), torch.rand(3, 200, 250)]) out = model(x) out_script = scripted_model(x) self.assertTrue(out["pred_logits"].equal(out_script["pred_logits"])) self.assertTrue(out["pred_boxes"].equal(out_script["pred_boxes"]))
def test_model_detection_different_inputs(self): model = detr_resnet50(pretrained=False).eval() # support NestedTensor x = nested_tensor_from_tensor_list([torch.rand(3, 200, 200), torch.rand(3, 200, 250)]) out = model(x) self.assertIn('pred_logits', out) # and 4d Tensor x = torch.rand(1, 3, 200, 200) out = model(x) self.assertIn('pred_logits', out) # and List[Tensor[C, H, W]] x = torch.rand(3, 200, 200) out = model([x]) self.assertIn('pred_logits', out)
def unwrap_collate_fn(batch): batch = list(zip(*batch)) batch[0] = nested_tensor_from_tensor_list(batch[0]) batch[0] = {"tensors": batch[0].tensors, "mask": batch[0].mask} return tuple(batch)
def forward(self, samples: NestedTensor): """ The forward expects a NestedTensor, which consists of: - samples.tensor: batched images, of shape [batch_size x 3 x H x W] - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels It returns a dict with the following elements: - "pred_logits": the classification logits (including no-object) for all queries. Shape= [batch_size x num_queries x (num_classes + 1)] - "pred_boxes": The normalized boxes coordinates for all queries, represented as (center_x, center_y, height, width). These values are normalized in [0, 1], relative to the size of each individual image (disregarding possible padding). See PostProcess for information on how to retrieve the unnormalized bounding box. - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of dictionnaries containing the two above keys for each decoder layer. """ # Global weight norm. #params = [p for n, p in self.named_parameters() if ("backbone" not in n and "global_scale" not in n) and p.requires_grad] #params = [p for p in self.parameters()] #contiguous = torch.cat([p.view(-1) for p in params]) #mean = contiguous.mean() #std = contiguous.std() # Initial std: 0.0392 #print("\n\nstd", std, "\n\n") #scale_parameter_member(self, 0.022 / std, mean) #scale_parameter_member(self, 0.022 / std) #scale_parameter_member(self, self.global_scale / std) if not self.scaled: self.scaled = True ignore = [ "backbone", "global_scale", "bbox_embed.layers.2", "query_embed.weight", "class_embed." ] named_params = [(n, p) for n, p in self.named_parameters() if all(i not in n for i in ignore) and p.requires_grad] names, params = zip(*named_params) contiguous = torch.cat([p.view(-1) for p in params]) std = contiguous.std() with torch.no_grad(): for param in params: param.data.mul_(0.022 / std) # scaled_params = multitensor_scaling((0.022 / std), *params) # # param_dict = dict(named_params) # # scaled_params = [] # # for (n, p) in named_params: # # # Replace bias with weight for fan in computation. # # fanin_name = n # # if n.endswith("bias"): # # scaled_params.append(p) # # fanin_name = n[:-4] + "weight" # # fanin_shape = param_dict[fanin_name].shape # # fanin = fanin_shape.numel() / fanin_shape[0] # # scale = sqrt(1/fanin) / std # # print(fanin, float(std), sqrt(1/fanin), float(scale), float((p*scale).std())) # # scaled_params.append(p * scale) # #scaled_params = multitensor_scaling((self.global_scale.squeeze() / std), *params) # scaled_param_dict = dict(zip(names, scaled_params)) # # a_names, attention_params = zip(*[(n, p) for n, p in named_params if "in_proj" in n]) # # rest_names, rest_params = zip(*[(n, p) for n, p in named_params if "in_proj" not in n]) # # #a_scaled_params = multitensor_scaling(sqrt(0.022) / std.sqrt(), *attention_params) # # a_scaled_params = multitensor_scaling((0.022 / std).sqrt(), *attention_params) # # rest_scaled_params = multitensor_scaling(0.022 / std, *rest_params) # # scaled_param_dict = dict(zip(a_names, a_scaled_params)) # # scaled_param_dict.update(dict(zip(rest_names, rest_scaled_params))) # assign_scaled_parameter(self, scaled_param_dict) if isinstance(samples, (list, torch.Tensor)): samples = nested_tensor_from_tensor_list(samples) features, pos = self.backbone(samples) if self.high_def: # Use the middle feature map as reference reference = 1 pos = pos[reference] src, mask = features[reference].decompose() src = self.projections[reference](src) for i, (projection, fmap) in enumerate(zip(self.projections, features)): if i == reference: continue fmap = projection(fmap.tensors) src += torch.nn.functional.interpolate(fmap, src.shape[2:4], mode="bilinear", align_corners=True) else: pos = pos[-1] src, mask = features[-1].decompose() assert mask is not None src = self.input_proj(src) if self.bootstrap_steps and self.training: stride = sum([self.epoch < e for e in self.bootstrap_steps]) + 1 self.stride = stride if stride > 1: x_i, y_i = random.choices(range(stride), k=2) src = src[:, :, y_i::stride, x_i::stride] mask = mask[:, y_i::stride, x_i::stride] pos = pos[:, :, y_i::stride, x_i::stride] hs = self.transformer(src, mask, self.query_embed.weight, pos)[0] #hs = hs[:, :, :self.num_queries] outputs_class = self.class_embed(hs) #outputs_coord = self.bbox_embed(hs).sigmoid() outputs_coord = self.bbox_embed(hs) outputs_coord[..., 2:] = torch.exp(outputs_coord[..., 2:]) out = { 'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1] } if self.aux_loss: out['aux_outputs'] = self._set_aux_loss(outputs_class, outputs_coord) #restore_params(self) return out
def forward(self, inputs: List[Tensor]): sample = nested_tensor_from_tensor_list(inputs) return self.model(sample)