def support_encoding_net(self, x, return_interm_layers=False): out: Dict[str, NestedTensor] = {} m = x.mask # x = self.meta_conv(x.tensors) x = self.backbone.conv1(x.tensors) x = self.backbone.bn1(x) x = self.backbone.relu(x) x = self.backbone.maxpool(x) x = self.backbone.layer1(x) x = self.backbone.layer2(x) if return_interm_layers: mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0] out['0'] = NestedTensor(x, mask) x = self.backbone.layer3(x) if return_interm_layers: mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0] out['1'] = NestedTensor(x, mask) x = self.backbone.layer4(x) if return_interm_layers: mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0] out['2'] = NestedTensor(x, mask) if return_interm_layers: return out else: mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0] out['0'] = NestedTensor(x, mask) return out
def forward(self, tensor_list: NestedTensor): xs = self.body(tensor_list.tensors) out: Dict[str, NestedTensor] = {} m = tensor_list.mask assert m is not None for name, x in xs.items(): # mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0] if name == '0': scale_map = self.c3_conv(x) elif name == '1': scale_map = self.c4_conv(x) else: scale_map = self.c5_conv(x) mask = F.interpolate(m[None].float(), size=scale_map.shape[-2:]).to(torch.bool)[0] out[name] = NestedTensor(scale_map, mask) c6 = self.c6_conv(xs['2']) mask = F.interpolate(m[None].float(), size=c6.shape[-2:]).to(torch.bool)[0] out['3'] = NestedTensor(c6, mask) return out
def forward(self, tensor_list): xs = self.body(tensor_list.tensors) out = OrderedDict() for name, x in xs.items(): mask = F.interpolate(tensor_list.mask[None].float(), size=x.shape[-2:]).bool()[0] out[name] = NestedTensor(x, mask) return out
def forward(self, samples: NestedTensor): """ The forward expects a NestedTensor, which consists of: - samples.tensor: batched images, of shape [batch_size x 3 x H x W] - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels It returns a dict with the following elements: - "pred_logits": the classification logits (including no-object) for all queries. Shape= [batch_size x num_queries x (num_classes + 1)] - "pred_boxes": The normalized boxes coordinates for all queries, represented as (center_x, center_y, height, width). These values are normalized in [0, 1], relative to the size of each individual image (disregarding possible padding). See PostProcess for information on how to retrieve the unnormalized bounding box. - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of dictionnaries containing the two above keys for each decoder layer. """ if not isinstance(samples, NestedTensor): samples = NestedTensor.from_tensor_list(samples) features, pos = self.backbone(samples) src, mask = features[-1].decompose() hs = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos[-1])[0] outputs_class = self.class_embed(hs) outputs_coord = self.bbox_embed(hs).sigmoid() out = { 'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1] } if self.aux_loss: out['aux_outputs'] = [{ 'pred_logits': a, 'pred_boxes': b } for a, b in zip(outputs_class[:-1], outputs_coord[:-1])] return out
def forward(self, tensor_list: NestedTensor): xs = self.body(tensor_list.tensors) out: Dict[str, NestedTensor] = {} for name, x in xs.items(): if 'layer' + name not in self.return_layers: continue #print(name, ", ", x.shape) m = tensor_list.mask assert m is not None mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0] # TODO: workaround to avoid NaN of attention calculation because of a full "True" mask invalid_indices = (torch.logical_not(mask).sum( dim=[1, 2]) == 0).nonzero().squeeze(-1) if (len(invalid_indices)): #print("workaround to avoid NaN for {}".format(invalid_indices)) mask[invalid_indices] = torch.zeros(x.shape[-2:], dtype=torch.bool, device=mask.device) out[name] = NestedTensor(x, mask) return out, xs
def main(): embedding = PositionEmbeddingSine(num_pos_feats=128) images = torch.rand((1, 3, 64, 64)).type(torch.FloatTensor) masks = torch.rand((1, 64, 64)).type(torch.LongTensor) inputs = NestedTensor(images, masks) pos = embedding(inputs) print(pos.size())
def loss_masks(self, outputs, targets, indices, num_boxes): """Compute the losses related to the masks: the focal loss and the dice loss. targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w] """ assert "pred_masks" in outputs src_idx = self._get_src_permutation_idx(indices) tgt_idx = self._get_tgt_permutation_idx(indices) src_masks = outputs["pred_masks"] # TODO use valid to mask invalid areas due to padding in loss target_masks, valid = NestedTensor.from_tensor_list( [t["masks"] for t in targets]).decompose() target_masks = target_masks.to(src_masks) src_masks = src_masks[src_idx] # upsample predictions to the target size src_masks = interpolate(src_masks[:, None], size=target_masks.shape[-2:], mode="bilinear", align_corners=False) src_masks = src_masks[:, 0].flatten(1) target_masks = target_masks[tgt_idx].flatten(1) losses = { "loss_mask": sigmoid_focal_loss(src_masks, target_masks, num_boxes), "loss_dice": dice_loss(src_masks, target_masks, num_boxes), } return losses
def forward(self, tensor_list: NestedTensor): xs = self.body(tensor_list.tensors) out: Dict[str, NestedTensor] = {} for name, x in xs.items(): m = tensor_list.mask assert m is not None mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0] out[name] = NestedTensor(x, mask) return out
def _forward_single_image(self, samples, track_instances: Instances): features, pos = self.backbone(samples) src, mask = features[-1].decompose() assert mask is not None srcs = [] masks = [] for l, feat in enumerate(features): src, mask = feat.decompose() srcs.append(self.input_proj[l](src)) masks.append(mask) assert mask is not None if self.num_feature_levels > len(srcs): _len_srcs = len(srcs) for l in range(_len_srcs, self.num_feature_levels): if l == _len_srcs: src = self.input_proj[l](features[-1].tensors) else: src = self.input_proj[l](srcs[-1]) m = samples.mask mask = F.interpolate(m[None].float(), size=src.shape[-2:]).to(torch.bool)[0] pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype) srcs.append(src) masks.append(mask) pos.append(pos_l) hs, init_reference, inter_references, enc_outputs_class, enc_outputs_coord_unact = self.transformer(srcs, masks, pos, track_instances.query_pos, ref_pts=track_instances.ref_pts) outputs_classes = [] outputs_coords = [] for lvl in range(hs.shape[0]): if lvl == 0: reference = init_reference else: reference = inter_references[lvl - 1] reference = inverse_sigmoid(reference) outputs_class = self.class_embed[lvl](hs[lvl]) tmp = self.bbox_embed[lvl](hs[lvl]) if reference.shape[-1] == 4: tmp += reference else: assert reference.shape[-1] == 2 tmp[..., :2] += reference outputs_coord = tmp.sigmoid() outputs_classes.append(outputs_class) outputs_coords.append(outputs_coord) outputs_class = torch.stack(outputs_classes) outputs_coord = torch.stack(outputs_coords) ref_pts_all = torch.cat([init_reference[None], inter_references[:, :, :, :2]], dim=0) out = {'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1], 'ref_pts': ref_pts_all[5]} if self.aux_loss: out['aux_outputs'] = self._set_aux_loss(outputs_class, outputs_coord) out['hs'] = hs[-1] return out
def forward(self, images): features = self.backbone(images.tensor) masks = self.mask_out_padding([ features_per_level.shape for features_per_level in features.values() ], images.image_sizes, images.tensor.device) assert len(features) == len(masks) for i, k in enumerate(features.keys()): features[k] = NestedTensor(features[k], masks[i]) return features
def forward(self, tensor_list): xs = self.body(tensor_list.tensors) if not self.interm: xs = [xs[self.main_layer]] out = OrderedDict() for i, x in enumerate(xs): mask = F.interpolate(tensor_list.mask[None].float(), size=x.shape[-2:]).bool()[0] out[f"layer{i}"] = NestedTensor(x, mask) return out
def forward(self, tensor_list: NestedTensor): xs = self.body(tensor_list.tensors) # get the tensor (image features) from intermediate layers out: Dict[str, NestedTensor] = {} for name, x in xs.items(): # for each intermediate layer tensor m = tensor_list.mask assert m is not None # scale the mask to the size of the intermediate layer tensor mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0] out[name] = NestedTensor(x, mask) return out
def forward(self, tensor_list): xs = self.body(tensor_list.tensors) out = OrderedDict() # print ("backbone part") for name, x in xs.items(): mask = F.interpolate(tensor_list.mask[None].float(), size=x.shape[-2:]).bool()[0] out[name] = NestedTensor(x, mask) # print ("backbone feature shape: ", x.size()) # print ("mask:", mask.size()) return out
def forward(self, samples: NestedTensor, pre_embed=None): """ The forward expects a NestedTensor, which consists of: - samples.tensor: batched images, of shape [batch_size x 3 x H x W] - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels It returns a dict with the following elements: - "pred_logits": the classification logits (including no-object) for all queries. Shape= [batch_size x num_queries x (num_classes + 1)] - "pred_boxes": The normalized boxes coordinates for all queries, represented as (center_x, center_y, height, width). These values are normalized in [0, 1], relative to the size of each individual image (disregarding possible padding). See PostProcess for information on how to retrieve the unnormalized bounding box. - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of dictionnaries containing the two above keys for each decoder layer. """ if isinstance(samples, (list, torch.Tensor)): samples = nested_tensor_from_tensor_list(samples) # detection only. if self.track_on and pre_embed is None: samples.tensor = samples.tensor[:, 3:, :, :] # det on pre frame. # backbone features. features, pos = self.backbone(samples) src, mask = features[self.index_feedforward].decompose() assert mask is not None # embedding features. hs = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos[self.index_feedforward], tgt=pre_embed)[0] # individual branch. outputs_class = self.class_embed(hs) outputs_coord = self.bbox_embed(hs).sigmoid() out = { 'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1] } outputs_track = None if self.track_on: outputs_track = self.track_embed(hs) out['pred_tracks'] = outputs_track[-1] if self.aux_loss: out['aux_outputs'] = self._set_aux_loss(outputs_class, outputs_coord, outputs_track) # pre embed. pre_embed = hs[-1].detach() return out, pre_embed
def forward(self, tensor_list: NestedTensor): xs = self.backbone.forward_features( tensor_list.tensors ) # NOTE tensor_list.tensors is merged tensors (padded). out: Dict[str, NestedTensor] = {} for name, x in xs.items(): m = tensor_list.mask assert m is not None mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0] out[name] = NestedTensor(x, mask) return out # Returns a dict of NestedTensors, containing the features and corresponding (interpolated) masks.
def forward(self, tensor_list: NestedTensor): xs = self.body(tensor_list.tensors) #torch.Size([2, 256, 38, 60]) # IPython.embed() xs = {'0': xs} out: Dict[str, NestedTensor] = {} for name, x in xs.items(): m = tensor_list.mask assert m is not None mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0] out[name] = NestedTensor(x, mask) # torch.Size([2, 256, 38, 60]) # IPython.embed() return out
def forward(self, tensor_list): """supports both NestedTensor and torch.Tensor """ if isinstance(tensor_list, NestedTensor): xs = self.body(tensor_list.tensors) out: Dict[str, NestedTensor] = {} for name, x in xs.items(): m = tensor_list.mask assert m is not None mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0] out[name] = NestedTensor(x, mask) else: out = self.body(tensor_list) return out
def forward(self, tensor_list: NestedTensor): xs = self.body(tensor_list.tensors) # input: torch.Size([2, 3, 604, 960]) #xs['0'].size(): torch.Size([2, 2048, 19, 30]) 'orderdict' # IPython.embed() out: Dict[str, NestedTensor] = {} for name, x in xs.items(): m = tensor_list.mask assert m is not None mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0] out[name] = NestedTensor( x, mask ) #x.size():torch.Size([2, 2048, 19, 30]) mask.size():[2, 19, 30]) # IPython.embed() return out
def forward(self, tensor_list: torch.Tensor): self.batch_size = tensor_list.shape[0] self.sequnce_length = tensor_list.shape[1] tensor_list = tensor_list.reshape( (self.batch_size * self.sequnce_length, tensor_list.shape[2], tensor_list.shape[3], tensor_list.shape[4])) tensor_list = nested_tensor_from_tensor_list(tensor_list) xs = self.body(tensor_list.tensors) out: Dict[str, NestedTensor] = {} for name, x in xs.items(): m = tensor_list.mask assert m is not None mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0] out[name] = NestedTensor(x, mask) return out
def forward(self, tensor_list: NestedTensor): xs = self.body(tensor_list.tensors) out: Dict[str, NestedTensor] = {} for name, x in xs.items(): m = tensor_list.mask assert m is not None m = L.unsqueeze(m, 1) # [batch_size, h, w] -> [batch_size, 1, h, w] m = m.astype("float32") mask = L.image_resize(m, out_shape=x.shape[-2:], resample="NEAREST") mask = mask.astype("bool") mask = L.squeeze( mask, [1]) # [batch_size, 1, h, w] -> [batch_size, h, w] out[name] = NestedTensor(x, mask) return out
def forward(self, tensor_list: NestedTensor): x = tensor_list.tensors x = self.body.patch_embed(x) x = self.body.pos_drop(x) for module in self.body.blocks: x = module(x) x = self.body.norm(x) x = torch.reshape(x, (-1, 2048, 18, 12)) x = self.body.pre_logits(x) m = tensor_list.mask mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0] out: Dict[str, NestedTensor] = {} out['0'] = NestedTensor(x, mask) return out
def forward(self, samples: NestedTensor): if not isinstance(samples, NestedTensor): samples = NestedTensor.from_tensor_list(samples) features, pos = self.detr.backbone(samples) bs = features[-1].tensors.shape[0] src, mask = features[-1].decompose() src_proj = self.detr.input_proj(src) hs, memory = self.detr.transformer(src_proj, mask, self.detr.query_embed.weight, pos[-1]) outputs_class = self.detr.class_embed(hs) outputs_coord = self.detr.bbox_embed(hs).sigmoid() out = { "pred_logits": outputs_class[-1], "pred_boxes": outputs_coord[-1] } if self.detr.aux_loss: out["aux_outputs"] = [{ "pred_logits": a, "pred_boxes": b } for a, b in zip(outputs_class[:-1], outputs_coord[:-1])] # FIXME h_boxes takes the last one computed, keep this in mind bbox_mask = self.bbox_attention(hs[-1], memory, mask=mask) seg_masks = self.mask_head( src_proj, bbox_mask, [features[2].tensors, features[1].tensors, features[0].tensors]) outputs_seg_masks = seg_masks.view(bs, self.detr.num_queries, seg_masks.shape[-2], seg_masks.shape[-1]) out["pred_masks"] = outputs_seg_masks return out
def forward(self, images): cur_images = images.tensor cur_dim = cur_images.shape[1] if cur_dim == 3: pre_images = cur_images.clone() elif cur_dim == 6: pre_images = cur_images[:, 3:, :, :] cur_images = cur_images[:, :3, :, :] else: raise NotImplementedError features = self.backbone(cur_images, pre_images) masks = self.mask_out_padding( [ features_per_level.shape for features_per_level in features.values() ], images.image_sizes, images.tensor.device, ) assert len(features) == len(masks) for i, k in enumerate(features.keys()): features[k] = NestedTensor(features[k], masks[i]) return features
def forward(self, samples: NestedTensor): """ The forward expects a NestedTensor, which consists of: - samples.tensor: batched images, of shape [batch_size x 3 x H x W] - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels It returns a dict with the following elements: - "pred_logits": the classification logits (including no-object) for all queries. Shape= [batch_size x num_queries x (num_classes + 1)] - "pred_boxes": The normalized boxes coordinates for all queries, represented as (center_x, center_y, height, width). These values are normalized in [0, 1], relative to the size of each individual image (disregarding possible padding). See PostProcess for information on how to retrieve the unnormalized bounding box. - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of dictionnaries containing the two above keys for each decoder layer. """ if not isinstance(samples, NestedTensor): samples = nested_tensor_from_tensor_list(samples) features, pos = self.backbone(samples) srcs = [] masks = [] for l, feat in enumerate(features): src, mask = feat.decompose() srcs.append(self.input_proj[l](src)) masks.append(mask) assert mask is not None if self.num_feature_levels > len(srcs): _len_srcs = len(srcs) for l in range(_len_srcs, self.num_feature_levels): if l == _len_srcs: src = self.input_proj[l](features[-1].tensors) else: src = self.input_proj[l](srcs[-1]) m = samples.mask mask = F.interpolate(m[None].float(), size=src.shape[-2:]).to(torch.bool)[0] pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype) srcs.append(src) masks.append(mask) pos.append(pos_l) query_embeds = None if not self.two_stage: query_embeds = self.query_embed.weight valid_ratio = None if self.accurate_ratio: valid_ratio = self._get_valid_ratio(samples.mask) hs, init_reference, inter_references, enc_outputs_class, enc_outputs_coord_unact = self.transformer( srcs, masks, pos, query_embeds, valid_ratio=valid_ratio) outputs_classes = [] outputs_coords = [] for lvl in range(hs.shape[0]): if lvl == 0: reference = init_reference else: reference = inter_references[lvl - 1] reference = inverse_sigmoid(reference) outputs_class = self.class_embed[lvl](hs[lvl]) tmp = self.bbox_embed[lvl](hs[lvl]) if reference.shape[-1] == 4: tmp += reference else: assert reference.shape[-1] == 2 tmp[..., :2] += reference outputs_coord = tmp.sigmoid() outputs_classes.append(outputs_class) outputs_coords.append(outputs_coord) outputs_class = torch.stack(outputs_classes) outputs_coord = torch.stack(outputs_coords) if not self.two_stage: ref_pts = torch.cat( [init_reference[None], inter_references[:, :, :, :2]]) out = { 'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1], 'ref_pts': ref_pts, 'logits_all': outputs_class, 'boxes_all': outputs_coord } else: out = { 'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1] } if self.aux_loss: out['aux_outputs'] = self._set_aux_loss(outputs_class, outputs_coord) if self.two_stage: enc_outputs_coord = enc_outputs_coord_unact.sigmoid() out['enc_outputs'] = { 'pred_logits': enc_outputs_class, 'pred_boxes': enc_outputs_coord } return out
def forward(self, samples: NestedTensor): """ The forward expects a NestedTensor, which consists of: - samples.tensors: batched images, of shape [batch_size x 3 x H x W] - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels It returns a dict with the following elements: - "pred_logits": the classification logits (including no-object) for all queries. Shape= [batch_size x num_queries x (num_classes + 1)] - "pred_boxes": The normalized boxes coordinates for all queries, represented as (center_x, center_y, height, width). These values are normalized in [0, 1], relative to the size of each individual image (disregarding possible padding). See PostProcess for information on how to retrieve the unnormalized bounding box. - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of dictionnaries containing the two above keys for each decoder layer. The backbone has two components, 0 represents the forward layer, on the other hand, 1 represents positionalencodding. """ if not isinstance(samples, NestedTensor): samples = nested_tensor_from_tensor_list(samples) features, pos = self.backbone(samples) print('sample shape:', samples.tensors.shape) print('feature:', features[0].tensors.shape) print('features length:', len(features)) print('pos length:', len(pos)) srcs = [] masks = [] for l, feat in enumerate(features): src, mask = feat.decompose() srcs.append(self.input_proj[l](src)) masks.append(mask) assert mask is not None print(src.shape) print(self.backbone[1]) if self.num_feature_levels > len(srcs): _len_srcs = len(srcs) for l in range(_len_srcs, self.num_feature_levels): if l == _len_srcs: src = self.input_proj[l](features[-1].tensors) else: src = self.input_proj[l](srcs[-1]) m = samples.mask mask = F.interpolate(m[None].float(), size=src.shape[-2:]).to(torch.bool)[0] pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype) srcs.append(src) masks.append(mask) pos.append(pos_l) print('pos len:', len(pos)) print('pos shape:', pos[0].shape) """ From the beginning until here, we have the input samples of 2 images as input (2 because of the batch size). the two images are passed to backbone which has two component, one is neural network, the other is positional encoding. """ query_embeds = None if not self.two_stage: query_embeds = self.query_embed.weight hs, init_reference, inter_references, enc_outputs_class, enc_outputs_coord_unact = self.transformer( srcs, masks, pos, query_embeds) """Don't underestimate this sentence. It returns the results of transformer!!!!!!!!!! """ print('hs shape:', hs.shape) print('init_reference shape', init_reference.shape) print('') outputs_classes = [] outputs_coords = [] for lvl in range(hs.shape[0]): if lvl == 0: reference = init_reference else: reference = inter_references[lvl - 1] reference = inverse_sigmoid(reference) outputs_class = self.class_embed[lvl](hs[lvl]) tmp = self.bbox_embed[lvl](hs[lvl]) if reference.shape[-1] == 4: tmp += reference else: assert reference.shape[-1] == 2 tmp[..., :2] += reference outputs_coord = tmp.sigmoid() outputs_classes.append(outputs_class) outputs_coords.append(outputs_coord) outputs_class = torch.stack(outputs_classes) outputs_coord = torch.stack(outputs_coords) out = { 'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1] } if self.aux_loss: out['aux_outputs'] = self._set_aux_loss(outputs_class, outputs_coord) if self.two_stage: enc_outputs_coord = enc_outputs_coord_unact.sigmoid() out['enc_outputs'] = { 'pred_logits': enc_outputs_class, 'pred_boxes': enc_outputs_coord } return out
def forward_once(self, samples: NestedTensor, train_samples: NestedTensor): if not isinstance(samples, NestedTensor): samples = nested_tensor_from_tensor_list(samples) features, pos = self.backbone(samples) if not isinstance(train_samples, NestedTensor): train_samples = nested_tensor_from_tensor_list(train_samples) pre_feat, _ = self.backbone(train_samples) srcs = [] masks = [] for l, (feat, feat2) in enumerate(zip(features, pre_feat)): src, mask = feat.decompose() src2, _ = feat2.decompose() srcs.append(self.combine(torch.cat([self.input_proj[l](src), self.input_proj[l](src2)], dim=1))) masks.append(mask) assert mask is not None if self.num_feature_levels > len(srcs): _len_srcs = len(srcs) for l in range(_len_srcs, self.num_feature_levels): if l == _len_srcs: src = self.combine(torch.cat([self.input_proj[l](features[-1].tensors), self.input_proj[l](pre_feat[-1].tensors)], dim=1)) else: src = self.input_proj[l](srcs[-1]) m = samples.mask mask = F.interpolate(m[None].float(), size=src.shape[-2:]).to(torch.bool)[0] pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype) srcs.append(src) masks.append(mask) pos.append(pos_l) query_embeds = None if not self.two_stage: query_embeds = self.query_embed.weight hs, init_reference, inter_references, enc_outputs_class, enc_outputs_coord_unact, _ = self.transformer(srcs, masks, pos, query_embeds) outputs_classes = [] outputs_coords = [] for lvl in range(hs.shape[0]): if lvl == 0: reference = init_reference else: reference = inter_references[lvl - 1] reference = inverse_sigmoid(reference) outputs_class = self.class_embed[lvl](hs[lvl]) tmp = self.bbox_embed[lvl](hs[lvl]) if reference.shape[-1] == 4: tmp += reference else: assert reference.shape[-1] == 2 tmp[..., :2] += reference outputs_coord = tmp.sigmoid() outputs_classes.append(outputs_class) outputs_coords.append(outputs_coord) outputs_class = torch.stack(outputs_classes) outputs_coord = torch.stack(outputs_coords) out = {'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1]} pre_embed = {'reference': outputs_coord[-1], 'tgt': hs[-1], 'feat': features} if self.aux_loss: out['aux_outputs'] = self._set_aux_loss(outputs_class, outputs_coord) if self.two_stage: enc_outputs_coord = enc_outputs_coord_unact.sigmoid() out['enc_outputs'] = {'pred_logits': enc_outputs_class, 'pred_boxes': enc_outputs_coord} return out, pre_embed
def _forward_single_image(self, samples, track_instances: Instances): features, pos = self.backbone(samples) src, mask = features[-1].decompose() assert mask is not None srcs = [] masks = [] for l, feat in enumerate(features): src, mask = feat.decompose() srcs.append(self.input_proj[l](src)) masks.append(mask) assert mask is not None if self.num_feature_levels > len(srcs): _len_srcs = len(srcs) for l in range(_len_srcs, self.num_feature_levels): if l == _len_srcs: src = self.input_proj[l](features[-1].tensors) else: src = self.input_proj[l](srcs[-1]) m = samples.mask mask = F.interpolate(m[None].float(), size=src.shape[-2:]).to(torch.bool)[0] pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype) srcs.append(src) masks.append(mask) pos.append(pos_l) hs, init_reference, inter_references, enc_outputs_class, enc_outputs_coord_unact = self.transformer( srcs, masks, pos, track_instances.query_pos, ref_pts=track_instances.ref_pts) outputs_classes = [] outputs_coords = [] for lvl in range(hs.shape[0]): if lvl == 0: reference = init_reference else: reference = inter_references[lvl - 1] reference = inverse_sigmoid(reference) outputs_class = self.class_embed[lvl](hs[lvl]) tmp = self.bbox_embed[lvl](hs[lvl]) if reference.shape[-1] == 4: tmp += reference else: assert reference.shape[-1] == 2 tmp[..., :2] += reference outputs_coord = tmp.sigmoid() outputs_classes.append(outputs_class) outputs_coords.append(outputs_coord) outputs_class = torch.stack(outputs_classes) outputs_coord = torch.stack(outputs_coords) ref_pts_all = torch.cat( [init_reference[None], inter_references[:, :, :, :2]], dim=0) out = { 'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1], 'ref_pts': ref_pts_all[5] } if self.aux_loss: out['aux_outputs'] = self._set_aux_loss(outputs_class, outputs_coord) with torch.no_grad(): if self.training: track_scores = outputs_class[-1, 0, :].sigmoid().max(dim=-1).values else: track_scores = outputs_class[-1, 0, :, 0].sigmoid() track_instances.scores = track_scores track_instances.pred_logits = outputs_class[-1, 0] track_instances.pred_boxes = outputs_coord[-1, 0] track_instances.output_embedding = hs[-1, 0] if self.training: # the track id will be assigned by the mather. out['track_instances'] = track_instances track_instances = self.criterion.match_for_single_frame(out) else: # each track will be assigned an unique global id by the track base. self.track_base.update(track_instances) if self.memory_bank is not None: track_instances = self.memory_bank(track_instances) # track_instances.track_scores = track_instances.track_scores[..., 0] # track_instances.scores = track_instances.track_scores.sigmoid() if self.training: self.criterion.calc_loss_for_track_scores(track_instances) tmp = {} tmp['init_track_instances'] = self._generate_empty_tracks() tmp['track_instances'] = track_instances out_track_instances = self.track_embed(tmp) out['track_instances'] = out_track_instances return out
criterion.to(device) postprocessors = {'bbox': PostProcess()} if args.masks: postprocessors['segm'] = PostProcessSegm() if args.dataset_file == "coco_panoptic": is_thing_map = {i: i <= 90 for i in range(201)} postprocessors["panoptic"] = PostProcessPanoptic(is_thing_map, threshold=0.85) return model, criterion, postprocessors if __name__ == '__main__': import argparse from util.misc import NestedTensor from main import get_args_parser parser = argparse.ArgumentParser('DETR training and evaluation script', parents=[get_args_parser()]) args = parser.parse_args() tensor = torch.randn(4, 3, 384, 384) mask = (tensor > 0)[:, 0, :, :] nt = NestedTensor(tensor, mask) model, criterion, _ = build(args) y = model(nt) for k, v in y.items(): if isinstance(v, torch.Tensor): print("%s: %s" % (k, str(v.shape)))
def forward(self, samples: NestedTensor, pre_embed=None): """ The forward expects a NestedTensor, which consists of: - samples.tensor: batched images, of shape [batch_size x 3 x H x W] - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels It returns a dict with the following elements: - "pred_logits": the classification logits (including no-object) for all queries. Shape= [batch_size x num_queries x (num_classes + 1)] - "pred_boxes": The normalized boxes coordinates for all queries, represented as (center_x, center_y, height, width). These values are normalized in [0, 1], relative to the size of each individual image (disregarding possible padding). See PostProcess for information on how to retrieve the unnormalized bounding box. - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of dictionnaries containing the two above keys for each decoder layer. """ assert not self.training, 'here is inference mode' assert samples.tensors.shape[0] == 1, 'track only supports batch 1' if not isinstance(samples, NestedTensor): samples = nested_tensor_from_tensor_list(samples) features, pos = self.backbone(samples) if pre_embed is not None: pre_feat = pre_embed['feat'] else: pre_feat = features srcs = [] masks = [] for l, (feat, feat2) in enumerate(zip(features, pre_feat)): src, mask = feat.decompose() src2, _ = feat2.decompose() srcs.append( self.combine( torch.cat( [self.input_proj[l](src), self.input_proj[l](src2)], dim=1))) masks.append(mask) assert mask is not None if self.num_feature_levels > len(srcs): _len_srcs = len(srcs) for l in range(_len_srcs, self.num_feature_levels): if l == _len_srcs: src = self.combine( torch.cat([ self.input_proj[l](features[-1].tensors), self.input_proj[l](pre_feat[-1].tensors) ], dim=1)) else: src = self.input_proj[l](srcs[-1]) m = samples.mask mask = F.interpolate(m[None].float(), size=src.shape[-2:]).to(torch.bool)[0] pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype) srcs.append(src) masks.append(mask) pos.append(pos_l) # detection mode query_embeds = None if not self.two_stage: query_embeds = self.query_embed.weight hs, init_reference, inter_references, enc_outputs_class, enc_outputs_coord_unact, memory = self.transformer( srcs, masks, pos, query_embeds) cur_hs = hs outputs_classes = [] outputs_coords = [] for lvl in range(hs.shape[0]): if lvl == 0: reference = init_reference else: reference = inter_references[lvl - 1] reference = inverse_sigmoid(reference) outputs_class = self.class_embed[lvl](hs[lvl]) tmp = self.bbox_embed[lvl](hs[lvl]) if reference.shape[-1] == 4: tmp += reference else: assert reference.shape[-1] == 2 tmp[..., :2] += reference outputs_coord = tmp.sigmoid() outputs_classes.append(outputs_class) outputs_coords.append(outputs_coord) outputs_class = torch.stack(outputs_classes) outputs_coord = torch.stack(outputs_coords) cur_class = outputs_class[-1] cur_box = outputs_coord[-1] cur_reference = cur_box cur_tgt = cur_hs[-1] if pre_embed is not None: # track mode pre_reference, pre_tgt = pre_embed['reference'], pre_embed['tgt'] hs, init_reference, inter_references, enc_outputs_class, enc_outputs_coord_unact, _ = self.transformer( srcs, masks, pos, query_embeds, pre_reference, pre_tgt, memory) outputs_classes = [] outputs_coords = [] for lvl in range(hs.shape[0]): if lvl == 0: reference = init_reference else: reference = inter_references[lvl - 1] reference = inverse_sigmoid(reference) outputs_class = self.class_embed[lvl](hs[lvl]) tmp = self.bbox_embed[lvl](hs[lvl]) if reference.shape[-1] == 4: tmp += reference else: assert reference.shape[-1] == 2 tmp[..., :2] += reference outputs_coord = tmp.sigmoid() outputs_classes.append(outputs_class) outputs_coords.append(outputs_coord) outputs_class = torch.stack(outputs_classes) outputs_coord = torch.stack(outputs_coords) pre_class, pre_box = outputs_class[-1], outputs_coord[-1] else: pre_class, pre_box = cur_class, cur_box out = { 'pred_logits': cur_class, 'pred_boxes': cur_box, 'tracking_logits': pre_class, 'tracking_boxes': pre_box } pre_embed = { 'reference': cur_reference, 'tgt': cur_tgt, 'feat': features } if self.aux_loss: out['aux_outputs'] = self._set_aux_loss(outputs_class, outputs_coord) if self.two_stage: enc_outputs_coord = enc_outputs_coord_unact.sigmoid() out['enc_outputs'] = { 'pred_logits': enc_outputs_class, 'pred_boxes': enc_outputs_coord } return out, pre_embed
def forward(self, samples: NestedTensor): print('run forward') """ The forward expects a NestedTensor, which consists of: - samples.tensor: batched images, of shape [batch_size x 3 x H x W] - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels It returns a dict with the following elements: - "pred_logits": the classification logits (including no-object) for all queries. Shape= [batch_size x num_queries x (num_classes + 1)] - "pred_boxes": The normalized boxes coordinates for all queries, represented as (center_x, center_y, height, width). These values are normalized in [0, 1], relative to the size of each individual image (disregarding possible padding). See PostProcess for information on how to retrieve the unnormalized bounding box. - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of dictionnaries containing the two above keys for each decoder layer. """ #print(samples) if not isinstance(samples, NestedTensor): samples = nested_tensor_from_tensor_list(samples) features, pos = self.backbone(samples) import cv2 print('feature size', features[2].tensors.size()) # for i in range(len(features)) : # print(type(features[i])) # print(features[i].tensors[0].size()) # name = '/content/content/content/Deformable-DETR/explained /features/feature', str(i) # save_image(features[i].tensors[0], name) # # cv2.imwrite(name, features[i]) channel1 = features[2].tensors[0, :, :, :] # for i, feat in enumerate(channel1): # #print(feat.size()) # name = '/content/Explain-Deformable-DETR/explained /features/feature_3_'+ str(i) + '.png' # if i==30: # break # print(name) # save_image(feat, name) srcs = [] masks = [] for l, feat in enumerate(features): src, mask = feat.decompose() srcs.append(self.input_proj[l](src)) #print ('sdfsdfsdfsdfsdfsdf', self.input_proj[l](src)) masks.append(mask) assert mask is not None if self.num_feature_levels > len(srcs): _len_srcs = len(srcs) for l in range(_len_srcs, self.num_feature_levels): if l == _len_srcs: src = self.input_proj[l](features[-1].tensors) else: src = self.input_proj[l](srcs[-1]) m = samples.mask mask = F.interpolate(m[None].float(), size=src.shape[-2:]).to(torch.bool)[0] pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype) srcs.append(src) masks.append(mask) pos.append(pos_l) print(len(srcs)) print(srcs[0].size()) #layer1 = srcs[3][0, :, :, :] # for i, feat in enumerate(layer1): # #print(feat.size()) # name = '/content/Explain-Deformable-DETR/explained /features/feature_3_'+ str(i) + '.png' # if i==10: # break # print(name) # save_image(feat, name) query_embeds = None if not self.two_stage: query_embeds = self.query_embed.weight print(query_embeds) hs, init_reference, inter_references, enc_outputs_class, enc_outputs_coord_unact = self.transformer( srcs, masks, pos, query_embeds) print('self.class_embed', self.class_embed) outputs_classes = [] outputs_coords = [] for lvl in range(hs.shape[0]): if lvl == 0: reference = init_reference else: reference = inter_references[lvl - 1] reference = inverse_sigmoid(reference) outputs_class = self.class_embed[lvl](hs[lvl]) tmp = self.bbox_embed[lvl](hs[lvl]) if reference.shape[-1] == 4: tmp += reference else: assert reference.shape[-1] == 2 tmp[..., :2] += reference outputs_coord = tmp.sigmoid() outputs_classes.append(outputs_class) outputs_coords.append(outputs_coord) outputs_class = torch.stack(outputs_classes) outputs_coord = torch.stack(outputs_coords) out = { 'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1] } if self.aux_loss: out['aux_outputs'] = self._set_aux_loss(outputs_class, outputs_coord) if self.two_stage: enc_outputs_coord = enc_outputs_coord_unact.sigmoid() print('two stage enc_outputs_coord and enc_outputs_class', enc_outputs_coord.shape, enc_outputs_coord_unact[0][0][:]) out['enc_outputs'] = { 'pred_logits': enc_outputs_class, 'pred_boxes': enc_outputs_coord } return out