def decode_results(predictions): dboxes = dboxes300_coco() encoder = Encoder(dboxes) ploc, plabel = [val.float() for val in predictions] results = encoder.decode_batch(ploc, plabel, criteria=0.5, max_output=20) return [[pred.detach().cpu().numpy() for pred in detections] for detections in results]
def test(opt): model = SSD(backbone=ResNet()) checkpoint = torch.load(opt.pretrained_model) model.load_state_dict(checkpoint["model_state_dict"]) if torch.cuda.is_available(): model.cuda() model.eval() dboxes = generate_dboxes() test_set = CocoDataset(opt.data_path, 2017, "val", SSDTransformer(dboxes, (300, 300), val=True)) encoder = Encoder(dboxes) if os.path.isdir(opt.output): shutil.rmtree(opt.output) os.makedirs(opt.output) for img, img_id, img_size, _, _ in test_set: if img is None: continue if torch.cuda.is_available(): img = img.cuda() with torch.no_grad(): ploc, plabel = model(img.unsqueeze(dim=0)) result = encoder.decode_batch(ploc, plabel, opt.nms_threshold, 20)[0] loc, label, prob = [r.cpu().numpy() for r in result] best = np.argwhere(prob > opt.cls_threshold).squeeze(axis=1) loc = loc[best] label = label[best] prob = prob[best] if len(loc) > 0: path = test_set.coco.loadImgs(img_id)[0]["file_name"] output_img = cv2.imread( os.path.join(opt.data_path, "val2017", path)) height, width, _ = output_img.shape loc[:, 0::2] *= width loc[:, 1::2] *= height loc = loc.astype(np.int32) for box, lb, pr in zip(loc, label, prob): category = test_set.label_info[lb] color = colors[lb] xmin, ymin, xmax, ymax = box cv2.rectangle(output_img, (xmin, ymin), (xmax, ymax), color, 2) text_size = cv2.getTextSize(category + " : %.2f" % pr, cv2.FONT_HERSHEY_PLAIN, 1, 1)[0] cv2.rectangle( output_img, (xmin, ymin), (xmin + text_size[0] + 3, ymin + text_size[1] + 4), color, -1) cv2.putText(output_img, category + " : %.2f" % pr, (xmin, ymin + text_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, (255, 255, 255), 1) cv2.imwrite( "{}/{}_prediction.jpg".format(opt.output, path[:-4]), output_img)
def test(opt): model = SSD(backbone=ResNet()) checkpoint = torch.load(opt.pretrained_model) model.load_state_dict(checkpoint["model_state_dict"]) if torch.cuda.is_available(): model.cuda() model.eval() dboxes = generate_dboxes() transformer = SSDTransformer(dboxes, (300, 300), val=True) img = Image.open(opt.input).convert("RGB") img, _, _, _ = transformer(img, None, torch.zeros(1,4), torch.zeros(1)) encoder = Encoder(dboxes) if torch.cuda.is_available(): img = img.cuda() with torch.no_grad(): ploc, plabel = model(img.unsqueeze(dim=0)) result = encoder.decode_batch(ploc, plabel, opt.nms_threshold, 20)[0] loc, label, prob = [r.cpu().numpy() for r in result] best = np.argwhere(prob > opt.cls_threshold).squeeze(axis=1) loc = loc[best] label = label[best] prob = prob[best] output_img = cv2.imread(opt.input) if len(loc) > 0: height, width, _ = output_img.shape loc[:, 0::2] *= width loc[:, 1::2] *= height loc = loc.astype(np.int32) for box, lb, pr in zip(loc, label, prob): category = coco_classes[lb] color = colors[lb] xmin, ymin, xmax, ymax = box cv2.rectangle(output_img, (xmin, ymin), (xmax, ymax), color, 2) text_size = cv2.getTextSize(category + " : %.2f" % pr, cv2.FONT_HERSHEY_PLAIN, 1, 1)[0] cv2.rectangle(output_img, (xmin, ymin), (xmin + text_size[0] + 3, ymin + text_size[1] + 4), color, -1) cv2.putText( output_img, category + " : %.2f" % pr, (xmin, ymin + text_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, (255, 255, 255), 1) if opt.output is None: output = "{}_prediction.jpg".format(opt.input[:-4]) else: output = opt.output cv2.imwrite(output, output_img)
class SSD300(nn.Module): def __init__(self, backbone=None, num_classes=21): super(SSD300, self).__init__() if backbone is None: raise Exception("backbone is None") if not hasattr(backbone, "out_channels"): raise Exception("the backbone not has attribute: out_channel") self.feature_extractor = backbone self.num_classes = num_classes # out_channels = [1024, 512, 512, 256, 256, 256] for resnet50 self._build_additional_features(self.feature_extractor.out_channels) self.num_defaults = [4, 6, 6, 6, 4, 4] location_extractors = [] confidence_extractors = [] # out_channels = [1024, 512, 512, 256, 256, 256] for resnet50 for nd, oc in zip(self.num_defaults, self.feature_extractor.out_channels): # nd is number_default_boxes, oc is output_channel location_extractors.append( nn.Conv2d(oc, nd * 4, kernel_size=3, padding=1)) confidence_extractors.append( nn.Conv2d(oc, nd * self.num_classes, kernel_size=3, padding=1)) self.loc = nn.ModuleList(location_extractors) self.conf = nn.ModuleList(confidence_extractors) self._init_weights() default_box = dboxes300_coco() self.compute_loss = Loss(default_box) self.encoder = Encoder(default_box) def _build_additional_features(self, input_size): """ 为backbone(resnet50)添加额外的一系列卷积层,得到相应的一系列特征提取器 :param input_size: :return: """ additional_blocks = [] # input_size = [1024, 512, 512, 256, 256, 256] for resnet50 middle_channels = [256, 256, 128, 128, 128] for i, (input_ch, output_ch, middle_ch) in enumerate( zip(input_size[:-1], input_size[1:], middle_channels)): padding, stride = (1, 2) if i < 3 else (0, 1) layer = nn.Sequential( nn.Conv2d(input_ch, middle_ch, kernel_size=1, bias=False), nn.BatchNorm2d(middle_ch), nn.ReLU(inplace=True), nn.Conv2d(middle_ch, output_ch, kernel_size=3, padding=padding, stride=stride, bias=False), nn.BatchNorm2d(output_ch), nn.ReLU(inplace=True), ) additional_blocks.append(layer) self.additional_blocks = nn.ModuleList(additional_blocks) def _init_weights(self): layers = [*self.additional_blocks, *self.loc, *self.conf] for layer in layers: for param in layer.parameters(): if param.dim() > 1: nn.init.xavier_uniform_(param) # Shape the classifier to the view of bboxes def bbox_view(self, features, loc_extractor, conf_extractor): locs = [] confs = [] for f, l, c in zip(features, loc_extractor, conf_extractor): # [batch, n*4, feat_size, feat_size] -> [batch, 4, -1] locs.append(l(f).view(f.size(0), 4, -1)) confs.append(c(f).view(f.size(0), self.num_classes, -1)) locs, confs = torch.cat(locs, 2).contiguous(), torch.cat(confs, 2).contiguous() return locs, confs def forward(self, image, targets): x = self.feature_extractor(image) # Feature Map 38x38x1024, 19x19x512, 10x10x512, 5x5x256, 3x3x256, 1x1x256 detection_features = [x] for layer in self.additional_blocks: x = layer(x) detection_features.append(x) # Feature Map 38x38x4, 19x19x6, 10x10x6, 5x5x6, 3x3x4, 1x1x4 locs, confs = self.bbox_view(detection_features, self.loc, self.conf) # For SSD 300, shall return nbatch x 8732 x {nlabels, nlocs} results # 38x38x4 + 19x19x6 + 10x10x6 + 5x5x6 + 3x3x4 + 1x1x4 = 8732 if self.training: # bboxes_out (Tensor 8732 x 4), labels_out (Tensor 8732) bboxes_out = targets['boxes'] bboxes_out = bboxes_out.transpose(1, 2).contiguous() # print(bboxes_out.is_contiguous()) labels_out = targets['labels'] # print(labels_out.is_contiguous()) # ploc, plabel, gloc, glabel loss = self.compute_loss(locs, confs, bboxes_out, labels_out) return {"total_losses": loss} # 将预测回归参数叠加到default box上得到最终预测box,并执行非极大值抑制虑除重叠框 results = self.encoder.decode_batch(locs, confs) return results
def test(opt): model = SSD(backbone=ResNet()) checkpoint = torch.load(opt.pretrained_model) model.load_state_dict(checkpoint["model_state_dict"]) if torch.cuda.is_available(): model.cuda() model.eval() dboxes = generate_dboxes() transformer = SSDTransformer(dboxes, (300, 300), val=True) cap = cv2.VideoCapture(opt.input) if opt.output is None: output = "{}_prediction.mp4".format(opt.input[:-4]) else: output = opt.output height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) out = cv2.VideoWriter(output, cv2.VideoWriter_fourcc(*"MJPG"), int(cap.get(cv2.CAP_PROP_FPS)), (width, height)) encoder = Encoder(dboxes) while cap.isOpened(): flag, frame = cap.read() output_frame = np.copy(frame) if flag: frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) else: break frame = Image.fromarray(frame) frame, _, _, _ = transformer(frame, None, torch.zeros(1, 4), torch.zeros(1)) if torch.cuda.is_available(): frame = frame.cuda() with torch.no_grad(): ploc, plabel = model(frame.unsqueeze(dim=0)) result = encoder.decode_batch(ploc, plabel, opt.nms_threshold, 20)[0] loc, label, prob = [r.cpu().numpy() for r in result] best = np.argwhere(prob > opt.cls_threshold).squeeze(axis=1) loc = loc[best] label = label[best] prob = prob[best] if len(loc) > 0: loc[:, 0::2] *= width loc[:, 1::2] *= height loc = loc.astype(np.int32) for box, lb, pr in zip(loc, label, prob): category = coco_classes[lb] color = colors[lb] xmin, ymin, xmax, ymax = box cv2.rectangle(output_frame, (xmin, ymin), (xmax, ymax), color, 2) text_size = cv2.getTextSize(category + " : %.2f" % pr, cv2.FONT_HERSHEY_PLAIN, 1, 1)[0] cv2.rectangle( output_frame, (xmin, ymin), (xmin + text_size[0] + 3, ymin + text_size[1] + 4), color, -1) cv2.putText(output_frame, category + " : %.2f" % pr, (xmin, ymin + text_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, (255, 255, 255), 1) out.write(output_frame) cap.release() out.release()