def preprocess_fn(self, img, num_objects, keypoints, bboxes, category_id): """image pre-process and augmentation""" num_objs = min(num_objects, self.data_opt.max_objs) img = cv2.imdecode(img, cv2.IMREAD_COLOR) width = img.shape[1] c = np.array([img.shape[1] / 2., img.shape[0] / 2.], dtype=np.float32) s = max(img.shape[0], img.shape[1]) * 1.0 rot = 0 flipped = False if self.data_opt.rand_crop: s = s * np.random.choice(np.arange(0.6, 1.4, 0.1)) h_border = self._get_border(self.data_opt.input_res[0], img.shape[0]) w_border = self._get_border(self.data_opt.input_res[1], img.shape[1]) c[1] = np.random.randint(low=h_border, high=img.shape[0] - h_border) c[0] = np.random.randint(low=w_border, high=img.shape[1] - w_border) else: sf = self.data_opt.scale cf = self.data_opt.shift c[0] += s * np.clip(np.random.randn() * cf, -2 * cf, 2 * cf) c[1] += s * np.clip(np.random.randn() * cf, -2 * cf, 2 * cf) s = s * np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf) if np.random.random() < self.data_opt.aug_rot: rf = self.data_opt.rotate rot = np.clip(np.random.randn() * rf, -rf * 2, rf * 2) if np.random.random() < self.data_opt.flip_prop: flipped = True img = img[:, ::-1, :] c[0] = width - c[0] - 1 trans_input = get_affine_transform(c, s, rot, self.data_opt.input_res) inp = cv2.warpAffine(img, trans_input, (self.data_opt.input_res[0], self.data_opt.input_res[1]), flags=cv2.INTER_LINEAR) if self.run_mode == "train" and self.data_opt.color_aug: color_aug(self._data_rng, inp / 255., self.data_opt.eig_val, self.data_opt.eig_vec) inp *= 255. # caution: image normalization and transpose to nchw will both be done on device # inp = (inp.astype(np.float32) / 255. - self.data_opt.mean) / self.data_opt.std # inp = inp.transpose(2, 0, 1) if self.data_opt.output_res[0] != self.data_opt.output_res[1]: raise ValueError("Only square image was supported to used as output for convinient") output_res = self.data_opt.output_res[0] num_joints = self.data_opt.num_joints max_objs = self.data_opt.max_objs num_classes = self.data_opt.num_classes trans_output_rot = get_affine_transform(c, s, rot, [output_res, output_res]) hm = np.zeros((num_classes, output_res, output_res), dtype=np.float32) hm_hp = np.zeros((num_joints, output_res, output_res), dtype=np.float32) dense_kps = np.zeros((num_joints, 2, output_res, output_res), dtype=np.float32) dense_kps_mask = np.zeros((num_joints, output_res, output_res), dtype=np.float32) wh = np.zeros((max_objs, 2), dtype=np.float32) kps = np.zeros((max_objs, num_joints * 2), dtype=np.float32) reg = np.zeros((max_objs, 2), dtype=np.float32) ind = np.zeros((max_objs), dtype=np.int32) reg_mask = np.zeros((max_objs), dtype=np.int32) kps_mask = np.zeros((max_objs, num_joints * 2), dtype=np.int32) hp_offset = np.zeros((max_objs * num_joints, 2), dtype=np.float32) hp_ind = np.zeros((max_objs * num_joints), dtype=np.int32) hp_mask = np.zeros((max_objs * num_joints), dtype=np.int32) draw_gaussian = draw_msra_gaussian if self.net_opt.mse_loss else draw_umich_gaussian ground_truth = [] for k in range(num_objs): bbox = self._coco_box_to_bbox(bboxes[k]) cls_id = int(category_id[k]) - 1 pts = np.array(keypoints[k], np.float32).reshape(num_joints, 3) if flipped: bbox[[0, 2]] = width - bbox[[2, 0]] - 1 # index begin from zero pts[:, 0] = width - pts[:, 0] - 1 for e in self.data_opt.flip_idx: pts[e[0]], pts[e[1]] = pts[e[1]].copy(), pts[e[0]].copy() lt = [bbox[0], bbox[3]] rb = [bbox[2], bbox[1]] bbox[:2] = affine_transform(bbox[:2], trans_output_rot) bbox[2:] = affine_transform(bbox[2:], trans_output_rot) if rot != 0: lt = affine_transform(lt, trans_output_rot) rb = affine_transform(rb, trans_output_rot) bbox[0] = min(lt[0], rb[0], bbox[0], bbox[2]) bbox[2] = max(lt[0], rb[0], bbox[0], bbox[2]) bbox[1] = min(lt[1], rb[1], bbox[1], bbox[3]) bbox[3] = max(lt[1], rb[1], bbox[1], bbox[3]) bbox = np.clip(bbox, 0, output_res - 1) h, w = bbox[3] - bbox[1], bbox[2] - bbox[0] if h <= 0 or w <= 0: continue radius = gaussian_radius((math.ceil(h), math.ceil(w))) ct = np.array([(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2], dtype=np.float32) ct_int = ct.astype(np.int32) wh[k] = 1. * w, 1. * h ind[k] = ct_int[1] * output_res + ct_int[0] reg[k] = ct - ct_int reg_mask[k] = 1 num_kpts = pts[:, 2].sum() if num_kpts == 0: hm[cls_id, ct_int[1], ct_int[0]] = 0.9999 reg_mask[k] = 0 hp_radius = radius for j in range(num_joints): if pts[j, 2] > 0: pts[j, :2] = affine_transform(pts[j, :2], trans_output_rot) if pts[j, 0] >= 0 and pts[j, 0] < output_res and \ pts[j, 1] >= 0 and pts[j, 1] < output_res: kps[k, j * 2: j * 2 + 2] = pts[j, :2] - ct_int kps_mask[k, j * 2: j * 2 + 2] = 1 pt_int = pts[j, :2].astype(np.int32) hp_offset[k * num_joints + j] = pts[j, :2] - pt_int hp_ind[k * num_joints + j] = pt_int[1] * output_res + pt_int[0] hp_mask[k * num_joints + j] = 1 if self.net_opt.dense_hp: # must be before draw center hm gaussian draw_dense_reg(dense_kps[j], hm[cls_id], ct_int, pts[j, :2] - ct_int, radius, is_offset=True) draw_gaussian(dense_kps_mask[j], ct_int, radius) draw_gaussian(hm_hp[j], pt_int, hp_radius) draw_gaussian(hm[cls_id], ct_int, radius) if self.enable_visual_image: gt = { "category_id": int(cls_id + 1), "bbox": [ct[0] - w / 2, ct[1] - h / 2, w, h], "score": float("{:.2f}".format(1)), "keypoints": pts.reshape(num_joints * 3).tolist(), } ground_truth.append(gt) ret = (inp, hm, reg_mask, ind, wh) if self.net_opt.dense_hp: dense_kps = dense_kps.reshape((num_joints * 2, output_res, output_res)) dense_kps_mask = dense_kps_mask.reshape((num_joints, 1, output_res, output_res)) dense_kps_mask = np.concatenate([dense_kps_mask, dense_kps_mask], axis=1) dense_kps_mask = dense_kps_mask.reshape((num_joints * 2, output_res, output_res)) ret += (dense_kps, dense_kps_mask) else: ret += (kps, kps_mask) ret += (reg, hm_hp, hp_offset, hp_ind, hp_mask) if self.enable_visual_image: out_img = cv2.warpAffine(img, trans_output_rot, (output_res, output_res), flags=cv2.INTER_LINEAR) visual_image(out_img, ground_truth, self.save_path, ratio=self.data_opt.input_res[0] // output_res) return ret
def preprocess_fn(self, img, num_objects, keypoints, bboxes, category_id): """image pre-process and augmentation""" num_objs = min(num_objects, self.data_opt.max_objs) img, width, c, s, rot, flipped = self.get_aug_param(img) trans_input = get_affine_transform(c, s, rot, self.data_opt.input_res) inp = cv2.warpAffine( img, trans_input, (self.data_opt.input_res[0], self.data_opt.input_res[1]), flags=cv2.INTER_LINEAR) assert self.data_opt.output_res[0] == self.data_opt.output_res[1] output_res = self.data_opt.output_res[0] num_joints = self.data_opt.num_joints max_objs = self.data_opt.max_objs num_classes = self.data_opt.num_classes trans_output_rot = get_affine_transform(c, s, rot, [output_res, output_res]) hm = np.zeros((num_classes, output_res, output_res), dtype=np.float32) hm_hp = np.zeros((num_joints, output_res, output_res), dtype=np.float32) dense_kps = np.zeros((num_joints, 2, output_res, output_res), dtype=np.float32) dense_kps_mask = np.zeros((num_joints, output_res, output_res), dtype=np.float32) wh = np.zeros((max_objs, 2), dtype=np.float32) kps = np.zeros((max_objs, num_joints * 2), dtype=np.float32) reg = np.zeros((max_objs, 2), dtype=np.float32) ind = np.zeros((max_objs), dtype=np.int32) reg_mask = np.zeros((max_objs), dtype=np.int32) kps_mask = np.zeros((max_objs, num_joints * 2), dtype=np.int32) hp_offset = np.zeros((max_objs * num_joints, 2), dtype=np.float32) hp_ind = np.zeros((max_objs * num_joints), dtype=np.int32) hp_mask = np.zeros((max_objs * num_joints), dtype=np.int32) draw_gaussian = draw_msra_gaussian if self.net_opt.mse_loss else draw_umich_gaussian ground_truth = [] for k in range(num_objs): bbox = self._coco_box_to_bbox(bboxes[k]) cls_id = int(category_id[k]) - 1 pts = np.array(keypoints[k], np.float32).reshape(num_joints, 3) if flipped: bbox[[0, 2]] = width - bbox[[2, 0]] - 1 # index begin from zero pts[:, 0] = width - pts[:, 0] - 1 for e in self.data_opt.flip_idx: pts[e[0]], pts[e[1]] = pts[e[1]].copy(), pts[e[0]].copy() lt, rb = [bbox[0], bbox[3]], [bbox[2], bbox[1]] bbox[:2] = affine_transform(bbox[:2], trans_output_rot) bbox[2:] = affine_transform(bbox[2:], trans_output_rot) if rot != 0: lt = affine_transform(lt, trans_output_rot) rb = affine_transform(rb, trans_output_rot) for i in range(2): bbox[i] = min(lt[i], rb[i], bbox[i], bbox[i + 2]) bbox[i + 2] = max(lt[i], rb[i], bbox[i], bbox[i + 2]) bbox = np.clip(bbox, 0, output_res - 1) h, w = bbox[3] - bbox[1], bbox[2] - bbox[0] if h <= 0 or w <= 0: continue hp_radius = radius = gaussian_radius((math.ceil(h), math.ceil(w))) ct = np.array([(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2], dtype=np.float32) ct_int = ct.astype(np.int32) wh[k] = 1. * w, 1. * h ind[k] = ct_int[1] * output_res + ct_int[0] reg[k] = ct - ct_int reg_mask[k] = 1 num_kpts = pts[:, 2].sum() if num_kpts == 0: hm[cls_id, ct_int[1], ct_int[0]] = 0.9999 reg_mask[k] = 0 for j in range(num_joints): if pts[j, 2] > 0: pts[j, :2] = affine_transform(pts[j, :2], trans_output_rot) if pts[j, 0] >= 0 and pts[j, 0] < output_res and \ pts[j, 1] >= 0 and pts[j, 1] < output_res: kps[k, j * 2:j * 2 + 2] = pts[j, :2] - ct_int kps_mask[k, j * 2:j * 2 + 2] = 1 pt_int = pts[j, :2].astype(np.int32) hp_offset[k * num_joints + j] = pts[j, :2] - pt_int hp_ind[k * num_joints + j] = pt_int[1] * output_res + pt_int[0] hp_mask[k * num_joints + j] = 1 if self.net_opt.dense_hp: # must be before draw center hm gaussian draw_dense_reg(dense_kps[j], hm[cls_id], ct_int, pts[j, :2] - ct_int, radius, is_offset=True) draw_gaussian(dense_kps_mask[j], ct_int, radius) draw_gaussian(hm_hp[j], pt_int, hp_radius) draw_gaussian(hm[cls_id], ct_int, radius) if self.enable_visual_image: gt = { "category_id": int(cls_id + 1), "bbox": [ct[0] - w / 2, ct[1] - h / 2, w, h], "score": float("{:.2f}".format(1)), "keypoints": pts.reshape(num_joints * 3).tolist(), } ground_truth.append(gt) ret = (inp, hm, reg_mask, ind, wh) if self.net_opt.dense_hp: dense_kps = dense_kps.reshape( (num_joints * 2, output_res, output_res)) dense_kps_mask = dense_kps_mask.reshape( (num_joints, 1, output_res, output_res)) dense_kps_mask = np.concatenate([dense_kps_mask, dense_kps_mask], axis=1) dense_kps_mask = dense_kps_mask.reshape( (num_joints * 2, output_res, output_res)) ret += (dense_kps, dense_kps_mask) else: ret += (kps, kps_mask) ret += (reg, hm_hp, hp_offset, hp_ind, hp_mask) if self.enable_visual_image: out_img = cv2.warpAffine(img, trans_output_rot, (output_res, output_res), flags=cv2.INTER_LINEAR) visual_image(out_img, ground_truth, self.save_path, ratio=self.data_opt.input_res[0] // output_res) return ret
def pre_process_for_test(self, image, img_id, scale): """image pre-process for evaluation""" b, h, w, ch = image.shape assert b == 1, "only single image was supported here" image = image.reshape((h, w, ch)) height, width = image.shape[0:2] new_height = int(height * scale) new_width = int(width * scale) if self.keep_res: inp_height = (new_height | self.pad) + 1 inp_width = (new_width | self.pad) + 1 c = np.array([new_width // 2, new_height // 2], dtype=np.float32) s = np.array([inp_width, inp_height], dtype=np.float32) else: inp_height, inp_width = self.data_opt.input_res[0], self.data_opt.input_res[1] c = np.array([new_width / 2., new_height / 2.], dtype=np.float32) s = max(height, width) * 1.0 trans_input = get_affine_transform(c, s, 0, [inp_width, inp_height]) resized_image = cv2.resize(image, (new_width, new_height)) inp_image = cv2.warpAffine(resized_image, trans_input, (inp_width, inp_height), flags=cv2.INTER_LINEAR) inp_img = (inp_image.astype(np.float32) / 255. - self.data_opt.mean) / self.data_opt.std eval_image = inp_img.reshape((1,) + inp_img.shape) eval_image = eval_image.transpose(0, 3, 1, 2) meta = {'c': c, 's': s, 'out_height': inp_height // self.net_opt.down_ratio, 'out_width': inp_width // self.net_opt.down_ratio} if self.enable_visual_image: if self.run_mode != "test": annos = self.coco.loadAnns(self.anns[img_id]) num_objs = min(len(annos), self.data_opt.max_objs) num_joints = self.data_opt.num_joints ground_truth = [] for k in range(num_objs): ann = annos[k] bbox = self._coco_box_to_bbox(ann['bbox']) * scale cls_id = int(ann['category_id']) - 1 pts = np.array(ann['keypoints'], np.float32).reshape(num_joints, 3) bbox[:2] = affine_transform(bbox[:2], trans_input) bbox[2:] = affine_transform(bbox[2:], trans_input) bbox[0::2] = np.clip(bbox[0::2], 0, inp_width - 1) bbox[1::2] = np.clip(bbox[1::2], 0, inp_height - 1) h, w = bbox[3] - bbox[1], bbox[2] - bbox[0] if h <= 0 or w <= 0: continue for j in range(num_joints): if pts[j, 2] > 0: pts[j, :2] = affine_transform(pts[j, :2] * scale, trans_input) bbox = [bbox[0], bbox[1], w, h] gt = { "image_id": int(img_id), "category_id": int(cls_id + 1), "bbox": bbox, "score": float("{:.2f}".format(1)), "keypoints": pts.reshape(num_joints * 3).tolist(), "id": self.anns[img_id][k] } ground_truth.append(gt) visual_image(inp_image, ground_truth, self.save_path, height=inp_height, width=inp_width, name="_scale" + str(scale)) else: image_name = "gt_" + self.run_mode + "_image_" + str(img_id) + "_scale_" + str(scale) + ".png" cv2.imwrite("{}/{}".format(self.save_path, image_name), inp_image) return eval_image, meta
def preprocess_fn(self, image, num_objects, bboxes, category_id): """image pre-process and augmentation""" num_objs = min(num_objects, self.data_opt.max_objs) img = cv2.imdecode(image, cv2.IMREAD_COLOR) height = img.shape[0] width = img.shape[1] c = np.array([img.shape[1] / 2., img.shape[0] / 2.], dtype=np.float32) s = max(height, width) * 1.0 input_h, input_w = self.data_opt.input_res[0], self.data_opt.input_res[ 1] rot = 0 flipped = False if self.data_opt.rand_crop: s = s * np.random.choice(np.arange(0.6, 1.4, 0.1)) h_border = self._get_border(128, img.shape[0]) w_border = self._get_border(128, img.shape[1]) c[0] = np.random.randint(low=w_border, high=img.shape[1] - w_border) c[1] = np.random.randint(low=h_border, high=img.shape[0] - h_border) else: sf = self.data_opt.scale cf = self.data_opt.shift c[0] += s * np.clip(np.random.randn() * cf, -2 * cf, 2 * cf) c[1] += s * np.clip(np.random.randn() * cf, -2 * cf, 2 * cf) s = s * np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf) if np.random.random() < self.data_opt.flip_prop: flipped = True img = img[:, ::-1, :] c[0] = width - c[0] - 1 trans_input = get_affine_transform(c, s, rot, [input_w, input_h]) inp = cv2.warpAffine(img, trans_input, (input_w, input_h), flags=cv2.INTER_LINEAR) inp = (inp.astype(np.float32) / 255.) if self.run_mode == "train" and self.data_opt.color_aug: color_aug(self._data_rng, inp, self.data_opt.eig_val, self.data_opt.eig_vec) if self.data_opt.output_res[0] != self.data_opt.output_res[1]: raise ValueError( "Only square image was supported to used as output for convenient" ) output_h = input_h // self.data_opt.down_ratio output_w = input_w // self.data_opt.down_ratio max_objs = self.data_opt.max_objs num_classes = self.data_opt.num_classes trans_output_rot = get_affine_transform(c, s, rot, [output_w, output_h]) hm = np.zeros((num_classes, output_h, output_w), dtype=np.float32) wh = np.zeros((max_objs, 2), dtype=np.float32) dense_wh = np.zeros((2, output_h, output_w), dtype=np.float32) reg = np.zeros((max_objs, 2), dtype=np.float32) ind = np.zeros((max_objs), dtype=np.int32) reg_mask = np.zeros((max_objs), dtype=np.int32) cat_spec_wh = np.zeros((max_objs, num_classes * 2), dtype=np.float32) cat_spec_mask = np.zeros((max_objs, num_classes * 2), dtype=np.int32) draw_gaussian = draw_msra_gaussian if self.net_opt.mse_loss else draw_umich_gaussian ground_truth = [] for k in range(num_objs): bbox = bboxes[k] cls_id = category_id[k] - 1 if flipped: bbox[[0, 2]] = width - bbox[[2, 0]] - 1 # index begin from zero bbox[:2] = affine_transform(bbox[:2], trans_output_rot) bbox[2:] = affine_transform(bbox[2:], trans_output_rot) bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, output_w - 1) bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, output_h - 1) h, w = bbox[3] - bbox[1], bbox[2] - bbox[0] if h <= 0 and w <= 0: continue radius = gaussian_radius((math.ceil(h), math.ceil(w))) radius = max(0, int(radius)) ct = np.array([(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2], dtype=np.float32) ct_int = ct.astype(np.int32) draw_gaussian(hm[cls_id], ct_int, radius) wh[k] = 1. * w, 1. * h ind[k] = ct_int[1] * output_w + ct_int[0] reg[k] = ct - ct_int reg_mask[k] = 1 cat_spec_wh[k, cls_id * 2:cls_id * 2 + 2] = wh[k] cat_spec_mask[k, cls_id * 2:cls_id * 2 + 2] = 1 if self.net_opt.dense_wh: draw_dense_reg(dense_wh, hm.max(axis=0), ct_int, wh[k], radius) ground_truth.append([ ct[0] - w / 2, ct[1] - h / 2, ct[0] + w / 2, ct[1] + h / 2, 1, cls_id ]) ret = (inp, hm, reg_mask, ind, wh) if self.net_opt.dense_wh: hm_a = hm.max(axis=0) dense_wh_mask = np.concatenate([hm_a, hm_a], axis=0) ret += (dense_wh, dense_wh_mask) elif self.net_opt.cat_spec_wh: ret += (cat_spec_wh, cat_spec_mask) if self.net_opt.reg_offset: ret += (reg, ) return ret