Esempio n. 1
0
def seg_maskrcnnresults():
    classifier = PointNetInstanceSeg(n_classes=opt.n_cat)
    if opt.model != '':
        classifier.load_state_dict(torch.load(opt.model))
    classifier.cuda()
    classifier = classifier.eval()
    if opt.dataset == 'Real':
        file_path = os.path.join(opt.dataset, 'test_list.txt')
        cam_fx, cam_fy, cam_cx, cam_cy = 591.0125, 590.16775, 322.525, 244.11084
        result_dir = 'results/mrcnn_results/{}_test_pointnet_seg'.format(
            opt.dataset)
    else:
        file_path = os.path.join(opt.dataset, 'val_list.txt')
        cam_fx, cam_fy, cam_cx, cam_cy = 577.5, 577.5, 319.5, 239.5
        result_dir = 'results/mrcnn_results/{}_val_pointnet_seg'.format(
            opt.dataset)
    if not os.path.exists(result_dir):
        os.makedirs(result_dir)
    norm_scale = 1000.0
    xmap = np.array([[i for i in range(640)] for j in range(480)])
    ymap = np.array([[j for i in range(640)] for j in range(480)])
    # get test data list
    img_list = [
        os.path.join(file_path.split('/')[0], line.rstrip('\n'))
        for line in open(os.path.join(opt.data_dir, file_path))
    ]

    t_start = time.time()
    for path in tqdm(img_list):
        img_path = os.path.join(opt.data_dir, path)
        depth = load_depth(img_path)
        # load mask-rcnn detection results
        img_path_parsing = img_path.split('/')
        mrcnn_path = os.path.join(
            'results/mrcnn_results', opt.data, 'results_{}_{}_{}.pkl'.format(
                opt.data.split('_')[-1], img_path_parsing[-2],
                img_path_parsing[-1]))
        with open(mrcnn_path, 'rb') as f:
            mrcnn_result = cPickle.load(f)
        num_insts = len(mrcnn_result['class_ids'])
        f_mask = np.zeros((num_insts, depth.shape[0], depth.shape[1]),
                          dtype=int)
        # prepare frame data
        f_points, f_choose, f_catId = [], [], []
        valid_inst = []
        result = {}
        for i in range(num_insts):
            cat_id = mrcnn_result['class_ids'][i] - 1
            rmin, rmax, cmin, cmax = get_bbox(mrcnn_result['rois'][i])

            # sample points
            depth_vaild = depth > 0
            choose_depth = depth_vaild[rmin:rmax,
                                       cmin:cmax].flatten().nonzero()[0]
            if len(choose_depth) < 32:
                continue
            else:
                valid_inst.append(i)
            # process objects with valid depth observation
            if len(choose_depth) > opt.n_pts:
                c_mask = np.zeros(len(choose_depth), dtype=int)
                c_mask[:opt.n_pts] = 1
                np.random.shuffle(c_mask)
                choose_depth = choose_depth[c_mask.nonzero()]
            else:
                choose_depth = np.pad(choose_depth,
                                      (0, opt.n_pts - len(choose_depth)),
                                      'wrap')

            depth_masked = depth[rmin:rmax,
                                 cmin:cmax].flatten()[choose_depth][:,
                                                                    np.newaxis]
            xmap_masked = xmap[rmin:rmax,
                               cmin:cmax].flatten()[choose_depth][:,
                                                                  np.newaxis]
            ymap_masked = ymap[rmin:rmax,
                               cmin:cmax].flatten()[choose_depth][:,
                                                                  np.newaxis]
            pt2 = depth_masked / norm_scale
            pt0 = (xmap_masked - cam_cx) * pt2 / cam_fx
            pt1 = (ymap_masked - cam_cy) * pt2 / cam_fy
            points = np.concatenate((pt0, pt1, pt2), axis=1)
            # Get frustum angle (according to center pixel in 2D BOX)
            box2d_center = np.array([(cmin + cmax) / 2.0, (rmin + rmax) / 2.0])
            depth_center = 1.0
            x_center = (box2d_center[0] - cam_cx) * depth_center / cam_fx
            y_center = (cam_cy - box2d_center[1]) * depth_center / cam_fy
            angle_y = -1 * np.arctan2(depth_center, x_center)
            angle_x = -1 * np.arctan2(
                (depth_center**2 + x_center**2)**0.5, y_center)

            # Get point cloud
            points = get_center_view_point_set(
                points, angle_y, angle_x)  # (n,3) #pts after Frustum rotation
            f_points.append(points)
            f_catId.append(cat_id)
            f_choose.append(choose_depth)
        if len(valid_inst):
            f_points = torch.cuda.FloatTensor(f_points)
            f_catId = torch.cuda.LongTensor(f_catId)
            f_one_hot_vec = F.one_hot(f_catId, opt.n_cat)
            f_points = f_points.transpose(2, 1)

            logits = classifier(f_points, f_one_hot_vec)
            logits_choice = logits.data.max(2)[1]
            logits_np = logits_choice.cpu().data.numpy()
            for i in range(len(valid_inst)):
                inst_idx = valid_inst[i]
                choose_depth = f_choose[i]
                logits_np_inst = logits_np[i]
                choose_logits_np = logits_np_inst.nonzero()
                rmin, rmax, cmin, cmax = get_bbox(
                    mrcnn_result['rois'][inst_idx])
                roi_mask = np.zeros(((rmax - rmin) * (cmax - cmin)), dtype=int)
                roi_mask[choose_depth[choose_logits_np]] = 1
                roi_mask = roi_mask.reshape((rmax - rmin, cmax - cmin))
                f_mask[inst_idx][rmin:rmax, cmin:cmax] = roi_mask
        result['class_ids'] = mrcnn_result['class_ids']
        result['rois'] = mrcnn_result['rois']
        result['scores'] = mrcnn_result['scores']
        result['masks'] = (f_mask.transpose(1, 2, 0) > 0)
        save_path = os.path.join(
            result_dir, 'results_{}_{}_{}.pkl'.format(
                opt.data.split('_')[-1], img_path_parsing[-2],
                img_path_parsing[-1]))
        with open(save_path, 'wb') as f:
            cPickle.dump(result, f)
Esempio n. 2
0
def detect():
    # resume model
    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu
    estimator = DeformNet(opt.n_cat, opt.nv_prior)
    estimator.cuda()
    estimator.load_state_dict(torch.load(opt.model))
    estimator.eval()
    # get test data list
    img_list = [
        os.path.join(file_path.split('/')[0], line.rstrip('\n'))
        for line in open(os.path.join(opt.data_dir, file_path))
    ]
    # frame by frame test
    t_inference = 0.0
    t_umeyama = 0.0
    inst_count = 0
    img_count = 0
    t_start = time.time()
    for path in tqdm(img_list):
        img_path = os.path.join(opt.data_dir, path)
        raw_rgb = cv2.imread(img_path + '_color.png')[:, :, :3]
        raw_rgb = raw_rgb[:, :, ::-1]
        raw_depth = load_depth(img_path)
        # load mask-rcnn detection results
        img_path_parsing = img_path.split('/')
        mrcnn_path = os.path.join(
            'results/mrcnn_results', opt.data, 'results_{}_{}_{}.pkl'.format(
                opt.data.split('_')[-1], img_path_parsing[-2],
                img_path_parsing[-1]))
        with open(mrcnn_path, 'rb') as f:
            mrcnn_result = cPickle.load(f)
        num_insts = len(mrcnn_result['class_ids'])
        f_sRT = np.zeros((num_insts, 4, 4), dtype=float)
        f_size = np.zeros((num_insts, 3), dtype=float)
        # prepare frame data
        f_points, f_rgb, f_choose, f_catId, f_prior = [], [], [], [], []
        valid_inst = []
        for i in range(num_insts):
            cat_id = mrcnn_result['class_ids'][i] - 1
            prior = mean_shapes[cat_id]
            rmin, rmax, cmin, cmax = get_bbox(mrcnn_result['rois'][i])
            mask = np.logical_and(mrcnn_result['masks'][:, :, i],
                                  raw_depth > 0)
            choose = mask[rmin:rmax, cmin:cmax].flatten().nonzero()[0]
            # no depth observation for background in CAMERA dataset
            # beacuase of how we compute the bbox in function get_bbox
            # there might be a chance that no foreground points after cropping the mask
            # cuased by false positive of mask_rcnn, most of the regions are background
            if len(choose) < 32:
                f_sRT[i] = np.identity(4, dtype=float)
                f_size[i] = 2 * np.amax(np.abs(prior), axis=0)
                continue
            else:
                valid_inst.append(i)
            # process objects with valid depth observation
            if len(choose) > opt.n_pts:
                c_mask = np.zeros(len(choose), dtype=int)
                c_mask[:opt.n_pts] = 1
                np.random.shuffle(c_mask)
                choose = choose[c_mask.nonzero()]
            else:
                choose = np.pad(choose, (0, opt.n_pts - len(choose)), 'wrap')
            depth_masked = raw_depth[rmin:rmax,
                                     cmin:cmax].flatten()[choose][:,
                                                                  np.newaxis]
            xmap_masked = xmap[rmin:rmax,
                               cmin:cmax].flatten()[choose][:, np.newaxis]
            ymap_masked = ymap[rmin:rmax,
                               cmin:cmax].flatten()[choose][:, np.newaxis]
            pt2 = depth_masked / norm_scale
            pt0 = (xmap_masked - cam_cx) * pt2 / cam_fx
            pt1 = (ymap_masked - cam_cy) * pt2 / cam_fy
            points = np.concatenate((pt0, pt1, pt2), axis=1)
            rgb = raw_rgb[rmin:rmax, cmin:cmax, :]
            rgb = cv2.resize(rgb, (opt.img_size, opt.img_size),
                             interpolation=cv2.INTER_LINEAR)
            rgb = norm_color(rgb)
            crop_w = rmax - rmin
            ratio = opt.img_size / crop_w
            col_idx = choose % crop_w
            row_idx = choose // crop_w
            choose = (np.floor(row_idx * ratio) * opt.img_size +
                      np.floor(col_idx * ratio)).astype(np.int64)
            # concatenate instances
            f_points.append(points)
            f_rgb.append(rgb)
            f_choose.append(choose)
            f_catId.append(cat_id)
            f_prior.append(prior)
        if len(valid_inst):
            f_points = torch.cuda.FloatTensor(f_points)
            f_rgb = torch.stack(f_rgb, dim=0).cuda()
            f_choose = torch.cuda.LongTensor(f_choose)
            f_catId = torch.cuda.LongTensor(f_catId)
            f_prior = torch.cuda.FloatTensor(f_prior)
            # inference
            torch.cuda.synchronize()
            t_now = time.time()
            assign_mat, deltas = estimator(f_points, f_rgb, f_choose, f_catId,
                                           f_prior)
            # assign_mat, deltas = estimator(f_rgb, f_choose, f_catId, f_prior)
            inst_shape = f_prior + deltas
            assign_mat = F.softmax(assign_mat, dim=2)
            f_coords = torch.bmm(assign_mat, inst_shape)  # bs x n_pts x 3
            torch.cuda.synchronize()
            t_inference += (time.time() - t_now)
            f_coords = f_coords.detach().cpu().numpy()
            f_points = f_points.cpu().numpy()
            f_choose = f_choose.cpu().numpy()
            f_insts = inst_shape.detach().cpu().numpy()
            t_now = time.time()
            for i in range(len(valid_inst)):
                inst_idx = valid_inst[i]
                choose = f_choose[i]
                _, choose = np.unique(choose, return_index=True)
                nocs_coords = f_coords[i, choose, :]
                f_size[inst_idx] = 2 * np.amax(np.abs(f_insts[i]), axis=0)
                points = f_points[i, choose, :]
                _, _, _, pred_sRT = estimateSimilarityTransform(
                    nocs_coords, points)
                if pred_sRT is None:
                    pred_sRT = np.identity(4, dtype=float)
                f_sRT[inst_idx] = pred_sRT
            t_umeyama += (time.time() - t_now)
            img_count += 1
            inst_count += len(valid_inst)

        # save results
        result = {}
        with open(img_path + '_label.pkl', 'rb') as f:
            gts = cPickle.load(f)
        result['gt_class_ids'] = gts['class_ids']
        result['gt_bboxes'] = gts['bboxes']
        result['gt_RTs'] = gts['poses']
        result['gt_scales'] = gts['size']
        result['gt_handle_visibility'] = gts['handle_visibility']

        result['pred_class_ids'] = mrcnn_result['class_ids']
        result['pred_bboxes'] = mrcnn_result['rois']
        result['pred_scores'] = mrcnn_result['scores']
        result['pred_RTs'] = f_sRT
        result['pred_scales'] = f_size

        image_short_path = '_'.join(img_path_parsing[-3:])
        save_path = os.path.join(result_dir,
                                 'results_{}.pkl'.format(image_short_path))
        with open(save_path, 'wb') as f:
            cPickle.dump(result, f)
    # write statistics
    fw = open('{0}/eval_logs.txt'.format(result_dir), 'w')
    messages = []
    messages.append("Total images: {}".format(len(img_list)))
    messages.append(
        "Valid images: {},  Total instances: {},  Average: {:.2f}/image".
        format(img_count, inst_count, inst_count / img_count))
    messages.append("Inference time: {:06f}  Average: {:06f}/image".format(
        t_inference, t_inference / img_count))
    messages.append("Umeyama time: {:06f}  Average: {:06f}/image".format(
        t_umeyama, t_umeyama / img_count))
    messages.append("Total time: {:06f}".format(time.time() - t_start))
    for msg in messages:
        print(msg)
        fw.write(msg + '\n')
    fw.close()
def evaluate():
    # get test data list
    img_list = [
        os.path.join(file_path.split('/')[0], line.rstrip('\n'))
        for line in open(os.path.join(opt.data_dir, file_path))
    ]

    total_count = np.zeros((opt.n_cat, ), dtype=int)
    acc = np.zeros((opt.n_cat, ), dtype=float)  #accuracy
    pcs = np.zeros((opt.n_cat, ), dtype=float)  #precision
    rcal = np.zeros((opt.n_cat, ), dtype=float)  #recall
    all_dtc_num = 0
    no_gt_num = 0

    t_start = time.time()
    for path in tqdm(img_list):
        img_path = os.path.join(opt.data_dir, path)
        raw_depth = load_depth(img_path)

        # load mask-rcnn detection results
        img_path_parsing = img_path.split('/')
        mrcnn_path = os.path.join(
            'results/mrcnn_results', opt.data, 'results_{}_{}_{}.pkl'.format(
                opt.data.split('_')[-1], img_path_parsing[-2],
                img_path_parsing[-1]))
        with open(mrcnn_path, 'rb') as f:
            mrcnn_result = cPickle.load(f)
        pred_num_insts = len(mrcnn_result['class_ids'])
        pred_class_ids = mrcnn_result['class_ids']

        #load label
        with open(img_path + '_label.pkl', 'rb') as f:
            gts = cPickle.load(f)
        mask = cv2.imread(img_path + '_mask.png')[:, :, 2]
        gt_num_insts = len(gts['class_ids'])
        gt_class_ids = gts['class_ids']

        for i in range(pred_num_insts):
            all_dtc_num += 1
            map_to_gt = []
            for j in range(len(gt_class_ids)):
                if gt_class_ids[j] != pred_class_ids[i]:
                    continue
                rmin1, rmax1, cmin1, cmax1 = get_bbox(mrcnn_result['rois'][i])
                rmin2, rmax2, cmin2, cmax2 = get_bbox(gts['bboxes'][j])
                pred_box = [cmin1, rmin1, cmax1, rmax1]
                gt_box = [cmin2, rmin2, cmax2, rmax2]
                iou = cal_iou(pred_box, gt_box)
                if iou < opt.iou_thd:
                    continue
                # match found
                map_to_gt.append(np.array([j, iou]))
            if len(map_to_gt) == 0:
                no_gt_num += 1
                continue

            max_iou_idx = np.argmax(np.array(map_to_gt)[:, 1])
            j = int(map_to_gt[max_iou_idx][0])
            #calculate segmantation accuracy
            gt_mask = mask == gts['instance_ids'][j]
            pre_mask = mrcnn_result['masks'][:, :, i]
            mask_bias = gt_mask == pre_mask
            ins_mask_bias = mask_bias[rmin1:rmax1, cmin1:cmax1]
            mask_TP = np.logical_and(gt_mask, pre_mask)
            ins_mask_TP = mask_TP[rmin1:rmax1, cmin1:cmax1]
            ins_depth = raw_depth[rmin1:rmax1, cmin1:cmax1]
            ins_depth_idxs = np.where(ins_depth > 0)
            correct_seg_num = np.sum(
                ins_mask_bias[ins_depth_idxs[0],
                              ins_depth_idxs[1]].astype(float))
            TP_seg_num = np.sum(ins_mask_TP[ins_depth_idxs[0],
                                            ins_depth_idxs[1]].astype(float))
            acc_ins = correct_seg_num / ins_depth_idxs[0].shape[0]
            pcs_ins = TP_seg_num / np.sum(pre_mask[rmin1:rmax1, cmin1:cmax1][
                ins_depth_idxs[0], ins_depth_idxs[1]].astype(float))
            rcal_ins = TP_seg_num / np.sum(gt_mask[rmin1:rmax1, cmin1:cmax1][
                ins_depth_idxs[0], ins_depth_idxs[1]].astype(float))
            total_count[pred_class_ids[i] - 1] += 1
            acc[pred_class_ids[i] - 1] += acc_ins
            pcs[pred_class_ids[i] - 1] += pcs_ins
            rcal[pred_class_ids[i] - 1] += rcal_ins

    # compute accuracy
    catId_to_name = {
        0: 'bottle',
        1: 'bowl',
        2: 'camera',
        3: 'can',
        4: 'laptop',
        5: 'mug'
    }
    acc, pcs, rcal = 100 * (acc / total_count), 100 * (
        pcs / total_count), 100 * (rcal / total_count)
    overall_acc, overall_pcs, overall_rcal = np.mean(acc), np.mean(
        pcs), np.mean(rcal)

    no_gt_ratio = 100 * (no_gt_num / all_dtc_num)
    fw = open('{0}/seg_acc_pcs_rcal.txt'.format(result_dir), 'a')
    messages = []
    messages.append('segmantation results:')
    messages.append('{:>12s}{:>12s}{:>12s}{:>12s}'.format(
        'category', 'accuracy', 'precision', 'recall'))
    for i in range(acc.shape[0]):
        messages.append("{:>12s}{:>12.2f}{:>12.2f}{:>12.2f}".format(
            catId_to_name[i], acc[i], pcs[i], rcal[i]))
    messages.append("{:>12s}{:>12.2f}{:>12.2f}{:>12.2f}".format(
        'overall', overall_acc, overall_pcs, overall_rcal))
    messages.append("{:>12s}{:>12.2f}".format('mismatch', no_gt_ratio))
    for msg in messages:
        print(msg)
        fw.write(msg + '\n')
    fw.close()
    def __getitem__(self, index):
        data_parsing = self.data_list[index].split('_')
        assert self.source in ['CAMERA', 'CAMERA+Real']
        if 'scene' in data_parsing[0]:
            img_path = os.path.join(self.data_dir, 'Real',
                                    '_'.join(data_parsing[:2]))
            cam_fx, cam_fy, cam_cx, cam_cy = self.real_intrinsics
        else:
            img_path = os.path.join(self.data_dir, 'CAMERA', data_parsing[0])
            cam_fx, cam_fy, cam_cx, cam_cy = self.camera_intrinsics
        rgb = cv2.imread(img_path + '_color.png')[:, :, :3]
        rgb = rgb[:, :, ::-1]
        mask = cv2.imread(img_path + '_mask.png')[:, :, 2]
        with open(img_path + '_label.pkl', 'rb') as f:
            gts = cPickle.load(f)
        # select one foreground object
        inst_id = int(data_parsing[-1])
        idx = np.where(np.array(gts['instance_ids']) == inst_id)[0][0]
        rmin, rmax, cmin, cmax = get_bbox(gts['bboxes'][idx])
        # sample points
        depth = load_depth(img_path)
        mask = np.equal(mask, inst_id)
        mask = np.logical_and(mask, depth > 0)
        depth_vaild = depth > 0
        choose_depth = depth_vaild[rmin:rmax, cmin:cmax].flatten().nonzero()[0]
        seg = mask[rmin:rmax, cmin:cmax].flatten().astype(np.float64)
        if len(choose_depth) > self.n_pts:
            c_mask = np.zeros(len(choose_depth), dtype=int)
            c_mask[:self.n_pts] = 1
            np.random.shuffle(c_mask)
            choose_depth = choose_depth[c_mask.nonzero()]
        else:
            choose_depth = np.pad(choose_depth,
                                  (0, self.n_pts - len(choose_depth)), 'wrap')

        seg = seg[choose_depth]
        depth_masked = depth[rmin:rmax,
                             cmin:cmax].flatten()[choose_depth][:, np.newaxis]
        xmap_masked = self.xmap[rmin:rmax,
                                cmin:cmax].flatten()[choose_depth][:,
                                                                   np.newaxis]
        ymap_masked = self.ymap[rmin:rmax,
                                cmin:cmax].flatten()[choose_depth][:,
                                                                   np.newaxis]
        pt2 = depth_masked / self.norm_scale
        pt0 = (xmap_masked - cam_cx) * pt2 / cam_fx
        pt1 = (ymap_masked - cam_cy) * pt2 / cam_fy
        points = np.concatenate((pt0, pt1, pt2), axis=1)
        # resize cropped image to standard size and adjust 'choose' accordingly
        rgb = rgb[rmin:rmax, cmin:cmax, :]
        rgb = cv2.resize(rgb, (self.img_size, self.img_size),
                         interpolation=cv2.INTER_LINEAR)
        crop_w = rmax - rmin
        ratio = self.img_size / crop_w
        col_idx = choose_depth % crop_w
        row_idx = choose_depth // crop_w
        choose_depth = (np.floor(row_idx * ratio) * self.img_size +
                        np.floor(col_idx * ratio)).astype(np.int64)
        # Get frustum angle (according to center pixel in 2D BOX)
        box2d_center = np.array([(cmin + cmax) / 2.0, (rmin + rmax) / 2.0])
        depth_center = 1.0
        x_center = (box2d_center[0] - cam_cx) * depth_center / cam_fx
        y_center = (cam_cy - box2d_center[1]) * depth_center / cam_fy
        angle_y = -1 * np.arctan2(depth_center, x_center)
        angle_x = -1 * np.arctan2(
            (depth_center**2 + x_center**2)**0.5, y_center)
        # Get point cloud
        if self.rotate_to_center:  # True
            points = self.get_center_view_point_set(
                points, angle_y, angle_x)  # (n,4) #pts after Frustum rotation

        # visual_points(points)
        # label
        cat_id = gts['class_ids'][idx] - 1  # convert to 0-indexed

        # data augmentation
        translation = gts['translations'][idx]
        if self.mode == 'train':
            # color jitter
            rgb = self.colorjitter(Image.fromarray(np.uint8(rgb)))
            rgb = np.array(rgb)
            # point shift
            add_t = np.random.uniform(-self.shift_range, self.shift_range,
                                      (1, 3))
            translation = translation + add_t[0]
            # point jitter
            add_t = add_t + np.clip(
                0.001 * np.random.randn(points.shape[0], 3), -0.005, 0.005)
            points = np.add(points, add_t)
        rgb = self.transform(rgb)
        points = points.astype(np.float32)

        return points, rgb, seg, cat_id, choose_depth
    def __getitem__(self, index):
        img_path = os.path.join(self.data_dir, self.img_list[index])
        rgb = cv2.imread(img_path + '_color.png')[:, :, :3]
        img = cv2.imread(img_path + '_color.png')
        rgb = rgb[:, :, ::-1]
        depth = load_depth(img_path)
        mask = cv2.imread(img_path + '_mask.png')[:, :, 2]
        coord = cv2.imread(img_path + '_coord.png')[:, :, :3]
        coord = coord[:, :, (2, 1, 0)]
        coord = np.array(coord, dtype=np.float32) / 255
        coord[:, :, 2] = 1 - coord[:, :, 2]
        with open(img_path + '_label.pkl', 'rb') as f:
            gts = cPickle.load(f)
        if 'CAMERA' in img_path.split('/'):
            cam_fx, cam_fy, cam_cx, cam_cy = self.camera_intrinsics
        else:
            cam_fx, cam_fy, cam_cx, cam_cy = self.real_intrinsics

        # select one foreground object
        idx = random.randint(0, len(gts['instance_ids']) - 1)
        inst_id = gts['instance_ids'][idx]
        rmin, rmax, cmin, cmax = get_bbox(gts['bboxes'][idx])
        if self.vis:
            cv2.rectangle(img, (cmin, rmin), (cmax, rmax), (255, 0, 0), 1)
            cv2.imshow('image', img)
            cv2.waitKey()

        # sample points
        mask = np.equal(mask, inst_id)
        mask = np.logical_and(mask, depth > 0)
        choose = mask[rmin:rmax, cmin:cmax].flatten().nonzero()[0]
        if len(choose) > self.n_pts:
            c_mask = np.zeros(len(choose), dtype=int)
            c_mask[:self.n_pts] = 1
            np.random.shuffle(c_mask)
            choose = choose[c_mask.nonzero()]
        else:
            choose = np.pad(choose, (0, self.n_pts - len(choose)), 'wrap')
        depth_masked = depth[rmin:rmax,
                             cmin:cmax].flatten()[choose][:, np.newaxis]
        xmap_masked = self.xmap[rmin:rmax,
                                cmin:cmax].flatten()[choose][:, np.newaxis]
        ymap_masked = self.ymap[rmin:rmax,
                                cmin:cmax].flatten()[choose][:, np.newaxis]
        pt2 = depth_masked / self.norm_scale
        pt0 = (xmap_masked - cam_cx) * pt2 / cam_fx
        pt1 = (ymap_masked - cam_cy) * pt2 / cam_fy
        points = np.concatenate((pt0, pt1, pt2), axis=1)
        nocs = coord[rmin:rmax, cmin:cmax, :].reshape((-1, 3))[choose, :] - 0.5
        # resize cropped image to standard size and adjust 'choose' accordingly
        rgb = rgb[rmin:rmax, cmin:cmax, :]
        rgb = cv2.resize(rgb, (self.img_size, self.img_size),
                         interpolation=cv2.INTER_LINEAR)
        crop_w = rmax - rmin
        ratio = self.img_size / crop_w
        col_idx = choose % crop_w
        row_idx = choose // crop_w
        choose = (np.floor(row_idx * ratio) * self.img_size +
                  np.floor(col_idx * ratio)).astype(np.int64)
        # label
        cat_id = gts['class_ids'][idx] - 1  # convert to 0-indexed
        model = self.models[gts['model_list'][idx]].astype(
            np.float32)  # 1024 points
        prior = self.mean_shapes[cat_id].astype(np.float32)
        scale = gts['scales'][idx]
        rotation = gts['rotations'][idx]
        translation = gts['translations'][idx]
        # data augmentation
        if self.mode == 'train':
            # color jitter
            rgb = self.colorjitter(Image.fromarray(np.uint8(rgb)))
            rgb = np.array(rgb)
            # point shift
            add_t = np.random.uniform(-self.shift_range, self.shift_range,
                                      (1, 3))
            translation = translation + add_t[0]
            # point jitter
            add_t = add_t + np.clip(
                0.001 * np.random.randn(points.shape[0], 3), -0.005, 0.005)
            points = np.add(points, add_t)
        rgb = self.transform(rgb)
        points = points.astype(np.float32)
        points_item = np.copy(points)
        points_fru, points_NP, points_SC = [], [], []
        if 'fru' in self.points_process:
            #get frustum angle
            box2d_center = np.array([(cmin + cmax) / 2.0, (rmin + rmax) / 2.0])
            center_depth = 1.0
            center_x = (box2d_center[0] - cam_cx) * center_depth / cam_fx
            center_y = (cam_cy - box2d_center[1]) * center_depth / cam_fy
            frustum_angle_for_y = -1 * np.arctan2(center_depth, center_x)
            frustum_angle_for_x = -1 * np.arctan2(
                (center_depth**2 + center_x**2)**0.5, center_y)
            # Get point cloud
            points_item = get_center_view_point_set(points_item,
                                                    frustum_angle_for_y,
                                                    frustum_angle_for_x)
            points_item = points_item.astype(np.float32)
            points_fru = np.copy(points_item)
        if 'NP' in self.points_process:
            #normalization points' coordinates
            points_item = np.subtract(points_item, points_item.mean(axis=0))
            points_item = points_item.astype(np.float32)
            points_NP = np.copy(points_item)
        if 'SC' in self.points_process:
            #scale the points to the same size of piror
            x_coplanar = (np.unique(points_item[:, 0])).size == 1
            y_coplanar = (np.unique(points_item[:, 1])).size == 1
            z_coplanar = (np.unique(points_item[:, 2])).size == 1
            if x_coplanar or y_coplanar or z_coplanar:
                pass
            else:
                piror_pcd = o3d.geometry.PointCloud()
                piror_pcd.points = o3d.utility.Vector3dVector(prior)
                piror_box = piror_pcd.get_axis_aligned_bounding_box()
                piror_extent = piror_box.get_extent()
                points_item_pcd = o3d.geometry.PointCloud()
                points_item_pcd.points = o3d.utility.Vector3dVector(
                    points_item)
                points_item_box = points_item_pcd.get_oriented_bounding_box()
                points_item_extent = points_item_box.extent
                scale_ = np.linalg.norm(piror_extent) / np.linalg.norm(
                    points_item_extent)
                box_points = points_item_box.get_box_points()
                box_points_np = np.zeros((8, 3))
                for i in range(8):
                    box_points_np[i] = box_points.pop()
                points_item_box_center = np.mean(box_points_np, axis=0)
                points_item_pcd.scale(scale_, points_item_box_center)
                points_item = np.asarray(points_item_pcd.points)
                points_item = points_item.astype(np.float32)
                points_SC = np.copy(points_item)
        # if self.points_process == 'fru':
        #     points_pro = points_fru
        # elif self.points_process == 'NP':
        #     #normalization points' coordinates
        #     points_NP = np.subtract(points_fru, points_fru.mean(axis=0))
        #     points_NP = points_NP.astype(np.float32)
        #     points_pro = points_NP
        # else:
        #     print('points_process error flag')
        #visulization of points before and after process
        points_pro = points_item
        if self.vis:
            visual_points(points, points_fru, points_NP, points_SC, prior)

        # adjust nocs coords for mug category
        if cat_id == 5:
            T0 = self.mug_meta[gts['model_list'][idx]][0]
            s0 = self.mug_meta[gts['model_list'][idx]][1]
            nocs = s0 * (nocs + T0)
        # map ambiguous rotation to canonical rotation
        if cat_id in self.sym_ids:
            rotation = gts['rotations'][idx]
            # assume continuous axis rotation symmetry
            theta_x = rotation[0, 0] + rotation[2, 2]
            theta_y = rotation[0, 2] - rotation[2, 0]
            r_norm = math.sqrt(theta_x**2 + theta_y**2)
            s_map = np.array([[theta_x / r_norm, 0.0, -theta_y / r_norm],
                              [0.0, 1.0, 0.0],
                              [theta_y / r_norm, 0.0, theta_x / r_norm]])
            rotation = rotation @ s_map
            nocs = nocs @ s_map
        sRT = np.identity(4, dtype=np.float32)
        sRT[:3, :3] = scale * rotation
        sRT[:3, 3] = translation
        nocs = nocs.astype(np.float32)

        return points, points_pro, rgb, choose, cat_id, model, prior, sRT, nocs
Esempio n. 6
0
def detect():
    # resume model
    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu
    estimator = DeformNet(opt.n_cat, opt.nv_prior)
    estimator.cuda()
    estimator.load_state_dict(torch.load(opt.model))
    estimator.eval()
    # get test data list
    img_list = [
        os.path.join(file_path.split('/')[0], line.rstrip('\n'))
        for line in open(os.path.join(opt.data_dir, file_path))
    ]
    # frame by frame test
    t_inference = 0.0
    t_umeyama = 0.0
    inst_count = 0
    img_count = 0
    t_start = time.time()
    for path in tqdm(img_list):
        img_path = os.path.join(opt.data_dir, path)
        raw_rgb = cv2.imread(img_path + '_color.png')[:, :, :3]
        img = cv2.imread(img_path + '_color.png')
        raw_rgb = raw_rgb[:, :, ::-1]
        raw_depth = load_depth(img_path)
        # load detection results by  mask-rcnn or mask-rcnn+fusseg
        img_path_parsing = img_path.split('/')
        if opt.fusseg:
            if opt.data == 'val':
                mrcnn_path = os.path.join(
                    'results/mrcnn_results', 'CAMERA_val_fus_seg',
                    'results_{}_{}_{}.pkl'.format(
                        opt.data.split('_')[-1], img_path_parsing[-2],
                        img_path_parsing[-1]))
            else:
                mrcnn_path = os.path.join(
                    'results/mrcnn_results', 'Real_test_fus_seg',
                    'results_{}_{}_{}.pkl'.format(
                        opt.data.split('_')[-1], img_path_parsing[-2],
                        img_path_parsing[-1]))
        else:
            mrcnn_path = os.path.join(
                'results/mrcnn_results', opt.data,
                'results_{}_{}_{}.pkl'.format(
                    opt.data.split('_')[-1], img_path_parsing[-2],
                    img_path_parsing[-1]))
        with open(mrcnn_path, 'rb') as f:
            mrcnn_result = cPickle.load(f)
        num_insts = len(mrcnn_result['class_ids'])
        f_sRT = np.zeros((num_insts, 4, 4), dtype=float)
        f_size = np.zeros((num_insts, 3), dtype=float)
        # prepare frame data
        f_points, f_points_pro, f_rgb, f_choose, f_catId, f_prior = [], [], [], [], [], []
        valid_inst = []
        for i in range(num_insts):
            cat_id = mrcnn_result['class_ids'][i] - 1
            prior = mean_shapes[cat_id]
            rmin, rmax, cmin, cmax = get_bbox(mrcnn_result['rois'][i])
            mask = np.logical_and(mrcnn_result['masks'][:, :, i],
                                  raw_depth > 0)
            choose = mask[rmin:rmax, cmin:cmax].flatten().nonzero()[0]
            # no depth observation for background in CAMERA dataset
            # beacuase of how we compute the bbox in function get_bbox
            # there might be a chance that no foreground points after cropping the mask
            # cuased by false positive of mask_rcnn, most of the regions are background
            if len(choose) < 32:
                f_sRT[i] = np.identity(4, dtype=float)
                f_size[i] = 2 * np.amax(np.abs(prior), axis=0)
                continue
            else:
                valid_inst.append(i)
            # process objects with valid depth observation
            if len(choose) > opt.n_pts:
                c_mask = np.zeros(len(choose), dtype=int)
                c_mask[:opt.n_pts] = 1
                np.random.shuffle(c_mask)
                choose = choose[c_mask.nonzero()]
            else:
                choose = np.pad(choose, (0, opt.n_pts - len(choose)), 'wrap')
            depth_masked = raw_depth[rmin:rmax,
                                     cmin:cmax].flatten()[choose][:,
                                                                  np.newaxis]
            xmap_masked = xmap[rmin:rmax,
                               cmin:cmax].flatten()[choose][:, np.newaxis]
            ymap_masked = ymap[rmin:rmax,
                               cmin:cmax].flatten()[choose][:, np.newaxis]
            pt2 = depth_masked / norm_scale
            pt0 = (xmap_masked - cam_cx) * pt2 / cam_fx
            pt1 = (ymap_masked - cam_cy) * pt2 / cam_fy
            points = np.concatenate((pt0, pt1, pt2), axis=1)
            #point normaliztion or not
            points_item = np.copy(points)
            if opt.points_process != '':
                points_fru, points_NP, points_SC = [], [], []
                if 'fru' in opt.points_process:
                    #get frustum angle
                    box2d_center = np.array([(cmin + cmax) / 2.0,
                                             (rmin + rmax) / 2.0])
                    center_depth = 1.0
                    center_x = (box2d_center[0] -
                                cam_cx) * center_depth / cam_fx
                    center_y = (cam_cy -
                                box2d_center[1]) * center_depth / cam_fy
                    frustum_angle_for_y = -1 * np.arctan2(
                        center_depth, center_x)
                    frustum_angle_for_x = -1 * np.arctan2(
                        (center_depth**2 + center_x**2)**0.5, center_y)
                    # Get point cloud
                    points_item = get_center_view_point_set(
                        points_item, frustum_angle_for_y, frustum_angle_for_x)
                    points_item = points_item.astype(np.float32)
                    points_fru = np.copy(points_item)
                if 'NP' in opt.points_process:
                    #normalization points' coordinates
                    points_item = np.subtract(points_item,
                                              points_item.mean(axis=0))
                    points_item = points_item.astype(np.float32)
                    points_NP = np.copy(points_item)
                if 'SC' in opt.points_process:
                    x_coplanar = (np.unique(points_item[:, 0])).size == 1
                    y_coplanar = (np.unique(points_item[:, 1])).size == 1
                    z_coplanar = (np.unique(points_item[:, 2])).size == 1
                    if x_coplanar or y_coplanar or z_coplanar:
                        print('coplanar img_path: {}'.format(img_path))
                    #scale the points to the same size of piror
                    else:
                        piror_pcd = o3d.geometry.PointCloud()
                        piror_pcd.points = o3d.utility.Vector3dVector(prior)
                        piror_box = piror_pcd.get_axis_aligned_bounding_box()
                        piror_extent = piror_box.get_extent()
                        points_item_pcd = o3d.geometry.PointCloud()
                        points_item_pcd.points = o3d.utility.Vector3dVector(
                            points_item)
                        points_item_box = points_item_pcd.get_oriented_bounding_box(
                        )
                        points_item_extent = points_item_box.extent
                        scale_ = np.linalg.norm(piror_extent) / np.linalg.norm(
                            points_item_extent)
                        box_points = points_item_box.get_box_points()
                        box_points_np = np.zeros((8, 3))
                        for i in range(8):
                            box_points_np[i] = box_points.pop()
                        points_item_box_center = np.mean(box_points_np, axis=0)
                        points_item_pcd.scale(scale_, points_item_box_center)
                        points_item = np.asarray(points_item_pcd.points)
                        points_item = points_item.astype(np.float32)
                    points_SC = np.copy(points_item)
                if opt.vis:
                    cv2.rectangle(img, (cmin, rmin), (cmax, rmax), (255, 0, 0),
                                  1)
                    cv2.imshow('image', img)
                    cv2.waitKey()
                    visual_points(points, points_fru, points_NP, points_SC,
                                  prior)
            points_pro = points_item
            # if opt.vis:
            #     cv2.rectangle(img, (cmin,rmin), (cmax,rmax), (255,0,0), 1)
            #     cv2.imshow('image', img)
            #     cv2.waitKey()
            #     visual_points(points_pro, prior)
            rgb = raw_rgb[rmin:rmax, cmin:cmax, :]
            rgb = cv2.resize(rgb, (opt.img_size, opt.img_size),
                             interpolation=cv2.INTER_LINEAR)
            rgb = norm_color(rgb)
            crop_w = rmax - rmin
            ratio = opt.img_size / crop_w
            col_idx = choose % crop_w
            row_idx = choose // crop_w
            choose = (np.floor(row_idx * ratio) * opt.img_size +
                      np.floor(col_idx * ratio)).astype(np.int64)
            # concatenate instances
            f_points.append(points)
            f_points_pro.append(points_pro)
            f_rgb.append(rgb)
            f_choose.append(choose)
            f_catId.append(cat_id)
            f_prior.append(prior)
        if len(valid_inst):
            f_points = torch.cuda.FloatTensor(f_points)
            f_points_pro = torch.cuda.FloatTensor(f_points_pro)
            f_rgb = torch.stack(f_rgb, dim=0).cuda()
            f_choose = torch.cuda.LongTensor(f_choose)
            f_catId = torch.cuda.LongTensor(f_catId)
            f_prior = torch.cuda.FloatTensor(f_prior)
            # inference
            torch.cuda.synchronize()
            t_now = time.time()
            assign_mat, deltas = estimator(f_points_pro, f_rgb, f_choose,
                                           f_catId, f_prior)
            # assign_mat, deltas = estimator(f_rgb, f_choose, f_catId, f_prior)
            inst_shape = f_prior + deltas
            assign_mat = F.softmax(assign_mat, dim=2)
            f_coords = torch.bmm(assign_mat, inst_shape)  # bs x n_pts x 3
            torch.cuda.synchronize()
            t_inference += (time.time() - t_now)
            f_coords = f_coords.detach().cpu().numpy()
            f_points = f_points.cpu().numpy()
            f_choose = f_choose.cpu().numpy()
            f_insts = inst_shape.detach().cpu().numpy()
            t_now = time.time()
            for i in range(len(valid_inst)):
                inst_idx = valid_inst[i]
                choose = f_choose[i]
                _, choose = np.unique(choose, return_index=True)
                nocs_coords = f_coords[i, choose, :]
                f_size[inst_idx] = 2 * np.amax(np.abs(f_insts[i]), axis=0)
                points = f_points[i, choose, :]
                _, _, _, pred_sRT = estimateSimilarityTransform(
                    nocs_coords, points)
                if pred_sRT is None:
                    pred_sRT = np.identity(4, dtype=float)
                f_sRT[inst_idx] = pred_sRT
            t_umeyama += (time.time() - t_now)
            img_count += 1
            inst_count += len(valid_inst)

        # save results
        result = {}
        with open(img_path + '_label.pkl', 'rb') as f:
            gts = cPickle.load(f)
        result['gt_class_ids'] = gts['class_ids']
        result['gt_bboxes'] = gts['bboxes']
        result['gt_RTs'] = gts['poses']
        result['gt_scales'] = gts['size']
        result['gt_handle_visibility'] = gts['handle_visibility']

        result['pred_class_ids'] = mrcnn_result['class_ids']
        result['pred_bboxes'] = mrcnn_result['rois']
        result['pred_scores'] = mrcnn_result['scores']
        result['pred_RTs'] = f_sRT
        result['pred_scales'] = f_size

        image_short_path = '_'.join(img_path_parsing[-3:])
        save_path = os.path.join(opt.result_dir,
                                 'results_{}.pkl'.format(image_short_path))
        with open(save_path, 'wb') as f:
            cPickle.dump(result, f)
    # write statistics
    fw = open('{0}/eval_logs.txt'.format(opt.result_dir), 'w')
    messages = []
    messages.append("Total images: {}".format(len(img_list)))
    messages.append(
        "Valid images: {},  Total instances: {},  Average: {:.2f}/image".
        format(img_count, inst_count, inst_count / img_count))
    messages.append("Inference time: {:06f}  Average: {:06f}/image".format(
        t_inference, t_inference / img_count))
    messages.append("Umeyama time: {:06f}  Average: {:06f}/image".format(
        t_umeyama, t_umeyama / img_count))
    messages.append("Total time: {:06f}".format(time.time() - t_start))
    for msg in messages:
        print(msg)
        fw.write(msg + '\n')
    fw.close()
Esempio n. 7
0
    def __getitem__(self, index):
        img_path = os.path.join(self.data_dir, self.img_list[index])
        rgb = cv2.imread(img_path + '_color.png')[:, :, :3]
        rgb = rgb[:, :, ::-1]
        depth = load_depth(img_path)
        mask = cv2.imread(img_path + '_mask.png')[:, :, 2]
        coord = cv2.imread(img_path + '_coord.png')[:, :, :3]
        coord = coord[:, :, (2, 1, 0)]
        coord = np.array(coord, dtype=np.float32) / 255
        coord[:, :, 2] = 1 - coord[:, :, 2]
        with open(img_path + '_label.pkl', 'rb') as f:
            gts = cPickle.load(f)
        if 'CAMERA' in img_path.split('/'):
            cam_fx, cam_fy, cam_cx, cam_cy = self.camera_intrinsics
        else:
            cam_fx, cam_fy, cam_cx, cam_cy = self.real_intrinsics

        # select one foreground object
        idx = random.randint(0, len(gts['instance_ids'])-1)
        inst_id = gts['instance_ids'][idx]
        rmin, rmax, cmin, cmax = get_bbox(gts['bboxes'][idx])
        # sample points
        mask = np.equal(mask, inst_id)
        mask = np.logical_and(mask, depth > 0)
        choose = mask[rmin:rmax, cmin:cmax].flatten().nonzero()[0]
        if len(choose) > self.n_pts:
            c_mask = np.zeros(len(choose), dtype=int)
            c_mask[:self.n_pts] = 1
            np.random.shuffle(c_mask)
            choose = choose[c_mask.nonzero()]
        else:
            choose = np.pad(choose, (0, self.n_pts-len(choose)), 'wrap')
        depth_masked = depth[rmin:rmax, cmin:cmax].flatten()[choose][:, np.newaxis]
        xmap_masked = self.xmap[rmin:rmax, cmin:cmax].flatten()[choose][:, np.newaxis]
        ymap_masked = self.ymap[rmin:rmax, cmin:cmax].flatten()[choose][:, np.newaxis]
        pt2 = depth_masked / self.norm_scale
        pt0 = (xmap_masked - cam_cx) * pt2 / cam_fx
        pt1 = (ymap_masked - cam_cy) * pt2 / cam_fy
        points = np.concatenate((pt0, pt1, pt2), axis=1)
        nocs = coord[rmin:rmax, cmin:cmax, :].reshape((-1, 3))[choose, :] - 0.5
        # resize cropped image to standard size and adjust 'choose' accordingly
        rgb = rgb[rmin:rmax, cmin:cmax, :]
        rgb = cv2.resize(rgb, (self.img_size, self.img_size), interpolation=cv2.INTER_LINEAR)
        crop_w = rmax - rmin
        ratio = self.img_size / crop_w
        col_idx = choose % crop_w
        row_idx = choose // crop_w
        choose = (np.floor(row_idx * ratio) * self.img_size + np.floor(col_idx * ratio)).astype(np.int64)
        # label
        cat_id = gts['class_ids'][idx] - 1    # convert to 0-indexed
        model = self.models[gts['model_list'][idx]].astype(np.float32)     # 1024 points
        prior = self.mean_shapes[cat_id].astype(np.float32)
        scale = gts['scales'][idx]
        rotation = gts['rotations'][idx]
        translation = gts['translations'][idx]
        # data augmentation
        if self.mode == 'train':
            # color jitter
            rgb = self.colorjitter(Image.fromarray(np.uint8(rgb)))
            rgb = np.array(rgb)
            # point shift
            add_t = np.random.uniform(-self.shift_range, self.shift_range, (1, 3))
            translation = translation + add_t[0]
            # point jitter
            add_t = add_t + np.clip(0.001*np.random.randn(points.shape[0], 3), -0.005, 0.005)
            points = np.add(points, add_t)
        rgb = self.transform(rgb)
        points = points.astype(np.float32)
        # adjust nocs coords for mug category
        if cat_id == 5:
            T0 = self.mug_meta[gts['model_list'][idx]][0]
            s0 = self.mug_meta[gts['model_list'][idx]][1]
            nocs = s0 * (nocs + T0)
        # map ambiguous rotation to canonical rotation
        if cat_id in self.sym_ids:
            rotation = gts['rotations'][idx]
            # assume continuous axis rotation symmetry
            theta_x = rotation[0, 0] + rotation[2, 2]
            theta_y = rotation[0, 2] - rotation[2, 0]
            r_norm = math.sqrt(theta_x**2 + theta_y**2)
            s_map = np.array([[theta_x/r_norm, 0.0, -theta_y/r_norm],
                              [0.0,            1.0,  0.0           ],
                              [theta_y/r_norm, 0.0,  theta_x/r_norm]])
            rotation = rotation @ s_map
            nocs = nocs @ s_map
        sRT = np.identity(4, dtype=np.float32)
        sRT[:3, :3] = scale * rotation
        sRT[:3, 3] = translation
        nocs = nocs.astype(np.float32)

        return points, rgb, choose, cat_id, model, prior, sRT, nocs
Esempio n. 8
0
def detect():
    # resume model
    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu
    estimator = DeformNet(opt.n_cat, opt.nv_prior)
    estimator.cuda()
    estimator.load_state_dict(torch.load(opt.model))
    estimator.eval()
    # get test data list
    img_list = [
        os.path.join(file_path.split('/')[0], line.rstrip('\n'))
        for line in open(os.path.join(opt.data_dir, file_path))
    ]
    # TODO: test, chamfer distance
    chamferD = ChamferLoss()
    cd_num = torch.zeros(6)
    prior_cd = torch.zeros(6)
    deform_cd = torch.zeros(6)
    for path in tqdm(img_list):
        img_path = os.path.join(opt.data_dir, path)
        raw_rgb = cv2.imread(img_path + '_color.png')[:, :, :3]
        raw_rgb = raw_rgb[:, :, ::-1]
        raw_depth = load_depth(img_path)
        # load mask-rcnn detection results
        img_path_parsing = img_path.split('/')
        mrcnn_path = os.path.join(
            'results/mrcnn_results', opt.data, 'results_{}_{}_{}.pkl'.format(
                opt.data.split('_')[-1], img_path_parsing[-2],
                img_path_parsing[-1]))
        with open(mrcnn_path, 'rb') as f:
            mrcnn_result = cPickle.load(f)
        with open(img_path + '_label.pkl', 'rb') as f:
            gts = cPickle.load(f)
        num_insts = len(mrcnn_result['class_ids'])
        f_sRT = np.zeros((num_insts, 4, 4), dtype=float)
        f_size = np.zeros((num_insts, 3), dtype=float)
        # prepare frame data
        f_points, f_rgb, f_choose, f_catId, f_prior, f_model = [], [], [], [], [], []
        valid_inst = []
        for i in range(num_insts):
            cat_id = mrcnn_result['class_ids'][i] - 1
            prior = mean_shapes[cat_id]
            rmin, rmax, cmin, cmax = get_bbox(mrcnn_result['rois'][i])
            mask = np.logical_and(mrcnn_result['masks'][:, :, i],
                                  raw_depth > 0)
            choose = mask[rmin:rmax, cmin:cmax].flatten().nonzero()[0]
            # no depth observation for background in CAMERA dataset
            # beacuase of how we compute the bbox in function get_bbox
            # there might be a chance that no foreground points after cropping the mask
            # cuased by false positive of mask_rcnn, most of the regions are background
            if len(choose) < 32:
                f_sRT[i] = np.identity(4, dtype=float)
                f_size[i] = 2 * np.amax(np.abs(prior), axis=0)
                continue
            else:
                valid_inst.append(i)
            # process objects with valid depth observation
            if len(choose) > opt.n_pts:
                c_mask = np.zeros(len(choose), dtype=int)
                c_mask[:opt.n_pts] = 1
                np.random.shuffle(c_mask)
                choose = choose[c_mask.nonzero()]
            else:
                choose = np.pad(choose, (0, opt.n_pts - len(choose)), 'wrap')
            depth_masked = raw_depth[rmin:rmax,
                                     cmin:cmax].flatten()[choose][:,
                                                                  np.newaxis]
            xmap_masked = xmap[rmin:rmax,
                               cmin:cmax].flatten()[choose][:, np.newaxis]
            ymap_masked = ymap[rmin:rmax,
                               cmin:cmax].flatten()[choose][:, np.newaxis]
            pt2 = depth_masked / norm_scale
            pt0 = (xmap_masked - cam_cx) * pt2 / cam_fx
            pt1 = (ymap_masked - cam_cy) * pt2 / cam_fy
            points = np.concatenate((pt0, pt1, pt2), axis=1)
            rgb = raw_rgb[rmin:rmax, cmin:cmax, :]
            rgb = cv2.resize(rgb, (opt.img_size, opt.img_size),
                             interpolation=cv2.INTER_LINEAR)
            rgb = norm_color(rgb)
            crop_w = rmax - rmin
            ratio = opt.img_size / crop_w
            col_idx = choose % crop_w
            row_idx = choose // crop_w
            choose = (np.floor(row_idx * ratio) * opt.img_size +
                      np.floor(col_idx * ratio)).astype(np.int64)
            # concatenate instances
            try:
                idx_gt = np.argwhere(gts['class_ids'] - 1 == cat_id).item()
            except:
                valid_inst.remove(i)
                continue
            model = models[gts['model_list'][idx_gt]].astype(
                np.float32)  # 1024 points
            f_model.append(model)
            f_points.append(points)
            f_rgb.append(rgb)
            f_choose.append(choose)
            f_catId.append(cat_id)
            f_prior.append(prior)
        if len(valid_inst):
            f_points = torch.cuda.FloatTensor(f_points)
            f_rgb = torch.stack(f_rgb, dim=0).cuda()
            f_choose = torch.cuda.LongTensor(f_choose)
            f_catId = torch.cuda.LongTensor(f_catId)
            f_prior = torch.cuda.FloatTensor(f_prior)
            f_model = torch.cuda.FloatTensor(f_model)
            # inference
            torch.cuda.synchronize()
            assign_mat, deltas = estimator(f_points, f_rgb, f_choose, f_catId,
                                           f_prior)
            # assign_mat, deltas = estimator(f_rgb, f_choose, f_catId, f_prior)
            # reconstruction points
            inst_shape = f_prior + deltas.detach()

            for i in range(len(valid_inst)):
                prior_loss, _, _ = chamferD(f_prior[i].unsqueeze(0),
                                            f_model[i].unsqueeze(0))
                deform_loss, _, _ = chamferD(inst_shape[i].unsqueeze(0),
                                             f_model[i].unsqueeze(0))

                idx = f_catId[i]
                cd_num[idx] += 1
                prior_cd[idx] += prior_loss.item()
                deform_cd[idx] += deform_loss.item()

    deform_cd_metric = (deform_cd / cd_num) * 1000
    print(
        "recon: {:.2f} , {:.2f} , {:.2f} , {:.2f} , {:.2f} , {:.2f} , {:.2f}".
        format(deform_cd_metric[0], deform_cd_metric[1], deform_cd_metric[2],
               deform_cd_metric[3], deform_cd_metric[4], deform_cd_metric[5],
               torch.mean(deform_cd_metric)))
    prior_cd_metric = (prior_cd / cd_num) * 1000
    print(
        "prior: {:.2f} , {:.2f} , {:.2f} , {:.2f} , {:.2f} , {:.2f} , {:.2f}".
        format(prior_cd_metric[0], prior_cd_metric[1], prior_cd_metric[2],
               prior_cd_metric[3], prior_cd_metric[4], prior_cd_metric[5],
               torch.mean(prior_cd_metric)))
def seg_maskrcnnresults():
    classifier = FusionInstanceSeg(n_classes=opt.n_cat)
    if opt.model != '':
        classifier.load_state_dict(torch.load(opt.model))
    classifier.cuda()
    classifier = classifier.eval()

    if opt.dataset == 'Real':
        file_path = os.path.join(opt.dataset, 'test_list.txt')
        cam_fx, cam_fy, cam_cx, cam_cy = 591.0125, 590.16775, 322.525, 244.11084
        result_dir = 'results/mrcnn_results/{}_test_fus_seg'.format(
            opt.dataset)
    else:
        file_path = os.path.join(opt.dataset, 'val_list.txt')
        cam_fx, cam_fy, cam_cx, cam_cy = 577.5, 577.5, 319.5, 239.5
        result_dir = 'results/mrcnn_results/{}_val_fus_seg'.format(opt.dataset)
    if not os.path.exists(result_dir):
        os.makedirs(result_dir)
    norm_scale = 1000.0
    norm_color = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])
    xmap = np.array([[i for i in range(640)] for j in range(480)])
    ymap = np.array([[j for i in range(640)] for j in range(480)])
    # get test data list
    img_list = [
        os.path.join(file_path.split('/')[0], line.rstrip('\n'))
        for line in open(os.path.join(opt.data_dir, file_path))
    ]

    total_count = np.zeros((opt.n_cat, ), dtype=int)
    acc = np.zeros((opt.n_cat, ), dtype=float)  #accuracy
    pcs = np.zeros((opt.n_cat, ), dtype=float)  #precision
    rcal = np.zeros((opt.n_cat, ), dtype=float)  #recall
    all_dtc_num = 0
    no_gt_num = 0

    t_start = time.time()
    for path in tqdm(img_list):
        img_path = os.path.join(opt.data_dir, path)
        raw_rgb = cv2.imread(img_path + '_color.png')[:, :, :3]
        raw_rgb = raw_rgb[:, :, ::-1]
        depth = load_depth(img_path)
        #load label
        with open(img_path + '_label.pkl', 'rb') as f:
            gts = cPickle.load(f)
        gt_mask = cv2.imread(img_path + '_mask.png')[:, :, 2]
        gt_num_insts = len(gts['class_ids'])
        gt_class_ids = gts['class_ids']

        # load mask-rcnn detection results
        img_path_parsing = img_path.split('/')
        mrcnn_path = os.path.join(
            'results/mrcnn_results', opt.data, 'results_{}_{}_{}.pkl'.format(
                opt.data.split('_')[-1], img_path_parsing[-2],
                img_path_parsing[-1]))
        with open(mrcnn_path, 'rb') as f:
            mrcnn_result = cPickle.load(f)
        num_insts = len(mrcnn_result['class_ids'])
        mrcnn_class_ids = mrcnn_result['class_ids']
        f_mask = np.zeros((num_insts, depth.shape[0], depth.shape[1]),
                          dtype=int)
        # prepare frame data
        f_points, f_rgb, f_choose, f_catId = [], [], [], []
        f_raw_choose = []
        valid_inst = []
        result = {}

        for i in range(num_insts):
            cat_id = mrcnn_result['class_ids'][i] - 1
            rmin, rmax, cmin, cmax = get_bbox(mrcnn_result['rois'][i])

            # sample points
            depth_vaild = depth > 0
            choose_depth = depth_vaild[rmin:rmax,
                                       cmin:cmax].flatten().nonzero()[0]
            if len(choose_depth) < 32:
                continue
            else:
                valid_inst.append(i)
            # process objects with valid depth observation
            if len(choose_depth) > opt.n_pts:
                c_mask = np.zeros(len(choose_depth), dtype=int)
                c_mask[:opt.n_pts] = 1
                np.random.shuffle(c_mask)
                choose_depth = choose_depth[c_mask.nonzero()]
            else:
                choose_depth = np.pad(choose_depth,
                                      (0, opt.n_pts - len(choose_depth)),
                                      'wrap')

            depth_masked = depth[rmin:rmax,
                                 cmin:cmax].flatten()[choose_depth][:,
                                                                    np.newaxis]
            xmap_masked = xmap[rmin:rmax,
                               cmin:cmax].flatten()[choose_depth][:,
                                                                  np.newaxis]
            ymap_masked = ymap[rmin:rmax,
                               cmin:cmax].flatten()[choose_depth][:,
                                                                  np.newaxis]
            pt2 = depth_masked / norm_scale
            pt0 = (xmap_masked - cam_cx) * pt2 / cam_fx
            pt1 = (ymap_masked - cam_cy) * pt2 / cam_fy
            points = np.concatenate((pt0, pt1, pt2), axis=1)
            # Get frustum angle (according to center pixel in 2D BOX)
            box2d_center = np.array([(cmin + cmax) / 2.0, (rmin + rmax) / 2.0])
            depth_center = 1.0
            x_center = (box2d_center[0] - cam_cx) * depth_center / cam_fx
            y_center = (cam_cy - box2d_center[1]) * depth_center / cam_fy
            angle_y = -1 * np.arctan2(depth_center, x_center)
            angle_x = -1 * np.arctan2(
                (depth_center**2 + x_center**2)**0.5, y_center)

            # Get point cloud
            points = get_center_view_point_set(
                points, angle_y, angle_x)  # (n,3) #pts after Frustum rotation
            rgb = raw_rgb[rmin:rmax, cmin:cmax, :]
            rgb = cv2.resize(rgb, (opt.img_size, opt.img_size),
                             interpolation=cv2.INTER_LINEAR)
            rgb = norm_color(rgb)
            crop_w = rmax - rmin
            ratio = opt.img_size / crop_w
            col_idx = choose_depth % crop_w
            row_idx = choose_depth // crop_w
            raw_choose = np.copy(choose_depth)
            choose_depth = (np.floor(row_idx * ratio) * opt.img_size +
                            np.floor(col_idx * ratio)).astype(np.int64)
            f_points.append(points)
            f_rgb.append(rgb)
            f_catId.append(cat_id)
            f_choose.append(choose_depth)
            f_raw_choose.append(raw_choose)
        if len(valid_inst):
            f_points = torch.cuda.FloatTensor(f_points)
            f_rgb = torch.stack(f_rgb, dim=0).cuda()
            f_catId = torch.cuda.LongTensor(f_catId)
            f_one_hot_vec = F.one_hot(f_catId, opt.n_cat)
            f_choose = torch.cuda.LongTensor(f_choose)
            f_points = f_points.transpose(2, 1)

            logits = classifier(f_points, f_rgb, f_one_hot_vec, f_choose)
            logits_choice = logits.data.max(2)[1]
            logits_np = logits_choice.cpu().data.numpy()
            f_choose = f_choose.cpu().numpy()
            for i in range(len(valid_inst)):
                inst_idx = valid_inst[i]
                choose_depth = f_choose[i]
                raw_choose = f_raw_choose[i]
                logits_np_inst = logits_np[i]
                choose_logits_np = logits_np_inst.nonzero()
                rmin, rmax, cmin, cmax = get_bbox(
                    mrcnn_result['rois'][inst_idx])
                roi_mask = np.zeros(((rmax - rmin) * (cmax - cmin)), dtype=int)
                roi_mask[raw_choose[choose_logits_np]] = 1
                roi_mask = roi_mask.reshape((rmax - rmin, cmax - cmin))
                f_mask[inst_idx][rmin:rmax, cmin:cmax] = roi_mask
                all_dtc_num += 1
                map_to_gt = []
                for j in range(len(gt_class_ids)):
                    if gt_class_ids[j] != mrcnn_class_ids[inst_idx]:
                        continue
                    pred_box = [cmin, rmin, cmax, rmax]
                    rmin2, rmax2, cmin2, cmax2 = get_bbox(gts['bboxes'][j])
                    gt_box = [cmin2, rmin2, cmax2, rmax2]
                    iou = cal_iou(pred_box, gt_box)
                    if iou < opt.iou_thd:
                        continue
                    # match found
                    map_to_gt.append(np.array([j, iou]))
                if len(map_to_gt) == 0:
                    no_gt_num += 1
                else:
                    max_iou_idx = np.argmax(np.array(map_to_gt)[:, 1])
                    j = int(map_to_gt[max_iou_idx][0])
                    gt_mask_ins = gt_mask == gts['instance_ids'][j]
                    gt_roi_mask = gt_mask_ins[rmin:rmax, cmin:cmax]
                    raw_choose, choose_raw_choose = np.unique(
                        raw_choose, return_index=True)
                    gt_logits = gt_roi_mask.flatten()[raw_choose]
                    logits_bias = logits_np_inst[
                        choose_raw_choose] == gt_logits
                    logits_TP = np.logical_and(
                        logits_np_inst[choose_raw_choose], gt_logits)
                    correct_seg_num = np.sum(np.array(logits_bias))
                    TP_seg_num = np.sum(np.array(logits_TP))
                    acc_ins = correct_seg_num / len(raw_choose)
                    pcs_ins = TP_seg_num / np.sum(
                        logits_np_inst[choose_raw_choose])
                    rcal_ins = TP_seg_num / np.sum(gt_logits)
                    total_count[mrcnn_class_ids[inst_idx] - 1] += 1
                    acc[mrcnn_class_ids[inst_idx] - 1] += acc_ins
                    pcs[mrcnn_class_ids[inst_idx] - 1] += pcs_ins
                    rcal[mrcnn_class_ids[inst_idx] - 1] += rcal_ins

        result['class_ids'] = mrcnn_result['class_ids']
        result['rois'] = mrcnn_result['rois']
        result['scores'] = mrcnn_result['scores']
        result['masks'] = (f_mask.transpose(1, 2, 0) > 0)
        if opt.save_pkl:
            save_path = os.path.join(
                result_dir, 'results_{}_{}_{}.pkl'.format(
                    opt.data.split('_')[-1], img_path_parsing[-2],
                    img_path_parsing[-1]))
            with open(save_path, 'wb') as f:
                cPickle.dump(result, f)
    # compute accuracy
    catId_to_name = {
        0: 'bottle',
        1: 'bowl',
        2: 'camera',
        3: 'can',
        4: 'laptop',
        5: 'mug'
    }
    acc, pcs, rcal = 100 * (acc / total_count), 100 * (
        pcs / total_count), 100 * (rcal / total_count)
    overall_acc, overall_pcs, overall_rcal = np.mean(acc), np.mean(
        pcs), np.mean(rcal)
    no_gt_ratio = 100 * (no_gt_num / all_dtc_num)
    fw = open('{0}/seg_acc_pcs.txt'.format(result_dir), 'a')
    messages = []
    messages.append('segmantation results:')
    messages.append('{:>12s}{:>12s}{:>12s}{:>12s}'.format(
        'category', 'accuracy', 'precision', 'recall'))
    for i in range(acc.shape[0]):
        messages.append("{:>12s}{:>12.2f}{:>12.2f}{:>12.2f}".format(
            catId_to_name[i], acc[i], pcs[i], rcal[i]))
    messages.append("{:>12s}{:>12.2f}{:>12.2f}{:>12.2f}".format(
        'overall', overall_acc, overall_pcs, overall_rcal))
    messages.append("{:>12s}{:>12.2f}".format('mismatch', no_gt_ratio))
    for msg in messages:
        print(msg)
        fw.write(msg + '\n')
    fw.close()