Ejemplos de cal_iou en Python, ejemplos de utils.cal_iou en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: focal_loss.py Proyecto: TalentBoy2333/RetinaNet-PyTorch-Tutorial

def imshow_postive_anchors(images, anchors, annotations):
    import matplotlib.pyplot as plt  
    import cv2

    batch_size = images.size()[0]
    for i in range(batch_size):
        image = images[i, :, :, :]
        anno = annotations[i, :, :]
        anno = anno[anno[:, 0] != -1]
        iou = cal_iou(anchors[:, :], anno[:, :-1])
        iou_max, iou_max_ind = torch.max(iou, dim=1)
        pos_ind = torch.ge(iou_max, 0.5)
        pos_anchors = anchors[pos_ind, :]
        print('positive anchor number:', pos_anchors.size())
    
        unnormalize = UnNormalizer()
        image = 255 * unnormalize(image)
        image = torch.clamp(image, min=0, max=255).data.numpy()
        image = np.transpose(image, (1, 2, 0)).astype(np.uint8)

        for x1, y1, x2, y2 in pos_anchors:
            x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2) 
            image = cv2.rectangle(image, (x1, y1), (x2, y2), (0,0,255), 1)

        image = image.get()
        print(image.shape)
        plt.figure() 
        image = image[:,:,[2,1,0]]
        plt.imshow(image)
        plt.show()

Ejemplo n.º 2

0

Mostrar archivo

Archivo: network.py Proyecto: VIM-Lab/AVSMVR

    def validate(self, image, y, name):
        # temp = tf.map_fn(lambda x : self.image_encoder(x, training = False), image)

        self.state = self.image_encoder(image[:, 0, :, :, :], training = False) # shape = [batch_size, 1024]
        used = [0]
        
        for _t in range(params.time_steps - 1):
            prob = self.actor(self.state).numpy()

            action = utils.choose_action(prob, used)
            used.append(action)

            append_state = self.image_encoder(image[:, action, :, :, :], training = False)
            self.state = tf.reduce_max(tf.stack([self.state, append_state], axis = 1), axis = 1)

        voxel = self.generator(self.state, training = False)

        voxel = utils.dicide_voxel(voxel)
        utils.save_voxel(voxel, '{}_pridict'.format(name))
        utils.save_voxel(y, '{}_true'.format(name))

        y = y[0]
        y = np.argmax(y, -1)
        voxel = np.argmax(voxel, -1)

        iou = utils.cal_iou(y, voxel)
        print('iou = ', iou)

Ejemplo n.º 3

0

Mostrar archivo

def preprocess_gt_boxes(gt_boxes, grid_shapes):
    """
    args:
        gt_boxes : [m][x1,y1,x2,y2,cls]  np.array  m is gt_box numbers ,x1y1 is left up coord,x2y2 is right bottom coord
    return:
        y_true:list of array , like [(52,52,3,(4+4+1)+cls),(26,26,3,(4+4+1)+cls),(13,13,3,(4+4+1)+cls)]
        4+4+1 : 4 is tx ty tw th , second 4 is gt_centerx gt_centery gt_w gt_h  , second 4 is for calc iou 
    """
    y_true = [
        np.zeros(
            (grid_shapes[l][0], grid_shapes[l][1],
             len(config.yolo_layer_anchor[l]), 4 + 4 + 1 + config.num_classes),
            dtype='float32') for l in range(config.num_layers)
    ]

    orign_gt_boxes = np.copy(gt_boxes[:, 0:4])
    orign_gt_boxes = utils.convert_boxes_to_origin(orign_gt_boxes)

    anchors = np.array(config.anchors)
    orign_anchors = np.zeros((anchors.shape[0], 4))

    #print('anchors:',config.anchors)

    orign_anchors[:, 2:4] = anchors[:][:]
    orign_anchors = utils.convert_boxes_to_origin(orign_anchors)

    for id, ogt in enumerate(orign_gt_boxes):
        gt_box = gt_boxes[id]
        #1.gt_box 最大iou的anchor
        iou = utils.cal_iou(ogt, orign_anchors)
        best_anchor = anchors[np.argmax(iou, axis=-1)]

        #print('gt_box:',gt_box,'ogt_box:',ogt,'best_anchor:',best_anchor,'o_anchors',orign_anchors)

        #2.iou最大的anchor 属于哪一层第几个
        lindx, aindx = config.yolo_anchor_layerIndex[tuple(best_anchor)]
        #3.gtbox在grid_shape中的位置和偏移量tx,ty
        grid_shape = config.stride[lindx]
        py, px, ty, tx = utils.cal_box_offset_pos(gt_box, grid_shape)
        #4.计算 gt_center_x gt_centery gt_w , gt_h
        gt_w, gt_h = (gt_box[2] - gt_box[0], gt_box[3] - gt_box[1])
        gt_center_x = (px + tx) * grid_shape
        gt_center_y = (py + ty) * grid_shape
        assert gt_w > 0 and gt_h > 0, r'gt_box w,h <0'
        #4.计算tw,th
        anchor_w, anchor_h = best_anchor
        tw = np.log(gt_w / anchor_w)
        th = np.log(gt_h / anchor_h)
        cls = int(gt_box[-1])
        aindx = int(aindx)
        lindx = int(lindx)
        y_true[lindx][py, px, aindx, 0:4] = (tx, ty, tw, th)
        y_true[lindx][py, px, aindx,
                      4:8] = (gt_center_x, gt_center_y, gt_w, gt_h)
        y_true[lindx][py, px, aindx, 8] = 1
        y_true[lindx][py, px, aindx, 9 + cls] = 1

        #print('gt_box;',gt_box,gt_center_x,gt_center_y,gt_w,gt_h)

    return y_true

Ejemplo n.º 4

0

Mostrar archivo

Archivo: gen_landmark.py Proyecto: yuteng/MTCNN

def create_landmark(argument=True):
    # 是否对图像变换
    # argument = True
    image_id = 0

    ftxt = os.path.join(data_dir, 'trainImageList.txt')
    data = get_landmark(ftxt, data_dir)
    idx = 0
    landmark_list = []

    for (imgPath, box, landmarkGt) in tqdm(data):
        # if image_id > 3:
        #     break
        #存储人脸图片和关键点
        F_imgs = []
        F_landmarks = []
        img = cv2.imread(imgPath, cv2.COLOR_BGR2RGB)

        img_h, img_w, img_c = img.shape
        gt_box = np.array([box.left, box.top, box.right, box.bottom])
        #人脸图片
        f_face = img[box.top:box.bottom + 1, box.left:box.right + 1]
        #resize成网络输入大小
        f_face = cv2.resize(f_face, (size, size))

        landmark = np.zeros((5, 2))
        for index, one in enumerate(landmarkGt):
            #关键点相对于左上坐标偏移量并归一化
            rv = ((one[0] - gt_box[0]) / (gt_box[2] - gt_box[0]),
                  (one[1] - gt_box[1]) / (gt_box[3] - gt_box[1]))
            landmark[index] = rv
        F_imgs.append(f_face)
        F_landmarks.append(landmark.reshape(10))
        landmark = np.zeros((5, 2))
        if argument:
            #对图像变换
            idx = idx + 1
            x1, y1, x2, y2 = gt_box
            gt_w = x2 - x1 + 1
            gt_h = y2 - y1 + 1
            #除去过小图像
            if max(gt_w, gt_h) < 40 or x1 < 0 or y1 < 0:
                continue
            for i in range(10):
                #随机裁剪图像大小
                box_size = npr.randint(int(min(gt_w, gt_h) * 0.8),
                                       np.ceil(1.25 * max(gt_w, gt_h)))
                #随机左上坐标偏移量
                delta_x = npr.randint(-gt_w * 0.2, gt_w * 0.2)
                delta_y = npr.randint(-gt_h * 0.2, gt_h * 0.2)
                #计算左上坐标
                nx1 = int(max(x1 + gt_w / 2 - box_size / 2 + delta_x, 0))
                ny1 = int(max(y1 + gt_h / 2 - box_size / 2 + delta_y, 0))
                nx2 = nx1 + box_size
                ny2 = ny1 + box_size
                #除去超过边界的
                if nx2 > img_w or ny2 > img_h:
                    continue
                #裁剪边框，图片
                crop_box = np.array([nx1, ny1, nx2, ny2])
                cropped_im = img[ny1:ny2 + 1, nx1:nx2 + 1, :]
                resized_im = cv2.resize(cropped_im, (size, size))
                iou = cal_iou(crop_box, np.expand_dims(gt_box, 0))
                #只保留pos图像
                if iou > 0.65:
                    F_imgs.append(resized_im)
                    #关键点相对偏移
                    for index, one in enumerate(landmarkGt):
                        rv = ((one[0] - nx1) / box_size,
                              (one[1] - ny1) / box_size)
                        landmark[index] = rv
                    F_landmarks.append(landmark.reshape(10))
                    landmark = np.zeros((5, 2))
                    landmark_ = F_landmarks[-1].reshape(-1, 2)
                    box = BBox([nx1, ny1, nx2, ny2])
                    #镜像
                    if random.choice([0, 1]) > 0:
                        face_flipped, landmark_flipped = flip(
                            resized_im, landmark_)
                        face_flipped = cv2.resize(face_flipped, (size, size))
                        F_imgs.append(face_flipped)
                        F_landmarks.append(landmark_flipped.reshape(10))
                    #逆时针翻转
                    if random.choice([0, 1]) > 0:
                        face_rotated_by_alpha, landmark_rorated = rotate(
                            img, box, box.reprojectLandmark(landmark_), 5)
                        #关键点偏移
                        landmark_rorated = box.projectLandmark(
                            landmark_rorated)
                        face_rotated_by_alpha = cv2.resize(
                            face_rotated_by_alpha, (size, size))
                        F_imgs.append(face_rotated_by_alpha)
                        F_landmarks.append(landmark_rorated.reshape(10))

                        #左右翻转
                        face_flipped, landmark_flipped = flip(
                            face_rotated_by_alpha, landmark_rorated)
                        face_flipped = cv2.resize(face_flipped, (size, size))
                        F_imgs.append(face_flipped)
                        F_landmarks.append(landmark_flipped.reshape(10))
                    #顺时针翻转
                    if random.choice([0, 1]) > 0:
                        face_rotated_by_alpha, landmark_rorated = rotate(
                            img, box, box.reprojectLandmark(landmark_), -5)
                        #关键点偏移
                        landmark_rorated = box.projectLandmark(
                            landmark_rorated)
                        face_rotated_by_alpha = cv2.resize(
                            face_rotated_by_alpha, (size, size))
                        F_imgs.append(face_rotated_by_alpha)
                        F_landmarks.append(landmark_rorated.reshape(10))

                        #左右翻转
                        face_flipped, landmark_flipped = flip(
                            face_rotated_by_alpha, landmark_rorated)
                        face_flipped = cv2.resize(face_flipped, (size, size))
                        F_imgs.append(face_flipped)
                        F_landmarks.append(landmark_flipped.reshape(10))
        F_imgs, F_landmarks = np.asarray(F_imgs), np.asarray(F_landmarks)

        for i in range(len(F_imgs)):
            #剔除数据偏移量在[0,1]之间
            if np.sum(np.where(F_landmarks[i] <= 0, 1, 0)) > 0:
                continue
            if np.sum(np.where(F_landmarks[i] >= 1, 1, 0)) > 0:
                continue
            landmark_list.append([F_imgs[i], F_landmarks[i]])

            image_id += 1

    print("landmark数量：", image_id)
    return landmark_list

Ejemplo n.º 5

0

Mostrar archivo

def cal_loss(img, y_preds, ground_truths, categories):
    '''

    :param img: 3D array like (W, H, C)
    :param y_preds: nd array like (S*S, B, (5+class_num))
    :param ground_truths: json like object,
    {
        'large_vehicle':[[(341, 292),...,(346, 457), 0], [(341, 292),...,(346, 457), 1]...],
        'small_vehicle':[[(341, 292),...,(346, 457), 0], [(341, 292),...,(346, 457), 0]...]
    }
    :param categories: categories list like [small_vehicle, ...]
    :return: total losses
    '''
    loss = 0
    class_num = len(categories)
    W, H, C = img.shape
    S, _, B = int(np.sqrt(y_preds.shape[0])), int(np.sqrt(
        y_preds.shape[0])), int(y_preds.shape[1])
    ### define parameters lambda_coord, lambda_noob
    lambda_coord, lambda_noob = 0.5, 0.5

    ### 1.对预测的中心坐标做损失
    ground_truths_centeroid_idx = confirm_cell_index(
        (W, H), S, cal_centeroid(ground_truths))
    for k in ground_truths_centeroid_idx.keys():
        idxs = ground_truths_centeroid_idx[k]
        for i, idx in enumerate(idxs):
            idx_x, idx_y = idx
            idx_flat = idx_x * S + idx_y
            res_bbox = y_preds[idx_flat, :, :]
            ground_truth_box = ground_truths[k][i]
            ious = []
            boxes = res_bbox
            boxes = np.array(boxes).reshape((B, class_num + 5))
            for box in boxes:
                x_pred, y_pred, w_pred, h_pred, conf, class_prob_array = box[
                    0], box[1], box[2], box[3], box[4], box[5:]
                pred_position = decode_position(img,
                                                cell=idxs,
                                                x=x_pred,
                                                y=y_pred,
                                                w=w_pred,
                                                h=h_pred,
                                                S=S)
                iou = cal_iou(ground_truth_box, pred_position)
                ious.append(iou)
        confirm_iou_idx = np.argmax(ious)
        confirm_box = ious[confirm_iou_idx]
        confirm_box = np.array(confirm_box).reshape(-1, )
        box_pts = decode_position(img,
                                  cell=idx,
                                  x=confirm_box[0],
                                  y=confirm_box[1],
                                  w=confirm_box[2],
                                  h=confirm_box[3],
                                  S=S)
        ground_truth_box_pts = ground_truth_box[0]

        ### 计算中心点和边界框loss
        centerAndBox_loss = cal_centeroid_loss(lambda_coord,
                                               ground_truth_box_pts, box_pts)

        ### 计算类别损失
        assert k in categories
        k_idx = categories.index(k)
        true_prob_array = np.zeros((class_num, ))
        true_prob_array[k_idx] = 1
        class_loss = calculateMSE(true_prob_array, class_prob_array)

        ### 计算置信度损失
        ground_truth_box_pts_idx = [
            confirm_cell_index((W, H), S, p)
            for p in np.array(ground_truth_box_pts)
        ]
        ground_truth_box_pts_idx_1d = [
            twoD2oneD(pt, S) for pt in ground_truths_centeroid_idx
        ]

        confidence_true = np.zeros(shape=(S, S))
        confidence_true[ground_truth_box_pts_idx_1d] = 1

        confidence_pred = np.array(y_pred[:, confirm_iou_idx, 4]).reshape(
            (S, S))
        confidence_loss = calculateMSE(confidence_true, confidence_pred)
        loss = centerAndBox_loss + class_loss + confidence_loss
    return loss

Ejemplo n.º 6

0

Mostrar archivo

Archivo: gen_hard_example.py Proyecto: yuteng/MTCNN

def save_hard_example(save_size, save_dir, data_gt, det_boxes):
    """将网络识别的box用来裁剪原图像作为下一个网络的输入"""
    img_list = data_gt['images']
    gt_boxes_list = data_gt['boxes']
    num_of_images = len(img_list)

    assert len(det_boxes) == num_of_images, "弄错了"

    n_idx = 0
    p_idx = 0
    d_idx = 0
    image_done = 0

    positive_list = []
    negative_list = []
    part_list = []

    for img, dets, gts in tqdm(zip(img_list, det_boxes, gt_boxes_list)):
        gts = np.array(gts, dtype=np.float32).reshape(-1, 4)
        image_done += 1

        if dets is None or dets.shape[0] == 0:
            continue
        # img = cv2.imread(im_idx)
        # 转换成正方形
        dets = convert_to_square(dets)
        dets[:, 0:4] = np.round(dets[:, 0:4])
        neg_num = 0
        for box in dets:
            x_left, y_top, x_right, y_bottom, _ = box.astype(int)
            width = x_right - x_left + 1
            height = y_bottom - y_top + 1

            # 除去过小的
            if width < 20 or x_left < 0 or y_top < 0 or x_right > img.shape[1] - 1 or y_bottom > img.shape[0] - 1:
                continue

            iou = cal_iou(box, gts)
            cropped_im = img[y_top:y_bottom + 1, x_left:x_right + 1, :]
            resized_im = cv2.resize(cropped_im, (save_size, save_size),
                                    interpolation=cv2.INTER_LINEAR)

            # 划分种类
            if np.max(iou) < 0.3 and neg_num < 60:
                negative_list.append(resized_im)
                n_idx += 1
                neg_num += 1
            else:
               
                idx = np.argmax(iou)
                assigned_gt = gts[idx]
                x1, y1, x2, y2 = assigned_gt

                # 偏移量
                offset_x1 = (x1 - x_left) / float(width)
                offset_y1 = (y1 - y_top) / float(height)
                offset_x2 = (x2 - x_right) / float(width)
                offset_y2 = (y2 - y_bottom) / float(height)
                roi = np.array([float(offset_x1), float(offset_y1), float(offset_x2), float(offset_y2)])

                # pos和part
                if np.max(iou) >= 0.65:
                    positive_list.append([resized_im, roi])
                    p_idx += 1

                elif np.max(iou) >= 0.4:
                    part_list.append([resized_im, roi])
                    d_idx += 1

    print('%s 个图片已处理，pos：%s  part: %s neg:%s' % (image_done, p_idx, d_idx, n_idx))

    base_num = 100000
    if len(negative_list) > base_num * 3:
        neg_keep = npr.choice(len(negative_list), size=base_num * 3, replace=True)
        negative_list = np.asarray(negative_list)[neg_keep]

    sum_p = len(negative_list) // 3
    pos_keep = npr.choice(len(positive_list), sum_p, replace=True)
    part_keep = npr.choice(len(part_list), sum_p, replace=True)

    positive_list = np.asarray(positive_list)[pos_keep]
    part_list = np.asarray(part_list)[part_keep]
    print('neg数量：{} pos数量：{} part数量:{}'.format(len(negative_list), len(pos_keep), len(part_keep)))

    create_h5_box(positive_list, filename=save_dir + '/positive.h5')
    create_h5_box(part_list, filename=save_dir + '/part.h5')
    create_h5_img(negative_list, filename=save_dir + '/negative.h5')

Ejemplo n.º 7

0

Mostrar archivo

Archivo: focal_loss.py Proyecto: TalentBoy2333/RetinaNet-PyTorch-Tutorial

    def forward(self, classification, localization, anchors, annotations):
        batch_size = classification.size()[0]
        cls_losses = []
        loc_losses = []

        for i in range(batch_size):
            pred_cls = classification[i, :, :]
            pred_loc = localization[i, :, :]
            anno = annotations[i, :, :]
            # 去掉为了batch中样本保持格式一致时，添加的[-1, -1, -1, -1, -1]
            anno = anno[anno[:, 0] != -1]

            # 首先考虑anno里什么都没有的情况
            if anno.size()[0] == 0:
                if cuda:
                    cls_losses.append(torch.tensor(0).float().cuda())
                    loc_losses.append(torch.tensor(0).float().cuda())
                else:
                    cls_losses.append(torch.tensor(0).float())
                    loc_losses.append(torch.tensor(0).float())
                continue
            '''
            如果要对数值进行 log 操作，最好先对其进行 clamp 操作，防止其中存在极小值，
            导致计算结果出现 nan。
            '''
            # 由于交叉熵要进行log()函数的运算，因此pred_cls中的数接近0或1时，
            # 会导致我们的交叉熵出现'nan'，因此需要将其限定在一定的范围内
            pred_cls = torch.clamp(pred_cls, min=1e-4, max=1-1e-4)

            iou = cal_iou(anchors[:, :], anno[:, :-1])

            # 计算各个anchor相对于annotations中，IoU最大的值，并记录最大IoU的位置
            # iou: [n, m]    iou_max: [n, ]
            iou_max, iou_max_ind = torch.max(iou, dim=1)

            '''
            计算 classification 的 loss 

            引用 'Focal Loss for Dense Object Detection' 论文原文：
            'Specifically, anchors are assigned to ground-truth object boxes 
            using an intersection-over-union (IoU) threshold of 0.5; and to 
            background if their IoU is in [0, 0.4). As each anchor is assigned 
            to at most one object box, we set the corresponding entry in its 
            length K label vector to 1 and all other entries to 0. If an anchor 
            is unassigned, which may happen with overlap in [0.4, 0.5), it is 
            ignored during training.'
            '''
            # 用于存储anchor被分配的类别，positive使用onehot编码(代码在后面实现)
            # negative使用全0编码，不参与训练的anchor使用全-1编码
            anchors_onehot = torch.ones(pred_cls.size()) * -1 # [-1, 80]
            if cuda:
                anchors_onehot = anchors_onehot.cuda()
            '''
            torch.lt(input, other, out=None)
            :逐元素比较input和other，即是否input < other
            :param input(Tensor): 要对比的张量
            :param other(Tensor or float): 对比的张量或float值
            :param out(Tensor,可选的): 输出张量
            '''
            # negative的anchor使用全0编码
            anchors_onehot[torch.lt(iou_max, 0.4), :] = 0
            '''
            torch.ge(input, other, out=None)
            :逐元素比较input和other，即是否input >= other。torch.gt()是判断input > other
            :param input(Tensor): 待对比的张量
            :prarm other(Tensor or float): 对比的张量或float值
            :param out(Tensor,可选的): 输出张量
            '''
            pos_ind = torch.ge(iou_max, 0.5)
            pos_num = pos_ind.sum()
            # print('positive anchor number:', pos_num)
            # 每一个anchor所属的GroundTruth位置和类别(按IoU计算结果分配), 
            # shape: [-1, 5],    [x1, y1, x2, y2, cls]
            gt = anno[iou_max_ind, :] 

            # positive的anchor使用onehot编码
            anchors_onehot[pos_ind, :] = 0
            anchors_onehot[pos_ind, gt[pos_ind, -1].long()] = 1

            if cuda:
                alpha = torch.ones(anchors_onehot.size()).cuda() * self.alpha
            else:
                alpha = torch.ones(anchors_onehot.size()) * self.alpha
            '''
            torch.where(condition, x, y) → Tensor
            针对于x而言，如果其中的每个元素都满足condition，就返回x的值；
            如果不满足condition，返回y的值。
            '''
            alpha = torch.where(anchors_onehot.eq(1), alpha, 1 - alpha)
            pt = torch.where(anchors_onehot.eq(1), pred_cls, 1 - pred_cls)
            # focal_weight = alpha(1-pt)^gamma
            focal_weight = alpha * torch.pow((1 - pt), self.gamma)
            # 交叉熵
            bce_loss = -1 * ( \
                anchors_onehot * torch.log(pred_cls) + \
                (1 - anchors_onehot) * torch.log(1 - pred_cls) \
                )
            cls_loss = focal_weight * bce_loss 
            # 不参与训练的anchor的loss需要从cls_loss中删除
            if cuda:
                cls_loss = torch.where(
                    torch.eq(anchors_onehot, -1), 
                    torch.zeros(cls_loss.size()).cuda(), 
                    cls_loss
                )
            else:
                cls_loss = torch.where(
                    torch.eq(anchors_onehot, -1), 
                    torch.zeros(cls_loss.size()), 
                    cls_loss
                )
            cls_losses.append(cls_loss.sum() / torch.clamp(pos_num.float(), min=1.0))
            # print(cls_losses) 

            '''
            计算 localization 的 loss 
            '''
            if pos_num <= 0:
                # 如果没有positive的anchor的话，我们将loc_loss置0
                if cuda:
                    loc_losses.append(torch.tensor(0).float().cuda())
                else:
                    loc_losses.append(torch.tensor(0).float().cuda())
            else:
                anchors_w = anchors[:, 2] - anchors[:, 0]
                anchors_h = anchors[:, 3] - anchors[:, 1]
                anchors_cx = anchors[:, 0] + 0.5 * anchors_w
                anchors_cy = anchors[:, 1] + 0.5 * anchors_h
                # 在BP算法中，我们只训练positive的anchor
                pos_w = anchors_w[pos_ind]
                pos_h = anchors_h[pos_ind]
                pos_cx = anchors_cx[pos_ind]
                pos_cy = anchors_cy[pos_ind]

                pos_gt = gt[pos_ind, :]
                gt_w = pos_gt[:, 2] - pos_gt[:, 0]
                gt_h = pos_gt[:, 3] - pos_gt[:, 1]
                gt_cx = pos_gt[:, 0] + 0.5 * gt_w
                gt_cy = pos_gt[:, 1] + 0.5 * gt_h 
                '''
                如果要对数值进行 log 操作，最好先对其进行 clamp 操作，防止其中存在极小值，
                导致计算结果出现 nan。
                '''
                # 同样的，我们在计算loc_loss时，依然会进行log()函数的运算，
                # 如果gt_w, gt_h过小的话，对导致最终输出的loc_loss为'nan'
                gt_w = torch.clamp(gt_w, min=1)
                gt_h = torch.clamp(gt_h, min=1)

                # 计算神经网络需要学习到的位置回归偏差
                dx = (gt_cx - pos_cx) / pos_w 
                dy = (gt_cy - pos_cy) / pos_h 
                dw = torch.log(gt_w / pos_w)
                dh = torch.log(gt_h / pos_h)

                d_stack = torch.stack((dx, dy, dw, dh)) 
                d_stack = d_stack.t() # 转置 
                '''
                引用 'Focal Loss for Dense Object Detection' 论文原文：
                'The training loss is the sum the focal loss and the standard 
                smooth L1 loss used for box regression [10].'
                因此，我们在计算loc_loss时，使用smooth L1 loss
                '''
                if cuda:
                    d_stack = d_stack / torch.Tensor([[0.1, 0.1, 0.2, 0.2]]).cuda()
                else:
                    d_stack = d_stack / torch.Tensor([[0.1, 0.1, 0.2, 0.2]]) 
                loc_loss = torch.abs(d_stack - pred_loc[pos_ind, :])
                '''
                torch.le(input, other, out=None)
                :逐元素比较input和other，即是否input <= other.
                :param input(Tenosr): 要对比的张量
                :param other(Tensor or float): 对比的张量或float值
                :param out(Tensor,可选的): 输出张量
                '''
                loc_loss = torch.where(
                    torch.le(loc_loss, 1.0 / 9.0),
                    0.5 * 9.0 * torch.pow(loc_loss, 2),
                    loc_loss - 0.5 / 9.0
                )
                loc_losses.append(loc_loss.mean())
        
        cls_loss = torch.stack(cls_losses).mean(dim=0, keepdim=True)
        loc_loss = torch.stack(loc_losses).mean(dim=0, keepdim=True)
        # print(cls_loss)
        # print(loc_loss)

        return cls_loss, loc_loss

Ejemplo n.º 8

0

Mostrar archivo

def main(args):

    # Class category of PASCAL that the RL agent will be searching
    device = torch.device("cuda:0" if (torch.cuda.is_available() and args.use_gpu) else "cpu")
    image_names = np.array(load_images_names_in_data_set('aeroplane_trainval', path_voc))
    feature_exactrator = torchvision.models.vgg16(pretrained=True).features.to(device)
    single_plane_image_names = []
    single_plane_image_gts = []
    dqn = DQN(device)
    EPISILO = args.EPISILO


    for image_name in image_names:
        annotation = get_bb_of_gt_from_pascal_xml_annotation(image_name, path_voc)
        if(len(annotation)>1):
            continue
        single_plane_image_names.append(image_name)
        single_plane_image_gts.append(annotation[0][1:])        #[[x1,x2,y1,y2] ...]

    trans = T.Compose([
        T.Resize((224,224)),
        T.ToTensor(),
    ])


    for i in range(epochs):
        ep_reward = 0
        for index, image_name in enumerate(single_plane_image_names):
            image_path = os.path.join(path_voc + "JPEGImages", image_name + ".jpg")
            image_original = Image.open(image_path)
            width, height = image_original.size
            #image_original = image_original.resize((224,224))
            bbx_gt = single_plane_image_gts[index]
            #draw = ImageDraw.Draw(image_original)
            #draw.rectangle([bbx_gt[0],bbx_gt[2],bbx_gt[1],bbx_gt[3]],outline='red')
            #image_original.show()
            #return

            image = init_process(image_original, trans).to(device)
            #print(image.shape)
            bbx = [0, width, 0, height]
            history_action = np.zeros(his_actions*NUM_ACTIONS)
            with torch.no_grad():
                vector = feature_exactrator(image).cpu().detach().numpy().reshape(7*7*512)
            state = np.concatenate([history_action, vector])
            step = 0
            while(step<10):
                iou = cal_iou(bbx, bbx_gt)
                if iou>0.5:
                    action = 5
                else:
                    action = dqn.choose_action(state, EPISILO)
                #print(action)

                #execute action and step to new bbx
                new_bbx = update_bbx(bbx, action)
                reward = reward_func(bbx, new_bbx, bbx_gt, action)

                #get new state
                action_vec = np.zeros(NUM_ACTIONS)
                action_vec[action] = 1.0
                history_action = np.concatenate([history_action[NUM_ACTIONS:], action_vec])

                with torch.no_grad():
                    vector = feature_exactrator(inter_process(image_original,new_bbx,trans).to(device)).cpu().detach().numpy().reshape(7*7*512)
                next_state = np.concatenate([history_action,vector])

                #store transition
                dqn.store_transition(state, action, reward, next_state)

                ep_reward += reward

                if dqn.memory_counter >= MEMORY_CAPACITY:
                    print("episode: {},".format(i),end=' ')
                    dqn.learn()

                #termation
                if action==5:
                    break

                state = next_state
                bbx = new_bbx
                step += 1

        if (EPISILO>0.1):
            EPISILO -= 0.1
        print("episode: {} , this epoch reward is {}".format(i, round(ep_reward, 3)))  # 0.001 precision

Ejemplo n.º 9

0

Mostrar archivo

def yolo_loss(y_true, y_pred):
    """

    :param y_true: [batch_size, 7, 7, 25]
    :param y_pred: [batch_size, 7, 7, 30]
    :return:
    """
    # 类别标签
    _classes = y_pred[..., 10:]
    classes = y_true[..., 5:]
    # (batch_size, 7, 7, 2)
    _confidences = y_pred[..., 8:10]
    # (batch_size, 7, 7, 1)
    confidences = y_true[..., 4:5]

    # (batch_size, 7, 7, 4)
    bboxes = y_true[..., 0:4]
    # (batch_size, 7, 7, 1, 4)
    bboxes = tf.reshape(bboxes, (-1, cfg.CELL_SIZE, cfg.CELL_SIZE, 1, 4))
    _bboxes = y_pred[..., 0:8]
    # (batch_size, 7, 7, 2, 4)
    _bboxes = tf.reshape(_bboxes, (-1, cfg.CELL_SIZE, cfg.CELL_SIZE, cfg.B, 4))

    grid_x = tf.range(cfg.CELL_SIZE, dtype=tf.float32)
    grid_y = tf.range(cfg.CELL_SIZE, dtype=tf.float32)
    grid_x, grid_y = tf.meshgrid(grid_x, grid_y)
    x_offset = tf.reshape(grid_x, (-1, 1))
    y_offset = tf.reshape(grid_y, (-1, 1))
    x_y_offset = tf.concat([x_offset, y_offset], axis=-1)
    x_y_offset = tf.cast(
        tf.reshape(x_y_offset, [cfg.CELL_SIZE, cfg.CELL_SIZE, 1, 2]),
        tf.float32)

    # 将_bboxes转到原图上
    _bboxes_normal = tf.stack([
        (_bboxes[..., 0] + x_y_offset[..., 0]) / cfg.CELL_SIZE,
        (_bboxes[..., 1] + x_y_offset[..., 1]) / cfg.CELL_SIZE,
        tf.square(_bboxes[..., 2]),
        tf.square(_bboxes[..., 3]),
    ],
                              axis=-1)

    # bboxes_ious: (n, 7, 7, 2)
    bboxes_ious = cal_iou(_bboxes_normal, bboxes)
    object_mask = tf.reduce_max(bboxes_ious, axis=-1, keep_dims=True)
    # 第i个cell第j个bbox负责产生损失
    object_mask = tf.cast(bboxes_ious >= object_mask,
                          dtype=tf.float32) * confidences
    noobject_mask = tf.ones_like(object_mask, dtype=tf.float32) - object_mask

    # _bboxes[..., 0:2] = (_bboxes[..., 0:2] + x_y_offset) / cfg.CELL_SIZE
    # bboxes = bboxes[..., 0:2] * cfg.CELL_SIZE - x_y_offset
    # bboxes = tf.sqrt(bboxes[..., 2:4])
    bboxes_normal = tf.stack([
        bboxes[..., 0] * cfg.CELL_SIZE - x_y_offset[..., 0],
        bboxes[..., 1] * cfg.CELL_SIZE - x_y_offset[..., 1],
        tf.sqrt(bboxes[..., 2]),
        tf.sqrt(bboxes[..., 3]),
    ],
                             axis=-1)

    object_delta = object_mask * (_confidences - bboxes_ious)
    object_loss = tf.reduce_mean(
        tf.reduce_sum(tf.square(object_delta), axis=[1, 2, 3
                                                     ])) * cfg.OBJECT_SCALE

    onobject_delta = noobject_mask * _confidences
    nobject_loss = tf.reduce_mean(
        tf.reduce_sum(tf.square(onobject_delta), axis=[1, 2, 3
                                                       ])) * cfg.NOOBJECT_SCALE

    # 类别损失
    cls_delta = confidences * (classes - _classes)
    cls_loss = tf.reduce_mean(
        tf.reduce_sum(tf.square(cls_delta), axis=[1, 2, 3])) * cfg.CLASS_SCALE

    # 边框损失
    bbox_mask = tf.expand_dims(object_mask, axis=-1)
    bboxes_xy_delta = bbox_mask * (_bboxes[..., 0:2] - bboxes_normal[..., 0:2])
    bboxes_wh_delta = bbox_mask * (_bboxes[..., 2:4] - bboxes_normal[..., 2:4])
    bboxes_loss = tf.reduce_mean(tf.reduce_sum(tf.square(bboxes_xy_delta), axis=[1, 2, 3, 4])) * cfg.BBOX_SCALE + \
                  tf.reduce_mean(tf.reduce_sum(tf.square(bboxes_wh_delta), axis=[1, 2, 3, 4])) * cfg.BBOX_SCALE
    total_loss = cls_loss + object_loss + nobject_loss + bboxes_loss
    return total_loss