Esempio n. 1
0
class SiamMask(nn.Module):
    def __init__(self, anchors=None, o_sz=127, g_sz=127):
        super(SiamMask, self).__init__()
        self.anchors = anchors  # anchor_cfg
        self.anchor_num = len(self.anchors["ratios"]) * len(self.anchors["scales"])
        self.anchor = Anchors(anchors)
        self.features = None
        self.rpn_model = None
        self.mask_model = None
        self.o_sz = o_sz
        self.g_sz = g_sz
        self.all_anchors = None

    def set_all_anchors(self, image_center, size):
        # cx,cy,w,h
        if not self.anchor.generate_all_anchors(image_center, size):
            return
        all_anchors = self.anchor.all_anchors[1]  # cx, cy, w, h
        self.all_anchors = torch.from_numpy(all_anchors).float().cuda()
        self.all_anchors = [self.all_anchors[i] for i in range(4)]

    def feature_extractor(self, x):
        return self.features(x)

    def rpn(self, template, search):
        pred_cls, pred_loc = self.rpn_model(template, search)
        return pred_cls, pred_loc

    def mask(self, template, search):
        pred_mask = self.mask_model(template, search)
        return pred_mask

    def template(self, z):
        self.zf = self.feature_extractor(z)
        cls_kernel, loc_kernel = self.rpn_model.template(self.zf)
        return cls_kernel, loc_kernel

    def track(self, x, cls_kernel=None, loc_kernel=None, softmax=False):
        xf = self.feature_extractor(x)
        rpn_pred_cls, rpn_pred_loc = self.rpn_model.track(xf, cls_kernel, loc_kernel)
        if softmax:
            rpn_pred_cls = self.softmax(rpn_pred_cls)
        return rpn_pred_cls, rpn_pred_loc
Esempio n. 2
0
class DataSets(Dataset):
    def __init__(self, cfg, anchor_cfg, num_epoch=1):
        super(DataSets, self).__init__()
        global logger
        logger = logging.getLogger('global')

        # anchors
        self.anchors = Anchors(anchor_cfg)
        # size
        self.template_size = 127
        self.origin_size = 127
        self.search_size = 255
        self.size = 17
        self.base_size = 0
        self.crop_size = 0
        # 根据配置文件更新参数
        if 'template_size' in cfg:
            self.template_size = cfg['template_size']
        if 'origin_size' in cfg:
            self.origin_size = cfg['origin_size']
        if 'search_size' in cfg:
            self.search_size = cfg['search_size']
        if 'base_size' in cfg:
            self.base_size = cfg['base_size']
        if 'size' in cfg:
            self.size = cfg['size']

        if (self.search_size - self.template_size) / self.anchors.stride + 1 + self.base_size != self.size:
            raise Exception("size not match!")  # TODO: calculate size online
        if 'crop_size' in cfg:
            self.crop_size = cfg['crop_size']
        self.template_small = False
        if 'template_small' in cfg and cfg['template_small']:
            self.template_small = True
        # 生成anchor
        self.anchors.generate_all_anchors(im_c=self.search_size//2, size=self.size)
        if 'anchor_target' not in cfg:
            cfg['anchor_target'] = {}
        # 生成anchor的信息:cls,reg,mask
        self.anchor_target = AnchorTargetLayer(cfg['anchor_target'])

        # data sets
        if 'datasets' not in cfg:
            raise(Exception('DataSet need "{}"'.format('datasets')))

        self.all_data = []
        start = 0
        self.num = 0
        for name in cfg['datasets']:
            dataset = cfg['datasets'][name]
            dataset['mark'] = name
            dataset['start'] = start
            # 加载数据
            dataset = SubDataSet(dataset)
            dataset.log()
            self.all_data.append(dataset)
            # 数据数量
            start += dataset.num  # real video number
            # 打乱的数据数量
            self.num += dataset.num_use  # the number used for subset shuffle

        # 数据增强data augmentation
        aug_cfg = cfg['augmentation']
        self.template_aug = Augmentation(aug_cfg['template'])
        self.search_aug = Augmentation(aug_cfg['search'])
        self.gray = aug_cfg['gray']
        self.neg = aug_cfg['neg']
        self.inner_neg = 0 if 'inner_neg' not in aug_cfg else aug_cfg['inner_neg']

        self.pick = None  # list to save id for each img
        if 'num' in cfg:  # number used in training for all dataset
            self.num = int(cfg['num'])
        self.num *= num_epoch
        self.shuffle()

        self.infos = {
                'template': self.template_size,
                'search': self.search_size,
                'template_small': self.template_small,
                'gray': self.gray,
                'neg': self.neg,
                'inner_neg': self.inner_neg,
                'crop_size': self.crop_size,
                'anchor_target': self.anchor_target.__dict__,
                'num': self.num // num_epoch
                }
        logger.info('dataset informations: \n{}'.format(json.dumps(self.infos, indent=4)))

    def imread(self, path):
        # 数据读取
        img = cv2.imread(path)
        if self.origin_size == self.template_size:
            # 返回图像
            return img, 1.0

        def map_size(exe, size):
            return int(round(((exe + 1) / (self.origin_size + 1) * (size+1) - 1)))
        # 尺寸调整
        nsize = map_size(self.template_size, img.shape[1])
        # 调整图像大小
        img = cv2.resize(img, (nsize, nsize))
        # 返回图像和缩放比例
        return img, nsize / img.shape[1]

    def shuffle(self):
        "打乱"
        pick = []
        m = 0
        # 获取数据
        while m < self.num:
            p = []
            for subset in self.all_data:
                sub_p = subset.shuffle()
                p += sub_p
            # 打乱数据
            sample_random.shuffle(p)
            # 将打乱的结果进行拼接
            pick += p
            m = len(pick)
        # 将打乱的结果赋值给pick
        self.pick = pick
        logger.info("shuffle done!")
        logger.info("dataset length {}".format(self.num))

    def __len__(self):
        return self.num

    def find_dataset(self, index):
        "查找数据"
        for dataset in self.all_data:
            if dataset.start + dataset.num > index:
                # 返回索引范围内的数据
                return dataset, index - dataset.start

    def __getitem__(self, index, debug=False):
        # 在打乱的结果中找到索引
        index = self.pick[index]
        # 查找得到数据
        dataset, index = self.find_dataset(index)
        # 灰度图
        gray = self.gray and self.gray > random.random()
        # 负样本
        neg = self.neg and self.neg > random.random()
        # 负样本
        if neg:
            # 获取template
            template = dataset.get_random_target(index)
            # 根据设置,从数据生成负样本或随机选择负样本
            if self.inner_neg and self.inner_neg > random.random():
                search = dataset.get_random_target()
            else:
                search = random.choice(self.all_data).get_random_target()
        else:
            # 获得正样本对
            template, search = dataset.get_positive_pair(index)
        # 裁剪图像的中央大小为size的部分
        def center_crop(img, size):
            # 获取图像的形状
            shape = img.shape[1]
            # 若为size,则直接返回
            if shape == size: return img
            # 否则,裁剪中央位置为size大小的图像
            c = shape // 2
            l = c - size // 2
            r = c + size // 2 + 1
            return img[l:r, l:r]
        # 读取模板图像
        template_image, scale_z = self.imread(template[0])
        # 若设置为小模板时,则从模板图像中进行裁剪
        if self.template_small:
            template_image = center_crop(template_image, self.template_size)
        # 读取待搜索图像
        search_image, scale_x = self.imread(search[0])
        # 若存在掩膜并且不是负样本数据
        if dataset.has_mask and not neg:
            # 读取掩膜数据
            search_mask = (cv2.imread(search[2], 0) > 0).astype(np.float32)
        else:
            # 掩膜数据用全零数组替代
            search_mask = np.zeros(search_image.shape[:2], dtype=np.float32)
        # 若裁剪size大于0,对搜索图像和掩膜进行裁剪
        if self.crop_size > 0:
            search_image = center_crop(search_image, self.crop_size)
            search_mask = center_crop(search_mask, self.crop_size)
        # 根据图像大小生成bbox,shape是模板图像中bbox的形状
        def toBBox(image, shape):
            # 图像的大小
            imh, imw = image.shape[:2]
            # 获取shape的宽高
            if len(shape) == 4:
                w, h = shape[2]-shape[0], shape[3]-shape[1]
            else:
                w, h = shape
            # 扩展比例
            context_amount = 0.5
            # 模板尺寸
            exemplar_size = self.template_size  # 127
            # 获取宽高
            wc_z = w + context_amount * (w+h)
            hc_z = h + context_amount * (w+h)
            # 等效边长
            s_z = np.sqrt(wc_z * hc_z)
            # 比例
            scale_z = exemplar_size / s_z
            # 宽高
            w = w*scale_z
            h = h*scale_z
            # 中心点坐标
            cx, cy = imw//2, imh//2
            bbox = center2corner(Center(cx, cy, w, h))
            return bbox
        # 生成模板图像和待搜索图像中的bbox
        template_box = toBBox(template_image, template[1])
        search_box = toBBox(search_image, search[1])
        # 模板数据增强
        template, _, _ = self.template_aug(template_image, template_box, self.template_size, gray=gray)
        # 待搜索图像的数据增强
        search, bbox, mask = self.search_aug(search_image, search_box, self.search_size, gray=gray, mask=search_mask)

        # def draw(image, box, name):
        #     image = image.copy()
        #     x1, y1, x2, y2 = map(lambda x: int(round(x)), box)
        #     cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0))
        #     cv2.imwrite(name, image)
        #
        # if debug:
        #     draw(template_image, template_box, "debug/{:06d}_ot.jpg".format(index))
        #     draw(search_image, search_box, "debug/{:06d}_os.jpg".format(index))
        #     draw(template, _, "debug/{:06d}_t.jpg".format(index))
        #     draw(search, bbox, "debug/{:06d}_s.jpg".format(index))
        # 生成anchor对应的信息
        cls, delta, delta_weight = self.anchor_target(self.anchors, bbox, self.size, neg)
        if dataset.has_mask and not neg:
            # 掩膜图像
            mask_weight = cls.max(axis=0, keepdims=True)
        else:
            mask_weight = np.zeros([1, cls.shape[1], cls.shape[2]], dtype=np.float32)
        # 模板和搜索图像
        template, search = map(lambda x: np.transpose(x, (2, 0, 1)).astype(np.float32), [template, search])
        # 掩膜结果
        mask = (np.expand_dims(mask, axis=0) > 0.5) * 2 - 1  # 1*H*W
        # 返回结果
        return template, search, cls, delta, delta_weight, np.array(bbox, np.float32), \
               np.array(mask, np.float32), np.array(mask_weight, np.float32)
Esempio n. 3
0
class SiamMask(nn.Module):
    def __init__(self, anchors=None, o_sz=127, g_sz=127):
        super(SiamMask, self).__init__()
        self.anchors = anchors  # anchor_cfg
        self.anchor_num = len(self.anchors["ratios"]) * len(
            self.anchors["scales"])
        self.anchor = Anchors(anchors)
        self.features = None
        self.rpn_model = None
        self.mask_model = None
        self.o_sz = o_sz
        self.g_sz = g_sz
        self.upSample = nn.UpsamplingBilinear2d(size=[g_sz, g_sz])

        self.all_anchors = None

    def set_all_anchors(self, image_center, size):
        # cx,cy,w,h
        if not self.anchor.generate_all_anchors(image_center, size):
            return
        all_anchors = self.anchor.all_anchors[1]  # cx, cy, w, h
        self.all_anchors = torch.from_numpy(all_anchors).float().cuda()
        self.all_anchors = [self.all_anchors[i] for i in range(4)]

    def feature_extractor(self, x):
        return self.features(x)

    def rpn(self, template, search):
        pred_cls, pred_loc = self.rpn_model(template, search)
        return pred_cls, pred_loc

    def mask(self, template, search):
        pred_mask = self.mask_model(template, search)
        return pred_mask

    def _add_rpn_loss(self, label_cls, label_loc, lable_loc_weight, label_mask,
                      label_mask_weight, rpn_pred_cls, rpn_pred_loc,
                      rpn_pred_mask):
        rpn_loss_cls = select_cross_entropy_loss(rpn_pred_cls, label_cls)

        rpn_loss_loc = weight_l1_loss(rpn_pred_loc, label_loc,
                                      lable_loc_weight)

        rpn_loss_mask, iou_m, iou_5, iou_7 = select_mask_logistic_loss(
            rpn_pred_mask, label_mask, label_mask_weight)

        return rpn_loss_cls, rpn_loss_loc, rpn_loss_mask, iou_m, iou_5, iou_7

    def run(self, template, search, softmax=False):
        """
        run network
        """
        template_feature = self.feature_extractor(template)
        feature, search_feature = self.features.forward_all(search)
        rpn_pred_cls, rpn_pred_loc = self.rpn(template_feature, search_feature)
        corr_feature = self.mask_model.mask.forward_corr(
            template_feature, search_feature)  # (b, 256, w, h)
        rpn_pred_mask = self.refine_model(feature, corr_feature)

        if softmax:
            rpn_pred_cls = self.softmax(rpn_pred_cls)
        return rpn_pred_cls, rpn_pred_loc, rpn_pred_mask, template_feature, search_feature

    def softmax(self, cls):
        b, a2, h, w = cls.size()
        cls = cls.view(b, 2, a2 // 2, h, w)
        cls = cls.permute(0, 2, 3, 4, 1).contiguous()
        cls = F.log_softmax(cls, dim=4)
        return cls

    def forward(self, input):
        """
        :param input: dict of input with keys of:
                'template': [b, 3, h1, w1], input template image.
                'search': [b, 3, h2, w2], input search image.
                'label_cls':[b, max_num_gts, 5] or None(self.training==False),
                                     each gt contains x1,y1,x2,y2,class.
        :return: dict of loss, predict, accuracy
        """
        template = input['template']
        search = input['search']
        if self.training:
            label_cls = input['label_cls']
            label_loc = input['label_loc']
            lable_loc_weight = input['label_loc_weight']
            label_mask = input['label_mask']
            label_mask_weight = input['label_mask_weight']

        rpn_pred_cls, rpn_pred_loc, rpn_pred_mask, template_feature, search_feature = \
            self.run(template, search, softmax=self.training)

        outputs = dict()

        outputs['predict'] = [
            rpn_pred_loc, rpn_pred_cls, rpn_pred_mask, template_feature,
            search_feature
        ]

        if self.training:
            rpn_loss_cls, rpn_loss_loc, rpn_loss_mask, iou_acc_mean, iou_acc_5, iou_acc_7 = \
                self._add_rpn_loss(label_cls, label_loc, lable_loc_weight, label_mask, label_mask_weight,
                                   rpn_pred_cls, rpn_pred_loc, rpn_pred_mask)
            outputs['losses'] = [rpn_loss_cls, rpn_loss_loc, rpn_loss_mask]
            outputs['accuracy'] = [iou_acc_mean, iou_acc_5, iou_acc_7]

        return outputs

    def template(self, z):
        self.zf = self.feature_extractor(z)
        cls_kernel, loc_kernel = self.rpn_model.template(self.zf)
        return cls_kernel, loc_kernel

    def track(self, x, cls_kernel=None, loc_kernel=None, softmax=False):
        xf = self.feature_extractor(x)
        rpn_pred_cls, rpn_pred_loc = self.rpn_model.track(
            xf, cls_kernel, loc_kernel)
        if softmax:
            rpn_pred_cls = self.softmax(rpn_pred_cls)
        return rpn_pred_cls, rpn_pred_loc
class DataSets(Dataset):
    def __init__(self, cfg, anchor_cfg, num_epoch=1):
        super(DataSets, self).__init__()
        global logger
        logger = logging.getLogger('global')

        # anchors
        self.anchors = Anchors(anchor_cfg)

        # size
        self.template_size = 127
        self.origin_size = 127
        self.search_size = 255
        self.size = 17
        self.base_size = 0
        self.crop_size = 0

        if 'template_size' in cfg:
            self.template_size = cfg['template_size']
        if 'origin_size' in cfg:
            self.origin_size = cfg['origin_size']
        if 'search_size' in cfg:
            self.search_size = cfg['search_size']
        if 'base_size' in cfg:
            self.base_size = cfg['base_size']
        if 'size' in cfg:
            self.size = cfg['size']

        if (self.search_size - self.template_size) / self.anchors.stride + 1 + self.base_size != self.size:
            raise Exception("size not match!")  # TODO: calculate size online
        if 'crop_size' in cfg:
            self.crop_size = cfg['crop_size']
        self.template_small = False
        if 'template_small' in cfg and cfg['template_small']:
            self.template_small = True

        self.anchors.generate_all_anchors(im_c=self.search_size//2, size=self.size)

        if 'anchor_target' not in cfg:
            cfg['anchor_target'] = {}
        self.anchor_target = AnchorTargetLayer(cfg['anchor_target'])

        # data sets
        if 'datasets' not in cfg:
            raise(Exception('DataSet need "{}"'.format('datasets')))

        self.all_data = []
        start = 0
        self.num = 0
        for name in cfg['datasets']:
            dataset = cfg['datasets'][name]
            dataset['mark'] = name
            dataset['start'] = start

            dataset = SubDataSet(dataset)
            dataset.log()
            self.all_data.append(dataset)

            start += dataset.num  # real video number
            self.num += dataset.num_use  # the number used for subset shuffle

        # data augmentation
        aug_cfg = cfg['augmentation']
        self.template_aug = Augmentation(aug_cfg['template'])
        self.search_aug = Augmentation(aug_cfg['search'])
        self.gray = aug_cfg['gray']
        self.neg = aug_cfg['neg']
        self.inner_neg = 0 if 'inner_neg' not in aug_cfg else aug_cfg['inner_neg']

        self.pick = None  # list to save id for each img
        if 'num' in cfg:  # number used in training for all dataset
            self.num = int(cfg['num'])
        self.num *= num_epoch
        self.shuffle()

        self.infos = {
                'template': self.template_size,
                'search': self.search_size,
                'template_small': self.template_small,
                'gray': self.gray,
                'neg': self.neg,
                'inner_neg': self.inner_neg,
                'crop_size': self.crop_size,
                'anchor_target': self.anchor_target.__dict__,
                'num': self.num // num_epoch
                }
        logger.info('dataset informations: \n{}'.format(json.dumps(self.infos, indent=4)))

    def imread(self, path):
        img = cv2.imread(path)

        if self.origin_size == self.template_size:
            return img, 1.0

        def map_size(exe, size):
            return int(round(((exe + 1) / (self.origin_size + 1) * (size+1) - 1)))

        nsize = map_size(self.template_size, img.shape[1])

        img = cv2.resize(img, (nsize, nsize))

        return img, nsize / img.shape[1]

    def shuffle(self):
        pick = []
        m = 0
        while m < self.num:
            p = []
            for subset in self.all_data:
                sub_p = subset.shuffle()
                p += sub_p

            sample_random.shuffle(p)

            pick += p
            m = len(pick)
        self.pick = pick
        logger.info("shuffle done!")
        logger.info("dataset length {}".format(self.num))

    def __len__(self):
        return self.num

    def find_dataset(self, index):
        for dataset in self.all_data:
            if dataset.start + dataset.num > index:
                return dataset, index - dataset.start

    def __getitem__(self, index, debug=False):
        index = self.pick[index]
        dataset, index = self.find_dataset(index)

        gray = self.gray and self.gray > random.random()
        neg = self.neg and self.neg > random.random()

        if neg:
            template = dataset.get_random_target(index)
            if self.inner_neg and self.inner_neg > random.random():
                search = dataset.get_random_target()
            else:
                search = random.choice(self.all_data).get_random_target()
        else:
            template, search = dataset.get_positive_pair(index)

        def center_crop(img, size):
            shape = img.shape[1]
            if shape == size: return img
            c = shape // 2
            l = c - size // 2
            r = c + size // 2 + 1
            return img[l:r, l:r]

        template_image, scale_z = self.imread(template[0])

        if self.template_small:
            template_image = center_crop(template_image, self.template_size)

        search_image, scale_x = self.imread(search[0])
        if self.crop_size > 0:
            search_image = center_crop(search_image, self.crop_size)

        def toBBox(image, shape):
            imh, imw = image.shape[:2]
            if len(shape) == 4:
                w, h = shape[2]-shape[0], shape[3]-shape[1]
            else:
                w, h = shape
            context_amount = 0.5
            exemplar_size = self.template_size  # 127
            wc_z = w + context_amount * (w+h)
            hc_z = h + context_amount * (w+h)
            s_z = np.sqrt(wc_z * hc_z)
            scale_z = exemplar_size / s_z
            w = w*scale_z
            h = h*scale_z
            cx, cy = imw//2, imh//2
            bbox = center2corner(Center(cx, cy, w, h))
            return bbox

        template_box = toBBox(template_image, template[1])
        search_box = toBBox(search_image, search[1])

        template, _ = self.template_aug(template_image, template_box, self.template_size, gray=gray)
        search, bbox = self.search_aug(search_image, search_box, self.search_size, gray=gray)

        def draw(image, box, name):
            image = image.copy()
            x1, y1, x2, y2 = map(lambda x: int(round(x)), box)
            cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0))
            cv2.imwrite(name, image)

        if debug:
            draw(template_image, template_box, "debug/{:06d}_ot.jpg".format(index))
            draw(search_image, search_box, "debug/{:06d}_os.jpg".format(index))
            draw(template, _, "debug/{:06d}_t.jpg".format(index))
            draw(search, bbox, "debug/{:06d}_s.jpg".format(index))

        cls, delta, delta_weight = self.anchor_target(self.anchors, bbox, self.size, neg)

        template, search = map(lambda x: np.transpose(x, (2, 0, 1)).astype(np.float32), [template, search])

        return template, search, cls, delta, delta_weight, np.array(bbox, np.float32)
Esempio n. 5
0
class DataSets(Dataset):
    def __init__(self, cfg, anchor_cfg):
        super(DataSets, self).__init__()
        global logger
        logger = logging.getLogger('global')

        # anchors
        self.anchors = Anchors(anchor_cfg)

        # size
        self.template_size = 127
        self.origin_size = 127
        self.search_size = 255
        self.heatmap_size = (255, 255)
        self.image_size = 255
        self.size = 17
        self.sigma = 4
        self.base_size = 0
        self.crop_size = 0
        self.target_type = 'gaussian'
        self.single_heatmap = False
        self.output_res = 56
        self.num_joints = 17  # added
        self.mse_loss = False
        self.hm_gauss = 5

        if 'template_size' in cfg:
            self.template_size = cfg['template_size']
        if 'origin_size' in cfg:
            self.origin_size = cfg['origin_size']
        if 'search_size' in cfg:
            self.search_size = cfg['search_size']
        if 'base_size' in cfg:
            self.base_size = cfg['base_size']
        if 'size' in cfg:
            self.size = cfg['size']
        if 'single_heatmap' in cfg:
            self.single_heatmap = cfg['single_heatmap']

        if (self.search_size - self.template_size
            ) / self.anchors.stride + 1 + self.base_size != self.size:
            raise Exception("size not match!")  # TODO: calculate size online
        if 'crop_size' in cfg:
            self.crop_size = cfg['crop_size']
        self.template_small = False
        if 'template_small' in cfg and cfg['template_small']:
            self.template_small = True

        self.anchors.generate_all_anchors(im_c=self.search_size // 2,
                                          size=self.size)

        if 'anchor_target' not in cfg:
            cfg['anchor_target'] = {}
        if 'kp_anchor' not in anchor_cfg:
            self.anchor_target = AnchorTargetLayer(cfg['anchor_target'])
            self.kp_anchor = False
        else:
            self.anchor_target = AnchorTargetWithKPLayer(cfg['anchor_target'])
            self.kp_anchor = True

        # data sets
        if 'datasets' not in cfg:
            raise (Exception('DataSet need "{}"'.format('datasets')))

        self.all_data = []
        start = 0
        self.num = 0
        for name in cfg['datasets']:
            dataset = cfg['datasets'][name]
            dataset['mark'] = name
            dataset['start'] = start

            dataset = SubDataSet(dataset)
            dataset.log()
            self.all_data.append(dataset)

            start += dataset.num  # real video number
            self.num += dataset.num_use  # the number used for subset shuffle

        # data augmentation
        aug_cfg = cfg['augmentation']
        self.template_aug = Augmentation(aug_cfg['template'])
        self.search_aug = Augmentation(aug_cfg['search'])
        self.gray = aug_cfg['gray']
        self.neg = aug_cfg['neg']
        self.inner_neg = 0 if 'inner_neg' not in aug_cfg else aug_cfg[
            'inner_neg']

        self.pick = None  # list to save id for each img
        if 'num' in cfg:  # number used in training for all dataset
            self.num = int(cfg['num'])
        self.shuffle()

        self.infos = {
            'template': self.template_size,
            'search': self.search_size,
            'template_small': self.template_small,
            'gray': self.gray,
            'neg': self.neg,
            'inner_neg': self.inner_neg,
            'crop_size': self.crop_size,
            'anchor_target': self.anchor_target.__dict__,
            'num': self.num
        }
        logger.info('dataset informations: \n{}'.format(
            json.dumps(self.infos, indent=4)))

    def generate_target(self, joints, joints_vis):
        '''
        :param joints:  [num_joints, 3]
        :param joints_vis: [num_joints, 3]
        :return: target, target_weight(1: visible, 0: invisible)
        '''
        target_weight = np.ones((self.num_joints, 1), dtype=np.float32)
        target_weight[:, 0] = joints_vis[:, 0]

        assert self.target_type == 'gaussian', \
            'Only support gaussian map now!'

        if self.target_type == 'gaussian':
            target = np.zeros(
                (self.num_joints, self.heatmap_size[1], self.heatmap_size[0]),
                dtype=np.float32)

            tmp_size = self.sigma * 3

            for joint_id in range(self.num_joints):
                feat_stride = [
                    self.image_size / self.heatmap_size[0],
                    self.image_size / self.heatmap_size[1]
                ]
                mu_x = int(joints[joint_id][0] / feat_stride[0] + 0.5)
                mu_y = int(joints[joint_id][1] / feat_stride[1] + 0.5)
                # Check that any part of the gaussian is in-bounds
                ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)]
                br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)]
                if ul[0] >= self.heatmap_size[0] or ul[1] >= self.heatmap_size[1] \
                        or br[0] < 0 or br[1] < 0:
                    # If not, just return the image as is
                    target_weight[joint_id] = 0
                    continue

                # # Generate gaussian
                size = 2 * tmp_size + 1
                x = np.arange(0, size, 1, np.float32)
                y = x[:, np.newaxis]
                x0 = y0 = size // 2
                # The gaussian is not normalized, we want the center value to equal 1
                g = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * self.sigma**2))

                # Usable gaussian range
                g_x = max(0, -ul[0]), min(br[0], self.heatmap_size[0]) - ul[0]
                g_y = max(0, -ul[1]), min(br[1], self.heatmap_size[1]) - ul[1]
                # Image range
                img_x = max(0, ul[0]), min(br[0], self.heatmap_size[0])
                img_y = max(0, ul[1]), min(br[1], self.heatmap_size[1])

                v = target_weight[joint_id]
                if v > 0.5:
                    target[joint_id][img_y[0]:img_y[1], img_x[0]:img_x[1]] = \
                        g[g_y[0]:g_y[1], g_x[0]:g_x[1]]

        return target, target_weight

    def generate_target_in_single_map(self, joints, joints_vis):
        '''
        :param joints:  [num_joints, 3]
        :return: target, target_weight(1: visible, 0: invisible)
        '''
        target_weight = np.ones((self.num_joints, 1), dtype=np.float32)
        target_weight[:, 0] = joints_vis[:, 0]

        assert self.target_type == 'gaussian', \
            'Only support gaussian map now!'

        if self.target_type == 'gaussian':
            target = np.zeros((1, self.heatmap_size[1], self.heatmap_size[0]),
                              dtype=np.float32)

            masked_gaussian = np.zeros(
                (1, self.heatmap_size[1], self.heatmap_size[0]),
                dtype=np.float32)

            tmp_size = self.sigma * 3

            for joint_id in range(self.num_joints):
                feat_stride = [
                    self.image_size / self.heatmap_size[0],
                    self.image_size / self.heatmap_size[1]
                ]
                mu_x = int(joints[joint_id][0] / feat_stride[0] + 0.5)
                mu_y = int(joints[joint_id][1] / feat_stride[1] + 0.5)
                # Check that any part of the gaussian is in-bounds
                ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)]
                br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)]
                if ul[0] >= self.heatmap_size[0] or ul[1] >= self.heatmap_size[1] \
                        or br[0] < 0 or br[1] < 0:
                    # If not, just return the image as is
                    target_weight[joint_id] = 0
                    continue

                # # Generate gaussian
                size = 2 * tmp_size + 1
                x = np.arange(0, size, 1, np.float32)
                y = x[:, np.newaxis]
                x0 = y0 = size // 2
                # The gaussian is not normalized, we want the center value to equal 1
                g = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * self.sigma**2))

                # Usable gaussian range
                g_x = max(0, -ul[0]), min(br[0], self.heatmap_size[0]) - ul[0]
                g_y = max(0, -ul[1]), min(br[1], self.heatmap_size[1]) - ul[1]
                # Image range
                img_x = max(0, ul[0]), min(br[0], self.heatmap_size[0])
                img_y = max(0, ul[1]), min(br[1], self.heatmap_size[1])

                v = target_weight[joint_id]
                if v > 0.5:
                    masked_gaussian[:, img_y[0]:img_y[1], img_x[0]:img_x[1]] = \
                        g[g_y[0]:g_y[1], g_x[0]:g_x[1]]
                    np.maximum(target, masked_gaussian, out=target)

        return target, target_weight

    def imread(self, path):
        img = cv2.imread(path)

        if self.origin_size == self.template_size:
            return img, 1.0

        def map_size(exe, size):
            return int(
                round(((exe + 1) / (self.origin_size + 1) * (size + 1) - 1)))

        nsize = map_size(self.template_size, img.shape[1])

        img = cv2.resize(img, (nsize, nsize))

        return img, nsize / img.shape[1]

    def shuffle(self):
        pick = []
        m = 0
        while m < self.num:
            p = []
            for subset in self.all_data:
                sub_p = subset.shuffle()
                p += sub_p

            sample_random.shuffle(p)

            pick += p
            m = len(pick)
        self.pick = pick
        logger.info("shuffle done!")
        logger.info("dataset length {}".format(self.num))

    def __len__(self):
        return self.num

    def find_dataset(self, index):
        for dataset in self.all_data:
            if dataset.start + dataset.num > index:
                return dataset, index - dataset.start

    def __getitem__(self, index, debug=False):
        index = self.pick[index]
        dataset, index = self.find_dataset(index)

        gray = self.gray and self.gray > random.random()
        neg = self.neg and self.neg > random.random()

        if neg:
            template = dataset.get_random_target(index)
            if self.inner_neg and self.inner_neg > random.random():
                search = dataset.get_random_target()
            else:
                search = random.choice(self.all_data).get_random_target()
        else:
            template, search = dataset.get_positive_pair(index)

        def center_crop(img, size):
            shape = img.shape[1]
            if shape == size: return img
            c = shape // 2
            l = c - size // 2
            r = c + size // 2 + 1
            return img[l:r, l:r]

        template_image, scale_z = self.imread(template[0])

        if self.template_small:
            template_image = center_crop(template_image, self.template_size)

        search_image, scale_x = self.imread(search[0])

        if dataset.has_mask:
            if not neg:
                search_mask = (cv2.imread(search[2], 0) > 0).astype(np.float32)
            else:
                search_mask = np.zeros(search_image.shape[:2],
                                       dtype=np.float32)
        else:
            if not neg:
                search_kp = np.array(search[2], dtype=np.float32)
            else:
                search_kp = np.zeros(51, dtype=np.float32)

        if self.crop_size > 0:
            search_image = center_crop(search_image, self.crop_size)

        def toBBox(image, shape):
            imh, imw = image.shape[:2]
            if len(shape) == 4:
                w, h = shape[2] - shape[0], shape[3] - shape[1]
            else:
                w, h = shape
            context_amount = 0.5
            exemplar_size = self.template_size  # 127
            wc_z = w + context_amount * (w + h)
            hc_z = h + context_amount * (w + h)
            s_z = np.sqrt(wc_z * hc_z)
            scale_z = exemplar_size / s_z
            w = w * scale_z
            h = h * scale_z
            cx, cy = imw // 2, imh // 2
            bbox = center2corner(Center(cx, cy, w, h))
            return bbox

        template_box = toBBox(template_image, template[1])
        search_box = toBBox(search_image, search[1])
        # bbox = search_box
        template, _, _ = self.template_aug(template_image,
                                           template_box,
                                           self.template_size,
                                           gray=gray)
        search, bbox, mask = self.search_aug(search_image,
                                             search_box,
                                             self.search_size,
                                             gray=gray)

        def draw(image, box, name):
            image = image.copy()
            x1, y1, x2, y2 = map(lambda x: int(round(x)), box)
            cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0))
            cv2.imwrite(name, image)

        def crop_hwc(bbox, out_sz=255):
            a = (out_sz - 1) / (bbox[2] - bbox[0])
            b = (out_sz - 1) / (bbox[3] - bbox[1])
            c = -a * bbox[0]
            d = -b * bbox[1]
            mapping = np.array([[a, 0, c], [0, b, d]]).astype(np.float)
            # crop = cv2.warpAffine(image, mapping, (out_sz, out_sz),
            # borderMode=cv2.BORDER_CONSTANT, borderValue=padding)
            return mapping

        def crop_hwc1(image, bbox, out_sz, padding=(0, 0, 0)):
            a = (out_sz - 1) / (bbox[2] - bbox[0])
            b = (out_sz - 1) / (bbox[3] - bbox[1])
            c = -a * bbox[0]
            d = -b * bbox[1]
            mapping = np.array([[a, 0, c], [0, b, d]]).astype(np.float)
            crop = cv2.warpAffine(image, mapping, (out_sz, out_sz))
            return crop

        def pos_s_2_bbox(pos, s):
            bbox = [
                pos[0] - s / 2, pos[1] - s / 2, pos[0] + s / 2, pos[1] + s / 2
            ]
            return bbox

        def crop_like_SiamFCx(bbox,
                              exemplar_size=127,
                              context_amount=0.5,
                              search_size=255):
            target_pos = [(bbox[2] + bbox[0]) / 2., (bbox[3] + bbox[1]) / 2.]
            target_size = [bbox[2] - bbox[0] + 1, bbox[3] - bbox[1] + 1]
            wc_z = target_size[1] + context_amount * sum(target_size)
            hc_z = target_size[0] + context_amount * sum(target_size)
            s_z = np.sqrt(wc_z * hc_z)
            scale_z = exemplar_size / s_z
            d_search = (search_size - exemplar_size) / 2
            pad = d_search / scale_z
            s_x = s_z + 2 * pad

            # x = crop_hwc1(image, pos_s_2_bbox(target_pos, s_x), search_size, padding)
            return target_pos, s_x

        def kp_conversion(KeyPoints, matrix):

            key_points = []
            kps_conversion = []
            skeleton = [0, 0]
            Skeleton = []

            for i in range(0, int(len(KeyPoints) / 3)):
                skeleton[0] = KeyPoints[i * 3 + 0]
                skeleton[1] = KeyPoints[i * 3 + 1]
                Skeleton.append(skeleton[:])
                lis = Skeleton[i]
                lis.append(1)
                key_points.append(lis)

            key_points = np.array(key_points)

            for i in range(0, int(len(KeyPoints) / 3)):
                if KeyPoints[i * 3 + 2] != 0:
                    ky_conversion = np.matmul(matrix,
                                              key_points[i, :]).tolist()
                    kps_conversion.append(ky_conversion[0])
                    kps_conversion.append(ky_conversion[1])
                    kps_conversion.append(KeyPoints[i * 3 + 2])
                else:
                    kps_conversion.append(0)
                    kps_conversion.append(0)
                    kps_conversion.append(0)

            return kps_conversion

        if debug:
            draw(template_image, template_box,
                 "debug/{:06d}_ot.jpg".format(index))
            draw(search_image, search_box, "debug/{:06d}_os.jpg".format(index))
            draw(template, _, "debug/{:06d}_t.jpg".format(index))
            draw(search, bbox, "debug/{:06d}_s.jpg".format(index))

        cls, delta, delta_weight = self.anchor_target(self.anchors, bbox,
                                                      self.size, neg)
        if not dataset.has_mask:
            pos, s = crop_like_SiamFCx(search_box,
                                       exemplar_size=127,
                                       context_amount=0.5,
                                       search_size=255)
            mapping_bbox = pos_s_2_bbox(pos, s)

            mapping = crop_hwc(mapping_bbox, out_sz=255)

            keypoints = kp_conversion(search_kp.tolist(), mapping)

            joints_3d = np.zeros((self.num_joints, 3), dtype=np.float)
            joints_3d_vis = np.zeros((self.num_joints, 3), dtype=np.float)
            for ipt in range(self.num_joints):
                joints_3d[ipt, 0] = keypoints[ipt * 3 + 0]
                joints_3d[ipt, 1] = keypoints[ipt * 3 + 1]
                joints_3d[ipt, 2] = keypoints[ipt * 3 + 2]
                t_vis = search_kp[ipt * 3 + 2]
                if t_vis > 1:
                    t_vis = 1
                joints_3d_vis[ipt, 0] = t_vis
                joints_3d_vis[ipt, 1] = t_vis
                joints_3d_vis[ipt, 2] = 0

            img = search.copy()
            # joints_3d = joints_3d / 255

            if not neg:
                kp_weight = cls.max(axis=0, keepdims=True)
            else:
                kp_weight = np.zeros([1, cls.shape[1], cls.shape[2]],
                                     dtype=np.float32)

            # now process the ct part
            c = np.array([img.shape[1] / 2., img.shape[0] / 2.],
                         dtype=np.float32)
            s = max(img.shape[0], img.shape[1]) * 1.0
            rot = 0

            output_res = self.output_res
            num_joints = self.num_joints
            trans_output_rot = get_affine_transform(c, s, rot,
                                                    [output_res, output_res])
            trans_output = get_affine_transform(c, s, 0,
                                                [output_res, output_res])

            ind = np.zeros(1, dtype=np.int64)
            # hm_hp = np.zeros((num_joints, output_res, output_res), dtype=np.float32)
            hm_hp = np.zeros((num_joints, output_res, output_res),
                             dtype=np.float32)
            kps = np.zeros(num_joints * 2, dtype=np.float32)
            kps_mask = np.zeros((self.num_joints * 2), dtype=np.uint8)
            hp_offset = np.zeros((num_joints, 2), dtype=np.float32)
            hp_ind = np.zeros(num_joints, dtype=np.int64)
            hp_mask = np.zeros(num_joints, dtype=np.int64)


            draw_gaussian = draw_msra_gaussian if self.mse_loss else \
                        draw_umich_gaussian

            pts = joints_3d.copy()
            bbox = np.array(bbox, np.float32)
            bbox_reg = np.array(bbox, np.float32)
            bbox[:2] = affine_transform(bbox[:2], trans_output)
            bbox[2:] = affine_transform(bbox[2:], trans_output)
            bbox = np.clip(bbox, 0, output_res - 1)
            h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]
            if (h > 0 and w > 0):
                ct = np.array([(bbox[0] + bbox[2]) / 2,
                               (bbox[1] + bbox[3]) / 2],
                              dtype=np.float32)
                ct_int = ct.astype(np.int32)
            hp_radius = gaussian_radius(
                (math.ceil(h) * 2.3, math.ceil(w) * 2.3))
            hp_radius = self.hm_gauss \
                        if self.mse_loss else max(0, int(hp_radius))
            ind[0] = ct_int[1] * output_res + ct_int[0]
            for j in range(num_joints):
                if pts[j, 2] > 0:
                    pts[j, :2] = affine_transform(pts[j, :2], trans_output_rot)
                    if pts[j, 0] >= 0 and pts[j, 0] < output_res and \
                       pts[j, 1] >= 0 and pts[j, 1] < output_res:
                        kps[j * 2:j * 2 + 2] = pts[j, :2] - ct_int
                        kps_mask[j * 2:j * 2 + 2] = 1
                        pt_int = pts[j, :2].astype(np.int32)
                        # print('ct_int: ', ct_int)
                        # print('pt_int: ', pt_int)
                        hp_offset[j] = pts[j, :2] - pt_int
                        hp_ind[j] = pt_int[1] * output_res + pt_int[0]
                        hp_mask[j] = 1

                        draw_gaussian(hm_hp[j], pt_int, hp_radius)
                        # pt_ori = joints_3d[j, :2].astype(np.int32)
                        # draw_gaussian(hm_hp[j], pt_ori, hp_radius)

            ret = {'hps': kps, 'hm_hp': hm_hp, 'hp_mask': hp_mask}
            # print('kps: ', ret['hps'])
            ret.update({
                'hp_offset': hp_offset,
                'hp_ind': hp_ind,
                'hps_mask': kps_mask,
                'ind': ind
            })

        # print('hp_offset: ', hp_offset)
        joints_3d_out = joints_3d.transpose(1, 0)

        template, search = map(
            lambda x: np.transpose(x, (2, 0, 1)).astype(np.float32),
            [template, search])
        return template, search, cls, delta, \
          delta_weight, bbox_reg, \
          np.array(kp_weight, np.float32), ret, joints_3d_out
Esempio n. 6
0
class SiamRPN(nn.Module):
    def __init__(self, anchors=None):
        super(SiamRPN, self).__init__()
        self.anchors = anchors  # anchor_cfg
        self.anchor = Anchors(anchors)
        self.anchor_num = self.anchor.anchor_num
        self.features = None
        self.rpn_model = None

        self.all_anchors = None

    def set_all_anchors(self, image_center, size):
        # cx,cy,w,h
        if not self.anchor.generate_all_anchors(image_center, size):
            return
        all_anchors = self.anchor.all_anchors[1] # cx, cy, w, h
        self.all_anchors = torch.from_numpy(all_anchors).float().cuda()
        self.all_anchors = [self.all_anchors[i] for i in range(4)]

    def feature_extractor(self, x):
        return self.features(x)

    def rpn(self, template, search):
        pred_cls, pred_loc = self.rpn_model(template, search)
        return pred_cls, pred_loc

    def _add_rpn_loss(self, label_cls, label_loc, lable_loc_weight, rpn_pred_cls,
                      rpn_pred_loc):
        '''
        :param compute_anchor_targets_fn: functions to produce anchors' learning targets.
        :param rpn_pred_cls: [B, num_anchors * 2, h, w], output of rpn for classification.
        :param rpn_pred_loc: [B, num_anchors * 4, h, w], output of rpn for localization.
        :return: loss of classification and localization, respectively.
        '''
        rpn_loss_cls = select_cross_entropy_loss(rpn_pred_cls, label_cls)

        rpn_loss_loc = weight_l1_loss(rpn_pred_loc, label_loc, lable_loc_weight)

        # classification accuracy, top1
        acc = torch.zeros(1)  # TODO
        return rpn_loss_cls, rpn_loss_loc, acc

    def run(self, template, search, softmax=False):
        """
        run network
        """
        template_feature = self.feature_extractor(template)
        search_feature = self.feature_extractor(search)
        rpn_pred_cls, rpn_pred_loc = self.rpn(template_feature, search_feature)
        if softmax:
            rpn_pred_cls = self.softmax(rpn_pred_cls)
        return rpn_pred_cls, rpn_pred_loc, template_feature, search_feature

    def softmax(self, cls):
        b, a2, h, w = cls.size()
        cls = cls.view(b, 2, a2//2, h, w)
        cls = cls.permute(0, 2, 3, 4, 1).contiguous()
        cls = F.log_softmax(cls, dim=4)
        return cls

    def forward(self, input):
        """
        :param input: dict of input with keys of:
                'template': [b, 3, h1, w1], input template image.
                'search': [b, 3, h2, w2], input search image.
                'label_cls':[b, max_num_gts, 5] or None(self.training==False),
                                     each gt contains x1,y1,x2,y2,class.
        :return: dict of loss, predict, accuracy
        """
        template = input['template']
        search = input['search']
        if self.training:
            label_cls = input['label_cls']
            label_loc = input['label_loc']
            lable_loc_weight = input['label_loc_weight']

        rpn_pred_cls, rpn_pred_loc, template_feature, search_feature = self.run(template, search, softmax=self.training)

        outputs = dict(predict=[], losses=[], accuracy=[])

        outputs['predict'] = [rpn_pred_loc, rpn_pred_cls, template_feature, search_feature]
        if self.training:
            rpn_loss_cls, rpn_loss_loc, rpn_acc = self._add_rpn_loss(label_cls, label_loc, lable_loc_weight,
                                                                     rpn_pred_cls, rpn_pred_loc)
            outputs['losses'] = [rpn_loss_cls, rpn_loss_loc]
        return outputs

    def template(self, z):
        self.zf = self.feature_extractor(z)
        cls_kernel, loc_kernel = self.rpn_model.template(self.zf)
        return cls_kernel, loc_kernel

    def track(self, x, cls_kernel=None, loc_kernel=None, softmax=False):
        xf = self.feature_extractor(x)
        rpn_pred_cls, rpn_pred_loc = self.rpn_model.track(xf, cls_kernel, loc_kernel)
        if softmax:
            rpn_pred_cls = self.softmax(rpn_pred_cls)
        return rpn_pred_cls, rpn_pred_loc
Esempio n. 7
0
class SiamMask(nn.Module):
    """
    主要用来定义siamMask网络的框架,及其主要模块
    """
    def __init__(self, anchors=None, o_sz=63, g_sz=127):
        super(SiamMask, self).__init__()
        self.anchors = anchors  # anchor_cfg anchors中的配置信息
        self.anchor_num = len(self.anchors["ratios"]) * len(self.anchors["scales"])  # anchor的数目
        self.anchor = Anchors(anchors)  # anchor
        self.features = None  # 特征提取网络模型
        self.rpn_model = None  # rpn网络模型
        self.mask_model = None  # 图像分割的网络模型
        self.o_sz = o_sz  # 输入尺寸
        self.g_sz = g_sz  # 输出尺寸
        self.upSample = nn.UpsamplingBilinear2d(size=[g_sz, g_sz])  # 2d数据的双线性插值

        self.all_anchors = None

    def set_all_anchors(self, image_center, size):
        """
        初始化anchors(该方法未使用)
        :param image_center: 图像中心
        :param size:
        :return:
        """
        # cx,cy,w,h
        if not self.anchor.generate_all_anchors(image_center, size):
            return
        all_anchors = self.anchor.all_anchors[1]  # cx, cy, w, h
        self.all_anchors = torch.from_numpy(all_anchors).float().cuda()
        self.all_anchors = [self.all_anchors[i] for i in range(4)]

    def feature_extractor(self, x):
        """
        特征提取
        :param x:输入数据
        :return:数据特征
        """
        return self.features(x)

    def rpn(self, template, search):
        """
        rpn网络
        :param template: 模板
        :param search: 搜索图像
        :return:
        """
        # 预测分类和位置结果
        pred_cls, pred_loc = self.rpn_model(template, search)
        return pred_cls, pred_loc

    def mask(self, template, search):
        """
        分割预测结果
        :param template: 模板
        :param search: 待搜索图像
        :return: 掩膜结果
        """
        pred_mask = self.mask_model(template, search)
        return pred_mask

    def _add_rpn_loss(self, label_cls, label_loc, lable_loc_weight, label_mask, label_mask_weight,
                      rpn_pred_cls, rpn_pred_loc, rpn_pred_mask):
        """
        rpn损失函数
        """
        # 分类的损失结果(交叉熵损失)
        rpn_loss_cls = select_cross_entropy_loss(rpn_pred_cls, label_cls)
        # 回归的损失结果
        rpn_loss_loc = weight_l1_loss(rpn_pred_loc, label_loc, lable_loc_weight)
        # 分割的损失结果和准确率
        rpn_loss_mask, iou_m, iou_5, iou_7 = select_mask_logistic_loss(rpn_pred_mask, label_mask, label_mask_weight)

        return rpn_loss_cls, rpn_loss_loc, rpn_loss_mask, iou_m, iou_5, iou_7

    def run(self, template, search, softmax=False):
        '''
        构建网络
        :param template: 模板
        :param search: 待搜索图像
        :param softmax:
        :return:
        '''
        # 提取模板特征
        template_feature = self.feature_extractor(template)
        # 提取图像特征
        search_feature = self.feature_extractor(search)
        # 预测结果
        rpn_pred_cls, rpn_pred_loc = self.rpn(template_feature, search_feature)
        rpn_pred_mask = self.mask(template_feature, search_feature)  # (b, 63*63, w, h)
        # 利用softmax进行分类
        if softmax:
            rpn_pred_cls = self.softmax(rpn_pred_cls)
        return rpn_pred_cls, rpn_pred_loc, rpn_pred_mask, template_feature, search_feature

    def softmax(self, cls):
        """
        softmax
        :param cls:
        :return:
        """
        # 获取cls的结果,及其对应的anchor的大小
        b, a2, h, w = cls.size()
        # 维度变换
        cls = cls.view(b, 2, a2//2, h, w)
        # 高维转置
        cls = cls.permute(0, 2, 3, 4, 1).contiguous()
        # 对softmax结果求以4为底的对数
        cls = F.log_softmax(cls, dim=4)
        return cls


    def forward(self, input):
        """
        torch中正向传递的算法,所有的子函数将覆盖函数
        :param input: dict of input with keys of:
                'template': [b, 3, h1, w1], input template image.输入的模板图像
                'search': [b, 3, h2, w2], input search image.待搜索图像
                'label_cls':[b, max_num_gts, 5] or None(self.training==False),
                                     each gt contains x1,y1,x2,y2,class.
        :return: dict of loss, predict, accuracy 损失 预测结果 准确率
        """
        # 参数设置
        template = input['template']
        search = input['search']
        if self.training:
            label_cls = input['label_cls']
            label_loc = input['label_loc']
            lable_loc_weight = input['label_loc_weight']
            label_mask = input['label_mask']
            label_mask_weight = input['label_mask_weight']

        # 运行网络
        rpn_pred_cls, rpn_pred_loc, rpn_pred_mask, template_feature, search_feature = \
            self.run(template, search, softmax=self.training)

        outputs = dict()
        # 预测结果
        outputs['predict'] = [rpn_pred_loc, rpn_pred_cls, rpn_pred_mask, template_feature, search_feature]

        if self.training:
            # 损失函数
            rpn_loss_cls, rpn_loss_loc, rpn_loss_mask, iou_acc_mean, iou_acc_5, iou_acc_7 = \
                self._add_rpn_loss(label_cls, label_loc, lable_loc_weight, label_mask, label_mask_weight,
                                   rpn_pred_cls, rpn_pred_loc, rpn_pred_mask)
            # 输出损失函数和精度结果
            outputs['losses'] = [rpn_loss_cls, rpn_loss_loc, rpn_loss_mask]
            outputs['accuracy'] = [iou_acc_mean, iou_acc_5, iou_acc_7]

        return outputs

    def template(self, z):
        """
        用于处理模板图像
        :param z: 跟踪目标的模板
        :return: 模板的分类和回归结果
        """
        self.zf = self.feature_extractor(z)
        cls_kernel, loc_kernel = self.rpn_model.template(self.zf)
        return cls_kernel, loc_kernel

    def track(self, x, cls_kernel=None, loc_kernel=None, softmax=False):
        """
        目标跟踪
        :param x:
        :param cls_kernel:
        :param loc_kernel:
        :param softmax:
        :return:
        """
        # 特征提取
        xf = self.feature_extractor(x)
        # 跟踪结果
        rpn_pred_cls, rpn_pred_loc = self.rpn_model.track(xf, cls_kernel, loc_kernel)
        if softmax:
            rpn_pred_cls = self.softmax(rpn_pred_cls)
        # 返回目标跟踪的位置和分类结果
        return rpn_pred_cls, rpn_pred_loc
class DataSets(Dataset):
    def __init__(self, cfg, anchor_cfg, num_epoch=1):
        super(DataSets, self).__init__()  # 继承自torch.utils.data.Dataset
        global logger  # 全局记录器
        logger = logging.getLogger('global')  # 实例化一个叫做‘global’的记录器

        # anchors
        self.anchors = Anchors(anchor_cfg)  # 实例化Anchors

        # size
        self.template_size = 127
        self.origin_size = 127
        self.search_size = 255
        self.size = 17  # 这说明anchor要在depth-wise方式的xcorr之后规划
        self.base_size = 0  # base_size不知道
        self.crop_size = 0

        if 'template_size' in cfg:
            self.template_size = cfg['template_size']
        if 'origin_size' in cfg:
            self.origin_size = cfg['origin_size']
        if 'search_size' in cfg:
            self.search_size = cfg['search_size']
        if 'base_size' in cfg:
            self.base_size = cfg['base_size']
        if 'size' in cfg:
            self.size = cfg['size']
        # 大概理解了下面的一行,可以结合Table8 进行分析,其实是把对examplar的操作迁移到了search image上,只不过要把没有被覆盖的search image区域继续做前面的stride = 8的操作
        if (self.search_size - self.template_size
            ) / self.anchors.stride + 1 + self.base_size != self.size:
            raise Exception("size not match!")  # TODO: calculate size online
        if 'crop_size' in cfg:
            self.crop_size = cfg['crop_size']
        self.template_small = False
        if 'template_small' in cfg and cfg['template_small']:
            self.template_small = True

        self.anchors.generate_all_anchors(
            im_c=self.search_size // 2,
            size=self.size)  # 调用了Anchors类里面的一个方法获取全部的anchors,这一步很重要

        if 'anchor_target' not in cfg:
            cfg['anchor_target'] = {}
        self.anchor_target = AnchorTargetLayer(
            cfg['anchor_target'])  # 实例化AnchorTargetLayer

        # data sets
        if 'datasets' not in cfg:
            raise (Exception('DataSet need "{}"'.format('datasets')))

        self.all_data = []
        start = 0
        self.num = 0
        for name in cfg['datasets']:
            dataset = cfg['datasets'][name]
            dataset['mark'] = name
            dataset['start'] = start

            dataset = SubDataSet(dataset)  # subdataset是用来干啥的???
            dataset.log()
            self.all_data.append(dataset)

            start += dataset.num  # real video number
            self.num += dataset.num_use  # the number used for subset shuffle

        # data augmentation
        aug_cfg = cfg['augmentation']  # 具体看数据扩增是怎么做的?还没弄明白
        self.template_aug = Augmentation(aug_cfg['template'])
        self.search_aug = Augmentation(aug_cfg['search'])
        self.gray = aug_cfg['gray']
        self.neg = aug_cfg['neg']
        self.inner_neg = 0 if 'inner_neg' not in aug_cfg else aug_cfg[
            'inner_neg']

        self.pick = None  # list to save id for each img
        if 'num' in cfg:  # number used in training for all dataset
            self.num = int(cfg['num'])
        self.num *= num_epoch
        self.shuffle()

        self.infos = {
            'template': self.template_size,
            'search': self.search_size,
            'template_small': self.template_small,
            'gray': self.gray,
            'neg': self.neg,
            'inner_neg': self.inner_neg,
            'crop_size': self.crop_size,
            'anchor_target': self.anchor_target.__dict__,
            'num': self.num // num_epoch
        }
        logger.info('dataset informations: \n{}'.format(
            json.dumps(self.infos, indent=4)))

    def imread(self, path):
        img = cv2.imread(path)
        # origin_size指通过标签Bbox算出来的exemplar patch的实际大小,但需resize为template_size=127
        if self.origin_size == self.template_size:
            return img, 1.0

        def map_size(exe, size):
            return int(
                round(((exe + 1) / (self.origin_size + 1) * (size + 1) - 1)))

        nsize = map_size(self.template_size, img.shape[1])

        img = cv2.resize(img, (nsize, nsize))

        return img, nsize / img.shape[1]

    def shuffle(self):
        pick = []
        m = 0
        while m < self.num:
            p = []
            for subset in self.all_data:
                sub_p = subset.shuffle()
                p += sub_p

            sample_random.shuffle(p)

            pick += p
            m = len(pick)
        self.pick = pick
        logger.info("shuffle done!")
        logger.info("dataset length {}".format(self.num))

    def __len__(self):
        return self.num

    def find_dataset(self, index):
        for dataset in self.all_data:
            if dataset.start + dataset.num > index:
                return dataset, index - dataset.start

    def __getitem__(self, index, debug=False):
        index = self.pick[index]
        dataset, index = self.find_dataset(index)

        gray = self.gray and self.gray > random.random()
        neg = self.neg and self.neg > random.random()

        if neg:
            template = dataset.get_random_target(index)
            if self.inner_neg and self.inner_neg > random.random():
                search = dataset.get_random_target(
                )  # inner_neg == True,从coco或ytb_vos内部选一对 negative pair
            else:
                search = random.choice(self.all_data).get_random_target()
        else:
            template, search = dataset.get_positive_pair(
                index)  # template = (image_path, image_anno, mask_path)

        def center_crop(img, size):
            shape = img.shape[1]
            if shape == size: return img
            c = shape // 2
            l = c - size // 2
            r = c + size // 2 + 1
            return img[l:r, l:r]

        template_image, scale_z = self.imread(
            template[0])  # template_image的shape是 511,511,3

        if self.template_small:  # 似乎感觉template_small和origin_size有点关系;
            template_image = center_crop(template_image, self.template_size)

        search_image, scale_x = self.imread(search[0])

        if dataset.has_mask and not neg:
            search_mask = (cv2.imread(search[2], 0) > 0).astype(np.float32)
        else:
            search_mask = np.zeros(search_image.shape[:2], dtype=np.float32)

        if self.crop_size > 0:  # crop_size指什么???
            search_image = center_crop(search_image, self.crop_size)
            search_mask = center_crop(search_mask, self.crop_size)

        def toBBox(image, shape):
            # 转变成相对于self.template_size=127条件下的bbox
            imh, imw = image.shape[:2]
            if len(shape) == 4:
                w, h = shape[2] - shape[0], shape[3] - shape[1]
            else:
                w, h = shape
            context_amount = 0.5
            exemplar_size = self.template_size  # 127
            wc_z = w + context_amount * (w + h)
            hc_z = h + context_amount * (w + h)
            s_z = np.sqrt(wc_z * hc_z)
            scale_z = exemplar_size / s_z
            w = w * scale_z
            h = h * scale_z

            # 为什么bbox将中心取在图片的中心? 因为在预处理阶段已经将bbox的中心置于本函数的arg:image的中心
            # 可以查看预处理阶段的par_crop.py文件
            cx, cy = imw // 2, imh // 2
            bbox = center2corner(Center(cx, cy, w, h))
            return bbox

        template_box = toBBox(template_image, template[1])
        search_box = toBBox(search_image, search[1])

        template, _, _ = self.template_aug(template_image,
                                           template_box,
                                           self.template_size,
                                           gray=gray)
        search, bbox, mask = self.search_aug(search_image,
                                             search_box,
                                             self.search_size,
                                             gray=gray,
                                             mask=search_mask)

        # search分支需要bbox和mask作为label,即标准答案

        def draw(image, box, name):
            image = image.copy()
            x1, y1, x2, y2 = map(lambda x: int(round(x)), box)
            cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0))
            cv2.imwrite(name, image)

        if debug:
            draw(template_image, template_box,
                 "debug/{:06d}_ot.jpg".format(index))
            draw(search_image, search_box, "debug/{:06d}_os.jpg".format(index))
            draw(template, _, "debug/{:06d}_t.jpg".format(index))
            draw(search, bbox, "debug/{:06d}_s.jpg".format(index))

        # 以下部分还未参透
        cls, delta, delta_weight = self.anchor_target(self.anchors, bbox,
                                                      self.size, neg)
        # 这个阶段不需要IoU计算,仅仅是取出数据,方便后面的训练;从上面的参数中也看出,没有传递need_iou这一参数,默认此参数的值是false
        if dataset.has_mask and not neg:  # 如果neg为true,则表示抽取的是一对负实例。
            mask_weight = cls.max(
                axis=0, keepdims=True)  # 如果keepdims=True,那么被减少的那个轴会以维度1保留在结果中
            # 从而也印证了下面一行是第0个维度方向为什么会设置为1
        else:
            mask_weight = np.zeros([1, cls.shape[1], cls.shape[2]],
                                   dtype=np.float32)

        template, search = map(
            lambda x: np.transpose(x, (2, 0, 1)).astype(np.float32),
            [template, search])

        mask = (np.expand_dims(mask, axis=0) > 0.5) * 2 - 1  # 1*H*W

        return template, search, cls, delta, delta_weight, np.array(bbox, np.float32), \
               np.array(mask, np.float32), np.array(mask_weight, np.float32)
Esempio n. 9
0
class SiamMask(nn.Module):
    def __init__(self, anchors=None, o_sz=63, g_sz=127):
        super(SiamMask, self).__init__()
        self.anchors = anchors  # anchor_cfg
        self.anchor_num = len(self.anchors["ratios"]) * len(
            self.anchors["scales"])
        self.anchor = Anchors(anchors)
        self.features = None
        self.rpn_model = None  # self.rpn_model将被子类的self.rpn_model覆盖
        self.mask_model = None  # self.mask_model也将被子类的self.mask_model覆盖
        self.o_sz = o_sz
        self.g_sz = g_sz
        self.upSample = nn.UpsamplingBilinear2d(size=[g_sz, g_sz])

        self.all_anchors = None

    def set_all_anchors(self, image_center, size):
        # cx,cy,w,h
        if not self.anchor.generate_all_anchors(
                image_center, size):  # 设定所有的anchors,如果设置过,就不用重新设置了
            return
        all_anchors = self.anchor.all_anchors[1]  # cx, cy, w, h
        self.all_anchors = torch.from_numpy(
            all_anchors).float().cuda()  # 转变成tensor,并推送到GPU上
        self.all_anchors = [self.all_anchors[i] for i in range(4)]

    def feature_extractor(self, x):
        # 参数共享阶段
        return self.features(x)

    def rpn(self, template, search):
        pred_cls, pred_loc = self.rpn_model(template, search)
        return pred_cls, pred_loc

    def mask(self, template, search):
        pred_mask = self.mask_model(template, search)
        return pred_mask

    def _add_rpn_loss(self, label_cls, label_loc, lable_loc_weight, label_mask,
                      label_mask_weight, rpn_pred_cls, rpn_pred_loc,
                      rpn_pred_mask):
        rpn_loss_cls = select_cross_entropy_loss(rpn_pred_cls, label_cls)

        rpn_loss_loc = weight_l1_loss(rpn_pred_loc, label_loc,
                                      lable_loc_weight)

        rpn_loss_mask, iou_m, iou_5, iou_7 = select_mask_logistic_loss(
            rpn_pred_mask, label_mask, label_mask_weight)

        return rpn_loss_cls, rpn_loss_loc, rpn_loss_mask, iou_m, iou_5, iou_7

    def run(self, template, search, softmax=False):
        """
        run network
        """
        template_feature = self.feature_extractor(template)
        search_feature = self.feature_extractor(search)
        rpn_pred_cls, rpn_pred_loc = self.rpn(template_feature, search_feature)
        rpn_pred_mask = self.mask(template_feature,
                                  search_feature)  # (b, 63*63, w, h)

        if softmax:  # 如果训练,则softmax;在referrence阶段,无需softmax。
            rpn_pred_cls = self.softmax(rpn_pred_cls)
        return rpn_pred_cls, rpn_pred_loc, rpn_pred_mask, template_feature, search_feature

    def softmax(self, cls):
        b, a2, h, w = cls.size(
        )  # 分类是两类。。。这里的a2//2就是anchor的种类,每一个锚点都有a2//2个anchor(一系列)
        cls = cls.view(b, 2, a2 // 2, h, w)
        cls = cls.permute(0, 2, 3, 4, 1).contiguous()  # 转变成在内存中连续的格式;
        cls = F.log_softmax(cls, dim=4)  # 在dim=4的方向求解softmax并log ; b*k*w*h*2
        return cls

    def forward(self, input):
        """
        :param input: dict of input with keys of:
                'template': [b, 3, h1, w1], input template image.
                'search': [b, 3, h2, w2], input search image.
                'label_cls':[b, max_num_gts, 5] or None(self.training==False),
                                     each gt(ground truth) contains x1,y1,x2,y2,class.
        :return: dict of loss, predict, accuracy
        """
        template = input['template']
        search = input['search']
        if self.training:  # 这块继承的是nn.Module吗???
            label_cls = input['label_cls']
            label_loc = input['label_loc']
            lable_loc_weight = input['label_loc_weight']
            label_mask = input['label_mask']
            label_mask_weight = input['label_mask_weight']

        rpn_pred_cls, rpn_pred_loc, rpn_pred_mask, template_feature, search_feature = \
            self.run(template, search, softmax=self.training)

        outputs = dict()

        outputs['predict'] = [
            rpn_pred_loc, rpn_pred_cls, rpn_pred_mask, template_feature,
            search_feature
        ]

        if self.training:
            rpn_loss_cls, rpn_loss_loc, rpn_loss_mask, iou_acc_mean, iou_acc_5, iou_acc_7 = \
                self._add_rpn_loss(label_cls, label_loc, lable_loc_weight, label_mask, label_mask_weight,
                                   rpn_pred_cls, rpn_pred_loc, rpn_pred_mask)
            outputs['losses'] = [rpn_loss_cls, rpn_loss_loc, rpn_loss_mask]
            outputs['accuracy'] = [iou_acc_mean, iou_acc_5, iou_acc_7]

        return outputs

    def template(self, z):
        self.zf = self.feature_extractor(z)
        cls_kernel, loc_kernel = self.rpn_model.template(
            self.zf)  # rpn_model.template是个什么方法???
        return cls_kernel, loc_kernel

    def track(self, x, cls_kernel=None, loc_kernel=None, softmax=False):
        xf = self.feature_extractor(x)
        rpn_pred_cls, rpn_pred_loc = self.rpn_model.track(
            xf, cls_kernel, loc_kernel)
        if softmax:
            rpn_pred_cls = self.softmax(rpn_pred_cls)
        return rpn_pred_cls, rpn_pred_loc