Ejemplo n.º 1
0
def project_masks_on_boxes(gt_masks, boxes, matched_idxs, M):
    """
    Given segmentation masks and the bounding boxes corresponding
    to the location of the masks in the image, this function
    crops and resizes the masks in the position defined by the
    boxes. This prepares the masks for them to be fed to the
    loss computation as the targets.
    """
    matched_idxs = matched_idxs.to(boxes)
    rois = torch.cat([matched_idxs[:, None], boxes], dim=1)
    gt_masks = gt_masks[:, None].to(rois)
    return roi_align(gt_masks, rois, (M, M), 1)[:, 0]
Ejemplo n.º 2
0
 def get_yolo_feature_vec(self, coords):
     feature_map = self.get_feature_map()
     ratio = self.img_size/feature_map.size()[2]
     #coords = (10,10,100,100)
     coords = torch.cat((torch.Tensor([0]),torch.Tensor(coords))).view(1,5).cuda()
     #coords = torch.Tensor(coords).view(1,4).cuda()
     #print(feature_map.shape)
     #print(coords.shape)
     #print(coords.shape)
     with torch.no_grad():
         roi = roi_align(  feature_map, coords,(3,3) , spatial_scale=1/ratio)
     #print(roi)
     vec = F.adaptive_avg_pool2d(roi, (1, 1))
     return np.squeeze(vec.cpu().detach().numpy())
Ejemplo n.º 3
0
    def __call__(self, feats, masks_to_concat, num_obj):
        out_instace_features = []
        masks_to_concat = F.interpolate(masks_to_concat.unsqueeze(0),
                                        size=feats.shape[-2:])
        for obj_idx in range(num_obj):
            obj_mask = masks_to_concat[:, obj_idx, ...]
            bbx = extract_bboxes(obj_mask, self.dilate)
            instance_features = roi_align(feats, bbx, self.spatial_size)
            out_instace_features.append(instance_features)

        return [
            torch.cat((feats, masks_to_concat[:, obj_idx, ...]),
                      dim=0).unsqueeze(0) for obj_idx in range(num_obj)
        ]
Ejemplo n.º 4
0
    def forward(self, input, targets):
        boxes = targets[:, [0, 2, 3, 4, 5]]
        _, _, h, w = input.shape
        boxes[:, [2, 4]] *= h
        boxes[:, [1, 3]] *= w
        o_h, o_w = torch.mean(boxes[:, 4]), torch.mean(boxes[:, 3])
        boxes[:, 1:] = xywh2xyxy(boxes[:, 1:])
        feat = roi_align(input, boxes, output_size=(o_h, o_w))
        out = self.bn(feat)
        out = self.act(out)
        out = self.pooling(out)
        out = self.linear(out.squeeze())

        return out
Ejemplo n.º 5
0
def extract_features(args):
    model = get_model(args)
    # model = RoiModel()
    # model = nn.DataParallel(model)
    # model = model.cuda()
    # model.eval()

    bboxes, keys = get_bbox(args)

    loader = get_dataloader(args, keys)

    N = len(loader.dataset)

    # out_channels = 2048 if args.arch == 'resnet152' else 512
    if args.arch == 'resnet152':
        out_channels = 2048
    elif args.arch == 'vgg16':
        out_channels = 512

    fp = open_memmap(
        os.path.join(args.output_dir, 'data.npy'),
        mode='w+',
        dtype=np.float32,
        shape=(N, args.num_boxes, out_channels)
    )

    with torch.no_grad():
        for i, images in tqdm(enumerate(loader), total=len(loader)):
            images = images.cuda()

            output = model(images)

            current_batch_size = images.shape[0]

            current_index = i * args.batch_size
            current_boxes = bboxes[current_index: current_index +
                                   current_batch_size]
            current_boxes = [b.cuda() for b in current_boxes]
            output = roi_align(output, current_boxes, (1, 1))

            # index = i * args.batch_size
            # import ipdb; ipdb.set_trace()
            fp[current_index: current_index + current_batch_size] = output.view(
                current_batch_size, args.num_boxes, out_channels).cpu().numpy()

    print(fp[N - 1])
    del fp

    loader.dataset.save_indices(args.output_dir)
Ejemplo n.º 6
0
 def forward(self, input, rois):
     """
     Args:
         input: NCHW images
         rois: Bx5 boxes. First column is the index into N. The other 4 columns are xyxy.
     """
     assert rois.dim() == 2 and rois.size(1) == 5
     return roi_align(
         input,
         rois.to(dtype=input.dtype),
         self.output_size,
         self.spatial_scale,
         self.sampling_ratio,
         self.aligned,
     )
Ejemplo n.º 7
0
    def crop_tracklets(self, boxes, frame, ber=None):
        """
        Crops relevant areas from frame based on a priori (pre_locations) object locations
        """
        if ber is None:
            ber = self.ber

        #box_ids = []
        #box_list = []

        # # convert to array
        # for id in pre_locations:
        #     box_ids.append(id)
        #     box_list.append(pre_locations[id][:4])
        # boxes = np.array(box_list)
        # boxes = pre_locations

        temp = np.zeros(boxes.shape)
        temp[:, 0] = (boxes[:, 0] + boxes[:, 2]) / 2.0
        temp[:, 1] = (boxes[:, 1] + boxes[:, 3]) / 2.0
        temp[:, 2] = boxes[:, 2] - boxes[:, 0]
        temp[:, 3] = (boxes[:, 3] - boxes[:, 1]) / (temp[:, 2] + 1e-07)
        boxes = temp

        # first row of zeros is batch index (batch is size 0) for ROI align
        new_boxes = np.zeros([len(boxes), 5])

        # use either s or s x r for both dimensions, whichever is smaller,so crop is square
        box_scales = np.max(np.stack((boxes[:, 2], boxes[:, 2] * boxes[:, 3]),
                                     axis=1),
                            axis=1)  #/2.0

        #expand box slightly
        box_scales = box_scales * ber  # box expansion ratio

        new_boxes[:, 1] = boxes[:, 0] - box_scales / 2
        new_boxes[:, 3] = boxes[:, 0] + box_scales / 2
        new_boxes[:, 2] = boxes[:, 1] - box_scales / 2
        new_boxes[:, 4] = boxes[:, 1] + box_scales / 2

        torch_boxes = torch.from_numpy(new_boxes).float().to(self.device)

        # crop using roi align
        crops = roi_align(frame.unsqueeze(0), torch_boxes, (self.cs, self.cs))

        return crops, new_boxes, box_scales
Ejemplo n.º 8
0
def extract_patch_from_frame(image, coordinates, output_shape):
    """ This should be the inverse operation to translate_and_scale_patch
    """

    # Translate from coordinates in (x_center, y_center, w, h) to (minx, miny, maxx, maxy)
    xyxy = torch.zeros_like(coordinates)
    xyxy[:, 0] = coordinates[:, 0] - image.shape[-2] * coordinates[:, 2] / 2
    xyxy[:, 1] = coordinates[:, 1] - image.shape[-1] * coordinates[:, 3] / 2
    xyxy[:, 2] = coordinates[:, 0] + image.shape[-2] * coordinates[:, 2] / 2
    xyxy[:, 3] = coordinates[:, 1] + image.shape[-1] * coordinates[:, 3] / 2
    xyxy_with_index = torch.cat(
        (torch.arange(xyxy.shape[0], dtype=xyxy.dtype,
                      device=xyxy.device).view(-1, 1), xyxy),
        dim=1)

    patches = roi_align(image, xyxy_with_index, output_shape, aligned=True)
    return patches
Ejemplo n.º 9
0
    def MultiScaleRoiAlign(self, fpn_feat_list, proposals, P=7):
        #####################################
        # Here you can use torchvision.ops.RoIAlign check the docs
        #####################################

        proposals = torch.cat(proposals)
        total_proposals = len(proposals)
        x1, y1, x2, y2 = proposals.T
        p_width = torch.abs(x1 - x2)
        p_height = torch.abs(y1 - y2)
        fpn_idx = (
            4 +
            torch.log2(torch.sqrt(p_width * p_height) / 224)).floor().clamp(
                min=2,
                max=5)  #k-values are clipped to [2,5] according to piazza
        fpn_idx -= 2

        feature_vectors = torch.zeros(
            (len(proposals), 256 * P * P)).to(self.device)
        for i in range(len(fpn_feat_list)):

            matched_proposals_idx = torch.where(fpn_idx == i)
            img_id = matched_proposals_idx[0] // 200
            matched_proposals = proposals[matched_proposals_idx]
            matched_proposals = torch.cat(
                (img_id.unsqueeze(dim=1), matched_proposals), dim=1)
            # convert proposal box from img coord to featuremap coord
            # matched_proposals *= fpn_feat_list[i].shape[-1]/ 1088

            matched_proposals[:, (
                1, 3
            )] *= fpn_feat_list[i].shape[-1] / 1088  # rescaling the x-coords
            matched_proposals[:, (
                2, 4
            )] *= fpn_feat_list[i].shape[-2] / 800  # rescaling the y-coords

            aligned_box = ops.roi_align(
                fpn_feat_list[i], boxes=matched_proposals,
                output_size=P)  #shape: #proposals in feature level i, 256*p*p
            aligned_box = torch.flatten(aligned_box, -3, -1)
            feature_vectors[matched_proposals_idx] = aligned_box.to(
                self.device)  #

        assert feature_vectors.shape[0] == total_proposals

        return feature_vectors
Ejemplo n.º 10
0
    def get_local_features(self, features, boxes, picture_width, picture_height):
        features_heights = features.shape[3]
        features_width = features.shape[2]
        boxes_copy = boxes.clone()
        boxes_copy[:, 0] = (boxes_copy[:, 0]*features_heights) / picture_height
        boxes_copy[:, 2] = (boxes_copy[:, 2]*features_heights) / picture_height
        boxes_copy[:, 1] = (boxes_copy[:, 1]*features_width) / picture_width
        boxes_copy[:, 3] = (boxes_copy[:, 3]*features_width) / picture_width
        batch = torch.arange(boxes_copy.shape[0]).unsqueeze(1).cuda().float()
        box_input = torch.cat((batch, boxes_copy), dim=1)
        roi_align_output = ops.roi_align(features, box_input, (1,1)).squeeze()
        roi_align_output[boxes[:, 0] == -1, :] = F.adaptive_avg_pool2d(features, (1,1)).squeeze()[boxes[:, 0] == -1, :]
        roi_align_output= roi_align_output.squeeze()

        if len(roi_align_output.shape) == 1:
            roi_align_output = roi_align_output.unsqueeze(0)

        return roi_align_output
Ejemplo n.º 11
0
def project_masks_on_boxes(true_masks: Tensor, boxes: Tensor,
                           matched_indexes: Tensor, M: int) -> Tensor:
    """
    Given segmentation masks and the bounding boxes corresponding
    to the location of the masks in the image, this function
    crops and resizes the masks in the position defined by the
    boxes. This prepares the masks for them to be fed to the
    loss computation as the targets.
    Args:
        true_masks
        boxes
        matched_indexes
        M?: output size
    """
    matched_indexes = matched_indexes.to(boxes)
    rois = torch.cat([matched_indexes[:, None], boxes], dim=1)
    true_masks = true_masks[:, None].to(rois)
    return roi_align(true_masks, rois, (M, M), 1.)[:, 0]
def adaptive_feature_pooling(scaled_features, rois):
    base_size = 28
    roi_size = 14

    first_f = True
    for i in range(len(scaled_features) - 1, -1, -1):
        rois = rois * 2**i * base_size
        rois_pool_temp = ops.roi_align(input=scaled_features[i],
                                       boxes=[rois],
                                       output_size=roi_size)
        if first_f:
            rois_pool = rois_pool_temp
            first_f = False
        else:
            rois_pool = torch.maximum(rois_pool, rois_pool_temp)
            del rois_pool_temp

    return rois_pool
Ejemplo n.º 13
0
    def __call__(self, pred__, feature):
        # detections with shape: nx6 (x1, y1, x2, y2, conf, cls)
        # bs = len(pr)
        # ig = (feature[0][0].permute(1,2,0).cpu().detach().numpy()*255).astype(np.int)
        # im_name  = str(time.time())+'.jpg'
        # cv2.imwrite('yanzheng/'+im_name,ig)
        feature = feature[0]
        preds = non_max_suppression_refine(pred__,
                                           self.conf_thres,
                                           self.iou_thres,
                                           classes=None)
        # print(self.conf_thres, self.iou_thres)
        pic_w, pic_h = feature.shape[2], feature.shape[3]

        boxes = []
        for i, pred_ in enumerate(preds):
            # print(torch.max(pred_[:,4]),pred_[0,4:])
            pred = pred_[:, 0:4]
            if pred.shape[0] > 20:
                pred = pred[0:20, :]
            # pred = torch.tensor([[0,160,160,320]]).to(feature.device).float()
            prd = torch.zeros_like(pred).to(feature.device)
            if pred.shape[0] != 0:
                w = (pred[:, 2] - pred[:, 0]).unsqueeze(0)
                h = (pred[:, 3] - pred[:, 1]).unsqueeze(0)
                c_y = (pred[:, 3] + pred[:, 1]) / 2
                c_x = (pred[:, 2] + pred[:, 0]) / 2
                wh = torch.cat((w, h), 0)
                wh = torch.max(wh, 0)[0] * 2
                prd[:, 3] = (c_y + wh / 2).clamp(0, pic_h - 1)
                prd[:, 1] = (c_y - wh / 2).clamp(0, pic_h - 1)
                prd[:, 2] = (c_x + wh / 2).clamp(0, pic_w - 1)
                prd[:, 0] = (c_x - wh / 2).clamp(0, pic_w - 1)

                # pred[]
                # pred = torch.tensor([[20.,20.,100.,100.]]).to(device)
                # print(pred.shape)
            boxes.append(prd)

        per_fear = ops.roi_align(feature, boxes, [32, 32])
        # ig = (per_fear[0].permute(1,2,0).cpu().detach().numpy()*255).astype(np.int)
        # im_name  = +'.jpg'
        # cv2.imwrite('yanzheng/'+im_name,ig)
        return per_fear, boxes
Ejemplo n.º 14
0
    def crop_tracklets(self, pre_locations, frame):
        """
        Crops relevant areas from frame based on a priori (pre_locations) object locations
        """
        start = time.time()
        box_ids = []
        box_list = []

        # convert to array
        for id in pre_locations:
            box_ids.append(id)
            box_list.append(pre_locations[id][:4])
        boxes = np.array(box_list)

        temp = np.zeros(boxes.shape)
        temp[:, 0] = (boxes[:, 0] + boxes[:, 2]) / 2.0
        temp[:, 1] = (boxes[:, 1] + boxes[:, 3]) / 2.0
        temp[:, 2] = boxes[:, 2] - boxes[:, 0]
        temp[:, 3] = (boxes[:, 3] - boxes[:, 1]) / temp[:, 2]
        boxes = temp

        # convert xysr boxes into xmin xmax ymin ymax
        # first row of zeros is batch index (batch is size 0) for ROI align
        new_boxes = np.zeros([len(boxes), 5])

        # use either s or s x r for both dimensions, whichever is smaller,so crop is square
        box_scales = np.min(np.stack((boxes[:, 2], boxes[:, 2] * boxes[:, 3]),
                                     axis=1),
                            axis=1)  #/2.0

        #expand box slightly
        box_scales = box_scales * self.ber  # box expansion ratio

        new_boxes[:, 1] = boxes[:, 0] - box_scales / 2
        new_boxes[:, 3] = boxes[:, 0] + box_scales / 2
        new_boxes[:, 2] = boxes[:, 1] - box_scales / 2
        new_boxes[:, 4] = boxes[:, 1] + box_scales / 2
        torch_boxes = torch.from_numpy(new_boxes).float().to(self.device)

        # crop using roi align
        crops = roi_align(frame.unsqueeze(0), torch_boxes, (self.cs, self.cs))
        self.time_metrics['pre_localize and align'] += time.time() - start

        return crops, new_boxes, box_ids, box_scales
Ejemplo n.º 15
0
def main():
    import torch
    import numpy as np
    import random
    from detectron2.structures import Boxes
    from detectron2.modeling.poolers import ROIPooler
    # from detectron2.layers import ROIAlign
    from torchvision.ops import roi_align

    # np.printoptions(precision=4)
    # img = np.arange(25).reshape(5, 5).astype("float32")
    # img = np.tile(np.expand_dims(img, -1), [1, 1, 3])
    # inputs = tf.convert_to_tensor(img)
    # inputs = tf.reshape(inputs, [1, 1, 5, 5, 3])
    # boxes = tf.constant([[[1, 1, 3, 3]]], dtype=tf.float32)
    # pooled = _selective_crop_and_resize(inputs, boxes, tf.constant([[0]]), tf.constant([[5, 5]], tf.float32), 3, 3, 3, 0.5, True)
    # print(pooled[0, 0, :, :, 0])
    # inputs = torch.from_numpy(img.transpose(2, 0, 1)[None, :, :].astype("float32"))
    # rois = torch.from_numpy(np.array([0, 1, 1, 3, 3]).astype("float32"))[None, :]
    # output = roi_align(inputs, rois, (3, 3), 1, 0, True)
    # print(output[0, 0])

    np.random.seed(4)
    noise = np.random.uniform(0, 1, [32, 32, 1])
    img = np.arange(32 * 32).reshape(32, 32, 1).astype("float32")
    img += noise
    inputs = tf.convert_to_tensor(img, tf.float32)
    inputs = tf.reshape(inputs, [1, 1, 32, 32, 1])
    boxes = tf.constant([[[1, 1, 17, 17]]], dtype=tf.float32)
    pooled = _selective_crop_and_resize(inputs, boxes, tf.constant([[0]]),
                                        tf.constant([[31, 31]], tf.float32), 5,
                                        5, 1, 0.5, True)

    print(pooled[0, 0, :, :, 0])
    inputs = torch.from_numpy(
        img.transpose(2, 0, 1)[None, :, :].astype("float32"))
    # print(inputs.shape)
    rois = torch.from_numpy(np.array([0, 1, 1, 17,
                                      17]).astype("float32"))[None, :]
    output = roi_align(inputs, rois, (5, 5), 1, 0, True)
    output = output.permute(0, 2, 3, 1)
    print(output[0, :, :, 0])
Ejemplo n.º 16
0
 def compute_mask_loss(self, mask_predict, positive_gt_idx, box_predicts,
                       targets):
     mask_gt = targets['mask'].split(targets['batch_len'])
     box_gt = targets['target'].split(targets['batch_len'])
     loss_mask_predicts = list()
     loss_mask_target = list()
     for mg, bg, mp, pgi, bp in zip(mask_gt, box_gt, mask_predict,
                                    positive_gt_idx, box_predicts):
         cls_idx = bg[:, 0].long()[pgi]
         mg_t = mg[pgi]
         mp = mp[range(len(mp)), cls_idx, :]
         bp_extend = torch.cat(
             [torch.arange(len(bp), device=bp.device)[:, None], bp], dim=-1)
         mt = roi_align(mg_t[:, None, :, :], bp_extend,
                        (mp.shape[-1], mp.shape[-1]), 1.)[:, 0]
         loss_mask_target.append(mt)
         loss_mask_predicts.append(mp)
     loss_mask_predicts = torch.cat(loss_mask_predicts)
     loss_mask_target = torch.cat(loss_mask_target)
     mask_loss = self.bce(loss_mask_predicts, loss_mask_target)
     return mask_loss
Ejemplo n.º 17
0
def roi_pooler(fpn_fms, rois, stride, pool_shape, pooler_type):
    if pooler_type == "ROIAlign":
        pooler_aligned = False
    elif pooler_type == "ROIAlignV2":
        pooler_aligned = True
    else:
        raise ValueError("Unknown pooler type: {}".format(pooler_type))
    assert len(fpn_fms) == len(stride)
    max_level = int(math.log2(stride[-1]))
    min_level = int(math.log2(stride[0]))
    assert (len(stride) == max_level - min_level + 1)
    level_assignments = assign_boxes_to_levels(rois, min_level, max_level, 224, 4)
    dtype, device = fpn_fms[0].dtype, fpn_fms[0].device
    output = torch.zeros((len(rois), fpn_fms[0].shape[1], pool_shape[0], pool_shape[1]),
            dtype=dtype, device=device)
    for level, (fm_level, scale_level) in enumerate(zip(fpn_fms, stride)):
        inds = torch.nonzero(level_assignments == level, as_tuple=False).squeeze(1)
        rois_level = rois[inds]
        output[inds] = roi_align(fm_level, rois_level, pool_shape, spatial_scale=1.0/scale_level,
                sampling_ratio=-1, aligned=pooler_aligned)
    return output
Ejemplo n.º 18
0
    def torchvision_roi_align(self, features, proposals, spatial_levels):
        output_features = torch.zeros((len(proposals), 256, 7, 7)).cuda()
        for i, scale in enumerate(self.pooler_scales):
            # get feature level
            feature = features[i]
            idxs = torch.where(spatial_levels == i)[0]

            if len(idxs) == 0:
                continue

            # get proposal
            proposal = proposals[idxs]

            output_feature = ops.roi_align(feature.unsqueeze(0), [proposal],
                                           output_size=self.output_size,
                                           spatial_scale=scale,
                                           sampling_ratio=2)
            output_features[idxs, :, :, :] = output_features[
                idxs, :, :, :] + output_feature

        return output_features
Ejemplo n.º 19
0
def test_2():
    """Authenticate the pooled box pair features """
    f = torch.rand(1, 3, 512, 512)

    boxes_h = torch.rand(256, 4) * 256; boxes_h[:, 2:] += boxes_h[:, :2]
    boxes_h = torch.cat([torch.zeros(256, 1), boxes_h], 1)
    boxes_o = torch.rand(256, 4) * 256; boxes_o[:, 2:] += boxes_o[:, :2]
    boxes_o = torch.cat([torch.zeros(256, 1), boxes_o], 1)

    boxes_union = torch.zeros_like(boxes_h)
    boxes_union[:, 1] = torch.min(boxes_h[:, 1], boxes_o[:, 1])
    boxes_union[:, 2] = torch.min(boxes_h[:, 2], boxes_o[:, 2])
    boxes_union[:, 3] = torch.max(boxes_h[:, 3], boxes_o[:, 3])
    boxes_union[:, 4] = torch.max(boxes_h[:, 4], boxes_o[:, 4])

    m = MaskedBoxPairPool(
        output_size=7,
        spatial_scale=[1.0],
        sampling_ratio=4
    )
    # Compute pooled box pair features
    out1 = m([f], [boxes_h[:, 1:]], [boxes_o[:, 1:]])

    masks = m.construct_masks_for_box_pairs(f, 0, boxes_h, boxes_o)
    # Apply masks on feature maps
    f_stacked = f[boxes_union[:, 0].long()] * masks
    boxes_union[:, 0] = torch.arange(256)
    # Compute pooled box union features
    out2 = roi_align(f_stacked, boxes_union, 
        output_size=(7,7), spatial_scale=1.0, sampling_ratio=4)

    # Compare the pooled features
    # The two feature maps should be exactly the same
    assert out1.shape == out2.shape, \
        "Inconsistent feature map size"
    print("Feature maps are {}% matched.".format(
        100 * torch.eq(out1, out2).sum() / torch.as_tensor(out1.shape).prod()))
Ejemplo n.º 20
0
    def forward(self, features, rois):
        batch_pooled_feats=[]
        batch_size,_,height_0, width_0 = features[0].size()
        for b in range(batch_size):
            pooled_feats = []
            for i in range(len(features)-1,-1,-1):
                keep_inds = (rois[b][:,6] == i)
                if (torch.sum(keep_inds) == 0):
                    continue
                roi = rois[b][keep_inds]
                rois_cords = self.resize_rois(roi[:,1:5], features[i],height_0, width_0)
#                 #print(rois_cords.shape) caused illegal memory error. converting to list seems to work -1/30
                x =  roi_align(features[i][b:b+1], [rois_cords], output_size=(self.aligned_width, self.aligned_height))               
                x = F.avg_pool2d(x, kernel_size=2, stride=1)
                pooled_feats.append(x)

            pooled_feats = torch.cat(pooled_feats, dim =0)
            pooled_feats = torch.unsqueeze(pooled_feats, dim = 0)
            batch_pooled_feats.append(pooled_feats)
        batch_pooled_feats = torch.cat(batch_pooled_feats, dim=0)
        batch_size , n_roi, c, h, w = batch_pooled_feats.size()
        batch_pooled_feats=batch_pooled_feats.view(batch_size*n_roi,-1)
        #print(batch_pooled_feats.size())
        return batch_pooled_feats, batch_size, n_roi, c ,h ,w
    def forward(self, x, coords):
        self.cshape = coords.shape

        batch_size, num_points = self.cshape[0], self.cshape[1]
        ctx_kwarg = self._ctx_kwarg(coords)
        coords = torch.reshape(coords, shape=(-1, 2))
        idx = [i for i in range(coords.shape[1]-1, -1, -1)]
        idx = torch.LongTensor(idx).cuda()
        coords = torch.index_select(coords, 1, idx)

        if self.extraction_method == 'ROIAlign':
            coords = coords - 0.5 / self.spatial_scale
            coords2 = coords
        else:
            coords2 = coords
        rois = torch.cat((coords, coords2), dim=1)
        bi = torch.arange(start=0, end=batch_size, step=1, **ctx_kwarg)
        bi = bi.repeat(num_points)
        bi = torch.reshape(bi, shape=(-1, 1)).type(torch.float32)
        rois = torch.cat((bi.cuda(), rois.float()), dim=1)
        w = roi_align(x, rois, (1, 1), spatial_scale=self.spatial_scale)
        w = torch.reshape(w, shape=(w.shape[0], -1))

        return w
Ejemplo n.º 22
0
def skip_track(track_path, tracker, det_step = 1, srr = 0, ber = 1, PLOT = True):
        
    init_frames = 3
    
    fsld_max = det_step
    
    # CUDA for PyTorch
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda:0" if use_cuda else "cpu")
    torch.cuda.empty_cache() 
            
    # get CNNs
    try:
        detector
        localizer
    except:
        detector,localizer = load_models(device)
        
    
    localizer.eval()
         
    # Loop Setup
    frames,n_frames = load_all_frames(track_path,det_step,init_frames,cutoff = None)
    
    frame_num = 0               # iteration counter   
    next_obj_id = 0             # next id for a new object (incremented during tracking)
    fsld = {}                   # fsld[id] stores frames since last detected for object id
    
    all_tracks = {}             # stores states for each object
    all_classes = {}            # stores class evidence for each object
    
    # for keeping track of what's using up time
    time_metrics = {            
        "gpu_load":0,
        "predict":0,
        "pre_localize and align":0,
        "localize":0,
        "post_localize":0,
        "detect":0,
        "parse":0,
        "match":0,
        "match2":0,
        "update":0,
        "add and remove":0,
        "store":0,
        "plot":0
        }
                
                
    # 3. Main Loop

    for (frame,dim,original_im) in frames:
        
        # 1. Move image to GPU
        start = time.time()
        frame = frame.to(device,non_blocking = True)
        if frame_num % det_step < init_frames: #if frame_num % det_step == 0:
            dim = dim.to(device,non_blocking = True)                      
        time_metrics['gpu_load'] += time.time() - start
        

        # 2. Predict next object locations
        start = time.time()
        try: # in the case that there are no active objects will throw exception

            tracker.predict()
            pre_locations = tracker.objs()
        except:
            pre_locations = []    
        time_metrics['predict'] += time.time() - start
    
       
        if frame_num % det_step < init_frames: #Use YOLO
            # 3a. YOLO detect                            
            detections = detector.detect2(frame,dim)
            torch.cuda.synchronize(device)
            time_metrics['detect'] += time.time() - start
            
            start = time.time()
            detections = detections.cpu()
            time_metrics['gpu_load'] += time.time() - start

            
            # postprocess detections
            start = time.time()
            detections = parse_detections(detections)
            time_metrics['parse'] += time.time() - start
         
            # 4a. Match, using Hungarian Algorithm        
            start = time.time()
            
            pre_ids = []
            pre_loc = []
            for id in pre_locations:
                pre_ids.append(id)
                pre_loc.append(pre_locations[id])
            pre_loc = np.array(pre_loc)
            
            # matchings[i] = [a,b] where a is index of pre_loc and b is index of detection
            matchings = match_hungarian(pre_loc,detections[:,:4],iou_cutoff = 0.05)
            time_metrics['match'] += time.time() - start
            
            # try:
            #     start = time.time()
            #     matchings2 = match_greedy(pre_loc,detections[:,:4], threshold = 200)
            #     time_metrics['match2'] += time.time() - start
            # except:
            #     print("failed")
            
            # 5a. Update tracked objects
            start = time.time()
    
            update_array = np.zeros([len(matchings),4])
            update_ids = []
            for i in range(len(matchings)):
                a = matchings[i,0] # index of pre_loc
                b = matchings[i,1] # index of detections
               
                update_array[i,:] = detections[b,:4]
                update_ids.append(pre_ids[a])
                fsld[pre_ids[a]] = 0 # fsld = 0 since this id was detected this frame
            
            if len(update_array) > 0:    
                tracker.update(update_array,update_ids)
              
                time_metrics['update'] += time.time() - start
                  
            
            # 6a. For each detection not in matchings, add a new object
            start = time.time()
            
            new_array = np.zeros([len(detections) - len(matchings),4])
            new_ids = []
            cur_row = 0
            for i in range(len(detections)):
                if len(matchings) == 0 or i not in matchings[:,1]:
                    
                    new_ids.append(next_obj_id)
                    new_array[cur_row,:] = detections[i,:4]
    
                    fsld[next_obj_id] = 0
                    all_tracks[next_obj_id] = np.zeros([n_frames,7])
                    all_classes[next_obj_id] = np.zeros(13)
                    
                    next_obj_id += 1
                    cur_row += 1
           
            if len(new_array) > 0:        
                tracker.add(new_array,new_ids)
            
            
            # 7a. For each untracked object, increment fsld        
            for i in range(len(pre_ids)):
                try:
                    if i not in matchings[:,0]:
                        fsld[pre_ids[i]] += 1
                except:
                    fsld[pre_ids[i]] += 1
            
            # 8a. remove lost objects
            removals = []
            for id in pre_ids:
                if fsld[id] > fsld_max:
                    removals.append(id)
           
            if len(removals) > 0:
                tracker.remove(removals)    
            
            time_metrics['add and remove'] += time.time() - start


            
        elif True: # use Resnet  
            # 3b. crop tracked objects from image
            start = time.time()
            # use predicted states to crop relevant portions of frame 
            box_ids = []
            box_list = []
            
            # convert to array
            for id in pre_locations:
                box_ids.append(id)
                box_list.append(pre_locations[id][:4])
            boxes = np.array(box_list)
            
            # convert xysr boxes into xmin xmax ymin ymax
            # first row of zeros is batch index (batch is size 0) for ROI align
            new_boxes = np.zeros([len(boxes),5]) 

            # use either s or s x r for both dimensions, whichever is larger,so crop is square
            #box_scales = np.max(np.stack((boxes[:,2],boxes[:,2]*boxes[:,3]),axis = 1),axis = 1)
            box_scales = np.min(np.stack((boxes[:,2],boxes[:,2]*boxes[:,3]),axis = 1),axis = 1) #/2.0
                
            #expand box slightly
            box_scales = box_scales * ber# box expansion ratio
            
            new_boxes[:,1] = boxes[:,0] - box_scales/2
            new_boxes[:,3] = boxes[:,0] + box_scales/2 
            new_boxes[:,2] = boxes[:,1] - box_scales/2 
            new_boxes[:,4] = boxes[:,1] + box_scales/2 
            torch_boxes = torch.from_numpy(new_boxes).float().to(device)
            
            if True: # mask other bboxes
                # these boxes are not square
                rect_boxes = np.zeros([len(boxes),4])
                rect_boxes[:,0] = boxes[:,0] - boxes[:,2] / 2.0
                rect_boxes[:,1] = boxes[:,1] - boxes[:,2] * boxes[:,3] / 2.0 
                rect_boxes[:,2] = boxes[:,0] + boxes[:,2] / 2.0
                rect_boxes[:,3] = boxes[:,1] + boxes[:,2] * boxes[:,3] / 2.0 
                rect_boxes = rect_boxes.astype(int)
                frame_copy = frame.clone()
                for rec in rect_boxes:
                    frame_copy[:,rec[1]:rec[3],rec[0]:rec[2]] = 0
                frame_copy = frame_copy.unsqueeze(0).repeat(len(boxes),1,1,1)
                
                # in each crop, replace active box with correct pixels
                for i in range(len(rect_boxes)):
                    torch_boxes[i,0] = i # so images are indexed correctly
                    rec = rect_boxes[i]
                    frame_copy[i,:,rec[1]:rec[3],rec[0]:rec[2]] = frame[:,rec[1]:rec[3],rec[0]:rec[2]]
            
            else:
                frame_copy = frame.unsqueeze(0)
            # crop using roi align 
            crops = roi_align(frame_copy,torch_boxes,(224,224))
            time_metrics['pre_localize and align'] += time.time() - start
            
            
            
            # 4b. Localize objects using localizer
            start= time.time()
            cls_out,reg_out = localizer(crops)
            torch.cuda.synchronize()
            time_metrics['localize'] += time.time() - start
            
            start = time.time()
            if  False:
                test_outputs(reg_out,crops)
            
            # store class predictions
            highest_conf,cls_preds = torch.max(cls_out,1)
            for i in range(len(cls_preds)):
                all_classes[box_ids[i]][cls_preds[i].item()] += 1
            
            
            # 5b. convert to global image coordinates 
                
            # these detections are relative to crops - convert to global image coords
            wer = 3 # window expansion ratio, was set during training
            
            detections = (reg_out* 224*wer - 224*(wer-1)/2)
            detections = detections.data.cpu()
            
            # add in original box offsets and scale outputs by original box scales
            detections[:,0] = detections[:,0]*box_scales/224 + new_boxes[:,1]
            detections[:,2] = detections[:,2]*box_scales/224 + new_boxes[:,1]
            detections[:,1] = detections[:,1]*box_scales/224 + new_boxes[:,2]
            detections[:,3] = detections[:,3]*box_scales/224 + new_boxes[:,2]

            # convert into xysr form 
            output = np.zeros([len(detections),4])
            output[:,0] = (detections[:,0] + detections[:,2]) / 2.0
            output[:,1] = (detections[:,1] + detections[:,3]) / 2.0
            output[:,2] = (detections[:,2] - detections[:,0])
            output[:,3] = (detections[:,3] - detections[:,1]) / output[:,2]
            
            
            #lastly, replace scale and ratio with original values 
            ## NOTE this is kind of a cludgey fix and ideally localizer should be better
            output[:,2:4] = srr*output[:,2:4] + (1-srr)*boxes[:,2:4] 
            time_metrics['post_localize'] += time.time() - start
            detections = output

            # 6b. Update tracker
            start = time.time()
            # map regressed bboxes directly to objects for update step
            tracker.update(output,box_ids)
            time_metrics['update'] += time.time() - start
            
            # 7b. increment all fslds
            for i in range(len(pre_ids)):
                    fsld[pre_ids[i]] += 1
        
        
            # Low confidence removals
            if True:
                removals = []
                locations = tracker.objs()
                for i in range(len(box_ids)):
                    if highest_conf[i] < 3 and box_ids[i] in locations:
                        removals.append(box_ids[i])
                        print("Removed low confidence object")
                tracker.remove(removals)
        
        # IOU suppression on overlapping bounding boxes
        if True:
            removals = []
            locations = tracker.objs()
            for i in locations:
                for j in locations:
                    if i != j:
                        iou_metric = iou(locations[i],locations[j])
                        if iou_metric > 0.5:
                            # determine which object has been around longer
                            if len(all_classes[i]) > len(all_classes[j]):
                                removals.append(j)
                            else:
                                removals.append(i)
            removals = list(set(removals))
            tracker.remove(removals)
            
            
        # 9. Get all object locations and store in output dict
        start = time.time()
        post_locations = tracker.objs()
        for id in post_locations:
            all_tracks[id][frame_num,:] = post_locations[id][:7]        
        time_metrics['store'] += time.time() - start  
        
        
        # 10. Plot
        start = time.time()
        if PLOT:
            plot(original_im,detections,post_locations,all_classes,class_dict,frame = frame_num)
        time_metrics['plot'] += time.time() - start
   
            
        # increment frame counter 
        if frame_num % 1000 == 0:
            print("Finished frame {}".format(frame_num))
        frame_num += 1
        torch.cuda.empty_cache()
            
    
    cv2.destroyAllWindows()
    
    del frames
    
    total_time = 0
    for key in time_metrics:
        total_time += time_metrics[key]
    
    if False:
        print("Finished file {} for det_step {}".format(track_path,det_step))
        print("\n\nTotal Framerate: {:.2f} fps".format(n_frames/total_time))
        print("---------- per operation ----------")
        for key in time_metrics:
            print("{:.3f}s ({:.2f}%) on {}".format(time_metrics[key],time_metrics[key]/total_time*100,key))


    #write final output 
        
    final_output = []
    for frame in range(n_frames):
        frame_objs = []
        
        for id in all_tracks:
            bbox = all_tracks[id][frame]
            if bbox[0] != 0:
                obj_dict = {}
                obj_dict["id"] = id
                obj_dict["class_num"] = np.argmax(all_classes[id])
                x0 = bbox[0] - bbox[2]/2.0
                x1 = bbox[0] + bbox[2]/2.0
                y0 = bbox[1] - bbox[2]*bbox[3]/2.0
                y1 = bbox[1] + bbox[2]*bbox[3]/2.0
                obj_dict["bbox"] = np.array([x0,y0,x1,y1])
                
                frame_objs.append(obj_dict)
        
        final_output.append(frame_objs)
        
    return final_output, n_frames/total_time, time_metrics
Ejemplo n.º 23
0
 def script_fn(input, rois, pool_size):
     # type: (Tensor, Tensor, int) -> Tensor
     return ops.roi_align(input, rois, pool_size, 1.0)[0]
Ejemplo n.º 24
0
    def forward(
        self,
        x: Dict[str, Tensor],
        boxes: List[Tensor],
        image_shapes: List[Tuple[int, int]],
    ) -> Tensor:
        """
        Args:
            x (OrderedDict[Tensor]): feature maps for each level. They are assumed to have
                all the same number of channels, but they can have different sizes.
            boxes (List[Tensor[N, 4]]): boxes to be used to perform the pooling operation, in
                (x1, y1, x2, y2) format and in the image reference size, not the feature map
                reference. The coordinate must satisfy ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
            image_shapes (List[Tuple[height, width]]): the sizes of each image before they
                have been fed to a CNN to obtain feature maps. This allows us to infer the
                scale factor for each one of the levels to be pooled.
        Returns:
            result (Tensor)
        """
        x_filtered = []
        for k, v in x.items():
            if k in self.featmap_names:
                x_filtered.append(v)
        num_levels = len(x_filtered)
        rois = self.convert_to_roi_format(boxes)
        if self.scales is None:
            self.setup_scales(x_filtered, image_shapes)

        scales = self.scales
        assert scales is not None

        if num_levels == 1:
            return roi_align(x_filtered[0],
                             rois,
                             output_size=self.output_size,
                             spatial_scale=scales[0],
                             sampling_ratio=self.sampling_ratio)

        mapper = self.map_levels
        assert mapper is not None

        levels = mapper(boxes)

        num_rois = len(rois)
        num_channels = x_filtered[0].shape[1]

        dtype, device = x_filtered[0].dtype, x_filtered[0].device
        result = torch.zeros(
            (
                num_rois,
                num_channels,
            ) + self.output_size,
            dtype=dtype,
            device=device,
        )

        tracing_results = []
        for level, (per_level_feature,
                    scale) in enumerate(zip(x_filtered, scales)):
            idx_in_level = torch.where(levels == level)[0]
            rois_per_level = rois[idx_in_level]

            result_idx_in_level = roi_align(per_level_feature,
                                            rois_per_level,
                                            output_size=self.output_size,
                                            spatial_scale=scale,
                                            sampling_ratio=self.sampling_ratio)

            if torchvision._is_tracing():
                tracing_results.append(result_idx_in_level.to(dtype))
            else:
                # result and result_idx_in_level's dtypes are based on dtypes of different
                # elements in x_filtered.  x_filtered contains tensors output by different
                # layers.  When autocast is active, it may choose different dtypes for
                # different layers' outputs.  Therefore, we defensively match result's dtype
                # before copying elements from result_idx_in_level in the following op.
                # We need to cast manually (can't rely on autocast to cast for us) because
                # the op acts on result in-place, and autocast only affects out-of-place ops.
                result[idx_in_level] = result_idx_in_level.to(result.dtype)

        if torchvision._is_tracing():
            result = _onnx_merge_levels(levels, tracing_results)

        return result
            box_scales = np.min(np.stack(
                (boxes[:, 2], boxes[:, 2] * boxes[:, 3]), axis=1),
                                axis=1)  #/2.0

            #expand box slightly
            ber = 1
            box_scales = box_scales * ber  # box expansion ratio

            new_boxes[:, 1] = boxes[:, 0] - box_scales / 2
            new_boxes[:, 3] = boxes[:, 0] + box_scales / 2
            new_boxes[:, 2] = boxes[:, 1] - box_scales / 2
            new_boxes[:, 4] = boxes[:, 1] + box_scales / 2
            torch_boxes = torch.from_numpy(new_boxes).float().to(device)

            # crop using roi align
            crops = roi_align(frame.unsqueeze(0), torch_boxes, (224, 224))

            # 4b. Localize objects using localizer

            cls_out, reg_out = localizer(crops)
            torch.cuda.synchronize()

            # 5b. convert to global image coordinates

            # these detections are relative to crops - convert to global image coords
            wer = 3  # window expansion ratio, was set during training

            detections = (reg_out * 224 * wer - 224 * (wer - 1) / 2)
            detections = detections.data.cpu()

            # add in original box offsets and scale outputs by original box scales
Ejemplo n.º 26
0
    def test_qroialign(self):
        """Make sure quantized version of RoIAlign is close to float version"""
        pool_size = 5
        img_size = 10
        n_channels = 2
        num_imgs = 1
        dtype = torch.float

        def make_rois(num_rois=1000):
            rois = torch.randint(0, img_size // 2, size=(num_rois, 5)).to(dtype)
            rois[:, 0] = torch.randint(0, num_imgs, size=(num_rois,))  # set batch index
            rois[:, 3:] += rois[:, 1:3]  # make sure boxes aren't degenerate
            return rois

        for aligned in (True, False):
            for scale, zero_point in ((1, 0), (2, 10), (0.1, 50)):
                for qdtype in (torch.qint8, torch.quint8, torch.qint32):

                    x = torch.randint(50, 100, size=(num_imgs, n_channels, img_size, img_size)).to(dtype)
                    qx = torch.quantize_per_tensor(x, scale=scale, zero_point=zero_point, dtype=qdtype)

                    rois = make_rois()
                    qrois = torch.quantize_per_tensor(rois, scale=scale, zero_point=zero_point, dtype=qdtype)

                    x, rois = qx.dequantize(), qrois.dequantize()  # we want to pass the same inputs

                    y = ops.roi_align(
                        x,
                        rois,
                        output_size=pool_size,
                        spatial_scale=1,
                        sampling_ratio=-1,
                        aligned=aligned,
                    )
                    qy = ops.roi_align(
                        qx,
                        qrois,
                        output_size=pool_size,
                        spatial_scale=1,
                        sampling_ratio=-1,
                        aligned=aligned,
                    )

                    # The output qy is itself a quantized tensor and there might have been a loss of info when it was
                    # quantized. For a fair comparison we need to quantize y as well
                    quantized_float_y = torch.quantize_per_tensor(y, scale=scale, zero_point=zero_point, dtype=qdtype)

                    try:
                        # Ideally, we would assert this, which passes with (scale, zero) == (1, 0)
                        self.assertTrue((qy == quantized_float_y).all())
                    except AssertionError:
                        # But because the computation aren't exactly the same between the 2 RoIAlign procedures, some
                        # rounding error may lead to a difference of 2 in the output.
                        # For example with (scale, zero) = (2, 10), 45.00000... will be quantized to 44
                        # but 45.00000001 will be rounded to 46. We make sure below that:
                        # - such discrepancies between qy and quantized_float_y are very rare (less then 5%)
                        # - any difference between qy and quantized_float_y is == scale
                        diff_idx = torch.where(qy != quantized_float_y)
                        num_diff = diff_idx[0].numel()
                        self.assertTrue(num_diff / qy.numel() < .05)

                        abs_diff = torch.abs(qy[diff_idx].dequantize() - quantized_float_y[diff_idx].dequantize())
                        t_scale = torch.full_like(abs_diff, fill_value=scale)
                        torch.testing.assert_close(abs_diff, t_scale, rtol=1e-5, atol=1e-5)

        x = torch.randint(50, 100, size=(2, 3, 10, 10)).to(dtype)
        qx = torch.quantize_per_tensor(x, scale=1, zero_point=0, dtype=torch.qint8)
        rois = make_rois(10)
        qrois = torch.quantize_per_tensor(rois, scale=1, zero_point=0, dtype=torch.qint8)
        with self.assertRaisesRegex(RuntimeError, "Only one image per batch is allowed"):
            ops.roi_align(qx, qrois, output_size=pool_size)
Ejemplo n.º 27
0
    def forward(self, images: torch.Tensor, boxes_coordinate: torch.Tensor,
                transcripts: torch.Tensor, src_key_padding_mask: torch.Tensor):
        '''

        :param images: whole_images, shape is (B, N, H, W, C), where B is batch size, N is the number of segments of
                the documents, H is height of image, W is width of image, C is channel of images (default is 3).
        :param boxes_coordinate: boxes coordinate, shape is (B, N, 8),
                where 8 is coordinates (x1, y1, x2, y2, x3, y3, x4, y4).
        :param transcripts: text segments, shape is (B, N, T, D), where T is the max length of transcripts,
                                D is dimension of model.
        :param src_key_padding_mask: text padding mask, shape is (B*N, T), True for padding value.
            if provided, specified padding elements in the key will be ignored by the attention.
            This is an binary mask. When the value is True, the corresponding value on the attention layer of Transformer
            will be filled with -inf.
        need_weights: output attn_output_weights.
        :return: set of nodes X, shape is (B*N, T, D)
        '''

        B, N, T, D = transcripts.shape

        # get image embedding using cnn
        # (B, 3, H, W)
        _, _, origin_H, origin_W = images.shape

        # image embedding: (B, C, H/16, W/16)
        images = self.cnn(images)
        _, C, H, W = images.shape

        # generate rois for roi pooling, rois shape is (B, N, 5), 5 means (batch_index, x0, y0, x1, y1)
        rois_batch = torch.zeros(B, N, 5, device=images.device)
        # Loop on the every image.
        for i in range(B):  # (B, N, 8)
            # (N, 8)
            doc_boxes = boxes_coordinate[i]
            # (N, 4)
            pos = torch.stack([
                doc_boxes[:, 0], doc_boxes[:, 1], doc_boxes[:, 4], doc_boxes[:,
                                                                             5]
            ],
                              dim=1)
            rois_batch[i, :, 1:5] = pos
            rois_batch[i, :, 0] = i

        spatial_scale = float(H / origin_H)
        # use roi pooling get image segments
        # (B*N, C, roi_pooling_size, roi_pooling_size)
        if self.roi_pooling_mode == 'roi_align':
            image_segments = roi_align(images, rois_batch.view(-1, 5),
                                       self.roi_pooling_size, spatial_scale)
        else:
            image_segments = roi_pool(images, rois_batch.view(-1, 5),
                                      self.roi_pooling_size, spatial_scale)

        # (B*N, D, 1, 1)
        image_segments = F.relu(self.bn(self.conv(image_segments)))
        # # (B*N, D,)
        image_segments = image_segments.squeeze()

        # (B*N, 1, D)
        image_segments = image_segments.unsqueeze(dim=1)

        # add positional embedding
        transcripts_segments = self.pe_droput(
            transcripts +
            self.position_embedding[:, :, :transcripts.size(2), :])
        # (B*N, T ,D)
        transcripts_segments = transcripts_segments.reshape(B * N, T, D)

        # (B*N, T, D)
        image_segments = image_segments.expand_as(transcripts_segments)

        # here we first add image embedding and text embedding together,
        # then as the input of transformer to get a non-local fusion features, different from paper process.
        out = image_segments + transcripts_segments

        # (T, B*N, D)
        out = out.transpose(0, 1).contiguous()

        # (T, B*N, D)
        out = self.transformer_encoder(
            out, src_key_padding_mask=src_key_padding_mask)

        # (B*N, T, D)
        out = out.transpose(0, 1).contiguous()
        out = self.norm(out)
        out = F.dropout(out, p=self.dropout, training=self.training)

        return out
Ejemplo n.º 28
0
    def forward(self, input):
        # args = get_args()

        if self.training:
            (intmd_fea, image, flang, bbox, pred_anchor, args) = input
            anchors_full = get_archors_full(args)
            batch_size = args.batch_size
            # n_neg=3
            roi_feat_all = []
            scores = []
            # iou_all=best_n_list
            roi_batch_all = []
            label_batch_all = []
            lang_all = []

            FG_THRESH = 0.5
            BG_THRESH_HI = 0.5
            BG_THRESH_LO = 0.00
            fg_rois_per_image = 2
            rois_per_image = 8

            for scale_ii in range(len(pred_anchor)):

                grid, grid_size = args.size // (32 // (2**scale_ii)), 32 // (
                    2**scale_ii)
                anchor_idxs = [x + 3 * scale_ii for x in [0, 1, 2]]
                anchors = [anchors_full[i] for i in anchor_idxs]
                # scaled_anchors = torch.from_numpy(np.asarray([(x[0] / (args.anchor_imsize / grid), \
                #                    x[1] / (args.anchor_imsize / grid)) for x in anchors])).float()

                ws = np.asarray([
                    np.round(x[0] * grid_size / (args.anchor_imsize / grid))
                    for x in anchors
                ])
                hs = np.asarray([
                    np.round(x[1] * grid_size / (args.anchor_imsize / grid))
                    for x in anchors
                ])

                x_ctr, y_ctr = (grid_size - 1) * 0.5, (grid_size - 1) * 0.5

                scaled_anchors = torch.from_numpy(
                    _mkanchors(ws, hs, x_ctr, y_ctr)).float().cuda()

                bbox_deltas = pred_anchor[scale_ii][:, :, :4, :, :]

                feat_height, feat_width = grid, grid
                shift_x = np.arange(0, feat_width) * grid_size
                shift_y = np.arange(0, feat_height) * grid_size
                shift_x, shift_y = np.meshgrid(shift_x, shift_y)
                shifts = torch.from_numpy(
                    np.vstack(
                        (shift_x.ravel(), shift_y.ravel(), shift_x.ravel(),
                         shift_y.ravel())).transpose())
                shifts = shifts.contiguous().type_as(bbox_deltas).float()

                A = 3
                K = shifts.size(0)

                # self._anchors = self._anchors.type_as(scores)
                # anchors = self._anchors.view(1, A, 4) + shifts.view(1, K, 4).permute(1, 0, 2).contiguous()
                anchors = scaled_anchors.view(1, A, 4) + shifts.view(K, 1, 4)
                anchors = anchors.view(1, K * A,
                                       4).expand(batch_size, K * A, 4)

                bbox_deltas = bbox_deltas.permute(0, 1, 3, 4, 2).contiguous()
                bbox_deltas = bbox_deltas.view(batch_size, -1, 4)

                proposals = bbox_transform_inv(anchors, bbox_deltas,
                                               batch_size, grid_size)  # xyxy

                proposals = clip_boxes(proposals, args.size, batch_size)

                gt_boxes = bbox.clone().unsqueeze(1)  #xyxy

                # gt_boxes_append = gt_boxes.new(gt_boxes.size()).zero_()
                # gt_boxes_append[:, :, 1:5] = gt_boxes[:, :, :4]

                # Include ground-truth boxes in the set of candidate rois
                all_rois = torch.cat([proposals, gt_boxes], 1)

                overlaps = bbox_overlaps_batch(all_rois, gt_boxes)

                max_overlaps, gt_assignment = torch.max(overlaps, 2)

                batch_size = overlaps.size(0)
                num_proposal = overlaps.size(1)
                num_boxes_per_img = overlaps.size(2)

                offset = torch.arange(0, batch_size) * gt_boxes.size(1)
                offset = offset.view(-1,
                                     1).type_as(gt_assignment) + gt_assignment

                labels = gt_boxes[:, :, 3]
                labels[:, :] = 1.
                labels = labels.contiguous().view(-1)[offset.view(-1)] \
                    .view(batch_size, -1)
                # labels = torch.ones(batch_size,1).cuda()

                # roi_size=(scale_ii+1)*7

                labels_batch = labels.new(batch_size, rois_per_image).zero_()
                rois_batch = all_rois.new(batch_size, rois_per_image,
                                          5).zero_()
                lang_batch = []

                for i in range(batch_size):
                    fg_inds = torch.nonzero(
                        max_overlaps[i] >= FG_THRESH).view(-1)
                    fg_num_rois = fg_inds.numel()

                    # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
                    bg_inds = torch.nonzero(
                        (max_overlaps[i] < BG_THRESH_HI)
                        & (max_overlaps[i] >= BG_THRESH_LO)).view(-1)
                    bg_num_rois = bg_inds.numel()

                    if fg_num_rois > 0 and bg_num_rois > 0:
                        # sampling fg
                        fg_rois_per_this_image = fg_rois_per_image  #min(fg_rois_per_image, fg_num_rois)

                        # torch.randperm seems has a bug on multi-gpu setting that cause the segfault.
                        # See https://github.com/pytorch/pytorch/issues/1868 for more details.
                        # use numpy instead.
                        # rand_num = torch.randperm(fg_num_rois).long().cuda()
                        if fg_rois_per_image < fg_num_rois:
                            rand_num = torch.from_numpy(
                                np.random.permutation(fg_num_rois)).type_as(
                                    gt_boxes).long()
                            fg_inds = fg_inds[
                                rand_num[:fg_rois_per_this_image]]
                        else:
                            rand_num = torch.from_numpy(
                                np.random.choice(
                                    fg_num_rois,
                                    fg_rois_per_image,
                                    replace=True)).type_as(gt_boxes).long()
                            fg_inds = fg_inds[rand_num]
                        # sampling bg
                        bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image

                        # Seems torch.rand has a bug, it will generate very large number and make an error.
                        # We use numpy rand instead.
                        # rand_num = (torch.rand(bg_rois_per_this_image) * bg_num_rois).long().cuda()
                        rand_num = np.floor(
                            np.random.rand(bg_rois_per_this_image) *
                            bg_num_rois)
                        rand_num = torch.from_numpy(rand_num).type_as(
                            gt_boxes).long()
                        bg_inds = bg_inds[rand_num]

                    elif fg_num_rois > 0 and bg_num_rois == 0:
                        # sampling fg
                        # rand_num = torch.floor(torch.rand(rois_per_image) * fg_num_rois).long().cuda()
                        rand_num = np.floor(
                            np.random.rand(rois_per_image) * fg_num_rois)
                        rand_num = torch.from_numpy(rand_num).type_as(
                            gt_boxes).long()
                        fg_inds = fg_inds[rand_num]
                        fg_rois_per_this_image = rois_per_image
                        bg_rois_per_this_image = 0
                    elif bg_num_rois > 0 and fg_num_rois == 0:
                        # sampling bg
                        # rand_num = torch.floor(torch.rand(rois_per_image) * bg_num_rois).long().cuda()
                        rand_num = np.floor(
                            np.random.rand(rois_per_image) * bg_num_rois)
                        rand_num = torch.from_numpy(rand_num).type_as(
                            gt_boxes).long()

                        bg_inds = bg_inds[rand_num]
                        bg_rois_per_this_image = rois_per_image
                        fg_rois_per_this_image = 0
                    else:
                        raise ValueError(
                            "bg_num_rois = 0 and fg_num_rois = 0, this should not happen!"
                        )

                        # The indices that we're selecting (both fg and bg)
                    keep_inds = torch.cat([fg_inds, bg_inds], 0)

                    # Select sampled values from various arrays:
                    labels_batch[i].copy_(labels[i][keep_inds])

                    # Clamp labels for the background RoIs to 0
                    if fg_rois_per_this_image < rois_per_image:
                        labels_batch[i][fg_rois_per_this_image:] = 0

                    rois_batch[i, :, 1:] = all_rois[i][keep_inds]
                    rois_batch[i, :, 0] = i
                    lang_batch.append(torch.stack([flang[i]] * rois_per_image))
                roi_batch_all.append(rois_batch)
                label_batch_all.append(labels_batch)
                lang_all.append(torch.stack(lang_batch))
            # for i in range(batch_size):

            roi_batch_all = torch.cat(roi_batch_all)
            label_batch_all = torch.cat(label_batch_all)
            flang = torch.cat(lang_all)

            for scale_ii in range(len(intmd_fea)):
                grid, grid_size = args.size // (32 // (2**scale_ii)), 32 // (
                    2**scale_ii)
                roi_size = (scale_ii + 1) * 7

                feat_map = intmd_fea[scale_ii]
                # roi_scale=torch.cat([roi_batch_all.view(-1, 5)[:,0].unsqueeze(1),roi_batch_all.view(-1, 5)[:,1:]/grid_size],dim=1)

                roi_feat = roi_align(feat_map, roi_batch_all.view(-1, 5),
                                     [roi_size, roi_size], 1. / grid_size)
                roi_img = roi_align(image, roi_batch_all.view(-1, 5),
                                    [roi_size, roi_size])

                roi_feat_all.append(torch.cat([roi_img, roi_feat], dim=1))
                scores.append(label_batch_all.view(-1))

            cam, cam_rv, bi_score = [], [], []
            for ii in range(len(roi_feat_all)):
                # output=self.fcn_out._modules[str(ii)](roi_feat_all[ii])
                emb = self.fcn_emb(roi_feat_all[ii])
                output = self.fcn_out(emb)
                cam.append(output)
                cam_rv.append(self.PCM(output, emb, flang))
                bi_score.append(
                    F.adaptive_avg_pool2d(cam[ii], (1, 1)).squeeze())

            return cam, cam_rv, bi_score, scores
        else:
            (intmd_fea, image, flang, seg_bbox, args) = input
            batch_size = seg_bbox.size(0)
            # feats = seg_bbox.unsqueeze(0)
            rois_batch = seg_bbox.new(batch_size, 5).zero_()
            for ii in range(batch_size):
                rois_batch[ii, 1:] = seg_bbox[ii]
                rois_batch[ii, 0] = ii

            roi_feat_all = []
            for scale_ii in range(len(intmd_fea)):
                grid, grid_size = args.size // (32 // (2**scale_ii)), 32 // (
                    2**scale_ii)
                roi_size = (scale_ii + 1) * 7
                # for ii in range(batch_size):
                #[x.unsqueeze(0) for x in seg_bbox[scale_ii]]
                feat_map = intmd_fea[scale_ii]
                roi_feat = roi_align(feat_map, rois_batch,
                                     [roi_size, roi_size], 1. / grid_size)
                roi_img = roi_align(image, rois_batch, [roi_size, roi_size])
                roi_feat_all.append(torch.cat([roi_img, roi_feat], dim=1))

            cam, cam_rv, bi_score = [], [], []
            for ii in range(len(roi_feat_all)):
                # output=self.fcn_out._modules[str(ii)](roi_feat_all[ii])
                emb = self.fcn_emb(roi_feat_all[ii])
                output = self.fcn_out(emb)
                cam.append(output)
                cam_rv.append(self.PCM(output, emb, flang))
                bi_score.append(
                    F.adaptive_avg_pool2d(cam[ii], (1, 1)).squeeze())

            return cam, cam_rv, bi_score
Ejemplo n.º 29
0
    def forward(self, x, boxes, image_shapes):
        # type: (Dict[str, Tensor], List[Tensor], List[Tuple[int, int]]) -> Tensor
        """
        Arguments:
            x (OrderedDict[Tensor]): feature maps for each level. They are assumed to have
                all the same number of channels, but they can have different sizes.
            boxes (List[Tensor[N, 4]]): boxes to be used to perform the pooling operation, in
                (x1, y1, x2, y2) format and in the image reference size, not the feature map
                reference.
            image_shapes (List[Tuple[height, width]]): the sizes of each image before they
                have been fed to a CNN to obtain feature maps. This allows us to infer the
                scale factor for each one of the levels to be pooled.
        Returns:
            result (Tensor)
        """
        x_filtered = []
        for k, v in x.items():
            if k in self.featmap_names:
                x_filtered.append(v)
        num_levels = len(x_filtered)
        rois = self.convert_to_roi_format(boxes)
        if self.scales is None:
            self.setup_scales(x_filtered, image_shapes)

        scales = self.scales
        assert scales is not None

        if num_levels == 1:
            return roi_align(x_filtered[0],
                             rois,
                             output_size=self.output_size,
                             spatial_scale=scales[0],
                             sampling_ratio=self.sampling_ratio)

        mapper = self.map_levels
        assert mapper is not None

        levels = mapper(boxes)

        num_rois = len(rois)
        num_channels = x_filtered[0].shape[1]

        dtype, device = x_filtered[0].dtype, x_filtered[0].device
        result = torch.zeros(
            (
                num_rois,
                num_channels,
            ) + self.output_size,
            dtype=dtype,
            device=device,
        )

        tracing_results = []
        for level, (per_level_feature,
                    scale) in enumerate(zip(x_filtered, scales)):
            idx_in_level = torch.nonzero(levels == level).squeeze(1)
            rois_per_level = rois[idx_in_level]

            result_idx_in_level = roi_align(per_level_feature,
                                            rois_per_level,
                                            output_size=self.output_size,
                                            spatial_scale=scale,
                                            sampling_ratio=self.sampling_ratio)

            if torchvision._is_tracing():
                tracing_results.append(result_idx_in_level.to(dtype))
            else:
                result[idx_in_level] = result_idx_in_level

        if torchvision._is_tracing():
            result = _onnx_merge_levels(levels, tracing_results)

        return result
Ejemplo n.º 30
0
    def _PyramidRoI_Feat(self, feat_maps, rois, im_info):
        ''' roi pool on pyramid feature maps'''
        # do roi pooling based on predicted rois
        img_area = im_info[0][0] * im_info[0][1]
        h = rois.data[:, 4] - rois.data[:, 2] + 1
        w = rois.data[:, 3] - rois.data[:, 1] + 1
        roi_level = torch.log(torch.sqrt(h * w) / 224.0)
        roi_level = torch.round(roi_level + 4)
        roi_level[roi_level < 2] = 2
        roi_level[roi_level > 5] = 5
        # roi_level.fill_(5)
        if cfg.POOLING_MODE == 'crop':
            # pdb.set_trace()
            # pooled_feat_anchor = _crop_pool_layer(base_feat, rois.view(-1, 5))
            # NOTE: need to add pyrmaid
            grid_xy = _affine_grid_gen(rois,
                                       base_feat.size()[2:], self.grid_size)
            grid_yx = torch.stack(
                [grid_xy.data[:, :, :, 1], grid_xy.data[:, :, :, 0]],
                3).contiguous()
            roi_pool_feat = self.RCNN_roi_crop(base_feat,
                                               Variable(grid_yx).detach())
            if cfg.CROP_RESIZE_WITH_MAX_POOL:
                roi_pool_feat = F.max_pool2d(roi_pool_feat, 2, 2)

        elif cfg.POOLING_MODE == 'align':
            roi_pool_feats = []
            box_to_levels = []
            for i, l in enumerate(range(2, 6)):
                if (roi_level == l).sum() == 0:
                    continue
                # idx_l = (roi_level == l).nonzero().squeeze()
                idx_l = (roi_level == l).nonzero()
                if idx_l.shape[0] > 1:
                    idx_l = idx_l.squeeze()
                else:
                    idx_l = idx_l.view(-1)
                box_to_levels.append(idx_l)
                scale = feat_maps[i].size(2) / im_info[0][0]

                # pdb.set_trace()
                # feat = self.RCNN_roi_align(feat_maps[i], rois[idx_l], scale)
                feat = roi_align(feat_maps[i],
                                 rois[idx_l],
                                 (cfg.POOLING_SIZE, cfg.POOLING_SIZE),
                                 spatial_scale=scale,
                                 sampling_ratio=0)

                roi_pool_feats.append(feat)
            roi_pool_feat = torch.cat(roi_pool_feats, 0)
            box_to_level = torch.cat(box_to_levels, 0)
            idx_sorted, order = torch.sort(box_to_level)
            roi_pool_feat = roi_pool_feat[order]

        elif cfg.POOLING_MODE == 'pool':
            roi_pool_feats = []
            box_to_levels = []
            for i, l in enumerate(range(2, 6)):
                if (roi_level == l).sum() == 0:
                    continue
                idx_l = (roi_level == l).nonzero().squeeze()
                box_to_levels.append(idx_l)
                scale = feat_maps[i].size(2) / im_info[0][0]
                feat = self.RCNN_roi_pool(feat_maps[i], rois[idx_l], scale)
                roi_pool_feats.append(feat)
            roi_pool_feat = torch.cat(roi_pool_feats, 0)
            box_to_level = torch.cat(box_to_levels, 0)
            idx_sorted, order = torch.sort(box_to_level)
            roi_pool_feat = roi_pool_feat[order]

        return roi_pool_feat