def __getitem__(self, idx):
        dataset_dict = self.json_list[idx] # dict of single instance of an image/video

        if dataset_dict['type'] == 'image':
            image = cv2.imread(dataset_dict["file_name"])
        elif dataset_dict['type'] == 'video':
            image = faster_read_frame_at_index(dataset_dict["file_name"], dataset_dict["frame"])


        image = get_cropped_img_fast(image, dataset_dict['bbox'])
        new_size = self.sizes[dataset_dict['aspect_group']]
        image = cv2.resize(image, new_size)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        if self.mode == 'train':
            data_dict = {"image": image}
            augmented = self.augmentor(**data_dict)
            image = augmented["image"]

            if random.random() < 0.7:
                image = random_affine(image, degrees=8, translate=.0625, scale=.1, shear=8)

            if random.random() < 0.7:
                image = cutout(image, int(new_size[1]*0.63), int(new_size[0]*0.63), fill_value=114)

        # if random.randint(0, 1) == 0:
        # image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        # image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)

        image = standardize(transforms.functional.to_tensor(image))
        return image, dataset_dict['instance_id'], dataset_dict['category_id']
    def __getitem__(self, idx):
        inst_dict = self.instance_list[idx]

        if inst_dict['type'] == 'image':
            img = cv2.imread(inst_dict['file_name'])
        elif inst_dict['type'] == 'video':
            img = faster_read_frame_at_index(inst_dict["file_name"], inst_dict["frame"])

        inst = inst_process(img, inst_dict['bbox'], inst_dict['aspect_group'], scale=self.scale)
        
        return inst.to(DEVICE), idx
Example #3
0
def infer_vid(cfg,
              metric_net,
              dataset,
              bbox_scale='S',
              frames=[40, 120, 200, 280, 360]):
    metric_net.to(DEVICE)
    metric_net.eval()

    aspect_template = np.array([
        0.25, 0.33333333, 0.41666667, 0.5, 0.57142857, 0.66666667, 0.8, 1.,
        1.25, 1.5, 1.75, 2.
    ])

    result = []
    predictor = DefaultPredictor(cfg)

    for image_dict in tqdm(dataset):
        for frame in frames:
            img = faster_read_frame_at_index(image_dict['file_name'], frame)
            outputs = predictor(img)
            scores = outputs['instances'].get_fields()['scores']
            pred_boxes = outputs['instances'].get_fields(
            )['pred_boxes'].tensor.cpu().numpy().astype(int).tolist()

            with torch.no_grad():
                for bbox, score in zip(pred_boxes, scores):
                    inst_dict = copy.deepcopy(image_dict)
                    inst_dict['bbox'] = bbox
                    inst_dict['score'] = float(score)
                    inst_dict['frame'] = frame
                    inst_dict['bbox_aspect'] = (bbox[2] - bbox[0]) / (bbox[3] -
                                                                      bbox[1])
                    inst_dict['aspect_group'] = int(
                        np.abs(inst_dict['bbox_aspect'] -
                               aspect_template).argmin())

                    inst = inst_process(img,
                                        bbox,
                                        inst_dict['aspect_group'],
                                        scale=bbox_scale).to(DEVICE)
                    feat = F.normalize(metric_net(
                        inst.unsqueeze(0)))[0].detach().cpu().numpy().tolist()
                    inst_dict['feat'] = feat

                    result.append(inst_dict)

    return result
Example #4
0
    def __getitem__(self, idx):
        dataset_dict = self.json_list[
            idx]  # dict of single instance of an image/video

        if dataset_dict['type'] == 'image':
            image = cv2.imread(dataset_dict["file_name"])
        elif dataset_dict['type'] == 'video':
            image = faster_read_frame_at_index(dataset_dict["file_name"],
                                               dataset_dict["frame"])

        image = get_cropped_img(image, dataset_dict['bbox'], is_mask=False)
        image = cv2.resize(image, self.sizes[dataset_dict['aspect_group']])

        # if random.randint(0, 1) == 0:
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)

        image = standardize(transforms.functional.to_tensor(image))
        return image
Example #5
0
def infer_vid(bbox_net, dataset, frames=list(range(80, 400, 20))):

    # infer bbox
    aspect_template = np.array([0.25, 0.335, 0.415, 0.5, 0.5721925, 0.66857143, 0.8, 1., 1.25, 1.4957264, 1.74766355, 2.])

    bbox_net.to(DEVICE)
    bbox_net.eval()

    inst_ds = []
    with torch.no_grad():
        for image_dict in tqdm(dataset):
            for frame in frames:
                img = faster_read_frame_at_index(image_dict['file_name'], frame)
                try:
                    inputs = {'image': torch.as_tensor(img.astype("float32").transpose(2, 0, 1)), 'height': image_dict['height'], 'width': image_dict['width']}
                except AttributeError:
                    print('Video {} not found (at frame {})'.format(image_dict['file_name'], frame))
                    continue
                outputs = bbox_net([inputs])[0]
                scores = outputs['instances'].scores
                pred_boxes = outputs['instances'].pred_boxes.tensor.cpu().numpy().astype(int).tolist()

                for bbox, score in zip(pred_boxes, scores):
                    inst_dict = copy.deepcopy(image_dict)
                    inst_dict['bbox'] = bbox
                    inst_dict['score'] = float(score)
                    inst_dict['frame'] = frame
                    inst_dict['bbox_aspect'] = (bbox[2] - bbox[0]) / (bbox[3] - bbox[1])
                    inst_dict['aspect_group'] = int(np.abs(inst_dict['bbox_aspect'] - aspect_template).argmin())

                    inst_ds.append(inst_dict)

    del bbox_net
    torch.cuda.empty_cache()

    return inst_ds