def __getitem__(self, idx): dataset_dict = self.json_list[idx] # dict of single instance of an image/video if dataset_dict['type'] == 'image': image = cv2.imread(dataset_dict["file_name"]) elif dataset_dict['type'] == 'video': image = faster_read_frame_at_index(dataset_dict["file_name"], dataset_dict["frame"]) image = get_cropped_img_fast(image, dataset_dict['bbox']) new_size = self.sizes[dataset_dict['aspect_group']] image = cv2.resize(image, new_size) image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) if self.mode == 'train': data_dict = {"image": image} augmented = self.augmentor(**data_dict) image = augmented["image"] if random.random() < 0.7: image = random_affine(image, degrees=8, translate=.0625, scale=.1, shear=8) if random.random() < 0.7: image = cutout(image, int(new_size[1]*0.63), int(new_size[0]*0.63), fill_value=114) # if random.randint(0, 1) == 0: # image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) # image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB) image = standardize(transforms.functional.to_tensor(image)) return image, dataset_dict['instance_id'], dataset_dict['category_id']
def __getitem__(self, idx): inst_dict = self.instance_list[idx] if inst_dict['type'] == 'image': img = cv2.imread(inst_dict['file_name']) elif inst_dict['type'] == 'video': img = faster_read_frame_at_index(inst_dict["file_name"], inst_dict["frame"]) inst = inst_process(img, inst_dict['bbox'], inst_dict['aspect_group'], scale=self.scale) return inst.to(DEVICE), idx
def infer_vid(cfg, metric_net, dataset, bbox_scale='S', frames=[40, 120, 200, 280, 360]): metric_net.to(DEVICE) metric_net.eval() aspect_template = np.array([ 0.25, 0.33333333, 0.41666667, 0.5, 0.57142857, 0.66666667, 0.8, 1., 1.25, 1.5, 1.75, 2. ]) result = [] predictor = DefaultPredictor(cfg) for image_dict in tqdm(dataset): for frame in frames: img = faster_read_frame_at_index(image_dict['file_name'], frame) outputs = predictor(img) scores = outputs['instances'].get_fields()['scores'] pred_boxes = outputs['instances'].get_fields( )['pred_boxes'].tensor.cpu().numpy().astype(int).tolist() with torch.no_grad(): for bbox, score in zip(pred_boxes, scores): inst_dict = copy.deepcopy(image_dict) inst_dict['bbox'] = bbox inst_dict['score'] = float(score) inst_dict['frame'] = frame inst_dict['bbox_aspect'] = (bbox[2] - bbox[0]) / (bbox[3] - bbox[1]) inst_dict['aspect_group'] = int( np.abs(inst_dict['bbox_aspect'] - aspect_template).argmin()) inst = inst_process(img, bbox, inst_dict['aspect_group'], scale=bbox_scale).to(DEVICE) feat = F.normalize(metric_net( inst.unsqueeze(0)))[0].detach().cpu().numpy().tolist() inst_dict['feat'] = feat result.append(inst_dict) return result
def __getitem__(self, idx): dataset_dict = self.json_list[ idx] # dict of single instance of an image/video if dataset_dict['type'] == 'image': image = cv2.imread(dataset_dict["file_name"]) elif dataset_dict['type'] == 'video': image = faster_read_frame_at_index(dataset_dict["file_name"], dataset_dict["frame"]) image = get_cropped_img(image, dataset_dict['bbox'], is_mask=False) image = cv2.resize(image, self.sizes[dataset_dict['aspect_group']]) # if random.randint(0, 1) == 0: image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB) image = standardize(transforms.functional.to_tensor(image)) return image
def infer_vid(bbox_net, dataset, frames=list(range(80, 400, 20))): # infer bbox aspect_template = np.array([0.25, 0.335, 0.415, 0.5, 0.5721925, 0.66857143, 0.8, 1., 1.25, 1.4957264, 1.74766355, 2.]) bbox_net.to(DEVICE) bbox_net.eval() inst_ds = [] with torch.no_grad(): for image_dict in tqdm(dataset): for frame in frames: img = faster_read_frame_at_index(image_dict['file_name'], frame) try: inputs = {'image': torch.as_tensor(img.astype("float32").transpose(2, 0, 1)), 'height': image_dict['height'], 'width': image_dict['width']} except AttributeError: print('Video {} not found (at frame {})'.format(image_dict['file_name'], frame)) continue outputs = bbox_net([inputs])[0] scores = outputs['instances'].scores pred_boxes = outputs['instances'].pred_boxes.tensor.cpu().numpy().astype(int).tolist() for bbox, score in zip(pred_boxes, scores): inst_dict = copy.deepcopy(image_dict) inst_dict['bbox'] = bbox inst_dict['score'] = float(score) inst_dict['frame'] = frame inst_dict['bbox_aspect'] = (bbox[2] - bbox[0]) / (bbox[3] - bbox[1]) inst_dict['aspect_group'] = int(np.abs(inst_dict['bbox_aspect'] - aspect_template).argmin()) inst_ds.append(inst_dict) del bbox_net torch.cuda.empty_cache() return inst_ds