class Frame: def __init__(self,image_path='',objects=[]): self.image_path = image_path self.objects = objects self.image = PTImage(pil_image_path=self.image_path,persist=False) @classmethod def from_image_and_objects(cls,ptimage,objects=[]): frame = cls('',objects) frame.image = ptimage return frame def get_objects(self): return self.objects def show_raw_image(self): self.image.visualize('frame image') def visualize(self,axes=None,display=False,title='Frame Visualization'): axes = self.image.visualize(axes=axes,title=title,display=False) for obj in self.objects: rect = patches.Rectangle(obj.box.xy_min(),obj.box.edges()[0],obj.box.edges()[1],linewidth=1,edgecolor='r',facecolor='none') axes.add_patch(rect) coord_string = str([int(round(x)) for x in obj.box.to_single_array()]) axes.text(obj.box.xmin, obj.box.ymin, str(obj.unique_id) + ' ' + str(obj.obj_type) + ' ' + coord_string, color='white', fontsize=12, bbox={'facecolor':'red', 'alpha':0.5, 'pad':2}) if display: plt.show(block=True) plt.close()
def visualize(self, parameters={}): image_original = PTImage.from_cwh_torch(self.data[0]) drawing_image = image_original.to_order_and_class( Ordering.HWC, ValueClass.BYTE0255).get_data().copy() boxes, classes = self.output[1:] # Nx4 boxes and N class tensor valid_boxes, valid_classes = MultiObjectDetector.post_process_boxes( self.data[0], boxes, classes, len(self.class_lookup)) # convert targets real_targets = self.target[0][:, 0] > -1 filtered_targets = self.target[0][real_targets].reshape( -1, self.target[0].shape[1]) target_boxes = filtered_targets[:, 1:] target_classes = filtered_targets[:, 0] if target_boxes.shape[0] > 0: draw_objects_on_np_image(drawing_image, self.__convert_to_objects( target_boxes, target_classes), color=(255, 0, 0)) if valid_boxes.shape[0] > 0: draw_objects_on_np_image(drawing_image, self.__convert_to_objects( valid_boxes, valid_classes), color=None) ImageVisualizer().set_image(PTImage(drawing_image), parameters.get('title', '') + ' : Output')
def visualize(self,parameters={}): image_original = PTImage.from_cwh_torch(self.data[0]) ImageVisualizer().set_image(image_original,parameters.get('title','') + ' : Input') # need to draw the mask layers ontop of the data with transparency target_mask_chw = self.target[0] output_mask_chw = self.output[0][-1] # draw a separate image for each channel for now for i in range(target_mask_chw.size(0)): imt = PTImage.from_cwh_torch(target_mask_chw[i,:,:].unsqueeze(0)) imo = PTImage.from_cwh_torch(output_mask_chw[i,:,:].unsqueeze(0)) ImageVisualizer().set_image(imt,parameters.get('title','') + ' : Target-{}'.format(self.class_lookup[i])) ImageVisualizer().set_image(imo,parameters.get('title','') + ' : LOutput-{}'.format(self.class_lookup[i]))
def apply_to_image(self, image, _output_size): output_size = [int(_output_size[0]), int(_output_size[1])] inverse_transform = self.inverse.copy() inverse_transform[0:2, 0:2] = self.inverse[1:None:-1, 1:None:-1] inverse_transform[0:2, 2] = self.inverse[1:None:-1, 2] assert image.ordering == Ordering.HWC, 'Ordering must be HWC to apply the affine transform!' img_data = image.get_data() # check if image only has 1 channel, duplicate the channels if len(img_data.shape) == 2: img_data = np.stack((img_data, ) * 3, axis=2) assert len(img_data.shape ) == 3, 'Input image must have 3 channels! found {}'.format( image_data.shape) newimage = PTImage(data=np.empty( [output_size[0], output_size[1], img_data.shape[2]], dtype=image.vc['dtype']), ordering=Ordering.HWC, vc=image.vc) newimage_data = newimage.get_data() # print self.inverse # print inverse_transform # print output_size # for i in range(0,image.data.shape[2]): # newimage.data[:,:,i] = affine_transform(image.data[:,:,i], # self.inverse[0:2,0:2], # offset=-self.transform[0:2,2], # output_shape=output_size).astype(image.vc['dtype']) # scipy's affine_transform sucks, it only accepts 2x2 affine matrices and # you have to specify the offset from the input using my own affine # Going to use map_coordinates apply the affine and interpolation separately # 1) first create an augmented matrix of 3 x (m*n) output points px, py = np.mgrid[0:output_size[0]:1, 0:output_size[1]:1] points = np.c_[px.ravel(), py.ravel()] points_aug = np.concatenate((points, np.ones((points.shape[0], 1))), axis=1) # 2) next apply the inverse transform to find the input points to sample at inv_points = np.dot(inverse_transform, points_aug.T) # 3) use map_coordinates to do a interpolation on the input image at the required points for i in range(0, img_data.shape[2]): newimage_data[:, :, i] = map_coordinates( img_data[:, :, i], inv_points[0:2, :], order=self.interp_order).reshape(output_size) return newimage
def visualize(self, parameters={}): # visualizes a sequence for i in range(self.data[0].shape[0]): img = PTImage.from_cwh_torch(self.data[0][i]) ImageVisualizer().set_image( img, parameters.get('title', '') + ' : Image {}'.format(i)) for i in range(self.output[2].shape[0]): dmap = self.output[2][i] depth_map = PTImage.from_2d_wh_torch(dmap) ImageVisualizer().set_image( depth_map, parameters.get('title', '') + ' : DepthMap {}'.format(i))
def visualize(self,parameters={}): # here output[0] could either be a single image or a sequence of images if isinstance(self.output[0],list): image_target = PTImage.from_cwh_torch(self.target[0]) ImageVisualizer().set_image(image_target,parameters.get('title','') + ' : Target') for i,o in enumerate(self.output[0]): image_output = PTImage.from_cwh_torch(o) ImageVisualizer().set_image(image_output,parameters.get('title','') + ' : Output{:02d}'.format(i)) else: image_target = PTImage.from_cwh_torch(self.target[0]) image_output = PTImage.from_cwh_torch(self.output[0]) ImageVisualizer().set_image(image_target,parameters.get('title','') + ' : Target') ImageVisualizer().set_image(image_output,parameters.get('title','') + ' : Output')
def visualize(self, parameters={}): # image_frame = PTImage.from_cwh_torch(self.data[0]) if parameters.get('mode', 'train') == 'train': image_pos = PTImage.from_cwh_torch(self.data[0]) image_neg = PTImage.from_cwh_torch(self.data[1]) image_anchor = PTImage.from_cwh_torch(self.output[0]) image_pos_map = PTImage.from_2d_wh_torch( F.sigmoid(self.output[1]).data) image_neg_map = PTImage.from_2d_wh_torch( F.sigmoid(self.output[2]).data) image_pos_tar = PTImage.from_2d_wh_torch(self.target[0]) image_neg_tar = PTImage.from_2d_wh_torch(self.target[1]) # target_box = Box.tensor_to_box(self.target[0].cpu(),image_pos.get_wh()) # objs = [Object(target_box,0,obj_type='T')] # pos_frame = Frame.from_image_and_objects(image_pos,objs) # ImageVisualizer().set_image(image_frame,parameters.get('title','') + ' : Frame') ImageVisualizer().set_image( image_anchor, parameters.get('title', '') + ' : anchor') ImageVisualizer().set_image( image_pos, parameters.get('title', '') + ' : pos_frame') ImageVisualizer().set_image( image_neg, parameters.get('title', '') + ' : neg_frame') ImageVisualizer().set_image( image_pos_tar, parameters.get('title', '') + ' : pos_target') ImageVisualizer().set_image( image_neg_tar, parameters.get('title', '') + ' : neg_target') ImageVisualizer().set_image( image_pos_map, parameters.get('title', '') + ' : pos_res') ImageVisualizer().set_image( image_neg_map, parameters.get('title', '') + ' : neg_res') else: img_frame = PTImage.from_cwh_torch(self.data[0]) img_frame_xcor = PTImage.from_2d_wh_torch( F.sigmoid(self.output[0]).data) # img_pos = PTImage.from_cwh_torch(self.data[1]) # img_neg = PTImage.from_cwh_torch(self.data[2]) # image_pos_map = PTImage.from_2d_wh_torch(F.sigmoid(self.output[1]).data) # image_neg_map = PTImage.from_2d_wh_torch(F.sigmoid(self.output[2]).data) ImageVisualizer().set_image( img_frame, parameters.get('title', '') + ' : Frame') ImageVisualizer().set_image( img_frame_xcor, parameters.get('title', '') + ' : Frame xcor')
def __getitem__(self, index): image, labels = self.dataset[index] np_arr = np.asarray(image) ptimage = PTImage.from_numpy_array(np_arr) objects = [] for t in labels: box = Box.from_xywh(t['bbox']) obj_type = self.coco.loadCats([t['category_id']])[0]['name'] # convert segmentation to polygon using the pycocotools # note the segmentation could in one of several formats, for example the custom coco RLE, # to convert the RLE back to polygon is bit of a pain so I will just ignore those right now # according the COCO site, most of the data is in polygon form (not sure why theres a discrepency?) # and I'd rather not store 2D binary masks with every object. polygon = t.get('segmentation') # reshape to 2d poly, assume its convex hull? polys = [] if polygon and isinstance(polygon, list): for seg in polygon: polys.append( Polygon( np.array(seg).reshape((int(old_div(len(seg), 2)), 2)))) objects.append(Object(box, obj_type=obj_type, polygons=polys)) frame = Frame.from_image_and_objects(ptimage, objects) return frame
def __getitem__(self,index): pil_img,label = self.dataset[index] # assert 2D here np_arr = np.asarray(pil_img) np_arr = np.expand_dims(np_arr, axis=2) # create the PTImage, and object that span the frame # add extra channel dimension ptimage = PTImage.from_numpy_array(np_arr) obj = Object(Box(0,0,pil_img.size[0],pil_img.size[1])) frame = Frame.from_image_and_objects(ptimage,[obj]) return frame
def forward(self, x): batch_size,chans,height,width = x.size() # need to first determine the hidden state size, which is tied to the cnn feature size dummy_glimpse = torch.Tensor(batch_size,chans,self.attn_grid_size,self.attn_grid_size) if x.is_cuda: dummy_glimpse = dummy_glimpse.cuda() dummy_feature_map = self.encoder.forward(dummy_glimpse) self.att_rnn.forward(dummy_feature_map.view(batch_size,dummy_feature_map.nelement()/batch_size)) self.att_rnn.reset_hidden_state(batch_size,x.data.is_cuda) outputs = [] init_tensor = torch.zeros(batch_size,self.num_classes,height,width) if x.data.is_cuda: init_tensor = init_tensor.cuda() outputs.append(init_tensor) self.init_weights(self.att_rnn.get_hidden_state()) for t in range(self.timesteps): # 1) decode hidden state to generate gaussian attention parameters state = self.att_rnn.get_hidden_state() gauss_attn_params = F.tanh(F.linear(state,self.att_decoder_weights)) # 2) extract glimpse glimpse = self.attn_reader.forward(x,gauss_attn_params,self.attn_grid_size) # visualize first glimpse in batch for all t torch_glimpses = torch.chunk(glimpse,batch_size,dim=0) ImageVisualizer().set_image(PTImage.from_cwh_torch(torch_glimpses[0].squeeze().data),'zGlimpse {}'.format(t)) # 3) use conv stack or resnet to extract features feature_map = self.encoder.forward(glimpse) conv_output_dims = self.encoder.get_output_dims()[:-1][::-1] conv_output_dims.append(glimpse.size()) # import ipdb;ipdb.set_trace() # 4) update hidden state # think about this connection a bit more self.att_rnn.forward(feature_map.view(batch_size,feature_map.nelement()/batch_size)) # 5) use deconv network to get partial masks partial_mask = self.decoder.forward(feature_map,conv_output_dims) # 6) write masks additively to mask canvas partial_canvas = self.attn_writer.forward(partial_mask,gauss_attn_params,(height,width)) outputs.append(torch.add(outputs[-1],partial_canvas)) # return the sigmoided versions for i in range(len(outputs)): outputs[i] = F.sigmoid(outputs[i]) return outputs
def process_single_batch(original_images, ego_motion_vectors, disp_maps, calib_frames, batch_number=0, mask_loss_factor=0.1): cam_coords = [] num_frames = calib_frames.shape[0] Logger().set('loss_component.disp_maps_mean', disp_maps.data.mean().item()) Logger().set('loss_component.disp_maps_min', disp_maps.data.min().item()) Logger().set('loss_component.disp_maps_max', disp_maps.data.max().item()) Logger().set('loss_component.ego_motion_vectors[0]', np.array2string(ego_motion_vectors[0].detach().cpu().numpy())) # step 1) Use inverse cam_matrix and depth to convert # frame 1,2,3 into camera coordinates for i in range(0, num_frames): cam_coords.append( image_to_cam(original_images[i], disp_maps[i], calib_frames[i])) transforms = [] # step 2) Generate transformation matrix from ego_motion_vectors for i in range(0, num_frames - 1): # fake_ego_motion_vec = torch.zeros_like(ego_motion_vectors[i]) transforms.append(six_dof_vec_to_matrix(ego_motion_vectors[i])) # step 3) Transform Frame i (cam_coords) -> Frame i+1(cam_coords) # Then construct a new 2D image using new projection matrix total_re_loss = torch.zeros([], dtype=original_images.dtype, device=original_images.device) total_ssim_loss = torch.zeros([], dtype=original_images.dtype, device=original_images.device) total_mask_loss = torch.zeros([], dtype=original_images.dtype, device=original_images.device) out_images = [] for i in range(0, num_frames - 1): # augment cam coords with row of 1's to 4D vecs ones_row = torch.ones_like(cam_coords[i])[0, :].unsqueeze(0) augmented_vecs = torch.cat((cam_coords[i], ones_row), dim=0) cur_frame_coords = torch.matmul(transforms[i], augmented_vecs) intrin_filler_right = torch.zeros( 3, dtype=original_images.dtype, device=original_images.device).unsqueeze(1) intrin_filler_bottom = torch.zeros( 4, dtype=original_images.dtype, device=original_images.device).unsqueeze(0) intrin_filler_bottom[0, 3] = 1 hom_calib = torch.cat((calib_frames[i], intrin_filler_right), dim=1) hom_calib = torch.cat((hom_calib, intrin_filler_bottom), dim=0) warped_image, mask = cam_to_image(hom_calib, cur_frame_coords, original_images[i]) out_images.append(warped_image) # compare warped_image to next real image # don't use 0 pixels for loss ptimage = PTImage.from_cwh_torch(warped_image) ptmask = PTImage.from_2d_wh_torch(mask) orig_image = PTImage.from_cwh_torch(original_images[i]) # ImageVisualizer().set_image(orig_image,'original_images {}'.format(i)) ImageVisualizer().set_image( ptimage, 'warped_image {}-{}'.format(batch_number, i)) ImageVisualizer().set_image(ptmask, 'mask {}-{}'.format(batch_number, i)) Logger().set('loss_component.mask_mean.{}-{}'.format(batch_number, i), mask.mean().data.item()) masked_warp_image = warped_image.unsqueeze(0) * mask masked_gt_image = original_images[i + 1].unsqueeze(0) * mask re_loss = F.smooth_l1_loss(masked_warp_image, masked_gt_image, reduction='none') # add loss to prevent mask from going to 0 # total_mask_loss += mask_loss_factor*F.smooth_l1_loss(mask, torch.ones_like(mask)) total_re_loss += re_loss.mean() total_ssim_loss += old_div( (1 - ssim(masked_warp_image, masked_gt_image)), 2) Logger().set('loss_component.mask_loss.{}'.format(batch_number), total_mask_loss.data.item()) Logger().set('loss_component.batch_re_loss.{}'.format(batch_number), total_re_loss.data.item()) Logger().set('loss_component.batch_ssim_loss.{}'.format(batch_number), total_ssim_loss.data.item()) return total_re_loss + total_ssim_loss + total_mask_loss, out_images
def __init__(self,image_path='',objects=[]): self.image_path = image_path self.objects = objects self.image = PTImage(pil_image_path=self.image_path,persist=False)
def __init__(self, image_path='', objs=[], calib_mat=None): self.image_path = image_path self.objects = copy.deepcopy(objs) self.image = PTImage(pil_image_path=self.image_path, persist=False) # 4x3 calibration matrix self.calib_mat = calib_mat
def __init__(self, image_path='', objs=[]): self.image_path = image_path self.objects = copy.deepcopy(objs) self.image = PTImage(pil_image_path=self.image_path, persist=False)