def getObjectPointFromImage(camera): """Get Object Pose from camera image""" max_pix_value = 1.0 normalizer = 255.0 / max_pix_value cam_img = CvBridge().imgmsg_to_cv2(camera, 'rgb8') / normalizer cam_img = torch.tensor(cam_img).permute(2, 1, 0) targ = localize_target(cam_img) point = PointStamped(header=camera.header, point=Point(x=(1.581 - targ[0]/154.29), y=(-0.16 - targ[1]/154.29), z=0.964)) return point
def find_object_1(t, camera, obj_pos_topic): underSmpl = 5 if int(t * 50) % underSmpl == 0: import numpy as np from specs import localize_target from cv_bridge import CvBridge import torch max_pix_value = 1.0 normalizer = 255.0 / max_pix_value cam_img = CvBridge().imgmsg_to_cv2(camera.value, 'rgb8') / normalizer cam_img = torch.tensor(cam_img).permute(2, 1, 0) targ = localize_target(cam_img) if targ[1] < 100: # otherwise it is detecting something else. targ = (-1, -1) msg = Point() msg.x = targ[0] msg.y = targ[1] obj_pos_topic.send_message(msg)
def img_to_pred(t, camera, plot_topic, latent_topic, pred_pos_topic, pred_msg, model, model_path, model_inputs, optimizer, scheduler, run_step): # Imports import os import torch import torch.nn.functional as F import numpy as np from prednet import PredNet from cv_bridge import CvBridge from std_msgs.msg import Float32MultiArray, MultiArrayLayout, MultiArrayDimension from specs import localize_target, complete_target_positions, mark_target, exp_dir # Image and model parameters underSmpl = 5 # Avoiding too sharp time resolution (no change between frames) nt = 15 # Number of "past" frames given to the network t_extrap = 5 # After this frame, input is not used for future predictions n_feat = 1 # Factor for number of features used in the network max_pix_value = 1.0 normalizer = 255.0 / max_pix_value C_channels = 3 # 1 or 3 (color channels) A_channels = (C_channels, n_feat * 4, n_feat * 8, n_feat * 16) R_channels = (C_channels, n_feat * 4, n_feat * 8, n_feat * 16) scale = 4 # 2 or 4 (how much layers down/upsample images) pad = 8 if scale == 4 else 0 # For up/downsampling to work model_name = 'model' + str(n_feat) + '.pt' new_model_path = os.getcwd() + '/resources/' + model_name trained_w_path = exp_dir + model_name # exp_dir computed in specs.py device = 'cpu' if torch.cuda.is_available(): device = 'cuda' # Training parameters use_new_w = False # If True, do not use weights that are saved in new_model_path use_trained_w = True # If above is False, use trained_w_path as model weights do_train = False # Train with present frames if True, predicts future if False initial_lr = 1e-4 # Then, the learning rate is scheduled with cosine annealing epoch_loop = 100 # Every epoch_loop, a prediction is made, to monitor progress n_batches = 1 # For now, not usable (could roll images for multiple batches) # Check that the simulation frame is far enough if camera.value is not None and int(t * 50) % underSmpl == 0: # Collect input image and initialize the network input cam_img = CvBridge().imgmsg_to_cv2(camera.value, 'rgb8') / normalizer if C_channels == 3: # Below I messed up, it should be (2,0,1) but the model is already trained. cam_img = torch.tensor(cam_img, device=device).permute( 2, 1, 0) # --> channels last if C_channels == 1: cam_img = cam_img[:, :, 1] # .mean(axis=2) cam_img = torch.tensor(cam_img, device=device).unsqueeze(dim=2).permute( 2, 1, 0) img_shp = cam_img.shape cam_img = F.pad(cam_img, (pad, pad), 'constant', 0.0) # width may need to be 256 if model_inputs.value is None: model_inputs.value = torch.zeros((1, nt) + cam_img.shape, device=device) # Update the model or the mode, if needed run_step.value = run_step.value + 1 if new_model_path != model_path.value: # Update the model path if new or changed and reset prediction plot model_path.value = new_model_path pred_msg.value = torch.ones(img_shp[0], img_shp[1] * (nt - t_extrap), img_shp[2] + 10) * 64.0 # Load or reload the model model.value = PredNet(R_channels, A_channels, device=device, t_extrap=t_extrap, scale=scale) if device == 'cuda': model.value = model.value.to('cuda') if run_step.value == 1: try: if use_new_w: a = 1. / 0. if use_trained_w: model.value.load_state_dict(torch.load(trained_w_path)) clientLogger.info( 'Model initialized with pre-trained weights.') else: model.value.load_state_dict( torch.load(model_path.value)) clientLogger.info( 'Learning weights loaded in the model.') except: clientLogger.info( 'No existing weight file found. Model initialized randomly.' ) # Initialize some variables needed for training time_loss_w = [1.0 / (nt - 1) if s > 0 else 0.0 for s in range(nt)] if t_extrap < nt: time_loss_w = [ w if n < t_extrap else 2.0 * w for n, w in enumerate(time_loss_w) ] if None in [optimizer.value, scheduler.value]: optimizer.value = torch.optim.Adam(model.value.parameters(), lr=initial_lr) scheduler.value = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts( optimizer.value, T_0=50) # Save the model at each epoch if run_step.value % epoch_loop == 1: torch.save(model.value.state_dict(), model_path.value) # Check that the model exists and initialize plot message if model.value is not None: # Feed network and train it or compute prediction model_inputs.value = model_inputs.value.roll(-1, dims=1) model_inputs.value[0, -1, :, :, :] = cam_img if run_step.value > nt: # Compute prediction along present frames and updates weights if do_train: # Compute prediction loss for every frame pred, latent = model.value(model_inputs.value, nt) loss = torch.tensor([0.0], device=device) for s in range(nt): error = (pred[s][0] - model_inputs.value[0][s])**2 loss += torch.sum(error) * time_loss_w[s] # Backward pass and weight updates optimizer.value.zero_grad() loss.backward() optimizer.value.step() scheduler.value.step() # Predicts future frames without weight updates else: with torch.no_grad(): pred, latent = model.value( model_inputs.value[:, -t_extrap:, :, :, :], nt) # Collect prediction frames displays = [] targ_pos = [] for s in range(nt - t_extrap): disp = torch.detach(pred[t_extrap + s].clamp( 0.0, 1.0)[0, :, :, pad:-pad]).cpu() # disp = model_inputs.value[0,-(s+1),:,:,pad:-pad].cpu() # for tests targ_pos.append(localize_target(disp)) displays.append(disp) # Complete for missing target positions, highlight target and set the display message if 0 < np.sum( [any([np.isnan(p) for p in pos]) for pos in targ_pos]) < len(targ_pos) - 2: targ_pos = complete_target_positions(targ_pos) for s, (disp, pos) in enumerate(zip(displays, targ_pos)): pred_msg.value[:, s * img_shp[1]:(s + 1) * img_shp[1], :img_shp[2]] = mark_target( disp, pos) # Print loss or prediction messages if do_train: clientLogger.info('Epoch: %2i - step: %2i - error: %5.4f - lr: %5.4f' % \ (int(run_step.value/epoch_loop), run_step.value%epoch_loop, loss.item(), \ scheduler.value.get_lr()[0])) else: clientLogger.info( 'Prediction for future target locations: ' + str(targ_pos)) # Send latent state message (latent[0] to remove batch dimension) latent_msg = list(latent[0].cpu().numpy().flatten()) layout_msg = MultiArrayLayout( dim=[MultiArrayDimension(size=d) for d in latent[0].shape]) latent_topic.send_message( Float32MultiArray(layout=layout_msg, data=latent_msg)) # Send predicted position according to the index of the frame that has to be reported pos_3d_msg = [[ 1.562 - p[0] / 156.274, -0.14 - p[1] / 152.691, 0.964 + p[0] - p[0] ] for p in targ_pos] pos_3d_msg = [p for pos in pos_3d_msg for p in pos] # flatten the list layout_msg = MultiArrayLayout(dim=[ MultiArrayDimension(size=d) for d in [len(targ_pos), 3] ]) pred_pos_topic.send_message( Float32MultiArray(layout=layout_msg, data=pos_3d_msg)) # Collect input frames inpt_msg = torch.zeros(img_shp[0], img_shp[1] * (nt - t_extrap), img_shp[2]) for s in range(nt - t_extrap): inpt_msg[:, s * img_shp[1]:(s + 1) * img_shp[1], :] = model_inputs.value[0, t_extrap + s, :, :, pad:-pad] # Build and send the display message plot_msg = torch.cat( (pred_msg.value, inpt_msg), 2).numpy().transpose( 2, 1, 0) * int(normalizer) if C_channels == 1: plot_msg = np.dstack((plot_msg, plot_msg, plot_msg)) plot_topic.send_message(CvBridge().cv2_to_imgmsg( plot_msg.astype(np.uint8), 'rgb8'))
def run_step(self): # Check that the camera device is on and that it is the right time-step if self.camera is not None: t = self.camera.header.stamp.to_secs() * 1000.0 # in milliseconds if t > self.last_cam_time + 20 * self.underSmpl: # one ros time-step is 20 ms self.last_cam_time = t # Collect input image and initialize the network input cam_img = CvBridge().imgmsg_to_cv2(self.camera, 'rgb8') / self.normalizer if self.C_channels == 3: # Below I messed up, it should be (2,0,1) but the model is already trained. cam_img = torch.tensor(cam_img, device=self.device).permute( 2, 1, 0) # --> channels last if self.C_channels == 1: cam_img = cam_img[:, :, 1] # .mean(axis=2) cam_img = torch.tensor( cam_img, device=self.device).unsqueeze(dim=2).permute(2, 1, 0) img_shp = cam_img.shape cam_inp = F.pad(cam_img, (self.pad, self.pad), 'constant', 0.0) # width may need to be 256 if self.model_inputs is None: self.model_inputs = torch.zeros( (1, self.nt) + cam_inp.shape, device=self.device) # Update the model or the mode, if needed self.running_step = self.running_step + 1 if self.new_model_path != self.model_path: # Update the model path if new or changed and reset prediction plot self.model_path = self.new_model_path self.pred_msg = torch.ones( img_shp[0], img_shp[1] * (self.nt - self.t_extrap + 1), img_shp[2] + 10) * 64.0 # Load or reload the model self.model = PredNet(self.R_channels, self.A_channels, device=self.device, t_extrap=self.t_extrap, scale=self.scale) if self.device == 'cuda': self.model = self.model.to('cuda') if self.running_step == 1: try: if self.use_new_w: a = 1. / 0. if self.use_trained_w: self.model.load_state_dict( torch.load(self.trained_w_path)) rospy.loginfo( 'Model initialized with pre-trained weights.' ) else: self.model.load_state_dict( torch.load(self.model_path)) rospy.loginfo( 'Learning weights loaded in the model.') except: rospy.loginfo( 'No existing weight file found. Model initialized randomly.' ) # Initialize some variables needed for training time_loss_w = [ 1.0 / (self.nt - 1) if s > 0 else 0.0 for s in range(self.nt) ] if self.t_extrap < self.nt: time_loss_w = [ w if n < self.t_extrap else 2.0 * w for n, w in enumerate(time_loss_w) ] # Initialize the optimizer and the scheduler if needed if None in [self.optimizer, self.scheduler]: self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.initial_lr) self.scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts( self.optimizer, T_0=50) # Save the model at each epoch if self.running_step % self.epoch_loop == 1: torch.save(self.model.state_dict(), self.model_path) # Check that the model exists and initialize plot message if self.model is not None: # Feed network and train it or compute prediction self.model_inputs = self.model_inputs.roll(-1, dims=1) self.model_inputs[0, -1, :, :, :] = cam_inp if self.running_step > self.nt: # Compute prediction along present frames and updates weights if self.do_train: # Compute prediction loss for every frame pred, latent = self.model(self.model_inputs, self.nt) loss = torch.tensor([0.0], device=self.device) for s in range(self.nt): error = (pred[s][0] - self.model_inputs[0][s])**2 loss += torch.sum(error) * time_loss_w[s] # Backward pass and weight updates self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.scheduler.step() # Predicts future frames without weight updates else: with torch.no_grad(): pred, latent = self.model( self. model_inputs[:, -self.t_extrap:, :, :, :], self.nt) # Collect prediction frames displays = [ cam_img ] # First frame to be displayed is the present frame targ_pos = [ localize_target(cam_img) ] # Localize the target on the present camera frame t_stamps = [ t ] # Time of the present frame is the camera rostime for s in range(self.nt - self.t_extrap): disp = torch.detach(pred[self.t_extrap + s].clamp( 0.0, 1.0)[0, :, :, self.pad:-self.pad]).cpu() targ_pos.append(localize_target(disp)) displays.append(disp) t_stamps.append( t + (s + 1) * 0.02 * self.underSmpl) # Not sure about this # Complete for missing target positions, highlight target and set the display message if 0 < np.sum([ any([np.isnan(p) for p in pos]) for pos in targ_pos ]) < len(targ_pos) - 2: targ_pos = complete_target_positions(targ_pos) for s, (disp, pos) in enumerate(zip(displays, targ_pos)): self.pred_msg[:, s * img_shp[1]:( s + 1) * img_shp[1], :img_shp[2]] = mark_target( disp, pos) # Print loss or prediction messages if self.do_train: rospy.loginfo( 'Epoch: %2i - step: %2i - error: %5.4f - lr: %5.4f' % (int(self.running_step / self.epoch_loop), self.running_step % self.epoch_loop, loss.item(), self.scheduler.get_lr()[0])) else: rospy.loginfo( 'Prediction for future target locations: ' + str(targ_pos)) # Send latent state message (latent[0] to remove batch dimension) latent_msg = list(latent[0].cpu().numpy().flatten()) layout_msg = MultiArrayLayout(dim=[ MultiArrayDimension(size=d) for d in latent[0].shape ]) self.latent_pub.publish( Float32MultiArray(layout=layout_msg, data=latent_msg)) # Send predicted position according to the index of the frame that has to be reported pos_3d_msg = [[ 1.562 - p[0] / 156.274, -0.14 - p[1] / 152.691, 0.964 + p[0] - p[0] ] for p in targ_pos] pos_4d_msg = [[p[0], p[1], p[2], s] for (s, p) in zip(t_stamps, pos_3d_msg) ] # Add time stamps pos_4d_msg = [p for pos in pos_3d_msg for p in pos] # Flatten the list layout_msg = MultiArrayLayout(dim=[ MultiArrayDimension(size=d) for d in [len(targ_pos), 4] ]) self.pred_pos_pub.publish( Float32MultiArray(layout=layout_msg, data=pos_3d_msg)) # Collect input frames inpt_msg = torch.zeros( img_shp[0], img_shp[1] * (self.nt - self.t_extrap + 1), img_shp[2]) for s in range(self.nt - self.t_extrap): inpt_msg[:, (s + 1) * img_shp[1]:(s + 2) * img_shp[1], :] = self.model_inputs[ 0, self.t_extrap + s, :, :, self.pad:-self.pad] # Build and send the display message plot_msg = torch.cat( (self.pred_msg, inpt_msg), 2).numpy().transpose( 2, 1, 0) * int(self.normalizer) if self.C_channels == 1: plot_msg = np.dstack((plot_msg, plot_msg, plot_msg)) self.plot_pub.publish(CvBridge().cv2_to_imgmsg( plot_msg.astype(np.uint8), 'rgb8'))