def act(self, ob): if self.policy is None: raise AttributeError('Policy attribute has to be set!') state = JointState(self.get_full_state(), ob) action = self.policy.predict(state) # print("self.policy.last_state: ",self.policy.last_state) return action
def state_callback(self, observe_info): robot_state = observe_info.robot_state robot_full_state = FullState(robot_state.pos_x, robot_state.pos_y, robot_state.vel_x, robot_state.vel_y, robot_state.radius, robot_state.goal_x, robot_state.goal_y, robot_state.vmax, robot_state.theta) peds_full_state = [ ObservableState(ped_state.pos_x, ped_state.pos_y, ped_state.vel_x, ped_state.vel_y, ped_state.radius) for ped_state in observe_info.ped_states ] observable_states = peds_full_state self.cur_state = JointState(robot_full_state, observable_states) action_cmd = ActionCmd() dis = np.sqrt((robot_full_state.px - robot_full_state.gx)**2 + (robot_full_state.py - robot_full_state.gy)**2) if dis < 0.3: action_cmd.stop = True action_cmd.vel_x = 0.0 action_cmd.vel_y = 0.0 else: print("state callback") action_cmd.stop = False robot_action, robot_action_index = self.robot_policy.predict( self.cur_state) print('robot_action', robot_action.v, robot_action.r) action_cmd.vel_x = robot_action.v action_cmd.vel_y = robot_action.r self.robot_action_pub.publish(action_cmd)
def act(self, obs): robot_state, humans_state, local_map = obs state = JointState(robot_state, humans_state) action = self.policy.predict(state, local_map, None) action = ActionRot(robot_state.v_pref * action.v, action.r) # de-normalize return action
def act(self, ob): if self.policy is None: raise AttributeError("Policy attribute has to be set!") state = JointState(self.get_full_state(), ob) action = self.policy.predict(state) return action
def act(self, ob): """ The state for human is its full state and all other agents' observable states :param ob: :return: """ state = JointState(self.get_full_state(), ob) action = self.policy.predict(state) return action
def act_static(self, ob): """ The state for human is its full state and all other agents' observable states :param ob: :return: """ state = JointState(self.get_full_state(), ob) action = ActionXY(0, 0) return action
def planner(self): # update robot robot.set(self.px, self.py, self.gx, self.gy, self.vx, self.vy, self.theta) dist_to_goal = np.linalg.norm( np.array(robot.get_position()) - np.array(robot.get_goal_position())) # compute command velocity if robot.reached_destination(): self.cmd_vel.linear.x = 0 self.cmd_vel.linear.y = 0 self.cmd_vel.linear.z = 0 self.cmd_vel.angular.x = 0 self.cmd_vel.angular.y = 0 self.cmd_vel.angular.z = 0 self.Is_goal_reached = True else: """ self state: FullState(px, py, vx, vy, radius, gx, gy, v_pref, theta) ob:[ObservableState(px1, py1, vx1, vy1, radius1), ObservableState(px1, py1, vx1, vy1, radius1), ....... ObservableState(pxn, pyn, vxn, vyn, radiusn)] """ if len(self.ob) == 0: self.ob = [ ObservableState(FAKE_HUMAN_PX, FAKE_HUMAN_PY, 0, 0, HUMAN_RADIUS) ] self.state = JointState(robot.get_full_state(), self.ob) action = policy.predict(self.state) # max_action self.cmd_vel.linear.x = action.v self.cmd_vel.linear.y = 0 self.cmd_vel.linear.z = 0 self.cmd_vel.angular.x = 0 self.cmd_vel.angular.y = 0 self.cmd_vel.angular.z = action.r ########### for debug ########## # dist_to_goal = np.linalg.norm(np.array(robot.get_position()) - np.array(robot.get_goal_position())) # if self.plan_counter % 10 == 0: # rospy.loginfo("robot position:(%s,%s)" % (self.px, self.py)) # rospy.loginfo("Distance to goal is %s" % dist_to_goal) # rospy.loginfo("self state:\n %s" % self.state.self_state) # for i in range(len(self.state.human_states)): # rospy.loginfo("human %s :\n %s" % (i+1, self.state.human_states[i])) # rospy.loginfo("%s-th action is planned: \n v: %s m/s \n r: %s rad/s" # % (self.plan_counter, self.cmd_vel.linear.x, self.cmd_vel.angular.z)) # publish velocity self.cmd_vel_pub.publish(self.cmd_vel) self.plan_counter += 1 self.visualize_action()
def act(self, ob): if self.policy is None: raise AttributeError('Policy attribute has to be set!') state = JointState(self.get_full_state(), ob) # print('----debug robot.py line 15 ') # print('robot cordinate = [',self.px,',',self.py,']') action = self.policy.predict(state) #IMPORTANT !!!!!!!!!!!!!!!!!!!!!!!! CHNGE IT IF YOU DONT WANT TO CREATE SIMULATION DATA action = ActionXY(0, 0) return action
def act(self, ob, imitation_learning=True, recorded_unrotated_state=None, occ=None): if self.policy is None: raise AttributeError('Policy attribute has to be set!') if (imitation_learning): state = JointState(self.get_full_state(), ob) action = self.policy.predict(state) else: #RL human_xy = recorded_unrotated_state[:, :, 9: 11] # [hist_len, n-human ,2(x,y)] robot_xy = recorded_unrotated_state[:, 0:1, 0:2] # [hist_len, 1 ,2(x,y)] joint_xy = torch.cat([robot_xy, human_xy], dim=-2) state = JointState(self.get_full_state(), ob) action = self.policy.predict( state, joint_xy, occ ) #.joint_xy = [hist_len ,n-human+1, 2(xy)]=[5 3 2] , occ =[4,32] =[hist_len-1 ,32 ] return action
def predict(self, obs, env): self.simulator.time_step = env._get_dt() other_agent_states = [ agent.get_observable_state() for agent in env.soadrl_sim.humans + env.soadrl_sim.other_robots ] action = self.simulator.predict( JointState(env.soadrl_sim.robot.get_full_state(), other_agent_states), env.soadrl_sim.obstacle_vertices, env.soadrl_sim.robot, ) if self.suicide_if_stuck: if action.v < 0.1: return Suicide() vx = action.v * np.cos(action.r) vy = action.v * np.sin(action.r) return np.array([vx, vy, 0.1 * (np.random.random() - 0.5)])
def compute_coordinate_observation(self, with_fov=False): # Todo: only consider humans in FOV px = self.robot_states[-1].position.x_val py = self.robot_states[-1].position.y_val if len(self.robot_states) == 1: vx = vy = 0 else: # TODO: use kinematics.linear_velocity? vx = self.robot_states[-1].position.x_val - self.robot_states[-2].position.x_val vy = self.robot_states[-1].position.y_val - self.robot_states[-2].position.y_val r = self.robot_radius gx = self.goal_position[0] gy = self.goal_position[1] v_pref = 1 _, _, theta = airsim.to_eularian_angles(self.robot_states[-1].orientation) robot_state = FullState(px, py, vx, vy, r, gx, gy, v_pref, theta) human_states = [] for i in range(self.human_num): if len(self.human_states[i]) == 1: vx = vy = 0 else: vx = (self.human_states[i][-1].position.x_val - self.human_states[i][-2].position.x_val) / self.time_step vy = (self.human_states[i][-1].position.y_val - self.human_states[i][-2].position.y_val) / self.time_step px = self.human_states[i][-1].position.x_val py = self.human_states[i][-1].position.y_val if with_fov: angle = np.arctan2(py - robot_state.py, px - robot_state.px) if abs(angle - robot_state.theta) > self.fov / 2: continue human_state = ObservableState(px, py, vx, vy, self.human_radius) human_states.append(human_state) if not human_states: human_states.append(ObservableState(-6, 0, 0, 0, self.human_radius)) joint_state = JointState(robot_state, human_states) return joint_state
def run_k_episodes(self, k, phase, update_memory=False, imitation_learning=False, episode=None, print_failure=False, noise_explore=0.0, progressbar=True, notebook=False): self.robot.policy.set_phase(phase) success_times = [] collision_times = [] timeout_times = [] success = 0 collision = 0 timeout = 0 too_close = 0 min_dist = [] cumulative_rewards = [] collision_cases = [] timeout_cases = [] if imitation_learning: human_obsv = [] iter = range(k) if progressbar: if notebook: iter = tqdm_notebook(iter, leave=False) else: iter = tqdm(iter, leave=False) for i in iter: ob = self.env.reset(phase) done = False states = [] actions = [] rewards = [] scene = [] #print(self.robot.policy) while not done: # infer action from policy if 'RNN' in self.robot.policy.name: action = self.robot.act(ob) elif self.num_frames == 1: action = self.robot.act(ob) else: last_ob = latest_frame(ob, self.num_frames) action = self.robot.act(last_ob) actions.append(action) #print(self.target_policy.name) # state append if imitation_learning: if 'RNN' in self.target_policy.name: # TODO: enumerate and test differenct combinations of policies joint_state = JointState(self.robot.get_full_state(), ob) elif 'SAIL' in self.target_policy.name or 'SARL' in self.target_policy.name: joint_state = JointState(self.robot.get_full_state(), ob) else: joint_state = self.robot.policy.last_state last_state = self.target_policy.transform(joint_state) states.append(last_state) scene.append(obs_to_frame(ob)) #print(last_state) else: states.append(self.robot.policy.last_state) # env step if imitation_learning and noise_explore > 0.0: noise_magnitude = noise_explore * 2.0 action = ActionXY( action[0] + torch.rand(1).sub(0.5) * noise_magnitude, action[1] + torch.rand(1).sub(0.5) * noise_magnitude ) if self.robot.policy.kinematics == 'holonomic' else ActionRot( action[0] + torch.rand(1).sub(0.5) * noise_magnitude, action[1] + torch.rand(1).sub(0.5) * noise_magnitude) ob, reward, done, info = self.env.step(action) rewards.append(reward) if isinstance(info, Danger): too_close += 1 min_dist.append(info.min_dist) if imitation_learning: human_obsv.append(torch.FloatTensor(scene)) if isinstance(info, ReachGoal): success += 1 success_times.append(self.env.global_time) elif isinstance(info, Collision): collision += 1 collision_cases.append(i) collision_times.append(self.env.global_time) elif isinstance(info, Timeout): timeout += 1 timeout_cases.append(i) timeout_times.append(self.env.time_limit) else: raise ValueError('Invalid end signal from environment') if update_memory: self.update_memory(states, actions, rewards, imitation_learning) # episode result cumulative_rewards.append( sum([ pow(self.gamma, t * self.robot.time_step * self.robot.v_pref) * reward for t, reward in enumerate(rewards) ])) # if i % 100 == 0 and i > 0: # logging.info("{:<5} Explore #{:d} / {:d} episodes".format(phase.upper(), i, k)) # episodes statistics success_rate = success / k collision_rate = collision / k assert success + collision + timeout == k avg_nav_time = sum(success_times) / len( success_times) if success_times else self.env.time_limit extra_info = '' if episode is None else 'in episode {} '.format( episode) logging.info( '{:<5} {} success: {:.2f}, collision: {:.2f}, nav time: {:.2f}, reward: {:.4f} +- {:.4f}' .format(phase.upper(), extra_info, success_rate, collision_rate, avg_nav_time, statistics.mean(cumulative_rewards), (statistics.stdev(cumulative_rewards) if len(cumulative_rewards) > 1 else 0.0))) print( '{:<5} {} success: {:.2f}, collision: {:.2f}, nav time: {:.2f}, reward: {:.4f} +- {:.4f}' .format(phase.upper(), extra_info, success_rate, collision_rate, avg_nav_time, statistics.mean(cumulative_rewards), (statistics.stdev(cumulative_rewards) if len(cumulative_rewards) > 1 else 0.0))) if phase in ['val', 'test']: total_time = sum(success_times + collision_times + timeout_times) * self.robot.time_step logging.info('Frequency of being in danger: %.2f', too_close / total_time) if print_failure: logging.info('Collision cases: ' + ' '.join([str(x) for x in collision_cases])) logging.info('Timeout cases: ' + ' '.join([str(x) for x in timeout_cases])) if imitation_learning: return human_obsv
def run_k_episodes(self, k, phase, update_memory=False, imitation_learning=False, episode=None, traj_head=None, print_failure=False, noise_explore=0.0, progressbar=False, output_info=False, notebook=False, print_info=False, return_states=False, suffix=None): self.robot.policy.set_phase(phase) success_times = [] collision_times = [] timeout_times = [] success = 0 collision = 0 timeout = 0 too_close = 0 min_dist = [] cumulative_rewards = [] collision_cases = [] timeout_cases = [] states_s = [] if imitation_learning: human_obsv = [] iter = range(k) if progressbar: if notebook: iter = tqdm_notebook(iter, leave=False) else: iter = tqdm(iter, leave=False) traj_acc_sum = 0 for i in iter: ob = self.env.reset(phase) done = False states = [] actions = [] rewards = [] scene = [] while not done: # infer action from policy if 'TRAJ' in self.robot.policy.name: action = self.robot.act(ob) elif self.num_frames == 1: action = self.robot.act(ob) else: last_ob = latest_frame(ob, self.num_frames) action = self.robot.act(last_ob) actions.append(action) # state append if imitation_learning: # if 'RNN' in self.target_policy.name: # # TODO: enumerate and test differenct combinations of policies # joint_state = JointState(self.robot.get_full_state(), ob) # elif 'SAIL' in self.target_policy.name: # joint_state = JointState(self.robot.get_full_state(), ob) # else: # joint_state = self.robot.policy.last_state joint_state = JointState(self.robot.get_full_state(), ob) last_state = ExplorerDs.transform_mult(joint_state) states.append(last_state) scene.append(obs_to_frame(ob)) else: states.append(self.robot.policy.last_state) # env step if imitation_learning and noise_explore > 0.0: noise_magnitude = noise_explore * 2.0 action = ActionXY( action[0] + torch.rand(1).sub(0.5) * noise_magnitude, action[1] + torch.rand(1).sub(0.5) * noise_magnitude ) if self.robot.policy.kinematics == 'holonomic' else ActionRot( action[0] + torch.rand(1).sub(0.5) * noise_magnitude, action[1] + torch.rand(1).sub(0.5) * noise_magnitude) ob, reward, done, info = self.env.step(action) rewards.append(reward) if isinstance(info, Danger): too_close += 1 min_dist.append(info.min_dist) if imitation_learning: human_obsv.append(torch.FloatTensor(scene)) if isinstance(info, ReachGoal): success += 1 success_times.append(self.env.global_time) elif isinstance(info, Collision): collision += 1 collision_cases.append(i) collision_times.append(self.env.global_time) elif isinstance(info, Timeout): timeout += 1 timeout_cases.append(i) timeout_times.append(self.env.time_limit) else: raise ValueError('Invalid end signal from environment') traj_accuracy = 0 if traj_head is not None: for i, ([rob, hum], human_feats) in enumerate(states): #next_human_pos = torch.cat([states[max(i, len(states)-1)][0][1] for i in range(len(states))], dim=1) next_human_pos = [] for j in range(4): n_p = states[min(i + j + 1, len(states) - 1)][0][1][-1, :, :2] #print(states[min(i+j+1, len(states)-1)][0][1]) next_human_pos.append(n_p) with torch.no_grad(): next_human_pos = torch.cat(next_human_pos, dim=1) next_human_pos_pred = traj_head(human_feats) # print(hum[-1]) # print(next_human_pos) # print(next_human_pos_pred) # print('\n') accuracy = ((next_human_pos - next_human_pos_pred)**2).mean().item() traj_accuracy += accuracy #print(traj_accuracy) traj_acc_sum += traj_accuracy if update_memory: self.update_memory(states, actions, rewards, imitation_learning) states_s.append(states) # episode result cumulative_rewards.append( sum([ pow(self.gamma, t * self.robot.time_step * self.robot.v_pref) * reward for t, reward in enumerate(rewards) ])) # if i % 100 == 0 and i > 0: # logging.info("{:<5} Explore #{:d} / {:d} episodes".format(phase.upper(), i, k)) traj_acc_sum = traj_acc_sum / sum(len(s) for s in states_s) # episodes statistics success_rate = success / k collision_rate = collision / k timeout_rate = timeout / k assert success + collision + timeout == k avg_nav_time = sum(success_times) / len( success_times) if success_times else self.env.time_limit if not print_info: extra_info = '' if episode is None else 'in episode {} '.format( episode) logging.info( '{:<5} {} success: {:.2f}, collision: {:.2f}, nav time: {:.2f}, reward: {:.4f} +- {:.4f}' .format(phase.upper(), extra_info, success_rate, collision_rate, avg_nav_time, statistics.mean(cumulative_rewards), (statistics.stdev(cumulative_rewards) if len(cumulative_rewards) > 1 else 0.0))) info = { 'success': success_rate, 'collision': collision_rate, 'nav time': avg_nav_time, 'reward': statistics.mean(cumulative_rewards) } if print_info: info = 'success: {:.4f},collision: {:.4f},nav time: {:.2f},reward: {:.4f}, timeout : {:.4f}'.\ format(success_rate, collision_rate, avg_nav_time, statistics.mean(cumulative_rewards), timeout_rate) if traj_head is not None: info += ', traj accuracy : {:.4f}'.format(traj_acc_sum) if suffix is not None: info = suffix + info print(info) # if phase in ['val', 'test']: # total_time = sum(success_times + collision_times + timeout_times) * self.robot.time_step # logging.info('Frequency of being in danger: %.2f', too_close / total_time) if print_failure: logging.info('Collision cases: ' + ' '.join([str(x) for x in collision_cases])) logging.info('Timeout cases: ' + ' '.join([str(x) for x in timeout_cases])) info_dic = { 'success': success_rate, 'collision': collision_rate, 'nav time': avg_nav_time, 'reward': statistics.mean(cumulative_rewards), 'traj accuracy': traj_acc_sum } if output_info: if return_states: return info_dic, states_s else: return info_dic if imitation_learning: return human_obsv
def predict(self, state): """ A base class for all methods that takes pairwise joint state as input to value network. The input to the value network is always of shape (batch_size, # humans, rotated joint state length) """ if self.phase is None or self.device is None: raise AttributeError('Phase, device attributes have to be set!') if self.phase == 'train' and self.epsilon is None: raise AttributeError('Epsilon attribute has to be set in training phase') if self.reach_destination(state): return ActionXY(0, 0) if self.kinematics == 'holonomic' else ActionRot(0, 0) if self.action_space is None: self.build_action_space(state.robot_state.v_pref) if not state.human_states: assert self.phase != 'train' if hasattr(self, 'attention_weights'): self.attention_weights = list() return self.select_greedy_action(state.robot_state) max_action_index = 0 occupancy_maps = None probability = np.random.random() if self.phase == 'train' and probability < self.epsilon: max_action_index = np.random.choice(len(self.action_space)) max_action = self.action_space[max_action_index] else: self.action_values = list() max_value = float('-inf') max_action = None rewards = [] action_index = 0 batch_input_tensor = None for action in self.action_space: next_robot_state = self.propagate(state.robot_state, action) if self.query_env: next_human_states, reward, done, info = self.env.onestep_lookahead(action) else: next_human_states = [self.propagate(human_state, ActionXY(human_state.vx, human_state.vy)) for human_state in state.human_states] next_state = JointState(next_robot_state, next_human_states) reward, _ = self.reward_estimator.estimate_reward_on_predictor(state, next_state) rewards.append(reward) batch_next_states = torch.cat([torch.Tensor([next_robot_state + next_human_state]).to(self.device) for next_human_state in next_human_states], dim=0) rotated_batch_input = self.rotate(batch_next_states).unsqueeze(0) if self.with_om: if occupancy_maps is None: occupancy_maps = self.build_occupancy_maps(next_human_states).unsqueeze(0) rotated_batch_input = torch.cat([rotated_batch_input, occupancy_maps], dim=2) if batch_input_tensor is None: batch_input_tensor = rotated_batch_input else: batch_input_tensor = torch.cat([batch_input_tensor, rotated_batch_input], dim=0) next_value = self.model(batch_input_tensor).squeeze(1) rewards_tensor = torch.tensor(rewards).to(self.device) value = rewards_tensor + next_value * pow(self.gamma, self.time_step * state.robot_state.v_pref) max_action_index = value.argmax() best_value = value[max_action_index] if best_value > max_value: max_action = self.action_space[max_action_index] if max_action is None: raise ValueError('Value network is not well trained. ') if self.phase == 'train': self.last_state = self.transform(state) return max_action, int(max_action_index)
def get_state(self, ob): state = JointState(self.get_full_state(), ob) if self.policy is None: raise AttributeError('Policy attribute has to be set!') return self.policy.transform(state)
def run_k_episodes(self, k_train, k_val, phase, update_memory=False, episode=None, imitation_learning=False, print_failure=False, with_render=False): self.robot.policy.set_phase(phase) success_times = [] collision_times = [] timeout_times = [] success = 0 collision = 0 timeout = 0 too_close = 0 min_dist = [] cumulative_rewards = [] collision_cases = [] timeout_cases = [] k = k_val + k_train for i in range(k): ob = self.env.reset(phase='gen') robot_state = self.robot.get_full_state() done = False states = [] actions = [] rewards = [] prev_obs = [JointState(robot_state, ob)] # If the policy is TLSGAN, either stay and observe or move according to ORCA for j in range(self.obs_len - 1): if isinstance(self.robot.policy, TLSGAN): #action = ActionXY(0,0) action = self.robot.act(prev_obs, test_policy=True) else: action = self.robot.act(prev_obs) ob, reward, done, info = self.env.step(action) robot_state = self.robot.get_full_state() prev_obs.append(JointState(robot_state, ob)) while not done: action = self.robot.act(prev_obs, test_policy=False) ob, reward, done, info = self.env.step(action) robot_state = self.robot.get_full_state() states.append(prev_obs) actions.append(action) rewards.append(reward) prev_obs = prev_obs[1:] prev_obs.append(JointState(robot_state, ob)) if isinstance(info, Danger): too_close += 1 min_dist.append(info.min_dist) # Render only the first episode if with_render and (i == 0): self.env.render(mode='video') if isinstance(info, ReachGoal): success += 1 success_times.append(self.env.global_time) elif isinstance(info, Collision): collision += 1 collision_cases.append(i) collision_times.append(self.env.global_time) elif isinstance(info, Timeout): timeout += 1 timeout_cases.append(i) timeout_times.append(self.env.time_limit) else: raise ValueError('Invalid end signal from environment') if update_memory: if isinstance(info, ReachGoal) or isinstance(info, Collision): # only add positive(success) or negative(collision) experience in experience set if i < k_train: memory = self.train_memory else: memory = self.val_memory self.update_memory(memory, states, actions, rewards, imitation_learning) cumulative_rewards.append( sum([ pow(self.gamma, t * self.robot.time_step * self.robot.v_pref) * reward for t, reward in enumerate(rewards) ])) success_rate = success / k collision_rate = collision / k assert success + collision + timeout == k avg_nav_time = sum(success_times) / len( success_times) if success_times else self.env.time_limit extra_info = '' if episode is None else 'in episode {} '.format( episode) self.logger.info( '{:<5} {}has success rate: {:.2f}, collision rate: {:.2f}, nav time: {:.2f}, total reward: {:.4f}' .format(phase.upper(), extra_info, success_rate, collision_rate, avg_nav_time, average(cumulative_rewards))) if phase in ['val', 'test']: total_time = sum(success_times + collision_times + timeout_times) * self.robot.time_step self.logger.info( 'Frequency of being in danger: %.2f and average min separate distance in danger: %.2f', too_close / total_time, average(min_dist)) if print_failure: self.logger.info('Collision cases: ' + ' '.join([str(x) for x in collision_cases])) self.logger.info('Timeout cases: ' + ' '.join([str(x) for x in timeout_cases]))
def predict(self, states): """ A base class for all methods that takes pairwise joint state as input to value network. The input to the value network is always of shape (batch_size, # humans, rotated joint state length) states is a list of joint states """ if self.phase is None or self.device is None: raise AttributeError('Phase, device attributes have to be set!') if self.phase == 'train' and self.epsilon is None: raise AttributeError( 'Epsilon attribute has to be set in training phase') if self.reach_destination(states[-1]): return ActionXY( 0, 0) if self.kinematics == 'holonomic' else ActionRot(0, 0) if self.action_space is None: self.build_action_space(states[-1].self_state.v_pref) if self.policy_learning: trajs, rel_trajs, self_info = stateToTrajTensors([states]) next_position = self.model( (trajs, rel_trajs, self_info)).data.squeeze(0) action = ActionXY(next_position[0], next_position[1]) return action else: occupancy_maps = None probability = np.random.random() if self.phase == 'train' and probability < self.epsilon: max_action = self.action_space[np.random.choice( len(self.action_space))] else: self.action_values = list() max_value = float('-inf') max_action = None for action in self.action_space: next_self_state = self.propagate( states[-1].self_state, action ) # 'px', 'py', 'vx', 'vy', 'radius', 'gx', 'gy', 'v_pref', 'theta' if self.query_env: next_human_states, reward, done, info = self.env.onestep_lookahead( action) else: next_human_states = [ self.propagate( human_state, ActionXY(human_state.vx, human_state.vy)) for human_state in states.human_states ] # 'px1', 'py1', 'vx1', 'vy1', 'radius1' reward = self.compute_reward(next_self_state, next_human_states) # VALUE UPDATE trajs, rel_trajs, self_info = stateToTrajTensors([ states + [JointState(next_self_state, next_human_states)] ]) next_state_value = self.model( (trajs, rel_trajs, self_info)).data.item() value = reward + pow( self.gamma, self.time_step * states[-1].self_state.v_pref) * next_state_value self.action_values.append(value) if value > max_value: max_value = value max_action = action if max_action is None: raise ValueError('Value network is not well trained. ') if self.phase == 'train': self.last_state = states return max_action