def predict(self, state): if self.phase is None or self.device is None: raise AttributeError('Phase, device attributes have to be set!') if self.phase == 'train' and self.epsilon is None: raise AttributeError( 'Epsilon attribute has to be set in training phase') if self.reach_destination(state): return ActionXY( 0, 0) if self.kinematics == 'holonomic' else ActionRot(0, 0) self.last_state = self.transform(state) #print(self.last_state) action, feats = self.model(self.last_state[0].unsqueeze(0), self.last_state[1].unsqueeze(0)) action = action.squeeze() action = ActionXY( action[0].item(), action[1].item()) if self.kinematics == 'holonomic' else ActionRot( action[0].item(), action[1].item()) self.last_state = self.last_state, feats return action
def build_action_space(self, v_pref): """ Action space consists of 25 uniformly sampled actions in permitted range and 25 randomly sampled actions. """ holonomic = True if self.kinematics == 'holonomic' else False speeds = [(np.exp( (i + 1) / self.speed_samples) - 1) / (np.e - 1) * v_pref for i in range(self.speed_samples)] if holonomic: rotations = np.linspace(0, 2 * np.pi, self.rotation_samples, endpoint=False) else: rotations = np.linspace(-np.pi / 2, np.pi / 2, self.rotation_samples) action_space = [ActionXY(0, 0) if holonomic else ActionRot(0, 0)] for rotation, speed in itertools.product(rotations, speeds): if holonomic: action_space.append( ActionXY(speed * np.cos(rotation), speed * np.sin(rotation))) else: action_space.append(ActionRot(speed, rotation)) self.speeds = speeds self.rotations = rotations self.action_space = action_space
def predict(self, state): """ A base class for all methods that takes pairwise joint state as input to value network. The input to the value network is always of shape (batch_size, # humans, rotated joint state length) """ if self.phase is None or self.device is None: raise AttributeError('Phase, device attributes have to be set!') if self.phase == 'train' and self.epsilon is None: raise AttributeError('Epsilon attribute has to be set in training phase') if self.phase == 'train': self.last_state = self.transform(state) if self.reach_destination(state): return ActionXY(0, 0) if self.kinematics == 'holonomic' else ActionRot(0, 0) state_tensor = state.to_tensor(add_batch_size=True, device=self.device) if self.phase == 'train': action = ( self.actor(state_tensor).squeeze().detach().numpy() + np.random.normal(0, self.max_action * self.expl_noise, size=self.action_dim) ).clip(-self.max_action, self.max_action) speed = action[0] theta = action[1] Action = ActionXY(speed * np.cos(theta), speed * np.sin(theta)) \ if self.kinematics == 'holonomic' else ActionRot(speed, theta) return Action, torch.tensor(action).float() else: with torch.no_grad(): action = self.actor(state_tensor).squeeze().numpy() speed = action[0] theta = action[1] Action = ActionXY(speed * np.cos(theta), speed * np.sin(theta)) \ if self.kinematics == 'holonomic' else ActionRot(speed, theta) return Action, torch.tensor(action).float()
def build_action_space(self, v_pref): """ Action space consists of 25 uniformly sampled actions in permitted range and 25 randomly sampled actions. """ holonomic = True if self.kinematics == 'holonomic' else False # speeds = [(np.exp((i + 1) / self.speed_samples) - 1) / (np.e - 1) * v_pref for i in range(self.speed_samples)] speeds = [(i + 1) / self.speed_samples * v_pref for i in range(self.speed_samples)] if holonomic: rotations = np.linspace(0, 2 * np.pi, self.rotation_samples, endpoint=False) else: if self.rotation_constraint == np.pi: rotations = np.linspace(-self.rotation_constraint, self.rotation_constraint, self.rotation_samples, endpoint=False) else: rotations = np.linspace(-self.rotation_constraint, self.rotation_constraint, self.rotation_samples) action_space = [ActionXY(0, 0) if holonomic else ActionRot(0, 0)] self.action_group_index.append(0) for j, speed in enumerate(speeds): for i, rotation in enumerate(rotations): action_index = j * self.rotation_samples + i + 1 self.action_group_index.append(action_index) if holonomic: action_space.append(ActionXY(speed * np.cos(rotation), speed * np.sin(rotation))) else: action_space.append(ActionRot(speed, rotation)) self.speeds = speeds self.rotations = rotations self.action_space = action_space
def _build_action_space(self): speeds = [(np.exp((i + 1) / self.speed_samples) - 1) / (np.e - 1) * self.max_speed for i in range(self.speed_samples)] rotations = np.linspace(-np.pi / 4, np.pi / 4, self.rotation_samples) actions = [ActionRot(0, 0)] for rotation, speed in itertools.product(rotations, speeds): actions.append(ActionRot(speed, rotation)) return actions
def build_action_space(self, v_pref): """ Action space consists of 25 uniformly sampled actions in permitted range and 25 randomly sampled actions. """ holonomic = True if self.kinematics == "holonomic" else False speeds = [(np.exp( (i + 1) / self.speed_samples) - 1) / (np.e - 1) * v_pref for i in range(self.speed_samples)] if holonomic: rotations = np.linspace(0, 2 * np.pi, self.rotation_samples, endpoint=False) else: rotations = np.linspace( -self.rotation_constraint, self.rotation_constraint, self.rotation_samples, ) action_space = [ActionXY(0, 0) if holonomic else ActionRot(0, 0)] for j, speed in enumerate(speeds): if j == 0: # index for action (0, 0) self.action_group_index.append(0) # only two groups in speeds if j < 3: speed_index = 0 else: speed_index = 1 for i, rotation in enumerate(rotations): rotation_index = i // 2 action_index = (speed_index * self.sparse_rotation_samples + rotation_index) self.action_group_index.append(action_index) if holonomic: action_space.append( ActionXY(speed * np.cos(rotation), speed * np.sin(rotation))) else: action_space.append(ActionRot(speed, rotation)) self.speeds = speeds self.rotations = rotations self.action_space = action_space
def clip_action(self, raw_action, v_pref): """ Input state is the joint state of robot concatenated by the observable state of other agents To predict the best action, agent samples actions and propagates one step to see how good the next state is thus the reward function is needed """ # quantize the action holonomic = True if self.config.action_space.kinematics == 'holonomic' else False # clip the action if holonomic: act_norm = np.linalg.norm(raw_action) if act_norm > v_pref: raw_action[0] = raw_action[0] / act_norm * v_pref raw_action[1] = raw_action[1] / act_norm * v_pref return ActionXY(raw_action[0], raw_action[1]) else: # for sim2real raw_action[0] = np.clip(raw_action[0], -0.1, 0.1) # action[0] is change of v # raw[0, 1] = np.clip(raw[0, 1], -0.25, 0.25) # action[1] is change of w # raw[0, 0] = np.clip(raw[0, 0], -state.self_state.v_pref, state.self_state.v_pref) # action[0] is v raw_action[1] = np.clip(raw_action[1], -0.1, 0.1) # action[1] is change of theta return ActionRot(raw_action[0], raw_action[1])
def act(self, obs): robot_state, humans_state, local_map = obs state = JointState(robot_state, humans_state) action = self.policy.predict(state, local_map, None) action = ActionRot(robot_state.v_pref * action.v, action.r) # de-normalize return action
def predict(self, state): """ Input state is the joint state of robot concatenated by the observable state of other agents To predict the best action, agent samples actions and propagates one step to see how good the next state is thus the reward function is needed """ if self.phase is None or self.device is None: raise AttributeError('Phase, device attributes have to be set!') if self.phase == 'train' and self.epsilon is None: raise AttributeError( 'Epsilon attribute has to be set in training phase') if self.reach_destination(state): return ActionXY( 0, 0) if self.kinematics == 'holonomic' else ActionRot(0, 0) host_agent = agent.Agent(state.self_state.px, state.self_state.py, state.self_state.gx, state.self_state.gy, state.self_state.radius, state.self_state.v_pref, state.self_state.theta, 0) host_agent.vel_global_frame = np.array( [state.self_state.vx, state.self_state.vy]) other_agents = [] for i, human_state in enumerate(state.human_states): if self.human_state_in_FOV(state.self_state, human_state): x = human_state.px y = human_state.py v_x = human_state.vx v_y = human_state.vy heading_angle = np.arctan2(v_y, v_x) pref_speed = np.linalg.norm(np.array([v_x, v_y])) goal_x = x + 5.0 goal_y = y + 5.0 other_agents.append( agent.Agent(x, y, goal_x, goal_y, human_state.radius, pref_speed, heading_angle, i + 1)) other_agents[-1].vel_global_frame = np.array([v_x, v_y]) obs = host_agent.observe(other_agents)[1:] obs = np.expand_dims(obs, axis=0) predictions = self.net.predict_p(obs, None)[0] raw_action = self.possible_actions.actions[np.argmax(predictions)] action = ActionRot(host_agent.pref_speed * raw_action[0], util.wrap(raw_action[1])) return action
def predict(self, state): """ A base class for all methods that takes pairwise joint state as input to value network. The input to the value network is always of shape (batch_size, # humans, rotated joint state length) """ if self.phase is None or self.device is None: raise AttributeError('Phase, device attributes have to be set!') if self.phase == 'train' and self.epsilon is None: raise AttributeError( 'Epsilon attribute has to be set in training phase') if self.reach_destination(state): return ActionXY( 0, 0) if self.kinematics == 'holonomic' else ActionRot(0, 0) if self.action_space is None: self.build_action_space(state.robot_state.v_pref) probability = np.random.random() if self.phase == 'train' and probability < self.epsilon: max_action = self.action_space[np.random.choice( len(self.action_space))] else: max_action = None max_value = float('-inf') max_traj = None if self.do_action_clip: state_tensor = state.to_tensor(add_batch_size=True, device=self.device) action_space_clipped = self.action_clip( state_tensor, self.action_space, self.planning_width) else: action_space_clipped = self.action_space for action in action_space_clipped: state_tensor = state.to_tensor(add_batch_size=True, device=self.device) next_state = self.state_predictor(state_tensor, action) max_next_return, max_next_traj = self.V_planning( next_state, self.planning_depth, self.planning_width) reward_est = self.estimate_reward(state, action) value = reward_est + self.get_normalized_gamma( ) * max_next_return if value > max_value: max_value = value max_action = action max_traj = [(state_tensor, action, reward_est) ] + max_next_traj if max_action is None: raise ValueError('Value network is not well trained.') if self.phase == 'train': self.last_state = self.transform(state) else: self.traj = max_traj return max_action
def predict(self, state): """ A base class for all methods that takes pairwise joint state as input to value network. The input to the value network is always of shape (batch_size, # humans, rotated joint state length) """ if self.phase is None or self.device is None: raise AttributeError('Phase, device attributes have to be set!') if self.phase == 'train' and self.epsilon is None: raise AttributeError('Epsilon attribute has to be set in training phase') if self.reach_destination(state): return ActionXY(0, 0) if self.kinematics == 'holonomic' else ActionRot(0, 0) if self.action_space is None: self.build_action_space(state.self_state.v_pref) occupancy_maps = None probability = np.random.random() if self.phase == 'train' and probability < self.epsilon: max_action = self.action_space[np.random.choice(len(self.action_space))] else: self.action_values = list() max_value = float('-inf') max_action = None for action in self.action_space: next_self_state = self.propagate(state.self_state, action) if self.query_env: next_human_states, reward, done, info = self.env.onestep_lookahead(action) else: next_human_states = [self.propagate(human_state, ActionXY(human_state.vx, human_state.vy)) for human_state in state.human_states] reward = self.compute_reward(next_self_state, next_human_states) batch_next_states = torch.cat([torch.Tensor([next_self_state + next_human_state]).to(self.device) for next_human_state in next_human_states], dim=0) #print('multi human_rl --- line 45 ') #print('prediction phase! ----> in RL ----') #print('rotation is callled ---------------------------------------------------------------------------------------------------------------------!') rotated_batch_input = self.rotate(batch_next_states).unsqueeze(0) if self.with_om: if occupancy_maps is None: occupancy_maps = self.build_occupancy_maps(next_human_states).unsqueeze(0) rotated_batch_input = torch.cat([rotated_batch_input, occupancy_maps], dim=2) # VALUE UPDATE next_state_value = self.model(rotated_batch_input).data.item() value = reward + pow(self.gamma, self.time_step * state.self_state.v_pref) * next_state_value self.action_values.append(value) if value > max_value: max_value = value max_action = action if max_action is None: raise ValueError('Value network is not well trained. ') if self.phase == 'train': #print('---------multi_human_rl.py line 64 ----->transformation done!') self.last_state = self.transform(state) return max_action
def predict(self, state): """ Input state is the joint state of robot concatenated by the observable state of other agents To predict the best action, agent samples actions and propagates one step to see how good the next state is thus the reward function is needed """ if self.phase is None or self.device is None: raise AttributeError('Phase, device attributes have to be set!') if self.phase == 'train' and self.epsilon is None: raise AttributeError( 'Epsilon attribute has to be set in training phase') if self.reach_destination(state): return ActionXY( 0, 0) if self.kinematics == 'holonomic' else ActionRot(0, 0) if self.action_space is None: self.build_action_space(state.self_state.v_pref) probability = np.random.random() if self.phase == 'train' and probability < self.epsilon: max_action = self.action_space[np.random.choice( len(self.action_space))] else: self.action_values = list() max_min_value = float('-inf') max_action = None for action in self.action_space: next_self_state = self.propagate(state.self_state, action) ob, reward, done, info = self.env.onestep_lookahead(action) batch_next_states = torch.cat([ torch.Tensor([next_self_state + next_human_state]).to( self.device) for next_human_state in ob ], dim=0) # VALUE UPDATE outputs = self.model( self.rotate(batch_next_states)) # Successor feature outputs = (outputs * self.model.w_vec).sum(1) min_output, min_index = torch.min(outputs, 0) min_value = reward + pow( self.gamma, self.time_step * state.self_state.v_pref) * min_output.data.item() self.action_values.append(min_value) if min_value > max_min_value: max_min_value = min_value max_action = action if self.phase == 'train': self.last_state = self.transform(state) # print("max action: ", max_action) return max_action
def select_greedy_action(self, self_state): # find the greedy action given kinematic constraints and return the closest action in the action space direction = np.arctan2(self_state.gy - self_state.py, self_state.gx - self_state.px) distance = np.linalg.norm( (self_state.gy - self_state.py, self_state.gx - self_state.px)) if self.kinematics == 'holonomic': speed = min(distance / self.time_step, self_state.v_pref) vx = np.cos(direction) * speed vy = np.sin(direction) * speed min_diff = float('inf') closest_action = None for action in self.action_space: diff = np.linalg.norm(np.array(action) - np.array((vx, vy))) if diff < min_diff: min_diff = diff closest_action = action else: rotation = direction - self_state.theta # if goal is not in the field of view, always rotate first if rotation < self.rotations[0]: closest_action = ActionRot(self.speeds[0], self.rotations[0]) elif rotation > self.rotations[-1]: closest_action = ActionRot(self.speeds[0], self.rotations[-1]) else: speed = min(distance / self.time_step, self_state.v_pref) min_diff = float('inf') closest_action = None for action in self.action_space: diff = np.linalg.norm( np.array((np.cos(action.r) * action.v, np.sin(action.r) * action.v)) - np.array((np.cos(rotation) * speed), np.sin(rotation) * action.v)) if diff < min_diff: min_diff = diff closest_action = action return closest_action
def test_non_holonomic_left_movement(self): radius = 1 time_step = .1 state = FullState(0, 0, 0, 0, radius, 0, 0, 0, 0) r = np.pi action = ActionRot(1., r) next_state = propagate(state, action, time_step=time_step, kinematics='non_holonomic') expected_state = FullState(time_step * np.cos(r), time_step * np.sin(r), \ np.cos(r), np.sin(r), radius, 0, 0, 0, r) self.assertEqual(expected_state, next_state)
def action_clip(self, state, action_space, width, depth=1): values = [] actions = [] if self.kinematics == "holonomic": actions.append(ActionXY(0, 0)) else: actions.append(ActionRot(0, 0)) # actions.append(ActionXY(0, 0)) next_robot_states = None next_human_states = None pre_next_state = self.state_predictor(state, actions) for action in action_space: # actions = [] # actions.append(action) next_robot_state = self.compute_next_robot_state(state[0], action) next_human_state = pre_next_state[1] if next_robot_states is None and next_human_states is None: next_robot_states = next_robot_state next_human_states = next_human_state else: next_robot_states = torch.cat((next_robot_states, next_robot_state), dim=0) next_human_states = torch.cat((next_human_states, next_human_state), dim=0) next_state_tensor = (next_robot_state, next_human_state) next_state = tensor_to_joint_state(next_state_tensor) reward_est, _ = self.reward_estimator.estimate_reward_on_predictor(state, next_state) values.append(reward_est) next_return = self.value_estimator((next_robot_states, next_human_states)).squeeze() next_return = np.array(next_return.data.detach()) values = np.array(values) + self.get_normalized_gamma() * next_return values = values.tolist() if self.sparse_search: # self.sparse_speed_samples = 2 # search in a sparse grained action space added_groups = set() max_indices = np.argsort(np.array(values))[::-1] clipped_action_space = [] for index in max_indices: if self.action_group_index[index] not in added_groups: clipped_action_space.append(action_space[index]) added_groups.add(self.action_group_index[index]) if len(clipped_action_space) == width: break else: max_indexes = np.argpartition(np.array(values), -width)[-width:] clipped_action_space = [action_space[i] for i in max_indexes] # print(clipped_action_space) return clipped_action_space
def reset(self): """ IANEnv destroys and re-creates its iarlenv at every reset, so apply our changes here """ self.env.reset() # we raytrace at a higher resolution, then downsample back to the original soadrl resolution # this avoids missing small obstacles due to the small soadrl resolution self.env.iarlenv.rlenv.virtual_peppers[ 0].kLidarMergedMaxAngle = self.angular_map_max_angle self.env.iarlenv.rlenv.virtual_peppers[ 0].kLidarMergedMinAngle = self.angular_map_min_angle self.env.iarlenv.rlenv.virtual_peppers[0].kLidarAngleIncrement = \ self.angular_map_angle_increment / self.lidar_upsampling self.env.iarlenv.rlenv.kMergedScanSize = self.angular_map_dim * self.lidar_upsampling self.episode_statistics = self.env.episode_statistics obs, _, _, _ = self.step(ActionRot(0., 0.)) return obs
def predict(self, state): """ A base class for all methods that takes pairwise joint state as input to value network. The input to the value network is always of shape (batch_size, # humans, rotated joint state length) """ self.count = self.count + 1 # if self.count == 34: # print('debug') if self.phase is None or self.device is None: raise AttributeError('Phase, device attributes have to be set!') if self.phase == 'train' and self.epsilon is None: raise AttributeError( 'Epsilon attribute has to be set in training phase') # self.v_pref = state.robot_state.v_pref if self.reach_destination(state): return ActionXY( 0, 0) if self.kinematics == 'holonomic' else ActionRot(0, 0) if self.action_space is None: self.build_action_space(self.v_pref) max_action = None origin_max_value = float('-inf') state_tensor = state.to_tensor(add_batch_size=True, device=self.device) probability = np.random.random() if self.phase == 'train' and probability < self.epsilon and self.use_noisy_net is False: max_action_index = np.random.choice(len(self.action_space)) max_action = self.action_space[max_action_index] self.last_state = self.transform(state) return max_action, max_action_index else: max_value, max_action_index, max_traj = self.V_planning( state_tensor, self.planning_depth, self.planning_width) if max_value[0] > origin_max_value: max_action = self.action_space[max_action_index[0]] if max_action is None: raise ValueError('Value network is not well trained.') if self.phase == 'train': self.last_state = self.transform(state) else: self.last_state = self.transform(state) self.traj = max_traj[0] return max_action, int(max_action_index[0])
def step(self, action, update=True): """ Compute actions for all agents, detect collision, update environment and return (ob, reward, done, info) """ action = self.robot.policy.clip_action(action, self.robot.v_pref) if self.robot.kinematics == 'unicycle': self.desiredVelocity[0] = np.clip(self.desiredVelocity[0]+action.v,-self.robot.v_pref,self.robot.v_pref) action=ActionRot(self.desiredVelocity[0], action.r) human_actions = self.get_human_actions() # compute reward and episode info reward, done, episode_info = self.calc_reward(action) # apply action and update all agents self.robot.step(action) for i, human_action in enumerate(human_actions): self.humans[i].step(human_action) self.global_time += self.time_step # max episode length=time_limit/time_step # compute the observation ob = self.generate_ob(reset=False) info={'info':episode_info} # Update all humans' goals randomly midway through episode if self.random_goal_changing: if self.global_time % 5 == 0: self.update_human_goals_randomly() # Update a specific human's goal once its reached its original goal if self.end_goal_changing: for human in self.humans: if norm((human.gx - human.px, human.gy - human.py)) < human.radius: self.update_human_goal(human) return ob, reward, done, info
def V_planning(self, state, depth, width): """ Plans n steps into future. Computes the value for the current state as well as the trajectories defined as a list of (state, action, reward) triples """ current_state_value = self.value_estimator(state) if depth == 1: return current_state_value, [(state, None, None)] if self.do_action_clip: action_space_clipped = self.action_clip(state, self.action_space, width) else: action_space_clipped = self.action_space returns = [] trajs = [] actions =[] if self.kinematics == "holonomic": actions.append(ActionXY(0, 0)) else: actions.append(ActionRot(0, 0)) # actions.append(ActionXY(0, 0)) pre_next_state = self.state_predictor(state, actions) for action in action_space_clipped: next_robot_staete = self.compute_next_robot_state(state[0], action) next_state_est = next_robot_staete, pre_next_state[1] # reward_est = self.estimate_reward(state, action) reward_est, _ = self.reward_estimator.estimate_reward_on_predictor(tensor_to_joint_state(state), tensor_to_joint_state(next_state_est)) next_value, next_traj = self.V_planning(next_state_est, depth - 1, self.planning_width) return_value = current_state_value / depth + (depth - 1) / depth * (self.get_normalized_gamma() * next_value + reward_est) returns.append(return_value) trajs.append([(state, action, reward_est)] + next_traj) max_index = np.argmax(returns) max_return = returns[max_index] max_traj = trajs[max_index] return max_return, max_traj
def predict(self, state): """ A base class for all methods that takes pairwise joint state as input to value network. The input to the value network is always of shape (batch_size, # humans, rotated joint state length) """ if self.phase is None or self.device is None: raise AttributeError("Phase, device attributes have to be set!") if self.phase == "train" and self.epsilon is None: raise AttributeError("Epsilon attribute has to be set in training phase") if self.reach_destination(state): return ActionXY(0, 0) if self.kinematics == "holonomic" else ActionRot(0, 0) if self.action_space is None: self.build_action_space(state.robot_state.v_pref) if not state.human_states: assert self.phase != "train" if hasattr(self, "attention_weights"): self.attention_weights = list() return self.select_greedy_action(state.robot_state) occupancy_maps = None probability = np.random.random() if self.phase == "train" and probability < self.epsilon: max_action = self.action_space[np.random.choice(len(self.action_space))] else: self.action_values = list() max_value = float("-inf") max_action = None for action in self.action_space: next_robot_state = self.propagate(state.robot_state, action) if self.query_env: next_human_states, reward, done, info = self.env.onestep_lookahead( action ) else: next_human_states = [ self.propagate( human_state, ActionXY(human_state.vx, human_state.vy) ) for human_state in state.human_states ] reward = self.compute_reward(next_robot_state, next_human_states) batch_next_states = torch.cat( [ torch.Tensor([next_robot_state + next_human_state]).to( self.device ) for next_human_state in next_human_states ], dim=0, ) rotated_batch_input = self.rotate(batch_next_states).unsqueeze(0) if self.with_om: if occupancy_maps is None: occupancy_maps = self.build_occupancy_maps( next_human_states ).unsqueeze(0) rotated_batch_input = torch.cat( [rotated_batch_input, occupancy_maps], dim=2 ) # VALUE UPDATE next_state_value = self.model(rotated_batch_input).data.item() value = ( reward + pow(self.gamma, self.time_step * state.robot_state.v_pref) * next_state_value ) self.action_values.append(value) if value > max_value: max_value = value max_action = action if hasattr(self, "attention_weights"): self.attention_weights = self.model.attention_weights if max_action is None: raise ValueError("Value network is not well trained. ") if self.phase == "train": self.last_state = self.transform(state) return max_action
def predict(self, states): """ A base class for all methods that takes pairwise joint state as input to value network. The input to the value network is always of shape (batch_size, # humans, rotated joint state length) states is a list of joint states """ if self.phase is None or self.device is None: raise AttributeError('Phase, device attributes have to be set!') if self.phase == 'train' and self.epsilon is None: raise AttributeError( 'Epsilon attribute has to be set in training phase') if self.reach_destination(states[-1]): return ActionXY( 0, 0) if self.kinematics == 'holonomic' else ActionRot(0, 0) if self.action_space is None: self.build_action_space(states[-1].self_state.v_pref) if self.policy_learning: trajs, rel_trajs, self_info = stateToTrajTensors([states]) next_position = self.model( (trajs, rel_trajs, self_info)).data.squeeze(0) action = ActionXY(next_position[0], next_position[1]) return action else: occupancy_maps = None probability = np.random.random() if self.phase == 'train' and probability < self.epsilon: max_action = self.action_space[np.random.choice( len(self.action_space))] else: self.action_values = list() max_value = float('-inf') max_action = None for action in self.action_space: next_self_state = self.propagate( states[-1].self_state, action ) # 'px', 'py', 'vx', 'vy', 'radius', 'gx', 'gy', 'v_pref', 'theta' if self.query_env: next_human_states, reward, done, info = self.env.onestep_lookahead( action) else: next_human_states = [ self.propagate( human_state, ActionXY(human_state.vx, human_state.vy)) for human_state in states.human_states ] # 'px1', 'py1', 'vx1', 'vy1', 'radius1' reward = self.compute_reward(next_self_state, next_human_states) # VALUE UPDATE trajs, rel_trajs, self_info = stateToTrajTensors([ states + [JointState(next_self_state, next_human_states)] ]) next_state_value = self.model( (trajs, rel_trajs, self_info)).data.item() value = reward + pow( self.gamma, self.time_step * states[-1].self_state.v_pref) * next_state_value self.action_values.append(value) if value > max_value: max_value = value max_action = action if max_action is None: raise ValueError('Value network is not well trained. ') if self.phase == 'train': self.last_state = states return max_action
def predict(self, state, joint_xy_history, occ_history): """ A base class for all methods that takes pairwise joint state as input to value network. The input to the value network is always of shape (batch_size, # humans, rotated joint state length) """ if self.phase is None or self.device is None: raise AttributeError('Phase, device attributes have to be set!') if self.phase == 'train' and self.epsilon is None: raise AttributeError( 'Epsilon attribute has to be set in training phase') if self.reach_destination(state): return ActionXY( 0, 0) if self.kinematics == 'holonomic' else ActionRot(0, 0) if self.action_space is None: self.build_action_space(state.self_state.v_pref) occupancy_maps = None probability = np.random.random() if self.phase == 'train' and probability < self.epsilon: max_action = self.action_space[np.random.choice( len(self.action_space))] else: self.action_values = list() max_value = float('-inf') max_action = None for action in self.action_space: next_self_state = self.propagate(state.self_state, action) if self.query_env: #mamoolan true #print("query env Tru") next_human_states, reward, done, info = self.env.onestep_lookahead( action) else: print("not!!!!! query env") next_human_states = [ self.propagate( human_state, ActionXY(human_state.vx, human_state.vy)) for human_state in state.human_states ] reward = self.compute_reward(next_self_state, next_human_states) batch_next_states = torch.cat([ torch.Tensor([next_self_state + next_human_state]).to( self.device) for next_human_state in next_human_states ], dim=0) # ans =[n-human,14] rotated_batch_input = self.rotate(batch_next_states).unsqueeze( 0) if self.with_om: if occupancy_maps is None: occupancy_maps = self.build_occupancy_maps( next_human_states).unsqueeze(0) rotated_batch_input = torch.cat( [rotated_batch_input, occupancy_maps.to(self.device)], dim=2) # VALUE UPDATE if (len(joint_xy_history.size()) == 3 ): #hatman bayad information next ra ezafe konim inja joint_xy_history = joint_xy_history.unsqueeze( 0) #[1 hist-len n-human+1 2(xy)] occ_history = occ_history.unsqueeze(0) #[1 hist-1 32] human_xy_next = batch_next_states[:, 9:11] robot_xy_next = batch_next_states[0:1, 0:2] joint_xy_next = torch.cat([robot_xy_next, human_xy_next], dim=-2) #[ n-human+1, 2] joint_xy_current = joint_xy_history[:, -1, :, :].squeeze( 0) #ans = [n-human+1 ,2(xy)] last_occ = self.pool(None, joint_xy_current, joint_xy_next).unsqueeze(0) #[1, 1 ,32] joint_xy_history_plus = torch.cat( [ joint_xy_history[:, 1:, :, :], joint_xy_next.unsqueeze(0).unsqueeze(0) ], dim=-3) #[batch ,hist-len ,n-human+1,2(xy)] occ_history_plus = torch.cat([occ_history[:, 1:, :], last_occ], dim=-2) representation_env_plus = self.traj_model( joint_xy_history_plus, occ_history_plus )[0].data # joint_xy=[batch, hist_len, n-human+1,2(xy)] // occ_history=[100,hist_len-1,32] //ans=[100 128] next_state_value = self.model( rotated_batch_input, representation_env_plus).data.item() value = reward + pow( self.gamma, self.time_step * state.self_state.v_pref) * next_state_value self.action_values.append(value) if value > max_value: max_value = value max_action = action if max_action is None: raise ValueError('Value network is not well trained. ') if self.phase == 'train': self.last_state = self.transform(state) self.last_state_unrotated = torch.cat( [ torch.Tensor([state.self_state + human_state]).to( self.device ) # added by saleh : because now we are in the RL phase and we need to push unrotated_state in the memory for human_state in state.human_states ], dim=0) return max_action
def predict(self, state): """ A base class for all methods that takes pairwise joint state as input to value network. The input to the value network is always of shape (batch_size, # humans, rotated joint state length) """ if self.phase is None or self.device is None: raise AttributeError('Phase, device attributes have to be set!') if self.phase == 'train' and self.epsilon is None: raise AttributeError('Epsilon attribute has to be set in training phase') if self.reach_destination(state): return ActionXY(0, 0) if self.kinematics == 'holonomic' else ActionRot(0, 0) if self.action_space is None: self.build_action_space(state.robot_state.v_pref) probability = np.random.random() if self.phase == 'train' and probability < self.epsilon: max_action_index = np.random.choice(len(self.action_space)) max_action = self.action_space[max_action_index] else: max_action = None max_value = float('-inf') max_traj = None if self.do_action_clip: state_tensor = state.to_tensor(add_batch_size=True, device=self.device) action_space_clipped = self.action_clip(state_tensor, self.action_space, self.planning_width) else: action_space_clipped = self.action_space state_tensor = state.to_tensor(add_batch_size=True, device=self.device) actions = [] if self.kinematics == "holonomic": actions.append(ActionXY(0, 0)) else: actions.append(ActionRot(0, 0)) # actions.append(ActionXY(0, 0)) pre_next_state = self.state_predictor(state_tensor, actions) next_robot_states = None next_human_states = None next_value = [] rewards = [] for action in action_space_clipped: next_robot_state = self.compute_next_robot_state(state_tensor[0], action) next_human_state = pre_next_state[1] if next_robot_states is None and next_human_states is None: next_robot_states = next_robot_state next_human_states = next_human_state else: next_robot_states = torch.cat((next_robot_states, next_robot_state), dim=0) next_human_states = torch.cat((next_human_states, next_human_state), dim=0) next_state = tensor_to_joint_state((next_robot_state, next_human_state)) reward_est, _ = self.reward_estimator.estimate_reward_on_predictor(state, next_state) max_next_return, max_next_traj = self.V_planning((next_robot_state, next_human_state), self.planning_depth, self.planning_width) value = reward_est + self.get_normalized_gamma() * max_next_return if value > max_value: max_value = value max_action = action max_traj = [(state_tensor, action, reward_est)] + max_next_traj # reward_est = self.estimate_reward(state, action) # rewards.append(reward_est) # next_state = self.state_predictor(state_tensor, action) # rewards_tensor = torch.tensor(rewards).to(self.device) # next_state_batch = (next_robot_states, next_human_states) # next_value = self.value_estimator(next_state_batch).squeeze(1) # value = rewards_tensor + next_value * self.get_normalized_gamma() # max_action_index = value.argmax() # best_value = value[max_action_index] # if best_value > max_value: # max_action = action_space_clipped[max_action_index] # # next_state = tensor_to_joint_state((next_robot_states[max_action_index], next_human_states[max_action_index])) # max_next_traj = [(next_state.to_tensor(), None, None)] # # max_next_return, max_next_traj = self.V_planning(next_state, self.planning_depth, self.planning_width) # # reward_est = self.estimate_reward(state, action) # # value = reward_est + self.get_normalized_gamma() * max_next_return # # if value > max_value: # # max_value = value # # max_action = action # max_traj = [(state_tensor, max_action, rewards[max_action_index])] + max_next_traj if max_action is None: raise ValueError('Value network is not well trained.') if self.phase == 'train': self.last_state = self.transform(state) else: self.traj = max_traj for action_index in range(len(self.action_space)): action = self.action_space[action_index] if action is max_action: max_action_index = action_index break return max_action, int(max_action_index)
def run_k_episodes(self, k, phase, update_memory=False, imitation_learning=False, episode=None, traj_head=None, print_failure=False, noise_explore=0.0, progressbar=False, output_info=False, notebook=False, print_info=False, return_states=False, suffix=None): self.robot.policy.set_phase(phase) success_times = [] collision_times = [] timeout_times = [] success = 0 collision = 0 timeout = 0 too_close = 0 min_dist = [] cumulative_rewards = [] collision_cases = [] timeout_cases = [] states_s = [] if imitation_learning: human_obsv = [] iter = range(k) if progressbar: if notebook: iter = tqdm_notebook(iter, leave=False) else: iter = tqdm(iter, leave=False) traj_acc_sum = 0 for i in iter: ob = self.env.reset(phase) done = False states = [] actions = [] rewards = [] scene = [] while not done: # infer action from policy if 'TRAJ' in self.robot.policy.name: action = self.robot.act(ob) elif self.num_frames == 1: action = self.robot.act(ob) else: last_ob = latest_frame(ob, self.num_frames) action = self.robot.act(last_ob) actions.append(action) # state append if imitation_learning: # if 'RNN' in self.target_policy.name: # # TODO: enumerate and test differenct combinations of policies # joint_state = JointState(self.robot.get_full_state(), ob) # elif 'SAIL' in self.target_policy.name: # joint_state = JointState(self.robot.get_full_state(), ob) # else: # joint_state = self.robot.policy.last_state joint_state = JointState(self.robot.get_full_state(), ob) last_state = ExplorerDs.transform_mult(joint_state) states.append(last_state) scene.append(obs_to_frame(ob)) else: states.append(self.robot.policy.last_state) # env step if imitation_learning and noise_explore > 0.0: noise_magnitude = noise_explore * 2.0 action = ActionXY( action[0] + torch.rand(1).sub(0.5) * noise_magnitude, action[1] + torch.rand(1).sub(0.5) * noise_magnitude ) if self.robot.policy.kinematics == 'holonomic' else ActionRot( action[0] + torch.rand(1).sub(0.5) * noise_magnitude, action[1] + torch.rand(1).sub(0.5) * noise_magnitude) ob, reward, done, info = self.env.step(action) rewards.append(reward) if isinstance(info, Danger): too_close += 1 min_dist.append(info.min_dist) if imitation_learning: human_obsv.append(torch.FloatTensor(scene)) if isinstance(info, ReachGoal): success += 1 success_times.append(self.env.global_time) elif isinstance(info, Collision): collision += 1 collision_cases.append(i) collision_times.append(self.env.global_time) elif isinstance(info, Timeout): timeout += 1 timeout_cases.append(i) timeout_times.append(self.env.time_limit) else: raise ValueError('Invalid end signal from environment') traj_accuracy = 0 if traj_head is not None: for i, ([rob, hum], human_feats) in enumerate(states): #next_human_pos = torch.cat([states[max(i, len(states)-1)][0][1] for i in range(len(states))], dim=1) next_human_pos = [] for j in range(4): n_p = states[min(i + j + 1, len(states) - 1)][0][1][-1, :, :2] #print(states[min(i+j+1, len(states)-1)][0][1]) next_human_pos.append(n_p) with torch.no_grad(): next_human_pos = torch.cat(next_human_pos, dim=1) next_human_pos_pred = traj_head(human_feats) # print(hum[-1]) # print(next_human_pos) # print(next_human_pos_pred) # print('\n') accuracy = ((next_human_pos - next_human_pos_pred)**2).mean().item() traj_accuracy += accuracy #print(traj_accuracy) traj_acc_sum += traj_accuracy if update_memory: self.update_memory(states, actions, rewards, imitation_learning) states_s.append(states) # episode result cumulative_rewards.append( sum([ pow(self.gamma, t * self.robot.time_step * self.robot.v_pref) * reward for t, reward in enumerate(rewards) ])) # if i % 100 == 0 and i > 0: # logging.info("{:<5} Explore #{:d} / {:d} episodes".format(phase.upper(), i, k)) traj_acc_sum = traj_acc_sum / sum(len(s) for s in states_s) # episodes statistics success_rate = success / k collision_rate = collision / k timeout_rate = timeout / k assert success + collision + timeout == k avg_nav_time = sum(success_times) / len( success_times) if success_times else self.env.time_limit if not print_info: extra_info = '' if episode is None else 'in episode {} '.format( episode) logging.info( '{:<5} {} success: {:.2f}, collision: {:.2f}, nav time: {:.2f}, reward: {:.4f} +- {:.4f}' .format(phase.upper(), extra_info, success_rate, collision_rate, avg_nav_time, statistics.mean(cumulative_rewards), (statistics.stdev(cumulative_rewards) if len(cumulative_rewards) > 1 else 0.0))) info = { 'success': success_rate, 'collision': collision_rate, 'nav time': avg_nav_time, 'reward': statistics.mean(cumulative_rewards) } if print_info: info = 'success: {:.4f},collision: {:.4f},nav time: {:.2f},reward: {:.4f}, timeout : {:.4f}'.\ format(success_rate, collision_rate, avg_nav_time, statistics.mean(cumulative_rewards), timeout_rate) if traj_head is not None: info += ', traj accuracy : {:.4f}'.format(traj_acc_sum) if suffix is not None: info = suffix + info print(info) # if phase in ['val', 'test']: # total_time = sum(success_times + collision_times + timeout_times) * self.robot.time_step # logging.info('Frequency of being in danger: %.2f', too_close / total_time) if print_failure: logging.info('Collision cases: ' + ' '.join([str(x) for x in collision_cases])) logging.info('Timeout cases: ' + ' '.join([str(x) for x in timeout_cases])) info_dic = { 'success': success_rate, 'collision': collision_rate, 'nav time': avg_nav_time, 'reward': statistics.mean(cumulative_rewards), 'traj accuracy': traj_acc_sum } if output_info: if return_states: return info_dic, states_s else: return info_dic if imitation_learning: return human_obsv
def run_k_episodes(self, k, phase, update_memory=False, imitation_learning=False, episode=None, print_failure=False, noise_explore=0.0, progressbar=True, notebook=False): self.robot.policy.set_phase(phase) success_times = [] collision_times = [] timeout_times = [] success = 0 collision = 0 timeout = 0 too_close = 0 min_dist = [] cumulative_rewards = [] collision_cases = [] timeout_cases = [] if imitation_learning: human_obsv = [] iter = range(k) if progressbar: if notebook: iter = tqdm_notebook(iter, leave=False) else: iter = tqdm(iter, leave=False) for i in iter: ob = self.env.reset(phase) done = False states = [] actions = [] rewards = [] scene = [] #print(self.robot.policy) while not done: # infer action from policy if 'RNN' in self.robot.policy.name: action = self.robot.act(ob) elif self.num_frames == 1: action = self.robot.act(ob) else: last_ob = latest_frame(ob, self.num_frames) action = self.robot.act(last_ob) actions.append(action) #print(self.target_policy.name) # state append if imitation_learning: if 'RNN' in self.target_policy.name: # TODO: enumerate and test differenct combinations of policies joint_state = JointState(self.robot.get_full_state(), ob) elif 'SAIL' in self.target_policy.name or 'SARL' in self.target_policy.name: joint_state = JointState(self.robot.get_full_state(), ob) else: joint_state = self.robot.policy.last_state last_state = self.target_policy.transform(joint_state) states.append(last_state) scene.append(obs_to_frame(ob)) #print(last_state) else: states.append(self.robot.policy.last_state) # env step if imitation_learning and noise_explore > 0.0: noise_magnitude = noise_explore * 2.0 action = ActionXY( action[0] + torch.rand(1).sub(0.5) * noise_magnitude, action[1] + torch.rand(1).sub(0.5) * noise_magnitude ) if self.robot.policy.kinematics == 'holonomic' else ActionRot( action[0] + torch.rand(1).sub(0.5) * noise_magnitude, action[1] + torch.rand(1).sub(0.5) * noise_magnitude) ob, reward, done, info = self.env.step(action) rewards.append(reward) if isinstance(info, Danger): too_close += 1 min_dist.append(info.min_dist) if imitation_learning: human_obsv.append(torch.FloatTensor(scene)) if isinstance(info, ReachGoal): success += 1 success_times.append(self.env.global_time) elif isinstance(info, Collision): collision += 1 collision_cases.append(i) collision_times.append(self.env.global_time) elif isinstance(info, Timeout): timeout += 1 timeout_cases.append(i) timeout_times.append(self.env.time_limit) else: raise ValueError('Invalid end signal from environment') if update_memory: self.update_memory(states, actions, rewards, imitation_learning) # episode result cumulative_rewards.append( sum([ pow(self.gamma, t * self.robot.time_step * self.robot.v_pref) * reward for t, reward in enumerate(rewards) ])) # if i % 100 == 0 and i > 0: # logging.info("{:<5} Explore #{:d} / {:d} episodes".format(phase.upper(), i, k)) # episodes statistics success_rate = success / k collision_rate = collision / k assert success + collision + timeout == k avg_nav_time = sum(success_times) / len( success_times) if success_times else self.env.time_limit extra_info = '' if episode is None else 'in episode {} '.format( episode) logging.info( '{:<5} {} success: {:.2f}, collision: {:.2f}, nav time: {:.2f}, reward: {:.4f} +- {:.4f}' .format(phase.upper(), extra_info, success_rate, collision_rate, avg_nav_time, statistics.mean(cumulative_rewards), (statistics.stdev(cumulative_rewards) if len(cumulative_rewards) > 1 else 0.0))) print( '{:<5} {} success: {:.2f}, collision: {:.2f}, nav time: {:.2f}, reward: {:.4f} +- {:.4f}' .format(phase.upper(), extra_info, success_rate, collision_rate, avg_nav_time, statistics.mean(cumulative_rewards), (statistics.stdev(cumulative_rewards) if len(cumulative_rewards) > 1 else 0.0))) if phase in ['val', 'test']: total_time = sum(success_times + collision_times + timeout_times) * self.robot.time_step logging.info('Frequency of being in danger: %.2f', too_close / total_time) if print_failure: logging.info('Collision cases: ' + ' '.join([str(x) for x in collision_cases])) logging.info('Timeout cases: ' + ' '.join([str(x) for x in timeout_cases])) if imitation_learning: return human_obsv
def predict(self, state): """ A base class for all methods that takes pairwise joint state as input to value network. The input to the value network is always of shape (batch_size, # humans, rotated joint state length) """ if hasattr(self.model, 'training'): self.model.eval() if self.phase is None or self.device is None: raise AttributeError('Phase, device attributes have to be set!') if self.phase == 'train' and self.epsilon is None: raise AttributeError( 'Epsilon attribute has to be set in training phase') if self.reach_destination(state): return ActionXY( 0, 0) if self.kinematics == 'holonomic' else ActionRot(0, 0) if self.action_space is None: self.build_action_space(state.self_state.v_pref) occupancy_maps = None probability = np.random.random() if self.phase == 'train' and probability < self.epsilon: max_action = self.action_space[np.random.choice( len(self.action_space))] max_activations = torch.randn(10, 2) else: self.action_values = list() max_value = float('-inf') max_action = None max_activations = None action_states = [ self.compute_action_vals_activs(action, state) for action in self.action_space ] action_order = [] next_states = [] rewards = [] for i in range(len(action_states)): action_order.append(action_states[i][0]) next_states.append(action_states[i][1]) rewards.append(action_states[i][2]) model_input = torch.cat(next_states, dim=0) values = self.model(model_input) for i in range(len(action_order)): action = action_order[i] value = values[i].item() reward = rewards[i] action_value = reward + pow( self.gamma, self.time_step * state.self_state.v_pref) * value if action_value > max_value: max_value = action_value max_action = action if self.phase == 'train': self.last_state = self.transform(state) self.last_activation = max_activations return max_action
def predict(self, state): """ A base class for all methods that takes pairwise joint state as input to value network. The input to the value network is always of shape (batch_size, # humans, rotated joint state length) """ if self.phase is None or self.device is None: raise AttributeError('Phase, device attributes have to be set!') if self.phase == 'train' and self.epsilon is None: raise AttributeError( 'Epsilon attribute has to be set in training phase') if self.reach_destination(state): return ActionXY( 0, 0) if self.kinematics == 'holonomic' else ActionRot(0, 0) if self.action_space is None: self.build_action_space(state.self_state.v_pref) occupancy_maps = None probability = np.random.random() if self.phase == 'train' and probability < self.epsilon: max_action = self.action_space[np.random.choice( len(self.action_space))] else: self.action_values = list() max_value = float('-inf') max_action = None need_build_map = True single_step_cnt = 0 for action in self.action_space: next_self_state = self.propagate(state.self_state, action) if self.query_env: next_human_states, reward, done, info = self.env.onestep_lookahead( action, need_build_map) need_build_map = False else: next_human_states = [ self.propagate( human_state, ActionXY(human_state.vx, human_state.vy)) for human_state in state.human_states ] self.map.build_map_cpu(next_human_states) collision_probability = self.map.compute_occupied_probability( next_self_state) reward = self.compute_reward(next_self_state, next_human_states, collision_probability) batch_next_states = torch.cat([ torch.Tensor([next_self_state + next_human_state]).to( self.device) for next_human_state in next_human_states ], dim=0) rotated_batch_input = self.rotate(batch_next_states).unsqueeze( 0) if self.with_om: if occupancy_maps is None: occupancy_maps = self.build_occupancy_maps( next_human_states).unsqueeze(0) rotated_batch_input = torch.cat( [rotated_batch_input, occupancy_maps], dim=2) # VALUE UPDATE value, score = self.model(rotated_batch_input) next_state_value = value.data.item() value = reward + pow( self.gamma, self.time_step * state.self_state.v_pref) * next_state_value self.action_values.append(value) # ionly save the step cnt of used action if value > max_value: max_value = value max_action = action if self.use_rate_statistic: single_step_cnt = self.model.get_step_cnt() if max_action is None: raise ValueError('Value network is not well trained. ') if self.use_rate_statistic: if single_step_cnt < len(self.statistic_info): self.statistic_info[single_step_cnt] += 1 else: print("step count too large!!") pass if self.phase == 'train': self.last_state = self.transform(state) return max_action
def V_planning(self, state, depth, width): """ Plans n steps into future based on state action value function. Computes the value for the current state as well as the trajectories defined as a list of (state, action, reward) triples """ # current_state_value = self.value_estimator(state) robot_state_batch = state[0] human_state_batch = state[1] if state[1] is None: if depth == 0: q_value = torch.Tensor(self.value_estimator(state)) max_action_value, max_action_indexes = torch.max(q_value, dim=1) trajs = [] for i in range(robot_state_batch.shape[0]): cur_state = (robot_state_batch[i, :, :].unsqueeze(0), None) trajs.append([(cur_state, None, None)]) return max_action_value, max_action_indexes, trajs else: q_value = torch.Tensor(self.value_estimator(state)) max_action_value, max_action_indexes = torch.topk(q_value, width, dim=1) action_stay = [] for i in range(robot_state_batch.shape[0]): if self.kinematics == "holonomic": action_stay.append(ActionXY(0, 0)) else: action_stay.append(ActionRot(0, 0)) pre_next_state = None next_robot_state_batch = None next_human_state_batch = None reward_est = torch.zeros(state[0].shape[0], width) * float('inf') for i in range(robot_state_batch.shape[0]): cur_state = (robot_state_batch[i, :, :].unsqueeze(0), None) next_human_state = None for j in range(width): cur_action = self.action_space[max_action_indexes[i][j]] next_robot_state = self.compute_next_robot_state( cur_state[0], cur_action) if next_robot_state_batch is None: next_robot_state_batch = next_robot_state else: next_robot_state_batch = torch.cat( (next_robot_state_batch, next_robot_state), dim=0) reward_est[i][ j], _ = self.reward_estimator.estimate_reward_on_predictor( tensor_to_joint_state(cur_state), tensor_to_joint_state( (next_robot_state, next_human_state))) next_state_batch = (next_robot_state_batch, next_human_state_batch) if self.planning_depth - depth >= 2 and self.planning_depth > 2: cur_width = 1 else: cur_width = int(self.planning_width / 2) next_values, next_action_indexes, next_trajs = self.V_planning( next_state_batch, depth - 1, cur_width) next_values = next_values.view(state[0].shape[0], width) returns = (reward_est + self.get_normalized_gamma() * next_values + max_action_value) / 2 max_action_return, max_action_index = torch.max(returns, dim=1) trajs = [] max_returns = [] max_actions = [] for i in range(robot_state_batch.shape[0]): cur_state = (robot_state_batch[i, :, :].unsqueeze(0), None) action_id = max_action_index[i] trajs_id = i * width + action_id action = max_action_indexes[i][action_id] next_traj = next_trajs[trajs_id] trajs.append([(cur_state, action, reward_est)] + next_traj) max_returns.append(max_action_return[i].data) max_actions.append(action) max_returns = torch.tensor(max_returns) return max_returns, max_actions, trajs else: if depth == 0: q_value = torch.Tensor(self.value_estimator(state)) max_action_value, max_action_indexes = torch.max(q_value, dim=1) trajs = [] for i in range(robot_state_batch.shape[0]): cur_state = (robot_state_batch[i, :, :].unsqueeze(0), human_state_batch[i, :, :].unsqueeze(0)) trajs.append([(cur_state, None, None)]) return max_action_value, max_action_indexes, trajs else: q_value = torch.Tensor(self.value_estimator(state)) max_action_value, max_action_indexes = torch.topk(q_value, width, dim=1) action_stay = [] for i in range(robot_state_batch.shape[0]): if self.kinematics == "holonomic": action_stay.append(ActionXY(0, 0)) else: action_stay.append(ActionRot(0, 0)) _, pre_next_state = self.state_predictor(state, action_stay) next_robot_state_batch = None next_human_state_batch = None reward_est = torch.zeros(state[0].shape[0], width) * float('inf') for i in range(robot_state_batch.shape[0]): cur_state = (robot_state_batch[i, :, :].unsqueeze(0), human_state_batch[i, :, :].unsqueeze(0)) next_human_state = pre_next_state[i, :, :].unsqueeze(0) for j in range(width): cur_action = self.action_space[max_action_indexes[i][j]] next_robot_state = self.compute_next_robot_state( cur_state[0], cur_action) if next_robot_state_batch is None: next_robot_state_batch = next_robot_state next_human_state_batch = next_human_state else: next_robot_state_batch = torch.cat( (next_robot_state_batch, next_robot_state), dim=0) next_human_state_batch = torch.cat( (next_human_state_batch, next_human_state), dim=0) reward_est[i][ j], _ = self.reward_estimator.estimate_reward_on_predictor( tensor_to_joint_state(cur_state), tensor_to_joint_state( (next_robot_state, next_human_state))) next_state_batch = (next_robot_state_batch, next_human_state_batch) if self.planning_depth - depth >= 2 and self.planning_depth > 2: cur_width = 1 else: cur_width = int(self.planning_width / 2) next_values, next_action_indexes, next_trajs = self.V_planning( next_state_batch, depth - 1, cur_width) next_values = next_values.view(state[0].shape[0], width) returns = (reward_est + self.get_normalized_gamma() * next_values + max_action_value) / 2 max_action_return, max_action_index = torch.max(returns, dim=1) trajs = [] max_returns = [] max_actions = [] for i in range(robot_state_batch.shape[0]): cur_state = (robot_state_batch[i, :, :].unsqueeze(0), human_state_batch[i, :, :].unsqueeze(0)) action_id = max_action_index[i] trajs_id = i * width + action_id action = max_action_indexes[i][action_id] next_traj = next_trajs[trajs_id] trajs.append([(cur_state, action, reward_est)] + next_traj) max_returns.append(max_action_return[i].data) max_actions.append(action) max_returns = torch.tensor(max_returns) return max_returns, max_actions, trajs
def predict(self, state): """ Takes pairwise joint state as input to value network and output action. The input to the value network is always of shape (batch_size, # humans, rotated joint state length). If with_costmap is True, the dangerous actions predicted by the value network will be screened out to avoid static obstacles on the map. """ if self.phase is None or self.device is None: raise AttributeError('Phase, device attributes have to be set!') if self.phase == 'train' and self.epsilon is None: raise AttributeError( 'Epsilon attribute has to be set in training phase') if self.reach_destination(state): return ActionXY( 0, 0) if self.kinematics == 'holonomic' else ActionRot(0, 0) if self.action_space is None: self.build_action_space(state.self_state.v_pref) occupancy_maps = None probability = np.random.random() if self.phase == 'train' and probability < self.epsilon: max_action = self.action_space[np.random.choice( len(self.action_space))] else: self.action_values = list() max_value = float('-inf') max_action = None for action in self.action_space: next_self_state = self.propagate(state.self_state, action) next_self_state_further = self.propagate_more( state.self_state, action) # abort actions which will probably cause collision with static obstacles in the costmap if self.with_costmap is True: cost = self.compute_cost(next_self_state_further) if cost > 0: print("********** Abort action:", action, " with cost:", cost, " that will hit the obstacles.") continue if self.query_env: next_human_states, reward, done, info = self.env.onestep_lookahead( action) else: next_human_states = [ self.propagate( human_state, ActionXY(human_state.vx, human_state.vy)) for human_state in state.human_states ] reward = self.compute_reward(next_self_state, next_human_states) batch_next_states = torch.cat([ torch.Tensor([next_self_state + next_human_state]).to( self.device) for next_human_state in next_human_states ], dim=0) rotated_batch_input = self.rotate(batch_next_states).unsqueeze( 0) if self.with_om: if occupancy_maps is None: occupancy_maps = self.build_occupancy_maps( next_human_states).unsqueeze(0) rotated_batch_input = torch.cat( [rotated_batch_input, occupancy_maps], dim=2) # VALUE UPDATE next_state_value = self.model(rotated_batch_input).data.item() value = reward + pow( self.gamma, self.time_step * state.self_state.v_pref) * next_state_value self.action_values.append(value) if value > max_value: max_value = value max_action = action # print("********** choose action:", action) # print("********** cost:", cost) if max_action is None: # if the robot is trapped, choose the turning action to escape max_action = ActionRot(0, 0.78) print("The robot is trapped. Rotate in place to escape......") if self.phase == 'train': self.last_state = self.transform(state) return max_action
def step(self, action, update=True): """ Compute actions for all agents, detect collision, update environment and return (ob, reward, done, info) """ action = self.robot.policy.clip_action(action, self.robot.v_pref) if self.robot.kinematics == 'unicycle': self.desiredVelocity[0] = np.clip( self.desiredVelocity[0] + action.v, -self.robot.v_pref, self.robot.v_pref) action = ActionRot(self.desiredVelocity[0], action.r) human_actions = [] # a list of all humans' actions for i, human in enumerate(self.humans): # observation for humans is always coordinates ob = [] for other_human in self.humans: if other_human != human: # Chance for one human to be blind to some other humans if self.random_unobservability and i == 0: if np.random.random( ) <= self.unobservable_chance or not self.detect_visible( human, other_human): ob.append(self.dummy_human.get_observable_state()) else: ob.append(other_human.get_observable_state()) # Else detectable humans are always observable to each other elif self.detect_visible(human, other_human): ob.append(other_human.get_observable_state()) else: ob.append(self.dummy_human.get_observable_state()) if self.robot.visible: # Chance for one human to be blind to robot if self.random_unobservability and i == 0: if np.random.random( ) <= self.unobservable_chance or not self.detect_visible( human, self.robot): ob += [self.dummy_robot.get_observable_state()] else: ob += [self.robot.get_observable_state()] # Else human will always see visible robots elif self.detect_visible(human, self.robot): ob += [self.robot.get_observable_state()] else: ob += [self.dummy_robot.get_observable_state()] human_actions.append(human.act(ob)) # compute reward and episode info reward, done, episode_info = self.calc_reward(action) # apply action and update all agents self.robot.step(action) for i, human_action in enumerate(human_actions): self.humans[i].step(human_action) self.global_time += self.time_step # max episode length=time_limit/time_step # compute the observation ob = self.generate_ob(reset=False) info = {'info': episode_info} # Update all humans' goals randomly midway through episode if self.random_goal_changing: if self.global_time % 5 == 0: self.update_human_goals_randomly() # Update a specific human's goal once its reached its original goal if self.end_goal_changing: for human in self.humans: if norm( (human.gx - human.px, human.gy - human.py)) < human.radius: self.update_human_goal(human) return ob, reward, done, info