def benchmark_data(self, agent, world): collisions = 0 occupied_landmarks = 0 min_dists = 0 for l in world.landmarks: dists = [np.linalg.norm(a.state.p_pos - l.state.p_pos) for a in world.agents] min_dists += min(dists) if min(dists) < 0.1: occupied_landmarks += 1 if agent.collide: for a in world.agents: if is_collision(a, agent): collisions += 1 return (self.reward(agent, world), collisions, min_dists, occupied_landmarks)
def reward(self, agent, world): # Agents are rewarded based on minimum agent distance to each landmark, penalized for collisions rew = 0 for l in world.landmarks: dists = [ np.sqrt(np.sum(np.square(a.state.p_pos - l.state.p_pos))) for a in world.agents ] rew -= min(dists) if agent.collide: for a in world.agents: if is_collision(a, agent): rew -= 1 return rew
def reward(self, agent, world): # Agents are rewarded based on minimum agent distance to each landmark, penalized for collisions assert world.identical_rewards == True rew = 0 for lm in [l for l in world.landmarks if not l.is_hazard]: # dists = [np.sqrt(np.sum(np.square(a.state.p_pos - l.state.p_pos))) for a in world.agents] dists = [distance(ag, lm) for ag in world.agents] rew -= min(dists) if agent.collide: for a in world.agents: if is_collision(a, agent): rew -= 1 return rew
def reward(self, agent, world): # Agents are rewarded based on minimum agent distance to each landmark, penalized for collisions rew = 0 for l in world.landmarks: dists = [ np.linalg.norm(a.state.p_pos - l.state.p_pos) for a in world.agents ] rew -= min(dists) for o in world.obstacles: if o.known: dists = [ np.linalg.norm(a.state.p_pos - o.state.p_pos) for a in world.agents ] rew += min(dists) if agent.collide: for a in world.agents: if is_collision(a, agent): rew -= 1 return rew
def step(self): super().step() # decrement observability and reward function of certain landmarks for landmark in self.landmarks: if isinstance(landmark, TemporarilyObservableRiskRewardLandmark): landmark.decrement_observe_clock(self.dt) # landmark.update_reward_function() # check for casualties for i, agent in enumerate(self.agents): # skip terminated agents since already a casualty if agent.terminated: # self.render_geoms = None # force resetting of object rendering list continue # check for destruction by hazard for landmark in self.landmarks: if isinstance(landmark, RiskRewardLandmark): rel_pos = delta_pos(agent, landmark) agent_failure = landmark.risk_fn.sample_failure( rel_pos[0], rel_pos[1]) if agent_failure: agent.terminate_agent() # If landmark caused a failure, make landmark observable if isinstance(landmark, TemporarilyObservableRiskRewardLandmark): landmark.set_observe_clock() break # check for destruction by flyoff or excess velocity # NOTE: this is a hack, certain scenarios were causing early training to send # agents to positions and velocities that overflowed floating point operations. this caused # the policy to return NaN actions. By using the termination state, we shouldn't # need to impose "walls" on the motion of agents, instead just cause them to # be terminated if they fly too far off if (np.linalg.norm(agent.state.p_pos) > self.flyoff_termination_radius or np.linalg.norm( agent.state.p_vel) > self.flyoff_termination_speed): agent.terminate_agent() # check if spontaneous destruction of agent if np.random.rand() < self.spontaneous_termination_probability: agent.terminate_agent() # recheck for terminated agents and skip if agent.terminated: # self.render_geoms = None # force resetting of object rendering list continue # check for collisions for other_agent in self.agents[i + 1:]: if other_agent.terminated: # skip if other agent alread terminated continue if is_collision(agent, other_agent): if np.random.random( ) < self.collision_termination_probability: # terminated other agent other_agent.terminate_agent() # logging.info('fatal crash') if np.random.random( ) < self.collision_termination_probability: # terminated current agent agent.terminate_agent() # logging.info('fatal crash') break