def act(self, action: np.ndarray) -> None: if self.clip: action = np.clip(action, -1, 1) if self.longitudinal and self.lateral: self.controlled_vehicle.act({ "acceleration": utils.lmap(action[0], [-1, 1], self.acceleration_range), "steering": utils.lmap(action[1], [-1, 1], self.steering_range), }) elif self.longitudinal: self.controlled_vehicle.act({ "acceleration": utils.lmap(action[0], [-1, 1], self.acceleration_range), "steering": 0, }) elif self.lateral: self.controlled_vehicle.act({ "acceleration": 0, "steering": utils.lmap(action[0], [-1, 1], self.steering_range) }) self.last_action = action
def normalize_obs(self, df: pd.DataFrame) -> pd.DataFrame: """ Normalize the observation values. For now, assume that the road is straight along the x axis. :param Dataframe df: observation data """ if not self.features_range: side_lanes = self.env.road.network.all_side_lanes( self.observer_vehicle.lane_index) self.features_range = { "x": [-5.0 * MDPVehicle.SPEED_MAX, 5.0 * MDPVehicle.SPEED_MAX], "y": [ -AbstractLane.DEFAULT_WIDTH * len(side_lanes), AbstractLane.DEFAULT_WIDTH * len(side_lanes) ], "vx": [-2 * MDPVehicle.SPEED_MAX, 2 * MDPVehicle.SPEED_MAX], "vy": [-2 * MDPVehicle.SPEED_MAX, 2 * MDPVehicle.SPEED_MAX] } for feature, f_range in self.features_range.items(): if feature in df: df[feature] = utils.lmap(df[feature], [f_range[0], f_range[1]], [-1, 1]) if self.clip: df[feature] = np.clip(df[feature], -1, 1) return df
def _reward(self, action: int) -> float: """ The vehicle is rewarded for driving with high speed on lanes to the right and avoiding collisions But an additional altruistic penalty is also suffered if any vehicle on the merging lane has a low speed. :param action: the action performed :return: the reward of the state-action transition """ action_reward = { 0: self.LANE_CHANGE_REWARD, 1: 0, 2: self.LANE_CHANGE_REWARD, 3: 0, 4: 0 } reward = self.COLLISION_REWARD * self.vehicle.crashed \ + self.RIGHT_LANE_REWARD * self.vehicle.lane_index[2] / 1 \ + self.HIGH_SPEED_REWARD * self.vehicle.speed_index / (self.vehicle.SPEED_COUNT - 1) # Altruistic penalty for vehicle in self.road.vehicles: if vehicle.lane_index == ("b", "c", 2) and isinstance( vehicle, ControlledVehicle): reward += self.MERGING_SPEED_REWARD * \ (vehicle.target_speed - vehicle.speed) / vehicle.target_speed return utils.lmap(action_reward[action] + reward, [ self.COLLISION_REWARD + self.MERGING_SPEED_REWARD, self.HIGH_SPEED_REWARD + self.RIGHT_LANE_REWARD ], [0, 1])
def _agent_reward(self, action: int, vehicle: Vehicle) -> float: reward = self.config["collision_reward"] * vehicle.crashed \ + self.HIGH_SPEED_REWARD * (vehicle.speed_index == vehicle.SPEED_COUNT - 1) reward = self.ARRIVED_REWARD if self.has_arrived(vehicle) else reward if self.config["normalize_reward"]: reward = utils.lmap( reward, [self.config["collision_reward"], self.ARRIVED_REWARD], [0, 1]) return reward
def _reward(self, action: int) -> float: lane_change = action == 0 or action == 2 reward = self.COLLISION_REWARD * self.vehicle.crashed \ + self.HIGH_SPEED_REWARD * MDPVehicle.get_speed_index(self.vehicle) / max(MDPVehicle.SPEED_COUNT - 1, 1) \ + self.LANE_CHANGE_REWARD * lane_change return utils.lmap(reward, [ self.COLLISION_REWARD + self.LANE_CHANGE_REWARD, self.HIGH_SPEED_REWARD ], [0, 1])
def observe(self) -> np.ndarray: if not self.env.road: return np.zeros(self.space().shape) if self.absolute: raise NotImplementedError() else: # Add nearby traffic self.grid.fill(0) df = pd.DataFrame.from_records([ v.to_dict(self.observer_vehicle) for v in self.env.road.vehicles ]) # Normalize df = self.normalize(df) # Fill-in features for layer, feature in enumerate(self.features): for _, vehicle in df.iterrows(): x, y = vehicle["x"], vehicle["y"] # Recover unnormalized coordinates for cell index if "x" in self.features_range: x = utils.lmap(x, [-1, 1], [ self.features_range["x"][0], self.features_range["x"][1] ]) if "y" in self.features_range: y = utils.lmap(y, [-1, 1], [ self.features_range["y"][0], self.features_range["y"][1] ]) cell = (int( (x - self.grid_size[0, 0]) / self.grid_step[0]), int((y - self.grid_size[1, 0]) / self.grid_step[1])) if 0 <= cell[1] < self.grid.shape[-2] and 0 <= cell[ 0] < self.grid.shape[-1]: self.grid[layer, cell[1], cell[0]] = vehicle[feature] # Clip obs = np.clip(self.grid, -1, 1) return obs
def _reward(self, action: Action) :#-> float: now we return a list """ The reward is defined to foster driving at high speed, on the rightmost lanes, and to avoid collisions. :param action: the last action performed :return: the corresponding reward """ # -> float: now we change it to return a list!!!!! n_defenders=self.config["n_defenders"] n_attackers=self.config["n_attackers"] n_dummies=self.config["n_dummies"] rewards=[] for vehicle in self.controlled_vehicles: neighbours = self.road.network.all_side_lanes(vehicle.lane_index) lane = vehicle.target_lane_index[2] if isinstance(vehicle, ControlledVehicle) \ else vehicle.lane_index[2] scaled_speed = utils.lmap(vehicle.speed, self.config["reward_speed_range"], [0, 1]) reward = \ + self.config["collision_reward"] * vehicle.crashed \ + self.RIGHT_LANE_REWARD * lane / max(len(neighbours) - 1, 1) \ + self.HIGH_SPEED_REWARD * np.clip(scaled_speed, 0, 1) #reward = utils.lmap(reward, # [self.config["collision_reward"], self.HIGH_SPEED_REWARD + self.RIGHT_LANE_REWARD], # [0, 1]) #reward = 0 if not vehicle.on_road else reward reward = -1 if not vehicle.on_road else reward ''' scaled_speed = utils.lmap(vehicle.speed, self.config["reward_speed_range"], [0, 1]) reward = \ + self.config["collision_reward"] * vehicle.crashed \ + self.HIGH_SPEED_REWARD * np.clip(scaled_speed, 0, 1) #reward = utils.lmap(reward, # [self.config["collision_reward"], self.HIGH_SPEED_REWARD ], # [0, 1]) reward = -1 if not vehicle.on_road else reward ''' rewards.append(reward) # save configuration to self._inner_backup_config if self.config["enable_backtrack"]: self._save_backtrack_configuration(action) # reward correction based on the last several steps' behavior # currently reward is not changed. Just implement backtrack if self.vehicle.crashed: reward = self._reward_backward_correction(reward) return rewards
def normalize(self, df: pd.DataFrame) -> pd.DataFrame: """ Normalize the observation values. For now, assume that the road is straight along the x axis. :param Dataframe df: observation data """ if not self.features_range: self.features_range = { "vx": [-2 * MDPVehicle.SPEED_MAX, 2 * MDPVehicle.SPEED_MAX], "vy": [-2 * MDPVehicle.SPEED_MAX, 2 * MDPVehicle.SPEED_MAX] } for feature, f_range in self.features_range.items(): if feature in df: df[feature] = utils.lmap(df[feature], [f_range[0], f_range[1]], [-1, 1]) return df
def _reward(self, action: Action): #-> float: now we return a list """ The reward is defined to foster driving at high speed, on the rightmost lanes, and to avoid collisions. :param action: the last action performed :return: the corresponding reward """ # -> float: now we change it to return a list!!!!! rewards = [] for i, vehicle in enumerate(self.controlled_vehicles): neighbours = self.road.network.all_side_lanes(vehicle.lane_index) lane = vehicle.target_lane_index[2] if isinstance(vehicle, ControlledVehicle) \ else vehicle.lane_index[2] scaled_speed = utils.lmap(vehicle.speed, self.config["reward_speed_range"], [0, 1]) reward = \ self.config["collision_reward"] * vehicle.crashed \ + self.RIGHT_LANE_REWARD * lane / max(len(neighbours) - 1, 1) \ + self.HIGH_SPEED_REWARD * np.clip(scaled_speed, 0, 1) #reward = utils.lmap(reward, # [self.config["collision_reward"], self.HIGH_SPEED_REWARD + self.RIGHT_LANE_REWARD], # [0, 1]) #reward = 0 if not vehicle.on_road else reward reward = -1 if not vehicle.on_road else reward if self.config['task_type'] == 'attack': if i >= self.config['n_defenders'] and i < ( self.config['n_defenders'] + self.config['n_attackers']): reward *= 0 reward = -0.5 if not vehicle.on_road or vehicle.crashed else 0 rewards.append(reward) return rewards