def __init__(self, brain, trainer_parameters, training, load, seed, run_id): """ Responsible for collecting experiences and training PPO model. :param trainer_parameters: The parameters for the trainer (dictionary). :param training: Whether the trainer is set for training. :param load: Whether the model should be loaded. :param seed: The seed the model will be initialized with :param run_id: The identifier of the current run """ super(BCTrainer, self).__init__(brain, trainer_parameters, training, run_id) self.policy = BCPolicy(seed, brain, trainer_parameters, load) self.n_sequences = 1 self.cumulative_rewards = {} self.episode_steps = {} self.stats = { "Losses/Cloning Loss": [], "Environment/Episode Length": [], "Environment/Cumulative Reward": [], } self.batches_per_epoch = trainer_parameters["batches_per_epoch"] self.demonstration_buffer = AgentBuffer() self.evaluation_buffer = ProcessingBuffer()
def create_buffer(brain_infos, brain_params, sequence_length, memory_size=8): buffer = ProcessingBuffer() update_buffer = AgentBuffer() # Make a buffer for idx, experience in enumerate(brain_infos): if idx > len(brain_infos) - 2: break current_brain_info = brain_infos[idx] next_brain_info = brain_infos[idx + 1] buffer[0].last_brain_info = current_brain_info buffer[0]["done"].append(next_brain_info.local_done[0]) buffer[0]["rewards"].append(next_brain_info.rewards[0]) for i in range(brain_params.number_visual_observations): buffer[0]["visual_obs%d" % i].append( current_brain_info.visual_observations[i][0] ) buffer[0]["next_visual_obs%d" % i].append( current_brain_info.visual_observations[i][0] ) if brain_params.vector_observation_space_size > 0: buffer[0]["vector_obs"].append(current_brain_info.vector_observations[0]) buffer[0]["next_vector_in"].append( current_brain_info.vector_observations[0] ) fake_action_size = len(brain_params.vector_action_space_size) if brain_params.vector_action_space_type == "continuous": fake_action_size = brain_params.vector_action_space_size[0] buffer[0]["actions"].append(np.zeros(fake_action_size, dtype=np.float32)) buffer[0]["prev_action"].append(np.zeros(fake_action_size, dtype=np.float32)) buffer[0]["masks"].append(1.0) buffer[0]["advantages"].append(1.0) if brain_params.vector_action_space_type == "discrete": buffer[0]["action_probs"].append( np.ones(sum(brain_params.vector_action_space_size), dtype=np.float32) ) else: buffer[0]["action_probs"].append( np.ones(buffer[0]["actions"][0].shape, dtype=np.float32) ) buffer[0]["actions_pre"].append( np.ones(buffer[0]["actions"][0].shape, dtype=np.float32) ) buffer[0]["action_mask"].append( np.ones(np.sum(brain_params.vector_action_space_size), dtype=np.float32) ) buffer[0]["memory"].append(np.ones(memory_size, dtype=np.float32)) buffer.append_to_update_buffer( update_buffer, 0, batch_size=None, training_length=sequence_length ) return update_buffer
def __init__(self, *args, **kwargs): super(RLTrainer, self).__init__(*args, **kwargs) # Make sure we have at least one reward_signal if not self.trainer_parameters["reward_signals"]: raise UnityTrainerException( "No reward signals were defined. At least one must be used with {}." .format(self.__class__.__name__)) # collected_rewards is a dictionary from name of reward signal to a dictionary of agent_id to cumulative reward # used for reporting only. We always want to report the environment reward to Tensorboard, regardless # of what reward signals are actually present. self.collected_rewards = {"environment": {}} self.processing_buffer = ProcessingBuffer() self.update_buffer = AgentBuffer() self.episode_steps = {}
def construct_fake_processing_buffer(): b = ProcessingBuffer() for fake_agent_id in range(4): for step in range(9): b[fake_agent_id]["vector_observation"].append([ 100 * fake_agent_id + 10 * step + 1, 100 * fake_agent_id + 10 * step + 2, 100 * fake_agent_id + 10 * step + 3, ]) b[fake_agent_id]["action"].append([ 100 * fake_agent_id + 10 * step + 4, 100 * fake_agent_id + 10 * step + 5, ]) return b
def make_demo_buffer( pair_infos: List[AgentInfoActionPairProto], brain_params: BrainParameters, sequence_length: int, ) -> AgentBuffer: # Create and populate buffer using experiences demo_process_buffer = ProcessingBuffer() demo_buffer = AgentBuffer() for idx, experience in enumerate(pair_infos): if idx > len(pair_infos) - 2: break current_pair_info = pair_infos[idx] next_pair_info = pair_infos[idx + 1] current_brain_info = BrainInfo.from_agent_proto( 0, [current_pair_info.agent_info], brain_params) next_brain_info = BrainInfo.from_agent_proto( 0, [next_pair_info.agent_info], brain_params) previous_action = (np.array(pair_infos[idx].action_info.vector_actions, dtype=np.float32) * 0) if idx > 0: previous_action = np.array( pair_infos[idx - 1].action_info.vector_actions, dtype=np.float32) demo_process_buffer[0].last_brain_info = current_brain_info demo_process_buffer[0]["done"].append(next_brain_info.local_done[0]) demo_process_buffer[0]["rewards"].append(next_brain_info.rewards[0]) for i in range(brain_params.number_visual_observations): demo_process_buffer[0]["visual_obs%d" % i].append( current_brain_info.visual_observations[i][0]) if brain_params.vector_observation_space_size > 0: demo_process_buffer[0]["vector_obs"].append( current_brain_info.vector_observations[0]) demo_process_buffer[0]["actions"].append( current_pair_info.action_info.vector_actions) demo_process_buffer[0]["prev_action"].append(previous_action) if next_brain_info.local_done[0]: demo_process_buffer.append_to_update_buffer( demo_buffer, 0, batch_size=None, training_length=sequence_length) demo_process_buffer.reset_local_buffers() demo_process_buffer.append_to_update_buffer( demo_buffer, 0, batch_size=None, training_length=sequence_length) return demo_buffer
class BCTrainer(Trainer): """The BCTrainer is an implementation of Behavioral Cloning.""" def __init__(self, brain, trainer_parameters, training, load, seed, run_id): """ Responsible for collecting experiences and training PPO model. :param trainer_parameters: The parameters for the trainer (dictionary). :param training: Whether the trainer is set for training. :param load: Whether the model should be loaded. :param seed: The seed the model will be initialized with :param run_id: The identifier of the current run """ super(BCTrainer, self).__init__(brain, trainer_parameters, training, run_id) self.policy = BCPolicy(seed, brain, trainer_parameters, load) self.n_sequences = 1 self.cumulative_rewards = {} self.episode_steps = {} self.stats = { "Losses/Cloning Loss": [], "Environment/Episode Length": [], "Environment/Cumulative Reward": [], } self.batches_per_epoch = trainer_parameters["batches_per_epoch"] self.demonstration_buffer = AgentBuffer() self.evaluation_buffer = ProcessingBuffer() def add_experiences( self, curr_info: BrainInfo, next_info: BrainInfo, take_action_outputs: ActionInfoOutputs, ) -> None: """ Adds experiences to each agent's experience history. :param curr_info: Current BrainInfo :param next_info: Next BrainInfo :param take_action_outputs: The outputs of the take action method. """ # Used to collect information about student performance. for agent_id in curr_info.agents: self.evaluation_buffer[agent_id].last_brain_info = curr_info for agent_id in next_info.agents: stored_next_info = self.evaluation_buffer[agent_id].last_brain_info if stored_next_info is None: continue else: next_idx = next_info.agents.index(agent_id) if agent_id not in self.cumulative_rewards: self.cumulative_rewards[agent_id] = 0 self.cumulative_rewards[agent_id] += next_info.rewards[ next_idx] if not next_info.local_done[next_idx]: if agent_id not in self.episode_steps: self.episode_steps[agent_id] = 0 self.episode_steps[agent_id] += 1 def process_experiences(self, current_info: BrainInfo, next_info: BrainInfo) -> None: """ Checks agent histories for processing condition, and processes them as necessary. Processing involves calculating value and advantage targets for model updating step. :param current_info: Current BrainInfo :param next_info: Next BrainInfo """ for l in range(len(next_info.agents)): if next_info.local_done[l]: agent_id = next_info.agents[l] self.stats["Environment/Cumulative Reward"].append( self.cumulative_rewards.get(agent_id, 0)) self.stats["Environment/Episode Length"].append( self.episode_steps.get(agent_id, 0)) self.reward_buffer.appendleft( self.cumulative_rewards.get(agent_id, 0)) self.cumulative_rewards[agent_id] = 0 self.episode_steps[agent_id] = 0 def end_episode(self): """ A signal that the Episode has ended. The buffer must be reset. Get only called when the academy resets. """ self.evaluation_buffer.reset_local_buffers() for agent_id in self.cumulative_rewards: self.cumulative_rewards[agent_id] = 0 for agent_id in self.episode_steps: self.episode_steps[agent_id] = 0 def is_ready_update(self): """ Returns whether or not the trainer has enough elements to run update model :return: A boolean corresponding to whether or not update_model() can be run """ return self.demonstration_buffer.num_experiences > self.n_sequences def update_policy(self): """ Updates the policy. """ self.demonstration_buffer.shuffle(self.policy.sequence_length) batch_losses = [] batch_size = self.n_sequences * self.policy.sequence_length # We either divide the entire buffer into num_batches batches, or limit the number # of batches to batches_per_epoch. num_batches = min( self.demonstration_buffer.num_experiences // batch_size, self.batches_per_epoch, ) for i in range(0, num_batches * batch_size, batch_size): update_buffer = self.demonstration_buffer mini_batch = update_buffer.make_mini_batch(i, i + batch_size) run_out = self.policy.update(mini_batch, self.n_sequences) loss = run_out["policy_loss"] batch_losses.append(loss) if len(batch_losses) > 0: self.stats["Losses/Cloning Loss"].append(np.mean(batch_losses)) else: self.stats["Losses/Cloning Loss"].append(0)
class RLTrainer(Trainer): """ This class is the base class for trainers that use Reward Signals. Contains methods for adding BrainInfos to the Buffer. """ def __init__(self, *args, **kwargs): super(RLTrainer, self).__init__(*args, **kwargs) # Make sure we have at least one reward_signal if not self.trainer_parameters["reward_signals"]: raise UnityTrainerException( "No reward signals were defined. At least one must be used with {}." .format(self.__class__.__name__)) # collected_rewards is a dictionary from name of reward signal to a dictionary of agent_id to cumulative reward # used for reporting only. We always want to report the environment reward to Tensorboard, regardless # of what reward signals are actually present. self.collected_rewards = {"environment": {}} self.processing_buffer = ProcessingBuffer() self.update_buffer = AgentBuffer() self.episode_steps = {} def construct_curr_info(self, next_info: BrainInfo) -> BrainInfo: """ Constructs a BrainInfo which contains the most recent previous experiences for all agents which correspond to the agents in a provided next_info. :BrainInfo next_info: A t+1 BrainInfo. :return: curr_info: Reconstructed BrainInfo to match agents of next_info. """ visual_observations: List[List[Any]] = [ [] for _ in next_info.visual_observations ] # TODO add types to brain.py methods vector_observations = [] rewards = [] local_dones = [] max_reacheds = [] agents = [] action_masks = [] for agent_id in next_info.agents: agent_brain_info = self.processing_buffer[agent_id].last_brain_info if agent_brain_info is None: agent_brain_info = next_info agent_index = agent_brain_info.agents.index(agent_id) for i in range(len(next_info.visual_observations)): visual_observations[i].append( agent_brain_info.visual_observations[i][agent_index]) vector_observations.append( agent_brain_info.vector_observations[agent_index]) rewards.append(agent_brain_info.rewards[agent_index]) local_dones.append(agent_brain_info.local_done[agent_index]) max_reacheds.append(agent_brain_info.max_reached[agent_index]) agents.append(agent_brain_info.agents[agent_index]) action_masks.append(agent_brain_info.action_masks[agent_index]) curr_info = BrainInfo( visual_observations, vector_observations, rewards, agents, local_dones, max_reacheds, action_masks, ) return curr_info def add_experiences( self, curr_info: BrainInfo, next_info: BrainInfo, take_action_outputs: ActionInfoOutputs, ) -> None: """ Adds experiences to each agent's experience history. :param curr_info: current BrainInfo. :param next_info: next BrainInfo. :param take_action_outputs: The outputs of the Policy's get_action method. """ self.trainer_metrics.start_experience_collection_timer() if take_action_outputs: self.stats["Policy/Entropy"].append( take_action_outputs["entropy"].mean()) self.stats["Policy/Learning Rate"].append( take_action_outputs["learning_rate"]) for name, signal in self.policy.reward_signals.items(): self.stats[signal.value_name].append( np.mean(take_action_outputs["value_heads"][name])) for agent_id in curr_info.agents: self.processing_buffer[agent_id].last_brain_info = curr_info self.processing_buffer[ agent_id].last_take_action_outputs = take_action_outputs if curr_info.agents != next_info.agents: curr_to_use = self.construct_curr_info(next_info) else: curr_to_use = curr_info # Evaluate and store the reward signals tmp_reward_signal_outs = {} for name, signal in self.policy.reward_signals.items(): tmp_reward_signal_outs[name] = signal.evaluate( curr_to_use, take_action_outputs["action"], next_info) # Store the environment reward tmp_environment = np.array(next_info.rewards, dtype=np.float32) rewards_out = AllRewardsOutput(reward_signals=tmp_reward_signal_outs, environment=tmp_environment) for agent_id in next_info.agents: stored_info = self.processing_buffer[agent_id].last_brain_info stored_take_action_outputs = self.processing_buffer[ agent_id].last_take_action_outputs if stored_info is not None: idx = stored_info.agents.index(agent_id) next_idx = next_info.agents.index(agent_id) if not stored_info.local_done[idx]: for i, _ in enumerate(stored_info.visual_observations): self.processing_buffer[agent_id][ "visual_obs%d" % i].append( stored_info.visual_observations[i][idx]) self.processing_buffer[agent_id][ "next_visual_obs%d" % i].append( next_info.visual_observations[i][next_idx]) if self.policy.use_vec_obs: self.processing_buffer[agent_id]["vector_obs"].append( stored_info.vector_observations[idx]) self.processing_buffer[agent_id][ "next_vector_in"].append( next_info.vector_observations[next_idx]) if self.policy.use_recurrent: self.processing_buffer[agent_id]["memory"].append( self.policy.retrieve_memories([agent_id])[0, :]) self.processing_buffer[agent_id]["masks"].append(1.0) self.processing_buffer[agent_id]["done"].append( next_info.local_done[next_idx]) # Add the outputs of the last eval self.add_policy_outputs(stored_take_action_outputs, agent_id, idx) # Store action masks if necessary if not self.policy.use_continuous_act: self.processing_buffer[agent_id]["action_mask"].append( stored_info.action_masks[idx], padding_value=1) self.processing_buffer[agent_id]["prev_action"].append( self.policy.retrieve_previous_action([agent_id])[0, :]) values = stored_take_action_outputs["value_heads"] # Add the value outputs if needed self.add_rewards_outputs(rewards_out, values, agent_id, idx, next_idx) for name, rewards in self.collected_rewards.items(): if agent_id not in rewards: rewards[agent_id] = 0 if name == "environment": # Report the reward from the environment rewards[agent_id] += rewards_out.environment[ next_idx] else: # Report the reward signals rewards[agent_id] += rewards_out.reward_signals[ name].scaled_reward[next_idx] if not next_info.local_done[next_idx]: if agent_id not in self.episode_steps: self.episode_steps[agent_id] = 0 self.episode_steps[agent_id] += 1 self.policy.save_previous_action(curr_info.agents, take_action_outputs["action"]) self.trainer_metrics.end_experience_collection_timer() def end_episode(self) -> None: """ A signal that the Episode has ended. The buffer must be reset. Get only called when the academy resets. """ self.processing_buffer.reset_local_buffers() for agent_id in self.episode_steps: self.episode_steps[agent_id] = 0 for rewards in self.collected_rewards.values(): for agent_id in rewards: rewards[agent_id] = 0 def clear_update_buffer(self) -> None: """ Clear the buffers that have been built up during inference. If we're not training, this should be called instead of update_policy. """ self.update_buffer.reset_agent() def add_policy_outputs(self, take_action_outputs: ActionInfoOutputs, agent_id: str, agent_idx: int) -> None: """ Takes the output of the last action and store it into the training buffer. We break this out from add_experiences since it is very highly dependent on the type of trainer. :param take_action_outputs: The outputs of the Policy's get_action method. :param agent_id: the Agent we're adding to. :param agent_idx: the index of the Agent agent_id """ raise UnityTrainerException( "The add_policy_outputs method was not implemented.") def add_rewards_outputs( self, rewards_out: AllRewardsOutput, values: Dict[str, np.ndarray], agent_id: str, agent_idx: int, agent_next_idx: int, ) -> None: """ Takes the value and evaluated rewards output of the last action and store it into the training buffer. We break this out from add_experiences since it is very highly dependent on the type of trainer. :param take_action_outputs: The outputs of the Policy's get_action method. :param rewards_dict: Dict of rewards after evaluation :param agent_id: the Agent we're adding to. :param agent_idx: the index of the Agent agent_id in the current brain info :param agent_next_idx: the index of the Agent agent_id in the next brain info """ raise UnityTrainerException( "The add_rewards_outputs method was not implemented.") def advance(self): """ Eventually logic from TrainerController.advance() will live here. """ self.clear_update_buffer()