def update(self, update_buffer: Buffer, num_sequences: int) -> Dict[str, float]: """ Updates Curiosity model using training buffer. Divides training buffer into mini batches and performs gradient descent. :param update_buffer: Update buffer from which to pull data from. :param num_sequences: Number of sequences in the update buffer. :return: Dict of stats that should be reported to Tensorboard. """ forward_total: List[float] = [] inverse_total: List[float] = [] for _ in range(self.num_epoch): update_buffer.shuffle() buffer = update_buffer for l in range(len(update_buffer["actions"]) // num_sequences): start = l * num_sequences end = (l + 1) * num_sequences run_out_curio = self._update_batch( buffer.make_mini_batch(start, end), num_sequences ) inverse_total.append(run_out_curio["inverse_loss"]) forward_total.append(run_out_curio["forward_loss"]) update_stats = { "Losses/Curiosity Forward Loss": np.mean(forward_total), "Losses/Curiosity Inverse Loss": np.mean(inverse_total), } return update_stats
def __init__(self, sess, brain, trainer_parameters, training, seed, run_id): """ Responsible for collecting experiences and training PPO model. :param sess: Tensorflow session. :param trainer_parameters: The parameters for the trainer (dictionary). :param training: Whether the trainer is set for training. """ super(BehavioralCloningTrainer, self).__init__(sess, brain, trainer_parameters, training, run_id) self.param_keys = ['brain_to_imitate', 'batch_size', 'time_horizon', 'graph_scope', 'summary_freq', 'max_steps', 'batches_per_epoch', 'use_recurrent', 'hidden_units','learning_rate', 'num_layers', 'sequence_length', 'memory_size'] for k in self.param_keys: if k not in trainer_parameters: raise UnityTrainerException("The hyperparameter {0} could not be found for the Imitation trainer of " "brain {1}.".format(k, brain.brain_name)) self.policy = BCPolicy(seed, brain, trainer_parameters, sess) self.brain_name = brain.brain_name self.brain_to_imitate = trainer_parameters['brain_to_imitate'] self.batches_per_epoch = trainer_parameters['batches_per_epoch'] self.n_sequences = max(int(trainer_parameters['batch_size'] / self.policy.sequence_length), 1) self.cumulative_rewards = {} self.episode_steps = {} self.stats = {'losses': [], 'episode_length': [], 'cumulative_reward': []} self.training_buffer = Buffer() self.summary_path = trainer_parameters['summary_path'] if not os.path.exists(self.summary_path): os.makedirs(self.summary_path) self.summary_writer = tf.summary.FileWriter(self.summary_path)
def __init__(self, brain, trainer_parameters, training, load, seed, run_id): """ Responsible for collecting experiences and training PPO model. :param trainer_parameters: The parameters for the trainer (dictionary). :param training: Whether the trainer is set for training. :param load: Whether the model should be loaded. :param seed: The seed the model will be initialized with :param run_id: The identifier of the current run """ super(BCTrainer, self).__init__(brain, trainer_parameters, training, run_id) self.policy = BCPolicy(seed, brain, trainer_parameters, load) self.n_sequences = 1 self.cumulative_rewards = {} self.episode_steps = {} self.stats = { "Losses/Cloning Loss": [], "Environment/Episode Length": [], "Environment/Cumulative Reward": [], } self.batches_per_epoch = trainer_parameters["batches_per_epoch"] self.demonstration_buffer = Buffer() self.evaluation_buffer = Buffer()
def __init__(self, brain, trainer_parameters, training, load, seed, run_id): """ Responsible for collecting experiences and training PPO model. :param trainer_parameters: The parameters for the trainer (dictionary). :param training: Whether the trainer is set for training. :param load: Whether the model should be loaded. :param seed: The seed the model will be initialized with :param run_id: The The identifier of the current run """ super(BCTrainer, self).__init__(brain, trainer_parameters, training, run_id) self.policy = BCPolicy(seed, brain, trainer_parameters, load) self.n_sequences = 1 self.cumulative_rewards = {} self.episode_steps = {} self.stats = {'Losses/Cloning Loss': [], 'Environment/Episode Length': [], 'Environment/Cumulative Reward': []} self.summary_path = trainer_parameters['summary_path'] self.batches_per_epoch = trainer_parameters['batches_per_epoch'] if not os.path.exists(self.summary_path): os.makedirs(self.summary_path) self.demonstration_buffer = Buffer() self.evaluation_buffer = Buffer() self.summary_writer = tf.summary.FileWriter(self.summary_path)
def test_buffer(): b = Buffer() for fake_agent_id in range(4): for step in range(9): b[fake_agent_id]['vector_observation'].append( [100 * fake_agent_id + 10 * step + 1, 100 * fake_agent_id + 10 * step + 2, 100 * fake_agent_id + 10 * step + 3] ) b[fake_agent_id]['action'].append([100 * fake_agent_id + 10 * step + 4, 100 * fake_agent_id + 10 * step + 5]) a = b[1]['vector_observation'].get_batch(batch_size=2, training_length=1, sequential=True) assert_array(a, np.array([[171, 172, 173], [181, 182, 183]])) a = b[2]['vector_observation'].get_batch(batch_size=2, training_length=3, sequential=True) assert_array(a, np.array([ [[231, 232, 233], [241, 242, 243], [251, 252, 253]], [[261, 262, 263], [271, 272, 273], [281, 282, 283]] ])) a = b[2]['vector_observation'].get_batch(batch_size=2, training_length=3, sequential=False) assert_array(a, np.array([ [[251, 252, 253], [261, 262, 263], [271, 272, 273]], [[261, 262, 263], [271, 272, 273], [281, 282, 283]] ])) b[4].reset_agent() assert len(b[4]) == 0 b.append_update_buffer(3, batch_size=None, training_length=2) b.append_update_buffer(2, batch_size=None, training_length=2) assert len(b.update_buffer['action']) == 10 assert np.array(b.update_buffer['action']).shape == (10, 2, 2) c = b.update_buffer.make_mini_batch(start=0, end=1) assert c.keys() == b.update_buffer.keys() assert c['action'].shape == (1, 2, 2)
def __init__(self, sess, brain, reward_buff_cap, trainer_parameters, training, seed, run_id): """ Responsible for collecting experiences and training PPO model. :param sess: Tensorflow session. :param trainer_parameters: The parameters for the trainer (dictionary). :param training: Whether the trainer is set for training. """ super(PPOTrainer, self).__init__(sess, brain.brain_name, trainer_parameters, training, run_id) self.param_keys = [ 'batch_size', 'beta', 'buffer_size', 'epsilon', 'gamma', 'hidden_units', 'lambd', 'learning_rate', 'max_steps', 'normalize', 'num_epoch', 'num_layers', 'time_horizon', 'sequence_length', 'summary_freq', 'use_recurrent', 'graph_scope', 'summary_path', 'memory_size', 'use_curiosity', 'curiosity_strength', 'curiosity_enc_size' ] for k in self.param_keys: if k not in trainer_parameters: raise UnityTrainerException( "The hyperparameter {0} could not be found for the PPO trainer of " "brain {1}.".format(k, brain.brain_name)) self.use_curiosity = bool(trainer_parameters['use_curiosity']) self.step = 0 self.policy = PPOPolicy(seed, brain, trainer_parameters, sess, self.is_training) stats = { 'cumulative_reward': [], 'episode_length': [], 'value_estimate': [], 'entropy': [], 'value_loss': [], 'policy_loss': [], 'learning_rate': [] } if self.use_curiosity: stats['forward_loss'] = [] stats['inverse_loss'] = [] stats['intrinsic_reward'] = [] self.intrinsic_rewards = {} self.stats = stats self.training_buffer = Buffer() self.training_buffer2 = Buffer() self.cumulative_rewards = {} self._reward_buffer = deque(maxlen=reward_buff_cap) self.episode_steps = {} self.summary_path = trainer_parameters['summary_path'] if not os.path.exists(self.summary_path): os.makedirs(self.summary_path) self.summary_writer = tf.summary.FileWriter(self.summary_path)
def __init__(self, brain, reward_buff_cap, trainer_parameters, training, load, seed, run_id): """ Responsible for collecting experiences and training PPO model. :param trainer_parameters: The parameters for the trainer (dictionary). :param reward_buff_cap: Max reward history to track in the reward buffer :param training: Whether the trainer is set for training. :param load: Whether the model should be loaded. :param seed: The seed the model will be initialized with :param run_id: The identifier of the current run """ super().__init__(brain, trainer_parameters, training, run_id, reward_buff_cap) self.param_keys = [ "batch_size", "beta", "buffer_size", "epsilon", "hidden_units", "lambd", "learning_rate", "max_steps", "normalize", "num_epoch", "num_layers", "time_horizon", "sequence_length", "summary_freq", "use_recurrent", "summary_path", "memory_size", "model_path", "reward_signals", ] self.check_param_keys() # Make sure we have at least one reward_signal if not self.trainer_parameters["reward_signals"]: raise UnityTrainerException( "No reward signals were defined. At least one must be used with {}." .format(self.__class__.__name__)) self.step = 0 self.policy = PPOPolicy(seed, brain, trainer_parameters, self.is_training, load) stats = defaultdict(list) # collected_rewards is a dictionary from name of reward signal to a dictionary of agent_id to cumulative reward # used for reporting only. We always want to report the environment reward to Tensorboard, regardless # of what reward signals are actually present. self.collected_rewards = {"environment": {}} for _reward_signal in self.policy.reward_signals.keys(): self.collected_rewards[_reward_signal] = {} self.stats = stats self.training_buffer = Buffer() self.episode_steps = {}
def __init__(self, brain, reward_buff_cap, trainer_parameters, training, load, seed, run_id): """ Responsible for collecting experiences and training PPO model. :param trainer_parameters: The parameters for the trainer (dictionary). :param training: Whether the trainer is set for training. :param load: Whether the model should be loaded. :param seed: The seed the model will be initialized with :param run_id: The The identifier of the current run """ super(PPOTrainer, self).__init__(brain, trainer_parameters, training, run_id) self.param_keys = [ 'batch_size', 'beta', 'buffer_size', 'epsilon', 'gamma', 'hidden_units', 'lambd', 'learning_rate', 'max_steps', 'normalize', 'num_epoch', 'num_layers', 'time_horizon', 'sequence_length', 'summary_freq', 'use_recurrent', 'summary_path', 'memory_size', 'use_curiosity', 'curiosity_strength', 'curiosity_enc_size', 'model_path' ] self.check_param_keys() self.use_curiosity = bool(trainer_parameters['use_curiosity']) self.step = 0 self.policy = PPOPolicy(seed, brain, trainer_parameters, self.is_training, load) stats = { 'Environment/Cumulative Reward': [], 'Environment/Episode Length': [], 'Policy/Value Estimate': [], 'Policy/Entropy': [], 'Losses/Value Loss': [], 'Losses/Policy Loss': [], 'Policy/Learning Rate': [] } if self.use_curiosity: stats['Losses/Forward Loss'] = [] stats['Losses/Inverse Loss'] = [] stats['Policy/Curiosity Reward'] = [] self.intrinsic_rewards = {} self.stats = stats self.training_buffer = Buffer() self.cumulative_rewards = {} self._reward_buffer = deque(maxlen=reward_buff_cap) self.episode_steps = {} self.summary_path = trainer_parameters['summary_path'] if not os.path.exists(self.summary_path): os.makedirs(self.summary_path) self.summary_writer = tf.summary.FileWriter(self.summary_path)
def __init__(self, *args, **kwargs): super(RLTrainer, self).__init__(*args, **kwargs) # Make sure we have at least one reward_signal if not self.trainer_parameters["reward_signals"]: raise UnityTrainerException( "No reward signals were defined. At least one must be used with {}." .format(self.__class__.__name__)) # collected_rewards is a dictionary from name of reward signal to a dictionary of agent_id to cumulative reward # used for reporting only. We always want to report the environment reward to Tensorboard, regardless # of what reward signals are actually present. self.collected_rewards = {"environment": {}} self.training_buffer = Buffer() self.episode_steps = {}
def make_demo_buffer(brain_infos, brain_params, sequence_length): # Create and populate buffer using experiences demo_buffer = Buffer() for idx, experience in enumerate(brain_infos): if idx > len(brain_infos) - 2: break current_brain_info = brain_infos[idx] next_brain_info = brain_infos[idx + 1] demo_buffer[0].last_brain_info = current_brain_info demo_buffer[0]['done'].append(next_brain_info.local_done[0]) demo_buffer[0]['rewards'].append(next_brain_info.rewards[0]) for i in range(brain_params.number_visual_observations): demo_buffer[0]['visual_obs%d' % i] \ .append(current_brain_info.visual_observations[i][0]) if brain_params.vector_observation_space_size > 0: demo_buffer[0]['vector_obs'] \ .append(current_brain_info.vector_observations[0]) demo_buffer[0]['actions'].append(next_brain_info.previous_vector_actions[0]) if next_brain_info.local_done[0]: demo_buffer.append_update_buffer(0, batch_size=None, training_length=sequence_length) demo_buffer.reset_local_buffers() demo_buffer.append_update_buffer(0, batch_size=None, training_length=sequence_length) return demo_buffer
def make_demo_buffer( brain_infos: List[BrainInfo], brain_params: BrainParameters, sequence_length: int ) -> Buffer: # Create and populate buffer using experiences demo_buffer = Buffer() for idx, experience in enumerate(brain_infos): if idx > len(brain_infos) - 2: break current_brain_info = brain_infos[idx] next_brain_info = brain_infos[idx + 1] demo_buffer[0].last_brain_info = current_brain_info demo_buffer[0]["done"].append(next_brain_info.local_done[0]) demo_buffer[0]["rewards"].append(next_brain_info.rewards[0]) for i in range(brain_params.number_visual_observations): demo_buffer[0]["visual_obs%d" % i].append( current_brain_info.visual_observations[i][0] ) if brain_params.vector_observation_space_size > 0: demo_buffer[0]["vector_obs"].append( current_brain_info.vector_observations[0] ) demo_buffer[0]["actions"].append(next_brain_info.previous_vector_actions[0]) demo_buffer[0]["prev_action"].append( current_brain_info.previous_vector_actions[0] ) if next_brain_info.local_done[0]: demo_buffer.append_update_buffer( 0, batch_size=None, training_length=sequence_length ) demo_buffer.reset_local_buffers() demo_buffer.append_update_buffer( 0, batch_size=None, training_length=sequence_length ) return demo_buffer
def create_buffer(brain_infos, brain_params, sequence_length, memory_size=8): buffer = Buffer() # Make a buffer for idx, experience in enumerate(brain_infos): if idx > len(brain_infos) - 2: break current_brain_info = brain_infos[idx] next_brain_info = brain_infos[idx + 1] buffer[0].last_brain_info = current_brain_info buffer[0]["done"].append(next_brain_info.local_done[0]) buffer[0]["rewards"].append(next_brain_info.rewards[0]) for i in range(brain_params.number_visual_observations): buffer[0]["visual_obs%d" % i].append( current_brain_info.visual_observations[i][0]) buffer[0]["next_visual_obs%d" % i].append( current_brain_info.visual_observations[i][0]) if brain_params.vector_observation_space_size > 0: buffer[0]["vector_obs"].append( current_brain_info.vector_observations[0]) buffer[0]["next_vector_in"].append( current_brain_info.vector_observations[0]) fake_action_size = len(brain_params.vector_action_space_size) if brain_params.vector_action_space_type == "continuous": fake_action_size = brain_params.vector_action_space_size[0] buffer[0]["actions"].append(np.zeros(fake_action_size)) buffer[0]["prev_action"].append(np.zeros(fake_action_size)) buffer[0]["masks"].append(1.0) buffer[0]["advantages"].append(1.0) if brain_params.vector_action_space_type == "discrete": buffer[0]["action_probs"].append( np.ones(sum(brain_params.vector_action_space_size))) else: buffer[0]["action_probs"].append( np.ones(buffer[0]["actions"][0].shape)) buffer[0]["actions_pre"].append(np.ones(buffer[0]["actions"][0].shape)) buffer[0]["random_normal_epsilon"].append( np.ones(buffer[0]["actions"][0].shape)) buffer[0]["action_mask"].append( np.ones(np.sum(brain_params.vector_action_space_size))) buffer[0]["memory"].append(np.ones(memory_size)) buffer.append_update_buffer(0, batch_size=None, training_length=sequence_length) return buffer
def construct_fake_buffer(): b = Buffer() for fake_agent_id in range(4): for step in range(9): b[fake_agent_id]["vector_observation"].append([ 100 * fake_agent_id + 10 * step + 1, 100 * fake_agent_id + 10 * step + 2, 100 * fake_agent_id + 10 * step + 3, ]) b[fake_agent_id]["action"].append([ 100 * fake_agent_id + 10 * step + 4, 100 * fake_agent_id + 10 * step + 5, ]) return b
def make_demo_buffer( pair_infos: List[AgentInfoActionPairProto], brain_params: BrainParameters, sequence_length: int, ) -> Buffer: # Create and populate buffer using experiences demo_buffer = Buffer() for idx, experience in enumerate(pair_infos): if idx > len(pair_infos) - 2: break current_pair_info = pair_infos[idx] next_pair_info = pair_infos[idx + 1] current_brain_info = BrainInfo.from_agent_proto( 0, [current_pair_info.agent_info], brain_params) next_brain_info = BrainInfo.from_agent_proto( 0, [next_pair_info.agent_info], brain_params) previous_action = np.array( pair_infos[idx].action_info.vector_actions) * 0 if idx > 0: previous_action = np.array( pair_infos[idx - 1].action_info.vector_actions) demo_buffer[0].last_brain_info = current_brain_info demo_buffer[0]["done"].append(next_brain_info.local_done[0]) demo_buffer[0]["rewards"].append(next_brain_info.rewards[0]) for i in range(brain_params.number_visual_observations): demo_buffer[0]["visual_obs%d" % i].append( current_brain_info.visual_observations[i][0]) if brain_params.vector_observation_space_size > 0: demo_buffer[0]["vector_obs"].append( current_brain_info.vector_observations[0]) demo_buffer[0]["actions"].append( current_pair_info.action_info.vector_actions) demo_buffer[0]["prev_action"].append(previous_action) if next_brain_info.local_done[0]: demo_buffer.append_update_buffer(0, batch_size=None, training_length=sequence_length) demo_buffer.reset_local_buffers() demo_buffer.append_update_buffer(0, batch_size=None, training_length=sequence_length) return demo_buffer
class RLTrainer(Trainer): """ This class is the base class for trainers that use Reward Signals. Contains methods for adding BrainInfos to the Buffer. """ def __init__(self, *args, **kwargs): super(RLTrainer, self).__init__(*args, **kwargs) self.step = 0 # Make sure we have at least one reward_signal if not self.trainer_parameters["reward_signals"]: raise UnityTrainerException( "No reward signals were defined. At least one must be used with {}." .format(self.__class__.__name__)) # collected_rewards is a dictionary from name of reward signal to a dictionary of agent_id to cumulative reward # used for reporting only. We always want to report the environment reward to Tensorboard, regardless # of what reward signals are actually present. self.collected_rewards = {"environment": {}} self.training_buffer = Buffer() self.episode_steps = {} def construct_curr_info(self, next_info: BrainInfo) -> BrainInfo: """ Constructs a BrainInfo which contains the most recent previous experiences for all agents which correspond to the agents in a provided next_info. :BrainInfo next_info: A t+1 BrainInfo. :return: curr_info: Reconstructed BrainInfo to match agents of next_info. """ visual_observations: List[List[Any]] = [ [] ] # TODO add types to brain.py methods vector_observations = [] text_observations = [] memories = [] rewards = [] local_dones = [] max_reacheds = [] agents = [] prev_vector_actions = [] prev_text_actions = [] action_masks = [] for agent_id in next_info.agents: agent_brain_info = self.training_buffer[agent_id].last_brain_info if agent_brain_info is None: agent_brain_info = next_info agent_index = agent_brain_info.agents.index(agent_id) for i in range(len(next_info.visual_observations)): visual_observations[i].append( agent_brain_info.visual_observations[i][agent_index]) vector_observations.append( agent_brain_info.vector_observations[agent_index]) text_observations.append( agent_brain_info.text_observations[agent_index]) if self.policy.use_recurrent: if len(agent_brain_info.memories) > 0: memories.append(agent_brain_info.memories[agent_index]) else: memories.append(self.policy.make_empty_memory(1)) rewards.append(agent_brain_info.rewards[agent_index]) local_dones.append(agent_brain_info.local_done[agent_index]) max_reacheds.append(agent_brain_info.max_reached[agent_index]) agents.append(agent_brain_info.agents[agent_index]) prev_vector_actions.append( agent_brain_info.previous_vector_actions[agent_index]) prev_text_actions.append( agent_brain_info.previous_text_actions[agent_index]) action_masks.append(agent_brain_info.action_masks[agent_index]) if self.policy.use_recurrent: memories = np.vstack(memories) curr_info = BrainInfo( visual_observations, vector_observations, text_observations, memories, rewards, agents, local_dones, prev_vector_actions, prev_text_actions, max_reacheds, action_masks, ) return curr_info def add_experiences( self, curr_all_info: AllBrainInfo, next_all_info: AllBrainInfo, take_action_outputs: ActionInfoOutputs, ) -> None: """ Adds experiences to each agent's experience history. :param curr_all_info: Dictionary of all current brains and corresponding BrainInfo. :param next_all_info: Dictionary of all current brains and corresponding BrainInfo. :param take_action_outputs: The outputs of the Policy's get_action method. """ self.trainer_metrics.start_experience_collection_timer() if take_action_outputs: self.stats["Policy/Entropy"].append( take_action_outputs["entropy"].mean()) self.stats["Policy/Learning Rate"].append( take_action_outputs["learning_rate"]) for name, signal in self.policy.reward_signals.items(): self.stats[signal.value_name].append( np.mean(take_action_outputs["value_heads"][name])) curr_info = curr_all_info[self.brain_name] next_info = next_all_info[self.brain_name] for agent_id in curr_info.agents: self.training_buffer[agent_id].last_brain_info = curr_info self.training_buffer[ agent_id].last_take_action_outputs = take_action_outputs if curr_info.agents != next_info.agents: curr_to_use = self.construct_curr_info(next_info) else: curr_to_use = curr_info # Evaluate and store the reward signals tmp_reward_signal_outs = {} for name, signal in self.policy.reward_signals.items(): tmp_reward_signal_outs[name] = signal.evaluate( curr_to_use, next_info) # Store the environment reward tmp_environment = np.array(next_info.rewards) rewards_out = AllRewardsOutput(reward_signals=tmp_reward_signal_outs, environment=tmp_environment) for agent_id in next_info.agents: stored_info = self.training_buffer[agent_id].last_brain_info stored_take_action_outputs = self.training_buffer[ agent_id].last_take_action_outputs if stored_info is not None: idx = stored_info.agents.index(agent_id) next_idx = next_info.agents.index(agent_id) if not stored_info.local_done[idx]: for i, _ in enumerate(stored_info.visual_observations): self.training_buffer[agent_id][ "visual_obs%d" % i].append( stored_info.visual_observations[i][idx]) self.training_buffer[agent_id][ "next_visual_obs%d" % i].append( next_info.visual_observations[i][next_idx]) if self.policy.use_vec_obs: self.training_buffer[agent_id]["vector_obs"].append( stored_info.vector_observations[idx]) self.training_buffer[agent_id][ "next_vector_in"].append( next_info.vector_observations[next_idx]) if self.policy.use_recurrent: if stored_info.memories.shape[1] == 0: stored_info.memories = np.zeros( (len(stored_info.agents), self.policy.m_size)) self.training_buffer[agent_id]["memory"].append( stored_info.memories[idx]) self.training_buffer[agent_id]["masks"].append(1.0) self.training_buffer[agent_id]["done"].append( next_info.local_done[next_idx]) # Add the outputs of the last eval self.add_policy_outputs(stored_take_action_outputs, agent_id, idx) # Store action masks if neccessary if not self.policy.use_continuous_act: self.training_buffer[agent_id]["action_mask"].append( stored_info.action_masks[idx], padding_value=1) self.training_buffer[agent_id]["prev_action"].append( stored_info.previous_vector_actions[idx]) values = stored_take_action_outputs["value_heads"] # Add the value outputs if needed self.add_rewards_outputs(rewards_out, values, agent_id, idx, next_idx) for name, rewards in self.collected_rewards.items(): if agent_id not in rewards: rewards[agent_id] = 0 if name == "environment": # Report the reward from the environment rewards[agent_id] += rewards_out.environment[ next_idx] else: # Report the reward signals rewards[agent_id] += rewards_out.reward_signals[ name].scaled_reward[next_idx] if not next_info.local_done[next_idx]: if agent_id not in self.episode_steps: self.episode_steps[agent_id] = 0 self.episode_steps[agent_id] += 1 self.trainer_metrics.end_experience_collection_timer() def end_episode(self) -> None: """ A signal that the Episode has ended. The buffer must be reset. Get only called when the academy resets. """ self.training_buffer.reset_local_buffers() for agent_id in self.episode_steps: self.episode_steps[agent_id] = 0 for rewards in self.collected_rewards.values(): for agent_id in rewards: rewards[agent_id] = 0 def add_policy_outputs(self, take_action_outputs: ActionInfoOutputs, agent_id: str, agent_idx: int) -> None: """ Takes the output of the last action and store it into the training buffer. We break this out from add_experiences since it is very highly dependent on the type of trainer. :param take_action_outputs: The outputs of the Policy's get_action method. :param agent_id: the Agent we're adding to. :param agent_idx: the index of the Agent agent_id """ raise UnityTrainerException( "The process_experiences method was not implemented.") def add_rewards_outputs( self, rewards_out: AllRewardsOutput, values: Dict[str, np.ndarray], agent_id: str, agent_idx: int, agent_next_idx: int, ) -> None: """ Takes the value and evaluated rewards output of the last action and store it into the training buffer. We break this out from add_experiences since it is very highly dependent on the type of trainer. :param take_action_outputs: The outputs of the Policy's get_action method. :param rewards_dict: Dict of rewards after evaluation :param agent_id: the Agent we're adding to. :param agent_idx: the index of the Agent agent_id in the current brain info :param agent_next_idx: the index of the Agent agent_id in the next brain info """ raise UnityTrainerException( "The process_experiences method was not implemented.")
class PPOTrainer(Trainer): """The PPOTrainer is an implementation of the PPO algorithm.""" def __init__(self, sess, brain, reward_buff_cap, trainer_parameters, training, seed, run_id): """ Responsible for collecting experiences and training PPO model. :param sess: Tensorflow session. :param trainer_parameters: The parameters for the trainer (dictionary). :param training: Whether the trainer is set for training. """ super(PPOTrainer, self).__init__(sess, brain.brain_name, trainer_parameters, training, run_id) self.param_keys = [ 'batch_size', 'beta', 'buffer_size', 'epsilon', 'gamma', 'hidden_units', 'lambd', 'learning_rate', 'max_steps', 'normalize', 'num_epoch', 'num_layers', 'time_horizon', 'sequence_length', 'summary_freq', 'use_recurrent', 'graph_scope', 'summary_path', 'memory_size', 'use_curiosity', 'curiosity_strength', 'curiosity_enc_size' ] for k in self.param_keys: if k not in trainer_parameters: raise UnityTrainerException( "The hyperparameter {0} could not be found for the PPO trainer of " "brain {1}.".format(k, brain.brain_name)) self.use_curiosity = bool(trainer_parameters['use_curiosity']) self.step = 0 self.policy = PPOPolicy(seed, brain, trainer_parameters, sess, self.is_training) stats = { 'cumulative_reward': [], 'episode_length': [], 'value_estimate': [], 'entropy': [], 'value_loss': [], 'policy_loss': [], 'learning_rate': [] } if self.use_curiosity: stats['forward_loss'] = [] stats['inverse_loss'] = [] stats['intrinsic_reward'] = [] self.intrinsic_rewards = {} self.stats = stats self.training_buffer = Buffer() self.training_buffer2 = Buffer() self.cumulative_rewards = {} self._reward_buffer = deque(maxlen=reward_buff_cap) self.episode_steps = {} self.summary_path = trainer_parameters['summary_path'] if not os.path.exists(self.summary_path): os.makedirs(self.summary_path) self.summary_writer = tf.summary.FileWriter(self.summary_path) def __str__(self): return '''Hyperparameters for the PPO Trainer of brain {0}: \n{1}'''.format( self.brain_name, '\n'.join([ '\t{0}:\t{1}'.format(x, self.trainer_parameters[x]) for x in self.param_keys ])) @property def parameters(self): """ Returns the trainer parameters of the trainer. """ return self.trainer_parameters @property def get_max_steps(self): """ Returns the maximum number of steps. Is used to know when the trainer should be stopped. :return: The maximum number of steps of the trainer """ return float(self.trainer_parameters['max_steps']) @property def get_step(self): """ Returns the number of steps the trainer has performed :return: the step count of the trainer """ return self.step @property def reward_buffer(self): """ Returns the reward buffer. The reward buffer contains the cumulative rewards of the most recent episodes completed by agents using this trainer. :return: the reward buffer. """ return self._reward_buffer def increment_step_and_update_last_reward(self): """ Increment the step count of the trainer and Updates the last reward """ if len(self.stats['cumulative_reward']) > 0: mean_reward = np.mean(self.stats['cumulative_reward']) self.policy.update_reward(mean_reward) self.policy.increment_step() self.step = self.policy.get_current_step() def take_action(self, all_brain_info: AllBrainInfo): """ Decides actions given observations information, and takes them in environment. :param all_brain_info: A dictionary of brain names and BrainInfo from environment. :return: a tuple containing action, memories, values and an object to be passed to add experiences """ curr_brain_info = all_brain_info[self.brain_name] if len(curr_brain_info.agents) == 0: return [], [], [], None, None run_out = self.policy.evaluate(curr_brain_info) self.stats['value_estimate'].append(run_out['value'].mean()) self.stats['entropy'].append(run_out['entropy'].mean()) self.stats['learning_rate'].append(run_out['learning_rate']) if self.policy.use_recurrent: return run_out['action'], run_out['memory_out'], None, \ run_out['value'], run_out else: return run_out['action'], None, None, run_out['value'], run_out def construct_curr_info(self, next_info: BrainInfo) -> BrainInfo: """ Constructs a BrainInfo which contains the most recent previous experiences for all agents info which correspond to the agents in a provided next_info. :BrainInfo next_info: A t+1 BrainInfo. :return: curr_info: Reconstructed BrainInfo to match agents of next_info. """ visual_observations = [[]] vector_observations = [] text_observations = [] memories = [] rewards = [] local_dones = [] max_reacheds = [] agents = [] prev_vector_actions = [] prev_text_actions = [] for agent_id in next_info.agents: agent_brain_info = self.training_buffer[agent_id].last_brain_info if agent_brain_info is None: agent_brain_info = next_info agent_index = agent_brain_info.agents.index(agent_id) for i in range(len(next_info.visual_observations)): visual_observations[i].append( agent_brain_info.visual_observations[i][agent_index]) vector_observations.append( agent_brain_info.vector_observations[agent_index]) text_observations.append( agent_brain_info.text_observations[agent_index]) if self.policy.use_recurrent: if len(agent_brain_info.memories > 0): memories.append(agent_brain_info.memories[agent_index]) else: memories.append(self.policy.make_empty_memory(1)) rewards.append(agent_brain_info.rewards[agent_index]) local_dones.append(agent_brain_info.local_done[agent_index]) max_reacheds.append(agent_brain_info.max_reached[agent_index]) agents.append(agent_brain_info.agents[agent_index]) prev_vector_actions.append( agent_brain_info.previous_vector_actions[agent_index]) prev_text_actions.append( agent_brain_info.previous_text_actions[agent_index]) if self.policy.use_recurrent: memories = np.vstack(memories) curr_info = BrainInfo(visual_observations, vector_observations, text_observations, memories, rewards, agents, local_dones, prev_vector_actions, prev_text_actions, max_reacheds) return curr_info def construct_curr_info2(self, next_info: BrainInfo) -> BrainInfo: """ Constructs a BrainInfo which contains the most recent previous experiences for all agents info which correspond to the agents in a provided next_info. :BrainInfo next_info: A t+1 BrainInfo. :return: curr_info: Reconstructed BrainInfo to match agents of next_info. """ visual_observations = [[]] vector_observations2 = [] text_observations = [] memories = [] rewards = [] local_dones = [] max_reacheds = [] agents = [] prev_vector_actions = [] prev_text_actions = [] for agent_id in next_info.agents: agent_brain_info = self.training_buffer[agent_id].last_brain_info agent_brain_info2 = next_info agent_index1 = agent_brain_info.agents.index(agent_id) agent_index2 = agent_brain_info2.agents.index(agent_id) for i in range(len(next_info.visual_observations)): visual_observations[i].append( agent_brain_info.visual_observations[i][agent_index]) vector_observations2.append( agent_brain_info.vector_observations[agent_index1], agent_brain_info2.vector_observations[agent_index2]) text_observations.append( agent_brain_info.text_observations[agent_index]) if self.policy.use_recurrent: if len(agent_brain_info.memories > 0): memories.append(agent_brain_info.memories[agent_index]) else: memories.append(self.policy.make_empty_memory(1)) rewards.append(agent_brain_info.rewards[agent_index]) local_dones.append(agent_brain_info.local_done[agent_index]) max_reacheds.append(agent_brain_info.max_reached[agent_index]) agents.append(agent_brain_info.agents[agent_index]) prev_vector_actions.append( agent_brain_info.previous_vector_actions[agent_index]) prev_text_actions.append( agent_brain_info.previous_text_actions[agent_index]) if self.policy.use_recurrent: memories = np.vstack(memories) curr_info2 = BrainInfo(visual_observations, vector_observations2, text_observations, memories, rewards, agents, local_dones, prev_vector_actions, prev_text_actions, max_reacheds) return curr_info2 def add_experiences(self, curr_all_info: AllBrainInfo, next_all_info: AllBrainInfo, take_action_outputs): """ Adds experiences to each agent's experience history. :param curr_all_info: Dictionary of all current brains and corresponding BrainInfo. :param next_all_info: Dictionary of all current brains and corresponding BrainInfo. :param take_action_outputs: The outputs of the take action method. """ curr_info = curr_all_info[self.brain_name] curr_info2 = curr_all_info[self.brain_name] next_info = next_all_info[self.brain_name] for agent_id in curr_info.agents: self.training_buffer[agent_id].last_brain_info = curr_info self.training_buffer[ agent_id].last_take_action_outputs = take_action_outputs for agent_id in curr_info2.agents: self.training_buffer2[agent_id].last_brain_info = curr_info2 self.training_buffer2[ agent_id].last_take_action_outputs = take_action_outputs if curr_info.agents != next_info.agents: curr_to_use = self.construct_curr_info(next_info) else: curr_to_use = curr_info intrinsic_rewards = self.policy.get_intrinsic_rewards( curr_to_use, next_info) for agent_id in next_info.agents: stored_info = self.training_buffer[agent_id].last_brain_info stored_take_action_outputs = self.training_buffer[ agent_id].last_take_action_outputs stored_take_action_outputs2 = self.training_buffer2[ agent_id].last_take_action_outputs if stored_info is not None: idx = stored_info.agents.index(agent_id) next_idx = next_info.agents.index(agent_id) if not stored_info.local_done[idx]: for i, _ in enumerate(stored_info.visual_observations): self.training_buffer[agent_id][ 'visual_obs%d' % i].append( stored_info.visual_observations[i][idx]) self.training_buffer[agent_id][ 'next_visual_obs%d' % i].append( next_info.visual_observations[i][next_idx]) if self.policy.use_vec_obs: self.training_buffer[agent_id]['vector_obs'].append( stored_info.vector_observations[idx]) self.training_buffer[agent_id][ 'next_vector_in'].append( next_info.vector_observations[next_idx]) if self.policy.use_recurrent: if stored_info.memories.shape[1] == 0: stored_info.memories = np.zeros( (len(stored_info.agents), self.policy.m_size)) self.training_buffer[agent_id]['memory'].append( stored_info.memories[idx]) actions = stored_take_action_outputs['action'] if self.policy.use_continuous_act: actions_pre = stored_take_action_outputs['pre_action'] self.training_buffer[agent_id]['actions_pre'].append( actions_pre[idx]) else: self.training_buffer[agent_id]['action_mask'].append( stored_info.action_masks[idx]) a_dist = stored_take_action_outputs['log_probs'] value1 = stored_take_action_outputs['value'] value2 = stored_take_action_outputs2['value'] if sum(value1) > sum(value2): value = value1 else: value = value2 self.training_buffer[agent_id]['actions'].append( actions[idx]) self.training_buffer[agent_id]['prev_action'].append( stored_info.previous_vector_actions[idx]) self.training_buffer[agent_id]['masks'].append(1.0) if self.use_curiosity: self.training_buffer[agent_id]['rewards'].append( next_info.rewards[next_idx] + intrinsic_rewards[next_idx]) else: self.training_buffer[agent_id]['rewards'].append( next_info.rewards[next_idx]) self.training_buffer[agent_id]['action_probs'].append( a_dist[idx]) self.training_buffer[agent_id]['value_estimates'].append( value[idx][0]) if agent_id not in self.cumulative_rewards: self.cumulative_rewards[agent_id] = 0 self.cumulative_rewards[agent_id] += next_info.rewards[ next_idx] if self.use_curiosity: if agent_id not in self.intrinsic_rewards: self.intrinsic_rewards[agent_id] = 0 self.intrinsic_rewards[agent_id] += intrinsic_rewards[ next_idx] if not next_info.local_done[next_idx]: if agent_id not in self.episode_steps: self.episode_steps[agent_id] = 0 self.episode_steps[agent_id] += 1 def process_experiences(self, current_info: AllBrainInfo, new_info: AllBrainInfo): """ Checks agent histories for processing condition, and processes them as necessary. Processing involves calculating value and advantage targets for model updating step. :param current_info: Dictionary of all current brains and corresponding BrainInfo. :param new_info: Dictionary of all next brains and corresponding BrainInfo. """ info = new_info[self.brain_name] for l in range(len(info.agents)): agent_actions = self.training_buffer[info.agents[l]]['actions'] if ((info.local_done[l] or len(agent_actions) > self.trainer_parameters['time_horizon']) and len(agent_actions) > 0): agent_id = info.agents[l] if info.local_done[l] and not info.max_reached[l]: value_next = 0.0 else: if info.max_reached[l]: bootstrapping_info = self.training_buffer[ agent_id].last_brain_info idx = bootstrapping_info.agents.index(agent_id) else: bootstrapping_info = info idx = l value_next = self.policy.get_value_estimate( bootstrapping_info, idx) self.training_buffer[agent_id]['advantages'].set( get_gae(rewards=self.training_buffer[agent_id] ['rewards'].get_batch(), value_estimates=self.training_buffer[agent_id] ['value_estimates'].get_batch(), value_next=value_next, gamma=self.trainer_parameters['gamma'], lambd=self.trainer_parameters['lambd'])) self.training_buffer[agent_id]['discounted_returns'].set( self.training_buffer[agent_id]['advantages'].get_batch() + self.training_buffer[agent_id] ['value_estimates'].get_batch()) self.training_buffer.append_update_buffer( agent_id, batch_size=None, training_length=self.policy.sequence_length) self.training_buffer[agent_id].reset_agent() if info.local_done[l]: self.stats['cumulative_reward'].append( self.cumulative_rewards.get(agent_id, 0)) self.reward_buffer.appendleft( self.cumulative_rewards.get(agent_id, 0)) self.stats['episode_length'].append( self.episode_steps.get(agent_id, 0)) self.cumulative_rewards[agent_id] = 0 self.episode_steps[agent_id] = 0 if self.use_curiosity: self.stats['intrinsic_reward'].append( self.intrinsic_rewards.get(agent_id, 0)) self.intrinsic_rewards[agent_id] = 0 def end_episode(self): """ A signal that the Episode has ended. The buffer must be reset. Get only called when the academy resets. """ self.training_buffer.reset_all() for agent_id in self.cumulative_rewards: self.cumulative_rewards[agent_id] = 0 for agent_id in self.episode_steps: self.episode_steps[agent_id] = 0 if self.use_curiosity: for agent_id in self.intrinsic_rewards: self.intrinsic_rewards[agent_id] = 0 def is_ready_update(self): """ Returns whether or not the trainer has enough elements to run update model :return: A boolean corresponding to whether or not update_model() can be run """ size_of_buffer = len(self.training_buffer.update_buffer['actions']) return size_of_buffer > max( int(self.trainer_parameters['buffer_size'] / self.policy.sequence_length), 1) def update_policy(self): """ Uses training_buffer to update the policy. """ n_sequences = max( int(self.trainer_parameters['batch_size'] / self.policy.sequence_length), 1) value_total, policy_total, forward_total, inverse_total = [], [], [], [] advantages = self.training_buffer.update_buffer[ 'advantages'].get_batch() self.training_buffer.update_buffer['advantages'].set( (advantages - advantages.mean()) / (advantages.std() + 1e-10)) num_epoch = self.trainer_parameters['num_epoch'] for k in range(num_epoch): self.training_buffer.update_buffer.shuffle() buffer = self.training_buffer.update_buffer for l in range( len(self.training_buffer.update_buffer['actions']) // n_sequences): start = l * n_sequences end = (l + 1) * n_sequences run_out = self.policy.update( buffer.make_mini_batch(start, end), n_sequences) value_total.append(run_out['value_loss']) policy_total.append(np.abs(run_out['policy_loss'])) if self.use_curiosity: inverse_total.append(run_out['inverse_loss']) forward_total.append(run_out['forward_loss']) self.stats['value_loss'].append(np.mean(value_total)) self.stats['policy_loss'].append(np.mean(policy_total)) if self.use_curiosity: self.stats['forward_loss'].append(np.mean(forward_total)) self.stats['inverse_loss'].append(np.mean(inverse_total)) self.training_buffer.reset_update_buffer()
class PPOTrainer(Trainer): """The PPOTrainer is an implementation of the PPO algorithm.""" def __init__(self, brain, reward_buff_cap, trainer_parameters, training, load, seed, run_id): """ Responsible for collecting experiences and training PPO model. :param trainer_parameters: The parameters for the trainer (dictionary). :param reward_buff_cap: Max reward history to track in the reward buffer :param training: Whether the trainer is set for training. :param load: Whether the model should be loaded. :param seed: The seed the model will be initialized with :param run_id: The identifier of the current run """ super().__init__(brain, trainer_parameters, training, run_id, reward_buff_cap) self.param_keys = [ "batch_size", "beta", "buffer_size", "epsilon", "hidden_units", "lambd", "learning_rate", "max_steps", "normalize", "num_epoch", "num_layers", "time_horizon", "sequence_length", "summary_freq", "use_recurrent", "summary_path", "memory_size", "model_path", "reward_signals", ] self.check_param_keys() # Make sure we have at least one reward_signal if not self.trainer_parameters["reward_signals"]: raise UnityTrainerException( "No reward signals were defined. At least one must be used with {}." .format(self.__class__.__name__)) self.step = 0 self.policy = PPOPolicy(seed, brain, trainer_parameters, self.is_training, load) stats = defaultdict(list) # collected_rewards is a dictionary from name of reward signal to a dictionary of agent_id to cumulative reward # used for reporting only. We always want to report the environment reward to Tensorboard, regardless # of what reward signals are actually present. self.collected_rewards = {"environment": {}} for _reward_signal in self.policy.reward_signals.keys(): self.collected_rewards[_reward_signal] = {} self.stats = stats self.training_buffer = Buffer() self.episode_steps = {} def __str__(self): return """Hyperparameters for the {0} of brain {1}: \n{2}""".format( self.__class__.__name__, self.brain_name, self.dict_to_str(self.trainer_parameters, 0), ) @property def parameters(self): """ Returns the trainer parameters of the trainer. """ return self.trainer_parameters @property def get_max_steps(self): """ Returns the maximum number of steps. Is used to know when the trainer should be stopped. :return: The maximum number of steps of the trainer """ return float(self.trainer_parameters["max_steps"]) @property def get_step(self): """ Returns the number of steps the trainer has performed :return: the step count of the trainer """ return self.step def increment_step(self, n_steps: int) -> None: """ Increment the step count of the trainer :param n_steps: number of steps to increment the step count by """ self.step = self.policy.increment_step(n_steps) def construct_curr_info(self, next_info: BrainInfo) -> BrainInfo: """ Constructs a BrainInfo which contains the most recent previous experiences for all agents which correspond to the agents in a provided next_info. :BrainInfo next_info: A t+1 BrainInfo. :return: curr_info: Reconstructed BrainInfo to match agents of next_info. """ visual_observations: List[List[Any]] = [ [] ] # TODO add types to brain.py methods vector_observations = [] text_observations = [] memories = [] rewards = [] local_dones = [] max_reacheds = [] agents = [] prev_vector_actions = [] prev_text_actions = [] action_masks = [] for agent_id in next_info.agents: agent_brain_info = self.training_buffer[agent_id].last_brain_info if agent_brain_info is None: agent_brain_info = next_info agent_index = agent_brain_info.agents.index(agent_id) for i in range(len(next_info.visual_observations)): visual_observations[i].append( agent_brain_info.visual_observations[i][agent_index]) vector_observations.append( agent_brain_info.vector_observations[agent_index]) text_observations.append( agent_brain_info.text_observations[agent_index]) if self.policy.use_recurrent: if len(agent_brain_info.memories) > 0: memories.append(agent_brain_info.memories[agent_index]) else: memories.append(self.policy.make_empty_memory(1)) rewards.append(agent_brain_info.rewards[agent_index]) local_dones.append(agent_brain_info.local_done[agent_index]) max_reacheds.append(agent_brain_info.max_reached[agent_index]) agents.append(agent_brain_info.agents[agent_index]) prev_vector_actions.append( agent_brain_info.previous_vector_actions[agent_index]) prev_text_actions.append( agent_brain_info.previous_text_actions[agent_index]) action_masks.append(agent_brain_info.action_masks[agent_index]) if self.policy.use_recurrent: memories = np.vstack(memories) curr_info = BrainInfo( visual_observations, vector_observations, text_observations, memories, rewards, agents, local_dones, prev_vector_actions, prev_text_actions, max_reacheds, action_masks, ) return curr_info def add_experiences( self, curr_all_info: AllBrainInfo, next_all_info: AllBrainInfo, take_action_outputs: ActionInfoOutputs, ) -> None: """ Adds experiences to each agent's experience history. :param curr_all_info: Dictionary of all current brains and corresponding BrainInfo. :param next_all_info: Dictionary of all current brains and corresponding BrainInfo. :param take_action_outputs: The outputs of the Policy's get_action method. """ self.trainer_metrics.start_experience_collection_timer() if take_action_outputs: self.stats["Policy/Entropy"].append( take_action_outputs["entropy"].mean()) self.stats["Policy/Learning Rate"].append( take_action_outputs["learning_rate"]) for name, signal in self.policy.reward_signals.items(): self.stats[signal.value_name].append( np.mean(take_action_outputs["value"][name])) curr_info = curr_all_info[self.brain_name] next_info = next_all_info[self.brain_name] for agent_id in curr_info.agents: self.training_buffer[agent_id].last_brain_info = curr_info self.training_buffer[ agent_id].last_take_action_outputs = take_action_outputs if curr_info.agents != next_info.agents: curr_to_use = self.construct_curr_info(next_info) else: curr_to_use = curr_info tmp_rewards_dict = {} for name, signal in self.policy.reward_signals.items(): tmp_rewards_dict[name] = signal.evaluate(curr_to_use, next_info) for agent_id in next_info.agents: stored_info = self.training_buffer[agent_id].last_brain_info stored_take_action_outputs = self.training_buffer[ agent_id].last_take_action_outputs if stored_info is not None: idx = stored_info.agents.index(agent_id) next_idx = next_info.agents.index(agent_id) if not stored_info.local_done[idx]: for i, _ in enumerate(stored_info.visual_observations): self.training_buffer[agent_id][ "visual_obs%d" % i].append( stored_info.visual_observations[i][idx]) self.training_buffer[agent_id][ "next_visual_obs%d" % i].append( next_info.visual_observations[i][next_idx]) if self.policy.use_vec_obs: self.training_buffer[agent_id]["vector_obs"].append( stored_info.vector_observations[idx]) self.training_buffer[agent_id][ "next_vector_in"].append( next_info.vector_observations[next_idx]) if self.policy.use_recurrent: if stored_info.memories.shape[1] == 0: stored_info.memories = np.zeros( (len(stored_info.agents), self.policy.m_size)) self.training_buffer[agent_id]["memory"].append( stored_info.memories[idx]) actions = stored_take_action_outputs["action"] if self.policy.use_continuous_act: actions_pre = stored_take_action_outputs["pre_action"] self.training_buffer[agent_id]["actions_pre"].append( actions_pre[idx]) epsilons = stored_take_action_outputs[ "random_normal_epsilon"] self.training_buffer[agent_id][ "random_normal_epsilon"].append(epsilons[idx]) else: self.training_buffer[agent_id]["action_mask"].append( stored_info.action_masks[idx], padding_value=1) a_dist = stored_take_action_outputs["log_probs"] # value is a dictionary from name of reward to value estimate of the value head value = stored_take_action_outputs["value"] self.training_buffer[agent_id]["actions"].append( actions[idx]) self.training_buffer[agent_id]["prev_action"].append( stored_info.previous_vector_actions[idx]) self.training_buffer[agent_id]["masks"].append(1.0) self.training_buffer[agent_id]["done"].append( next_info.local_done[next_idx]) for name, reward_result in tmp_rewards_dict.items(): # 0 because we use the scaled reward to train the agent self.training_buffer[agent_id]["{}_rewards".format( name)].append( reward_result.scaled_reward[next_idx]) self.training_buffer[agent_id][ "{}_value_estimates".format(name)].append( value[name][idx][0]) self.training_buffer[agent_id]["action_probs"].append( a_dist[idx]) for name, rewards in self.collected_rewards.items(): if agent_id not in rewards: rewards[agent_id] = 0 if name == "environment": # Report the reward from the environment rewards[agent_id] += np.array( next_info.rewards)[next_idx] else: # Report the reward signals rewards[agent_id] += tmp_rewards_dict[ name].scaled_reward[next_idx] if not next_info.local_done[next_idx]: if agent_id not in self.episode_steps: self.episode_steps[agent_id] = 0 self.episode_steps[agent_id] += 1 self.trainer_metrics.end_experience_collection_timer() def process_experiences(self, current_info: AllBrainInfo, new_info: AllBrainInfo) -> None: """ Checks agent histories for processing condition, and processes them as necessary. Processing involves calculating value and advantage targets for model updating step. :param current_info: Dictionary of all current brains and corresponding BrainInfo. :param new_info: Dictionary of all next brains and corresponding BrainInfo. """ info = new_info[self.brain_name] for l in range(len(info.agents)): agent_actions = self.training_buffer[info.agents[l]]["actions"] if (info.local_done[l] or len(agent_actions) > self.trainer_parameters["time_horizon"] ) and len(agent_actions) > 0: agent_id = info.agents[l] if info.max_reached[l]: bootstrapping_info = self.training_buffer[ agent_id].last_brain_info idx = bootstrapping_info.agents.index(agent_id) else: bootstrapping_info = info idx = l value_next = self.policy.get_value_estimates( bootstrapping_info, idx, info.local_done[l] and not info.max_reached[l], ) tmp_advantages = [] tmp_returns = [] for name in self.policy.reward_signals: bootstrap_value = value_next[name] local_rewards = self.training_buffer[agent_id][ "{}_rewards".format(name)].get_batch() local_value_estimates = self.training_buffer[agent_id][ "{}_value_estimates".format(name)].get_batch() local_advantage = get_gae( rewards=local_rewards, value_estimates=local_value_estimates, value_next=bootstrap_value, gamma=self.policy.reward_signals[name].gamma, lambd=self.trainer_parameters["lambd"], ) local_return = local_advantage + local_value_estimates # This is later use as target for the different value estimates self.training_buffer[agent_id]["{}_returns".format( name)].set(local_return) self.training_buffer[agent_id]["{}_advantage".format( name)].set(local_advantage) tmp_advantages.append(local_advantage) tmp_returns.append(local_return) global_advantages = list( np.mean(np.array(tmp_advantages), axis=0)) global_returns = list(np.mean(np.array(tmp_returns), axis=0)) self.training_buffer[agent_id]["advantages"].set( global_advantages) self.training_buffer[agent_id]["discounted_returns"].set( global_returns) self.training_buffer.append_update_buffer( agent_id, batch_size=None, training_length=self.policy.sequence_length, ) self.training_buffer[agent_id].reset_agent() if info.local_done[l]: self.stats["Environment/Episode Length"].append( self.episode_steps.get(agent_id, 0)) self.episode_steps[agent_id] = 0 for name, rewards in self.collected_rewards.items(): if name == "environment": self.cumulative_returns_since_policy_update.append( rewards.get(agent_id, 0)) self.stats["Environment/Cumulative Reward"].append( rewards.get(agent_id, 0)) self.reward_buffer.appendleft( rewards.get(agent_id, 0)) rewards[agent_id] = 0 else: self.stats[self.policy.reward_signals[name]. stat_name].append( rewards.get(agent_id, 0)) rewards[agent_id] = 0 def end_episode(self): """ A signal that the Episode has ended. The buffer must be reset. Get only called when the academy resets. """ self.training_buffer.reset_local_buffers() for agent_id in self.episode_steps: self.episode_steps[agent_id] = 0 for rewards in self.collected_rewards.values(): for agent_id in rewards: rewards[agent_id] = 0 def is_ready_update(self): """ Returns whether or not the trainer has enough elements to run update model :return: A boolean corresponding to whether or not update_model() can be run """ size_of_buffer = len(self.training_buffer.update_buffer["actions"]) return size_of_buffer > max( int(self.trainer_parameters["buffer_size"] / self.policy.sequence_length), 1) def update_policy(self): """ Uses demonstration_buffer to update the policy. The reward signal generators must be updated in this method at their own pace. """ self.trainer_metrics.start_policy_update_timer( number_experiences=len( self.training_buffer.update_buffer["actions"]), mean_return=float( np.mean(self.cumulative_returns_since_policy_update)), ) self.cumulative_returns_since_policy_update = [] n_sequences = max( int(self.trainer_parameters["batch_size"] / self.policy.sequence_length), 1) value_total, policy_total = [], [] advantages = self.training_buffer.update_buffer[ "advantages"].get_batch() self.training_buffer.update_buffer["advantages"].set( (advantages - advantages.mean()) / (advantages.std() + 1e-10)) num_epoch = self.trainer_parameters["num_epoch"] for _ in range(num_epoch): self.training_buffer.update_buffer.shuffle() buffer = self.training_buffer.update_buffer for l in range( len(self.training_buffer.update_buffer["actions"]) // n_sequences): start = l * n_sequences end = (l + 1) * n_sequences run_out = self.policy.update( buffer.make_mini_batch(start, end), n_sequences) value_total.append(run_out["value_loss"]) policy_total.append(np.abs(run_out["policy_loss"])) self.stats["Losses/Value Loss"].append(np.mean(value_total)) self.stats["Losses/Policy Loss"].append(np.mean(policy_total)) for _, reward_signal in self.policy.reward_signals.items(): update_stats = reward_signal.update( self.training_buffer.update_buffer, n_sequences) for stat, val in update_stats.items(): self.stats[stat].append(val) if self.policy.bc_module: update_stats = self.policy.bc_module.update() for stat, val in update_stats.items(): self.stats[stat].append(val) self.training_buffer.reset_update_buffer() self.trainer_metrics.end_policy_update()
class BCTrainer(Trainer): """The BCTrainer is an implementation of Behavioral Cloning.""" def __init__(self, brain, trainer_parameters, training, load, seed, run_id): """ Responsible for collecting experiences and training PPO model. :param trainer_parameters: The parameters for the trainer (dictionary). :param training: Whether the trainer is set for training. :param load: Whether the model should be loaded. :param seed: The seed the model will be initialized with :param run_id: The The identifier of the current run """ super(BCTrainer, self).__init__(brain, trainer_parameters, training, run_id) self.policy = BCPolicy(seed, brain, trainer_parameters, load) self.n_sequences = 1 self.cumulative_rewards = {} self.episode_steps = {} self.stats = {'Losses/Cloning Loss': [], 'Environment/Episode Length': [], 'Environment/Cumulative Reward': []} self.summary_path = trainer_parameters['summary_path'] self.batches_per_epoch = trainer_parameters['batches_per_epoch'] if not os.path.exists(self.summary_path): os.makedirs(self.summary_path) self.demonstration_buffer = Buffer() self.evaluation_buffer = Buffer() self.summary_writer = tf.summary.FileWriter(self.summary_path) @property def parameters(self): """ Returns the trainer parameters of the trainer. """ return self.trainer_parameters @property def get_max_steps(self): """ Returns the maximum number of steps. Is used to know when the trainer should be stopped. :return: The maximum number of steps of the trainer """ return float(self.trainer_parameters['max_steps']) @property def get_step(self): """ Returns the number of steps the trainer has performed :return: the step count of the trainer """ return self.policy.get_current_step() @property def get_last_reward(self): """ Returns the last reward the trainer has had :return: the new last reward """ if len(self.stats['Environment/Cumulative Reward']) > 0: return np.mean(self.stats['Environment/Cumulative Reward']) else: return 0 def increment_step_and_update_last_reward(self): """ Increment the step count of the trainer and Updates the last reward """ self.policy.increment_step() return def add_experiences(self, curr_info: AllBrainInfo, next_info: AllBrainInfo, take_action_outputs): """ Adds experiences to each agent's experience history. :param curr_info: Current AllBrainInfo (Dictionary of all current brains and corresponding BrainInfo). :param next_info: Next AllBrainInfo (Dictionary of all current brains and corresponding BrainInfo). :param take_action_outputs: The outputs of the take action method. """ # Used to collect information about student performance. info_student = curr_info[self.brain_name] next_info_student = next_info[self.brain_name] for agent_id in info_student.agents: self.evaluation_buffer[agent_id].last_brain_info = info_student for agent_id in next_info_student.agents: stored_info_student = self.evaluation_buffer[agent_id].last_brain_info if stored_info_student is None: continue else: next_idx = next_info_student.agents.index(agent_id) if agent_id not in self.cumulative_rewards: self.cumulative_rewards[agent_id] = 0 self.cumulative_rewards[agent_id] += next_info_student.rewards[next_idx] if not next_info_student.local_done[next_idx]: if agent_id not in self.episode_steps: self.episode_steps[agent_id] = 0 self.episode_steps[agent_id] += 1 def process_experiences(self, current_info: AllBrainInfo, next_info: AllBrainInfo): """ Checks agent histories for processing condition, and processes them as necessary. Processing involves calculating value and advantage targets for model updating step. :param current_info: Current AllBrainInfo :param next_info: Next AllBrainInfo """ info_student = next_info[self.brain_name] for l in range(len(info_student.agents)): if info_student.local_done[l]: agent_id = info_student.agents[l] self.stats['Environment/Cumulative Reward'].append( self.cumulative_rewards.get(agent_id, 0)) self.stats['Environment/Episode Length'].append( self.episode_steps.get(agent_id, 0)) self.cumulative_rewards[agent_id] = 0 self.episode_steps[agent_id] = 0 def end_episode(self): """ A signal that the Episode has ended. The buffer must be reset. Get only called when the academy resets. """ self.evaluation_buffer.reset_local_buffers() for agent_id in self.cumulative_rewards: self.cumulative_rewards[agent_id] = 0 for agent_id in self.episode_steps: self.episode_steps[agent_id] = 0 def is_ready_update(self): """ Returns whether or not the trainer has enough elements to run update model :return: A boolean corresponding to whether or not update_model() can be run """ return len(self.demonstration_buffer.update_buffer['actions']) > self.n_sequences def update_policy(self): """ Updates the policy. """ self.demonstration_buffer.update_buffer.shuffle() batch_losses = [] num_batches = min(len(self.demonstration_buffer.update_buffer['actions']) // self.n_sequences, self.batches_per_epoch) for i in range(num_batches): update_buffer = self.demonstration_buffer.update_buffer start = i * self.n_sequences end = (i + 1) * self.n_sequences mini_batch = update_buffer.make_mini_batch(start, end) run_out = self.policy.update(mini_batch, self.n_sequences) loss = run_out['policy_loss'] batch_losses.append(loss) if len(batch_losses) > 0: self.stats['Losses/Cloning Loss'].append(np.mean(batch_losses)) else: self.stats['Losses/Cloning Loss'].append(0)
class BCTrainer(Trainer): """The BCTrainer is an implementation of Behavioral Cloning.""" def __init__(self, brain, trainer_parameters, training, load, seed, run_id): """ Responsible for collecting experiences and training PPO model. :param trainer_parameters: The parameters for the trainer (dictionary). :param training: Whether the trainer is set for training. :param load: Whether the model should be loaded. :param seed: The seed the model will be initialized with :param run_id: The identifier of the current run """ super(BCTrainer, self).__init__(brain, trainer_parameters, training, run_id) self.policy = BCPolicy(seed, brain, trainer_parameters, load) self.n_sequences = 1 self.cumulative_rewards = {} self.episode_steps = {} self.stats = { "Losses/Cloning Loss": [], "Environment/Episode Length": [], "Environment/Cumulative Reward": [], } self.batches_per_epoch = trainer_parameters["batches_per_epoch"] self.demonstration_buffer = Buffer() self.evaluation_buffer = Buffer() def add_experiences( self, curr_info: AllBrainInfo, next_info: AllBrainInfo, take_action_outputs: ActionInfoOutputs, ) -> None: """ Adds experiences to each agent's experience history. :param curr_info: Current AllBrainInfo (Dictionary of all current brains and corresponding BrainInfo). :param next_info: Next AllBrainInfo (Dictionary of all current brains and corresponding BrainInfo). :param take_action_outputs: The outputs of the take action method. """ # Used to collect information about student performance. info_student = curr_info[self.brain_name] next_info_student = next_info[self.brain_name] for agent_id in info_student.agents: self.evaluation_buffer[agent_id].last_brain_info = info_student for agent_id in next_info_student.agents: stored_info_student = self.evaluation_buffer[agent_id].last_brain_info if stored_info_student is None: continue else: next_idx = next_info_student.agents.index(agent_id) if agent_id not in self.cumulative_rewards: self.cumulative_rewards[agent_id] = 0 self.cumulative_rewards[agent_id] += next_info_student.rewards[next_idx] if not next_info_student.local_done[next_idx]: if agent_id not in self.episode_steps: self.episode_steps[agent_id] = 0 self.episode_steps[agent_id] += 1 def process_experiences( self, current_info: AllBrainInfo, next_info: AllBrainInfo ) -> None: """ Checks agent histories for processing condition, and processes them as necessary. Processing involves calculating value and advantage targets for model updating step. :param current_info: Current AllBrainInfo :param next_info: Next AllBrainInfo """ info_student = next_info[self.brain_name] for l in range(len(info_student.agents)): if info_student.local_done[l]: agent_id = info_student.agents[l] self.stats["Environment/Cumulative Reward"].append( self.cumulative_rewards.get(agent_id, 0) ) self.stats["Environment/Episode Length"].append( self.episode_steps.get(agent_id, 0) ) self.reward_buffer.appendleft(self.cumulative_rewards.get(agent_id, 0)) self.cumulative_rewards[agent_id] = 0 self.episode_steps[agent_id] = 0 def end_episode(self): """ A signal that the Episode has ended. The buffer must be reset. Get only called when the academy resets. """ self.evaluation_buffer.reset_local_buffers() for agent_id in self.cumulative_rewards: self.cumulative_rewards[agent_id] = 0 for agent_id in self.episode_steps: self.episode_steps[agent_id] = 0 def is_ready_update(self): """ Returns whether or not the trainer has enough elements to run update model :return: A boolean corresponding to whether or not update_model() can be run """ return ( len(self.demonstration_buffer.update_buffer["actions"]) > self.n_sequences ) def update_policy(self): """ Updates the policy. """ self.demonstration_buffer.update_buffer.shuffle(self.policy.sequence_length) batch_losses = [] num_batches = min( len(self.demonstration_buffer.update_buffer["actions"]) // self.n_sequences, self.batches_per_epoch, ) for i in range(num_batches): update_buffer = self.demonstration_buffer.update_buffer start = i * self.n_sequences end = (i + 1) * self.n_sequences mini_batch = update_buffer.make_mini_batch(start, end) run_out = self.policy.update(mini_batch, self.n_sequences) loss = run_out["policy_loss"] batch_losses.append(loss) if len(batch_losses) > 0: self.stats["Losses/Cloning Loss"].append(np.mean(batch_losses)) else: self.stats["Losses/Cloning Loss"].append(0)
def demo_to_buffer(file_path: str, sequence_length: int) -> Tuple[BrainParameters, Buffer]: """ Loads demonstration file and uses it to fill training buffer. :param file_path: Location of demonstration file (.demo). :param sequence_length: Length of trajectories to fill buffer. :return: """ # early exit if inference mode # export EVALUATION_STAGE='testing' EVALUATION_STAGE = os.getenv('EVALUATION_STAGE', '') if EVALUATION_STAGE == 'testing': demo_buffer = Buffer() brain_params = MineRLToMLAgentWrapper.get_brain_params(file_path) return brain_params, demo_buffer # # The dataset is available in data/ directory from repository root. # MINERL_DATA_ROOT = os.getenv('MINERL_DATA_ROOT', 'data/') logger.info("Building data pipeline for {}".format(file_path)) data = minerl.data.make(file_path) report_trajs = [] trajs = [ 'v1_other_pomegranite_orc-12_24007-29518', 'v1_right_mushroom_fire-breathing_dragon_41653-47509', 'v1_juvenile_apple_angel-7_205561-212353', 'v1_juvenile_apple_angel-6_221-11831', 'v1_equal_olive_chimera-7_10379-19453', 'v1_unselfish_blood_orange_savage-18_19656-23843', 'v1_other_pomegranite_orc-12_31579-36826', 'v1_svelte_cherry_devil-17_314-11959', 'v1_agonizing_kale_tree_nymph-7_133235-141843', 'v1_unselfish_blood_orange_savage-18_14639-19416', 'v1_unselfish_blood_orange_savage-18_399-10066', 'v1_courageous_rutabaga_nessie-1_3069-13764', 'v1_agonizing_kale_tree_nymph-20_289-7919', 'v1_right_mushroom_fire-breathing_dragon_88565-95177', 'v1_last_prune_swamp_monster-2_2208-8442', 'v1_excellent_mango_beast-6_43472-48953', 'v1_bogus_guava_djinn-17_23146-31716', 'v1_splendid_brussels_sprout_pegasus-5_45696-54118', 'v1_agonizing_kale_tree_nymph-7_106750-114380', 'v1_right_mushroom_fire-breathing_dragon_7211-17977', 'v1_agonizing_kale_tree_nymph-20_7989-16044', 'v1_excellent_mango_beast-6_20909-29943', 'v1_villainous_black_eyed_peas_loch_ness_monster-1_82621-93105', 'v1_subtle_iceberg_lettuce_nymph-4_16111-20545', 'v1_agonizing_kale_tree_nymph-7_74962-82761', 'v1_juvenile_apple_angel-5_4254-15273', 'v1_conscious_tangerine_rain_bird-23_48769-59333', 'v1_absolute_grape_changeling-6_37339-46767', 'v1_equal_olive_chimera-9_14563-24740', 'v1_juvenile_apple_angel-7_158092-167444', 'v1_bogus_guava_djinn-2_19159-30071', 'v1_other_pomegranite_orc-12_16800-22992' ] # trajs = data.get_trajectory_names() all_demo = dict() brain_infos = [] brain_params = MineRLToMLAgentWrapper.get_brain_params(file_path) agent_id = 'fake_id' # stream_name = random.choice(trajs) for stream_name in trajs: demo = Object() logger.info("Loading data for {}...".format(stream_name)) demo.data_frames = list( data.load_data(stream_name, include_metadata=True)) demo.meta = demo.data_frames[0][-1] cum_rewards = np.cumsum([x[2] for x in demo.data_frames]) demo.file_len = len(demo.data_frames) logger.info("Data loading complete!".format(stream_name)) logger.info("META DATA: {}".format(demo.meta)) demo.height, demo.width = data.observation_space.spaces[ 'pov'].shape[:2] # all_demo[stream_name]=demo if not demo.meta['success']: logger.info("SKIP as success=False") continue if int(demo.meta['duration_steps']) > 12000: logger.info("****HACK**** SKIP as > 12k steps") continue if int(demo.meta['total_reward']) < 1024: logger.info( "ERROR score must be > 1024 because of dimond = 1024 points") continue logger.info("*** PASSED CHECKS ****") report_trajs.append(stream_name) running_reward = 0 for i, frame in enumerate(demo.data_frames): ob = frame[0] action = frame[1] # action=np.hstack([v for v in action.values()]) reward = float(frame[2]) ob = frame[3] done = frame[4] meta_data = frame[5] running_reward += reward info = { 'stream_name': meta_data['stream_name'], 'duration_steps': meta_data['duration_steps'], 'total_reward': meta_data['total_reward'], 'success': meta_data['success'], 'step': i, 'running_reward': running_reward } max_reached = i + 1 == meta_data['duration_steps'] brain_info = MineRLToMLAgentWrapper.create_brain_info( ob=ob, agent_id=agent_id, brain_params=brain_params, reward=reward, done=done, info=info, action=action, max_reached=max_reached) brain_info = MineRLToMLAgentWrapper.process_brain_info_through_wrapped_envs( file_path, brain_info) brain_infos.append(brain_info) del frame[3] # obs, free for memory del demo.data_frames del demo import gc gc.collect() # brain_params, brain_infos, _ = load_demonstration(file_path) demo_buffer = make_demo_buffer(brain_infos, brain_params, sequence_length) del brain_infos import gc gc.collect() logger.info("report_trajs = " + str([str(i) for i in report_trajs])) return brain_params, demo_buffer
def __init__(self, brain, reward_buff_cap, trainer_parameters, training, load, seed, run_id): """ Responsible for collecting experiences and training PPO model. :param trainer_parameters: The parameters for the trainer (dictionary). :param training: Whether the trainer is set for training. :param load: Whether the model should be loaded. :param seed: The seed the model will be initialized with :param run_id: The identifier of the current run """ super(PPOTrainer, self).__init__(brain, trainer_parameters, training, run_id) self.param_keys = [ "batch_size", "beta", "buffer_size", "epsilon", "gamma", "hidden_units", "lambd", "learning_rate", "max_steps", "normalize", "num_epoch", "num_layers", "time_horizon", "sequence_length", "summary_freq", "use_recurrent", "summary_path", "memory_size", "use_curiosity", "curiosity_strength", "curiosity_enc_size", "model_path", ] self.check_param_keys() self.use_curiosity = bool(trainer_parameters["use_curiosity"]) self.step = 0 self.policy = PPOPolicy(seed, brain, trainer_parameters, self.is_training, load) stats = { "Environment/Cumulative Reward": [], "Environment/Episode Length": [], "Policy/Value Estimate": [], "Policy/Entropy": [], "Losses/Value Loss": [], "Losses/Policy Loss": [], "Policy/Learning Rate": [], } if self.use_curiosity: stats["Losses/Forward Loss"] = [] stats["Losses/Inverse Loss"] = [] stats["Policy/Curiosity Reward"] = [] self.intrinsic_rewards = {} self.stats = stats self.training_buffer = Buffer() self.cumulative_rewards = {} self._reward_buffer = deque(maxlen=reward_buff_cap) self.episode_steps = {}
class PPOTrainer(Trainer): """The PPOTrainer is an implementation of the PPO algorithm.""" def __init__(self, brain, reward_buff_cap, trainer_parameters, training, load, seed, run_id): """ Responsible for collecting experiences and training PPO model. :param trainer_parameters: The parameters for the trainer (dictionary). :param training: Whether the trainer is set for training. :param load: Whether the model should be loaded. :param seed: The seed the model will be initialized with :param run_id: The identifier of the current run """ super(PPOTrainer, self).__init__(brain, trainer_parameters, training, run_id) self.param_keys = [ "batch_size", "beta", "buffer_size", "epsilon", "gamma", "hidden_units", "lambd", "learning_rate", "max_steps", "normalize", "num_epoch", "num_layers", "time_horizon", "sequence_length", "summary_freq", "use_recurrent", "summary_path", "memory_size", "use_curiosity", "curiosity_strength", "curiosity_enc_size", "model_path", ] self.check_param_keys() self.use_curiosity = bool(trainer_parameters["use_curiosity"]) self.step = 0 self.policy = PPOPolicy(seed, brain, trainer_parameters, self.is_training, load) stats = { "Environment/Cumulative Reward": [], "Environment/Episode Length": [], "Policy/Value Estimate": [], "Policy/Entropy": [], "Losses/Value Loss": [], "Losses/Policy Loss": [], "Policy/Learning Rate": [], } if self.use_curiosity: stats["Losses/Forward Loss"] = [] stats["Losses/Inverse Loss"] = [] stats["Policy/Curiosity Reward"] = [] self.intrinsic_rewards = {} self.stats = stats self.training_buffer = Buffer() self.cumulative_rewards = {} self._reward_buffer = deque(maxlen=reward_buff_cap) self.episode_steps = {} def __str__(self): return """Hyperparameters for the PPO Trainer of brain {0}: \n{1}""".format( self.brain_name, "\n".join([ "\t{0}:\t{1}".format(x, self.trainer_parameters[x]) for x in self.param_keys ]), ) @property def parameters(self): """ Returns the trainer parameters of the trainer. """ return self.trainer_parameters @property def get_max_steps(self): """ Returns the maximum number of steps. Is used to know when the trainer should be stopped. :return: The maximum number of steps of the trainer """ return float(self.trainer_parameters["max_steps"]) @property def get_step(self): """ Returns the number of steps the trainer has performed :return: the step count of the trainer """ return self.step @property def reward_buffer(self): """ Returns the reward buffer. The reward buffer contains the cumulative rewards of the most recent episodes completed by agents using this trainer. :return: the reward buffer. """ return self._reward_buffer def increment_step_and_update_last_reward(self): """ Increment the step count of the trainer and Updates the last reward """ if len(self.stats["Environment/Cumulative Reward"]) > 0: mean_reward = np.mean(self.stats["Environment/Cumulative Reward"]) self.policy.update_reward(mean_reward) self.policy.increment_step() self.step = self.policy.get_current_step() def construct_curr_info(self, next_info: BrainInfo) -> BrainInfo: """ Constructs a BrainInfo which contains the most recent previous experiences for all agents info which correspond to the agents in a provided next_info. :BrainInfo next_info: A t+1 BrainInfo. :return: curr_info: Reconstructed BrainInfo to match agents of next_info. """ visual_observations = [[]] vector_observations = [] text_observations = [] memories = [] rewards = [] local_dones = [] max_reacheds = [] agents = [] prev_vector_actions = [] prev_text_actions = [] action_masks = [] for agent_id in next_info.agents: agent_brain_info = self.training_buffer[agent_id].last_brain_info if agent_brain_info is None: agent_brain_info = next_info agent_index = agent_brain_info.agents.index(agent_id) for i in range(len(next_info.visual_observations)): visual_observations[i].append( agent_brain_info.visual_observations[i][agent_index]) vector_observations.append( agent_brain_info.vector_observations[agent_index]) text_observations.append( agent_brain_info.text_observations[agent_index]) if self.policy.use_recurrent: if len(agent_brain_info.memories) > 0: memories.append(agent_brain_info.memories[agent_index]) else: memories.append(self.policy.make_empty_memory(1)) rewards.append(agent_brain_info.rewards[agent_index]) local_dones.append(agent_brain_info.local_done[agent_index]) max_reacheds.append(agent_brain_info.max_reached[agent_index]) agents.append(agent_brain_info.agents[agent_index]) prev_vector_actions.append( agent_brain_info.previous_vector_actions[agent_index]) prev_text_actions.append( agent_brain_info.previous_text_actions[agent_index]) action_masks.append(agent_brain_info.action_masks[agent_index]) if self.policy.use_recurrent: memories = np.vstack(memories) curr_info = BrainInfo( visual_observations, vector_observations, text_observations, memories, rewards, agents, local_dones, prev_vector_actions, prev_text_actions, max_reacheds, action_masks, ) return curr_info def add_experiences( self, curr_all_info: AllBrainInfo, next_all_info: AllBrainInfo, take_action_outputs, ): """ Adds experiences to each agent's experience history. :param curr_all_info: Dictionary of all current brains and corresponding BrainInfo. :param next_all_info: Dictionary of all current brains and corresponding BrainInfo. :param take_action_outputs: The outputs of the Policy's get_action method. """ self.trainer_metrics.start_experience_collection_timer() if take_action_outputs: self.stats["Policy/Value Estimate"].append( take_action_outputs["value"].mean()) self.stats["Policy/Entropy"].append( take_action_outputs["entropy"].mean()) self.stats["Policy/Learning Rate"].append( take_action_outputs["learning_rate"]) curr_info = curr_all_info[self.brain_name] next_info = next_all_info[self.brain_name] for agent_id in curr_info.agents: self.training_buffer[agent_id].last_brain_info = curr_info self.training_buffer[ agent_id].last_take_action_outputs = take_action_outputs if curr_info.agents != next_info.agents: curr_to_use = self.construct_curr_info(next_info) else: curr_to_use = curr_info intrinsic_rewards = self.policy.get_intrinsic_rewards( curr_to_use, next_info) for agent_id in next_info.agents: stored_info = self.training_buffer[agent_id].last_brain_info stored_take_action_outputs = self.training_buffer[ agent_id].last_take_action_outputs if stored_info is not None: idx = stored_info.agents.index(agent_id) next_idx = next_info.agents.index(agent_id) if not stored_info.local_done[idx]: for i, _ in enumerate(stored_info.visual_observations): self.training_buffer[agent_id][ "visual_obs%d" % i].append( stored_info.visual_observations[i][idx]) self.training_buffer[agent_id][ "next_visual_obs%d" % i].append( next_info.visual_observations[i][next_idx]) if self.policy.use_vec_obs: self.training_buffer[agent_id]["vector_obs"].append( stored_info.vector_observations[idx]) self.training_buffer[agent_id][ "next_vector_in"].append( next_info.vector_observations[next_idx]) if self.policy.use_recurrent: if stored_info.memories.shape[1] == 0: stored_info.memories = np.zeros( (len(stored_info.agents), self.policy.m_size)) self.training_buffer[agent_id]["memory"].append( stored_info.memories[idx]) actions = stored_take_action_outputs["action"] if self.policy.use_continuous_act: actions_pre = stored_take_action_outputs["pre_action"] self.training_buffer[agent_id]["actions_pre"].append( actions_pre[idx]) epsilons = stored_take_action_outputs[ "random_normal_epsilon"] self.training_buffer[agent_id][ "random_normal_epsilon"].append(epsilons[idx]) else: self.training_buffer[agent_id]["action_mask"].append( stored_info.action_masks[idx], padding_value=1) a_dist = stored_take_action_outputs["log_probs"] value = stored_take_action_outputs["value"] self.training_buffer[agent_id]["actions"].append( actions[idx]) self.training_buffer[agent_id]["prev_action"].append( stored_info.previous_vector_actions[idx]) self.training_buffer[agent_id]["masks"].append(1.0) if self.use_curiosity: self.training_buffer[agent_id]["rewards"].append( next_info.rewards[next_idx] + intrinsic_rewards[next_idx]) else: self.training_buffer[agent_id]["rewards"].append( next_info.rewards[next_idx]) self.training_buffer[agent_id]["action_probs"].append( a_dist[idx]) self.training_buffer[agent_id]["value_estimates"].append( value[idx][0]) if agent_id not in self.cumulative_rewards: self.cumulative_rewards[agent_id] = 0 self.cumulative_rewards[agent_id] += next_info.rewards[ next_idx] if self.use_curiosity: if agent_id not in self.intrinsic_rewards: self.intrinsic_rewards[agent_id] = 0 self.intrinsic_rewards[agent_id] += intrinsic_rewards[ next_idx] if not next_info.local_done[next_idx]: if agent_id not in self.episode_steps: self.episode_steps[agent_id] = 0 self.episode_steps[agent_id] += 1 self.trainer_metrics.end_experience_collection_timer() def process_experiences(self, current_info: AllBrainInfo, new_info: AllBrainInfo): """ Checks agent histories for processing condition, and processes them as necessary. Processing involves calculating value and advantage targets for model updating step. :param current_info: Dictionary of all current brains and corresponding BrainInfo. :param new_info: Dictionary of all next brains and corresponding BrainInfo. """ self.trainer_metrics.start_experience_collection_timer() info = new_info[self.brain_name] for l in range(len(info.agents)): agent_actions = self.training_buffer[info.agents[l]]["actions"] if (info.local_done[l] or len(agent_actions) > self.trainer_parameters["time_horizon"] ) and len(agent_actions) > 0: agent_id = info.agents[l] if info.local_done[l] and not info.max_reached[l]: value_next = 0.0 else: if info.max_reached[l]: bootstrapping_info = self.training_buffer[ agent_id].last_brain_info idx = bootstrapping_info.agents.index(agent_id) else: bootstrapping_info = info idx = l value_next = self.policy.get_value_estimate( bootstrapping_info, idx) self.training_buffer[agent_id]["advantages"].set( get_gae( rewards=self.training_buffer[agent_id] ["rewards"].get_batch(), value_estimates=self.training_buffer[agent_id] ["value_estimates"].get_batch(), value_next=value_next, gamma=self.trainer_parameters["gamma"], lambd=self.trainer_parameters["lambd"], )) self.training_buffer[agent_id]["discounted_returns"].set( self.training_buffer[agent_id]["advantages"].get_batch() + self.training_buffer[agent_id] ["value_estimates"].get_batch()) self.training_buffer.append_update_buffer( agent_id, batch_size=None, training_length=self.policy.sequence_length, ) self.training_buffer[agent_id].reset_agent() if info.local_done[l]: self.cumulative_returns_since_policy_update.append( self.cumulative_rewards.get(agent_id, 0)) self.stats["Environment/Cumulative Reward"].append( self.cumulative_rewards.get(agent_id, 0)) self.reward_buffer.appendleft( self.cumulative_rewards.get(agent_id, 0)) self.stats["Environment/Episode Length"].append( self.episode_steps.get(agent_id, 0)) self.cumulative_rewards[agent_id] = 0 self.episode_steps[agent_id] = 0 if self.use_curiosity: self.stats["Policy/Curiosity Reward"].append( self.intrinsic_rewards.get(agent_id, 0)) self.intrinsic_rewards[agent_id] = 0 self.trainer_metrics.end_experience_collection_timer() def end_episode(self): """ A signal that the Episode has ended. The buffer must be reset. Get only called when the academy resets. """ self.training_buffer.reset_local_buffers() for agent_id in self.cumulative_rewards: self.cumulative_rewards[agent_id] = 0 for agent_id in self.episode_steps: self.episode_steps[agent_id] = 0 if self.use_curiosity: for agent_id in self.intrinsic_rewards: self.intrinsic_rewards[agent_id] = 0 def is_ready_update(self): """ Returns whether or not the trainer has enough elements to run update model :return: A boolean corresponding to whether or not update_model() can be run """ size_of_buffer = len(self.training_buffer.update_buffer["actions"]) return size_of_buffer > max( int(self.trainer_parameters["buffer_size"] / self.policy.sequence_length), 1) def update_policy(self): """ Uses demonstration_buffer to update the policy. """ self.trainer_metrics.start_policy_update_timer( number_experiences=len( self.training_buffer.update_buffer["actions"]), mean_return=float( np.mean(self.cumulative_returns_since_policy_update)), ) self.cumulative_returns_since_policy_update = [] n_sequences = max( int(self.trainer_parameters["batch_size"] / self.policy.sequence_length), 1) value_total, policy_total, forward_total, inverse_total = [], [], [], [] advantages = self.training_buffer.update_buffer[ "advantages"].get_batch() self.training_buffer.update_buffer["advantages"].set( (advantages - advantages.mean()) / (advantages.std() + 1e-10)) num_epoch = self.trainer_parameters["num_epoch"] for _ in range(num_epoch): self.training_buffer.update_buffer.shuffle() buffer = self.training_buffer.update_buffer for l in range( len(self.training_buffer.update_buffer["actions"]) // n_sequences): start = l * n_sequences end = (l + 1) * n_sequences run_out = self.policy.update( buffer.make_mini_batch(start, end), n_sequences) value_total.append(run_out["value_loss"]) policy_total.append(np.abs(run_out["policy_loss"])) if self.use_curiosity: inverse_total.append(run_out["inverse_loss"]) forward_total.append(run_out["forward_loss"]) self.stats["Losses/Value Loss"].append(np.mean(value_total)) self.stats["Losses/Policy Loss"].append(np.mean(policy_total)) if self.use_curiosity: self.stats["Losses/Forward Loss"].append(np.mean(forward_total)) self.stats["Losses/Inverse Loss"].append(np.mean(inverse_total)) self.training_buffer.reset_update_buffer() self.trainer_metrics.end_policy_update()
class BehavioralCloningTrainer(Trainer): """The ImitationTrainer is an implementation of the imitation learning.""" def __init__(self, sess, brain, trainer_parameters, training, seed, run_id): """ Responsible for collecting experiences and training PPO model. :param sess: Tensorflow session. :param trainer_parameters: The parameters for the trainer (dictionary). :param training: Whether the trainer is set for training. """ super(BehavioralCloningTrainer, self).__init__(sess, brain, trainer_parameters, training, run_id) self.param_keys = ['brain_to_imitate', 'batch_size', 'time_horizon', 'graph_scope', 'summary_freq', 'max_steps', 'batches_per_epoch', 'use_recurrent', 'hidden_units','learning_rate', 'num_layers', 'sequence_length', 'memory_size'] for k in self.param_keys: if k not in trainer_parameters: raise UnityTrainerException("The hyperparameter {0} could not be found for the Imitation trainer of " "brain {1}.".format(k, brain.brain_name)) self.policy = BCPolicy(seed, brain, trainer_parameters, sess) self.brain_name = brain.brain_name self.brain_to_imitate = trainer_parameters['brain_to_imitate'] self.batches_per_epoch = trainer_parameters['batches_per_epoch'] self.n_sequences = max(int(trainer_parameters['batch_size'] / self.policy.sequence_length), 1) self.cumulative_rewards = {} self.episode_steps = {} self.stats = {'losses': [], 'episode_length': [], 'cumulative_reward': []} self.training_buffer = Buffer() self.summary_path = trainer_parameters['summary_path'] if not os.path.exists(self.summary_path): os.makedirs(self.summary_path) self.summary_writer = tf.summary.FileWriter(self.summary_path) def __str__(self): return '''Hyperparameters for the Imitation Trainer of brain {0}: \n{1}'''.format( self.brain_name, '\n'.join(['\t{0}:\t{1}'.format(x, self.trainer_parameters[x]) for x in self.param_keys])) @property def parameters(self): """ Returns the trainer parameters of the trainer. """ return self.trainer_parameters @property def get_max_steps(self): """ Returns the maximum number of steps. Is used to know when the trainer should be stopped. :return: The maximum number of steps of the trainer """ return float(self.trainer_parameters['max_steps']) @property def get_step(self): """ Returns the number of steps the trainer has performed :return: the step count of the trainer """ return self.policy.get_current_step() @property def get_last_reward(self): """ Returns the last reward the trainer has had :return: the new last reward """ if len(self.stats['cumulative_reward']) > 0: return np.mean(self.stats['cumulative_reward']) else: return 0 def increment_step_and_update_last_reward(self): """ Increment the step count of the trainer and Updates the last reward """ self.policy.increment_step() return def take_action(self, all_brain_info: AllBrainInfo): """ Decides actions using policy given current brain info. :param all_brain_info: AllBrainInfo from environment. :return: a tuple containing action, memories, values and an object to be passed to add experiences """ if len(all_brain_info[self.brain_name].agents) == 0: return [], [], [], None, None agent_brain = all_brain_info[self.brain_name] run_out = self.policy.evaluate(agent_brain) if self.policy.use_recurrent: return run_out['action'], run_out['memory_out'], None, None, None else: return run_out['action'], None, None, None, None def add_experiences(self, curr_info: AllBrainInfo, next_info: AllBrainInfo, take_action_outputs): """ Adds experiences to each agent's experience history. :param curr_info: Current AllBrainInfo (Dictionary of all current brains and corresponding BrainInfo). :param next_info: Next AllBrainInfo (Dictionary of all current brains and corresponding BrainInfo). :param take_action_outputs: The outputs of the take action method. """ # Used to collect teacher experience into training buffer info_teacher = curr_info[self.brain_to_imitate] next_info_teacher = next_info[self.brain_to_imitate] for agent_id in info_teacher.agents: self.training_buffer[agent_id].last_brain_info = info_teacher for agent_id in next_info_teacher.agents: stored_info_teacher = self.training_buffer[agent_id].last_brain_info if stored_info_teacher is None: continue else: idx = stored_info_teacher.agents.index(agent_id) next_idx = next_info_teacher.agents.index(agent_id) if stored_info_teacher.text_observations[idx] != "": info_teacher_record, info_teacher_reset = \ stored_info_teacher.text_observations[idx].lower().split(",") next_info_teacher_record, next_info_teacher_reset = next_info_teacher.text_observations[idx].\ lower().split(",") if next_info_teacher_reset == "true": self.training_buffer.reset_update_buffer() else: info_teacher_record, next_info_teacher_record = "true", "true" if info_teacher_record == "true" and next_info_teacher_record == "true": if not stored_info_teacher.local_done[idx]: for i in range(self.policy.vis_obs_size): self.training_buffer[agent_id]['visual_obs%d' % i]\ .append(stored_info_teacher.visual_observations[i][idx]) if self.policy.use_vec_obs: self.training_buffer[agent_id]['vector_obs']\ .append(stored_info_teacher.vector_observations[idx]) if self.policy.use_recurrent: if stored_info_teacher.memories.shape[1] == 0: stored_info_teacher.memories = np.zeros((len(stored_info_teacher.agents), self.policy.m_size)) self.training_buffer[agent_id]['memory'].append(stored_info_teacher.memories[idx]) self.training_buffer[agent_id]['actions'].append(next_info_teacher. previous_vector_actions[next_idx]) info_student = curr_info[self.brain_name] next_info_student = next_info[self.brain_name] for agent_id in info_student.agents: self.training_buffer[agent_id].last_brain_info = info_student # Used to collect information about student performance. for agent_id in next_info_student.agents: stored_info_student = self.training_buffer[agent_id].last_brain_info if stored_info_student is None: continue else: next_idx = next_info_student.agents.index(agent_id) if agent_id not in self.cumulative_rewards: self.cumulative_rewards[agent_id] = 0 self.cumulative_rewards[agent_id] += next_info_student.rewards[next_idx] if not next_info_student.local_done[next_idx]: if agent_id not in self.episode_steps: self.episode_steps[agent_id] = 0 self.episode_steps[agent_id] += 1 def process_experiences(self, current_info: AllBrainInfo, next_info: AllBrainInfo): """ Checks agent histories for processing condition, and processes them as necessary. Processing involves calculating value and advantage targets for model updating step. :param current_info: Current AllBrainInfo :param next_info: Next AllBrainInfo """ info_teacher = next_info[self.brain_to_imitate] for l in range(len(info_teacher.agents)): teacher_action_list = len(self.training_buffer[info_teacher.agents[l]]['actions']) horizon_reached = teacher_action_list > self.trainer_parameters['time_horizon'] teacher_filled = len(self.training_buffer[info_teacher.agents[l]]['actions']) > 0 if ((info_teacher.local_done[l] or horizon_reached) and teacher_filled): agent_id = info_teacher.agents[l] self.training_buffer.append_update_buffer( agent_id, batch_size=None, training_length=self.policy.sequence_length) self.training_buffer[agent_id].reset_agent() info_student = next_info[self.brain_name] for l in range(len(info_student.agents)): if info_student.local_done[l]: agent_id = info_student.agents[l] self.stats['cumulative_reward'].append( self.cumulative_rewards.get(agent_id, 0)) self.stats['episode_length'].append( self.episode_steps.get(agent_id, 0)) self.cumulative_rewards[agent_id] = 0 self.episode_steps[agent_id] = 0 def end_episode(self): """ A signal that the Episode has ended. The buffer must be reset. Get only called when the academy resets. """ self.training_buffer.reset_all() for agent_id in self.cumulative_rewards: self.cumulative_rewards[agent_id] = 0 for agent_id in self.episode_steps: self.episode_steps[agent_id] = 0 def is_ready_update(self): """ Returns whether or not the trainer has enough elements to run update model :return: A boolean corresponding to whether or not update_model() can be run """ return len(self.training_buffer.update_buffer['actions']) > self.n_sequences def update_policy(self): """ Updates the policy. """ self.training_buffer.update_buffer.shuffle() batch_losses = [] num_batches = min(len(self.training_buffer.update_buffer['actions']) // self.n_sequences, self.batches_per_epoch) for i in range(num_batches): buffer = self.training_buffer.update_buffer start = i * self.n_sequences end = (i + 1) * self.n_sequences mini_batch = buffer.make_mini_batch(start, end) run_out = self.policy.update(mini_batch, self.n_sequences) loss = run_out['policy_loss'] batch_losses.append(loss) if len(batch_losses) > 0: self.stats['losses'].append(np.mean(batch_losses)) else: self.stats['losses'].append(0)
def update(self, update_buffer: Buffer, n_sequences: int) -> Dict[str, float]: """ Updates model using buffer. :param update_buffer: The policy buffer containing the trajectories for the current policy. :param n_sequences: The number of sequences from demo and policy used in each mini batch. :return: The loss of the update. """ batch_losses = [] # Divide by 2 since we have two buffers, so we have roughly the same batch size n_sequences = max(n_sequences // 2, 1) possible_demo_batches = ( len(self.demonstration_buffer.update_buffer["actions"]) // n_sequences ) possible_policy_batches = len(update_buffer["actions"]) // n_sequences possible_batches = min(possible_policy_batches, possible_demo_batches) max_batches = self.samples_per_update // n_sequences kl_loss = [] policy_estimate = [] expert_estimate = [] z_log_sigma_sq = [] z_mean_expert = [] z_mean_policy = [] n_epoch = self.num_epoch for _epoch in range(n_epoch): self.demonstration_buffer.update_buffer.shuffle() update_buffer.shuffle() if max_batches == 0: num_batches = possible_batches else: num_batches = min(possible_batches, max_batches) for i in range(num_batches): demo_update_buffer = self.demonstration_buffer.update_buffer policy_update_buffer = update_buffer start = i * n_sequences end = (i + 1) * n_sequences mini_batch_demo = demo_update_buffer.make_mini_batch(start, end) mini_batch_policy = policy_update_buffer.make_mini_batch(start, end) run_out = self._update_batch(mini_batch_demo, mini_batch_policy) loss = run_out["gail_loss"] policy_estimate.append(run_out["policy_estimate"]) expert_estimate.append(run_out["expert_estimate"]) if self.model.use_vail: kl_loss.append(run_out["kl_loss"]) z_log_sigma_sq.append(run_out["z_log_sigma_sq"]) z_mean_policy.append(run_out["z_mean_policy"]) z_mean_expert.append(run_out["z_mean_expert"]) batch_losses.append(loss) self.has_updated = True print_list = ["n_epoch", "beta", "policy_estimate", "expert_estimate"] print_vals = [ n_epoch, self.policy.sess.run(self.model.beta), np.mean(policy_estimate), np.mean(expert_estimate), ] if self.model.use_vail: print_list += [ "kl_loss", "z_mean_expert", "z_mean_policy", "z_log_sigma_sq", ] print_vals += [ np.mean(kl_loss), np.mean(z_mean_expert), np.mean(z_mean_policy), np.mean(z_log_sigma_sq), ] LOGGER.debug( "GAIL Debug:\n\t\t" + "\n\t\t".join( "{0}: {1}".format(_name, _val) for _name, _val in zip(print_list, print_vals) ) ) update_stats = {"Losses/GAIL Loss": np.mean(batch_losses)} return update_stats