def advance(self, env: EnvManager) -> int: with hierarchical_timer("env_step"): new_step_infos = env.step() for step_info in new_step_infos: for brain_name, trainer in self.trainers.items(): if step_info.has_actions_for_brain(brain_name): _processor = self.managers[brain_name].processor _processor.add_experiences( step_info.previous_all_brain_info[brain_name], step_info.current_all_brain_info[brain_name], step_info.brain_name_to_action_info[brain_name]. outputs, ) for brain_name, trainer in self.trainers.items(): if self.train_model and trainer.get_step <= trainer.get_max_steps: trainer.increment_step(len(new_step_infos)) if trainer.is_ready_update(): # Perform gradient descent with experience buffer with hierarchical_timer("update_policy"): trainer.update_policy() env.set_policy(brain_name, trainer.policy) else: # Avoid memory leak during inference # Eventually this whole block will take place in advance() # But currently this only calls clear_update_buffer() in RLTrainer # and nothing in the base class trainer.advance() return len(new_step_infos)
def _get_and_process_experiences(self, env: EnvManager) -> int: with hierarchical_timer("env_step"): # Get new policies if found for brain_name in self.trainers.keys(): for name_behavior_id in self.brain_name_to_identifier[ brain_name]: try: _policy = self.managers[ name_behavior_id].policy_queue.get_nowait() env.set_policy(name_behavior_id, _policy) except AgentManagerQueue.Empty: pass # Step the environment new_step_infos = env.step() # Add to AgentProcessor for step_info in new_step_infos: for name_behavior_id in step_info.name_behavior_ids: if name_behavior_id not in self.managers: self.logger.warning( "Agent manager was not created for behavior id {}.". format(name_behavior_id)) continue self.managers[name_behavior_id].add_experiences( step_info.previous_all_brain_info[name_behavior_id], step_info.current_all_brain_info[name_behavior_id], step_info.brain_name_to_action_info[name_behavior_id]. outputs, ) return len(new_step_infos)
def _get_and_process_experiences(self, env: EnvManager) -> int: with hierarchical_timer("env_step"): # Get new policies if found for brain_name in self.trainers.keys(): for name_behavior_id in self.brain_name_to_identifier[ brain_name]: try: _policy = self.managers[ name_behavior_id].policy_queue.get_nowait() env.set_policy(name_behavior_id, _policy) except AgentManagerQueue.Empty: pass # Step the environment new_step_infos = env.step() # Add to AgentProcessor num_step_infos = self._process_step_infos(new_step_infos) return num_step_infos
def advance(self, env: EnvManager) -> int: with hierarchical_timer("env_step"): new_step_infos = env.step() for step_info in new_step_infos: for name_behavior_id in step_info.name_behavior_ids: if name_behavior_id not in self.managers: self.logger.warning( "Agent manager was not created for behavior id {}.". format(name_behavior_id)) continue _processor = self.managers[name_behavior_id].processor _processor.add_experiences( step_info.previous_all_brain_info[name_behavior_id], step_info.current_all_brain_info[name_behavior_id], step_info.brain_name_to_action_info[name_behavior_id]. outputs, ) for brain_name, trainer in self.trainers.items(): if self.train_model and trainer.get_step <= trainer.get_max_steps: n_steps = len(new_step_infos) trainer.increment_step(n_steps) for name_behavior_id in self.brain_name_to_identifier[ brain_name]: trainer.get_policy(name_behavior_id).increment_step( n_steps) if trainer.is_ready_update(): # Perform gradient descent with experience buffer with hierarchical_timer("update_policy"): trainer.update_policy() for name_behavior_id in self.brain_name_to_identifier[ brain_name]: env.set_policy(name_behavior_id, trainer.get_policy(name_behavior_id)) else: # Avoid memory leak during inference # Eventually this whole block will take place in advance() # But currently this only calls clear_update_buffer() in RLTrainer # and nothing in the base class trainer.advance() return len(new_step_infos)
def advance(self, env: EnvManager) -> int: with hierarchical_timer("env_step"): time_start_step = time() new_step_infos = env.step() delta_time_step = time() - time_start_step for step_info in new_step_infos: for brain_name, trainer in self.trainers.items(): if brain_name in self.trainer_metrics: self.trainer_metrics[brain_name].add_delta_step( delta_time_step) if step_info.has_actions_for_brain(brain_name): trainer.add_experiences( step_info.previous_all_brain_info[brain_name], step_info.current_all_brain_info[brain_name], step_info.brain_name_to_action_info[brain_name]. outputs, ) trainer.process_experiences( step_info.previous_all_brain_info[brain_name], step_info.current_all_brain_info[brain_name], ) for brain_name, trainer in self.trainers.items(): if brain_name in self.trainer_metrics: self.trainer_metrics[brain_name].add_delta_step( delta_time_step) if self.train_model and trainer.get_step <= trainer.get_max_steps: trainer.increment_step(len(new_step_infos)) if trainer.is_ready_update(): # Perform gradient descent with experience buffer with hierarchical_timer("update_policy"): trainer.update_policy() env.set_policy(brain_name, trainer.policy) else: # Avoid memory leak during inference trainer.clear_update_buffer() return len(new_step_infos)