def advance(self, env: EnvManager) -> int: with hierarchical_timer("env_step"): new_step_infos = env.step() for step_info in new_step_infos: for brain_name, trainer in self.trainers.items(): if step_info.has_actions_for_brain(brain_name): _processor = self.managers[brain_name].processor _processor.add_experiences( step_info.previous_all_brain_info[brain_name], step_info.current_all_brain_info[brain_name], step_info.brain_name_to_action_info[brain_name]. outputs, ) for brain_name, trainer in self.trainers.items(): if self.train_model and trainer.get_step <= trainer.get_max_steps: trainer.increment_step(len(new_step_infos)) if trainer.is_ready_update(): # Perform gradient descent with experience buffer with hierarchical_timer("update_policy"): trainer.update_policy() env.set_policy(brain_name, trainer.policy) else: # Avoid memory leak during inference # Eventually this whole block will take place in advance() # But currently this only calls clear_update_buffer() in RLTrainer # and nothing in the base class trainer.advance() return len(new_step_infos)
def _create_trainer_and_manager(self, env_manager: EnvManager, name_behavior_id: str) -> None: brain_name = BehaviorIdentifiers.from_name_behavior_id( name_behavior_id).brain_name try: trainer = self.trainers[brain_name] except KeyError: trainer = self.trainer_factory.generate(brain_name) self.trainers[brain_name] = trainer self.logger.info(trainer) if self.train_model: trainer.write_tensorboard_text("Hyperparameters", trainer.parameters) # print("*/*/*/*/*/*/*/*/*/*/*/*/*") # print(trainer) # print("*/*/*/*/*/*/*/*/*/*/*/*/*") policy = trainer.create_policy( env_manager.external_brains[name_behavior_id]) trainer.add_policy(name_behavior_id, policy) agent_manager = AgentManager( policy, name_behavior_id, trainer.stats_reporter, trainer.parameters.get("time_horizon", sys.maxsize), ) env_manager.set_agent_manager(name_behavior_id, agent_manager) env_manager.set_policy(name_behavior_id, policy) self.brain_name_to_identifier[brain_name].add(name_behavior_id) trainer.publish_policy_queue(agent_manager.policy_queue) trainer.subscribe_trajectory_queue(agent_manager.trajectory_queue)
def start_trainer(self, trainer: Trainer, env_manager: EnvManager) -> None: self.trainers[trainer.brain_name] = trainer self.logger.info(trainer) if self.train_model: trainer.write_tensorboard_text("Hyperparameters", trainer.parameters) env_manager.set_policy(trainer.brain_name, trainer.policy)
def _create_trainer_and_manager(self, env_manager: EnvManager, name_behavior_id: str) -> None: parsed_behavior_id = BehaviorIdentifiers.from_name_behavior_id( name_behavior_id) brain_name = parsed_behavior_id.brain_name try: trainer = self.trainers[brain_name] except KeyError: trainer = self.trainer_factory.generate(brain_name) self.trainers[brain_name] = trainer policy = trainer.create_policy( parsed_behavior_id, env_manager.external_brains[name_behavior_id]) trainer.add_policy(parsed_behavior_id, policy) agent_manager = AgentManager( policy, name_behavior_id, trainer.stats_reporter, trainer.parameters.get("time_horizon", sys.maxsize), ) env_manager.set_agent_manager(name_behavior_id, agent_manager) env_manager.set_policy(name_behavior_id, policy) self.brain_name_to_identifier[brain_name].add(name_behavior_id) trainer.publish_policy_queue(agent_manager.policy_queue) trainer.subscribe_trajectory_queue(agent_manager.trajectory_queue) if trainer.threaded: # Start trainer thread trainerthread = threading.Thread(target=self.trainer_update_func, args=(trainer, ), daemon=True) trainerthread.start() self.trainer_threads.append(trainerthread)
def _get_and_process_experiences(self, env: EnvManager) -> int: with hierarchical_timer("env_step"): # Get new policies if found for brain_name in self.trainers.keys(): for name_behavior_id in self.brain_name_to_identifier[ brain_name]: try: _policy = self.managers[ name_behavior_id].policy_queue.get_nowait() env.set_policy(name_behavior_id, _policy) except AgentManagerQueue.Empty: pass # Step the environment new_step_infos = env.step() # Add to AgentProcessor for step_info in new_step_infos: for name_behavior_id in step_info.name_behavior_ids: if name_behavior_id not in self.managers: self.logger.warning( "Agent manager was not created for behavior id {}.". format(name_behavior_id)) continue self.managers[name_behavior_id].add_experiences( step_info.previous_all_brain_info[name_behavior_id], step_info.current_all_brain_info[name_behavior_id], step_info.brain_name_to_action_info[name_behavior_id]. outputs, ) return len(new_step_infos)
def _create_trainer_and_manager( self, env_manager: EnvManager, name_behavior_id: str ) -> None: parsed_behavior_id = BehaviorIdentifiers.from_name_behavior_id(name_behavior_id) brain_name = parsed_behavior_id.brain_name try: trainer = self.trainers[brain_name] except KeyError: trainer = self.trainer_factory.generate(brain_name) self.trainers[brain_name] = trainer policy = trainer.create_policy( parsed_behavior_id, env_manager.external_brains[name_behavior_id] ) trainer.add_policy(parsed_behavior_id, policy) agent_manager = AgentManager( policy, name_behavior_id, trainer.stats_reporter, trainer.parameters.get("time_horizon", sys.maxsize), ) env_manager.set_agent_manager(name_behavior_id, agent_manager) env_manager.set_policy(name_behavior_id, policy) self.brain_name_to_identifier[brain_name].add(name_behavior_id) trainer.publish_policy_queue(agent_manager.policy_queue) trainer.subscribe_trajectory_queue(agent_manager.trajectory_queue)
def reset_env_if_ready(self, env: EnvManager) -> None: # Get the sizes of the reward buffers. reward_buff = { k: list(t.reward_buffer) for (k, t) in self.trainers.items() } curr_step = {k: int(t.step) for (k, t) in self.trainers.items()} max_step = { k: int(t.get_max_steps) for (k, t) in self.trainers.items() } # Attempt to increment the lessons of the brains who # were ready. updated, param_must_reset = self.param_manager.update_lessons( curr_step, max_step, reward_buff) if updated: for trainer in self.trainers.values(): trainer.reward_buffer.clear() # If ghost trainer swapped teams ghost_controller_reset = self.ghost_controller.should_reset() if param_must_reset or ghost_controller_reset: self._reset_env(env) # This reset also sends the new config to env self.end_trainer_episodes() elif updated: env.set_env_parameters(self.param_manager.get_current_samplers())
def _reset_env(self, env: EnvManager) -> None: """Resets the environment. Returns: A Data structure corresponding to the initial reset state of the environment. """ new_config = self.param_manager.get_current_samplers() env.reset(config=new_config)
def _reset_env(self, env: EnvManager) -> None: """Resets the environment. Returns: A Data structure corresponding to the initial reset state of the environment. """ new_meta_curriculum_config = (self.meta_curriculum.get_config() if self.meta_curriculum else None) env.reset(config=new_meta_curriculum_config)
def _reset_env(self, env_manager: EnvManager) -> None: """Resets the environment. Returns: A Data structure corresponding to the initial reset state of the environment. """ new_config = self.param_manager.get_current_samplers() env_manager.reset(config=new_config) # Register any new behavior ids that were generated on the reset. self._register_new_behaviors(env_manager, env_manager.first_step_infos)
def _reset_env(self, env_manager: EnvManager) -> None: """Resets the environment. Returns: A Data structure corresponding to the initial reset state of the environment. """ sampled_reset_param = self.sampler_manager.sample_all() new_meta_curriculum_config = (self.meta_curriculum.get_config() if self.meta_curriculum else {}) sampled_reset_param.update(new_meta_curriculum_config) env_manager.reset(config=sampled_reset_param)
def _get_and_process_experiences(self, env: EnvManager) -> int: with hierarchical_timer("env_step"): # Get new policies if found for brain_name in self.trainers.keys(): for name_behavior_id in self.brain_name_to_identifier[ brain_name]: try: _policy = self.managers[ name_behavior_id].policy_queue.get_nowait() env.set_policy(name_behavior_id, _policy) except AgentManagerQueue.Empty: pass # Step the environment new_step_infos = env.step() # Add to AgentProcessor num_step_infos = self._process_step_infos(new_step_infos) return num_step_infos
def _create_trainer_and_manager(self, env_manager: EnvManager, name_behavior_id: str) -> None: parsed_behavior_id = BehaviorIdentifiers.from_name_behavior_id( name_behavior_id) brain_name = parsed_behavior_id.brain_name trainerthread = None if brain_name in self.trainers: trainer = self.trainers[brain_name] else: trainer = self.trainer_factory.generate(brain_name) self.trainers[brain_name] = trainer if trainer.threaded: # Only create trainer thread for new trainers trainerthread = threading.Thread( target=self.trainer_update_func, args=(trainer, ), daemon=True) self.trainer_threads.append(trainerthread) policy = trainer.create_policy( parsed_behavior_id, env_manager.training_behaviors[name_behavior_id], create_graph=True, ) trainer.add_policy(parsed_behavior_id, policy) agent_manager = AgentManager( policy, name_behavior_id, trainer.stats_reporter, trainer.parameters.time_horizon, threaded=trainer.threaded, ) env_manager.set_agent_manager(name_behavior_id, agent_manager) env_manager.set_policy(name_behavior_id, policy) self.brain_name_to_identifier[brain_name].add(name_behavior_id) trainer.publish_policy_queue(agent_manager.policy_queue) trainer.subscribe_trajectory_queue(agent_manager.trajectory_queue) # Only start new trainers if trainerthread is not None: trainerthread.start()
def advance(self, env: EnvManager) -> int: with hierarchical_timer("env_step"): new_step_infos = env.step() for step_info in new_step_infos: for name_behavior_id in step_info.name_behavior_ids: if name_behavior_id not in self.managers: self.logger.warning( "Agent manager was not created for behavior id {}.". format(name_behavior_id)) continue _processor = self.managers[name_behavior_id].processor _processor.add_experiences( step_info.previous_all_brain_info[name_behavior_id], step_info.current_all_brain_info[name_behavior_id], step_info.brain_name_to_action_info[name_behavior_id]. outputs, ) for brain_name, trainer in self.trainers.items(): if self.train_model and trainer.get_step <= trainer.get_max_steps: n_steps = len(new_step_infos) trainer.increment_step(n_steps) for name_behavior_id in self.brain_name_to_identifier[ brain_name]: trainer.get_policy(name_behavior_id).increment_step( n_steps) if trainer.is_ready_update(): # Perform gradient descent with experience buffer with hierarchical_timer("update_policy"): trainer.update_policy() for name_behavior_id in self.brain_name_to_identifier[ brain_name]: env.set_policy(name_behavior_id, trainer.get_policy(name_behavior_id)) else: # Avoid memory leak during inference # Eventually this whole block will take place in advance() # But currently this only calls clear_update_buffer() in RLTrainer # and nothing in the base class trainer.advance() return len(new_step_infos)
def advance(self, env_manager: EnvManager) -> int: # Get steps with hierarchical_timer("env_step"): new_step_infos = env_manager.get_steps() self._register_new_behaviors(env_manager, new_step_infos) num_steps = env_manager.process_steps(new_step_infos) # Report current lesson for each environment parameter for ( param_name, lesson_number, ) in self.param_manager.get_current_lesson_number().items(): for trainer in self.trainers.values(): trainer.stats_reporter.set_stat( f"Environment/Lesson Number/{param_name}", lesson_number) for trainer in self.trainers.values(): if not trainer.threaded: with hierarchical_timer("trainer_advance"): trainer.advance() return num_steps
def advance(self, env: EnvManager) -> int: with hierarchical_timer("env_step"): time_start_step = time() new_step_infos = env.step() delta_time_step = time() - time_start_step for step_info in new_step_infos: for brain_name, trainer in self.trainers.items(): if brain_name in self.trainer_metrics: self.trainer_metrics[brain_name].add_delta_step( delta_time_step) if step_info.has_actions_for_brain(brain_name): trainer.add_experiences( step_info.previous_all_brain_info[brain_name], step_info.current_all_brain_info[brain_name], step_info.brain_name_to_action_info[brain_name]. outputs, ) trainer.process_experiences( step_info.previous_all_brain_info[brain_name], step_info.current_all_brain_info[brain_name], ) for brain_name, trainer in self.trainers.items(): if brain_name in self.trainer_metrics: self.trainer_metrics[brain_name].add_delta_step( delta_time_step) if self.train_model and trainer.get_step <= trainer.get_max_steps: trainer.increment_step(len(new_step_infos)) if trainer.is_ready_update(): # Perform gradient descent with experience buffer with hierarchical_timer("update_policy"): trainer.update_policy() env.set_policy(brain_name, trainer.policy) else: # Avoid memory leak during inference trainer.clear_update_buffer() return len(new_step_infos)
def advance(self, env: EnvManager) -> int: # Get steps with hierarchical_timer("env_step"): num_steps = env.advance() # Report current lesson if self.meta_curriculum: for brain_name, curr in self.meta_curriculum.brains_to_curricula.items( ): if brain_name in self.trainers: self.trainers[brain_name].stats_reporter.set_stat( "Environment/Lesson", curr.lesson_num) # Advance trainers. This can be done in a separate loop in the future. with hierarchical_timer("trainer_advance"): for trainer in self.trainers.values(): trainer.advance() return num_steps
def advance(self, env: EnvManager) -> int: # Get steps with hierarchical_timer("env_step"): num_steps = env.advance() # Report current lesson if self.meta_curriculum: for brain_name, curr in self.meta_curriculum.brains_to_curricula.items( ): if brain_name in self.trainers: self.trainers[brain_name].stats_reporter.set_stat( "Environment/Lesson", curr.lesson_num) for trainer in self.trainers.values(): if not trainer.threaded: with hierarchical_timer("trainer_advance"): trainer.advance() return num_steps
def start_learning(self, env_manager: EnvManager) -> None: self._create_model_path(self.model_path) tf.reset_default_graph() global_step = 0 last_brain_behavior_ids: Set[str] = set() try: self._reset_env(env_manager) while self._not_done_training(): external_brain_behavior_ids = set( env_manager.external_brains.keys()) new_behavior_ids = external_brain_behavior_ids - last_brain_behavior_ids for name_behavior_id in new_behavior_ids: try: brain_name, _ = name_behavior_id.split("?") except ValueError: brain_name = name_behavior_id try: trainer = self.trainers[brain_name] except KeyError: trainer = self.trainer_factory.generate(brain_name) self.trainers[brain_name] = trainer self.logger.info(trainer) if self.train_model: trainer.write_tensorboard_text( "Hyperparameters", trainer.parameters) policy = trainer.create_policy( env_manager.external_brains[name_behavior_id]) trainer.add_policy(name_behavior_id, policy) env_manager.set_policy(name_behavior_id, policy) self.brain_name_to_identifier[brain_name].add( name_behavior_id) agent_manager = AgentManager(processor=AgentProcessor( trainer, policy, name_behavior_id, trainer.stats_reporter, trainer.parameters.get("time_horizon", sys.maxsize), )) self.managers[name_behavior_id] = agent_manager last_brain_behavior_ids = external_brain_behavior_ids n_steps = self.advance(env_manager) for _ in range(n_steps): global_step += 1 self.reset_env_if_ready(env_manager, global_step) if self._should_save_model(global_step): # Save Tensorflow model self._save_model() self.write_to_tensorboard(global_step) # Final save Tensorflow model if global_step != 0 and self.train_model: self._save_model() except (KeyboardInterrupt, UnityCommunicationException): if self.train_model: self._save_model_when_interrupted() pass if self.train_model: self._export_graph() self._write_timing_tree()