def start_trainer(self, trainer: Trainer, env_manager: EnvManager) -> None: self.trainers[trainer.brain_name] = trainer self.logger.info(trainer) if self.train_model: trainer.write_tensorboard_text("Hyperparameters", trainer.parameters) env_manager.set_policy(trainer.brain_name, trainer.policy)
def _create_trainer_and_manager(self, env_manager: EnvManager, name_behavior_id: str) -> None: brain_name = BehaviorIdentifiers.from_name_behavior_id( name_behavior_id).brain_name try: trainer = self.trainers[brain_name] except KeyError: trainer = self.trainer_factory.generate(brain_name) self.trainers[brain_name] = trainer self.logger.info(trainer) if self.train_model: trainer.write_tensorboard_text("Hyperparameters", trainer.parameters) # print("*/*/*/*/*/*/*/*/*/*/*/*/*") # print(trainer) # print("*/*/*/*/*/*/*/*/*/*/*/*/*") policy = trainer.create_policy( env_manager.external_brains[name_behavior_id]) trainer.add_policy(name_behavior_id, policy) agent_manager = AgentManager( policy, name_behavior_id, trainer.stats_reporter, trainer.parameters.get("time_horizon", sys.maxsize), ) env_manager.set_agent_manager(name_behavior_id, agent_manager) env_manager.set_policy(name_behavior_id, policy) self.brain_name_to_identifier[brain_name].add(name_behavior_id) trainer.publish_policy_queue(agent_manager.policy_queue) trainer.subscribe_trajectory_queue(agent_manager.trajectory_queue)
def advance(self, env: EnvManager) -> int: with hierarchical_timer("env_step"): new_step_infos = env.step() for step_info in new_step_infos: for brain_name, trainer in self.trainers.items(): if step_info.has_actions_for_brain(brain_name): _processor = self.managers[brain_name].processor _processor.add_experiences( step_info.previous_all_brain_info[brain_name], step_info.current_all_brain_info[brain_name], step_info.brain_name_to_action_info[brain_name]. outputs, ) for brain_name, trainer in self.trainers.items(): if self.train_model and trainer.get_step <= trainer.get_max_steps: trainer.increment_step(len(new_step_infos)) if trainer.is_ready_update(): # Perform gradient descent with experience buffer with hierarchical_timer("update_policy"): trainer.update_policy() env.set_policy(brain_name, trainer.policy) else: # Avoid memory leak during inference # Eventually this whole block will take place in advance() # But currently this only calls clear_update_buffer() in RLTrainer # and nothing in the base class trainer.advance() return len(new_step_infos)
def _create_trainer_and_manager( self, env_manager: EnvManager, name_behavior_id: str ) -> None: parsed_behavior_id = BehaviorIdentifiers.from_name_behavior_id(name_behavior_id) brain_name = parsed_behavior_id.brain_name try: trainer = self.trainers[brain_name] except KeyError: trainer = self.trainer_factory.generate(brain_name) self.trainers[brain_name] = trainer policy = trainer.create_policy( parsed_behavior_id, env_manager.external_brains[name_behavior_id] ) trainer.add_policy(parsed_behavior_id, policy) agent_manager = AgentManager( policy, name_behavior_id, trainer.stats_reporter, trainer.parameters.get("time_horizon", sys.maxsize), ) env_manager.set_agent_manager(name_behavior_id, agent_manager) env_manager.set_policy(name_behavior_id, policy) self.brain_name_to_identifier[brain_name].add(name_behavior_id) trainer.publish_policy_queue(agent_manager.policy_queue) trainer.subscribe_trajectory_queue(agent_manager.trajectory_queue)
def _get_and_process_experiences(self, env: EnvManager) -> int: with hierarchical_timer("env_step"): # Get new policies if found for brain_name in self.trainers.keys(): for name_behavior_id in self.brain_name_to_identifier[ brain_name]: try: _policy = self.managers[ name_behavior_id].policy_queue.get_nowait() env.set_policy(name_behavior_id, _policy) except AgentManagerQueue.Empty: pass # Step the environment new_step_infos = env.step() # Add to AgentProcessor for step_info in new_step_infos: for name_behavior_id in step_info.name_behavior_ids: if name_behavior_id not in self.managers: self.logger.warning( "Agent manager was not created for behavior id {}.". format(name_behavior_id)) continue self.managers[name_behavior_id].add_experiences( step_info.previous_all_brain_info[name_behavior_id], step_info.current_all_brain_info[name_behavior_id], step_info.brain_name_to_action_info[name_behavior_id]. outputs, ) return len(new_step_infos)
def _create_trainer_and_manager(self, env_manager: EnvManager, name_behavior_id: str) -> None: parsed_behavior_id = BehaviorIdentifiers.from_name_behavior_id( name_behavior_id) brain_name = parsed_behavior_id.brain_name try: trainer = self.trainers[brain_name] except KeyError: trainer = self.trainer_factory.generate(brain_name) self.trainers[brain_name] = trainer policy = trainer.create_policy( parsed_behavior_id, env_manager.external_brains[name_behavior_id]) trainer.add_policy(parsed_behavior_id, policy) agent_manager = AgentManager( policy, name_behavior_id, trainer.stats_reporter, trainer.parameters.get("time_horizon", sys.maxsize), ) env_manager.set_agent_manager(name_behavior_id, agent_manager) env_manager.set_policy(name_behavior_id, policy) self.brain_name_to_identifier[brain_name].add(name_behavior_id) trainer.publish_policy_queue(agent_manager.policy_queue) trainer.subscribe_trajectory_queue(agent_manager.trajectory_queue) if trainer.threaded: # Start trainer thread trainerthread = threading.Thread(target=self.trainer_update_func, args=(trainer, ), daemon=True) trainerthread.start() self.trainer_threads.append(trainerthread)
def _create_trainer_and_manager( self, env_manager: EnvManager, name_behavior_id: str ) -> None: parsed_behavior_id = BehaviorIdentifiers.from_name_behavior_id(name_behavior_id) brain_name = parsed_behavior_id.brain_name trainerthread = None if brain_name in self.trainers: trainer = self.trainers[brain_name] else: trainer = self.trainer_factory.generate(brain_name) self.trainers[brain_name] = trainer if trainer.threaded: # Only create trainer thread for new trainers trainerthread = threading.Thread( target=self.trainer_update_func, args=(trainer,), daemon=True ) self.trainer_threads.append(trainerthread) env_manager.on_training_started( brain_name, self.trainer_factory.trainer_config[brain_name] ) policy = trainer.create_policy( parsed_behavior_id, env_manager.training_behaviors[name_behavior_id], create_graph=True, ) trainer.add_policy(parsed_behavior_id, policy) agent_manager = AgentManager( policy, name_behavior_id, trainer.stats_reporter, trainer.parameters.time_horizon, threaded=trainer.threaded, ) env_manager.set_agent_manager(name_behavior_id, agent_manager) env_manager.set_policy(name_behavior_id, policy) self.brain_name_to_identifier[brain_name].add(name_behavior_id) trainer.publish_policy_queue(agent_manager.policy_queue) trainer.subscribe_trajectory_queue(agent_manager.trajectory_queue) # Only start new trainers if trainerthread is not None: trainerthread.start()
def _get_and_process_experiences(self, env: EnvManager) -> int: with hierarchical_timer("env_step"): # Get new policies if found for brain_name in self.trainers.keys(): for name_behavior_id in self.brain_name_to_identifier[ brain_name]: try: _policy = self.managers[ name_behavior_id].policy_queue.get_nowait() env.set_policy(name_behavior_id, _policy) except AgentManagerQueue.Empty: pass # Step the environment new_step_infos = env.step() # Add to AgentProcessor num_step_infos = self._process_step_infos(new_step_infos) return num_step_infos
def advance(self, env: EnvManager) -> int: with hierarchical_timer("env_step"): new_step_infos = env.step() for step_info in new_step_infos: for name_behavior_id in step_info.name_behavior_ids: if name_behavior_id not in self.managers: self.logger.warning( "Agent manager was not created for behavior id {}.". format(name_behavior_id)) continue _processor = self.managers[name_behavior_id].processor _processor.add_experiences( step_info.previous_all_brain_info[name_behavior_id], step_info.current_all_brain_info[name_behavior_id], step_info.brain_name_to_action_info[name_behavior_id]. outputs, ) for brain_name, trainer in self.trainers.items(): if self.train_model and trainer.get_step <= trainer.get_max_steps: n_steps = len(new_step_infos) trainer.increment_step(n_steps) for name_behavior_id in self.brain_name_to_identifier[ brain_name]: trainer.get_policy(name_behavior_id).increment_step( n_steps) if trainer.is_ready_update(): # Perform gradient descent with experience buffer with hierarchical_timer("update_policy"): trainer.update_policy() for name_behavior_id in self.brain_name_to_identifier[ brain_name]: env.set_policy(name_behavior_id, trainer.get_policy(name_behavior_id)) else: # Avoid memory leak during inference # Eventually this whole block will take place in advance() # But currently this only calls clear_update_buffer() in RLTrainer # and nothing in the base class trainer.advance() return len(new_step_infos)
def advance(self, env: EnvManager) -> int: with hierarchical_timer("env_step"): time_start_step = time() new_step_infos = env.step() delta_time_step = time() - time_start_step for step_info in new_step_infos: for brain_name, trainer in self.trainers.items(): if brain_name in self.trainer_metrics: self.trainer_metrics[brain_name].add_delta_step( delta_time_step) if step_info.has_actions_for_brain(brain_name): trainer.add_experiences( step_info.previous_all_brain_info[brain_name], step_info.current_all_brain_info[brain_name], step_info.brain_name_to_action_info[brain_name]. outputs, ) trainer.process_experiences( step_info.previous_all_brain_info[brain_name], step_info.current_all_brain_info[brain_name], ) for brain_name, trainer in self.trainers.items(): if brain_name in self.trainer_metrics: self.trainer_metrics[brain_name].add_delta_step( delta_time_step) if self.train_model and trainer.get_step <= trainer.get_max_steps: trainer.increment_step(len(new_step_infos)) if trainer.is_ready_update(): # Perform gradient descent with experience buffer with hierarchical_timer("update_policy"): trainer.update_policy() env.set_policy(brain_name, trainer.policy) else: # Avoid memory leak during inference trainer.clear_update_buffer() return len(new_step_infos)
def start_learning(self, env_manager: EnvManager) -> None: self._create_model_path(self.model_path) tf.reset_default_graph() global_step = 0 last_brain_behavior_ids: Set[str] = set() try: self._reset_env(env_manager) while self._not_done_training(): external_brain_behavior_ids = set( env_manager.external_brains.keys()) new_behavior_ids = external_brain_behavior_ids - last_brain_behavior_ids for name_behavior_id in new_behavior_ids: try: brain_name, _ = name_behavior_id.split("?") except ValueError: brain_name = name_behavior_id try: trainer = self.trainers[brain_name] except KeyError: trainer = self.trainer_factory.generate(brain_name) self.trainers[brain_name] = trainer self.logger.info(trainer) if self.train_model: trainer.write_tensorboard_text( "Hyperparameters", trainer.parameters) policy = trainer.create_policy( env_manager.external_brains[name_behavior_id]) trainer.add_policy(name_behavior_id, policy) env_manager.set_policy(name_behavior_id, policy) self.brain_name_to_identifier[brain_name].add( name_behavior_id) agent_manager = AgentManager(processor=AgentProcessor( trainer, policy, name_behavior_id, trainer.stats_reporter, trainer.parameters.get("time_horizon", sys.maxsize), )) self.managers[name_behavior_id] = agent_manager last_brain_behavior_ids = external_brain_behavior_ids n_steps = self.advance(env_manager) for _ in range(n_steps): global_step += 1 self.reset_env_if_ready(env_manager, global_step) if self._should_save_model(global_step): # Save Tensorflow model self._save_model() self.write_to_tensorboard(global_step) # Final save Tensorflow model if global_step != 0 and self.train_model: self._save_model() except (KeyboardInterrupt, UnityCommunicationException): if self.train_model: self._save_model_when_interrupted() pass if self.train_model: self._export_graph() self._write_timing_tree()