def __init__( self, brain_name: str, trainer_parameters: dict, training: bool, run_id: str, reward_buff_cap: int = 1, ): """ Responsible for collecting experiences and training a neural network model. :BrainParameters brain: Brain to be trained. :dict trainer_parameters: The parameters for the trainer (dictionary). :bool training: Whether the trainer is set for training. :str run_id: The identifier of the current run :int reward_buff_cap: """ self.param_keys: List[str] = [] self.brain_name = brain_name self.run_id = run_id self.trainer_parameters = trainer_parameters self.summary_path = trainer_parameters["summary_path"] self.stats_reporter = StatsReporter(self.summary_path) self.cumulative_returns_since_policy_update: List[float] = [] self.is_training = training self._reward_buffer: Deque[float] = deque(maxlen=reward_buff_cap) self.policy_queues: List[AgentManagerQueue[Policy]] = [] self.trajectory_queues: List[AgentManagerQueue[Trajectory]] = [] self.step: int = 0 self.training_start_time = time.time() self.summary_freq = self.trainer_parameters["summary_freq"] self.next_summary_step = self.summary_freq
def __init__( self, brain: BrainParameters, trainer_parameters: dict, training: bool, run_id: str, reward_buff_cap: int = 1, ): """ Responsible for collecting experiences and training a neural network model. :BrainParameters brain: Brain to be trained. :dict trainer_parameters: The parameters for the trainer (dictionary). :bool training: Whether the trainer is set for training. :str run_id: The identifier of the current run :int reward_buff_cap: """ self.param_keys: List[str] = [] self.brain_name = brain.brain_name self.run_id = run_id self.trainer_parameters = trainer_parameters self.summary_path = trainer_parameters["summary_path"] self.stats_reporter = StatsReporter(self.summary_path) self.cumulative_returns_since_policy_update: List[float] = [] self.is_training = training self._reward_buffer: Deque[float] = deque(maxlen=reward_buff_cap) self.policy: TFPolicy = None # type: ignore # this will always get set self.step: int = 0
def _check_environment_trains( env, trainer_config, reward_processor=default_reward_processor, meta_curriculum=None, success_threshold=0.9, env_manager=None, ): # Create controller and begin training. with tempfile.TemporaryDirectory() as dir: run_id = "id" save_freq = 99999 seed = 1337 StatsReporter.writers.clear( ) # Clear StatsReporters so we don't write to file debug_writer = DebugWriter() StatsReporter.add_writer(debug_writer) # Make sure threading is turned off for determinism trainer_config["threading"] = False if env_manager is None: env_manager = SimpleEnvManager(env, FloatPropertiesChannel()) trainer_factory = TrainerFactory( trainer_config=trainer_config, summaries_dir=dir, run_id=run_id, model_path=dir, keep_checkpoints=1, train_model=True, load_model=False, seed=seed, meta_curriculum=meta_curriculum, multi_gpu=False, ) tc = TrainerController( trainer_factory=trainer_factory, summaries_dir=dir, model_path=dir, run_id=run_id, meta_curriculum=meta_curriculum, train=True, training_seed=seed, sampler_manager=SamplerManager(None), resampling_interval=None, save_freq=save_freq, ) # Begin training tc.start_learning(env_manager) if (success_threshold is not None ): # For tests where we are just checking setup and not reward processed_rewards = [ reward_processor(rewards) for rewards in env.final_rewards.values() ] assert all(not math.isnan(reward) for reward in processed_rewards) assert all(reward > success_threshold for reward in processed_rewards)
def check_environment_trains( env, trainer_config, reward_processor=default_reward_processor, env_parameter_manager=None, success_threshold=0.9, env_manager=None, training_seed=None, ): if env_parameter_manager is None: env_parameter_manager = EnvironmentParameterManager() # Create controller and begin training. with tempfile.TemporaryDirectory() as dir: run_id = "id" seed = 1337 if training_seed is None else training_seed StatsReporter.writers.clear( ) # Clear StatsReporters so we don't write to file debug_writer = DebugWriter() StatsReporter.add_writer(debug_writer) if env_manager is None: env_manager = SimpleEnvManager(env, EnvironmentParametersChannel()) trainer_factory = TrainerFactory( trainer_config=trainer_config, output_path=dir, train_model=True, load_model=False, seed=seed, param_manager=env_parameter_manager, multi_gpu=False, ) tc = TrainerController( trainer_factory=trainer_factory, output_path=dir, run_id=run_id, param_manager=env_parameter_manager, train=True, training_seed=seed, ) # Begin training tc.start_learning(env_manager) if (success_threshold is not None ): # For tests where we are just checking setup and not reward processed_rewards = [ reward_processor(rewards) for rewards in env.final_rewards.values() ] assert all(not math.isnan(reward) for reward in processed_rewards) assert all(reward > success_threshold for reward in processed_rewards)
def test_agentprocessor(num_vis_obs): policy = create_mock_policy() trainer = mock.Mock() processor = AgentProcessor( trainer, policy, max_trajectory_length=5, stats_reporter=StatsReporter("testcat"), ) fake_action_outputs = { "action": [0.1, 0.1], "entropy": np.array([1.0], dtype=np.float32), "learning_rate": 1.0, "pre_action": [0.1, 0.1], "log_probs": [0.1, 0.1], } mock_braininfo = mb.create_mock_braininfo( num_agents=2, num_vector_observations=8, num_vector_acts=2, num_vis_observations=num_vis_obs, ) for i in range(5): processor.add_experiences(mock_braininfo, mock_braininfo, fake_action_outputs) # Assert that two trajectories have been added to the Trainer assert len(trainer.process_trajectory.call_args_list) == 2 # Assert that the trajectory is of length 5 trajectory = trainer.process_trajectory.call_args_list[0][0][0] assert len(trajectory.steps) == 5 # Assert that the AgentProcessor is empty assert len(processor.experience_buffers[0]) == 0
def test_agentprocessor(num_vis_obs): policy = create_mock_policy() tqueue = mock.Mock() name_behavior_id = "test_brain_name" processor = AgentProcessor( policy, name_behavior_id, max_trajectory_length=5, stats_reporter=StatsReporter("testcat"), ) fake_action_outputs = { "action": ActionTuple(continuous=np.array([[0.1], [0.1]])), "entropy": np.array([1.0], dtype=np.float32), "learning_rate": 1.0, "log_probs": LogProbsTuple(continuous=np.array([[0.1], [0.1]])), } mock_decision_steps, mock_terminal_steps = mb.create_mock_steps( num_agents=2, observation_shapes=[(8,)] + num_vis_obs * [(84, 84, 3)], action_spec=ActionSpec.create_continuous(2), ) fake_action_info = ActionInfo( action=ActionTuple(continuous=np.array([[0.1], [0.1]])), env_action=ActionTuple(continuous=np.array([[0.1], [0.1]])), value=[0.1, 0.1], outputs=fake_action_outputs, agent_ids=mock_decision_steps.agent_id, ) processor.publish_trajectory_queue(tqueue) # This is like the initial state after the env reset processor.add_experiences( mock_decision_steps, mock_terminal_steps, 0, ActionInfo.empty() ) for _ in range(5): processor.add_experiences( mock_decision_steps, mock_terminal_steps, 0, fake_action_info ) # Assert that two trajectories have been added to the Trainer assert len(tqueue.put.call_args_list) == 2 # Assert that the trajectory is of length 5 trajectory = tqueue.put.call_args_list[0][0][0] assert len(trajectory.steps) == 5 # Assert that the AgentProcessor is empty assert len(processor.experience_buffers[0]) == 0 # Test empty steps mock_decision_steps, mock_terminal_steps = mb.create_mock_steps( num_agents=0, observation_shapes=[(8,)] + num_vis_obs * [(84, 84, 3)], action_spec=ActionSpec.create_continuous(2), ) processor.add_experiences( mock_decision_steps, mock_terminal_steps, 0, ActionInfo.empty() ) # Assert that the AgentProcessor is still empty assert len(processor.experience_buffers[0]) == 0
def __init__( self, brain_name: str, trainer_settings: TrainerSettings, training: bool, load: bool, artifact_path: str, reward_buff_cap: int = 1, ): """ Responsible for collecting experiences and training a neural network model. :param brain_name: Brain name of brain to be trained. :param trainer_settings: The parameters for the trainer (dictionary). :param training: Whether the trainer is set for training. :param artifact_path: The directory within which to store artifacts from this trainer :param reward_buff_cap: """ self.brain_name = brain_name self.trainer_settings = trainer_settings self._threaded = trainer_settings.threaded self._stats_reporter = StatsReporter(brain_name) self.is_training = training self.load = load self._reward_buffer: Deque[float] = deque(maxlen=reward_buff_cap) self.policy_queues: List[AgentManagerQueue[Policy]] = [] self.trajectory_queues: List[AgentManagerQueue[Trajectory]] = [] self.step: int = 0 self.artifact_path = artifact_path self.summary_freq = self.trainer_settings.summary_freq self.policies: Dict[str, Policy] = {}
def __init__( self, brain_name: str, trainer_settings: TrainerSettings, training: bool, run_id: str, reward_buff_cap: int = 1, ): """ Responsible for collecting experiences and training a neural network model. :BrainParameters brain: Brain to be trained. :dict trainer_settings: The parameters for the trainer (dictionary). :bool training: Whether the trainer is set for training. :str run_id: The identifier of the current run :int reward_buff_cap: """ self.brain_name = brain_name self.run_id = run_id self.trainer_settings = trainer_settings self._threaded = trainer_settings.threaded self._stats_reporter = StatsReporter(brain_name) self.is_training = training self._reward_buffer: Deque[float] = deque(maxlen=reward_buff_cap) self.policy_queues: List[AgentManagerQueue[Policy]] = [] self.trajectory_queues: List[AgentManagerQueue[Trajectory]] = [] self.step: int = 0 self.summary_freq = self.trainer_settings.summary_freq
def test_agent_deletion(): policy = create_mock_policy() tqueue = mock.Mock() name_behavior_id = "test_brain_name" processor = AgentProcessor( policy, name_behavior_id, max_trajectory_length=5, stats_reporter=StatsReporter("testcat"), ) fake_action_outputs = { "action": [0.1], "entropy": np.array([1.0], dtype=np.float32), "learning_rate": 1.0, "pre_action": [0.1], "log_probs": [0.1], } mock_step = mb.create_mock_batchedstep( num_agents=1, num_vector_observations=8, action_shape=[2], num_vis_observations=0, ) mock_done_step = mb.create_mock_batchedstep( num_agents=1, num_vector_observations=8, action_shape=[2], num_vis_observations=0, done=True, ) fake_action_info = ActionInfo( action=[0.1], value=[0.1], outputs=fake_action_outputs, agent_ids=mock_step.agent_id, ) processor.publish_trajectory_queue(tqueue) # This is like the initial state after the env reset processor.add_experiences(mock_step, 0, ActionInfo.empty()) # Run 3 trajectories, with different workers (to simulate different agents) add_calls = [] remove_calls = [] for _ep in range(3): for _ in range(5): processor.add_experiences(mock_step, _ep, fake_action_info) add_calls.append(mock.call([get_global_agent_id(_ep, 0)], [0.1])) processor.add_experiences(mock_done_step, _ep, fake_action_info) # Make sure we don't add experiences from the prior agents after the done remove_calls.append(mock.call([get_global_agent_id(_ep, 0)])) policy.save_previous_action.assert_has_calls(add_calls) policy.remove_previous_action.assert_has_calls(remove_calls) # Check that there are no experiences left assert len(processor.experience_buffers.keys()) == 0 assert len(processor.last_take_action_outputs.keys()) == 0 assert len(processor.episode_steps.keys()) == 0 assert len(processor.episode_rewards.keys()) == 0
def test_agent_manager_stats_report(aggregation_type): stats_reporter = StatsReporter("recorder_name") manager = AgentManager(None, "behaviorName", stats_reporter) values = range(5) env_stats = {"stat": [(i, aggregation_type) for i in values]} manager.record_environment_stats(env_stats, 0) summary = stats_reporter.get_stats_summaries("stat") aggregation_result = { StatsAggregationMethod.AVERAGE: sum(values) / len(values), StatsAggregationMethod.MOST_RECENT: values[-1], StatsAggregationMethod.SUM: sum(values), StatsAggregationMethod.HISTOGRAM: sum(values) / len(values), } assert summary.aggregated_value == aggregation_result[aggregation_type] stats_reporter.write_stats(0)
def test_group_statuses(): policy = create_mock_policy() tqueue = mock.Mock() name_behavior_id = "test_brain_name" processor = AgentProcessor( policy, name_behavior_id, max_trajectory_length=5, stats_reporter=StatsReporter("testcat"), ) mock_decision_steps, mock_terminal_steps = mb.create_mock_steps( num_agents=4, observation_specs=create_observation_specs_with_shapes([(8,)]), action_spec=ActionSpec.create_continuous(2), grouped=True, ) fake_action_info = _create_action_info(4, mock_decision_steps.agent_id) processor.publish_trajectory_queue(tqueue) # This is like the initial state after the env reset processor.add_experiences( mock_decision_steps, mock_terminal_steps, 0, ActionInfo.empty() ) for _ in range(2): processor.add_experiences( mock_decision_steps, mock_terminal_steps, 0, fake_action_info ) # Make terminal steps for some dead agents mock_decision_steps_2, mock_terminal_steps_2 = mb.create_mock_steps( num_agents=2, observation_specs=create_observation_specs_with_shapes([(8,)]), action_spec=ActionSpec.create_continuous(2), done=True, grouped=True, ) processor.add_experiences( mock_decision_steps_2, mock_terminal_steps_2, 0, fake_action_info ) fake_action_info = _create_action_info(4, mock_decision_steps.agent_id) for _ in range(3): processor.add_experiences( mock_decision_steps, mock_terminal_steps, 0, fake_action_info ) # Assert that four trajectories have been added to the Trainer assert len(tqueue.put.call_args_list) == 4 # Last trajectory should be the longest trajectory = tqueue.put.call_args_list[0][0][-1] # Make sure trajectory has the right Groupmate Experiences for step in trajectory.steps[0:3]: assert len(step.group_status) == 3 # After 2 agents has died for step in trajectory.steps[3:]: assert len(step.group_status) == 1
def test_agentprocessor(num_vis_obs): policy = create_mock_policy() tqueue = mock.Mock() name_behavior_id = "test_brain_name" processor = AgentProcessor( policy, name_behavior_id, max_trajectory_length=5, stats_reporter=StatsReporter("testcat"), ) fake_action_outputs = { "action": [0.1, 0.1], "entropy": np.array([1.0], dtype=np.float32), "learning_rate": 1.0, "pre_action": [0.1, 0.1], "log_probs": [0.1, 0.1], } mock_step = mb.create_mock_batchedstep( num_agents=2, num_vector_observations=8, action_shape=[2], num_vis_observations=num_vis_obs, ) fake_action_info = ActionInfo( action=[0.1, 0.1], value=[0.1, 0.1], outputs=fake_action_outputs, agent_ids=mock_step.agent_id, ) processor.publish_trajectory_queue(tqueue) # This is like the initial state after the env reset processor.add_experiences(mock_step, 0, ActionInfo.empty()) for _ in range(5): processor.add_experiences(mock_step, 0, fake_action_info) # Assert that two trajectories have been added to the Trainer assert len(tqueue.put.call_args_list) == 2 # Assert that the trajectory is of length 5 trajectory = tqueue.put.call_args_list[0][0][0] assert len(trajectory.steps) == 5 # Assert that the AgentProcessor is empty assert len(processor.experience_buffers[0]) == 0 # Test empty BatchedStepResult mock_step = mb.create_mock_batchedstep( num_agents=0, num_vector_observations=8, action_shape=[2], num_vis_observations=num_vis_obs, ) processor.add_experiences(mock_step, 0, ActionInfo([], [], {}, [])) # Assert that the AgentProcessor is still empty assert len(processor.experience_buffers[0]) == 0
def test_agentprocessor(num_vis_obs): policy = create_mock_policy() tqueue = mock.Mock() name_behavior_id = "test_brain_name" processor = AgentProcessor( policy, name_behavior_id, max_trajectory_length=5, stats_reporter=StatsReporter("testcat"), ) mock_decision_steps, mock_terminal_steps = mb.create_mock_steps( num_agents=2, observation_specs=create_observation_specs_with_shapes( [(8,)] + num_vis_obs * [(84, 84, 3)] ), action_spec=ActionSpec.create_continuous(2), ) fake_action_info = _create_action_info(2, mock_decision_steps.agent_id) processor.publish_trajectory_queue(tqueue) # This is like the initial state after the env reset processor.add_experiences( mock_decision_steps, mock_terminal_steps, 0, ActionInfo.empty() ) for _ in range(5): processor.add_experiences( mock_decision_steps, mock_terminal_steps, 0, fake_action_info ) # Assert that two trajectories have been added to the Trainer assert len(tqueue.put.call_args_list) == 2 # Assert that the trajectory is of length 5 trajectory = tqueue.put.call_args_list[0][0][0] assert len(trajectory.steps) == 5 # Make sure ungrouped agents don't have team obs for step in trajectory.steps: assert len(step.group_status) == 0 # Assert that the AgentProcessor is empty assert len(processor._experience_buffers[0]) == 0 # Test empty steps mock_decision_steps, mock_terminal_steps = mb.create_mock_steps( num_agents=0, observation_specs=create_observation_specs_with_shapes( [(8,)] + num_vis_obs * [(84, 84, 3)] ), action_spec=ActionSpec.create_continuous(2), ) processor.add_experiences( mock_decision_steps, mock_terminal_steps, 0, ActionInfo.empty() ) # Assert that the AgentProcessor is still empty assert len(processor._experience_buffers[0]) == 0
def test_end_episode(): policy = create_mock_policy() tqueue = mock.Mock() name_behavior_id = "test_brain_name" processor = AgentProcessor( policy, name_behavior_id, max_trajectory_length=5, stats_reporter=StatsReporter("testcat"), ) fake_action_outputs = { "action": ActionTuple(continuous=np.array([[0.1]])), "entropy": np.array([1.0], dtype=np.float32), "learning_rate": 1.0, "log_probs": LogProbsTuple(continuous=np.array([[0.1]])), } mock_decision_step, mock_terminal_step = mb.create_mock_steps( num_agents=1, observation_shapes=[(8,)], action_spec=ActionSpec.create_continuous(2), ) fake_action_info = ActionInfo( action=ActionTuple(continuous=np.array([[0.1]])), env_action=ActionTuple(continuous=np.array([[0.1]])), value=[0.1], outputs=fake_action_outputs, agent_ids=mock_decision_step.agent_id, ) processor.publish_trajectory_queue(tqueue) # This is like the initial state after the env reset processor.add_experiences( mock_decision_step, mock_terminal_step, 0, ActionInfo.empty() ) # Run 3 trajectories, with different workers (to simulate different agents) remove_calls = [] for _ep in range(3): remove_calls.append(mock.call([get_global_agent_id(_ep, 0)])) for _ in range(5): processor.add_experiences( mock_decision_step, mock_terminal_step, _ep, fake_action_info ) # Make sure we don't add experiences from the prior agents after the done # Call end episode processor.end_episode() # Check that we removed every agent policy.remove_previous_action.assert_has_calls(remove_calls) # Check that there are no experiences left assert len(processor.experience_buffers.keys()) == 0 assert len(processor.last_take_action_outputs.keys()) == 0 assert len(processor.episode_steps.keys()) == 0 assert len(processor.episode_rewards.keys()) == 0
def test_agent_manager(): policy = create_mock_policy() name_behavior_id = "test_brain_name" manager = AgentManager( policy, name_behavior_id, max_trajectory_length=5, stats_reporter=StatsReporter("testcat"), ) assert len(manager.trajectory_queues) == 1 assert isinstance(manager.trajectory_queues[0], AgentManagerQueue)
def test_agent_manager_stats(): policy = mock.Mock() stats_reporter = StatsReporter("FakeCategory") writer = mock.Mock() stats_reporter.add_writer(writer) manager = AgentManager(policy, "MyBehavior", stats_reporter) all_env_stats = [ { "averaged": [(1.0, StatsAggregationMethod.AVERAGE)], "most_recent": [(2.0, StatsAggregationMethod.MOST_RECENT)], }, { "averaged": [(3.0, StatsAggregationMethod.AVERAGE)], "most_recent": [(4.0, StatsAggregationMethod.MOST_RECENT)], }, ] for env_stats in all_env_stats: manager.record_environment_stats(env_stats, worker_id=0) expected_stats = { "averaged": StatsSummary(mean=2.0, std=mock.ANY, num=2), "most_recent": StatsSummary(mean=4.0, std=0.0, num=1), } stats_reporter.write_stats(123) writer.write_stats.assert_any_call("FakeCategory", expected_stats, 123) # clean up our Mock from the global list StatsReporter.writers.remove(writer)
def test_stat_reporter_property(): # Test add_writer mock_writer = mock.Mock() StatsReporter.writers.clear() StatsReporter.add_writer(mock_writer) assert len(StatsReporter.writers) == 1 statsreporter1 = StatsReporter("category1") # Test add_property statsreporter1.add_property("key", "this is a text") mock_writer.add_property.assert_called_once_with("category1", "key", "this is a text")
def test_stat_reporter_text(): # Test add_writer mock_writer = mock.Mock() StatsReporter.writers.clear() StatsReporter.add_writer(mock_writer) assert len(StatsReporter.writers) == 1 statsreporter1 = StatsReporter("category1") # Test write_text step = 10 statsreporter1.write_text("this is a text", step) mock_writer.write_text.assert_called_once_with("category1", "this is a text", step)
def test_stat_reporter_add_summary_write(): # Test add_writer StatsReporter.writers.clear() mock_writer1 = mock.Mock() mock_writer2 = mock.Mock() StatsReporter.add_writer(mock_writer1) StatsReporter.add_writer(mock_writer2) assert len(StatsReporter.writers) == 2 # Test add_stats and summaries statsreporter1 = StatsReporter("category1") statsreporter2 = StatsReporter("category2") for i in range(10): statsreporter1.add_stat("key1", float(i)) statsreporter2.add_stat("key2", float(i)) statssummary1 = statsreporter1.get_stats_summaries("key1") statssummary2 = statsreporter2.get_stats_summaries("key2") assert statssummary1.num == 10 assert statssummary2.num == 10 assert statssummary1.mean == 4.5 assert statssummary2.mean == 4.5 assert statssummary1.std == pytest.approx(2.9, abs=0.1) assert statssummary2.std == pytest.approx(2.9, abs=0.1) # Test write_stats step = 10 statsreporter1.write_stats(step) mock_writer1.write_stats.assert_called_once_with( "category1", {"key1": statssummary1}, step ) mock_writer2.write_stats.assert_called_once_with( "category1", {"key1": statssummary1}, step )
def run_training(run_seed: int, options: RunOptions) -> None: """ Launches training session. :param options: parsed command line arguments :param run_seed: Random seed used for training. :param run_options: Command line arguments for training. """ model_path = f"./models/{options.run_id}" summaries_dir = "./summaries" port = options.base_port # Configure CSV, Tensorboard Writers and StatsReporter # We assume reward and episode length are needed in the CSV. csv_writer = CSVWriter( summaries_dir, required_fields=[ "Environment/Cumulative Reward", "Environment/Episode Length" ], ) tb_writer = TensorboardWriter(summaries_dir) StatsReporter.add_writer(tb_writer) StatsReporter.add_writer(csv_writer) if options.env_path is None: port = 5004 # This is the in Editor Training Port env_factory = create_environment_factory(options.env_path, options.no_graphics, run_seed, port, options.env_args, options.env_id, options.n_steps) env_manager = SubprocessEnvManager(env_factory=env_factory, n_env=options.num_envs) maybe_meta_curriculum = try_create_meta_curriculum( options.curriculum_config, env_manager, options.lesson) sampler_manager, resampling_interval = create_sampler_manager( options.sampler_config, run_seed) trainer_factory = TrainerFactory( options.trainer_config, summaries_dir, options.run_id, model_path, options.keep_checkpoints, options.train_model, options.load_model, run_seed, maybe_meta_curriculum, options.multi_gpu, ) # Create controller and begin training. tc = TrainerController(trainer_factory=trainer_factory, model_path=model_path, summaries_dir=summaries_dir, run_id=options.run_id, save_freq=options.save_freq, meta_curriculum=maybe_meta_curriculum, train=options.train_model, training_seed=run_seed, sampler_manager=sampler_manager, resampling_interval=resampling_interval, n_steps=options.n_steps) # Begin training try: tc.start_learning(env_manager) finally: env_manager.close()
def run_training(run_seed: int, options: RunOptions) -> None: """ Launches training session. :param options: parsed command line arguments :param run_seed: Random seed used for training. :param run_options: Command line arguments for training. """ with hierarchical_timer("run_training.setup"): model_path = f"./models/{options.run_id}" maybe_init_path = ( f"./models/{options.initialize_from}" if options.initialize_from else None ) summaries_dir = "./summaries" port = options.base_port # Configure CSV, Tensorboard Writers and StatsReporter # We assume reward and episode length are needed in the CSV. csv_writer = CSVWriter( summaries_dir, required_fields=[ "Environment/Cumulative Reward", "Environment/Episode Length", ], ) handle_existing_directories( model_path, summaries_dir, options.resume, options.force, maybe_init_path ) tb_writer = TensorboardWriter(summaries_dir, clear_past_data=not options.resume) gauge_write = GaugeWriter() console_writer = ConsoleWriter() StatsReporter.add_writer(tb_writer) StatsReporter.add_writer(csv_writer) StatsReporter.add_writer(gauge_write) StatsReporter.add_writer(console_writer) if options.env_path is None: port = UnityEnvironment.DEFAULT_EDITOR_PORT env_factory = create_environment_factory( options.env_path, options.no_graphics, run_seed, port, options.env_args ) engine_config = EngineConfig( width=options.width, height=options.height, quality_level=options.quality_level, time_scale=options.time_scale, target_frame_rate=options.target_frame_rate, capture_frame_rate=options.capture_frame_rate, ) env_manager = SubprocessEnvManager(env_factory, engine_config, options.num_envs) maybe_meta_curriculum = try_create_meta_curriculum( options.curriculum_config, env_manager, options.lesson ) sampler_manager, resampling_interval = create_sampler_manager( options.sampler_config, run_seed ) trainer_factory = TrainerFactory( options.trainer_config, summaries_dir, options.run_id, model_path, options.keep_checkpoints, not options.inference, options.resume, run_seed, maybe_init_path, maybe_meta_curriculum, options.multi_gpu, ) # Create controller and begin training. tc = TrainerController( trainer_factory, model_path, summaries_dir, options.run_id, options.save_freq, maybe_meta_curriculum, not options.inference, run_seed, sampler_manager, resampling_interval, ) # Begin training try: tc.start_learning(env_manager) finally: env_manager.close() write_timing_tree(summaries_dir, options.run_id)
def run_training(run_seed: int, options: RunOptions) -> None: """ Launches training session. :param options: parsed command line arguments :param run_seed: Random seed used for training. :param run_options: Command line arguments for training. """ # Recognize and use docker volume if one is passed as an argument if not options.docker_target_name: model_path = f"./models/{options.run_id}" summaries_dir = "./summaries" else: model_path = f"/{options.docker_target_name}/models/{options.run_id}" summaries_dir = f"/{options.docker_target_name}/summaries" port = options.base_port # Configure CSV, Tensorboard Writers and StatsReporter # We assume reward and episode length are needed in the CSV. csv_writer = CSVWriter( summaries_dir, required_fields=[ "Environment/Cumulative Reward", "Environment/Episode Length" ], ) tb_writer = TensorboardWriter(summaries_dir) StatsReporter.add_writer(tb_writer) StatsReporter.add_writer(csv_writer) if options.env_path is None: port = UnityEnvironment.DEFAULT_EDITOR_PORT env_factory = create_environment_factory( options.env_path, options.docker_target_name, options.no_graphics, run_seed, port, options.env_args, ) engine_config = EngineConfig( options.width, options.height, options.quality_level, options.time_scale, options.target_frame_rate, ) env_manager = SubprocessEnvManager(env_factory, engine_config, options.num_envs) maybe_meta_curriculum = try_create_meta_curriculum( options.curriculum_config, env_manager, options.lesson) sampler_manager, resampling_interval = create_sampler_manager( options.sampler_config, run_seed) trainer_factory = TrainerFactory( options.trainer_config, summaries_dir, options.run_id, model_path, options.keep_checkpoints, options.train_model, options.load_model, run_seed, maybe_meta_curriculum, options.multi_gpu, ) # Create controller and begin training. tc = TrainerController( trainer_factory, model_path, summaries_dir, options.run_id, options.save_freq, maybe_meta_curriculum, options.train_model, run_seed, sampler_manager, resampling_interval, ) # Begin training try: tc.start_learning(env_manager) finally: env_manager.close()
class Trainer(abc.ABC): """This class is the base class for the mlagents_envs.trainers""" def __init__( self, brain_name: str, trainer_parameters: dict, training: bool, run_id: str, reward_buff_cap: int = 1, ): """ Responsible for collecting experiences and training a neural network model. :BrainParameters brain: Brain to be trained. :dict trainer_parameters: The parameters for the trainer (dictionary). :bool training: Whether the trainer is set for training. :str run_id: The identifier of the current run :int reward_buff_cap: """ self.param_keys: List[str] = [] self.brain_name = brain_name self.run_id = run_id self.trainer_parameters = trainer_parameters self.summary_path = trainer_parameters["summary_path"] self.stats_reporter = StatsReporter(self.summary_path) self.cumulative_returns_since_policy_update: List[float] = [] self.is_training = training self._reward_buffer: Deque[float] = deque(maxlen=reward_buff_cap) self.policy_queues: List[AgentManagerQueue[Policy]] = [] self.trajectory_queues: List[AgentManagerQueue[Trajectory]] = [] self.step: int = 0 self.training_start_time = time.time() self.summary_freq = self.trainer_parameters["summary_freq"] self.next_summary_step = self.summary_freq def _check_param_keys(self): for k in self.param_keys: if k not in self.trainer_parameters: raise UnityTrainerException( "The hyper-parameter {0} could not be found for the {1} trainer of " "brain {2}.".format(k, self.__class__, self.brain_name) ) def write_tensorboard_text(self, key: str, input_dict: Dict[str, Any]) -> None: """ Saves text to Tensorboard. Note: Only works on tensorflow r1.2 or above. :param key: The name of the text. :param input_dict: A dictionary that will be displayed in a table on Tensorboard. """ try: with tf.Session(config=tf_utils.generate_session_config()) as sess: s_op = tf.summary.text( key, tf.convert_to_tensor( ([[str(x), str(input_dict[x])] for x in input_dict]) ), ) s = sess.run(s_op) self.stats_reporter.write_text(s, self.get_step) except Exception: LOGGER.info("Could not write text summary for Tensorboard.") pass def _dict_to_str(self, param_dict: Dict[str, Any], num_tabs: int) -> str: """ Takes a parameter dictionary and converts it to a human-readable string. Recurses if there are multiple levels of dict. Used to print out hyperaparameters. param: param_dict: A Dictionary of key, value parameters. return: A string version of this dictionary. """ if not isinstance(param_dict, dict): return str(param_dict) else: append_newline = "\n" if num_tabs > 0 else "" return append_newline + "\n".join( [ "\t" + " " * num_tabs + "{0}:\t{1}".format( x, self._dict_to_str(param_dict[x], num_tabs + 1) ) for x in param_dict ] ) def __str__(self) -> str: return """Hyperparameters for the {0} of brain {1}: \n{2}""".format( self.__class__.__name__, self.brain_name, self._dict_to_str(self.trainer_parameters, 0), ) @property def parameters(self) -> Dict[str, Any]: """ Returns the trainer parameters of the trainer. """ return self.trainer_parameters @property def get_max_steps(self) -> int: """ Returns the maximum number of steps. Is used to know when the trainer should be stopped. :return: The maximum number of steps of the trainer """ return int(float(self.trainer_parameters["max_steps"])) @property def get_step(self) -> int: """ Returns the number of steps the trainer has performed :return: the step count of the trainer """ return self.step @property def should_still_train(self) -> bool: """ Returns whether or not the trainer should train. A Trainer could stop training if it wasn't training to begin with, or if max_steps is reached. """ return self.is_training and self.get_step <= self.get_max_steps @property def reward_buffer(self) -> Deque[float]: """ Returns the reward buffer. The reward buffer contains the cumulative rewards of the most recent episodes completed by agents using this trainer. :return: the reward buffer. """ return self._reward_buffer def _increment_step(self, n_steps: int, name_behavior_id: str) -> None: """ Increment the step count of the trainer :param n_steps: number of steps to increment the step count by """ self.step += n_steps self.next_summary_step = self._get_next_summary_step() p = self.get_policy(name_behavior_id) if p: p.increment_step(n_steps) def _get_next_summary_step(self) -> int: """ Get the next step count that should result in a summary write. """ return self.step + (self.summary_freq - self.step % self.summary_freq) def save_model(self, name_behavior_id: str) -> None: """ Saves the model """ self.get_policy(name_behavior_id).save_model(self.get_step) def export_model(self, name_behavior_id: str) -> None: """ Exports the model """ policy = self.get_policy(name_behavior_id) settings = SerializationSettings(policy.model_path, policy.brain.brain_name) export_policy_model(settings, policy.graph, policy.sess) def _write_summary(self, step: int) -> None: """ Saves training statistics to Tensorboard. """ is_training = "Training." if self.should_still_train else "Not Training." stats_summary = self.stats_reporter.get_stats_summaries( "Environment/Cumulative Reward" ) if stats_summary.num > 0: LOGGER.info( " {}: {}: Step: {}. " "Time Elapsed: {:0.3f} s " "Mean " "Reward: {:0.3f}" ". Std of Reward: {:0.3f}. {}".format( self.run_id, self.brain_name, step, time.time() - self.training_start_time, stats_summary.mean, stats_summary.std, is_training, ) ) set_gauge(f"{self.brain_name}.mean_reward", stats_summary.mean) else: LOGGER.info( " {}: {}: Step: {}. No episode was completed since last summary. {}".format( self.run_id, self.brain_name, step, is_training ) ) self.stats_reporter.write_stats(int(step)) @abc.abstractmethod def _process_trajectory(self, trajectory: Trajectory) -> None: """ Takes a trajectory and processes it, putting it into the update buffer. :param trajectory: The Trajectory tuple containing the steps to be processed. """ self._maybe_write_summary(self.get_step + len(trajectory.steps)) self._increment_step(len(trajectory.steps), trajectory.behavior_id) def _maybe_write_summary(self, step_after_process: int) -> None: """ If processing the trajectory will make the step exceed the next summary write, write the summary. This logic ensures summaries are written on the update step and not in between. :param step_after_process: the step count after processing the next trajectory. """ if step_after_process >= self.next_summary_step and self.get_step != 0: self._write_summary(self.next_summary_step) @abc.abstractmethod def end_episode(self): """ A signal that the Episode has ended. The buffer must be reset. Get only called when the academy resets. """ pass @abc.abstractmethod def create_policy(self, brain_parameters: BrainParameters) -> TFPolicy: """ Creates policy """ pass @abc.abstractmethod def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None: """ Adds policy to trainer """ pass @abc.abstractmethod def get_policy(self, name_behavior_id: str) -> TFPolicy: """ Gets policy from trainer """ pass @abc.abstractmethod def _is_ready_update(self): """ Returns whether or not the trainer has enough elements to run update model :return: A boolean corresponding to wether or not update_model() can be run """ return False @abc.abstractmethod def _update_policy(self): """ Uses demonstration_buffer to update model. """ pass def advance(self) -> None: """ Steps the trainer, taking in trajectories and updates if ready. """ with hierarchical_timer("process_trajectory"): for traj_queue in self.trajectory_queues: # We grab at most the maximum length of the queue. # This ensures that even if the queue is being filled faster than it is # being emptied, the trajectories in the queue are on-policy. for _ in range(traj_queue.maxlen): try: t = traj_queue.get_nowait() self._process_trajectory(t) except AgentManagerQueue.Empty: break if self.should_still_train: if self._is_ready_update(): with hierarchical_timer("_update_policy"): self._update_policy() for q in self.policy_queues: # Get policies that correspond to the policy queue in question q.put(self.get_policy(q.behavior_id)) def publish_policy_queue(self, policy_queue: AgentManagerQueue[Policy]) -> None: """ Adds a policy queue to the list of queues to publish to when this Trainer makes a policy update :param queue: Policy queue to publish to. """ self.policy_queues.append(policy_queue) def subscribe_trajectory_queue( self, trajectory_queue: AgentManagerQueue[Trajectory] ) -> None: """ Adds a trajectory queue to the list of queues for the trainer to ingest Trajectories from. :param queue: Trajectory queue to publish to. """ self.trajectory_queues.append(trajectory_queue)
def run_training(run_seed: int, options: RunOptions) -> None: """ Launches training session. :param options: parsed command line arguments :param run_seed: Random seed used for training. :param run_options: Command line arguments for training. """ with hierarchical_timer("run_training.setup"): checkpoint_settings = options.checkpoint_settings env_settings = options.env_settings engine_settings = options.engine_settings base_path = "results" write_path = os.path.join(base_path, checkpoint_settings.run_id) maybe_init_path = ( os.path.join(base_path, checkpoint_settings.initialize_from) if checkpoint_settings.initialize_from is not None else None ) run_logs_dir = os.path.join(write_path, "run_logs") port: Optional[int] = env_settings.base_port # Check if directory exists validate_existing_directories( write_path, checkpoint_settings.resume, checkpoint_settings.force, maybe_init_path, ) # Make run logs directory os.makedirs(run_logs_dir, exist_ok=True) # Load any needed states if checkpoint_settings.resume: GlobalTrainingStatus.load_state( os.path.join(run_logs_dir, "training_status.json") ) # Configure Tensorboard Writers and StatsReporter tb_writer = TensorboardWriter( write_path, clear_past_data=not checkpoint_settings.resume ) gauge_write = GaugeWriter() console_writer = ConsoleWriter() StatsReporter.add_writer(tb_writer) StatsReporter.add_writer(gauge_write) StatsReporter.add_writer(console_writer) if env_settings.env_path is None: port = None env_factory = create_environment_factory( env_settings.env_path, engine_settings.no_graphics, run_seed, port, env_settings.env_args, os.path.abspath(run_logs_dir), # Unity environment requires absolute path ) engine_config = EngineConfig( width=engine_settings.width, height=engine_settings.height, quality_level=engine_settings.quality_level, time_scale=engine_settings.time_scale, target_frame_rate=engine_settings.target_frame_rate, capture_frame_rate=engine_settings.capture_frame_rate, ) env_manager = SubprocessEnvManager( env_factory, engine_config, env_settings.num_envs ) env_parameter_manager = EnvironmentParameterManager( options.environment_parameters, run_seed, restore=checkpoint_settings.resume ) trainer_factory = TrainerFactory( trainer_config=options.behaviors, output_path=write_path, train_model=not checkpoint_settings.inference, load_model=checkpoint_settings.resume, seed=run_seed, param_manager=env_parameter_manager, init_path=maybe_init_path, multi_gpu=False, ) # Create controller and begin training. tc = TrainerController( trainer_factory, write_path, checkpoint_settings.run_id, env_parameter_manager, not checkpoint_settings.inference, run_seed, ) # Begin training try: tc.start_learning(env_manager) finally: env_manager.close() write_run_options(write_path, options) write_timing_tree(run_logs_dir) write_training_status(run_logs_dir)
def run_training(run_seed: int, options: RunOptions) -> None: """ Launches training session. :param options: parsed command line arguments :param run_seed: Random seed used for training. :param run_options: Command line arguments for training. """ options.checkpoint_settings.run_id = "test8" with hierarchical_timer("run_training.setup"): checkpoint_settings = options.checkpoint_settings env_settings = options.env_settings engine_settings = options.engine_settings base_path = "results" write_path = os.path.join(base_path, checkpoint_settings.run_id) maybe_init_path = (os.path.join(base_path, checkpoint_settings.initialize_from) if checkpoint_settings.initialize_from else None) run_logs_dir = os.path.join(write_path, "run_logs") port: Optional[int] = env_settings.base_port # Check if directory exists handle_existing_directories( write_path, checkpoint_settings.resume, checkpoint_settings.force, maybe_init_path, ) # Make run logs directory os.makedirs(run_logs_dir, exist_ok=True) # Load any needed states if checkpoint_settings.resume: GlobalTrainingStatus.load_state( os.path.join(run_logs_dir, "training_status.json")) # Configure CSV, Tensorboard Writers and StatsReporter # We assume reward and episode length are needed in the CSV. csv_writer = CSVWriter( write_path, required_fields=[ "Environment/Cumulative Reward", "Environment/Episode Length", ], ) tb_writer = TensorboardWriter( write_path, clear_past_data=not checkpoint_settings.resume) gauge_write = GaugeWriter() console_writer = ConsoleWriter() StatsReporter.add_writer(tb_writer) StatsReporter.add_writer(csv_writer) StatsReporter.add_writer(gauge_write) StatsReporter.add_writer(console_writer) engine_config = EngineConfig( width=engine_settings.width, height=engine_settings.height, quality_level=engine_settings.quality_level, time_scale=engine_settings.time_scale, target_frame_rate=engine_settings.target_frame_rate, capture_frame_rate=engine_settings.capture_frame_rate, ) if env_settings.env_path is None: port = None # Begin training env_settings.env_path = "C:/Users/Sebastian/Desktop/RLUnity/Training/mFindTarget_new/RLProject.exe" env_factory = create_environment_factory( env_settings.env_path, engine_settings.no_graphics, run_seed, port, env_settings.env_args, os.path.abspath( run_logs_dir), # Unity environment requires absolute path ) env_manager = SubprocessEnvManager(env_factory, engine_config, env_settings.num_envs) maybe_meta_curriculum = try_create_meta_curriculum( options.curriculum, env_manager, restore=checkpoint_settings.resume) sampler_manager, resampling_interval = create_sampler_manager( options.parameter_randomization, run_seed) max_steps = options.behaviors['Brain'].max_steps options.behaviors['Brain'].max_steps = 10 trainer_factory = TrainerFactory(options, write_path, not checkpoint_settings.inference, checkpoint_settings.resume, run_seed, maybe_init_path, maybe_meta_curriculum, False, total_steps=0) trainer_factory.trainer_config[ 'Brain'].hyperparameters.learning_rate_schedule = ScheduleType.CONSTANT # Create controller and begin training. tc = TrainerController( trainer_factory, write_path, checkpoint_settings.run_id, maybe_meta_curriculum, not checkpoint_settings.inference, run_seed, sampler_manager, resampling_interval, ) try: # Get inital weights tc.init_weights(env_manager) inital_weights = deepcopy(tc.weights) finally: env_manager.close() write_run_options(write_path, options) write_timing_tree(run_logs_dir) write_training_status(run_logs_dir) options.behaviors['Brain'].max_steps = max_steps step = 0 counter = 0 max_meta_updates = 200 while counter < max_meta_updates: sample = np.random.random_sample() if (sample > 1): print("Performing Meta-learning on Carry Object stage") env_settings.env_path = "C:/Users/Sebastian/Desktop/RLUnity/Training/mCarryObject_new/RLProject.exe" else: print("Performing Meta-learning on Find Target stage") env_settings.env_path = "C:/Users/Sebastian/Desktop/RLUnity/Training/mFindTarget_new/RLProject.exe" env_factory = create_environment_factory( env_settings.env_path, engine_settings.no_graphics, run_seed, port, env_settings.env_args, os.path.abspath( run_logs_dir), # Unity environment requires absolute path ) env_manager = SubprocessEnvManager(env_factory, engine_config, env_settings.num_envs) maybe_meta_curriculum = try_create_meta_curriculum( options.curriculum, env_manager, restore=checkpoint_settings.resume) sampler_manager, resampling_interval = create_sampler_manager( options.parameter_randomization, run_seed) trainer_factory = TrainerFactory(options, write_path, not checkpoint_settings.inference, checkpoint_settings.resume, run_seed, maybe_init_path, maybe_meta_curriculum, False, total_steps=step) trainer_factory.trainer_config[ 'Brain'].hyperparameters.learning_rate_schedule = ScheduleType.CONSTANT trainer_factory.trainer_config[ 'Brain'].hyperparameters.learning_rate = 0.0005 * ( 1 - counter / max_meta_updates) trainer_factory.trainer_config[ 'Brain'].hyperparameters.beta = 0.005 * ( 1 - counter / max_meta_updates) trainer_factory.trainer_config[ 'Brain'].hyperparameters.epsilon = 0.2 * ( 1 - counter / max_meta_updates) print("Current lr: {}\nCurrent beta: {}\nCurrent epsilon: {}".format( trainer_factory.trainer_config['Brain'].hyperparameters. learning_rate, trainer_factory.trainer_config['Brain'].hyperparameters.beta, trainer_factory.trainer_config['Brain'].hyperparameters.epsilon)) # Create controller and begin training. tc = TrainerController( trainer_factory, write_path, checkpoint_settings.run_id, maybe_meta_curriculum, not checkpoint_settings.inference, run_seed, sampler_manager, resampling_interval, ) try: # Get inital weights print("Start learning at step: " + str(step) + " meta_step: " + str(counter)) print("Inital weights: " + str(inital_weights[8])) weights_after_train = tc.start_learning(env_manager, inital_weights) print(tc.trainers['Brain'].optimizer) # weights_after_train = tc.weights # print("Trained weights: " + str(weights_after_train[8])) step += options.behaviors['Brain'].max_steps print("meta step:" + str(step)) # print(weights_after_train) # equal = [] # for i, weight in enumerate(tc.weights): # equal.append(np.array_equal(inital_weights[i], weights_after_train[i])) # print(all(equal)) finally: print(len(weights_after_train), len(inital_weights)) for i, weight in enumerate(weights_after_train): inital_weights[i] = weights_after_train[i] env_manager.close() write_run_options(write_path, options) write_timing_tree(run_logs_dir) write_training_status(run_logs_dir) counter += 1
class Trainer(object): """This class is the base class for the mlagents_envs.trainers""" def __init__( self, brain: BrainParameters, trainer_parameters: dict, training: bool, run_id: str, reward_buff_cap: int = 1, ): """ Responsible for collecting experiences and training a neural network model. :BrainParameters brain: Brain to be trained. :dict trainer_parameters: The parameters for the trainer (dictionary). :bool training: Whether the trainer is set for training. :str run_id: The identifier of the current run :int reward_buff_cap: """ self.param_keys: List[str] = [] self.brain_name = brain.brain_name self.run_id = run_id self.trainer_parameters = trainer_parameters self.summary_path = trainer_parameters["summary_path"] self.stats_reporter = StatsReporter(self.summary_path) self.cumulative_returns_since_policy_update: List[float] = [] self.is_training = training self._reward_buffer: Deque[float] = deque(maxlen=reward_buff_cap) self.policy: TFPolicy = None # type: ignore # this will always get set self.step: int = 0 def check_param_keys(self): for k in self.param_keys: if k not in self.trainer_parameters: raise UnityTrainerException( "The hyper-parameter {0} could not be found for the {1} trainer of " "brain {2}.".format(k, self.__class__, self.brain_name)) def write_tensorboard_text(self, key: str, input_dict: Dict[str, Any]) -> None: """ Saves text to Tensorboard. Note: Only works on tensorflow r1.2 or above. :param key: The name of the text. :param input_dict: A dictionary that will be displayed in a table on Tensorboard. """ try: with tf.Session() as sess: s_op = tf.summary.text( key, tf.convert_to_tensor( ([[str(x), str(input_dict[x])] for x in input_dict])), ) s = sess.run(s_op) self.stats_reporter.write_text(s, self.get_step) except Exception: LOGGER.info("Could not write text summary for Tensorboard.") pass def dict_to_str(self, param_dict: Dict[str, Any], num_tabs: int) -> str: """ Takes a parameter dictionary and converts it to a human-readable string. Recurses if there are multiple levels of dict. Used to print out hyperaparameters. param: param_dict: A Dictionary of key, value parameters. return: A string version of this dictionary. """ if not isinstance(param_dict, dict): return str(param_dict) else: append_newline = "\n" if num_tabs > 0 else "" return append_newline + "\n".join([ "\t" + " " * num_tabs + "{0}:\t{1}".format( x, self.dict_to_str(param_dict[x], num_tabs + 1)) for x in param_dict ]) def __str__(self) -> str: return """Hyperparameters for the {0} of brain {1}: \n{2}""".format( self.__class__.__name__, self.brain_name, self.dict_to_str(self.trainer_parameters, 0), ) @property def parameters(self) -> Dict[str, Any]: """ Returns the trainer parameters of the trainer. """ return self.trainer_parameters @property def get_max_steps(self) -> float: """ Returns the maximum number of steps. Is used to know when the trainer should be stopped. :return: The maximum number of steps of the trainer """ return float(self.trainer_parameters["max_steps"]) @property def get_step(self) -> int: """ Returns the number of steps the trainer has performed :return: the step count of the trainer """ return self.step @property def reward_buffer(self) -> Deque[float]: """ Returns the reward buffer. The reward buffer contains the cumulative rewards of the most recent episodes completed by agents using this trainer. :return: the reward buffer. """ return self._reward_buffer def increment_step(self, n_steps: int) -> None: """ Increment the step count of the trainer :param n_steps: number of steps to increment the step count by """ self.step = self.policy.increment_step(n_steps) def save_model(self) -> None: """ Saves the model """ self.policy.save_model(self.get_step) def export_model(self) -> None: """ Exports the model """ self.policy.export_model() def write_summary(self, global_step: int, delta_train_start: float) -> None: """ Saves training statistics to Tensorboard. :param delta_train_start: Time elapsed since training started. :param global_step: The number of steps the simulation has been going for """ if (global_step % self.trainer_parameters["summary_freq"] == 0 and global_step != 0): is_training = ("Training." if self.is_training and self.get_step <= self.get_max_steps else "Not Training.") step = min(self.get_step, self.get_max_steps) stats_summary = self.stats_reporter.get_stats_summaries( "Environment/Cumulative Reward") if stats_summary.num > 0: LOGGER.info(" {}: {}: Step: {}. " "Time Elapsed: {:0.3f} s " "Mean " "Reward: {:0.3f}" ". Std of Reward: {:0.3f}. {}".format( self.run_id, self.brain_name, step, delta_train_start, stats_summary.mean, stats_summary.std, is_training, )) set_gauge(f"{self.brain_name}.mean_reward", stats_summary.mean) else: LOGGER.info( " {}: {}: Step: {}. No episode was completed since last summary. {}" .format(self.run_id, self.brain_name, step, is_training)) self.stats_reporter.write_stats(int(step)) def process_trajectory(self, trajectory: Trajectory) -> None: """ Takes a trajectory and processes it, putting it into the update buffer. Processing involves calculating value and advantage targets for model updating step. :param trajectory: The Trajectory tuple containing the steps to be processed. """ raise UnityTrainerException( "The process_experiences method was not implemented.") def end_episode(self): """ A signal that the Episode has ended. The buffer must be reset. Get only called when the academy resets. """ raise UnityTrainerException( "The end_episode method was not implemented.") def is_ready_update(self): """ Returns whether or not the trainer has enough elements to run update model :return: A boolean corresponding to wether or not update_model() can be run """ raise UnityTrainerException( "The is_ready_update method was not implemented.") def update_policy(self): """ Uses demonstration_buffer to update model. """ raise UnityTrainerException( "The update_model method was not implemented.") def advance(self) -> None: pass
def run_training(sub_id: int, run_seed: int, options: CommandLineOptions, process_queue: Queue) -> None: """ Launches training session. :param process_queue: Queue used to send signal back to main. :param sub_id: Unique id for training session. :param options: parsed command line arguments :param run_seed: Random seed used for training. :param run_options: Command line arguments for training. """ # Docker Parameters trainer_config_path = options.trainer_config_path curriculum_folder = options.curriculum_folder # Recognize and use docker volume if one is passed as an argument if not options.docker_target_name: model_path = "./models/{run_id}-{sub_id}".format(run_id=options.run_id, sub_id=sub_id) summaries_dir = "./summaries" else: trainer_config_path = "/{docker_target_name}/{trainer_config_path}".format( docker_target_name=options.docker_target_name, trainer_config_path=trainer_config_path, ) if curriculum_folder is not None: curriculum_folder = "/{docker_target_name}/{curriculum_folder}".format( docker_target_name=options.docker_target_name, curriculum_folder=curriculum_folder, ) model_path = "/{docker_target_name}/models/{run_id}-{sub_id}".format( docker_target_name=options.docker_target_name, run_id=options.run_id, sub_id=sub_id, ) summaries_dir = "/{docker_target_name}/summaries".format( docker_target_name=options.docker_target_name) trainer_config = load_config(trainer_config_path) port = options.base_port + (sub_id * options.num_envs) # Configure CSV, Tensorboard Writers and StatsReporter # We assume reward and episode length are needed in the CSV. csv_writer = CSVWriter( summaries_dir, required_fields=[ "Environment/Cumulative Reward", "Environment/Episode Length" ], ) tb_writer = TensorboardWriter(summaries_dir) StatsReporter.add_writer(tb_writer) StatsReporter.add_writer(csv_writer) if options.env_path is None: port = 5004 # This is the in Editor Training Port env_factory = create_environment_factory( options.env_path, options.docker_target_name, options.no_graphics, run_seed, port, options.env_args, ) engine_config = EngineConfig( options.width, options.height, options.quality_level, options.time_scale, options.target_frame_rate, ) env_manager = SubprocessEnvManager(env_factory, engine_config, options.num_envs) maybe_meta_curriculum = try_create_meta_curriculum(curriculum_folder, env_manager, options.lesson) sampler_manager, resampling_interval = create_sampler_manager( options.sampler_file_path, run_seed) trainer_factory = TrainerFactory( trainer_config, summaries_dir, options.run_id, model_path, options.keep_checkpoints, options.train_model, options.load_model, run_seed, maybe_meta_curriculum, options.multi_gpu, ) # Create controller and begin training. tc = TrainerController( trainer_factory, model_path, summaries_dir, options.run_id + "-" + str(sub_id), options.save_freq, maybe_meta_curriculum, options.train_model, run_seed, sampler_manager, resampling_interval, ) # Signal that environment has been launched. process_queue.put(True) # Begin training try: tc.start_learning(env_manager) finally: env_manager.close()
def run_training(run_seed: int, options: RunOptions, num_areas: int) -> None: """ Launches training session. :param run_seed: Random seed used for training. :param num_areas: Number of training areas to instantiate :param options: parsed command line arguments """ with hierarchical_timer("run_training.setup"): torch_utils.set_torch_config(options.torch_settings) checkpoint_settings = options.checkpoint_settings env_settings = options.env_settings engine_settings = options.engine_settings run_logs_dir = checkpoint_settings.run_logs_dir port: Optional[int] = env_settings.base_port # Check if directory exists validate_existing_directories( checkpoint_settings.write_path, checkpoint_settings.resume, checkpoint_settings.force, checkpoint_settings.maybe_init_path, ) # Make run logs directory os.makedirs(run_logs_dir, exist_ok=True) # Load any needed states in case of resume if checkpoint_settings.resume: GlobalTrainingStatus.load_state( os.path.join(run_logs_dir, "training_status.json") ) # In case of initialization, set full init_path for all behaviors elif checkpoint_settings.maybe_init_path is not None: setup_init_path(options.behaviors, checkpoint_settings.maybe_init_path) # Configure Tensorboard Writers and StatsReporter stats_writers = register_stats_writer_plugins(options) for sw in stats_writers: StatsReporter.add_writer(sw) if env_settings.env_path is None: port = None env_factory = create_environment_factory( env_settings.env_path, engine_settings.no_graphics, run_seed, num_areas, port, env_settings.env_args, os.path.abspath(run_logs_dir), # Unity environment requires absolute path ) env_manager = SubprocessEnvManager(env_factory, options, env_settings.num_envs) env_parameter_manager = EnvironmentParameterManager( options.environment_parameters, run_seed, restore=checkpoint_settings.resume ) trainer_factory = TrainerFactory( trainer_config=options.behaviors, output_path=checkpoint_settings.write_path, train_model=not checkpoint_settings.inference, load_model=checkpoint_settings.resume, seed=run_seed, param_manager=env_parameter_manager, init_path=checkpoint_settings.maybe_init_path, multi_gpu=False, ) # Create controller and begin training. tc = TrainerController( trainer_factory, checkpoint_settings.write_path, checkpoint_settings.run_id, env_parameter_manager, not checkpoint_settings.inference, run_seed, ) # Begin training try: tc.start_learning(env_manager) finally: env_manager.close() write_run_options(checkpoint_settings.write_path, options) write_timing_tree(run_logs_dir) write_training_status(run_logs_dir)
def run_training_aai(run_seed: int, options: RunOptionsAAI) -> None: """ Launches training session. :param run_seed: Random seed used for training. :param options: training parameters """ with hierarchical_timer("run_training.setup"): # Recognize and use docker volume if one is passed as an argument # if not options.docker_target_name: model_path = f"./models/{options.run_id}" summaries_dir = "./summaries" # else: # model_path = f"/{options.docker_target_name}/models/{options.run_id}" # summaries_dir = f"/{options.docker_target_name}/summaries" port = options.base_port # Configure CSV, Tensorboard Writers and StatsReporter # We assume reward and episode length are needed in the CSV. csv_writer = CSVWriter( summaries_dir, required_fields=[ "Environment/Cumulative Reward", "Environment/Episode Length", ], ) tb_writer = TensorboardWriter(summaries_dir) gauge_write = GaugeWriter() StatsReporter.add_writer(tb_writer) StatsReporter.add_writer(csv_writer) StatsReporter.add_writer(gauge_write) if options.env_path is None: port = AnimalAIEnvironment.DEFAULT_EDITOR_PORT env_factory = create_environment_factory_aai( options.env_path, # options.docker_target_name, run_seed, port, options.n_arenas_per_env, options.arena_config, options.resolution, ) if options.train_model: engine_config = EngineConfig( options.width, options.height, AnimalAIEnvironment.QUALITY_LEVEL.train, AnimalAIEnvironment.TIMESCALE.train, AnimalAIEnvironment.TARGET_FRAME_RATE.train, ) else: engine_config = EngineConfig( AnimalAIEnvironment.WINDOW_WIDTH.play, AnimalAIEnvironment.WINDOW_HEIGHT.play, AnimalAIEnvironment.QUALITY_LEVEL.play, AnimalAIEnvironment.TIMESCALE.play, AnimalAIEnvironment.TARGET_FRAME_RATE.play, ) env_manager = SubprocessEnvManagerAAI(env_factory, engine_config, options.num_envs) maybe_meta_curriculum = try_create_meta_curriculum( options.curriculum_config, env_manager, options.lesson) trainer_factory = TrainerFactory( options.trainer_config, summaries_dir, options.run_id, model_path, options.keep_checkpoints, options.train_model, options.load_model, run_seed, maybe_meta_curriculum, # options.multi_gpu, ) # Create controller and begin training. tc = TrainerControllerAAI( trainer_factory, model_path, summaries_dir, options.run_id, options.save_freq, maybe_meta_curriculum, options.train_model, run_seed, ) # Begin training try: tc.start_learning(env_manager) finally: env_manager.close() write_timing_tree(summaries_dir, options.run_id)
def run_training(run_seed: int, options: RunOptions) -> None: """ Launches training session. :param options: parsed command line arguments :param run_seed: Random seed used for training. :param run_options: Command line arguments for training. """ with hierarchical_timer("run_training.setup"): checkpoint_settings = options.checkpoint_settings env_settings = options.env_settings engine_settings = options.engine_settings base_path = "results" write_path = os.path.join(base_path, checkpoint_settings.run_id) maybe_init_path = ( os.path.join(base_path, checkpoint_settings.initialize_from) if checkpoint_settings.initialize_from else None ) run_logs_dir = os.path.join(write_path, "run_logs") port: Optional[int] = env_settings.base_port # Check if directory exists handle_existing_directories( write_path, checkpoint_settings.resume, checkpoint_settings.force, maybe_init_path, ) # Make run logs directory os.makedirs(run_logs_dir, exist_ok=True) # Load any needed states if checkpoint_settings.resume: GlobalTrainingStatus.load_state( os.path.join(run_logs_dir, "training_status.json") ) # Configure CSV, Tensorboard Writers and StatsReporter # We assume reward and episode length are needed in the CSV. csv_writer = CSVWriter( write_path, required_fields=[ "Environment/Cumulative Reward", "Environment/Episode Length", ], ) tb_writer = TensorboardWriter( write_path, clear_past_data=not checkpoint_settings.resume ) gauge_write = GaugeWriter() console_writer = ConsoleWriter() StatsReporter.add_writer(tb_writer) StatsReporter.add_writer(csv_writer) StatsReporter.add_writer(gauge_write) StatsReporter.add_writer(console_writer) if env_settings.env_path is None: port = None env_factory = create_environment_factory( env_settings.env_path, engine_settings.no_graphics, run_seed, port, env_settings.env_args, os.path.abspath(run_logs_dir), # Unity environment requires absolute path ) engine_config = EngineConfig( width=engine_settings.width, height=engine_settings.height, quality_level=engine_settings.quality_level, time_scale=engine_settings.time_scale, target_frame_rate=engine_settings.target_frame_rate, capture_frame_rate=engine_settings.capture_frame_rate, ) env_manager = SubprocessEnvManager( env_factory, engine_config, env_settings.num_envs ) maybe_meta_curriculum = try_create_meta_curriculum( options.curriculum, env_manager, restore=checkpoint_settings.resume ) maybe_add_samplers(options.parameter_randomization, env_manager, run_seed) trainer_factory = TrainerFactory( options.behaviors, write_path, not checkpoint_settings.inference, checkpoint_settings.resume, run_seed, maybe_init_path, maybe_meta_curriculum, False, ) # Create controller and begin training. tc = TrainerController( trainer_factory, write_path, checkpoint_settings.run_id, maybe_meta_curriculum, not checkpoint_settings.inference, run_seed, ) # Begin training try: tc.start_learning(env_manager) finally: env_manager.close() write_run_options(write_path, options) write_timing_tree(run_logs_dir) write_training_status(run_logs_dir)