def test_ppo_model_cc_vector_rnn(mock_communicator, mock_launcher): tf.reset_default_graph() with tf.Session() as sess: with tf.variable_scope("FakeGraphScope"): mock_communicator.return_value = MockCommunicator( discrete_action=False, visual_inputs=0) env = UnityEnvironment(' ') memory_size = 128 model = PPOModel(env.brains["RealFakeBrain"], use_recurrent=True, m_size=memory_size) init = tf.global_variables_initializer() sess.run(init) run_list = [ model.output, model.all_log_probs, model.value, model.entropy, model.learning_rate, model.memory_out ] feed_dict = { model.batch_size: 1, model.sequence_length: 2, model.memory_in: np.zeros((1, memory_size)), model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]), model.epsilon: np.array([[0, 1]]) } sess.run(run_list, feed_dict=feed_dict) env.close()
def test_step(mock_communicator, mock_launcher): mock_communicator.return_value = MockCommunicator(discrete_action=False, visual_inputs=0) env = UnityEnvironment(' ') brain = env.brains['RealFakeBrain'] brain_info = env.reset() brain_info = env.step([0] * brain.vector_action_space_size[0] * len(brain_info['RealFakeBrain'].agents)) with pytest.raises(UnityActionException): env.step([0]) brain_info = env.step([-1] * brain.vector_action_space_size[0] * len(brain_info['RealFakeBrain'].agents)) with pytest.raises(UnityActionException): env.step([0] * brain.vector_action_space_size[0] * len(brain_info['RealFakeBrain'].agents)) env.close() assert env.global_done assert isinstance(brain_info, dict) assert isinstance(brain_info['RealFakeBrain'], BrainInfo) assert isinstance(brain_info['RealFakeBrain'].visual_observations, list) assert isinstance(brain_info['RealFakeBrain'].vector_observations, np.ndarray) assert len(brain_info['RealFakeBrain'].visual_observations ) == brain.number_visual_observations assert len(brain_info['RealFakeBrain'].vector_observations) == \ len(brain_info['RealFakeBrain'].agents) assert len(brain_info['RealFakeBrain'].vector_observations[0]) == \ brain.vector_observation_space_size * brain.num_stacked_vector_observations print("\n\n\n\n\n\n\n" + str(brain_info['RealFakeBrain'].local_done)) assert not brain_info['RealFakeBrain'].local_done[0] assert brain_info['RealFakeBrain'].local_done[2]
def test_ppo_get_value_estimates(mock_communicator, mock_launcher, dummy_config): tf.reset_default_graph() mock_communicator.return_value = MockCommunicator( discrete_action=False, visual_inputs=0 ) env = UnityEnvironment(" ") brain_infos = env.reset() brain_info = brain_infos[env.brain_names[0]] trainer_parameters = dummy_config model_path = env.brain_names[0] trainer_parameters["model_path"] = model_path trainer_parameters["keep_checkpoints"] = 3 policy = PPOPolicy( 0, env.brains[env.brain_names[0]], trainer_parameters, False, False ) run_out = policy.get_value_estimates(brain_info, 0, done=False) for key, val in run_out.items(): assert type(key) is str assert type(val) is float run_out = policy.get_value_estimates(brain_info, 0, done=True) for key, val in run_out.items(): assert type(key) is str assert val == 0.0 # Check if we ignore terminal states properly policy.reward_signals["extrinsic"].use_terminal_states = False run_out = policy.get_value_estimates(brain_info, 0, done=True) for key, val in run_out.items(): assert type(key) is str assert val != 0.0 env.close()
def test_ppo_model_dc_vector_curio(mock_communicator, mock_launcher): tf.reset_default_graph() with tf.Session() as sess: with tf.variable_scope("FakeGraphScope"): mock_communicator.return_value = MockCommunicator( discrete_action=True, visual_inputs=0) env = UnityEnvironment(' ') model = PPOModel(env.brains["RealFakeBrain"], use_curiosity=True) init = tf.global_variables_initializer() sess.run(init) run_list = [ model.output, model.all_log_probs, model.value, model.entropy, model.learning_rate, model.intrinsic_reward ] feed_dict = { model.batch_size: 2, model.sequence_length: 1, model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]), model.next_vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]), model.action_holder: [[0], [0]], model.action_masks: np.ones([2, 2]) } sess.run(run_list, feed_dict=feed_dict) env.close()
def test_ppo_model_cc_visual(mock_communicator, mock_launcher): tf.reset_default_graph() with tf.Session() as sess: with tf.variable_scope("FakeGraphScope"): mock_communicator.return_value = MockCommunicator( discrete_action=False, visual_inputs=2) env = UnityEnvironment(' ') model = PPOModel(env.brains["RealFakeBrain"]) init = tf.global_variables_initializer() sess.run(init) run_list = [ model.output, model.log_probs, model.value, model.entropy, model.learning_rate ] feed_dict = { model.batch_size: 2, model.sequence_length: 1, model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]), model.visual_in[0]: np.ones([2, 40, 30, 3]), model.visual_in[1]: np.ones([2, 40, 30, 3]), model.epsilon: np.array([[0, 1], [2, 3]]) } sess.run(run_list, feed_dict=feed_dict) env.close()
def test_unity(self): from mlagents.envs import UnityEnvironment env = UnityEnvironment(file_name='env/' + env_set['env_name'], worker_id=0) default_brain = env.brain_names[0] env_info = env.reset(config={ 'Mass': 1, 'Length': 1.5 * 3.0 })[default_brain] states = env_info.vector_observations self.saver.restore(self.sess, save_path='runs/td3_' + env_set['env_name'] + '/save') scores = np.zeros([self.worker_size]) score = deque(maxlen=1000) for i in range(10000): actions = self.get_action(states, 0.05) env_info = env.step(actions)[default_brain] states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done scores += rewards for idx, d in enumerate(dones): if d: score.append(scores[idx]) scores[idx] = 0 print('score : ', "{0:.2f}".format(np.mean(score))) env.close()
def test_close(mock_communicator, mock_launcher): comm = MockCommunicator(discrete_action=False, visual_inputs=0) mock_communicator.return_value = comm env = UnityEnvironment(' ') assert env._loaded env.close() assert not env._loaded assert comm.has_been_closed
def test_initialization(mock_communicator, mock_launcher): mock_communicator.return_value = MockCommunicator(discrete_action=False, visual_inputs=0) env = UnityEnvironment(' ') with pytest.raises(UnityActionException): env.step([0]) assert env.brain_names[0] == 'RealFakeBrain' env.close()
def main(): env = UnityEnvironment(file_name='../env/Pong/Pong') default_brain = env.brain_names[0] brain = env.brains[default_brain] env_info = env.reset(train_mode=False)[default_brain] obs_dim = env_info.vector_observations[0].shape[0] act_num = brain.vector_action_space_size[0] mlp = MLP(obs_dim, act_num).to(device) if args.load is not None: pretrained_model_path = os.path.join('./save_model/' + str(args.load)) pretrained_model = torch.load(pretrained_model_path) mlp.load_state_dict(pretrained_model) sum_returns = 0. num_episodes = 0 for episode in range(1, 10001): total_reward = 0. obs = env_info.vector_observations[0] done = False while not done: action = mlp( torch.Tensor(obs).to(device)).argmax().detach().cpu().numpy() env_info = env.step(int(action))[default_brain] next_obs = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] total_reward += reward obs = next_obs sum_returns += total_reward num_episodes += 1 average_return = sum_returns / num_episodes if num_episodes > 0 else 0.0 if episode % 10 == 0: print('---------------------------------------') print('Episodes:', num_episodes) print('AverageReturn:', average_return) print('---------------------------------------') env.close()
def evaluate_model(self, model_name): """ Session for evaluating every model. Uses the concept of the Trainer in a deterministic manner :param model_name: Name of the model to be evaluated """ print( "\n==================== New Evaluation {} ============================" .format(model_name)) if self.use_executable: env = UnityEnvironment(file_name=self.env_name) else: env = UnityEnvironment(file_name=None) default_brain = env.brain_names[0] env_info = env.reset(train_mode=False)[default_brain] num_output = len(env_info.action_masks[0]) # Fetching model model_path = self.path_to_models + model_name + ".h5" model_manager = ModelManager(load=True, num_views=num_output, num_output=num_output, model_name=model_path) # Change the model name if "_" in model_name and False: model_name = "evaluation_" + model_name.split("_", 1)[1] else: model_name = "eval_" + model_name model_name = "eval_coverage_progression" # Evaluating the model trainer = Trainer(model_manager, env, self.max_step) synopsis = SynopsisManager(trainer, model_manager, run_name=model_name, max_step=self.max_step) trainer.evaluate_solution(self.evaluation_size) # Close environment env.close() # Cleanup # del trainer.memory del trainer del synopsis del model_manager
def execute_session(self, model_name, alpha_acc, exp_acc, alpha_dist, exp_dist, alpha_steps, t): """ Executes a single training session of a model :param model_name: Name of the model :param alpha_acc: Coverage reward :param exp_acc: Coverage exponential reward :param alpha_dist: Distance reward :param exp_dist: Distance Exponential Reward :param alpha_steps: Step Reward :param t: Which architecture to be used """ print("\n==================== New Session {} ============================".format(model_name)) print("acc: {} - {}, dist: {} - {}, steps {}, views: {}, LR: {}\n" .format(alpha_acc, exp_acc, alpha_dist, exp_dist, alpha_steps, self.alpha_views, self.learning_rate)) if self.use_executable: env = UnityEnvironment(file_name=self.env_name) else: env = UnityEnvironment(file_name=None) default_brain = env.brain_names[0] env_info = env.reset(train_mode=False)[default_brain] num_output = len(env_info.action_masks[0]) # Fetching model model_manager = ModelManager(load=self.load_model, num_views=num_output, num_output=num_output, model_name=model_name, learning_rate=self.learning_rate, variation=t) # Train trainer = Trainer(model_manager, env, self.max_step) trainer.set_reward_values(alpha_acc, exp_acc, alpha_dist, exp_dist, alpha_steps, self.alpha_views) synopsis = SynopsisManager(trainer, model_manager, run_name=model_name, max_step=self.max_step) trainer.train(self.num_generations, self.num_batches, self.batch_size, self.test_size) synopsis.print_training_summary() trainer.evaluate_solution(self.evaluation_size) # Close environment env.close() # Save model model_manager.save_model() # Cleanup # del trainer.memory del trainer del synopsis del model_manager
def test_ppo_policy_evaluate(mock_communicator, mock_launcher, dummy_config): tf.reset_default_graph() mock_communicator.return_value = MockCommunicator(discrete_action=False, visual_inputs=0) env = UnityEnvironment(' ') brain_infos = env.reset() brain_info = brain_infos[env.brain_names[0]] trainer_parameters = dummy_config model_path = env.brain_names[0] trainer_parameters['model_path'] = model_path trainer_parameters['keep_checkpoints'] = 3 policy = PPOPolicy(0, env.brains[env.brain_names[0]], trainer_parameters, False, False) run_out = policy.evaluate(brain_info) assert run_out['action'].shape == (3, 2) env.close()
class Drone: spec = None name = None action_space = None observation_space = None def __init__( self, env_path: str, env_name: str, cfg: dict, train_mode: bool = True, worker_id: int = 1, ): self.env = UnityEnvironment(file_name=env_path, worker_id=worker_id) self.default_brain = self.env.brain_names[0] self.cfg = cfg self.name = env_name self.action_space = spaces.Box(low=-1, high=1, shape=(3, ), dtype=np.float32) self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(9, ), dtype=np.float32) self.train_mode = train_mode def reset(self): env_info = self.env.reset(train_mode=self.train_mode, config=self.cfg)[self.default_brain] return env_info.vector_observations[0] def step(self, action): env_info = self.env.step(action.tolist())[self.default_brain] observation = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] info = None return observation, reward, done, info def close(self): self.env.close() def seed(self, seed): pass
def test_cc_bc_model(mock_communicator, mock_launcher): tf.reset_default_graph() with tf.Session() as sess: with tf.variable_scope("FakeGraphScope"): mock_communicator.return_value = MockCommunicator( discrete_action=False, visual_inputs=0) env = UnityEnvironment(' ') model = BehavioralCloningModel(env.brains["RealFakeBrain"]) init = tf.global_variables_initializer() sess.run(init) run_list = [model.sample_action, model.policy] feed_dict = {model.batch_size: 2, model.sequence_length: 1, model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]])} sess.run(run_list, feed_dict=feed_dict) env.close()
def test_reset(mock_communicator, mock_launcher): mock_communicator.return_value = MockCommunicator(discrete_action=False, visual_inputs=0) env = UnityEnvironment(' ') brain = env.brains['RealFakeBrain'] brain_info = env.reset() env.close() assert not env.global_done assert isinstance(brain_info, dict) assert isinstance(brain_info['RealFakeBrain'], BrainInfo) assert isinstance(brain_info['RealFakeBrain'].visual_observations, list) assert isinstance(brain_info['RealFakeBrain'].vector_observations, np.ndarray) assert len(brain_info['RealFakeBrain'].visual_observations ) == brain.number_visual_observations assert len(brain_info['RealFakeBrain'].vector_observations) == \ len(brain_info['RealFakeBrain'].agents) assert len(brain_info['RealFakeBrain'].vector_observations[0]) == \ brain.vector_observation_space_size * brain.num_stacked_vector_observations
def test_ppo_policy_evaluate(mock_communicator, mock_launcher): tf.reset_default_graph() with tf.Session() as sess: mock_communicator.return_value = MockCommunicator( discrete_action=False, visual_inputs=0) env = UnityEnvironment(' ') brain_infos = env.reset() brain_info = brain_infos[env.brain_names[0]] trainer_parameters = dummy_config() graph_scope = env.brain_names[0] trainer_parameters['graph_scope'] = graph_scope policy = PPOPolicy(0, env.brains[env.brain_names[0]], trainer_parameters, sess, False) init = tf.global_variables_initializer() sess.run(init) run_out = policy.evaluate(brain_info) assert run_out['action'].shape == (3, 2) env.close()
class Sokoban: spec = None name = None action_space = None observation_space = None def __init__( self, env_path: str, env_name: str, cfg: dict, train_mode=True, worker_id: int = 1, ): self.env = UnityEnvironment(file_name=env_path, worker_id=worker_id) self.default_brain = self.env.brain_names[0] self.cfg = cfg self.name = env_name self.action_space = spaces.Discrete(5) self.observation_space = spaces.Box(low=0, high=255, shape=(3, 84, 84), dtype=np.uint8) self.train_mode = train_mode def reset(self): env_info = self.env.reset(train_mode=self.train_mode, config=self.cfg)[self.default_brain] return env_info.visual_observations[0][0].reshape(3, 84, 84) def step(self, action): env_info = self.env.step(action.tolist())[self.default_brain] observation = env_info.visual_observations[0][0].reshape(3, 84, 84) reward = env_info.rewards[0] done = env_info.local_done[0] info = None return observation, reward, done, info def close(self): self.env.close() def seed(self, seed): pass
class Sokoban_env(): def __init__(self, env_path, env_cfg=Sokoban_env_cfg): self.env = UnityEnvironment(file_name=env_path) self.default_brain = self.env.brain_names[0] self.env_cfg = env_cfg def reset(self): env_info = self.env.reset(train_mode=True, config=self.env_cfg)[self.default_brain] return env_info.visual_observations[0][0].reshape(1,3,84,84) def step(self, action): env_info = self.env.step(action)[self.default_brain] observation = env_info.visual_observations[0][0].reshape(1,3,84,84) reward = env_info.rewards[0] done = env_info.local_done[0] info = None return observation, reward, done, info def close(self): self.env.close()
def test_visual_dc_bc_model(mock_communicator, mock_launcher): tf.reset_default_graph() with tf.Session() as sess: with tf.variable_scope("FakeGraphScope"): mock_communicator.return_value = MockCommunicator( discrete_action=True, visual_inputs=2) env = UnityEnvironment(" ") model = BehavioralCloningModel(env.brains["RealFakeBrain"]) init = tf.global_variables_initializer() sess.run(init) run_list = [model.sample_action, model.action_probs] feed_dict = { model.batch_size: 2, model.dropout_rate: 1.0, model.sequence_length: 1, model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]), model.visual_in[0]: np.ones([2, 40, 30, 3]), model.visual_in[1]: np.ones([2, 40, 30, 3]), model.action_masks: np.ones([2, 2]), } sess.run(run_list, feed_dict=feed_dict) env.close()
class UnityEnvBase(gym.Env): """ Provides Gym wrapper for Unity Learning Environments. Multi-agent environments use lists for object types, as done here: https://github.com/openai/multiagent-particle-envs """ worker_id = UNIVERSAL_LOCK def __init__( self, environment_filename: str, use_visual=True, uint8_visual=True, multiagent=False, flatten_branched=False, no_graphics=False, allow_multiple_visual_obs=False, ): """ Environment initialization :param environment_filename: The UnityEnvironment path or file to be wrapped in the gym. :param worker_id: Worker number for environment. :param use_visual: Whether to use visual observation or vector observation. :param uint8_visual: Return visual observations as uint8 (0-255) matrices instead of float (0.0-1.0). :param multiagent: Whether to run in multi-agent mode (lists of obs, reward, done). :param flatten_branched: If True, turn branched discrete action spaces into a Discrete space rather than MultiDiscrete. :param no_graphics: Whether to run the Unity simulator in no-graphics mode :param allow_multiple_visual_obs: If True, return a list of visual observations instead of only one. """ worker_id = UnityEnvBase._generate_new_env_id() self.worker_id = worker_id self._env = UnityEnvironment(environment_filename, worker_id, no_graphics=no_graphics) self.name = self._env.academy_name self.visual_obs = None self._current_state = None self._n_agents = None self._multiagent = multiagent self._flattener = None self.game_over = ( False ) # Hidden flag used by Atari environments to determine if the game is over self._allow_multiple_visual_obs = allow_multiple_visual_obs # Check brain configuration if len(self._env.brains) != 1: raise UnityEnvBaseException( "There can only be one brain in a UnityEnvironment " "if it is wrapped in a gym.") if len(self._env.external_brain_names) <= 0: raise UnityEnvBaseException( "There are not any external brain in the UnityEnvironment") self.brain_name = self._env.external_brain_names[0] brain = self._env.brains[self.brain_name] if use_visual and brain.number_visual_observations == 0: raise UnityEnvBaseException( "`use_visual` was set to True, however there are no" " visual observations as part of this environment.") self.use_visual = brain.number_visual_observations >= 1 and use_visual if not use_visual and uint8_visual: logger.warning( "`uint8_visual was set to true, but visual observations are not in use. " "This setting will not have any effect.") else: self.uint8_visual = uint8_visual if brain.number_visual_observations > 1 and not self._allow_multiple_visual_obs: logger.warning( "The environment contains more than one visual observation. " "You must define allow_multiple_visual_obs=True to received them all. " "Otherwise, please note that only the first will be provided in the observation." ) if brain.num_stacked_vector_observations != 1: raise UnityEnvBaseException( "There can only be one stacked vector observation in a UnityEnvironment " "if it is wrapped in a gym.") # Check for number of agents in scene. initial_info = self._env.reset()[self.brain_name] self._check_agents(len(initial_info.agents)) # Set observation and action spaces if brain.vector_action_space_type == "discrete": if len(brain.vector_action_space_size) == 1: self._action_space = spaces.Discrete( brain.vector_action_space_size[0]) else: if flatten_branched: self._flattener = ActionFlattener( brain.vector_action_space_size) self._action_space = self._flattener.action_space else: self._action_space = spaces.MultiDiscrete( brain.vector_action_space_size) else: if flatten_branched: logger.warning( "The environment has a non-discrete action space. It will " "not be flattened.") high = np.array([np.inf] * brain.vector_action_space_size[0]) self._action_space = spaces.Box(-high, high, dtype=np.float32) high = np.array([np.inf] * brain.vector_observation_space_size) self.action_meanings = brain.vector_action_descriptions if self.use_visual: if brain.camera_resolutions[0]["blackAndWhite"]: depth = 1 else: depth = 3 self._observation_space = spaces.Box( 0, 1, dtype=np.float32, shape=( brain.camera_resolutions[0]["height"], brain.camera_resolutions[0]["width"], depth, ), ) else: self._observation_space = spaces.Box(-high, high, dtype=np.float32) def reset(self): """Resets the state of the environment and returns an initial observation. In the case of multi-agent environments, this is a list. Returns: observation (object/list): the initial observation of the space. """ info = self._env.reset()[self.brain_name] n_agents = len(info.agents) self._check_agents(n_agents) self.game_over = False if not self._multiagent: obs, reward, done, info = self._single_step(info) else: obs, reward, done, info = self._multi_step(info) return obs def step(self, action): """Run one timestep of the environment's dynamics. When end of episode is reached, you are responsible for calling `reset()` to reset this environment's state. Accepts an action and returns a tuple (observation, reward, done, info). In the case of multi-agent environments, these are lists. Args: action (object/list): an action provided by the environment Returns: observation (object/list): agent's observation of the current environment reward (float/list) : amount of reward returned after previous action done (boolean/list): whether the episode has ended. info (dict): contains auxiliary diagnostic information, including BrainInfo. """ # Use random actions for all other agents in environment. if self._multiagent: if not isinstance(action, list): raise UnityEnvBaseException( "The environment was expecting `action` to be a list.") if len(action) != self._n_agents: raise UnityEnvBaseException( "The environment was expecting a list of {} actions.". format(self._n_agents)) else: if self._flattener is not None: # Action space is discrete and flattened - we expect a list of scalars action = [ self._flattener.lookup_action(_act) for _act in action ] action = np.array(action) else: if self._flattener is not None: # Translate action into list action = self._flattener.lookup_action(action) info = self._env.step(action)[self.brain_name] n_agents = len(info.agents) self._check_agents(n_agents) self._current_state = info if not self._multiagent: obs, reward, done, info = self._single_step(info) self.game_over = done else: obs, reward, done, info = self._multi_step(info) self.game_over = all(done) return obs, reward, done, info def _single_step(self, info): if self.use_visual: visual_obs = info.visual_observations if isinstance(visual_obs, list): visual_obs = np.array(visual_obs) if self._allow_multiple_visual_obs: visual_obs_list = [] for obs in visual_obs: visual_obs_list.append( self._preprocess_single(obs[0, :, :, :])) self.visual_obs = visual_obs_list else: self.visual_obs = self._preprocess_single( visual_obs[0][0, :, :, :]) default_observation = self.visual_obs else: default_observation = info.vector_observations[0, :] return ( default_observation, info.rewards[0], info.local_done[0], { "text_observation": info.text_observations[0], "brain_info": info, "vector_observations": info.vector_observations[0, :] }, ) def _preprocess_single(self, single_visual_obs): if self.uint8_visual: return (255.0 * single_visual_obs).astype(np.uint8) else: return single_visual_obs def _multi_step(self, info): if self.use_visual: self.visual_obs = self._preprocess_multi(info.visual_observations) default_observation = self.visual_obs else: default_observation = info.vector_observations return ( list(default_observation), info.rewards, info.local_done, { "text_observation": info.text_observations, "brain_info": info }, ) def _preprocess_multi(self, multiple_visual_obs): if self.uint8_visual: return [(255.0 * _visual_obs).astype(np.uint8) for _visual_obs in multiple_visual_obs] else: return multiple_visual_obs def render(self, mode="rgb_array"): return self.visual_obs def close(self): """Override _close in your subclass to perform any necessary cleanup. Environments will automatically close() themselves when garbage collected or when the program exits. """ try: self._env.close() except: pass def __del__(self): try: self._env.close() except: pass def get_action_meanings(self): return self.action_meanings def seed(self, seed=None): """Sets the seed for this env's random number generator(s). Currently not implemented. """ logger.warn("Could not seed environment %s", self.name) return def _check_agents(self, n_agents): if not self._multiagent and n_agents > 1: raise UnityEnvBaseException( "The environment was launched as a single-agent environment, however" "there is more than one agent in the scene.") elif self._multiagent and n_agents <= 1: raise UnityEnvBaseException( "The environment was launched as a mutli-agent environment, however" "there is only one agent in the scene.") if self._n_agents is None: self._n_agents = n_agents logger.info("{} agents within environment.".format(n_agents)) elif self._n_agents != n_agents: raise UnityEnvBaseException( "The number of agents in the environment has changed since " "initialization. This is not supported.") @staticmethod def _generate_new_env_id(): with UnityEnvBase.worker_id.get_lock(): new_id = UnityEnvBase.worker_id.value UnityEnvBase.worker_id.value += 1 return new_id @property def metadata(self): return {"render.modes": ["rgb_array"]} @property def reward_range(self): return -float("inf"), float("inf") @property def spec(self): return None @property def action_space(self): return self._action_space @property def observation_space(self): return self._observation_space @property def number_agents(self): return self._n_agents
class SocTwoEnv(): def __init__(self, env_path, worker_id, train_mode=True, n_str=16, n_goalie=16): self.env = UnityEnvironment(file_name=env_path, worker_id=0) self.striker_brain_name, self.goalie_brain_name = self.env.brain_names self.striker_brain = self.env.brains[self.striker_brain_name] self.goalie_brain = self.env.brains[self.goalie_brain_name] self.done_str = [False] * 16 self.done_goalie = [False] * 16 self.train_mode = train_mode self.done_hist_str = [False] * 16 self.done_hist_goalie = [False] * 16 self.episode_str_rewards = 0 self.episode_goalie_rewards = 0 self.n_str = n_str self.n_goalie = n_goalie self.act_str_hist = [[] for x in range(n_str)] self.act_goalie_hist = [[] for x in range(n_goalie)] self.observation_str_hist = [[] for x in range(SIZE_OBSERVATION)] self.observation_goalie_hist = [[] for x in range(SIZE_OBSERVATION)] self.observation_str = None self.observation_goalie = None return def reset(self): """ Reset the all environments and agents. """ self.env_info_str = self.env.reset( train_mode=self.train_mode)[self.striker_brain_name] self.env_info_goalie = self.env.reset( train_mode=self.train_mode)[self.goalie_brain_name] self.episode_rewards = 0 self.done_str = [False] * 16 self.done_goalie = [False] * 16 self.done_hist_str = np.array([False] * 16) self.done_hist_goalie = np.array([False] * 16) return {'str': self.env_info_str, 'goalie': self.env_info_goalie} def step(self, action_str, action_goalie): """ In each timestep, give each striker and goalie a instruction to do action. And then, get the current observation stored at observation_str and observation_goalie. """ self.env_info = self.env.step({ self.striker_brain_name: action_str, self.goalie_brain_name: action_goalie }) self.observation_str = np.array( self.env_info[self.striker_brain_name].vector_observations) self.observation_goalie = np.array( self.env_info[self.goalie_brain_name].vector_observations) return self.env_info def reward(self): self.episode_str_rewards = np.array( self.env_info[self.striker_brain_name].rewards) self.episode_goalie_rewards = np.array( self.env_info[self.goalie_brain_name].rewards) return self.episode_str_rewards, self.episode_goalie_rewards def close(self): """ Close the simulation Unity environment. """ self.env.close() return def done(self): self.done_str = np.array( self.env_info[self.striker_brain_name].local_done) self.done_goalie = np.array( self.env_info[self.goalie_brain_name].local_done) def reset_some_agents(self, str_arg, goalie_arg): """ params: str_arg, mark which striker's history that wants to be cleared. goalie_arg, mark which goalie's history that wants to be cleared. Clear the history of specific agents. """ for i in str_arg: self.act_str_hist[i[0]] = [] self.observation_str_hist[i[0]] = [] for i in goalie_arg: self.act_goalie_hist[i[0]] = [] def print_r(self, episode): print("Total reward this episode_{}: {}".format( episode, self.episode_rewards)) return
class ObstacleTowerEnv(gym.Env): ALLOWED_VERSIONS = ['1'] def __init__(self, environment_filename=None, docker_training=False, worker_id=0, retro=True): """ Arguments: environment_filename: The file path to the Unity executable. Does not require the extension. docker_training: Whether this is running within a docker environment and should use a virtual frame buffer (xvfb). worker_id: The index of the worker in the case where multiple environments are running. Each environment reserves port (5005 + worker_id) for communication with the Unity executable. retro: Resize visual observation to 84x84 (int8) and flattens action space. """ if self.is_grading(): environment_filename = None docker_training = True self._env = UnityEnvironment(environment_filename, worker_id, docker_training=docker_training) split_name = self._env.academy_name.split('-v') if len(split_name) == 2 and split_name[0] == "ObstacleTower": self.name, self.version = split_name else: raise UnityGymException( "Attempting to launch non-Obstacle Tower environment") if self.version not in self.ALLOWED_VERSIONS: raise UnityGymException( "Invalid Obstacle Tower version. Your build is v" + self.version + \ " but only the following versions are compatible with this gym: " + \ str(self.ALLOWED_VERSIONS) ) self.visual_obs = None self._current_state = None self._n_agents = None self._done_grading = False self._flattener = None self._seed = None self._floor = None self.game_over = False # Hidden flag used by Atari environments to determine if the game is over self.retro = retro flatten_branched = self.retro uint8_visual = self.retro # Check brain configuration if len(self._env.brains) != 1: raise UnityGymException( "There can only be one brain in a UnityEnvironment " "if it is wrapped in a gym.") self.brain_name = self._env.external_brain_names[0] brain = self._env.brains[self.brain_name] if brain.number_visual_observations == 0: raise UnityGymException( "Environment provides no visual observations.") self.uint8_visual = uint8_visual if brain.number_visual_observations > 1: logger.warning( "The environment contains more than one visual observation. " "Please note that only the first will be provided in the observation." ) # Check for number of agents in scene. initial_info = self._env.reset()[self.brain_name] self._check_agents(len(initial_info.agents)) # Set observation and action spaces if len(brain.vector_action_space_size) == 1: self._action_space = spaces.Discrete( brain.vector_action_space_size[0]) else: if flatten_branched: self._flattener = ActionFlattener( brain.vector_action_space_size) self._action_space = self._flattener.action_space else: self._action_space = spaces.MultiDiscrete( brain.vector_action_space_size) high = np.array([np.inf] * brain.vector_observation_space_size) self.action_meanings = brain.vector_action_descriptions depth = 3 image_space_max = 1.0 image_space_dtype = np.float32 camera_height = brain.camera_resolutions[0]["height"] camera_width = brain.camera_resolutions[0]["width"] if self.retro: image_space_max = 255 image_space_dtype = np.uint8 camera_height = 84 camera_width = 84 image_space = spaces.Box(0, image_space_max, dtype=image_space_dtype, shape=(camera_height, camera_width, depth)) if self.retro: self._observation_space = image_space else: max_float = np.finfo(np.float32).max keys_space = spaces.Discrete(5) time_remaining_space = spaces.Box(low=0.0, high=max_float, shape=(1, ), dtype=np.float32) self._observation_space = spaces.Tuple( (image_space, keys_space, time_remaining_space)) def done_grading(self): return self._done_grading def is_grading(self): return os.getenv('OTC_EVALUATION_ENABLED', False) def reset(self): """Resets the state of the environment and returns an initial observation. In the case of multi-agent environments, this is a list. Returns: observation (object/list): the initial observation of the space. """ reset_params = {} if self._floor is not None: reset_params['floor-number'] = self._floor if self._seed is not None: reset_params['tower-seed'] = self._seed info = self._env.reset(config=reset_params)[self.brain_name] n_agents = len(info.agents) self._check_agents(n_agents) self.game_over = False obs, reward, done, info = self._single_step(info) return obs def step(self, action): """Run one timestep of the environment's dynamics. When end of episode is reached, you are responsible for calling `reset()` to reset this environment's state. Accepts an action and returns a tuple (observation, reward, done, info). In the case of multi-agent environments, these are lists. Args: action (object/list): an action provided by the environment Returns: observation (object/list): agent's observation of the current environment reward (float/list) : amount of reward returned after previous action done (boolean/list): whether the episode has ended. info (dict): contains auxiliary diagnostic information, including BrainInfo. """ # Use random actions for all other agents in environment. if self._flattener is not None: # Translate action into list action = self._flattener.lookup_action(action) info = self._env.step(action)[self.brain_name] n_agents = len(info.agents) self._check_agents(n_agents) self._current_state = info obs, reward, done, info = self._single_step(info) self.game_over = done if info.get('text_observation') == 'evaluation_complete': done = True self._done_grading = True return obs, reward, done, info def _single_step(self, info): self.visual_obs = self._preprocess_single( info.visual_observations[0][0, :, :, :]) if self.retro: self.visual_obs = self._resize_observation(self.visual_obs) self.visual_obs = self._add_stats_to_image( self.visual_obs, info.vector_observations[0]) default_observation = self.visual_obs else: default_observation = self._prepare_tuple_observation( self.visual_obs, info.vector_observations[0]) return default_observation, info.rewards[0], info.local_done[0], { "text_observation": info.text_observations[0], "brain_info": info } def _preprocess_single(self, single_visual_obs): if self.uint8_visual: return (255.0 * single_visual_obs).astype(np.uint8) else: return single_visual_obs def render(self, mode='rgb_array'): return self.visual_obs def close(self): """Override _close in your subclass to perform any necessary cleanup. Environments will automatically close() themselves when garbage collected or when the program exits. """ self._env.close() def get_action_meanings(self): return self.action_meanings def seed(self, seed=None): """Sets a fixed seed for this env's random number generator(s). The valid range for seeds is [0, 100). By default a random seed will be chosen. """ if seed is None: self._seed = seed return seed = int(seed) if seed < 0 or seed >= 100: logger.warn("Seed outside of valid range [0, 100). A random seed " "within the valid range will be used on next reset.") logger.warn("New seed " + str(seed) + " will apply on next reset.") self._seed = seed def floor(self, floor=None): """Sets the starting floor to a fixed floor number on subsequent environment resets.""" if floor is None: self._floor = floor return floor = int(floor) if floor < 0 or floor >= 25: logger.warn( "Starting floor outside of valid range [0, 25). Floor 0 will be used" "on next reset.") logger.warn("New starting floor " + str(floor) + " will apply on next reset.") self._floor = floor @staticmethod def _resize_observation(observation): """ Re-sizes visual observation to 84x84 """ obs_image = Image.fromarray(observation) obs_image = obs_image.resize((84, 84), Image.NEAREST) return np.array(obs_image) @staticmethod def _prepare_tuple_observation(vis_obs, vector_obs): """ Converts separate visual and vector observation into prepared tuple """ key = vector_obs[0:6] time = vector_obs[6] key_num = np.argmax(key, axis=0) return vis_obs, key_num, time @staticmethod def _add_stats_to_image(vis_obs, vector_obs): """ Displays time left and number of keys on visual observation """ key = vector_obs[0:6] time = vector_obs[6] key_num = np.argmax(key, axis=0) time_num = min(time, 10000) / 10000 vis_obs[0:10, :, :] = 0 for i in range(key_num): start = int(i * 16.8) + 4 end = start + 10 vis_obs[1:5, start:end, 0:2] = 255 vis_obs[6:10, 0:int(time_num * 84), 1] = 255 return vis_obs def _check_agents(self, n_agents): if n_agents > 1: raise UnityGymException( "The environment was launched as a single-agent environment, however" "there is more than one agent in the scene.") if self._n_agents is None: self._n_agents = n_agents logger.info("{} agents within environment.".format(n_agents)) elif self._n_agents != n_agents: raise UnityGymException( "The number of agents in the environment has changed since " "initialization. This is not supported.") @property def metadata(self): return {'render.modes': ['rgb_array']} @property def reward_range(self): return -float('inf'), float('inf') @property def spec(self): return None @property def action_space(self): return self._action_space @property def observation_space(self): return self._observation_space @property def number_agents(self): return self._n_agents
def run(options, runLog, minimumAcceptableFitness=None): isFunctionInValidationMode = \ isinstance(minimumAcceptableFitness, float) runLog.Append("This is run.py -> script for running pretrained models!") locationOfPretrainedModel = options["--model"] resultsRepository = TrainingResultsRepository() bestAgent = resultsRepository.LoadBestModel(locationOfPretrainedModel) if bestAgent is None: runLog.Append("run.run() error: Cannot load model, location " \ "'training_results/{0}' does not exist!".format( locationOfPretrainedModel)) exit() runLog.Append("Run model from 'training_results/{0}'!".format( locationOfPretrainedModel)) pathToEnv = options["--env-path"] env = UnityEnvironment(file_name=pathToEnv) if pathToEnv is None: runLog.Append("Established connection with Unity Editor!") else: runLog.Append("Established connection with Unity build '{0}'!" \ .format(pathToEnv)) del pathToEnv brainName = env.brain_names[0] if isFunctionInValidationMode: fitness = 0.0 shouldRunBeExecuted = True try: while shouldRunBeExecuted: envInfo = env.reset(train_mode=False)[brainName] inputData = envInfo.vector_observations.tolist() inputData = inputData[0][0:-1] while shouldRunBeExecuted: outputData = bestAgent.forward(inputData) envInfo = env.step([outputData])[brainName] inputData = envInfo.vector_observations.tolist() if isFunctionInValidationMode: episodeReward = inputData[0][-1] if episodeReward > fitness: fitness = episodeReward inputData = inputData[0][:-1] if envInfo.local_done[0]: if isFunctionInValidationMode: shouldRunBeExecuted = False break except KeyboardInterrupt: runLog.Append("\nRun interrupted because of KeyboardInterrupt!") runLog.Append("End of run!") env.close() runLog.Append("Closed Unity environment.") if isFunctionInValidationMode: return fitness >= minimumAcceptableFitness else: return False
def unity_run(default_args, share_args, options, max_step, max_episode, save_frequency, name): from mlagents.envs import UnityEnvironment from utils.sampler import create_sampler_manager try: tf_version, (model, policy_mode, _) = get_model_info(options['--algorithm']) algorithm_config = sth.load_config( f'./Algorithms/{tf_version}/config.yaml')[options['--algorithm']] ma = options['--algorithm'][:3] == 'ma_' except KeyError: raise NotImplementedError reset_config = default_args['reset_config'] if options['--unity']: env = UnityEnvironment() env_name = 'unity' else: file_name = default_args['exe_file'] if options[ '--env'] == 'None' else options['--env'] if os.path.exists(file_name): env = UnityEnvironment(file_name=file_name, base_port=int(options['--port']), no_graphics=False if options['--inference'] else not options['--graphic']) env_dir = os.path.split(file_name)[0] env_name = os.path.join(*env_dir.replace('\\', '/').replace( r'//', r'/').split('/')[-2:]) sys.path.append(env_dir) if os.path.exists(env_dir + '/env_config.py'): import env_config reset_config = env_config.reset_config max_step = env_config.max_step if os.path.exists(env_dir + '/env_loop.py'): from env_loop import Loop else: raise Exception('can not find this file.') sampler_manager, resampling_interval = create_sampler_manager( options['--sampler'], env.reset_parameters) if 'Loop' not in locals().keys(): if ma: from ma_loop import Loop else: from loop import Loop if options['--config-file'] != 'None': algorithm_config = update_config(algorithm_config, options['--config-file']) _base_dir = os.path.join(share_args['base_dir'], env_name, options['--algorithm']) base_dir = os.path.join(_base_dir, name) show_config(algorithm_config) brain_names = env.external_brain_names brains = env.brains brain_num = len(brain_names) visual_resolutions = {} for i in brain_names: if brains[i].number_visual_observations: visual_resolutions[f'{i}'] = [ brains[i].camera_resolutions[0]['height'], brains[i].camera_resolutions[0]['width'], 1 if brains[i].camera_resolutions[0]['blackAndWhite'] else 3 ] else: visual_resolutions[f'{i}'] = [] model_params = [{ 's_dim': brains[i].vector_observation_space_size * brains[i].num_stacked_vector_observations, 'a_dim_or_list': brains[i].vector_action_space_size, 'action_type': brains[i].vector_action_space_type, 'max_episode': max_episode, 'base_dir': os.path.join(base_dir, i), 'logger2file': share_args['logger2file'], 'out_graph': share_args['out_graph'], } for i in brain_names] if ma: assert brain_num > 1, 'if using ma* algorithms, number of brains must larger than 1' data = ExperienceReplay(share_args['ma']['batch_size'], share_args['ma']['capacity']) extra_params = {'data': data} models = [ model(n=brain_num, i=i, **model_params[i], **algorithm_config) for i in range(brain_num) ] else: extra_params = {} models = [ model(visual_sources=brains[i].number_visual_observations, visual_resolution=visual_resolutions[f'{i}'], **model_params[index], **algorithm_config) for index, i in enumerate(brain_names) ] [ models[index].init_or_restore( os.path.join( _base_dir, name if options['--load'] == 'None' else options['--load'], i)) for index, i in enumerate(brain_names) ] begin_episode = models[0].get_init_episode() params = { 'env': env, 'brain_names': brain_names, 'models': models, 'begin_episode': begin_episode, 'save_frequency': save_frequency, 'reset_config': reset_config, 'max_step': max_step, 'max_episode': max_episode, 'sampler_manager': sampler_manager, 'resampling_interval': resampling_interval, 'policy_mode': policy_mode } if 'batch_size' in algorithm_config.keys() and options['--fill-in']: steps = algorithm_config['batch_size'] else: steps = default_args['no_op_steps'] no_op_params = { 'env': env, 'brain_names': brain_names, 'models': models, 'brains': brains, 'steps': steps, 'choose': options['--noop-choose'] } params.update(extra_params) no_op_params.update(extra_params) if options['--inference']: Loop.inference(env, brain_names, models, reset_config=reset_config, sampler_manager=sampler_manager, resampling_interval=resampling_interval) else: try: [ sth.save_config(os.path.join(base_dir, i, 'config'), algorithm_config) for i in brain_names ] Loop.no_op(**no_op_params) Loop.train(**params) except Exception as e: print(e) finally: try: [models[i].close() for i in range(len(models))] except Exception as e: print(e) finally: env.close() sys.exit()
# (Over-) Print current average score print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, average_score), end="") # Print average score every scores_average_window episodes if i_episode % scores_average_window == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, average_score)) # Check to see if the task is solved (i.e,. avearge_score > solved_score). # If yes, save the network weights and scores and end training. if average_score >= solved_score: print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'. format(i_episode, average_score)) # Save trained neural network weights timestr = time.strftime("%Y%m%d-%H%M%S") nn_filename = "dqnAgent_Trained_Model_" + timestr + ".pth" torch.save(agent.network.state_dict(), nn_filename) # Save the recorded Scores data scores_filename = "dqnAgent_scores_" + timestr + ".csv" np.savetxt(scores_filename, scores, delimiter=",") break env.close() # END :) #############
def train_de(options, trainingLog, dataCollector=None): isTrainInExperimentMode = isinstance(dataCollector, ExperimentDataCollector) trainingLog.Append( "This is train_de.py -> Differential Evolution training!") if options["--track-1"]: trackNumber = 1 elif options["--track-2"]: trackNumber = 2 elif options["--track-3"]: trackNumber = 3 trainingLog.Append("Training on RaceTrack_{0}.".format(trackNumber)) # --- Load config data from file --- # pathToConfigFile = options["<config-file-path>"] CONFIG_DATA = loadConfigData(pathToConfigFile) trainingLog.Append( "Config data has been loaded from file: {0}".format(pathToConfigFile)) del pathToConfigFile # --- Set random seed --- # TRAINING_PARAMS = CONFIG_DATA["TrainingParameters"] RANDOM_SEED = TRAINING_PARAMS["randomSeed"] if isinstance(RANDOM_SEED, int): random.seed(RANDOM_SEED) torch.manual_seed(RANDOM_SEED) trainingLog.Append("Random seed set to value: {0}".format(RANDOM_SEED)) # --- Establish connection with Unity environment --- # pathToEnv = options["--env-path"] env = UnityEnvironment(file_name=pathToEnv) if pathToEnv is None: trainingLog.Append("Established connection with Unity Editor!") else: trainingLog.Append("Established connection with Unity build '{0}'!" \ .format(pathToEnv)) del pathToEnv # --- Get info from Unity environment --- # brainName = env.brain_names[0] trainingLog.Append("Brain name: {0}".format(brainName)) brain = env.brains[brainName] brainInfo = env.reset(train_mode=True)[brainName] observationSize = brainInfo.vector_observations.shape[1] actionSize = brain.vector_action_space_size[0] trainingLog.Append( "Loaded from Unity environment: observationSize = {0}, " \ "actionSize = {1}".format(observationSize, actionSize)) # --- Compute agent dimensions -- # HIDDEN_DIMENSIONS = TRAINING_PARAMS["networkHiddenDimensions"] agentDimensions = [observationSize - 1] + HIDDEN_DIMENSIONS + [actionSize] trainingLog.Append("Computed agentDimensions: {0}".format(agentDimensions)) # --- Create population ---- # locationForPretrainedPopulation = options["--population"] DIFF_EVO_PARAMS = CONFIG_DATA["LearningAlgorithms"]["diff_evo"] NUM_OF_AGENTS = DIFF_EVO_PARAMS["numberOfAgents"] resultsRepository = TrainingResultsRepository(trainingLog) try: MUTATION_FACTOR = DIFF_EVO_PARAMS["mutationFactor"] CROSS_PROBABILITY = DIFF_EVO_PARAMS["crossProbability"] NUM_OF_PARAMS = computeNumOfParameters(agentDimensions) trackName = "RaceTrack_{0}".format(trackNumber) MINIMAL_ACCEPTABLE_FITNESS = \ TRAINING_PARAMS["minimalAcceptableFitness"][trackName] del trackName fitnessEvaluation = AgentFitnessEvaluator(env, brainName) MAX_EPISODES_NUMBER = TRAINING_PARAMS["maxNumberOfEpisodes"] MAX_REPEATS_NUMBER = TRAINING_PARAMS[ "maxNumberOfRepeatsIfTrainingFails"] trainingLog.Append( "Start training with parameters: MAX_EPISODES_NUMBER = {0}, " \ "MAX_REPEATS_NUMBER = {1}, MUTATION_FACTOR = {2}, " \ "CROSS_PROBABILITY = {3}, NUM_OF_PARAMS = {4}, " \ "MINIMAL_ACCEPTABLE_FITNESS = {5}, fitnessFunction = {6}".format( MAX_EPISODES_NUMBER, MAX_REPEATS_NUMBER, MUTATION_FACTOR, CROSS_PROBABILITY, NUM_OF_PARAMS, MINIMAL_ACCEPTABLE_FITNESS, type(fitnessEvaluation))) shouldContinueTraining = True for repeatCounter in range(MAX_REPEATS_NUMBER): if not shouldContinueTraining: break if locationForPretrainedPopulation is None: population = [ AgentNeuralNetwork(agentDimensions) \ for _ in range(NUM_OF_AGENTS) ] trainingLog.Append("Created new population, with parameters: " \ "NUM_OF_AGENTS = {0}, agentDimensions = {1}, ".format( NUM_OF_AGENTS, agentDimensions)) else: population = \ resultsRepository.LoadPopulation( locationForPretrainedPopulation) if population is None: env.close() exit() fitnessList = [] if isTrainInExperimentMode: bestFitnessSequence = [] meanFitnessSequence = [] stdevFitnessSequence = [] searchCounter = NUM_OF_AGENTS timeOfBegin = time.time() for agentIndex in range(NUM_OF_AGENTS): agentFitness = fitnessEvaluation(population[agentIndex]) fitnessList.append(agentFitness) bestFitness = max(fitnessList) indexOfBestFitness = fitnessList.index(bestFitness) bestAgent = deepcopy(population[indexOfBestFitness]) meanFitness = statistics.mean(fitnessList) stdDevFitness = statistics.stdev(fitnessList) if isTrainInExperimentMode: bestFitnessSequence.append(bestFitness), meanFitnessSequence.append(meanFitness) stdevFitnessSequence.append(stdDevFitness) pop_denorm = retrieveParametersFromAgentList(population) pop_norm = pop_denorm / 4 + 0.5 for episodeCounter in range(MAX_EPISODES_NUMBER): for j in range(NUM_OF_AGENTS): if isTrainInExperimentMode: searchCounter += 1 indices = [ index for index in range(NUM_OF_AGENTS) if index != j ] a_idx, b_idx, c_idx = random.sample(indices, 3) a, b, c = pop_norm[a_idx], pop_norm[b_idx], pop_norm[c_idx] mutant = torch.clamp(a + MUTATION_FACTOR * (b - c), 0.0, 1.0) cross_points = [ random.uniform(0, 1) for _ in range(NUM_OF_PARAMS) ] trial_norm = torch.zeros(NUM_OF_PARAMS) for k in range(NUM_OF_PARAMS): if cross_points[k] < CROSS_PROBABILITY: trial_norm[k] = mutant[k] else: trial_norm[k] = pop_norm[j][k] trial_denorm = trial_norm * 4 - 2 setNewParametersOnAgent(population[j], trial_denorm) agentIndex = j fitness_trial = fitnessEvaluation(population[agentIndex]) if fitness_trial > fitnessList[j]: fitnessList[j] = fitness_trial pop_denorm[j] = trial_denorm pop_norm[j] = trial_norm if fitness_trial > bestFitness: bestFitness = fitness_trial bestAgent = deepcopy(population[j]) else: setNewParametersOnAgent(population[j], pop_denorm[j]) meanFitness = statistics.mean(fitnessList) stdDevFitness = statistics.stdev(fitnessList) if isTrainInExperimentMode: bestFitnessSequence.append(bestFitness), meanFitnessSequence.append(meanFitness) stdevFitnessSequence.append(stdDevFitness) trainingLog.Append( "Episode {0}: best = {1}, mean = {2}, stdDev = {3}".format( episodeCounter, bestFitness, meanFitness, stdDevFitness)) if bestFitness >= MINIMAL_ACCEPTABLE_FITNESS: trainingLog.Append( "Training interrupted after {0} episodes, reason: " \ "reached minimal acceptable value for bestFitness!" \ " (minimalAcceptableFitness = {1}, bestFitness = {2})" \ .format( episodeCounter + 1, MINIMAL_ACCEPTABLE_FITNESS, bestFitness)) if isTrainInExperimentMode: timeOfEnd = time.time() trainingTime = timeOfEnd - timeOfBegin dataCollector.AppendBestFitnessSequence( trackNumber, "DE", bestFitnessSequence) dataCollector.AppendMeanFitnessSequence( trackNumber, "DE", meanFitnessSequence) dataCollector.AppendStdevFitnessSequence( trackNumber, "DE", stdevFitnessSequence) dataCollector.AddTimeInSecondsFromTraining( trackNumber, "DE", trainingTime) trainingLog.Append("Training time in seconds: {0}" \ .format(trainingTime)) dataCollector.AddTimeInEpisodesFromTraining( trackNumber, "DE", episodeCounter + 1) trainingLog.Append("Training time in episodes: {0}" \ .format(episodeCounter + 1)) dataCollector.AddToSearchCounter( trackNumber, "DE", searchCounter) trainingLog.Append( "searchCounter = {0}".format(searchCounter)) shouldContinueTraining = False break if episodeCounter >= (MAX_EPISODES_NUMBER - 1): message = "Cannot train population in current repeat " \ "(MAX_EPISODES_NUMBER = {0}, repeatCounter = {1})!" \ .format(MAX_EPISODES_NUMBER, repeatCounter) if repeatCounter < MAX_REPEATS_NUMBER - 1: message += " Try again to train population!" else: message += " Unfortunately, cannot try again. " \ "Reason: achieved maximum number of repeats!" trainingLog.Append(message) except KeyboardInterrupt: trainingLog.Append( "\nTraining interrupted because of KeyboardInterrupt!") trainingLog.Append("End of training!") # --- Close environment --- # env.close() trainingLog.Append("Closed Unity environment.") # --- Save training results --- # shouldSavePopulation = options["--save-population"] resultsRepository.Save(population, bestAgent, shouldSavePopulation) if isTrainInExperimentMode: dataCollector.PathToLastSavedModel = \ resultsRepository._pathToLastSavedModel
class Game: # set up unity ml agent environment def __init__(self): self.loadEnv(0) def loadEnv(self, wid): # load env env_name = ENV_LOCATION self.env = UnityEnvironment(env_name, worker_id=wid) # Set the default brain to work with self.default_brain = self.env.brain_names[0] self.brain = self.env.brains[self.default_brain] # Reset the environment - train mode enabled env_info = self.env.reset(train_mode=True)[self.default_brain] # this frogger game action space is 5, actions[0] = selected action (action = [[1]]) # actions # 1 - up, 2 - down , 3- left , 4 -right , 0 - do nothing def performAction(self, actionValue, numberOfFrames=STACK_SIZE): action = [[0]] action[0] = actionValue terminal = False # indication of terminal state size = (IMAGE_HEIGTH, IMAGE_WIDTH, numberOfFrames ) # create list to keep frames stack = np.zeros(size) reward = 0 # rewards for all the frames # first frame after action env_info = self.env.step(action)[ self.default_brain] # send action to brain reward = round(env_info.rewards[0], 5) # get reward newState = env_info.visual_observations[0][ 0] # get state visual observation newStateGray = skimage.color.rgb2gray(newState) # covert to gray scale newStateGray = skimage.transform.resize(newStateGray, (IMAGE_HEIGTH, IMAGE_WIDTH)) # check terminal reached if reward == -1 or reward == -2: terminal = True # add the state to the 0 th position stack[:, :, 0] = newStateGray # get stack of frames after the action for i in range(1, numberOfFrames): env_info = self.env.step( )[self. default_brain] # change environment to next step without action st = env_info.visual_observations[0][0] stGray = skimage.color.rgb2gray(st) stGray = skimage.transform.resize(stGray, (IMAGE_HEIGTH, IMAGE_WIDTH)) stack[:, :, i] = stGray # if terminal only consider the reward for terminal if env_info.rewards[0] == -1 or env_info.rewards[0] == -2: terminal = True reward = round(env_info.rewards[0], 5) elif not terminal: # if it got a positive reward for move up let it have it if reward < 0: reward = round(env_info.rewards[0], 5) # get reward # reshape for Keras # noinspection PyArgumentList stack = stack.reshape(1, stack.shape[0], stack.shape[1], stack.shape[2]) # 1*100*100*4 return reward, stack, terminal # close environment def close(self): self.env.close() def reset(self): self.close() self.loadEnv(0)
class UnityEnv(gym.Env): """ Provides Gym wrapper for Unity Learning Environments. Multi-agent environments use lists for object types, as done here: https://github.com/openai/multiagent-particle-envs """ def __init__(self, environment_filename: str, worker_id=0, use_visual=False, multiagent=False): """ Environment initialization :param environment_filename: The UnityEnvironment path or file to be wrapped in the gym. :param worker_id: Worker number for environment. :param use_visual: Whether to use visual observation or vector observation. :param multiagent: Whether to run in multi-agent mode (lists of obs, reward, done). """ self._env = UnityEnvironment(environment_filename, worker_id) self.name = self._env.academy_name self.visual_obs = None self._current_state = None self._n_agents = None self._multiagent = multiagent # Check brain configuration if len(self._env.brains) != 1: raise UnityGymException( "There can only be one brain in a UnityEnvironment " "if it is wrapped in a gym.") self.brain_name = self._env.external_brain_names[0] brain = self._env.brains[self.brain_name] if use_visual and brain.number_visual_observations == 0: raise UnityGymException("`use_visual` was set to True, however there are no" " visual observations as part of this environment.") self.use_visual = brain.number_visual_observations == 1 and use_visual if brain.num_stacked_vector_observations != 1: raise UnityGymException( "There can only be one stacked vector observation in a UnityEnvironment " "if it is wrapped in a gym.") # Check for number of agents in scene. initial_info = self._env.reset()[self.brain_name] self._check_agents(len(initial_info.agents)) # Set observation and action spaces if brain.vector_action_space_type == "discrete": if len(brain.vector_action_space_size) == 1: self._action_space = spaces.Discrete(brain.vector_action_space_size[0]) else: self._action_space = spaces.MultiDiscrete(brain.vector_action_space_size) else: high = np.array([1] * brain.vector_action_space_size[0]) self._action_space = spaces.Box(-high, high, dtype=np.float32) high = np.array([np.inf] * brain.vector_observation_space_size) self.action_meanings = brain.vector_action_descriptions if self.use_visual: if brain.camera_resolutions[0]["blackAndWhite"]: depth = 1 else: depth = 3 self._observation_space = spaces.Box(0, 1, dtype=np.float32, shape=(brain.camera_resolutions[0]["height"], brain.camera_resolutions[0]["width"], depth)) else: self._observation_space = spaces.Box(-high, high, dtype=np.float32) def reset(self): """Resets the state of the environment and returns an initial observation. In the case of multi-agent environments, this is a list. Returns: observation (object/list): the initial observation of the space. """ info = self._env.reset()[self.brain_name] n_agents = len(info.agents) self._check_agents(n_agents) if not self._multiagent: obs, reward, done, info = self._single_step(info) else: obs, reward, done, info = self._multi_step(info) return obs def step(self, action): """Run one timestep of the environment's dynamics. When end of episode is reached, you are responsible for calling `reset()` to reset this environment's state. Accepts an action and returns a tuple (observation, reward, done, info). In the case of multi-agent environments, these are lists. Args: action (object/list): an action provided by the environment Returns: observation (object/list): agent's observation of the current environment reward (float/list) : amount of reward returned after previous action done (boolean/list): whether the episode has ended. info (dict): contains auxiliary diagnostic information, including BrainInfo. """ # Use random actions for all other agents in environment. if self._multiagent: if not isinstance(action, list): raise UnityGymException("The environment was expecting `action` to be a list.") if len(action) != self._n_agents: raise UnityGymException("The environment was expecting a list of {} actions.".format(self._n_agents)) else: action = np.array(action) info = self._env.step(action)[self.brain_name] n_agents = len(info.agents) self._check_agents(n_agents) self._current_state = info if not self._multiagent: obs, reward, done, info = self._single_step(info) else: obs, reward, done, info = self._multi_step(info) return obs, reward, done, info def _single_step(self, info): if self.use_visual: self.visual_obs = info.visual_observations[0][0, :, :, :] default_observation = self.visual_obs else: default_observation = info.vector_observations[0, :] return default_observation, info.rewards[0], info.local_done[0], {"text_observation": info.text_observations[0], "brain_info": info} def _multi_step(self, info): if self.use_visual: self.visual_obs = info.visual_observations default_observation = self.visual_obs else: default_observation = info.vector_observations return list(default_observation), info.rewards, info.local_done, {"text_observation": info.text_observations, "brain_info": info} def render(self, mode='rgb_array'): return self.visual_obs def close(self): """Override _close in your subclass to perform any necessary cleanup. Environments will automatically close() themselves when garbage collected or when the program exits. """ self._env.close() def get_action_meanings(self): return self.action_meanings def seed(self, seed=None): """Sets the seed for this env's random number generator(s). Currently not implemented. """ logger.warn("Could not seed environment %s", self.name) return def _check_agents(self, n_agents): if not self._multiagent and n_agents > 1: raise UnityGymException("The environment was launched as a single-agent environment, however" "there is more than one agent in the scene.") elif self._multiagent and n_agents <= 1: raise UnityGymException("The environment was launched as a mutli-agent environment, however" "there is only one agent in the scene.") if self._n_agents is None: self._n_agents = n_agents logger.info("{} agents within environment.".format(n_agents)) elif self._n_agents != n_agents: raise UnityGymException("The number of agents in the environment has changed since " "initialization. This is not supported.") @property def metadata(self): return {'render.modes': ['rgb_array']} @property def reward_range(self): return -float('inf'), float('inf') @property def spec(self): return None @property def action_space(self): return self._action_space @property def observation_space(self): return self._observation_space @property def number_agents(self): return self._n_agents
def train_wrapper(env_config, wrapper_config): """ Set the Training Parameters :param env_config: dictionary, used to pass parameters into the environment :param wrapper_config: dictionary of user defined variables. """ # num_episodes (int): maximum number of training episodes num_episodes = wrapper_config['num_episodes'] # scores_average_window (int): the window size employed for calculating the average score scores_average_window = wrapper_config['scores_avg_window'] # solved_score (float): the average score required for the environment to be considered solved solved_score = wrapper_config['solved_score'] # load_weights (bool): whether or not to start training with loaded weights load_weights = wrapper_config['load_weights'] # weights_path: path to the directory containing the weights (same directory to save them) weights_path = wrapper_config['weights_path'] if load_weights and not (os.path.isdir(weights_path)): print('weights dir does not exist') raise NotADirectoryError # save_mem (bool): whether or not to save memory save_mem = wrapper_config['save_mem'] # load_mem (bool): whether or not to continue training with loaded memory load_mem = wrapper_config['load_mem'] # mem_path: path to directory containing the memory to load mem_path = wrapper_config['mem_path'] if load_mem and not (os.path.isdir(mem_path)): print('mem dir does not exist') raise NotADirectoryError # build_path: path to the build of the unity environment. build_path = None if wrapper_config['build'] == 'None' else wrapper_config[ 'build'] if (build_path is not None) and (not os.path.isfile(build_path)): print('--build is not a valid path') raise FileNotFoundError # no_graphics (bool): whether or not to start the environment without graphics (default = True in training) no_graphics_in = not wrapper_config['show_graphics'] # agent_type (DDPG | MDDPG | MADDPG) agent_type = wrapper_config['agent'] if not issubclass(agent_type, AgentABC): print('invalid agent type') raise TypeError # print_Agent_loss (bool): whether or not to print the agent's loss (mse for critic) after every episode print_agent_loss = wrapper_config['print_agent_loss'] # save_log (bool): whether or not to save the episodes score (csv format, default is True) save_log = wrapper_config['save_score_log'] # save_best_weights (bool): save also the best weights of the session (by average score) save_best_weights = wrapper_config['save_best_weights'] # episode_scores (float): list to record the scores obtained from each episode episode_scores = [] """ Start the Unity Environment """ env = UnityEnvironment(file_name=build_path, no_graphics=no_graphics_in) """ Get The Unity Environment Brain Unity ML-Agent applications or Environments contain "BRAINS" which are responsible for deciding the actions an agent or set of agents should take given a current set of environment (state) observations. The Race environment has a single Brain, thus, we just need to access the first brain available (i.e., the default brain). We then set the default brain as the brain that will be controlled. """ # Get the default brain brain_name = env.brain_names[0] # Assign the default brain as the brain to be controlled brain = env.brains[brain_name] """ Determine the size of the Action and State Spaces and the Number of Agents. The observation space consists of variables corresponding to Ray Cast in different direction, velocity and direction. Each action is a vector with 2 numbers, corresponding to steer left/right and brake/drive (in this order). each action is a number between -1 and 1. num_agents will correspond to the number of agent using the same brain - (since all cars use the same action / observation space they all use the same brain) if in the future one should have different cars use different observation space, one will need to split them into different brains.. """ # Set the number of actions or action size action_size = brain.vector_action_space_size # Set the size of state observations or state size state_size = brain.vector_observation_space_size # Get number of agents in Environment env_info = env.reset(train_mode=True, config=env_config)[brain_name] num_agents = len(env_info.agents) print('\nNumber of Agents: ', num_agents) """ Create an Agent from the Agent Class in Agent.py Any agent initialized with the following parameters. ====== state_size (int): dimension of each state (required) action_size (int): dimension of each action (required) num_agents (int): number of agents in the unity environment seed (int): random seed for initializing training point (default = 0) Here we initialize an agent using the Unity environments state and action size and number of Agents determined above. """ agent: AgentABC = agent_type(state_size=state_size, action_size=action_size[0], num_agents=num_agents, random_seed=0) # Load trained model weights if load_weights: agent.load_weights(weights_path) if load_mem: agent.load_mem(mem_path) """ ################################### STEP 6: Run the Training Sequence The Training Process involves the agent learning from repeated episodes of behaviour to map states to actions the maximize rewards received via environmental interaction. The agent training process involves the following: (1) Reset the environment at the beginning of each episode. (2) Obtain (observe) current state, s, of the environment at time t (3) Perform an action, a(t), in the environment given s(t) (4) Observe the result of the action in terms of the reward received and the state of the environment at time t+1 (i.e., s(t+1)) (5) Update agent memory and learn from experience (i.e, agent.step) (6) Update episode score (total reward received) and set s(t) -> s(t+1). (7) If episode is done, break and repeat from (1), otherwise repeat from (3). Below we also exit the training process early if the environment is solved. That is, if the average score for the previous 100 episodes is greater than solved_score. """ best_score = -np.inf # used to determine the best average score so far (for saving best_weights) # loop from num_episodes for i_episode in range(1, num_episodes + 1): # reset the unity environment at the beginning of each episode env_info = env.reset(train_mode=True, config=env_config)[brain_name] # get initial state of the unity environment states = env_info.vector_observations # reset the training agent for new episode agent.reset() # set the initial episode score to zero. agent_scores = np.zeros(num_agents) # Run the episode training loop; # At each loop step take an action as a function of the current state observations # Based on the resultant environmental state (next_state) and reward received update the agent ('step' method) # If environment episode is done, exit loop... # Otherwise repeat until done == true steps = 0 while True: steps = steps + 1 # determine actions for the unity agents from current sate actions = agent.act(states) # send the actions to the unity agents in the environment and receive resultant environment information env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations # get the next states for each unity agent in the environment rewards = env_info.rewards # get the rewards for each unity agent in the environment dones = env_info.local_done # see if episode has finished for each unity agent in the environment # Send (S, A, R, S') info to the training agent for replay buffer (memory) and network updates agent.step(states, actions, rewards, next_states, dones) # set new states to current states for determining next actions states = next_states # Update episode score for each unity agent agent_scores += rewards # If any unity agent indicates that the episode is done, # then exit episode loop, to begin new episode if np.any(dones): break # Add episode score to Scores and... # Calculate mean score over last 100 episodes # Mean score is calculated over current episodes until i_episode > 100 episode_scores.append(np.mean(agent_scores)) average_score = np.mean( episode_scores[i_episode - min(i_episode, scores_average_window):i_episode + 1]) # Print current and average score, number of steps in episode. print( '\nEpisode {}\tEpisode Score: {:.3f}\tAverage Score: {:.3f}\tNumber Of Steps{}' .format(i_episode, episode_scores[i_episode - 1], average_score, steps), end="") if print_agent_loss: # print agent's loss (useful for babysitting the training) print('\t episode loss: {}'.format(agent.debug_loss)) if save_log: # Save the recorded Scores data (in weights path) if not (os.path.isdir(weights_path)): os.mkdir(weights_path) scores_filename = "Agent_Scores.csv" # noinspection PyTypeChecker np.savetxt(os.path.join(weights_path, scores_filename), episode_scores, delimiter=",") # Save trained Actor and Critic network weights after each episode agent.save_weights(weights_path) if save_best_weights: if best_score < average_score: best_score = average_score agent.save_weights(weights_path + '_best') if save_mem and (i_episode % 50) == 0: agent.save_mem(mem_path) # Check to see if the task is solved (i.e,. average_score > solved_score over 100 episodes). # If yes, save the network weights and scores and end training. if i_episode > scores_average_window * 2 and average_score >= solved_score: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.3f}' .format(i_episode, average_score)) break agent.save_mem(mem_path) """ ################################### STEP 7: Everything is Finished -> Close the Environment. """ env.close()
def main(): # Initialize environment env = UnityEnvironment(file_name='../env/Hopper/Hopper') default_brain = env.brain_names[0] brain = env.brains[default_brain] env_info = env.reset(train_mode=True)[default_brain] obs_dim = env_info.vector_observations[0].shape[0] act_dim = brain.vector_action_space_size[0] print('State dimension:', obs_dim) print('Action dimension:', act_dim) # Set a random seed np.random.seed(0) torch.manual_seed(0) # Create a SummaryWriter object by TensorBoard dir_name = 'runs/' + 'Hopper' + '_' + time.ctime() writer = SummaryWriter(log_dir=dir_name) # Main network actor = GaussianPolicy(obs_dim, act_dim).to(device) qf1 = FlattenMLP(obs_dim + act_dim, 1).to(device) qf2 = FlattenMLP(obs_dim + act_dim, 1).to(device) # Target network qf1_target = FlattenMLP(obs_dim + act_dim, 1).to(device) qf2_target = FlattenMLP(obs_dim + act_dim, 1).to(device) # Initialize target parameters to match main parameters hard_target_update(qf1, qf1_target) hard_target_update(qf2, qf2_target) # Create optimizers actor_optimizer = optim.Adam(actor.parameters(), lr=args.actor_lr) qf1_optimizer = optim.Adam(qf1.parameters(), lr=args.qf_lr) qf2_optimizer = optim.Adam(qf2.parameters(), lr=args.qf_lr) # Experience buffer replay_buffer = ReplayBuffer(obs_dim, act_dim, args.buffer_size) # If automatic entropy tuning is True, initialize a target entropy, a log alpha and an alpha optimizer if args.automatic_entropy_tuning: target_entropy = -np.prod((act_dim, )).item() log_alpha = torch.zeros(1, requires_grad=True, device=device) alpha_optimizer = optim.Adam([log_alpha], lr=args.alpha_lr) else: target_entropy = None log_alpha = None alpha_optimizer = None def run_one_episode(steps, eval_mode): total_reward = 0. env_info = env.reset(train_mode=True)[default_brain] obs = env_info.vector_observations[0] done = False # Keep interacting until agent reaches a terminal state. while not done: steps += 1 if eval_mode: action, _, _ = actor(torch.Tensor(obs).to(device)) action = action.detach().cpu().numpy() env_info = env.step(action)[default_brain] next_obs = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] else: # Collect experience (s, a, r, s') using some policy _, action, _ = actor(torch.Tensor(obs).to(device)) action = action.detach().cpu().numpy() env_info = env.step(action)[default_brain] next_obs = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] # Add experience to replay buffer replay_buffer.add(obs, action, reward, next_obs, done) # Start training when the number of experience is greater than batch size if steps > args.batch_size: batch = replay_buffer.sample(args.batch_size) args.alpha = train_model(actor, qf1, qf2, qf1_target, qf2_target, actor_optimizer, qf1_optimizer, qf2_optimizer, batch, target_entropy, log_alpha, alpha_optimizer) total_reward += reward obs = next_obs return steps, total_reward, args.alpha train_sum_returns = 0. train_num_episodes = 0 start_time = time.time() steps = 0 for episode in range(1, args.training_eps + 1): # Perform the training phase, during which the agent learns eval_mode = False # Run one episode steps, train_episode_return, args.alpha = run_one_episode( steps, eval_mode) train_sum_returns += train_episode_return train_num_episodes += 1 train_average_return = train_sum_returns / train_num_episodes if train_num_episodes > 0 else 0.0 # Log experiment result for training episodes writer.add_scalar('Train/AverageReturns', train_average_return, episode) writer.add_scalar('Train/EpisodeReturns', train_episode_return, episode) if args.automatic_entropy_tuning: writer.add_scalar('Train/Alpha', args.alpha, episode) # Perform the evaluation phase -- no learning if episode > 0 and episode % args.eval_per_train == 0: eval_mode = True eval_sum_returns = 0. eval_num_episodes = 0 for _ in range(args.evaluation_eps): # Run one episode steps, eval_episode_return, _ = run_one_episode( steps, eval_mode) eval_sum_returns += eval_episode_return eval_num_episodes += 1 eval_average_return = eval_sum_returns / eval_num_episodes if eval_num_episodes > 0 else 0.0 # Log experiment result for evaluation episodes writer.add_scalar('Eval/AverageReturns', eval_average_return, episode) writer.add_scalar('Eval/EpisodeReturns', eval_episode_return, episode) print('---------------------------------------') print('Episodes:', episode) print('AverageReturn:', round(train_average_return, 2)) print('EvalEpisodes:', eval_num_episodes) print('EvalAverageReturn:', round(eval_average_return, 2)) print('Time:', int(time.time() - start_time)) print('---------------------------------------') # Save a training model if eval_average_return >= args.threshold_return: if not os.path.exists('./save_model'): os.mkdir('./save_model') ckpt_path = os.path.join('./save_model/' + 'Hopper' + '_ep_' + str(episode) \ + '_rt_' + str(round(eval_average_return, 2)) \ + '_t_' + str(int(time.time() - start_time)) + '.pt') torch.save(actor.state_dict(), ckpt_path) break env.close()