Python UnityEnvironment.close Examples, mlagents.envs.UnityEnvironment.close Python Examples

Example #1

0

Show file

File: test_ppo.py Project: hubz00/AvatarMaker

def test_ppo_model_cc_vector_rnn(mock_communicator, mock_launcher):
    tf.reset_default_graph()
    with tf.Session() as sess:
        with tf.variable_scope("FakeGraphScope"):
            mock_communicator.return_value = MockCommunicator(
                discrete_action=False, visual_inputs=0)
            env = UnityEnvironment(' ')
            memory_size = 128
            model = PPOModel(env.brains["RealFakeBrain"],
                             use_recurrent=True,
                             m_size=memory_size)
            init = tf.global_variables_initializer()
            sess.run(init)

            run_list = [
                model.output, model.all_log_probs, model.value, model.entropy,
                model.learning_rate, model.memory_out
            ]
            feed_dict = {
                model.batch_size: 1,
                model.sequence_length: 2,
                model.memory_in: np.zeros((1, memory_size)),
                model.vector_in: np.array([[1, 2, 3, 1, 2, 3],
                                           [3, 4, 5, 3, 4, 5]]),
                model.epsilon: np.array([[0, 1]])
            }
            sess.run(run_list, feed_dict=feed_dict)
            env.close()

Example #2

0

Show file

def test_step(mock_communicator, mock_launcher):
    mock_communicator.return_value = MockCommunicator(discrete_action=False,
                                                      visual_inputs=0)
    env = UnityEnvironment(' ')
    brain = env.brains['RealFakeBrain']
    brain_info = env.reset()
    brain_info = env.step([0] * brain.vector_action_space_size[0] *
                          len(brain_info['RealFakeBrain'].agents))
    with pytest.raises(UnityActionException):
        env.step([0])
    brain_info = env.step([-1] * brain.vector_action_space_size[0] *
                          len(brain_info['RealFakeBrain'].agents))
    with pytest.raises(UnityActionException):
        env.step([0] * brain.vector_action_space_size[0] *
                 len(brain_info['RealFakeBrain'].agents))
    env.close()
    assert env.global_done
    assert isinstance(brain_info, dict)
    assert isinstance(brain_info['RealFakeBrain'], BrainInfo)
    assert isinstance(brain_info['RealFakeBrain'].visual_observations, list)
    assert isinstance(brain_info['RealFakeBrain'].vector_observations,
                      np.ndarray)
    assert len(brain_info['RealFakeBrain'].visual_observations
               ) == brain.number_visual_observations
    assert len(brain_info['RealFakeBrain'].vector_observations) == \
           len(brain_info['RealFakeBrain'].agents)
    assert len(brain_info['RealFakeBrain'].vector_observations[0]) == \
           brain.vector_observation_space_size * brain.num_stacked_vector_observations

    print("\n\n\n\n\n\n\n" + str(brain_info['RealFakeBrain'].local_done))
    assert not brain_info['RealFakeBrain'].local_done[0]
    assert brain_info['RealFakeBrain'].local_done[2]

Example #3

0

Show file

File: test_ppo.py Project: carlotes247/IGGI19_Imitation_Learning_Workshop

def test_ppo_get_value_estimates(mock_communicator, mock_launcher, dummy_config):
    tf.reset_default_graph()
    mock_communicator.return_value = MockCommunicator(
        discrete_action=False, visual_inputs=0
    )
    env = UnityEnvironment(" ")
    brain_infos = env.reset()
    brain_info = brain_infos[env.brain_names[0]]

    trainer_parameters = dummy_config
    model_path = env.brain_names[0]
    trainer_parameters["model_path"] = model_path
    trainer_parameters["keep_checkpoints"] = 3
    policy = PPOPolicy(
        0, env.brains[env.brain_names[0]], trainer_parameters, False, False
    )
    run_out = policy.get_value_estimates(brain_info, 0, done=False)
    for key, val in run_out.items():
        assert type(key) is str
        assert type(val) is float

    run_out = policy.get_value_estimates(brain_info, 0, done=True)
    for key, val in run_out.items():
        assert type(key) is str
        assert val == 0.0

    # Check if we ignore terminal states properly
    policy.reward_signals["extrinsic"].use_terminal_states = False
    run_out = policy.get_value_estimates(brain_info, 0, done=True)
    for key, val in run_out.items():
        assert type(key) is str
        assert val != 0.0

    env.close()

Example #4

0

Show file

File: test_ppo.py Project: hubz00/AvatarMaker

def test_ppo_model_dc_vector_curio(mock_communicator, mock_launcher):
    tf.reset_default_graph()
    with tf.Session() as sess:
        with tf.variable_scope("FakeGraphScope"):
            mock_communicator.return_value = MockCommunicator(
                discrete_action=True, visual_inputs=0)
            env = UnityEnvironment(' ')
            model = PPOModel(env.brains["RealFakeBrain"], use_curiosity=True)
            init = tf.global_variables_initializer()
            sess.run(init)

            run_list = [
                model.output, model.all_log_probs, model.value, model.entropy,
                model.learning_rate, model.intrinsic_reward
            ]
            feed_dict = {
                model.batch_size:
                2,
                model.sequence_length:
                1,
                model.vector_in:
                np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]),
                model.next_vector_in:
                np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]),
                model.action_holder: [[0], [0]],
                model.action_masks:
                np.ones([2, 2])
            }
            sess.run(run_list, feed_dict=feed_dict)
            env.close()

Example #5

0

Show file

File: test_ppo.py Project: hubz00/AvatarMaker

def test_ppo_model_cc_visual(mock_communicator, mock_launcher):
    tf.reset_default_graph()
    with tf.Session() as sess:
        with tf.variable_scope("FakeGraphScope"):
            mock_communicator.return_value = MockCommunicator(
                discrete_action=False, visual_inputs=2)
            env = UnityEnvironment(' ')

            model = PPOModel(env.brains["RealFakeBrain"])
            init = tf.global_variables_initializer()
            sess.run(init)

            run_list = [
                model.output, model.log_probs, model.value, model.entropy,
                model.learning_rate
            ]
            feed_dict = {
                model.batch_size: 2,
                model.sequence_length: 1,
                model.vector_in: np.array([[1, 2, 3, 1, 2, 3],
                                           [3, 4, 5, 3, 4, 5]]),
                model.visual_in[0]: np.ones([2, 40, 30, 3]),
                model.visual_in[1]: np.ones([2, 40, 30, 3]),
                model.epsilon: np.array([[0, 1], [2, 3]])
            }
            sess.run(run_list, feed_dict=feed_dict)
            env.close()

Example #6

0

Show file

File: td3.py Project: TinkTheBoush/spinning_up_kr

    def test_unity(self):
        from mlagents.envs import UnityEnvironment
        env = UnityEnvironment(file_name='env/' + env_set['env_name'],
                               worker_id=0)
        default_brain = env.brain_names[0]
        env_info = env.reset(config={
            'Mass': 1,
            'Length': 1.5 * 3.0
        })[default_brain]
        states = env_info.vector_observations
        self.saver.restore(self.sess,
                           save_path='runs/td3_' + env_set['env_name'] +
                           '/save')
        scores = np.zeros([self.worker_size])
        score = deque(maxlen=1000)
        for i in range(10000):
            actions = self.get_action(states, 0.05)
            env_info = env.step(actions)[default_brain]
            states = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done
            scores += rewards

            for idx, d in enumerate(dones):
                if d:
                    score.append(scores[idx])
                    scores[idx] = 0
        print('score : ', "{0:.2f}".format(np.mean(score)))
        env.close()

Example #7

0

Show file

def test_close(mock_communicator, mock_launcher):
    comm = MockCommunicator(discrete_action=False, visual_inputs=0)
    mock_communicator.return_value = comm
    env = UnityEnvironment(' ')
    assert env._loaded
    env.close()
    assert not env._loaded
    assert comm.has_been_closed

Example #8

0

Show file

def test_initialization(mock_communicator, mock_launcher):
    mock_communicator.return_value = MockCommunicator(discrete_action=False,
                                                      visual_inputs=0)
    env = UnityEnvironment(' ')
    with pytest.raises(UnityActionException):
        env.step([0])
    assert env.brain_names[0] == 'RealFakeBrain'
    env.close()

Example #9

0

Show file

def main():
    env = UnityEnvironment(file_name='../env/Pong/Pong')

    default_brain = env.brain_names[0]
    brain = env.brains[default_brain]

    env_info = env.reset(train_mode=False)[default_brain]

    obs_dim = env_info.vector_observations[0].shape[0]
    act_num = brain.vector_action_space_size[0]

    mlp = MLP(obs_dim, act_num).to(device)

    if args.load is not None:
        pretrained_model_path = os.path.join('./save_model/' + str(args.load))
        pretrained_model = torch.load(pretrained_model_path)
        mlp.load_state_dict(pretrained_model)

    sum_returns = 0.
    num_episodes = 0

    for episode in range(1, 10001):
        total_reward = 0.

        obs = env_info.vector_observations[0]
        done = False

        while not done:
            action = mlp(
                torch.Tensor(obs).to(device)).argmax().detach().cpu().numpy()
            env_info = env.step(int(action))[default_brain]

            next_obs = env_info.vector_observations[0]
            reward = env_info.rewards[0]
            done = env_info.local_done[0]

            total_reward += reward
            obs = next_obs

        sum_returns += total_reward
        num_episodes += 1

        average_return = sum_returns / num_episodes if num_episodes > 0 else 0.0

        if episode % 10 == 0:
            print('---------------------------------------')
            print('Episodes:', num_episodes)
            print('AverageReturn:', average_return)
            print('---------------------------------------')

    env.close()

Example #10

0

Show file

File: testing_supervisor.py Project: Mikkelsv/ActivePerception

    def evaluate_model(self, model_name):
        """
        Session for evaluating every model. Uses the concept of the Trainer in a deterministic manner
        :param model_name: Name of the model to be evaluated
        """
        print(
            "\n==================== New Evaluation {} ============================"
            .format(model_name))

        if self.use_executable:
            env = UnityEnvironment(file_name=self.env_name)
        else:
            env = UnityEnvironment(file_name=None)

        default_brain = env.brain_names[0]
        env_info = env.reset(train_mode=False)[default_brain]
        num_output = len(env_info.action_masks[0])

        # Fetching model
        model_path = self.path_to_models + model_name + ".h5"
        model_manager = ModelManager(load=True,
                                     num_views=num_output,
                                     num_output=num_output,
                                     model_name=model_path)

        # Change the model name
        if "_" in model_name and False:
            model_name = "evaluation_" + model_name.split("_", 1)[1]
        else:
            model_name = "eval_" + model_name
        model_name = "eval_coverage_progression"

        # Evaluating the model
        trainer = Trainer(model_manager, env, self.max_step)
        synopsis = SynopsisManager(trainer,
                                   model_manager,
                                   run_name=model_name,
                                   max_step=self.max_step)
        trainer.evaluate_solution(self.evaluation_size)

        # Close environment
        env.close()

        # Cleanup
        # del trainer.memory
        del trainer
        del synopsis
        del model_manager

Example #11

0

Show file

File: learning_supervisor.py Project: Mikkelsv/ActivePerception

    def execute_session(self, model_name, alpha_acc, exp_acc, alpha_dist, exp_dist, alpha_steps, t):
        """
        Executes a single training session of a model
        :param model_name: Name of the model
        :param alpha_acc: Coverage reward
        :param exp_acc: Coverage exponential reward
        :param alpha_dist: Distance reward
        :param exp_dist: Distance Exponential Reward
        :param alpha_steps: Step Reward
        :param t: Which architecture to be used
        """
        print("\n==================== New Session {} ============================".format(model_name))
        print("acc: {} - {}, dist: {} - {}, steps {}, views: {}, LR: {}\n"
              .format(alpha_acc, exp_acc, alpha_dist, exp_dist, alpha_steps, self.alpha_views, self.learning_rate))

        if self.use_executable:
            env = UnityEnvironment(file_name=self.env_name)
        else:
            env = UnityEnvironment(file_name=None)
        default_brain = env.brain_names[0]
        env_info = env.reset(train_mode=False)[default_brain]
        num_output = len(env_info.action_masks[0])

        # Fetching model
        model_manager = ModelManager(load=self.load_model, num_views=num_output, num_output=num_output,
                                     model_name=model_name, learning_rate=self.learning_rate, variation=t)

        # Train
        trainer = Trainer(model_manager, env, self.max_step)
        trainer.set_reward_values(alpha_acc, exp_acc, alpha_dist, exp_dist, alpha_steps, self.alpha_views)
        synopsis = SynopsisManager(trainer, model_manager, run_name=model_name, max_step=self.max_step)
        trainer.train(self.num_generations, self.num_batches, self.batch_size, self.test_size)
        synopsis.print_training_summary()
        trainer.evaluate_solution(self.evaluation_size)

        # Close environment
        env.close()

        # Save model
        model_manager.save_model()

        # Cleanup
        # del trainer.memory
        del trainer
        del synopsis
        del model_manager

Example #12

0

Show file

File: test_ppo.py Project: hubz00/AvatarMaker

def test_ppo_policy_evaluate(mock_communicator, mock_launcher, dummy_config):
    tf.reset_default_graph()
    mock_communicator.return_value = MockCommunicator(discrete_action=False,
                                                      visual_inputs=0)
    env = UnityEnvironment(' ')
    brain_infos = env.reset()
    brain_info = brain_infos[env.brain_names[0]]

    trainer_parameters = dummy_config
    model_path = env.brain_names[0]
    trainer_parameters['model_path'] = model_path
    trainer_parameters['keep_checkpoints'] = 3
    policy = PPOPolicy(0, env.brains[env.brain_names[0]], trainer_parameters,
                       False, False)
    run_out = policy.evaluate(brain_info)
    assert run_out['action'].shape == (3, 2)
    env.close()

Example #13

0

Show file

class Drone:
    spec = None
    name = None
    action_space = None
    observation_space = None

    def __init__(
        self,
        env_path: str,
        env_name: str,
        cfg: dict,
        train_mode: bool = True,
        worker_id: int = 1,
    ):
        self.env = UnityEnvironment(file_name=env_path, worker_id=worker_id)
        self.default_brain = self.env.brain_names[0]
        self.cfg = cfg
        self.name = env_name
        self.action_space = spaces.Box(low=-1,
                                       high=1,
                                       shape=(3, ),
                                       dtype=np.float32)
        self.observation_space = spaces.Box(low=-np.inf,
                                            high=np.inf,
                                            shape=(9, ),
                                            dtype=np.float32)
        self.train_mode = train_mode

    def reset(self):
        env_info = self.env.reset(train_mode=self.train_mode,
                                  config=self.cfg)[self.default_brain]
        return env_info.vector_observations[0]

    def step(self, action):
        env_info = self.env.step(action.tolist())[self.default_brain]
        observation = env_info.vector_observations[0]
        reward = env_info.rewards[0]
        done = env_info.local_done[0]
        info = None
        return observation, reward, done, info

    def close(self):
        self.env.close()

    def seed(self, seed):
        pass

Example #14

0

Show file

File: test_bc.py Project: hubz00/AvatarMaker

def test_cc_bc_model(mock_communicator, mock_launcher):
    tf.reset_default_graph()
    with tf.Session() as sess:
        with tf.variable_scope("FakeGraphScope"):
            mock_communicator.return_value = MockCommunicator(
                discrete_action=False, visual_inputs=0)
            env = UnityEnvironment(' ')
            model = BehavioralCloningModel(env.brains["RealFakeBrain"])
            init = tf.global_variables_initializer()
            sess.run(init)

            run_list = [model.sample_action, model.policy]
            feed_dict = {model.batch_size: 2,
                         model.sequence_length: 1,
                         model.vector_in: np.array([[1, 2, 3, 1, 2, 3],
                                                    [3, 4, 5, 3, 4, 5]])}
            sess.run(run_list, feed_dict=feed_dict)
            env.close()

Example #15

0

Show file

def test_reset(mock_communicator, mock_launcher):
    mock_communicator.return_value = MockCommunicator(discrete_action=False,
                                                      visual_inputs=0)
    env = UnityEnvironment(' ')
    brain = env.brains['RealFakeBrain']
    brain_info = env.reset()
    env.close()
    assert not env.global_done
    assert isinstance(brain_info, dict)
    assert isinstance(brain_info['RealFakeBrain'], BrainInfo)
    assert isinstance(brain_info['RealFakeBrain'].visual_observations, list)
    assert isinstance(brain_info['RealFakeBrain'].vector_observations,
                      np.ndarray)
    assert len(brain_info['RealFakeBrain'].visual_observations
               ) == brain.number_visual_observations
    assert len(brain_info['RealFakeBrain'].vector_observations) == \
           len(brain_info['RealFakeBrain'].agents)
    assert len(brain_info['RealFakeBrain'].vector_observations[0]) == \
           brain.vector_observation_space_size * brain.num_stacked_vector_observations

Example #16

0

Show file

File: test_ppo.py Project: gnouhp/MarathonEnvsBaselines

def test_ppo_policy_evaluate(mock_communicator, mock_launcher):
    tf.reset_default_graph()
    with tf.Session() as sess:
        mock_communicator.return_value = MockCommunicator(
            discrete_action=False, visual_inputs=0)
        env = UnityEnvironment(' ')
        brain_infos = env.reset()
        brain_info = brain_infos[env.brain_names[0]]

        trainer_parameters = dummy_config()
        graph_scope = env.brain_names[0]
        trainer_parameters['graph_scope'] = graph_scope
        policy = PPOPolicy(0, env.brains[env.brain_names[0]],
                           trainer_parameters, sess, False)
        init = tf.global_variables_initializer()
        sess.run(init)
        run_out = policy.evaluate(brain_info)
        assert run_out['action'].shape == (3, 2)
        env.close()

Example #17

0

Show file

class Sokoban:
    spec = None
    name = None
    action_space = None
    observation_space = None

    def __init__(
        self,
        env_path: str,
        env_name: str,
        cfg: dict,
        train_mode=True,
        worker_id: int = 1,
    ):
        self.env = UnityEnvironment(file_name=env_path, worker_id=worker_id)
        self.default_brain = self.env.brain_names[0]
        self.cfg = cfg
        self.name = env_name
        self.action_space = spaces.Discrete(5)
        self.observation_space = spaces.Box(low=0,
                                            high=255,
                                            shape=(3, 84, 84),
                                            dtype=np.uint8)
        self.train_mode = train_mode

    def reset(self):
        env_info = self.env.reset(train_mode=self.train_mode,
                                  config=self.cfg)[self.default_brain]
        return env_info.visual_observations[0][0].reshape(3, 84, 84)

    def step(self, action):
        env_info = self.env.step(action.tolist())[self.default_brain]
        observation = env_info.visual_observations[0][0].reshape(3, 84, 84)
        reward = env_info.rewards[0]
        done = env_info.local_done[0]
        info = None
        return observation, reward, done, info

    def close(self):
        self.env.close()

    def seed(self, seed):
        pass

Example #18

0

Show file

File: Sokoban_gym.py Project: minseop4898/for-unity-simple-test

class Sokoban_env():
    def __init__(self, env_path, env_cfg=Sokoban_env_cfg):
        self.env = UnityEnvironment(file_name=env_path)
        self.default_brain = self.env.brain_names[0]
        self.env_cfg = env_cfg

    def reset(self):
        env_info = self.env.reset(train_mode=True, config=self.env_cfg)[self.default_brain]
        return env_info.visual_observations[0][0].reshape(1,3,84,84)

    def step(self, action):
        env_info = self.env.step(action)[self.default_brain]
        observation = env_info.visual_observations[0][0].reshape(1,3,84,84)
        reward = env_info.rewards[0]
        done = env_info.local_done[0]
        info = None
        return observation, reward, done, info

    def close(self):
        self.env.close()

Example #19

0

Show file

File: test_bc.py Project: znova/ml-agents

def test_visual_dc_bc_model(mock_communicator, mock_launcher):
    tf.reset_default_graph()
    with tf.Session() as sess:
        with tf.variable_scope("FakeGraphScope"):
            mock_communicator.return_value = MockCommunicator(
                discrete_action=True, visual_inputs=2)
            env = UnityEnvironment(" ")
            model = BehavioralCloningModel(env.brains["RealFakeBrain"])
            init = tf.global_variables_initializer()
            sess.run(init)

            run_list = [model.sample_action, model.action_probs]
            feed_dict = {
                model.batch_size: 2,
                model.dropout_rate: 1.0,
                model.sequence_length: 1,
                model.vector_in: np.array([[1, 2, 3, 1, 2, 3],
                                           [3, 4, 5, 3, 4, 5]]),
                model.visual_in[0]: np.ones([2, 40, 30, 3]),
                model.visual_in[1]: np.ones([2, 40, 30, 3]),
                model.action_masks: np.ones([2, 2]),
            }
            sess.run(run_list, feed_dict=feed_dict)
            env.close()

Example #20

0

Show file

class UnityEnvBase(gym.Env):
    """
    Provides Gym wrapper for Unity Learning Environments.
    Multi-agent environments use lists for object types, as done here:
    https://github.com/openai/multiagent-particle-envs
    """

    worker_id = UNIVERSAL_LOCK

    def __init__(
        self,
        environment_filename: str,
        use_visual=True,
        uint8_visual=True,
        multiagent=False,
        flatten_branched=False,
        no_graphics=False,
        allow_multiple_visual_obs=False,
    ):
        """
        Environment initialization
        :param environment_filename: The UnityEnvironment path or file to be wrapped in the gym.
        :param worker_id: Worker number for environment.
        :param use_visual: Whether to use visual observation or vector observation.
        :param uint8_visual: Return visual observations as uint8 (0-255) matrices instead of float (0.0-1.0).
        :param multiagent: Whether to run in multi-agent mode (lists of obs, reward, done).
        :param flatten_branched: If True, turn branched discrete action spaces into a Discrete space rather than MultiDiscrete.
        :param no_graphics: Whether to run the Unity simulator in no-graphics mode
        :param allow_multiple_visual_obs: If True, return a list of visual observations instead of only one.
        """
        worker_id = UnityEnvBase._generate_new_env_id()
        self.worker_id = worker_id

        self._env = UnityEnvironment(environment_filename,
                                     worker_id,
                                     no_graphics=no_graphics)

        self.name = self._env.academy_name
        self.visual_obs = None
        self._current_state = None
        self._n_agents = None
        self._multiagent = multiagent
        self._flattener = None
        self.game_over = (
            False
        )  # Hidden flag used by Atari environments to determine if the game is over
        self._allow_multiple_visual_obs = allow_multiple_visual_obs

        # Check brain configuration
        if len(self._env.brains) != 1:
            raise UnityEnvBaseException(
                "There can only be one brain in a UnityEnvironment "
                "if it is wrapped in a gym.")
        if len(self._env.external_brain_names) <= 0:
            raise UnityEnvBaseException(
                "There are not any external brain in the UnityEnvironment")

        self.brain_name = self._env.external_brain_names[0]
        brain = self._env.brains[self.brain_name]

        if use_visual and brain.number_visual_observations == 0:
            raise UnityEnvBaseException(
                "`use_visual` was set to True, however there are no"
                " visual observations as part of this environment.")
        self.use_visual = brain.number_visual_observations >= 1 and use_visual

        if not use_visual and uint8_visual:
            logger.warning(
                "`uint8_visual was set to true, but visual observations are not in use. "
                "This setting will not have any effect.")
        else:
            self.uint8_visual = uint8_visual

        if brain.number_visual_observations > 1 and not self._allow_multiple_visual_obs:
            logger.warning(
                "The environment contains more than one visual observation. "
                "You must define allow_multiple_visual_obs=True to received them all. "
                "Otherwise, please note that only the first will be provided in the observation."
            )

        if brain.num_stacked_vector_observations != 1:
            raise UnityEnvBaseException(
                "There can only be one stacked vector observation in a UnityEnvironment "
                "if it is wrapped in a gym.")

        # Check for number of agents in scene.
        initial_info = self._env.reset()[self.brain_name]
        self._check_agents(len(initial_info.agents))

        # Set observation and action spaces
        if brain.vector_action_space_type == "discrete":
            if len(brain.vector_action_space_size) == 1:
                self._action_space = spaces.Discrete(
                    brain.vector_action_space_size[0])
            else:
                if flatten_branched:
                    self._flattener = ActionFlattener(
                        brain.vector_action_space_size)
                    self._action_space = self._flattener.action_space
                else:
                    self._action_space = spaces.MultiDiscrete(
                        brain.vector_action_space_size)

        else:
            if flatten_branched:
                logger.warning(
                    "The environment has a non-discrete action space. It will "
                    "not be flattened.")
            high = np.array([np.inf] * brain.vector_action_space_size[0])
            self._action_space = spaces.Box(-high, high, dtype=np.float32)
        high = np.array([np.inf] * brain.vector_observation_space_size)
        self.action_meanings = brain.vector_action_descriptions
        if self.use_visual:
            if brain.camera_resolutions[0]["blackAndWhite"]:
                depth = 1
            else:
                depth = 3
            self._observation_space = spaces.Box(
                0,
                1,
                dtype=np.float32,
                shape=(
                    brain.camera_resolutions[0]["height"],
                    brain.camera_resolutions[0]["width"],
                    depth,
                ),
            )
        else:
            self._observation_space = spaces.Box(-high, high, dtype=np.float32)

    def reset(self):
        """Resets the state of the environment and returns an initial observation.
        In the case of multi-agent environments, this is a list.
        Returns: observation (object/list): the initial observation of the
            space.
        """
        info = self._env.reset()[self.brain_name]
        n_agents = len(info.agents)
        self._check_agents(n_agents)
        self.game_over = False

        if not self._multiagent:
            obs, reward, done, info = self._single_step(info)
        else:
            obs, reward, done, info = self._multi_step(info)
        return obs

    def step(self, action):
        """Run one timestep of the environment's dynamics. When end of
        episode is reached, you are responsible for calling `reset()`
        to reset this environment's state.
        Accepts an action and returns a tuple (observation, reward, done, info).
        In the case of multi-agent environments, these are lists.
        Args:
            action (object/list): an action provided by the environment
        Returns:
            observation (object/list): agent's observation of the current environment
            reward (float/list) : amount of reward returned after previous action
            done (boolean/list): whether the episode has ended.
            info (dict): contains auxiliary diagnostic information, including BrainInfo.
        """

        # Use random actions for all other agents in environment.
        if self._multiagent:
            if not isinstance(action, list):
                raise UnityEnvBaseException(
                    "The environment was expecting `action` to be a list.")
            if len(action) != self._n_agents:
                raise UnityEnvBaseException(
                    "The environment was expecting a list of {} actions.".
                    format(self._n_agents))
            else:
                if self._flattener is not None:
                    # Action space is discrete and flattened - we expect a list of scalars
                    action = [
                        self._flattener.lookup_action(_act) for _act in action
                    ]
                action = np.array(action)
        else:
            if self._flattener is not None:
                # Translate action into list
                action = self._flattener.lookup_action(action)

        info = self._env.step(action)[self.brain_name]
        n_agents = len(info.agents)
        self._check_agents(n_agents)
        self._current_state = info

        if not self._multiagent:
            obs, reward, done, info = self._single_step(info)
            self.game_over = done
        else:
            obs, reward, done, info = self._multi_step(info)
            self.game_over = all(done)
        return obs, reward, done, info

    def _single_step(self, info):
        if self.use_visual:
            visual_obs = info.visual_observations
            if isinstance(visual_obs, list):
                visual_obs = np.array(visual_obs)

            if self._allow_multiple_visual_obs:
                visual_obs_list = []
                for obs in visual_obs:
                    visual_obs_list.append(
                        self._preprocess_single(obs[0, :, :, :]))
                self.visual_obs = visual_obs_list
            else:
                self.visual_obs = self._preprocess_single(
                    visual_obs[0][0, :, :, :])

            default_observation = self.visual_obs
        else:
            default_observation = info.vector_observations[0, :]

        return (
            default_observation,
            info.rewards[0],
            info.local_done[0],
            {
                "text_observation": info.text_observations[0],
                "brain_info": info,
                "vector_observations": info.vector_observations[0, :]
            },
        )

    def _preprocess_single(self, single_visual_obs):
        if self.uint8_visual:
            return (255.0 * single_visual_obs).astype(np.uint8)
        else:
            return single_visual_obs

    def _multi_step(self, info):
        if self.use_visual:
            self.visual_obs = self._preprocess_multi(info.visual_observations)
            default_observation = self.visual_obs
        else:
            default_observation = info.vector_observations
        return (
            list(default_observation),
            info.rewards,
            info.local_done,
            {
                "text_observation": info.text_observations,
                "brain_info": info
            },
        )

    def _preprocess_multi(self, multiple_visual_obs):
        if self.uint8_visual:
            return [(255.0 * _visual_obs).astype(np.uint8)
                    for _visual_obs in multiple_visual_obs]
        else:
            return multiple_visual_obs

    def render(self, mode="rgb_array"):
        return self.visual_obs

    def close(self):
        """Override _close in your subclass to perform any necessary cleanup.
        Environments will automatically close() themselves when
        garbage collected or when the program exits.
        """
        try:
            self._env.close()
        except:
            pass

    def __del__(self):
        try:
            self._env.close()
        except:
            pass

    def get_action_meanings(self):
        return self.action_meanings

    def seed(self, seed=None):
        """Sets the seed for this env's random number generator(s).
        Currently not implemented.
        """
        logger.warn("Could not seed environment %s", self.name)
        return

    def _check_agents(self, n_agents):
        if not self._multiagent and n_agents > 1:
            raise UnityEnvBaseException(
                "The environment was launched as a single-agent environment, however"
                "there is more than one agent in the scene.")
        elif self._multiagent and n_agents <= 1:
            raise UnityEnvBaseException(
                "The environment was launched as a mutli-agent environment, however"
                "there is only one agent in the scene.")
        if self._n_agents is None:
            self._n_agents = n_agents
            logger.info("{} agents within environment.".format(n_agents))
        elif self._n_agents != n_agents:
            raise UnityEnvBaseException(
                "The number of agents in the environment has changed since "
                "initialization. This is not supported.")

    @staticmethod
    def _generate_new_env_id():

        with UnityEnvBase.worker_id.get_lock():
            new_id = UnityEnvBase.worker_id.value
            UnityEnvBase.worker_id.value += 1

        return new_id

    @property
    def metadata(self):
        return {"render.modes": ["rgb_array"]}

    @property
    def reward_range(self):
        return -float("inf"), float("inf")

    @property
    def spec(self):
        return None

    @property
    def action_space(self):
        return self._action_space

    @property
    def observation_space(self):
        return self._observation_space

    @property
    def number_agents(self):
        return self._n_agents

Example #21

0

Show file

class SocTwoEnv():
    def __init__(self,
                 env_path,
                 worker_id,
                 train_mode=True,
                 n_str=16,
                 n_goalie=16):
        self.env = UnityEnvironment(file_name=env_path, worker_id=0)
        self.striker_brain_name, self.goalie_brain_name = self.env.brain_names
        self.striker_brain = self.env.brains[self.striker_brain_name]
        self.goalie_brain = self.env.brains[self.goalie_brain_name]
        self.done_str = [False] * 16
        self.done_goalie = [False] * 16
        self.train_mode = train_mode
        self.done_hist_str = [False] * 16
        self.done_hist_goalie = [False] * 16
        self.episode_str_rewards = 0
        self.episode_goalie_rewards = 0
        self.n_str = n_str
        self.n_goalie = n_goalie
        self.act_str_hist = [[] for x in range(n_str)]
        self.act_goalie_hist = [[] for x in range(n_goalie)]
        self.observation_str_hist = [[] for x in range(SIZE_OBSERVATION)]
        self.observation_goalie_hist = [[] for x in range(SIZE_OBSERVATION)]
        self.observation_str = None
        self.observation_goalie = None
        return

    def reset(self):
        """
            Reset the all environments and agents.
        """
        self.env_info_str = self.env.reset(
            train_mode=self.train_mode)[self.striker_brain_name]
        self.env_info_goalie = self.env.reset(
            train_mode=self.train_mode)[self.goalie_brain_name]

        self.episode_rewards = 0
        self.done_str = [False] * 16
        self.done_goalie = [False] * 16
        self.done_hist_str = np.array([False] * 16)
        self.done_hist_goalie = np.array([False] * 16)
        return {'str': self.env_info_str, 'goalie': self.env_info_goalie}

    def step(self, action_str, action_goalie):
        """
            In each timestep, give each striker and goalie a instruction
            to do action. And then, get the current observation stored
            at observation_str and observation_goalie.
        """
        self.env_info = self.env.step({
            self.striker_brain_name: action_str,
            self.goalie_brain_name: action_goalie
        })
        self.observation_str = np.array(
            self.env_info[self.striker_brain_name].vector_observations)
        self.observation_goalie = np.array(
            self.env_info[self.goalie_brain_name].vector_observations)
        return self.env_info

    def reward(self):
        self.episode_str_rewards = np.array(
            self.env_info[self.striker_brain_name].rewards)
        self.episode_goalie_rewards = np.array(
            self.env_info[self.goalie_brain_name].rewards)
        return self.episode_str_rewards, self.episode_goalie_rewards

    def close(self):
        """
            Close the simulation Unity environment.
        """
        self.env.close()
        return

    def done(self):
        self.done_str = np.array(
            self.env_info[self.striker_brain_name].local_done)
        self.done_goalie = np.array(
            self.env_info[self.goalie_brain_name].local_done)

    def reset_some_agents(self, str_arg, goalie_arg):
        """
            params:
                str_arg, mark which striker's history that wants to be cleared.
                goalie_arg, mark which goalie's history that wants to be cleared.
            Clear the history of specific agents.

        """
        for i in str_arg:
            self.act_str_hist[i[0]] = []
            self.observation_str_hist[i[0]] = []
        for i in goalie_arg:
            self.act_goalie_hist[i[0]] = []

    def print_r(self, episode):
        print("Total reward this episode_{}: {}".format(
            episode, self.episode_rewards))
        return

Example #22

0

Show file

class ObstacleTowerEnv(gym.Env):
    ALLOWED_VERSIONS = ['1']

    def __init__(self,
                 environment_filename=None,
                 docker_training=False,
                 worker_id=0,
                 retro=True):
        """
        Arguments:
          environment_filename: The file path to the Unity executable.  Does not require the extension.
          docker_training: Whether this is running within a docker environment and should use a virtual 
            frame buffer (xvfb).
          worker_id: The index of the worker in the case where multiple environments are running.  Each 
            environment reserves port (5005 + worker_id) for communication with the Unity executable.
          retro: Resize visual observation to 84x84 (int8) and flattens action space.
        """
        if self.is_grading():
            environment_filename = None
            docker_training = True

        self._env = UnityEnvironment(environment_filename,
                                     worker_id,
                                     docker_training=docker_training)

        split_name = self._env.academy_name.split('-v')
        if len(split_name) == 2 and split_name[0] == "ObstacleTower":
            self.name, self.version = split_name
        else:
            raise UnityGymException(
                "Attempting to launch non-Obstacle Tower environment")

        if self.version not in self.ALLOWED_VERSIONS:
            raise UnityGymException(
                "Invalid Obstacle Tower version.  Your build is v" + self.version + \
                " but only the following versions are compatible with this gym: " + \
                str(self.ALLOWED_VERSIONS)
            )

        self.visual_obs = None
        self._current_state = None
        self._n_agents = None
        self._done_grading = False
        self._flattener = None
        self._seed = None
        self._floor = None
        self.game_over = False  # Hidden flag used by Atari environments to determine if the game is over
        self.retro = retro

        flatten_branched = self.retro
        uint8_visual = self.retro

        # Check brain configuration
        if len(self._env.brains) != 1:
            raise UnityGymException(
                "There can only be one brain in a UnityEnvironment "
                "if it is wrapped in a gym.")
        self.brain_name = self._env.external_brain_names[0]
        brain = self._env.brains[self.brain_name]

        if brain.number_visual_observations == 0:
            raise UnityGymException(
                "Environment provides no visual observations.")

        self.uint8_visual = uint8_visual

        if brain.number_visual_observations > 1:
            logger.warning(
                "The environment contains more than one visual observation. "
                "Please note that only the first will be provided in the observation."
            )

        # Check for number of agents in scene.
        initial_info = self._env.reset()[self.brain_name]
        self._check_agents(len(initial_info.agents))

        # Set observation and action spaces
        if len(brain.vector_action_space_size) == 1:
            self._action_space = spaces.Discrete(
                brain.vector_action_space_size[0])
        else:
            if flatten_branched:
                self._flattener = ActionFlattener(
                    brain.vector_action_space_size)
                self._action_space = self._flattener.action_space
            else:
                self._action_space = spaces.MultiDiscrete(
                    brain.vector_action_space_size)

        high = np.array([np.inf] * brain.vector_observation_space_size)
        self.action_meanings = brain.vector_action_descriptions

        depth = 3
        image_space_max = 1.0
        image_space_dtype = np.float32
        camera_height = brain.camera_resolutions[0]["height"]
        camera_width = brain.camera_resolutions[0]["width"]
        if self.retro:
            image_space_max = 255
            image_space_dtype = np.uint8
            camera_height = 84
            camera_width = 84

        image_space = spaces.Box(0,
                                 image_space_max,
                                 dtype=image_space_dtype,
                                 shape=(camera_height, camera_width, depth))
        if self.retro:
            self._observation_space = image_space
        else:
            max_float = np.finfo(np.float32).max
            keys_space = spaces.Discrete(5)
            time_remaining_space = spaces.Box(low=0.0,
                                              high=max_float,
                                              shape=(1, ),
                                              dtype=np.float32)
            self._observation_space = spaces.Tuple(
                (image_space, keys_space, time_remaining_space))

    def done_grading(self):
        return self._done_grading

    def is_grading(self):
        return os.getenv('OTC_EVALUATION_ENABLED', False)

    def reset(self):
        """Resets the state of the environment and returns an initial observation.
        In the case of multi-agent environments, this is a list.
        Returns: observation (object/list): the initial observation of the
            space.
        """
        reset_params = {}
        if self._floor is not None:
            reset_params['floor-number'] = self._floor
        if self._seed is not None:
            reset_params['tower-seed'] = self._seed

        info = self._env.reset(config=reset_params)[self.brain_name]
        n_agents = len(info.agents)
        self._check_agents(n_agents)
        self.game_over = False

        obs, reward, done, info = self._single_step(info)
        return obs

    def step(self, action):
        """Run one timestep of the environment's dynamics. When end of
        episode is reached, you are responsible for calling `reset()`
        to reset this environment's state.
        Accepts an action and returns a tuple (observation, reward, done, info).
        In the case of multi-agent environments, these are lists.
        Args:
            action (object/list): an action provided by the environment
        Returns:
            observation (object/list): agent's observation of the current environment
            reward (float/list) : amount of reward returned after previous action
            done (boolean/list): whether the episode has ended.
            info (dict): contains auxiliary diagnostic information, including BrainInfo.
        """

        # Use random actions for all other agents in environment.
        if self._flattener is not None:
            # Translate action into list
            action = self._flattener.lookup_action(action)

        info = self._env.step(action)[self.brain_name]
        n_agents = len(info.agents)
        self._check_agents(n_agents)
        self._current_state = info

        obs, reward, done, info = self._single_step(info)
        self.game_over = done

        if info.get('text_observation') == 'evaluation_complete':
            done = True
            self._done_grading = True
        return obs, reward, done, info

    def _single_step(self, info):
        self.visual_obs = self._preprocess_single(
            info.visual_observations[0][0, :, :, :])
        if self.retro:
            self.visual_obs = self._resize_observation(self.visual_obs)
            self.visual_obs = self._add_stats_to_image(
                self.visual_obs, info.vector_observations[0])
            default_observation = self.visual_obs
        else:
            default_observation = self._prepare_tuple_observation(
                self.visual_obs, info.vector_observations[0])

        return default_observation, info.rewards[0], info.local_done[0], {
            "text_observation": info.text_observations[0],
            "brain_info": info
        }

    def _preprocess_single(self, single_visual_obs):
        if self.uint8_visual:
            return (255.0 * single_visual_obs).astype(np.uint8)
        else:
            return single_visual_obs

    def render(self, mode='rgb_array'):
        return self.visual_obs

    def close(self):
        """Override _close in your subclass to perform any necessary cleanup.
        Environments will automatically close() themselves when
        garbage collected or when the program exits.
        """
        self._env.close()

    def get_action_meanings(self):
        return self.action_meanings

    def seed(self, seed=None):
        """Sets a fixed seed for this env's random number generator(s).
        The valid range for seeds is [0, 100). By default a random seed
        will be chosen.
        """
        if seed is None:
            self._seed = seed
            return

        seed = int(seed)
        if seed < 0 or seed >= 100:
            logger.warn("Seed outside of valid range [0, 100). A random seed "
                        "within the valid range will be used on next reset.")
        logger.warn("New seed " + str(seed) + " will apply on next reset.")
        self._seed = seed

    def floor(self, floor=None):
        """Sets the starting floor to a fixed floor number on subsequent environment
        resets."""
        if floor is None:
            self._floor = floor
            return

        floor = int(floor)
        if floor < 0 or floor >= 25:
            logger.warn(
                "Starting floor outside of valid range [0, 25). Floor 0 will be used"
                "on next reset.")
        logger.warn("New starting floor " + str(floor) +
                    " will apply on next reset.")
        self._floor = floor

    @staticmethod
    def _resize_observation(observation):
        """
        Re-sizes visual observation to 84x84
        """
        obs_image = Image.fromarray(observation)
        obs_image = obs_image.resize((84, 84), Image.NEAREST)
        return np.array(obs_image)

    @staticmethod
    def _prepare_tuple_observation(vis_obs, vector_obs):
        """
        Converts separate visual and vector observation into prepared tuple
        """
        key = vector_obs[0:6]
        time = vector_obs[6]
        key_num = np.argmax(key, axis=0)
        return vis_obs, key_num, time

    @staticmethod
    def _add_stats_to_image(vis_obs, vector_obs):
        """
        Displays time left and number of keys on visual observation
        """
        key = vector_obs[0:6]
        time = vector_obs[6]
        key_num = np.argmax(key, axis=0)
        time_num = min(time, 10000) / 10000

        vis_obs[0:10, :, :] = 0
        for i in range(key_num):
            start = int(i * 16.8) + 4
            end = start + 10
            vis_obs[1:5, start:end, 0:2] = 255
        vis_obs[6:10, 0:int(time_num * 84), 1] = 255
        return vis_obs

    def _check_agents(self, n_agents):
        if n_agents > 1:
            raise UnityGymException(
                "The environment was launched as a single-agent environment, however"
                "there is more than one agent in the scene.")
        if self._n_agents is None:
            self._n_agents = n_agents
            logger.info("{} agents within environment.".format(n_agents))
        elif self._n_agents != n_agents:
            raise UnityGymException(
                "The number of agents in the environment has changed since "
                "initialization. This is not supported.")

    @property
    def metadata(self):
        return {'render.modes': ['rgb_array']}

    @property
    def reward_range(self):
        return -float('inf'), float('inf')

    @property
    def spec(self):
        return None

    @property
    def action_space(self):
        return self._action_space

    @property
    def observation_space(self):
        return self._observation_space

    @property
    def number_agents(self):
        return self._n_agents

Example #23

0

Show file

def run(options, runLog, minimumAcceptableFitness=None):
    isFunctionInValidationMode = \
            isinstance(minimumAcceptableFitness, float)
    runLog.Append("This is run.py -> script for running pretrained models!")

    locationOfPretrainedModel = options["--model"]

    resultsRepository = TrainingResultsRepository()
    bestAgent = resultsRepository.LoadBestModel(locationOfPretrainedModel)

    if bestAgent is None:
        runLog.Append("run.run() error: Cannot load model, location " \
                "'training_results/{0}' does not exist!".format(
                        locationOfPretrainedModel))
        exit()

    runLog.Append("Run model from 'training_results/{0}'!".format(
        locationOfPretrainedModel))

    pathToEnv = options["--env-path"]
    env = UnityEnvironment(file_name=pathToEnv)
    if pathToEnv is None:
        runLog.Append("Established connection with Unity Editor!")
    else:
        runLog.Append("Established connection with Unity build '{0}'!" \
                .format(pathToEnv))
    del pathToEnv

    brainName = env.brain_names[0]

    if isFunctionInValidationMode:
        fitness = 0.0

    shouldRunBeExecuted = True
    try:
        while shouldRunBeExecuted:
            envInfo = env.reset(train_mode=False)[brainName]
            inputData = envInfo.vector_observations.tolist()
            inputData = inputData[0][0:-1]
            while shouldRunBeExecuted:
                outputData = bestAgent.forward(inputData)
                envInfo = env.step([outputData])[brainName]
                inputData = envInfo.vector_observations.tolist()

                if isFunctionInValidationMode:
                    episodeReward = inputData[0][-1]
                    if episodeReward > fitness:
                        fitness = episodeReward

                inputData = inputData[0][:-1]

                if envInfo.local_done[0]:
                    if isFunctionInValidationMode:
                        shouldRunBeExecuted = False
                    break

    except KeyboardInterrupt:
        runLog.Append("\nRun interrupted because of KeyboardInterrupt!")

    runLog.Append("End of run!")
    env.close()
    runLog.Append("Closed Unity environment.")

    if isFunctionInValidationMode:
        return fitness >= minimumAcceptableFitness
    else:
        return False

Example #24

0

Show file

def unity_run(default_args, share_args, options, max_step, max_episode,
              save_frequency, name):
    from mlagents.envs import UnityEnvironment
    from utils.sampler import create_sampler_manager

    try:
        tf_version, (model, policy_mode,
                     _) = get_model_info(options['--algorithm'])
        algorithm_config = sth.load_config(
            f'./Algorithms/{tf_version}/config.yaml')[options['--algorithm']]
        ma = options['--algorithm'][:3] == 'ma_'
    except KeyError:
        raise NotImplementedError

    reset_config = default_args['reset_config']
    if options['--unity']:
        env = UnityEnvironment()
        env_name = 'unity'
    else:
        file_name = default_args['exe_file'] if options[
            '--env'] == 'None' else options['--env']
        if os.path.exists(file_name):
            env = UnityEnvironment(file_name=file_name,
                                   base_port=int(options['--port']),
                                   no_graphics=False if options['--inference']
                                   else not options['--graphic'])
            env_dir = os.path.split(file_name)[0]
            env_name = os.path.join(*env_dir.replace('\\', '/').replace(
                r'//', r'/').split('/')[-2:])
            sys.path.append(env_dir)
            if os.path.exists(env_dir + '/env_config.py'):
                import env_config
                reset_config = env_config.reset_config
                max_step = env_config.max_step
            if os.path.exists(env_dir + '/env_loop.py'):
                from env_loop import Loop
        else:
            raise Exception('can not find this file.')
    sampler_manager, resampling_interval = create_sampler_manager(
        options['--sampler'], env.reset_parameters)

    if 'Loop' not in locals().keys():
        if ma:
            from ma_loop import Loop
        else:
            from loop import Loop

    if options['--config-file'] != 'None':
        algorithm_config = update_config(algorithm_config,
                                         options['--config-file'])
    _base_dir = os.path.join(share_args['base_dir'], env_name,
                             options['--algorithm'])
    base_dir = os.path.join(_base_dir, name)
    show_config(algorithm_config)

    brain_names = env.external_brain_names
    brains = env.brains
    brain_num = len(brain_names)

    visual_resolutions = {}
    for i in brain_names:
        if brains[i].number_visual_observations:
            visual_resolutions[f'{i}'] = [
                brains[i].camera_resolutions[0]['height'],
                brains[i].camera_resolutions[0]['width'],
                1 if brains[i].camera_resolutions[0]['blackAndWhite'] else 3
            ]
        else:
            visual_resolutions[f'{i}'] = []

    model_params = [{
        's_dim':
        brains[i].vector_observation_space_size *
        brains[i].num_stacked_vector_observations,
        'a_dim_or_list':
        brains[i].vector_action_space_size,
        'action_type':
        brains[i].vector_action_space_type,
        'max_episode':
        max_episode,
        'base_dir':
        os.path.join(base_dir, i),
        'logger2file':
        share_args['logger2file'],
        'out_graph':
        share_args['out_graph'],
    } for i in brain_names]

    if ma:
        assert brain_num > 1, 'if using ma* algorithms, number of brains must larger than 1'
        data = ExperienceReplay(share_args['ma']['batch_size'],
                                share_args['ma']['capacity'])
        extra_params = {'data': data}
        models = [
            model(n=brain_num, i=i, **model_params[i], **algorithm_config)
            for i in range(brain_num)
        ]
    else:
        extra_params = {}
        models = [
            model(visual_sources=brains[i].number_visual_observations,
                  visual_resolution=visual_resolutions[f'{i}'],
                  **model_params[index],
                  **algorithm_config) for index, i in enumerate(brain_names)
        ]

    [
        models[index].init_or_restore(
            os.path.join(
                _base_dir,
                name if options['--load'] == 'None' else options['--load'], i))
        for index, i in enumerate(brain_names)
    ]
    begin_episode = models[0].get_init_episode()

    params = {
        'env': env,
        'brain_names': brain_names,
        'models': models,
        'begin_episode': begin_episode,
        'save_frequency': save_frequency,
        'reset_config': reset_config,
        'max_step': max_step,
        'max_episode': max_episode,
        'sampler_manager': sampler_manager,
        'resampling_interval': resampling_interval,
        'policy_mode': policy_mode
    }
    if 'batch_size' in algorithm_config.keys() and options['--fill-in']:
        steps = algorithm_config['batch_size']
    else:
        steps = default_args['no_op_steps']
    no_op_params = {
        'env': env,
        'brain_names': brain_names,
        'models': models,
        'brains': brains,
        'steps': steps,
        'choose': options['--noop-choose']
    }
    params.update(extra_params)
    no_op_params.update(extra_params)

    if options['--inference']:
        Loop.inference(env,
                       brain_names,
                       models,
                       reset_config=reset_config,
                       sampler_manager=sampler_manager,
                       resampling_interval=resampling_interval)
    else:
        try:
            [
                sth.save_config(os.path.join(base_dir, i, 'config'),
                                algorithm_config) for i in brain_names
            ]
            Loop.no_op(**no_op_params)
            Loop.train(**params)
        except Exception as e:
            print(e)
        finally:
            try:
                [models[i].close() for i in range(len(models))]
            except Exception as e:
                print(e)
            finally:
                env.close()
                sys.exit()

Example #25

0

Show file

File: train.py Project: SabareeshNikhil/244Bananas

    # (Over-) Print current average score
    print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode,
                                                       average_score),
          end="")

    # Print average score every scores_average_window episodes
    if i_episode % scores_average_window == 0:
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(
            i_episode, average_score))

    # Check to see if the task is solved (i.e,. avearge_score > solved_score).
    # If yes, save the network weights and scores and end training.
    if average_score >= solved_score:
        print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.
              format(i_episode, average_score))

        # Save trained neural network weights
        timestr = time.strftime("%Y%m%d-%H%M%S")
        nn_filename = "dqnAgent_Trained_Model_" + timestr + ".pth"
        torch.save(agent.network.state_dict(), nn_filename)

        # Save the recorded Scores data
        scores_filename = "dqnAgent_scores_" + timestr + ".csv"
        np.savetxt(scores_filename, scores, delimiter=",")
        break

env.close()

# END :) #############

Example #26

0

Show file

File: train_de.py Project: galgreg/BachelorOfScienceThesis

def train_de(options, trainingLog, dataCollector=None):
    isTrainInExperimentMode = isinstance(dataCollector,
                                         ExperimentDataCollector)
    trainingLog.Append(
        "This is train_de.py -> Differential Evolution training!")

    if options["--track-1"]:
        trackNumber = 1
    elif options["--track-2"]:
        trackNumber = 2
    elif options["--track-3"]:
        trackNumber = 3

    trainingLog.Append("Training on RaceTrack_{0}.".format(trackNumber))

    # --- Load config data from file --- #
    pathToConfigFile = options["<config-file-path>"]
    CONFIG_DATA = loadConfigData(pathToConfigFile)
    trainingLog.Append(
        "Config data has been loaded from file: {0}".format(pathToConfigFile))
    del pathToConfigFile

    # --- Set random seed --- #
    TRAINING_PARAMS = CONFIG_DATA["TrainingParameters"]
    RANDOM_SEED = TRAINING_PARAMS["randomSeed"]
    if isinstance(RANDOM_SEED, int):
        random.seed(RANDOM_SEED)
        torch.manual_seed(RANDOM_SEED)
        trainingLog.Append("Random seed set to value: {0}".format(RANDOM_SEED))

    # --- Establish connection with Unity environment --- #
    pathToEnv = options["--env-path"]
    env = UnityEnvironment(file_name=pathToEnv)

    if pathToEnv is None:
        trainingLog.Append("Established connection with Unity Editor!")
    else:
        trainingLog.Append("Established connection with Unity build '{0}'!" \
                .format(pathToEnv))
    del pathToEnv

    # --- Get info from Unity environment --- #
    brainName = env.brain_names[0]
    trainingLog.Append("Brain name: {0}".format(brainName))
    brain = env.brains[brainName]
    brainInfo = env.reset(train_mode=True)[brainName]
    observationSize = brainInfo.vector_observations.shape[1]
    actionSize = brain.vector_action_space_size[0]
    trainingLog.Append(
            "Loaded from Unity environment: observationSize = {0}, " \
            "actionSize = {1}".format(observationSize, actionSize))

    # --- Compute agent dimensions -- #
    HIDDEN_DIMENSIONS = TRAINING_PARAMS["networkHiddenDimensions"]
    agentDimensions = [observationSize - 1] + HIDDEN_DIMENSIONS + [actionSize]
    trainingLog.Append("Computed agentDimensions: {0}".format(agentDimensions))

    # --- Create population ---- #
    locationForPretrainedPopulation = options["--population"]
    DIFF_EVO_PARAMS = CONFIG_DATA["LearningAlgorithms"]["diff_evo"]
    NUM_OF_AGENTS = DIFF_EVO_PARAMS["numberOfAgents"]
    resultsRepository = TrainingResultsRepository(trainingLog)

    try:
        MUTATION_FACTOR = DIFF_EVO_PARAMS["mutationFactor"]
        CROSS_PROBABILITY = DIFF_EVO_PARAMS["crossProbability"]
        NUM_OF_PARAMS = computeNumOfParameters(agentDimensions)

        trackName = "RaceTrack_{0}".format(trackNumber)
        MINIMAL_ACCEPTABLE_FITNESS = \
                TRAINING_PARAMS["minimalAcceptableFitness"][trackName]
        del trackName

        fitnessEvaluation = AgentFitnessEvaluator(env, brainName)

        MAX_EPISODES_NUMBER = TRAINING_PARAMS["maxNumberOfEpisodes"]
        MAX_REPEATS_NUMBER = TRAINING_PARAMS[
            "maxNumberOfRepeatsIfTrainingFails"]
        trainingLog.Append(
            "Start training with parameters: MAX_EPISODES_NUMBER = {0}, " \
            "MAX_REPEATS_NUMBER = {1}, MUTATION_FACTOR = {2}, " \
            "CROSS_PROBABILITY = {3}, NUM_OF_PARAMS = {4}, " \
            "MINIMAL_ACCEPTABLE_FITNESS = {5}, fitnessFunction = {6}".format(
                    MAX_EPISODES_NUMBER,
                    MAX_REPEATS_NUMBER,
                    MUTATION_FACTOR,
                    CROSS_PROBABILITY,
                    NUM_OF_PARAMS,
                    MINIMAL_ACCEPTABLE_FITNESS,
                    type(fitnessEvaluation)))

        shouldContinueTraining = True
        for repeatCounter in range(MAX_REPEATS_NUMBER):
            if not shouldContinueTraining:
                break

            if locationForPretrainedPopulation is None:
                population = [ AgentNeuralNetwork(agentDimensions) \
                        for _ in range(NUM_OF_AGENTS) ]
                trainingLog.Append("Created new population, with parameters: " \
                        "NUM_OF_AGENTS = {0}, agentDimensions = {1}, ".format(
                                NUM_OF_AGENTS,
                                agentDimensions))
            else:
                population = \
                        resultsRepository.LoadPopulation(
                                locationForPretrainedPopulation)
                if population is None:
                    env.close()
                    exit()

            fitnessList = []
            if isTrainInExperimentMode:
                bestFitnessSequence = []
                meanFitnessSequence = []
                stdevFitnessSequence = []
                searchCounter = NUM_OF_AGENTS
                timeOfBegin = time.time()

            for agentIndex in range(NUM_OF_AGENTS):
                agentFitness = fitnessEvaluation(population[agentIndex])
                fitnessList.append(agentFitness)

            bestFitness = max(fitnessList)
            indexOfBestFitness = fitnessList.index(bestFitness)
            bestAgent = deepcopy(population[indexOfBestFitness])
            meanFitness = statistics.mean(fitnessList)
            stdDevFitness = statistics.stdev(fitnessList)

            if isTrainInExperimentMode:
                bestFitnessSequence.append(bestFitness),
                meanFitnessSequence.append(meanFitness)
                stdevFitnessSequence.append(stdDevFitness)

            pop_denorm = retrieveParametersFromAgentList(population)
            pop_norm = pop_denorm / 4 + 0.5
            for episodeCounter in range(MAX_EPISODES_NUMBER):
                for j in range(NUM_OF_AGENTS):
                    if isTrainInExperimentMode:
                        searchCounter += 1

                    indices = [
                        index for index in range(NUM_OF_AGENTS) if index != j
                    ]
                    a_idx, b_idx, c_idx = random.sample(indices, 3)
                    a, b, c = pop_norm[a_idx], pop_norm[b_idx], pop_norm[c_idx]
                    mutant = torch.clamp(a + MUTATION_FACTOR * (b - c), 0.0,
                                         1.0)
                    cross_points = [
                        random.uniform(0, 1) for _ in range(NUM_OF_PARAMS)
                    ]

                    trial_norm = torch.zeros(NUM_OF_PARAMS)
                    for k in range(NUM_OF_PARAMS):
                        if cross_points[k] < CROSS_PROBABILITY:
                            trial_norm[k] = mutant[k]
                        else:
                            trial_norm[k] = pop_norm[j][k]
                    trial_denorm = trial_norm * 4 - 2

                    setNewParametersOnAgent(population[j], trial_denorm)
                    agentIndex = j
                    fitness_trial = fitnessEvaluation(population[agentIndex])

                    if fitness_trial > fitnessList[j]:
                        fitnessList[j] = fitness_trial
                        pop_denorm[j] = trial_denorm
                        pop_norm[j] = trial_norm
                        if fitness_trial > bestFitness:
                            bestFitness = fitness_trial
                            bestAgent = deepcopy(population[j])
                    else:
                        setNewParametersOnAgent(population[j], pop_denorm[j])

                meanFitness = statistics.mean(fitnessList)
                stdDevFitness = statistics.stdev(fitnessList)

                if isTrainInExperimentMode:
                    bestFitnessSequence.append(bestFitness),
                    meanFitnessSequence.append(meanFitness)
                    stdevFitnessSequence.append(stdDevFitness)

                trainingLog.Append(
                    "Episode {0}: best = {1}, mean = {2}, stdDev = {3}".format(
                        episodeCounter, bestFitness, meanFitness,
                        stdDevFitness))

                if bestFitness >= MINIMAL_ACCEPTABLE_FITNESS:
                    trainingLog.Append(
                            "Training interrupted after {0} episodes, reason: " \
                            "reached minimal acceptable value for bestFitness!" \
                            " (minimalAcceptableFitness = {1}, bestFitness = {2})" \
                            .format(
                                    episodeCounter + 1,
                                    MINIMAL_ACCEPTABLE_FITNESS,
                                    bestFitness))

                    if isTrainInExperimentMode:
                        timeOfEnd = time.time()
                        trainingTime = timeOfEnd - timeOfBegin

                        dataCollector.AppendBestFitnessSequence(
                            trackNumber, "DE", bestFitnessSequence)
                        dataCollector.AppendMeanFitnessSequence(
                            trackNumber, "DE", meanFitnessSequence)
                        dataCollector.AppendStdevFitnessSequence(
                            trackNumber, "DE", stdevFitnessSequence)
                        dataCollector.AddTimeInSecondsFromTraining(
                            trackNumber, "DE", trainingTime)
                        trainingLog.Append("Training time in seconds: {0}" \
                                .format(trainingTime))
                        dataCollector.AddTimeInEpisodesFromTraining(
                            trackNumber, "DE", episodeCounter + 1)
                        trainingLog.Append("Training time in episodes: {0}" \
                                .format(episodeCounter + 1))
                        dataCollector.AddToSearchCounter(
                            trackNumber, "DE", searchCounter)
                        trainingLog.Append(
                            "searchCounter = {0}".format(searchCounter))
                    shouldContinueTraining = False
                    break

                if episodeCounter >= (MAX_EPISODES_NUMBER - 1):
                    message = "Cannot train population in current repeat " \
                            "(MAX_EPISODES_NUMBER = {0}, repeatCounter = {1})!" \
                            .format(MAX_EPISODES_NUMBER, repeatCounter)
                    if repeatCounter < MAX_REPEATS_NUMBER - 1:
                        message += " Try again to train population!"
                    else:
                        message += " Unfortunately, cannot try again. " \
                                "Reason: achieved maximum number of repeats!"
                    trainingLog.Append(message)

    except KeyboardInterrupt:
        trainingLog.Append(
            "\nTraining interrupted because of KeyboardInterrupt!")

    trainingLog.Append("End of training!")

    # --- Close environment --- #
    env.close()
    trainingLog.Append("Closed Unity environment.")

    # --- Save training results --- #
    shouldSavePopulation = options["--save-population"]
    resultsRepository.Save(population, bestAgent, shouldSavePopulation)

    if isTrainInExperimentMode:
        dataCollector.PathToLastSavedModel = \
                resultsRepository._pathToLastSavedModel

Example #27

0

Show file

class Game:

    # set up unity ml agent environment

    def __init__(self):
        self.loadEnv(0)

    def loadEnv(self, wid):
        # load env
        env_name = ENV_LOCATION
        self.env = UnityEnvironment(env_name, worker_id=wid)
        # Set the default brain to work with
        self.default_brain = self.env.brain_names[0]
        self.brain = self.env.brains[self.default_brain]
        # Reset the environment - train mode enabled
        env_info = self.env.reset(train_mode=True)[self.default_brain]

    # this frogger game action space is 5, actions[0] = selected action (action = [[1]])
    # actions
    # 1 - up, 2 - down , 3- left , 4 -right , 0 - do nothing

    def performAction(self, actionValue, numberOfFrames=STACK_SIZE):
        action = [[0]]
        action[0] = actionValue
        terminal = False  # indication of terminal state
        size = (IMAGE_HEIGTH, IMAGE_WIDTH, numberOfFrames
                )  # create list to keep frames
        stack = np.zeros(size)
        reward = 0  # rewards for all the frames

        # first frame after action
        env_info = self.env.step(action)[
            self.default_brain]  # send action to brain
        reward = round(env_info.rewards[0], 5)  # get reward
        newState = env_info.visual_observations[0][
            0]  # get state visual observation
        newStateGray = skimage.color.rgb2gray(newState)  # covert to gray scale
        newStateGray = skimage.transform.resize(newStateGray,
                                                (IMAGE_HEIGTH, IMAGE_WIDTH))
        # check terminal reached
        if reward == -1 or reward == -2:
            terminal = True

        # add the state to the 0 th position
        stack[:, :, 0] = newStateGray

        # get stack of frames after the action
        for i in range(1, numberOfFrames):
            env_info = self.env.step(
            )[self.
              default_brain]  # change environment to next step without action
            st = env_info.visual_observations[0][0]
            stGray = skimage.color.rgb2gray(st)
            stGray = skimage.transform.resize(stGray,
                                              (IMAGE_HEIGTH, IMAGE_WIDTH))
            stack[:, :, i] = stGray
            # if terminal only consider the reward for terminal
            if env_info.rewards[0] == -1 or env_info.rewards[0] == -2:
                terminal = True
                reward = round(env_info.rewards[0], 5)
            elif not terminal:
                # if it got a positive reward for move up let it have it
                if reward < 0:
                    reward = round(env_info.rewards[0], 5)  # get reward

        # reshape for Keras
        # noinspection PyArgumentList
        stack = stack.reshape(1, stack.shape[0], stack.shape[1],
                              stack.shape[2])  # 1*100*100*4

        return reward, stack, terminal

    # close environment
    def close(self):
        self.env.close()

    def reset(self):
        self.close()
        self.loadEnv(0)

Example #28

0

Show file

class UnityEnv(gym.Env):
    """
    Provides Gym wrapper for Unity Learning Environments.
    Multi-agent environments use lists for object types, as done here:
    https://github.com/openai/multiagent-particle-envs
    """

    def __init__(self, environment_filename: str, worker_id=0, use_visual=False, multiagent=False):
        """
        Environment initialization
        :param environment_filename: The UnityEnvironment path or file to be wrapped in the gym.
        :param worker_id: Worker number for environment.
        :param use_visual: Whether to use visual observation or vector observation.
        :param multiagent: Whether to run in multi-agent mode (lists of obs, reward, done).
        """
        self._env = UnityEnvironment(environment_filename, worker_id)
        self.name = self._env.academy_name
        self.visual_obs = None
        self._current_state = None
        self._n_agents = None
        self._multiagent = multiagent

        # Check brain configuration
        if len(self._env.brains) != 1:
            raise UnityGymException(
                "There can only be one brain in a UnityEnvironment "
                "if it is wrapped in a gym.")
        self.brain_name = self._env.external_brain_names[0]
        brain = self._env.brains[self.brain_name]

        if use_visual and brain.number_visual_observations == 0:
            raise UnityGymException("`use_visual` was set to True, however there are no"
                                    " visual observations as part of this environment.")
        self.use_visual = brain.number_visual_observations == 1 and use_visual

        if brain.num_stacked_vector_observations != 1:
            raise UnityGymException(
                "There can only be one stacked vector observation in a UnityEnvironment "
                "if it is wrapped in a gym.")

        # Check for number of agents in scene.
        initial_info = self._env.reset()[self.brain_name]
        self._check_agents(len(initial_info.agents))

        # Set observation and action spaces
        if brain.vector_action_space_type == "discrete":
            if len(brain.vector_action_space_size) == 1:
                self._action_space = spaces.Discrete(brain.vector_action_space_size[0])
            else:
                self._action_space = spaces.MultiDiscrete(brain.vector_action_space_size)
        else:
            high = np.array([1] * brain.vector_action_space_size[0])
            self._action_space = spaces.Box(-high, high, dtype=np.float32)
        high = np.array([np.inf] * brain.vector_observation_space_size)
        self.action_meanings = brain.vector_action_descriptions
        if self.use_visual:
            if brain.camera_resolutions[0]["blackAndWhite"]:
                depth = 1
            else:
                depth = 3
            self._observation_space = spaces.Box(0, 1, dtype=np.float32,
                                                 shape=(brain.camera_resolutions[0]["height"],
                                                        brain.camera_resolutions[0]["width"],
                                                        depth))
        else:
            self._observation_space = spaces.Box(-high, high, dtype=np.float32)

    def reset(self):
        """Resets the state of the environment and returns an initial observation.
        In the case of multi-agent environments, this is a list.
        Returns: observation (object/list): the initial observation of the
            space.
        """
        info = self._env.reset()[self.brain_name]
        n_agents = len(info.agents)
        self._check_agents(n_agents)

        if not self._multiagent:
            obs, reward, done, info = self._single_step(info)
        else:
            obs, reward, done, info = self._multi_step(info)
        return obs

    def step(self, action):
        """Run one timestep of the environment's dynamics. When end of
        episode is reached, you are responsible for calling `reset()`
        to reset this environment's state.
        Accepts an action and returns a tuple (observation, reward, done, info).
        In the case of multi-agent environments, these are lists.
        Args:
            action (object/list): an action provided by the environment
        Returns:
            observation (object/list): agent's observation of the current environment
            reward (float/list) : amount of reward returned after previous action
            done (boolean/list): whether the episode has ended.
            info (dict): contains auxiliary diagnostic information, including BrainInfo.
        """

        # Use random actions for all other agents in environment.
        if self._multiagent:
            if not isinstance(action, list):
                raise UnityGymException("The environment was expecting `action` to be a list.")
            if len(action) != self._n_agents:
                raise UnityGymException("The environment was expecting a list of {} actions.".format(self._n_agents))
            else:
                action = np.array(action)

        info = self._env.step(action)[self.brain_name]
        n_agents = len(info.agents)
        self._check_agents(n_agents)
        self._current_state = info

        if not self._multiagent:
            obs, reward, done, info = self._single_step(info)
        else:
            obs, reward, done, info = self._multi_step(info)
        return obs, reward, done, info

    def _single_step(self, info):
        if self.use_visual:
            self.visual_obs = info.visual_observations[0][0, :, :, :]
            default_observation = self.visual_obs
        else:
            default_observation = info.vector_observations[0, :]

        return default_observation, info.rewards[0], info.local_done[0], {"text_observation": info.text_observations[0],
                                                                          "brain_info": info}

    def _multi_step(self, info):
        if self.use_visual:
            self.visual_obs = info.visual_observations
            default_observation = self.visual_obs
        else:
            default_observation = info.vector_observations
        return list(default_observation), info.rewards,  info.local_done, {"text_observation": info.text_observations,
                                                                           "brain_info": info}

    def render(self, mode='rgb_array'):
        return self.visual_obs

    def close(self):
        """Override _close in your subclass to perform any necessary cleanup.
        Environments will automatically close() themselves when
        garbage collected or when the program exits.
        """
        self._env.close()

    def get_action_meanings(self):
        return self.action_meanings

    def seed(self, seed=None):
        """Sets the seed for this env's random number generator(s).
        Currently not implemented.
        """
        logger.warn("Could not seed environment %s", self.name)
        return

    def _check_agents(self, n_agents):
        if not self._multiagent and n_agents > 1:
            raise UnityGymException("The environment was launched as a single-agent environment, however"
                                    "there is more than one agent in the scene.")
        elif self._multiagent and n_agents <= 1:
            raise UnityGymException("The environment was launched as a mutli-agent environment, however"
                                    "there is only one agent in the scene.")
        if self._n_agents is None:
            self._n_agents = n_agents
            logger.info("{} agents within environment.".format(n_agents))
        elif self._n_agents != n_agents:
            raise UnityGymException("The number of agents in the environment has changed since "
                                    "initialization. This is not supported.")

    @property
    def metadata(self):
        return {'render.modes': ['rgb_array']}

    @property
    def reward_range(self):
        return -float('inf'), float('inf')

    @property
    def spec(self):
        return None

    @property
    def action_space(self):
        return self._action_space

    @property
    def observation_space(self):
        return self._observation_space

    @property
    def number_agents(self):
        return self._n_agents

Example #29

0

Show file

def train_wrapper(env_config, wrapper_config):
    """
    Set the Training Parameters
    :param env_config: dictionary, used to pass parameters into the environment
    :param wrapper_config: dictionary of user defined variables.
    """
    # num_episodes (int): maximum number of training episodes
    num_episodes = wrapper_config['num_episodes']

    # scores_average_window (int): the window size employed for calculating the average score
    scores_average_window = wrapper_config['scores_avg_window']

    # solved_score (float): the average score required for the environment to be considered solved
    solved_score = wrapper_config['solved_score']

    # load_weights (bool): whether or not to start training with loaded weights
    load_weights = wrapper_config['load_weights']

    # weights_path: path to the directory containing the weights (same directory to save them)
    weights_path = wrapper_config['weights_path']
    if load_weights and not (os.path.isdir(weights_path)):
        print('weights dir does not exist')
        raise NotADirectoryError

    # save_mem (bool): whether or not to save memory
    save_mem = wrapper_config['save_mem']
    # load_mem (bool): whether or not to continue training with loaded memory
    load_mem = wrapper_config['load_mem']
    # mem_path: path to directory containing the memory to load
    mem_path = wrapper_config['mem_path']
    if load_mem and not (os.path.isdir(mem_path)):
        print('mem dir does not exist')
        raise NotADirectoryError

    # build_path: path to the build of the unity environment.
    build_path = None if wrapper_config['build'] == 'None' else wrapper_config[
        'build']
    if (build_path is not None) and (not os.path.isfile(build_path)):
        print('--build is not a valid path')
        raise FileNotFoundError

    # no_graphics (bool): whether or not to start the environment without graphics (default = True in training)
    no_graphics_in = not wrapper_config['show_graphics']

    # agent_type (DDPG | MDDPG | MADDPG)
    agent_type = wrapper_config['agent']
    if not issubclass(agent_type, AgentABC):
        print('invalid agent type')
        raise TypeError

    # print_Agent_loss (bool): whether or not to print the agent's loss (mse for critic) after every episode
    print_agent_loss = wrapper_config['print_agent_loss']

    # save_log (bool): whether or not to save the episodes score (csv format, default is True)
    save_log = wrapper_config['save_score_log']

    # save_best_weights (bool): save also the best weights of the session (by average score)
    save_best_weights = wrapper_config['save_best_weights']

    # episode_scores (float): list to record the scores obtained from each episode
    episode_scores = []
    """
    Start the Unity Environment
    """
    env = UnityEnvironment(file_name=build_path, no_graphics=no_graphics_in)
    """
    Get The Unity Environment Brain
    Unity ML-Agent applications or Environments contain "BRAINS" which are responsible for deciding 
    the actions an agent or set of agents should take given a current set of environment (state) 
    observations. The Race environment has a single Brain, thus, we just need to access the first brain 
    available (i.e., the default brain). We then set the default brain as the brain that will be controlled.
    """
    # Get the default brain
    brain_name = env.brain_names[0]

    # Assign the default brain as the brain to be controlled
    brain = env.brains[brain_name]
    """
    Determine the size of the Action and State Spaces and the Number of Agents.
    The observation space consists of variables corresponding to Ray Cast in different direction, 
    velocity and direction.  
    Each action is a vector with 2 numbers, corresponding to steer left/right and brake/drive (in this order).
    each action is a number between -1 and 1.
    num_agents will correspond to the number of agent using the same brain -
    (since all cars use the same action / observation space they all use the same brain)
    if in the future one should have different cars use different observation space, 
    one will need to split them into different brains..
    """
    # Set the number of actions or action size
    action_size = brain.vector_action_space_size

    # Set the size of state observations or state size
    state_size = brain.vector_observation_space_size

    # Get number of agents in Environment
    env_info = env.reset(train_mode=True, config=env_config)[brain_name]
    num_agents = len(env_info.agents)
    print('\nNumber of Agents: ', num_agents)
    """
    Create an Agent from the Agent Class in Agent.py
    Any agent initialized with the following parameters.
        ======
        state_size (int): dimension of each state (required)
        action_size (int): dimension of each action (required)
        num_agents (int): number of agents in the unity environment
        seed (int): random seed for initializing training point (default = 0)
    
    Here we initialize an agent using the Unity environments state and action size and number of Agents
    determined above.
    """
    agent: AgentABC = agent_type(state_size=state_size,
                                 action_size=action_size[0],
                                 num_agents=num_agents,
                                 random_seed=0)

    # Load trained model weights
    if load_weights:
        agent.load_weights(weights_path)
    if load_mem:
        agent.load_mem(mem_path)
    """
    ###################################
    STEP 6: Run the Training Sequence
    The Training Process involves the agent learning from repeated episodes of behaviour 
    to map states to actions the maximize rewards received via environmental interaction.
    
    The agent training process involves the following:
    (1) Reset the environment at the beginning of each episode.
    (2) Obtain (observe) current state, s, of the environment at time t
    (3) Perform an action, a(t), in the environment given s(t)
    (4) Observe the result of the action in terms of the reward received and 
        the state of the environment at time t+1 (i.e., s(t+1))
    (5) Update agent memory and learn from experience (i.e, agent.step)
    (6) Update episode score (total reward received) and set s(t) -> s(t+1).
    (7) If episode is done, break and repeat from (1), otherwise repeat from (3).
    
    Below we also exit the training process early if the environment is solved. 
    That is, if the average score for the previous 100 episodes is greater than solved_score.
    """

    best_score = -np.inf  # used to determine the best average score so far (for saving best_weights)
    # loop from num_episodes
    for i_episode in range(1, num_episodes + 1):
        # reset the unity environment at the beginning of each episode
        env_info = env.reset(train_mode=True, config=env_config)[brain_name]

        # get initial state of the unity environment
        states = env_info.vector_observations

        # reset the training agent for new episode
        agent.reset()

        # set the initial episode score to zero.
        agent_scores = np.zeros(num_agents)

        # Run the episode training loop;
        # At each loop step take an action as a function of the current state observations
        # Based on the resultant environmental state (next_state) and reward received update the agent ('step' method)
        # If environment episode is done, exit loop...
        # Otherwise repeat until done == true
        steps = 0
        while True:
            steps = steps + 1
            # determine actions for the unity agents from current sate
            actions = agent.act(states)

            # send the actions to the unity agents in the environment and receive resultant environment information
            env_info = env.step(actions)[brain_name]

            next_states = env_info.vector_observations  # get the next states for each unity agent in the environment
            rewards = env_info.rewards  # get the rewards for each unity agent in the environment
            dones = env_info.local_done  # see if episode has finished for each unity agent in the environment

            # Send (S, A, R, S') info to the training agent for replay buffer (memory) and network updates
            agent.step(states, actions, rewards, next_states, dones)

            # set new states to current states for determining next actions
            states = next_states

            # Update episode score for each unity agent
            agent_scores += rewards

            # If any unity agent indicates that the episode is done,
            # then exit episode loop, to begin new episode
            if np.any(dones):
                break

        # Add episode score to Scores and...
        # Calculate mean score over last 100 episodes
        # Mean score is calculated over current episodes until i_episode > 100
        episode_scores.append(np.mean(agent_scores))
        average_score = np.mean(
            episode_scores[i_episode -
                           min(i_episode, scores_average_window):i_episode +
                           1])

        # Print current and average score, number of steps in episode.
        print(
            '\nEpisode {}\tEpisode Score: {:.3f}\tAverage Score: {:.3f}\tNumber Of Steps{}'
            .format(i_episode, episode_scores[i_episode - 1], average_score,
                    steps),
            end="")
        if print_agent_loss:
            # print agent's loss (useful for babysitting the training)
            print('\t episode loss: {}'.format(agent.debug_loss))

        if save_log:
            # Save the recorded Scores data (in weights path)
            if not (os.path.isdir(weights_path)):
                os.mkdir(weights_path)
            scores_filename = "Agent_Scores.csv"
            # noinspection PyTypeChecker
            np.savetxt(os.path.join(weights_path, scores_filename),
                       episode_scores,
                       delimiter=",")

        # Save trained  Actor and Critic network weights after each episode
        agent.save_weights(weights_path)
        if save_best_weights:
            if best_score < average_score:
                best_score = average_score
                agent.save_weights(weights_path + '_best')

        if save_mem and (i_episode % 50) == 0:
            agent.save_mem(mem_path)
        # Check to see if the task is solved (i.e,. average_score > solved_score over 100 episodes).
        # If yes, save the network weights and scores and end training.
        if i_episode > scores_average_window * 2 and average_score >= solved_score:
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.3f}'
                .format(i_episode, average_score))
            break
    agent.save_mem(mem_path)
    """
    ###################################
    STEP 7: Everything is Finished -> Close the Environment.
    """
    env.close()

Example #30

0

Show file

File: train.py Project: seongl/rl_bootcamp

def main():
    # Initialize environment
    env = UnityEnvironment(file_name='../env/Hopper/Hopper')

    default_brain = env.brain_names[0]
    brain = env.brains[default_brain]

    env_info = env.reset(train_mode=True)[default_brain]

    obs_dim = env_info.vector_observations[0].shape[0]
    act_dim = brain.vector_action_space_size[0]
    print('State dimension:', obs_dim)
    print('Action dimension:', act_dim)

    # Set a random seed
    np.random.seed(0)
    torch.manual_seed(0)

    # Create a SummaryWriter object by TensorBoard
    dir_name = 'runs/' + 'Hopper' + '_' + time.ctime()
    writer = SummaryWriter(log_dir=dir_name)

    # Main network
    actor = GaussianPolicy(obs_dim, act_dim).to(device)
    qf1 = FlattenMLP(obs_dim + act_dim, 1).to(device)
    qf2 = FlattenMLP(obs_dim + act_dim, 1).to(device)
    # Target network
    qf1_target = FlattenMLP(obs_dim + act_dim, 1).to(device)
    qf2_target = FlattenMLP(obs_dim + act_dim, 1).to(device)

    # Initialize target parameters to match main parameters
    hard_target_update(qf1, qf1_target)
    hard_target_update(qf2, qf2_target)

    # Create optimizers
    actor_optimizer = optim.Adam(actor.parameters(), lr=args.actor_lr)
    qf1_optimizer = optim.Adam(qf1.parameters(), lr=args.qf_lr)
    qf2_optimizer = optim.Adam(qf2.parameters(), lr=args.qf_lr)

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim, act_dim, args.buffer_size)

    # If automatic entropy tuning is True, initialize a target entropy, a log alpha and an alpha optimizer
    if args.automatic_entropy_tuning:
        target_entropy = -np.prod((act_dim, )).item()
        log_alpha = torch.zeros(1, requires_grad=True, device=device)
        alpha_optimizer = optim.Adam([log_alpha], lr=args.alpha_lr)
    else:
        target_entropy = None
        log_alpha = None
        alpha_optimizer = None

    def run_one_episode(steps, eval_mode):
        total_reward = 0.

        env_info = env.reset(train_mode=True)[default_brain]
        obs = env_info.vector_observations[0]
        done = False

        # Keep interacting until agent reaches a terminal state.
        while not done:
            steps += 1

            if eval_mode:
                action, _, _ = actor(torch.Tensor(obs).to(device))
                action = action.detach().cpu().numpy()
                env_info = env.step(action)[default_brain]

                next_obs = env_info.vector_observations[0]
                reward = env_info.rewards[0]
                done = env_info.local_done[0]
            else:
                # Collect experience (s, a, r, s') using some policy
                _, action, _ = actor(torch.Tensor(obs).to(device))
                action = action.detach().cpu().numpy()
                env_info = env.step(action)[default_brain]

                next_obs = env_info.vector_observations[0]
                reward = env_info.rewards[0]
                done = env_info.local_done[0]

                # Add experience to replay buffer
                replay_buffer.add(obs, action, reward, next_obs, done)

                # Start training when the number of experience is greater than batch size
                if steps > args.batch_size:
                    batch = replay_buffer.sample(args.batch_size)
                    args.alpha = train_model(actor, qf1, qf2, qf1_target,
                                             qf2_target, actor_optimizer,
                                             qf1_optimizer, qf2_optimizer,
                                             batch, target_entropy, log_alpha,
                                             alpha_optimizer)

            total_reward += reward
            obs = next_obs
        return steps, total_reward, args.alpha

    train_sum_returns = 0.
    train_num_episodes = 0

    start_time = time.time()
    steps = 0

    for episode in range(1, args.training_eps + 1):
        # Perform the training phase, during which the agent learns
        eval_mode = False

        # Run one episode
        steps, train_episode_return, args.alpha = run_one_episode(
            steps, eval_mode)

        train_sum_returns += train_episode_return
        train_num_episodes += 1

        train_average_return = train_sum_returns / train_num_episodes if train_num_episodes > 0 else 0.0

        # Log experiment result for training episodes
        writer.add_scalar('Train/AverageReturns', train_average_return,
                          episode)
        writer.add_scalar('Train/EpisodeReturns', train_episode_return,
                          episode)
        if args.automatic_entropy_tuning:
            writer.add_scalar('Train/Alpha', args.alpha, episode)

        # Perform the evaluation phase -- no learning
        if episode > 0 and episode % args.eval_per_train == 0:
            eval_mode = True

            eval_sum_returns = 0.
            eval_num_episodes = 0

            for _ in range(args.evaluation_eps):
                # Run one episode
                steps, eval_episode_return, _ = run_one_episode(
                    steps, eval_mode)

                eval_sum_returns += eval_episode_return
                eval_num_episodes += 1

                eval_average_return = eval_sum_returns / eval_num_episodes if eval_num_episodes > 0 else 0.0

                # Log experiment result for evaluation episodes
                writer.add_scalar('Eval/AverageReturns', eval_average_return,
                                  episode)
                writer.add_scalar('Eval/EpisodeReturns', eval_episode_return,
                                  episode)

            print('---------------------------------------')
            print('Episodes:', episode)
            print('AverageReturn:', round(train_average_return, 2))
            print('EvalEpisodes:', eval_num_episodes)
            print('EvalAverageReturn:', round(eval_average_return, 2))
            print('Time:', int(time.time() - start_time))
            print('---------------------------------------')

            # Save a training model
            if eval_average_return >= args.threshold_return:
                if not os.path.exists('./save_model'):
                    os.mkdir('./save_model')

                ckpt_path = os.path.join('./save_model/' + 'Hopper' + '_ep_' + str(episode) \
                                                                              + '_rt_' + str(round(eval_average_return, 2)) \
                                                                              + '_t_' + str(int(time.time() - start_time)) + '.pt')
                torch.save(actor.state_dict(), ckpt_path)
                break

    env.close()