Ejemplo n.º 1
0
def init_unity_env(env_path, show_visuals=True):
    worker_id = 0
    done = False

    while not done:
        if worker_id > 64:
            sys.exit()
        try:
            env = UnityEnvironment(env_path,
                                   worker_id=worker_id,
                                   no_graphics=not show_visuals)
            done = True
        except mlagents.envs.exception.UnityWorkerInUseException:
            worker_id += 1

    env.reset(train_mode=True)
    brain_name = list(env.brains.keys())[0]

    state_space = env.brains[brain_name].vector_observation_space_size
    action_space = env.brains[brain_name].vector_action_space_size

    n_agents = env._n_agents[brain_name]

    multiagent = True if n_agents > 1 else False

    return env, state_space, action_space, n_agents, multiagent, brain_name
Ejemplo n.º 2
0
def get_unity_envs():
    # check the python environment
    print("Python version: ", sys.version)
    if (sys.version_info[0] < 3):
        raise Exception("ERROR: ML-Agents Toolkit requires Python 3")

    # set the unity environment
    env = UnityEnvironment(file_name=UNITY_PATH, base_port=5005)
    brain = env.brain_names[0]
    env.reset(train_mode=True)[brain]

    return env, brain
Ejemplo n.º 3
0
def test_reset(mock_communicator, mock_launcher):
    mock_communicator.return_value = MockCommunicator(discrete_action=False,
                                                      visual_inputs=0)
    env = UnityEnvironment(" ")
    spec = env.get_agent_group_spec("RealFakeBrain")
    env.reset()
    batched_step_result = env.get_step_result("RealFakeBrain")
    env.close()
    assert isinstance(batched_step_result, BatchedStepResult)
    assert len(spec.observation_shapes) == len(batched_step_result.obs)
    n_agents = batched_step_result.n_agents()
    for shape, obs in zip(spec.observation_shapes, batched_step_result.obs):
        assert (n_agents, ) + shape == obs.shape
Ejemplo n.º 4
0
def do_rollout(env: UnityEnvironment, brain_name):
    """ Builds a path by running through an environment using a provided function to select actions. """
    obs, rewards, actions, human_obs = [], [], [], []

    curr_info = env.reset(train_mode=False)
    # Primary environment loop
    while not env.global_done:
        ob = curr_info
        action = 2 * np.random.rand() - 1
        obs.append(ob)
        actions.append(action)
        new_info = env.step(vector_action=action)[brain_name]
        ob, rew, done, info = env.step(action)
        rewards.append(rew)
        human_obs.append(info.get("human_obs"))
        if done:
            break
    # Build path dictionary
    path = {
        "obs": np.array(obs),
        "original_rewards": np.array(rewards),
        "actions": np.array(actions),
        "human_obs": np.array(human_obs)
    }
    return path
Ejemplo n.º 5
0
def test_ppo_get_value_estimates(mock_communicator, mock_launcher,
                                 dummy_config):
    tf.reset_default_graph()
    mock_communicator.return_value = MockCommunicator(discrete_action=False,
                                                      visual_inputs=0)
    env = UnityEnvironment(" ")
    brain_infos = env.reset()
    brain_info = brain_infos[env.external_brain_names[0]]

    trainer_parameters = dummy_config
    model_path = env.external_brain_names[0]
    trainer_parameters["model_path"] = model_path
    trainer_parameters["keep_checkpoints"] = 3
    policy = PPOPolicy(0, env.brains[env.external_brain_names[0]],
                       trainer_parameters, False, False)
    run_out = policy.get_value_estimates(brain_info, 0, done=False)
    for key, val in run_out.items():
        assert type(key) is str
        assert type(val) is float

    run_out = policy.get_value_estimates(brain_info, 0, done=True)
    for key, val in run_out.items():
        assert type(key) is str
        assert val == 0.0

    # Check if we ignore terminal states properly
    policy.reward_signals["extrinsic"].use_terminal_states = False
    run_out = policy.get_value_estimates(brain_info, 0, done=True)
    for key, val in run_out.items():
        assert type(key) is str
        assert val != 0.0

    env.close()
Ejemplo n.º 6
0
class UnityEnv():
    """Unity Reacher Environment Wrapper
        https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Learning-Environment-Examples.md

    """
    def __init__(self,
                 env_file='data/Reacher.exe',
                 no_graphics=True,
                 mlagents=False):
        if mlagents:
            from mlagents.envs.environment import UnityEnvironment
        else:
            from unityagents import UnityEnvironment
        self.env = UnityEnvironment(file_name=env_file,
                                    no_graphics=no_graphics)
        self.brain_name = self.env.brain_names[0]
        brain = self.env.brains[self.brain_name]
        self.action_size = brain.vector_action_space_size
        if type(self.action_size) != int:
            self.action_size = self.action_size[0]
        env_info = self.env.reset(train_mode=True)[self.brain_name]
        self.state_size = env_info.vector_observations.shape[1]
        self.num_agents = len(env_info.agents)

    def reset(self, train=True):
        env_info = self.env.reset(train_mode=train)[self.brain_name]
        return env_info.vector_observations

    def close(self):
        self.env.close()

    def step(self, actions):
        actions = np.clip(actions, -1, 1)
        env_info = self.env.step(actions)[self.brain_name]
        next_states = env_info.vector_observations
        rewards = env_info.rewards
        dones = env_info.local_done
        return next_states, np.array(rewards), np.array(dones)

    @property
    def action_shape(self):
        return (self.num_agents, self.action_size)
Ejemplo n.º 7
0
def test_ppo_policy_evaluate(mock_communicator, mock_launcher, dummy_config):
    tf.reset_default_graph()
    mock_communicator.return_value = MockCommunicator(discrete_action=False,
                                                      visual_inputs=0)
    env = UnityEnvironment(" ")
    env.reset()
    brain_name = env.get_agent_groups()[0]
    brain_info = step_result_to_brain_info(
        env.get_step_result(brain_name), env.get_agent_group_spec(brain_name))
    brain_params = group_spec_to_brain_parameters(
        brain_name, env.get_agent_group_spec(brain_name))

    trainer_parameters = dummy_config
    model_path = brain_name
    trainer_parameters["model_path"] = model_path
    trainer_parameters["keep_checkpoints"] = 3
    policy = PPOPolicy(0, brain_params, trainer_parameters, False, False)
    run_out = policy.evaluate(brain_info)
    assert run_out["action"].shape == (3, 2)
    env.close()
Ejemplo n.º 8
0
def run(train_mode, load_model, env_name):
    env = UnityEnvironment(file_name=env_name)
    default_brain = env.brain_names[0]

    agent = DDPGAgent(state_size, action_size, train_mode, load_model)
    rewards = deque(maxlen=print_interval)
    success_cnt = 0
    step = 0

    for episode in range(run_episode + test_episode):
        if episode == run_episode:
            train_mode = False

        env_info = env.reset(train_mode=train_mode)[default_brain]
        state = env_info.vector_observations[0]
        episode_rewards = 0
        done = False

        while not done:
            step += 1

            action = agent.get_action([state])[0]
            #print(action)
            env_info = env.step(action)[default_brain]
            next_state = env_info.vector_observations[0]
            reward = env_info.rewards[0]
            done = env_info.local_done[0]

            episode_rewards += reward

            if train_mode:
                agent.append_sample(state, action, reward, next_state, done)

            state = next_state

            if episode > start_train_episode and train_mode:
                agent.train_model()

        success_cnt = success_cnt + 1 if reward == 1 else success_cnt
        rewards.append(episode_rewards)
        agent.save_samples(episode)

        if episode % print_interval == 0 and episode != 0:
            print("step: {} / episode: {} / reward: {:.3f} / success_cnt: {}".
                  format(step, episode, np.mean(rewards), success_cnt))
            agent.Write_Summray(np.mean(rewards), success_cnt, episode)
            success_cnt = 0

        if train_mode and episode % save_interval == 0 and episode != 0:
            print("model saved")
            agent.save_model()

    env.close()
Ejemplo n.º 9
0
def init_unity_env(env_path, show_visuals=True):

    # Find a worker_id < 64 that's not in use
    worker_id = 0
    done = False

    while not done:
        if worker_id > 64:
            sys.exit()
        try:
            env = UnityEnvironment(env_path,
                                   worker_id=worker_id,
                                   no_graphics=not show_visuals)
            done = True
        except mlagents.envs.exception.UnityWorkerInUseException:
            worker_id += 1

    # Get state and action space, as well as multiagent and multibrain info from environment
    env.reset(train_mode=not show_visuals)
    # brain_name = list(env.brains.keys())[0]
    brain_names = list(env.brains.keys())

    if len(brain_names) > 1:
        multibrain = True
        n_agents = env._n_agents[brain_names[0]] + env._n_agents[
            brain_names[1]]
    else:
        multibrain = False
        n_agents = env._n_agents[brain_names[0]]

    # WalkerVis is a version of the Walker environment with one brain 'WalkerVis'
    # having visual observations, whereas 'Walker' brain does not.
    # The visual observations are used for recording episodes
    state_space = env.brains[brain_names[0]].vector_observation_space_size
    action_space = env.brains[brain_names[0]].vector_action_space_size

    multiagent = True if n_agents > 1 else False

    return env, state_space, action_space, n_agents, multiagent, brain_names, multibrain
Ejemplo n.º 10
0
def test_ppo_policy_evaluate(mock_communicator, mock_launcher, dummy_config):
    tf.reset_default_graph()
    mock_communicator.return_value = MockCommunicator(discrete_action=False,
                                                      visual_inputs=0)
    env = UnityEnvironment(" ")
    brain_infos = env.reset()
    brain_info = brain_infos[env.external_brain_names[0]]

    trainer_parameters = dummy_config
    model_path = env.external_brain_names[0]
    trainer_parameters["model_path"] = model_path
    trainer_parameters["keep_checkpoints"] = 3
    policy = PPOPolicy(0, env.brains[env.external_brain_names[0]],
                       trainer_parameters, False, False)
    run_out = policy.evaluate(brain_info)
    assert run_out["action"].shape == (3, 2)
    env.close()
Ejemplo n.º 11
0
def test_reset(mock_communicator, mock_launcher):
    mock_communicator.return_value = MockCommunicator(discrete_action=False,
                                                      visual_inputs=0)
    env = UnityEnvironment(" ")
    brain = env.brains["RealFakeBrain"]
    brain_info = env.reset()
    env.close()
    assert isinstance(brain_info, dict)
    assert isinstance(brain_info["RealFakeBrain"], BrainInfo)
    assert isinstance(brain_info["RealFakeBrain"].visual_observations, list)
    assert isinstance(brain_info["RealFakeBrain"].vector_observations,
                      np.ndarray)
    assert (len(brain_info["RealFakeBrain"].visual_observations) ==
            brain.number_visual_observations)
    assert len(brain_info["RealFakeBrain"].vector_observations) == len(
        brain_info["RealFakeBrain"].agents)
    assert (len(brain_info["RealFakeBrain"].vector_observations[0]) ==
            brain.vector_observation_space_size)
Ejemplo n.º 12
0
class UnityEnv(gym.Env):
    """
    Provides Gym wrapper for Unity Learning Environments.
    Multi-agent environments use lists for object types, as done here:
    https://github.com/openai/multiagent-particle-envs
    """
    def __init__(
        self,
        environment_filename: str,
        worker_id: int = 0,
        use_visual: bool = False,
        uint8_visual: bool = False,
        multiagent: bool = False,
        flatten_branched: bool = False,
        no_graphics: bool = False,
        allow_multiple_visual_obs: bool = False,
    ):
        """
        Environment initialization
        :param environment_filename: The UnityEnvironment path or file to be wrapped in the gym.
        :param worker_id: Worker number for environment.
        :param use_visual: Whether to use visual observation or vector observation.
        :param uint8_visual: Return visual observations as uint8 (0-255) matrices instead of float (0.0-1.0).
        :param multiagent: Whether to run in multi-agent mode (lists of obs, reward, done).
        :param flatten_branched: If True, turn branched discrete action spaces into a Discrete space rather than
            MultiDiscrete.
        :param no_graphics: Whether to run the Unity simulator in no-graphics mode
        :param allow_multiple_visual_obs: If True, return a list of visual observations instead of only one.
        """
        self._env = UnityEnvironment(environment_filename,
                                     worker_id,
                                     no_graphics=no_graphics)

        # Take a single step so that the brain information will be sent over
        if not self._env.brains:
            self._env.step()

        self.name = self._env.academy_name
        self.visual_obs = None
        self._current_state = None
        self._n_agents = None
        self._multiagent = multiagent
        self._flattener = None
        self.game_over = (
            False
        )  # Hidden flag used by Atari environments to determine if the game is over
        self._allow_multiple_visual_obs = allow_multiple_visual_obs

        # Check brain configuration
        if len(self._env.brains) != 1:
            raise UnityGymException(
                "There can only be one brain in a UnityEnvironment "
                "if it is wrapped in a gym.")
        if len(self._env.external_brain_names) <= 0:
            raise UnityGymException(
                "There are not any external brain in the UnityEnvironment")

        self.brain_name = self._env.external_brain_names[0]
        brain = self._env.brains[self.brain_name]

        if use_visual and brain.number_visual_observations == 0:
            raise UnityGymException(
                "`use_visual` was set to True, however there are no"
                " visual observations as part of this environment.")
        self.use_visual = brain.number_visual_observations >= 1 and use_visual

        if not use_visual and uint8_visual:
            logger.warning(
                "`uint8_visual was set to true, but visual observations are not in use. "
                "This setting will not have any effect.")
        else:
            self.uint8_visual = uint8_visual

        if brain.number_visual_observations > 1 and not self._allow_multiple_visual_obs:
            logger.warning(
                "The environment contains more than one visual observation. "
                "You must define allow_multiple_visual_obs=True to received them all. "
                "Otherwise, please note that only the first will be provided in the observation."
            )

        if brain.num_stacked_vector_observations != 1:
            raise UnityGymException(
                "There can only be one stacked vector observation in a UnityEnvironment "
                "if it is wrapped in a gym.")

        # Check for number of agents in scene.
        initial_info = self._env.reset()[self.brain_name]
        self._check_agents(len(initial_info.agents))

        # Set observation and action spaces
        if brain.vector_action_space_type == "discrete":
            if len(brain.vector_action_space_size) == 1:
                self._action_space = spaces.Discrete(
                    brain.vector_action_space_size[0])
            else:
                if flatten_branched:
                    self._flattener = ActionFlattener(
                        brain.vector_action_space_size)
                    self._action_space = self._flattener.action_space
                else:
                    self._action_space = spaces.MultiDiscrete(
                        brain.vector_action_space_size)

        else:
            if flatten_branched:
                logger.warning(
                    "The environment has a non-discrete action space. It will "
                    "not be flattened.")
            high = np.array([1] * brain.vector_action_space_size[0])
            self._action_space = spaces.Box(-high, high, dtype=np.float32)
        high = np.array([np.inf] * brain.vector_observation_space_size)
        self.action_meanings = brain.vector_action_descriptions
        if self.use_visual:
            shape = (
                brain.camera_resolutions[0].height,
                brain.camera_resolutions[0].width,
                brain.camera_resolutions[0].num_channels,
            )
            if uint8_visual:
                self._observation_space = spaces.Box(0,
                                                     255,
                                                     dtype=np.uint8,
                                                     shape=shape)
            else:
                self._observation_space = spaces.Box(0,
                                                     1,
                                                     dtype=np.float32,
                                                     shape=shape)

        else:
            self._observation_space = spaces.Box(-high, high, dtype=np.float32)

    def reset(self):
        """Resets the state of the environment and returns an initial observation.
        In the case of multi-agent environments, this is a list.
        Returns: observation (object/list): the initial observation of the
            space.
        """
        info = self._env.reset()[self.brain_name]
        n_agents = len(info.agents)
        self._check_agents(n_agents)
        self.game_over = False

        if not self._multiagent:
            obs, reward, done, info = self._single_step(info)
        else:
            obs, reward, done, info = self._multi_step(info)
        return obs

    def step(self, action):
        """Run one timestep of the environment's dynamics. When end of
        episode is reached, you are responsible for calling `reset()`
        to reset this environment's state.
        Accepts an action and returns a tuple (observation, reward, done, info).
        In the case of multi-agent environments, these are lists.
        Args:
            action (object/list): an action provided by the environment
        Returns:
            observation (object/list): agent's observation of the current environment
            reward (float/list) : amount of reward returned after previous action
            done (boolean/list): whether the episode has ended.
            info (dict): contains auxiliary diagnostic information, including BrainInfo.
        """

        # Use random actions for all other agents in environment.
        if self._multiagent:
            if not isinstance(action, list):
                raise UnityGymException(
                    "The environment was expecting `action` to be a list.")
            if len(action) != self._n_agents:
                raise UnityGymException(
                    "The environment was expecting a list of {} actions.".
                    format(self._n_agents))
            else:
                if self._flattener is not None:
                    # Action space is discrete and flattened - we expect a list of scalars
                    action = [
                        self._flattener.lookup_action(_act) for _act in action
                    ]
                action = np.array(action)
        else:
            if self._flattener is not None:
                # Translate action into list
                action = self._flattener.lookup_action(action)

        info = self._env.step(action)[self.brain_name]
        n_agents = len(info.agents)
        self._check_agents(n_agents)
        self._current_state = info

        if not self._multiagent:
            obs, reward, done, info = self._single_step(info)
            self.game_over = done
        else:
            obs, reward, done, info = self._multi_step(info)
            self.game_over = all(done)
        return obs, reward, done, info

    def _single_step(self, info):
        if self.use_visual:
            visual_obs = info.visual_observations

            if self._allow_multiple_visual_obs:
                visual_obs_list = []
                for obs in visual_obs:
                    visual_obs_list.append(self._preprocess_single(obs[0]))
                self.visual_obs = visual_obs_list
            else:
                self.visual_obs = self._preprocess_single(visual_obs[0][0])

            default_observation = self.visual_obs
        else:
            default_observation = info.vector_observations[0, :]

        return (
            default_observation,
            info.rewards[0],
            info.local_done[0],
            {
                "text_observation": info.text_observations[0],
                "brain_info": info
            },
        )

    def _preprocess_single(self, single_visual_obs):
        if self.uint8_visual:
            return (255.0 * single_visual_obs).astype(np.uint8)
        else:
            return single_visual_obs

    def _multi_step(self, info):
        if self.use_visual:
            self.visual_obs = self._preprocess_multi(info.visual_observations)
            default_observation = self.visual_obs
        else:
            default_observation = info.vector_observations
        return (
            list(default_observation),
            info.rewards,
            info.local_done,
            {
                "text_observation": info.text_observations,
                "brain_info": info
            },
        )

    def _preprocess_multi(self, multiple_visual_obs):
        if self.uint8_visual:
            return [(255.0 * _visual_obs).astype(np.uint8)
                    for _visual_obs in multiple_visual_obs]
        else:
            return multiple_visual_obs

    def render(self, mode="rgb_array"):
        return self.visual_obs

    def close(self):
        """Override _close in your subclass to perform any necessary cleanup.
        Environments will automatically close() themselves when
        garbage collected or when the program exits.
        """
        self._env.close()

    def get_action_meanings(self):
        return self.action_meanings

    def seed(self, seed=None):
        """Sets the seed for this env's random number generator(s).
        Currently not implemented.
        """
        logger.warn("Could not seed environment %s", self.name)
        return

    def _check_agents(self, n_agents):
        if not self._multiagent and n_agents > 1:
            raise UnityGymException(
                "The environment was launched as a single-agent environment, however"
                "there is more than one agent in the scene.")
        elif self._multiagent and n_agents <= 1:
            raise UnityGymException(
                "The environment was launched as a mutli-agent environment, however"
                "there is only one agent in the scene.")
        if self._n_agents is None:
            self._n_agents = n_agents
            logger.info("{} agents within environment.".format(n_agents))
        elif self._n_agents != n_agents:
            raise UnityGymException(
                "The number of agents in the environment has changed since "
                "initialization. This is not supported.")

    @property
    def metadata(self):
        return {"render.modes": ["rgb_array"]}

    @property
    def reward_range(self):
        return -float("inf"), float("inf")

    @property
    def spec(self):
        return None

    @property
    def action_space(self):
        return self._action_space

    @property
    def observation_space(self):
        return self._observation_space

    @property
    def number_agents(self):
        return self._n_agents
Ejemplo n.º 13
0
class Agent:
    def __init__(self):
        # Unity variables
        self.env = None
        self.env_info = None
        self.default_brain = None

        # Actions (Nothing, forward, turn left, reverse, turn right)
        self.actions = [0, 1, 2, 3, 4]

        # Robot constants
        self.robot_length_forward = 2.4
        self.robot_length_backwards = 1.1

        # Action constants
        self.thrust_constant = 0.1 * 5
        self.rotation_constant = 2 * 5

        self.features = [
            self.feature_1,
            self.throttling_into_wall,
            self.reversing_into_wall,
            self.getting_closer_to_debris_1,
            self.getting_closer_to_dropzone,
            self.rotation,
            self.pointed_towards_debris,
            self.angle_to_debris_1,
            self.distance_to_debris_1,
            # self.debris_to_dropzone_1
        ]

        # Observations
        self.observations = [0] * 86
        self.sensors_front = []
        self.sensors_behind = []
        # Observation list class(IntEnum)
        self.obs = observation.RobotObservations

    # Setup connection between Unity and Python
    def setup_connection_with_unity(self, build_scene):
        # Connect to Unity and get environment
        self.env = UnityEnvironment(file_name=build_scene, worker_id=0, seed=1)

        # Reset the environment
        self.env_info = self.env.reset(train_mode=True)

        # Set the default brain to work with
        self.default_brain = "Robot"

    # Update observations variable with information about the environment without dropzone
    def update_observations(self):
        self.observations = self.env_info[
            self.default_brain].vector_observations[0]
        self.velocity_z = self.get_obs(self.obs.robot_velocity_z)
        #self.sensors_front = [self.get_obs(self.obs.sensor_measurement_1), self.get_obs(self.obs.sensor_measurement_2), self.get_obs(self.obs.sensor_measurement_30)]
        #self.sensors_behind = [-self.get_obs(self.obs.sensor_measurement_16), -self.get_obs(self.obs.sensor_measurement_15), -self.get_obs(self.obs.sensor_measurement_17)]

        self.sensors_front = [
            self.get_obs(self.obs.sensor_measurement_1),
            self.get_obs(self.obs.sensor_measurement_2),
            self.get_obs(self.obs.sensor_measurement_30)
        ]
        self.sensors_behind = [
            -self.get_obs(self.obs.sensor_measurement_16),
            -self.get_obs(self.obs.sensor_measurement_15),
            -self.get_obs(self.obs.sensor_measurement_17)
        ]

    def get_obs(self, index):
        return self.observations[index]

    def perform_action(self, action):
        self.env_info = self.env.step({self.default_brain: action})

    def get_reward(self):
        return self.env_info[self.default_brain].rewards[0]

    def is_done(self):
        return self.env_info[self.default_brain].local_done[0]

    def reset_simulation(self):
        self.env.reset()

    # States
    def get_state(self):
        state = [1]

        self.update_observations()

        # Throttling into wall
        state.append(1) if self.sensors_front < [self.robot_length_forward
                                                 ] else state.append(0)

        # Reversing into wall
        state.append(1) if self.sensors_behind > [
            -self.robot_length_backwards
        ] else state.append(0)

        # Robot within dropZone
        state.append(1) if self.get_obs(
            self.obs.robot_in_dropzone) else state.append(0)

        # Getting closer to debris 1
        #state.append(1) if self.get_obs(self.obs.getting_closer_to_debris_1) else state.append(0)

        # Ready to pickup debris
        state.append(1) if self.get_obs(
            self.obs.shovel_position) == 330 else state.append(0)

        # Debris is in shovel
        state.append(1) if self.get_obs(
            self.obs.debris_in_shovel) else state.append(0)

        # Debris in front of shovel
        state.append(1) if self.get_obs(
            self.obs.debris_in_front) else state.append(0)

        # Getting closer to dropzone
        state.append(1) if self.get_obs(
            self.obs.robot_facing_debris) else state.append(0)

        # Rotation
        #state.append(int(self.get_obs(self.obs.robot_rotation)))

        # Pointed towards debris
        state.append(1) if self.get_obs(
            self.obs.robot_facing_debris) else state.append(0)

        # Angle to debris
        state.append(int(self.get_obs(self.obs.angle_robot_debris_1)))

        # Distance to debris
        state.append(int(self.get_obs(self.obs.getting_closer_to_debris_1)))

        # Debris distance to dropzone
        # state.append(int(self.get_obs(self.obs.debris_to_dropzone_1)))

        return state

    # Features
    def feature_1(self, state, action):
        return 1

    def throttling_into_wall(self, state, action):
        constant = 1

        if action == 1:
            constant = self.thrust_constant
        elif action == -1:
            constant = -self.thrust_constant

        return 1 if self.sensors_front < [
            self.robot_length_forward + constant
        ] else 0

    def reversing_into_wall(self, state, action):
        constant = 1

        if action == -1:
            constant = -self.thrust_constant
        elif action == 1:
            constant = self.thrust_constant

        return 1 if self.sensors_behind > [
            self.robot_length_backwards + constant
        ] else 0

    def distance_to_debris_1(self, state, action):
        debrisPosition_1 = [
            self.obs.debris_1_position_x, self.obs.debris_1_position_z
        ]
        return self.distance_to_debris(state, action, debrisPosition_1)

    def distance_to_debris_2(self, state, action):
        debrisPosition_2 = [
            self.obs.debris_2_position_x, self.obs.debris_2_position_y
        ]
        return self.distance_to_debris(state, action, debrisPosition_2)

    def distance_to_debris_3(self, state, action):
        debrisPosition_3 = [
            self.obs.debris_3_position_x, self.obs.debris_3_position_z
        ]
        return self.distance_to_debris(state, action, debrisPosition_3)

    def distance_to_debris_4(self, state, action):
        debrisPosition_4 = [
            self.obs.debris_4_position_x, self.obs.debris_4_position_z
        ]
        return self.distance_to_debris(state, action, debrisPosition_4)

    def distance_to_debris_5(self, state, action):
        debrisPosition_5 = [
            self.obs.debris_5_position_x, self.obs.debris_5_position_z
        ]
        return self.distance_to_debris(state, action, debrisPosition_5)

    def distance_to_debris_6(self, state, action):
        debrisPosition_6 = [
            self.obs.debris_6_position_x, self.obs.debris_6_position_z
        ]
        return self.distance_to_debris(state, action, debrisPosition_6)

    def distance_to_debris(self, state, action, debrisNumber):
        robotPosition = [self.obs.robot_position_x, self.obs.robot_position_z]
        distance = 0

        if action == 0:  # standing still
            Angle = 0
            distance = self.distance_solver(robotPosition, debrisNumber, 0,
                                            Angle)

        if action == 1:  # forward
            Angle = 0
            distance = self.distance_solver(robotPosition, debrisNumber, 1,
                                            Angle)

        if action == 2:  # left while not throttling
            Angle = self.rotation_constant
            distance = self.distance_solver(robotPosition, debrisNumber, 0,
                                            Angle)

        if action == 3:  # backward
            Angle = 0
            distance = self.distance_solver(robotPosition, debrisNumber, -1,
                                            Angle)

        if action == 4:  # right while not throttling
            Angle = -self.rotation_constant
            distance = self.distance_solver(robotPosition, debrisNumber, 0,
                                            Angle)

        transform_value = 1 / 20

        return distance * transform_value

    def distance_solver(self, robotPosition, debrisNumber, throttle, angle):
        robotPosition_new = []
        # find robot's position based on current rotation
        for i in range(len(robotPosition)):
            if throttle == 1:  # if moving forwards, add constant
                robotPosition[i] += self.thrust_constant
                robotPosition_new.append(robotPosition[i])
            elif throttle == -1:  # if moving backwards, subtract constant
                robotPosition[i] -= self.thrust_constant
                robotPosition_new.append(robotPosition[i])
            elif throttle == 0:  # if not moving save robot's current position as new position
                robotPosition_new.append(robotPosition[i])

        # find robot's new position based on its' previous position
        if angle != 0:
            robotPosition_new = self.robotPostion_new(robotPosition_new, angle)

        distance_vector = []
        # find vector from robot to debris
        for i in range(len(robotPosition_new)):
            result = debrisNumber[i] - robotPosition_new[i]
            distance_vector.append(result)

        result = 0
        # find distance between robot and debris
        for i in distance_vector:
            result += i * i
        distance = math.sqrt(result)

        return distance

    def robotPostion_new(self, robotPosition, angle):
        robotPosition_new = []

        # as robotPosition[x , z] we use specific index to determine the x and z-coordinate
        rbPos_X = robotPosition[0] * math.cos(math.radians(
            angle)) - robotPosition[1] * math.sin(math.radians(angle))
        rbPos_Z = robotPosition[0] * math.sin(math.radians(
            angle)) + robotPosition[1] * math.cos(math.radians(angle))

        robotPosition_new.append(rbPos_X)
        robotPosition_new.append(rbPos_Z)

        return robotPosition_new

    # Getting closer to debris number 1 (2-6 following)
    def getting_closer_to_debris_1(self, state, action):
        return self.getting_closer_to_debris(state, action,
                                             self.obs.angle_robot_debris_1)

    # TODO: add 2-6 to feature list (line 33)
    def getting_closer_to_debris_2(self, state, action):
        return self.getting_closer_to_debris(state, action,
                                             self.obs.angle_robot_debris_2)

    def getting_closer_to_debris_3(self, state, action):
        return self.getting_closer_to_debris(state, action,
                                             self.obs.angle_robot_debris_3)

    def getting_closer_to_debris_4(self, state, action):
        return self.getting_closer_to_debris(state, action,
                                             self.obs.angle_robot_debris_4)

    def getting_closer_to_debris_5(self, state, action):
        return self.getting_closer_to_debris(state, action,
                                             self.obs.angle_robot_debris_5)

    def getting_closer_to_debris_6(self, state, action):
        return self.getting_closer_to_debris(state, action,
                                             self.obs.angle_robot_debris_6)

    # TODO Check if it reverses towards debris
    def getting_closer_to_debris(self, state, action, obs_num):
        action_list = action
        # Angle on each side of the robot's forward vector
        angle_range = 45  # TODO: Figure out the exact value
        angle_to_debris = self.get_obs(obs_num)

        getting_closer = 0

        # steer 1 = more negative, steer -1 = more positive
        # Move directly towards debris
        if action == 1:
            if action == 1:
                if -angle_range < (angle_to_debris +
                                   self.rotation_constant) < angle_range:
                    getting_closer = 1
            elif action == -1:
                if -angle_range < (angle_to_debris -
                                   self.rotation_constant) < angle_range:
                    getting_closer = 1
            elif action == 0:
                if -angle_range < angle_to_debris < angle_range:
                    getting_closer = 1

        return getting_closer

    # TODO Check if it reverses towards debris
    def getting_closer_to_dropzone(self, state, action):
        action_list = action
        angle_to_dropzone = self.get_obs(self.obs.angle_to_dropzone)
        getting_closer_to_dropzone = 0

        if action == 1:
            if action == 0:
                if -45 < angle_to_dropzone < 45:
                    getting_closer_to_dropzone = 1
            elif action == 1:
                if -45 < angle_to_dropzone + self.rotation_constant < 45:
                    getting_closer_to_dropzone = 1
            elif action == -1:
                if -45 < angle_to_dropzone - self.rotation_constant < 45:
                    getting_closer_to_dropzone = 1

        return getting_closer_to_dropzone

    def debris_to_dropzone_1(self, state, action):
        return self.debris_to_dropzone(state, action,
                                       self.obs.debris_to_dropzone_1)

    def debris_to_dropzone_2(self, state, action):
        return self.debris_to_dropzone(state, action,
                                       self.obs.debris_to_dropzone_2)

    def debris_to_dropzone_3(self, state, action):
        return self.debris_to_dropzone(state, action,
                                       self.obs.debris_to_dropzone_3)

    def debris_to_dropzone_4(self, state, action):
        return self.debris_to_dropzone(state, action,
                                       self.obs.debris_to_dropzone_4)

    def debris_to_dropzone_5(self, state, action):
        return self.debris_to_dropzone(state, action,
                                       self.obs.debris_to_dropzone_5)

    def debris_to_dropzone_6(self, state, action):
        return self.debris_to_dropzone(state, action,
                                       self.obs.debris_to_dropzone_6)

    def debris_to_dropzone(self, state, action, observation_index):
        action_list = action
        closer_to_dropzone = 0  # the domain is either 1 or 0 (boolean-alike)

        if action == 1:
            if action == 1:
                if self.get_obs(observation_index) is 1:
                    closer_to_dropzone = self.get_obs(observation_index)
            elif action == 0:
                if self.get_obs(observation_index) is 1:
                    closer_to_dropzone = self.get_obs(observation_index)
            elif action == -1:
                if self.get_obs(observation_index) is 1:
                    closer_to_dropzone = self.get_obs(observation_index)

        if action == 0:
            if action == 1:
                if self.get_obs(observation_index) is 1:
                    closer_to_dropzone = self.get_obs(observation_index)
            elif action == 0:
                if self.get_obs(observation_index) is 1:
                    closer_to_dropzone = self.get_obs(observation_index)
            elif action == -1:
                if self.get_obs(observation_index) is 1:
                    closer_to_dropzone = self.get_obs(observation_index)

        if action == -1:
            if action == 1:
                if self.get_obs(observation_index) is 1:
                    closer_to_dropzone = self.get_obs(observation_index)
            elif action == 0:
                if self.get_obs(observation_index) is 1:
                    closer_to_dropzone = self.get_obs(observation_index)
            elif action == -1:
                if self.get_obs(observation_index) is 1:
                    closer_to_dropzone = self.get_obs(observation_index)

        return closer_to_dropzone

    def angle_to_debris_1(self, state, action):
        debris_1 = [self.obs.debris_1_position_x, self.obs.debris_1_position_z]
        return self.angle_to_debris(state, action, debris_1)

    def angle_to_debris_2(self, state, action):
        debris_2 = [self.obs.debris_2_position_x, self.obs.debris_2_position_z]
        return self.angle_to_debris(state, action, debris_2)

    def angle_to_debris_3(self, state, action):
        debris_3 = [self.obs.debris_3_position_x, self.obs.debris_3_position_z]
        return self.angle_to_debris(state, action, debris_3)

    def angle_to_debris_4(self, state, action):
        debris_4 = [self.obs.debris_4_position_x, self.obs.debris_4_position_z]
        return self.angle_to_debris(state, action, debris_4)

    def angle_to_debris_5(self, state, action):
        debris_5 = [self.obs.debris_5_position_x, self.obs.debris_5_position_z]
        return self.angle_to_debris(state, action, debris_5)

    def angle_to_debris_6(self, state, action):
        debris_6 = [self.obs.debris_6_position_x, self.obs.debris_6_position_z]
        return self.angle_to_debris(state, action, debris_6)

    def angle_to_debris(self, state, action, debrisNumber):
        transform_value = 1 / 360
        robotPosition = [self.obs.robot_position_x, self.obs.robot_position_z]
        angle_to_debris = 0

        if action == 0:  # standing still
            Angle = 0
            angle_to_debris = self.robotAngle_New(robotPosition, debrisNumber,
                                                  0, Angle)

        if action == 1:  # forward straight
            Angle = 0
            angle_to_debris = self.robotAngle_New(robotPosition, debrisNumber,
                                                  1, Angle)

        if action == 2:  # left with no throttling
            Angle = self.rotation_constant
            angle_to_debris = self.robotAngle_New(robotPosition, debrisNumber,
                                                  0, Angle)

        if action == 3:  # backward straight
            Angle = 0
            angle_to_debris = self.robotAngle_New(robotPosition, debrisNumber,
                                                  -1, Angle)

        if action == 4:  # right with no throttling
            Angle = -self.rotation_constant
            angle_to_debris = self.robotAngle_New(robotPosition, debrisNumber,
                                                  0, Angle)

        return angle_to_debris * transform_value

    def robotAngle_New(self, robotPosition, debrisNumber, throttle, angle):
        robotPosition_new = []

        # find the vector straight from the robot based on thrust
        for i in range(len(robotPosition)):
            if throttle == 1:
                robotPosition[i] += self.thrust_constant
                robotPosition_new.append(robotPosition[i])
            elif throttle == -1:
                robotPosition[i] -= self.thrust_constant
                robotPosition_new.append(robotPosition[i])
            elif throttle == 0:
                robotPosition_new.append(robotPosition[i])

        # find robot's new position by predicting
        robotPosition_new = self.robotPostion_new(robotPosition_new, angle)

        # find vector from robot's predicted position to a certain debris
        debrisVector = []
        for i in range(len(robotPosition_new)):
            result = debrisNumber[i] - robotPosition_new[i]
            debrisVector.append(result)

        # find the vector pointing forward from the robot's new position
        robotVector = []
        for i in range(len(robotPosition_new)):
            if throttle == 1:
                result = robotPosition_new[i] + self.thrust_constant
                robotVector.append(result)
            elif throttle == -1:
                result = robotPosition_new[i] + self.thrust_constant
                robotVector.append(result)
            elif throttle == 0:
                result = robotPosition_new[i] + self.thrust_constant
                robotVector.append(result)
                continue

        for i in range(len(robotVector)):
            robotVector[i] = robotVector[i] - robotPosition_new[i]

        # find angle from robot pointing forward to vector from robot to debris
        distanceVecotr = []
        for i in range(len(robotPosition_new)):
            result = debrisNumber[i] - robotPosition_new[i]
            distanceVecotr.append(result)

        # find angle between 'robotVector' and 'distanceVector'
        robotRotation_new = self.angle(robotVector, distanceVecotr)

        return robotRotation_new

    def dotproduct(self, v1, v2):  # find dot product of two vectors
        return sum((a * b) for a, b in zip(v1, v2))

    def length(self, v):  # find the length of a vector
        return math.sqrt(self.dotproduct(v, v))

    def angle(self, v1, v2):  # find the angle between two vectors
        return math.degrees(
            math.acos(
                self.dotproduct(v1, v2) / (self.length(v1) * self.length(v2))))

    # Rotation: 0 to 360
    def rotation(self, state, action):
        rotation = self.get_obs(self.obs.robot_rotation)
        new_rotation = 0

        # Transform rotation into a value between 0 and 1
        transform_value = 1 / 360

        if action == 0:
            return int(rotation) * transform_value
        elif action == 1:
            total_rotation = rotation + self.rotation_constant

            # Check if new rotation is above 360 and return value beyond 0 instead
            if total_rotation >= 360:
                new_rotation = int(total_rotation - 360)
            else:
                new_rotation = int(total_rotation)
        elif action == -1:
            total_rotation = rotation - self.rotation_constant

            # Check if new rotation is below zero and return value behind 360 instead
            if total_rotation < 0:
                new_rotation = int(360 - abs(total_rotation))
            else:
                new_rotation = int(total_rotation)

        return new_rotation * transform_value

    # Pointing / Turning towards debris base on angle
    def pointed_towards_debris(self, state, action):
        pointing = 0

        if action == 1:
            if self.get_obs(self.obs.angle_robot_debris_1
                            ) + self.rotation_constant < 0:
                pointing = 1
        elif action == -1:
            if self.get_obs(self.obs.angle_robot_debris_1
                            ) - self.rotation_constant > 0:
                pointing = 1
        elif action == 0:
            if self.get_obs(self.obs.robot_facing_debris):
                pointing = 1

        return pointing
Ejemplo n.º 14
0
class MLAgentsEnv(VectorEnv):
    """
    An Environment sitting behind a tcp connection and communicating through this adapter.
    Note: Communication between Unity and Python takes place over an open socket without authentication.
    Ensure that the network where training takes place is secure.
    """
    def __init__(self, file_name=None, worker_id=0, base_port=5005, seed=0, docker_training=False, no_graphics=False,
                 timeout_wait=30, train_mode=True, **kwargs):
        """
        Args:
            file_name (Optional[str]): Name of Unity environment binary.
            base_port (int): Port number to connect to Unity environment. `worker_id` increments on top of this.
            worker_id (int): Number to add to `base_port`. Used for asynchronous agent scenarios.
            docker_training (bool): Informs this class, whether the process is being run within a container.
                Default: False.
            no_graphics (bool): Whether to run the Unity simulator in no-graphics mode. Default: False.
            timeout_wait (int): Time (in seconds) to wait for connection from environment.
            train_mode (bool): Whether to run in training mode, speeding up the simulation. Default: True.
        """
        # First create the UnityMLAgentsEnvironment to get state and action spaces, then create RLgraph Environment
        # instance.
        self.mlagents_env = UnityEnvironment(
            file_name, worker_id, base_port, seed, docker_training, no_graphics
        )
        all_brain_info = self.mlagents_env.reset()
        # Get all possible information from AllBrainInfo.
        # TODO: Which scene do we pick?
        self.scene_key = next(iter(all_brain_info))
        first_brain_info = all_brain_info[self.scene_key]
        num_environments = len(first_brain_info.agents)

        state_space = {}
        if len(first_brain_info.vector_observations[0]) > 0:
            state_space["vector"] = get_space_from_op(first_brain_info.vector_observations[0])
            # TODO: This is a hack.
            if state_space["vector"].dtype == np.float64:
                state_space["vector"].dtype = np.float32
        if len(first_brain_info.visual_observations) > 0:
            state_space["visual"] = get_space_from_op(first_brain_info.visual_observations[0])
        if first_brain_info.text_observations[0]:
            state_space["text"] = get_space_from_op(first_brain_info.text_observations[0])

        if len(state_space) == 1:
            self.state_key = next(iter(state_space))
            state_space = state_space[self.state_key]
        else:
            self.state_key = None
            state_space = Dict(state_space)
        brain_params = next(iter(self.mlagents_env.brains.values()))
        if brain_params.vector_action_space_type == "discrete":
            highs = brain_params.vector_action_space_size
            # MultiDiscrete (Tuple(IntBox)).
            if any(h != highs[0] for h in highs):
                action_space = Tuple([IntBox(h) for h in highs])
            # Normal IntBox:
            else:
                action_space = IntBox(
                    low=np.zeros_like(highs, dtype=np.int32),
                    high=np.array(highs, dtype=np.int32),
                    shape=(len(highs),)
                )
        else:
            action_space = get_space_from_op(first_brain_info.action_masks[0])
        if action_space.dtype == np.float64:
            action_space.dtype = np.float32

        super(MLAgentsEnv, self).__init__(
            num_environments=num_environments, state_space=state_space, action_space=action_space, **kwargs
        )

        # Caches the last observation we made (after stepping or resetting).
        self.last_state = None

    def get_env(self):
        return self

    def reset(self, index=0):
        # Reset entire MLAgentsEnv iff global_done is True.
        if self.mlagents_env.global_done is True or self.last_state is None:
            self.reset_all()
        return self.last_state[index]

    def reset_all(self):
        all_brain_info = self.mlagents_env.reset()
        self.last_state = self._get_state_from_brain_info(all_brain_info)
        return self.last_state

    def step(self, actions, text_actions=None, **kwargs):
        # MLAgents Envs don't like tuple-actions.
        if isinstance(actions[0], tuple):
            actions = [list(a) for a in actions]
        all_brain_info = self.mlagents_env.step(
            # TODO: Only support vector actions for now.
            vector_action=actions, memory=None, text_action=text_actions, value=None
        )
        self.last_state = self._get_state_from_brain_info(all_brain_info)
        r = self._get_reward_from_brain_info(all_brain_info)
        t = self._get_terminal_from_brain_info(all_brain_info)
        return self.last_state, r, t, None

    def render(self):
        # TODO: If no_graphics is True, maybe user can render through this method manually?
        pass

    def terminate(self):
        self.mlagents_env.close()

    def terminate_all(self):
        return self.terminate()

    def __str__(self):
        return "MLAgentsEnv(port={}{})".format(
            self.mlagents_env.port, " [loaded]" if self.mlagents_env._loaded else ""
        )

    def _get_state_from_brain_info(self, all_brain_info):
        brain_info = all_brain_info[self.scene_key]
        if self.state_key is None:
            return {"vector": list(brain_info.vector_observations), "visual": list(brain_info.visual_observations),
                    "text": list(brain_info.text_observations)}
        elif self.state_key == "vector":
            return list(brain_info.vector_observations)
        elif self.state_key == "visual":
            return list(brain_info.visual_observations)
        elif self.state_key == "text":
            return list(brain_info.text_observations)

    def _get_reward_from_brain_info(self, all_brain_info):
        brain_info = all_brain_info[self.scene_key]
        return [np.array(r_, dtype=np.float32) for r_ in brain_info.rewards]

    def _get_terminal_from_brain_info(self, all_brain_info):
        brain_info = all_brain_info[self.scene_key]
        return brain_info.local_done
class TrainerController(object):
    def __init__(self, env_path, run_id, save_freq, curriculum_folder,
                 fast_simulation, load, train, worker_id, keep_checkpoints,
                 lesson, seed, docker_target_name, trainer_config_path,
                 no_graphics):
        """
        :param env_path: Location to the environment executable to be loaded.
        :param run_id: The sub-directory name for model and summary statistics
        :param save_freq: Frequency at which to save model
        :param curriculum_folder: Folder containing JSON curriculums for the
               environment.
        :param fast_simulation: Whether to run the game at training speed.
        :param load: Whether to load the model or randomly initialize.
        :param train: Whether to train model, or only run inference.
        :param worker_id: Number to add to communication port (5005).
               Used for multi-environment
        :param keep_checkpoints: How many model checkpoints to keep.
        :param lesson: Start learning from this lesson.
        :param seed: Random seed used for training.
        :param docker_target_name: Name of docker volume that will contain all
               data.
        :param trainer_config_path: Fully qualified path to location of trainer
               configuration file.
        :param no_graphics: Whether to run the Unity simulator in no-graphics
                            mode.
        """
        if env_path is not None:
            # Strip out executable extensions if passed
            env_path = (env_path.strip().replace('.app', '').replace(
                '.exe', '').replace('.x86_64', '').replace('.x86', ''))

        # Recognize and use docker volume if one is passed as an argument
        if not docker_target_name:
            self.docker_training = False
            self.trainer_config_path = trainer_config_path
            self.model_path = './models/{run_id}'.format(run_id=run_id)
            self.curriculum_folder = curriculum_folder
            self.summaries_dir = './summaries'
        else:
            self.docker_training = True
            self.trainer_config_path = \
                '/{docker_target_name}/{trainer_config_path}'.format(
                    docker_target_name=docker_target_name,
                    trainer_config_path = trainer_config_path)
            self.model_path = '/{docker_target_name}/models/{run_id}'.format(
                docker_target_name=docker_target_name, run_id=run_id)
            if env_path is not None:
                env_path = '/{docker_target_name}/{env_name}'.format(
                    docker_target_name=docker_target_name, env_name=env_path)
            if curriculum_folder is not None:
                self.curriculum_folder = \
                    '/{docker_target_name}/{curriculum_folder}'.format(
                        docker_target_name=docker_target_name,
                        curriculum_folder=curriculum_folder)

            self.summaries_dir = '/{docker_target_name}/summaries'.format(
                docker_target_name=docker_target_name)

        self.logger = logging.getLogger('mlagents.envs')
        self.run_id = run_id
        self.save_freq = save_freq
        self.lesson = lesson
        self.fast_simulation = fast_simulation
        self.load_model = load
        self.train_model = train
        self.worker_id = worker_id
        self.keep_checkpoints = keep_checkpoints
        self.trainers = {}
        self.seed = seed
        np.random.seed(self.seed)
        tf.set_random_seed(self.seed)
        self.env = UnityEnvironment(file_name=env_path,
                                    worker_id=self.worker_id,
                                    seed=self.seed,
                                    docker_training=self.docker_training,
                                    no_graphics=no_graphics)
        if env_path is None:
            self.env_name = 'editor_' + self.env.academy_name
        else:
            # Extract out name of environment
            self.env_name = os.path.basename(os.path.normpath(env_path))

        if curriculum_folder is None:
            self.meta_curriculum = None
        else:
            self.meta_curriculum = MetaCurriculum(self.curriculum_folder,
                                                  self.env._resetParameters)

        if self.meta_curriculum:
            for brain_name in self.meta_curriculum.brains_to_curriculums.keys(
            ):
                if brain_name not in self.env.external_brain_names:
                    raise MetaCurriculumError('One of the curriculums '
                                              'defined in ' +
                                              self.curriculum_folder + ' '
                                              'does not have a corresponding '
                                              'Brain. Check that the '
                                              'curriculum file has the same '
                                              'name as the Brain '
                                              'whose curriculum it defines.')

    def _get_measure_vals(self):
        if self.meta_curriculum:
            brain_names_to_measure_vals = {}
            for brain_name, curriculum \
                in self.meta_curriculum.brains_to_curriculums.items():
                if curriculum.measure == 'progress':
                    measure_val = (self.trainers[brain_name].get_step /
                                   self.trainers[brain_name].get_max_steps)
                    brain_names_to_measure_vals[brain_name] = measure_val
                elif curriculum.measure == 'reward':
                    measure_val = np.mean(
                        self.trainers[brain_name].reward_buffer)
                    brain_names_to_measure_vals[brain_name] = measure_val
            return brain_names_to_measure_vals
        else:
            return None

    def _process_graph(self):
        nodes = []
        scopes = []
        for brain_name in self.trainers.keys():
            if self.trainers[brain_name].policy.graph_scope is not None:
                scope = self.trainers[brain_name].policy.graph_scope + '/'
                if scope == '/':
                    scope = ''
                scopes += [scope]
                if self.trainers[brain_name].parameters['trainer'] \
                        == 'imitation':
                    nodes += [scope + x for x in ['action']]
                else:
                    nodes += [
                        scope + x for x in [
                            'action', 'value_estimate', 'action_probs',
                            'value_estimate'
                        ]
                    ]
                if self.trainers[brain_name].parameters['use_recurrent']:
                    nodes += [
                        scope + x for x in ['recurrent_out', 'memory_size']
                    ]
        if len(scopes) > 1:
            self.logger.info('List of available scopes :')
            for scope in scopes:
                self.logger.info('\t' + scope)
        self.logger.info('List of nodes to export :')
        for n in nodes:
            self.logger.info('\t' + n)
        return nodes

    def _save_model(self, sess, saver, steps=0):
        """
        Saves current model to checkpoint folder.
        :param sess: Current Tensorflow session.
        :param steps: Current number of steps in training process.
        :param saver: Tensorflow saver for session.
        """
        last_checkpoint = self.model_path + '/model-' + str(steps) + '.cptk'
        saver.save(sess, last_checkpoint)
        tf.train.write_graph(sess.graph_def,
                             self.model_path,
                             'raw_graph_def.pb',
                             as_text=False)
        self.logger.info('Saved Model')

    def _export_graph(self):
        """
        Exports latest saved model to .bytes format for Unity embedding.
        """
        target_nodes = ','.join(self._process_graph())
        ckpt = tf.train.get_checkpoint_state(self.model_path)
        freeze_graph.freeze_graph(
            input_graph=self.model_path + '/raw_graph_def.pb',
            input_binary=True,
            input_checkpoint=ckpt.model_checkpoint_path,
            output_node_names=target_nodes,
            output_graph=(self.model_path + '/' + self.env_name + '_' +
                          self.run_id + '.bytes'),
            clear_devices=True,
            initializer_nodes='',
            input_saver='',
            restore_op_name='save/restore_all',
            filename_tensor_name='save/Const:0')

    def _initialize_trainers(self, trainer_config, sess):
        trainer_parameters_dict = {}
        # TODO: This probably doesn't need to be reinitialized.
        self.trainers = {}
        for brain_name in self.env.external_brain_names:
            trainer_parameters = trainer_config['default'].copy()
            if len(self.env.external_brain_names) > 1:
                graph_scope = re.sub('[^0-9a-zA-Z]+', '-', brain_name)
                trainer_parameters['graph_scope'] = graph_scope
                trainer_parameters['summary_path'] = '{basedir}/{name}'.format(
                    basedir=self.summaries_dir,
                    name=str(self.run_id) + '_' + graph_scope)
            else:
                trainer_parameters['graph_scope'] = ''
                trainer_parameters['summary_path'] = '{basedir}/{name}'.format(
                    basedir=self.summaries_dir, name=str(self.run_id))
            if brain_name in trainer_config:
                _brain_key = brain_name
                while not isinstance(trainer_config[_brain_key], dict):
                    _brain_key = trainer_config[_brain_key]
                for k in trainer_config[_brain_key]:
                    trainer_parameters[k] = trainer_config[_brain_key][k]
            trainer_parameters_dict[brain_name] = trainer_parameters.copy()
        for brain_name in self.env.external_brain_names:
            if trainer_parameters_dict[brain_name]['trainer'] == 'imitation':
                self.trainers[brain_name] = BehavioralCloningTrainer(
                    sess, self.env.brains[brain_name],
                    trainer_parameters_dict[brain_name], self.train_model,
                    self.seed, self.run_id)
            elif trainer_parameters_dict[brain_name]['trainer'] == 'ppo':

                ###############################################################################
                #######         External brain becomes internal brain in here        ##########
                ###############################################################################

                self.trainers[brain_name] = PPOTrainer(
                    sess, self.env.brains[brain_name],
                    self.meta_curriculum.brains_to_curriculums[brain_name].
                    min_lesson_length if self.meta_curriculum else 0,
                    trainer_parameters_dict[brain_name], self.train_model,
                    self.seed, self.run_id)
            else:
                raise UnityEnvironmentException('The trainer config contains '
                                                'an unknown trainer type for '
                                                'brain {}'.format(brain_name))

    def _load_config(self):
        try:
            with open(self.trainer_config_path) as data_file:
                trainer_config = yaml.load(data_file)
                return trainer_config
        except IOError:
            raise UnityEnvironmentException(
                'Parameter file could not be found '
                'at {}.'.format(self.trainer_config_path))
        except UnicodeDecodeError:
            raise UnityEnvironmentException(
                'There was an error decoding '
                'Trainer Config from this path : {}'.format(
                    self.trainer_config_path))

    @staticmethod
    def _create_model_path(model_path):
        try:
            if not os.path.exists(model_path):
                os.makedirs(model_path)
        except Exception:
            raise UnityEnvironmentException(
                'The folder {} containing the '
                'generated model could not be '
                'accessed. Please make sure the '
                'permissions are set correctly.'.format(model_path))

    def _reset_env(self):
        """Resets the environment.

        Returns:
            A Data structure corresponding to the initial reset state of the
            environment.
        """
        if self.meta_curriculum is not None:
            return self.env.reset(config=self.meta_curriculum.get_config(),
                                  train_mode=self.fast_simulation)
        else:
            return self.env.reset(train_mode=self.fast_simulation)

    def start_learning(self):
        # TODO: Should be able to start learning at different lesson numbers
        # for each curriculum.
        if self.meta_curriculum is not None:
            self.meta_curriculum.set_all_curriculums_to_lesson_num(self.lesson)
        trainer_config = self._load_config()
        self._create_model_path(self.model_path)

        tf.reset_default_graph()

        # Prevent a single session from taking all GPU memory.
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        with tf.Session(config=config) as sess:
            self._initialize_trainers(trainer_config, sess)
            for _, t in self.trainers.items():
                self.logger.info(t)
            init = tf.global_variables_initializer()
            saver = tf.train.Saver(max_to_keep=self.keep_checkpoints)
            # Instantiate model parameters
            if self.load_model:
                self.logger.info('Loading Model...')
                ckpt = tf.train.get_checkpoint_state(self.model_path)
                if ckpt is None:
                    self.logger.info('The model {0} could not be found. Make '
                                     'sure you specified the right '
                                     '--run-id'.format(self.model_path))
                saver.restore(sess, ckpt.model_checkpoint_path)
            else:
                sess.run(init)
                #######################################################################################
                ##########    Modify here for partialy load model --> for segmentation network    #####
                #######################################################################################

                print("modify here for partialy load model")

                # also weights from segmentation network has to be frozen

            global_step = 0  # This is only for saving the model
            curr_info = self._reset_env()
            if self.train_model:
                for brain_name, trainer in self.trainers.items():
                    trainer.write_tensorboard_text('Hyperparameters',
                                                   trainer.parameters)
            try:
                while any([t.get_step <= t.get_max_steps \
                           for k, t in self.trainers.items()]) \
                      or not self.train_model:
                    if self.meta_curriculum:
                        # Get the sizes of the reward buffers.
                        reward_buff_sizes = {k:len(t.reward_buffer) \
                                            for (k,t) in self.trainers.items()}
                        # Attempt to increment the lessons of the brains who
                        # were ready.
                        lessons_incremented = \
                            self.meta_curriculum.increment_lessons(
                                self._get_measure_vals(),
                                reward_buff_sizes=reward_buff_sizes)

                    # If any lessons were incremented or the environment is
                    # ready to be reset
                    if (self.meta_curriculum
                            and any(lessons_incremented.values())):
                        curr_info = self._reset_env()
                        for brain_name, trainer in self.trainers.items():
                            trainer.end_episode()
                        for brain_name, changed in lessons_incremented.items():
                            if changed:
                                self.trainers[brain_name].reward_buffer.clear()
                    elif self.env.global_done:
                        curr_info = self._reset_env()
                        for brain_name, trainer in self.trainers.items():
                            trainer.end_episode()

                    # Decide and take an action
                    take_action_vector, \
                    take_action_memories, \
                    take_action_text, \
                    take_action_value, \
                    take_action_outputs \
                        = {}, {}, {}, {}, {}
                    for brain_name, trainer in self.trainers.items():
                        (take_action_vector[brain_name],
                         take_action_memories[brain_name],
                         take_action_text[brain_name],
                         take_action_value[brain_name],
                         take_action_outputs[brain_name]) = \
                            trainer.take_action(curr_info)
                    new_info = self.env.step(vector_action=take_action_vector,
                                             memory=take_action_memories,
                                             text_action=take_action_text,
                                             value=take_action_value)
                    for brain_name, trainer in self.trainers.items():
                        trainer.add_experiences(
                            curr_info, new_info,
                            take_action_outputs[brain_name])
                        trainer.process_experiences(curr_info, new_info)
                        if trainer.is_ready_update() and self.train_model \
                                and trainer.get_step <= trainer.get_max_steps:
                            # Perform gradient descent with experience buffer
                            trainer.update_policy()
                        # Write training statistics to Tensorboard.
                        if self.meta_curriculum is not None:
                            trainer.write_summary(
                                global_step,
                                lesson_num=self.meta_curriculum.
                                brains_to_curriculums[brain_name].lesson_num)
                        else:
                            trainer.write_summary(global_step)
                        if self.train_model \
                                and trainer.get_step <= trainer.get_max_steps:
                            trainer.increment_step_and_update_last_reward()
                    global_step += 1
                    if global_step % self.save_freq == 0 and global_step != 0 \
                            and self.train_model:
                        # Save Tensorflow model
                        self._save_model(sess, steps=global_step, saver=saver)
                    curr_info = new_info
                # Final save Tensorflow model
                if global_step != 0 and self.train_model:
                    self._save_model(sess, steps=global_step, saver=saver)
            except KeyboardInterrupt:
                print(
                    '--------------------------Now saving model--------------'
                    '-----------')
                if self.train_model:
                    self.logger.info('Learning was interrupted. Please wait '
                                     'while the graph is generated.')
                    self._save_model(sess, steps=global_step, saver=saver)
                pass
        self.env.close()
        if self.train_model:
            self._export_graph()
Ejemplo n.º 16
0
class ObstacleTowerEnv(gym.Env):
    ALLOWED_VERSIONS = ['3.1']

    def __init__(self,
                 environment_filename=None,
                 docker_training=False,
                 worker_id=0,
                 retro=True,
                 timeout_wait=30,
                 realtime_mode=False,
                 config=None,
                 greyscale=False):
        """
        Arguments:
          environment_filename: The file path to the Unity executable.  Does not require the extension.
          docker_training: Whether this is running within a docker environment and should use a virtual
            frame buffer (xvfb).
          worker_id: The index of the worker in the case where multiple environments are running.  Each
            environment reserves port (5005 + worker_id) for communication with the Unity executable.
          retro: Resize visual observation to 84x84 (int8) and flattens action space.
          timeout_wait: Time for python interface to wait for environment to connect.
          realtime_mode: Whether to render the environment window image and run environment at realtime.
        """
        self._env = UnityEnvironment(environment_filename,
                                     worker_id,
                                     docker_training=docker_training,
                                     timeout_wait=timeout_wait)

        split_name = self._env.academy_name.split('-v')
        if len(split_name) == 2 and split_name[0] == "ObstacleTower":
            self.name, self.version = split_name
        else:
            raise UnityGymException(
                "Attempting to launch non-Obstacle Tower environment")

        if self.version not in self.ALLOWED_VERSIONS:
            raise UnityGymException(
                "Invalid Obstacle Tower version.  Your build is v" +
                self.version +
                " but only the following versions are compatible with this gym: "
                + str(self.ALLOWED_VERSIONS))

        self.visual_obs = None
        self._current_state = None
        self._n_agents = None
        self._flattener = None
        self._greyscale = greyscale

        # Environment reset parameters
        self._seed = None
        self._floor = None

        self.realtime_mode = realtime_mode
        self.game_over = False  # Hidden flag used by Atari environments to determine if the game is over
        self.retro = retro
        if config != None:
            self.config = config
        else:
            self.config = None

        flatten_branched = self.retro
        uint8_visual = self.retro

        # Check brain configuration
        if len(self._env.brains) != 1:
            raise UnityGymException(
                "There can only be one brain in a UnityEnvironment "
                "if it is wrapped in a gym.")
        self.brain_name = self._env.external_brain_names[0]
        brain = self._env.brains[self.brain_name]

        if brain.number_visual_observations == 0:
            raise UnityGymException(
                "Environment provides no visual observations.")

        self.uint8_visual = uint8_visual

        if brain.number_visual_observations > 1:
            logger.warning(
                "The environment contains more than one visual observation. "
                "Please note that only the first will be provided in the observation."
            )

        # Check for number of agents in scene.
        initial_info = self._env.reset(
            train_mode=not self.realtime_mode)[self.brain_name]
        self._check_agents(len(initial_info.agents))

        # Set observation and action spaces
        if len(brain.vector_action_space_size) == 1:
            self._action_space = spaces.Discrete(
                brain.vector_action_space_size[0])
        else:
            if flatten_branched:
                self._flattener = ActionFlattener(
                    brain.vector_action_space_size)
                self._action_space = self._flattener.action_space
            else:
                self._action_space = spaces.MultiDiscrete(
                    brain.vector_action_space_size)

        high = np.array([np.inf] * brain.vector_observation_space_size)
        self.action_meanings = brain.vector_action_descriptions

        if self._greyscale:
            depth = 1
        else:
            depth = 3
        image_space_max = 1.0
        image_space_dtype = np.float32
        camera_height = brain.camera_resolutions[0]["height"]
        camera_width = brain.camera_resolutions[0]["width"]
        if self.retro:
            image_space_max = 255
            image_space_dtype = np.uint8
            camera_height = 84
            camera_width = 84

        image_space = spaces.Box(0,
                                 image_space_max,
                                 dtype=image_space_dtype,
                                 shape=(camera_height, camera_width, depth))
        if self.retro:
            self._observation_space = image_space
        else:
            max_float = np.finfo(np.float32).max
            keys_space = spaces.Discrete(5)
            time_remaining_space = spaces.Box(low=0.0,
                                              high=max_float,
                                              shape=(1, ),
                                              dtype=np.float32)
            floor_space = spaces.Discrete(9999)
            self._observation_space = spaces.Tuple(
                (image_space, keys_space, time_remaining_space, floor_space))

    def reset(self, config=None):
        """Resets the state of the environment and returns an initial observation.
        In the case of multi-agent environments, this is a list.
        Returns: observation (object/list): the initial observation of the
            space.
        """
        if config is None:
            reset_params = {}
            if self.config is not None:
                reset_params = self.config
        else:
            reset_params = config
        if self._floor is not None:
            reset_params['starting-floor'] = self._floor
        if self._seed is not None:
            reset_params['tower-seed'] = self._seed

        self.reset_params = self._env.reset_parameters
        info = self._env.reset(
            config=reset_params,
            train_mode=not self.realtime_mode)[self.brain_name]
        n_agents = len(info.agents)
        self._check_agents(n_agents)
        self.game_over = False

        obs, reward, done, info = self._single_step(info)
        return obs

    def step(self, action):
        """Run one timestep of the environment's dynamics. When end of
        episode is reached, you are responsible for calling `reset()`
        to reset this environment's state.
        Accepts an action and returns a tuple (observation, reward, done, info).
        In the case of multi-agent environments, these are lists.
        Args:
            action (object/list): an action provided by the environment
        Returns:
            observation (object/list): agent's observation of the current environment
            reward (float/list) : amount of reward returned after previous action
            done (boolean/list): whether the episode has ended.
            info (dict): contains auxiliary diagnostic information, including BrainInfo.
        """

        # Use random actions for all other agents in environment.
        if self._flattener is not None:
            # Translate action into list
            action = self._flattener.lookup_action(action)

        info = self._env.step(action)[self.brain_name]
        n_agents = len(info.agents)
        self._check_agents(n_agents)
        self._current_state = info

        obs, reward, done, info = self._single_step(info)
        self.game_over = done

        return obs, reward, done, info

    def _single_step(self, info):
        self.visual_obs = self._preprocess_single(
            info.visual_observations[0][0][:, :, :])

        self.visual_obs, keys, time, current_floor = self._prepare_tuple_observation(
            self.visual_obs, info.vector_observations[0])

        if self.retro:
            self.visual_obs = self._resize_observation(self.visual_obs)
            self.visual_obs = self._add_stats_to_image(
                self.visual_obs, info.vector_observations[0])
            default_observation = self.visual_obs
        else:
            default_observation = self.visual_obs, keys, time, current_floor

        if self._greyscale:
            default_observation = self._greyscale_obs(default_observation)

        return default_observation, info.rewards[0], info.local_done[0], {
            "text_observation": info.text_observations[0],
            "brain_info": info,
            "total_keys": keys,
            "time_remaining": time,
            "current_floor": current_floor
        }

    def _greyscale_obs(self, obs):
        new_obs = np.floor(np.expand_dims(np.mean(obs, axis=2),
                                          axis=2)).astype(np.uint8)
        return new_obs

    def _preprocess_single(self, single_visual_obs):
        if self.uint8_visual:
            return (255.0 * single_visual_obs).astype(np.uint8)
        else:
            return single_visual_obs

    def render(self, mode='rgb_array'):
        return self.visual_obs

    def close(self):
        """Override _close in your subclass to perform any necessary cleanup.
        Environments will automatically close() themselves when
        garbage collected or when the program exits.
        """
        self._env.close()

    def get_action_meanings(self):
        return self.action_meanings

    def seed(self, seed=None):
        """Sets a fixed seed for this env's random number generator(s).
        The valid range for seeds is [0, 99999). By default a random seed
        will be chosen.
        """
        if seed is None:
            self._seed = seed
            return

        seed = int(seed)
        if seed < 0 or seed >= 99999:
            logger.warning(
                "Seed outside of valid range [0, 99999). A random seed "
                "within the valid range will be used on next reset.")
        logger.warning("New seed " + str(seed) + " will apply on next reset.")
        self._seed = seed

    def floor(self, floor=None):
        """Sets the starting floor to a fixed floor number on subsequent environment
        resets."""
        if floor is None:
            self._floor = floor
            return

        floor = int(floor)
        if floor < 0 or floor > 99:
            logger.warning(
                "Starting floor outside of valid range [0, 99]. Floor 0 will be used"
                "on next reset.")
        logger.warning("New starting floor " + str(floor) +
                       " will apply on next reset.")
        self._floor = floor

    @staticmethod
    def _resize_observation(observation):
        """
        Re-sizes visual observation to 84x84
        """
        obs_image = Image.fromarray(observation)
        obs_image = obs_image.resize((84, 84), Image.NEAREST)
        return np.array(obs_image)

    @staticmethod
    def _prepare_tuple_observation(vis_obs, vector_obs):
        """
        Converts separate visual and vector observation into prepared tuple
        """
        key = vector_obs[0:6]
        time = vector_obs[6]
        floor_number = vector_obs[7]
        key_num = np.argmax(key, axis=0)
        return vis_obs, key_num, time, floor_number

    @staticmethod
    def _add_stats_to_image(vis_obs, vector_obs):
        """
        Displays time left and number of keys on visual observation
        """
        key = vector_obs[0:6]
        time = vector_obs[6]
        key_num = int(np.argmax(key, axis=0))
        time_num = min(time, 10000) / 10000

        vis_obs[0:10, :, :] = 0
        for i in range(key_num):
            start = int(i * 16.8) + 4
            end = start + 10
            vis_obs[1:5, start:end, 0:2] = 255
        vis_obs[6:10, 0:int(time_num * 84), 1] = 255
        return vis_obs

    def _check_agents(self, n_agents):
        if n_agents > 1:
            raise UnityGymException(
                "The environment was launched as a single-agent environment, however"
                "there is more than one agent in the scene.")
        if self._n_agents is None:
            self._n_agents = n_agents
            logger.info("{} agents within environment.".format(n_agents))
        elif self._n_agents != n_agents:
            raise UnityGymException(
                "The number of agents in the environment has changed since "
                "initialization. This is not supported.")

    @property
    def metadata(self):
        return {'render.modes': ['rgb_array']}

    @property
    def reward_range(self):
        return -float('inf'), float('inf')

    @property
    def spec(self):
        return None

    @property
    def action_space(self):
        return self._action_space

    @property
    def observation_space(self):
        return self._observation_space

    @property
    def number_agents(self):
        return self._n_agents
Ejemplo n.º 17
0
class Main(object):
    train_mode = True
    _agent_class = Agent

    def __init__(self, argv):
        self._now = time.strftime('%Y%m%d%H%M%S', time.localtime(time.time()))

        self.config, self.reset_config, replay_config, sac_config, model_root_path = self._init_config(
            argv)
        self._init_env(replay_config, sac_config, model_root_path)
        self._run()

    def _init_config(self, argv):
        config = dict()

        with open(
                f'{Path(__file__).resolve().parent}/default_config.yaml') as f:
            default_config_file = yaml.load(f, Loader=yaml.FullLoader)
            config = default_config_file

        # define command line arguments
        try:
            opts, args = getopt.getopt(argv, 'rc:n:b:p:', [
                'run', 'config=', 'name=', 'build=', 'port=', 'logger_file=',
                'seed=', 'sac=', 'agents='
            ])
        except getopt.GetoptError:
            raise Exception('ARGS ERROR')

        # initialize config from config.yaml
        for opt, arg in opts:
            if opt in ('-c', '--config'):
                with open(arg) as f:
                    config_file = yaml.load(f, Loader=yaml.FullLoader)
                    for k, v in config_file.items():
                        assert k in config.keys(), f'{k} is invalid'
                        if v is not None:
                            for kk, vv in v.items():
                                assert kk in config[k].keys(
                                ), f'{kk} is invalid in {k}'
                                config[k][kk] = vv
                break

        config['base_config']['build_path'] = config['base_config'][
            'build_path'][sys.platform]

        # initialize config from command line arguments
        logger_file = None
        for opt, arg in opts:
            if opt in ('-r', '--run'):
                self.train_mode = False
            elif opt in ('-n', '--name'):
                config['base_config']['name'] = arg
            elif opt in ('-b', '--build'):
                config['base_config']['build_path'] = arg
            elif opt in ('-p', '--port'):
                config['base_config']['port'] = int(arg)
            elif opt == '--logger_file':
                logger_file = arg
            elif opt == '--seed':
                config['sac_config']['seed'] = int(arg)
            elif opt == '--sac':
                config['base_config']['sac'] = arg
            elif opt == '--agents':
                config['reset_config']['copy'] = int(arg)

        # logger config
        _log = logging.getLogger()
        _log.setLevel(logging.INFO)
        # remove default root logger handler
        _log.handlers = []

        # create stream handler
        sh = logging.StreamHandler()
        sh.setLevel(logging.INFO)

        # add handler and formatter to logger
        sh.setFormatter(
            logging.Formatter('[%(levelname)s] - [%(name)s] - %(message)s'))
        _log.addHandler(sh)

        _log = logging.getLogger('tensorflow')
        _log.setLevel(level=logging.ERROR)

        self.logger = logging.getLogger('sac')
        self.logger.setLevel(level=logging.INFO)

        if logger_file is not None:
            # create file handler
            fh = logging.handlers.RotatingFileHandler(logger_file,
                                                      maxBytes=1024 * 100,
                                                      backupCount=5)
            fh.setLevel(logging.INFO)

            # add handler and formatter to logger
            fh.setFormatter(
                logging.Formatter(
                    '%(asctime)-15s [%(levelname)s] - [%(name)s] - %(message)s'
                ))
            self.logger.addHandler(fh)

        config['base_config']['name'] = config['base_config']['name'].replace(
            '{time}', self._now)
        model_root_path = f'models/{config["base_config"]["name"]}'

        # save config
        if self.train_mode:
            if not os.path.exists(model_root_path):
                os.makedirs(model_root_path)
            with open(f'{model_root_path}/config.yaml', 'w') as f:
                yaml.dump(config, f, default_flow_style=False)

        # display config
        config_str = ''
        for k, v in config.items():
            config_str += f'\n{k}'
            for kk, vv in v.items():
                config_str += f'\n{kk:>25}: {vv}'
        self.logger.info(config_str)

        return (config['base_config'], config['reset_config'],
                config['replay_config'], config['sac_config'], model_root_path)

    def _init_env(self, replay_config, sac_config, model_root_path):
        if self.config['build_path'] is None or self.config['build_path'] == '':
            self.env = UnityEnvironment()
        else:
            self.env = UnityEnvironment(file_name=self.config['build_path'],
                                        no_graphics=self.train_mode,
                                        base_port=self.config['port'],
                                        args=['--scene', self.config['scene']])

        self.default_brain_name = self.env.brain_names[0]

        brain_params = self.env.brains[self.default_brain_name]
        state_dim = brain_params.vector_observation_space_size * brain_params.num_stacked_vector_observations
        action_dim = brain_params.vector_action_space_size[0]

        custom_sac_model = importlib.import_module(self.config['sac'])
        shutil.copyfile(f'{self.config["sac"]}.py',
                        f'{model_root_path}/{self.config["sac"]}.py')

        self.sac = SAC_Base(state_dim=state_dim,
                            action_dim=action_dim,
                            model_root_path=model_root_path,
                            model=custom_sac_model,
                            train_mode=self.train_mode,
                            use_rnn=self.config['use_rnn'],
                            replay_config=replay_config,
                            burn_in_step=self.config['burn_in_step'],
                            n_step=self.config['n_step'],
                            **sac_config)

    def _run(self):
        brain_info = self.env.reset(
            train_mode=self.train_mode,
            config=self.reset_config)[self.default_brain_name]
        if self.config['use_rnn']:
            initial_rnn_state = self.sac.get_initial_rnn_state(
                len(brain_info.agents))
            rnn_state = initial_rnn_state

        for iteration in range(self.config['max_iter'] + 1):
            if self.config['reset_on_iteration']:
                brain_info = self.env.reset(
                    train_mode=self.train_mode)[self.default_brain_name]
                if self.config['use_rnn']:
                    rnn_state = initial_rnn_state

            agents = [
                self._agent_class(i,
                                  tran_len=self.config['burn_in_step'] +
                                  self.config['n_step'],
                                  stagger=self.config['stagger'],
                                  use_rnn=self.config['use_rnn'])
                for i in brain_info.agents
            ]
            """
            s0    s1    s2    s3    s4    s5    s6
             └──burn_in_step───┘     └───n_step──┘
             └───────────deque_maxlen────────────┘
                         s2    s3    s4    s5    s6    s7    s8
             └─────┘
             stagger
            """

            states = brain_info.vector_observations
            step = 0

            while False in [a.done for a in agents]:
                if self.config['use_rnn']:
                    actions, next_rnn_state = self.sac.choose_rnn_action(
                        states.astype(np.float32), rnn_state)
                    next_rnn_state = next_rnn_state.numpy()
                else:
                    actions = self.sac.choose_action(states.astype(np.float32))

                actions = actions.numpy()

                brain_info = self.env.step({self.default_brain_name:
                                            actions})[self.default_brain_name]

                states_ = brain_info.vector_observations
                if step == self.config['max_step']:
                    brain_info.local_done = [True] * len(brain_info.agents)
                    brain_info.max_reached = [True] * len(brain_info.agents)

                trans_list = [
                    agents[i].add_transition(
                        states[i], actions[i], brain_info.rewards[i],
                        brain_info.local_done[i], brain_info.max_reached[i],
                        states_[i],
                        rnn_state[i] if self.config['use_rnn'] else None)
                    for i in range(len(agents))
                ]

                trans_list = [t for t in trans_list if t is not None]
                if len(trans_list) != 0:
                    # n_states, n_actions, n_rewards, done, rnn_state
                    trans = [
                        np.concatenate(t, axis=0) for t in zip(*trans_list)
                    ]

                    if self.train_mode:
                        self.sac.fill_replay_buffer(*trans)
                        self.sac.train()

                states = states_
                if self.config['use_rnn']:
                    rnn_state = next_rnn_state
                    rnn_state[brain_info.local_done] = initial_rnn_state[
                        brain_info.local_done]

                step += 1

            if self.train_mode:
                self._log_episode_summaries(iteration, agents)

                if iteration % self.config['save_model_per_iter'] == 0:
                    self.sac.save_model(iteration)

            self._log_episode_info(iteration, agents)

        self.env.close()

    def _log_episode_summaries(self, iteration, agents):
        rewards = np.array([a.reward for a in agents])
        self.sac.write_constant_summaries([{
            'tag': 'reward/mean',
            'simple_value': rewards.mean()
        }, {
            'tag': 'reward/max',
            'simple_value': rewards.max()
        }, {
            'tag': 'reward/min',
            'simple_value': rewards.min()
        }], iteration)

    def _log_episode_info(self, iteration, agents):
        rewards = [a.reward for a in agents]
        rewards_sorted = ", ".join([f"{i:.1f}" for i in sorted(rewards)])
        self.logger.info(f'iter {iteration}, rewards {rewards_sorted}')
Ejemplo n.º 18
0
import torch.nn.functional as F
import torch.optim as optim
from collections import namedtuple, deque
import random
import numpy as np
import matplotlib.pyplot as plt
from torch.distributions import Normal, Categorical
from mlagents.envs.environment import UnityEnvironment
#from DDPG import MADDPG
from DDPG import DDPG

#Transition = namedtuple('Transition',['state', 'action', 'reward', 'a_log_prob', 'next_state'])
env = UnityEnvironment(file_name=env_name, worker_id=1, seed=1)
default_brain = env.brain_names[0]
brain = env.brains[default_brain]
env_info = env.reset(train_mode=True)[default_brain]
max_step = 1000
#maddpg = MADDPG()
ddpg = DDPG()
rewards = []

for eps in range(1):
    env_info = env.reset(train_mode=True)[default_brain]
    done = False
    eps_reward = 0
    state = env_info.vector_observations
    #state = torch.from_numpy(state).float()
    score = 0
    #running_reward = -1000

    while True:
Ejemplo n.º 19
0
def main():

    env_path = 'AutoBenchExecutable/AutoBenchExecutable'
    # env_path = None
    curriculum_file = 'config/curricula/autobench/AutoBenchBrain.json'
    no_graphics = False  # True if you want Unity Environment to train in background
    max_step = 1e10  # total training step

    # Set True: 100x time scale, small window, 10 agents
    # Set False: 1x time scale, big window, 1 agents with observation camera
    fast_simulation = False
    benchmark = False
    benchmark_episode = 1000

    # Setup the Unity Environment
    env_config = get_env_config(curriculum_file)
    env = UnityEnvironment(
        file_name=env_path,
        no_graphics=no_graphics,
        camera_res_overwrite=extract_camera_config(curriculum_file))
    brain_name = env.brain_names[0]  # Get brain_name, assume only have 1 brain

    curr_info = env.reset(config=env_config,
                          train_mode=fast_simulation)[brain_name]
    agent_size = len(curr_info.agents)

    BenchmarkManager(agent_amount=agent_size,
                     benchmark_episode=benchmark_episode,
                     success_threshold=env_config['goal_reward'] +
                     env_config['time_penalty'],
                     verbose=False)

    last_update_time = time.clock()

    ### Standard RL training loop
    for global_step in range(int(max_step)):

        # Implement your own decide algorithm
        action = decide(curr_info)

        # Send Action into Unity Environment and return new_info, type = BrainInfo
        # You can refer to brain.py, I've commented the detail data structure
        new_info = env.step(vector_action={brain_name: action},
                            memory={brain_name: None},
                            text_action={brain_name: None})[brain_name]
        if benchmark:
            BenchmarkManager.add_result(new_info)
            if BenchmarkManager.is_complete():
                BenchmarkManager.analyze()
                break

        # Calculate and Print training speed
        if global_step % 100 == 0:
            print("Steps:{:,}".format(global_step), " ||  Speed:",
                  format(100 / (time.clock() - last_update_time), ".2f"))
            last_update_time = time.clock()

        # Assign new_info to curr_info for next timestep training
        curr_info = new_info

    env.close()
Ejemplo n.º 20
0
class Learner(object):
    _training_lock = threading.Lock()

    def __init__(self, argv, agent_class=Agent):
        self._agent_class = agent_class
        self._now = time.strftime('%Y%m%d%H%M%S', time.localtime(time.time()))

        (self.config, self.net_config, self.reset_config, _, sac_config,
         model_root_path) = self._init_config(argv)

        self.replay_base_url = f'http://{self.net_config["replay_host"]}:{self.net_config["replay_port"]}'

        self._init_env(sac_config, model_root_path)
        self._run()

    def _init_config(self, argv):
        config = dict()

        with open(
                f'{Path(__file__).resolve().parent}/default_config.yaml') as f:
            default_config_file = yaml.load(f, Loader=yaml.FullLoader)
            config = default_config_file

        # define command line arguments
        try:
            opts, args = getopt.getopt(argv, 'c:', [
                'run', 'config=', 'build_path=', 'build_port=', 'logger_file=',
                'sac=', 'agents='
            ])
        except getopt.GetoptError:
            raise Exception('ARGS ERROR')

        # initialize config from config.yaml
        for opt, arg in opts:
            if opt in ('-c', '--config'):
                with open(arg) as f:
                    config_file = yaml.load(f, Loader=yaml.FullLoader)
                    for k, v in config_file.items():
                        assert k in config.keys(), f'{k} is invalid'
                        if v is not None:
                            for kk, vv in v.items():
                                assert kk in config[k].keys(
                                ), f'{kk} is invalid in {k}'
                                config[k][kk] = vv
                break

        config['base_config']['build_path'] = config['base_config'][
            'build_path'][sys.platform]

        logger_file = None
        for opt, arg in opts:
            if opt == '--run':
                self._train_mode = False
            elif opt == '--build_path':
                config['base_config']['build_path'] = arg
            elif opt == '--build_port':
                config['base_config']['build_port'] = int(arg)
            elif opt == '--logger_file':
                logger_file = arg
            elif opt == '--sac':
                config['base_config']['sac'] = arg
            elif opt == '--agents':
                config['reset_config']['copy'] = int(arg)

        # logger config
        _log = logging.getLogger()
        _log.setLevel(logging.INFO)
        # remove default root logger handler
        _log.handlers = []

        # create stream handler
        sh = logging.StreamHandler()
        sh.setLevel(logging.INFO)

        # add handler and formatter to logger
        sh.setFormatter(
            logging.Formatter('[%(levelname)s] - [%(name)s] - %(message)s'))
        _log.addHandler(sh)

        _log = logging.getLogger('werkzeug')
        _log.setLevel(level=logging.ERROR)

        self.logger = logging.getLogger('sac.ds.learner')
        self.logger.setLevel(level=logging.INFO)

        if logger_file is not None:
            # create file handler
            fh = logging.handlers.RotatingFileHandler(logger_file,
                                                      maxBytes=1024 * 100,
                                                      backupCount=5)
            fh.setLevel(logging.INFO)

            # add handler and formatter to logger
            fh.setFormatter(
                logging.Formatter(
                    '%(asctime)-15s [%(levelname)s] - [%(name)s] - %(message)s'
                ))
            self.logger.addHandler(fh)

        config['base_config']['name'] = config['base_config']['name'].replace(
            '{time}', self._now)
        model_root_path = f'models/{config["base_config"]["name"]}'

        # save config
        if not os.path.exists(model_root_path):
            os.makedirs(model_root_path)
        with open(f'{model_root_path}/config.yaml', 'w') as f:
            yaml.dump(config, f, default_flow_style=False)

        # display config
        config_str = ''
        for k, v in config.items():
            config_str += f'\n{k}'
            for kk, vv in v.items():
                config_str += f'\n{kk:>25}: {vv}'
        self.logger.info(config_str)

        return (config['base_config'], config['net_config'],
                config['reset_config'], config['replay_config'],
                config['sac_config'], model_root_path)

    def _init_env(self, sac_config, model_root_path):
        if self.config['build_path'] is None or self.config['build_path'] == '':
            self.env = UnityEnvironment()
        else:
            self.env = UnityEnvironment(file_name=self.config['build_path'],
                                        no_graphics=True,
                                        base_port=self.config['build_port'],
                                        args=['--scene', self.config['scene']])

        self.logger.info(f'{self.config["build_path"]} initialized')

        self.default_brain_name = self.env.brain_names[0]

        brain_params = self.env.brains[self.default_brain_name]
        state_dim = brain_params.vector_observation_space_size * brain_params.num_stacked_vector_observations
        action_dim = brain_params.vector_action_space_size[0]

        custom_sac_model = importlib.import_module(self.config['sac'])
        shutil.copyfile(f'{self.config["sac"]}.py',
                        f'{model_root_path}/{self.config["sac"]}.py')

        self.sac = SAC_DS_Base(state_dim=state_dim,
                               action_dim=action_dim,
                               model_root_path=model_root_path,
                               model=custom_sac_model,
                               use_rnn=self.config['use_rnn'],
                               burn_in_step=self.config['burn_in_step'],
                               n_step=self.config['n_step'],
                               **sac_config)

    def _start_policy_evaluation(self):
        iteration = 0
        start_time = time.time()

        brain_info = self.env.reset(
            train_mode=False,
            config=self.reset_config)[self.default_brain_name]
        if self.config['use_rnn']:
            initial_rnn_state = self.sac.get_initial_rnn_state(
                len(brain_info.agents))
            rnn_state = initial_rnn_state

        while True:
            if self.config['reset_on_iteration']:
                brain_info = self.env.reset(
                    train_mode=False)[self.default_brain_name]

            agents = [
                self._agent_class(i,
                                  tran_len=self.config['burn_in_step'] +
                                  self.config['n_step'],
                                  stagger=self.config['stagger'],
                                  use_rnn=self.config['use_rnn'])
                for i in brain_info.agents
            ]

            states = brain_info.vector_observations
            step = 0

            while False in [a.done for a in agents]:
                with self._training_lock:
                    if self.config['use_rnn']:
                        actions, next_rnn_state = self.sac.choose_rnn_action(
                            states.astype(np.float32), rnn_state)
                        next_rnn_state = next_rnn_state.numpy()
                    else:
                        actions = self.sac.choose_action(
                            states.astype(np.float32))

                actions = actions.numpy()

                brain_info = self.env.step({self.default_brain_name:
                                            actions})[self.default_brain_name]

                states_ = brain_info.vector_observations
                if step == self.config['max_step']:
                    brain_info.local_done = [True] * len(brain_info.agents)
                    brain_info.max_reached = [True] * len(brain_info.agents)

                for i, agent in enumerate(agents):
                    agent.add_transition(
                        states[i], actions[i], brain_info.rewards[i],
                        brain_info.local_done[i], brain_info.max_reached[i],
                        states_[i],
                        rnn_state[i] if self.config['use_rnn'] else None)

                states = states_
                if self.config['use_rnn']:
                    rnn_state = next_rnn_state
                    rnn_state[brain_info.local_done] = initial_rnn_state[
                        brain_info.local_done]

                step += 1

            with self._training_lock:
                self._log_episode_info(iteration, start_time, agents)

            iteration += 1

    def _log_episode_info(self, iteration, start_time, agents):
        rewards = np.array([a.reward for a in agents])

        self.sac.write_constant_summaries([{
            'tag': 'reward/mean',
            'simple_value': rewards.mean()
        }, {
            'tag': 'reward/max',
            'simple_value': rewards.max()
        }, {
            'tag': 'reward/min',
            'simple_value': rewards.min()
        }], iteration)

        time_elapse = (time.time() - start_time) / 60
        rewards_sorted = ", ".join([f"{i:.1f}" for i in sorted(rewards)])
        self.logger.info(
            f'{iteration}, {time_elapse:.2f}min, rewards {rewards_sorted}')

    def _run_learner_server(self):
        app = Flask('learner')

        @app.route('/get_policy_variables')
        def get_policy_variables():
            with self._training_lock:
                variables = self.sac.get_policy_variables()

            return jsonify(variables)

        @app.route('/get_td_errors', methods=['POST'])
        def get_td_errors():
            trans = request.get_json()
            trans = [np.array(t, dtype=np.float32) for t in trans]

            with self._training_lock:
                td_errors = self.sac.get_td_error(*trans)

            return jsonify(td_errors.numpy().flatten().tolist())

        app.run(host='0.0.0.0', port=self.net_config['learner_port'])

    def _get_sampled_data(self):
        while True:
            try:
                r = requests.get(f'{self.replay_base_url}/sample')
            except requests.ConnectionError:
                self.logger.error(f'get_sampled_data connecting error')
                time.sleep(1)
            except Exception as e:
                self.logger.error(
                    f'get_sampled_data error {type(e)}, {str(e)}')
                time.sleep(1)
            else:
                break
        return r.json()

    def _update_td_errors(self, pointers, td_errors):
        while True:
            try:
                requests.post(f'{self.replay_base_url}/update',
                              json={
                                  'pointers': pointers,
                                  'td_errors': td_errors
                              })
            except requests.ConnectionError:
                self.logger.error(f'update_td_errors connecting error')
                time.sleep(1)
            except Exception as e:
                self.logger.error(
                    f'update_td_errors error {type(e)}, {str(e)}')
                time.sleep(1)
            else:
                break

    def _update_transitions(self, pointers, index, data):
        while True:
            try:
                requests.post(f'{self.replay_base_url}/update_transitions',
                              json={
                                  'pointers': pointers,
                                  'index': index,
                                  'data': data
                              })
            except requests.ConnectionError:
                self.logger.error(f'_update_transitions connecting error')
                time.sleep(1)
            except Exception as e:
                self.logger.error(
                    f'update_transitions error {type(e)}, {str(e)}')
                time.sleep(1)
            else:
                break

    def _clear_replay_buffer(self):
        while True:
            try:
                requests.get(f'{self.replay_base_url}/clear')
            except requests.ConnectionError:
                self.logger.error(f'clear_replay_buffer connecting error')
                time.sleep(1)
            except Exception as e:
                self.logger.error(
                    f'clear_replay_buffer error {type(e)}, {str(e)}')
                time.sleep(1)
            else:
                break

    def _run_training_client(self):
        # asyncio.run(self._websocket_server.send_to_all('aaa'))
        self._clear_replay_buffer()

        while True:
            data = self._get_sampled_data()

            if data:
                pointers = data['pointers']
                trans = [np.array(t, dtype=np.float32) for t in data['trans']]
                priority_is = np.array(data['priority_is'], dtype=np.float32)

                if self.config['use_rnn']:
                    n_states, n_actions, n_rewards, state_, done, mu_n_probs, rnn_state = trans
                else:
                    n_states, n_actions, n_rewards, state_, done, mu_n_probs = trans

                with self._training_lock:
                    td_errors, pi_n_probs = self.sac.train(
                        n_states, n_actions, n_rewards, state_, done,
                        mu_n_probs, priority_is,
                        rnn_state if self.config['use_rnn'] else None)

                self._update_td_errors(pointers, td_errors.tolist())
                if self.config['use_rnn']:
                    self._update_transitions(pointers, 5, pi_n_probs.tolist())
            else:
                self.logger.warn('no data sampled')
                time.sleep(1)

    def _run(self):
        # TODO
        self._websocket_server = WebsocketServer(
            {}, self.net_config['websocket_port'])

        t_learner = threading.Thread(target=self._run_learner_server)
        t_training = threading.Thread(target=self._run_training_client)
        t_evaluation = threading.Thread(target=self._start_policy_evaluation)

        t_learner.start()
        t_training.start()
        t_evaluation.start()

        asyncio.get_event_loop().run_forever()
Ejemplo n.º 21
0
LEARNING_RATE = 0.005
DISCOUNT_RATE = 0.95
EXPLORATION_RATE = 1
EXPLORATION_RATE_DECAY = 1
TARGET_NETWORK_UPDATE_INTERVAL = 100
REPLAY_MEMORY_SIZE = 100
MINIBATCH_SIZE = 10

# progress tracking and saving
COLLECT_DATA = True
RANDOM_STATES = 10
CHECKPOINT_EPOCHS = 5

# initialize simulation
env = UnityEnvironment(ENVIRONMENT)
bi = env.reset()
BRAIN_NAME = env.external_brain_names[0]
brain_parameters = env.external_brains[BRAIN_NAME]
STATE_SPACE_SIZE = brain_parameters.vector_observation_space_size
ACTION_SPACE_SIZE = brain_parameters.vector_action_space_size[0]

# sample states
random_states = utils.sample_states(env, BRAIN_NAME, ACTION_SPACE_SIZE,
                                    RANDOM_STATES)

# initialize policy network, target network, and optimizer
qnet = NETWORK(STATE_SPACE_SIZE, ACTION_SPACE_SIZE)
qnet.load_state_dict(torch.load("qnet_parameters.pt"))
optimizer = torch.optim.SGD(qnet.parameters(), LEARNING_RATE)
tnet = NETWORK(STATE_SPACE_SIZE, ACTION_SPACE_SIZE)
tnet.load_state_dict(qnet.state_dict())
Ejemplo n.º 22
0
# env_name = "../envs/GridWorld"  # Name of the Unity environment binary to launch
env_name = None  # to use the Unity editor
train_mode = True  # Whether to run the environment in training or inference mode
engine_configuration_channel = EngineConfigurationChannel()
# env = UnityEnvironment(base_port = 5006, file_name=env_name, side_channels = [engine_configuration_channel])
# mlagents.__spec__
env = UnityEnvironment(base_port=5004,
                       file_name=env_name,
                       side_channels=[engine_configuration_channel])
# env = UnityEnvironment(base_port = 5004, file_name=env_name)
# env = UnityEnvironment(file_name=env_name, side_channels = [engine_configuration_channel])
# env = UnityEnvironment(base_port = 5004, file_name=env_name, side_channels = [engine_configuration_channel])
#%%

#Reset the environment
env.reset()

# Set the default brain to work with
group_name = env.get_agent_groups()[0]
group_spec = env.get_agent_group_spec(group_name)

# Set the time scale of the engine
engine_configuration_channel.set_configuration_parameters(time_scale=3.0)

# env.reset()
# comm = env.get_communicator(2,5004,100)
'''Examine the observation and state spaces'''
'''
We can reset the environment to be provided with an initial set of observations and states for all the agents within the environment. In ML-Agents, states refer to a vector of variables corresponding to relevant aspects of the environment for an agent. Likewise, observations refer to a set of relevant pixel-wise visuals for an agent.
'''
Ejemplo n.º 23
0
            if i==1:
                if j == 0:
                    A_[i][j] = 1
                    A_[j][i] = 1
                if 1 + n < j and j < 1 + 1 + n + n:
                    A_[i][j] = 1
                    A_[j][i] = 1
                if 1 + n + n + n < j and j < 1 + 1 + n + n + n + n:
                    A_[i][j] = 1
                    A_[j][i] = 1
    for i in range(1+1+n+n+n+n):
        A_[i][i] = 1
    for a in range(NUM_AGENT):
        A.append(preprocess_adj(A_))


    return np.array(A), np.array(X)



if __name__ == '__main__':
    env = UnityEnvironment()
    obs = env.reset(train_mode=True)
    brain_name = env.brain_names[0]
    obs = obs[brain_name].vector_observations
    a , x = preprocess_observation_n(obs)
    print(a)
    print(x)
    env.close()

Ejemplo n.º 24
0
class Actor(object):
    _train_mode = True
    _websocket_connected = False

    def __init__(self, argv, agent_class=Agent):
        self._agent_class = agent_class

        self.config, net_config, self.reset_config = self._init_config(argv)
        self.replay_base_url = f'http://{net_config["replay_host"]}:{net_config["replay_port"]}'
        self.learner_base_url = f'http://{net_config["learner_host"]}:{net_config["learner_port"]}'
        self.websocket_base_url = f'ws://{net_config["websocket_host"]}:{net_config["websocket_port"]}'

        self._init_websocket_client()
        self._init_env()
        self._trans_cache = TransCache()
        self._run()

    def _init_config(self, argv):
        config = dict()

        with open(
                f'{Path(__file__).resolve().parent}/default_config.yaml') as f:
            default_config_file = yaml.load(f, Loader=yaml.FullLoader)
            config = default_config_file

        # define command line arguments
        try:
            opts, args = getopt.getopt(argv, 'c:', [
                'config=', 'run', 'build_path=', 'build_port=', 'logger_file=',
                'sac=', 'agents='
            ])
        except getopt.GetoptError:
            raise Exception('ARGS ERROR')

        # initialize config from config.yaml
        for opt, arg in opts:
            if opt in ('-c', '--config'):
                with open(arg) as f:
                    config_file = yaml.load(f, Loader=yaml.FullLoader)
                    for k, v in config_file.items():
                        assert k in config.keys(), f'{k} is invalid'
                        if v is not None:
                            for kk, vv in v.items():
                                assert kk in config[k].keys(
                                ), f'{kk} is invalid in {k}'
                                config[k][kk] = vv
                break

        config['base_config']['build_path'] = config['base_config'][
            'build_path'][sys.platform]

        logger_file = None
        for opt, arg in opts:
            if opt == '--run':
                self._train_mode = False
            elif opt == '--build_path':
                config['base_config']['build_path'] = arg
            elif opt == '--build_port':
                config['base_config']['build_port'] = int(arg)
            elif opt == '--logger_file':
                logger_file = arg
            elif opt == '--sac':
                config['base_config']['sac'] = arg
            elif opt == '--agents':
                config['reset_config']['copy'] = int(arg)

        # logger config
        _log = logging.getLogger()
        _log.setLevel(logging.INFO)
        # remove default root logger handler
        _log.handlers = []

        # create stream handler
        sh = logging.StreamHandler()
        sh.setLevel(logging.INFO)

        # add handler and formatter to logger
        sh.setFormatter(
            logging.Formatter('[%(levelname)s] - [%(name)s] - %(message)s'))
        _log.addHandler(sh)

        _log = logging.getLogger('tensorflow')
        _log.setLevel(level=logging.ERROR)

        self.logger = logging.getLogger('sac.ds.actor')
        self.logger.setLevel(level=logging.INFO)

        if logger_file is not None:
            # create file handler
            fh = logging.handlers.RotatingFileHandler(logger_file,
                                                      maxBytes=1024 * 100,
                                                      backupCount=5)
            fh.setLevel(logging.INFO)

            # add handler and formatter to logger
            fh.setFormatter(
                logging.Formatter(
                    '%(asctime)-15s [%(levelname)s] - [%(name)s] - %(message)s'
                ))
            self.logger.addHandler(fh)

        # display config
        config_str = ''
        for k, v in config.items():
            config_str += f'\n{k}'
            for kk, vv in v.items():
                config_str += f'\n{kk:>25}: {vv}'
        self.logger.info(config_str)

        return config['base_config'], config['net_config'], config[
            'reset_config']

    def _init_websocket_client(self):
        loop = asyncio.get_event_loop()
        loop.run_in_executor(None,
                             lambda: asyncio.run(self._connect_websocket()))

    async def _connect_websocket(self):
        while True:
            try:
                async with websockets.connect(
                        self.websocket_base_url) as websocket:
                    await websocket.send(json.dumps({'cmd': 'actor'}))
                    self.logger.info('websocket connected')
                    while True:
                        try:
                            raw_message = await websocket.recv()
                            message = json.loads(raw_message)
                            if message['cmd'] == 'reset':
                                self._websocket_connected = True
                                self.config = dict(self.config,
                                                   **message['config'])
                                self.logger.info(
                                    f'reinitialize config: {message["config"]}'
                                )
                        except websockets.ConnectionClosed:
                            self.logger.error('websocket connection closed')
                            break
                        except json.JSONDecodeError:
                            self.logger.error(
                                f'websocket json decode error, {raw_message}')
            except (ConnectionRefusedError, websockets.InvalidMessage):
                self.logger.error(f'websocket connecting failed')
                time.sleep(1)
            except Exception as e:
                self.logger.error(
                    f'websocket connecting error {type(e)}, {str(e)}')
                time.sleep(1)
            finally:
                self._websocket_connected = False

    def _init_env(self):
        if self.config['build_path'] is None or self.config['build_path'] == '':
            self.env = UnityEnvironment()
        else:
            self.env = UnityEnvironment(file_name=self.config['build_path'],
                                        no_graphics=self._train_mode,
                                        base_port=self.config['build_port'],
                                        args=['--scene', self.config['scene']])

        self.logger.info(f'{self.config["build_path"]} initialized')

        self.default_brain_name = self.env.brain_names[0]

    def _init_sac(self):
        brain_params = self.env.brains[self.default_brain_name]
        state_dim = brain_params.vector_observation_space_size * brain_params.num_stacked_vector_observations
        action_dim = brain_params.vector_action_space_size[0]

        custom_sac_model = importlib.import_module(self.config['sac'])

        self.sac_actor = SAC_DS_Base(state_dim=state_dim,
                                     action_dim=action_dim,
                                     model_root_path=None,
                                     model=custom_sac_model,
                                     use_rnn=self.config['use_rnn'])

        self.logger.info(f'actor initialized')

    def _update_policy_variables(self):
        while True and self._websocket_connected:
            try:
                r = requests.get(
                    f'{self.learner_base_url}/get_policy_variables')

                new_variables = r.json()
                self.sac_actor.update_policy_variables(new_variables)
            except requests.ConnectionError:
                self.logger.error('update_policy_variables connecting error')
                time.sleep(1)
            except Exception as e:
                self.logger.error(
                    f'update_policy_variables error {type(e)}, {str(e)}')
                break
            else:
                break

    def _add_trans(self, *trans):
        # n_states, n_actions, n_rewards, state_, done, mu_n_probs, rnn_state
        self._trans_cache.add(*trans)

        if self._trans_cache.size > self.config['add_trans_threshold']:
            trans = self._trans_cache.get_trans_list_and_clear()
            while True and self._websocket_connected:
                try:
                    requests.post(f'{self.replay_base_url}/add', json=trans)
                except requests.ConnectionError:
                    self.logger.error(f'add_trans connecting error')
                    time.sleep(1)
                except Exception as e:
                    self.logger.error(f'add_trans error {type(e)}, {str(e)}')
                    break
                else:
                    break

    def _run(self):
        iteration = 0

        while True:
            # learner is offline, waiting...
            if not self._websocket_connected:
                iteration = 0
                time.sleep(1)
                continue

            # learner is online, reset all settings
            if iteration == 0 and self._websocket_connected:
                self._trans_cache.clear()
                self._init_sac()

                brain_info = self.env.reset(
                    train_mode=self._train_mode,
                    config=self.reset_config)[self.default_brain_name]
                if self.config['use_rnn']:
                    initial_rnn_state = self.sac_actor.get_initial_rnn_state(
                        len(brain_info.agents))
                    rnn_state = initial_rnn_state

            if self.config['reset_on_iteration']:
                brain_info = self.env.reset(
                    train_mode=self._train_mode)[self.default_brain_name]
                if self.config['use_rnn']:
                    rnn_state = initial_rnn_state

            agents = [
                self._agent_class(i,
                                  tran_len=self.config['burn_in_step'] +
                                  self.config['n_step'],
                                  stagger=self.config['stagger'],
                                  use_rnn=self.config['use_rnn'])
                for i in brain_info.agents
            ]

            states = brain_info.vector_observations
            step = 0

            if self.config['update_policy_variables_per_step'] == -1:
                self._update_policy_variables()

            while False in [a.done
                            for a in agents] and self._websocket_connected:
                if self.config[
                        'update_policy_variables_per_step'] != -1 and step % self.config[
                            'update_policy_variables_per_step'] == 0:
                    self._update_policy_variables()

                if self.config['use_rnn']:
                    actions, next_rnn_state = self.sac_actor.choose_rnn_action(
                        states, rnn_state)
                    next_rnn_state = next_rnn_state.numpy()
                else:
                    actions = self.sac_actor.choose_action(states)

                actions = actions.numpy()

                brain_info = self.env.step({self.default_brain_name:
                                            actions})[self.default_brain_name]

                states_ = brain_info.vector_observations
                if step == self.config['max_step']:
                    brain_info.local_done = [True] * len(brain_info.agents)
                    brain_info.max_reached = [True] * len(brain_info.agents)

                trans_list = [
                    agents[i].add_transition(
                        states[i], actions[i], brain_info.rewards[i],
                        brain_info.local_done[i], brain_info.max_reached[i],
                        states_[i],
                        rnn_state[i] if self.config['use_rnn'] else None)
                    for i in range(len(agents))
                ]

                trans_list = [t for t in trans_list if t is not None]
                if len(trans_list) != 0:
                    # n_states, n_actions, n_rewards, state_, done, rnn_state
                    trans = [
                        np.concatenate(t, axis=0) for t in zip(*trans_list)
                    ]

                    if self.config['use_rnn']:
                        n_states, n_actions, n_rewards, state_, done, rnn_state = trans
                        # TODO: only need [:, burn_in_step:, :]
                        mu_n_probs = self.sac_actor.get_n_step_probs(
                            n_states, n_actions, rnn_state).numpy()
                        self._add_trans(n_states, n_actions, n_rewards, state_,
                                        done, mu_n_probs, rnn_state)
                    else:
                        n_states, n_actions, n_rewards, state_, done = trans
                        mu_n_probs = self.sac_actor.get_n_step_probs(
                            n_states, n_actions).numpy()
                        self._add_trans(n_states, n_actions, n_rewards, state_,
                                        done, mu_n_probs)

                states = states_
                if self.config['use_rnn']:
                    rnn_state = next_rnn_state
                    rnn_state[brain_info.local_done] = initial_rnn_state[
                        brain_info.local_done]

                step += 1

            self._log_episode_info(iteration, agents)
            iteration += 1

    def _log_episode_info(self, iteration, agents):
        rewards = [a.reward for a in agents]
        rewards_sorted = ", ".join([f"{i:.1f}" for i in sorted(rewards)])
        self.logger.info(f'{iteration}, rewards {rewards_sorted}')
Ejemplo n.º 25
0
class TrainerController(object):
    def __init__(self, env_path, run_id, save_freq, curriculum_folder,
                 fast_simulation, load, train, worker_id, keep_checkpoints,
                 lesson, seed, docker_target_name, trainer_config_path,
                 no_graphics, camera_res_overwrite, benchmark,
                 benchmark_episode, success_threshold, benchmark_verbose):
        """
        :param env_path: Location to the environment executable to be loaded.
        :param run_id: The sub-directory name for model and summary statistics
        :param save_freq: Frequency at which to save model
        :param curriculum_folder: Folder containing JSON curriculums for the
               environment.
        :param fast_simulation: Whether to run the game at training speed.
        :param load: Whether to load the model or randomly initialize.
        :param train: Whether to train model, or only run inference.
        :param worker_id: Number to add to communication port (5005).
               Used for multi-environment
        :param keep_checkpoints: How many model checkpoints to keep.
        :param lesson: Start learning from this lesson.
        :param seed: Random seed used for training.
        :param docker_target_name: Name of docker volume that will contain all
               data.
        :param trainer_config_path: Fully qualified path to location of trainer
               configuration file.
        :param no_graphics: Whether to run the Unity simulator in no-graphics
                            mode.
        """
        if env_path is not None:
            # Strip out executable extensions if passed
            env_path = (env_path.strip().replace('.app', '').replace(
                '.exe', '').replace('.x86_64', '').replace('.x86', ''))

        # Recognize and use docker volume if one is passed as an argument
        if not docker_target_name:
            self.docker_training = False
            self.trainer_config_path = trainer_config_path
            self.model_path = './models/{run_id}'.format(run_id=run_id)
            self.curriculum_folder = curriculum_folder
            self.summaries_dir = './summaries'
        else:
            self.docker_training = True
            self.trainer_config_path = \
                '/{docker_target_name}/{trainer_config_path}'.format(
                    docker_target_name=docker_target_name,
                    trainer_config_path = trainer_config_path)
            self.model_path = '/{docker_target_name}/models/{run_id}'.format(
                docker_target_name=docker_target_name, run_id=run_id)
            if env_path is not None:
                """
                Comments for future maintenance:
                    Some OS/VM instances (e.g. COS GCP Image) mount filesystems 
                    with COS flag which prevents execution of the Unity scene, 
                    to get around this, we will copy the executable into the 
                    container.
                """
                # Navigate in docker path and find env_path and copy it.
                env_path = self._prepare_for_docker_run(
                    docker_target_name, env_path)
            if curriculum_folder is not None:
                self.curriculum_folder = \
                    '/{docker_target_name}/{curriculum_folder}'.format(
                        docker_target_name=docker_target_name,
                        curriculum_folder=curriculum_folder)

            self.summaries_dir = '/{docker_target_name}/summaries'.format(
                docker_target_name=docker_target_name)

        self.logger = logging.getLogger('mlagents.envs')
        self.run_id = run_id
        self.save_freq = save_freq
        self.lesson = lesson
        self.fast_simulation = fast_simulation
        self.load_model = load
        self.train_model = train
        self.worker_id = worker_id
        self.keep_checkpoints = keep_checkpoints
        self.trainers = {}
        self.seed = seed
        self.benchmark = benchmark
        self.global_step = 0
        self.benchmark_episode = benchmark_episode
        self.success_threshold = success_threshold
        self.benchmark_verbose = benchmark_verbose
        np.random.seed(self.seed)
        tf.set_random_seed(self.seed)
        self.env = UnityEnvironment(file_name=env_path,
                                    worker_id=self.worker_id,
                                    seed=self.seed,
                                    docker_training=self.docker_training,
                                    no_graphics=no_graphics,
                                    camera_res_overwrite=camera_res_overwrite)
        self.brain_name = self.env.brain_names[
            0]  # Get brain_name, assume only have 1 brain

        if env_path is None:
            self.env_name = 'editor_' + self.env.academy_name
        else:
            # Extract out name of environment
            self.env_name = os.path.basename(os.path.normpath(env_path))

        if curriculum_folder is None:
            self.meta_curriculum = None
        else:
            self.meta_curriculum = MetaCurriculum(self.curriculum_folder,
                                                  self.env._resetParameters)

        if self.meta_curriculum:
            for brain_name in self.meta_curriculum.brains_to_curriculums.keys(
            ):
                if brain_name not in self.env.external_brain_names:
                    raise MetaCurriculumError('One of the curriculums '
                                              'defined in ' +
                                              self.curriculum_folder + ' '
                                              'does not have a corresponding '
                                              'Brain. Check that the '
                                              'curriculum file has the same '
                                              'name as the Brain '
                                              'whose curriculum it defines.')

    def _prepare_for_docker_run(self, docker_target_name, env_path):
        for f in glob.glob('/{docker_target_name}/*'.format(
                docker_target_name=docker_target_name)):
            if env_path in f:
                try:
                    b = os.path.basename(f)
                    if os.path.isdir(f):
                        shutil.copytree(f, '/ml-agents/{b}'.format(b=b))
                    else:
                        src_f = '/{docker_target_name}/{b}'.format(
                            docker_target_name=docker_target_name, b=b)
                        dst_f = '/ml-agents/{b}'.format(b=b)
                        shutil.copyfile(src_f, dst_f)
                        os.chmod(dst_f, 0o775)  # Make executable
                except Exception as e:
                    self.logger.info(e)
        env_path = '/ml-agents/{env_name}'.format(env_name=env_path)
        return env_path

    def _get_measure_vals(self):
        if self.meta_curriculum:
            brain_names_to_measure_vals = {}
            for brain_name, curriculum \
                in self.meta_curriculum.brains_to_curriculums.items():
                if curriculum.measure == 'progress':
                    measure_val = (self.trainers[brain_name].get_step /
                                   self.trainers[brain_name].get_max_steps)
                    brain_names_to_measure_vals[brain_name] = measure_val
                elif curriculum.measure == 'reward':
                    measure_val = np.mean(
                        self.trainers[brain_name].reward_buffer)
                    brain_names_to_measure_vals[brain_name] = measure_val
            return brain_names_to_measure_vals
        else:
            return None

    def _save_model(self, steps=0):
        """
        Saves current model to checkpoint folder.
        :param steps: Current number of steps in training process.
        :param saver: Tensorflow saver for session.
        """
        for brain_name in self.trainers.keys():
            self.trainers[brain_name].save_model()
        self.logger.info('Saved Model')

    def _save_model_when_interrupted(self, steps=0):
        self.logger.info('Learning was interrupted. Please wait '
                         'while the graph is generated.')
        self._save_model(steps)

    def _win_handler(self, event):
        """
        This function gets triggered after ctrl-c or ctrl-break is pressed
        under Windows platform.
        """
        if event in (win32con.CTRL_C_EVENT, win32con.CTRL_BREAK_EVENT):
            self._save_model_when_interrupted(self.global_step)
            self._export_graph()
            sys.exit()
            return True
        return False

    def _export_graph(self):
        """
        Exports latest saved models to .bytes format for Unity embedding.
        """
        for brain_name in self.trainers.keys():
            self.trainers[brain_name].export_model()

    def _initialize_trainers(self, trainer_config):
        """
        Initialization of the trainers
        :param trainer_config: The configurations of the trainers
        """
        trainer_parameters_dict = {}
        for brain_name in self.env.external_brain_names:
            trainer_parameters = trainer_config['default'].copy()
            trainer_parameters['summary_path'] = '{basedir}/{name}'.format(
                basedir=self.summaries_dir,
                name=str(self.run_id) + '_' + brain_name)
            trainer_parameters['model_path'] = '{basedir}/{name}'.format(
                basedir=self.model_path, name=brain_name)
            trainer_parameters['keep_checkpoints'] = self.keep_checkpoints
            if brain_name in trainer_config:
                _brain_key = brain_name
                while not isinstance(trainer_config[_brain_key], dict):
                    _brain_key = trainer_config[_brain_key]
                for k in trainer_config[_brain_key]:
                    trainer_parameters[k] = trainer_config[_brain_key][k]
            trainer_parameters_dict[brain_name] = trainer_parameters.copy()
        for brain_name in self.env.external_brain_names:
            if trainer_parameters_dict[brain_name]['trainer'] == 'offline_bc':
                self.trainers[brain_name] = OfflineBCTrainer(
                    self.env.brains[brain_name],
                    trainer_parameters_dict[brain_name], self.train_model,
                    self.load_model, self.seed, self.run_id)
            elif trainer_parameters_dict[brain_name]['trainer'] == 'online_bc':
                self.trainers[brain_name] = OnlineBCTrainer(
                    self.env.brains[brain_name],
                    trainer_parameters_dict[brain_name], self.train_model,
                    self.load_model, self.seed, self.run_id)
            elif trainer_parameters_dict[brain_name]['trainer'] == 'ppo':
                self.trainers[brain_name] = PPOTrainer(
                    self.env.brains[brain_name],
                    self.meta_curriculum.brains_to_curriculums[brain_name].
                    min_lesson_length if self.meta_curriculum else 0,
                    trainer_parameters_dict[brain_name], self.train_model,
                    self.load_model, self.seed, self.run_id)
            else:
                raise UnityEnvironmentException('The trainer config contains '
                                                'an unknown trainer type for '
                                                'brain {}'.format(brain_name))

    def _load_config(self):
        try:
            with open(self.trainer_config_path) as data_file:
                trainer_config = yaml.load(data_file)
                return trainer_config
        except IOError:
            raise UnityEnvironmentException(
                'Parameter file could not be found '
                'at {}.'.format(self.trainer_config_path))
        except UnicodeDecodeError:
            raise UnityEnvironmentException(
                'There was an error decoding '
                'Trainer Config from this path : {}'.format(
                    self.trainer_config_path))

    @staticmethod
    def _create_model_path(model_path):
        try:
            if not os.path.exists(model_path):
                os.makedirs(model_path)
        except Exception:
            raise UnityEnvironmentException(
                'The folder {} containing the '
                'generated model could not be '
                'accessed. Please make sure the '
                'permissions are set correctly.'.format(model_path))

    def _reset_env(self):
        """Resets the environment.

        Returns:
            A Data structure corresponding to the initial reset state of the
            environment.
        """
        if self.meta_curriculum is not None:
            return self.env.reset(config=self.meta_curriculum.get_config(),
                                  train_mode=self.fast_simulation)
        else:
            return self.env.reset(train_mode=self.fast_simulation)

    ### Main Learning Loop
    def start_learning(self):
        # TODO: Should be able to start learning at different lesson numbers
        # for each curriculum.
        if self.meta_curriculum is not None:
            self.meta_curriculum.set_all_curriculums_to_lesson_num(self.lesson)
        trainer_config = self._load_config()
        self._create_model_path(self.model_path)

        tf.reset_default_graph()

        # Prevent a single session from taking all GPU memory.
        self._initialize_trainers(trainer_config)
        for _, t in self.trainers.items():
            self.logger.info(t)
        curr_info = self._reset_env()
        BenchmarkManager(len(curr_info[self.brain_name].agents),
                         self.benchmark_episode, self.success_threshold,
                         self.benchmark_verbose)

        if self.train_model:
            for brain_name, trainer in self.trainers.items():
                trainer.write_tensorboard_text('Hyperparameters',
                                               trainer.parameters)
            if sys.platform.startswith('win'):
                # Add the _win_handler function to the windows console's handler function list
                win32api.SetConsoleCtrlHandler(self._win_handler, True)
        last_update_time = time.clock()
        try:
            while any([t.get_step <= t.get_max_steps \
                       for k, t in self.trainers.items()]) \
                  or not self.train_model:
                if self.meta_curriculum:
                    # Get the sizes of the reward buffers.
                    reward_buff_sizes = {k:len(t.reward_buffer) \
                                        for (k,t) in self.trainers.items()}
                    # Attempt to increment the lessons of the brains who
                    # were ready.
                    lessons_incremented = \
                        self.meta_curriculum.increment_lessons(
                            self._get_measure_vals(),
                            reward_buff_sizes=reward_buff_sizes)

                # If any lessons were incremented or the environment is
                # ready to be reset
                if (self.meta_curriculum
                        and any(lessons_incremented.values())):
                    curr_info = self._reset_env()
                    for brain_name, trainer in self.trainers.items():
                        trainer.end_episode()
                    for brain_name, changed in lessons_incremented.items():
                        if changed:
                            self.trainers[brain_name].reward_buffer.clear()
                elif self.env.global_done:
                    curr_info = self._reset_env()
                    for brain_name, trainer in self.trainers.items():
                        trainer.end_episode()

                # Decide and take an action
                take_action_vector, \
                take_action_memories, \
                take_action_text, \
                take_action_value, \
                take_action_outputs \
                    = {}, {}, {}, {}, {}
                for brain_name, trainer in self.trainers.items():
                    (take_action_vector[brain_name],
                     take_action_memories[brain_name],
                     take_action_text[brain_name],
                     take_action_value[brain_name],
                     take_action_outputs[brain_name]) = \
                        trainer.take_action(curr_info)

                # Send Action into Unity Environment and return new_info: AllBrainInfo
                # You can refer to brain.py, I've commented the detail data structure
                # AllBrainInfo type = {str: BrainInfo}
                new_info = self.env.step(vector_action=take_action_vector,
                                         memory=take_action_memories,
                                         text_action=take_action_text,
                                         value=take_action_value)

                if self.benchmark:
                    BenchmarkManager.add_result(new_info[self.brain_name])
                    if BenchmarkManager.is_complete():
                        BenchmarkManager.analyze()
                        break

                for brain_name, trainer in self.trainers.items():
                    # Calculate and Print training speed
                    if trainer.get_step % 100 == 0 and trainer.is_training:
                        print(
                            'Steps:{:,}'.format(trainer.get_step),
                            ' || Speed:',
                            format(100 / (time.clock() - last_update_time),
                                   '.2f'))
                        last_update_time = time.clock()
                    trainer.add_experiences(curr_info, new_info,
                                            take_action_outputs[brain_name])
                    trainer.process_experiences(curr_info, new_info)

                    if trainer.is_ready_update() and self.train_model \
                            and trainer.get_step <= trainer.get_max_steps:
                        learn_timer = time.time()
                        print('Start Learning...')
                        # Perform gradient descent with experience buffer
                        trainer.update_policy()
                        print('Learning Time:', time.time() - learn_timer)

                    # Write training statistics to Tensorboard.
                    if self.meta_curriculum is not None:
                        trainer.write_summary(
                            self.global_step,
                            lesson_num=self.meta_curriculum.
                            brains_to_curriculums[brain_name].lesson_num)
                    else:
                        trainer.write_summary(self.global_step)
                    if self.train_model \
                            and trainer.get_step <= trainer.get_max_steps:
                        trainer.increment_step_and_update_last_reward()
                self.global_step += 1
                if self.global_step % self.save_freq == 0 and self.global_step != 0 \
                        and self.train_model:
                    # Save Tensorflow model
                    self._save_model(steps=self.global_step)
                curr_info = new_info
            # Final save Tensorflow model
            if self.global_step != 0 and self.train_model:
                self._save_model(steps=self.global_step)
        except KeyboardInterrupt:
            if self.train_model:
                self._save_model_when_interrupted(steps=self.global_step)
            pass
        self.env.close()
        if self.train_model:
            self._export_graph()