Example #1
0
def test_step():
    with mock.patch('subprocess.Popen'):
        with mock.patch('socket.socket') as mock_socket:
            with mock.patch('glob.glob') as mock_glob:
                mock_glob.return_value = ['FakeLaunchPath']
                mock_socket.return_value.accept.return_value = (mock_socket, 0)
                mock_socket.recv.return_value.decode.return_value = dummy_start
                env = UnityEnvironment(' ')
                brain = env.brains['RealFakeBrain']
                mock_socket.recv.side_effect = dummy_reset
                brain_info = env.reset()
                mock_socket.recv.side_effect = dummy_step
                brain_info = env.step([0] * brain.vector_action_space_size * len(brain_info['RealFakeBrain'].agents))
                with pytest.raises(UnityActionException):
                    env.step([0])
                brain_info = env.step([0] * brain.vector_action_space_size * len(brain_info['RealFakeBrain'].agents))
                with pytest.raises(UnityActionException):
                    env.step([0] * brain.vector_action_space_size * len(brain_info['RealFakeBrain'].agents))
                env.close()
                assert env.global_done
                assert isinstance(brain_info, dict)
                assert isinstance(brain_info['RealFakeBrain'], BrainInfo)
                assert isinstance(brain_info['RealFakeBrain'].visual_observations, list)
                assert isinstance(brain_info['RealFakeBrain'].vector_observations, np.ndarray)
                assert len(brain_info['RealFakeBrain'].visual_observations) == brain.number_visual_observations
                assert brain_info['RealFakeBrain'].vector_observations.shape[0] == \
                       len(brain_info['RealFakeBrain'].agents)
                assert brain_info['RealFakeBrain'].vector_observations.shape[1] == \
                       brain.vector_observation_space_size * brain.num_stacked_vector_observations
                assert not brain_info['RealFakeBrain'].local_done[0]
                assert brain_info['RealFakeBrain'].local_done[2]
brain = env.brains[brain_name]


# ### 2. Examine the State and Action Spaces
# 
# In this environment, a double-jointed arm can move to target locations. A reward of `+0.1` is provided for each step that the agent's hand is in the goal location. Thus, the goal of your agent is to maintain its position at the target location for as many time steps as possible.
# 
# The observation space consists of `33` variables corresponding to position, rotation, velocity, and angular velocities of the arm.  Each action is a vector with four numbers, corresponding to torque applicable to two joints.  Every entry in the action vector must be a number between `-1` and `1`.
# 
# Run the code cell below to print some information about the environment.

# In[4]:


# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])
Example #3
0
class UnityEnvV0(Env, Serializable):
    def __init__(self,
                 app_name,
                 time_state=False,
                 idx=0,
                 is_render=False,
                 no_graphics=False,
                 recording=True):
        Serializable.quick_init(self, locals())

        # Unity scene
        self._env = UnityEnvironment(file_name=app_name,
                                     worker_id=idx,
                                     no_graphics=no_graphics)
        self.id = 0

        self.name = app_name
        self.idx = idx
        self.is_render = is_render

        self.time_state = time_state
        self.time_step = 0

        # Check brain configuration
        assert len(self._env.brains) == 1
        self.brain_name = self._env.external_brain_names[0]
        brain = self._env.brains[self.brain_name]

        # Check for number of agents in scene
        initial_info = self._env.reset()[self.brain_name]
        self.use_visual = (brain.number_visual_observations == 1) and False
        self.recording = brain.number_visual_observations == 1 and recording

        # Set observation and action spaces
        if brain.vector_action_space_type == "discrete":
            self._action_space = Discrete(1)
        else:
            high = np.array([np.inf] * (brain.vector_action_space_size))
            self._action_space = Box(-high, high)
        # ----------------------------------
        if self.use_visual and False and no_graphic:
            high = np.array([np.inf] * brain.camera_resolutions[0]["height"] *
                            brain.camera_resolutions[0]["width"] * 3)
            self._observation_space = Box(-high, high)
        else:
            if self.time_state:
                high = np.array([np.inf] *
                                (brain.vector_observation_space_size + 1))
            else:
                high = np.array([np.inf] *
                                (brain.vector_observation_space_size))
            self._observation_space = Box(-high, high)

        # video buffer
        self.frames = []

    def reset(self):
        self.frames = []
        info = self._env.reset()[self.brain_name]
        if self.is_render: self.observation = info.visual_observations[0]
        state = info.vector_observations[0][:]
        self._pos = info.vector_observations[0][:2]
        if self.time_state:
            state = np.hstack((state, [self.time_step]))
            self.time_step += 1
        self._collect_frames(info.visual_observations[0][0])
        return state.flatten()

    def step(self, action):
        info = self._env.step([action])[self.brain_name]
        if self.is_render: self.observation = info.visual_observations[0]
        state = info.vector_observations[0][:]
        self._pos = info.vector_observations[0][:2]
        reward = info.rewards[0]
        done = info.local_done[0]
        if self.time_state:
            state = np.hstack((state, [self.time_step]))
            self.time_step += 1
            if done: self.time_step = 0
        self._collect_frames(info.visual_observations[0][0])
        return Step(observation=state.flatten(), reward=reward, done=done)

    def terminate(self):
        self._env.close()

    def render(self, mode=None):
        if self.is_render:
            x = self.observation[0] * 255
            return np.array(x).astype('uint8')
        else:
            return np.zeros((480, 360, 3))

    def _collect_frames(self, frame):
        if self.recording:
            self.frames.append(np.uint8(frame * 255))

    @property
    def action_space(self):
        return self._action_space

    @property
    def observation_space(self):
        return self._observation_space

    @property
    def position(self):
        return self._pos
def main(seed=seed):
    # ---------------------------------------------------------------------------------------------------
    #  Logger
    # ---------------------------------------------------------------------------------------------------
    save_path = f"./results/Reacher_DDPG_{pd.Timestamp.utcnow().value}"
    os.makedirs(save_path, exist_ok=True)

    logger = logging.getLogger()
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s : %(message)s')

    handler = logging.FileHandler(
        f"{save_path}/logs_navigation_{pd.Timestamp.utcnow().value}.log")
    handler.setLevel(logging.DEBUG)
    handler.setFormatter(formatter)
    logger.addHandler(handler)

    # ---------------------------------------------------------------------------------------------------
    #  Inputs
    # ---------------------------------------------------------------------------------------------------
    n_episodes = 300
    config = dict(
        # Environment parameters
        env_name="Reacher",
        n_episodes=n_episodes,
        length_episode=1500,
        save_every=100,
        save_path=save_path,
        mode="train",  # "train" or "test"
        evaluate_every=
        5000,  # Number of training episodes before 1 evaluation episode
        eps_decay=1,  # Epsilon decay rate

        # Agent Parameters
        agent="DDPG",
        hidden_layers_actor=(200, 150),  # (50, 50, 15),  # (200, 150),  #
        hidden_layers_critic_body=(400, ),  # (50, 50,),  #
        hidden_layers_critic_head=(300, ),  # (50,),   # (300,)
        func_critic_body="F.leaky_relu",  #
        func_critic_head="F.leaky_relu",  #
        func_actor_body="F.leaky_relu",  #
        lr_scheduler=
        None,  #{'scheduler_type': "multistep",  # "step", "exp" or "decay", "multistep"
        #               'gamma': 0.5,  # 0.99999,
        #               'step_size': 1,
        #               'milestones': [15*1000 * i for i in range(1, 6)],
        #               'max_epochs': n_episodes},
        TAU=1e-3,  # for soft update of target parameters
        BUFFER_SIZE=int(1e6),  # replay buffer size
        BATCH_SIZE=128,  # minibatch size
        GAMMA=0.99,  # discount factor
        LR_ACTOR=1e-3,  # learning rate of the actor
        LR_CRITIC=1e-3,  # learning rate of the critic
        WEIGHT_DECAY=0,  # L2 weight decay
        UPDATE_EVERY=1,  # Number of actions before making a learning step
        action_noise="OU",  #
        action_noise_scale=1,
        weights_noise=None,  #
        state_normalizer="BatchNorm",  # "RunningMeanStd" or "BatchNorm"
        warmup=0,  # Number of random actions to start with as a warm-up
        start_time=str(pd.Timestamp.utcnow()),
        random_seed=seed,
        threshold=30)
    logger.warning("+=" * 90)
    logger.warning(f"  RUNNING SIMULATION WITH PARAMETERS config={config}")
    logger.warning("+=" * 90)

    # ------------------------------------------------------------
    #  1. Initialization
    # ------------------------------------------------------------
    # 1. Start the Environment

    # env = UnityEnvironment(file_name=f'./Reacher_Linux_2/Reacher.x86_64')  # Linux
    env = UnityEnvironment(file_name=f'./{config["env_name"]}')  # mac OS

    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]

    # number of agents
    num_agents = len(env_info.agents)
    print('Number of agents:', num_agents)
    config["n_agents"] = num_agents

    # size of each action
    action_size = brain.vector_action_space_size
    print('Size of each action:', action_size)

    # examine the state space
    states = env_info.vector_observations
    state_size = states.shape[1]
    print('There are {} agents. Each observes a state with length: {}'.format(
        states.shape[0], state_size))
    print('The state for the first agent looks like:', states[0])
    config.update(dict(action_size=action_size, state_size=state_size))

    # ------------------------------------------------------------
    #  2. Training
    # ------------------------------------------------------------
    # Unity Monitor
    monitor = UnityMonitor(env=env, config=config)

    if config["mode"] == "train":
        # Actor model
        seed = 0
        actor = SimpleNeuralNetHead(action_size,
                                    SimpleNeuralNetBody(
                                        state_size,
                                        config["hidden_layers_actor"],
                                        seed=seed),
                                    func=F.tanh,
                                    seed=seed)
        actor_target = SimpleNeuralNetHead(action_size,
                                           SimpleNeuralNetBody(
                                               state_size,
                                               config["hidden_layers_actor"],
                                               seed=seed),
                                           func=F.tanh,
                                           seed=seed)
        # Critic model
        critic = DeepNeuralNetHeadCritic(
            action_size,
            SimpleNeuralNetBody(state_size,
                                config["hidden_layers_critic_body"],
                                func=eval(config["func_critic_body"]),
                                seed=seed),
            hidden_layers_sizes=config["hidden_layers_critic_head"],
            func=eval(config["func_critic_head"]),
            end_func=None,
            seed=seed)

        critic_target = DeepNeuralNetHeadCritic(
            action_size,
            SimpleNeuralNetBody(state_size,
                                config["hidden_layers_critic_body"],
                                func=eval(config["func_critic_body"]),
                                seed=seed),
            hidden_layers_sizes=config["hidden_layers_critic_head"],
            func=eval(config["func_critic_head"]),
            end_func=None,
            seed=seed)

        # DDPG Agent
        agent = DDPGAgent(
            state_size=state_size,
            action_size=action_size,
            model_actor=actor,
            model_critic=critic,
            # actor_target=actor_target, critic_target=critic_target,
            action_space_low=-1,
            action_space_high=1,
            config=config,
        )

        # Training
        start = pd.Timestamp.utcnow()
        scores = monitor.run(agent)
        logger.info("Average Score last 100 episodes: {}".format(
            np.mean(scores[-100:])))
        elapsed_time = pd.Timedelta(pd.Timestamp.utcnow() -
                                    start).total_seconds()
        logger.info(f"Elapsed Time: {elapsed_time} seconds")

    # ------------------------------------------------------------
    #  3. Testing
    # ------------------------------------------------------------
    else:
        agent = DDPGAgent.load(filepath=config['save_path'], mode="test")
        scores = monitor.run(agent)
        logger.info(
            f"Test Score over {len(scores)} episodes: {np.mean(scores)}")
        config["test_scores"] = scores
        config["best_test_score"] = max(scores)
        config["avg_test_score"] = np.mean(scores)

    # When finished, you can close the environment.
    logger.info("Closing...")
    env.close()
Example #5
0
class UnityEnv(IEnvironment):
    def __init__(self, name):

        drl_logger.info("Initializing environment.'",
                        extra={"params": {
                            "name": name,
                        }})

        self.env = UnityEnvironment(file_name=name)
        self.brain_name = self.env.brain_names[0]
        self.termination_reward = 0

    def action_offset(self):
        return 0

    def close(self):
        self.env.close()

    def get_action_space(self):
        # isDiscrete = isinstance(self.__env.action_space, Discrete)
        #
        # if isDiscrete:
        #     num_action_space = self.__env.action_space.n
        #     logging.debug("Env action space is discrete")
        #     logging.debug("Env action space: {}".format(num_action_space))
        #
        # logging.debug("Env observation space: {}".format(self.__env.observation_space))
        pass

    def render(self, mode):
        pass

    def reset(self):
        brain_name = self.env.brain_names[0]
        # brain = self.__env.brains[brain_name]

        env_info = self.env.reset(
            train_mode=True)[brain_name]  # reset the environment
        state = env_info.vector_observations[0]  # get the current state
        # state = env_info.vector_observations  # get the current state

        new_life = True

        return state, new_life

    def start_game_action(self):
        return None

    def step(self, action):
        env_info = self.env.step(action)[
            self.brain_name]  # send the action to the environment

        next_state = env_info.vector_observations[0]  # get the next state
        reward = env_info.rewards[0]  # get the reward
        done = env_info.local_done[0]  # see if episode has finished

        if done:
            reward += self.termination_reward

        new_life = False

        return next_state, reward, done, new_life
Example #6
0
class TrainerController(object):
    def __init__(self, env_path, run_id, save_freq, curriculum_file,
                 fast_simulation, load, train, worker_id, keep_checkpoints,
                 lesson, seed, docker_target_name, trainer_config_path,
                 no_graphics):
        """
        :param env_path: Location to the environment executable to be loaded.
        :param run_id: The sub-directory name for model and summary statistics
        :param save_freq: Frequency at which to save model
        :param curriculum_file: Curriculum json file for environment
        :param fast_simulation: Whether to run the game at training speed
        :param load: Whether to load the model or randomly initialize
        :param train: Whether to train model, or only run inference
        :param worker_id: Number to add to communication port (5005). Used for multi-environment
        :param keep_checkpoints: How many model checkpoints to keep
        :param lesson: Start learning from this lesson
        :param seed: Random seed used for training.
        :param docker_target_name: Name of docker volume that will contain all data.
        :param trainer_config_path: Fully qualified path to location of trainer configuration file
        :param no_graphics: Whether to run the Unity simulator in no-graphics mode
        """
        self.trainer_config_path = trainer_config_path
        if env_path is not None:
            env_path = (env_path.strip().replace('.app', '').replace(
                '.exe', '').replace('.x86_64', '').replace('.x86', '')
                        )  # Strip out executable extensions if passed
        # Recognize and use docker volume if one is passed as an argument
        if docker_target_name == '':
            self.docker_training = False
            self.model_path = './experiments/{run_id}'.format(run_id=run_id)
            self.curriculum_file = curriculum_file
            self.summaries_dir = './summaries'
        else:
            self.docker_training = True
            self.model_path = '/{docker_target_name}/experiments/{run_id}'.format(
                docker_target_name=docker_target_name, run_id=run_id)
            if env_path is not None:
                env_path = '/{docker_target_name}/{env_name}'.format(
                    docker_target_name=docker_target_name, env_name=env_path)
            if curriculum_file is None:
                self.curriculum_file = None
            else:
                self.curriculum_file = '/{docker_target_name}/{curriculum_file}'.format(
                    docker_target_name=docker_target_name,
                    curriculum_file=curriculum_file)
            self.summaries_dir = '/{docker_target_name}/summaries'.format(
                docker_target_name=docker_target_name)
        self.logger = logging.getLogger("unityagents")
        self.run_id = run_id
        self.save_freq = save_freq
        self.lesson = lesson
        self.fast_simulation = fast_simulation
        self.load_model = load
        self.train_model = train
        self.worker_id = worker_id
        self.keep_checkpoints = keep_checkpoints
        self.trainers = {}
        if seed == -1:
            seed = np.random.randint(0, 999999)
        self.seed = seed
        np.random.seed(self.seed)
        tf.set_random_seed(self.seed)
        self.env = UnityEnvironment(file_name=env_path,
                                    worker_id=self.worker_id,
                                    curriculum=self.curriculum_file,
                                    seed=self.seed,
                                    docker_training=self.docker_training,
                                    no_graphics=no_graphics)
        if env_path is None:
            self.env_name = 'editor_' + self.env.academy_name
        else:
            self.env_name = os.path.basename(
                os.path.normpath(env_path))  # Extract out name of environment

    def _get_progress(self):
        if self.curriculum_file is not None:
            progress = 0
            if self.env.curriculum.measure_type == "progress":
                for brain_name in self.env.external_brain_names:
                    progress += self.trainers[
                        brain_name].get_step / self.trainers[
                            brain_name].get_max_steps
                return progress / len(self.env.external_brain_names)
            elif self.env.curriculum.measure_type == "reward":
                for brain_name in self.env.external_brain_names:
                    progress += self.trainers[brain_name].get_last_reward
                return progress
            else:
                return None
        else:
            return None

    def _process_graph(self):
        nodes = []
        scopes = []
        for brain_name in self.trainers.keys():
            if self.trainers[brain_name].graph_scope is not None:
                scope = self.trainers[brain_name].graph_scope + '/'
                if scope == '/':
                    scope = ''
                scopes += [scope]
                if self.trainers[brain_name].parameters[
                        "trainer"] == "imitation":
                    nodes += [scope + x for x in ["action"]]
                else:
                    nodes += [
                        scope + x
                        for x in ["action", "value_estimate", "action_probs"]
                    ]
                if self.trainers[brain_name].parameters["use_recurrent"]:
                    nodes += [
                        scope + x for x in ["recurrent_out", "memory_size"]
                    ]
        if len(scopes) > 1:
            self.logger.info("List of available scopes :")
            for scope in scopes:
                self.logger.info("\t" + scope)
        self.logger.info("List of nodes to export :")
        for n in nodes:
            self.logger.info("\t" + n)
        return nodes

    def _save_model(self, sess, saver, steps=0):
        """
        Saves current model to checkpoint folder.
        :param sess: Current Tensorflow session.
        :param steps: Current number of steps in training process.
        :param saver: Tensorflow saver for session.
        """
        last_checkpoint = self.model_path + '/model-' + str(steps) + '.cptk'
        saver.save(sess, last_checkpoint)
        tf.train.write_graph(sess.graph_def,
                             self.model_path,
                             'raw_graph_def.pb',
                             as_text=False)
        self.logger.info("Saved Model")

    def _export_graph(self):
        """
        Exports latest saved model to .bytes format for Unity embedding.
        """
        target_nodes = ','.join(self._process_graph())
        ckpt = tf.train.get_checkpoint_state(self.model_path)
        freeze_graph.freeze_graph(
            input_graph=self.model_path + '/raw_graph_def.pb',
            input_binary=True,
            input_checkpoint=ckpt.model_checkpoint_path,
            output_node_names=target_nodes,
            output_graph=self.model_path + '/' + self.env_name + "_" +
            self.run_id + '.bytes',
            clear_devices=True,
            initializer_nodes="",
            input_saver="",
            restore_op_name="save/restore_all",
            filename_tensor_name="save/Const:0")

    def _initialize_trainers(self, trainer_config, sess):
        trainer_parameters_dict = {}
        self.trainers = {}
        for brain_name in self.env.external_brain_names:
            trainer_parameters = trainer_config['default'].copy()
            if len(self.env.external_brain_names) > 1:
                graph_scope = re.sub('[^0-9a-zA-Z]+', '-', brain_name)
                trainer_parameters['graph_scope'] = graph_scope
                trainer_parameters['summary_path'] = '{basedir}/{name}'.format(
                    basedir=self.summaries_dir,
                    name=str(self.run_id) + '_' + graph_scope)
            else:
                trainer_parameters['graph_scope'] = ''
                trainer_parameters['summary_path'] = '{basedir}/{name}'.format(
                    basedir=self.summaries_dir, name=str(self.run_id))
            if brain_name in trainer_config:
                _brain_key = brain_name
                while not isinstance(trainer_config[_brain_key], dict):
                    _brain_key = trainer_config[_brain_key]
                for k in trainer_config[_brain_key]:
                    trainer_parameters[k] = trainer_config[_brain_key][k]
            trainer_parameters_dict[brain_name] = trainer_parameters.copy()
        for brain_name in self.env.external_brain_names:
            if trainer_parameters_dict[brain_name]['trainer'] == "imitation":
                self.trainers[brain_name] = BehavioralCloningTrainer(
                    sess, self.env, brain_name,
                    trainer_parameters_dict[brain_name], self.train_model,
                    self.seed)
            elif trainer_parameters_dict[brain_name]['trainer'] == "ppo":
                self.trainers[brain_name] = PPOTrainer(
                    sess, self.env, brain_name,
                    trainer_parameters_dict[brain_name], self.train_model,
                    self.seed)
            else:
                raise UnityEnvironmentException(
                    "The trainer config contains an unknown trainer type for brain {}"
                    .format(brain_name))

    def _load_config(self):
        try:
            with open(self.trainer_config_path) as data_file:
                trainer_config = yaml.load(data_file)
                return trainer_config
        except IOError:
            raise UnityEnvironmentException(
                """Parameter file could not be found here {}.
                                            Will use default Hyper parameters"""
                .format(self.trainer_config_path))
        except UnicodeDecodeError:
            raise UnityEnvironmentException(
                "There was an error decoding Trainer Config from this path : {}"
                .format(self.trainer_config_path))

    @staticmethod
    def _create_model_path(model_path):
        try:
            if not os.path.exists(model_path):
                os.makedirs(model_path)
        except Exception:
            raise UnityEnvironmentException(
                "The folder {} containing the generated model could not be accessed."
                " Please make sure the permissions are set correctly.".format(
                    model_path))

    def start_learning(self):
        self.env.curriculum.set_lesson_number(self.lesson)
        trainer_config = self._load_config()
        self._create_model_path(self.model_path)

        tf.reset_default_graph()

        with tf.Session() as sess:
            self._initialize_trainers(trainer_config, sess)
            for k, t in self.trainers.items():
                self.logger.info(t)
            init = tf.global_variables_initializer()
            saver = tf.train.Saver(max_to_keep=self.keep_checkpoints)
            # Instantiate model parameters
            if self.load_model:
                self.logger.info('Loading Model...')
                ckpt = tf.train.get_checkpoint_state(self.model_path)
                if ckpt is None:
                    self.logger.info(
                        'The model {0} could not be found. Make sure you specified the right '
                        '--run-id'.format(self.model_path))
                saver.restore(sess, ckpt.model_checkpoint_path)
            else:
                sess.run(init)
            global_step = 0  # This is only for saving the model
            self.env.curriculum.increment_lesson(self._get_progress())
            curr_info = self.env.reset(train_mode=self.fast_simulation)
            if self.train_model:
                for brain_name, trainer in self.trainers.items():
                    trainer.write_tensorboard_text('Hyperparameters',
                                                   trainer.parameters)
            try:
                while any([
                        t.get_step <= t.get_max_steps
                        for k, t in self.trainers.items()
                ]) or not self.train_model:
                    if self.env.global_done:
                        self.env.curriculum.increment_lesson(
                            self._get_progress())
                        curr_info = self.env.reset(
                            train_mode=self.fast_simulation)
                        for brain_name, trainer in self.trainers.items():
                            trainer.end_episode()
                    # Decide and take an action
                    take_action_vector, take_action_memories, take_action_text, take_action_outputs = {}, {}, {}, {}
                    for brain_name, trainer in self.trainers.items():
                        (take_action_vector[brain_name],
                         take_action_memories[brain_name],
                         take_action_text[brain_name],
                         take_action_outputs[brain_name]
                         ) = trainer.take_action(curr_info)
                    new_info = self.env.step(vector_action=take_action_vector,
                                             memory=take_action_memories,
                                             text_action=take_action_text)
                    for brain_name, trainer in self.trainers.items():
                        trainer.add_experiences(
                            curr_info, new_info,
                            take_action_outputs[brain_name])
                        trainer.process_experiences(curr_info, new_info)
                        if trainer.is_ready_update(
                        ) and self.train_model and trainer.get_step <= trainer.get_max_steps:
                            # Perform gradient descent with experience buffer
                            trainer.update_model()
                        # Write training statistics to Tensorboard.
                        trainer.write_summary(
                            self.env.curriculum.lesson_number)
                        if self.train_model and trainer.get_step <= trainer.get_max_steps:
                            trainer.increment_step_and_update_last_reward()
                    if self.train_model:
                        global_step += 1
                    if global_step % self.save_freq == 0 and global_step != 0 and self.train_model:
                        # Save Tensorflow model
                        self._save_model(sess, steps=global_step, saver=saver)
                    curr_info = new_info
                # Final save Tensorflow model
                if global_step != 0 and self.train_model:
                    self._save_model(sess, steps=global_step, saver=saver)
            except KeyboardInterrupt:
                print(
                    '--------------------------Now saving model-------------------------'
                )
                if self.train_model:
                    self.logger.info(
                        "Learning was interrupted. Please wait while the graph is generated."
                    )
                    self._save_model(sess, steps=global_step, saver=saver)
                pass
        self.env.close()
        if self.train_model:
            self._export_graph()
Example #7
0
from unityagents import UnityEnvironment

env_name = "tv_maze"
env = UnityEnvironment(file_name=env_name, worker_id = 2)
print(str(env))

default_brain = env.brain_names[0]
brain = env.brains[default_brain]
train_mode = False
prevState = None
np.random.seed = 13

d = {"startLoc" : 1, "render" : 1.0, "tv" : 0.0, "door" : 1.0}
for episode in range(300):
    env_info = env.reset(train_mode=train_mode, config = d)[default_brain]
    done = False
    episode_rewards = 0
    for i in range(1000):
        print(env_info.states)
        if brain.action_space_type == 'continuous':

            act = np.random.randn(len(env_info.agents), brain.action_space_size)
            if False:
                quaternion = [1,0,0,0]
                quaternion = np.array(quaternion)
                act[:, :4] = quaternion
            env_info = env.step(act)[default_brain]
        else:
            a = int(input("input: "))
            env_info = env.step(a)[default_brain]
import pickle

NO_GRAPHICS = True
GPU_SERVER = True
MONITOR_INTERVAL = 10
TRAIN_MODE = True

env = UnityEnvironment(file_name='../Reacher_Linux_NoVis/Reacher.x86_64'
                       if GPU_SERVER else '../Reacher.app',
                       no_graphics=NO_GRAPHICS)

brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# reset the environment
env_info = env.reset(train_mode=TRAIN_MODE)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(
    states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])
Example #9
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Train a ddpg agent to play the Unity Environment Reacher app')
    parser.add_argument("--episodes",
                        type=int,
                        help="Number of training episodes to run",
                        default=200)
    parser.add_argument("--max_steps",
                        type=int,
                        help="Maximum steps per episode",
                        default=1000)
    parser.add_argument(
        "--saveto",
        help=
        "Save agent after training.  agent- and critic- are prepended to the specified name.",
        default='checkpoint.pth')
    parser.add_argument("--loadfrom",
                        help="Load previously saved model before training")
    parser.add_argument(
        "--min_score",
        type=float,
        help="Only save the model if the it achieves this score",
        default=30.)
    parser.add_argument("--saveplot", help="Location to save plot of scores")
    parser.add_argument(
        "--environment",
        help="Path to Unity environment for game (i.e. ./Reacher.App)",
        default="./Reacher.app")
    parser.add_argument(
        "--eval",
        type=bool,
        help=
        "Turns on eval mode, which affects the unity environment and removes the random noise from the predicted agent actions",
        default=False)
    args = parser.parse_args()

    env = UnityEnvironment(file_name=args.environment)
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]

    # number of actions
    action_size = brain.vector_action_space_size

    # examine the state space
    state = env_info.vector_observations[0]
    state_size = len(state)

    num_agents = len(env_info.agents)
    print('Number of agents:', num_agents)

    # Create agent and start training
    _agent = ddpg_agent.DDPGAgent(state_size, action_size, num_agents)
    if args.loadfrom:
        _agent.load(args.loadfrom)
    _coach = coach.Coach(_agent, env)
    scores = _coach.run_episodes(args.episodes,
                                 args.max_steps,
                                 train=not args.eval)
    mean_score = np.mean(scores[-100:])

    # Save the network if successful
    if mean_score > args.min_score and args.saveto:
        _agent.save(args.saveto)
        print("Training succeeded!")

    # Plot scores
    plt.plot(scores)
    plt.plot(moving_average(scores, 100), color='red')
    plt.ylabel('Episode scores')
    if args.saveplot:
        plt.savefig(args.saveplot, bbox_inches='tight')

    print("Your agent received a final mean score of {}".format(mean_score))
Example #10
0
def train_unity_ddpg(PATH, env_name, platform, env_path, policy,
                     score_threshold, timestamp, start, n_episodes, max_t,
                     num_agents):
    """ Trains unity environments with DDPG policy """
    total_scores = []
    from unityagents import UnityEnvironment
    env_path = PATH + f"data/{env_path}"
    env = UnityEnvironment(file_name=env_path)
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    env_info = env.reset(train_mode=True)[brain_name]
    num_agents = len(env_info.agents)
    print(f"Number of agents: {num_agents}")
    states = env_info.vector_observations
    state_size = states.shape[1]
    print(
        f"There are {states.shape[0]} agents.  Each observes a state with length {state_size}"
    )
    print(f"The state for the first agent looks like:\n{states[0]}")
    action_size = brain.vector_action_space_size
    print(f"Size of each action: {action_size}")
    policy = policy(state_size, action_size, num_agents)
    for i_episode in range(1, n_episodes + 1):
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations
        scores = np.zeros(num_agents)
        policy.reset()
        for t in range(max_t):
            actions = policy.act(states)
            env_info = env.step(actions)[
                brain_name]  # send the action to the environment
            next_states = env_info.vector_observations
            rewards = env_info.rewards  # get the reward
            dones = env_info.local_done
            policy.step(states, actions, rewards, next_states, dones, t)
            states = next_states
            scores += env_info.rewards
            if np.any(dones):
                break
        score_length = len(total_scores) if len(total_scores) < 100 else 100
        mean_score = np.mean(scores)
        min_score = np.min(scores)
        max_score = np.max(scores)
        total_scores.append(mean_score)
        total_average_score = np.mean(total_scores[-score_length:])
        end = time.time()
        print(
            f'\rEpisode {i_episode}\tScore TAS/Mean/Max/Min: {total_average_score:.2f}/{mean_score:.2f}/{max_score:.2f}/{min_score:.2f}\t{calc_runtime(end-start)}',
            end=" ")
        if i_episode % 20 == 0 or total_average_score >= score_threshold:
            fap = PATH + f'results/{env_name}_{timestamp}_checkpoint_actor.pth'
            torch.save(policy.actor.state_dict(), fap)
            fcp = PATH + f'results/{env_name}_{timestamp}_checkpoint_critic.pth'
            torch.save(policy.critic.state_dict(), fcp)
            print(
                f'\rEpisode {i_episode}\tScore TAS/Mean/Max/Min: {total_average_score:.2f}/{mean_score:.2f}/{max_score:.2f}/{min_score:.2f}\t{calc_runtime(end-start)}'
            )
        if total_average_score > score_threshold:
            print(f"Solved in {i_episode} and {calc_runtime(end-start)}")
            break
    env.close()
    return total_scores
Example #11
0
    summary.value.add(tag="Cumulated Reward", simple_value=episodeReward)
    summary.value.add(tag="Epsilon", simple_value=epsilon)
    summary.value.add(tag="Learning Rate", simple_value=lr)
    summary.value.add(tag="Episode Length", simple_value=episodeStep)
    writer.add_summary(summary, episode)
    writer.flush()


with tf.Session(config=config) as sess:
    sess.run(init)
    totalStep = 0
    for episode in range(maxEpisode):
        # initial observation
        episodeStep = 0
        episodeReward = 0
        info = env.reset()[brain_name]
        state = info.states[0]
        while True:
            action = RL.choose_action(state)
            new_info = env.step({brain_name: [action]})[brain_name]
            RL.store_transition(state, action, new_info.rewards[0],
                                new_info.states[0])
            episodeReward += new_info.rewards[0]
            if (totalStep > 200) and (totalStep % learning_freq == 0):
                RL.learn()
                state = new_info.states[0]
                if new_info.local_done[0]:
                    break
            totalStep += 1
            episodeStep += 1
        if (episode % summary_freq == 0):
class UnityEnv(Env):
    allowed_modes = ['vector', 'visual']

    def __init__(self, filename: str, mode='vector',
                 frame_size=(84, 84), use_grayscale=True, n_frames=4,
                 **kwargs) -> None:
        super().__init__(**kwargs)
        if mode not in self.allowed_modes:
            raise Exception("Allowed modes : %s" % self.allowed_modes)

        if "headless" in kwargs:
            del kwargs["headless"]

        if "train_mode" in kwargs:
            del kwargs["train_mode"]

        self.mode = mode
        self.env = UnityEnvironment(filename, no_graphics=self._headless, **kwargs)
        self.brain_name = self.env.brain_names[0]
        brain = self.env.brains[self.brain_name]
        env_info = self.env.reset(train_mode=self._train_mode)[self.brain_name]

        self.nA = brain.vector_action_space_size
        self.action_shape = (self.nA,)

        if mode == 'vector':
            self.nS = len(env_info.vector_observations[0])
            self.state_shape = (self.nS,)
        elif mode == 'visual':
            self.frame_size = tuple(frame_size)
            self.use_grayscale = use_grayscale
            self.n_frames = n_frames
            self.frame_buffer = deque(maxlen=self.n_frames)
            num_channels = 1
            if not use_grayscale:
                num_channels = 3
            self.state_shape = self.frame_size + (num_channels * n_frames,)

    def reset(self):
        if self.mode == 'visual':
            self.frame_buffer.clear()
        env_info = self.env.reset(train_mode=self._train_mode)[self.brain_name]
        return self._to_state(env_info)

    def step(self, action):
        env_info = self.env.step(action)[self.brain_name]
        next_state = self._to_state(env_info)
        reward = env_info.rewards[0]
        done = env_info.local_done[0]

        return next_state, reward, done, env_info

    def render(self, **kwargs):
        pass

    def close(self):
        pass

    def _process_frame(self, frame):
        frame = np.squeeze(frame, axis=0)
        frame = resize(frame, self.frame_size, mode='constant', anti_aliasing=True)
        if self.use_grayscale:
            frame = np.expand_dims(rgb2gray(frame), axis=2)
        return frame

    def _to_state(self, env_info):
        if self.mode == 'vector':
            return env_info.vector_observations[0]
        elif self.mode == 'visual':
            frame = self._process_frame(env_info.visual_observations[0])
            if len(self.frame_buffer) == 0:
                for i in range(self.n_frames):
                    self.frame_buffer.append(frame)
            else:
                self.frame_buffer.append(frame)

            result = np.reshape(self.frame_buffer, self.state_shape)
            result = np.expand_dims(result, axis=0)
            return result
Example #13
0
def main():
    # ---------------------------------------------------------------------------------------------------
    #  Logger
    # ---------------------------------------------------------------------------------------------------
    save_path = f"./results/Tennis_DDPG_{pd.Timestamp.utcnow().value}"
    os.makedirs(save_path, exist_ok=True)

    logger = logging.getLogger()
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s : %(message)s')

    handler = logging.FileHandler(
        f"{save_path}/logs_p3_{pd.Timestamp.utcnow().value}.log")
    handler.setLevel(logging.DEBUG)
    handler.setFormatter(formatter)
    logger.addHandler(handler)

    # ---------------------------------------------------------------------------------------------------
    #  Inputs
    # ---------------------------------------------------------------------------------------------------
    import json
    with open(f"./assets/best_agent/config.json", "r") as f:
        config = json.load(f)
    config["mode"] = "test"
    config["n_episodes"] = 10
    config["warmup"] = 0

    logger.warning("+=" * 90)
    logger.warning(f"  RUNNING SIMULATION WITH PARAMETERS config={config}")
    logger.warning("+=" * 90)

    # ------------------------------------------------------------
    #  1. Initialization
    # ------------------------------------------------------------
    # 1. Start the Environment
    env = UnityEnvironment(file_name=f'./{config["env_name"]}')  # mac OS

    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]

    # number of agents
    num_agents = len(env_info.agents)
    print('Number of agents:', num_agents)
    config["n_agents"] = num_agents

    # size of each action
    action_size = brain.vector_action_space_size
    print('Size of each action:', action_size)

    # examine the state space
    states = env_info.vector_observations
    state_size = states.shape[1]
    print('There are {} agents. Each observes a state with length: {}'.format(
        states.shape[0], state_size))
    print('The state for the first agent looks like:', states[0])
    config.update(dict(action_size=action_size, state_size=state_size))

    # ------------------------------------------------------------
    #  2. Training
    # ------------------------------------------------------------
    # Unity Monitor
    monitor = UnityMonitor(env=env, config=config)

    # Actor model
    seed = 0
    actor = SimpleNeuralNetHead(action_size,
                                SimpleNeuralNetBody(
                                    state_size,
                                    config["hidden_layers_actor"],
                                    seed=seed),
                                func=torch.tanh,
                                seed=seed)
    # Critic model
    critic = DeepNeuralNetHeadCritic(
        action_size * num_agents,
        SimpleNeuralNetBody(state_size * num_agents,
                            config["hidden_layers_critic_body"],
                            func=eval(config["func_critic_body"]),
                            seed=seed),
        hidden_layers_sizes=config["hidden_layers_critic_head"],
        func=eval(config["func_critic_head"]),
        end_func=None,
        seed=seed)

    # MADDPG Agent
    agent = MADDPGAgent(
        state_size=state_size,
        action_size=action_size,
        model_actor=actor,
        model_critic=critic,
        action_space_low=-1,
        action_space_high=1,
        config=config,
    )

    # ------------------------------------------------------------
    #  3. Testing
    # ------------------------------------------------------------
    logger.warning("Entering Test Mode!")
    monitor.n_episodes = 100
    env.reset(train_mode=False)
    env.warmup = 0
    agent.warmup = 0
    for a in agent.agents:
        a.warmup = 0
    agent.load(filepath="./assets/best_agent", mode="test")
    scores = monitor.run(agent)
    logger.info(f"Test Score over {len(scores)} episodes: {np.mean(scores)}")
    config["test_scores"] = scores
    config["best_test_score"] = max(scores)
    config["avg_test_score"] = np.mean(scores)

    # When finished, you can close the environment.
    logger.info("Closing...")
    env.close()
Example #14
0
class Env:
    '''A convinience function for generating episodes and memories
    
    This convinience class generates a context manager that can be
    used for generating a Unity environment. The Unity environment
    and the OpenAI Gym environment operates slightly differently
    and hence it will be difficult to create a uniform algorithm that
    is able to solve everything at the sametime. This environment
    tries to solve that problem.
    '''
    def __init__(self, fileName, showEnv=False, trainMode=True):
        '''Initialize the environment
        
        This sets up the requirements that will later be used for generating
        the Unity Environment. This assumes that you will provide a binary
        file for generating the environment. There are different ways in 
        which the environment can be generated. It can be generated either
        in a *headless* mode by using showEnv as False, in which case the 
        environment will not show a window at startup. This is good for 
        training, as well as situations when you are running the environment
        without the presence of an X server, especially when you are running 
        this environment remotely. The other thing that you can do is to 
        specify that this is being run in `trainMode`. In this case, the 
        environment will be primed for training. That is, each frame will
        finish as soon as possible. This is not good for observing what is
        happening. However, this significantly increases the speed of 
        training. 
        
        Arguments:
            fileName {str} -- Path to the binary file. This file must be
                the same as the one for which the `unityagents` package 
                has been generated. 
        
        Keyword Arguments:
            showEnv {bool} -- Set this to ``True`` if you want to view the 
                environment (default: {False})
            trainMode {bool} -- Set this to ``True`` if you want the environment
                tobe in training mode (i.e. fast execution) (default: {True})
        '''

        try:
            self.no_graphics = not showEnv
            self.trainMode = trainMode
            self.fileName = fileName
            self.states = None
        except Exception as e:
            raise type(e)('lib.envs.envUnity.Env.__init__ - ERROR - ' +
                          str(e)).with_traceback(sys.exc_info()[2])
        return

    def __enter__(self):
        '''generate a context manager
        
        This will actually generate the context manager and allow you use this 
        within a ``with`` statement. This is the function that actually 
        initialized the environment and maintains it, until it is needed. 
        
        Returns:
            ``this`` -- Returns an instance of the same class
        '''

        try:
            self.env = UnityEnvironment(file_name=self.fileName,
                                        no_graphics=self.no_graphics)

            # get the default brain
            self.brain_name = self.env.brain_names[0]
            self.brain = self.env.brains[self.brain_name]
            self.env_info = self.env.reset(
                train_mode=self.trainMode)[self.brain_name]

            self.num_agents = len(self.env_info.agents)
            self.action_size = self.brain.vector_action_space_size
        except Exception as e:
            raise type(e)('lib.envs.envUnity.Env.__enter__ - ERROR - ' +
                          str(e)).with_traceback(sys.exc_info()[2])

        return self

    def reset(self):
        '''reset the environment before starting an episode
        
        Returns:
            status -- The current status after the reset
        '''
        try:
            self.env.reset(train_mode=self.trainMode)
            self.states = self.env_info.vector_observations
        except Exception as e:
            raise type(e)('lib.envs.envUnity.Env.reset - ERROR - ' +
                          str(e)).with_traceback(sys.exc_info()[2])
        return self.states

    def step(self, policy):
        '''advance one step by taking an action
        
        This function takes a policy function and generates an action 
        according to that particular policy. This results in the 
        advancement of the episode into a one step with the return 
        of the reward, and the next state along with any done 
        information. 
        
        Arguments:
            policy {function} -- This function takes a state vector and 
                returns an action vector. It is assumed that the policy 
                is the correct type of policy, and is capable if taking
                the right returning the right type of vector corresponding
                the the policy for the current environment. It does not 
                check for the validity of the policy function
        
        Returns:
            list -- This returns a list of tuples containing the tuple 
                ``(s_t, a_t, r_{t+1}, s_{t+1}, d)``. One tuple for each
                agent. Even for the case of a single agent, this is going
                to return a list of states
        '''

        try:
            states = self.states.copy()
            actions = policy(states)
            env_info = self.env.step(actions)[self.brain_name]
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done

            self.states = next_states

            results = []
            for i in range(self.num_agents):
                state = states[i]
                action = actions[i]
                reward = rewards[i]
                next_state = next_states[i]
                done = dones[i]

                results.append((state, action, reward, next_state, done))

        except Exception as e:
            raise type(e)('lib.envs.envUnity.Env.step - ERROR - ' +
                          str(e)).with_traceback(sys.exc_info()[2])

        return results

    def episode(self, policy, maxSteps=None):
        '''generate data for an entire episode
        
        This function generates an entire episde. It plays the environment
        by first resetting it too the beginning, and then playing the game for 
        a given number of steps (or unless the game is terminated). It generates
        a set of list of tuplees, again one for each agent. Rememebr that even
        when the number of agents is 1, it will still return a list oof states.

        Arguments:
            policy {function} -- The function that takes the current state and 
                returns the action vector. 
        
        Keyword Arguments:
            maxSteps {int or None} -- The maximum number of steps that the agent is
                going to play the episode before the episode is terminated. (default: 
                {None} in which case the episode will continue until it actually 
                finishes)
        
        Returns:
            list -- This returns the list of tuples for the entire episode. Again, this
                is a lsit of lists, one for each agent.
        '''

        try:
            self.reset()
            stepCount = 0
            allResults = [[] for _ in range(self.num_agents)]

            while True:

                stepCount += 1
                finished = False
                results = self.step(policy)
                for agent in range(self.num_agents):
                    state, action, reward, next_state, done = results[agent]
                    allResults[agent].append(results[agent])
                    finished = finished or done

                if finished:
                    break

                if (maxSteps is not None) and (stepCount >= maxSteps):
                    break
        except Exception as e:
            raise type(e)('lib.envs.envUnity.Env.episode - ERROR - ' +
                          str(e)).with_traceback(sys.exc_info()[2])

        return allResults

    def __exit__(self, exc, value, traceback):
        '''Exit the context manager
        
        The exit funciton that will result in exiting the
        context manager. Typically one is supposed to check 
        the error if any at this point. This will be handled 
        at a higher level
        
        Arguments:
            *args {[type]} -- [description]
        '''

        if not exec:
            self.env.close()
            return True
Example #15
0
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , '''
        
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])
Example #16
0
class UnityEnv:
    '''
    Class for all Envs.
    Standardizes the UnityEnv design to work in Lab.
    Access Agents properties by: Agents - AgentSpace - AEBSpace - EnvSpace - Envs
    '''

    def __init__(self, env_spec, env_space, e=0):
        self.env_spec = env_spec
        self.env_space = env_space
        self.info_space = env_space.info_space
        self.e = e
        util.set_attr(self, self.env_spec)
        self.name = self.env_spec['name']
        self.body_e = None
        self.nanflat_body_e = None  # nanflatten version of bodies
        self.body_num = None

        worker_id = int(f'{os.getpid()}{self.e+int(ps.unique_id())}'[-4:])
        self.u_env = UnityEnvironment(file_name=util.get_env_path(self.name), worker_id=worker_id)
        # spaces for NN auto input/output inference
        logger.warn('Unity environment observation_space and action_space are constructed with invalid range. Use only their shapes.')
        self.observation_spaces = []
        self.action_spaces = []
        for a in range(len(self.u_env.brain_names)):
            observation_shape = (self.get_observable_dim(a)['state'],)
            if self.get_brain(a).state_space_type == 'discrete':
                observation_space = gym.spaces.Box(low=0, high=1, shape=observation_shape, dtype=np.int32)
            else:
                observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=observation_shape, dtype=np.float32)
            self.observation_spaces.append(observation_space)
            if self.is_discrete(a):
                action_space = gym.spaces.Discrete(self.get_action_dim(a))
            else:
                action_space = gym.spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32)
            self.action_spaces.append(action_space)
        for observation_space, action_space in zip(self.observation_spaces, self.action_spaces):
            set_gym_space_attr(observation_space)
            set_gym_space_attr(action_space)

        # TODO experiment to find out optimal benchmarking max_timestep, set
        # TODO ensure clock_speed from env_spec
        self.clock_speed = 1
        self.clock = Clock(self.clock_speed)
        self.done = False

    def check_u_brain_to_agent(self):
        '''Check the size match between unity brain and agent'''
        u_brain_num = self.u_env.number_brains
        agent_num = len(self.body_e)
        assert u_brain_num == agent_num, f'There must be a Unity brain for each agent. e:{self.e}, brain: {u_brain_num} != agent: {agent_num}.'

    def check_u_agent_to_body(self, env_info_a, a):
        '''Check the size match between unity agent and body'''
        u_agent_num = len(env_info_a.agents)
        body_num = util.count_nonan(self.body_e[a])
        assert u_agent_num == body_num, f'There must be a Unity agent for each body; a:{a}, e:{self.e}, agent_num: {u_agent_num} != body_num: {body_num}.'

    def get_brain(self, a):
        '''Get the unity-equivalent of agent, i.e. brain, to access its info'''
        name_a = self.u_env.brain_names[a]
        brain_a = self.u_env.brains[name_a]
        return brain_a

    def get_env_info(self, env_info_dict, a):
        name_a = self.u_env.brain_names[a]
        env_info_a = env_info_dict[name_a]
        return env_info_a

    @lab_api
    def post_body_init(self):
        '''Run init for components that need bodies to exist first, e.g. memory or architecture.'''
        self.nanflat_body_e = util.nanflatten(self.body_e)
        for idx, body in enumerate(self.nanflat_body_e):
            body.nanflat_e_idx = idx
        self.body_num = len(self.nanflat_body_e)
        self.check_u_brain_to_agent()
        logger.info(util.self_desc(self))

    def is_discrete(self, a):
        '''Check if an agent (brain) is subject to discrete actions'''
        return self.get_brain(a).is_discrete()

    def get_action_dim(self, a):
        '''Get the action dim for an agent (brain) in env'''
        return self.get_brain(a).get_action_dim()

    def get_action_space(self, a):
        return self.action_spaces[a]

    def get_observable_dim(self, a):
        '''Get the observable dim for an agent (brain) in env'''
        return self.get_brain(a).get_observable_dim()

    def get_observable_types(self, a):
        '''Get the observable for an agent (brain) in env'''
        return self.get_brain(a).get_observable_types()

    def get_observation_space(self, a):
        return self.observation_spaces[a]

    @lab_api
    def reset(self):
        self.done = False
        env_info_dict = self.u_env.reset(train_mode=(util.get_lab_mode() != 'dev'), config=self.env_spec.get('unity'))
        _reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s(ENV_DATA_NAMES, e=self.e)
        for (a, b), body in util.ndenumerate_nonan(self.body_e):
            env_info_a = self.get_env_info(env_info_dict, a)
            self.check_u_agent_to_body(env_info_a, a)
            state = env_info_a.states[b]
            state_e[(a, b)] = state
            done_e[(a, b)] = self.done
        return _reward_e, state_e, done_e

    @lab_api
    def step(self, action_e):
        # TODO implement clock_speed: step only if self.clock.to_step()
        if self.done:
            return self.reset()
        action_e = util.nanflatten(action_e)
        env_info_dict = self.u_env.step(action_e)
        reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s(ENV_DATA_NAMES, e=self.e)
        for (a, b), body in util.ndenumerate_nonan(self.body_e):
            env_info_a = self.get_env_info(env_info_dict, a)
            reward_e[(a, b)] = env_info_a.rewards[b]
            state_e[(a, b)] = env_info_a.states[b]
            done_e[(a, b)] = env_info_a.local_done[b]
        self.done = (util.nonan_all(done_e) or self.clock.get('t') > self.max_timestep)
        return reward_e, state_e, done_e

    @lab_api
    def close(self):
        self.u_env.close()
Example #17
0
class UnityEnv(BaseEnv):
    r"""
    Basic Unity ML Agent environment.

    config example:
    "env": {
        "name": "Reacher",
        "type": "unity",
        "seed": 0,
        "to_render": True,
        "frame_sleep": 0.001,
        "max_steps": 1000,
        "one_hot": None,
        "action_bins": None,
        "reward_scale": None,
        "num_envs": None,
    }
    """

    def __init__(self, config):
        super(UnityEnv, self).__init__(config)

        self._env = UnityEnvironment(file_name=get_env_path(self.name), seed=self.seed)
        self.patch_gym_spaces(self._env)
        self._set_attr_from_u_env(self._env)

        # TODO: Logging
        print(utils.describe(self))

    def reset(self):
        self.done = False
        info_dict = self._env.reset(train_mode=self.to_render)
        env_info = self._get_env_info(info_dict, 0)
        state = env_info.vector_observations[0]
        return state

    def step(self, action):
        info_dict = self._env.step(action)
        env_info = self._get_env_info(info_dict, 0)
        state = env_info.vector_observations[0]
        reward = env_info.rewards[0]
        done = env_info.local_done[0]
        return state, reward, done, env_info

    def render(self):
        pass

    def close(self):
        self._env.close()

    def _get_brain(self, env, brain_index):
        r"""
        Get the unity-equivalent of agent, i.e. brain, to access its info
        :param env:
        :param brain_index:
        :return:
        """
        brain_name = env.brain_names[brain_index]
        brain = env.brains[brain_name]
        return brain

    def patch_gym_spaces(self, env):
        r"""
        For standardization, use gym spaces to represent observation and action spaces for Unity.
        This method iterates through the multiple brains (multiagent) then constructs and returns lists of observation_spaces and action_spaces
        :param env:
        :return:
        """

        observation_spaces = []
        action_spaces = []
        for brain_index in range(len(env.brain_names)):
            brain = self._get_brain(env, brain_index)

            # TODO: Logging
            utils.describe(brain)

            observation_shape = (brain.get_observable_dim()['state'],)
            action_dim = (brain.get_action_dim(),)

            if brain.is_discrete():
                dtype = np.int32
                action_space = spaces.Discrete(brain.get_action_dim())
            else:
                dtype = np.float32
                action_space = spaces.Box(low=0.0, high=1.0, shape=action_dim, dtype=dtype)

            observation_space = spaces.Box(low=0, high=1, shape=observation_shape, dtype=dtype)
            utils.set_gym_space_attr(observation_space)
            utils.set_gym_space_attr(action_space)
            observation_spaces.append(observation_space)
            action_spaces.append(action_space)

        # set for singleton
        env.observation_space = observation_spaces[0]
        env.action_space = action_spaces[0]

        return observation_spaces, action_spaces

    def _get_env_info(self, env_info_dict, index):
        r"""
        Unity API returns a env_info_dict. Use this method to pull brain(env)-specific
        :param env_info_dict:
        :param index:
        :return:
        """
        brain_name = self._env.brain_names[index]
        env_info = env_info_dict[brain_name]
        return env_info
Example #18
0
class ExperienceManager:
    def __init__(self):
        #define the params for later usage
        self.env = None
        self.brain_name = None
        self.agent = None

    #initialize enviroment and set the state space size and  action space size
    def initEnviroment(self):
        print('Initialize env')
        #initalize Unity env
        #update to you
        self.env = UnityEnvironment(file_name=BANANA_INSTALLATION)
        #get the default brain
        self.brain_name = self.env.brain_names[0]
        #reset the environment
        env_info = self.env.reset(train_mode=TRAIN_MODE)[self.brain_name]
        #get size of action and state
        self.action_size = self.env.brains[
            self.brain_name].vector_action_space_size
        self.state_size = len(env_info.vector_observations[0])
        #initiate Agent
        self.agent = Agent(state_size=self.state_size,
                           action_size=self.action_size)
        print('Env init done')

    #run one episode and return total reward
    def runEpisode(self):
        #init score to 0 and reset env
        score = 0
        state = self.env.reset(
            train_mode=TRAIN_MODE)[self.brain_name].vector_observations[0]
        while True:
            #get greedy action
            action = self.agent.greedy_action(state)
            #perform action
            env_info = self.env.step(action)[self.brain_name]
            #get the step result
            next_state = env_info.vector_observations[0]  # get the next state
            reward = env_info.rewards[0]  # get the reward
            done = env_info.local_done[0]  # see if episode has finished
            #store step result and perform learning
            self.agent.step(state, action, reward, next_state, done)
            #update state and score
            state = next_state
            score += reward
            if done:
                #finito
                break
        #return the score of whole episode
        return score

    #run the whole experiments = defined number of episodes
    def runEperiment(self, n_episodes=EPISODES_NUM):
        #init enviroment
        self.initEnviroment()
        scores = []
        scores_window = deque(maxlen=100)  # last 100 scores
        for i_episode in range(1, n_episodes + 1):
            #run one episode
            score = self.runEpisode()
            #store the score of episode
            scores_window.append(score)
            scores.append(score)
            #print progress
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_window)),
                  end="")
            #keep progress of last 100 episodes
            if i_episode % 100 == 0:
                print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                    i_episode, np.mean(scores_window)))
        return scores
Example #19
0
class TrainerController(object):
    def __init__(self, env_path, run_id, save_freq, curriculum_file,
                 fast_simulation, load, train, worker_id, keep_checkpoints,
                 lesson, seed, docker_target_name, trainer_config_path,
                 use_data_gatherer):
        """

        :param env_path: Location to the environment executable to be loaded.
        :param run_id: The sub-directory name for model and summary statistics
        :param save_freq: Frequency at which to save model
        :param curriculum_file: Curriculum json file for environment
        :param fast_simulation: Whether to run the game at training speed
        :param load: Whether to load the model or randomly initialize
        :param train: Whether to train model, or only run inference
        :param worker_id: Number to add to communication port (5005). Used for multi-environment
        :param keep_checkpoints: How many model checkpoints to keep
        :param lesson: Start learning from this lesson
        :param seed: Random seed used for training.
        :param docker_target_name: Name of docker volume that will contain all data.
        :param trainer_config_path: Fully qualified path to location of trainer configuration file
        """
        ''' Here's a small change (this only happens if code is launched with the '--data-gatherer' flag) '''
        self.use_data_gatherer = use_data_gatherer

        self.trainer_config_path = trainer_config_path
        env_path = (env_path.strip().replace('.app', '').replace(
            '.exe', '').replace('.x86_64', '').replace('.x86', '')
                    )  # Strip out executable extensions if passed
        # Recognize and use docker volume if one is passed as an argument
        if docker_target_name == '':
            self.docker_training = False
            self.model_path = './models/{run_id}'.format(run_id=run_id)
            self.curriculum_file = curriculum_file
            self.summaries_dir = './summaries'
        else:
            self.docker_training = True
            self.model_path = '/{docker_target_name}/models/{run_id}'.format(
                docker_target_name=docker_target_name, run_id=run_id)
            env_path = '/{docker_target_name}/{env_name}'.format(
                docker_target_name=docker_target_name, env_name=env_path)
            if curriculum_file is None:
                self.curriculum_file = None
            else:
                self.curriculum_file = '/{docker_target_name}/{curriculum_file}'.format(
                    docker_target_name=docker_target_name,
                    curriculum_file=curriculum_file)
            self.summaries_dir = '/{docker_target_name}/summaries'.format(
                docker_target_name=docker_target_name)
        self.logger = logging.getLogger("unityagents")
        self.run_id = run_id
        self.save_freq = save_freq
        self.lesson = lesson
        self.fast_simulation = fast_simulation
        self.load_model = load
        self.train_model = train
        self.worker_id = worker_id
        self.keep_checkpoints = keep_checkpoints
        self.trainers = {}
        if seed == -1:
            seed = np.random.randint(0, 999999)
        self.seed = seed
        np.random.seed(self.seed)
        tf.set_random_seed(self.seed)
        self.env = UnityEnvironment(file_name=env_path,
                                    worker_id=self.worker_id,
                                    curriculum=self.curriculum_file,
                                    seed=self.seed,
                                    docker_training=self.docker_training)
        self.env_name = os.path.basename(
            os.path.normpath(env_path))  # Extract out name of environment

    def _get_progress(self):
        if self.curriculum_file is not None:
            progress = 0
            if self.env.curriculum.measure_type == "progress":
                for brain_name in self.env.external_brain_names:
                    progress += self.trainers[
                        brain_name].get_step / self.trainers[
                            brain_name].get_max_steps
                return progress / len(self.env.external_brain_names)
            elif self.env.curriculum.measure_type == "reward":
                for brain_name in self.env.external_brain_names:
                    progress += self.trainers[brain_name].get_last_reward
                return progress
            else:
                return None
        else:
            return None

    def _process_graph(self):
        nodes = []
        scopes = []
        for brain_name in self.trainers.keys():
            if self.trainers[brain_name].graph_scope is not None:
                scope = self.trainers[brain_name].graph_scope + '/'
                if scope == '/':
                    scope = ''
                scopes += [scope]
                if self.trainers[brain_name].parameters[
                        "trainer"] == "imitation":
                    nodes += [scope + x for x in ["action"]]
                elif not self.trainers[brain_name].parameters["use_recurrent"]:
                    nodes += [
                        scope + x
                        for x in ["action", "value_estimate", "action_probs"]
                    ]
                else:
                    node_list = [
                        "action", "value_estimate", "action_probs",
                        "recurrent_out", "memory_size"
                    ]
                    nodes += [scope + x for x in node_list]
        if len(scopes) > 1:
            self.logger.info("List of available scopes :")
            for scope in scopes:
                self.logger.info("\t" + scope)
        self.logger.info("List of nodes to export :")
        for n in nodes:
            self.logger.info("\t" + n)
        return nodes

    def _save_model(self, sess, saver, steps=0):
        """
        Saves current model to checkpoint folder.
        :param sess: Current Tensorflow session.
        :param steps: Current number of steps in training process.
        :param saver: Tensorflow saver for session.
        """
        last_checkpoint = self.model_path + '/model-' + str(steps) + '.cptk'
        saver.save(sess, last_checkpoint)
        tf.train.write_graph(sess.graph_def,
                             self.model_path,
                             'raw_graph_def.pb',
                             as_text=False)
        self.logger.info("Saved Model")

    def _export_graph(self):
        """
        Exports latest saved model to .bytes format for Unity embedding.
        """
        target_nodes = ','.join(self._process_graph())
        ckpt = tf.train.get_checkpoint_state(self.model_path)

        freeze_graph.freeze_graph(
            input_graph=self.model_path + '/raw_graph_def.pb',
            input_binary=True,
            input_checkpoint=ckpt.model_checkpoint_path,
            output_node_names=target_nodes,
            ######## FOLOWING LINE UGLY FIX: only return first 20 characters of run_id ######
            output_graph=self.model_path + '/' + self.env_name + "_" +
            self.run_id[:20] + '.bytes',
            clear_devices=True,
            initializer_nodes="",
            input_saver="",
            restore_op_name="save/restore_all",
            filename_tensor_name="save/Const:0")

    def _initialize_trainers(self, trainer_config, sess):
        trainer_parameters_dict = {}
        self.trainers = {}
        for brain_name in self.env.external_brain_names:
            trainer_parameters = trainer_config['default'].copy()
            if len(self.env.external_brain_names) > 1:
                graph_scope = re.sub('[^0-9a-zA-Z]+', '-', brain_name)
                trainer_parameters['graph_scope'] = graph_scope
                trainer_parameters['summary_path'] = '{basedir}/{name}'.format(
                    basedir=self.summaries_dir,
                    name=str(self.run_id) + '_' + graph_scope)
            else:
                trainer_parameters['graph_scope'] = ''
                trainer_parameters['summary_path'] = '{basedir}/{name}'.format(
                    basedir=self.summaries_dir, name=str(self.run_id))
            if brain_name in trainer_config:
                _brain_key = brain_name
                while not isinstance(trainer_config[_brain_key], dict):
                    _brain_key = trainer_config[_brain_key]
                for k in trainer_config[_brain_key]:
                    trainer_parameters[k] = trainer_config[_brain_key][k]
            trainer_parameters_dict[brain_name] = trainer_parameters.copy()
        for brain_name in self.env.external_brain_names:
            if trainer_parameters_dict[brain_name]['trainer'] == "imitation":
                self.trainers[brain_name] = BehavioralCloningTrainer(
                    sess, self.env, brain_name,
                    trainer_parameters_dict[brain_name], self.train_model,
                    self.seed)
            elif trainer_parameters_dict[brain_name]['trainer'] == "ppo":
                self.trainers[brain_name] = PPOTrainer(
                    sess, self.env, brain_name,
                    trainer_parameters_dict[brain_name], self.train_model,
                    self.seed, self.use_data_gatherer)
            else:
                raise UnityEnvironmentException(
                    "The trainer config contains an unknown trainer type for brain {}"
                    .format(brain_name))

    def _load_config(self):
        try:
            with open(self.trainer_config_path) as data_file:
                trainer_config = yaml.load(data_file)
                return trainer_config
        except IOError:
            raise UnityEnvironmentException(
                """Parameter file could not be found here {}.
                                            Will use default Hyper parameters"""
                .format(self.trainer_config_path))
        except UnicodeDecodeError:
            raise UnityEnvironmentException(
                "There was an error decoding Trainer Config from this path : {}"
                .format(self.trainer_config_path))

    @staticmethod
    def _create_model_path(model_path):
        try:
            if not os.path.exists(model_path):
                os.makedirs(model_path)
        except Exception:
            raise UnityEnvironmentException(
                "The folder {} containing the generated model could not be accessed."
                " Please make sure the permissions are set correctly.".format(
                    model_path))

    def start_learning(self):
        self.env.curriculum.set_lesson_number(self.lesson)
        trainer_config = self._load_config()
        self._create_model_path(self.model_path)

        tf.reset_default_graph()
        with tf.Session() as sess:
            self._initialize_trainers(trainer_config, sess)
            for k, t in self.trainers.items():
                self.logger.info(t)
            init = tf.global_variables_initializer()
            saver = tf.train.Saver(max_to_keep=self.keep_checkpoints)
            # Instantiate model parameters
            if self.load_model:
                self.logger.info('Loading Model...')
                ckpt = tf.train.get_checkpoint_state(self.model_path)
                if ckpt is None:
                    self.logger.info(
                        'The model {0} could not be found. Make sure you specified the right '
                        '--run-id'.format(self.model_path))
                saver.restore(sess, ckpt.model_checkpoint_path)
            else:
                sess.run(init)
            global_step = 0  # This is only for saving the model
            self.env.curriculum.increment_lesson(self._get_progress())
            curr_info = self.env.reset(train_mode=self.fast_simulation)
            if self.train_model:
                for brain_name, trainer in self.trainers.items():
                    trainer.write_tensorboard_text('Hyperparameters',
                                                   trainer.parameters)
            try:
                while any([
                        t.get_step <= t.get_max_steps
                        for k, t in self.trainers.items()
                ]) or not self.train_model:
                    if debug_print:
                        print("|", end='', flush=True)
                    if self.env.global_done:
                        self.env.curriculum.increment_lesson(
                            self._get_progress())
                        curr_info = self.env.reset(
                            train_mode=self.fast_simulation)
                        for brain_name, trainer in self.trainers.items():
                            trainer.end_episode()
                    if data_gatherer['reset_after_each_frame']:
                        curr_info = self.env.reset(
                            train_mode=self.fast_simulation)

                    # Decide and take an action
                    take_action_vector, take_action_memories, take_action_text, take_action_outputs = {}, {}, {}, {}
                    for brain_name, trainer in self.trainers.items():
                        (take_action_vector[brain_name],
                         take_action_memories[brain_name],
                         take_action_text[brain_name],
                         take_action_outputs[brain_name]
                         ) = trainer.take_action(curr_info)
                    new_info = self.env.step(vector_action=take_action_vector,
                                             memory=take_action_memories,
                                             text_action=take_action_text)
                    ''' ----- '''
                    ''' Enabling data gathering disables the normal functionality.... '''
                    if self.use_data_gatherer:
                        if data_gatherer['firstRun']:
                            print("---")
                            print("NORMAL FUNCTIONALITY DISABLED!")
                            print(
                                "Now we just sample stats from the initial distribution and save them:"
                            )
                            print("Save dir: {}".format(data_gatherer['dir']))
                            print("---")
                            print(
                                "If you did not expect to see this, NOW is the time to [ctrl-C]! (otherwise: [enter] to continue...)"
                            )
                            ''' Create the folder-structure if it is needed: '''
                            paths = [
                                settings['dir_base'],
                                settings['dir_base'] + settings['project'],
                                data_gatherer['dir']
                            ]
                            for p in paths:
                                if not os.path.isdir(p):
                                    os.makedirs(p)
                                    print("Created path: {}".format(p))
                                else:
                                    print("Reusing existing: {}".format(p))
                            ape = input()
                            data_gatherer['firstRun'] = False

                        #if data_gatherer['reset_after_each_frame']:
                        #    curr_info = self.env.reset(train_mode=self.fast_simulation)
                        #    take_action_vector, take_action_memories, take_action_text, take_action_outputs = {}, {}, {}, {}
                        #    for brain_name, trainer in self.trainers.items():
                        #        (take_action_vector[brain_name],
                        #        take_action_memories[brain_name],
                        #        take_action_text[brain_name],
                        #        take_action_outputs[brain_name]) = trainer.take_action(curr_info)
                        #    new_info = self.env.step(vector_action=take_action_vector, memory=take_action_memories,
                        #                         text_action=take_action_text)

                        is_done = False
                        for x in new_info:
                            for l in range(len(new_info[x].agents)):
                                is_done = is_done or new_info[x].local_done[l]

                        if data_gatherer['idx'] == data_gatherer[
                                'n'] or is_done:
                            #WRITE_TO_FILE....
                            print("Saving chunk {}... ({} samples)".format(
                                data_gatherer['n_chunks'],
                                data_gatherer['idx']))
                            with open(
                                    data_gatherer['dir'] +
                                    data_gatherer['file_base'] +
                                    "chunk{}.pkl".format(
                                        str(data_gatherer['n_chunks']).zfill(
                                            5)), 'wb') as outfile:
                                pickle.dump(
                                    data_gatherer['data']
                                    [:data_gatherer['idx'], :, :, :].reshape(
                                        (-1, ) + data_gatherer['obs_size']),
                                    outfile, pickle.HIGHEST_PROTOCOL)
                            #Prep next:
                            data_gatherer['n_chunks'] += 1
                            data_gatherer['data'] = np.empty(
                                data_gatherer['size'], dtype=np.uint8)
                            data_gatherer['idx'] = 0

                            if data_gatherer['n_chunks'] == 1500:
                                print("Total samples gathered: {}".format(
                                    (data_gatherer['n_chunks'] - 1000) * 1000))
                                exit()
                        data_gatherer['data'][
                            data_gatherer['idx'], :, :, :] = (
                                255 *
                                new_info["PepperBrain"].visual_observations[0]
                            ).astype(np.uint8)
                        data_gatherer['idx'] += 1

                        if data_gatherer['reset_after_each_frame']:
                            continue
                    ''' ----- '''

                    if settings['store_as_int']:
                        for key in new_info:
                            for x in range(
                                    len(new_info[key].visual_observations)):
                                new_info[key].visual_observations[x] = (
                                    255 * new_info[key].visual_observations[x]
                                ).astype(np.uint8)

                    for brain_name, trainer in self.trainers.items():
                        if debug_print:
                            print(".", end='', flush=True)
                        trainer.add_experiences(
                            curr_info, new_info,
                            take_action_outputs[brain_name])
                        trainer.process_experiences(curr_info, new_info)
                        if trainer.is_ready_update(
                        ) and self.train_model and trainer.get_step <= trainer.get_max_steps:
                            if debug_print:
                                print("!", end='', flush=True)
                            # Perform gradient descent with experience buffer
                            print("Updating model... ", end='', flush=True)
                            t = time.time()
                            trainer.update_model()
                            print("[x] Done in {} seconds.".format(
                                time.time()))
                        # Write training statistics to Tensorboard.
                        if debug_print:
                            print(",", end='', flush=True)
                        trainer.write_summary(
                            self.env.curriculum.lesson_number)
                        if self.train_model and trainer.get_step <= trainer.get_max_steps:
                            if debug_print:
                                print("?", end='', flush=True)
                            trainer.increment_step()
                            trainer.update_last_reward()
                    if self.train_model and trainer.get_step <= trainer.get_max_steps:
                        global_step += 1
                    if global_step % self.save_freq == 0 and global_step != 0 and self.train_model:
                        if debug_print:
                            print("x", end='', flush=True)
                        # Save Tensorflow model
                        self._save_model(sess, steps=global_step, saver=saver)
                    curr_info = new_info
                # Final save Tensorflow model
                if global_step != 0 and self.train_model:
                    self._save_model(sess, steps=global_step, saver=saver)
            except KeyboardInterrupt:
                if self.train_model:
                    self.logger.info(
                        "Learning was interrupted. Please wait while the graph is generated."
                    )
                    self._save_model(sess, steps=global_step, saver=saver)
                pass
        self.env.close()
        if self.train_model:
            self._export_graph()
Example #20
0
num_epochs = 300

batch_size = 128
model.fit(states_train,
          actions_train,
          validation_split=0.1,
          batch_size=batch_size,
          epochs=num_epochs,
          shuffle=True)
print()
print(model.metrics_names)
print("Test error:", model.evaluate(states_test, actions_test))

env = UnityEnvironment(file_name="drone_sim_external", worker_id=0)
num_dagger_iterations = 10
steps = 1000
for iterations in range(num_dagger_iterations):
    done = False
    env.reset(train_mode=False)
    states = np.zeros((1, 13))
    threshold = 10
    for i in range(steps):
        action = model.predict(states)
        action = np.hstack((action[0], 0))
        brainInf = env.step(action)['DroneBrain']
        states = brainInf.states
        norm = np.linalg.norm(states[0][3:6] - states[0][9:12])

        #TODO: FIGURE OUT HOW TO GENERATE LABELED ACTIONS
model.save("trained_model.h5")
Example #21
0
class UnityEnvHelper:

    # constructor - give file_name of agent environment

    def __init__(self, file_name, no_graphics=True, seed=8888):

        self.seed = seed
        self.uenv = UnityEnvironment(file_name=file_name,
                                     seed=self.seed,
                                     no_graphics=no_graphics)

        # pick the first agent as the brain

        self.brain_name = self.uenv.brain_names[0]
        self.brain = self.uenv.brains[self.brain_name]

        # get the action space size

        self.action_size = self.brain.vector_action_space_size

        # reset the environment , in training mode

        self.reset(True)

        # get the state space size
        self.state_size = len(self.ue_info.vector_observations[0])

    def __del__(self):

        # make sure we close the environment
        try:
            self.uenv.close()
            del self.uenv
        except:
            pass

    def reset(self, train_mode=True):

        # tell the unity agent to restart an episode
        # training mode simple seems to run the simulation at full speed
        self.ue_info = self.uenv.reset(train_mode=train_mode)[self.brain_name]

    # we pass in current state for convenience
    def step(self, state, action):

        # perform action on environment  and get observation
        self.ue_info = self.uenv.step(action)[self.brain_name]
        # return state , action , next state , reward and done flag
        # slightly
        return {
            'state': state,
            'action': action,
            'reward': self.reward(),
            'next_state': self.state(),
            'done': self.done()
        }

    def state(self):
        # just last observation state
        return self.ue_info.vector_observations[0]

    def reward(self):
        # return reward from last observation
        return self.ue_info.rewards[0]

    def done(self):
        # return done flag
        return self.ue_info.local_done[0]
class Environment():
    """
    This is a wrapper class for a Unity environment

    The Unity environment is wrapped such that the API
    is similar to a Gym environment.

    Using this class, DQN algorithms written for Gym environments
    can be re-used with minimal changes.
    """
    def __init__(self,
                 filename_path,
                 worker_id=0,
                 train_mode=True,
                 no_graphics=False,
                 seed=0):
        # Create new environment

        # Create Unity environment
        self._env = UnityEnvironment(file_name=filename_path, \
                                    worker_id=worker_id,\
                                    no_graphics=no_graphics, \
                                    seed=seed)

        # get the default brain
        self._brain_name = self._env.brain_names[0]
        self._brain = self._env.brains[self._brain_name]

        # set the initial state
        self.train_mode = train_mode
        self._env_info = self._env.reset(
            train_mode=train_mode)[self._brain_name]
        self._state = self._env_info.vector_observations[0]

        # define state_size and action_size
        self.state_size = len(self._state)
        self.action_size = self._brain.vector_action_space_size

    def reset(self):
        # reset the environment
        self._env_info = self._env.reset(
            train_mode=self.train_mode)[self._brain_name]
        self._state = self._env_info.vector_observations[0]

        # return the state vector
        return self._state

    def step(self, action):
        # send the action to the environment
        self._env_info = self._env.step(action)[self._brain_name]
        # get the next state
        next_state = self._env_info.vector_observations[0]
        # get the reward
        reward = self._env_info.rewards[0]
        # check if terminal state is reached
        done = self._env_info.local_done[0]
        # create dummy value to keep API compatible
        dummy = 0

        # return the next_state vector, the reward,
        # and whether the terminal state was reached
        return next_state, reward, done, dummy

    def close(self):
        self._env.close()
        pass
Example #23
0
    os.makedirs(summary_path)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

with tf.Session() as sess:
    # Instantiate model parameters
    if load_model:
        print('Loading Model...')
        ckpt = tf.train.get_checkpoint_state(model_path)
        saver.restore(sess, ckpt.model_checkpoint_path)
    else:
        sess.run(init)
    steps = sess.run(ppo_model.global_step)
    summary_writer = tf.summary.FileWriter(summary_path)
    info = env.reset(train_mode=train_model)[brain_name]
    trainer = Trainer(ppo_model, sess, info, is_continuous, use_observations)
    while steps <= max_steps or not train_model:
        if env.global_done:
            info = env.reset(train_mode=train_model)[brain_name]
        # Decide and take an action
        new_info = trainer.take_action(info, env, brain_name)
        info = new_info
        trainer.process_experiences(info, time_horizon, gamma, lambd)
        if len(trainer.training_buffer['actions']) > buffer_size and train_model:
            # Perform gradient descent with experience buffer
            trainer.update_model(batch_size, num_epoch)
        if steps % summary_freq == 0 and steps != 0 and train_model:
            # Write training statistics to tensorboard.
            trainer.write_summary(summary_writer, steps)
        if steps % save_freq == 0 and steps != 0 and train_model:
            if avg_score >= TARGET_SCORE:
                torch.save(agents.actor_local.state_dict(),
                           "ckpt/{}".format(ACTOR_CHECKPOINT_NAME))
                torch.save(agents.critic_local.state_dict(),
                           "ckpt/{}".format(CRITIC_CHECKPOINT_NAME))
                break

    return scores


env = UnityEnvironment(file_name=ENV_PATH, no_graphics=GRAPHICS_OFF)
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

env_info = env.reset(train_mode=GRAPHICS_OFF)[brain_name]
num_agents = len(env_info.agents)
action_size = brain.vector_action_space_size
states = env_info.vector_observations
state_size = states.shape[1]

print('Number of agents: {}'.format(num_agents))
print('Number of actions: {}'.format(action_size))
print('Number of states: {}'.format(state_size))

print('First state: {}'.format(states[0]))

if torch.cuda.is_available():
    print("trainining on GPU")
else:
    print("training on CPU")
Example #25
0
# env_1.reset(train_mode=False)
# env_2.reset(train_mode=False)

env_path = util.get_env_path('gridworld')
# use train_mode = False to debug, i.e. render env at real size, real time
train_mode = False

# UnityEnvironment interfaces python with Unity,
# and contains brains for controlling connected agents.
env = UnityEnvironment(file_name=env_path)
print(str(env))

# get the default brain
default_brain = env.brain_names[0]
brain = env.brains[default_brain]
env_info = env.reset(train_mode=train_mode)[default_brain]
'''
is_continuous = (brain.action_space_type == 'continuous')
use_observations = (brain.number_observations > 0)
use_states = (brain.state_space_size > 0)

- reset env with param, returns dict of {brain: BrainInfo}
env.reset(train_mode=train_mode)
env_info = env.reset(train_mode=train_mode)[default_brain]

- list of 4D np arrays. nth element = nth observation (pixel-wise) of the brain
env_info.observations
- 2D np array of (batch_size, state_size) for cont and discrete
env_info.states.shape

- 2D np array of (batch_size, memory_size) which corresponds to
                      critic_layer_dim_1=args['critic_layer_dim_1'],
                      critic_layer_dim_2=args['critic_layer_dim_2'],
                      critic_layer_dim_3=args['critic_layer_dim_3'])
    return agent


projects = [
    "01Run", "02Run", "03Run", "04Run", "05Run", "06Run", "07Run", "08Run",
    "09Run", "10Run"
]

dfs_args = []

agents = []

unity_environment_path = "./Tennis_Linux/Tennis.x86_64"
env = UnityEnvironment(file_name=unity_environment_path)
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
env_info = env.reset(train_mode=False)[brain_name]
action_size = brain.vector_action_space_size
states = env_info.vector_observations
state_size = states.shape[1]

agent_2 = loadagent_chkpt("04Run", 9)
agent_1 = loadagent_chkpt("06Run", 4)

game = TourDDPG(agent_1, agent_2)
result = play(env, game, 100)

print(result)
Example #27
0
def evaluate(agent_dir: Path,
             number_of_episodes: int = 1000,
             maximum_timestaps: int = 1000,
             environment_path: str = DEFAULT_ENVIRONMENT_EXECUTABLE_PATH):
    """Evaluate an agent on some episodes. Note that the agent is not trained during the evaluation and the
    exploration is set to 0. Thus the results really reflect the final performance of the agent."""
    agent_path = agent_dir / 'checkpoint.pth'
    if not agent_path.exists():
        logging.warning(f'No saved parameters found for agent in {agent_dir}.')
        return
    hist_path = agent_dir / 'evaluation_histogram.png'
    scores_path = agent_dir / 'scores_evaluation.csv'

    env = UnityEnvironment(file_name=environment_path, no_graphics=True)
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    action_size = brain.vector_action_space_size
    env_info = env.reset(train_mode=True)[brain_name]
    state_size = len(env_info.vector_observations[0])

    agent = DqnAgent(state_size=state_size,
                     action_size=action_size,
                     device=DEVICE)
    agent.load(agent_path)

    scores = []

    for _ in tqdm(list(range(1, number_of_episodes + 1))):
        env_info = env.reset(train_mode=True)[brain_name]
        state = env_info.vector_observations[0]
        score = 0
        for t in range(maximum_timestaps):
            action = agent.act(state, epsilon=0.0)

            env_info = env.step(action)[brain_name]
            next_state = env_info.vector_observations[0]
            reward = env_info.rewards[0]
            done = env_info.local_done[0]

            state = next_state
            score += reward
            if done:
                break
        scores.append(score)

    scores_ts = pd.Series(scores)

    plt.hist(scores, bins=100, color='steelblue')
    xlim = plt.ylim()
    med = scores_ts.median()
    plt.vlines(med,
               *xlim,
               linewidth=2,
               linestyle='--',
               color='orange',
               label=f'median: {med}')
    plt.legend()

    plt.savefig(hist_path)
    scores_ts.to_csv(scores_path, index=False)
Example #28
0
from unityagents import UnityEnvironment

env_name = "env19"  # Name of the Unity environment binary to launch
train_mode = True  # Whether to run the environment in training or inference mode

env = UnityEnvironment(file_name=env_name)

# Examine environment parameters
print(str(env))

# Set the default brain to work with
default_brain = env.brain_names[0]
brain = env.brains[default_brain]

# Reset the environment
env_info = env.reset(train_mode=train_mode)[default_brain]

# Examine the state space for the default brain
print("Agent state looks like: \n{}".format(env_info.states[0]))

# Examine the observation space for the default brain
# for observation in env_info.observations:
#     print("Agent observations look like:")
#     if observation.shape[3] == 3:
#         plt.imshow(observation[0,:,:,:])
#     else:
#         plt.imshow(observation[0,:,:,0])

for episode in range(100):
    env_info = env.reset(train_mode=train_mode)[default_brain]
    done = False
Example #29
0
class UnityEnv(gym.Env):
    def __init__(self, app_name=None, idx=0):
        # parameter
        app_path = os.path.join(os.path.dirname(__file__), 'assets', app_name)
        idx = idx
        no_graphics = False
        self.num_envs = 1

        # create environment
        self._env = UnityEnvironment(file_name=app_path,
                                     worker_id=idx,
                                     no_graphics=no_graphics)
        self.name = app_name

        # Only Accept Environment with Only One Brain
        assert len(self._env.brains) == 1
        self.brain_name = self._env.external_brain_names[0]
        self.brain = self._env.brains[self.brain_name]

        # viusalization
        self.use_visual = (self.brain.number_visual_observations == 1)

        # action space dimension
        if self.brain.vector_action_space_type == "discrete":
            self._a_dim = Discrete(1)
        else:
            high = np.array([np.inf] * (self.brain.vector_action_space_size))
            self._a_dim = Box(-high, high)

        # observation spce dimension
        if self.use_visual and False and no_graphic:
            high = np.array([np.inf] *
                            self.brain.camera_resolutions[0]["height"] *
                            self.brain.camera_resolutions[0]["width"] * 3)
            self._ob_dim = Box(-high, high)
        else:
            high = np.array([np.inf] *
                            self.brain.vector_observation_space_size)
            self._ob_dim = Box(-high, high)

        # video buffer
        self.frames = []

    def reset(self):
        self.frames = []
        info = self._env.reset()[self.brain_name]
        state = info.vector_observations[0]
        return np.array([state])

    def step(self, action):
        info = self._env.step([action])[self.brain_name]

        state = info.vector_observations[0]
        reward = info.rewards[0]
        done = info.local_done[0]

        self._collect_frames(info.visual_observations[0])
        return np.array([state]), np.array([reward
                                            ]), np.array([done
                                                          ]), np.array([None])

    def close(self):
        self._env.close()

    def _collect_frames(self, frame):
        if self.use_visual:
            self.frames.append(frame)

    @property
    def action_space(self):
        return self._a_dim

    @property
    def observation_space(self):
        return self._ob_dim
def dqn(n_episodes=10000,
        max_t=1000,
        eps_start=1.0,
        eps_end=0.05,
        eps_decay=0.995,
        train_mode=True):
    """Deep Q-Learning.
    
    Params
    ======
        agent: 
        env: 
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
        train_mode (bool): set environment into training mode if True. 
    """
    scores = []  # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    eps = eps_start  # initialize epsilon

    env = UnityEnvironment(file_name="Banana/Banana.exe",
                           base_port=64738,
                           no_graphics=True)
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    action_size = brain.vector_action_space_size
    env_info = env.reset(train_mode=train_mode)[brain_name]
    state_size = len(env_info.vector_observations[0])

    agent = Agent(state_size=state_size, action_size=action_size, seed=0)

    for i_episode in range(1, n_episodes + 1):
        state = env_info.vector_observations[0]
        score = 0
        for t in range(max_t):
            action = np.int32(agent.act(state, eps))
            #next_state, reward, done, _ = env.step(action)
            env_info = env.step(action)[
                brain_name]  # send the action to the environment
            next_state = env_info.vector_observations[0]  # get the next state
            reward = env_info.rewards[0]  # get the reward
            done = env_info.local_done[0]  # see if episode has finished

            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                env.reset(train_mode=train_mode)[brain_name]
                break
        scores_window.append(score)  # save most recent score
        scores.append(score)  # save most recent score
        eps = max(eps_end, eps_decay * eps)  # decrease epsilon
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(
            i_episode, np.mean(scores_window)),
              end="")
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_window)))
        if np.mean(scores_window) > 13.0:
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(i_episode - 100, np.mean(scores_window)))
            torch.save(agent.qnetwork_local.state_dict(),
                       'checkpoint_vanilla.pth')
            break
    return scores
Example #31
0
def train_agent(
    env: unityagents.UnityEnvironment,
    agent: agents.DDPGAgent,
    n_episodes: int = 200,
    mean_score_threshold: float = 30.0,
    max_t: int = 1000,
    has_ou_noise: bool = True,
    scores_maxlen: int = 100,
    ou_noise_sigma_start: float = 0.5,
    ou_noise_sigma_end: float = 0.01,
    ou_noise_sigma_decay: float = 0.99,
    n_random_episodes: int = 100,
    logging_freq: int = 10,
    checkpoints_dir: typing.Optional[pathlib.Path] = None,
    checkpoints_freq: int = 50,
) -> pd.DataFrame:
    """
    Train agent for Unity Tennis environment and return results.

    Parameters
    ----------
    env
        Unity environment
    agent
        And instance of Deep Reinforcement Learning Agent from drl_ctrl.agents module
    n_episodes
        Maximum number of episodes
    mean_score_threshold
        Threshold of mean last 100 weights to stop training and save results
    max_t
        Maximum number of time steps per episode
    has_ou_noise
        If True, Ornstein-Uhlenbeck noise is added to actions
    scores_maxlen
        Maximum length of scores window
    ou_noise_sigma_start
        Ornstein-Uhlenbeck noise sigma starting value per episode
    ou_noise_sigma_end
        Ornstein-Uhlenbeck noise sigma minimum value per episode
    ou_noise_sigma_decay
        Ornstein-Uhlenbeck noise sigma multiplicative decay
    n_random_episodes
        Number of random episodes to gather experience
    logging_freq
        Logging frequency
    checkpoints_dir
        Model checkpoints output directory
    checkpoints_freq
        Checkpoint frequency to check if agent scores achieves average score threshold

    """

    logger = logging.getLogger(__name__)

    scores = []
    scores_avg100 = []
    scores_window = deque(maxlen=scores_maxlen)
    time_started = time.time()
    times_total = []
    times_per_episode = []
    time_steps = []

    i_last_checkpoint = 0
    for i_episode in range(1, (n_random_episodes + n_episodes + 1)):

        time_started_episode = time.time()

        brain_name = env.brain_names[0]
        env_info = env.reset(train_mode=True)[brain_name]
        agent.reset()
        states = env_info.vector_observations
        num_agents = len(env_info.agents)
        agent_scores = np.zeros(num_agents)

        ou_noise_sigma = ou_noise_sigma_start

        t = 1
        while True:
            # choose action (for each agent)
            if i_episode <= n_random_episodes:
                action_size = env.brains[brain_name].vector_action_space_size
                actions = np.random.randn(num_agents, action_size)
                actions = np.clip(actions, -1, 1)
            else:
                actions = agent.act(states,
                                    ou_noise_sigma=ou_noise_sigma,
                                    add_noise=has_ou_noise)
            ou_noise_sigma = max(ou_noise_sigma_end,
                                 ou_noise_sigma * ou_noise_sigma_decay)

            # take action in the environment(for each agent)
            env_info = env.step(actions)[brain_name]

            # get next state (for each agent)
            next_states = env_info.vector_observations

            # see if episode finished
            dones = env_info.local_done

            # update the score (for each agent)
            agent_scores += env_info.rewards

            if i_episode <= n_random_episodes:
                agent.memory.add_batch(states, actions, env_info.rewards,
                                       next_states, dones)
            else:
                agent.step(states, actions, env_info.rewards, next_states,
                           dones)

            # roll over states to next time step
            states = next_states

            # exit loop if episode finished
            if np.any(dones):
                break
            t += 1

        score = float(np.max(agent_scores))
        scores_window.append(score)
        scores.append(score)
        scores_avg100.append(np.mean(scores_window))

        times_total.append(time.time() - time_started)
        times_per_episode.append(time.time() - time_started_episode)
        time_steps.append(t)

        if i_episode % logging_freq == 0:
            logger.info(f'\rEp: {i_episode}'
                        f'\tSigma({t}): {ou_noise_sigma:.3f}'
                        f'\tScore: {score:.2f}'
                        f'\tAvg. Score: {np.mean(scores_window):.2f}'
                        f'\tTime_e: {times_per_episode[-1]:.3f}s'
                        f'\tTime: {times_total[-1]:.3f}s')

        if len(scores_window) == scores_maxlen and np.mean(
                scores_window) >= mean_score_threshold:
            if (checkpoints_dir is not None and
                ((i_episode - i_last_checkpoint) % checkpoints_freq) == 0):
                checkpoint_dir = checkpoints_dir.joinpath(
                    f"episode_{i_episode}")
                checkpoint_dir.mkdir(parents=True, exist_ok=True)

                torch.save(
                    agent.actor_local.state_dict(),
                    str(path_util.mk_path_weights_actor_local(checkpoint_dir)))
                torch.save(
                    agent.actor_target.state_dict(),
                    str(path_util.mk_path_weights_actor_target(
                        checkpoint_dir)))
                torch.save(
                    agent.critic_local.state_dict(),
                    str(path_util.mk_path_weights_critic_local(
                        checkpoint_dir)))
                torch.save(
                    agent.critic_target.state_dict(),
                    str(
                        path_util.mk_path_weights_critic_target(
                            checkpoints_dir)))

                logger.info(
                    f'\nSaved model checkpoint to {str(checkpoints_dir)}')
            else:
                logger.info(
                    f'\nEnvironment solved in {i_episode - 100:d} episodes!'
                    f'\nScore: {score:.2f}'
                    f'\tAverage Score: {np.mean(scores_window):.2f}'
                    f'\tAverage Time_e: {np.mean(times_per_episode):.3f}s'
                    f'\tTotal Time: {times_total[-1]:.3f}s')
                break

    return pd.DataFrame.from_records(
        zip(range(len(scores)), scores, scores_avg100, time_steps,
            times_per_episode, times_total),
        columns=[
            cfg.COL_EPISODE, cfg.COL_SCORE, cfg.COL_SCORE_AVG100,
            cfg.COL_N_TIME_STEPS, cfg.COL_TIME_PER_EPISODE, cfg.COL_TIME_TOTAL
        ])
class TrainerController(object):
    def __init__(self, env_path, run_id, save_freq, curriculum_file, fast_simulation, load, train,
                 worker_id, keep_checkpoints, lesson, seed, docker_target_name, trainer_config_path,
                 no_graphics):
        """
        :param env_path: Location to the environment executable to be loaded.
        :param run_id: The sub-directory name for model and summary statistics
        :param save_freq: Frequency at which to save model
        :param curriculum_file: Curriculum json file for environment
        :param fast_simulation: Whether to run the game at training speed
        :param load: Whether to load the model or randomly initialize
        :param train: Whether to train model, or only run inference
        :param worker_id: Number to add to communication port (5005). Used for multi-environment
        :param keep_checkpoints: How many model checkpoints to keep
        :param lesson: Start learning from this lesson
        :param seed: Random seed used for training.
        :param docker_target_name: Name of docker volume that will contain all data.
        :param trainer_config_path: Fully qualified path to location of trainer configuration file
        :param no_graphics: Whether to run the Unity simulator in no-graphics mode
        """
        self.trainer_config_path = trainer_config_path
        if env_path is not None:
            env_path = (env_path.strip()
                        .replace('.app', '')
                        .replace('.exe', '')
                        .replace('.x86_64', '')
                        .replace('.x86', ''))  # Strip out executable extensions if passed
        # Recognize and use docker volume if one is passed as an argument
        if docker_target_name == '':
            self.docker_training = False
            self.model_path = './models/{run_id}'.format(run_id=run_id)
            self.curriculum_file = curriculum_file
            self.summaries_dir = './summaries'
        else:
            self.docker_training = True
            self.model_path = '/{docker_target_name}/models/{run_id}'.format(
                docker_target_name=docker_target_name,
                run_id=run_id)
            if env_path is not None:
                env_path = '/{docker_target_name}/{env_name}'.format(docker_target_name=docker_target_name,
                                                                     env_name=env_path)
            if curriculum_file is None:
                self.curriculum_file = None
            else:
                self.curriculum_file = '/{docker_target_name}/{curriculum_file}'.format(
                    docker_target_name=docker_target_name,
                    curriculum_file=curriculum_file)
            self.summaries_dir = '/{docker_target_name}/summaries'.format(docker_target_name=docker_target_name)
        self.logger = logging.getLogger("unityagents")
        self.run_id = run_id
        self.save_freq = save_freq
        self.lesson = lesson
        self.fast_simulation = fast_simulation
        self.load_model = load
        self.train_model = train
        self.worker_id = worker_id
        self.keep_checkpoints = keep_checkpoints
        self.trainers = {}
        if seed == -1:
            seed = np.random.randint(0, 999999)
        self.seed = seed
        np.random.seed(self.seed)
        tf.set_random_seed(self.seed)
        self.env = UnityEnvironment(file_name=env_path, worker_id=self.worker_id,
                                    curriculum=self.curriculum_file, seed=self.seed,
                                    docker_training=self.docker_training,
                                    no_graphics=no_graphics)
        if env_path is None:
            self.env_name = 'editor_'+self.env.academy_name
        else:
            self.env_name = os.path.basename(os.path.normpath(env_path))  # Extract out name of environment

    def _get_progress(self):
        if self.curriculum_file is not None:
            progress = 0
            if self.env.curriculum.measure_type == "progress":
                for brain_name in self.env.external_brain_names:
                    progress += self.trainers[brain_name].get_step / self.trainers[brain_name].get_max_steps
                return progress / len(self.env.external_brain_names)
            elif self.env.curriculum.measure_type == "reward":
                for brain_name in self.env.external_brain_names:
                    progress += self.trainers[brain_name].get_last_reward
                return progress
            else:
                return None
        else:
            return None

    def _process_graph(self):
        nodes = []
        scopes = []
        for brain_name in self.trainers.keys():
            if self.trainers[brain_name].graph_scope is not None:
                scope = self.trainers[brain_name].graph_scope + '/'
                if scope == '/':
                    scope = ''
                scopes += [scope]
                if self.trainers[brain_name].parameters["trainer"] == "imitation":
                    nodes += [scope + x for x in ["action"]]
                else:
                    nodes += [scope + x for x in ["action", "value_estimate", "action_probs"]]
                if self.trainers[brain_name].parameters["use_recurrent"]:
                    nodes += [scope + x for x in ["recurrent_out", "memory_size"]]
        if len(scopes) > 1:
            self.logger.info("List of available scopes :")
            for scope in scopes:
                self.logger.info("\t" + scope)
        self.logger.info("List of nodes to export :")
        for n in nodes:
            self.logger.info("\t" + n)
        return nodes

    def _save_model(self, sess, saver, steps=0):
        """
        Saves current model to checkpoint folder.
        :param sess: Current Tensorflow session.
        :param steps: Current number of steps in training process.
        :param saver: Tensorflow saver for session.
        """
        last_checkpoint = self.model_path + '/model-' + str(steps) + '.cptk'
        saver.save(sess, last_checkpoint)
        tf.train.write_graph(sess.graph_def, self.model_path, 'raw_graph_def.pb', as_text=False)
        self.logger.info("Saved Model")

    def _export_graph(self):
        """
        Exports latest saved model to .bytes format for Unity embedding.
        """
        target_nodes = ','.join(self._process_graph())
        ckpt = tf.train.get_checkpoint_state(self.model_path)
        freeze_graph.freeze_graph(input_graph=self.model_path + '/raw_graph_def.pb',
                                  input_binary=True,
                                  input_checkpoint=ckpt.model_checkpoint_path,
                                  output_node_names=target_nodes,
                                  output_graph=self.model_path + '/' + self.env_name + "_" + self.run_id + '.bytes',
                                  clear_devices=True, initializer_nodes="", input_saver="",
                                  restore_op_name="save/restore_all", filename_tensor_name="save/Const:0")

    def _initialize_trainers(self, trainer_config, sess):
        trainer_parameters_dict = {}
        self.trainers = {}
        for brain_name in self.env.external_brain_names:
            trainer_parameters = trainer_config['default'].copy()
            if len(self.env.external_brain_names) > 1:
                graph_scope = re.sub('[^0-9a-zA-Z]+', '-', brain_name)
                trainer_parameters['graph_scope'] = graph_scope
                trainer_parameters['summary_path'] = '{basedir}/{name}'.format(
                    basedir=self.summaries_dir,
                    name=str(self.run_id) + '_' + graph_scope)
            else:
                trainer_parameters['graph_scope'] = ''
                trainer_parameters['summary_path'] = '{basedir}/{name}'.format(
                    basedir=self.summaries_dir,
                    name=str(self.run_id))
            if brain_name in trainer_config:
                _brain_key = brain_name
                while not isinstance(trainer_config[_brain_key], dict):
                    _brain_key = trainer_config[_brain_key]
                for k in trainer_config[_brain_key]:
                    trainer_parameters[k] = trainer_config[_brain_key][k]
            trainer_parameters_dict[brain_name] = trainer_parameters.copy()
        for brain_name in self.env.external_brain_names:
            if trainer_parameters_dict[brain_name]['trainer'] == "imitation":
                self.trainers[brain_name] = BehavioralCloningTrainer(sess, self.env, brain_name,
                                                                     trainer_parameters_dict[brain_name],
                                                                     self.train_model, self.seed)
            elif trainer_parameters_dict[brain_name]['trainer'] == "ppo":
                self.trainers[brain_name] = PPOTrainer(sess, self.env, brain_name, trainer_parameters_dict[brain_name],
                                                       self.train_model, self.seed)
            else:
                raise UnityEnvironmentException("The trainer config contains an unknown trainer type for brain {}"
                                                .format(brain_name))

    def _load_config(self):
        try:
            with open(self.trainer_config_path) as data_file:
                trainer_config = yaml.load(data_file)
                return trainer_config
        except IOError:
            raise UnityEnvironmentException("""Parameter file could not be found here {}.
                                            Will use default Hyper parameters"""
                                            .format(self.trainer_config_path))
        except UnicodeDecodeError:
            raise UnityEnvironmentException("There was an error decoding Trainer Config from this path : {}"
                                            .format(self.trainer_config_path))

    @staticmethod
    def _create_model_path(model_path):
        try:
            if not os.path.exists(model_path):
                os.makedirs(model_path)
        except Exception:
            raise UnityEnvironmentException("The folder {} containing the generated model could not be accessed."
                                            " Please make sure the permissions are set correctly."
                                            .format(model_path))

    def start_learning(self):
        self.env.curriculum.set_lesson_number(self.lesson)
        trainer_config = self._load_config()
        self._create_model_path(self.model_path)

        tf.reset_default_graph()

        with tf.Session() as sess:
            self._initialize_trainers(trainer_config, sess)
            for k, t in self.trainers.items():
                self.logger.info(t)
            init = tf.global_variables_initializer()
            saver = tf.train.Saver(max_to_keep=self.keep_checkpoints)
            # Instantiate model parameters
            if self.load_model:
                self.logger.info('Loading Model...')
                ckpt = tf.train.get_checkpoint_state(self.model_path)
                if ckpt is None:
                    self.logger.info('The model {0} could not be found. Make sure you specified the right '
                                     '--run-id'.format(self.model_path))
                saver.restore(sess, ckpt.model_checkpoint_path)
            else:
                sess.run(init)
            global_step = 0  # This is only for saving the model
            self.env.curriculum.increment_lesson(self._get_progress())
            curr_info = self.env.reset(train_mode=self.fast_simulation)
            if self.train_model:
                for brain_name, trainer in self.trainers.items():
                    trainer.write_tensorboard_text('Hyperparameters', trainer.parameters)
            try:
                while any([t.get_step <= t.get_max_steps for k, t in self.trainers.items()]) or not self.train_model:
                    if self.env.global_done:
                        self.env.curriculum.increment_lesson(self._get_progress())
                        curr_info = self.env.reset(train_mode=self.fast_simulation)
                        for brain_name, trainer in self.trainers.items():
                            trainer.end_episode()
                    # Decide and take an action
                    take_action_vector, take_action_memories, take_action_text, take_action_outputs = {}, {}, {}, {}
                    for brain_name, trainer in self.trainers.items():
                        (take_action_vector[brain_name],
                         take_action_memories[brain_name],
                         take_action_text[brain_name],
                         take_action_outputs[brain_name]) = trainer.take_action(curr_info)
                    new_info = self.env.step(vector_action=take_action_vector, memory=take_action_memories,
                                             text_action=take_action_text)
                    for brain_name, trainer in self.trainers.items():
                        trainer.add_experiences(curr_info, new_info, take_action_outputs[brain_name])
                        trainer.process_experiences(curr_info, new_info)
                        if trainer.is_ready_update() and self.train_model and trainer.get_step <= trainer.get_max_steps:
                            # Perform gradient descent with experience buffer
                            trainer.update_model()
                        # Write training statistics to Tensorboard.
                        trainer.write_summary(self.env.curriculum.lesson_number)
                        if self.train_model and trainer.get_step <= trainer.get_max_steps:
                            trainer.increment_step_and_update_last_reward()
                    if self.train_model:
                        global_step += 1
                    if global_step % self.save_freq == 0 and global_step != 0 and self.train_model:
                        # Save Tensorflow model
                        self._save_model(sess, steps=global_step, saver=saver)
                    curr_info = new_info
                # Final save Tensorflow model
                if global_step != 0 and self.train_model:
                    self._save_model(sess, steps=global_step, saver=saver)
            except KeyboardInterrupt:
                print('--------------------------Now saving model-------------------------')
                if self.train_model:
                    self.logger.info("Learning was interrupted. Please wait while the graph is generated.")
                    self._save_model(sess, steps=global_step, saver=saver)
                pass
        self.env.close()
        if self.train_model:
            self._export_graph()
Example #33
0
class UnityEnv(gym.Env):
    """
    Provides Gym wrapper for Unity Learning Environments.
    Multi-agent environments use lists for object types, as done here:
    https://github.com/openai/multiagent-particle-envs
    """
    def __init__(self, params):

        environment_filename = params['path']
        worker_id = params['worker_id']
        seed = params['seed']
        use_visual = params['visual_mode']
        multiagent = params['multiagent_mode']

        self._env = UnityEnvironment(environment_filename, seed=seed)
        self.name = self._env.academy_name
        self.visual_obs = None
        self._action_space_size = None
        self._current_state = None
        self._n_agents = None
        self._multiagent = multiagent

        # Check brain configuration
        if len(self._env.brains) != 1:
            raise UnityGymException(
                "There can only be one brain in a UnityEnvironment "
                "if it is wrapped in a gym.")
        self.brain_name = self._env.external_brain_names[0]
        brain = self._env.brains[self.brain_name]

        if use_visual and brain.number_visual_observations == 0:
            raise UnityGymException(
                "`use_visual` was set to True, however there are no"
                " visual observations as part of this environment.")
        self.use_visual = brain.number_visual_observations >= 1 and use_visual

        if brain.number_visual_observations > 1:
            logger.warning(
                "The environment contains more than one visual observation. "
                "Please note that only the first will be provided in the observation."
            )

        if brain.num_stacked_vector_observations != 1:
            raise UnityGymException(
                "There can only be one stacked vector observation in a UnityEnvironment "
                "if it is wrapped in a gym.")

        # Check for number of agents in scene.
        initial_info = self._env.reset()[self.brain_name]
        self._check_agents(len(initial_info.agents))

        # Set observation and action spaces
        if brain.vector_action_space_type == "discrete":
            if len(brain.vector_action_space_size) == 1:
                self._action_space = spaces.Discrete(
                    brain.vector_action_space_size[0])
            else:
                self._action_space = spaces.MultiDiscrete(
                    brain.vector_action_space_size)
        else:
            self._action_space_size = brain.vector_action_space_size
            high = np.array([1] * brain.vector_action_space_size)
            self._action_space = spaces.Box(-high, high, dtype=np.float32)

        high = np.array([np.inf] * brain.vector_observation_space_size)
        self.action_meanings = brain.vector_action_descriptions
        if self.use_visual:
            if brain.camera_resolutions[0]["blackAndWhite"]:
                depth = 1
            else:
                depth = 3
            self._observation_space = spaces.Box(
                0,
                1,
                dtype=np.float32,
                shape=(brain.camera_resolutions[0]["height"],
                       brain.camera_resolutions[0]["width"], depth))
        else:
            self._observation_space = spaces.Box(-high, high, dtype=np.float32)

    def reset(self, train_mode=True):
        """Resets the state of the environment and returns an initial observation.
        In the case of multi-agent environments, this is a list.
        Returns: observation (object/list): the initial observation of the
            space.
        """
        info = self._env.reset(train_mode)[self.brain_name]
        n_agents = len(info.agents)
        self._check_agents(n_agents)

        if not self._multiagent:
            obs, reward, done, info = self._single_step(info)
        else:
            obs, reward, done, info = self._multi_step(info)
        return obs

    def step(self, action):
        """Run one timestep of the environment's dynamics. When end of
        episode is reached, you are responsible for calling `reset()`
        to reset this environment's state.
        Accepts an action and returns a tuple (observation, reward, done, info).
        In the case of multi-agent environments, these are lists.
        Args:
            action (object/list): an action provided by the environment
        Returns:
            observation (object/list): agent's observation of the current environment
            reward (float/list) : amount of reward returned after previous action
            done (boolean/list): whether the episode has ended.
            info (dict): contains auxiliary diagnostic information, including BrainInfo.
        """

        # Use random actions for all other agents in environment.
        if self._multiagent:
            if not isinstance(action, list):
                raise UnityGymException(
                    "The environment was expecting `action` to be a list.")
            if len(action) != self._n_agents:
                raise UnityGymException(
                    "The environment was expecting a list of {} actions.".
                    format(self._n_agents))
            else:
                action = np.array(action)

        info = self._env.step(action)[self.brain_name]
        n_agents = len(info.agents)
        self._check_agents(n_agents)
        self._current_state = info

        if not self._multiagent:
            obs, reward, done, info = self._single_step(info)
        else:
            obs, reward, done, info = self._multi_step(info)
        return obs, reward, done, info

    def _single_step(self, info):
        if self.use_visual:
            self.visual_obs = info.visual_observations[0][0, :, :, :]
            default_observation = self.visual_obs
        else:
            default_observation = info.vector_observations[0, :]

        return default_observation, info.rewards[0], info.local_done[0], {
            "text_observation": info.text_observations[0],
            "brain_info": info
        }

    def _multi_step(self, info):
        if self.use_visual:
            self.visual_obs = info.visual_observations
            default_observation = self.visual_obs
        else:
            default_observation = info.vector_observations
        return list(default_observation), info.rewards, info.local_done, {
            "text_observation": info.text_observations,
            "brain_info": info
        }

    def render(self, mode='rgb_array'):
        return self.visual_obs

    def close(self):
        """Override _close in your subclass to perform any necessary cleanup.
        Environments will automatically close() themselves when
        garbage collected or when the program exits.
        """
        self._env.close()

    def get_action_meanings(self):
        return self.action_meanings

    def seed(self, seed=None):
        """Sets the seed for this env's random number generator(s).
        Currently not implemented.
        """
        logger.warning("Could not seed environment %s", self.name)
        return

    def _check_agents(self, n_agents):
        if not self._multiagent and n_agents > 1:
            raise UnityGymException(
                "The environment was launched as a single-agent environment, however"
                "there is more than one agent in the scene.")
        elif self._multiagent and n_agents <= 1:
            raise UnityGymException(
                "The environment was launched as a mutli-agent environment, however"
                "there is only one agent in the scene.")
        if self._n_agents is None:
            self._n_agents = n_agents
            logger.info("{} agents within environment.".format(n_agents))
        elif self._n_agents != n_agents:
            raise UnityGymException(
                "The number of agents in the environment has changed since "
                "initialization. This is not supported.")

    @property
    def metadata(self):
        return {'render.modes': ['rgb_array']}

    @property
    def reward_range(self):
        return -float('inf'), float('inf')

    @property
    def spec(self):
        return None

    @property
    def action_space_size(self):
        return self._action_space_size

    @property
    def action_space(self):
        return self._action_space

    @property
    def observation_space(self):
        return self._observation_space

    @property
    def number_agents(self):
        return self._n_agents