Example #1
0
    def _reset_env(self, env: BaseUnityEnvironment):
        """Resets the environment.

        Returns:
            A Data structure corresponding to the initial reset state of the
            environment.
        """
        if self.meta_curriculum is not None:
            return env.reset(train_mode=self.fast_simulation,
                             config=self.meta_curriculum.get_config())
        else:
            return env.reset(train_mode=self.fast_simulation)
    def start_learning(self, env: BaseUnityEnvironment, trainer_config):
        # TODO: Should be able to start learning at different lesson numbers
        # for each curriculum.
        if self.meta_curriculum is not None:
            self.meta_curriculum.set_all_curriculums_to_lesson_num(self.lesson)
        self._create_model_path(self.model_path)

        tf.reset_default_graph()

        # Prevent a single session from taking all GPU memory.
        self.initialize_trainers(trainer_config)
        for _, t in self.trainers.items():
            self.logger.info(t)

        if self.train_model:
            for brain_name, trainer in self.trainers.items():
                trainer.write_tensorboard_text("Hyperparameters", trainer.parameters)
        try:
            curr_info = self._reset_env(env)
            while (
                any([t.get_step <= t.get_max_steps for k, t in self.trainers.items()])
                or not self.train_model
            ):
                new_info = self.take_step(env, curr_info)
                self.global_step += 1
                if (
                    self.global_step % self.save_freq == 0
                    and self.global_step != 0
                    and self.train_model
                ):
                    # Save Tensorflow model
                    self._save_model(steps=self.global_step)
                curr_info = new_info
            # Final save Tensorflow model
            if self.global_step != 0 and self.train_model:
                self._save_model(steps=self.global_step)
        except KeyboardInterrupt:
            if self.train_model:
                self._save_model_when_interrupted(steps=self.global_step)
            pass
        env.close()
        if self.train_model:
            self._write_training_metrics()
            self._export_graph()
Example #3
0
    def take_step(self, env: BaseUnityEnvironment, curr_info: AllBrainInfo):
        if self.meta_curriculum:
            # Get the sizes of the reward buffers.
            reward_buff_sizes = {
                k: len(t.reward_buffer)
                for (k, t) in self.trainers.items()
            }
            # Attempt to increment the lessons of the brains who
            # were ready.
            lessons_incremented = \
                self.meta_curriculum.increment_lessons(
                    self._get_measure_vals(),
                    reward_buff_sizes=reward_buff_sizes)
        else:
            lessons_incremented = {}

        # If any lessons were incremented or the environment is
        # ready to be reset
        if (self.meta_curriculum and any(lessons_incremented.values())):
            curr_info = self._reset_env(env)
            print("___________Current info of agent in trainer_controller")
            print(curr_info)
            for brain_name, trainer in self.trainers.items():
                trainer.end_episode()
            for brain_name, changed in lessons_incremented.items():
                if changed:
                    self.trainers[brain_name].reward_buffer.clear()
        elif env.global_done:
            curr_info = self._reset_env(env)
            for brain_name, trainer in self.trainers.items():
                trainer.end_episode()

        # Decide and take an action
        take_action_vector = {}
        take_action_memories = {}
        take_action_text = {}
        take_action_value = {}
        take_action_outputs = {}
        for brain_name, trainer in self.trainers.items():
            action_info = trainer.get_action(curr_info[brain_name])
            take_action_vector[brain_name] = action_info.action
            take_action_memories[brain_name] = action_info.memory
            take_action_text[brain_name] = action_info.text
            take_action_value[brain_name] = action_info.value
            take_action_outputs[brain_name] = action_info.outputs
            #print ("_____________________Take Step___________________")
            #print (action_info.action)
            #print (action_info.memory)
            #print (action_info.text)
            #print (action_info.value)
            #print (action_info.outputs)
        time_start_step = time()
        new_info = env.step(vector_action=take_action_vector,
                            memory=take_action_memories,
                            text_action=take_action_text,
                            value=take_action_value)
        delta_time_step = time() - time_start_step
        for brain_name, trainer in self.trainers.items():
            if brain_name in self.trainer_metrics:
                self.trainer_metrics[brain_name].add_delta_step(
                    delta_time_step)
            trainer.add_experiences(curr_info, new_info,
                                    take_action_outputs[brain_name])
            trainer.process_experiences(curr_info, new_info)
            if trainer.is_ready_update() and self.train_model \
                    and trainer.get_step <= trainer.get_max_steps:
                # Perform gradient descent with experience buffer

                trainer.update_policy()
            # Write training statistics to Tensorboard.
            delta_train_start = time() - self.training_start_time
            if self.meta_curriculum is not None:
                trainer.write_summary(
                    self.global_step,
                    delta_train_start,
                    lesson_num=self.meta_curriculum.
                    brains_to_curriculums[brain_name].lesson_num)
            else:
                trainer.write_summary(self.global_step, delta_train_start)
            if self.train_model \
                    and trainer.get_step <= trainer.get_max_steps:
                trainer.increment_step_and_update_last_reward()
        return new_info