Esempio n. 1
0
    def eval(self):
        """
        Abstract evaluation flow
        """
        print("EVALUATION RUN")
        print("No exploration and dropout will be used")
        self.arglist.exploration_bonus = 0.0
        self.arglist.init_noise_scale = 0.0
        self.arglist.dropout_p = 0.0
        if self.arglist.load_models is None:
            print("WARNING: Evaluation run without loading any models!")

        # set random seeds before model creation
        if self.arglist.seed is not None:
            random.seed(self.arglist.seed)
            np.random.seed(self.arglist.seed)
            torch.manual_seed(self.arglist.seed)
            torch.cuda.manual_seed(self.arglist.seed)
            if torch.cuda.is_available():
                torch.backends.cudnn.deterministic = True
                torch.backends.cudnn.benchmark = False

        env, env_name, task_name, n_agents, observation_sizes, action_sizes, discrete_actions = (
            self.create_environment())
        self.env = env
        self.n_agents = n_agents

        print("Observation sizes: ", observation_sizes)
        print("Action sizes: ", action_sizes)

        # Create curiosity instances
        if self.arglist.curiosity is None:
            print("No curiosity is to be used!")
        elif self.arglist.curiosity == "icm":
            print("Training uses Intrinsic Curiosity Module (ICM)!")
        elif self.arglist.curiosity == "rnd":
            print("Training uses Random Network Distillation (RND)!")
        elif self.arglist.curiosity == "count":
            print("Training uses hash-based counting exploration bonus!")
            # TODO: add count based ones
        else:
            raise ValueError("Unknown curiosity: " + self.arglist.curiosity)

        # create algorithm trainer
        if self.arglist.alg == "maddpg":
            self.alg = MADDPG(n_agents, observation_sizes, action_sizes,
                              discrete_actions, self.arglist)
            print(
                "Training multi-agent deep deterministic policy gradient (MADDPG) on "
                + env_name + " environment")
        elif self.arglist.alg == "iql":
            self.alg = IQL(n_agents, observation_sizes, action_sizes,
                           discrete_actions, self.arglist)
            print("Training independent q-learning (IQL) on " + env_name +
                  " environment")
        else:
            raise ValueError("Unknown algorithm: " + self.arglist.alg)

        # set random seeds past model creation
        if self.arglist.seed is not None:
            random.seed(self.arglist.seed)
            np.random.seed(self.arglist.seed)
            torch.manual_seed(self.arglist.seed)
            torch.cuda.manual_seed(self.arglist.seed)
            if torch.cuda.is_available():
                torch.backends.cudnn.deterministic = True
                torch.backends.cudnn.benchmark = False

        if self.arglist.load_models is not None:
            print("Loading models from " + self.arglist.load_models +
                  " with extension " + self.arglist.load_models_extension)
            self.alg.load_model_networks(
                self.arglist.load_models,
                "_" + self.arglist.load_models_extension)

        self.logger = Logger(n_agents, task_name, None, self.arglist.alg,
                             self.arglist.curiosity)
        self.plotter = Plotter(
            self.logger,
            n_agents,
            task_name,
            self.arglist.run,
            self.arglist.alg,
            self.arglist.curiosity,
        )
        if self.arglist.save_gifs:
            self.frame_saver = FrameSaver(self.arglist.eta, task_name,
                                          self.arglist.run, self.arglist.alg)

        print("Starting iterations...")
        start_time = time.time()
        t = 0
        for ep in range(self.arglist.num_episodes):
            episode_rewards = []

            obs = self.reset_environment()
            self.alg.reset(ep)

            episode_rewards = np.array([0.0] * n_agents)
            episode_length = 0
            done = False

            while not done and episode_length < self.arglist.max_episode_len:
                torch_obs = [
                    Variable(torch.Tensor(obs[i]), requires_grad=False)
                    for i in range(n_agents)
                ]

                actions, agent_actions = self.select_actions(torch_obs)
                rewards, dones, next_obs = self.environment_step(actions)

                t += 1

                episode_rewards += rewards

                # for displaying learned policies
                self.environment_render()
                if self.arglist.save_gifs:
                    self.frame_saver.add_frame(
                        self.env.render("rgb_array")[0], ep)

                obs = next_obs
                episode_length += 1
                done = all(dones)

            if self.arglist.alg == "maddpg":
                self.logger.log_episode(
                    ep,
                    episode_rewards,
                    [0.0] * n_agents,
                    self.alg.agents[0].get_exploration_scale(),
                )
            if self.arglist.alg == "iql":
                self.logger.log_episode(
                    ep,
                    episode_rewards,
                    [0.0] * n_agents,
                    self.alg.agents[0].epsilon,
                )
            self.logger.dump_episodes(1)

            episode_rewards = []
            episode_length = 0

            if self.arglist.save_gifs:
                self.frame_saver.save_episode_gif()

            if ep % 20 == 0 and ep > 0:
                # update plots
                self.plotter.update_reward_plot(True)
                self.plotter.update_exploration_plot(True)

        duration = time.time() - start_time
        print("Overall duration: %.2fs" % duration)

        env.close()
Esempio n. 2
0
    def train(self):
        """
        Abstract training flow
        """
        # set random seeds before model creation
        if self.arglist.seed is not None:
            random.seed(self.arglist.seed)
            np.random.seed(self.arglist.seed)
            torch.manual_seed(self.arglist.seed)
            torch.cuda.manual_seed(self.arglist.seed)
            if torch.cuda.is_available():
                torch.backends.cudnn.deterministic = True
                torch.backends.cudnn.benchmark = False

        # use number of threads if no GPUs are available
        if not USE_CUDA:
            torch.set_num_threads(self.arglist.n_training_threads)

        env, env_name, task_name, n_agents, observation_sizes, action_sizes, discrete_actions = (
            self.create_environment())
        self.env = env
        self.n_agents = n_agents

        steps = self.arglist.num_episodes * self.arglist.max_episode_len
        # steps-th root of GOAL_EPSILON
        decay_epsilon = GOAL_EPSILON**(1 / float(steps))
        self.arglist.decay_factor = decay_epsilon
        print("Epsilon is decaying with factor %.7f to %.3f over %d steps." %
              (decay_epsilon, GOAL_EPSILON, steps))

        print("Observation sizes: ", observation_sizes)
        print("Action sizes: ", action_sizes)

        # Create curiosity instances
        if self.arglist.curiosity is None:
            print("No curiosity is to be used!")
        elif self.arglist.curiosity == "icm":
            print("Training uses Intrinsic Curiosity Module (ICM)!")
        elif self.arglist.curiosity == "rnd":
            print("Training uses Random Network Distillation (RND)!")
        elif self.arglist.curiosity == "count":
            print("Training uses hash-based counting exploration bonus!")
        else:
            raise ValueError("Unknown curiosity: " + self.arglist.curiosity)

        # create algorithm trainer
        if self.arglist.alg == "maddpg":
            self.alg = MADDPG(n_agents, observation_sizes, action_sizes,
                              discrete_actions, self.arglist)
            print(
                "Training multi-agent deep deterministic policy gradient (MADDPG) on "
                + env_name + " environment")
        elif self.arglist.alg == "iql":
            self.alg = IQL(n_agents, observation_sizes, action_sizes,
                           discrete_actions, self.arglist)
            print("Training independent q-learning (IQL) on " + env_name +
                  " environment")
        else:
            raise ValueError("Unknown algorithm: " + self.arglist.alg)

        self.memory = ReplayBuffer(
            self.arglist.buffer_capacity,
            n_agents,
            observation_sizes,
            action_sizes,
            self.arglist.no_rewards,
        )

        # set random seeds past model creation
        if self.arglist.seed is not None:
            random.seed(self.arglist.seed)
            np.random.seed(self.arglist.seed)
            torch.manual_seed(self.arglist.seed)
            torch.cuda.manual_seed(self.arglist.seed)
            if torch.cuda.is_available():
                torch.backends.cudnn.deterministic = True
                torch.backends.cudnn.benchmark = False

        if self.arglist.load_models is not None:
            print("Loading models from " + self.arglist.load_models +
                  " with extension " + self.arglist.load_models_extension)
            self.alg.load_model_networks(
                self.arglist.load_models,
                "_" + self.arglist.load_models_extension)

        self.model_saver = ModelSaver(self.arglist.save_models_dir,
                                      self.arglist.run, self.arglist.alg)
        self.logger = Logger(
            n_agents,
            self.arglist.eta,
            task_name,
            self.arglist.run,
            self.arglist.alg,
            self.arglist.curiosity,
        )
        self.plotter = Plotter(
            self.logger,
            n_agents,
            self.arglist.eval_frequency,
            task_name,
            self.arglist.run,
            self.arglist.alg,
            self.arglist.curiosity,
        )
        if self.arglist.save_frames:
            self.frame_saver = FrameSaver(self.arglist.eta, task_name,
                                          self.arglist.run, self.arglist.alg)

        print("Starting iterations...")
        start_time = time.time()
        t = 0

        for ep in range(self.arglist.num_episodes):
            obs = self.reset_environment()
            self.alg.reset(ep)

            episode_rewards = np.array([0.0] * n_agents)
            if self.arglist.sparse_rewards:
                sparse_rewards = np.array([0.0] * n_agents)
            episode_length = 0
            done = False
            interesting_episode = False

            while not done and episode_length < self.arglist.max_episode_len:
                torch_obs = [
                    Variable(torch.Tensor(obs[i]), requires_grad=False)
                    for i in range(n_agents)
                ]

                actions, agent_actions = self.select_actions(
                    torch_obs, not self.arglist.no_exploration)
                rewards, dones, next_obs = self.environment_step(actions)

                episode_rewards += rewards
                if self.arglist.sparse_rewards:
                    sparse_rewards += rewards

                if self.arglist.no_rewards:
                    rewards = [0.0] * n_agents
                elif self.arglist.sparse_rewards:
                    if (episode_length + 1) % self.arglist.sparse_freq == 0:
                        rewards = list(sparse_rewards /
                                       self.arglist.sparse_freq)
                    else:
                        rewards = [0.0] * n_agents
                self.memory.push(obs, agent_actions, rewards, next_obs, dones)

                t += 1

                if (len(self.memory) >= self.arglist.batch_size
                        and (t % self.arglist.steps_per_update) == 0):
                    losses = self.alg.update(self.memory, USE_CUDA)
                    self.logger.log_losses(ep, losses)
                    if self.arglist.dump_losses:
                        self.logger.dump_losses(1)

                # for displaying learned policies
                if self.arglist.display:
                    self.environment_render()
                if self.arglist.save_frames:
                    self.frame_saver.add_frame(
                        self.env.render("rgb_array")[0], ep)
                    if self.arglist.curiosity is not None:
                        curiosities = self.alg.get_curiosities(
                            obs, agent_actions, next_obs)
                        interesting = self.frame_saver.save_interesting_frame(
                            curiosities)
                        interesting_episode = interesting_episode or interesting

                obs = next_obs
                episode_length += 1
                done = all(dones)

            if ep % self.arglist.eval_frequency == 0:
                eval_rewards = np.zeros((self.arglist.eval_episodes, n_agents))
                for i in range(self.arglist.eval_episodes):
                    ep_rewards, _, _ = self.eval(ep, n_agents)
                    eval_rewards[i, :] = ep_rewards
                if self.arglist.alg == "maddpg":
                    self.logger.log_episode(
                        ep,
                        eval_rewards.mean(0),
                        eval_rewards.var(0),
                        self.alg.agents[0].get_exploration_scale(),
                    )
                if self.arglist.alg == "iql":
                    self.logger.log_episode(ep, eval_rewards.mean(0),
                                            eval_rewards.var(0),
                                            self.alg.agents[0].epsilon)
                self.logger.dump_episodes(1)
            if ep % 100 == 0 and ep > 0:
                duration = time.time() - start_time
                self.logger.dump_train_progress(ep, self.arglist.num_episodes,
                                                duration)

            if interesting_episode:
                self.frame_saver.save_episode_gif()

            if ep % (self.arglist.save_interval // 2) == 0 and ep > 0:
                # update plots
                self.plotter.update_reward_plot(self.arglist.plot)
                self.plotter.update_exploration_plot(self.arglist.plot)
                self.plotter.update_alg_loss_plot(self.arglist.plot)
                if self.arglist.curiosity is not None:
                    self.plotter.update_cur_loss_plot(self.arglist.plot)
                    self.plotter.update_intrinsic_reward_plot(
                        self.arglist.plot)

            if ep % self.arglist.save_interval == 0 and ep > 0:
                # save plots
                print("Remove previous plots")
                self.plotter.clear_plots()
                print("Saving intermediate plots")
                self.plotter.save_reward_plot(str(ep))
                self.plotter.save_exploration_plot(str(ep))
                self.plotter.save_alg_loss_plots(str(ep))
                self.plotter.save_cur_loss_plots(str(ep))
                self.plotter.save_intrinsic_reward_plot(str(ep))
                # save models
                print("Remove previous models")
                self.model_saver.clear_models()
                print("Saving intermediate models")
                self.model_saver.save_models(self.alg, str(ep))
                # save logs
                print("Remove previous logs")
                self.logger.clear_logs()
                print("Saving intermediate logs")
                self.logger.save_episodes(extension=str(ep))
                self.logger.save_losses(extension=str(ep))
                # save parameter log
                self.logger.save_parameters(
                    env_name,
                    task_name,
                    n_agents,
                    observation_sizes,
                    action_sizes,
                    discrete_actions,
                    self.arglist,
                )

        duration = time.time() - start_time
        print("Overall duration: %.2fs" % duration)

        print("Remove previous plots")
        self.plotter.clear_plots()
        print("Saving final plots")
        self.plotter.save_reward_plot("final")
        self.plotter.save_exploration_plot("final")
        self.plotter.save_alg_loss_plots("final")
        self.plotter.save_cur_loss_plots("final")
        self.plotter.save_intrinsic_reward_plot("final")

        # save models
        print("Remove previous models")
        self.model_saver.clear_models()
        print("Saving final models")
        self.model_saver.save_models(self.alg, "final")

        # save logs
        print("Remove previous logs")
        self.logger.clear_logs()
        print("Saving final logs")
        self.logger.save_episodes(extension="final")
        self.logger.save_losses(extension="final")
        self.logger.save_duration_cuda(duration, torch.cuda.is_available())

        # save parameter log
        self.logger.save_parameters(
            env_name,
            task_name,
            n_agents,
            observation_sizes,
            action_sizes,
            discrete_actions,
            self.arglist,
        )

        env.close()