Exemple #1
0
    def learn(self, total_timesteps, callback=None, log_interval=1,
              eval_env=None, eval_freq=-1, n_eval_episodes=5, tb_log_name="PPO", reset_num_timesteps=True):

        timesteps_since_eval, iteration, evaluations, obs, eval_env = self._setup_learn(eval_env)

        if self.tensorboard_log is not None:
            self.tb_writer = tf.summary.create_file_writer(os.path.join(self.tensorboard_log, tb_log_name))

        while self.num_timesteps < total_timesteps:

            if callback is not None:
                # Only stop training if return value is False, not when it is None.
                if callback(locals(), globals()) is False:
                    break

            obs = self.collect_rollouts(self.env, self.rollout_buffer, n_rollout_steps=self.n_steps,
                                        obs=obs)
            iteration += 1
            self.num_timesteps += self.n_steps * self.n_envs
            timesteps_since_eval += self.n_steps * self.n_envs
            self._update_current_progress(self.num_timesteps, total_timesteps)

            # Display training infos
            if self.verbose >= 1 and log_interval is not None and iteration % log_interval == 0:
                fps = int(self.num_timesteps / (time.time() - self.start_time))
                logger.logkv("iterations", iteration)
                if len(self.ep_info_buffer) > 0 and len(self.ep_info_buffer[0]) > 0:
                    logger.logkv('ep_rew_mean', self.safe_mean([ep_info['r'] for ep_info in self.ep_info_buffer]))
                    logger.logkv('ep_len_mean', self.safe_mean([ep_info['l'] for ep_info in self.ep_info_buffer]))
                logger.logkv("fps", fps)
                logger.logkv('time_elapsed', int(time.time() - self.start_time))
                logger.logkv("total timesteps", self.num_timesteps)
                logger.dumpkvs()

            self.train(self.n_epochs, batch_size=self.batch_size)

            # Evaluate the agent
            timesteps_since_eval = self._eval_policy(eval_freq, eval_env, n_eval_episodes,
                                                     timesteps_since_eval, deterministic=True)
            # For tensorboard integration
            # if self.tb_writer is not None:
            #     with self.tb_writer.as_default():
            #         tf.summary.scalar('Eval/reward', mean_reward, self.num_timesteps)


        return self
Exemple #2
0
    def collect_rollouts(self,
                         env,
                         n_episodes=1,
                         n_steps=-1,
                         action_noise=None,
                         deterministic=False,
                         callback=None,
                         learning_starts=0,
                         num_timesteps=0,
                         replay_buffer=None,
                         obs=None,
                         episode_num=0,
                         log_interval=None):
        """
        Collect rollout using the current policy (and possibly fill the replay buffer)
        TODO: move this method to off-policy base class.

        :param env: (VecEnv)
        :param n_episodes: (int)
        :param n_steps: (int)
        :param action_noise: (ActionNoise)
        :param deterministic: (bool)
        :param callback: (callable)
        :param learning_starts: (int)
        :param num_timesteps: (int)
        :param replay_buffer: (ReplayBuffer)
        :param obs: (np.ndarray)
        :param episode_num: (int)
        :param log_interval: (int)
        """
        episode_rewards = []
        total_timesteps = []
        total_steps, total_episodes = 0, 0
        assert isinstance(env, VecEnv)
        assert env.num_envs == 1

        while total_steps < n_steps or total_episodes < n_episodes:
            done = False
            # Reset environment: not needed for VecEnv
            # obs = env.reset()
            episode_reward, episode_timesteps = 0.0, 0

            while not done:

                # Select action randomly or according to policy
                if num_timesteps < learning_starts:
                    # Warmup phase
                    unscaled_action = np.array([self.action_space.sample()])
                else:
                    unscaled_action = self.predict(obs)

                # Rescale the action from [low, high] to [-1, 1]
                scaled_action = self.scale_action(unscaled_action)

                # Add noise to the action (improve exploration)
                if action_noise is not None:
                    scaled_action = np.clip(scaled_action + action_noise(), -1,
                                            1)

                # Rescale and perform action
                new_obs, reward, done, infos = env.step(
                    self.unscale_action(scaled_action))

                done_bool = [float(done[0])]
                episode_reward += reward

                # Retrieve reward and episode length if using Monitor wrapper
                self._update_info_buffer(infos)

                # Store data in replay buffer
                if replay_buffer is not None:
                    replay_buffer.add(obs, new_obs, scaled_action, reward,
                                      done_bool)

                obs = new_obs

                num_timesteps += 1
                episode_timesteps += 1
                total_steps += 1
                if 0 < n_steps <= total_steps:
                    break

            if done:
                total_episodes += 1
                episode_rewards.append(episode_reward)
                total_timesteps.append(episode_timesteps)
                if action_noise is not None:
                    action_noise.reset()

                # Display training infos
                if self.verbose >= 1 and log_interval is not None and (
                        episode_num + total_episodes) % log_interval == 0:
                    fps = int(num_timesteps / (time.time() - self.start_time))
                    logger.logkv("episodes", episode_num + total_episodes)
                    if len(self.ep_info_buffer) > 0 and len(
                            self.ep_info_buffer[0]) > 0:
                        logger.logkv(
                            'ep_rew_mean',
                            self.safe_mean([
                                ep_info['r'] for ep_info in self.ep_info_buffer
                            ]))
                        logger.logkv(
                            'ep_len_mean',
                            self.safe_mean([
                                ep_info['l'] for ep_info in self.ep_info_buffer
                            ]))
                    # logger.logkv("n_updates", n_updates)
                    logger.logkv("fps", fps)
                    logger.logkv('time_elapsed',
                                 int(time.time() - self.start_time))
                    logger.logkv("total timesteps", num_timesteps)
                    logger.dumpkvs()

        mean_reward = np.mean(episode_rewards) if total_episodes > 0 else 0.0

        return mean_reward, total_steps, total_episodes, obs
Exemple #3
0
    def __init__(self,
                 policy,
                 env,
                 learning_rate=3e-4,
                 n_steps=2048,
                 batch_size=64,
                 n_epochs=10,
                 gamma=0.99,
                 gae_lambda=0.95,
                 clip_range=0.2,
                 clip_range_vf=None,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 target_kl=None,
                 tensorboard_log=None,
                 create_eval_env=False,
                 policy_kwargs=None,
                 verbose=0,
                 seed=0,
                 _init_setup_model=True,
                 modelpath=None,
                 logpath=None):

        super(PPO, self).__init__(policy,
                                  env,
                                  PPOPolicy,
                                  policy_kwargs=policy_kwargs,
                                  verbose=verbose,
                                  create_eval_env=create_eval_env,
                                  support_multi_env=True,
                                  seed=seed)

        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.n_epochs = n_epochs
        self.n_steps = n_steps
        self.gamma = gamma
        self.gae_lambda = gae_lambda
        self.clip_range = clip_range
        self.clip_range_vf = clip_range_vf
        self.ent_coef = ent_coef
        self.vf_coef = vf_coef
        self.max_grad_norm = max_grad_norm
        self.rollout_buffer = None
        self.target_kl = target_kl
        self.tensorboard_log = tensorboard_log
        self.tb_writer = None

        self.iteration_start = 0
        self.time_elapsed_start = 0
        self.num_timesteps_start = 0

        self.modelpath = modelpath

        params_loaded, policy_loaded = self._setup_model(modelpath)

        if logpath is not None:

            p = None
            if modelpath is not None and params_loaded and policy_loaded:
                try:
                    fname = osp.join(logpath, 'progress.csv')
                    p = pd.read_csv(fname, delimiter=',', dtype=float)
                except:
                    pass

            format_strs = os.getenv('', 'stdout,log,csv').split(',')
            logger.configure(os.path.abspath(logpath), format_strs)

            if p is not None:
                keys = p.keys()
                vals = p.values
                self.iteration_start = p['iterations'].values[-1]
                self.num_timesteps_start = p['total timesteps'].values[-1]
                self.time_elapsed_start = p['time_elapsed'].values[-1]
                for i in range(vals.shape[0]):
                    for j in range(len(keys)):
                        logger.logkv(keys[j], vals[i, j])
                    logger.dumpkvs()