Beispiel #1
0
    def _update_learning_rate(self, optimizers):
        """
        Update the optimizers learning rate using the current learning rate schedule
        and the current progress (from 1 to 0).

        :param optimizers: ([th.optim.Optimizer] or Optimizer) An optimizer
            or a list of optimizer.
        """
        # Log the current learning rate
        logger.logkv("learning_rate",
                     self.learning_rate(self._current_progress))
Beispiel #2
0
    def collect_rollouts(self,
                         env,
                         n_episodes=1,
                         n_steps=-1,
                         action_noise=None,
                         deterministic=False,
                         callback=None,
                         learning_starts=0,
                         num_timesteps=0,
                         replay_buffer=None,
                         obs=None,
                         episode_num=0,
                         log_interval=None):
        """
        Collect rollout using the current policy (and possibly fill the replay buffer)
        TODO: move this method to off-policy base class.

        :param env: (VecEnv)
        :param n_episodes: (int)
        :param n_steps: (int)
        :param action_noise: (ActionNoise)
        :param deterministic: (bool)
        :param callback: (callable)
        :param learning_starts: (int)
        :param num_timesteps: (int)
        :param replay_buffer: (ReplayBuffer)
        :param obs: (np.ndarray)
        :param episode_num: (int)
        :param log_interval: (int)
        """
        episode_rewards = []
        total_timesteps = []
        total_steps, total_episodes = 0, 0
        assert isinstance(env, VecEnv)
        assert env.num_envs == 1

        while total_steps < n_steps or total_episodes < n_episodes:
            done = False
            # Reset environment: not needed for VecEnv
            # obs = env.reset()
            episode_reward, episode_timesteps = 0.0, 0

            while not done:

                # Select action randomly or according to policy
                if num_timesteps < learning_starts:
                    # Warmup phase
                    unscaled_action = np.array([self.action_space.sample()])
                else:
                    unscaled_action = self.predict(obs)

                # Rescale the action from [low, high] to [-1, 1]
                scaled_action = self.scale_action(unscaled_action)

                # Add noise to the action (improve exploration)
                if action_noise is not None:
                    scaled_action = np.clip(scaled_action + action_noise(), -1,
                                            1)

                # Rescale and perform action
                new_obs, reward, done, infos = env.step(
                    self.unscale_action(scaled_action))

                done_bool = [float(done[0])]
                episode_reward += reward

                # Retrieve reward and episode length if using Monitor wrapper
                self._update_info_buffer(infos)

                # Store data in replay buffer
                if replay_buffer is not None:
                    replay_buffer.add(obs, new_obs, scaled_action, reward,
                                      done_bool)

                obs = new_obs

                num_timesteps += 1
                episode_timesteps += 1
                total_steps += 1
                if 0 < n_steps <= total_steps:
                    break

            if done:
                total_episodes += 1
                episode_rewards.append(episode_reward)
                total_timesteps.append(episode_timesteps)
                if action_noise is not None:
                    action_noise.reset()

                # Display training infos
                if self.verbose >= 1 and log_interval is not None and (
                        episode_num + total_episodes) % log_interval == 0:
                    fps = int(num_timesteps / (time.time() - self.start_time))
                    logger.logkv("episodes", episode_num + total_episodes)
                    if len(self.ep_info_buffer) > 0 and len(
                            self.ep_info_buffer[0]) > 0:
                        logger.logkv(
                            'ep_rew_mean',
                            self.safe_mean([
                                ep_info['r'] for ep_info in self.ep_info_buffer
                            ]))
                        logger.logkv(
                            'ep_len_mean',
                            self.safe_mean([
                                ep_info['l'] for ep_info in self.ep_info_buffer
                            ]))
                    # logger.logkv("n_updates", n_updates)
                    logger.logkv("fps", fps)
                    logger.logkv('time_elapsed',
                                 int(time.time() - self.start_time))
                    logger.logkv("total timesteps", num_timesteps)
                    logger.dumpkvs()

        mean_reward = np.mean(episode_rewards) if total_episodes > 0 else 0.0

        return mean_reward, total_steps, total_episodes, obs
Beispiel #3
0
    def learn(self,
              total_timesteps,
              callback=None,
              log_interval=1,
              eval_env=None,
              eval_freq=-1,
              n_eval_episodes=5,
              tb_log_name="PPO",
              reset_num_timesteps=True):

        timesteps_since_eval, iteration, evaluations, obs, eval_env = self._setup_learn(
            eval_env)
        iteration += self.iteration_start

        if self.tensorboard_log is not None:
            self.tb_writer = tf.summary.create_file_writer(
                os.path.join(self.tensorboard_log,
                             f'{tb_log_name}_{time.time()}'))

        while self.num_timesteps < total_timesteps:

            if callback is not None:
                # Only stop training if return value is False, not when it is None.
                if callback(locals(), globals()) is False:
                    break

            obs = self.collect_rollouts(self.env,
                                        self.rollout_buffer,
                                        n_rollout_steps=self.n_steps,
                                        obs=obs)
            iteration += 1
            self.num_timesteps += self.n_steps * self.n_envs
            timesteps_since_eval += self.n_steps * self.n_envs
            self._update_current_progress(self.num_timesteps, total_timesteps)

            # Display training infos

            if self.verbose >= 1 and log_interval is not None and iteration % log_interval == 0:
                if len(self.ep_reward_buffer) > 0:
                    fps = int(self.num_timesteps /
                              (time.time() - self.start_time))
                    logger.logkv("iterations", iteration)
                    logger.logkv('ep_rew_mean',
                                 self.safe_mean(self.ep_reward_buffer))
                    logger.logkv("fps", fps)
                    logger.logkv(
                        'time_elapsed',
                        int(time.time() - self.start_time +
                            self.time_elapsed_start))
                    logger.logkv("total timesteps",
                                 self.num_timesteps + self.num_timesteps_start)
                    logger.dumpkvs()
                    if iteration > self.iteration_start + 1:
                        self.save(self.modelpath)

            self.train(self.n_epochs, batch_size=self.batch_size)

            # Evaluate the agent

            timesteps_since_eval = self._eval_policy(eval_freq,
                                                     eval_env,
                                                     n_eval_episodes,
                                                     timesteps_since_eval,
                                                     deterministic=True)

            # For tensorboard integration

            if self.tb_writer is not None:
                with self.tb_writer.as_default():
                    if len(self.ep_reward_buffer) > 0:
                        tf.summary.scalar(
                            'Reward', self.safe_mean(self.ep_reward_buffer),
                            self.num_timesteps)

        return self
Beispiel #4
0
    def __init__(self,
                 policy,
                 env,
                 learning_rate=3e-4,
                 n_steps=2048,
                 batch_size=64,
                 n_epochs=10,
                 gamma=0.99,
                 gae_lambda=0.95,
                 clip_range=0.2,
                 clip_range_vf=None,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 target_kl=None,
                 tensorboard_log=None,
                 create_eval_env=False,
                 policy_kwargs=None,
                 verbose=0,
                 seed=0,
                 _init_setup_model=True,
                 modelpath=None,
                 logpath=None):

        super(PPO, self).__init__(policy,
                                  env,
                                  PPOPolicy,
                                  policy_kwargs=policy_kwargs,
                                  verbose=verbose,
                                  create_eval_env=create_eval_env,
                                  support_multi_env=True,
                                  seed=seed)

        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.n_epochs = n_epochs
        self.n_steps = n_steps
        self.gamma = gamma
        self.gae_lambda = gae_lambda
        self.clip_range = clip_range
        self.clip_range_vf = clip_range_vf
        self.ent_coef = ent_coef
        self.vf_coef = vf_coef
        self.max_grad_norm = max_grad_norm
        self.rollout_buffer = None
        self.target_kl = target_kl
        self.tensorboard_log = tensorboard_log
        self.tb_writer = None

        self.iteration_start = 0
        self.time_elapsed_start = 0
        self.num_timesteps_start = 0

        self.modelpath = modelpath

        params_loaded, policy_loaded = self._setup_model(modelpath)

        if logpath is not None:

            p = None
            if modelpath is not None and params_loaded and policy_loaded:
                try:
                    fname = osp.join(logpath, 'progress.csv')
                    p = pd.read_csv(fname, delimiter=',', dtype=float)
                except:
                    pass

            format_strs = os.getenv('', 'stdout,log,csv').split(',')
            logger.configure(os.path.abspath(logpath), format_strs)

            if p is not None:
                keys = p.keys()
                vals = p.values
                self.iteration_start = p['iterations'].values[-1]
                self.num_timesteps_start = p['total timesteps'].values[-1]
                self.time_elapsed_start = p['time_elapsed'].values[-1]
                for i in range(vals.shape[0]):
                    for j in range(len(keys)):
                        logger.logkv(keys[j], vals[i, j])
                    logger.dumpkvs()
Beispiel #5
0
    def train(self, gradient_steps, batch_size=64):
        # Update optimizer learning rate
        # self._update_learning_rate(self.policy.optimizer)

        # Compute current clip range
        clip_range = self.clip_range(self._current_progress)
        if self.clip_range_vf is not None:
            clip_range_vf = self.clip_range_vf(self._current_progress)
        else:
            clip_range_vf = None

        for gradient_step in range(gradient_steps):
            approx_kl_divs = []
            # Sample replay buffer
            for replay_data in self.rollout_buffer.get(batch_size):
                # Unpack
                obs, action, old_values, old_log_prob, advantage, return_batch = replay_data

                if isinstance(self.action_space, spaces.Discrete):
                    # Convert discrete action for float to long
                    action = action.astype(np.int64).flatten()

                with tf.GradientTape() as tape:
                    tape.watch(self.policy.trainable_variables)
                    values, log_prob, entropy = self.policy.evaluate_actions(
                        obs, action)
                    # Flatten
                    values = tf.reshape(values, [-1])

                    policy_loss = self.policy_loss(advantage, log_prob,
                                                   old_log_prob, clip_range)
                    value_loss = self.value_loss(values, old_values,
                                                 return_batch, clip_range_vf)

                    # Entropy loss favor exploration
                    entropy_loss = -tf.reduce_mean(entropy)

                    loss = policy_loss + self.ent_coef * entropy_loss + self.vf_coef * value_loss

                # Optimization step
                gradients = tape.gradient(loss,
                                          self.policy.trainable_variables)
                # Clip grad norm
                # gradients = tf.clip_by_norm(gradients, self.max_grad_norm)
                self.policy.optimizer.apply_gradients(
                    zip(gradients, self.policy.trainable_variables))
                approx_kl_divs.append(
                    tf.reduce_mean(old_log_prob - log_prob).numpy())

            if self.target_kl is not None and np.mean(
                    approx_kl_divs) > 1.5 * self.target_kl:
                print(
                    "Early stopping at step {} due to reaching max kl: {:.2f}".
                    format(gradient_step, np.mean(approx_kl_divs)))
                break

        explained_var = explained_variance(
            self.rollout_buffer.returns.flatten(),
            self.rollout_buffer.values.flatten())

        logger.logkv("clip_range", clip_range)
        if self.clip_range_vf is not None:
            logger.logkv("clip_range_vf", clip_range_vf)

        logger.logkv("explained_variance", explained_var)
        # TODO: gather stats for the entropy and other losses?
        logger.logkv("entropy", entropy.numpy().mean())
        logger.logkv("policy_loss", policy_loss.numpy())
        logger.logkv("value_loss", value_loss.numpy())
        if hasattr(self.policy, 'log_std'):
            logger.logkv("std", tf.exp(self.policy.log_std).numpy().mean())