Beispiel #1
0
 def __init__(self,
              buffer,
              env,
              act_fn,
              device,
              learning_starts=1000,
              update_period=1):
     """Init."""
     self.env = ensure_vec_env(env)
     if self.env.num_envs > 1 and not isinstance(buffer,
                                                 BatchedReplayBuffer):
         raise ValueError(
             "when num_envs > 1, you must pass a BatchedReplayBuffer"
             " to the ReplayBufferDataManager.")
     if not isinstance(buffer, BatchedReplayBuffer):
         buffer = BatchedReplayBuffer(buffer)
     if self.env.num_envs != buffer.n:
         raise ValueError(
             f"Found {self.env.num_envs} envs and {buffer.n} "
             "buffers. The number of envs must be equal to the "
             "number of buffers!")
     self.act = act_fn
     self.buffer = buffer
     self.device = device
     self.learning_starts = learning_starts
     self.update_period = update_period
     self._ob = None
Beispiel #2
0
    def __init__(self,
                 logdir,
                 env_fn,
                 policy_fn,
                 qf_fn,
                 nenv=1,
                 optimizer=torch.optim.Adam,
                 buffer_size=10000,
                 frame_stack=1,
                 learning_starts=1000,
                 update_period=1,
                 batch_size=256,
                 policy_lr=1e-3,
                 qf_lr=1e-3,
                 gamma=0.99,
                 target_update_period=1,
                 policy_update_period=1,
                 target_smoothing_coef=0.005,
                 alpha=0.2,
                 automatic_entropy_tuning=True,
                 target_entropy=None,
                 gpu=True,
                 eval_num_episodes=1,
                 record_num_episodes=1,
                 log_period=1000):
        """Init."""
        self.logdir = logdir
        self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts'))
        self.env_fn = env_fn
        self.nenv = nenv
        self.eval_num_episodes = eval_num_episodes
        self.record_num_episodes = record_num_episodes
        self.gamma = gamma
        self.buffer_size = buffer_size
        self.frame_stack = frame_stack
        self.learning_starts = learning_starts
        self.update_period = update_period
        self.batch_size = batch_size
        if target_update_period < self.update_period:
            self.target_update_period = self.update_period
        else:
            self.target_update_period = target_update_period - (
                target_update_period % self.update_period)
        if policy_update_period < self.update_period:
            self.policy_update_period = self.update_period
        else:
            self.policy_update_period = policy_update_period - (
                policy_update_period % self.update_period)
        self.target_smoothing_coef = target_smoothing_coef
        self.log_period = log_period

        self.device = torch.device(
            'cuda:0' if gpu and torch.cuda.is_available() else 'cpu')

        self.env = VecEpisodeLogger(env_fn(nenv=nenv))
        eval_env = VecFrameStack(self.env, self.frame_stack)
        self.pi = policy_fn(eval_env)
        self.qf1 = qf_fn(eval_env)
        self.qf2 = qf_fn(eval_env)
        self.target_qf1 = qf_fn(eval_env)
        self.target_qf2 = qf_fn(eval_env)

        self.pi.to(self.device)
        self.qf1.to(self.device)
        self.qf2.to(self.device)
        self.target_qf1.to(self.device)
        self.target_qf2.to(self.device)

        self.opt_pi = optimizer(self.pi.parameters(), lr=policy_lr)
        self.opt_qf1 = optimizer(self.qf1.parameters(), lr=qf_lr)
        self.opt_qf2 = optimizer(self.qf2.parameters(), lr=qf_lr)

        self.target_qf1.load_state_dict(self.qf1.state_dict())
        self.target_qf2.load_state_dict(self.qf2.state_dict())

        self.buffer = BatchedReplayBuffer(
            *
            [ReplayBuffer(buffer_size, frame_stack) for _ in range(self.nenv)])
        self.data_manager = ReplayBufferDataManager(self.buffer, self.env,
                                                    SACActor(self.pi),
                                                    self.device,
                                                    self.learning_starts,
                                                    self.update_period)

        self.alpha = alpha
        self.automatic_entropy_tuning = automatic_entropy_tuning
        if self.automatic_entropy_tuning:
            if target_entropy:
                self.target_entropy = target_entropy
            else:
                target_entropies = nest.map_structure(
                    lambda space: -np.prod(space.shape).item(),
                    misc.unpack_space(self.env.action_space))
                self.target_entropy = sum(nest.flatten(target_entropies))

            self.log_alpha = torch.tensor(np.log([self.alpha]),
                                          requires_grad=True,
                                          device=self.device,
                                          dtype=torch.float32)
            self.opt_alpha = optimizer([self.log_alpha], lr=policy_lr)
        else:
            self.target_entropy = None
            self.log_alpha = None
            self.opt_alpha = None

        self.mse_loss = torch.nn.MSELoss()

        self.t = 0
Beispiel #3
0
class SAC(Algorithm):
    """SAC algorithm."""
    def __init__(self,
                 logdir,
                 env_fn,
                 policy_fn,
                 qf_fn,
                 nenv=1,
                 optimizer=torch.optim.Adam,
                 buffer_size=10000,
                 frame_stack=1,
                 learning_starts=1000,
                 update_period=1,
                 batch_size=256,
                 policy_lr=1e-3,
                 qf_lr=1e-3,
                 gamma=0.99,
                 target_update_period=1,
                 policy_update_period=1,
                 target_smoothing_coef=0.005,
                 alpha=0.2,
                 automatic_entropy_tuning=True,
                 target_entropy=None,
                 gpu=True,
                 eval_num_episodes=1,
                 record_num_episodes=1,
                 log_period=1000):
        """Init."""
        self.logdir = logdir
        self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts'))
        self.env_fn = env_fn
        self.nenv = nenv
        self.eval_num_episodes = eval_num_episodes
        self.record_num_episodes = record_num_episodes
        self.gamma = gamma
        self.buffer_size = buffer_size
        self.frame_stack = frame_stack
        self.learning_starts = learning_starts
        self.update_period = update_period
        self.batch_size = batch_size
        if target_update_period < self.update_period:
            self.target_update_period = self.update_period
        else:
            self.target_update_period = target_update_period - (
                target_update_period % self.update_period)
        if policy_update_period < self.update_period:
            self.policy_update_period = self.update_period
        else:
            self.policy_update_period = policy_update_period - (
                policy_update_period % self.update_period)
        self.target_smoothing_coef = target_smoothing_coef
        self.log_period = log_period

        self.device = torch.device(
            'cuda:0' if gpu and torch.cuda.is_available() else 'cpu')

        self.env = VecEpisodeLogger(env_fn(nenv=nenv))
        eval_env = VecFrameStack(self.env, self.frame_stack)
        self.pi = policy_fn(eval_env)
        self.qf1 = qf_fn(eval_env)
        self.qf2 = qf_fn(eval_env)
        self.target_qf1 = qf_fn(eval_env)
        self.target_qf2 = qf_fn(eval_env)

        self.pi.to(self.device)
        self.qf1.to(self.device)
        self.qf2.to(self.device)
        self.target_qf1.to(self.device)
        self.target_qf2.to(self.device)

        self.opt_pi = optimizer(self.pi.parameters(), lr=policy_lr)
        self.opt_qf1 = optimizer(self.qf1.parameters(), lr=qf_lr)
        self.opt_qf2 = optimizer(self.qf2.parameters(), lr=qf_lr)

        self.target_qf1.load_state_dict(self.qf1.state_dict())
        self.target_qf2.load_state_dict(self.qf2.state_dict())

        self.buffer = BatchedReplayBuffer(
            *
            [ReplayBuffer(buffer_size, frame_stack) for _ in range(self.nenv)])
        self.data_manager = ReplayBufferDataManager(self.buffer, self.env,
                                                    SACActor(self.pi),
                                                    self.device,
                                                    self.learning_starts,
                                                    self.update_period)

        self.alpha = alpha
        self.automatic_entropy_tuning = automatic_entropy_tuning
        if self.automatic_entropy_tuning:
            if target_entropy:
                self.target_entropy = target_entropy
            else:
                target_entropies = nest.map_structure(
                    lambda space: -np.prod(space.shape).item(),
                    misc.unpack_space(self.env.action_space))
                self.target_entropy = sum(nest.flatten(target_entropies))

            self.log_alpha = torch.tensor(np.log([self.alpha]),
                                          requires_grad=True,
                                          device=self.device,
                                          dtype=torch.float32)
            self.opt_alpha = optimizer([self.log_alpha], lr=policy_lr)
        else:
            self.target_entropy = None
            self.log_alpha = None
            self.opt_alpha = None

        self.mse_loss = torch.nn.MSELoss()

        self.t = 0

    def loss(self, batch):
        """Loss function."""
        pi_out = self.pi(batch['obs'], reparameterization_trick=True)
        logp = pi_out.dist.log_prob(pi_out.action)
        q1 = self.qf1(batch['obs'], batch['action']).value
        q2 = self.qf2(batch['obs'], batch['action']).value

        # alpha loss
        if self.automatic_entropy_tuning:
            ent_error = logp + self.target_entropy
            alpha_loss = -(self.log_alpha * ent_error.detach()).mean()
            self.opt_alpha.zero_grad()
            alpha_loss.backward()
            self.opt_alpha.step()
            alpha = self.log_alpha.exp()
        else:
            alpha = self.alpha
            alpha_loss = 0

        # qf loss
        with torch.no_grad():
            next_pi_out = self.pi(batch['next_obs'])
            next_ac_logp = next_pi_out.dist.log_prob(next_pi_out.action)
            q1_next = self.target_qf1(batch['next_obs'],
                                      next_pi_out.action).value
            q2_next = self.target_qf2(batch['next_obs'],
                                      next_pi_out.action).value
            qnext = torch.min(q1_next, q2_next) - alpha * next_ac_logp
            qtarg = batch['reward'] + (1.0 -
                                       batch['done']) * self.gamma * qnext

        assert qtarg.shape == q1.shape
        assert qtarg.shape == q2.shape
        qf1_loss = self.mse_loss(q1, qtarg)
        qf2_loss = self.mse_loss(q2, qtarg)

        # pi loss
        pi_loss = None
        if self.t % self.policy_update_period == 0:
            q1_pi = self.qf1(batch['obs'], pi_out.action).value
            q2_pi = self.qf2(batch['obs'], pi_out.action).value
            min_q_pi = torch.min(q1_pi, q2_pi)
            assert min_q_pi.shape == logp.shape
            pi_loss = (alpha * logp - min_q_pi).mean()

            # log pi loss about as frequently as other losses
            if self.t % self.log_period < self.policy_update_period:
                logger.add_scalar('loss/pi', pi_loss, self.t, time.time())

        if self.t % self.log_period < self.update_period:
            if self.automatic_entropy_tuning:
                logger.add_scalar('alg/log_alpha',
                                  self.log_alpha.detach().cpu().numpy(),
                                  self.t, time.time())
                scalars = {
                    "target": self.target_entropy,
                    "entropy": -torch.mean(logp.detach()).cpu().numpy().item()
                }
                logger.add_scalars('alg/entropy', scalars, self.t, time.time())
            else:
                logger.add_scalar(
                    'alg/entropy',
                    -torch.mean(logp.detach()).cpu().numpy().item(), self.t,
                    time.time())
            logger.add_scalar('loss/qf1', qf1_loss, self.t, time.time())
            logger.add_scalar('loss/qf2', qf2_loss, self.t, time.time())
            logger.add_scalar('alg/qf1',
                              q1.mean().detach().cpu().numpy(), self.t,
                              time.time())
            logger.add_scalar('alg/qf2',
                              q2.mean().detach().cpu().numpy(), self.t,
                              time.time())
        return pi_loss, qf1_loss, qf2_loss

    def step(self):
        """Step optimization."""
        self.t += self.data_manager.step_until_update()
        if self.t % self.target_update_period == 0:
            soft_target_update(self.target_qf1, self.qf1,
                               self.target_smoothing_coef)
            soft_target_update(self.target_qf2, self.qf2,
                               self.target_smoothing_coef)

        if self.t % self.update_period == 0:
            batch = self.data_manager.sample(self.batch_size)

            pi_loss, qf1_loss, qf2_loss = self.loss(batch)

            # update
            if pi_loss:
                self.opt_pi.zero_grad()
                pi_loss.backward()
                self.opt_pi.step()

            self.opt_qf1.zero_grad()
            qf1_loss.backward()
            self.opt_qf1.step()

            self.opt_qf2.zero_grad()
            qf2_loss.backward()
            self.opt_qf2.step()

        return self.t

    def evaluate(self):
        """Evaluate."""
        eval_env = VecFrameStack(self.env, self.frame_stack)
        self.pi.eval()
        misc.set_env_to_eval_mode(eval_env)

        # Eval policy
        os.makedirs(os.path.join(self.logdir, 'eval'), exist_ok=True)
        outfile = os.path.join(self.logdir, 'eval',
                               self.ckptr.format.format(self.t) + '.json')
        stats = rl_evaluate(eval_env, self.pi, self.eval_num_episodes, outfile,
                            self.device)
        logger.add_scalar('eval/mean_episode_reward', stats['mean_reward'],
                          self.t, time.time())
        logger.add_scalar('eval/mean_episode_length', stats['mean_length'],
                          self.t, time.time())

        # Record policy
        os.makedirs(os.path.join(self.logdir, 'video'), exist_ok=True)
        outfile = os.path.join(self.logdir, 'video',
                               self.ckptr.format.format(self.t) + '.mp4')
        rl_record(eval_env, self.pi, self.record_num_episodes, outfile,
                  self.device)

        self.pi.train()
        misc.set_env_to_train_mode(self.env)
        self.data_manager.manual_reset()

    def save(self):
        """Save."""
        state_dict = {
            'pi':
            self.pi.state_dict(),
            'qf1':
            self.qf1.state_dict(),
            'qf2':
            self.qf2.state_dict(),
            'target_qf1':
            self.target_qf1.state_dict(),
            'target_qf2':
            self.target_qf2.state_dict(),
            'opt_pi':
            self.opt_pi.state_dict(),
            'opt_qf1':
            self.opt_qf1.state_dict(),
            'opt_qf2':
            self.opt_qf2.state_dict(),
            'log_alpha':
            (self.log_alpha if self.automatic_entropy_tuning else None),
            'opt_alpha': (self.opt_alpha.state_dict()
                          if self.automatic_entropy_tuning else None),
            'env':
            misc.env_state_dict(self.env),
            't':
            self.t
        }
        buffer_dict = self.buffer.state_dict()
        state_dict['buffer_format'] = nest.get_structure(buffer_dict)
        self.ckptr.save(state_dict, self.t)

        # save buffer seperately and only once (because it can be huge)
        np.savez(
            os.path.join(self.ckptr.ckptdir, 'buffer.npz'),
            **{f'{i:04d}': x
               for i, x in enumerate(nest.flatten(buffer_dict))})

    def load(self, t=None):
        """Load."""
        state_dict = self.ckptr.load(t)
        if state_dict is None:
            self.t = 0
            return self.t
        self.pi.load_state_dict(state_dict['pi'])
        self.qf1.load_state_dict(state_dict['qf1'])
        self.qf2.load_state_dict(state_dict['qf2'])
        self.target_qf1.load_state_dict(state_dict['target_qf1'])
        self.target_qf2.load_state_dict(state_dict['target_qf2'])

        self.opt_pi.load_state_dict(state_dict['opt_pi'])
        self.opt_qf1.load_state_dict(state_dict['opt_qf1'])
        self.opt_qf2.load_state_dict(state_dict['opt_qf2'])

        if state_dict['log_alpha']:
            with torch.no_grad():
                self.log_alpha.copy_(state_dict['log_alpha'])
            self.opt_alpha.load_state_dict(state_dict['opt_alpha'])
        misc.env_load_state_dict(self.env, state_dict['env'])
        self.t = state_dict['t']

        buffer_format = state_dict['buffer_format']
        buffer_state = dict(
            np.load(os.path.join(self.ckptr.ckptdir, 'buffer.npz'),
                    allow_pickle=True))
        buffer_state = nest.flatten(buffer_state)
        self.buffer.load_state_dict(
            nest.pack_sequence_as(buffer_state, buffer_format))
        self.data_manager.manual_reset()
        return self.t

    def close(self):
        """Close environment."""
        try:
            self.env.close()
        except Exception:
            pass
Beispiel #4
0
    def __init__(self,
                 logdir,
                 env_fn,
                 policy_fn,
                 qf_fn,
                 nenv=1,
                 optimizer=torch.optim.Adam,
                 buffer_size=10000,
                 frame_stack=1,
                 learning_starts=1000,
                 update_period=1,
                 batch_size=256,
                 policy_lr=1e-4,
                 qf_lr=1e-3,
                 qf_weight_decay=0.01,
                 gamma=0.99,
                 noise_theta=0.15,
                 noise_sigma=0.2,
                 noise_sigma_final=0.01,
                 noise_decay_period=10000,
                 target_update_period=1,
                 target_smoothing_coef=0.005,
                 reward_scale=1,
                 gpu=True,
                 eval_num_episodes=1,
                 record_num_episodes=1,
                 log_period=1000):
        """Init."""
        self.logdir = logdir
        self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts'))
        self.env_fn = env_fn
        self.nenv = nenv
        self.eval_num_episodes = eval_num_episodes
        self.record_num_episodes = record_num_episodes
        self.gamma = gamma
        self.buffer_size = buffer_size
        self.frame_stack = frame_stack
        self.learning_starts = learning_starts
        self.update_period = update_period
        self.batch_size = batch_size
        if target_update_period < self.update_period:
            self.target_update_period = self.update_period
        else:
            self.target_update_period = target_update_period - (
                target_update_period % self.update_period)
        self.reward_scale = reward_scale
        self.target_smoothing_coef = target_smoothing_coef
        self.log_period = log_period

        self.device = torch.device(
            'cuda:0' if gpu and torch.cuda.is_available() else 'cpu')
        self.t = 0

        self.env = VecEpisodeLogger(env_fn(nenv=nenv))
        self.policy_fn = policy_fn
        self.qf_fn = qf_fn
        eval_env = VecFrameStack(self.env, self.frame_stack)
        self.pi = policy_fn(eval_env)
        self.qf = qf_fn(eval_env)
        self.target_pi = policy_fn(eval_env)
        self.target_qf = qf_fn(eval_env)

        self.pi.to(self.device)
        self.qf.to(self.device)
        self.target_pi.to(self.device)
        self.target_qf.to(self.device)

        self.optimizer = optimizer
        self.policy_lr = policy_lr
        self.qf_lr = qf_lr
        self.qf_weight_decay = qf_weight_decay
        self.opt_pi = optimizer(self.pi.parameters(), lr=policy_lr)
        self.opt_qf = optimizer(self.qf.parameters(),
                                lr=qf_lr,
                                weight_decay=qf_weight_decay)

        self.target_pi.load_state_dict(self.pi.state_dict())
        self.target_qf.load_state_dict(self.qf.state_dict())

        self.noise_schedule = LinearSchedule(noise_decay_period,
                                             noise_sigma_final, noise_sigma)
        self._actor = DDPGActor(self.pi, self.env.action_space, noise_theta,
                                self.noise_schedule.value(self.t))
        self.buffer = BatchedReplayBuffer(
            *
            [ReplayBuffer(buffer_size, frame_stack) for _ in range(self.nenv)])
        self.data_manager = ReplayBufferDataManager(self.buffer, self.env,
                                                    self._actor, self.device,
                                                    self.learning_starts,
                                                    self.update_period)

        self.qf_criterion = torch.nn.MSELoss()
        if self.env.action_space.__class__.__name__ == 'Discrete':
            raise ValueError("Action space must be continuous!")
Beispiel #5
0
class DDPG(Algorithm):
    """DDPG algorithm."""
    def __init__(self,
                 logdir,
                 env_fn,
                 policy_fn,
                 qf_fn,
                 nenv=1,
                 optimizer=torch.optim.Adam,
                 buffer_size=10000,
                 frame_stack=1,
                 learning_starts=1000,
                 update_period=1,
                 batch_size=256,
                 policy_lr=1e-4,
                 qf_lr=1e-3,
                 qf_weight_decay=0.01,
                 gamma=0.99,
                 noise_theta=0.15,
                 noise_sigma=0.2,
                 noise_sigma_final=0.01,
                 noise_decay_period=10000,
                 target_update_period=1,
                 target_smoothing_coef=0.005,
                 reward_scale=1,
                 gpu=True,
                 eval_num_episodes=1,
                 record_num_episodes=1,
                 log_period=1000):
        """Init."""
        self.logdir = logdir
        self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts'))
        self.env_fn = env_fn
        self.nenv = nenv
        self.eval_num_episodes = eval_num_episodes
        self.record_num_episodes = record_num_episodes
        self.gamma = gamma
        self.buffer_size = buffer_size
        self.frame_stack = frame_stack
        self.learning_starts = learning_starts
        self.update_period = update_period
        self.batch_size = batch_size
        if target_update_period < self.update_period:
            self.target_update_period = self.update_period
        else:
            self.target_update_period = target_update_period - (
                target_update_period % self.update_period)
        self.reward_scale = reward_scale
        self.target_smoothing_coef = target_smoothing_coef
        self.log_period = log_period

        self.device = torch.device(
            'cuda:0' if gpu and torch.cuda.is_available() else 'cpu')
        self.t = 0

        self.env = VecEpisodeLogger(env_fn(nenv=nenv))
        self.policy_fn = policy_fn
        self.qf_fn = qf_fn
        eval_env = VecFrameStack(self.env, self.frame_stack)
        self.pi = policy_fn(eval_env)
        self.qf = qf_fn(eval_env)
        self.target_pi = policy_fn(eval_env)
        self.target_qf = qf_fn(eval_env)

        self.pi.to(self.device)
        self.qf.to(self.device)
        self.target_pi.to(self.device)
        self.target_qf.to(self.device)

        self.optimizer = optimizer
        self.policy_lr = policy_lr
        self.qf_lr = qf_lr
        self.qf_weight_decay = qf_weight_decay
        self.opt_pi = optimizer(self.pi.parameters(), lr=policy_lr)
        self.opt_qf = optimizer(self.qf.parameters(),
                                lr=qf_lr,
                                weight_decay=qf_weight_decay)

        self.target_pi.load_state_dict(self.pi.state_dict())
        self.target_qf.load_state_dict(self.qf.state_dict())

        self.noise_schedule = LinearSchedule(noise_decay_period,
                                             noise_sigma_final, noise_sigma)
        self._actor = DDPGActor(self.pi, self.env.action_space, noise_theta,
                                self.noise_schedule.value(self.t))
        self.buffer = BatchedReplayBuffer(
            *
            [ReplayBuffer(buffer_size, frame_stack) for _ in range(self.nenv)])
        self.data_manager = ReplayBufferDataManager(self.buffer, self.env,
                                                    self._actor, self.device,
                                                    self.learning_starts,
                                                    self.update_period)

        self.qf_criterion = torch.nn.MSELoss()
        if self.env.action_space.__class__.__name__ == 'Discrete':
            raise ValueError("Action space must be continuous!")

    def loss(self, batch):
        """Loss function."""
        # compute QFunction loss.
        with torch.no_grad():
            target_action = self.target_pi(batch['next_obs']).action
            target_q = self.target_qf(batch['next_obs'], target_action).value
            qtarg = self.reward_scale * batch['reward'].float() + (
                (1.0 - batch['done']) * self.gamma * target_q)

        q = self.qf(batch['obs'], batch['action']).value
        assert qtarg.shape == q.shape
        qf_loss = self.qf_criterion(q, qtarg)

        # compute policy loss
        action = self.pi(batch['obs'], deterministic=True).action
        q = self.qf(batch['obs'], action).value
        pi_loss = -q.mean()

        # log losses
        if self.t % self.log_period < self.update_period:
            logger.add_scalar('loss/qf', qf_loss, self.t, time.time())
            logger.add_scalar('loss/pi', pi_loss, self.t, time.time())
        return pi_loss, qf_loss

    def step(self):
        """Step optimization."""
        self._actor.update_sigma(self.noise_schedule.value(self.t))
        self.t += self.data_manager.step_until_update()
        if self.t % self.target_update_period == 0:
            soft_target_update(self.target_pi, self.pi,
                               self.target_smoothing_coef)
            soft_target_update(self.target_qf, self.qf,
                               self.target_smoothing_coef)

        if self.t % self.update_period == 0:
            batch = self.data_manager.sample(self.batch_size)

            pi_loss, qf_loss = self.loss(batch)

            # update
            self.opt_pi.zero_grad()
            pi_loss.backward()
            self.opt_pi.step()

            self.opt_qf.zero_grad()
            qf_loss.backward()
            self.opt_qf.step()

        return self.t

    def evaluate(self):
        """Evaluate."""
        eval_env = VecFrameStack(self.env, self.frame_stack)
        self.pi.eval()
        misc.set_env_to_eval_mode(eval_env)

        # Eval policy
        os.makedirs(os.path.join(self.logdir, 'eval'), exist_ok=True)
        outfile = os.path.join(self.logdir, 'eval',
                               self.ckptr.format.format(self.t) + '.json')
        stats = rl_evaluate(eval_env, self.pi, self.eval_num_episodes, outfile,
                            self.device)
        logger.add_scalar('eval/mean_episode_reward', stats['mean_reward'],
                          self.t, time.time())
        logger.add_scalar('eval/mean_episode_length', stats['mean_length'],
                          self.t, time.time())

        # Record policy
        os.makedirs(os.path.join(self.logdir, 'video'), exist_ok=True)
        outfile = os.path.join(self.logdir, 'video',
                               self.ckptr.format.format(self.t) + '.mp4')
        rl_record(eval_env, self.pi, self.record_num_episodes, outfile,
                  self.device)

        self.pi.train()
        misc.set_env_to_train_mode(self.env)
        self.data_manager.manual_reset()

    def save(self):
        """Save."""
        state_dict = {
            'pi': self.pi.state_dict(),
            'qf': self.qf.state_dict(),
            'target_pi': self.target_pi.state_dict(),
            'target_qf': self.target_qf.state_dict(),
            'opt_pi': self.opt_pi.state_dict(),
            'opt_qf': self.opt_qf.state_dict(),
            'env': misc.env_state_dict(self.env),
            't': self.t
        }
        buffer_dict = self.buffer.state_dict()
        state_dict['buffer_format'] = nest.get_structure(buffer_dict)
        self.ckptr.save(state_dict, self.t)

        # save buffer seperately and only once (because it can be huge)
        np.savez(
            os.path.join(self.ckptr.ckptdir, 'buffer.npz'),
            **{f'{i:04d}': x
               for i, x in enumerate(nest.flatten(buffer_dict))})

    def load(self, t=None):
        """Load."""
        state_dict = self.ckptr.load(t)
        if state_dict is None:
            self.t = 0
            return self.t
        self.pi.load_state_dict(state_dict['pi'])
        self.qf.load_state_dict(state_dict['qf'])
        self.target_pi.load_state_dict(state_dict['target_pi'])
        self.target_qf.load_state_dict(state_dict['target_qf'])

        self.opt_pi.load_state_dict(state_dict['opt_pi'])
        self.opt_qf.load_state_dict(state_dict['opt_qf'])
        misc.env_load_state_dict(self.env, state_dict['env'])
        self.t = state_dict['t']

        buffer_format = state_dict['buffer_format']
        buffer_state = dict(
            np.load(os.path.join(self.ckptr.ckptdir, 'buffer.npz')))
        buffer_state = nest.flatten(buffer_state)
        self.buffer.load_state_dict(
            nest.pack_sequence_as(buffer_state, buffer_format))
        self.data_manager.manual_reset()
        return self.t

    def close(self):
        """Close environment."""
        try:
            self.env.close()
        except Exception:
            pass
Beispiel #6
0
    def __init__(self,
                 logdir,
                 env_fn,
                 qf_fn,
                 nenv=1,
                 optimizer=torch.optim.RMSprop,
                 buffer_size=100000,
                 frame_stack=1,
                 learning_starts=10000,
                 update_period=1,
                 gamma=0.99,
                 huber_loss=True,
                 exploration_timesteps=1000000,
                 final_eps=0.1,
                 eval_eps=0.05,
                 target_update_period=10000,
                 batch_size=32,
                 gpu=True,
                 eval_num_episodes=1,
                 record_num_episodes=1,
                 log_period=10):
        """Init."""
        self.logdir = logdir
        self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts'))
        self.env_fn = env_fn
        self.nenv = nenv
        self.eval_num_episodes = eval_num_episodes
        self.record_num_episodes = record_num_episodes
        self.gamma = gamma
        self.frame_stack = frame_stack
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.learning_starts = learning_starts
        self.update_period = update_period
        self.eval_eps = eval_eps
        self.target_update_period = target_update_period - (
            target_update_period % self.update_period)
        self.log_period = log_period
        self.device = torch.device('cuda:0' if gpu and torch.cuda.is_available()
                                   else 'cpu')

        self.env = VecEpisodeLogger(env_fn(nenv=nenv))
        stacked_env = VecFrameStack(env_fn(nenv=nenv), self.frame_stack)

        self.qf = qf_fn(stacked_env).to(self.device)
        self.qf_targ = qf_fn(stacked_env).to(self.device)
        self.opt = optimizer(self.qf.parameters())
        if huber_loss:
            self.criterion = torch.nn.SmoothL1Loss(reduction='none')
        else:
            self.criterion = torch.nn.MSELoss(reduction='none')
        self.eps_schedule = LinearSchedule(exploration_timesteps, final_eps,
                                           1.0)
        self._actor = EpsilonGreedyActor(self.qf, self.eps_schedule,
                                         self.env.action_space, self.nenv)

        self.buffer = BatchedReplayBuffer(*[
            ReplayBuffer(buffer_size, frame_stack) for _ in range(self.nenv)
        ])
        self.data_manager = ReplayBufferDataManager(self.buffer,
                                                    self.env,
                                                    self._actor,
                                                    self.device,
                                                    self.learning_starts,
                                                    self.update_period)
        self.t = 0
Beispiel #7
0
class DQN(Algorithm):
    """DQN algorithm."""

    def __init__(self,
                 logdir,
                 env_fn,
                 qf_fn,
                 nenv=1,
                 optimizer=torch.optim.RMSprop,
                 buffer_size=100000,
                 frame_stack=1,
                 learning_starts=10000,
                 update_period=1,
                 gamma=0.99,
                 huber_loss=True,
                 exploration_timesteps=1000000,
                 final_eps=0.1,
                 eval_eps=0.05,
                 target_update_period=10000,
                 batch_size=32,
                 gpu=True,
                 eval_num_episodes=1,
                 record_num_episodes=1,
                 log_period=10):
        """Init."""
        self.logdir = logdir
        self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts'))
        self.env_fn = env_fn
        self.nenv = nenv
        self.eval_num_episodes = eval_num_episodes
        self.record_num_episodes = record_num_episodes
        self.gamma = gamma
        self.frame_stack = frame_stack
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.learning_starts = learning_starts
        self.update_period = update_period
        self.eval_eps = eval_eps
        self.target_update_period = target_update_period - (
            target_update_period % self.update_period)
        self.log_period = log_period
        self.device = torch.device('cuda:0' if gpu and torch.cuda.is_available()
                                   else 'cpu')

        self.env = VecEpisodeLogger(env_fn(nenv=nenv))
        stacked_env = VecFrameStack(env_fn(nenv=nenv), self.frame_stack)

        self.qf = qf_fn(stacked_env).to(self.device)
        self.qf_targ = qf_fn(stacked_env).to(self.device)
        self.opt = optimizer(self.qf.parameters())
        if huber_loss:
            self.criterion = torch.nn.SmoothL1Loss(reduction='none')
        else:
            self.criterion = torch.nn.MSELoss(reduction='none')
        self.eps_schedule = LinearSchedule(exploration_timesteps, final_eps,
                                           1.0)
        self._actor = EpsilonGreedyActor(self.qf, self.eps_schedule,
                                         self.env.action_space, self.nenv)

        self.buffer = BatchedReplayBuffer(*[
            ReplayBuffer(buffer_size, frame_stack) for _ in range(self.nenv)
        ])
        self.data_manager = ReplayBufferDataManager(self.buffer,
                                                    self.env,
                                                    self._actor,
                                                    self.device,
                                                    self.learning_starts,
                                                    self.update_period)
        self.t = 0

    def _compute_target(self, rew, next_ob, done):
        qtarg = self.qf_targ(next_ob).max_q
        return rew + (1.0 - done) * self.gamma * qtarg

    def _get_batch(self):
        return self.data_manager.sample(self.batch_size)

    def loss(self, batch):
        """Compute loss."""
        q = self.qf(batch['obs'], batch['action']).value

        with torch.no_grad():
            target = self._compute_target(batch['reward'], batch['next_obs'],
                                          batch['done'])

        assert target.shape == q.shape
        loss = self.criterion(target, q).mean()
        if self.t % self.log_period < self.update_period:
            logger.add_scalar('alg/maxq', torch.max(q).detach().cpu().numpy(),
                              self.t, time.time())
            logger.add_scalar('alg/loss', loss.detach().cpu().numpy(), self.t,
                              time.time())
            logger.add_scalar('alg/epsilon',
                              self.eps_schedule.value(self._actor.t),
                              self.t, time.time())
        return loss

    def step(self):
        """Step."""
        self.t += self.data_manager.step_until_update()
        if self.t % self.target_update_period == 0:
            self.qf_targ.load_state_dict(self.qf.state_dict())

        self.opt.zero_grad()
        loss = self.loss(self._get_batch())
        loss.backward()
        self.opt.step()
        return self.t

    def evaluate(self):
        """Evaluate."""
        eval_env = VecEpsilonGreedy(VecFrameStack(self.env, self.frame_stack),
                                    self.eval_eps)
        self.qf.eval()
        misc.set_env_to_eval_mode(eval_env)

        # Eval policy
        os.makedirs(os.path.join(self.logdir, 'eval'), exist_ok=True)
        outfile = os.path.join(self.logdir, 'eval',
                               self.ckptr.format.format(self.t) + '.json')
        stats = rl_evaluate(eval_env, self.qf, self.eval_num_episodes,
                            outfile, self.device)
        logger.add_scalar('eval/mean_episode_reward', stats['mean_reward'],
                          self.t, time.time())
        logger.add_scalar('eval/mean_episode_length', stats['mean_length'],
                          self.t, time.time())

        # Record policy
        os.makedirs(os.path.join(self.logdir, 'video'), exist_ok=True)
        outfile = os.path.join(self.logdir, 'video',
                               self.ckptr.format.format(self.t) + '.mp4')
        rl_record(eval_env, self.qf, self.record_num_episodes, outfile,
                  self.device)

        self.qf.train()
        misc.set_env_to_train_mode(self.env)
        self.data_manager.manual_reset()

    def save(self):
        """Save."""
        state_dict = {
            'qf': self.qf.state_dict(),
            'qf_targ': self.qf.state_dict(),
            'opt': self.opt.state_dict(),
            '_actor': self._actor.state_dict(),
            'env': misc.env_state_dict(self.env),
            't': self.t
        }
        buffer_dict = self.buffer.state_dict()
        state_dict['buffer_format'] = nest.get_structure(buffer_dict)
        self.ckptr.save(state_dict, self.t)

        # save buffer seperately and only once (because it can be huge)
        np.savez(os.path.join(self.ckptr.ckptdir, 'buffer.npz'),
                 **{f'{i:04d}': x for i, x in
                    enumerate(nest.flatten(buffer_dict))})

    def load(self, t=None):
        """Load."""
        state_dict = self.ckptr.load(t)
        if state_dict is None:
            self.t = 0
            return self.t
        self.qf.load_state_dict(state_dict['qf'])
        self.qf_targ.load_state_dict(state_dict['qf_targ'])
        self.opt.load_state_dict(state_dict['opt'])
        self._actor.load_state_dict(state_dict['_actor'])
        misc.env_load_state_dict(self.env, state_dict['env'])
        self.t = state_dict['t']

        buffer_format = state_dict['buffer_format']
        buffer_state = dict(np.load(os.path.join(self.ckptr.ckptdir,
                                                 'buffer.npz')))
        buffer_state = nest.flatten(buffer_state)
        self.buffer.load_state_dict(nest.pack_sequence_as(buffer_state,
                                                          buffer_format))
        self.data_manager.manual_reset()
        return self.t

    def close(self):
        """Close environment."""
        try:
            self.env.close()
        except Exception:
            pass
Beispiel #8
0
    def __init__(self,
                 logdir,
                 env_fn,
                 policy_fn,
                 qf_fn,
                 nenv=1,
                 optimizer=torch.optim.Adam,
                 buffer_size=int(1e6),
                 frame_stack=1,
                 learning_starts=10000,
                 update_period=1,
                 batch_size=256,
                 lr=3e-4,
                 policy_update_period=2,
                 target_smoothing_coef=0.005,
                 reward_scale=1,
                 gamma=0.99,
                 exploration_noise=0.1,
                 policy_noise=0.2,
                 policy_noise_clip=0.5,
                 gpu=True,
                 eval_num_episodes=1,
                 record_num_episodes=1,
                 log_period=1000):
        """Init."""
        self.logdir = logdir
        self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts'))
        self.env_fn = env_fn
        self.nenv = nenv
        self.eval_num_episodes = eval_num_episodes
        self.record_num_episodes = record_num_episodes
        self.gamma = gamma
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.frame_stack = frame_stack
        self.learning_starts = learning_starts
        self.update_period = update_period
        if policy_update_period < self.update_period:
            self.policy_update_period = self.update_period
        else:
            self.policy_update_period = policy_update_period - (
                                policy_update_period % self.update_period)
        self.reward_scale = reward_scale
        self.target_smoothing_coef = target_smoothing_coef
        self.exploration_noise = exploration_noise
        self.policy_noise = policy_noise
        self.policy_noise_clip = policy_noise_clip
        self.log_period = log_period

        self.device = torch.device('cuda:0' if gpu and torch.cuda.is_available()
                                   else 'cpu')

        self.policy_fn = policy_fn
        self.qf_fn = qf_fn
        self.env = VecEpisodeLogger(env_fn(nenv=nenv))
        eval_env = VecFrameStack(self.env, self.frame_stack)
        self.pi = policy_fn(eval_env)
        self.qf1 = qf_fn(eval_env)
        self.qf2 = qf_fn(eval_env)
        self.target_pi = policy_fn(eval_env)
        self.target_qf1 = qf_fn(eval_env)
        self.target_qf2 = qf_fn(eval_env)

        self.pi.to(self.device)
        self.qf1.to(self.device)
        self.qf2.to(self.device)
        self.target_pi.to(self.device)
        self.target_qf1.to(self.device)
        self.target_qf2.to(self.device)

        self.optimizer = optimizer
        self.lr = lr
        self.opt_pi = optimizer(self.pi.parameters(), lr=lr)
        self.opt_qf = optimizer(list(self.qf1.parameters())
                                + list(self.qf2.parameters()), lr=lr)

        self.target_pi.load_state_dict(self.pi.state_dict())
        self.target_qf1.load_state_dict(self.qf1.state_dict())
        self.target_qf2.load_state_dict(self.qf2.state_dict())

        self._actor = TD3Actor(self.pi, self.env.action_space,
                               exploration_noise)
        self.buffer = BatchedReplayBuffer(*[
            ReplayBuffer(buffer_size, frame_stack) for _ in range(self.nenv)
        ])
        self.data_manager = ReplayBufferDataManager(self.buffer,
                                                    self.env,
                                                    self._actor,
                                                    self.device,
                                                    self.learning_starts,
                                                    self.update_period)

        self.qf_criterion = torch.nn.MSELoss()
        if self.env.action_space.__class__.__name__ == 'Discrete':
            raise ValueError("Action space must be continuous!")

        self.low = torch.from_numpy(self.env.action_space.low).to(self.device)
        self.high = torch.from_numpy(self.env.action_space.high).to(self.device)

        self.t = 0
Beispiel #9
0
class TD3(Algorithm):
    """TD3 algorithm."""

    def __init__(self,
                 logdir,
                 env_fn,
                 policy_fn,
                 qf_fn,
                 nenv=1,
                 optimizer=torch.optim.Adam,
                 buffer_size=int(1e6),
                 frame_stack=1,
                 learning_starts=10000,
                 update_period=1,
                 batch_size=256,
                 lr=3e-4,
                 policy_update_period=2,
                 target_smoothing_coef=0.005,
                 reward_scale=1,
                 gamma=0.99,
                 exploration_noise=0.1,
                 policy_noise=0.2,
                 policy_noise_clip=0.5,
                 gpu=True,
                 eval_num_episodes=1,
                 record_num_episodes=1,
                 log_period=1000):
        """Init."""
        self.logdir = logdir
        self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts'))
        self.env_fn = env_fn
        self.nenv = nenv
        self.eval_num_episodes = eval_num_episodes
        self.record_num_episodes = record_num_episodes
        self.gamma = gamma
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.frame_stack = frame_stack
        self.learning_starts = learning_starts
        self.update_period = update_period
        if policy_update_period < self.update_period:
            self.policy_update_period = self.update_period
        else:
            self.policy_update_period = policy_update_period - (
                                policy_update_period % self.update_period)
        self.reward_scale = reward_scale
        self.target_smoothing_coef = target_smoothing_coef
        self.exploration_noise = exploration_noise
        self.policy_noise = policy_noise
        self.policy_noise_clip = policy_noise_clip
        self.log_period = log_period

        self.device = torch.device('cuda:0' if gpu and torch.cuda.is_available()
                                   else 'cpu')

        self.policy_fn = policy_fn
        self.qf_fn = qf_fn
        self.env = VecEpisodeLogger(env_fn(nenv=nenv))
        eval_env = VecFrameStack(self.env, self.frame_stack)
        self.pi = policy_fn(eval_env)
        self.qf1 = qf_fn(eval_env)
        self.qf2 = qf_fn(eval_env)
        self.target_pi = policy_fn(eval_env)
        self.target_qf1 = qf_fn(eval_env)
        self.target_qf2 = qf_fn(eval_env)

        self.pi.to(self.device)
        self.qf1.to(self.device)
        self.qf2.to(self.device)
        self.target_pi.to(self.device)
        self.target_qf1.to(self.device)
        self.target_qf2.to(self.device)

        self.optimizer = optimizer
        self.lr = lr
        self.opt_pi = optimizer(self.pi.parameters(), lr=lr)
        self.opt_qf = optimizer(list(self.qf1.parameters())
                                + list(self.qf2.parameters()), lr=lr)

        self.target_pi.load_state_dict(self.pi.state_dict())
        self.target_qf1.load_state_dict(self.qf1.state_dict())
        self.target_qf2.load_state_dict(self.qf2.state_dict())

        self._actor = TD3Actor(self.pi, self.env.action_space,
                               exploration_noise)
        self.buffer = BatchedReplayBuffer(*[
            ReplayBuffer(buffer_size, frame_stack) for _ in range(self.nenv)
        ])
        self.data_manager = ReplayBufferDataManager(self.buffer,
                                                    self.env,
                                                    self._actor,
                                                    self.device,
                                                    self.learning_starts,
                                                    self.update_period)

        self.qf_criterion = torch.nn.MSELoss()
        if self.env.action_space.__class__.__name__ == 'Discrete':
            raise ValueError("Action space must be continuous!")

        self.low = torch.from_numpy(self.env.action_space.low).to(self.device)
        self.high = torch.from_numpy(self.env.action_space.high).to(self.device)

        self.t = 0

    def loss(self, batch):
        """Loss function."""
        # compute QFunction loss.
        with torch.no_grad():
            target_action = self.target_pi(batch['next_obs']).action
            noise = (
                     torch.randn_like(target_action) * self.policy_noise
            ).clamp(-self.policy_noise_clip, self.policy_noise_clip)
            target_action = (target_action + noise).clamp(-1., 1.)
            target_q1 = self.target_qf1(batch['next_obs'], target_action).value
            target_q2 = self.target_qf2(batch['next_obs'], target_action).value
            target_q = torch.min(target_q1, target_q2)
            qtarg = self.reward_scale * batch['reward'].float() + (
                    (1.0 - batch['done']) * self.gamma * target_q)

        q1 = self.qf1(batch['obs'], batch['action']).value
        q2 = self.qf2(batch['obs'], batch['action']).value
        assert qtarg.shape == q1.shape
        assert qtarg.shape == q2.shape
        qf_loss = self.qf_criterion(q1, qtarg) + self.qf_criterion(q2, qtarg)

        # compute policy loss
        if self.t % self.policy_update_period == 0:
            action = self.pi(batch['obs'], deterministic=True).action
            q = self.qf1(batch['obs'], action).value
            pi_loss = -q.mean()
        else:
            pi_loss = torch.zeros_like(qf_loss)

        # log losses
        if self.t % self.log_period < self.update_period:
            logger.add_scalar('loss/qf', qf_loss, self.t, time.time())
            if self.t % self.policy_update_period == 0:
                logger.add_scalar('loss/pi', pi_loss, self.t, time.time())
        return pi_loss, qf_loss

    def step(self):
        """Step optimization."""
        self.t += self.data_manager.step_until_update()
        batch = self.data_manager.sample(self.batch_size)

        pi_loss, qf_loss = self.loss(batch)

        # update
        self.opt_qf.zero_grad()
        qf_loss.backward()
        self.opt_qf.step()

        if self.t % self.policy_update_period == 0:
            self.opt_pi.zero_grad()
            pi_loss.backward()
            self.opt_pi.step()

            # update target networks
            soft_target_update(self.target_pi, self.pi,
                               self.target_smoothing_coef)
            soft_target_update(self.target_qf1, self.qf1,
                               self.target_smoothing_coef)
            soft_target_update(self.target_qf2, self.qf2,
                               self.target_smoothing_coef)
        return self.t

    def evaluate(self):
        """Evaluate."""
        eval_env = VecFrameStack(self.env, self.frame_stack)
        self.pi.eval()
        misc.set_env_to_eval_mode(eval_env)

        # Eval policy
        os.makedirs(os.path.join(self.logdir, 'eval'), exist_ok=True)
        outfile = os.path.join(self.logdir, 'eval',
                               self.ckptr.format.format(self.t) + '.json')
        stats = rl_evaluate(eval_env, self.pi, self.eval_num_episodes,
                            outfile, self.device)
        logger.add_scalar('eval/mean_episode_reward', stats['mean_reward'],
                          self.t, time.time())
        logger.add_scalar('eval/mean_episode_length', stats['mean_length'],
                          self.t, time.time())

        # Record policy
        os.makedirs(os.path.join(self.logdir, 'video'), exist_ok=True)
        outfile = os.path.join(self.logdir, 'video',
                               self.ckptr.format.format(self.t) + '.mp4')
        rl_record(eval_env, self.pi, self.record_num_episodes, outfile,
                  self.device)

        self.pi.train()
        misc.set_env_to_train_mode(self.env)
        self.data_manager.manual_reset()

    def save(self):
        """Save."""
        state_dict = {
            'pi': self.pi.state_dict(),
            'qf1': self.qf1.state_dict(),
            'qf2': self.qf2.state_dict(),
            'target_pi': self.target_pi.state_dict(),
            'target_qf1': self.target_qf1.state_dict(),
            'target_qf2': self.target_qf2.state_dict(),
            'opt_pi': self.opt_pi.state_dict(),
            'opt_qf': self.opt_qf.state_dict(),
            'env': misc.env_state_dict(self.env),
            't': self.t
        }
        buffer_dict = self.buffer.state_dict()
        state_dict['buffer_format'] = nest.get_structure(buffer_dict)
        self.ckptr.save(state_dict, self.t)

        # save buffer seperately and only once (because it can be huge)
        np.savez(os.path.join(self.ckptr.ckptdir, 'buffer.npz'),
                 **{f'{i:04d}': x for i, x in
                    enumerate(nest.flatten(buffer_dict))})

    def load(self, t=None):
        """Load."""
        state_dict = self.ckptr.load(t)
        if state_dict is None:
            self.t = 0
            return self.t
        self.pi.load_state_dict(state_dict['pi'])
        self.qf1.load_state_dict(state_dict['qf1'])
        self.qf2.load_state_dict(state_dict['qf2'])
        self.target_pi.load_state_dict(state_dict['target_pi'])
        self.target_qf1.load_state_dict(state_dict['target_qf1'])
        self.target_qf2.load_state_dict(state_dict['target_qf2'])
        self.opt_pi.load_state_dict(state_dict['opt_pi'])
        self.opt_qf.load_state_dict(state_dict['opt_qf'])
        misc.env_load_state_dict(self.env, state_dict['env'])
        self.t = state_dict['t']

        buffer_format = state_dict['buffer_format']
        buffer_state = dict(np.load(os.path.join(self.ckptr.ckptdir,
                                                 'buffer.npz')))
        buffer_state = nest.flatten(buffer_state)
        self.buffer.load_state_dict(nest.pack_sequence_as(buffer_state,
                                                          buffer_format))
        self.data_manager.manual_reset()
        return self.t

    def close(self):
        """Close environment."""
        try:
            self.env.close()
        except Exception:
            pass