Exemple #1
0
class Runner(object):
    def __init__(self,
                 net,
                 env,
                 params,
                 is_cuda=True,
                 seed=42,
                 log_dir=abspath("/data/patrik")):
        super().__init__()

        # constants
        self.timestamp = strftime("%Y-%m-%d %H_%M_%S", gmtime())
        self.seed = seed
        self.is_cuda = torch.cuda.is_available() and is_cuda

        # parameters
        self.params = params
        """Logger"""
        self.logger = TemporalLogger(self.params.env_name, self.timestamp,
                                     log_dir, *["rewards", "features"])
        self.checkpointer = AgentCheckpointer(self.params.env_name,
                                              self.params.num_updates,
                                              self.timestamp)
        """Environment"""
        self.env = env

        self.storage = RolloutStorage(self.params.rollout_size,
                                      self.params.num_envs,
                                      self.env.observation_space.shape[0:-1],
                                      self.params.n_stack,
                                      is_cuda=self.is_cuda)
        """Network"""
        self.net = net

        if self.is_cuda:
            self.net = self.net.cuda()

    def train(self):
        """Environment reset"""
        obs = self.env.reset()
        self.storage.states[0].copy_(self.storage.obs2tensor(obs))

        for num_update in range(self.params.num_updates):

            final_value, entropy = self.episode_rollout()

            self.net.optimizer.zero_grad()
            """ICM prediction """
            # tensors for the curiosity-based loss
            # feature, feature_pred: fwd_loss
            # a_t_pred: inv_loss
            icm_loss = self.net.icm(
                self.params.num_envs,
                self.storage.states.view(-1, self.params.n_stack,
                                         *self.storage.frame_shape),
                self.storage.actions.view(-1))
            """Assemble loss"""
            a2c_loss, rewards = self.storage.a2c_loss(
                final_value, entropy, self.params.value_coeff,
                self.params.entropy_coeff)

            loss = a2c_loss + icm_loss

            loss.backward(retain_graph=False)

            # gradient clipping
            nn.utils.clip_grad_norm_(self.net.parameters(),
                                     self.params.max_grad_norm)
            """Log rewards & features"""
            if len(self.storage.episode_rewards) > 1:
                self.logger.log(
                    **{
                        "rewards":
                        np.array(self.storage.episode_rewards),
                        "features":
                        self.storage.features[-1].detach().cpu().numpy()
                    })

            self.net.optimizer.step()

            # it stores a lot of data which let's the graph
            # grow out of memory, so it is crucial to reset
            self.storage.after_update()

            if len(self.storage.episode_rewards) > 1:
                self.checkpointer.checkpoint(loss,
                                             self.storage.episode_rewards,
                                             self.net)

            if num_update % 1000 == 0:
                print("current loss: ", loss.item(), " at update #",
                      num_update)
                self.storage.print_reward_stats()
                # torch.save(self.net.state_dict(), "a2c_time_log_no_norm")

        self.env.close()

        self.logger.save(*["rewards", "features"])
        self.params.save(self.logger.data_dir, self.timestamp)

    def episode_rollout(self):
        episode_entropy = 0
        for step in range(self.params.rollout_size):
            """Interact with the environments """
            # call A2C
            a_t, log_p_a_t, entropy, value, a2c_features = self.net.a2c.get_action(
                self.storage.get_state(step))
            # accumulate episode entropy
            episode_entropy += entropy

            # interact
            obs, rewards, dones, infos = self.env.step(a_t.cpu().numpy())

            # save episode reward
            self.storage.log_episode_rewards(infos)

            self.storage.insert(step, rewards, obs, a_t, log_p_a_t, value,
                                dones, a2c_features)
            self.net.a2c.reset_recurrent_buffers(reset_indices=dones)

        # Note:
        # get the estimate of the final reward
        # that's why we have the CRITIC --> estimate final reward
        # detach, as the final value will only be used as a
        with torch.no_grad():
            _, _, _, final_value, final_features = self.net.a2c.get_action(
                self.storage.get_state(step + 1))

        self.storage.features[step + 1].copy_(final_features)

        return final_value, episode_entropy
Exemple #2
0
class A2CTrial(PyTorchTrial):
    def __init__(self, trial_context: PyTorchTrialContext) -> None:
        self.context = trial_context
        self.download_directory = f"/tmp/data-rank{self.context.distributed.get_rank()}"
        # self.logger = TorchWriter()
        self.n_stack = self.context.get_hparam("n_stack")
        self.env_name = self.context.get_hparam("env_name")
        self.num_envs = self.context.get_hparam("num_envs")
        self.rollout_size = self.context.get_hparam("rollout_size")
        self.curiousity = self.context.get_hparam("curiousity")
        self.lr = self.context.get_hparam("lr")
        self.icm_beta = self.context.get_hparam("icm_beta")
        self.value_coeff = self.context.get_hparam("value_coeff")
        self.entropy_coeff = self.context.get_hparam("entropy_coeff")
        self.max_grad_norm = self.context.get_hparam("max_grad_norm")

        env = make_atari_env(self.env_name, num_env=self.num_envs, seed=42)
        self.env = VecFrameStack(env, n_stack=self.n_stack)
        eval_env = make_atari_env(self.env_name, num_env=1, seed=42)
        self.eval_env = VecFrameStack(eval_env, n_stack=self.n_stack)

        # constants
        self.in_size = self.context.get_hparam("in_size")  # in_size
        self.num_actions = env.action_space.n

        def init_(m):
            return init(m, nn.init.orthogonal_,
                        lambda x: nn.init.constant_(x, 0))

        self.feat_enc_net = self.context.Model(
            FeatureEncoderNet(self.n_stack, self.in_size))
        self.actor = self.context.Model(
            init_(nn.Linear(self.feat_enc_net.hidden_size, self.num_actions)))
        self.critic = self.context.Model(
            init_(nn.Linear(self.feat_enc_net.hidden_size, 1)))
        self.set_recurrent_buffers(self.num_envs)

        params = list(self.feat_enc_net.parameters()) + list(
            self.actor.parameters()) + list(self.critic.parameters())
        self.opt = self.context.Optimizer(torch.optim.Adam(params, self.lr))

        self.is_cuda = torch.cuda.is_available()
        self.storage = RolloutStorage(self.rollout_size,
                                      self.num_envs,
                                      self.env.observation_space.shape[0:-1],
                                      self.n_stack,
                                      is_cuda=self.is_cuda,
                                      value_coeff=self.value_coeff,
                                      entropy_coeff=self.entropy_coeff)

        obs = self.env.reset()
        self.storage.states[0].copy_(self.storage.obs2tensor(obs))

        self.writer = SummaryWriter(log_dir="/tmp/tensorboard")
        self.global_eval_count = 0

    def set_recurrent_buffers(self, buf_size):
        self.feat_enc_net.reset_lstm(buf_size=buf_size)

    def reset_recurrent_buffers(self, reset_indices):
        self.feat_enc_net.reset_lstm(reset_indices=reset_indices)

    def build_training_data_loader(self) -> DataLoader:
        ds = torchvision.datasets.MNIST(
            self.download_directory,
            train=True,
            transform=transforms.Compose([
                transforms.ToTensor(),
                # These are the precomputed mean and standard deviation of the
                # MNIST data; this normalizes the data to have zero mean and unit
                # standard deviation.
                transforms.Normalize((0.1307, ), (0.3081, )),
            ]),
            download=True)
        return DataLoader(ds, batch_size=1)

    def build_validation_data_loader(self) -> DataLoader:
        ds = torchvision.datasets.MNIST(
            self.download_directory,
            train=False,
            transform=transforms.Compose([
                transforms.ToTensor(),
                # These are the precomputed mean and standard deviation of the
                # MNIST data; this normalizes the data to have zero mean and unit
                # standard deviation.
                transforms.Normalize((0.1307, ), (0.3081, )),
            ]),
            download=True)
        return DataLoader(ds, batch_size=1)

    def train_batch(self, batch: TorchData, model: nn.Module, epoch_idx: int,
                    batch_idx: int) -> Dict[str, torch.Tensor]:
        final_value, entropy = self.episode_rollout()
        self.opt.zero_grad()
        total_loss, value_loss, policy_loss, entropy_loss = self.storage.a2c_loss(
            final_value, entropy)
        self.context.backward(total_loss)

        def clip_grads(parameters):
            torch.nn.utils.clip_grad_norm_(parameters, self.max_grad_norm)

        self.context.step_optimizer(self.opt, clip_grads)
        self.storage.after_update()

        return {
            'loss': total_loss,
            'value_loss': value_loss,
            'policy_loss': policy_loss,
            'entropy_loss': entropy_loss
        }

    def get_action(self, state, deterministic=False):
        feature = self.feat_enc_net(state)

        # calculate policy and value function
        policy = self.actor(feature)
        value = torch.squeeze(self.critic(feature))

        action_prob = F.softmax(policy, dim=-1)
        cat = Categorical(action_prob)
        if not deterministic:
            action = cat.sample()
            return (action, cat.log_prob(action), cat.entropy().mean(), value,
                    feature)
        else:
            action = np.argmax(action_prob.detach().cpu().numpy(), axis=1)
            return (action, [], [], value, feature)

    def episode_rollout(self):
        episode_entropy = 0
        for step in range(self.rollout_size):
            """Interact with the environments """
            # call A2C
            a_t, log_p_a_t, entropy, value, a2c_features = self.get_action(
                self.storage.get_state(step))
            # accumulate episode entropy
            episode_entropy += entropy

            # interact
            obs, rewards, dones, infos = self.env.step(a_t.cpu().numpy())

            # save episode reward
            self.storage.log_episode_rewards(infos)

            self.storage.insert(step, rewards, obs, a_t, log_p_a_t, value,
                                dones)
            self.reset_recurrent_buffers(reset_indices=dones)

        # Note:
        # get the estimate of the final reward
        # that's why we have the CRITIC --> estimate final reward
        # detach, as the final value will only be used as a
        with torch.no_grad():
            state = self.storage.get_state(step + 1)
            final_features = self.feat_enc_net(state)
            final_value = torch.squeeze(self.critic(final_features))

        return final_value, episode_entropy

    def evaluate_full_dataset(self, data_loader, model) -> Dict[str, Any]:
        self.global_eval_count += 1
        episode_rewards, episode_lengths = [], []
        n_eval_episodes = 10
        self.set_recurrent_buffers(1)
        frames = []
        with torch.no_grad():
            for episode in range(n_eval_episodes):
                obs = self.eval_env.reset()
                done, state = False, None
                episode_reward = 0.0
                episode_length = 0
                while not done:
                    state = self.storage.obs2tensor(obs)
                    if episode == 0:
                        frame = torch.unsqueeze(torch.squeeze(state)[0],
                                                0).detach()
                        frames.append(frame)
                    action, _, _, _, _ = self.get_action(state,
                                                         deterministic=True)
                    obs, reward, done, _info = self.eval_env.step(action)
                    reward = reward[0]
                    done = done[0]
                    episode_reward += reward
                    episode_length += 1
                if episode == 0:
                    video = torch.unsqueeze(torch.stack(frames), 0)
                    self.writer.add_video('policy',
                                          video,
                                          global_step=self.global_eval_count,
                                          fps=20)
                episode_rewards.append(episode_reward)
                episode_lengths.append(episode_length)

        mean_reward = np.mean(episode_rewards)
        std_reward = np.std(episode_rewards)
        self.set_recurrent_buffers(self.num_envs)
        return {'mean_reward': mean_reward}
Exemple #3
0
class Runner(object):
    def __init__(self,
                 net,
                 env,
                 num_envs,
                 n_stack,
                 rollout_size=5,
                 num_updates=2500000,
                 max_grad_norm=0.5,
                 value_coeff=0.5,
                 entropy_coeff=0.02,
                 tensorboard_log=False,
                 log_path="./log",
                 is_cuda=True,
                 seed=42):
        super().__init__()

        # constants
        self.num_envs = num_envs
        self.rollout_size = rollout_size
        self.num_updates = num_updates
        self.n_stack = n_stack
        self.seed = seed

        self.max_grad_norm = max_grad_norm

        # loss scaling coefficients
        self.is_cuda = torch.cuda.is_available() and is_cuda

        # objects
        """Tensorboard logger"""
        self.writer = SummaryWriter(
            comment="statistics",
            log_dir=log_path) if tensorboard_log else None
        """Environment"""
        self.env = env

        self.storage = RolloutStorage(self.rollout_size,
                                      self.num_envs,
                                      self.env.observation_space.shape[0:-1],
                                      self.n_stack,
                                      is_cuda=self.is_cuda,
                                      value_coeff=value_coeff,
                                      entropy_coeff=entropy_coeff,
                                      writer=self.writer)
        """Network"""
        self.net = net
        self.net.a2c.writer = self.writer

        if self.is_cuda:
            self.net = self.net.cuda()

        # self.writer.add_graph(self.net, input_to_model=(self.storage.states[0],)) --> not working for LSTMCEll

    def train(self):
        """Environment reset"""
        obs = self.env.reset()
        self.storage.states[0].copy_(self.storage.obs2tensor(obs))
        best_loss = np.inf

        for num_update in range(self.num_updates):

            final_value, entropy = self.episode_rollout()

            self.net.optimizer.zero_grad()
            """Assemble loss"""
            loss = self.storage.a2c_loss(final_value, entropy)
            loss.backward(retain_graph=False)

            # gradient clipping
            nn.utils.clip_grad_norm_(self.net.parameters(), self.max_grad_norm)

            if self.writer is not None:
                self.writer.add_scalar("loss", loss.item())

            self.net.optimizer.step()

            # it stores a lot of data which let's the graph
            # grow out of memory, so it is crucial to reset
            self.storage.after_update()

            if loss < best_loss:
                best_loss = loss.item()
                print("model saved with best loss: ", best_loss,
                      " at update #", num_update)
                torch.save(self.net.state_dict(), "a2c_best_loss")

            elif num_update % 10 == 0:
                print("current loss: ", loss.item(), " at update #",
                      num_update)
                self.storage.print_reward_stats()

            elif num_update % 100 == 0:
                torch.save(self.net.state_dict(), "a2c_time_log_no_norm")

            if self.writer is not None and len(
                    self.storage.episode_rewards) > 1:
                self.writer.add_histogram(
                    "episode_rewards",
                    torch.tensor(self.storage.episode_rewards))

        self.env.close()

    def episode_rollout(self):
        episode_entropy = 0
        for step in range(self.rollout_size):
            """Interact with the environments """
            # call A2C
            a_t, log_p_a_t, entropy, value, a2c_features = self.net.a2c.get_action(
                self.storage.get_state(step))
            # accumulate episode entropy
            episode_entropy += entropy

            # interact
            obs, rewards, dones, infos = self.env.step(a_t.cpu().numpy())

            # save episode reward
            self.storage.log_episode_rewards(infos)

            self.storage.insert(step, rewards, obs, a_t, log_p_a_t, value,
                                dones)
            self.net.a2c.reset_recurrent_buffers(reset_indices=dones)

        # Note:
        # get the estimate of the final reward
        # that's why we have the CRITIC --> estimate final reward
        # detach, as the final value will only be used as a
        with torch.no_grad():
            _, _, _, final_value, final_features = self.net.a2c.get_action(
                self.storage.get_state(step + 1))

        return final_value, episode_entropy