Beispiel #1
0
    def step(self):
        config = self.config
        storage = VPGStorageBuffer(config.rollout_length)
        states = self.states
        for _ in range(config.rollout_length):
            action_tr, log_prob_tr, entropy_tr, v_tr = self.network(
                config.state_normalizer(states))
            next_states, rewards, terminals, _ = self.task.step(
                toNumpy(action_tr))
            self.online_rewards += rewards
            rewards = config.reward_normalizer(rewards)
            for i, terminal in enumerate(terminals):
                if terminals[i]:
                    self.episode_rewards.append(self.online_rewards[i])
                    self.online_rewards[i] = 0
            storage.store_next(states=toTensor(states),
                               actions=action_tr,
                               values=v_tr,
                               log_pi=log_prob_tr,
                               entropy=entropy_tr,
                               rewards=toTensor(rewards).unsqueeze(-1),
                               terminals=toTensor(1 - terminals).unsqueeze(-1))
            states = next_states

        self.states = states
        action_tr, log_prob_tr, entropy_tr, v_tr = self.network(
            config.state_normalizer(states))
        storage.values.append(v_tr)

        advantages = toTensor(np.zeros((config.num_workers, 1)))
        returns = v_tr.detach()
        for i in reversed(range(config.rollout_length)):
            returns = storage.rewards[
                i] + config.discount * storage.terminals[i] * returns
            if not config.use_gae:
                advantages = returns - storage.values[i]
            else:
                td_error = storage.rewards[i] + config.discount * storage.terminals[i] * storage.values[i + 1] \
                           - storage.values[i]
                advantages = storage.terminals[
                    i] * config.gae_tau * config.discount * advantages + td_error
            storage.advantages[i] = advantages.detach()
            storage.returns[i] = returns.detach()

        states, actions, log_prob_old, returns, advantages = storage.cat(
            ['states', 'actions', 'log_pi', 'returns', 'advantages'])
        actions = actions.detach()
        log_prob_old = log_prob_old.detach()
        advantages = (advantages - advantages.mean()) / advantages.std()

        policy_loss = -(log_prob_old * advantages).mean()
        value_loss = 0.5 * (returns - value).pow(2).mean()
        entropy_loss = entropy.mean()
        loss = policy_loss - config.entropy_weight * entropy_loss + config.value_loss_weight * value_loss
        self.actor_optimizer.zero_grad()
        loss.backward()
        clip_grad_norm_(self.network.parameters(), config.gradient_clip)
        self.actor_optimizer.step()

        self.total_steps += config.rollout_length * config.num_workers
Beispiel #2
0
 def eval_step(self, state):
     self.config.state_normalizer.set_read_only()
     state = self.config.state_normalizer(state)
     q = self.network(state)
     action = np.argmax(toNumpy(q))
     self.config.state_normalizer.unset_read_only()
     return action
Beispiel #3
0
    def step(self):
        config = self.config
        actions = self.network(self.states)
        actions = toNumpy(actions)
        actions += self.random_process.sample()
        next_states, rewards, dones, _ = self.task.step(actions)
        next_states = self.config.state_normalizer(next_states)
        rewards = self.config.reward_normalizer(rewards)
        self.replay.store([
            self.states, actions, rewards, next_states,
            dones.astype(np.uint8)
        ])
        if dones[0]:
            self.random_process.reset_states()
        self.states = next_states
        self.total_steps += 1

        if self.replay.size >= config.min_memory_size:
            experiences = self.replay.sample(config.batch_size)
            states, actions, rewards, next_states, terminals = experiences
            states = states.squeeze(1)
            actions = actions.squeeze(1)
            rewards = toTensor(rewards)
            next_states = next_states.squeeze(1)
            terminals = toTensor(terminals)

            phi_next = self.target_network.feature(next_states)
            a_next = self.target_network.actor(phi_next)
            q_next = self.target_network.critic(phi_next, a_next)
            q_next = config.discount * q_next * (1 - terminals)
            q_next.add_(rewards)
            q_next = q_next.detach()
            phi = self.network.feature(states)
            q = self.network.critic(phi, toTensor(actions))
            critic_loss = (q - q_next).pow(2).mul(0.5).sum(-1).mean()

            self.network.zero_grad()
            critic_loss.backward()
            self.network.critic_opt.step()

            phi = self.network.feature(states)
            action = self.network.actor(phi)
            policy_loss = -self.network.critic(phi.detach(), action).mean()

            self.network.zero_grad()
            policy_loss.backward()
            self.network.actor_opt.step()

            # soft_update
            for target_param, param in zip(self.target_network.parameters(),
                                           self.network.parameters()):
                target_param.detach_()
                target_param.copy_(target_param *
                                   (1.0 - self.config.target_network_mix) +
                                   param * self.config.target_network_mix)
Beispiel #4
0
    def step(self):
        config = self.config

        # rollout
        for _ in range(self.config.rollout_length):
            # choose according to max(Q)
            q = self.network(config.state_normalizer(self.states)).mean(-1)
            epsilon = config.random_action_prob(config.num_workers)
            actions = epsilon_greedy(epsilon, toNumpy(q))

            next_states, rewards, dones, infos = self.task.step(actions)
            state, reward, next_state, done, info = self.states[0], rewards[
                0], next_states[0], int(dones[0]), infos[0]
            self.states = next_states
            self.total_steps += 1

            reward = config.reward_normalizer(reward)
            self.replay.store([state, actions[0], reward, next_state, done])

        if self.total_steps > config.exploration_steps:
            # minibatch gradient descent
            experiences = self.replay.sample(config.batch_size)
            states, actions, rewards, next_states, terminals = experiences
            states = config.state_normalizer(states)
            next_states = config.state_normalizer(next_states)

            quantiles_next = self.target_network(next_states).detach()
            a_next = torch.argmax(quantiles_next.sum(-1), dim=-1)
            quantiles_next = quantiles_next[self.batch_indices, a_next, :]

            rewards = toTensor(rewards).unsqueeze(-1)
            terminals = toTensor(terminals).unsqueeze(-1)
            quantiles_next = rewards + self.config.discount * (
                1 - terminals) * quantiles_next

            quantiles = self.network(states)
            actions = toTensor(actions).long()
            quantiles = quantiles[self.batch_indices, actions, :]

            quantiles_next = quantiles_next.t().unsqueeze(-1)
            diff = quantiles_next - quantiles
            loss = huber_loss(diff) * (self.cumulative_density -
                                       (diff.detach() < 0).float()).abs()

            self.optimizer.zero_grad()
            loss.mean(0).mean(1).sum().backward()
            clip_grad_norm_(self.network.parameters(),
                            self.config.gradient_clip)
            self.optimizer.step()

        if self.total_steps / config.rollout_length % config.target_network_update_freq == 0:
            self.target_network.load_state_dict(self.network.state_dict())
Beispiel #5
0
    def __init__(self, config):
        super().__init__(config)
        self.network = config.network_fn()
        self.target_network = config.network_fn()
        self.target_network.load_state_dict(self.network.state_dict())
        self.optimizer = config.optimizer_fn(self.network.parameters())

        self.task = config.task_fn()
        self.states = config.state_normalizer(self.task.reset())

        self.q_options, self.betas, self.log_pi = self.network(self.states)
        self.options = epsilon_greedy(
            config.random_option_prob(config.num_workers),
            toNumpy(self.q_options))
        self.is_initial_betas = np.ones(self.config.num_workers)
        self.prev_options = np.copy(self.options)
Beispiel #6
0
 def step(self):
     config = self.config
     
     # rollout
     for _ in range(self.config.rollout_length):
         # choose according to max(Q)
         q = self.network(config.state_normalizer(self.states))
         epsilon = config.random_action_prob(config.num_workers)
         actions = epsilon_greedy(epsilon, toNumpy(q))
         
         next_states, rewards, dones, infos = self.task.step(actions)
         rewards = config.reward_normalizer(rewards)
         self.replay.store([self.states[0], actions[0], rewards[0], next_states[0], dones[0]])
         
         self.states = next_states
         self.total_steps += 1
     
     if self.total_steps > config.exploration_steps:
         # minibatch gradient descent
         experiences = self.replay.sample(config.batch_size)
         states, actions, rewards, next_states, terminals = experiences
         states = config.state_normalizer(states)
         next_states = config.state_normalizer(next_states)
         q_next = self.target_network(next_states).detach()
         if config.double_q:
             best_actions = torch.argmax(self.network(next_states), dim=-1)
             q_next = q_next[self.batch_indices, best_actions]
         else:
             q_next = q_next.max(1)[0]
         terminals = toTensor(terminals)
         rewards = toTensor(rewards)
         q_next = rewards + config.discount * q_next * (1 - terminals)
         
         actions = toTensor(actions).long()
         q = self.network(states)
         q = q[self.batch_indices, actions]
         
         loss = (q_next - q).pow(2).mul(0.5).mean()
         
         self.optimizer.zero_grad()
         loss.backward()
         clip_grad_norm_(self.network.parameters(), config.gradient_clip)
         self.optimizer.step()
     
     if self.total_steps / config.rollout_length % config.target_network_update_freq == 0:
         self.target_network.load_state_dict(self.network.state_dict())
Beispiel #7
0
 def step(self):
     config = self.config
     
     rollout = []
     states = self.states
     for _ in range(self.config.rollout_length):
         # choose according to max(Q)
         q = self.network(config.state_normalizer(states))
         epsilon = config.random_action_prob(config.num_workers)
         actions = epsilon_greedy(epsilon, toNumpy(q))
         
         next_states, rewards, terminals, infos = self.task.step(actions)
         rewards = config.reward_normalizer(rewards)
         rollout.append([q, actions, rewards, 1 - terminals])
         states = next_states
         
         self.total_steps += config.num_workers
         if self.total_steps / config.num_workers % config.target_network_update_freq == 0:
             self.target_network.load_state_dict(self.network.state_dict())
     
     self.states = states
     
     processed_rollout = [None] * len(rollout)
     returns = self.target_network(config.state_normalizer(states)).detach()
     returns, _ = torch.max(returns, dim=-1, keepdim=True)
     for i in reversed(range(len(rollout))):
         q, actions, rewards, terminals = rollout[i]
         actions = toTensor(actions).unsqueeze(1).long()
         q = q.gather(1, actions)
         terminals = toTensor(terminals).unsqueeze(1)
         rewards = toTensor(rewards).unsqueeze(1)
         returns = rewards + config.discount * terminals * returns
         processed_rollout[i] = [q, returns]
     
     q, returns = map(lambda x: torch.cat(x, dim=0), zip(*processed_rollout))
     # loss = F.smooth_l1_loss(q, returns)
     # loss = huber_loss(q - returns)
     loss = 0.5 * (q - returns).pow(2).mean()
     self.optimizer.zero_grad()
     loss.backward()
     clip_grad_norm_(self.network.parameters(), config.gradient_clip)
     self.optimizer.step()
     
     if self.total_steps / config.rollout_length % config.target_network_update_freq == 0:
         self.target_network.load_state_dict(self.network.state_dict())
Beispiel #8
0
    def step(self):
        config = self.config
        storage = PPOStorageBuffer(config.rollout_length)
        states = self.states
        for _ in range(config.rollout_length):
            action_tr, log_prob_tr, entropy_tr, v_tr = self.network(states)
            next_states, rewards, terminals, infos = self.task.step(
                toNumpy(action_tr))
            rewards = config.reward_normalizer(rewards)
            storage.store_next(states=toTensor(states),
                               actions=action_tr,
                               values=v_tr,
                               log_pi=log_prob_tr,
                               entropy=entropy_tr,
                               rewards=toTensor(rewards).unsqueeze(-1),
                               terminals=toTensor(1 - terminals).unsqueeze(-1))
            states = config.state_normalizer(next_states)

        self.states = states
        action_tr, log_prob_tr, entropy_tr, v_tr = self.network(states)
        storage.values.append(v_tr)

        advantages = toTensor(np.zeros((config.num_workers, 1)))
        returns = v_tr.detach()
        for i in reversed(range(config.rollout_length)):
            returns = storage.rewards[
                i] + config.discount * storage.terminals[i] * returns
            if not config.use_gae:
                advantages = returns - storage.values[i]
            else:
                td_error = storage.rewards[i] + config.discount * storage.terminals[i] * storage.values[i + 1] \
                           - storage.values[i]
                advantages = storage.terminals[
                    i] * config.gae_tau * config.discount * advantages + td_error
            storage.advantages[i] = advantages.detach()
            storage.returns[i] = returns.detach()

        states, actions, log_prob_old, returns, advantages = storage.cat(
            ['states', 'actions', 'log_pi', 'returns', 'advantages'])
        actions = actions.detach()
        log_prob_old = log_prob_old.detach()
        advantages = (advantages - advantages.mean()) / advantages.std()

        for _ in range(config.optimization_epochs):
            sampler = random_sample(np.arange(states.size(0)),
                                    config.mini_batch_size)
            for batch_indices in sampler:
                batch_indices = toTensor(batch_indices).long()
                sampled_states = states[batch_indices]
                sampled_actions = actions[batch_indices]
                sampled_log_prob_old = log_prob_old[batch_indices]
                sampled_returns = returns[batch_indices]
                sampled_advantages = advantages[batch_indices]

                action_tr, log_prob_tr, entropy_tr, v_tr = self.network(
                    sampled_states, sampled_actions)
                ratio = (log_prob_tr - sampled_log_prob_old).exp()
                obj = ratio * sampled_advantages
                obj_clipped = ratio.clamp(
                    1.0 - config.ppo_ratio_clip,
                    1.0 + config.ppo_ratio_clip) * sampled_advantages
                policy_loss = -torch.min(obj, obj_clipped).mean() \
                              - config.entropy_weight * entropy_tr.mean()
                value_loss = 0.5 * (sampled_returns - v_tr).pow(2).mean()
                self.optimizer.zero_grad()
                (policy_loss + value_loss).backward()
                clip_grad_norm_(self.network.parameters(),
                                config.gradient_clip)
                self.optimizer.step()

        self.total_steps += config.rollout_length * config.num_workers
Beispiel #9
0
    def step(self):
        config = self.config

        # rollout
        for _ in range(self.config.rollout_length):
            # choose according to max(Q)
            probs, _ = self.network(config.state_normalizer(self.states))
            q = (probs * self.atoms).sum(-1)
            epsilon = config.random_action_prob(config.num_workers)
            actions = epsilon_greedy(epsilon, toNumpy(q))

            next_states, rewards, dones, infos = self.task.step(actions)
            state, reward, next_state, done, info = self.states[0], rewards[
                0], next_states[0], int(dones[0]), infos[0]
            self.states = next_states
            self.total_steps += 1

            reward = config.reward_normalizer(reward)
            self.replay.store([state, actions[0], reward, next_state, done])

        if self.total_steps > config.exploration_steps:
            # minibatch gradient descent
            experiences = self.replay.sample(config.batch_size)
            states, actions, rewards, next_states, terminals = experiences
            states = config.state_normalizer(states)
            next_states = config.state_normalizer(next_states)

            prob_next, _ = self.target_network(next_states)
            prob_next = prob_next.detach()
            q_next = (prob_next * self.atoms).sum(-1)
            a_next = torch.argmax(q_next, dim=-1)
            prob_next = prob_next[self.batch_indices, a_next, :]

            rewards = toTensor(rewards).unsqueeze(-1)
            terminals = toTensor(terminals).unsqueeze(-1)
            atoms_next = rewards + self.config.discount * (
                1 - terminals) * self.atoms.view(1, -1)

            atoms_next.clamp_(self.config.categorical_v_min,
                              self.config.categorical_v_max)
            b = (atoms_next - self.config.categorical_v_min) / self.delta_atom
            l = b.floor()
            u = b.ceil()
            d_m_l = (u + (l == u).float() - b) * prob_next
            d_m_u = (b - l) * prob_next
            target_prob = toTensor(np.zeros(prob_next.size()))
            for i in range(target_prob.size(0)):
                target_prob[i].index_add_(0, l[i].long(), d_m_l[i])
                target_prob[i].index_add_(0, u[i].long(), d_m_u[i])

            _, log_prob = self.network(states)
            actions = toTensor(actions).long()
            log_prob = log_prob[self.batch_indices, actions, :]
            loss = -(target_prob * log_prob).sum(-1).mean()

            self.optimizer.zero_grad()
            loss.backward()
            clip_grad_norm_(self.network.parameters(),
                            self.config.gradient_clip)
            self.optimizer.step()

        if self.total_steps / config.rollout_length % config.target_network_update_freq == 0:
            self.target_network.load_state_dict(self.network.state_dict())