Beispiel #1
0
class PPO(Agent):
    """
    An agent learned with PPO using Advantage Actor-Critic framework
    - Actor takes state as input
    - Critic takes both state and action as input
    - agent interact with environment to collect experience
    - agent training with experience to update policy
    - adam seems better than rmsprop for ppo
    """
    def __init__(self,
                 env,
                 state_dim,
                 action_dim,
                 memory_capacity=10000,
                 max_steps=None,
                 roll_out_n_steps=1,
                 target_tau=1.,
                 target_update_steps=5,
                 clip_param=0.2,
                 reward_gamma=0.99,
                 reward_scale=1.,
                 done_penalty=None,
                 actor_hidden_size=32,
                 critic_hidden_size=32,
                 actor_output_act=nn.functional.log_softmax,
                 critic_loss="mse",
                 actor_lr=0.001,
                 critic_lr=0.001,
                 optimizer_type="adam",
                 entropy_reg=0.01,
                 max_grad_norm=0.5,
                 batch_size=100,
                 episodes_before_train=100,
                 epsilon_start=0.9,
                 epsilon_end=0.01,
                 epsilon_decay=200,
                 use_cuda=True):
        super(PPO,
              self).__init__(env, state_dim, action_dim, memory_capacity,
                             max_steps, reward_gamma, reward_scale,
                             done_penalty, actor_hidden_size,
                             critic_hidden_size, actor_output_act, critic_loss,
                             actor_lr, critic_lr, optimizer_type, entropy_reg,
                             max_grad_norm, batch_size, episodes_before_train,
                             epsilon_start, epsilon_end, epsilon_decay,
                             use_cuda)

        self.roll_out_n_steps = roll_out_n_steps
        self.target_tau = target_tau
        self.target_update_steps = target_update_steps
        self.clip_param = clip_param

        self.actor = ActorNetwork(self.state_dim, self.actor_hidden_size,
                                  self.action_dim, self.actor_output_act)
        self.critic = CriticNetwork(self.state_dim, self.action_dim,
                                    self.critic_hidden_size, 1)
        # to ensure target network and learning network has the same weights
        self.actor_target = deepcopy(self.actor)
        self.critic_target = deepcopy(self.critic)

        if self.optimizer_type == "adam":
            self.actor_optimizer = Adam(self.actor.parameters(),
                                        lr=self.actor_lr)
            self.critic_optimizer = Adam(self.critic.parameters(),
                                         lr=self.critic_lr)
        elif self.optimizer_type == "rmsprop":
            self.actor_optimizer = RMSprop(self.actor.parameters(),
                                           lr=self.actor_lr)
            self.critic_optimizer = RMSprop(self.critic.parameters(),
                                            lr=self.critic_lr)

        if self.use_cuda:
            self.actor.cuda()
            self.critic.cuda()
            self.actor_target.cuda()
            self.critic_target.cuda()

    # agent interact with the environment to collect experience
    def interact(self):
        super(PPO, self)._take_n_steps()

    # train on a roll out batch
    def train(self):
        if self.n_episodes <= self.episodes_before_train:
            pass

        batch = self.memory.sample(self.batch_size)
        states_var = to_tensor_var(batch.states,
                                   self.use_cuda).view(-1, self.state_dim)
        one_hot_actions = index_to_one_hot(batch.actions, self.action_dim)
        actions_var = to_tensor_var(one_hot_actions,
                                    self.use_cuda).view(-1, self.action_dim)
        rewards_var = to_tensor_var(batch.rewards, self.use_cuda).view(-1, 1)

        # update actor network
        self.actor_optimizer.zero_grad()
        values = self.critic_target(states_var, actions_var).detach()
        advantages = rewards_var - values
        # # normalizing advantages seems not working correctly here
        # advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5)
        action_log_probs = self.actor(states_var)
        action_log_probs = th.sum(action_log_probs * actions_var, 1)
        old_action_log_probs = self.actor_target(states_var).detach()
        old_action_log_probs = th.sum(old_action_log_probs * actions_var, 1)
        ratio = th.exp(action_log_probs - old_action_log_probs)
        surr1 = ratio * advantages
        surr2 = th.clamp(ratio, 1.0 - self.clip_param,
                         1.0 + self.clip_param) * advantages
        # PPO's pessimistic surrogate (L^CLIP)
        actor_loss = -th.mean(th.min(surr1, surr2))
        actor_loss.backward()
        if self.max_grad_norm is not None:
            nn.utils.clip_grad_norm(self.actor.parameters(),
                                    self.max_grad_norm)
        self.actor_optimizer.step()

        # update critic network
        self.critic_optimizer.zero_grad()
        target_values = rewards_var
        values = self.critic(states_var, actions_var)
        if self.critic_loss == "huber":
            critic_loss = nn.functional.smooth_l1_loss(values, target_values)
        else:
            critic_loss = nn.MSELoss()(values, target_values)
        critic_loss.backward()
        if self.max_grad_norm is not None:
            nn.utils.clip_grad_norm(self.critic.parameters(),
                                    self.max_grad_norm)
        self.critic_optimizer.step()

        # update actor target network and critic target network
        if self.n_steps % self.target_update_steps == 0 and self.n_steps > 0:
            super(PPO, self)._soft_update_target(self.actor_target, self.actor)
            super(PPO, self)._soft_update_target(self.critic_target,
                                                 self.critic)

    # predict softmax action based on state
    def _softmax_action(self, state):
        state_var = to_tensor_var([state], self.use_cuda)
        softmax_action_var = th.exp(self.actor(state_var))
        if self.use_cuda:
            softmax_action = softmax_action_var.data.cpu().numpy()[0]
        else:
            softmax_action = softmax_action_var.data.numpy()[0]
        return softmax_action

    # choose an action based on state with random noise added for exploration in training
    def exploration_action(self, state):
        softmax_action = self._softmax_action(state)
        epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
                                  np.exp(-1. * self.n_steps / self.epsilon_decay)
        if np.random.rand() < epsilon:
            action = np.random.choice(self.action_dim)
        else:
            action = np.argmax(softmax_action)
        return action

    # choose an action based on state for execution
    def action(self, state):
        softmax_action = self._softmax_action(state)
        action = np.argmax(softmax_action)
        return action

    # evaluate value for a state-action pair
    def value(self, state, action):
        state_var = to_tensor_var([state], self.use_cuda)
        action = index_to_one_hot(action, self.action_dim)
        action_var = to_tensor_var([action], self.use_cuda)
        value_var = self.critic(state_var, action_var)
        if self.use_cuda:
            value = value_var.data.cpu().numpy()[0]
        else:
            value = value_var.data.numpy()[0]
        return value
Beispiel #2
0
class A2C(Agent):
    """
    An agent learned with Advantage Actor-Critic
    - Actor takes state as input
    - Critic takes both state and action as input
    - agent interact with environment to collect experience
    - agent training with experience to update policy
    """
    def __init__(self,
                 env,
                 state_dim,
                 action_dim,
                 memory_capacity=10000,
                 max_steps=None,
                 roll_out_n_steps=10,
                 reward_gamma=0.99,
                 reward_scale=1.,
                 done_penalty=None,
                 actor_hidden_size=32,
                 critic_hidden_size=32,
                 actor_output_act=nn.functional.log_softmax,
                 critic_loss="mse",
                 actor_lr=0.001,
                 critic_lr=0.001,
                 optimizer_type="rmsprop",
                 entropy_reg=0.01,
                 max_grad_norm=0.5,
                 batch_size=100,
                 episodes_before_train=100,
                 epsilon_start=0.9,
                 epsilon_end=0.01,
                 epsilon_decay=200,
                 use_cuda=True):
        super(A2C,
              self).__init__(env, state_dim, action_dim, memory_capacity,
                             max_steps, reward_gamma, reward_scale,
                             done_penalty, actor_hidden_size,
                             critic_hidden_size, actor_output_act, critic_loss,
                             actor_lr, critic_lr, optimizer_type, entropy_reg,
                             max_grad_norm, batch_size, episodes_before_train,
                             epsilon_start, epsilon_end, epsilon_decay,
                             use_cuda)

        self.roll_out_n_steps = roll_out_n_steps

        self.actor = ActorNetwork(self.state_dim, self.actor_hidden_size,
                                  self.action_dim, self.actor_output_act)
        self.critic = CriticNetwork(self.state_dim, self.action_dim,
                                    self.critic_hidden_size, 1)
        if self.optimizer_type == "adam":
            self.actor_optimizer = Adam(self.actor.parameters(),
                                        lr=self.actor_lr)
            self.critic_optimizer = Adam(self.critic.parameters(),
                                         lr=self.critic_lr)
        elif self.optimizer_type == "rmsprop":
            self.actor_optimizer = RMSprop(self.actor.parameters(),
                                           lr=self.actor_lr)
            self.critic_optimizer = RMSprop(self.critic.parameters(),
                                            lr=self.critic_lr)
        if self.use_cuda:
            self.actor.cuda()

    # agent interact with the environment to collect experience
    def interact(self):
        super(A2C, self)._take_n_steps()

    # train on a roll out batch
    def train(self):
        if self.n_episodes <= self.episodes_before_train:
            pass

        batch = self.memory.sample(self.batch_size)
        states_var = to_tensor_var(batch.states,
                                   self.use_cuda).view(-1, self.state_dim)
        one_hot_actions = index_to_one_hot(batch.actions, self.action_dim)
        actions_var = to_tensor_var(one_hot_actions,
                                    self.use_cuda).view(-1, self.action_dim)
        rewards_var = to_tensor_var(batch.rewards, self.use_cuda).view(-1, 1)

        # update actor network
        self.actor_optimizer.zero_grad()
        action_log_probs = self.actor(states_var)
        entropy_loss = th.mean(entropy(th.exp(action_log_probs)))
        action_log_probs = th.sum(action_log_probs * actions_var, 1)
        values = self.critic(states_var, actions_var)
        advantages = rewards_var - values.detach()
        pg_loss = -th.mean(action_log_probs * advantages)
        actor_loss = pg_loss - entropy_loss * self.entropy_reg
        actor_loss.backward()
        if self.max_grad_norm is not None:
            nn.utils.clip_grad_norm(self.actor.parameters(),
                                    self.max_grad_norm)
        self.actor_optimizer.step()

        # update critic network
        self.critic_optimizer.zero_grad()
        target_values = rewards_var
        if self.critic_loss == "huber":
            critic_loss = nn.functional.smooth_l1_loss(values, target_values)
        else:
            critic_loss = nn.MSELoss()(values, target_values)
        critic_loss.backward()
        if self.max_grad_norm is not None:
            nn.utils.clip_grad_norm(self.critic.parameters(),
                                    self.max_grad_norm)
        self.critic_optimizer.step()

    # predict softmax action based on state
    def _softmax_action(self, state):
        state_var = to_tensor_var([state], self.use_cuda)
        softmax_action_var = th.exp(self.actor(state_var))
        if self.use_cuda:
            softmax_action = softmax_action_var.data.cpu().numpy()[0]
        else:
            softmax_action = softmax_action_var.data.numpy()[0]
        return softmax_action

    # choose an action based on state with random noise added for exploration in training
    def exploration_action(self, state):
        softmax_action = self._softmax_action(state)
        epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
                                  np.exp(-1. * self.n_steps / self.epsilon_decay)
        if np.random.rand() < epsilon:
            action = np.random.choice(self.action_dim)
        else:
            action = np.argmax(softmax_action)
        return action

    # choose an action based on state for execution
    def action(self, state):
        softmax_action = self._softmax_action(state)
        action = np.argmax(softmax_action)
        return action

    # evaluate value for a state-action pair
    def value(self, state, action):
        state_var = to_tensor_var([state], self.use_cuda)
        action = index_to_one_hot(action, self.action_dim)
        action_var = to_tensor_var([action], self.use_cuda)
        value_var = self.critic(state_var, action_var)
        if self.use_cuda:
            value = value_var.data.cpu().numpy()[0]
        else:
            value = value_var.data.numpy()[0]
        return value
Beispiel #3
0
class DQN(Agent):
    """
    An agent learned with DQN using replay memory and temporal difference
    - use a value network to estimate the state-action value
    """
    def __init__(self,
                 env,
                 state_dim,
                 action_dim,
                 memory_capacity=10000,
                 max_steps=10000,
                 reward_gamma=0.99,
                 reward_scale=1.,
                 done_penalty=None,
                 actor_hidden_size=32,
                 critic_hidden_size=32,
                 actor_output_act=identity,
                 critic_loss="mse",
                 actor_lr=0.001,
                 critic_lr=0.001,
                 optimizer_type="rmsprop",
                 entropy_reg=0.01,
                 max_grad_norm=0.5,
                 batch_size=100,
                 episodes_before_train=100,
                 epsilon_start=0.9,
                 epsilon_end=0.01,
                 epsilon_decay=200,
                 use_cuda=True):
        super(DQN,
              self).__init__(env, state_dim, action_dim, memory_capacity,
                             max_steps, reward_gamma, reward_scale,
                             done_penalty, actor_hidden_size,
                             critic_hidden_size, actor_output_act, critic_loss,
                             actor_lr, critic_lr, optimizer_type, entropy_reg,
                             max_grad_norm, batch_size, episodes_before_train,
                             epsilon_start, epsilon_end, epsilon_decay,
                             use_cuda)

        self.actor = ActorNetwork(self.state_dim, self.actor_hidden_size,
                                  self.action_dim, self.actor_output_act)
        if self.optimizer_type == "adam":
            self.actor_optimizer = Adam(self.actor.parameters(),
                                        lr=self.actor_lr)
        elif self.optimizer_type == "rmsprop":
            self.actor_optimizer = RMSprop(self.actor.parameters(),
                                           lr=self.actor_lr)
        if self.use_cuda:
            self.actor.cuda()

    # agent interact with the environment to collect experience
    def interact(self):
        super(DQN, self)._take_one_step()

    # train on a sample batch
    def train(self):
        if self.n_episodes <= self.episodes_before_train:
            pass

        batch = self.memory.sample(self.batch_size)
        states_var = to_tensor_var(batch.states,
                                   self.use_cuda).view(-1, self.state_dim)
        actions_var = to_tensor_var(batch.actions, self.use_cuda,
                                    "long").view(-1, 1)
        rewards_var = to_tensor_var(batch.rewards, self.use_cuda).view(-1, 1)
        next_states_var = to_tensor_var(batch.next_states, self.use_cuda).view(
            -1, self.state_dim)
        dones_var = to_tensor_var(batch.dones, self.use_cuda).view(-1, 1)

        # compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken
        current_q = self.actor(states_var).gather(1, actions_var)

        # compute V(s_{t+1}) for all next states and all actions,
        # and we then take max_a { V(s_{t+1}) }
        next_state_action_values = self.actor(next_states_var).detach()
        next_q = th.max(next_state_action_values, 1)[0].view(-1, 1)
        # compute target q by: r + gamma * max_a { V(s_{t+1}) }
        target_q = self.reward_scale * rewards_var + self.reward_gamma * next_q * (
            1. - dones_var)

        # update value network
        self.actor_optimizer.zero_grad()
        if self.critic_loss == "huber":
            loss = th.nn.functional.smooth_l1_loss(current_q, target_q)
        else:
            loss = th.nn.MSELoss()(current_q, target_q)
        loss.backward()
        if self.max_grad_norm is not None:
            nn.utils.clip_grad_norm(self.actor.parameters(),
                                    self.max_grad_norm)
        self.actor_optimizer.step()

    # choose an action based on state with random noise added for exploration in training
    def exploration_action(self, state):
        epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
                                  np.exp(-1. * self.n_steps / self.epsilon_decay)
        if np.random.rand() < epsilon:
            action = np.random.choice(self.action_dim)
        else:
            action = self.action(state)
        return action

    # choose an action based on state for execution
    def action(self, state):
        state_var = to_tensor_var([state], self.use_cuda)
        state_action_value_var = self.actor(state_var)
        if self.use_cuda:
            state_action_value = state_action_value_var.data.cpu().numpy()[0]
        else:
            state_action_value = state_action_value_var.data.numpy()[0]
        action = np.argmax(state_action_value)
        return action
Beispiel #4
0
class DDPG(Agent):
    """
    An agent learned with Deep Deterministic Policy Gradient using Actor-Critic framework
    - Actor takes state as input
    - Critic takes both state and action as input
    - Critic uses gradient temporal-difference learning
    """
    def __init__(self,
                 env,
                 state_dim,
                 action_dim,
                 memory_capacity=10000,
                 max_steps=None,
                 target_tau=0.01,
                 target_update_steps=5,
                 reward_gamma=0.99,
                 reward_scale=1.,
                 done_penalty=None,
                 actor_hidden_size=32,
                 critic_hidden_size=32,
                 actor_output_act=nn.functional.tanh,
                 critic_loss="mse",
                 actor_lr=0.001,
                 critic_lr=0.001,
                 optimizer_type="adam",
                 entropy_reg=0.01,
                 max_grad_norm=0.5,
                 batch_size=100,
                 episodes_before_train=100,
                 epsilon_start=0.9,
                 epsilon_end=0.01,
                 epsilon_decay=200,
                 use_cuda=True):
        super(DDPG,
              self).__init__(env, state_dim, action_dim, memory_capacity,
                             max_steps, reward_gamma, reward_scale,
                             done_penalty, actor_hidden_size,
                             critic_hidden_size, actor_output_act, critic_loss,
                             actor_lr, critic_lr, optimizer_type, entropy_reg,
                             max_grad_norm, batch_size, episodes_before_train,
                             epsilon_start, epsilon_end, epsilon_decay,
                             use_cuda)

        self.target_tau = target_tau
        self.target_update_steps = target_update_steps

        self.actor = ActorNetwork(self.state_dim, self.actor_hidden_size,
                                  self.action_dim, self.actor_output_act)
        self.critic = CriticNetwork(self.state_dim, self.action_dim,
                                    self.critic_hidden_size, 1)
        # to ensure target network and learning network has the same weights
        self.actor_target = deepcopy(self.actor)
        self.critic_target = deepcopy(self.critic)

        if self.optimizer_type == "adam":
            self.actor_optimizer = Adam(self.actor.parameters(),
                                        lr=self.actor_lr)
            self.critic_optimizer = Adam(self.critic.parameters(),
                                         lr=self.critic_lr)
        elif self.optimizer_type == "rmsprop":
            self.actor_optimizer = RMSprop(self.actor.parameters(),
                                           lr=self.actor_lr)
            self.critic_optimizer = RMSprop(self.critic.parameters(),
                                            lr=self.critic_lr)

        if self.use_cuda:
            self.actor.cuda()
            self.critic.cuda()
            self.actor_target.cuda()
            self.critic_target.cuda()

    # agent interact with the environment to collect experience
    def interact(self):
        super(DDPG, self)._take_one_step()

    # train on a sample batch
    def train(self):
        # do not train until exploration is enough
        if self.n_episodes <= self.episodes_before_train:
            pass

        batch = self.memory.sample(self.batch_size)
        state_var = to_tensor_var(batch.states,
                                  self.use_cuda).view(-1, self.state_dim)
        action_var = to_tensor_var(batch.actions,
                                   self.use_cuda).view(-1, self.action_dim)
        reward_var = to_tensor_var(batch.rewards, self.use_cuda).view(-1, 1)
        next_state_var = to_tensor_var(batch.next_states,
                                       self.use_cuda).view(-1, self.state_dim)
        done_var = to_tensor_var(batch.dones, self.use_cuda).view(-1, 1)

        # estimate the target q with actor_target network and critic_target network
        next_action_var = self.actor_target(next_state_var)
        next_q = self.critic_target(next_state_var, next_action_var).detach()
        target_q = self.reward_scale * reward_var + self.reward_gamma * next_q * (
            1. - done_var)

        # update critic network
        self.critic_optimizer.zero_grad()
        # current Q values
        current_q = self.critic(state_var, action_var)
        # rewards is target Q values
        if self.critic_loss == "huber":
            critic_loss = nn.functional.smooth_l1_loss(current_q, target_q)
        else:
            critic_loss = nn.MSELoss()(current_q, target_q)
        critic_loss.backward()
        if self.max_grad_norm is not None:
            nn.utils.clip_grad_norm(self.critic.parameters(),
                                    self.max_grad_norm)
        self.critic_optimizer.step()

        # update actor network
        self.actor_optimizer.zero_grad()
        # the accurate action prediction
        action = self.actor(state_var)
        # actor_loss is used to maximize the Q value for the predicted action
        actor_loss = -self.critic(state_var, action)
        actor_loss = actor_loss.mean()
        actor_loss.backward()
        if self.max_grad_norm is not None:
            nn.utils.clip_grad_norm(self.actor.parameters(),
                                    self.max_grad_norm)
        self.actor_optimizer.step()

        # update actor target network and critic target network
        if self.n_steps % self.target_update_steps == 0 and self.n_steps > 0:
            super(DDPG, self)._soft_update_target(self.critic_target,
                                                  self.critic)
            super(DDPG, self)._soft_update_target(self.actor_target,
                                                  self.actor)

    # choose an action based on state with random noise added for exploration in training
    def exploration_action(self, state):
        action = self.action(state)
        epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
                                  np.exp(-1. * self.n_steps / self.epsilon_decay)
        # add noise
        noise = np.random.randn(self.action_dim) * epsilon
        action += noise
        return action

    # choose an action based on state for execution
    def action(self, state):
        action_var = self.actor(to_tensor_var([state], self.use_cuda))
        if self.use_cuda:
            action = action_var.data.cpu().numpy()[0]
        else:
            action = action_var.data.numpy()[0]
        return action
Beispiel #5
0
class A2C(Agent):
    """
    An agent learned with Advantage Actor-Critic
    - Actor takes state as input
    - Critic takes both state and action as input
    - agent interact with environment to collect experience
    - agent training with experience to update policy
    """
    def __init__(self, env, state_dim, action_dim,
                 memory_capacity=10000, max_steps=None,
                 roll_out_n_steps=10,
                 reward_gamma=0.99, reward_scale=1., done_penalty=None,
                 actor_hidden_size=32, critic_hidden_size=32,
                 actor_output_act=nn.functional.log_softmax, critic_loss="mse",
                 actor_lr=0.001, critic_lr=0.001,
                 optimizer_type="rmsprop", entropy_reg=0.01,
                 max_grad_norm=0.5, batch_size=100, episodes_before_train=100,
                 epsilon_start=0.9, epsilon_end=0.01, epsilon_decay=200,
                 use_cuda=True):
        super(A2C, self).__init__(env, state_dim, action_dim,
                 memory_capacity, max_steps,
                 reward_gamma, reward_scale, done_penalty,
                 actor_hidden_size, critic_hidden_size,
                 actor_output_act, critic_loss,
                 actor_lr, critic_lr,
                 optimizer_type, entropy_reg,
                 max_grad_norm, batch_size, episodes_before_train,
                 epsilon_start, epsilon_end, epsilon_decay,
                 use_cuda)

        self.roll_out_n_steps = roll_out_n_steps

        self.actor = ActorNetwork(self.state_dim, self.actor_hidden_size,
                                  self.action_dim, self.actor_output_act)
        self.critic = CriticNetwork(self.state_dim, self.action_dim,
                                    self.critic_hidden_size, 1)
        if self.optimizer_type == "adam":
            self.actor_optimizer = Adam(self.actor.parameters(), lr=self.actor_lr)
            self.critic_optimizer = Adam(self.critic.parameters(), lr=self.critic_lr)
        elif self.optimizer_type == "rmsprop":
            self.actor_optimizer = RMSprop(self.actor.parameters(), lr=self.actor_lr)
            self.critic_optimizer = RMSprop(self.critic.parameters(), lr=self.critic_lr)
        if self.use_cuda:
            self.actor.cuda()
            self.critic.cuda()

    # agent interact with the environment to collect experience
    def interact(self):
        super(A2C, self)._take_n_steps()

    def get_loss(self):
        if self.n_episodes <= self.episodes_before_train:
            pass

        batch = self.memory.sample(self.batch_size)
        states_var = to_tensor_var(
            batch.states, self.use_cuda).view(-1, self.state_dim)
        one_hot_actions = index_to_one_hot(batch.actions, self.action_dim)
        actions_var = to_tensor_var(
            one_hot_actions, self.use_cuda).view(-1, self.action_dim)
        rewards_var = to_tensor_var(batch.rewards, self.use_cuda).view(-1, 1)

        # Get the actor network loss
        self.actor_optimizer.zero_grad()
        action_log_probs = self.actor(states_var)
        entropy_loss = th.mean(entropy(th.exp(action_log_probs)))
        action_log_probs = th.sum(action_log_probs * actions_var, 1)
        values = self.critic(states_var, actions_var)
        advantages = rewards_var - values.detach()
        pg_loss = -th.mean(action_log_probs * advantages)
        actor_loss = pg_loss - entropy_loss * self.entropy_reg

        # Get the critic network loss
        self.critic_optimizer.zero_grad()
        target_values = rewards_var
        if self.critic_loss == "huber":
            critic_loss = nn.functional.smooth_l1_loss(values, target_values)
        else:
            critic_loss = nn.MSELoss()(values, target_values)

        combined_loss = {'actor_loss': actor_loss,
                         'critic_loss': critic_loss}

        return combined_loss

    def update_net(self, combined_loss):
        actor_loss = combined_loss['actor_loss']
        critic_loss = combined_loss['critic_loss']

        # Update the actor network
        actor_loss.backward()
        if self.max_grad_norm is not None:
            nn.utils.clip_grad_norm(
                self.actor.parameters(), self.max_grad_norm)
        self.actor_optimizer.step()

        # Update the critic network
        critic_loss.backward()
        if self.max_grad_norm is not None:
            nn.utils.clip_grad_norm(
                self.critic.parameters(), self.max_grad_norm)
        self.critic_optimizer.step()


    # train on a roll out batch
    def train(self):
        if self.n_episodes <= self.episodes_before_train:
            pass

        batch = self.memory.sample(self.batch_size)
        states_var = to_tensor_var(batch.states, self.use_cuda).view(-1, self.state_dim)
        one_hot_actions = index_to_one_hot(batch.actions, self.action_dim)
        actions_var = to_tensor_var(one_hot_actions, self.use_cuda).view(-1, self.action_dim)
        rewards_var = to_tensor_var(batch.rewards, self.use_cuda).view(-1, 1)

        # update actor network
        self.actor_optimizer.zero_grad()
        action_log_probs = self.actor(states_var)
        entropy_loss = th.mean(entropy(th.exp(action_log_probs)))
        action_log_probs = th.sum(action_log_probs * actions_var, 1)
        values = self.critic(states_var, actions_var)
        advantages = rewards_var - values.detach()
        pg_loss = -th.mean(action_log_probs * advantages)
        actor_loss = pg_loss - entropy_loss * self.entropy_reg
        actor_loss.backward()
        if self.max_grad_norm is not None:
            nn.utils.clip_grad_norm(self.actor.parameters(), self.max_grad_norm)
        self.actor_optimizer.step()

        # update critic network
        self.critic_optimizer.zero_grad()
        target_values = rewards_var
        if self.critic_loss == "huber":
            critic_loss = nn.functional.smooth_l1_loss(values, target_values)
        else:
            critic_loss = nn.MSELoss()(values, target_values)
        critic_loss.backward()
        if self.max_grad_norm is not None:
            nn.utils.clip_grad_norm(self.critic.parameters(), self.max_grad_norm)
        self.critic_optimizer.step()

    # predict softmax action based on state
    def _softmax_action(self, state):
        state_var = to_tensor_var([state], self.use_cuda)
        softmax_action_var = th.exp(self.actor(state_var))
        if self.use_cuda:
            softmax_action = softmax_action_var.data.cpu().numpy()[0]
        else:
            softmax_action = softmax_action_var.data.numpy()[0]
        return softmax_action

    # choose an action based on state with random noise added for exploration in training
    def exploration_action(self, state):
        softmax_action = self._softmax_action(state)
        epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
                                  np.exp(-1. * self.n_steps / self.epsilon_decay)
        if np.random.rand() < epsilon:
            action = np.random.choice(self.action_dim)
        else:
            action = np.argmax(softmax_action)
        return action

    # choose an action based on state for execution
    def action(self, state):
        softmax_action = self._softmax_action(state)
        action = np.argmax(softmax_action)
        return action

    # evaluate value for a state-action pair
    def value(self, state, action):
        state_var = to_tensor_var([state], self.use_cuda)
        action = index_to_one_hot(action, self.action_dim)
        action_var = to_tensor_var([action], self.use_cuda)

        value_var = self.critic(state_var, action_var)
        if self.use_cuda:
            value = value_var.data.cpu().numpy()[0]
        else:
            value = value_var.data.numpy()[0]
        return value

    def get_weights(self):
        state_actor = {'id': id,
                       'state_dict': self.actor.state_dict(),
                       'optimizer': self.actor_optimizer.state_dict()}

        state_critic = {'id': id,
                        'state_dict': self.critic.state_dict(),
                        'optimizer': self.critic_optimizer.state_dict()}

        state_dicts = {'state_actor': state_actor,
                       'state_critic': state_critic}
        return state_dicts

    def set_weights(self, state_dicts):
        actor_checkpoint = state_dicts['state_actor']
        critic_checkpoint = state_dicts['state_critic']

        self.actor.load_state_dict(actor_checkpoint['state_dict'])
        self.critic.load_state_dict(critic_checkpoint['state_dict'])

        self.actor_optimizer.load_state_dict(actor_checkpoint['optimizer'])
        self.critic_optimizer.load_state_dict(critic_checkpoint['optimizer'])

    def save_weights(self, env_name, id):
        state_actor = {'id': id,
                       'state_dict': self.actor.state_dict(),
                       'optimizer': self.actor_optimizer.state_dict()}

        state_critic = {'id': id,
                        'state_dict': self.critic.state_dict(),
                        'optimizer': self.critic_optimizer.state_dict()}

        base_path = os.path.join(env_name, 'a2c_weights')
        if not os.path.exists(base_path):
            os.makedirs(base_path)
        file_name_actor = os.path.join(base_path, 'actor_' + str(id) + '.pth.tar')
        file_name_critic = os.path.join(
            base_path, 'critic_' + str(id) + '.pth.tar')

        th.save(state_actor, file_name_actor)
        th.save(state_critic, file_name_critic)

    def load_weights(self, actor_weight_file, critic_weight_file):
        print("=> loading checkpoint '{}, {}'".format(
            actor_weight_file, critic_weight_file))
        actor_checkpoint = th.load(actor_weight_file)
        critic_checkpoint = th.load(critic_weight_file)

        self.actor.load_state_dict(actor_checkpoint['state_dict'])
        self.critic.load_state_dict(critic_checkpoint['state_dict'])

        self.actor_optimizer.load_state_dict(actor_checkpoint['optimizer'])
        self.critic_optimizer.load_state_dict(critic_checkpoint['optimizer'])

        print("=> loaded checkpoint '{}, {}'".format(
            actor_weight_file, critic_weight_file))