Beispiel #1
0
class Agent(object):
    def __init__(self,
                 alpha,
                 beta,
                 input_dims,
                 tau,
                 env,
                 gamma=0.99,
                 max_size=10000,
                 layer1_size=400,
                 layer2_size=300,
                 batch_size=64):
        n_actions = env.action_space.shape[0]
        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.sess = tf.Session()

        self.actor = Actor(alpha,
                           n_actions,
                           'Actor',
                           input_dims,
                           self.sess,
                           layer1_size,
                           layer2_size,
                           env.action_space.high,
                           self.batch_size,
                           ckpt_dir='tmp/ddpg/actor')

        self.critic = Critic(beta,
                             n_actions,
                             'Critic',
                             input_dims,
                             self.sess,
                             layer1_size,
                             layer2_size,
                             self.batch_size,
                             ckpt_dir='tmp/ddpg/critic')

        self.target_actor = Actor(alpha,
                                  n_actions,
                                  'TargetActor',
                                  input_dims,
                                  self.sess,
                                  layer1_size,
                                  layer2_size,
                                  env.action_space.high,
                                  self.batch_size,
                                  ckpt_dir='tmp/ddpg/target_actor')

        self.target_critic = Critic(beta,
                                    n_actions,
                                    'TargetCritic',
                                    input_dims,
                                    self.sess,
                                    layer1_size,
                                    layer2_size,
                                    self.batch_size,
                                    ckpt_dir='tmp/ddpg/target_critic')

        self.noise = OUActionNoise(mu=np.zeros(n_actions))

        self.update_actor = [
            self.target_actor.params[i].assign(
                tf.multiply(self.actor.params[i], self.tau) +
                tf.multiply(self.target_actor.params[i], 1. - self.tau))
            for i in range(len(self.target_actor.params))
        ]

        self.update_critic = [
            self.target_critic.params[i].assign(
                tf.multiply(self.critic.params[i], self.tau) +
                tf.multiply(self.target_critic.params[i], 1. - self.tau))
            for i in range(len(self.target_critic.params))
        ]

        self.sess.run(tf.global_variables_initializer())

        self.update_target_network_parameters(first=True)

    def update_target_network_parameters(self, first=False):
        for _, d in enumerate(["/device:GPU:0", "/device:GPU:1"]):
            with tf.device(d):
                if first:
                    old_tau = self.tau
                    self.tau = 1.0
                    self.target_actor.sess.run(self.update_actor)
                    self.target_critic.sess.run(self.update_critic)
                    self.tau = old_tau
                else:
                    self.target_critic.sess.run(self.update_critic)
                    self.target_actor.sess.run(self.update_actor)

    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)

    def choose_action(self, state):
        # print("State[0]: ",state[0].shape)
        # print("State[1]: ",state[1].shape)
        state1 = state[0][np.newaxis, :]
        state2 = state[1][np.newaxis, :]
        state = [state1, state2]
        for _, d in enumerate(["/device:GPU:0", "/device:GPU:1"]):
            with tf.device(d):
                mu = self.actor.predict(state)
        noise = self.noise()
        mu_prime = mu + noise

        return mu_prime[0]

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return
        for _, d in enumerate(["/device:GPU:0", "/device:GPU:1"]):
            with tf.device(d):
                state, action, reward, new_state, done = \
                                            self.memory.sample_buffer(self.batch_size)
                #target q-value(new_state) with actor's bounded action forward pass
                critic_value_ = self.target_critic.predict(
                    new_state, self.target_actor.predict(new_state))

                target = []
                for j in range(self.batch_size):
                    target.append(reward[j] +
                                  self.gamma * critic_value_[j] * done[j])

                target = np.reshape(target, (self.batch_size, 1))

                _ = self.critic.train(state, action, target)  #s_i, a_i and y_i

                # a = mu(s_i)
                a_outs = self.actor.predict(state)
                # gradients of Q w.r.t actions
                grads = self.critic.get_action_gradients(state, a_outs)

                self.actor.train(state, grads[0])

                self.update_target_network_parameters(first=True)

    def save_models(self):
        self.actor.save_checkpoint()
        self.target_actor.save_checkpoint()
        self.critic.save_checkpoint()
        self.target_critic.save_checkpoint()

    def load_models(self):
        self.actor.load_checkpoint()
        self.target_actor.load_checkpoint()
        self.critic.load_checkpoint()
        self.target_critic.load_checkpoint()
Beispiel #2
0
class Agent(object):
    def __init__(self, n_states, n_actions, lr_actor, lr_critic, tau, gamma,
                 mem_size, actor_l1_size, actor_l2_size, critic_l1_size,
                 critic_l2_size, batch_size):

        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(mem_size, n_states, n_actions)
        self.batch_size = batch_size

        self.actor = Actor(lr_actor, n_states, n_actions, actor_l1_size,
                           actor_l2_size)
        self.critic = Critic(lr_critic, n_states, n_actions, critic_l1_size,
                             critic_l2_size)

        self.target_actor = Actor(lr_actor, n_states, n_actions, actor_l1_size,
                                  actor_l2_size)
        self.target_critic = Critic(lr_critic, n_states, n_actions,
                                    critic_l1_size, critic_l2_size)

        self.noise = OUActionNoise(mu=np.zeros(n_actions), sigma=0.005)

        self.update_network_parameters(tau=1)

    def choose_action(self, observation):
        self.actor.eval()
        observation = torch.tensor(observation,
                                   dtype=torch.float).to(self.actor.device)
        mu = self.actor.forward(observation).to(self.actor.device)

        # add noise to action - for exploration
        mu_prime = mu + torch.tensor(self.noise(), dtype=torch.float).to(
            self.actor.device)
        self.actor.train()

        return mu_prime.cpu().detach().numpy()

    def choose_action_no_train(self, observation):
        self.actor.eval()
        observation = torch.tensor(observation,
                                   dtype=torch.float).to(self.actor.device)
        mu = self.actor.forward(observation).to(self.actor.device)

        return mu.cpu().detach().numpy()

    def remember(self, state, action, reward, new_state, done):
        self.memory.push(state, action, reward, new_state, done)

    def learn(self):
        if self.memory.idx_last < self.batch_size:
            # not enough data in replay buffer
            return

        # select random events
        state, action, reward, new_state, done = self.memory.sample_buffer(
            self.batch_size)

        reward = torch.tensor(reward, dtype=torch.float).to(self.critic.device)
        done = torch.tensor(done).to(self.critic.device)
        new_state = torch.tensor(new_state,
                                 dtype=torch.float).to(self.critic.device)
        action = torch.tensor(action, dtype=torch.float).to(self.critic.device)
        state = torch.tensor(state, dtype=torch.float).to(self.critic.device)

        self.target_actor.eval()
        self.target_critic.eval()
        self.critic.eval()
        target_actions = self.target_actor.forward(new_state)
        critic_value_ = self.target_critic.forward(new_state, target_actions)
        critic_value = self.critic.forward(state, action)

        target = []
        for j in range(self.batch_size):
            target.append(reward[j] + self.gamma * critic_value_[j] * done[j])
        target = torch.tensor(target).to(self.critic.device)
        target = target.view(self.batch_size, 1)

        self.critic.train()
        self.critic.optimizer.zero_grad()
        critic_loss = F.mse_loss(target, critic_value)
        critic_loss.backward()
        self.critic.optimizer.step()

        self.critic.eval()
        self.actor.optimizer.zero_grad()
        mu = self.actor.forward(state)
        self.actor.train()
        actor_loss = -self.critic.forward(state, mu)
        actor_loss = torch.mean(actor_loss)
        actor_loss.backward()
        self.actor.optimizer.step()

        self.update_network_parameters()

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        actor_params = self.actor.named_parameters()
        critic_params = self.critic.named_parameters()
        target_actor_params = self.target_actor.named_parameters()
        target_critic_params = self.target_critic.named_parameters()

        critic_state_dict = dict(critic_params)
        actor_state_dict = dict(actor_params)
        target_critic_dict = dict(target_critic_params)
        target_actor_dict = dict(target_actor_params)

        for name in critic_state_dict:
            critic_state_dict[name] = tau*critic_state_dict[name].clone() + \
                                      (1-tau)*target_critic_dict[name].clone()

        self.target_critic.load_state_dict(critic_state_dict)

        for name in actor_state_dict:
            actor_state_dict[name] = tau*actor_state_dict[name].clone() + \
                                      (1-tau)*target_actor_dict[name].clone()
        self.target_actor.load_state_dict(actor_state_dict)

    def save_models(self):
        timestamp = time.strftime("%Y%m%d-%H%M%S")

        self.actor.save("actor_" + timestamp)
        self.target_actor.save("target_actor_" + timestamp)
        self.critic.save("critic_" + timestamp)
        self.target_critic.save("target_critic_" + timestamp)

    def load_models(self, fn_actor, fn_target_actor, fn_critic,
                    fn_target_critic):
        self.actor.load_checkpoint(fn_actor)
        self.target_actor.load_checkpoint(fn_target_actor)
        self.critic.load_checkpoint(fn_critic)
        self.target_critic.load_checkpoint(fn_target_critic)