コード例 #1
0
ファイル: td3.py プロジェクト: TomoyaAkiyama/TD3
class TD3RolloutActor:
    def __init__(self, state_dim, action_dim, action_max, exploration_noise):
        self.actor = Actor(state_dim, 256, action_dim, action_max).eval()
        self.exploration_noise = exploration_noise

    def select_action(self, state):
        state = torch.tensor(state.reshape(1, -1), dtype=torch.float)
        action = self.actor.forward(state)
        noise = torch.randn_like(action) * self.exploration_noise
        action = action + noise
        return action.cpu().detach().numpy().flatten()

    def deterministic_action(self, state):
        state = torch.tensor(state.reshape(1, -1), dtype=torch.float)
        action = self.actor.forward(state)
        return action.cpu().detach().numpy().flatten()

    def parameters(self):
        return self.actor.parameters()
コード例 #2
0
class DDPGAgent:
    def __init__(self,
                 state_space_dim,
                 action_space_dim,
                 min_action_val,
                 max_action_val,
                 hidden_layer_size=512,
                 gamma=0.99,
                 tau=0.0001,
                 path_to_load=None):
        self.gamma = gamma
        self.tau = tau
        self.min_action_val = min_action_val
        self.max_action_val = max_action_val
        self.buffer = Buffer(state_space_dim, action_space_dim)
        self.noise_generator = GaussianNoise(0., 0.2, action_space_dim)

        self.actor = Actor(state_space_dim, action_space_dim, max_action_val,
                           hidden_layer_size)
        self.critic = Critic(state_space_dim, action_space_dim,
                             hidden_layer_size)

        if path_to_load is not None:
            if os.path.exists(path_to_load + "_actor.h5") and \
                    os.path.exists(path_to_load + "_critic.h5"):
                self.load(path_to_load)

        self.target_actor = Actor(state_space_dim, action_space_dim,
                                  max_action_val, hidden_layer_size)
        self.target_critic = Critic(state_space_dim, action_space_dim,
                                    hidden_layer_size)

        self.target_actor.model.set_weights(self.actor.model.get_weights())
        self.target_critic.model.set_weights(self.critic.model.get_weights())

        critic_lr = 0.002
        actor_lr = 0.001

        self.critic_optimizer = tf.keras.optimizers.Adam(critic_lr)
        self.actor_optimizer = tf.keras.optimizers.Adam(actor_lr)

    @tf.function
    def _apply_gradients(self, states, actions, next_states, rewards):
        with tf.GradientTape() as tape:
            target_actions = self.target_actor.forward(next_states)
            y = tf.cast(rewards,
                        tf.float32) + self.gamma * self.target_critic.forward(
                            [next_states, target_actions])
            critic_value = self.critic.forward([states, actions])
            critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value))

        critic_grad = tape.gradient(critic_loss,
                                    self.critic.model.trainable_variables)
        self.critic_optimizer.apply_gradients(
            zip(critic_grad, self.critic.model.trainable_variables))

        with tf.GradientTape() as tape:
            actions = self.actor.forward(states)
            critic_value = self.critic.forward([states, actions])
            actor_loss = -tf.math.reduce_mean(critic_value)

        actor_grad = tape.gradient(actor_loss,
                                   self.actor.model.trainable_variables)
        self.actor_optimizer.apply_gradients(
            zip(actor_grad, self.actor.model.trainable_variables))

    def learn(self):
        states, actions, next_states, rewards = self.buffer.sample()
        self._apply_gradients(states, actions, next_states, rewards)

    def remember_step(self, info):
        self.buffer.remember(info)

    def update_targets(self):
        new_weights = []
        target_variables = self.target_critic.model.weights
        for i, variable in enumerate(self.critic.model.weights):
            new_weights.append(variable * self.tau + target_variables[i] *
                               (1 - self.tau))

        self.target_critic.model.set_weights(new_weights)

        new_weights = []
        target_variables = self.target_actor.model.weights
        for i, variable in enumerate(self.actor.model.weights):
            new_weights.append(variable * self.tau + target_variables[i] *
                               (1 - self.tau))

        self.target_actor.model.set_weights(new_weights)

    def get_best_action(self, state):
        tf_state = tf.expand_dims(tf.convert_to_tensor(state), 0)
        return tf.squeeze(self.actor.forward(tf_state)).numpy()

    def get_action(self, state):
        actions = self.get_best_action(
            state) + self.noise_generator.get_noise()
        return np.clip(actions, self.min_action_val, self.max_action_val)

    def save(self, path):
        print(f"Model has been saved as '{path}'")
        self.actor.save(path)
        self.critic.save(path)

    def load(self, path):
        print(f"Model has been loaded from '{path}'")
        self.actor.load(path)
        self.critic.load(path)
コード例 #3
0
class Agent(object):
    def __init__(self, n_states, n_actions, lr_actor, lr_critic, tau, gamma,
                 mem_size, actor_l1_size, actor_l2_size, critic_l1_size,
                 critic_l2_size, batch_size):

        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(mem_size, n_states, n_actions)
        self.batch_size = batch_size

        self.actor = Actor(lr_actor, n_states, n_actions, actor_l1_size,
                           actor_l2_size)
        self.critic = Critic(lr_critic, n_states, n_actions, critic_l1_size,
                             critic_l2_size)

        self.target_actor = Actor(lr_actor, n_states, n_actions, actor_l1_size,
                                  actor_l2_size)
        self.target_critic = Critic(lr_critic, n_states, n_actions,
                                    critic_l1_size, critic_l2_size)

        self.noise = OUActionNoise(mu=np.zeros(n_actions), sigma=0.005)

        self.update_network_parameters(tau=1)

    def choose_action(self, observation):
        self.actor.eval()
        observation = torch.tensor(observation,
                                   dtype=torch.float).to(self.actor.device)
        mu = self.actor.forward(observation).to(self.actor.device)

        # add noise to action - for exploration
        mu_prime = mu + torch.tensor(self.noise(), dtype=torch.float).to(
            self.actor.device)
        self.actor.train()

        return mu_prime.cpu().detach().numpy()

    def choose_action_no_train(self, observation):
        self.actor.eval()
        observation = torch.tensor(observation,
                                   dtype=torch.float).to(self.actor.device)
        mu = self.actor.forward(observation).to(self.actor.device)

        return mu.cpu().detach().numpy()

    def remember(self, state, action, reward, new_state, done):
        self.memory.push(state, action, reward, new_state, done)

    def learn(self):
        if self.memory.idx_last < self.batch_size:
            # not enough data in replay buffer
            return

        # select random events
        state, action, reward, new_state, done = self.memory.sample_buffer(
            self.batch_size)

        reward = torch.tensor(reward, dtype=torch.float).to(self.critic.device)
        done = torch.tensor(done).to(self.critic.device)
        new_state = torch.tensor(new_state,
                                 dtype=torch.float).to(self.critic.device)
        action = torch.tensor(action, dtype=torch.float).to(self.critic.device)
        state = torch.tensor(state, dtype=torch.float).to(self.critic.device)

        self.target_actor.eval()
        self.target_critic.eval()
        self.critic.eval()
        target_actions = self.target_actor.forward(new_state)
        critic_value_ = self.target_critic.forward(new_state, target_actions)
        critic_value = self.critic.forward(state, action)

        target = []
        for j in range(self.batch_size):
            target.append(reward[j] + self.gamma * critic_value_[j] * done[j])
        target = torch.tensor(target).to(self.critic.device)
        target = target.view(self.batch_size, 1)

        self.critic.train()
        self.critic.optimizer.zero_grad()
        critic_loss = F.mse_loss(target, critic_value)
        critic_loss.backward()
        self.critic.optimizer.step()

        self.critic.eval()
        self.actor.optimizer.zero_grad()
        mu = self.actor.forward(state)
        self.actor.train()
        actor_loss = -self.critic.forward(state, mu)
        actor_loss = torch.mean(actor_loss)
        actor_loss.backward()
        self.actor.optimizer.step()

        self.update_network_parameters()

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        actor_params = self.actor.named_parameters()
        critic_params = self.critic.named_parameters()
        target_actor_params = self.target_actor.named_parameters()
        target_critic_params = self.target_critic.named_parameters()

        critic_state_dict = dict(critic_params)
        actor_state_dict = dict(actor_params)
        target_critic_dict = dict(target_critic_params)
        target_actor_dict = dict(target_actor_params)

        for name in critic_state_dict:
            critic_state_dict[name] = tau*critic_state_dict[name].clone() + \
                                      (1-tau)*target_critic_dict[name].clone()

        self.target_critic.load_state_dict(critic_state_dict)

        for name in actor_state_dict:
            actor_state_dict[name] = tau*actor_state_dict[name].clone() + \
                                      (1-tau)*target_actor_dict[name].clone()
        self.target_actor.load_state_dict(actor_state_dict)

    def save_models(self):
        timestamp = time.strftime("%Y%m%d-%H%M%S")

        self.actor.save("actor_" + timestamp)
        self.target_actor.save("target_actor_" + timestamp)
        self.critic.save("critic_" + timestamp)
        self.target_critic.save("target_critic_" + timestamp)

    def load_models(self, fn_actor, fn_target_actor, fn_critic,
                    fn_target_critic):
        self.actor.load_checkpoint(fn_actor)
        self.target_actor.load_checkpoint(fn_target_actor)
        self.critic.load_checkpoint(fn_critic)
        self.target_critic.load_checkpoint(fn_target_critic)
コード例 #4
0
ファイル: td3.py プロジェクト: TomoyaAkiyama/TD3
class TD3:
    def __init__(self,
                 device,
                 state_dim,
                 action_dim,
                 action_max,
                 gamma=0.99,
                 tau=0.005,
                 lr=3e-4,
                 policy_noise=0.2,
                 noise_clip=0.5,
                 exploration_noise=0.1,
                 policy_freq=2):

        self.actor = Actor(state_dim, 256, action_dim, action_max).to(device)
        self.target_actor = copy.deepcopy(self.actor)
        self.actor_optimizer = optim.Adam(params=self.actor.parameters(),
                                          lr=lr)
        self.critic = Critic(state_dim, 256, action_dim).to(device)
        self.target_critic = copy.deepcopy(self.critic)
        self.critic_optimizer = optim.Adam(params=self.critic.parameters(),
                                           lr=lr)

        self.device = device
        self.gamma = gamma
        self.tau = tau
        self.policy_noise = policy_noise
        self.noise_clip = noise_clip
        self.policy_freq = policy_freq

        self.rollout_actor = TD3RolloutActor(state_dim, action_dim, action_max,
                                             exploration_noise)
        self.sync_rollout_actor()

        self.iteration_num = 0

    def train(self, replay_buffer, batch_size=256):
        self.iteration_num += 1

        st, nx_st, ac, rw, mask = replay_buffer.sample(batch_size)
        with torch.no_grad():
            noise = (torch.randn_like(ac) * self.policy_noise).clamp(
                -self.noise_clip, self.noise_clip)
            nx_ac = self.target_actor.forward(nx_st, noise)

            target_q1, target_q2 = self.target_critic.forward(nx_st, nx_ac)
            min_q = torch.min(target_q1, target_q2)
            target_q = rw + mask * self.gamma * min_q

        q1, q2 = self.critic.forward(st, ac)
        critic_loss = F.mse_loss(q1, target_q) + F.mse_loss(q2, target_q)
        self.critic.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        if self.iteration_num % self.policy_freq == 0:
            actor_loss = -self.critic.q1(st, self.actor.forward(st)).mean()
            self.actor.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            for param, target_param in zip(self.critic.parameters(),
                                           self.target_critic.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)

            for param, target_param in zip(self.actor.parameters(),
                                           self.target_actor.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)

        self.sync_rollout_actor()

    def sync_rollout_actor(self):
        for param, target_param in zip(self.actor.parameters(),
                                       self.rollout_actor.parameters()):
            target_param.data.copy_(param.data.cpu())

    def save(self, path):
        torch.save(self.critic.state_dict(), os.path.join(path, 'critic.pth'))
        torch.save(self.target_critic.state_dict(),
                   os.path.join(path, 'target_critic.pth'))
        torch.save(self.critic_optimizer.state_dict(),
                   os.path.join(path, 'critic_optimizer.pth'))

        torch.save(self.actor.state_dict(), os.path.join(path, 'actor.pth'))
        torch.save(self.target_actor.state_dict(),
                   os.path.join(path, 'target_actor.pth'))
        torch.save(self.actor_optimizer.state_dict(),
                   os.path.join(path, 'actor_optimizer.pth'))

    def load(self, path):
        self.critic.load_state_dict(
            torch.load(os.path.join(path, 'critic.pth')))
        self.target_critic.load_state_dict(
            torch.load(os.path.join(path, 'target_critic.pth')))
        self.critic_optimizer.load_state_dict(
            torch.load(os.path.join(path, 'critic_optimizer.pth')))

        self.actor.load_state_dict(torch.load(os.path.join(path, 'actor.pth')))
        self.target_actor.load_state_dict(
            torch.load(os.path.join(path, 'target_actor.pth')))
        self.actor_optimizer.load_state_dict(
            torch.load(os.path.join(path, 'actor_optimizer.pth')))
        self.sync_rollout_actor()