class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.name = "DDPG"
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high,
                                 'actor_local')
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high,
                                  'actor_target')

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size,
                                   'critic_local')
        self.critic_target = Critic(self.state_size, self.action_size,
                                    'critic_target')

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0.0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters

        # Reward counter
        self.total_reward = 0
        self.n_steps = 0

    def load(self):
        self.actor_local.load()
        self.actor_target.load()
        self.critic_local.load()
        self.critic_target.load()
        print("Agent's weights loaded from disk.")

    def save(self):
        self.actor_local.save()
        self.actor_target.save()
        self.critic_local.save()
        self.critic_target.save()
        print("Agent's weights saved to disk.")

    def reset_episode(self):
        self.total_reward = 0
        self.n_steps = 0
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)
        # Add reward to total
        self.total_reward += reward
        self.n_steps += 1

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state, add_noise=True):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        # Hack, rescale rotor revs to +-5 range from average
        # rev_mean = np.mean(action)
        # action = (action-450)/450
        # action *= 50
        # action += rev_mean

        if add_noise:
            action += self.noise.sample()  # additive noise for exploration
        return list(action)

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
Exemple #2
0
class Agent:
    def __init__(self, env, gamma, gae_lambda, batch_size, lr_rate,
                 ratio_clipping, epochs):

        self.env = env
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]
        self.action_bound = env.action_space.high[0]

        self.gamma = gamma
        self.gae_lambda = gae_lambda
        self.batch_size = batch_size
        self.epochs = epochs

        self.actor = Actor(self.state_dim, self.action_dim, self.action_bound,
                           lr_rate[0], ratio_clipping)
        self.critic = Critic(self.state_dim, lr_rate[1])

        self.save_epi_reward = []

    def gae_target(self, rewards, v_values, next_v_value, done):
        n_step_targets = torch.zeros_like(rewards)
        gae = torch.zeros_like(rewards)
        gae_cumulative = 0.
        forward_val = 0.

        if not done:
            forward_val = next_v_value

        for k in reversed(range(0, len(rewards))):
            delta = rewards[k] + self.gamma * forward_val - v_values[k]
            gae_cumulative = self.gamma * self.gae_lambda * gae_cumulative + delta
            gae[k] = gae_cumulative
            forward_val = v_values[k]
            n_step_targets[k] = gae[k] + v_values[k]

        return gae, n_step_targets

    def unpack_batch(self, batch):
        unpack = []
        for idx in range(len(batch)):
            unpack.append(batch[idx])

        unpack = torch.cat(unpack, axis=0)
        return unpack

    def train(self, max_episode_num, save_path, save_names):
        batch_state, batch_action, batch_reward = [], [], []
        batch_log_old_policy_pdf = []

        for episode in range(max_episode_num):
            time, episode_reward, done = 0, 0, False
            state = self.env.reset()
            state = torch.from_numpy(state).type(torch.FloatTensor)

            while not done:
                #env.render()
                mu_old, std_old, action = self.actor.get_policy_action(state)
                action = np.array([action.item()])
                mu_old = np.array([mu_old.item()])
                std_old = np.array([std_old.item()])
                action = np.clip(action, -self.action_bound, self.action_bound)

                var_old = std_old**2
                log_old_policy_pdf = -0.5 * (
                    action - mu_old)**2 / var_old - 0.5 * np.log(
                        var_old * 2 * np.pi)
                log_old_policy_pdf = np.sum(log_old_policy_pdf)

                next_state, reward, done, _ = self.env.step(action)
                next_state = torch.from_numpy(next_state).type(
                    torch.FloatTensor)
                action = torch.from_numpy(action).type(torch.FloatTensor)
                reward = torch.FloatTensor([reward])
                log_old_policy_pdf = torch.FloatTensor([log_old_policy_pdf])

                state = state.view(1, self.state_dim)
                next_state = next_state.view(1, self.state_dim)
                action = action.view(1, self.action_dim)
                reward = reward.view(1, 1)
                log_old_policy_pdf = log_old_policy_pdf.view(1, 1)

                batch_state.append(state)
                batch_action.append(action)
                batch_reward.append((reward + 8) / 8)
                batch_log_old_policy_pdf.append(log_old_policy_pdf)

                if len(batch_state) < self.batch_size:
                    state = next_state[0]
                    episode_reward += reward[0]
                    time += 1
                    continue

                states = self.unpack_batch(batch_state)
                actions = self.unpack_batch(batch_action)
                rewards = self.unpack_batch(batch_reward)
                log_old_policy_pdfs = self.unpack_batch(
                    batch_log_old_policy_pdf)
                batch_state, batch_action, batch_reward = [], [], []
                batch_log_old_policy_pdf = []

                v_values = self.critic.get_value(states)
                next_v_value = self.critic.get_value(next_state)
                gaes, y_i = self.gae_target(rewards, v_values, next_v_value,
                                            done)

                for _ in range(self.epochs):
                    self.actor.update(states, actions, gaes,
                                      log_old_policy_pdfs)
                    self.critic.update(states, y_i)

                state = next_state[0]
                episode_reward += reward[0]
                time += 1

            self.save_epi_reward.append(episode_reward.item())

            if len(self.save_epi_reward) < 20:
                print('Episode:', episode + 1, 'Time:',
                      time, 'Reward(ave of recent20):',
                      np.mean(self.save_epi_reward))
            else:
                print('Episode:', episode + 1, 'Time:', time,
                      'Reward(ave of recent20):',
                      np.mean(self.save_epi_reward[-20:]))

            if episode % 10 == 0:
                self.actor.save(save_path, save_names[0])
                self.critic.save(save_path, save_names[1])
Exemple #3
0
class Agent:
    def __init__(self, env, gamma, batch_size, buffer_size, lr_rate, tau):

        self.env = env
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]
        self.action_bound = env.action_space.high[0]

        self.gamma = gamma
        self.batch_size = batch_size
        self.buffer_size = buffer_size

        self.actor = Actor(self.state_dim, self.action_dim, self.action_bound,
                           lr_rate[0], tau)
        self.critic = Critic(self.state_dim, self.action_dim, lr_rate[1], tau)

        self.buffer = ReplayBuffer(self.buffer_size)
        self.save_epi_reward = []

    def ou_noise(self, x, rho=0.15, mu=0., dt=1e-1, sigma=0.2, dim=1):
        rho = torch.FloatTensor([rho])
        mu = torch.FloatTensor([mu])
        dt = torch.FloatTensor([dt])
        return x + rho * (mu - x) * dt + torch.sqrt(dt) * torch.normal(
            0., sigma, size=(dim, ))

    def td_target(self, rewards, q_values, dones):
        y_k = torch.zeros(q_values.shape)

        for i in range(q_values.shape[0]):
            if dones[i]:
                y_k[i] = rewards[i]
            else:
                y_k[i] = rewards[i] + self.gamma * q_values[i]
        return y_k

    def train(self, max_episode_num, save_path, save_names):
        self.actor.update_target_network()
        self.critic.update_target_network()

        for episode in range(max_episode_num):
            time, episode_reward, done = 0, 0, False
            state = self.env.reset()
            state = torch.from_numpy(state).type(torch.FloatTensor)

            pre_noise = torch.zeros(self.action_dim)

            while not done:
                #env.render()
                action = self.actor.predict(state)[0]
                noise = self.ou_noise(pre_noise, dim=self.action_dim)

                action = np.array([action.item()])
                action = np.clip(action, -self.action_bound, self.action_bound)

                next_state, reward, done, _ = self.env.step(action)
                next_state = torch.from_numpy(next_state).type(
                    torch.FloatTensor)
                action = torch.from_numpy(action).type(torch.FloatTensor)
                reward = torch.FloatTensor([reward])
                train_reward = torch.FloatTensor([(reward + 8) / 8])

                state = state.view(1, self.state_dim)
                next_state = next_state.view(1, self.state_dim)
                action = action.view(1, self.action_dim)
                reward = reward.view(1, 1)
                train_reward = reward.view(1, 1)

                self.buffer.add_buffer(state, action, train_reward, next_state,
                                       done)
                if self.buffer.buffer_size > 1000:
                    states, actions, rewards, next_states, dones = self.buffer.sample_batch(
                        self.batch_size)

                    actions_ = self.actor.target_predict(next_states)
                    actions_ = actions_.view(next_states.shape[0],
                                             self.action_dim)
                    target_qs = self.critic.target_predict(
                        next_states, actions_)
                    y_i = self.td_target(rewards, target_qs, dones)
                    self.critic.train(states, actions, y_i)

                    s_actions = self.actor.predict(states)
                    policy_loss = self.critic.predict(states, s_actions)
                    self.actor.train(policy_loss)

                    self.actor.update_target_network()
                    self.critic.update_target_network()

                pre_noise = noise
                state = next_state[0]
                episode_reward += reward[0]
                time += 1

            self.save_epi_reward.append(episode_reward.item())

            if len(self.save_epi_reward) < 20:
                print('Episode:', episode + 1, 'Time:',
                      time, 'Reward(ave of recent20):',
                      np.mean(self.save_epi_reward))
            else:
                print('Episode:', episode + 1, 'Time:', time,
                      'Reward(ave of recent20):',
                      np.mean(self.save_epi_reward[-20:]))

            if episode % 10 == 0:
                self.actor.save(save_path, save_names[0])
                self.critic.save(save_path, save_names[1])