class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        
        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters
        
        # Score tracker and learning parameters
        #self.count = 0
        #self.score = 0
        #self.total_reward = 0
        #self.best_w = None
        #self.best_score = -np.inf
        #self.noise_scale = 0.1


    def reset_episode(self):
        #self.count = 0
        #self.score = 0
        #self.total_reward = 0
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)
        #self.count += 1
        #self.total_reward += reward

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        self.noise.sample()
        return list(action + self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
       
        
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack([e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)   
        

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 - self.tau) * target_weights
        target_model.set_weights(new_weights)
Ejemplo n.º 2
0
class Agent:
    def __init__(self, env, sess):
        # Environment
        self.n_state = env.observation_space.shape[0]
        self.n_action = env.action_space.shape[0]

        # Neural Networks
        self.sess = sess
        self.actor = Actor(self.sess, self.n_state, self.n_action)
        self.critic = Critic(self.sess, self.n_state, self.n_action)

        # Replay Buffer
        self.replay_buffer = ReplayBuffer(BUFFER_SIZE)
        # Ornstein-Uhlenbeck Noise
        self.exploration_noise = OUNoise(self.n_action)

    def noise_action(self, state):
        '''Get action with noise'''
        return self.action(state) + self.exploration_noise.noise()

    def action(self, state):
        '''Get action from online actor'''
        return self.actor.action(state)

    def train(self):
        '''Train Networks'''
        # Draw sample from Replay Buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([d[0] for d in minibatch])
        action_batch = np.asarray([d[1] for d in minibatch])
        reward_batch = np.asarray([d[2] for d in minibatch])
        next_state_batch = np.asarray([d[3] for d in minibatch])
        done_batch = np.asarray([d[4] for d in minibatch])

        # Train Critic
        next_action_batch = self.actor.target_actions(next_state_batch)
        target_q_value_batch = self.critic.target_q(next_state_batch,
                                                    next_action_batch)
        # q = r if done else r+gamma*target_q
        q_batch = reward_batch.reshape(
            (BATCH_SIZE, 1)) + (1. - done_batch.reshape(
                BATCH_SIZE, 1).astype(float)) * GAMMA * target_q_value_batch
        self.critic.train(q_batch, state_batch, action_batch)

        # Train Actor
        action_batch_grads = self.actor.actions(state_batch)
        q_grads_batch = self.critic.gradients(state_batch, action_batch_grads)
        self.actor.train(q_grads_batch, state_batch)

        # Slowly update Target Networks
        self.actor.update_target()
        self.critic.update_target()

    def perceive(self, state, action, reward, next_state, done):
        '''Add transition to replay buffer and train if there are sufficient amount of transitions'''
        # Add samples
        self.replay_buffer.add(state, action, reward, next_state, done)
        # Train if there are sufficient number of samples
        if self.replay_buffer.count() > REPLAY_START:
            self.train()
        # Reset the noise for next episode
        if done:
            self.exploration_noise.reset()