Python OUNoise.reset Examples

Programming Language: Python

Namespace/Package Name: agents.ou_noise

Class/Type: OUNoise

Method/Function: reset

Examples at hotexamples.com: 8

Python OUNoise.reset - 8 examples found. These are the top rated real world Python examples of agents.ou_noise.OUNoise.reset extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

OUNoise(15)

sample(8)

reset(7)

Frequently Used Methods

OUNoise (15)

sample (8)

reset (7)

Example #1

Show file

File: agent.py Project: Bonz07/RL-Quadcopter-2

class Agent():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15 # same direction
        self.exploration_sigma = 0.001 # random noise
        
        #self.exploration_mu = 0
        #self.exploration_theta = 0.15
        #self.exploration_sigma = 0.2
        
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.1  # for soft update of target parameters

        
        # Compute the ongoing top score
        self.top_score = -np.inf
        self.score = 0

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        self.score = 0
        return state

    def step(self, action, reward, next_state, done):
         # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

        # stats
        self.score += reward
        if done:
            if self.score > self.top_score:
                self.top_score = self.score

    def act(self, states):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(states, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action + self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack([e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 - self.tau) * target_weights
        target_model.set_weights(new_weights)

Example #2

Show file

class DDPG_Agent:
    """Reinforcement learning agent that learns through DDPG."""
    def __init__(self, task):
        """Initialize DDPG Agent instance."""
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_high = task.action_high
        self.action_low = task.action_low

        # Initializing local and target Actor Models
        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_high, self.action_low)

        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_high, self.action_low)

        # Initializing local and target Critic Models
        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())

        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay Memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters

        # Additional Parameters
        self.best_score = -np.inf
        self.total_reward = 0.0
        self.count = 0
        self.score = 0

    def reset_episode(self):
        """Reset episode to initial state."""
        self.total_reward = 0.0
        self.count = 0
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        """Take a step."""
        self.total_reward += reward
        self.count += 1
        # Save experience/reward
        self.memory.memorize(self.last_state, action, reward, next_state, done)

        # Learn if enough samples are available in memory.
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

    def act(self, state):
        """Returns actions for state(s) according to given policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]

        # Add some noise to action for exploration and return
        return list(action + self.noise.sample())

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience
        tuples."""

        self.score = self.total_reward / \
            float(self.count) if self.count else 0.0
        if self.score > self.best_score:
            self.best_score = self.score

        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.vstack([e.action for e in experiences
                             if e is not None]).astype(np.float32).reshape(
                                 -1, self.action_size)
        rewards = np.vstack([e.reward for e in experiences if e is not None
                             ]).astype(np.float32).reshape(-1, 1)
        dones = np.vstack([e.done for e in experiences
                           if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        # Q_targets_next = critic_target(next_state, actor_target(next_state))
        next_actions = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, next_actions])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        # [states, actions, 0] 0 is for No learning Phase
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights), "Local and target model parameters must \
            have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights

        target_model.set_weights(new_weights)

Example #3

Show file

File: agent.py Project: vladhc/RL-Quadcopter-2

class Agent():
    def __init__(self, task, sess, stats):
        self.sess = sess
        self.task = task
        self.stats = stats

        tau = 0.01
        learning_rate = 2e-4

        self.critic_local = QNetwork(sess,
                                     task,
                                     stats,
                                     name='critic_local',
                                     hidden_units=64,
                                     dropout_rate=0.2)
        self.critic_target = QNetwork(sess,
                                      task,
                                      stats,
                                      name='critic_target',
                                      hidden_units=64,
                                      dropout_rate=0.2)
        self.actor_local = Policy(sess,
                                  task,
                                  stats,
                                  name='actor_local',
                                  hidden_units=32,
                                  dropout_rate=0.2)
        self.actor_target = Policy(sess,
                                   task,
                                   stats,
                                   name='actor_target',
                                   hidden_units=32,
                                   dropout_rate=0.2)
        soft_copy_critic_ops = self._create_soft_copy_op('critic_local',
                                                         'critic_target',
                                                         tau=tau)
        soft_copy_actor_ops = self._create_soft_copy_op('actor_local',
                                                        'actor_target',
                                                        tau=tau)
        self._soft_copy_ops = []
        self._soft_copy_ops.extend(soft_copy_critic_ops)
        self._soft_copy_ops.extend(soft_copy_actor_ops)

        self.gamma = 0.99  # reward discount rate

        # Exploration noise process
        exploration_mu = 0
        exploration_theta = 0.15
        exploration_sigma = 0.15
        self.noise = OUNoise(task.action_size, exploration_mu,
                             exploration_theta, exploration_sigma)

        # Replay memory
        self.batch_size = 256
        self.memory = ReplayBuffer(buffer_size=10000, decay_steps=1000)

        self.sess.run(tf.global_variables_initializer())

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        self.memory.decay_a()
        return state

    def step(self, action, reward, next_state, done):
        # Save experience
        self._save_experience(self.last_state, action, reward, next_state,
                              done)

        # Learn, if enough samples are available in memory
        self.learn()

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state, explore=False):
        """Returns actions for given state(s) as per current policy."""
        actor = self.actor_local if explore else self.actor_target
        action = actor.act([state], explore)[0]
        assert not np.any(np.isnan(action))

        if explore:
            action = action + self.noise.sample()
            action = np.maximum(action, self.task.action_low)
            action = np.minimum(action, self.task.action_high)

        assert not np.any(np.isnan(action))
        assert np.all(action >= self.task.action_low
                      ), "expected less than {:7.3f}, but was {}".format(
                          task.action_low, action)
        assert np.all(action <= self.task.action_high)

        return action

    def learn(self):
        """Update policy and value parameters using given batch of experience tuples."""

        if len(self.memory) < self.batch_size:
            return

        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        experiences, experience_indexes = self.memory.sample(self.batch_size)
        action_size = self.task.action_size
        states = np.vstack([e.state for e in experiences])
        actions = np.array([e.action for e in experiences
                            ]).astype(np.float32).reshape(-1, action_size)
        rewards = np.array([e.reward for e in experiences
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          ]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack([e.next_state for e in experiences])

        # Get predicted next-state actions, Q and V values
        actions_next = self.actor_target.act(next_states)
        Q_targets_next, V_targets_next = self.critic_target.get_q_and_v(
            next_states, actions_next)

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        V_targets = rewards + self.gamma * V_targets_next * (1 - dones)
        td_errs = self.critic_local.learn(states, actions, Q_targets,
                                          V_targets)

        self.memory.update_td_err(experience_indexes, td_errs)

        self.memory.scrape_stats(self.stats)

        # Train actor model
        actions = self.actor_target.act(states)
        action_gradients = self.critic_target.get_action_gradients(
            states, actions)
        self.actor_local.learn(states, action_gradients)

        self._soft_copy()

    def _save_experience(self, state, action, reward, next_state, done):
        """Adds experience into ReplayBuffer. As a side effect, also learns q network on this sample."""
        # Get predicted next-state actions and Q values
        actions_next = self.actor_local.act([next_state])
        Q_targets_next, _ = self.critic_local.get_q_and_v([next_state],
                                                          actions_next)
        Q_target_next = Q_targets_next[0]

        Q_target = reward + self.gamma * Q_target_next * (1 - done)
        td_err = self.critic_local.get_td_err([state], [action], [Q_target])

        self.memory.add(Experience(state, action, reward, next_state, done),
                        td_err)

    def _soft_copy(self):
        self.sess.run(self._soft_copy_ops)

    def _create_soft_copy_op(self, scope_src, scope_dst, tau=0.01):
        var_src = tf.trainable_variables(scope=scope_src)
        var_dst = tf.trainable_variables(scope=scope_dst)
        copy_ops = []
        for src, dst in zip(var_src, var_dst):
            mixed = tau * src + (1.0 - tau) * dst
            copy_op = tf.assign(dst, mixed)
            copy_ops.append(copy_op)
        return copy_ops

Example #4

Show file

class DDPG:
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.001

        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size)

        self.gamma = 0.99
        self.tau = 0.1
        self.learning_rate = 0.0005

        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        self.actor_local = Actor(self.state_size,
                                 self.action_size,
                                 self.action_low,
                                 self.action_high,
                                 learning_rate=self.learning_rate)
        self.actor_target = Actor(self.state_size,
                                  self.action_size,
                                  self.action_low,
                                  self.action_high,
                                  learning_rate=self.learning_rate)

        self.critic_local = Critic(self.state_size,
                                   self.action_size,
                                   learning_rate=self.learning_rate)
        self.critic_target = Critic(self.state_size,
                                    self.action_size,
                                    learning_rate=self.learning_rate)

    def reset_episode(self):
        self.total_reward = 0.0
        self.count = 0
        self.noise.reset()
        self.last_state = self.task.reset()
        return self.last_state

    def act(self, state):
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action + self.noise.sample())

    def step(self, action, reward, next_state, done):
        self.memory.add(self.last_state, action, reward, next_state, done)

        self.total_reward += reward
        self.count += 1

        if self.memory.size() > self.batch_size:
            experiences = self.memory.sample(self.batch_size)
            self.learn(experiences)

        self.last_state = next_state

    def learn(self, experiences):
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        actions_next = self.actor_target.model.predict_on_batch(next_states)
        q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        q_targets = rewards + (self.gamma * q_targets_next * (1 - dones))
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=q_targets)

        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])

        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)

Example #5

Show file

File: agent.py Project: Rababalkhalifa/QuadCop

class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters

        # from plicy search
        
        self.action_range = self.action_high - self.action_low
        
        self.w = np.random.normal(
            size=(self.state_size, self.action_size),  # weights for simple linear policy: state_space x action_space
            scale=(self.action_range / (2 * self.state_size))) # start producing actions in a decent range
        
        # Score tracker and learning parameters
        self.score = -np.inf
        self.best_w = None
        self.best_score = -np.inf
        self.noise_scale = 0.1
        
        #counter
        self.count = 0
        
        
        
        
    def reset_episode(self):
        self.noise.reset()
        self.count = 0
        self.total_reward = 0.0
        self.score = 0
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
         # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)
        
        self.count += 1
        self.total_reward += reward
        
        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state
        
        #from the tutorial SRC
        self.score += reward
        
        if done:
            if self.score > self.best_score:
                self.best_score = self.score

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        
        return list(action + self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack([e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)  
        
        # from policy search
        
        # Learn by random policy search, using a reward-based score
        # self.score = self.total_reward / float(self.count) if self.count else 0.0
        # if self.score > self.best_score:
        #     self.best_score = self.score
        #     self.best_w = self.w
        #     self.noise_scale = max(0.5 * self.noise_scale, 0.01)
        # else:
        #     self.w = self.best_w
        #     self.noise_scale = min(2.0 * self.noise_scale, 3.2)
        # self.w = self.w + self.noise_scale * np.random.normal(size=self.w.shape)  # equal noise in all directions

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 - self.tau) * target_weights
        target_model.set_weights(new_weights)

Example #6

Show file

class AE_DDPG_Agent():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # AE: Although OUNoise gives me a convenient set of randomness for each of the rotors, I still need
        # AE: to make a decision myself on how to apply the randomness and how to manage its magnitude
        # AE: (i.e. my eplore vs exploit strategy). These variables will do that.
        self.explore_start = 1.0  # AE: exploration probability at start
        self.explore_stop = 0.001  # AE: minimum exploration probability
        self.decay_rate = 0.003  # AE: exponential decay rate for exploration prob
        self.magnitude_coeff = 0.1  # AE: a coefficient to limit randomness

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0  # AE: additive to the noise. mu * theta will be directly added

        self.exploration_theta = 0.15  # AE: old noise will be multiplied by this
        self.exploration_sigma = 0.2  # AE: new noise will be multiplied by this
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        # AE: The learning rate. How much we trust the new values compared to the old ones.
        self.tau = 0.0001  # for soft update of target parameters

        # AE: current reward in learning procedure (for statistics)
        self.score = -np.inf

        # Episode variables
        self.reset_episode()

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state

        self.best_score = -np.inf
        self.score = -np.inf
        self.total_reward = 0.0
        self.count = 0

        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

        self.total_reward += reward
        self.count += 1

        # AE: Score (average reward in this episode so far) and best score for statistics
        self.score = self.total_reward / float(self.count)
        if self.score > self.best_score:
            self.best_score = self.score

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])

        # AE: directly sampling approximated value from learned action-value function.
        action = self.actor_local.model.predict(state)[0]

        # AE: and adding some noise to that for unpredictability.
        # AE: The magnitude of noise has to drop over time.
        explore_p = self.explore_stop + (self.explore_start -
                                         self.explore_stop) * np.exp(
                                             -self.decay_rate * self.count)
        #self.noise.update_mu(explore_p)
        noise_sample = self.magnitude_coeff * explore_p * self.noise.sample()
        #noise_sample = explore_p * np.random.randn(self.action_size)
        #print("Noi=", s)

        return list(
            action +
            noise_sample * self.action_size)  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        # AE: Updating NN weights directly in the passed model (actor or critic).
        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)

Example #7

Show file

File: agent.py Project: srinivascreddy/RLQuadcopter

class RLA():
    """ Reinfocement learning agent"""
    
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        
        #actor model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        
        #Critic model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)
        
        #Initialize target model params with local params
        self.critic_target.model.set_weights(
                self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
                self.actor_local.model.get_weights())
        
        #Initialize noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)
        
        #Replay memory Initialization
        self.buffer_size, self.batch_size = 2000000, 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)
        
        #Initialize algorithm parameters
        self.gamma, self.tau = 0.95, 0.001
        
        #Initialize scores
        self.score, self.best_score = -np.inf, -np.inf
    
    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        self.score = 0
        return state
    
    def step(self, action, reward, next_state, done):
        self.memory.add(self.last_state, action, reward, next_state, done)
        
        #Learn from samples in memory if they are greater than batch size
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)
        
        #Preserve state as last_state
        self.last_state = next_state
        
        #Update score with reward from this step
        self.score += reward
        
        if done:
            #Preserve best score
            if self.score > self.best_score:
                self.best_score = self.score
        
    def act(self, state):
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action + self.noise.sample())
    
    def learn(self, experiences):
        #Convert experiences seperate arrays
        states = np.vstack([exp.state for exp in experiences if exp is not None])
        actions = np.array([exp.action for exp in experiences if exp is not None]).astype(np.float32).reshape(-1, self.action_size)
        rewards = np.array([exp.reward for exp in experiences if exp is not None]).astype(np.float32).reshape(-1, 1)
        dones = np.array([exp.done for exp in experiences if exp is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack([exp.next_state for exp in experiences if exp is not None])
        
        #predict next_state actions and Q values from target model...
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next])
        
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states,actions], y=Q_targets)
        
        #Train local actor model
        action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])
        
        #Update target models
        self.update(self.critic_local.model, self.critic_target.model)
        self.update(self.actor_local.model, self.actor_target.model)
        
    def update(self, local_model, target_model):
        """Update model parameters"""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())
        
        new_weights = self.tau * local_weights + (1 - self.tau) * target_weights
        target_model.set_weights(new_weights)

Example #8

Show file

File: ddpg_agent.py Project: xadahiya/RL-Quadcopter-2

class DDPG_Agent:
    """Reinforcement learning agent that learns through DDPG."""

    def __init__(self, task):
        """Initialize DDPG Agent instance."""
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_high = task.action_high
        self.action_low = task.action_low

        # Initializing local and target Actor Models
        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_high, self.action_low)

        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_high, self.action_low)

        # Initializing local and target Critic Models
        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())

        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay Memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters

        # Additional Parameters
        self.best_score = -np.inf
        self.total_reward = 0.0
        self.count = 0
        self.score = 0

    def reset_episode(self):
        """Reset episode to initial state."""
        self.total_reward = 0.0
        self.count = 0
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        """Take a step."""
        self.total_reward += reward
        self.count += 1
        # Save experience/reward
        self.memory.memorize(self.last_state, action, reward, next_state, done)

        # Learn if enough samples are available in memory.
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

    def act(self, state):
        """Returns actions for state(s) according to given policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]

        # Add some noise to action for exploration and return
        return list(action + self.noise.sample())

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience
        tuples."""

        self.score = self.total_reward / \
            float(self.count) if self.count else 0.0
        if self.score > self.best_score:
            self.best_score = self.score

        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.vstack(
            [e.action for e in experiences if e is not None]).astype(
            np.float32).reshape(-1, self.action_size)
        rewards = np.vstack(
            [e.reward for e in experiences if e is not None]).astype(
            np.float32).reshape(-1, 1)
        dones = np.vstack(
            [e.done for e in experiences if e is not None]).astype(
            np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        # Q_targets_next = critic_target(next_state, actor_target(next_state))
        next_actions = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, next_actions])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(
            x=[states, actions], y=Q_targets)

        # Train actor model (local)
        # [states, actions, 0] 0 is for No learning Phase
        action_gradients = np.reshape(self.critic_local.get_action_gradients(
            [states, actions, 0]), (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights), "Local and target model parameters must \
            have the same size"

        new_weights = self.tau*local_weights + (1-self.tau)*target_weights

        target_model.set_weights(new_weights)