コード例 #1
0
ファイル: agent.py プロジェクト: Bonz07/RL-Quadcopter-2
class Agent():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15 # same direction
        self.exploration_sigma = 0.001 # random noise
        
        #self.exploration_mu = 0
        #self.exploration_theta = 0.15
        #self.exploration_sigma = 0.2
        
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.1  # for soft update of target parameters

        
        # Compute the ongoing top score
        self.top_score = -np.inf
        self.score = 0

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        self.score = 0
        return state

    def step(self, action, reward, next_state, done):
         # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

        # stats
        self.score += reward
        if done:
            if self.score > self.top_score:
                self.top_score = self.score

    def act(self, states):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(states, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action + self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack([e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 - self.tau) * target_weights
        target_model.set_weights(new_weights)
コード例 #2
0
ファイル: agent.py プロジェクト: vladhc/RL-Quadcopter-2
class Agent():
    def __init__(self, task, sess, stats):
        self.sess = sess
        self.task = task
        self.stats = stats

        tau = 0.01
        learning_rate = 2e-4

        self.critic_local = QNetwork(sess,
                                     task,
                                     stats,
                                     name='critic_local',
                                     hidden_units=64,
                                     dropout_rate=0.2)
        self.critic_target = QNetwork(sess,
                                      task,
                                      stats,
                                      name='critic_target',
                                      hidden_units=64,
                                      dropout_rate=0.2)
        self.actor_local = Policy(sess,
                                  task,
                                  stats,
                                  name='actor_local',
                                  hidden_units=32,
                                  dropout_rate=0.2)
        self.actor_target = Policy(sess,
                                   task,
                                   stats,
                                   name='actor_target',
                                   hidden_units=32,
                                   dropout_rate=0.2)
        soft_copy_critic_ops = self._create_soft_copy_op('critic_local',
                                                         'critic_target',
                                                         tau=tau)
        soft_copy_actor_ops = self._create_soft_copy_op('actor_local',
                                                        'actor_target',
                                                        tau=tau)
        self._soft_copy_ops = []
        self._soft_copy_ops.extend(soft_copy_critic_ops)
        self._soft_copy_ops.extend(soft_copy_actor_ops)

        self.gamma = 0.99  # reward discount rate

        # Exploration noise process
        exploration_mu = 0
        exploration_theta = 0.15
        exploration_sigma = 0.15
        self.noise = OUNoise(task.action_size, exploration_mu,
                             exploration_theta, exploration_sigma)

        # Replay memory
        self.batch_size = 256
        self.memory = ReplayBuffer(buffer_size=10000, decay_steps=1000)

        self.sess.run(tf.global_variables_initializer())

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        self.memory.decay_a()
        return state

    def step(self, action, reward, next_state, done):
        # Save experience
        self._save_experience(self.last_state, action, reward, next_state,
                              done)

        # Learn, if enough samples are available in memory
        self.learn()

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state, explore=False):
        """Returns actions for given state(s) as per current policy."""
        actor = self.actor_local if explore else self.actor_target
        action = actor.act([state], explore)[0]
        assert not np.any(np.isnan(action))

        if explore:
            action = action + self.noise.sample()
            action = np.maximum(action, self.task.action_low)
            action = np.minimum(action, self.task.action_high)

        assert not np.any(np.isnan(action))
        assert np.all(action >= self.task.action_low
                      ), "expected less than {:7.3f}, but was {}".format(
                          task.action_low, action)
        assert np.all(action <= self.task.action_high)

        return action

    def learn(self):
        """Update policy and value parameters using given batch of experience tuples."""

        if len(self.memory) < self.batch_size:
            return

        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        experiences, experience_indexes = self.memory.sample(self.batch_size)
        action_size = self.task.action_size
        states = np.vstack([e.state for e in experiences])
        actions = np.array([e.action for e in experiences
                            ]).astype(np.float32).reshape(-1, action_size)
        rewards = np.array([e.reward for e in experiences
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          ]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack([e.next_state for e in experiences])

        # Get predicted next-state actions, Q and V values
        actions_next = self.actor_target.act(next_states)
        Q_targets_next, V_targets_next = self.critic_target.get_q_and_v(
            next_states, actions_next)

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        V_targets = rewards + self.gamma * V_targets_next * (1 - dones)
        td_errs = self.critic_local.learn(states, actions, Q_targets,
                                          V_targets)

        self.memory.update_td_err(experience_indexes, td_errs)

        self.memory.scrape_stats(self.stats)

        # Train actor model
        actions = self.actor_target.act(states)
        action_gradients = self.critic_target.get_action_gradients(
            states, actions)
        self.actor_local.learn(states, action_gradients)

        self._soft_copy()

    def _save_experience(self, state, action, reward, next_state, done):
        """Adds experience into ReplayBuffer. As a side effect, also learns q network on this sample."""
        # Get predicted next-state actions and Q values
        actions_next = self.actor_local.act([next_state])
        Q_targets_next, _ = self.critic_local.get_q_and_v([next_state],
                                                          actions_next)
        Q_target_next = Q_targets_next[0]

        Q_target = reward + self.gamma * Q_target_next * (1 - done)
        td_err = self.critic_local.get_td_err([state], [action], [Q_target])

        self.memory.add(Experience(state, action, reward, next_state, done),
                        td_err)

    def _soft_copy(self):
        self.sess.run(self._soft_copy_ops)

    def _create_soft_copy_op(self, scope_src, scope_dst, tau=0.01):
        var_src = tf.trainable_variables(scope=scope_src)
        var_dst = tf.trainable_variables(scope=scope_dst)
        copy_ops = []
        for src, dst in zip(var_src, var_dst):
            mixed = tau * src + (1.0 - tau) * dst
            copy_op = tf.assign(dst, mixed)
            copy_ops.append(copy_op)
        return copy_ops
コード例 #3
0
class DDPG_Land():
    def __init__(self, task, seed=None, render=False):

        self.env = task.env
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        self.total_reward = 0
        self.steps = 0
        self.action_repeat = 3
        self.render = render

        # Score tracker and learning parameters
        self.score = -np.inf
        self.best_w = None
        self.best_score = -np.inf
        self.noise_scale = 0.1

        #counter
        self.count = 0

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(1, self.exploration_mu, self.exploration_theta,
                             self.exploration_sigma)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters

    def act(self, s):
        #         # print('act')
        #         # a = lunder.heuristic(self.env, s)
        #         # 1. Testing.
        #         # 2. Demonstration rollout.
        #         angle_targ = s[0]*0.5 + s[2]*1.0         # angle should point towards center (s[0] is horizontal coordinate, s[2] hor speed)
        #         if angle_targ >  0.4: angle_targ =  0.4  # more than 0.4 radians (22 degrees) is bad
        #         if angle_targ < -0.4: angle_targ = -0.4
        #         hover_targ = 0.55*np.abs(s[0])           # target y should be proporional to horizontal offset

        #         # PID controller: s[4] angle, s[5] angularSpeed
        #         angle_todo = (angle_targ - s[4])*0.5 - (s[5])*1.0
        #         #print("angle_targ=%0.2f, angle_todo=%0.2f" % (angle_targ, angle_todo))

        #         # PID controller: s[1] vertical coordinate s[3] vertical speed
        #         hover_todo = (hover_targ - s[1])*0.5 - (s[3])*0.5
        #         #print("hover_targ=%0.2f, hover_todo=%0.2f" % (hover_targ, hover_todo))

        #         if s[6] or s[7]: # legs have contact
        #             angle_todo = 0
        #             hover_todo = -(s[3])*0.5  # override to reduce fall speed, that's all we need after contact

        #         if self.env.continuous:
        #             a = np.array( [hover_todo*20 - 1, -angle_todo*20] )
        #             a = np.clip(a, -1, +1)
        #         else:
        #             a = 0
        #             if hover_todo > np.abs(angle_todo) and hover_todo > 0.05: a = 2
        #             elif angle_todo < -0.05: a = 3
        #             elif angle_todo > +0.05: a = 1
        #         # return a
        # state = s
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(s, [-1, 24])
        action = self.actor_local.model.predict(state)[0]

        return list(action + self.noise.sample())

    def step(self, action, reward, next_state, done):
        # print ("step")
        # ob, reward, done, info = self.env.step(action)
        # print(ob)
        # next_state = ob
        # Save experience / reward
        reward = np.clip(reward, a_min=-100, a_max=100)
        self.memory.add(self.last_state, action, reward, next_state, done)

        self.count += 1
        self.total_reward += reward

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

        #from the tutorial SRC
        self.score += reward

        if done:
            # self.score = np.clip(self.score,a_min=-100,a_max=100)
            if self.score > self.best_score:
                self.best_score = self.score

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)

#         #from the tutorial SRC
#         self.score += reward

#         if done:

#             if self.score > self.best_score:
#                 self.best_score = self.score

#         # return ob, reward, done

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

#         # from policy search

#         # Learn by random policy search, using a reward-based score
#         # self.score = self.total_reward / float(self.count) if self.count else 0.0
#         # if self.score > self.best_score:
#         #     self.best_score = self.score
#         #     self.best_w = self.w
#         #     self.noise_scale = max(0.5 * self.noise_scale, 0.01)
#         # else:
#         #     self.w = self.best_w
#         #     self.noise_scale = min(2.0 * self.noise_scale, 3.2)
#         # self.w = self.w + self.noise_scale * np.random.normal(size=self.w.shape)  # equal noise in all directions

    def reset(self):
        self.steps = 0
        self.total_reward = 0
        self.count = 0
        self.score = 0
        # self.best_score = 0
        """Reset the sim to start a new episode."""
        ob = self.env.reset()
        state = np.concatenate([ob] * self.action_repeat)

        self.last_state = state
        return state
コード例 #4
0
class DDPGAgent(Agent):
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self,
                 actor_model,
                 tgt_actor_model,
                 critic_model,
                 tgt_critic_model,
                 action_limits,
                 actor_lr=1e-4,
                 critic_lr=1e-3,
                 critic_decay=1e-2,
                 tau=1e-3,
                 gamma=0.99,
                 process=None,
                 rb_size=1e6,
                 minibatch_size=64,
                 warmup_episodes=0,
                 episodes_trained=0,
                 train_scores=None,
                 test_scores=None,
                 best_train_score=-np.inf):
        # Changed this to use generic env instead of Task
        super().__init__(warmup_episodes, episodes_trained, train_scores,
                         test_scores, best_train_score)
        self.actor = Actor(actor_model, critic_model, lr=actor_lr)
        self.tgt_actor = Actor(tgt_actor_model, tgt_critic_model, lr=actor_lr)
        self.tgt_actor.set_weights(self.actor.get_weights())

        self.critic = Critic(critic_model, lr=critic_lr, decay=critic_decay)
        self.tgt_critic = Critic(tgt_critic_model,
                                 lr=critic_lr,
                                 decay=critic_decay)
        self.tgt_critic.set_weights(self.critic.get_weights())

        self.action_limits = action_limits
        self.process = process
        self.minibatch_size = minibatch_size
        self.buffer = ReplayBuffer(int(rb_size), self.minibatch_size)
        self.tau = tau
        self.gamma = gamma

        self.state_space = K.int_shape(critic_model.inputs[0])[1]
        self.action_space = K.int_shape(critic_model.inputs[1])[1]

        self.learning_phase = 1
        if process is None:
            self.process = OUNoise(size=self.action_space,
                                   theta=0.15,
                                   mu=0,
                                   sigma=0.2)
        else:
            self.process = process

    def sense(self, s, a, r, s_new, done):
        s = np.reshape(s, [-1, self.state_space])
        s_new = np.reshape(s_new, [-1, self.state_space])
        self.buffer.add(s, a, r, s_new, done)

    def act(self, s):
        s = np.reshape(s, [-1, self.state_space])
        a = self.tgt_actor(s)
        # Cache.
        self.last_state = np.copy(s)
        self.last_action = np.copy(a)
        if self.learning_phase:
            a += self.process.sample()
        a = np.clip(a, self.action_limits[0], self.action_limits[1])

        self.last_action_noisy = np.copy(a)
        return a

    def new_episode(self):
        self.process.reset()

    def train_step(self):
        if len(self.buffer.memory) < self.minibatch_size:
            return

        minibatch = self.buffer.sample(self.minibatch_size)
        states = np.zeros([len(minibatch), self.state_space])
        states_new = np.zeros([len(minibatch), self.state_space])
        actions = np.zeros([len(minibatch), self.action_space])
        r = np.zeros([len(minibatch), 1])
        dones = np.zeros([len(minibatch), 1])

        for i in range(len(minibatch)):
            states[i], actions[i], r[i], states_new[i], dones[i] = minibatch[i]

        # Estimate Q_values
        critic_out = self.critic(states_new, self.actor(states_new))
        tgt_critic_out = self.tgt_critic(states_new,
                                         self.tgt_actor(states_new))

        # Q-values using tgt_critic
        ys = r + self.gamma * tgt_critic_out

        # Train local critic and actor
        self.critic.step(states, actions, ys)
        self.actor.step(states)

        # Soft weight updates for target critic and actor
        critic_weights = self.critic.get_weights()
        tgt_critic_weights = self.tgt_critic.get_weights()
        for i in range(len(critic_weights)):
            tgt_critic_weights[i] = (1 - self.tau) * tgt_critic_weights[i] + \
                self.tau * critic_weights[i]
        self.tgt_critic.set_weights(tgt_critic_weights)

        actor_weights = self.actor.get_weights()
        tgt_actor_weights = self.tgt_actor.get_weights()
        for i in range(len(actor_weights)):
            tgt_actor_weights[i] = (1 - self.tau) * tgt_actor_weights[i] + \
                self.tau * actor_weights[i]
        self.tgt_actor.set_weights(tgt_actor_weights)
class DDPGAgentVersion3(BaseAgent):
    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 random_seed,
                 lr_actor=1e-4,
                 lr_critic=1e-3,
                 fc1_units=400,
                 fc2_units=300,
                 buffer_size=int(1e5),
                 batch_size=128,
                 gamma=0.99,
                 tau=1e-3,
                 max_norm=1.0,
                 learn_period=20,
                 learn_sampling_num=10):
        """Initialize an Agent object.

        Args:
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
            
            max_norm (float): value of clip_grad_norm for critic optimizer
        """

        super().__init__()

        self.state_size = state_size
        self.num_agents = num_agents
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.max_norm = max_norm
        self.learn_period = learn_period
        self.learn_sampling_num = learn_sampling_num

        # Actor Network (w/ Target Network)
        self.actor_local = DDPGActor(state_size,
                                     action_size,
                                     random_seed,
                                     fc1_units=fc1_units,
                                     fc2_units=fc2_units).to(device)
        self.actor_target = DDPGActor(state_size,
                                      action_size,
                                      random_seed,
                                      fc1_units=fc1_units,
                                      fc2_units=fc2_units).to(device)

        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = DDPGCritic(state_size,
                                       action_size,
                                       random_seed,
                                       fcs1_units=fc1_units,
                                       fc2_units=fc2_units).to(device)
        self.critic_target = DDPGCritic(state_size,
                                        action_size,
                                        random_seed,
                                        fcs1_units=fc1_units,
                                        fc2_units=fc2_units).to(device)

        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic)

        # Noise process for action
        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15  # (Timothy Lillicrap, 2016)
        self.exploration_sigma = 0.2  # (Timothy Lillicrap, 2016)
        #         self.noise = OUNoise(action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)
        self.noise = OUNoiseMultivariate((num_agents, action_size),
                                         random_seed,
                                         mu=self.exploration_mu,
                                         theta=self.exploration_theta,
                                         sigma=self.exploration_sigma)

        # Replay memory
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size,
                                   random_seed, device)

        # parameter of discounted reward
        self.gamma = gamma

        # soft update parameter
        self.tau = tau

        self.batch_size = batch_size

    def step(self, states, actions, rewards, next_states, dones, time_step):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        for i in range(self.num_agents):
            self.memory.add(states[i, :], actions[i, :], rewards[i],
                            next_states[i, :], dones[i])
        #self.memory.add_batch(states, actions, rewards, next_states, dones)

        # Learn, if enough samples are available in memory
        if (len(self.memory) > self.batch_size) and (time_step %
                                                     self.learn_period == 0):
            for _ in range(self.learn_sampling_num):
                experiences = self.memory.sample()
                self.learn(experiences, self.gamma)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()

        self.actor_local.train()

        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        
        Q_targets = r + gamma * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        
        Args:
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # train critic
        # loss fuction = Q_target(TD 1-step boostrapping) - Q_local(current)
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        Q_expected = self.critic_local(states, actions)

        critic_loss = F.mse_loss(Q_expected, Q_targets)

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(),
                                       self.max_norm)
        self.critic_optimizer.step()

        # train actor (policy gradient)
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # update critic_target
        self.soft_update(self.critic_local, self.critic_target, self.tau)

        # update actor_target
        self.soft_update(self.actor_local, self.actor_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Args:
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def model_dicts(self):
        return {'actor': self.actor_target, 'critic': self.critic_target}
コード例 #6
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task, verbose=False):
        self.verbose = verbose

        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        #log_path = '/tmp/logs'
        #self.callback = callbacks.TensorBoard(log_dir=log_path, histogram_freq=1,
        #                        write_images=False, write_grads=True, write_graph=False)
        #self.callback.set_model(self.critic_local.model)

        #log_path = '/tmp/logs'
        #self.writer = tf.summary.FileWriter(log_path)

        #self.learn_counter = 0

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0.1
        self.exploration_theta = 0.2
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 512
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.015  # for soft update of target parameters

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        #self.learn_counter = 0
        return state

    def mimic(self, experience_to_mimic):
        print("ready to mimic")
        self.memory.memory = experience_to_mimic

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        def save_grads(writer, model):
            for layer in model.layers:
                for weight in layer.weights:
                    mapped_weight_name = weight.name.replace(':', '_')
                    tf.summary.histogram(mapped_weight_name, weight)

                    grads = model.optimizer.get_gradients(
                        model.total_loss, weight)

                    def is_indexed_slices(grad):
                        return type(grad).__name__ == 'IndexedSlices'

                    grads = [
                        grad.values if is_indexed_slices(grad) else grad
                        for grad in grads
                    ]
                    tf.summary.histogram('{}_grad'.format(mapped_weight_name),
                                         grads)
                    merged = tf.summary.merge_all()
                    writer.flush()
                    writer.close()

        #save_grads(self.writer, self.critic_local.model)
        #def write_log(callback, names, logs, batch_no):
        #    for name, value in zip(names, logs):
        #        summary = tf.Summary()
        #        summary_value = summary.value.add()
        #        summary_value.simple_value = value
        #        summary_value.tag = name
        #        callback.writer.add_summary(summary, batch_no)
        #        callback.writer.flush()

        #train_names = ['train_loss', 'train_mae']
        #print("about to write log")
        #write_log(self.callback, train_names, logs, self.learn_counter)
        #trainable_weights = critic_local.model.trainable_weights
        #gradients = critic_local.model.optimizer.get_gradients(critic_local.model.total_loss, trainable_weights)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

        #self.learn_counter += 1

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)

    def _save_weight(self, model, directory_name, file_name):
        cwd = os.getcwd()
        directory_path = os.path.join(cwd, directory_name)
        if not os.path.exists(directory_path):
            os.makedirs(directory_path)

        file_path = os.path.join(directory_path, file_name)

        mv_file_to_dir_with_date(file_path, directory_path)

        model.save_weights(file_path)

    def save_weights(self, location='weights_backup'):
        if self.verbose:
            print("start save_weights")

        self._save_weight(self.critic_local.model, location, "critic_local.h5")
        self._save_weight(self.critic_target.model, location,
                          "critic_target.h5")
        self._save_weight(self.actor_local.model, location, "actor_local.h5")
        self._save_weight(self.actor_target.model, location, "actor_target.h5")

        if self.verbose:
            print("done save_weights")

    def _h5(self, model, file_path):
        if os.path.exists(file_path):
            model.load_weights(file_path)
        else:
            print(f'could not find weight to load from [{file_path}]')

    def load_weights(self, location='weights_backup'):
        if self.verbose:
            print("start load_weights")

        cwd = os.getcwd()
        directory_path = os.path.join(cwd, location)

        self._h5(self.critic_local.model,
                 os.path.join(directory_path, "critic_local.h5"))
        self._h5(self.critic_target.model,
                 os.path.join(directory_path, "critic_target.h5"))
        self._h5(self.actor_local.model,
                 os.path.join(directory_path, "actor_local.h5"))
        self._h5(self.actor_target.model,
                 os.path.join(directory_path, "actor_target.h5"))

        if self.verbose:
            print("done load_weights")
コード例 #7
0
class DDPG:
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.001

        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size)

        self.gamma = 0.99
        self.tau = 0.1
        self.learning_rate = 0.0005

        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        self.actor_local = Actor(self.state_size,
                                 self.action_size,
                                 self.action_low,
                                 self.action_high,
                                 learning_rate=self.learning_rate)
        self.actor_target = Actor(self.state_size,
                                  self.action_size,
                                  self.action_low,
                                  self.action_high,
                                  learning_rate=self.learning_rate)

        self.critic_local = Critic(self.state_size,
                                   self.action_size,
                                   learning_rate=self.learning_rate)
        self.critic_target = Critic(self.state_size,
                                    self.action_size,
                                    learning_rate=self.learning_rate)

    def reset_episode(self):
        self.total_reward = 0.0
        self.count = 0
        self.noise.reset()
        self.last_state = self.task.reset()
        return self.last_state

    def act(self, state):
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action + self.noise.sample())

    def step(self, action, reward, next_state, done):
        self.memory.add(self.last_state, action, reward, next_state, done)

        self.total_reward += reward
        self.count += 1

        if self.memory.size() > self.batch_size:
            experiences = self.memory.sample(self.batch_size)
            self.learn(experiences)

        self.last_state = next_state

    def learn(self, experiences):
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        actions_next = self.actor_target.model.predict_on_batch(next_states)
        q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        q_targets = rewards + (self.gamma * q_targets_next * (1 - dones))
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=q_targets)

        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])

        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
コード例 #8
0
class DDPG():
    """
    Reinforcement Learning agent that learns using DDPG.
    Deep DPG as described by Lillicrap et al. (2015)
    """
    def __init__(self, task, prioritized_replay=True):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15  #0.15 #0.1
        self.exploration_sigma = 0.2  #0.2 #0.1
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        self.buffer_size = 100000
        self.batch_size = 64  # 64

        self.prioritized_replay = prioritized_replay
        self.prioritized_replay_alpha = 0.6
        self.prioritized_replay_beta0 = 0.4
        self.prioritized_replay_beta_iters = None
        self.prioritized_replay_eps = 1e-6
        self.max_timesteps = 100000

        # Replay buffer
        if self.prioritized_replay:
            self.memory = PrioritizedReplayBuffer(
                self.buffer_size, alpha=self.prioritized_replay_alpha)
            if self.prioritized_replay_beta_iters is None:
                self.prioritized_replay_beta_iters = self.max_timesteps
            self.beta_schedule = LinearSchedule(
                self.prioritized_replay_beta_iters,
                initial_p=self.prioritized_replay_beta0,
                final_p=1.0)
        else:
            self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters
        #self.tau = 0.001 # 0.001 per paper

        self.td_errors_list = []
        self.actor_loss_list = []
        self.critic_loss_list = []

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            if self.prioritized_replay:
                samples = self.memory.sample(self.batch_size,
                                             beta=self.beta_schedule.value(
                                                 len(self.memory)))
                (obses_t, actions, rewards, obses_tp1, dones, weights,
                 batch_idxes) = samples
                experiences = []
                for i in range(len(obses_t)):
                    experiences.append(
                        namedtuple("PrioritizedExperience",
                                   field_names=[
                                       "state", "action", "reward",
                                       "next_state", "done", "weight",
                                       "batch_idx"
                                   ])(obses_t[i:i + 1], actions[i:i + 1],
                                      rewards[i:i + 1], obses_tp1[i:i + 1],
                                      dones[i:i + 1], weights[i:i + 1],
                                      batch_idxes[i:i + 1]))
                self.learn(experiences)
            else:
                experiences = self.memory.sample()
                self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]

        #actions = list(action + self.noise.sample())
        #print("act {}".format(actions))
        #return actions  # add some noise for exploration

        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        # Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        critic_loss = self.critic_local.model.train_on_batch(
            x=[states, actions], y=Q_targets)

        # Train actor model (local) using action gradients
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        actor_loss = self.actor_local.train_fn([states, action_gradients,
                                                1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

        if self.prioritized_replay:
            # Update replay buffer priorities
            batch_idxes = np.vstack(
                [e.batch_idx[0] for e in experiences if e is not None])
            new_priorities = np.abs(Q_targets) + self.prioritized_replay_eps
            self.memory.update_priorities(batch_idxes, new_priorities)

        self.td_errors_list.append(Q_targets.T)
        self.actor_loss_list.append(actor_loss[0])
        self.critic_loss_list.append(critic_loss)

        #print("states {} next states {} critic_loss {} actor_loss {}".format(states, actions_next, critic_loss, actor_loss))

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)

    def save_weights(self):
        self.actor_local.model.save_weights("DDPG_actor_weights.h5")
        self.critic_local.model.save_weights("DDPG_critic_weights.h5")

    def save_td_errors(self, i_episode):
        with open("DDPG_agent_td_errors_episode_{}.csv".format(i_episode),
                  'w') as csvfile:
            writer = csv.writer(csvfile)
            for td_errors in self.td_errors_list:
                writer.writerow([td_errors])
        self.td_errors_list.clear()

    def save_losses(self, i_episode):
        with open(
                "DDPG_agent_actor_critic_loss_episode_{}.csv".format(
                    i_episode), 'w') as csvfile:
            writer = csv.writer(csvfile)
            for actor_loss, critic_loss in zip(self.actor_loss_list,
                                               self.critic_loss_list):
                writer.writerow([actor_loss, critic_loss])

        self.actor_loss_list.clear()
        self.critic_loss_list.clear()
class DDPG():
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

    def create_models(self,
                      hidden_sizes_actor=(512, 256),
                      hidden_sizes_critic=(512, 256, 256)):
        self.actor_local = Actor(self.state_size,
                                 self.action_size,
                                 self.action_low,
                                 self.action_high,
                                 hidden_sizes=hidden_sizes_actor)
        self.actor_target = Actor(self.state_size,
                                  self.action_size,
                                  self.action_low,
                                  self.action_high,
                                  hidden_sizes=hidden_sizes_actor)
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        self.critic_local = Critic(self.state_size,
                                   self.action_size,
                                   hidden_sizes=hidden_sizes_critic)
        self.critic_target = Critic(self.state_size,
                                    self.action_size,
                                    hidden_sizes=hidden_sizes_critic)
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())

    def set_params(self,
                   mu=0.1,
                   sigma=0.1,
                   theta=0.1,
                   buffer_size=1e+8,
                   batch_size=128,
                   gamma=0.99,
                   tau=1e-3):
        self.exploration_mu = mu
        self.exploration_sigma = sigma
        self.exploration_theta = theta
        self.noise = noise(self.action_size, self.exploration_mu,
                           self.exploration_theta, self.exploration_sigma)

        self.buffer_size = int(buffer_size)
        self.batch_size = int(batch_size)
        self.buffer = ReplayBuffer(self.buffer_size)

        self.gamma = gamma
        self.tau = tau

    def act(self, states):
        state = np.reshape(states, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action + self.noise.calc_noise())

    def learn(self):
        states, actions, rewards, dones, next_states = self.buffer.sample(
            self.batch_size, self.action_size, self.state_size)

        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)

        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])

        # soft_update
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        self.buffer.add(self.last_state, action, reward, next_state, done)
        self.learn()
        self.last_state = next_state

    def soft_update(self, local_model, target_model):
        target_model.set_weights(
            self.tau * np.array(local_model.get_weights()) +
            (1 - self.tau) * np.array(target_model.get_weights()))
コード例 #10
0
ファイル: agent.py プロジェクト: Rababalkhalifa/QuadCop
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters

        # from plicy search
        
        self.action_range = self.action_high - self.action_low
        
        self.w = np.random.normal(
            size=(self.state_size, self.action_size),  # weights for simple linear policy: state_space x action_space
            scale=(self.action_range / (2 * self.state_size))) # start producing actions in a decent range
        
        # Score tracker and learning parameters
        self.score = -np.inf
        self.best_w = None
        self.best_score = -np.inf
        self.noise_scale = 0.1
        
        #counter
        self.count = 0
        
        
        
        
    def reset_episode(self):
        self.noise.reset()
        self.count = 0
        self.total_reward = 0.0
        self.score = 0
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
         # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)
        
        self.count += 1
        self.total_reward += reward
        
        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state
        
        #from the tutorial SRC
        self.score += reward
        
        if done:
            if self.score > self.best_score:
                self.best_score = self.score

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        
        return list(action + self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack([e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)  
        
        # from policy search
        
        # Learn by random policy search, using a reward-based score
        # self.score = self.total_reward / float(self.count) if self.count else 0.0
        # if self.score > self.best_score:
        #     self.best_score = self.score
        #     self.best_w = self.w
        #     self.noise_scale = max(0.5 * self.noise_scale, 0.01)
        # else:
        #     self.w = self.best_w
        #     self.noise_scale = min(2.0 * self.noise_scale, 3.2)
        # self.w = self.w + self.noise_scale * np.random.normal(size=self.w.shape)  # equal noise in all directions

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 - self.tau) * target_weights
        target_model.set_weights(new_weights)
コード例 #11
0
class DDPG(Agent):
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, env):
        # Changed this to use generic env instead of Task
        super().__init__(env)

        self.state_size = env.observation_space.shape[0]
        self.action_size = env.action_space.shape[0]
        self.action_low = env.action_space.low
        self.action_high = env.action_space.high

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 1e-2  # for soft update of target parameters

        # Critic Params
        self.critic_lr = 1e-3
        self.critic_decay = 1e-2

        # Actor Params
        self.actor_lr = 1e-4
        self.actor_decay = 0

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high,
                                 self.actor_lr, self.actor_decay)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high,
                                  self.actor_lr, self.actor_decay)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size,
                                   self.critic_lr, self.critic_decay)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    self.critic_lr, self.critic_decay)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

    def reset_episode(self):
        self.noise.reset()
        state = self.env.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done, training=True):
        # Since DDPG is an off-policy learner, add a training flag

        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if training and len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)
            self.steps_trained += 1

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state, training=True):
        # Add a training flag to decide whether to explore
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        if training:
            return list(action +
                        self.noise.sample())  # add some noise for exploration
        else:
            return list(action)

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)

    def save_model(self, filename):
        al = self.actor_local
        at = self.actor_target
        cl = self.critic_local
        ct = self.critic_target

        self.actor_local = None
        self.actor_target = None
        self.critic_local = None
        self.critic_target = None

        with open(filename + '.ddpg_agent') as f:
            pickle.dump(self, f)

        al.save(filename + '.actor_local')
        at.save(filename + '.actor_target')
        cl.save(filename + '.critic_local')
        ct.save(filename + '.critic_target')

        self.actor_local = al
        self.actor_target = at
        self.critic_local = cl
        self.critic_target = ct

    @classmethod
    def load_model(cls, filename):
        with open(filename + '.ddpg_agent') as f:
            m = pickle.load(f)
        m.actor_local = load_model(filename + '.actor_local')
        m.actor_target = load_model(filename + '.actor_target')
        m.critic_local = load_model(filename + '.critic_local')
        m.critic_target = load_model(filename + '.critic_target')
        return m
コード例 #12
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG.    """
    def __init__(self, env_reset, state_size, action_size, action_low, action_high):
        """Params:
        env_reset: callback function to reset environemnt at end of episode
        state_size: dimension of state space
        action_size: dimension of action space
        action_low: float - minimum action value
        action_high: float - maximum action value
        """
        self.training_steps = 0 # number of training steps run so far

        self.env_reset = env_reset
        self.state_size = state_size
        self.action_size = action_size
        self.action_low = action_low
        self.action_high = action_high

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 1e-3  # for soft update of target parameters
        self.critic_decay = 1e-2 # L2 weight decay for critic (regularization)
        self.critic_lr = 1e-3 # Learning rate for critic
        self.critic_alpha = 1e-2 # Leaky ReLU alpha for critic
        self.actor_lr = 1e-4 # Learning rate for actor
        self.actor_alpha = 1e-2 # Leaky ReLU alpha for actor

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr, self.actor_alpha)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr, self.actor_alpha)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size, self.critic_lr, self.critic_decay, self.critic_alpha)
        self.critic_target = Critic(self.state_size, self.action_size, self.critic_lr, self.critic_decay,self.critic_alpha)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = int(1e6)
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)



    def reset_episode(self):
        self.noise.reset()
        state = self.env_reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done, training=True):
         # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if training and len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences)
                self.training_steps += 1

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state, training=True):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        if training: # add some noise for exploration
            return list(action + self.noise.sample())
        else:
            return list(action)

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack([e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])  # custom training function


        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 - self.tau) * target_weights
        target_model.set_weights(new_weights)
コード例 #13
0
class AE_DDPG_Agent():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # AE: Although OUNoise gives me a convenient set of randomness for each of the rotors, I still need
        # AE: to make a decision myself on how to apply the randomness and how to manage its magnitude
        # AE: (i.e. my eplore vs exploit strategy). These variables will do that.
        self.explore_start = 1.0  # AE: exploration probability at start
        self.explore_stop = 0.001  # AE: minimum exploration probability
        self.decay_rate = 0.003  # AE: exponential decay rate for exploration prob
        self.magnitude_coeff = 0.1  # AE: a coefficient to limit randomness

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0  # AE: additive to the noise. mu * theta will be directly added

        self.exploration_theta = 0.15  # AE: old noise will be multiplied by this
        self.exploration_sigma = 0.2  # AE: new noise will be multiplied by this
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        # AE: The learning rate. How much we trust the new values compared to the old ones.
        self.tau = 0.0001  # for soft update of target parameters

        # AE: current reward in learning procedure (for statistics)
        self.score = -np.inf

        # Episode variables
        self.reset_episode()

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state

        self.best_score = -np.inf
        self.score = -np.inf
        self.total_reward = 0.0
        self.count = 0

        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

        self.total_reward += reward
        self.count += 1

        # AE: Score (average reward in this episode so far) and best score for statistics
        self.score = self.total_reward / float(self.count)
        if self.score > self.best_score:
            self.best_score = self.score

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])

        # AE: directly sampling approximated value from learned action-value function.
        action = self.actor_local.model.predict(state)[0]

        # AE: and adding some noise to that for unpredictability.
        # AE: The magnitude of noise has to drop over time.
        explore_p = self.explore_stop + (self.explore_start -
                                         self.explore_stop) * np.exp(
                                             -self.decay_rate * self.count)
        #self.noise.update_mu(explore_p)
        noise_sample = self.magnitude_coeff * explore_p * self.noise.sample()
        #noise_sample = explore_p * np.random.randn(self.action_size)
        #print("Noi=", s)

        return list(
            action +
            noise_sample * self.action_size)  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        # AE: Updating NN weights directly in the passed model (actor or critic).
        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
コード例 #14
0
class Agent():
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)

        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())
        
        self.exploration_mu = 0
        self.exploration_theta = 0.10 # same direction
        self.exploration_sigma = 0.001 # random noise
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        self.gamma = 0.90  # discount factor
        self.tau = 0.1  # for soft update of target parameters

        self.best_score = -np.inf
        self.score = 0

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        self.score = 0
        return state

    def step(self, action, reward, next_state, done):
        self.memory.add(self.last_state, action, reward, next_state, done)

        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        self.last_state = next_state
        
        self.score += reward
        if done:
            if self.score > self.best_score:
                self.best_score = self.score
                
    def act(self, state):
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action + self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack([e.next_state for e in experiences if e is not None])

        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next])

        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets)

        action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])  # custom training function

        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)
        
    def soft_update(self, local_model, target_model):
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights)

        new_weights = self.tau * local_weights + (1 - self.tau) * target_weights
        target_model.set_weights(new_weights)
コード例 #15
0
class DDPG():
    '''reinforcement learning agent that learns using Deep Deterministic Policy Gradient'''
    def __init__(self, task):
        '''
        Params
        ======
        task (object)   : environment

        '''
        '''
        Reference: Continuous Control With Deep Reinforcement Learning(2016)
        Playing CartPole through Asynchronous Advantage Actor Critic (A3C) with tf.keras
        =========
        gamma   : 0.99
        tau     : 0.001
        buffer_size (ReplayBuffer)  : 1e6
        batch_size (ReplayBuffer)   : 64
        theta (Ornstein-Uhlenbeck process)  : 0.15
        sigma (Ornstein-Uhlenbeck process)  : 0.2


        '''

        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # actor (policy) model - use two copies of model for updating model and producing target
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # critic (value) model - use two copies of model for updating model and producing target
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # initialize target model parameters with local model parameters
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())

        # noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # replay memory
        self.buffer_size = 1000000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters

        # reward history
        self.best_avg_score = -np.inf
        self.accumulated_reward = 0
        self.count = 0

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        self.accumulated_reward = 0
        self.count = 0
        return state

    def step(self, action, reward, next_state, done):
        # save experience and reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # roll over last state and action
        self.last_state = next_state

        # accumulate reward
        self.accumulated_reward += reward
        self.count += 1

        # record best average score
        if done:
            if float(self.accumulated_reward /
                     self.count) > self.best_avg_score:
                self.best_avg_score = float(self.accumulated_reward /
                                            self.count)

    def act(self, state):
        '''returns actions for given state(s) as per current policy'''
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration
        # both action and self.noise.sample() are numpy object, + means sum up both,
        # instead of concatenation

    def learn(self, experiences):
        '''update policy and value parameters using given batch of experience tuples'''
        # convert experience tuples to separate arrays for each element(states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences if e is not None]).\
                           astype(np.float32).reshape(-1,self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None]).\
                           astype(np.float32).reshape(-1,1)
        dones = np.array([e.done for e in experiences if e is not None]).\
                           astype(np.uint8).reshape(-1,1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # get predicted next-state actions and Q-values from target models
        # Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # compute Q targets for current states and train critic model (local)
        # Value Loss: L=∑(R_t+1 + Q_t+1 — Qt)²
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # train actor model (local)
        # Policy Loss: L = (1/N)*log(𝝅(s)) * Q(s)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function
        # The learning phase flag is a bool tensor (0 = test, 1 = train)
        # to be passed as input to any Keras function
        # that uses a different behavior at train time and test time.

        # soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        '''soft update model parameters'''
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights),\
            'Local and target model parameters must have the same size'

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
コード例 #16
0
ファイル: agent.py プロジェクト: srinivascreddy/RLQuadcopter
class RLA():
    """ Reinfocement learning agent"""
    
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        
        #actor model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        
        #Critic model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)
        
        #Initialize target model params with local params
        self.critic_target.model.set_weights(
                self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
                self.actor_local.model.get_weights())
        
        #Initialize noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)
        
        #Replay memory Initialization
        self.buffer_size, self.batch_size = 2000000, 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)
        
        #Initialize algorithm parameters
        self.gamma, self.tau = 0.95, 0.001
        
        #Initialize scores
        self.score, self.best_score = -np.inf, -np.inf
    
    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        self.score = 0
        return state
    
    def step(self, action, reward, next_state, done):
        self.memory.add(self.last_state, action, reward, next_state, done)
        
        #Learn from samples in memory if they are greater than batch size
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)
        
        #Preserve state as last_state
        self.last_state = next_state
        
        #Update score with reward from this step
        self.score += reward
        
        if done:
            #Preserve best score
            if self.score > self.best_score:
                self.best_score = self.score
        
    def act(self, state):
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action + self.noise.sample())
    
    def learn(self, experiences):
        #Convert experiences seperate arrays
        states = np.vstack([exp.state for exp in experiences if exp is not None])
        actions = np.array([exp.action for exp in experiences if exp is not None]).astype(np.float32).reshape(-1, self.action_size)
        rewards = np.array([exp.reward for exp in experiences if exp is not None]).astype(np.float32).reshape(-1, 1)
        dones = np.array([exp.done for exp in experiences if exp is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack([exp.next_state for exp in experiences if exp is not None])
        
        #predict next_state actions and Q values from target model...
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next])
        
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states,actions], y=Q_targets)
        
        #Train local actor model
        action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])
        
        #Update target models
        self.update(self.critic_local.model, self.critic_target.model)
        self.update(self.actor_local.model, self.actor_target.model)
        
    def update(self, local_model, target_model):
        """Update model parameters"""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())
        
        new_weights = self.tau * local_weights + (1 - self.tau) * target_weights
        target_model.set_weights(new_weights)