class Agent(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 # same direction self.exploration_sigma = 0.001 # random noise #self.exploration_mu = 0 #self.exploration_theta = 0.15 #self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.1 # for soft update of target parameters # Compute the ongoing top score self.top_score = -np.inf self.score = 0 def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state self.score = 0 return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state # stats self.score += reward if done: if self.score > self.top_score: self.top_score = self.score def act(self, states): """Returns actions for given state(s) as per current policy.""" state = np.reshape(states, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack([e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class DDPG_Agent: """Reinforcement learning agent that learns through DDPG.""" def __init__(self, task): """Initialize DDPG Agent instance.""" self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_high = task.action_high self.action_low = task.action_low # Initializing local and target Actor Models # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_high, self.action_low) self.actor_target = Actor(self.state_size, self.action_size, self.action_high, self.action_low) # Initializing local and target Critic Models # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.actor_target.model.set_weights( self.actor_local.model.get_weights()) self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay Memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters # Additional Parameters self.best_score = -np.inf self.total_reward = 0.0 self.count = 0 self.score = 0 def reset_episode(self): """Reset episode to initial state.""" self.total_reward = 0.0 self.count = 0 self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): """Take a step.""" self.total_reward += reward self.count += 1 # Save experience/reward self.memory.memorize(self.last_state, action, reward, next_state, done) # Learn if enough samples are available in memory. if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) def act(self, state): """Returns actions for state(s) according to given policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] # Add some noise to action for exploration and return return list(action + self.noise.sample()) def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" self.score = self.total_reward / \ float(self.count) if self.count else 0.0 if self.score > self.best_score: self.best_score = self.score states = np.vstack([e.state for e in experiences if e is not None]) actions = np.vstack([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.vstack([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) next_actions = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, next_actions]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) # [states, actions, 0] 0 is for No learning Phase action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights), "Local and target model parameters must \ have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class Agent(): def __init__(self, task, sess, stats): self.sess = sess self.task = task self.stats = stats tau = 0.01 learning_rate = 2e-4 self.critic_local = QNetwork(sess, task, stats, name='critic_local', hidden_units=64, dropout_rate=0.2) self.critic_target = QNetwork(sess, task, stats, name='critic_target', hidden_units=64, dropout_rate=0.2) self.actor_local = Policy(sess, task, stats, name='actor_local', hidden_units=32, dropout_rate=0.2) self.actor_target = Policy(sess, task, stats, name='actor_target', hidden_units=32, dropout_rate=0.2) soft_copy_critic_ops = self._create_soft_copy_op('critic_local', 'critic_target', tau=tau) soft_copy_actor_ops = self._create_soft_copy_op('actor_local', 'actor_target', tau=tau) self._soft_copy_ops = [] self._soft_copy_ops.extend(soft_copy_critic_ops) self._soft_copy_ops.extend(soft_copy_actor_ops) self.gamma = 0.99 # reward discount rate # Exploration noise process exploration_mu = 0 exploration_theta = 0.15 exploration_sigma = 0.15 self.noise = OUNoise(task.action_size, exploration_mu, exploration_theta, exploration_sigma) # Replay memory self.batch_size = 256 self.memory = ReplayBuffer(buffer_size=10000, decay_steps=1000) self.sess.run(tf.global_variables_initializer()) def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state self.memory.decay_a() return state def step(self, action, reward, next_state, done): # Save experience self._save_experience(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory self.learn() # Roll over last state and action self.last_state = next_state def act(self, state, explore=False): """Returns actions for given state(s) as per current policy.""" actor = self.actor_local if explore else self.actor_target action = actor.act([state], explore)[0] assert not np.any(np.isnan(action)) if explore: action = action + self.noise.sample() action = np.maximum(action, self.task.action_low) action = np.minimum(action, self.task.action_high) assert not np.any(np.isnan(action)) assert np.all(action >= self.task.action_low ), "expected less than {:7.3f}, but was {}".format( task.action_low, action) assert np.all(action <= self.task.action_high) return action def learn(self): """Update policy and value parameters using given batch of experience tuples.""" if len(self.memory) < self.batch_size: return # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) experiences, experience_indexes = self.memory.sample(self.batch_size) action_size = self.task.action_size states = np.vstack([e.state for e in experiences]) actions = np.array([e.action for e in experiences ]).astype(np.float32).reshape(-1, action_size) rewards = np.array([e.reward for e in experiences ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences ]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack([e.next_state for e in experiences]) # Get predicted next-state actions, Q and V values actions_next = self.actor_target.act(next_states) Q_targets_next, V_targets_next = self.critic_target.get_q_and_v( next_states, actions_next) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) V_targets = rewards + self.gamma * V_targets_next * (1 - dones) td_errs = self.critic_local.learn(states, actions, Q_targets, V_targets) self.memory.update_td_err(experience_indexes, td_errs) self.memory.scrape_stats(self.stats) # Train actor model actions = self.actor_target.act(states) action_gradients = self.critic_target.get_action_gradients( states, actions) self.actor_local.learn(states, action_gradients) self._soft_copy() def _save_experience(self, state, action, reward, next_state, done): """Adds experience into ReplayBuffer. As a side effect, also learns q network on this sample.""" # Get predicted next-state actions and Q values actions_next = self.actor_local.act([next_state]) Q_targets_next, _ = self.critic_local.get_q_and_v([next_state], actions_next) Q_target_next = Q_targets_next[0] Q_target = reward + self.gamma * Q_target_next * (1 - done) td_err = self.critic_local.get_td_err([state], [action], [Q_target]) self.memory.add(Experience(state, action, reward, next_state, done), td_err) def _soft_copy(self): self.sess.run(self._soft_copy_ops) def _create_soft_copy_op(self, scope_src, scope_dst, tau=0.01): var_src = tf.trainable_variables(scope=scope_src) var_dst = tf.trainable_variables(scope=scope_dst) copy_ops = [] for src, dst in zip(var_src, var_dst): mixed = tau * src + (1.0 - tau) * dst copy_op = tf.assign(dst, mixed) copy_ops.append(copy_op) return copy_ops
class DDPG: def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.001 self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size) self.gamma = 0.99 self.tau = 0.1 self.learning_rate = 0.0005 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, learning_rate=self.learning_rate) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, learning_rate=self.learning_rate) self.critic_local = Critic(self.state_size, self.action_size, learning_rate=self.learning_rate) self.critic_target = Critic(self.state_size, self.action_size, learning_rate=self.learning_rate) def reset_episode(self): self.total_reward = 0.0 self.count = 0 self.noise.reset() self.last_state = self.task.reset() return self.last_state def act(self, state): state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) def step(self, action, reward, next_state, done): self.memory.add(self.last_state, action, reward, next_state, done) self.total_reward += reward self.count += 1 if self.memory.size() > self.batch_size: experiences = self.memory.sample(self.batch_size) self.learn(experiences) self.last_state = next_state def learn(self, experiences): states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) actions_next = self.actor_target.model.predict_on_batch(next_states) q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) q_targets = rewards + (self.gamma * q_targets_next * (1 - dones)) self.critic_local.model.train_on_batch(x=[states, actions], y=q_targets) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters # from plicy search self.action_range = self.action_high - self.action_low self.w = np.random.normal( size=(self.state_size, self.action_size), # weights for simple linear policy: state_space x action_space scale=(self.action_range / (2 * self.state_size))) # start producing actions in a decent range # Score tracker and learning parameters self.score = -np.inf self.best_w = None self.best_score = -np.inf self.noise_scale = 0.1 #counter self.count = 0 def reset_episode(self): self.noise.reset() self.count = 0 self.total_reward = 0.0 self.score = 0 state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) self.count += 1 self.total_reward += reward # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state #from the tutorial SRC self.score += reward if done: if self.score > self.best_score: self.best_score = self.score def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack([e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) # from policy search # Learn by random policy search, using a reward-based score # self.score = self.total_reward / float(self.count) if self.count else 0.0 # if self.score > self.best_score: # self.best_score = self.score # self.best_w = self.w # self.noise_scale = max(0.5 * self.noise_scale, 0.01) # else: # self.w = self.best_w # self.noise_scale = min(2.0 * self.noise_scale, 3.2) # self.w = self.w + self.noise_scale * np.random.normal(size=self.w.shape) # equal noise in all directions def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class AE_DDPG_Agent(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # AE: Although OUNoise gives me a convenient set of randomness for each of the rotors, I still need # AE: to make a decision myself on how to apply the randomness and how to manage its magnitude # AE: (i.e. my eplore vs exploit strategy). These variables will do that. self.explore_start = 1.0 # AE: exploration probability at start self.explore_stop = 0.001 # AE: minimum exploration probability self.decay_rate = 0.003 # AE: exponential decay rate for exploration prob self.magnitude_coeff = 0.1 # AE: a coefficient to limit randomness # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 # AE: additive to the noise. mu * theta will be directly added self.exploration_theta = 0.15 # AE: old noise will be multiplied by this self.exploration_sigma = 0.2 # AE: new noise will be multiplied by this self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor # AE: The learning rate. How much we trust the new values compared to the old ones. self.tau = 0.0001 # for soft update of target parameters # AE: current reward in learning procedure (for statistics) self.score = -np.inf # Episode variables self.reset_episode() def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state self.best_score = -np.inf self.score = -np.inf self.total_reward = 0.0 self.count = 0 return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state self.total_reward += reward self.count += 1 # AE: Score (average reward in this episode so far) and best score for statistics self.score = self.total_reward / float(self.count) if self.score > self.best_score: self.best_score = self.score def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) # AE: directly sampling approximated value from learned action-value function. action = self.actor_local.model.predict(state)[0] # AE: and adding some noise to that for unpredictability. # AE: The magnitude of noise has to drop over time. explore_p = self.explore_stop + (self.explore_start - self.explore_stop) * np.exp( -self.decay_rate * self.count) #self.noise.update_mu(explore_p) noise_sample = self.magnitude_coeff * explore_p * self.noise.sample() #noise_sample = explore_p * np.random.randn(self.action_size) #print("Noi=", s) return list( action + noise_sample * self.action_size) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" # AE: Updating NN weights directly in the passed model (actor or critic). new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class RLA(): """ Reinfocement learning agent""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high #actor model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) #Critic model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) #Initialize target model params with local params self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) #Initialize noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) #Replay memory Initialization self.buffer_size, self.batch_size = 2000000, 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) #Initialize algorithm parameters self.gamma, self.tau = 0.95, 0.001 #Initialize scores self.score, self.best_score = -np.inf, -np.inf def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state self.score = 0 return state def step(self, action, reward, next_state, done): self.memory.add(self.last_state, action, reward, next_state, done) #Learn from samples in memory if they are greater than batch size if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) #Preserve state as last_state self.last_state = next_state #Update score with reward from this step self.score += reward if done: #Preserve best score if self.score > self.best_score: self.best_score = self.score def act(self, state): state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) def learn(self, experiences): #Convert experiences seperate arrays states = np.vstack([exp.state for exp in experiences if exp is not None]) actions = np.array([exp.action for exp in experiences if exp is not None]).astype(np.float32).reshape(-1, self.action_size) rewards = np.array([exp.reward for exp in experiences if exp is not None]).astype(np.float32).reshape(-1, 1) dones = np.array([exp.done for exp in experiences if exp is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack([exp.next_state for exp in experiences if exp is not None]) #predict next_state actions and Q values from target model... actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next]) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states,actions], y=Q_targets) #Train local actor model action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) #Update target models self.update(self.critic_local.model, self.critic_target.model) self.update(self.actor_local.model, self.actor_target.model) def update(self, local_model, target_model): """Update model parameters""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class DDPG_Agent: """Reinforcement learning agent that learns through DDPG.""" def __init__(self, task): """Initialize DDPG Agent instance.""" self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_high = task.action_high self.action_low = task.action_low # Initializing local and target Actor Models # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_high, self.action_low) self.actor_target = Actor(self.state_size, self.action_size, self.action_high, self.action_low) # Initializing local and target Critic Models # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.actor_target.model.set_weights(self.actor_local.model.get_weights()) self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay Memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters # Additional Parameters self.best_score = -np.inf self.total_reward = 0.0 self.count = 0 self.score = 0 def reset_episode(self): """Reset episode to initial state.""" self.total_reward = 0.0 self.count = 0 self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): """Take a step.""" self.total_reward += reward self.count += 1 # Save experience/reward self.memory.memorize(self.last_state, action, reward, next_state, done) # Learn if enough samples are available in memory. if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) def act(self, state): """Returns actions for state(s) according to given policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] # Add some noise to action for exploration and return return list(action + self.noise.sample()) def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" self.score = self.total_reward / \ float(self.count) if self.count else 0.0 if self.score > self.best_score: self.best_score = self.score states = np.vstack([e.state for e in experiences if e is not None]) actions = np.vstack( [e.action for e in experiences if e is not None]).astype( np.float32).reshape(-1, self.action_size) rewards = np.vstack( [e.reward for e in experiences if e is not None]).astype( np.float32).reshape(-1, 1) dones = np.vstack( [e.done for e in experiences if e is not None]).astype( np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) next_actions = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, next_actions]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch( x=[states, actions], y=Q_targets) # Train actor model (local) # [states, actions, 0] 0 is for No learning Phase action_gradients = np.reshape(self.critic_local.get_action_gradients( [states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights), "Local and target model parameters must \ have the same size" new_weights = self.tau*local_weights + (1-self.tau)*target_weights target_model.set_weights(new_weights)