class DDPG(): """Reinforcement learning agent who learns using DDPG""" def __init__(self,task): """Initialize models""" self.env = task self.state_size = task.observation_space.shape[0] self.action_size = task.action_space.shape[0] self.action_high = task.action_space.high self.action_low = task.action_space.low # Initialize Actor (policy) models self.actor_local = Actor(self.state_size,self.action_size,self.action_low,self.action_high) self.actor_target = Actor(self.state_size,self.action_size,self.action_low,self.action_high) # Initialize Critic (value) models self.critic_local = Critic(self.state_size,self.action_size) self.critic_target = Critic(self.state_size,self.action_size) # Initialize target model parameters with local model parameters self.actor_target.model.set_weights(self.actor_local.model.get_weights()) self.critic_target.model.set_weights(self.critic_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay buffer self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size,self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.001 # for soft update of target parameters def reset_episode(self,task): """Return state after reseting task""" self.noise.reset() state = task.reset() self.last_state = state return state def step(self,action,reward,next_state,done): # Add experience to memory self.memory.add_experience(self.last_state,action,reward,next_state,done) # Learn is memory is larger than batch size if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over state self.last_state = next_state def act(self,state): """Returns action using the policy network """ state = np.reshape(state,[-1,self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action+self.noise.sample()) def learn(self,experiences): # Convert experience tuples to separate arrays for each element states = np.vstack([e.state for e in experiences if e is not None]).astype(np.float32).reshape(-1,self.state_size) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1,self.action_size) next_states = np.vstack([e.next_state for e in experiences if e is not None]).astype(np.float32).reshape(-1,self.state_size) rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1,1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1,1) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict(next_states) Q_targets_next = self.critic_target.model.predict([next_states,actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma*Q_targets_next*(1-dones) self.critic_local.model.train_on_batch(x=[states,actions],y=Q_targets) # Train actor model (local) action_gradients = np.reshape(self.critic_local.get_action_gradients([states,actions,0]), [-1,self.action_size]) self.actor_local.train_fn([states,action_gradients,1]) # Soft-update target models self.soft_update(self.actor_local.model,self.actor_target.model) self.soft_update(self.critic_local.model,self.critic_target.model) def soft_update(self,local_model,target_model): local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size" new_weights = self.tau*local_weights + (1-self.tau)*target_weights target_model.set_weights(new_weights) def save_model(self,path): self.actor_local.model.save_weights(path) def load_model(self,path): self.actor_local.model.load_weights(path) def act_only(self,state): state = np.reshape(state,[-1,self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action)
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task, verbose=False): self.verbose = verbose self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) #log_path = '/tmp/logs' #self.callback = callbacks.TensorBoard(log_dir=log_path, histogram_freq=1, # write_images=False, write_grads=True, write_graph=False) #self.callback.set_model(self.critic_local.model) #log_path = '/tmp/logs' #self.writer = tf.summary.FileWriter(log_path) #self.learn_counter = 0 # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0.1 self.exploration_theta = 0.2 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 512 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.015 # for soft update of target parameters def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state #self.learn_counter = 0 return state def mimic(self, experience_to_mimic): print("ready to mimic") self.memory.memory = experience_to_mimic def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) def save_grads(writer, model): for layer in model.layers: for weight in layer.weights: mapped_weight_name = weight.name.replace(':', '_') tf.summary.histogram(mapped_weight_name, weight) grads = model.optimizer.get_gradients( model.total_loss, weight) def is_indexed_slices(grad): return type(grad).__name__ == 'IndexedSlices' grads = [ grad.values if is_indexed_slices(grad) else grad for grad in grads ] tf.summary.histogram('{}_grad'.format(mapped_weight_name), grads) merged = tf.summary.merge_all() writer.flush() writer.close() #save_grads(self.writer, self.critic_local.model) #def write_log(callback, names, logs, batch_no): # for name, value in zip(names, logs): # summary = tf.Summary() # summary_value = summary.value.add() # summary_value.simple_value = value # summary_value.tag = name # callback.writer.add_summary(summary, batch_no) # callback.writer.flush() #train_names = ['train_loss', 'train_mae'] #print("about to write log") #write_log(self.callback, train_names, logs, self.learn_counter) #trainable_weights = critic_local.model.trainable_weights #gradients = critic_local.model.optimizer.get_gradients(critic_local.model.total_loss, trainable_weights) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) #self.learn_counter += 1 def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights) def _save_weight(self, model, directory_name, file_name): cwd = os.getcwd() directory_path = os.path.join(cwd, directory_name) if not os.path.exists(directory_path): os.makedirs(directory_path) file_path = os.path.join(directory_path, file_name) mv_file_to_dir_with_date(file_path, directory_path) model.save_weights(file_path) def save_weights(self, location='weights_backup'): if self.verbose: print("start save_weights") self._save_weight(self.critic_local.model, location, "critic_local.h5") self._save_weight(self.critic_target.model, location, "critic_target.h5") self._save_weight(self.actor_local.model, location, "actor_local.h5") self._save_weight(self.actor_target.model, location, "actor_target.h5") if self.verbose: print("done save_weights") def _h5(self, model, file_path): if os.path.exists(file_path): model.load_weights(file_path) else: print(f'could not find weight to load from [{file_path}]') def load_weights(self, location='weights_backup'): if self.verbose: print("start load_weights") cwd = os.getcwd() directory_path = os.path.join(cwd, location) self._h5(self.critic_local.model, os.path.join(directory_path, "critic_local.h5")) self._h5(self.critic_target.model, os.path.join(directory_path, "critic_target.h5")) self._h5(self.actor_local.model, os.path.join(directory_path, "actor_local.h5")) self._h5(self.actor_target.model, os.path.join(directory_path, "actor_target.h5")) if self.verbose: print("done load_weights")
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task, basename): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # learning rates self.actor_learning_rate = 0.0001 self.critic_learning_rate = 0.001 # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_learning_rate) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_learning_rate) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, self.critic_learning_rate) self.critic_target = Critic(self.state_size, self.action_size, self.critic_learning_rate) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 1000000 self.batch_size = 128 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.001 # for soft update of target parameters # keep track of the best run self.nEpisode = 0 self.bestEpisode = [] self.bestEpisodeAt = -1 # logging business self.state_labels = self.task.get_state_labels() self.action_labels = [ 'ac{}'.format(i) for i in range(self.action_size) ] self.df_columns = [ 't' ] + self.state_labels.tolist() + self.action_labels + ['R'] self.basename = os.path.join('log', basename) self.currentEpisode = [] self.bestCumReward = -np.inf def reset_episode(self): self.noise.reset() self.last_state = self.task.reset() self.currentEpisode = [] return self.last_state def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights),\ "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights) def step(self, action): last_state_variables = self.task.get_state_variables() last_t = self.task.sim.get_time() # call the model for state transition next_state, reward, done = self.task.step(action) # logging the current episode self.currentEpisode += [ np.hstack([last_t, last_state_variables, action, reward]) ] # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state if done: # log the episode df = pd.DataFrame(self.currentEpisode, columns=self.df_columns) fn_i = '{}_{}'.format(self.basename, self.nEpisode) df.to_csv(fn_i + '.csv') cumR = df.R.sum() if len(df) > len(self.bestEpisode) or \ (len(df) == len(self.bestEpisode) and cumR > self.bestCumReward): self.bestCumReward = cumR self.bestEpisode = df self.bestEpisodeAt = self.nEpisode self.plot_episode(df, self.nEpisode, fn_i) sys.stdout.write( "\rEp#{:4d} dur_{} cumR_{:5.3f} best@{} dur_{} cumR_{:5.3f} ". format(self.nEpisode, len(self.bestEpisode), cumR, self.bestEpisodeAt, len(self.bestEpisode), self.bestCumReward)) self.nEpisode += 1 return next_state, done def train(self, num_episodes=1): for ep_i in range(num_episodes): state, done = self.reset_episode(), False while not done: action = self.act(state) state, done = self.step(action) def plot_episode(self, df, episNo, filename=''): fig = plt.figure(1) fig.clf() ax2 = fig.add_subplot(313) ax1 = fig.add_subplot(312, sharex=ax2) ax0 = fig.add_subplot(311, sharex=ax2) # plot selected state variables ax0.set_title('Ep#{} dur={:5.2f} sec'.format(episNo, df.t.iloc[-1])) df.plot(x='t', y=self.state_labels[:6], ax=ax0, style='.:') df.plot(x='t', y=self.state_labels[6:], ax=ax1, style='.:') df.plot(x='t', y=self.action_labels, ax=ax2, style='.:') df.plot(x='t', y='R', ax=ax2, secondary_y=True) plt.ylabel('Reward') plt.show() if len(filename) > 0: fig.savefig(filename)
class DDPG(): """ Reinforcement Learning Agent. """ def __init__(self, task, exp_mu, exp_theta, exp_sigma, gamma, tau): self.task = task self.s_size = task.s_size self.a_size = task.a_size self.a_low = task.a_low self.a_high = task.a_high # Actor Model self.actor_local = Actor(self.s_size, self.a_size, self.a_low, self.a_high) self.actor_target = Actor(self.s_size, self.a_size, self.a_low, self.a_high) # Critic Model self.critic_local = Critic(self.s_size, self.a_size) self.critic_target = Critic(self.s_size, self.a_size) # Initialize target model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # initialize noise self.exp_mu = exp_mu self.exp_theta = exp_theta self.exp_sigma = exp_sigma self.noise = OUNoise(self.a_size, self.exp_mu, self.exp_theta, self.exp_sigma) # For Replay buffer self.buff_size = 1024 * 1024 self.batch_size = 64 self.memory = ReplayBuffer(self.buff_size, self.batch_size) # discount factor self.gamma = gamma # for soft update of target parameters self.tau = tau def reset_episode(self): self.noise.reset() state = self.task.reset() # last state self.l_state = state return state # A - Action, R - Reward, D - Done def step(self, A, R, nState, D): # save experience to memory self.memory.add(self.l_state, A, R, nState, D) # Learn, if enough samples (experiences) are available in memory if len(self.memory) > self.batch_size: self.learn(self.memory.sample()) self.l_state = nState def act(self, states): S = np.reshape(states, [-1, self.s_size]) A = self.actor_local.model.predict(S)[0] return list(A + self.noise.sample()) def learn(self, exp): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) S = np.vstack([e.state for e in exp if e is not None]) A = np.array([e.action for e in exp if e is not None ]).astype(np.float32).reshape(-1, self.a_size) R = np.array([e.reward for e in exp if e is not None]).astype(np.float32).reshape(-1, 1) D = np.array([e.done for e in exp if e is not None]).astype(np.uint8).reshape(-1, 1) nS = np.vstack([e.next_state for e in exp if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) a_next = self.actor_target.model.predict_on_batch(nS) t_next = self.critic_target.model.predict_on_batch([nS, a_next]) # Compute Q targets for current state and train critic model (local) Q_targets = R + self.gamma * t_next * (1 - D) self.critic_local.model.train_on_batch(x=[S, A], y=Q_targets) # Train actor model (local) a_grad = np.reshape(self.critic_local.get_action_gradients([S, A, 0]), (-1, self.a_size)) self.actor_local.train_fn([S, a_grad, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" l_weights = np.array(local_model.get_weights()) t_weights = np.array(target_model.get_weights()) assert len(l_weights) == len( t_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * l_weights + (1 - self.tau) * t_weights target_model.set_weights(new_weights)
class DDPG(): def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high def create_models(self, hidden_sizes_actor=(512, 256), hidden_sizes_critic=(512, 256, 256)): self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, hidden_sizes=hidden_sizes_actor) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, hidden_sizes=hidden_sizes_actor) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) self.critic_local = Critic(self.state_size, self.action_size, hidden_sizes=hidden_sizes_critic) self.critic_target = Critic(self.state_size, self.action_size, hidden_sizes=hidden_sizes_critic) self.critic_target.model.set_weights( self.critic_local.model.get_weights()) def set_params(self, mu=0.1, sigma=0.1, theta=0.1, buffer_size=1e+8, batch_size=128, gamma=0.99, tau=1e-3): self.exploration_mu = mu self.exploration_sigma = sigma self.exploration_theta = theta self.noise = noise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) self.buffer_size = int(buffer_size) self.batch_size = int(batch_size) self.buffer = ReplayBuffer(self.buffer_size) self.gamma = gamma self.tau = tau def act(self, states): state = np.reshape(states, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.calc_noise()) def learn(self): states, actions, rewards, dones, next_states = self.buffer.sample( self.batch_size, self.action_size, self.state_size) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # soft_update self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): self.buffer.add(self.last_state, action, reward, next_state, done) self.learn() self.last_state = next_state def soft_update(self, local_model, target_model): target_model.set_weights( self.tau * np.array(local_model.get_weights()) + (1 - self.tau) * np.array(target_model.get_weights()))
class DDPG: def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.001 self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size) self.gamma = 0.99 self.tau = 0.1 self.learning_rate = 0.0005 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, learning_rate=self.learning_rate) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, learning_rate=self.learning_rate) self.critic_local = Critic(self.state_size, self.action_size, learning_rate=self.learning_rate) self.critic_target = Critic(self.state_size, self.action_size, learning_rate=self.learning_rate) def reset_episode(self): self.total_reward = 0.0 self.count = 0 self.noise.reset() self.last_state = self.task.reset() return self.last_state def act(self, state): state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) def step(self, action, reward, next_state, done): self.memory.add(self.last_state, action, reward, next_state, done) self.total_reward += reward self.count += 1 if self.memory.size() > self.batch_size: experiences = self.memory.sample(self.batch_size) self.learn(experiences) self.last_state = next_state def learn(self, experiences): states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) actions_next = self.actor_target.model.predict_on_batch(next_states) q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) q_targets = rewards + (self.gamma * q_targets_next * (1 - dones)) self.critic_local.model.train_on_batch(x=[states, actions], y=q_targets) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class DDPG_Land(): def __init__(self, task, seed=None, render=False): self.env = task.env self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) self.total_reward = 0 self.steps = 0 self.action_repeat = 3 self.render = render # Score tracker and learning parameters self.score = -np.inf self.best_w = None self.best_score = -np.inf self.noise_scale = 0.1 #counter self.count = 0 # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(1, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters def act(self, s): # # print('act') # # a = lunder.heuristic(self.env, s) # # 1. Testing. # # 2. Demonstration rollout. # angle_targ = s[0]*0.5 + s[2]*1.0 # angle should point towards center (s[0] is horizontal coordinate, s[2] hor speed) # if angle_targ > 0.4: angle_targ = 0.4 # more than 0.4 radians (22 degrees) is bad # if angle_targ < -0.4: angle_targ = -0.4 # hover_targ = 0.55*np.abs(s[0]) # target y should be proporional to horizontal offset # # PID controller: s[4] angle, s[5] angularSpeed # angle_todo = (angle_targ - s[4])*0.5 - (s[5])*1.0 # #print("angle_targ=%0.2f, angle_todo=%0.2f" % (angle_targ, angle_todo)) # # PID controller: s[1] vertical coordinate s[3] vertical speed # hover_todo = (hover_targ - s[1])*0.5 - (s[3])*0.5 # #print("hover_targ=%0.2f, hover_todo=%0.2f" % (hover_targ, hover_todo)) # if s[6] or s[7]: # legs have contact # angle_todo = 0 # hover_todo = -(s[3])*0.5 # override to reduce fall speed, that's all we need after contact # if self.env.continuous: # a = np.array( [hover_todo*20 - 1, -angle_todo*20] ) # a = np.clip(a, -1, +1) # else: # a = 0 # if hover_todo > np.abs(angle_todo) and hover_todo > 0.05: a = 2 # elif angle_todo < -0.05: a = 3 # elif angle_todo > +0.05: a = 1 # # return a # state = s """Returns actions for given state(s) as per current policy.""" state = np.reshape(s, [-1, 24]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) def step(self, action, reward, next_state, done): # print ("step") # ob, reward, done, info = self.env.step(action) # print(ob) # next_state = ob # Save experience / reward reward = np.clip(reward, a_min=-100, a_max=100) self.memory.add(self.last_state, action, reward, next_state, done) self.count += 1 self.total_reward += reward # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state #from the tutorial SRC self.score += reward if done: # self.score = np.clip(self.score,a_min=-100,a_max=100) if self.score > self.best_score: self.best_score = self.score def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights) # #from the tutorial SRC # self.score += reward # if done: # if self.score > self.best_score: # self.best_score = self.score # # return ob, reward, done def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) # # from policy search # # Learn by random policy search, using a reward-based score # # self.score = self.total_reward / float(self.count) if self.count else 0.0 # # if self.score > self.best_score: # # self.best_score = self.score # # self.best_w = self.w # # self.noise_scale = max(0.5 * self.noise_scale, 0.01) # # else: # # self.w = self.best_w # # self.noise_scale = min(2.0 * self.noise_scale, 3.2) # # self.w = self.w + self.noise_scale * np.random.normal(size=self.w.shape) # equal noise in all directions def reset(self): self.steps = 0 self.total_reward = 0 self.count = 0 self.score = 0 # self.best_score = 0 """Reset the sim to start a new episode.""" ob = self.env.reset() state = np.concatenate([ob] * self.action_repeat) self.last_state = state return state
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters # from plicy search self.action_range = self.action_high - self.action_low self.w = np.random.normal( size=(self.state_size, self.action_size), # weights for simple linear policy: state_space x action_space scale=(self.action_range / (2 * self.state_size))) # start producing actions in a decent range # Score tracker and learning parameters self.score = -np.inf self.best_w = None self.best_score = -np.inf self.noise_scale = 0.1 #counter self.count = 0 def reset_episode(self): self.noise.reset() self.count = 0 self.total_reward = 0.0 self.score = 0 state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) self.count += 1 self.total_reward += reward # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state #from the tutorial SRC self.score += reward if done: if self.score > self.best_score: self.best_score = self.score def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack([e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) # from policy search # Learn by random policy search, using a reward-based score # self.score = self.total_reward / float(self.count) if self.count else 0.0 # if self.score > self.best_score: # self.best_score = self.score # self.best_w = self.w # self.noise_scale = max(0.5 * self.noise_scale, 0.01) # else: # self.w = self.best_w # self.noise_scale = min(2.0 * self.noise_scale, 3.2) # self.w = self.w + self.noise_scale * np.random.normal(size=self.w.shape) # equal noise in all directions def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.3 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) self.buffer_size = 1000000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) self.gamma = 0.99 self.tau = 0.001 def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): self.memory.add(self.last_state, action, reward, next_state, done) if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) self.last_state = next_state def act(self, states): """Returns actions for given state(s) as per current policy.""" state = np.reshape(states, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack([e.next_state for e in experiences if e is not None]) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next]) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class TD3(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task, lra, lrc, db): self.task = task self.s_sz = task.state_size self.a_sz = task.action_size self.a_max = task.max_action # Actor (Policy) Model self.actor_local = Actor(self.s_sz, self.a_sz, lra) self.actor_target = Actor(self.s_sz, self.a_sz, lra) # First Critic (Value) Model self.critic_local_1 = Critic(self.s_sz, self.a_sz, lrc) self.critic_target_1 = Critic(self.s_sz, self.a_sz, lrc) # Second Critic (Value) Model self.critic_local_2 = Critic(self.s_sz, self.a_sz, lrc) self.critic_target_2 = Critic(self.s_sz, self.a_sz, lrc) # Initialize target model parameters with local model parameters self.critic_target_1.model.set_weights( self.critic_local_1.model.get_weights()) self.critic_target_2.model.set_weights( self.critic_local_2.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.noise = GaussianNoise(self.a_sz) # Replay memory self.num_exp = 0 self.batch = 32 self.buffer = 10000 labels = ["state", "action", "reward", "next_state", "done"] self.experience = namedtuple("Experience", field_names=labels) self.memory = PrioritizedReplayBuffer(self.buffer, self.batch, db) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.005 # for soft update of target parameters def reset_episode(self): state = self.task.reset() self.last_state = state self.num_exp return state def step(self, action, reward, next_state, done, PER_init=False): # Save experience / reward exp = self.experience(self.last_state, action, reward, next_state, done) self.memory.add(exp) self.num_exp += 1 # Roll over last state and action self.last_state = next_state # Learn, if enough samples are available in memory if PER_init: p_idx, weights, experiences = self.memory.sample() mean_abs_error, loss = self.learn(experiences, weights, p_idx) return mean_abs_error, loss def act(self, state, training=True): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.s_sz]) action = self.actor_local.model.predict(state) if training: noise = self.noise.sample(0.1) return list((action + noise)[0]) # add some noise for exploration else: action = self.actor_target.model.predict(state) return list(action[0]) def learn(self, exp, weights, p_idx): states = np.vstack([e.state for e in exp]) actions = np.array([e.action for e in exp ]).astype(np.float32).reshape(-1, self.a_sz) rewards = np.array([e.reward for e in exp]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in exp]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack([e.next_state for e in exp]) weights = np.ndarray.flatten( np.array([w for w in weights]).astype(np.float32)) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) target_noise = self.noise.sample(0.2, self.batch, True) actions_next = np.clip(actions_next + target_noise, -self.a_max, self.a_max) Q_targets_1 = self.critic_target_1.model.predict_on_batch( [next_states, actions_next]).reshape(-1, 1) Q_targets_2 = self.critic_target_2.model.predict_on_batch( [next_states, actions_next]).reshape(-1, 1) Q_targets_next = np.minimum(Q_targets_1, Q_targets_2) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) # Compute Q targets for current states and train critic model (local) Q_local_1 = self.critic_local_1.model.predict_on_batch( [states, actions]) Q_local_2 = self.critic_local_2.model.predict_on_batch( [states, actions]) loss_1 = self.critic_local_1.model.train_on_batch([states, actions], Q_targets, weights) loss_2 = self.critic_local_2.model.train_on_batch([states, actions], Q_targets, weights) Q_error_1 = np.absolute(Q_targets - Q_local_1) Q_error_2 = np.absolute(Q_targets - Q_local_2) Q_error = np.mean([Q_error_1, Q_error_2], axis=0) self.memory.update_weights(p_idx, Q_error) # Train actor model (local) actor_actions = self.actor_local.model.predict_on_batch(states) action_grads = self.critic_local_1.get_gradients( [states, actor_actions, 0]) action_grads = np.reshape(action_grads, (-1, self.a_sz)) self.actor_local.train_fn([states, action_grads, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local_1.model, self.critic_target_1.model) self.soft_update(self.critic_local_2.model, self.critic_target_2.model) self.soft_update(self.actor_local.model, self.actor_target.model) return np.mean(Q_error), np.mean([loss_1, loss_2]) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) message = "Local and target model parameters must have the same size" assert len(local_weights) == len(target_weights), message new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.3 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 1000000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.95 # discount factor self.tau = 0.002 # for soft update of target parameters def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, states): state = np.reshape(states, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class DDPG(): def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Critic self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) self.critic_target.model.set_weights( self.critic_local.model.get_weights()) # Exploration noise self.exploration_mu = 0.1 self.exploration_sigma = 0.1 self.exploration_theta = 0.1 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Experience self.buffer_size = 100000000 self.batch_size = 64 self.buffer = ReplayBuffer(self.buffer_size) # Parameters self.gamma = 0.99 self.tau = 0.001 def act(self, states): state = np.reshape(states, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) def learn(self): # Sample states, actions, rewards, dones, next_states = self.buffer.sample( self.batch_size, self.action_size, self.state_size) # Predict actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) # Train Critic self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train Actor action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # Update weights self.update_target_weights(self.critic_local.model, self.critic_target.model) self.update_target_weights(self.actor_local.model, self.actor_target.model) def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): self.buffer.add(self.last_state, action, reward, next_state, done) self.learn() self.last_state = next_state def update_target_weights(self, local_model, target_model): target_model.set_weights( self.tau * np.array(local_model.get_weights()) + (1 - self.tau) * np.array(target_model.get_weights()))
class Agent(): def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) self.exploration_mu = 0 self.exploration_theta = 0.10 # same direction self.exploration_sigma = 0.001 # random noise self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) self.gamma = 0.90 # discount factor self.tau = 0.1 # for soft update of target parameters self.best_score = -np.inf self.score = 0 def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state self.score = 0 return state def step(self, action, reward, next_state, done): self.memory.add(self.last_state, action, reward, next_state, done) if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) self.last_state = next_state self.score += reward if done: if self.score > self.best_score: self.best_score = self.score def act(self, state): state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack([e.next_state for e in experiences if e is not None]) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next]) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights) new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class DDPG(): '''reinforcement learning agent that learns using Deep Deterministic Policy Gradient''' def __init__(self, task): ''' Params ====== task (object) : environment ''' ''' Reference: Continuous Control With Deep Reinforcement Learning(2016) Playing CartPole through Asynchronous Advantage Actor Critic (A3C) with tf.keras ========= gamma : 0.99 tau : 0.001 buffer_size (ReplayBuffer) : 1e6 batch_size (ReplayBuffer) : 64 theta (Ornstein-Uhlenbeck process) : 0.15 sigma (Ornstein-Uhlenbeck process) : 0.2 ''' self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # actor (policy) model - use two copies of model for updating model and producing target self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # critic (value) model - use two copies of model for updating model and producing target self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # initialize target model parameters with local model parameters self.actor_target.model.set_weights( self.actor_local.model.get_weights()) self.critic_target.model.set_weights( self.critic_local.model.get_weights()) # noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # replay memory self.buffer_size = 1000000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.001 # for soft update of target parameters # reward history self.best_avg_score = -np.inf self.accumulated_reward = 0 self.count = 0 def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state self.accumulated_reward = 0 self.count = 0 return state def step(self, action, reward, next_state, done): # save experience and reward self.memory.add(self.last_state, action, reward, next_state, done) # learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # roll over last state and action self.last_state = next_state # accumulate reward self.accumulated_reward += reward self.count += 1 # record best average score if done: if float(self.accumulated_reward / self.count) > self.best_avg_score: self.best_avg_score = float(self.accumulated_reward / self.count) def act(self, state): '''returns actions for given state(s) as per current policy''' state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration # both action and self.noise.sample() are numpy object, + means sum up both, # instead of concatenation def learn(self, experiences): '''update policy and value parameters using given batch of experience tuples''' # convert experience tuples to separate arrays for each element(states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).\ astype(np.float32).reshape(-1,self.action_size) rewards = np.array([e.reward for e in experiences if e is not None]).\ astype(np.float32).reshape(-1,1) dones = np.array([e.done for e in experiences if e is not None]).\ astype(np.uint8).reshape(-1,1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # get predicted next-state actions and Q-values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # compute Q targets for current states and train critic model (local) # Value Loss: L=∑(R_t+1 + Q_t+1 — Qt)² Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # train actor model (local) # Policy Loss: L = (1/N)*log(𝝅(s)) * Q(s) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # The learning phase flag is a bool tensor (0 = test, 1 = train) # to be passed as input to any Keras function # that uses a different behavior at train time and test time. # soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): '''soft update model parameters''' local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights),\ 'Local and target model parameters must have the same size' new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.action_range = self.action_high - self.action_low self.score = 0 # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, self.action_low, self.action_high) self.critic_target = Critic(self.state_size, self.action_size, self.action_low, self.action_high) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 # Taken from paper - changed from 10000 originally self.batch_size = 64 # Taken from paper - changed from 64 originally self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.005 # Tau for soft update of target parameters. Taken from paper - changed from 0.01 originally # Reset the episode when model set up self.reset_episode() def reset_episode(self): self.total_reward = 0.0 self.score = 0.0 self.count = 0 self.best_score = 0.0 self.noise.reset() state = self.task.reset() self.last_state = state return state def normalise_actions(self, actions): # Added square root to action to keep the actions closer to the middle of the rotor speed range normalised_actions = (np.sign(actions) * np.sqrt(np.abs(actions)) * self.action_range / 2) + self.action_low + self.action_range / 2 return normalised_actions def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > 10000: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state self.count += 1 self.total_reward += reward def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] action = self.normalise_actions(action) return list(action + self.noise.sample()) # add some noise for exploration def act_eval(self, state): """Returns actions without exploration for final evaluation.""" state = np.reshape(state, [-1, self.state_size]) # Scale [0, 1] output for each action dimension to proper range action = self.actor_local.model.predict(state)[0] action = self.normalise_actions(action) action = list(action) return list( action ) # No noise for exploration. Evaluate final performance of quadcopter def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) actions_next = self.normalise_actions(actions_next) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) # Update the score metric so it can be tracked through training self.score = self.total_reward / float( self.count) if self.count else 0.0 def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class DDPG(): def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Create the actor instances for local and target self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Create the critic instances for local and target self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # LOAD the weights self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise hyperparameters self.exploration_mu = 0 self.exploration_theta = 0.35 self.exploration_sigma = 0.1 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Set the replay memory self.buffer_size = 100000 self.batch_size = 32 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Update function hyperparameters self.gamma = 0.99 self.tau = 0.001 def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): self.memory.add(self.last_state, action, reward, next_state, done) # Save current experience # When there memory is enough for a batch size then we train if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Move up one state self.last_state = next_state def act(self, state): #NOTE: This is how the agent will act, which is based on the current policy state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) def learn(self, experiences): states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q_targets Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train the local actor model action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # Soft-update of target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" #name: is a name to use to save the netural Network models #load: load data from existing models or cretae an entirly new model def __init__(self, task, name, loadfile=False): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) self.name = name if loadfile: self.actor_local.model.load_weights("./weights/" + name + "_actor.h5") self.critic_local.model.load_weights("./weights/" + name + "_critic.h5") # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 #0.3 #original 0.15 self.exploration_sigma = 0.3 #0.3 #original 0.3 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 1000000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.001 # for soft update of target parameters def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, states): """Returns actions for given state(s) as per current policy.""" state = np.reshape(states, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) #rewards = np.interp(rewards, (rewards.min(), rewards.max()), (-1, +1)) #TESTING to scale rewards to a small number. dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def save_weights(self): self.actor_local.model.save_weights("./weights/" + self.name + "_actor.h5") self.critic_local.model.save_weights("./weights/" + self.name + "_critic.h5") #Notice that after training over a batch of experiences, we could just copy our newly learned weights (from the local model) to the target model. #However, individual batches can introduce a lot of variance into the process, so it's better to perform a soft update, controlled by the parameter tau. def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class DDPG_Agent: """Reinforcement learning agent that learns through DDPG.""" def __init__(self, task): """Initialize DDPG Agent instance.""" self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_high = task.action_high self.action_low = task.action_low # Initializing local and target Actor Models # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_high, self.action_low) self.actor_target = Actor(self.state_size, self.action_size, self.action_high, self.action_low) # Initializing local and target Critic Models # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.actor_target.model.set_weights( self.actor_local.model.get_weights()) self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay Memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters # Additional Parameters self.best_score = -np.inf self.total_reward = 0.0 self.count = 0 self.score = 0 def reset_episode(self): """Reset episode to initial state.""" self.total_reward = 0.0 self.count = 0 self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): """Take a step.""" self.total_reward += reward self.count += 1 # Save experience/reward self.memory.memorize(self.last_state, action, reward, next_state, done) # Learn if enough samples are available in memory. if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) def act(self, state): """Returns actions for state(s) according to given policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] # Add some noise to action for exploration and return return list(action + self.noise.sample()) def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" self.score = self.total_reward / \ float(self.count) if self.count else 0.0 if self.score > self.best_score: self.best_score = self.score states = np.vstack([e.state for e in experiences if e is not None]) actions = np.vstack([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.vstack([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) next_actions = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, next_actions]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) # [states, actions, 0] 0 is for No learning Phase action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights), "Local and target model parameters must \ have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class DDPG(): """Reinforcement learning agent that learns using DDPG.""" def __init__(self, task, train=True): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Set the learning rate suggested by paper: https://pdfs.semanticscholar.org/71f2/03de1a53deae81a7707143f0ed564661e279.pdf self.actor_learning_rate = 0.001 self.actor_decay = 0.0 self.critic_learning_rate = 0.001 self.critic_decay = 0.0 # Actor Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_learning_rate, self.actor_decay) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_learning_rate, self.actor_decay) # Critic Model self.critic_local = Critic(self.state_size, self.action_size, self.critic_learning_rate, self.critic_decay) self.critic_target = Critic(self.state_size, self.action_size, self.critic_learning_rate, self.critic_decay) # initialize targets model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 # self.exploration_theta = 0.15 # self.exploration_sigma = 0.2 self.exploration_theta = 0.01 self.exploration_sigma = 0.02 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) self.best_w = None self.best_score = -np.inf # self.noise_scale = 0.7 self.score = 0 # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters # Indicate if we want to learn (or use to predict without learn) self.set_train(train) def reset_episode(self): self.total_reward = 0.0 self.score = 0 self.step_count = 0 self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): self.total_reward += reward self.step_count += 1 # Save experience /reward self.memory.add(self.last_state, action, reward, next_state, done) self.score = self.total_reward / float(self.step_count) if self.step_count else 0.0 # Update the noise factor depending on the new score value if self.score >= self.best_score: self.best_score = self.score # Learn, if enough samples are available in memory if self.train and len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, done) # Roll over last state and action self.last_state= next_state def act(self, state): """Returns actions for given state(s) as per current policy""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add more noise for exploration def learn(self, experiences, done): """Update policy and value parameters using give batch experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_state = np.vstack([e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) next_action = self.actor_target.model.predict_on_batch(next_state) Q_targets_next = self.critic_target.model.predict_on_batch([next_state, next_action]) # Compute Q targets for current states and train critic model(local) Q_targets = rewards + self.gamma * Q_targets_next * ( 1- dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # Soft-update target method self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights), "Local and target model parameters mush have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights) def set_train(self, train): self.train = train
class Christophers_Agent(): def __init__(self, task): # Task (environment) information self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.action_range = self.action_high - self.action_low self.w = np.random.normal( size=( self.state_size, self.action_size ), # weights for simple linear policy: state_space x action_space scale=(self.action_range / (2 * self.state_size) )) # start producing actions in a decent range self.actor = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.critic = Critic(self.state_size, self.action_size) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.critic_target = Critic(self.state_size, self.action_size) self.gamma = 0.95 self.tau = 0.001 self.best_w = None self.best_score = -np.inf self.exploration_mu = 0.5 self.exploration_theta = 0.2 self.exploration_sigma = 0.4 self.noise = Noise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) self.buffer_size = 100000 self.batch_size = 32 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) self.best_score = -np.inf self.num_steps = 0 # Episode variables self.reset_episode() def reset_episode(self): if self.get_score() > self.best_score: self.best_score = self.get_score() self.total_reward = 0.0 self.num_steps = 0 self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): self.total_reward += reward self.num_steps += 1 self.memory.add(self.last_state, action, reward, next_state, done) if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) self.last_state = next_state def act(self, state): state = np.reshape(state, [-1, self.state_size]) action = self.actor.model.predict(state)[0] action = list(action + self.noise.sample()) # add some noise for exploration return action def get_score(self): return -np.inf if self.num_steps == 0 else self.total_reward / self.num_steps def learn(self, experiences): states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) done = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) Q_targets = rewards + self.gamma * Q_targets_next * (1 - done) self.critic.model.train_on_batch(x=[states, actions], y=Q_targets) action_gradients = np.reshape( self.critic.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor.train_fn([states, action_gradients, 1]) self.soft_update(self.critic.model, self.critic_target.model) self.soft_update(self.actor.model, self.actor_target.model) def soft_update(self, local_model, target_model): local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights) new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class Agent(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 # same direction self.exploration_sigma = 0.001 # random noise #self.exploration_mu = 0 #self.exploration_theta = 0.15 #self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.1 # for soft update of target parameters # Compute the ongoing top score self.top_score = -np.inf self.score = 0 def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state self.score = 0 return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state # stats self.score += reward if done: if self.score > self.top_score: self.top_score = self.score def act(self, states): """Returns actions for given state(s) as per current policy.""" state = np.reshape(states, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack([e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class RLA(): """ Reinfocement learning agent""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high #actor model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) #Critic model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) #Initialize target model params with local params self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) #Initialize noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) #Replay memory Initialization self.buffer_size, self.batch_size = 2000000, 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) #Initialize algorithm parameters self.gamma, self.tau = 0.95, 0.001 #Initialize scores self.score, self.best_score = -np.inf, -np.inf def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state self.score = 0 return state def step(self, action, reward, next_state, done): self.memory.add(self.last_state, action, reward, next_state, done) #Learn from samples in memory if they are greater than batch size if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) #Preserve state as last_state self.last_state = next_state #Update score with reward from this step self.score += reward if done: #Preserve best score if self.score > self.best_score: self.best_score = self.score def act(self, state): state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) def learn(self, experiences): #Convert experiences seperate arrays states = np.vstack([exp.state for exp in experiences if exp is not None]) actions = np.array([exp.action for exp in experiences if exp is not None]).astype(np.float32).reshape(-1, self.action_size) rewards = np.array([exp.reward for exp in experiences if exp is not None]).astype(np.float32).reshape(-1, 1) dones = np.array([exp.done for exp in experiences if exp is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack([exp.next_state for exp in experiences if exp is not None]) #predict next_state actions and Q values from target model... actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next]) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states,actions], y=Q_targets) #Train local actor model action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) #Update target models self.update(self.critic_local.model, self.critic_target.model) self.update(self.actor_local.model, self.actor_target.model) def update(self, local_model, target_model): """Update model parameters""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)