class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task, explore_mu=0, explore_theta=0.15, explore_sigma=0.2, gamma=0.99, tau=0.01): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = explore_mu # 0 self.exploration_theta = explore_theta # 0.15 self.exploration_sigma = explore_sigma # 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = gamma # 0.99 # discount factor self.tau = gamma # for soft update of target parameters def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class agentDDPG(): def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.rotor_high = task.action_high self.rotor_low = task.action_low # We will update local agent continuously and intermittantly copy the weights to target agent self.actor_local = actor(self.state_size, self.action_size, h1=64, h2=32, lr=0.001, r_h=self.rotor_high, r_l=self.rotor_low) self.actor_target = actor(self.state_size, self.action_size, h1=64, h2=32, lr=0.001, r_h=self.rotor_high, r_l=self.rotor_low) self.critic_local = critic(self.state_size, self.action_size, h1=32, h2=24, lr=0.001) self.critic_target = critic(self.state_size, self.action_size, h1=32, h2=24, lr=0.001) # Make the weights of both local and target agent same self.actor_target.actorModel.set_weights( self.actor_local.actorModel.get_weights()) self.critic_target.criticModel.set_weights( self.critic_local.criticModel.get_weights()) self.mu = 0 self.sigma = 0.15 self.theta = 0.2 self.OUNoise = OUNoise(self.action_size, self.mu, self.sigma, self.theta) self.bufferSize = 100000 self.batch_size = 64 self.memory = memoryBuffer(self.bufferSize, self.batch_size) self.gamma = 0.99 self.tau = 0.01 def reset_episode(self): self.OUNoise.reset() state = self.task.reset() self.last_state = state return state def step(self, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done) if self.memory.len() > self.batch_size: experiences = self.memory.sample() self.learn(experiences) self.last_state = next_state def learn(self, experience): states = np.vstack([e.state for e in experience if e is not None]) actions = np.array([e.action for e in experience if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experience if e is not None ]).astype(np.float32).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experience if e is not None]) done = np.array([e.done for e in experience if e is not None]).astype(np.uint8).reshape(-1, 1) # Train actor agent based on the action_gradient received from critic # Train critic agent based on TD error actions_next = self.actor_local.actorModel.predict_on_batch( next_states) Q_targets_next = self.critic_local.criticModel.predict_on_batch( [next_states, actions_next]) Q_targets = rewards + self.gamma * Q_targets_next * (1 - done) self.critic_local.criticModel.train_on_batch(x=[states, actions], y=Q_targets) action_gradients = self.critic_local.get_action_gradients( inputs=[states, actions, 0]) action_gradients = np.reshape(action_gradients, (-1, self.action_size)) self.actor_local.train_actor(inputs=[states, action_gradients, 1]) self.soft_update(self.actor_local.actorModel, self.actor_target.actorModel) self.soft_update(self.critic_local.criticModel, self.critic_target.criticModel) def soft_update(self, local_model, target_model): local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights) new_weights = local_weights * self.tau + target_weights * (1 - self.tau) target_model.set_weights(new_weights) def act(self, state): state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.actorModel.predict(state)[0] return list(action + self.OUNoise.sample())
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): print('loaded DDPG ') self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.action_range = self.action_high - self.action_low # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters # Score tracker and learning parameters self.best_w = None self.best_score = -np.inf self.noise_scale = 0.1 self.w = np.random.normal(size=(self.state_size, self.action_size), # weights for simple linear policy: state_space x action_space scale=(self.action_range / (2 * self.state_size))) # start producing actions in a decent range # Episode variables # self.reset_episode() #load weight if existing def reset_episode(self): self.total_reward = 0.0 self.count = 0 self.noise_scale = 0.1 self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) self.total_reward += reward self.count += 1 # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state if done: self.score_update() def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack([e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights) def score_update(self): # Learn by random policy search, using a reward-based score self.score = self.total_reward / float(self.count) if self.count else 0.0 if self.score > self.best_score: self.best_score = self.score self.best_w = self.w self.noise_scale = max(0.5 * self.noise_scale, 0.01) else: self.w = self.best_w self.noise_scale = min(2.0 * self.noise_scale, 3.2) self.w = self.w + self.noise_scale * np.random.normal(size=self.w.shape) # equal noise in all directions
class DDPG(): '''reinforcement learning agent that learns using Deep Deterministic Policy Gradient''' def __init__(self, task): ''' Params ====== task (object) : environment ''' ''' Reference: Continuous Control With Deep Reinforcement Learning(2016) Playing CartPole through Asynchronous Advantage Actor Critic (A3C) with tf.keras ========= gamma : 0.99 tau : 0.001 buffer_size (ReplayBuffer) : 1e6 batch_size (ReplayBuffer) : 64 theta (Ornstein-Uhlenbeck process) : 0.15 sigma (Ornstein-Uhlenbeck process) : 0.2 ''' self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # actor (policy) model - use two copies of model for updating model and producing target self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # critic (value) model - use two copies of model for updating model and producing target self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # initialize target model parameters with local model parameters self.actor_target.model.set_weights( self.actor_local.model.get_weights()) self.critic_target.model.set_weights( self.critic_local.model.get_weights()) # noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # replay memory self.buffer_size = 1000000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.001 # for soft update of target parameters # reward history self.best_avg_score = -np.inf self.accumulated_reward = 0 self.count = 0 def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state self.accumulated_reward = 0 self.count = 0 return state def step(self, action, reward, next_state, done): # save experience and reward self.memory.add(self.last_state, action, reward, next_state, done) # learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # roll over last state and action self.last_state = next_state # accumulate reward self.accumulated_reward += reward self.count += 1 # record best average score if done: if float(self.accumulated_reward / self.count) > self.best_avg_score: self.best_avg_score = float(self.accumulated_reward / self.count) def act(self, state): '''returns actions for given state(s) as per current policy''' state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration # both action and self.noise.sample() are numpy object, + means sum up both, # instead of concatenation def learn(self, experiences): '''update policy and value parameters using given batch of experience tuples''' # convert experience tuples to separate arrays for each element(states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).\ astype(np.float32).reshape(-1,self.action_size) rewards = np.array([e.reward for e in experiences if e is not None]).\ astype(np.float32).reshape(-1,1) dones = np.array([e.done for e in experiences if e is not None]).\ astype(np.uint8).reshape(-1,1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # get predicted next-state actions and Q-values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # compute Q targets for current states and train critic model (local) # Value Loss: L=∑(R_t+1 + Q_t+1 — Qt)² Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # train actor model (local) # Policy Loss: L = (1/N)*log(𝝅(s)) * Q(s) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # The learning phase flag is a bool tensor (0 = test, 1 = train) # to be passed as input to any Keras function # that uses a different behavior at train time and test time. # soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): '''soft update model parameters''' local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights),\ 'Local and target model parameters must have the same size' new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class DDPG_Agent: def __init__(self, task, noise, memory, rl_param, nn_hidden, actor_lr, critic_lr, q_lambda): # Adapted for this gym self.task = task self.action_low = task.action_low self.action_high = task.action_high self.state_space = task.state_size self.action_space = task.action_size self.q_lambda = q_lambda # Instantiate Actors and Critics. self.actor = Actor(self.state_space, self.action_space, self.action_low, self.action_high, hidden_units=nn_hidden[0], learning_rate=actor_lr, q_lambda=q_lambda) self.actor_target = Actor(self.state_space, self.action_space, self.action_low, self.action_high, hidden_units=nn_hidden[0], learning_rate=actor_lr, q_lambda=q_lambda) self.critic = Critic(self.state_space, self.action_space, hidden_units=nn_hidden[1], learning_rate=critic_lr, q_lambda=q_lambda) self.critic_target = Critic(self.state_space, self.action_space, hidden_units=nn_hidden[1], learning_rate=critic_lr, q_lambda=q_lambda) # Set same weights in target. self.actor_target.model.set_weights(self.actor.model.get_weights()) self.critic_target.model.set_weights(self.critic.model.get_weights()) # Noise for exploration. self.mean = noise[0] self.sigma = noise[1] self.theta = noise[2] self.ounoise = OUNoise(self.action_space, self.mean, self.sigma, self.theta) # Experience Replay memory. self.capacity = memory[0] self.batch_size = memory[1] self.er_buffer = ExperienceReplayBuffer(capacity=self.capacity, batch_size=self.batch_size) # RL parameters. self.gamma = rl_param[0] self.t = rl_param[1] # Keeping track of learning. self.learning_rewards = list() self.total_reward = None self.best_reward = -np.inf self.losses = list() def restart_task(self): if self.total_reward is not None: self.learning_rewards.append(self.total_reward) if self.total_reward > self.best_reward: self.best_reward = self.total_reward self.total_reward = 0 state = self.task.reset() self.state = state self.ounoise.restart() return state def act(self, state, epsilon): self.action_wo_noise = self.actor.model.predict( np.reshape(state, newshape=(-1, self.state_space))) self.step_noise = self.ounoise.sample() * epsilon action = np.array(self.action_wo_noise[0] + self.step_noise[0]).reshape(-1, self.action_space) action_clipped = np.clip(a=action, a_min=self.action_low, a_max=self.action_high) return action_clipped # Saves expirience into memory and updates actor-critic weights. def store_learn(self, state, action, reward, done, next_state): # Store experience into exp replay memory. self.er_buffer.add_env_reaction( (state, action, reward, done, next_state)) # Learn if agent has enough experiences. if len(self.er_buffer.mem) > self.batch_size: self.learn() self.total_reward += reward # Update to the current state of the enviroment. self.state = next_state def soft_update(self): actor_current = np.array(self.actor.model.get_weights()) critic_current = np.array(self.critic.model.get_weights()) actor_target = np.array(self.actor_target.model.get_weights()) critic_target = np.array(self.critic_target.model.get_weights()) self.actor_target.model.set_weights(actor_target * (1 - self.t) + self.t * actor_current) self.critic_target.model.set_weights(critic_target * (1 - self.t) + self.t * critic_current) # Learn step of the agent, update weights of actor-critic and actor-critic target NN. def learn(self): states, actions, rewards, dones, next_states = self.er_buffer.sample_batch( ) states = np.vstack(states) actions = np.array(actions, dtype=np.float32).reshape(-1, self.action_space) rewards = np.array(rewards, dtype=np.float32).reshape(-1, 1) dones = np.array(dones, dtype=np.uint8).reshape(-1, 1) next_states = np.vstack(next_states) # Get action for deterministic policy. next_actions = self.actor_target.model.predict_on_batch(next_states) next_q_values = self.critic_target.model.predict_on_batch( [next_states, next_actions]) # Need to handle the done case. targets = rewards + self.gamma * next_q_values * (1 - dones) loss = self.critic.model.train_on_batch(x=[states, actions], y=targets) self.losses.append(loss) # Getting gradients before Critics backprop. action_gradients = self.critic.get_action_gradients( [states, actions, 0]) action_gradients_prev = action_gradients action_gradients = np.reshape(action_gradients[0], (-1, self.action_space)) # Learning Phase = 0 (Test), we just want the gradient, no update on weights. self.actor.train_fn([states, action_gradients, 1]) # Do soft update on weigths. self.soft_update()
class DDPG(): def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high #Policy Model & Value Model self.actorLocal = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.criticLocal = Critic(self.state_size, self.action_size) self.actorTarget = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.criticTarget = Critic(self.state_size, self.action_size) #Initializing target model with local model params self.criticTarget.model.set_weights( self.criticLocal.model.get_weights()) self.actorTarget.model.set_weights(self.actorLocal.model.get_weights()) #Replay Buffer self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) self.noise = OUNoise(self.action_size, 0, 0.1, 0.25) self.discountGamma = 0.9 def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): self.memory.add(self.last_state, action, reward, next_state, done) if len(self.memory) > self.batch_size: exp = self.memory.sample() self.learn(exp) self.last_state = next_state def act(self, state): state = np.reshape(state, [-1, self.state_size]) action = self.actorLocal.model.predict(state)[0] return list(action + self.noise.sample()) def learn(self, exp): """ https://docs.scipy.org/doc/numpy/reference/generated/numpy.vstack.html Vertical Stacking of arrays This took a long time to get in place :). Thanks to some other references in github too for examples. """ state = np.vstack([ex.state for ex in exp if ex is not None]) action = np.array([ex.action for ex in exp if ex is not None]).reshape(-1, self.action_size) reward = np.array([ex.reward for ex in exp if ex is not None]).reshape(-1, 1) done = np.array([ex.done for ex in exp if ex is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [ex.next_state for ex in exp if ex is not None]) actions_next = self.actorTarget.model.predict_on_batch(next_states) QTargets_next = self.criticTarget.model.predict_on_batch( [next_states, actions_next]) Q_targets = reward + self.discountGamma * QTargets_next * (1 - done) self.criticLocal.model.train_on_batch(x=[state, action], y=Q_targets) actionGradients = np.reshape( self.criticLocal.get_action_gradients([state, action, 0]), (-1, self.action_size)) self.actorLocal.train_fn([state, actionGradients, 1]) # Soft-update target models self.criticTarget.model.set_weights( 0.01 * np.array(self.criticLocal.model.get_weights()) + (1 - 0.01) * np.array(self.criticTarget.model.get_weights())) self.actorTarget.model.set_weights( 0.01 * np.array(self.actorLocal.model.get_weights()) + (1 - 0.01) * np.array(self.actorTarget.model.get_weights()))