class Agent(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 # same direction self.exploration_sigma = 0.001 # random noise #self.exploration_mu = 0 #self.exploration_theta = 0.15 #self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.1 # for soft update of target parameters # Compute the ongoing top score self.top_score = -np.inf self.score = 0 def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state self.score = 0 return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state # stats self.score += reward if done: if self.score > self.top_score: self.top_score = self.score def act(self, states): """Returns actions for given state(s) as per current policy.""" state = np.reshape(states, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack([e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class Agent(): def __init__(self, task, sess, stats): self.sess = sess self.task = task self.stats = stats tau = 0.01 learning_rate = 2e-4 self.critic_local = QNetwork(sess, task, stats, name='critic_local', hidden_units=64, dropout_rate=0.2) self.critic_target = QNetwork(sess, task, stats, name='critic_target', hidden_units=64, dropout_rate=0.2) self.actor_local = Policy(sess, task, stats, name='actor_local', hidden_units=32, dropout_rate=0.2) self.actor_target = Policy(sess, task, stats, name='actor_target', hidden_units=32, dropout_rate=0.2) soft_copy_critic_ops = self._create_soft_copy_op('critic_local', 'critic_target', tau=tau) soft_copy_actor_ops = self._create_soft_copy_op('actor_local', 'actor_target', tau=tau) self._soft_copy_ops = [] self._soft_copy_ops.extend(soft_copy_critic_ops) self._soft_copy_ops.extend(soft_copy_actor_ops) self.gamma = 0.99 # reward discount rate # Exploration noise process exploration_mu = 0 exploration_theta = 0.15 exploration_sigma = 0.15 self.noise = OUNoise(task.action_size, exploration_mu, exploration_theta, exploration_sigma) # Replay memory self.batch_size = 256 self.memory = ReplayBuffer(buffer_size=10000, decay_steps=1000) self.sess.run(tf.global_variables_initializer()) def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state self.memory.decay_a() return state def step(self, action, reward, next_state, done): # Save experience self._save_experience(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory self.learn() # Roll over last state and action self.last_state = next_state def act(self, state, explore=False): """Returns actions for given state(s) as per current policy.""" actor = self.actor_local if explore else self.actor_target action = actor.act([state], explore)[0] assert not np.any(np.isnan(action)) if explore: action = action + self.noise.sample() action = np.maximum(action, self.task.action_low) action = np.minimum(action, self.task.action_high) assert not np.any(np.isnan(action)) assert np.all(action >= self.task.action_low ), "expected less than {:7.3f}, but was {}".format( task.action_low, action) assert np.all(action <= self.task.action_high) return action def learn(self): """Update policy and value parameters using given batch of experience tuples.""" if len(self.memory) < self.batch_size: return # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) experiences, experience_indexes = self.memory.sample(self.batch_size) action_size = self.task.action_size states = np.vstack([e.state for e in experiences]) actions = np.array([e.action for e in experiences ]).astype(np.float32).reshape(-1, action_size) rewards = np.array([e.reward for e in experiences ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences ]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack([e.next_state for e in experiences]) # Get predicted next-state actions, Q and V values actions_next = self.actor_target.act(next_states) Q_targets_next, V_targets_next = self.critic_target.get_q_and_v( next_states, actions_next) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) V_targets = rewards + self.gamma * V_targets_next * (1 - dones) td_errs = self.critic_local.learn(states, actions, Q_targets, V_targets) self.memory.update_td_err(experience_indexes, td_errs) self.memory.scrape_stats(self.stats) # Train actor model actions = self.actor_target.act(states) action_gradients = self.critic_target.get_action_gradients( states, actions) self.actor_local.learn(states, action_gradients) self._soft_copy() def _save_experience(self, state, action, reward, next_state, done): """Adds experience into ReplayBuffer. As a side effect, also learns q network on this sample.""" # Get predicted next-state actions and Q values actions_next = self.actor_local.act([next_state]) Q_targets_next, _ = self.critic_local.get_q_and_v([next_state], actions_next) Q_target_next = Q_targets_next[0] Q_target = reward + self.gamma * Q_target_next * (1 - done) td_err = self.critic_local.get_td_err([state], [action], [Q_target]) self.memory.add(Experience(state, action, reward, next_state, done), td_err) def _soft_copy(self): self.sess.run(self._soft_copy_ops) def _create_soft_copy_op(self, scope_src, scope_dst, tau=0.01): var_src = tf.trainable_variables(scope=scope_src) var_dst = tf.trainable_variables(scope=scope_dst) copy_ops = [] for src, dst in zip(var_src, var_dst): mixed = tau * src + (1.0 - tau) * dst copy_op = tf.assign(dst, mixed) copy_ops.append(copy_op) return copy_ops
class DDPG_Land(): def __init__(self, task, seed=None, render=False): self.env = task.env self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) self.total_reward = 0 self.steps = 0 self.action_repeat = 3 self.render = render # Score tracker and learning parameters self.score = -np.inf self.best_w = None self.best_score = -np.inf self.noise_scale = 0.1 #counter self.count = 0 # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(1, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters def act(self, s): # # print('act') # # a = lunder.heuristic(self.env, s) # # 1. Testing. # # 2. Demonstration rollout. # angle_targ = s[0]*0.5 + s[2]*1.0 # angle should point towards center (s[0] is horizontal coordinate, s[2] hor speed) # if angle_targ > 0.4: angle_targ = 0.4 # more than 0.4 radians (22 degrees) is bad # if angle_targ < -0.4: angle_targ = -0.4 # hover_targ = 0.55*np.abs(s[0]) # target y should be proporional to horizontal offset # # PID controller: s[4] angle, s[5] angularSpeed # angle_todo = (angle_targ - s[4])*0.5 - (s[5])*1.0 # #print("angle_targ=%0.2f, angle_todo=%0.2f" % (angle_targ, angle_todo)) # # PID controller: s[1] vertical coordinate s[3] vertical speed # hover_todo = (hover_targ - s[1])*0.5 - (s[3])*0.5 # #print("hover_targ=%0.2f, hover_todo=%0.2f" % (hover_targ, hover_todo)) # if s[6] or s[7]: # legs have contact # angle_todo = 0 # hover_todo = -(s[3])*0.5 # override to reduce fall speed, that's all we need after contact # if self.env.continuous: # a = np.array( [hover_todo*20 - 1, -angle_todo*20] ) # a = np.clip(a, -1, +1) # else: # a = 0 # if hover_todo > np.abs(angle_todo) and hover_todo > 0.05: a = 2 # elif angle_todo < -0.05: a = 3 # elif angle_todo > +0.05: a = 1 # # return a # state = s """Returns actions for given state(s) as per current policy.""" state = np.reshape(s, [-1, 24]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) def step(self, action, reward, next_state, done): # print ("step") # ob, reward, done, info = self.env.step(action) # print(ob) # next_state = ob # Save experience / reward reward = np.clip(reward, a_min=-100, a_max=100) self.memory.add(self.last_state, action, reward, next_state, done) self.count += 1 self.total_reward += reward # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state #from the tutorial SRC self.score += reward if done: # self.score = np.clip(self.score,a_min=-100,a_max=100) if self.score > self.best_score: self.best_score = self.score def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights) # #from the tutorial SRC # self.score += reward # if done: # if self.score > self.best_score: # self.best_score = self.score # # return ob, reward, done def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) # # from policy search # # Learn by random policy search, using a reward-based score # # self.score = self.total_reward / float(self.count) if self.count else 0.0 # # if self.score > self.best_score: # # self.best_score = self.score # # self.best_w = self.w # # self.noise_scale = max(0.5 * self.noise_scale, 0.01) # # else: # # self.w = self.best_w # # self.noise_scale = min(2.0 * self.noise_scale, 3.2) # # self.w = self.w + self.noise_scale * np.random.normal(size=self.w.shape) # equal noise in all directions def reset(self): self.steps = 0 self.total_reward = 0 self.count = 0 self.score = 0 # self.best_score = 0 """Reset the sim to start a new episode.""" ob = self.env.reset() state = np.concatenate([ob] * self.action_repeat) self.last_state = state return state
class DDPGAgent(Agent): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, actor_model, tgt_actor_model, critic_model, tgt_critic_model, action_limits, actor_lr=1e-4, critic_lr=1e-3, critic_decay=1e-2, tau=1e-3, gamma=0.99, process=None, rb_size=1e6, minibatch_size=64, warmup_episodes=0, episodes_trained=0, train_scores=None, test_scores=None, best_train_score=-np.inf): # Changed this to use generic env instead of Task super().__init__(warmup_episodes, episodes_trained, train_scores, test_scores, best_train_score) self.actor = Actor(actor_model, critic_model, lr=actor_lr) self.tgt_actor = Actor(tgt_actor_model, tgt_critic_model, lr=actor_lr) self.tgt_actor.set_weights(self.actor.get_weights()) self.critic = Critic(critic_model, lr=critic_lr, decay=critic_decay) self.tgt_critic = Critic(tgt_critic_model, lr=critic_lr, decay=critic_decay) self.tgt_critic.set_weights(self.critic.get_weights()) self.action_limits = action_limits self.process = process self.minibatch_size = minibatch_size self.buffer = ReplayBuffer(int(rb_size), self.minibatch_size) self.tau = tau self.gamma = gamma self.state_space = K.int_shape(critic_model.inputs[0])[1] self.action_space = K.int_shape(critic_model.inputs[1])[1] self.learning_phase = 1 if process is None: self.process = OUNoise(size=self.action_space, theta=0.15, mu=0, sigma=0.2) else: self.process = process def sense(self, s, a, r, s_new, done): s = np.reshape(s, [-1, self.state_space]) s_new = np.reshape(s_new, [-1, self.state_space]) self.buffer.add(s, a, r, s_new, done) def act(self, s): s = np.reshape(s, [-1, self.state_space]) a = self.tgt_actor(s) # Cache. self.last_state = np.copy(s) self.last_action = np.copy(a) if self.learning_phase: a += self.process.sample() a = np.clip(a, self.action_limits[0], self.action_limits[1]) self.last_action_noisy = np.copy(a) return a def new_episode(self): self.process.reset() def train_step(self): if len(self.buffer.memory) < self.minibatch_size: return minibatch = self.buffer.sample(self.minibatch_size) states = np.zeros([len(minibatch), self.state_space]) states_new = np.zeros([len(minibatch), self.state_space]) actions = np.zeros([len(minibatch), self.action_space]) r = np.zeros([len(minibatch), 1]) dones = np.zeros([len(minibatch), 1]) for i in range(len(minibatch)): states[i], actions[i], r[i], states_new[i], dones[i] = minibatch[i] # Estimate Q_values critic_out = self.critic(states_new, self.actor(states_new)) tgt_critic_out = self.tgt_critic(states_new, self.tgt_actor(states_new)) # Q-values using tgt_critic ys = r + self.gamma * tgt_critic_out # Train local critic and actor self.critic.step(states, actions, ys) self.actor.step(states) # Soft weight updates for target critic and actor critic_weights = self.critic.get_weights() tgt_critic_weights = self.tgt_critic.get_weights() for i in range(len(critic_weights)): tgt_critic_weights[i] = (1 - self.tau) * tgt_critic_weights[i] + \ self.tau * critic_weights[i] self.tgt_critic.set_weights(tgt_critic_weights) actor_weights = self.actor.get_weights() tgt_actor_weights = self.tgt_actor.get_weights() for i in range(len(actor_weights)): tgt_actor_weights[i] = (1 - self.tau) * tgt_actor_weights[i] + \ self.tau * actor_weights[i] self.tgt_actor.set_weights(tgt_actor_weights)
class DDPGAgentVersion3(BaseAgent): def __init__(self, state_size, action_size, num_agents, random_seed, lr_actor=1e-4, lr_critic=1e-3, fc1_units=400, fc2_units=300, buffer_size=int(1e5), batch_size=128, gamma=0.99, tau=1e-3, max_norm=1.0, learn_period=20, learn_sampling_num=10): """Initialize an Agent object. Args: state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed max_norm (float): value of clip_grad_norm for critic optimizer """ super().__init__() self.state_size = state_size self.num_agents = num_agents self.action_size = action_size self.seed = random.seed(random_seed) self.max_norm = max_norm self.learn_period = learn_period self.learn_sampling_num = learn_sampling_num # Actor Network (w/ Target Network) self.actor_local = DDPGActor(state_size, action_size, random_seed, fc1_units=fc1_units, fc2_units=fc2_units).to(device) self.actor_target = DDPGActor(state_size, action_size, random_seed, fc1_units=fc1_units, fc2_units=fc2_units).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Critic Network (w/ Target Network) self.critic_local = DDPGCritic(state_size, action_size, random_seed, fcs1_units=fc1_units, fc2_units=fc2_units).to(device) self.critic_target = DDPGCritic(state_size, action_size, random_seed, fcs1_units=fc1_units, fc2_units=fc2_units).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic) # Noise process for action # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 # (Timothy Lillicrap, 2016) self.exploration_sigma = 0.2 # (Timothy Lillicrap, 2016) # self.noise = OUNoise(action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) self.noise = OUNoiseMultivariate((num_agents, action_size), random_seed, mu=self.exploration_mu, theta=self.exploration_theta, sigma=self.exploration_sigma) # Replay memory self.memory = ReplayBuffer(action_size, buffer_size, batch_size, random_seed, device) # parameter of discounted reward self.gamma = gamma # soft update parameter self.tau = tau self.batch_size = batch_size def step(self, states, actions, rewards, next_states, dones, time_step): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward for i in range(self.num_agents): self.memory.add(states[i, :], actions[i, :], rewards[i], next_states[i, :], dones[i]) #self.memory.add_batch(states, actions, rewards, next_states, dones) # Learn, if enough samples are available in memory if (len(self.memory) > self.batch_size) and (time_step % self.learn_period == 0): for _ in range(self.learn_sampling_num): experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + gamma * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Args: experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # train critic # loss fuction = Q_target(TD 1-step boostrapping) - Q_local(current) actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), self.max_norm) self.critic_optimizer.step() # train actor (policy gradient) actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # update critic_target self.soft_update(self.critic_local, self.critic_target, self.tau) # update actor_target self.soft_update(self.actor_local, self.actor_target, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Args: local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def model_dicts(self): return {'actor': self.actor_target, 'critic': self.critic_target}
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task, verbose=False): self.verbose = verbose self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) #log_path = '/tmp/logs' #self.callback = callbacks.TensorBoard(log_dir=log_path, histogram_freq=1, # write_images=False, write_grads=True, write_graph=False) #self.callback.set_model(self.critic_local.model) #log_path = '/tmp/logs' #self.writer = tf.summary.FileWriter(log_path) #self.learn_counter = 0 # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0.1 self.exploration_theta = 0.2 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 512 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.015 # for soft update of target parameters def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state #self.learn_counter = 0 return state def mimic(self, experience_to_mimic): print("ready to mimic") self.memory.memory = experience_to_mimic def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) def save_grads(writer, model): for layer in model.layers: for weight in layer.weights: mapped_weight_name = weight.name.replace(':', '_') tf.summary.histogram(mapped_weight_name, weight) grads = model.optimizer.get_gradients( model.total_loss, weight) def is_indexed_slices(grad): return type(grad).__name__ == 'IndexedSlices' grads = [ grad.values if is_indexed_slices(grad) else grad for grad in grads ] tf.summary.histogram('{}_grad'.format(mapped_weight_name), grads) merged = tf.summary.merge_all() writer.flush() writer.close() #save_grads(self.writer, self.critic_local.model) #def write_log(callback, names, logs, batch_no): # for name, value in zip(names, logs): # summary = tf.Summary() # summary_value = summary.value.add() # summary_value.simple_value = value # summary_value.tag = name # callback.writer.add_summary(summary, batch_no) # callback.writer.flush() #train_names = ['train_loss', 'train_mae'] #print("about to write log") #write_log(self.callback, train_names, logs, self.learn_counter) #trainable_weights = critic_local.model.trainable_weights #gradients = critic_local.model.optimizer.get_gradients(critic_local.model.total_loss, trainable_weights) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) #self.learn_counter += 1 def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights) def _save_weight(self, model, directory_name, file_name): cwd = os.getcwd() directory_path = os.path.join(cwd, directory_name) if not os.path.exists(directory_path): os.makedirs(directory_path) file_path = os.path.join(directory_path, file_name) mv_file_to_dir_with_date(file_path, directory_path) model.save_weights(file_path) def save_weights(self, location='weights_backup'): if self.verbose: print("start save_weights") self._save_weight(self.critic_local.model, location, "critic_local.h5") self._save_weight(self.critic_target.model, location, "critic_target.h5") self._save_weight(self.actor_local.model, location, "actor_local.h5") self._save_weight(self.actor_target.model, location, "actor_target.h5") if self.verbose: print("done save_weights") def _h5(self, model, file_path): if os.path.exists(file_path): model.load_weights(file_path) else: print(f'could not find weight to load from [{file_path}]') def load_weights(self, location='weights_backup'): if self.verbose: print("start load_weights") cwd = os.getcwd() directory_path = os.path.join(cwd, location) self._h5(self.critic_local.model, os.path.join(directory_path, "critic_local.h5")) self._h5(self.critic_target.model, os.path.join(directory_path, "critic_target.h5")) self._h5(self.actor_local.model, os.path.join(directory_path, "actor_local.h5")) self._h5(self.actor_target.model, os.path.join(directory_path, "actor_target.h5")) if self.verbose: print("done load_weights")
class DDPG: def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.001 self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size) self.gamma = 0.99 self.tau = 0.1 self.learning_rate = 0.0005 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, learning_rate=self.learning_rate) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, learning_rate=self.learning_rate) self.critic_local = Critic(self.state_size, self.action_size, learning_rate=self.learning_rate) self.critic_target = Critic(self.state_size, self.action_size, learning_rate=self.learning_rate) def reset_episode(self): self.total_reward = 0.0 self.count = 0 self.noise.reset() self.last_state = self.task.reset() return self.last_state def act(self, state): state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) def step(self, action, reward, next_state, done): self.memory.add(self.last_state, action, reward, next_state, done) self.total_reward += reward self.count += 1 if self.memory.size() > self.batch_size: experiences = self.memory.sample(self.batch_size) self.learn(experiences) self.last_state = next_state def learn(self, experiences): states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) actions_next = self.actor_target.model.predict_on_batch(next_states) q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) q_targets = rewards + (self.gamma * q_targets_next * (1 - dones)) self.critic_local.model.train_on_batch(x=[states, actions], y=q_targets) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class DDPG(): """ Reinforcement Learning agent that learns using DDPG. Deep DPG as described by Lillicrap et al. (2015) """ def __init__(self, task, prioritized_replay=True): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 #0.15 #0.1 self.exploration_sigma = 0.2 #0.2 #0.1 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) self.buffer_size = 100000 self.batch_size = 64 # 64 self.prioritized_replay = prioritized_replay self.prioritized_replay_alpha = 0.6 self.prioritized_replay_beta0 = 0.4 self.prioritized_replay_beta_iters = None self.prioritized_replay_eps = 1e-6 self.max_timesteps = 100000 # Replay buffer if self.prioritized_replay: self.memory = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: self.prioritized_replay_beta_iters = self.max_timesteps self.beta_schedule = LinearSchedule( self.prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters #self.tau = 0.001 # 0.001 per paper self.td_errors_list = [] self.actor_loss_list = [] self.critic_loss_list = [] def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: if self.prioritized_replay: samples = self.memory.sample(self.batch_size, beta=self.beta_schedule.value( len(self.memory))) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = samples experiences = [] for i in range(len(obses_t)): experiences.append( namedtuple("PrioritizedExperience", field_names=[ "state", "action", "reward", "next_state", "done", "weight", "batch_idx" ])(obses_t[i:i + 1], actions[i:i + 1], rewards[i:i + 1], obses_tp1[i:i + 1], dones[i:i + 1], weights[i:i + 1], batch_idxes[i:i + 1])) self.learn(experiences) else: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] #actions = list(action + self.noise.sample()) #print("act {}".format(actions)) #return actions # add some noise for exploration return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) critic_loss = self.critic_local.model.train_on_batch( x=[states, actions], y=Q_targets) # Train actor model (local) using action gradients action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) actor_loss = self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) if self.prioritized_replay: # Update replay buffer priorities batch_idxes = np.vstack( [e.batch_idx[0] for e in experiences if e is not None]) new_priorities = np.abs(Q_targets) + self.prioritized_replay_eps self.memory.update_priorities(batch_idxes, new_priorities) self.td_errors_list.append(Q_targets.T) self.actor_loss_list.append(actor_loss[0]) self.critic_loss_list.append(critic_loss) #print("states {} next states {} critic_loss {} actor_loss {}".format(states, actions_next, critic_loss, actor_loss)) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights) def save_weights(self): self.actor_local.model.save_weights("DDPG_actor_weights.h5") self.critic_local.model.save_weights("DDPG_critic_weights.h5") def save_td_errors(self, i_episode): with open("DDPG_agent_td_errors_episode_{}.csv".format(i_episode), 'w') as csvfile: writer = csv.writer(csvfile) for td_errors in self.td_errors_list: writer.writerow([td_errors]) self.td_errors_list.clear() def save_losses(self, i_episode): with open( "DDPG_agent_actor_critic_loss_episode_{}.csv".format( i_episode), 'w') as csvfile: writer = csv.writer(csvfile) for actor_loss, critic_loss in zip(self.actor_loss_list, self.critic_loss_list): writer.writerow([actor_loss, critic_loss]) self.actor_loss_list.clear() self.critic_loss_list.clear()
class DDPG(): def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high def create_models(self, hidden_sizes_actor=(512, 256), hidden_sizes_critic=(512, 256, 256)): self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, hidden_sizes=hidden_sizes_actor) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, hidden_sizes=hidden_sizes_actor) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) self.critic_local = Critic(self.state_size, self.action_size, hidden_sizes=hidden_sizes_critic) self.critic_target = Critic(self.state_size, self.action_size, hidden_sizes=hidden_sizes_critic) self.critic_target.model.set_weights( self.critic_local.model.get_weights()) def set_params(self, mu=0.1, sigma=0.1, theta=0.1, buffer_size=1e+8, batch_size=128, gamma=0.99, tau=1e-3): self.exploration_mu = mu self.exploration_sigma = sigma self.exploration_theta = theta self.noise = noise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) self.buffer_size = int(buffer_size) self.batch_size = int(batch_size) self.buffer = ReplayBuffer(self.buffer_size) self.gamma = gamma self.tau = tau def act(self, states): state = np.reshape(states, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.calc_noise()) def learn(self): states, actions, rewards, dones, next_states = self.buffer.sample( self.batch_size, self.action_size, self.state_size) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # soft_update self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): self.buffer.add(self.last_state, action, reward, next_state, done) self.learn() self.last_state = next_state def soft_update(self, local_model, target_model): target_model.set_weights( self.tau * np.array(local_model.get_weights()) + (1 - self.tau) * np.array(target_model.get_weights()))
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters # from plicy search self.action_range = self.action_high - self.action_low self.w = np.random.normal( size=(self.state_size, self.action_size), # weights for simple linear policy: state_space x action_space scale=(self.action_range / (2 * self.state_size))) # start producing actions in a decent range # Score tracker and learning parameters self.score = -np.inf self.best_w = None self.best_score = -np.inf self.noise_scale = 0.1 #counter self.count = 0 def reset_episode(self): self.noise.reset() self.count = 0 self.total_reward = 0.0 self.score = 0 state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) self.count += 1 self.total_reward += reward # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state #from the tutorial SRC self.score += reward if done: if self.score > self.best_score: self.best_score = self.score def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack([e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) # from policy search # Learn by random policy search, using a reward-based score # self.score = self.total_reward / float(self.count) if self.count else 0.0 # if self.score > self.best_score: # self.best_score = self.score # self.best_w = self.w # self.noise_scale = max(0.5 * self.noise_scale, 0.01) # else: # self.w = self.best_w # self.noise_scale = min(2.0 * self.noise_scale, 3.2) # self.w = self.w + self.noise_scale * np.random.normal(size=self.w.shape) # equal noise in all directions def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class DDPG(Agent): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, env): # Changed this to use generic env instead of Task super().__init__(env) self.state_size = env.observation_space.shape[0] self.action_size = env.action_space.shape[0] self.action_low = env.action_space.low self.action_high = env.action_space.high # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 1e-2 # for soft update of target parameters # Critic Params self.critic_lr = 1e-3 self.critic_decay = 1e-2 # Actor Params self.actor_lr = 1e-4 self.actor_decay = 0 # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr, self.actor_decay) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr, self.actor_decay) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, self.critic_lr, self.critic_decay) self.critic_target = Critic(self.state_size, self.action_size, self.critic_lr, self.critic_decay) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) def reset_episode(self): self.noise.reset() state = self.env.reset() self.last_state = state return state def step(self, action, reward, next_state, done, training=True): # Since DDPG is an off-policy learner, add a training flag # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if training and len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) self.steps_trained += 1 # Roll over last state and action self.last_state = next_state def act(self, state, training=True): # Add a training flag to decide whether to explore """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] if training: return list(action + self.noise.sample()) # add some noise for exploration else: return list(action) def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights) def save_model(self, filename): al = self.actor_local at = self.actor_target cl = self.critic_local ct = self.critic_target self.actor_local = None self.actor_target = None self.critic_local = None self.critic_target = None with open(filename + '.ddpg_agent') as f: pickle.dump(self, f) al.save(filename + '.actor_local') at.save(filename + '.actor_target') cl.save(filename + '.critic_local') ct.save(filename + '.critic_target') self.actor_local = al self.actor_target = at self.critic_local = cl self.critic_target = ct @classmethod def load_model(cls, filename): with open(filename + '.ddpg_agent') as f: m = pickle.load(f) m.actor_local = load_model(filename + '.actor_local') m.actor_target = load_model(filename + '.actor_target') m.critic_local = load_model(filename + '.critic_local') m.critic_target = load_model(filename + '.critic_target') return m
class DDPG(): """Reinforcement Learning agent that learns using DDPG. """ def __init__(self, env_reset, state_size, action_size, action_low, action_high): """Params: env_reset: callback function to reset environemnt at end of episode state_size: dimension of state space action_size: dimension of action space action_low: float - minimum action value action_high: float - maximum action value """ self.training_steps = 0 # number of training steps run so far self.env_reset = env_reset self.state_size = state_size self.action_size = action_size self.action_low = action_low self.action_high = action_high # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 1e-3 # for soft update of target parameters self.critic_decay = 1e-2 # L2 weight decay for critic (regularization) self.critic_lr = 1e-3 # Learning rate for critic self.critic_alpha = 1e-2 # Leaky ReLU alpha for critic self.actor_lr = 1e-4 # Learning rate for actor self.actor_alpha = 1e-2 # Leaky ReLU alpha for actor # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr, self.actor_alpha) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr, self.actor_alpha) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, self.critic_lr, self.critic_decay, self.critic_alpha) self.critic_target = Critic(self.state_size, self.action_size, self.critic_lr, self.critic_decay,self.critic_alpha) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = int(1e6) self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) def reset_episode(self): self.noise.reset() state = self.env_reset() self.last_state = state return state def step(self, action, reward, next_state, done, training=True): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if training and len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) self.training_steps += 1 # Roll over last state and action self.last_state = next_state def act(self, state, training=True): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] if training: # add some noise for exploration return list(action + self.noise.sample()) else: return list(action) def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack([e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class AE_DDPG_Agent(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # AE: Although OUNoise gives me a convenient set of randomness for each of the rotors, I still need # AE: to make a decision myself on how to apply the randomness and how to manage its magnitude # AE: (i.e. my eplore vs exploit strategy). These variables will do that. self.explore_start = 1.0 # AE: exploration probability at start self.explore_stop = 0.001 # AE: minimum exploration probability self.decay_rate = 0.003 # AE: exponential decay rate for exploration prob self.magnitude_coeff = 0.1 # AE: a coefficient to limit randomness # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 # AE: additive to the noise. mu * theta will be directly added self.exploration_theta = 0.15 # AE: old noise will be multiplied by this self.exploration_sigma = 0.2 # AE: new noise will be multiplied by this self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor # AE: The learning rate. How much we trust the new values compared to the old ones. self.tau = 0.0001 # for soft update of target parameters # AE: current reward in learning procedure (for statistics) self.score = -np.inf # Episode variables self.reset_episode() def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state self.best_score = -np.inf self.score = -np.inf self.total_reward = 0.0 self.count = 0 return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state self.total_reward += reward self.count += 1 # AE: Score (average reward in this episode so far) and best score for statistics self.score = self.total_reward / float(self.count) if self.score > self.best_score: self.best_score = self.score def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) # AE: directly sampling approximated value from learned action-value function. action = self.actor_local.model.predict(state)[0] # AE: and adding some noise to that for unpredictability. # AE: The magnitude of noise has to drop over time. explore_p = self.explore_stop + (self.explore_start - self.explore_stop) * np.exp( -self.decay_rate * self.count) #self.noise.update_mu(explore_p) noise_sample = self.magnitude_coeff * explore_p * self.noise.sample() #noise_sample = explore_p * np.random.randn(self.action_size) #print("Noi=", s) return list( action + noise_sample * self.action_size) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" # AE: Updating NN weights directly in the passed model (actor or critic). new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class Agent(): def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) self.exploration_mu = 0 self.exploration_theta = 0.10 # same direction self.exploration_sigma = 0.001 # random noise self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) self.gamma = 0.90 # discount factor self.tau = 0.1 # for soft update of target parameters self.best_score = -np.inf self.score = 0 def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state self.score = 0 return state def step(self, action, reward, next_state, done): self.memory.add(self.last_state, action, reward, next_state, done) if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) self.last_state = next_state self.score += reward if done: if self.score > self.best_score: self.best_score = self.score def act(self, state): state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack([e.next_state for e in experiences if e is not None]) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next]) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights) new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class DDPG(): '''reinforcement learning agent that learns using Deep Deterministic Policy Gradient''' def __init__(self, task): ''' Params ====== task (object) : environment ''' ''' Reference: Continuous Control With Deep Reinforcement Learning(2016) Playing CartPole through Asynchronous Advantage Actor Critic (A3C) with tf.keras ========= gamma : 0.99 tau : 0.001 buffer_size (ReplayBuffer) : 1e6 batch_size (ReplayBuffer) : 64 theta (Ornstein-Uhlenbeck process) : 0.15 sigma (Ornstein-Uhlenbeck process) : 0.2 ''' self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # actor (policy) model - use two copies of model for updating model and producing target self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # critic (value) model - use two copies of model for updating model and producing target self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # initialize target model parameters with local model parameters self.actor_target.model.set_weights( self.actor_local.model.get_weights()) self.critic_target.model.set_weights( self.critic_local.model.get_weights()) # noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # replay memory self.buffer_size = 1000000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.001 # for soft update of target parameters # reward history self.best_avg_score = -np.inf self.accumulated_reward = 0 self.count = 0 def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state self.accumulated_reward = 0 self.count = 0 return state def step(self, action, reward, next_state, done): # save experience and reward self.memory.add(self.last_state, action, reward, next_state, done) # learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # roll over last state and action self.last_state = next_state # accumulate reward self.accumulated_reward += reward self.count += 1 # record best average score if done: if float(self.accumulated_reward / self.count) > self.best_avg_score: self.best_avg_score = float(self.accumulated_reward / self.count) def act(self, state): '''returns actions for given state(s) as per current policy''' state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration # both action and self.noise.sample() are numpy object, + means sum up both, # instead of concatenation def learn(self, experiences): '''update policy and value parameters using given batch of experience tuples''' # convert experience tuples to separate arrays for each element(states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).\ astype(np.float32).reshape(-1,self.action_size) rewards = np.array([e.reward for e in experiences if e is not None]).\ astype(np.float32).reshape(-1,1) dones = np.array([e.done for e in experiences if e is not None]).\ astype(np.uint8).reshape(-1,1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # get predicted next-state actions and Q-values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # compute Q targets for current states and train critic model (local) # Value Loss: L=∑(R_t+1 + Q_t+1 — Qt)² Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # train actor model (local) # Policy Loss: L = (1/N)*log(𝝅(s)) * Q(s) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # The learning phase flag is a bool tensor (0 = test, 1 = train) # to be passed as input to any Keras function # that uses a different behavior at train time and test time. # soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): '''soft update model parameters''' local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights),\ 'Local and target model parameters must have the same size' new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class RLA(): """ Reinfocement learning agent""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high #actor model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) #Critic model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) #Initialize target model params with local params self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) #Initialize noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) #Replay memory Initialization self.buffer_size, self.batch_size = 2000000, 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) #Initialize algorithm parameters self.gamma, self.tau = 0.95, 0.001 #Initialize scores self.score, self.best_score = -np.inf, -np.inf def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state self.score = 0 return state def step(self, action, reward, next_state, done): self.memory.add(self.last_state, action, reward, next_state, done) #Learn from samples in memory if they are greater than batch size if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) #Preserve state as last_state self.last_state = next_state #Update score with reward from this step self.score += reward if done: #Preserve best score if self.score > self.best_score: self.best_score = self.score def act(self, state): state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) def learn(self, experiences): #Convert experiences seperate arrays states = np.vstack([exp.state for exp in experiences if exp is not None]) actions = np.array([exp.action for exp in experiences if exp is not None]).astype(np.float32).reshape(-1, self.action_size) rewards = np.array([exp.reward for exp in experiences if exp is not None]).astype(np.float32).reshape(-1, 1) dones = np.array([exp.done for exp in experiences if exp is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack([exp.next_state for exp in experiences if exp is not None]) #predict next_state actions and Q values from target model... actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next]) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states,actions], y=Q_targets) #Train local actor model action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) #Update target models self.update(self.critic_local.model, self.critic_target.model) self.update(self.actor_local.model, self.actor_target.model) def update(self, local_model, target_model): """Update model parameters""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)