class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task, train=True): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.actor_lr = 1e-5 #.0001 self.critic_lr = 1e-4 #0.0000001 self.network = [128, 256, 128] self.train = train network = self.network actor_lr = self.actor_lr critic_lr = self.critic_lr if (self.train): # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, actor_lr, network) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, actor_lr, network) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, critic_lr, network) self.critic_target = Critic(self.state_size, self.action_size, critic_lr, network) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 # Mean self.exploration_theta = 0.15 #.15 How fast variable reverts to mean self.exploration_sigma = 0.2 #.2 Degree of volatility self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 5000 self.batch_size = 16 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) self.targets = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters print("DDPG init", "Actor: ", actor_lr, "Critic: ", critic_lr) print("Tau: ", self.tau, "Sigma: ", self.exploration_sigma) print(self.actor_local.model.summary()) print(self.critic_local.model.summary()) # https://stackoverflow.com/questions/44861149/keras-use-tensorboard-with-train-on-batch?rq=1 # Create the TensorBoard callback, # which we will drive manually self.tensorboard = keras.callbacks.TensorBoard( log_dir='logdir', histogram_freq=0, batch_size=self.batch_size, write_graph=True, write_grads=True) self.tensorboard.set_model(self.critic_local.model) self.summary_writer = tf.summary.FileWriter("scores") self.batch_id = 0 def reset_episode(self): if (self.train): self.noise.reset() self.noise_arr = [] self.noise_matrix = [0., 0., 0., 0.] state = self.task.reset() self.last_state = state return state def save_initial_weights(self): self.actor_local.model.save_weights('actor_local.h5') self.actor_target.model.save_weights('actor_target.h5') self.critic_local.model.save_weights('critic_local.h5') self.critic_target.model.save_weights('critic_target.h5') def load_initial_weights(self): self.actor_local.model.load_weights('actor_local.h5') self.actor_target.model.load_weights('actor_target.h5') self.critic_local.model.load_weights('critic_local.h5') self.critic_target.model.load_weights('critic_target.h5') def save_model(self): # Save the weights self.actor_local.model.save_weights('model_weights.h5') def load_weights(self, option=None): if (option == None): self.trained = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr, self.network) self.trained.model.load_weights('model_weights.h5') else: self.trained = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr, self.network) self.trained.model.load_weights('weights-best.hdf5') print(self.trained.model.summary()) def predict(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.trained.model.predict(state)[0] return action def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if (len(self.memory) > self.batch_size * 2): experiences = self.memory.sample() self.learn(experiences) if (len(self.memory) == self.buffer_size): self.memory.memory.clear() print("buffer cleared") # Roll over last state and action self.last_state = next_state def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) noise = self.noise.sample() action = list(self.actor_local.model.predict(state)[0] + noise) return action, noise # add some noise for exploration def learn(self, experiences): #experiences """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) ''' print("States", states.shape) print("actions", actions.shape) print("rewards", rewards.shape) print("dones", dones.shape) print("Next states", next_states.shape) ''' # keep training actor local and critic local # use values from target model to update and train local # don't train target models, we soft update target # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch( next_states) #target #Actions predicted by target critic Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) #target # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) critic_loss = self.critic_local.model.train_on_batch( x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) actor_loss = self.actor_local.train_fn([states, action_gradients, 1]) # custom training function self.tensorboard.on_epoch_end( self.batch_id, named_logs(self.critic_local.model, [critic_loss])) self.batch_id += 1 # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed, fc1_units, fc2_units, weighted=False, individual=False): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.epsilon = EPSILON_MAX # Actor Network (w/ Target Network) if weighted: self.actor_local = Weight_adapter(state_size, action_size).to(device) self.actor_target = Weight_adapter(state_size, action_size).to(device) elif individual: self.actor_local = IndividualModel(state_size, action_size, random_seed, fc1_units).to(device) self.actor_target = IndividualModel(state_size, action_size, random_seed, fc1_units).to(device) else: self.actor_local = Actor(state_size, action_size, random_seed, fc1_units, fc2_units).to(device) self.actor_target = Actor(state_size, action_size, random_seed, fc1_units, fc2_units).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed, mu=0, theta=0.15, sigma=0.2) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) # Make sure target is with the same weight as the source self.hard_update(self.actor_target, self.actor_local) self.hard_update(self.critic_target, self.critic_local) self.t_step = 0 def step(self, state, action, reward, next_state, done, timestep): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) if len(self.memory) > LEARN_START: # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: for _ in range(UPDATES_PER_STEP): experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() #print(action) self.actor_local.train() if add_noise: tem_noise = self.noise.sample() action += self.epsilon * tem_noise # print(tem_noise, np.clip(action, -1, 1)) return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + ? * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() #torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) # ---------------------------- update noise ---------------------------- # if self.epsilon - EPSILON_DECAY > EPSILON_MIN: self.epsilon -= EPSILON_DECAY else: self.epsilon = EPSILON_MIN self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. ?_target = t*?_local + (1 - t)*?_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def hard_update(self, target, source): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data)
class Agent: """ Interacts with and learns from the environment. """ def __init__(self, state_size, action_size, num_agents, random_seed): """ Initialize an Agent Params ====== state_size (int): state dimension action_size (int): action dimension num_agents (int): simultaneous running agents random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents random.seed(random_seed) # Actor Network and its target network self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network and its target network self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise object self.noise = OUNoise((num_agents, action_size), random_seed) # Replay Memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, EXPERIENCES_PER_SAMPLING, device, random_seed) # Initialize time step (for updating every UPDATE_NN_EVERY steps) self.t_step_nn = 0 # Initialize time step (for updating every UPDATE_MEM_PAR_EVERY steps) self.t_step_mem_par = 0 # Initialize time step (for updating every UPDATE_MEM_EVERY steps) self.t_step_mem = 0 def step(self, state, action, reward, next_state, done): """ Save experience in replay memory, and use prioritized sample from buffer to learn. """ # Save memory for i in range(self.num_agents): self.memory.add(state[i, :], action[i, :], reward[i], next_state[i, :], done[i]) # Learn every UPDATE_NN_EVERY time steps. self.t_step_nn = (self.t_step_nn + 1) % UPDATE_NN_EVERY self.t_step_mem = (self.t_step_mem + 1) % UPDATE_MEM_EVERY self.t_step_mem_par = (self.t_step_mem_par + 1) % UPDATE_MEM_PAR_EVERY if self.t_step_mem_par == 0: self.memory.update_parameters() if self.t_step_nn == 0: # Learn from memory if enough samples exist if self.memory.experience_count > EXPERIENCES_PER_SAMPLING: experiences = self.memory.sample() self.learn(experiences, GAMMA) if self.t_step_mem == 0: self.memory.update_memory_sampling() def act(self, states, add_noise=True): """ Returns actions for given state as per current policy. """ states = torch.from_numpy(states).float().to(device) actions = np.zeros((self.num_agents, self.action_size)) self.actor_local.eval() with torch.no_grad(): for i, state in enumerate(states): action = self.actor_local(state).cpu().data.numpy() actions[i, :] = action self.actor_local.train() if add_noise: actions += self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """ Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones, indices = experiences # update Critic # Get next predicted state, actions, and Q values actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current state Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute Critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # Update Actor actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Update target networks self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) # Update priorities delta = abs(Q_targets - Q_expected).detach().numpy() self.memory.update_priorities(delta, indices) @staticmethod def soft_update(local_model, target_model, tau): """ Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_model_param, local_model_param in zip( target_model.parameters(), local_model.parameters()): target_model_param.data.copy_(tau * local_model_param.data + (1. - tau) * target_model_param.data)