class DDPG_Agent(): """Interacts with and learns from the environment.""" #self.state_size, self.action_size, self.seed, hidden_layers_actor, hidden_layers_critic, self.buffer_size, learning_rate_actor, learning_rate_critic def __init__(self, state_size, action_size, num_agents, seed, device, buffer_size=int(1e5), batch_size=128, num_batches = 5, update_every=10, gamma=0.99, tau=8e-3, learning_rate_actor=1e-3, learning_rate_critic=1e-3, weight_decay=0.0001, hidden_layers_actor=[32,32], hidden_layers_critic=[32, 32, 32], add_noise=True, start_eps=5.0, end_eps=0.0, end_eps_episode=500, agent_id=-1): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents seed (int): random seed hidden_layers (list of int ; optional): number of each layer nodes buffer_size (int ; optional): replay buffer size batch_size (int; optional): minibatch size gamma (float; optional): discount factor tau (float; optional): for soft update of target parameters learning_rate_X (float; optional): learning rate for X=actor or critic """ print('In DPPG_AGENT: seed = ', seed) self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(seed) self.device = device self.buffer_size = buffer_size self.batch_size = batch_size self.update_every = update_every self.num_batches = num_batches self.gamma = gamma self.tau = tau self.lr_actor = learning_rate_actor self.lr_critic = learning_rate_critic self.weight_decay_critic = weight_decay self.add_noise = add_noise self.start_eps = start_eps self.eps = start_eps self.end_eps = end_eps self.eps_decay = 1/(end_eps_episode*num_batches) # set decay rate based on epsilon end target self.timestep = 0 self.agent_id = agent_id ### SET UP THE ACTOR NETWORK ### # Assign model parameters and assign device model_params_actor = [state_size, action_size, seed, hidden_layers_actor] # Create the Actor Network (w/ Target Network) self.actor_local = Actor(*model_params_actor).to(self.device) self.actor_target = Actor(*model_params_actor).to(self.device) #print('actor_local network is: ', print(self.actor_local)) # Set up optimizer for the Actor network self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor) ### SET UP THE CRITIC NETWORK ### model_params_critic = [state_size, action_size, seed, hidden_layers_critic] # Create the Critic Network (w/ Target Network) self.critic_local = Critic(*model_params_critic).to(self.device) self.critic_target = Critic(*model_params_critic).to(self.device) # Set up optimizer for the Critic Network self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay_critic) # Noise process self.noise = OUNoise(action_size, self.seed) # Replay memory self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed, device) def step(self, states, actions, rewards, next_states, dones, agent_number): # Increment timestep by 1 self.timestep += 1 # Save experience in replay memory self.memory.add(states, actions, rewards, next_states, dones) # If there are enough samples and a model update is to be made at this time step if len(self.memory) > self.batch_size and self.timestep%self.update_every == 0: # For each batch for i in range(self.num_batches): # Sample experiences from memory experiences = self.memory.sample() # Learn from the experience self.learn(experiences, self.gamma, agent_number) def act(self, state, scale_noise=True): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().to(self.device) # Go to evaluation mode and get Q values for current state self.actor_local.eval() with torch.no_grad(): # Get action for the agent and concatenate them action = [self.actor_local(state[0]).cpu().data.numpy()] # get back to train mode self.actor_local.train() # Add noise to the action probabilities # Note, we want the magnitude of noise to decrease as the agent keeps learning action += int(scale_noise)*(self.eps)*self.noise.sample() return np.clip(action, -1.0, 1.0) def reset(self): """ Reset the noise, and all neural network parameters for the current agent """ self.noise.reset() self.eps = self.start_eps self.timestep = 0 self.critic_local.reset_parameters() self.actor_local.reset_parameters() self.critic_target.reset_parameters() self.actor_target.reset_parameters() # ReSet up optimizer for the Actor network self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor) # Set up optimizer for the Critic Network self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay_critic) # Clear the experience buffer self.memory.clear_buffer() def reset_noise(self): """ Reset the noise only """ self.noise.reset() def learn(self, experiences, gamma, agent_number): #### DRAW FROM MEMORY AND PREPARE SARS DATA #### # From the experiences buffer, separate out S_t, A_t, R_t, S_t+1, done data states, actions, rewards, next_states, dones = experiences # NOTE: actions has dimension of batch_size x concatenated action for all agents # get the next action for the current agent for the entire batch actions_next = self.actor_target(next_states) # Construct next action vector for the agent if agent_number == 0: actions_next = torch.cat((actions_next, actions[:,2:]), dim=1) else: actions_next = torch.cat((actions[:,:2], actions_next), dim=1) #### UPDATE CRITIC #### # Get predicted next-state actions and Q values from target models # Get the next targets Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) # Define the loss critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() # Clip gradient @1 torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # --------------UPDATE ACTOR -----------------------# # Compute actor loss actions_pred = self.actor_local(states) # Construct action prediction vector relative to each agent if agent_number == 0: actions_pred = torch.cat((actions_pred, actions[:,2:]), dim=1) else: actions_pred = torch.cat((actions[:,:2], actions_pred), dim=1) # Calculate the loss. Note the negative sign since we use steepest ascent actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Update the target networks using the local and target networks self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) # update noise decay parameter self.eps -= self.eps_decay self.eps = max(self.eps, self.end_eps) self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. X_target = tau*X_local + (1 - tau)*X_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class DdpgAgent(): def __init__(self, config, seed, device="cpu"): self.seed = seed # -- Set environment self.action_size = config["env"]["action_size"] self.env = config["env"]["simulator"] self.brain_name = config["env"]["brain_name"] self.num_agents = config["env"]["num_agents"] # -- Construct Actor/Critic models self.actor_local = Actor(config["env"]["state_size"], config["env"]["action_size"], seed, config["actor"]["hidden_layers"]).to(device) self.actor_target = Actor(config["env"]["state_size"], config["env"]["action_size"], seed, config["actor"]["hidden_layers"]).to(device) self.checkpoint = {"state_size":config["env"]["state_size"], "action_size":config["env"]["action_size"], "hidden_layers":config["actor"]["hidden_layers"], "state_dict":self.actor_local.state_dict()} self.critic_local = Critic(config["env"]["state_size"], config["env"]["action_size"], seed, config["critic"]["hidden_layers"]).to(device) self.critic_target = Critic(config["env"]["state_size"], config["env"]["action_size"], seed, config["critic"]["hidden_layers"]).to(device) # self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=config["learning"]["lr_critic"], weight_decay=0.0001) # -- Configure optimizer self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=config["learning"]["lr_actor"]) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=config["learning"]["lr_critic"]) self.optimizer_lr_decay = config["learning"]["lr_decay"]["activate"] self.actor_optimizer_lr_scheduler = optim.lr_scheduler.StepLR(self.actor_optimizer, step_size=config["learning"]["lr_decay"]["actor_step"], gamma=config["learning"]["lr_decay"]["actor_gamma"]) self.critic_optimizer_lr_scheduler = optim.lr_scheduler.StepLR(self.critic_optimizer, step_size=config["learning"]["lr_decay"]["critic_step"], gamma=config["learning"]["lr_decay"]["critic_gamma"]) # -- Set learning parameters self.batch_size = config["learning"]["batch_size"] self.buffer_size = config["learning"]["buffer_size"] self.discount = config["learning"]["discount"] self.max_t = config["learning"]["max_t"] self.tau = config["learning"]["soft_update_tau"] self.learn_every_n_steps = config["learning"]["learn_every_n_steps"] self.num_learn_steps = config["learning"]["num_learn_steps"] self.checkpointfile = config["learning"]["checkpointfile"] self.memory = ReplayBuffer(self.buffer_size, self.batch_size, seed, device) self.device=device self.add_noise = True self.ou_noise = OUNoise(self.action_size, seed) self.hard_copy(self.actor_local, self.actor_target) self.hard_copy(self.critic_local, self.critic_target) def steps(self): if self.optimizer_lr_decay: self.actor_optimizer_lr_scheduler.step() self.critic_optimizer_lr_scheduler.step() env_info = self.env.reset(train_mode=True)[self.brain_name] self.ou_noise.reset() state = env_info.vector_observations score = np.zeros(self.num_agents) self.step_ctr = 0 while True: action = self.act(state) env_info = self.env.step(action)[self.brain_name] next_state = env_info.vector_observations # get next state (for each agent) reward = env_info.rewards # get reward (for each agent) done = env_info.local_done self.step(state, action, reward, next_state, done) state = next_state score += reward if np.any(done): break return score, self.step_ctr def step(self, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done) self.step_ctr += 1 if len(self.memory) > self.batch_size and self.step_ctr % self.learn_every_n_steps == 0: for _ in range(self.num_learn_steps): self.learn() def act(self, state): # print(state) state = torch.from_numpy(state).float().to(self.device) self.actor_local.eval() # set train= False with torch.no_grad(): # action = self.actor_local(state).data.numpy() action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() # set back train=True if self.add_noise: action += self.ou_noise.sample() return np.clip(action, -1, 1) def learn(self): # print("IM IN") # print("*") states, actions, rewards, next_states, dones = self.memory.sample_random() # -------------------- Update Critic ----------------------------- # Get predicted next-state actions and Q values from target model next_actions = self.actor_target(next_states) # Q_targets_next = self.critic_target(next_states, next_actions) Q_targets_next = self.critic_target(next_states, next_actions).detach() # Compare Q targets for current states (y_i) Q_targets = rewards + (self.discount * Q_targets_next * (1 - dones)) # Q_targets = Q_targets.detach() # Compute Critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() # torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # -------------------- Update Actor ----------------------------- # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ------------------ Update Target Networds -------------------- self.soft_update(self.critic_local, self.critic_target) self.soft_update(self.actor_local, self.actor_target) def soft_update(self, local_model, target_model): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(self.tau*local_param + (1.0-self.tau)*target_param) def hard_copy(self, model_a, model_b): """ copy model_a to model_b """ for param_a, param_b in zip(model_a.parameters(), model_b.parameters()): param_b.data.copy_(param_a) def reset(self): self.actor_local.reset_parameters() self.actor_target.reset_parameters() self.critic_local.reset_parameters() self.critic_target.reset_parameters() # self.hard_copy(self.actor_local, self.actor_target) # self.hard_copy(self.critic_local, self.critic_target) def set_lr(self, actor_lr=None, critic_lr=None): if actor_lr is not None: self.actor_optimizer def save_model(self): torch.save(self.checkpoint, self.checkpointfile) def add_noise_on_act(self, nois_on_act): """ When nois_on_act is True, OU noise is added in act() """ self.add_noise = nois_on_act
class Agent(): def __init__(self, state_size, action_size, seed): self.actor_local = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.qvalue_local = Critic(state_size, action_size, seed).to(device) self.qvalue_target = Critic(state_size, action_size, seed).to(device) self.seed = seed self.actor_target.load_state_dict(self.actor_local.state_dict()) self.qvalue_target.load_state_dict(self.qvalue_local.state_dict()) self.noise = OUNoise(action_size, self.seed) self.qvalue_optimizer = optim.Adam(self.qvalue_local.parameters(), lr=LR_CRITIC) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) def reset(self): self.actor_local.reset_parameters() self.qvalue_local.reset_parameters() self.actor_target.reset_parameters() self.qvalue_target.reset_parameters() self.noise.reset() def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def into_tensor(self, state): return torch.tensor(state).float().to(device) def play(self, max_len=200): state1 = env.reset() score = 0 for _ in range(max_len): state1 = self.into_tensor(state1) action1 = self.act(state1) action1_t = self.into_tensor(action1) value1_pred = self.qvalue_local(state1, action1_t) state2, reward, done, _ = env.step(action1) state2 = self.into_tensor(state2) action2 = self.actor_target(state2) action2_t = self.into_tensor(action2) value2 = self.qvalue_target(state2, action2_t) if not done: expected_value = self.into_tensor(reward) + GAMMA * value2 else: expected_value = self.into_tensor(reward) score += reward ## Critic Update qvalue_loss = F.mse_loss(value1_pred, expected_value) self.qvalue_optimizer.zero_grad() qvalue_loss.backward() self.qvalue_optimizer.step() ## Actor Update actor_loss = -self.qvalue_target(state1, action1_t) self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() state1 = state2 self.soft_update(self.actor_local, self.actor_target, TAU) self.soft_update(self.qvalue_local, self.qvalue_target, TAU) if done: break # print(actor_loss, qvalue_loss) return score def soft_update(self, local_model, target_model, tau): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def run(self, iteration, max_len=200): self.score_deque = deque(maxlen=100) for episode in range(iteration): score = self.play(max_len) self.score_deque.append(score) print("\rIteration {} with Current Score {}".format( episode, score), end=" ") if (episode % 50 == 0): print("\rIteration {} with Average Score {} ".format( episode, sum(self.score_deque) / len(self.score_deque)))